aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block
diff options
context:
space:
mode:
authorGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
committerGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
commitc71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
treeecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /drivers/block
parentea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts: litmus/sched_cedf.c
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/DAC960.c23
-rw-r--r--drivers/block/Kconfig39
-rw-r--r--drivers/block/Makefile4
-rw-r--r--drivers/block/amiflop.c91
-rw-r--r--drivers/block/aoe/Makefile2
-rw-r--r--drivers/block/aoe/aoeblk.c16
-rw-r--r--drivers/block/aoe/aoechr.c10
-rw-r--r--drivers/block/aoe/aoecmd.c6
-rw-r--r--drivers/block/aoe/aoedev.c4
-rw-r--r--drivers/block/ataflop.c78
-rw-r--r--drivers/block/brd.c50
-rw-r--r--drivers/block/cciss.c1691
-rw-r--r--drivers/block/cciss.h18
-rw-r--r--drivers/block/cciss_cmd.h14
-rw-r--r--drivers/block/cciss_scsi.c62
-rw-r--r--drivers/block/cciss_scsi.h4
-rw-r--r--drivers/block/cpqarray.c18
-rw-r--r--drivers/block/drbd/drbd_actlog.c404
-rw-r--r--drivers/block/drbd/drbd_bitmap.c790
-rw-r--r--drivers/block/drbd/drbd_int.h575
-rw-r--r--drivers/block/drbd/drbd_main.c1321
-rw-r--r--drivers/block/drbd/drbd_nl.c648
-rw-r--r--drivers/block/drbd/drbd_proc.c137
-rw-r--r--drivers/block/drbd/drbd_receiver.c1766
-rw-r--r--drivers/block/drbd/drbd_req.c388
-rw-r--r--drivers/block/drbd/drbd_req.h102
-rw-r--r--drivers/block/drbd/drbd_strings.c6
-rw-r--r--drivers/block/drbd/drbd_vli.h2
-rw-r--r--drivers/block/drbd/drbd_worker.c672
-rw-r--r--drivers/block/drbd/drbd_wrappers.h20
-rw-r--r--drivers/block/floppy.c114
-rw-r--r--drivers/block/hd.c2
-rw-r--r--drivers/block/loop.c174
-rw-r--r--drivers/block/nbd.c26
-rw-r--r--drivers/block/osdblk.c5
-rw-r--r--drivers/block/paride/pcd.c33
-rw-r--r--drivers/block/paride/pd.c21
-rw-r--r--drivers/block/paride/pf.c26
-rw-r--r--drivers/block/paride/pg.c8
-rw-r--r--drivers/block/paride/pt.c20
-rw-r--r--drivers/block/pktcdvd.c55
-rw-r--r--drivers/block/ps3disk.c2
-rw-r--r--drivers/block/rbd.c2499
-rw-r--r--drivers/block/rbd_types.h73
-rw-r--r--drivers/block/smart1,2.h2
-rw-r--r--drivers/block/swim.c22
-rw-r--r--drivers/block/swim3.c25
-rw-r--r--drivers/block/ub.c26
-rw-r--r--drivers/block/umem.c26
-rw-r--r--drivers/block/viodasd.c13
-rw-r--r--drivers/block/virtio_blk.c145
-rw-r--r--drivers/block/xd.c7
-rw-r--r--drivers/block/xen-blkback/Makefile3
-rw-r--r--drivers/block/xen-blkback/blkback.c826
-rw-r--r--drivers/block/xen-blkback/common.h233
-rw-r--r--drivers/block/xen-blkback/xenbus.c767
-rw-r--r--drivers/block/xen-blkfront.c230
-rw-r--r--drivers/block/xsysace.c41
-rw-r--r--drivers/block/z2ram.c21
59 files changed, 10494 insertions, 3912 deletions
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 4e2c367fec11..e086fbbbe853 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -36,7 +36,7 @@
36#include <linux/ioport.h> 36#include <linux/ioport.h>
37#include <linux/mm.h> 37#include <linux/mm.h>
38#include <linux/slab.h> 38#include <linux/slab.h>
39#include <linux/smp_lock.h> 39#include <linux/mutex.h>
40#include <linux/proc_fs.h> 40#include <linux/proc_fs.h>
41#include <linux/seq_file.h> 41#include <linux/seq_file.h>
42#include <linux/reboot.h> 42#include <linux/reboot.h>
@@ -54,6 +54,7 @@
54#define DAC960_GAM_MINOR 252 54#define DAC960_GAM_MINOR 252
55 55
56 56
57static DEFINE_MUTEX(DAC960_mutex);
57static DAC960_Controller_T *DAC960_Controllers[DAC960_MaxControllers]; 58static DAC960_Controller_T *DAC960_Controllers[DAC960_MaxControllers];
58static int DAC960_ControllerCount; 59static int DAC960_ControllerCount;
59static struct proc_dir_entry *DAC960_ProcDirectoryEntry; 60static struct proc_dir_entry *DAC960_ProcDirectoryEntry;
@@ -81,7 +82,7 @@ static int DAC960_open(struct block_device *bdev, fmode_t mode)
81 int drive_nr = (long)disk->private_data; 82 int drive_nr = (long)disk->private_data;
82 int ret = -ENXIO; 83 int ret = -ENXIO;
83 84
84 lock_kernel(); 85 mutex_lock(&DAC960_mutex);
85 if (p->FirmwareType == DAC960_V1_Controller) { 86 if (p->FirmwareType == DAC960_V1_Controller) {
86 if (p->V1.LogicalDriveInformation[drive_nr]. 87 if (p->V1.LogicalDriveInformation[drive_nr].
87 LogicalDriveState == DAC960_V1_LogicalDrive_Offline) 88 LogicalDriveState == DAC960_V1_LogicalDrive_Offline)
@@ -99,7 +100,7 @@ static int DAC960_open(struct block_device *bdev, fmode_t mode)
99 goto out; 100 goto out;
100 ret = 0; 101 ret = 0;
101out: 102out:
102 unlock_kernel(); 103 mutex_unlock(&DAC960_mutex);
103 return ret; 104 return ret;
104} 105}
105 106
@@ -139,13 +140,14 @@ static int DAC960_getgeo(struct block_device *bdev, struct hd_geometry *geo)
139 return 0; 140 return 0;
140} 141}
141 142
142static int DAC960_media_changed(struct gendisk *disk) 143static unsigned int DAC960_check_events(struct gendisk *disk,
144 unsigned int clearing)
143{ 145{
144 DAC960_Controller_T *p = disk->queue->queuedata; 146 DAC960_Controller_T *p = disk->queue->queuedata;
145 int drive_nr = (long)disk->private_data; 147 int drive_nr = (long)disk->private_data;
146 148
147 if (!p->LogicalDriveInitiallyAccessible[drive_nr]) 149 if (!p->LogicalDriveInitiallyAccessible[drive_nr])
148 return 1; 150 return DISK_EVENT_MEDIA_CHANGE;
149 return 0; 151 return 0;
150} 152}
151 153
@@ -162,7 +164,7 @@ static const struct block_device_operations DAC960_BlockDeviceOperations = {
162 .owner = THIS_MODULE, 164 .owner = THIS_MODULE,
163 .open = DAC960_open, 165 .open = DAC960_open,
164 .getgeo = DAC960_getgeo, 166 .getgeo = DAC960_getgeo,
165 .media_changed = DAC960_media_changed, 167 .check_events = DAC960_check_events,
166 .revalidate_disk = DAC960_revalidate_disk, 168 .revalidate_disk = DAC960_revalidate_disk,
167}; 169};
168 170
@@ -1788,7 +1790,7 @@ static bool DAC960_V2_ReadControllerConfiguration(DAC960_Controller_T
1788 unsigned short LogicalDeviceNumber = 0; 1790 unsigned short LogicalDeviceNumber = 0;
1789 int ModelNameLength; 1791 int ModelNameLength;
1790 1792
1791 /* Get data into dma-able area, then copy into permanant location */ 1793 /* Get data into dma-able area, then copy into permanent location */
1792 if (!DAC960_V2_NewControllerInfo(Controller)) 1794 if (!DAC960_V2_NewControllerInfo(Controller))
1793 return DAC960_Failure(Controller, "GET CONTROLLER INFO"); 1795 return DAC960_Failure(Controller, "GET CONTROLLER INFO");
1794 memcpy(ControllerInfo, Controller->V2.NewControllerInformation, 1796 memcpy(ControllerInfo, Controller->V2.NewControllerInformation,
@@ -6625,7 +6627,7 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
6625 long ErrorCode = 0; 6627 long ErrorCode = 0;
6626 if (!capable(CAP_SYS_ADMIN)) return -EACCES; 6628 if (!capable(CAP_SYS_ADMIN)) return -EACCES;
6627 6629
6628 lock_kernel(); 6630 mutex_lock(&DAC960_mutex);
6629 switch (Request) 6631 switch (Request)
6630 { 6632 {
6631 case DAC960_IOCTL_GET_CONTROLLER_COUNT: 6633 case DAC960_IOCTL_GET_CONTROLLER_COUNT:
@@ -7056,13 +7058,14 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
7056 default: 7058 default:
7057 ErrorCode = -ENOTTY; 7059 ErrorCode = -ENOTTY;
7058 } 7060 }
7059 unlock_kernel(); 7061 mutex_unlock(&DAC960_mutex);
7060 return ErrorCode; 7062 return ErrorCode;
7061} 7063}
7062 7064
7063static const struct file_operations DAC960_gam_fops = { 7065static const struct file_operations DAC960_gam_fops = {
7064 .owner = THIS_MODULE, 7066 .owner = THIS_MODULE,
7065 .unlocked_ioctl = DAC960_gam_ioctl 7067 .unlocked_ioctl = DAC960_gam_ioctl,
7068 .llseek = noop_llseek,
7066}; 7069};
7067 7070
7068static struct miscdevice DAC960_gam_dev = { 7071static struct miscdevice DAC960_gam_dev = {
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index de277689da61..717d6e4e18d3 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -464,11 +464,33 @@ config XEN_BLKDEV_FRONTEND
464 tristate "Xen virtual block device support" 464 tristate "Xen virtual block device support"
465 depends on XEN 465 depends on XEN
466 default y 466 default y
467 select XEN_XENBUS_FRONTEND
467 help 468 help
468 This driver implements the front-end of the Xen virtual 469 This driver implements the front-end of the Xen virtual
469 block device driver. It communicates with a back-end driver 470 block device driver. It communicates with a back-end driver
470 in another domain which drives the actual block device. 471 in another domain which drives the actual block device.
471 472
473config XEN_BLKDEV_BACKEND
474 tristate "Block-device backend driver"
475 depends on XEN_BACKEND
476 help
477 The block-device backend driver allows the kernel to export its
478 block devices to other guests via a high-performance shared-memory
479 interface.
480
481 The corresponding Linux frontend driver is enabled by the
482 CONFIG_XEN_BLKDEV_FRONTEND configuration option.
483
484 The backend driver attaches itself to a any block device specified
485 in the XenBus configuration. There are no limits to what the block
486 device as long as it has a major and minor.
487
488 If you are compiling a kernel to run in a Xen block backend driver
489 domain (often this is domain 0) you should say Y here. To
490 compile this driver as a module, chose M here: the module
491 will be called xen-blkback.
492
493
472config VIRTIO_BLK 494config VIRTIO_BLK
473 tristate "Virtio block driver (EXPERIMENTAL)" 495 tristate "Virtio block driver (EXPERIMENTAL)"
474 depends on EXPERIMENTAL && VIRTIO 496 depends on EXPERIMENTAL && VIRTIO
@@ -488,4 +510,21 @@ config BLK_DEV_HD
488 510
489 If unsure, say N. 511 If unsure, say N.
490 512
513config BLK_DEV_RBD
514 tristate "Rados block device (RBD)"
515 depends on INET && EXPERIMENTAL && BLOCK
516 select CEPH_LIB
517 select LIBCRC32C
518 select CRYPTO_AES
519 select CRYPTO
520 default n
521 help
522 Say Y here if you want include the Rados block device, which stripes
523 a block device over objects stored in the Ceph distributed object
524 store.
525
526 More information at http://ceph.newdream.net/.
527
528 If unsure, say N.
529
491endif # BLK_DEV 530endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index aff5ac925c34..76646e9a1c91 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -36,6 +36,8 @@ obj-$(CONFIG_BLK_DEV_UB) += ub.o
36obj-$(CONFIG_BLK_DEV_HD) += hd.o 36obj-$(CONFIG_BLK_DEV_HD) += hd.o
37 37
38obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o 38obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
39obj-$(CONFIG_XEN_BLKDEV_BACKEND) += xen-blkback/
39obj-$(CONFIG_BLK_DEV_DRBD) += drbd/ 40obj-$(CONFIG_BLK_DEV_DRBD) += drbd/
41obj-$(CONFIG_BLK_DEV_RBD) += rbd.o
40 42
41swim_mod-objs := swim.o swim_asm.o 43swim_mod-y := swim.o swim_asm.o
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 76f114f0bba3..8eba86bba599 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -60,7 +60,7 @@
60#include <linux/hdreg.h> 60#include <linux/hdreg.h>
61#include <linux/delay.h> 61#include <linux/delay.h>
62#include <linux/init.h> 62#include <linux/init.h>
63#include <linux/smp_lock.h> 63#include <linux/mutex.h>
64#include <linux/amifdreg.h> 64#include <linux/amifdreg.h>
65#include <linux/amifd.h> 65#include <linux/amifd.h>
66#include <linux/buffer_head.h> 66#include <linux/buffer_head.h>
@@ -109,13 +109,12 @@
109#define FD_HD_3 0x55555555 /* high-density 3.5" (1760K) drive */ 109#define FD_HD_3 0x55555555 /* high-density 3.5" (1760K) drive */
110#define FD_DD_5 0xaaaaaaaa /* double-density 5.25" (440K) drive */ 110#define FD_DD_5 0xaaaaaaaa /* double-density 5.25" (440K) drive */
111 111
112static DEFINE_MUTEX(amiflop_mutex);
112static unsigned long int fd_def_df0 = FD_DD_3; /* default for df0 if it doesn't identify */ 113static unsigned long int fd_def_df0 = FD_DD_3; /* default for df0 if it doesn't identify */
113 114
114module_param(fd_def_df0, ulong, 0); 115module_param(fd_def_df0, ulong, 0);
115MODULE_LICENSE("GPL"); 116MODULE_LICENSE("GPL");
116 117
117static struct request_queue *floppy_queue;
118
119/* 118/*
120 * Macros 119 * Macros
121 */ 120 */
@@ -164,6 +163,7 @@ static volatile int selected = -1; /* currently selected drive */
164static int writepending; 163static int writepending;
165static int writefromint; 164static int writefromint;
166static char *raw_buf; 165static char *raw_buf;
166static int fdc_queue;
167 167
168static DEFINE_SPINLOCK(amiflop_lock); 168static DEFINE_SPINLOCK(amiflop_lock);
169 169
@@ -1334,6 +1334,42 @@ static int get_track(int drive, int track)
1334 return -1; 1334 return -1;
1335} 1335}
1336 1336
1337/*
1338 * Round-robin between our available drives, doing one request from each
1339 */
1340static struct request *set_next_request(void)
1341{
1342 struct request_queue *q;
1343 int cnt = FD_MAX_UNITS;
1344 struct request *rq = NULL;
1345
1346 /* Find next queue we can dispatch from */
1347 fdc_queue = fdc_queue + 1;
1348 if (fdc_queue == FD_MAX_UNITS)
1349 fdc_queue = 0;
1350
1351 for(cnt = FD_MAX_UNITS; cnt > 0; cnt--) {
1352
1353 if (unit[fdc_queue].type->code == FD_NODRIVE) {
1354 if (++fdc_queue == FD_MAX_UNITS)
1355 fdc_queue = 0;
1356 continue;
1357 }
1358
1359 q = unit[fdc_queue].gendisk->queue;
1360 if (q) {
1361 rq = blk_fetch_request(q);
1362 if (rq)
1363 break;
1364 }
1365
1366 if (++fdc_queue == FD_MAX_UNITS)
1367 fdc_queue = 0;
1368 }
1369
1370 return rq;
1371}
1372
1337static void redo_fd_request(void) 1373static void redo_fd_request(void)
1338{ 1374{
1339 struct request *rq; 1375 struct request *rq;
@@ -1345,7 +1381,7 @@ static void redo_fd_request(void)
1345 int err; 1381 int err;
1346 1382
1347next_req: 1383next_req:
1348 rq = blk_fetch_request(floppy_queue); 1384 rq = set_next_request();
1349 if (!rq) { 1385 if (!rq) {
1350 /* Nothing left to do */ 1386 /* Nothing left to do */
1351 return; 1387 return;
@@ -1506,9 +1542,9 @@ static int fd_ioctl(struct block_device *bdev, fmode_t mode,
1506{ 1542{
1507 int ret; 1543 int ret;
1508 1544
1509 lock_kernel(); 1545 mutex_lock(&amiflop_mutex);
1510 ret = fd_locked_ioctl(bdev, mode, cmd, param); 1546 ret = fd_locked_ioctl(bdev, mode, cmd, param);
1511 unlock_kernel(); 1547 mutex_unlock(&amiflop_mutex);
1512 1548
1513 return ret; 1549 return ret;
1514} 1550}
@@ -1555,11 +1591,11 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
1555 int old_dev; 1591 int old_dev;
1556 unsigned long flags; 1592 unsigned long flags;
1557 1593
1558 lock_kernel(); 1594 mutex_lock(&amiflop_mutex);
1559 old_dev = fd_device[drive]; 1595 old_dev = fd_device[drive];
1560 1596
1561 if (fd_ref[drive] && old_dev != system) { 1597 if (fd_ref[drive] && old_dev != system) {
1562 unlock_kernel(); 1598 mutex_unlock(&amiflop_mutex);
1563 return -EBUSY; 1599 return -EBUSY;
1564 } 1600 }
1565 1601
@@ -1575,7 +1611,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
1575 rel_fdc(); 1611 rel_fdc();
1576 1612
1577 if (wrprot) { 1613 if (wrprot) {
1578 unlock_kernel(); 1614 mutex_unlock(&amiflop_mutex);
1579 return -EROFS; 1615 return -EROFS;
1580 } 1616 }
1581 } 1617 }
@@ -1594,7 +1630,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
1594 printk(KERN_INFO "fd%d: accessing %s-disk with %s-layout\n",drive, 1630 printk(KERN_INFO "fd%d: accessing %s-disk with %s-layout\n",drive,
1595 unit[drive].type->name, data_types[system].name); 1631 unit[drive].type->name, data_types[system].name);
1596 1632
1597 unlock_kernel(); 1633 mutex_unlock(&amiflop_mutex);
1598 return 0; 1634 return 0;
1599} 1635}
1600 1636
@@ -1603,7 +1639,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
1603 struct amiga_floppy_struct *p = disk->private_data; 1639 struct amiga_floppy_struct *p = disk->private_data;
1604 int drive = p - unit; 1640 int drive = p - unit;
1605 1641
1606 lock_kernel(); 1642 mutex_lock(&amiflop_mutex);
1607 if (unit[drive].dirty == 1) { 1643 if (unit[drive].dirty == 1) {
1608 del_timer (flush_track_timer + drive); 1644 del_timer (flush_track_timer + drive);
1609 non_int_flush_track (drive); 1645 non_int_flush_track (drive);
@@ -1617,17 +1653,17 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
1617/* the mod_use counter is handled this way */ 1653/* the mod_use counter is handled this way */
1618 floppy_off (drive | 0x40000000); 1654 floppy_off (drive | 0x40000000);
1619#endif 1655#endif
1620 unlock_kernel(); 1656 mutex_unlock(&amiflop_mutex);
1621 return 0; 1657 return 0;
1622} 1658}
1623 1659
1624/* 1660/*
1625 * floppy-change is never called from an interrupt, so we can relax a bit 1661 * check_events is never called from an interrupt, so we can relax a bit
1626 * here, sleep etc. Note that floppy-on tries to set current_DOR to point 1662 * here, sleep etc. Note that floppy-on tries to set current_DOR to point
1627 * to the desired drive, but it will probably not survive the sleep if 1663 * to the desired drive, but it will probably not survive the sleep if
1628 * several floppies are used at the same time: thus the loop. 1664 * several floppies are used at the same time: thus the loop.
1629 */ 1665 */
1630static int amiga_floppy_change(struct gendisk *disk) 1666static unsigned amiga_check_events(struct gendisk *disk, unsigned int clearing)
1631{ 1667{
1632 struct amiga_floppy_struct *p = disk->private_data; 1668 struct amiga_floppy_struct *p = disk->private_data;
1633 int drive = p - unit; 1669 int drive = p - unit;
@@ -1650,7 +1686,7 @@ static int amiga_floppy_change(struct gendisk *disk)
1650 p->dirty = 0; 1686 p->dirty = 0;
1651 writepending = 0; /* if this was true before, too bad! */ 1687 writepending = 0; /* if this was true before, too bad! */
1652 writefromint = 0; 1688 writefromint = 0;
1653 return 1; 1689 return DISK_EVENT_MEDIA_CHANGE;
1654 } 1690 }
1655 return 0; 1691 return 0;
1656} 1692}
@@ -1661,7 +1697,7 @@ static const struct block_device_operations floppy_fops = {
1661 .release = floppy_release, 1697 .release = floppy_release,
1662 .ioctl = fd_ioctl, 1698 .ioctl = fd_ioctl,
1663 .getgeo = fd_getgeo, 1699 .getgeo = fd_getgeo,
1664 .media_changed = amiga_floppy_change, 1700 .check_events = amiga_check_events,
1665}; 1701};
1666 1702
1667static int __init fd_probe_drives(void) 1703static int __init fd_probe_drives(void)
@@ -1682,6 +1718,13 @@ static int __init fd_probe_drives(void)
1682 continue; 1718 continue;
1683 } 1719 }
1684 unit[drive].gendisk = disk; 1720 unit[drive].gendisk = disk;
1721
1722 disk->queue = blk_init_queue(do_fd_request, &amiflop_lock);
1723 if (!disk->queue) {
1724 unit[drive].type->code = FD_NODRIVE;
1725 continue;
1726 }
1727
1685 drives++; 1728 drives++;
1686 if ((unit[drive].trackbuf = kmalloc(FLOPPY_MAX_SECTORS * 512, GFP_KERNEL)) == NULL) { 1729 if ((unit[drive].trackbuf = kmalloc(FLOPPY_MAX_SECTORS * 512, GFP_KERNEL)) == NULL) {
1687 printk("no mem for "); 1730 printk("no mem for ");
@@ -1695,7 +1738,6 @@ static int __init fd_probe_drives(void)
1695 disk->fops = &floppy_fops; 1738 disk->fops = &floppy_fops;
1696 sprintf(disk->disk_name, "fd%d", drive); 1739 sprintf(disk->disk_name, "fd%d", drive);
1697 disk->private_data = &unit[drive]; 1740 disk->private_data = &unit[drive];
1698 disk->queue = floppy_queue;
1699 set_capacity(disk, 880*2); 1741 set_capacity(disk, 880*2);
1700 add_disk(disk); 1742 add_disk(disk);
1701 } 1743 }
@@ -1726,8 +1768,8 @@ static int __init amiga_floppy_probe(struct platform_device *pdev)
1726 return -EBUSY; 1768 return -EBUSY;
1727 1769
1728 ret = -ENOMEM; 1770 ret = -ENOMEM;
1729 if ((raw_buf = (char *)amiga_chip_alloc (RAW_BUF_SIZE, "Floppy")) == 1771 raw_buf = amiga_chip_alloc(RAW_BUF_SIZE, "Floppy");
1730 NULL) { 1772 if (!raw_buf) {
1731 printk("fd: cannot get chip mem buffer\n"); 1773 printk("fd: cannot get chip mem buffer\n");
1732 goto out_blkdev; 1774 goto out_blkdev;
1733 } 1775 }
@@ -1743,11 +1785,6 @@ static int __init amiga_floppy_probe(struct platform_device *pdev)
1743 goto out_irq2; 1785 goto out_irq2;
1744 } 1786 }
1745 1787
1746 ret = -ENOMEM;
1747 floppy_queue = blk_init_queue(do_fd_request, &amiflop_lock);
1748 if (!floppy_queue)
1749 goto out_queue;
1750
1751 ret = -ENODEV; 1788 ret = -ENODEV;
1752 if (fd_probe_drives() < 1) /* No usable drives */ 1789 if (fd_probe_drives() < 1) /* No usable drives */
1753 goto out_probe; 1790 goto out_probe;
@@ -1791,8 +1828,6 @@ static int __init amiga_floppy_probe(struct platform_device *pdev)
1791 return 0; 1828 return 0;
1792 1829
1793out_probe: 1830out_probe:
1794 blk_cleanup_queue(floppy_queue);
1795out_queue:
1796 free_irq(IRQ_AMIGA_CIAA_TB, NULL); 1831 free_irq(IRQ_AMIGA_CIAA_TB, NULL);
1797out_irq2: 1832out_irq2:
1798 free_irq(IRQ_AMIGA_DSKBLK, NULL); 1833 free_irq(IRQ_AMIGA_DSKBLK, NULL);
@@ -1810,9 +1845,12 @@ static int __exit amiga_floppy_remove(struct platform_device *pdev)
1810 1845
1811 for( i = 0; i < FD_MAX_UNITS; i++) { 1846 for( i = 0; i < FD_MAX_UNITS; i++) {
1812 if (unit[i].type->code != FD_NODRIVE) { 1847 if (unit[i].type->code != FD_NODRIVE) {
1848 struct request_queue *q = unit[i].gendisk->queue;
1813 del_gendisk(unit[i].gendisk); 1849 del_gendisk(unit[i].gendisk);
1814 put_disk(unit[i].gendisk); 1850 put_disk(unit[i].gendisk);
1815 kfree(unit[i].trackbuf); 1851 kfree(unit[i].trackbuf);
1852 if (q)
1853 blk_cleanup_queue(q);
1816 } 1854 }
1817 } 1855 }
1818 blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); 1856 blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256);
@@ -1820,7 +1858,6 @@ static int __exit amiga_floppy_remove(struct platform_device *pdev)
1820 free_irq(IRQ_AMIGA_DSKBLK, NULL); 1858 free_irq(IRQ_AMIGA_DSKBLK, NULL);
1821 custom.dmacon = DMAF_DISK; /* disable DMA */ 1859 custom.dmacon = DMAF_DISK; /* disable DMA */
1822 amiga_chip_free(raw_buf); 1860 amiga_chip_free(raw_buf);
1823 blk_cleanup_queue(floppy_queue);
1824 unregister_blkdev(FLOPPY_MAJOR, "fd"); 1861 unregister_blkdev(FLOPPY_MAJOR, "fd");
1825} 1862}
1826#endif 1863#endif
diff --git a/drivers/block/aoe/Makefile b/drivers/block/aoe/Makefile
index e76d997183c6..06ea82cdf27d 100644
--- a/drivers/block/aoe/Makefile
+++ b/drivers/block/aoe/Makefile
@@ -3,4 +3,4 @@
3# 3#
4 4
5obj-$(CONFIG_ATA_OVER_ETH) += aoe.o 5obj-$(CONFIG_ATA_OVER_ETH) += aoe.o
6aoe-objs := aoeblk.o aoechr.o aoecmd.o aoedev.o aoemain.o aoenet.o 6aoe-y := aoeblk.o aoechr.o aoecmd.o aoedev.o aoemain.o aoenet.o
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index a946929735a5..528f6318ded1 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -4,17 +4,20 @@
4 * block device routines 4 * block device routines
5 */ 5 */
6 6
7#include <linux/kernel.h>
7#include <linux/hdreg.h> 8#include <linux/hdreg.h>
8#include <linux/blkdev.h> 9#include <linux/blkdev.h>
9#include <linux/backing-dev.h> 10#include <linux/backing-dev.h>
10#include <linux/fs.h> 11#include <linux/fs.h>
11#include <linux/ioctl.h> 12#include <linux/ioctl.h>
12#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/ratelimit.h>
13#include <linux/genhd.h> 15#include <linux/genhd.h>
14#include <linux/netdevice.h> 16#include <linux/netdevice.h>
15#include <linux/smp_lock.h> 17#include <linux/mutex.h>
16#include "aoe.h" 18#include "aoe.h"
17 19
20static DEFINE_MUTEX(aoeblk_mutex);
18static struct kmem_cache *buf_pool_cache; 21static struct kmem_cache *buf_pool_cache;
19 22
20static ssize_t aoedisk_show_state(struct device *dev, 23static ssize_t aoedisk_show_state(struct device *dev,
@@ -125,16 +128,16 @@ aoeblk_open(struct block_device *bdev, fmode_t mode)
125 struct aoedev *d = bdev->bd_disk->private_data; 128 struct aoedev *d = bdev->bd_disk->private_data;
126 ulong flags; 129 ulong flags;
127 130
128 lock_kernel(); 131 mutex_lock(&aoeblk_mutex);
129 spin_lock_irqsave(&d->lock, flags); 132 spin_lock_irqsave(&d->lock, flags);
130 if (d->flags & DEVFL_UP) { 133 if (d->flags & DEVFL_UP) {
131 d->nopen++; 134 d->nopen++;
132 spin_unlock_irqrestore(&d->lock, flags); 135 spin_unlock_irqrestore(&d->lock, flags);
133 unlock_kernel(); 136 mutex_unlock(&aoeblk_mutex);
134 return 0; 137 return 0;
135 } 138 }
136 spin_unlock_irqrestore(&d->lock, flags); 139 spin_unlock_irqrestore(&d->lock, flags);
137 unlock_kernel(); 140 mutex_unlock(&aoeblk_mutex);
138 return -ENODEV; 141 return -ENODEV;
139} 142}
140 143
@@ -177,9 +180,6 @@ aoeblk_make_request(struct request_queue *q, struct bio *bio)
177 BUG(); 180 BUG();
178 bio_endio(bio, -ENXIO); 181 bio_endio(bio, -ENXIO);
179 return 0; 182 return 0;
180 } else if (bio->bi_rw & REQ_HARDBARRIER) {
181 bio_endio(bio, -EOPNOTSUPP);
182 return 0;
183 } else if (bio->bi_io_vec == NULL) { 183 } else if (bio->bi_io_vec == NULL) {
184 printk(KERN_ERR "aoe: bi_io_vec is NULL\n"); 184 printk(KERN_ERR "aoe: bi_io_vec is NULL\n");
185 BUG(); 185 BUG();
@@ -206,7 +206,7 @@ aoeblk_make_request(struct request_queue *q, struct bio *bio)
206 spin_lock_irqsave(&d->lock, flags); 206 spin_lock_irqsave(&d->lock, flags);
207 207
208 if ((d->flags & DEVFL_UP) == 0) { 208 if ((d->flags & DEVFL_UP) == 0) {
209 printk(KERN_INFO "aoe: device %ld.%d is not up\n", 209 pr_info_ratelimited("aoe: device %ld.%d is not up\n",
210 d->aoemajor, d->aoeminor); 210 d->aoemajor, d->aoeminor);
211 spin_unlock_irqrestore(&d->lock, flags); 211 spin_unlock_irqrestore(&d->lock, flags);
212 mempool_free(buf, d->bufpool); 212 mempool_free(buf, d->bufpool);
diff --git a/drivers/block/aoe/aoechr.c b/drivers/block/aoe/aoechr.c
index 4a1b9e7464aa..146296ca4965 100644
--- a/drivers/block/aoe/aoechr.c
+++ b/drivers/block/aoe/aoechr.c
@@ -9,7 +9,7 @@
9#include <linux/completion.h> 9#include <linux/completion.h>
10#include <linux/delay.h> 10#include <linux/delay.h>
11#include <linux/slab.h> 11#include <linux/slab.h>
12#include <linux/smp_lock.h> 12#include <linux/mutex.h>
13#include <linux/skbuff.h> 13#include <linux/skbuff.h>
14#include "aoe.h" 14#include "aoe.h"
15 15
@@ -37,6 +37,7 @@ struct ErrMsg {
37 char *msg; 37 char *msg;
38}; 38};
39 39
40static DEFINE_MUTEX(aoechr_mutex);
40static struct ErrMsg emsgs[NMSG]; 41static struct ErrMsg emsgs[NMSG];
41static int emsgs_head_idx, emsgs_tail_idx; 42static int emsgs_head_idx, emsgs_tail_idx;
42static struct completion emsgs_comp; 43static struct completion emsgs_comp;
@@ -183,16 +184,16 @@ aoechr_open(struct inode *inode, struct file *filp)
183{ 184{
184 int n, i; 185 int n, i;
185 186
186 lock_kernel(); 187 mutex_lock(&aoechr_mutex);
187 n = iminor(inode); 188 n = iminor(inode);
188 filp->private_data = (void *) (unsigned long) n; 189 filp->private_data = (void *) (unsigned long) n;
189 190
190 for (i = 0; i < ARRAY_SIZE(chardevs); ++i) 191 for (i = 0; i < ARRAY_SIZE(chardevs); ++i)
191 if (chardevs[i].minor == n) { 192 if (chardevs[i].minor == n) {
192 unlock_kernel(); 193 mutex_unlock(&aoechr_mutex);
193 return 0; 194 return 0;
194 } 195 }
195 unlock_kernel(); 196 mutex_unlock(&aoechr_mutex);
196 return -EINVAL; 197 return -EINVAL;
197} 198}
198 199
@@ -265,6 +266,7 @@ static const struct file_operations aoe_fops = {
265 .open = aoechr_open, 266 .open = aoechr_open,
266 .release = aoechr_rel, 267 .release = aoechr_rel,
267 .owner = THIS_MODULE, 268 .owner = THIS_MODULE,
269 .llseek = noop_llseek,
268}; 270};
269 271
270static char *aoe_devnode(struct device *dev, mode_t *mode) 272static char *aoe_devnode(struct device *dev, mode_t *mode)
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 5674bd01d96d..de0435e63b02 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -297,8 +297,8 @@ aoecmd_cfg_pkts(ushort aoemajor, unsigned char aoeminor, struct sk_buff_head *qu
297 struct sk_buff *skb; 297 struct sk_buff *skb;
298 struct net_device *ifp; 298 struct net_device *ifp;
299 299
300 read_lock(&dev_base_lock); 300 rcu_read_lock();
301 for_each_netdev(&init_net, ifp) { 301 for_each_netdev_rcu(&init_net, ifp) {
302 dev_hold(ifp); 302 dev_hold(ifp);
303 if (!is_aoe_netif(ifp)) 303 if (!is_aoe_netif(ifp))
304 goto cont; 304 goto cont;
@@ -325,7 +325,7 @@ aoecmd_cfg_pkts(ushort aoemajor, unsigned char aoeminor, struct sk_buff_head *qu
325cont: 325cont:
326 dev_put(ifp); 326 dev_put(ifp);
327 } 327 }
328 read_unlock(&dev_base_lock); 328 rcu_read_unlock();
329} 329}
330 330
331static void 331static void
diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c
index 0849280bfc1c..6b5110a47458 100644
--- a/drivers/block/aoe/aoedev.c
+++ b/drivers/block/aoe/aoedev.c
@@ -102,6 +102,7 @@ aoedev_freedev(struct aoedev *d)
102{ 102{
103 struct aoetgt **t, **e; 103 struct aoetgt **t, **e;
104 104
105 cancel_work_sync(&d->work);
105 if (d->gd) { 106 if (d->gd) {
106 aoedisk_rm_sysfs(d); 107 aoedisk_rm_sysfs(d);
107 del_gendisk(d->gd); 108 del_gendisk(d->gd);
@@ -135,7 +136,6 @@ aoedev_flush(const char __user *str, size_t cnt)
135 all = !strncmp(buf, "all", 3); 136 all = !strncmp(buf, "all", 3);
136 } 137 }
137 138
138 flush_scheduled_work();
139 spin_lock_irqsave(&devlist_lock, flags); 139 spin_lock_irqsave(&devlist_lock, flags);
140 dd = &devlist; 140 dd = &devlist;
141 while ((d = *dd)) { 141 while ((d = *dd)) {
@@ -257,8 +257,6 @@ aoedev_exit(void)
257 struct aoedev *d; 257 struct aoedev *d;
258 ulong flags; 258 ulong flags;
259 259
260 flush_scheduled_work();
261
262 while ((d = devlist)) { 260 while ((d = devlist)) {
263 devlist = d->next; 261 devlist = d->next;
264 262
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index aceb96476524..ede16c64ff07 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -67,7 +67,7 @@
67#include <linux/delay.h> 67#include <linux/delay.h>
68#include <linux/init.h> 68#include <linux/init.h>
69#include <linux/blkdev.h> 69#include <linux/blkdev.h>
70#include <linux/smp_lock.h> 70#include <linux/mutex.h>
71 71
72#include <asm/atafd.h> 72#include <asm/atafd.h>
73#include <asm/atafdreg.h> 73#include <asm/atafdreg.h>
@@ -79,8 +79,9 @@
79 79
80#undef DEBUG 80#undef DEBUG
81 81
82static struct request_queue *floppy_queue; 82static DEFINE_MUTEX(ataflop_mutex);
83static struct request *fd_request; 83static struct request *fd_request;
84static int fdc_queue;
84 85
85/* Disk types: DD, HD, ED */ 86/* Disk types: DD, HD, ED */
86static struct atari_disk_type { 87static struct atari_disk_type {
@@ -1323,23 +1324,24 @@ static void finish_fdc_done( int dummy )
1323 * due to unrecognised disk changes. 1324 * due to unrecognised disk changes.
1324 */ 1325 */
1325 1326
1326static int check_floppy_change(struct gendisk *disk) 1327static unsigned int floppy_check_events(struct gendisk *disk,
1328 unsigned int clearing)
1327{ 1329{
1328 struct atari_floppy_struct *p = disk->private_data; 1330 struct atari_floppy_struct *p = disk->private_data;
1329 unsigned int drive = p - unit; 1331 unsigned int drive = p - unit;
1330 if (test_bit (drive, &fake_change)) { 1332 if (test_bit (drive, &fake_change)) {
1331 /* simulated change (e.g. after formatting) */ 1333 /* simulated change (e.g. after formatting) */
1332 return 1; 1334 return DISK_EVENT_MEDIA_CHANGE;
1333 } 1335 }
1334 if (test_bit (drive, &changed_floppies)) { 1336 if (test_bit (drive, &changed_floppies)) {
1335 /* surely changed (the WP signal changed at least once) */ 1337 /* surely changed (the WP signal changed at least once) */
1336 return 1; 1338 return DISK_EVENT_MEDIA_CHANGE;
1337 } 1339 }
1338 if (UD.wpstat) { 1340 if (UD.wpstat) {
1339 /* WP is on -> could be changed: to be sure, buffers should be 1341 /* WP is on -> could be changed: to be sure, buffers should be
1340 * invalidated... 1342 * invalidated...
1341 */ 1343 */
1342 return 1; 1344 return DISK_EVENT_MEDIA_CHANGE;
1343 } 1345 }
1344 1346
1345 return 0; 1347 return 0;
@@ -1391,6 +1393,29 @@ static void setup_req_params( int drive )
1391 ReqTrack, ReqSector, (unsigned long)ReqData )); 1393 ReqTrack, ReqSector, (unsigned long)ReqData ));
1392} 1394}
1393 1395
1396/*
1397 * Round-robin between our available drives, doing one request from each
1398 */
1399static struct request *set_next_request(void)
1400{
1401 struct request_queue *q;
1402 int old_pos = fdc_queue;
1403 struct request *rq = NULL;
1404
1405 do {
1406 q = unit[fdc_queue].disk->queue;
1407 if (++fdc_queue == FD_MAX_UNITS)
1408 fdc_queue = 0;
1409 if (q) {
1410 rq = blk_fetch_request(q);
1411 if (rq)
1412 break;
1413 }
1414 } while (fdc_queue != old_pos);
1415
1416 return rq;
1417}
1418
1394 1419
1395static void redo_fd_request(void) 1420static void redo_fd_request(void)
1396{ 1421{
@@ -1405,7 +1430,7 @@ static void redo_fd_request(void)
1405 1430
1406repeat: 1431repeat:
1407 if (!fd_request) { 1432 if (!fd_request) {
1408 fd_request = blk_fetch_request(floppy_queue); 1433 fd_request = set_next_request();
1409 if (!fd_request) 1434 if (!fd_request)
1410 goto the_end; 1435 goto the_end;
1411 } 1436 }
@@ -1546,7 +1571,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode,
1546 * or the next access will revalidate - and clear UDT :-( 1571 * or the next access will revalidate - and clear UDT :-(
1547 */ 1572 */
1548 1573
1549 if (check_floppy_change(disk)) 1574 if (floppy_check_events(disk, 0))
1550 floppy_revalidate(disk); 1575 floppy_revalidate(disk);
1551 1576
1552 if (UD.flags & FTD_MSG) 1577 if (UD.flags & FTD_MSG)
@@ -1671,9 +1696,9 @@ static int fd_ioctl(struct block_device *bdev, fmode_t mode,
1671{ 1696{
1672 int ret; 1697 int ret;
1673 1698
1674 lock_kernel(); 1699 mutex_lock(&ataflop_mutex);
1675 ret = fd_locked_ioctl(bdev, mode, cmd, arg); 1700 ret = fd_locked_ioctl(bdev, mode, cmd, arg);
1676 unlock_kernel(); 1701 mutex_unlock(&ataflop_mutex);
1677 1702
1678 return ret; 1703 return ret;
1679} 1704}
@@ -1854,9 +1879,9 @@ static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode)
1854{ 1879{
1855 int ret; 1880 int ret;
1856 1881
1857 lock_kernel(); 1882 mutex_lock(&ataflop_mutex);
1858 ret = floppy_open(bdev, mode); 1883 ret = floppy_open(bdev, mode);
1859 unlock_kernel(); 1884 mutex_unlock(&ataflop_mutex);
1860 1885
1861 return ret; 1886 return ret;
1862} 1887}
@@ -1864,14 +1889,14 @@ static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode)
1864static int floppy_release(struct gendisk *disk, fmode_t mode) 1889static int floppy_release(struct gendisk *disk, fmode_t mode)
1865{ 1890{
1866 struct atari_floppy_struct *p = disk->private_data; 1891 struct atari_floppy_struct *p = disk->private_data;
1867 lock_kernel(); 1892 mutex_lock(&ataflop_mutex);
1868 if (p->ref < 0) 1893 if (p->ref < 0)
1869 p->ref = 0; 1894 p->ref = 0;
1870 else if (!p->ref--) { 1895 else if (!p->ref--) {
1871 printk(KERN_ERR "floppy_release with fd_ref == 0"); 1896 printk(KERN_ERR "floppy_release with fd_ref == 0");
1872 p->ref = 0; 1897 p->ref = 0;
1873 } 1898 }
1874 unlock_kernel(); 1899 mutex_unlock(&ataflop_mutex);
1875 return 0; 1900 return 0;
1876} 1901}
1877 1902
@@ -1880,7 +1905,7 @@ static const struct block_device_operations floppy_fops = {
1880 .open = floppy_unlocked_open, 1905 .open = floppy_unlocked_open,
1881 .release = floppy_release, 1906 .release = floppy_release,
1882 .ioctl = fd_ioctl, 1907 .ioctl = fd_ioctl,
1883 .media_changed = check_floppy_change, 1908 .check_events = floppy_check_events,
1884 .revalidate_disk= floppy_revalidate, 1909 .revalidate_disk= floppy_revalidate,
1885}; 1910};
1886 1911
@@ -1932,10 +1957,6 @@ static int __init atari_floppy_init (void)
1932 PhysTrackBuffer = virt_to_phys(TrackBuffer); 1957 PhysTrackBuffer = virt_to_phys(TrackBuffer);
1933 BufferDrive = BufferSide = BufferTrack = -1; 1958 BufferDrive = BufferSide = BufferTrack = -1;
1934 1959
1935 floppy_queue = blk_init_queue(do_fd_request, &ataflop_lock);
1936 if (!floppy_queue)
1937 goto Enomem;
1938
1939 for (i = 0; i < FD_MAX_UNITS; i++) { 1960 for (i = 0; i < FD_MAX_UNITS; i++) {
1940 unit[i].track = -1; 1961 unit[i].track = -1;
1941 unit[i].flags = 0; 1962 unit[i].flags = 0;
@@ -1944,7 +1965,10 @@ static int __init atari_floppy_init (void)
1944 sprintf(unit[i].disk->disk_name, "fd%d", i); 1965 sprintf(unit[i].disk->disk_name, "fd%d", i);
1945 unit[i].disk->fops = &floppy_fops; 1966 unit[i].disk->fops = &floppy_fops;
1946 unit[i].disk->private_data = &unit[i]; 1967 unit[i].disk->private_data = &unit[i];
1947 unit[i].disk->queue = floppy_queue; 1968 unit[i].disk->queue = blk_init_queue(do_fd_request,
1969 &ataflop_lock);
1970 if (!unit[i].disk->queue)
1971 goto Enomem;
1948 set_capacity(unit[i].disk, MAX_DISK_SIZE * 2); 1972 set_capacity(unit[i].disk, MAX_DISK_SIZE * 2);
1949 add_disk(unit[i].disk); 1973 add_disk(unit[i].disk);
1950 } 1974 }
@@ -1959,10 +1983,14 @@ static int __init atari_floppy_init (void)
1959 1983
1960 return 0; 1984 return 0;
1961Enomem: 1985Enomem:
1962 while (i--) 1986 while (i--) {
1987 struct request_queue *q = unit[i].disk->queue;
1988
1963 put_disk(unit[i].disk); 1989 put_disk(unit[i].disk);
1964 if (floppy_queue) 1990 if (q)
1965 blk_cleanup_queue(floppy_queue); 1991 blk_cleanup_queue(q);
1992 }
1993
1966 unregister_blkdev(FLOPPY_MAJOR, "fd"); 1994 unregister_blkdev(FLOPPY_MAJOR, "fd");
1967 return -ENOMEM; 1995 return -ENOMEM;
1968} 1996}
@@ -2011,12 +2039,14 @@ static void __exit atari_floppy_exit(void)
2011 int i; 2039 int i;
2012 blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); 2040 blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256);
2013 for (i = 0; i < FD_MAX_UNITS; i++) { 2041 for (i = 0; i < FD_MAX_UNITS; i++) {
2042 struct request_queue *q = unit[i].disk->queue;
2043
2014 del_gendisk(unit[i].disk); 2044 del_gendisk(unit[i].disk);
2015 put_disk(unit[i].disk); 2045 put_disk(unit[i].disk);
2046 blk_cleanup_queue(q);
2016 } 2047 }
2017 unregister_blkdev(FLOPPY_MAJOR, "fd"); 2048 unregister_blkdev(FLOPPY_MAJOR, "fd");
2018 2049
2019 blk_cleanup_queue(floppy_queue);
2020 del_timer_sync(&fd_timer); 2050 del_timer_sync(&fd_timer);
2021 atari_stram_free( DMABuffer ); 2051 atari_stram_free( DMABuffer );
2022} 2052}
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 1c7f63792ff8..dba1c32e1ddf 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -15,7 +15,7 @@
15#include <linux/blkdev.h> 15#include <linux/blkdev.h>
16#include <linux/bio.h> 16#include <linux/bio.h>
17#include <linux/highmem.h> 17#include <linux/highmem.h>
18#include <linux/smp_lock.h> 18#include <linux/mutex.h>
19#include <linux/radix-tree.h> 19#include <linux/radix-tree.h>
20#include <linux/buffer_head.h> /* invalidate_bh_lrus() */ 20#include <linux/buffer_head.h> /* invalidate_bh_lrus() */
21#include <linux/slab.h> 21#include <linux/slab.h>
@@ -35,10 +35,6 @@
35 */ 35 */
36struct brd_device { 36struct brd_device {
37 int brd_number; 37 int brd_number;
38 int brd_refcnt;
39 loff_t brd_offset;
40 loff_t brd_sizelimit;
41 unsigned brd_blocksize;
42 38
43 struct request_queue *brd_queue; 39 struct request_queue *brd_queue;
44 struct gendisk *brd_disk; 40 struct gendisk *brd_disk;
@@ -55,6 +51,7 @@ struct brd_device {
55/* 51/*
56 * Look up and return a brd's page for a given sector. 52 * Look up and return a brd's page for a given sector.
57 */ 53 */
54static DEFINE_MUTEX(brd_mutex);
58static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector) 55static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
59{ 56{
60 pgoff_t idx; 57 pgoff_t idx;
@@ -402,7 +399,7 @@ static int brd_ioctl(struct block_device *bdev, fmode_t mode,
402 * ram device BLKFLSBUF has special semantics, we want to actually 399 * ram device BLKFLSBUF has special semantics, we want to actually
403 * release and destroy the ramdisk data. 400 * release and destroy the ramdisk data.
404 */ 401 */
405 lock_kernel(); 402 mutex_lock(&brd_mutex);
406 mutex_lock(&bdev->bd_mutex); 403 mutex_lock(&bdev->bd_mutex);
407 error = -EBUSY; 404 error = -EBUSY;
408 if (bdev->bd_openers <= 1) { 405 if (bdev->bd_openers <= 1) {
@@ -419,7 +416,7 @@ static int brd_ioctl(struct block_device *bdev, fmode_t mode,
419 error = 0; 416 error = 0;
420 } 417 }
421 mutex_unlock(&bdev->bd_mutex); 418 mutex_unlock(&bdev->bd_mutex);
422 unlock_kernel(); 419 mutex_unlock(&brd_mutex);
423 420
424 return error; 421 return error;
425} 422}
@@ -439,11 +436,11 @@ static int rd_nr;
439int rd_size = CONFIG_BLK_DEV_RAM_SIZE; 436int rd_size = CONFIG_BLK_DEV_RAM_SIZE;
440static int max_part; 437static int max_part;
441static int part_shift; 438static int part_shift;
442module_param(rd_nr, int, 0); 439module_param(rd_nr, int, S_IRUGO);
443MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices"); 440MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");
444module_param(rd_size, int, 0); 441module_param(rd_size, int, S_IRUGO);
445MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes."); 442MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
446module_param(max_part, int, 0); 443module_param(max_part, int, S_IRUGO);
447MODULE_PARM_DESC(max_part, "Maximum number of partitions per RAM disk"); 444MODULE_PARM_DESC(max_part, "Maximum number of partitions per RAM disk");
448MODULE_LICENSE("GPL"); 445MODULE_LICENSE("GPL");
449MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR); 446MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
@@ -482,7 +479,6 @@ static struct brd_device *brd_alloc(int i)
482 if (!brd->brd_queue) 479 if (!brd->brd_queue)
483 goto out_free_dev; 480 goto out_free_dev;
484 blk_queue_make_request(brd->brd_queue, brd_make_request); 481 blk_queue_make_request(brd->brd_queue, brd_make_request);
485 blk_queue_ordered(brd->brd_queue, QUEUE_ORDERED_TAG);
486 blk_queue_max_hw_sectors(brd->brd_queue, 1024); 482 blk_queue_max_hw_sectors(brd->brd_queue, 1024);
487 blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY); 483 blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
488 484
@@ -552,7 +548,7 @@ static struct kobject *brd_probe(dev_t dev, int *part, void *data)
552 struct kobject *kobj; 548 struct kobject *kobj;
553 549
554 mutex_lock(&brd_devices_mutex); 550 mutex_lock(&brd_devices_mutex);
555 brd = brd_init_one(dev & MINORMASK); 551 brd = brd_init_one(MINOR(dev) >> part_shift);
556 kobj = brd ? get_disk(brd->brd_disk) : ERR_PTR(-ENOMEM); 552 kobj = brd ? get_disk(brd->brd_disk) : ERR_PTR(-ENOMEM);
557 mutex_unlock(&brd_devices_mutex); 553 mutex_unlock(&brd_devices_mutex);
558 554
@@ -575,25 +571,39 @@ static int __init brd_init(void)
575 * 571 *
576 * (1) if rd_nr is specified, create that many upfront, and this 572 * (1) if rd_nr is specified, create that many upfront, and this
577 * also becomes a hard limit. 573 * also becomes a hard limit.
578 * (2) if rd_nr is not specified, create 1 rd device on module 574 * (2) if rd_nr is not specified, create CONFIG_BLK_DEV_RAM_COUNT
579 * load, user can further extend brd device by create dev node 575 * (default 16) rd device on module load, user can further
580 * themselves and have kernel automatically instantiate actual 576 * extend brd device by create dev node themselves and have
581 * device on-demand. 577 * kernel automatically instantiate actual device on-demand.
582 */ 578 */
583 579
584 part_shift = 0; 580 part_shift = 0;
585 if (max_part > 0) 581 if (max_part > 0) {
586 part_shift = fls(max_part); 582 part_shift = fls(max_part);
587 583
584 /*
585 * Adjust max_part according to part_shift as it is exported
586 * to user space so that user can decide correct minor number
587 * if [s]he want to create more devices.
588 *
589 * Note that -1 is required because partition 0 is reserved
590 * for the whole disk.
591 */
592 max_part = (1UL << part_shift) - 1;
593 }
594
595 if ((1UL << part_shift) > DISK_MAX_PARTS)
596 return -EINVAL;
597
588 if (rd_nr > 1UL << (MINORBITS - part_shift)) 598 if (rd_nr > 1UL << (MINORBITS - part_shift))
589 return -EINVAL; 599 return -EINVAL;
590 600
591 if (rd_nr) { 601 if (rd_nr) {
592 nr = rd_nr; 602 nr = rd_nr;
593 range = rd_nr; 603 range = rd_nr << part_shift;
594 } else { 604 } else {
595 nr = CONFIG_BLK_DEV_RAM_COUNT; 605 nr = CONFIG_BLK_DEV_RAM_COUNT;
596 range = 1UL << (MINORBITS - part_shift); 606 range = 1UL << MINORBITS;
597 } 607 }
598 608
599 if (register_blkdev(RAMDISK_MAJOR, "ramdisk")) 609 if (register_blkdev(RAMDISK_MAJOR, "ramdisk"))
@@ -632,7 +642,7 @@ static void __exit brd_exit(void)
632 unsigned long range; 642 unsigned long range;
633 struct brd_device *brd, *next; 643 struct brd_device *brd, *next;
634 644
635 range = rd_nr ? rd_nr : 1UL << (MINORBITS - part_shift); 645 range = rd_nr ? rd_nr << part_shift : 1UL << MINORBITS;
636 646
637 list_for_each_entry_safe(brd, next, &brd_devices, brd_list) 647 list_for_each_entry_safe(brd, next, &brd_devices, brd_list)
638 brd_del_one(brd); 648 brd_del_one(brd);
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 5e4fadcdece9..8f4ef656a1af 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -26,7 +26,6 @@
26#include <linux/pci.h> 26#include <linux/pci.h>
27#include <linux/kernel.h> 27#include <linux/kernel.h>
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/smp_lock.h>
30#include <linux/delay.h> 29#include <linux/delay.h>
31#include <linux/major.h> 30#include <linux/major.h>
32#include <linux/fs.h> 31#include <linux/fs.h>
@@ -65,12 +64,13 @@ MODULE_DESCRIPTION("Driver for HP Smart Array Controllers");
65MODULE_SUPPORTED_DEVICE("HP Smart Array Controllers"); 64MODULE_SUPPORTED_DEVICE("HP Smart Array Controllers");
66MODULE_VERSION("3.6.26"); 65MODULE_VERSION("3.6.26");
67MODULE_LICENSE("GPL"); 66MODULE_LICENSE("GPL");
67static int cciss_tape_cmds = 6;
68module_param(cciss_tape_cmds, int, 0644);
69MODULE_PARM_DESC(cciss_tape_cmds,
70 "number of commands to allocate for tape devices (default: 6)");
68 71
69static int cciss_allow_hpsa; 72static DEFINE_MUTEX(cciss_mutex);
70module_param(cciss_allow_hpsa, int, S_IRUGO|S_IWUSR); 73static struct proc_dir_entry *proc_cciss;
71MODULE_PARM_DESC(cciss_allow_hpsa,
72 "Prevent cciss driver from accessing hardware known to be "
73 " supported by the hpsa driver");
74 74
75#include "cciss_cmd.h" 75#include "cciss_cmd.h"
76#include "cciss.h" 76#include "cciss.h"
@@ -98,18 +98,6 @@ static const struct pci_device_id cciss_pci_device_id[] = {
98 {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSD, 0x103C, 0x3215}, 98 {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSD, 0x103C, 0x3215},
99 {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSC, 0x103C, 0x3237}, 99 {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSC, 0x103C, 0x3237},
100 {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSC, 0x103C, 0x323D}, 100 {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSC, 0x103C, 0x323D},
101 {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3241},
102 {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3243},
103 {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3245},
104 {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3247},
105 {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3249},
106 {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x324A},
107 {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x324B},
108 {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3250},
109 {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3251},
110 {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3252},
111 {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3253},
112 {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3254},
113 {0,} 101 {0,}
114}; 102};
115 103
@@ -130,6 +118,8 @@ static struct board_type products[] = {
130 {0x409D0E11, "Smart Array 6400 EM", &SA5_access}, 118 {0x409D0E11, "Smart Array 6400 EM", &SA5_access},
131 {0x40910E11, "Smart Array 6i", &SA5_access}, 119 {0x40910E11, "Smart Array 6i", &SA5_access},
132 {0x3225103C, "Smart Array P600", &SA5_access}, 120 {0x3225103C, "Smart Array P600", &SA5_access},
121 {0x3223103C, "Smart Array P800", &SA5_access},
122 {0x3234103C, "Smart Array P400", &SA5_access},
133 {0x3235103C, "Smart Array P400i", &SA5_access}, 123 {0x3235103C, "Smart Array P400i", &SA5_access},
134 {0x3211103C, "Smart Array E200i", &SA5_access}, 124 {0x3211103C, "Smart Array E200i", &SA5_access},
135 {0x3212103C, "Smart Array E200", &SA5_access}, 125 {0x3212103C, "Smart Array E200", &SA5_access},
@@ -137,23 +127,9 @@ static struct board_type products[] = {
137 {0x3214103C, "Smart Array E200i", &SA5_access}, 127 {0x3214103C, "Smart Array E200i", &SA5_access},
138 {0x3215103C, "Smart Array E200i", &SA5_access}, 128 {0x3215103C, "Smart Array E200i", &SA5_access},
139 {0x3237103C, "Smart Array E500", &SA5_access}, 129 {0x3237103C, "Smart Array E500", &SA5_access},
140/* controllers below this line are also supported by the hpsa driver. */
141#define HPSA_BOUNDARY 0x3223103C
142 {0x3223103C, "Smart Array P800", &SA5_access}, 130 {0x3223103C, "Smart Array P800", &SA5_access},
143 {0x3234103C, "Smart Array P400", &SA5_access}, 131 {0x3234103C, "Smart Array P400", &SA5_access},
144 {0x323D103C, "Smart Array P700m", &SA5_access}, 132 {0x323D103C, "Smart Array P700m", &SA5_access},
145 {0x3241103C, "Smart Array P212", &SA5_access},
146 {0x3243103C, "Smart Array P410", &SA5_access},
147 {0x3245103C, "Smart Array P410i", &SA5_access},
148 {0x3247103C, "Smart Array P411", &SA5_access},
149 {0x3249103C, "Smart Array P812", &SA5_access},
150 {0x324A103C, "Smart Array P712m", &SA5_access},
151 {0x324B103C, "Smart Array P711m", &SA5_access},
152 {0x3250103C, "Smart Array", &SA5_access},
153 {0x3251103C, "Smart Array", &SA5_access},
154 {0x3252103C, "Smart Array", &SA5_access},
155 {0x3253103C, "Smart Array", &SA5_access},
156 {0x3254103C, "Smart Array", &SA5_access},
157}; 133};
158 134
159/* How long to wait (in milliseconds) for board to go into simple mode */ 135/* How long to wait (in milliseconds) for board to go into simple mode */
@@ -221,7 +197,9 @@ static int __devinit cciss_find_cfg_addrs(struct pci_dev *pdev,
221 u64 *cfg_offset); 197 u64 *cfg_offset);
222static int __devinit cciss_pci_find_memory_BAR(struct pci_dev *pdev, 198static int __devinit cciss_pci_find_memory_BAR(struct pci_dev *pdev,
223 unsigned long *memory_bar); 199 unsigned long *memory_bar);
224 200static inline u32 cciss_tag_discard_error_bits(ctlr_info_t *h, u32 tag);
201static __devinit int write_driver_ver_to_cfgtable(
202 CfgTable_struct __iomem *cfgtable);
225 203
226/* performant mode helper functions */ 204/* performant mode helper functions */
227static void calc_bucket_map(int *bucket, int num_buckets, int nsgs, 205static void calc_bucket_map(int *bucket, int num_buckets, int nsgs,
@@ -259,16 +237,16 @@ static const struct block_device_operations cciss_fops = {
259 */ 237 */
260static void set_performant_mode(ctlr_info_t *h, CommandList_struct *c) 238static void set_performant_mode(ctlr_info_t *h, CommandList_struct *c)
261{ 239{
262 if (likely(h->transMethod == CFGTBL_Trans_Performant)) 240 if (likely(h->transMethod & CFGTBL_Trans_Performant))
263 c->busaddr |= 1 | (h->blockFetchTable[c->Header.SGList] << 1); 241 c->busaddr |= 1 | (h->blockFetchTable[c->Header.SGList] << 1);
264} 242}
265 243
266/* 244/*
267 * Enqueuing and dequeuing functions for cmdlists. 245 * Enqueuing and dequeuing functions for cmdlists.
268 */ 246 */
269static inline void addQ(struct hlist_head *list, CommandList_struct *c) 247static inline void addQ(struct list_head *list, CommandList_struct *c)
270{ 248{
271 hlist_add_head(&c->list, list); 249 list_add_tail(&c->list, list);
272} 250}
273 251
274static inline void removeQ(CommandList_struct *c) 252static inline void removeQ(CommandList_struct *c)
@@ -281,12 +259,12 @@ static inline void removeQ(CommandList_struct *c)
281 * them off as 'stale' to prevent the driver from 259 * them off as 'stale' to prevent the driver from
282 * falling over. 260 * falling over.
283 */ 261 */
284 if (WARN_ON(hlist_unhashed(&c->list))) { 262 if (WARN_ON(list_empty(&c->list))) {
285 c->cmd_type = CMD_MSG_STALE; 263 c->cmd_type = CMD_MSG_STALE;
286 return; 264 return;
287 } 265 }
288 266
289 hlist_del_init(&c->list); 267 list_del_init(&c->list);
290} 268}
291 269
292static void enqueue_cmd_and_start_io(ctlr_info_t *h, 270static void enqueue_cmd_and_start_io(ctlr_info_t *h,
@@ -392,8 +370,6 @@ static const char *raid_label[] = { "0", "4", "1(1+0)", "5", "5+1", "ADG",
392#define ENG_GIG_FACTOR (ENG_GIG/512) 370#define ENG_GIG_FACTOR (ENG_GIG/512)
393#define ENGAGE_SCSI "engage scsi" 371#define ENGAGE_SCSI "engage scsi"
394 372
395static struct proc_dir_entry *proc_cciss;
396
397static void cciss_seq_show_header(struct seq_file *seq) 373static void cciss_seq_show_header(struct seq_file *seq)
398{ 374{
399 ctlr_info_t *h = seq->private; 375 ctlr_info_t *h = seq->private;
@@ -586,6 +562,66 @@ static void __devinit cciss_procinit(ctlr_info_t *h)
586#define to_hba(n) container_of(n, struct ctlr_info, dev) 562#define to_hba(n) container_of(n, struct ctlr_info, dev)
587#define to_drv(n) container_of(n, drive_info_struct, dev) 563#define to_drv(n) container_of(n, drive_info_struct, dev)
588 564
565/* List of controllers which cannot be hard reset on kexec with reset_devices */
566static u32 unresettable_controller[] = {
567 0x324a103C, /* Smart Array P712m */
568 0x324b103C, /* SmartArray P711m */
569 0x3223103C, /* Smart Array P800 */
570 0x3234103C, /* Smart Array P400 */
571 0x3235103C, /* Smart Array P400i */
572 0x3211103C, /* Smart Array E200i */
573 0x3212103C, /* Smart Array E200 */
574 0x3213103C, /* Smart Array E200i */
575 0x3214103C, /* Smart Array E200i */
576 0x3215103C, /* Smart Array E200i */
577 0x3237103C, /* Smart Array E500 */
578 0x323D103C, /* Smart Array P700m */
579 0x409C0E11, /* Smart Array 6400 */
580 0x409D0E11, /* Smart Array 6400 EM */
581};
582
583/* List of controllers which cannot even be soft reset */
584static u32 soft_unresettable_controller[] = {
585 0x409C0E11, /* Smart Array 6400 */
586 0x409D0E11, /* Smart Array 6400 EM */
587};
588
589static int ctlr_is_hard_resettable(u32 board_id)
590{
591 int i;
592
593 for (i = 0; i < ARRAY_SIZE(unresettable_controller); i++)
594 if (unresettable_controller[i] == board_id)
595 return 0;
596 return 1;
597}
598
599static int ctlr_is_soft_resettable(u32 board_id)
600{
601 int i;
602
603 for (i = 0; i < ARRAY_SIZE(soft_unresettable_controller); i++)
604 if (soft_unresettable_controller[i] == board_id)
605 return 0;
606 return 1;
607}
608
609static int ctlr_is_resettable(u32 board_id)
610{
611 return ctlr_is_hard_resettable(board_id) ||
612 ctlr_is_soft_resettable(board_id);
613}
614
615static ssize_t host_show_resettable(struct device *dev,
616 struct device_attribute *attr,
617 char *buf)
618{
619 struct ctlr_info *h = to_hba(dev);
620
621 return snprintf(buf, 20, "%d\n", ctlr_is_resettable(h->board_id));
622}
623static DEVICE_ATTR(resettable, S_IRUGO, host_show_resettable, NULL);
624
589static ssize_t host_store_rescan(struct device *dev, 625static ssize_t host_store_rescan(struct device *dev,
590 struct device_attribute *attr, 626 struct device_attribute *attr,
591 const char *buf, size_t count) 627 const char *buf, size_t count)
@@ -771,6 +807,7 @@ static DEVICE_ATTR(usage_count, S_IRUGO, cciss_show_usage_count, NULL);
771 807
772static struct attribute *cciss_host_attrs[] = { 808static struct attribute *cciss_host_attrs[] = {
773 &dev_attr_rescan.attr, 809 &dev_attr_rescan.attr,
810 &dev_attr_resettable.attr,
774 NULL 811 NULL
775}; 812};
776 813
@@ -935,7 +972,7 @@ static CommandList_struct *cmd_alloc(ctlr_info_t *h)
935 972
936 c->cmdindex = i; 973 c->cmdindex = i;
937 974
938 INIT_HLIST_NODE(&c->list); 975 INIT_LIST_HEAD(&c->list);
939 c->busaddr = (__u32) cmd_dma_handle; 976 c->busaddr = (__u32) cmd_dma_handle;
940 temp64.val = (__u64) err_dma_handle; 977 temp64.val = (__u64) err_dma_handle;
941 c->ErrDesc.Addr.lower = temp64.val32.lower; 978 c->ErrDesc.Addr.lower = temp64.val32.lower;
@@ -974,7 +1011,7 @@ static CommandList_struct *cmd_special_alloc(ctlr_info_t *h)
974 } 1011 }
975 memset(c->err_info, 0, sizeof(ErrorInfo_struct)); 1012 memset(c->err_info, 0, sizeof(ErrorInfo_struct));
976 1013
977 INIT_HLIST_NODE(&c->list); 1014 INIT_LIST_HEAD(&c->list);
978 c->busaddr = (__u32) cmd_dma_handle; 1015 c->busaddr = (__u32) cmd_dma_handle;
979 temp64.val = (__u64) err_dma_handle; 1016 temp64.val = (__u64) err_dma_handle;
980 c->ErrDesc.Addr.lower = temp64.val32.lower; 1017 c->ErrDesc.Addr.lower = temp64.val32.lower;
@@ -1003,8 +1040,8 @@ static void cmd_special_free(ctlr_info_t *h, CommandList_struct *c)
1003 temp64.val32.upper = c->ErrDesc.Addr.upper; 1040 temp64.val32.upper = c->ErrDesc.Addr.upper;
1004 pci_free_consistent(h->pdev, sizeof(ErrorInfo_struct), 1041 pci_free_consistent(h->pdev, sizeof(ErrorInfo_struct),
1005 c->err_info, (dma_addr_t) temp64.val); 1042 c->err_info, (dma_addr_t) temp64.val);
1006 pci_free_consistent(h->pdev, sizeof(CommandList_struct), 1043 pci_free_consistent(h->pdev, sizeof(CommandList_struct), c,
1007 c, (dma_addr_t) c->busaddr); 1044 (dma_addr_t) cciss_tag_discard_error_bits(h, (u32) c->busaddr));
1008} 1045}
1009 1046
1010static inline ctlr_info_t *get_host(struct gendisk *disk) 1047static inline ctlr_info_t *get_host(struct gendisk *disk)
@@ -1059,9 +1096,9 @@ static int cciss_unlocked_open(struct block_device *bdev, fmode_t mode)
1059{ 1096{
1060 int ret; 1097 int ret;
1061 1098
1062 lock_kernel(); 1099 mutex_lock(&cciss_mutex);
1063 ret = cciss_open(bdev, mode); 1100 ret = cciss_open(bdev, mode);
1064 unlock_kernel(); 1101 mutex_unlock(&cciss_mutex);
1065 1102
1066 return ret; 1103 return ret;
1067} 1104}
@@ -1074,13 +1111,13 @@ static int cciss_release(struct gendisk *disk, fmode_t mode)
1074 ctlr_info_t *h; 1111 ctlr_info_t *h;
1075 drive_info_struct *drv; 1112 drive_info_struct *drv;
1076 1113
1077 lock_kernel(); 1114 mutex_lock(&cciss_mutex);
1078 h = get_host(disk); 1115 h = get_host(disk);
1079 drv = get_drv(disk); 1116 drv = get_drv(disk);
1080 dev_dbg(&h->pdev->dev, "cciss_release %s\n", disk->disk_name); 1117 dev_dbg(&h->pdev->dev, "cciss_release %s\n", disk->disk_name);
1081 drv->usage_count--; 1118 drv->usage_count--;
1082 h->usage_count--; 1119 h->usage_count--;
1083 unlock_kernel(); 1120 mutex_unlock(&cciss_mutex);
1084 return 0; 1121 return 0;
1085} 1122}
1086 1123
@@ -1088,9 +1125,9 @@ static int do_ioctl(struct block_device *bdev, fmode_t mode,
1088 unsigned cmd, unsigned long arg) 1125 unsigned cmd, unsigned long arg)
1089{ 1126{
1090 int ret; 1127 int ret;
1091 lock_kernel(); 1128 mutex_lock(&cciss_mutex);
1092 ret = cciss_ioctl(bdev, mode, cmd, arg); 1129 ret = cciss_ioctl(bdev, mode, cmd, arg);
1093 unlock_kernel(); 1130 mutex_unlock(&cciss_mutex);
1094 return ret; 1131 return ret;
1095} 1132}
1096 1133
@@ -1182,6 +1219,7 @@ static int cciss_ioctl32_big_passthru(struct block_device *bdev, fmode_t mode,
1182 int err; 1219 int err;
1183 u32 cp; 1220 u32 cp;
1184 1221
1222 memset(&arg64, 0, sizeof(arg64));
1185 err = 0; 1223 err = 0;
1186 err |= 1224 err |=
1187 copy_from_user(&arg64.LUN_info, &arg32->LUN_info, 1225 copy_from_user(&arg64.LUN_info, &arg32->LUN_info,
@@ -1232,470 +1270,451 @@ static void check_ioctl_unit_attention(ctlr_info_t *h, CommandList_struct *c)
1232 c->err_info->ScsiStatus != SAM_STAT_CHECK_CONDITION) 1270 c->err_info->ScsiStatus != SAM_STAT_CHECK_CONDITION)
1233 (void)check_for_unit_attention(h, c); 1271 (void)check_for_unit_attention(h, c);
1234} 1272}
1235/* 1273
1236 * ioctl 1274static int cciss_getpciinfo(ctlr_info_t *h, void __user *argp)
1237 */
1238static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
1239 unsigned int cmd, unsigned long arg)
1240{ 1275{
1241 struct gendisk *disk = bdev->bd_disk; 1276 cciss_pci_info_struct pciinfo;
1242 ctlr_info_t *h = get_host(disk);
1243 drive_info_struct *drv = get_drv(disk);
1244 void __user *argp = (void __user *)arg;
1245 1277
1246 dev_dbg(&h->pdev->dev, "cciss_ioctl: Called with cmd=%x %lx\n", 1278 if (!argp)
1247 cmd, arg); 1279 return -EINVAL;
1248 switch (cmd) { 1280 pciinfo.domain = pci_domain_nr(h->pdev->bus);
1249 case CCISS_GETPCIINFO: 1281 pciinfo.bus = h->pdev->bus->number;
1250 { 1282 pciinfo.dev_fn = h->pdev->devfn;
1251 cciss_pci_info_struct pciinfo; 1283 pciinfo.board_id = h->board_id;
1252 1284 if (copy_to_user(argp, &pciinfo, sizeof(cciss_pci_info_struct)))
1253 if (!arg) 1285 return -EFAULT;
1254 return -EINVAL; 1286 return 0;
1255 pciinfo.domain = pci_domain_nr(h->pdev->bus); 1287}
1256 pciinfo.bus = h->pdev->bus->number;
1257 pciinfo.dev_fn = h->pdev->devfn;
1258 pciinfo.board_id = h->board_id;
1259 if (copy_to_user
1260 (argp, &pciinfo, sizeof(cciss_pci_info_struct)))
1261 return -EFAULT;
1262 return 0;
1263 }
1264 case CCISS_GETINTINFO:
1265 {
1266 cciss_coalint_struct intinfo;
1267 if (!arg)
1268 return -EINVAL;
1269 intinfo.delay =
1270 readl(&h->cfgtable->HostWrite.CoalIntDelay);
1271 intinfo.count =
1272 readl(&h->cfgtable->HostWrite.CoalIntCount);
1273 if (copy_to_user
1274 (argp, &intinfo, sizeof(cciss_coalint_struct)))
1275 return -EFAULT;
1276 return 0;
1277 }
1278 case CCISS_SETINTINFO:
1279 {
1280 cciss_coalint_struct intinfo;
1281 unsigned long flags;
1282 int i;
1283
1284 if (!arg)
1285 return -EINVAL;
1286 if (!capable(CAP_SYS_ADMIN))
1287 return -EPERM;
1288 if (copy_from_user
1289 (&intinfo, argp, sizeof(cciss_coalint_struct)))
1290 return -EFAULT;
1291 if ((intinfo.delay == 0) && (intinfo.count == 0))
1292 return -EINVAL;
1293 spin_lock_irqsave(&h->lock, flags);
1294 /* Update the field, and then ring the doorbell */
1295 writel(intinfo.delay,
1296 &(h->cfgtable->HostWrite.CoalIntDelay));
1297 writel(intinfo.count,
1298 &(h->cfgtable->HostWrite.CoalIntCount));
1299 writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL);
1300
1301 for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) {
1302 if (!(readl(h->vaddr + SA5_DOORBELL)
1303 & CFGTBL_ChangeReq))
1304 break;
1305 /* delay and try again */
1306 udelay(1000);
1307 }
1308 spin_unlock_irqrestore(&h->lock, flags);
1309 if (i >= MAX_IOCTL_CONFIG_WAIT)
1310 return -EAGAIN;
1311 return 0;
1312 }
1313 case CCISS_GETNODENAME:
1314 {
1315 NodeName_type NodeName;
1316 int i;
1317
1318 if (!arg)
1319 return -EINVAL;
1320 for (i = 0; i < 16; i++)
1321 NodeName[i] =
1322 readb(&h->cfgtable->ServerName[i]);
1323 if (copy_to_user(argp, NodeName, sizeof(NodeName_type)))
1324 return -EFAULT;
1325 return 0;
1326 }
1327 case CCISS_SETNODENAME:
1328 {
1329 NodeName_type NodeName;
1330 unsigned long flags;
1331 int i;
1332 1288
1333 if (!arg) 1289static int cciss_getintinfo(ctlr_info_t *h, void __user *argp)
1334 return -EINVAL; 1290{
1335 if (!capable(CAP_SYS_ADMIN)) 1291 cciss_coalint_struct intinfo;
1336 return -EPERM;
1337 1292
1338 if (copy_from_user 1293 if (!argp)
1339 (NodeName, argp, sizeof(NodeName_type))) 1294 return -EINVAL;
1340 return -EFAULT; 1295 intinfo.delay = readl(&h->cfgtable->HostWrite.CoalIntDelay);
1296 intinfo.count = readl(&h->cfgtable->HostWrite.CoalIntCount);
1297 if (copy_to_user
1298 (argp, &intinfo, sizeof(cciss_coalint_struct)))
1299 return -EFAULT;
1300 return 0;
1301}
1341 1302
1342 spin_lock_irqsave(&h->lock, flags); 1303static int cciss_setintinfo(ctlr_info_t *h, void __user *argp)
1304{
1305 cciss_coalint_struct intinfo;
1306 unsigned long flags;
1307 int i;
1343 1308
1344 /* Update the field, and then ring the doorbell */ 1309 if (!argp)
1345 for (i = 0; i < 16; i++) 1310 return -EINVAL;
1346 writeb(NodeName[i], 1311 if (!capable(CAP_SYS_ADMIN))
1347 &h->cfgtable->ServerName[i]); 1312 return -EPERM;
1313 if (copy_from_user(&intinfo, argp, sizeof(intinfo)))
1314 return -EFAULT;
1315 if ((intinfo.delay == 0) && (intinfo.count == 0))
1316 return -EINVAL;
1317 spin_lock_irqsave(&h->lock, flags);
1318 /* Update the field, and then ring the doorbell */
1319 writel(intinfo.delay, &(h->cfgtable->HostWrite.CoalIntDelay));
1320 writel(intinfo.count, &(h->cfgtable->HostWrite.CoalIntCount));
1321 writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL);
1348 1322
1349 writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL); 1323 for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) {
1324 if (!(readl(h->vaddr + SA5_DOORBELL) & CFGTBL_ChangeReq))
1325 break;
1326 udelay(1000); /* delay and try again */
1327 }
1328 spin_unlock_irqrestore(&h->lock, flags);
1329 if (i >= MAX_IOCTL_CONFIG_WAIT)
1330 return -EAGAIN;
1331 return 0;
1332}
1350 1333
1351 for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) { 1334static int cciss_getnodename(ctlr_info_t *h, void __user *argp)
1352 if (!(readl(h->vaddr + SA5_DOORBELL) 1335{
1353 & CFGTBL_ChangeReq)) 1336 NodeName_type NodeName;
1354 break; 1337 int i;
1355 /* delay and try again */
1356 udelay(1000);
1357 }
1358 spin_unlock_irqrestore(&h->lock, flags);
1359 if (i >= MAX_IOCTL_CONFIG_WAIT)
1360 return -EAGAIN;
1361 return 0;
1362 }
1363 1338
1364 case CCISS_GETHEARTBEAT: 1339 if (!argp)
1365 { 1340 return -EINVAL;
1366 Heartbeat_type heartbeat; 1341 for (i = 0; i < 16; i++)
1367 1342 NodeName[i] = readb(&h->cfgtable->ServerName[i]);
1368 if (!arg) 1343 if (copy_to_user(argp, NodeName, sizeof(NodeName_type)))
1369 return -EINVAL; 1344 return -EFAULT;
1370 heartbeat = readl(&h->cfgtable->HeartBeat); 1345 return 0;
1371 if (copy_to_user 1346}
1372 (argp, &heartbeat, sizeof(Heartbeat_type)))
1373 return -EFAULT;
1374 return 0;
1375 }
1376 case CCISS_GETBUSTYPES:
1377 {
1378 BusTypes_type BusTypes;
1379
1380 if (!arg)
1381 return -EINVAL;
1382 BusTypes = readl(&h->cfgtable->BusTypes);
1383 if (copy_to_user
1384 (argp, &BusTypes, sizeof(BusTypes_type)))
1385 return -EFAULT;
1386 return 0;
1387 }
1388 case CCISS_GETFIRMVER:
1389 {
1390 FirmwareVer_type firmware;
1391 1347
1392 if (!arg) 1348static int cciss_setnodename(ctlr_info_t *h, void __user *argp)
1393 return -EINVAL; 1349{
1394 memcpy(firmware, h->firm_ver, 4); 1350 NodeName_type NodeName;
1351 unsigned long flags;
1352 int i;
1395 1353
1396 if (copy_to_user 1354 if (!argp)
1397 (argp, firmware, sizeof(FirmwareVer_type))) 1355 return -EINVAL;
1398 return -EFAULT; 1356 if (!capable(CAP_SYS_ADMIN))
1399 return 0; 1357 return -EPERM;
1400 } 1358 if (copy_from_user(NodeName, argp, sizeof(NodeName_type)))
1401 case CCISS_GETDRIVVER: 1359 return -EFAULT;
1402 { 1360 spin_lock_irqsave(&h->lock, flags);
1403 DriverVer_type DriverVer = DRIVER_VERSION; 1361 /* Update the field, and then ring the doorbell */
1362 for (i = 0; i < 16; i++)
1363 writeb(NodeName[i], &h->cfgtable->ServerName[i]);
1364 writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL);
1365 for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) {
1366 if (!(readl(h->vaddr + SA5_DOORBELL) & CFGTBL_ChangeReq))
1367 break;
1368 udelay(1000); /* delay and try again */
1369 }
1370 spin_unlock_irqrestore(&h->lock, flags);
1371 if (i >= MAX_IOCTL_CONFIG_WAIT)
1372 return -EAGAIN;
1373 return 0;
1374}
1404 1375
1405 if (!arg) 1376static int cciss_getheartbeat(ctlr_info_t *h, void __user *argp)
1406 return -EINVAL; 1377{
1378 Heartbeat_type heartbeat;
1407 1379
1408 if (copy_to_user 1380 if (!argp)
1409 (argp, &DriverVer, sizeof(DriverVer_type))) 1381 return -EINVAL;
1410 return -EFAULT; 1382 heartbeat = readl(&h->cfgtable->HeartBeat);
1411 return 0; 1383 if (copy_to_user(argp, &heartbeat, sizeof(Heartbeat_type)))
1412 } 1384 return -EFAULT;
1385 return 0;
1386}
1413 1387
1414 case CCISS_DEREGDISK: 1388static int cciss_getbustypes(ctlr_info_t *h, void __user *argp)
1415 case CCISS_REGNEWD: 1389{
1416 case CCISS_REVALIDVOLS: 1390 BusTypes_type BusTypes;
1417 return rebuild_lun_table(h, 0, 1);
1418 1391
1419 case CCISS_GETLUNINFO:{ 1392 if (!argp)
1420 LogvolInfo_struct luninfo; 1393 return -EINVAL;
1394 BusTypes = readl(&h->cfgtable->BusTypes);
1395 if (copy_to_user(argp, &BusTypes, sizeof(BusTypes_type)))
1396 return -EFAULT;
1397 return 0;
1398}
1421 1399
1422 memcpy(&luninfo.LunID, drv->LunID, 1400static int cciss_getfirmver(ctlr_info_t *h, void __user *argp)
1423 sizeof(luninfo.LunID)); 1401{
1424 luninfo.num_opens = drv->usage_count; 1402 FirmwareVer_type firmware;
1425 luninfo.num_parts = 0; 1403
1426 if (copy_to_user(argp, &luninfo, 1404 if (!argp)
1427 sizeof(LogvolInfo_struct))) 1405 return -EINVAL;
1428 return -EFAULT; 1406 memcpy(firmware, h->firm_ver, 4);
1429 return 0; 1407
1408 if (copy_to_user
1409 (argp, firmware, sizeof(FirmwareVer_type)))
1410 return -EFAULT;
1411 return 0;
1412}
1413
1414static int cciss_getdrivver(ctlr_info_t *h, void __user *argp)
1415{
1416 DriverVer_type DriverVer = DRIVER_VERSION;
1417
1418 if (!argp)
1419 return -EINVAL;
1420 if (copy_to_user(argp, &DriverVer, sizeof(DriverVer_type)))
1421 return -EFAULT;
1422 return 0;
1423}
1424
1425static int cciss_getluninfo(ctlr_info_t *h,
1426 struct gendisk *disk, void __user *argp)
1427{
1428 LogvolInfo_struct luninfo;
1429 drive_info_struct *drv = get_drv(disk);
1430
1431 if (!argp)
1432 return -EINVAL;
1433 memcpy(&luninfo.LunID, drv->LunID, sizeof(luninfo.LunID));
1434 luninfo.num_opens = drv->usage_count;
1435 luninfo.num_parts = 0;
1436 if (copy_to_user(argp, &luninfo, sizeof(LogvolInfo_struct)))
1437 return -EFAULT;
1438 return 0;
1439}
1440
1441static int cciss_passthru(ctlr_info_t *h, void __user *argp)
1442{
1443 IOCTL_Command_struct iocommand;
1444 CommandList_struct *c;
1445 char *buff = NULL;
1446 u64bit temp64;
1447 DECLARE_COMPLETION_ONSTACK(wait);
1448
1449 if (!argp)
1450 return -EINVAL;
1451
1452 if (!capable(CAP_SYS_RAWIO))
1453 return -EPERM;
1454
1455 if (copy_from_user
1456 (&iocommand, argp, sizeof(IOCTL_Command_struct)))
1457 return -EFAULT;
1458 if ((iocommand.buf_size < 1) &&
1459 (iocommand.Request.Type.Direction != XFER_NONE)) {
1460 return -EINVAL;
1461 }
1462 if (iocommand.buf_size > 0) {
1463 buff = kmalloc(iocommand.buf_size, GFP_KERNEL);
1464 if (buff == NULL)
1465 return -EFAULT;
1466 }
1467 if (iocommand.Request.Type.Direction == XFER_WRITE) {
1468 /* Copy the data into the buffer we created */
1469 if (copy_from_user(buff, iocommand.buf, iocommand.buf_size)) {
1470 kfree(buff);
1471 return -EFAULT;
1430 } 1472 }
1431 case CCISS_PASSTHRU: 1473 } else {
1432 { 1474 memset(buff, 0, iocommand.buf_size);
1433 IOCTL_Command_struct iocommand; 1475 }
1434 CommandList_struct *c; 1476 c = cmd_special_alloc(h);
1435 char *buff = NULL; 1477 if (!c) {
1436 u64bit temp64; 1478 kfree(buff);
1437 DECLARE_COMPLETION_ONSTACK(wait); 1479 return -ENOMEM;
1438 1480 }
1439 if (!arg) 1481 /* Fill in the command type */
1440 return -EINVAL; 1482 c->cmd_type = CMD_IOCTL_PEND;
1441 1483 /* Fill in Command Header */
1442 if (!capable(CAP_SYS_RAWIO)) 1484 c->Header.ReplyQueue = 0; /* unused in simple mode */
1443 return -EPERM; 1485 if (iocommand.buf_size > 0) { /* buffer to fill */
1444 1486 c->Header.SGList = 1;
1445 if (copy_from_user 1487 c->Header.SGTotal = 1;
1446 (&iocommand, argp, sizeof(IOCTL_Command_struct))) 1488 } else { /* no buffers to fill */
1447 return -EFAULT; 1489 c->Header.SGList = 0;
1448 if ((iocommand.buf_size < 1) && 1490 c->Header.SGTotal = 0;
1449 (iocommand.Request.Type.Direction != XFER_NONE)) { 1491 }
1450 return -EINVAL; 1492 c->Header.LUN = iocommand.LUN_info;
1451 } 1493 /* use the kernel address the cmd block for tag */
1452#if 0 /* 'buf_size' member is 16-bits, and always smaller than kmalloc limit */ 1494 c->Header.Tag.lower = c->busaddr;
1453 /* Check kmalloc limits */
1454 if (iocommand.buf_size > 128000)
1455 return -EINVAL;
1456#endif
1457 if (iocommand.buf_size > 0) {
1458 buff = kmalloc(iocommand.buf_size, GFP_KERNEL);
1459 if (buff == NULL)
1460 return -EFAULT;
1461 }
1462 if (iocommand.Request.Type.Direction == XFER_WRITE) {
1463 /* Copy the data into the buffer we created */
1464 if (copy_from_user
1465 (buff, iocommand.buf, iocommand.buf_size)) {
1466 kfree(buff);
1467 return -EFAULT;
1468 }
1469 } else {
1470 memset(buff, 0, iocommand.buf_size);
1471 }
1472 c = cmd_special_alloc(h);
1473 if (!c) {
1474 kfree(buff);
1475 return -ENOMEM;
1476 }
1477 /* Fill in the command type */
1478 c->cmd_type = CMD_IOCTL_PEND;
1479 /* Fill in Command Header */
1480 c->Header.ReplyQueue = 0; /* unused in simple mode */
1481 if (iocommand.buf_size > 0) /* buffer to fill */
1482 {
1483 c->Header.SGList = 1;
1484 c->Header.SGTotal = 1;
1485 } else /* no buffers to fill */
1486 {
1487 c->Header.SGList = 0;
1488 c->Header.SGTotal = 0;
1489 }
1490 c->Header.LUN = iocommand.LUN_info;
1491 /* use the kernel address the cmd block for tag */
1492 c->Header.Tag.lower = c->busaddr;
1493
1494 /* Fill in Request block */
1495 c->Request = iocommand.Request;
1496
1497 /* Fill in the scatter gather information */
1498 if (iocommand.buf_size > 0) {
1499 temp64.val = pci_map_single(h->pdev, buff,
1500 iocommand.buf_size,
1501 PCI_DMA_BIDIRECTIONAL);
1502 c->SG[0].Addr.lower = temp64.val32.lower;
1503 c->SG[0].Addr.upper = temp64.val32.upper;
1504 c->SG[0].Len = iocommand.buf_size;
1505 c->SG[0].Ext = 0; /* we are not chaining */
1506 }
1507 c->waiting = &wait;
1508 1495
1509 enqueue_cmd_and_start_io(h, c); 1496 /* Fill in Request block */
1510 wait_for_completion(&wait); 1497 c->Request = iocommand.Request;
1511 1498
1512 /* unlock the buffers from DMA */ 1499 /* Fill in the scatter gather information */
1513 temp64.val32.lower = c->SG[0].Addr.lower; 1500 if (iocommand.buf_size > 0) {
1514 temp64.val32.upper = c->SG[0].Addr.upper; 1501 temp64.val = pci_map_single(h->pdev, buff,
1515 pci_unmap_single(h->pdev, (dma_addr_t) temp64.val, 1502 iocommand.buf_size, PCI_DMA_BIDIRECTIONAL);
1516 iocommand.buf_size, 1503 c->SG[0].Addr.lower = temp64.val32.lower;
1517 PCI_DMA_BIDIRECTIONAL); 1504 c->SG[0].Addr.upper = temp64.val32.upper;
1505 c->SG[0].Len = iocommand.buf_size;
1506 c->SG[0].Ext = 0; /* we are not chaining */
1507 }
1508 c->waiting = &wait;
1518 1509
1519 check_ioctl_unit_attention(h, c); 1510 enqueue_cmd_and_start_io(h, c);
1511 wait_for_completion(&wait);
1520 1512
1521 /* Copy the error information out */ 1513 /* unlock the buffers from DMA */
1522 iocommand.error_info = *(c->err_info); 1514 temp64.val32.lower = c->SG[0].Addr.lower;
1523 if (copy_to_user 1515 temp64.val32.upper = c->SG[0].Addr.upper;
1524 (argp, &iocommand, sizeof(IOCTL_Command_struct))) { 1516 pci_unmap_single(h->pdev, (dma_addr_t) temp64.val, iocommand.buf_size,
1525 kfree(buff); 1517 PCI_DMA_BIDIRECTIONAL);
1526 cmd_special_free(h, c); 1518 check_ioctl_unit_attention(h, c);
1527 return -EFAULT; 1519
1528 } 1520 /* Copy the error information out */
1521 iocommand.error_info = *(c->err_info);
1522 if (copy_to_user(argp, &iocommand, sizeof(IOCTL_Command_struct))) {
1523 kfree(buff);
1524 cmd_special_free(h, c);
1525 return -EFAULT;
1526 }
1529 1527
1530 if (iocommand.Request.Type.Direction == XFER_READ) { 1528 if (iocommand.Request.Type.Direction == XFER_READ) {
1531 /* Copy the data out of the buffer we created */ 1529 /* Copy the data out of the buffer we created */
1532 if (copy_to_user 1530 if (copy_to_user(iocommand.buf, buff, iocommand.buf_size)) {
1533 (iocommand.buf, buff, iocommand.buf_size)) {
1534 kfree(buff);
1535 cmd_special_free(h, c);
1536 return -EFAULT;
1537 }
1538 }
1539 kfree(buff); 1531 kfree(buff);
1540 cmd_special_free(h, c); 1532 cmd_special_free(h, c);
1541 return 0; 1533 return -EFAULT;
1542 } 1534 }
1543 case CCISS_BIG_PASSTHRU:{ 1535 }
1544 BIG_IOCTL_Command_struct *ioc; 1536 kfree(buff);
1545 CommandList_struct *c; 1537 cmd_special_free(h, c);
1546 unsigned char **buff = NULL; 1538 return 0;
1547 int *buff_size = NULL; 1539}
1548 u64bit temp64; 1540
1549 BYTE sg_used = 0; 1541static int cciss_bigpassthru(ctlr_info_t *h, void __user *argp)
1550 int status = 0; 1542{
1551 int i; 1543 BIG_IOCTL_Command_struct *ioc;
1552 DECLARE_COMPLETION_ONSTACK(wait); 1544 CommandList_struct *c;
1553 __u32 left; 1545 unsigned char **buff = NULL;
1554 __u32 sz; 1546 int *buff_size = NULL;
1555 BYTE __user *data_ptr; 1547 u64bit temp64;
1556 1548 BYTE sg_used = 0;
1557 if (!arg) 1549 int status = 0;
1558 return -EINVAL; 1550 int i;
1559 if (!capable(CAP_SYS_RAWIO)) 1551 DECLARE_COMPLETION_ONSTACK(wait);
1560 return -EPERM; 1552 __u32 left;
1561 ioc = (BIG_IOCTL_Command_struct *) 1553 __u32 sz;
1562 kmalloc(sizeof(*ioc), GFP_KERNEL); 1554 BYTE __user *data_ptr;
1563 if (!ioc) { 1555
1564 status = -ENOMEM; 1556 if (!argp)
1565 goto cleanup1; 1557 return -EINVAL;
1566 } 1558 if (!capable(CAP_SYS_RAWIO))
1567 if (copy_from_user(ioc, argp, sizeof(*ioc))) { 1559 return -EPERM;
1560 ioc = kmalloc(sizeof(*ioc), GFP_KERNEL);
1561 if (!ioc) {
1562 status = -ENOMEM;
1563 goto cleanup1;
1564 }
1565 if (copy_from_user(ioc, argp, sizeof(*ioc))) {
1566 status = -EFAULT;
1567 goto cleanup1;
1568 }
1569 if ((ioc->buf_size < 1) &&
1570 (ioc->Request.Type.Direction != XFER_NONE)) {
1571 status = -EINVAL;
1572 goto cleanup1;
1573 }
1574 /* Check kmalloc limits using all SGs */
1575 if (ioc->malloc_size > MAX_KMALLOC_SIZE) {
1576 status = -EINVAL;
1577 goto cleanup1;
1578 }
1579 if (ioc->buf_size > ioc->malloc_size * MAXSGENTRIES) {
1580 status = -EINVAL;
1581 goto cleanup1;
1582 }
1583 buff = kzalloc(MAXSGENTRIES * sizeof(char *), GFP_KERNEL);
1584 if (!buff) {
1585 status = -ENOMEM;
1586 goto cleanup1;
1587 }
1588 buff_size = kmalloc(MAXSGENTRIES * sizeof(int), GFP_KERNEL);
1589 if (!buff_size) {
1590 status = -ENOMEM;
1591 goto cleanup1;
1592 }
1593 left = ioc->buf_size;
1594 data_ptr = ioc->buf;
1595 while (left) {
1596 sz = (left > ioc->malloc_size) ? ioc->malloc_size : left;
1597 buff_size[sg_used] = sz;
1598 buff[sg_used] = kmalloc(sz, GFP_KERNEL);
1599 if (buff[sg_used] == NULL) {
1600 status = -ENOMEM;
1601 goto cleanup1;
1602 }
1603 if (ioc->Request.Type.Direction == XFER_WRITE) {
1604 if (copy_from_user(buff[sg_used], data_ptr, sz)) {
1568 status = -EFAULT; 1605 status = -EFAULT;
1569 goto cleanup1; 1606 goto cleanup1;
1570 } 1607 }
1571 if ((ioc->buf_size < 1) && 1608 } else {
1572 (ioc->Request.Type.Direction != XFER_NONE)) { 1609 memset(buff[sg_used], 0, sz);
1573 status = -EINVAL; 1610 }
1574 goto cleanup1; 1611 left -= sz;
1575 } 1612 data_ptr += sz;
1576 /* Check kmalloc limits using all SGs */ 1613 sg_used++;
1577 if (ioc->malloc_size > MAX_KMALLOC_SIZE) { 1614 }
1578 status = -EINVAL; 1615 c = cmd_special_alloc(h);
1579 goto cleanup1; 1616 if (!c) {
1580 } 1617 status = -ENOMEM;
1581 if (ioc->buf_size > ioc->malloc_size * MAXSGENTRIES) { 1618 goto cleanup1;
1582 status = -EINVAL; 1619 }
1583 goto cleanup1; 1620 c->cmd_type = CMD_IOCTL_PEND;
1584 } 1621 c->Header.ReplyQueue = 0;
1585 buff = 1622 c->Header.SGList = sg_used;
1586 kzalloc(MAXSGENTRIES * sizeof(char *), GFP_KERNEL); 1623 c->Header.SGTotal = sg_used;
1587 if (!buff) { 1624 c->Header.LUN = ioc->LUN_info;
1588 status = -ENOMEM; 1625 c->Header.Tag.lower = c->busaddr;
1589 goto cleanup1;
1590 }
1591 buff_size = kmalloc(MAXSGENTRIES * sizeof(int),
1592 GFP_KERNEL);
1593 if (!buff_size) {
1594 status = -ENOMEM;
1595 goto cleanup1;
1596 }
1597 left = ioc->buf_size;
1598 data_ptr = ioc->buf;
1599 while (left) {
1600 sz = (left >
1601 ioc->malloc_size) ? ioc->
1602 malloc_size : left;
1603 buff_size[sg_used] = sz;
1604 buff[sg_used] = kmalloc(sz, GFP_KERNEL);
1605 if (buff[sg_used] == NULL) {
1606 status = -ENOMEM;
1607 goto cleanup1;
1608 }
1609 if (ioc->Request.Type.Direction == XFER_WRITE) {
1610 if (copy_from_user
1611 (buff[sg_used], data_ptr, sz)) {
1612 status = -EFAULT;
1613 goto cleanup1;
1614 }
1615 } else {
1616 memset(buff[sg_used], 0, sz);
1617 }
1618 left -= sz;
1619 data_ptr += sz;
1620 sg_used++;
1621 }
1622 c = cmd_special_alloc(h);
1623 if (!c) {
1624 status = -ENOMEM;
1625 goto cleanup1;
1626 }
1627 c->cmd_type = CMD_IOCTL_PEND;
1628 c->Header.ReplyQueue = 0;
1629 1626
1630 if (ioc->buf_size > 0) { 1627 c->Request = ioc->Request;
1631 c->Header.SGList = sg_used; 1628 for (i = 0; i < sg_used; i++) {
1632 c->Header.SGTotal = sg_used; 1629 temp64.val = pci_map_single(h->pdev, buff[i], buff_size[i],
1633 } else { 1630 PCI_DMA_BIDIRECTIONAL);
1634 c->Header.SGList = 0; 1631 c->SG[i].Addr.lower = temp64.val32.lower;
1635 c->Header.SGTotal = 0; 1632 c->SG[i].Addr.upper = temp64.val32.upper;
1636 } 1633 c->SG[i].Len = buff_size[i];
1637 c->Header.LUN = ioc->LUN_info; 1634 c->SG[i].Ext = 0; /* we are not chaining */
1638 c->Header.Tag.lower = c->busaddr; 1635 }
1639 1636 c->waiting = &wait;
1640 c->Request = ioc->Request; 1637 enqueue_cmd_and_start_io(h, c);
1641 if (ioc->buf_size > 0) { 1638 wait_for_completion(&wait);
1642 for (i = 0; i < sg_used; i++) { 1639 /* unlock the buffers from DMA */
1643 temp64.val = 1640 for (i = 0; i < sg_used; i++) {
1644 pci_map_single(h->pdev, buff[i], 1641 temp64.val32.lower = c->SG[i].Addr.lower;
1645 buff_size[i], 1642 temp64.val32.upper = c->SG[i].Addr.upper;
1646 PCI_DMA_BIDIRECTIONAL); 1643 pci_unmap_single(h->pdev,
1647 c->SG[i].Addr.lower = 1644 (dma_addr_t) temp64.val, buff_size[i],
1648 temp64.val32.lower; 1645 PCI_DMA_BIDIRECTIONAL);
1649 c->SG[i].Addr.upper = 1646 }
1650 temp64.val32.upper; 1647 check_ioctl_unit_attention(h, c);
1651 c->SG[i].Len = buff_size[i]; 1648 /* Copy the error information out */
1652 c->SG[i].Ext = 0; /* we are not chaining */ 1649 ioc->error_info = *(c->err_info);
1653 } 1650 if (copy_to_user(argp, ioc, sizeof(*ioc))) {
1654 } 1651 cmd_special_free(h, c);
1655 c->waiting = &wait; 1652 status = -EFAULT;
1656 enqueue_cmd_and_start_io(h, c); 1653 goto cleanup1;
1657 wait_for_completion(&wait); 1654 }
1658 /* unlock the buffers from DMA */ 1655 if (ioc->Request.Type.Direction == XFER_READ) {
1659 for (i = 0; i < sg_used; i++) { 1656 /* Copy the data out of the buffer we created */
1660 temp64.val32.lower = c->SG[i].Addr.lower; 1657 BYTE __user *ptr = ioc->buf;
1661 temp64.val32.upper = c->SG[i].Addr.upper; 1658 for (i = 0; i < sg_used; i++) {
1662 pci_unmap_single(h->pdev, 1659 if (copy_to_user(ptr, buff[i], buff_size[i])) {
1663 (dma_addr_t) temp64.val, buff_size[i],
1664 PCI_DMA_BIDIRECTIONAL);
1665 }
1666 check_ioctl_unit_attention(h, c);
1667 /* Copy the error information out */
1668 ioc->error_info = *(c->err_info);
1669 if (copy_to_user(argp, ioc, sizeof(*ioc))) {
1670 cmd_special_free(h, c); 1660 cmd_special_free(h, c);
1671 status = -EFAULT; 1661 status = -EFAULT;
1672 goto cleanup1; 1662 goto cleanup1;
1673 } 1663 }
1674 if (ioc->Request.Type.Direction == XFER_READ) { 1664 ptr += buff_size[i];
1675 /* Copy the data out of the buffer we created */
1676 BYTE __user *ptr = ioc->buf;
1677 for (i = 0; i < sg_used; i++) {
1678 if (copy_to_user
1679 (ptr, buff[i], buff_size[i])) {
1680 cmd_special_free(h, c);
1681 status = -EFAULT;
1682 goto cleanup1;
1683 }
1684 ptr += buff_size[i];
1685 }
1686 }
1687 cmd_special_free(h, c);
1688 status = 0;
1689 cleanup1:
1690 if (buff) {
1691 for (i = 0; i < sg_used; i++)
1692 kfree(buff[i]);
1693 kfree(buff);
1694 }
1695 kfree(buff_size);
1696 kfree(ioc);
1697 return status;
1698 } 1665 }
1666 }
1667 cmd_special_free(h, c);
1668 status = 0;
1669cleanup1:
1670 if (buff) {
1671 for (i = 0; i < sg_used; i++)
1672 kfree(buff[i]);
1673 kfree(buff);
1674 }
1675 kfree(buff_size);
1676 kfree(ioc);
1677 return status;
1678}
1679
1680static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
1681 unsigned int cmd, unsigned long arg)
1682{
1683 struct gendisk *disk = bdev->bd_disk;
1684 ctlr_info_t *h = get_host(disk);
1685 void __user *argp = (void __user *)arg;
1686
1687 dev_dbg(&h->pdev->dev, "cciss_ioctl: Called with cmd=%x %lx\n",
1688 cmd, arg);
1689 switch (cmd) {
1690 case CCISS_GETPCIINFO:
1691 return cciss_getpciinfo(h, argp);
1692 case CCISS_GETINTINFO:
1693 return cciss_getintinfo(h, argp);
1694 case CCISS_SETINTINFO:
1695 return cciss_setintinfo(h, argp);
1696 case CCISS_GETNODENAME:
1697 return cciss_getnodename(h, argp);
1698 case CCISS_SETNODENAME:
1699 return cciss_setnodename(h, argp);
1700 case CCISS_GETHEARTBEAT:
1701 return cciss_getheartbeat(h, argp);
1702 case CCISS_GETBUSTYPES:
1703 return cciss_getbustypes(h, argp);
1704 case CCISS_GETFIRMVER:
1705 return cciss_getfirmver(h, argp);
1706 case CCISS_GETDRIVVER:
1707 return cciss_getdrivver(h, argp);
1708 case CCISS_DEREGDISK:
1709 case CCISS_REGNEWD:
1710 case CCISS_REVALIDVOLS:
1711 return rebuild_lun_table(h, 0, 1);
1712 case CCISS_GETLUNINFO:
1713 return cciss_getluninfo(h, disk, argp);
1714 case CCISS_PASSTHRU:
1715 return cciss_passthru(h, argp);
1716 case CCISS_BIG_PASSTHRU:
1717 return cciss_bigpassthru(h, argp);
1699 1718
1700 /* scsi_cmd_ioctl handles these, below, though some are not */ 1719 /* scsi_cmd_ioctl handles these, below, though some are not */
1701 /* very meaningful for cciss. SG_IO is the main one people want. */ 1720 /* very meaningful for cciss. SG_IO is the main one people want. */
@@ -2576,7 +2595,7 @@ static int fill_cmd(ctlr_info_t *h, CommandList_struct *c, __u8 cmd, void *buff,
2576 } 2595 }
2577 } else if (cmd_type == TYPE_MSG) { 2596 } else if (cmd_type == TYPE_MSG) {
2578 switch (cmd) { 2597 switch (cmd) {
2579 case 0: /* ABORT message */ 2598 case CCISS_ABORT_MSG:
2580 c->Request.CDBLen = 12; 2599 c->Request.CDBLen = 12;
2581 c->Request.Type.Attribute = ATTR_SIMPLE; 2600 c->Request.Type.Attribute = ATTR_SIMPLE;
2582 c->Request.Type.Direction = XFER_WRITE; 2601 c->Request.Type.Direction = XFER_WRITE;
@@ -2586,16 +2605,16 @@ static int fill_cmd(ctlr_info_t *h, CommandList_struct *c, __u8 cmd, void *buff,
2586 /* buff contains the tag of the command to abort */ 2605 /* buff contains the tag of the command to abort */
2587 memcpy(&c->Request.CDB[4], buff, 8); 2606 memcpy(&c->Request.CDB[4], buff, 8);
2588 break; 2607 break;
2589 case 1: /* RESET message */ 2608 case CCISS_RESET_MSG:
2590 c->Request.CDBLen = 16; 2609 c->Request.CDBLen = 16;
2591 c->Request.Type.Attribute = ATTR_SIMPLE; 2610 c->Request.Type.Attribute = ATTR_SIMPLE;
2592 c->Request.Type.Direction = XFER_NONE; 2611 c->Request.Type.Direction = XFER_NONE;
2593 c->Request.Timeout = 0; 2612 c->Request.Timeout = 0;
2594 memset(&c->Request.CDB[0], 0, sizeof(c->Request.CDB)); 2613 memset(&c->Request.CDB[0], 0, sizeof(c->Request.CDB));
2595 c->Request.CDB[0] = cmd; /* reset */ 2614 c->Request.CDB[0] = cmd; /* reset */
2596 c->Request.CDB[1] = 0x03; /* reset a target */ 2615 c->Request.CDB[1] = CCISS_RESET_TYPE_TARGET;
2597 break; 2616 break;
2598 case 3: /* No-Op message */ 2617 case CCISS_NOOP_MSG:
2599 c->Request.CDBLen = 1; 2618 c->Request.CDBLen = 1;
2600 c->Request.Type.Attribute = ATTR_SIMPLE; 2619 c->Request.Type.Attribute = ATTR_SIMPLE;
2601 c->Request.Type.Direction = XFER_WRITE; 2620 c->Request.Type.Direction = XFER_WRITE;
@@ -2624,6 +2643,31 @@ static int fill_cmd(ctlr_info_t *h, CommandList_struct *c, __u8 cmd, void *buff,
2624 return status; 2643 return status;
2625} 2644}
2626 2645
2646static int __devinit cciss_send_reset(ctlr_info_t *h, unsigned char *scsi3addr,
2647 u8 reset_type)
2648{
2649 CommandList_struct *c;
2650 int return_status;
2651
2652 c = cmd_alloc(h);
2653 if (!c)
2654 return -ENOMEM;
2655 return_status = fill_cmd(h, c, CCISS_RESET_MSG, NULL, 0, 0,
2656 CTLR_LUNID, TYPE_MSG);
2657 c->Request.CDB[1] = reset_type; /* fill_cmd defaults to target reset */
2658 if (return_status != IO_OK) {
2659 cmd_special_free(h, c);
2660 return return_status;
2661 }
2662 c->waiting = NULL;
2663 enqueue_cmd_and_start_io(h, c);
2664 /* Don't wait for completion, the reset won't complete. Don't free
2665 * the command either. This is the last command we will send before
2666 * re-initializing everything, so it doesn't matter and won't leak.
2667 */
2668 return 0;
2669}
2670
2627static int check_target_status(ctlr_info_t *h, CommandList_struct *c) 2671static int check_target_status(ctlr_info_t *h, CommandList_struct *c)
2628{ 2672{
2629 switch (c->err_info->ScsiStatus) { 2673 switch (c->err_info->ScsiStatus) {
@@ -2700,6 +2744,10 @@ static int process_sendcmd_error(ctlr_info_t *h, CommandList_struct *c)
2700 c->Request.CDB[0]); 2744 c->Request.CDB[0]);
2701 return_status = IO_NEEDS_RETRY; 2745 return_status = IO_NEEDS_RETRY;
2702 break; 2746 break;
2747 case CMD_UNABORTABLE:
2748 dev_warn(&h->pdev->dev, "cmd unabortable\n");
2749 return_status = IO_ERROR;
2750 break;
2703 default: 2751 default:
2704 dev_warn(&h->pdev->dev, "cmd 0x%02x returned " 2752 dev_warn(&h->pdev->dev, "cmd 0x%02x returned "
2705 "unknown status %x\n", c->Request.CDB[0], 2753 "unknown status %x\n", c->Request.CDB[0],
@@ -2880,7 +2928,9 @@ static int cciss_revalidate(struct gendisk *disk)
2880 sector_t total_size; 2928 sector_t total_size;
2881 InquiryData_struct *inq_buff = NULL; 2929 InquiryData_struct *inq_buff = NULL;
2882 2930
2883 for (logvol = 0; logvol < CISS_MAX_LUN; logvol++) { 2931 for (logvol = 0; logvol <= h->highest_lun; logvol++) {
2932 if (!h->drv[logvol])
2933 continue;
2884 if (memcmp(h->drv[logvol]->LunID, drv->LunID, 2934 if (memcmp(h->drv[logvol]->LunID, drv->LunID,
2885 sizeof(drv->LunID)) == 0) { 2935 sizeof(drv->LunID)) == 0) {
2886 FOUND = 1; 2936 FOUND = 1;
@@ -2933,8 +2983,8 @@ static void start_io(ctlr_info_t *h)
2933{ 2983{
2934 CommandList_struct *c; 2984 CommandList_struct *c;
2935 2985
2936 while (!hlist_empty(&h->reqQ)) { 2986 while (!list_empty(&h->reqQ)) {
2937 c = hlist_entry(h->reqQ.first, CommandList_struct, list); 2987 c = list_entry(h->reqQ.next, CommandList_struct, list);
2938 /* can't do anything if fifo is full */ 2988 /* can't do anything if fifo is full */
2939 if ((h->access.fifo_full(h))) { 2989 if ((h->access.fifo_full(h))) {
2940 dev_warn(&h->pdev->dev, "fifo full\n"); 2990 dev_warn(&h->pdev->dev, "fifo full\n");
@@ -3148,6 +3198,13 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
3148 (cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ? 3198 (cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
3149 DID_PASSTHROUGH : DID_ERROR); 3199 DID_PASSTHROUGH : DID_ERROR);
3150 break; 3200 break;
3201 case CMD_UNABORTABLE:
3202 dev_warn(&h->pdev->dev, "cmd %p unabortable\n", cmd);
3203 rq->errors = make_status_bytes(SAM_STAT_GOOD,
3204 cmd->err_info->CommandStatus, DRIVER_OK,
3205 cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC ?
3206 DID_PASSTHROUGH : DID_ERROR);
3207 break;
3151 default: 3208 default:
3152 dev_warn(&h->pdev->dev, "cmd %p returned " 3209 dev_warn(&h->pdev->dev, "cmd %p returned "
3153 "unknown status %x\n", cmd, 3210 "unknown status %x\n", cmd,
@@ -3181,10 +3238,13 @@ static inline u32 cciss_tag_to_index(u32 tag)
3181 return tag >> DIRECT_LOOKUP_SHIFT; 3238 return tag >> DIRECT_LOOKUP_SHIFT;
3182} 3239}
3183 3240
3184static inline u32 cciss_tag_discard_error_bits(u32 tag) 3241static inline u32 cciss_tag_discard_error_bits(ctlr_info_t *h, u32 tag)
3185{ 3242{
3186#define CCISS_ERROR_BITS 0x03 3243#define CCISS_PERF_ERROR_BITS ((1 << DIRECT_LOOKUP_SHIFT) - 1)
3187 return tag & ~CCISS_ERROR_BITS; 3244#define CCISS_SIMPLE_ERROR_BITS 0x03
3245 if (likely(h->transMethod & CFGTBL_Trans_Performant))
3246 return tag & ~CCISS_PERF_ERROR_BITS;
3247 return tag & ~CCISS_SIMPLE_ERROR_BITS;
3188} 3248}
3189 3249
3190static inline void cciss_mark_tag_indexed(u32 *tag) 3250static inline void cciss_mark_tag_indexed(u32 *tag)
@@ -3215,12 +3275,6 @@ static void do_cciss_request(struct request_queue *q)
3215 int sg_index = 0; 3275 int sg_index = 0;
3216 int chained = 0; 3276 int chained = 0;
3217 3277
3218 /* We call start_io here in case there is a command waiting on the
3219 * queue that has not been sent.
3220 */
3221 if (blk_queue_plugged(q))
3222 goto startio;
3223
3224 queue: 3278 queue:
3225 creq = blk_peek_request(q); 3279 creq = blk_peek_request(q);
3226 if (!creq) 3280 if (!creq)
@@ -3410,7 +3464,7 @@ static inline u32 next_command(ctlr_info_t *h)
3410{ 3464{
3411 u32 a; 3465 u32 a;
3412 3466
3413 if (unlikely(h->transMethod != CFGTBL_Trans_Performant)) 3467 if (unlikely(!(h->transMethod & CFGTBL_Trans_Performant)))
3414 return h->access.command_completed(h); 3468 return h->access.command_completed(h);
3415 3469
3416 if ((*(h->reply_pool_head) & 1) == (h->reply_pool_wraparound)) { 3470 if ((*(h->reply_pool_head) & 1) == (h->reply_pool_wraparound)) {
@@ -3445,15 +3499,12 @@ static inline u32 process_indexed_cmd(ctlr_info_t *h, u32 raw_tag)
3445/* process completion of a non-indexed command */ 3499/* process completion of a non-indexed command */
3446static inline u32 process_nonindexed_cmd(ctlr_info_t *h, u32 raw_tag) 3500static inline u32 process_nonindexed_cmd(ctlr_info_t *h, u32 raw_tag)
3447{ 3501{
3448 u32 tag;
3449 CommandList_struct *c = NULL; 3502 CommandList_struct *c = NULL;
3450 struct hlist_node *tmp;
3451 __u32 busaddr_masked, tag_masked; 3503 __u32 busaddr_masked, tag_masked;
3452 3504
3453 tag = cciss_tag_discard_error_bits(raw_tag); 3505 tag_masked = cciss_tag_discard_error_bits(h, raw_tag);
3454 hlist_for_each_entry(c, tmp, &h->cmpQ, list) { 3506 list_for_each_entry(c, &h->cmpQ, list) {
3455 busaddr_masked = cciss_tag_discard_error_bits(c->busaddr); 3507 busaddr_masked = cciss_tag_discard_error_bits(h, c->busaddr);
3456 tag_masked = cciss_tag_discard_error_bits(tag);
3457 if (busaddr_masked == tag_masked) { 3508 if (busaddr_masked == tag_masked) {
3458 finish_cmd(h, c, raw_tag); 3509 finish_cmd(h, c, raw_tag);
3459 return next_command(h); 3510 return next_command(h);
@@ -3463,6 +3514,63 @@ static inline u32 process_nonindexed_cmd(ctlr_info_t *h, u32 raw_tag)
3463 return next_command(h); 3514 return next_command(h);
3464} 3515}
3465 3516
3517/* Some controllers, like p400, will give us one interrupt
3518 * after a soft reset, even if we turned interrupts off.
3519 * Only need to check for this in the cciss_xxx_discard_completions
3520 * functions.
3521 */
3522static int ignore_bogus_interrupt(ctlr_info_t *h)
3523{
3524 if (likely(!reset_devices))
3525 return 0;
3526
3527 if (likely(h->interrupts_enabled))
3528 return 0;
3529
3530 dev_info(&h->pdev->dev, "Received interrupt while interrupts disabled "
3531 "(known firmware bug.) Ignoring.\n");
3532
3533 return 1;
3534}
3535
3536static irqreturn_t cciss_intx_discard_completions(int irq, void *dev_id)
3537{
3538 ctlr_info_t *h = dev_id;
3539 unsigned long flags;
3540 u32 raw_tag;
3541
3542 if (ignore_bogus_interrupt(h))
3543 return IRQ_NONE;
3544
3545 if (interrupt_not_for_us(h))
3546 return IRQ_NONE;
3547 spin_lock_irqsave(&h->lock, flags);
3548 while (interrupt_pending(h)) {
3549 raw_tag = get_next_completion(h);
3550 while (raw_tag != FIFO_EMPTY)
3551 raw_tag = next_command(h);
3552 }
3553 spin_unlock_irqrestore(&h->lock, flags);
3554 return IRQ_HANDLED;
3555}
3556
3557static irqreturn_t cciss_msix_discard_completions(int irq, void *dev_id)
3558{
3559 ctlr_info_t *h = dev_id;
3560 unsigned long flags;
3561 u32 raw_tag;
3562
3563 if (ignore_bogus_interrupt(h))
3564 return IRQ_NONE;
3565
3566 spin_lock_irqsave(&h->lock, flags);
3567 raw_tag = get_next_completion(h);
3568 while (raw_tag != FIFO_EMPTY)
3569 raw_tag = next_command(h);
3570 spin_unlock_irqrestore(&h->lock, flags);
3571 return IRQ_HANDLED;
3572}
3573
3466static irqreturn_t do_cciss_intx(int irq, void *dev_id) 3574static irqreturn_t do_cciss_intx(int irq, void *dev_id)
3467{ 3575{
3468 ctlr_info_t *h = dev_id; 3576 ctlr_info_t *h = dev_id;
@@ -3801,11 +3909,12 @@ static void __devinit cciss_wait_for_mode_change_ack(ctlr_info_t *h)
3801 for (i = 0; i < MAX_CONFIG_WAIT; i++) { 3909 for (i = 0; i < MAX_CONFIG_WAIT; i++) {
3802 if (!(readl(h->vaddr + SA5_DOORBELL) & CFGTBL_ChangeReq)) 3910 if (!(readl(h->vaddr + SA5_DOORBELL) & CFGTBL_ChangeReq))
3803 break; 3911 break;
3804 msleep(10); 3912 usleep_range(10000, 20000);
3805 } 3913 }
3806} 3914}
3807 3915
3808static __devinit void cciss_enter_performant_mode(ctlr_info_t *h) 3916static __devinit void cciss_enter_performant_mode(ctlr_info_t *h,
3917 u32 use_short_tags)
3809{ 3918{
3810 /* This is a bit complicated. There are 8 registers on 3919 /* This is a bit complicated. There are 8 registers on
3811 * the controller which we write to to tell it 8 different 3920 * the controller which we write to to tell it 8 different
@@ -3860,7 +3969,7 @@ static __devinit void cciss_enter_performant_mode(ctlr_info_t *h)
3860 writel(0, &h->transtable->RepQCtrAddrHigh32); 3969 writel(0, &h->transtable->RepQCtrAddrHigh32);
3861 writel(h->reply_pool_dhandle, &h->transtable->RepQAddr0Low32); 3970 writel(h->reply_pool_dhandle, &h->transtable->RepQAddr0Low32);
3862 writel(0, &h->transtable->RepQAddr0High32); 3971 writel(0, &h->transtable->RepQAddr0High32);
3863 writel(CFGTBL_Trans_Performant, 3972 writel(CFGTBL_Trans_Performant | use_short_tags,
3864 &(h->cfgtable->HostWrite.TransportRequest)); 3973 &(h->cfgtable->HostWrite.TransportRequest));
3865 3974
3866 writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL); 3975 writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL);
@@ -3907,7 +4016,8 @@ static void __devinit cciss_put_controller_into_performant_mode(ctlr_info_t *h)
3907 if ((h->reply_pool == NULL) || (h->blockFetchTable == NULL)) 4016 if ((h->reply_pool == NULL) || (h->blockFetchTable == NULL))
3908 goto clean_up; 4017 goto clean_up;
3909 4018
3910 cciss_enter_performant_mode(h); 4019 cciss_enter_performant_mode(h,
4020 trans_support & CFGTBL_Trans_use_short_tags);
3911 4021
3912 /* Change the access methods to the performant access methods */ 4022 /* Change the access methods to the performant access methods */
3913 h->access = SA5_performant_access; 4023 h->access = SA5_performant_access;
@@ -3985,13 +4095,9 @@ static int __devinit cciss_lookup_board_id(struct pci_dev *pdev, u32 *board_id)
3985 *board_id = ((subsystem_device_id << 16) & 0xffff0000) | 4095 *board_id = ((subsystem_device_id << 16) & 0xffff0000) |
3986 subsystem_vendor_id; 4096 subsystem_vendor_id;
3987 4097
3988 for (i = 0; i < ARRAY_SIZE(products); i++) { 4098 for (i = 0; i < ARRAY_SIZE(products); i++)
3989 /* Stand aside for hpsa driver on request */
3990 if (cciss_allow_hpsa && products[i].board_id == HPSA_BOUNDARY)
3991 return -ENODEV;
3992 if (*board_id == products[i].board_id) 4099 if (*board_id == products[i].board_id)
3993 return i; 4100 return i;
3994 }
3995 dev_warn(&pdev->dev, "unrecognized board ID: 0x%08x, ignoring.\n", 4101 dev_warn(&pdev->dev, "unrecognized board ID: 0x%08x, ignoring.\n",
3996 *board_id); 4102 *board_id);
3997 return -ENODEV; 4103 return -ENODEV;
@@ -4022,18 +4128,31 @@ static int __devinit cciss_pci_find_memory_BAR(struct pci_dev *pdev,
4022 return -ENODEV; 4128 return -ENODEV;
4023} 4129}
4024 4130
4025static int __devinit cciss_wait_for_board_ready(ctlr_info_t *h) 4131static int __devinit cciss_wait_for_board_state(struct pci_dev *pdev,
4132 void __iomem *vaddr, int wait_for_ready)
4133#define BOARD_READY 1
4134#define BOARD_NOT_READY 0
4026{ 4135{
4027 int i; 4136 int i, iterations;
4028 u32 scratchpad; 4137 u32 scratchpad;
4029 4138
4030 for (i = 0; i < CCISS_BOARD_READY_ITERATIONS; i++) { 4139 if (wait_for_ready)
4031 scratchpad = readl(h->vaddr + SA5_SCRATCHPAD_OFFSET); 4140 iterations = CCISS_BOARD_READY_ITERATIONS;
4032 if (scratchpad == CCISS_FIRMWARE_READY) 4141 else
4033 return 0; 4142 iterations = CCISS_BOARD_NOT_READY_ITERATIONS;
4143
4144 for (i = 0; i < iterations; i++) {
4145 scratchpad = readl(vaddr + SA5_SCRATCHPAD_OFFSET);
4146 if (wait_for_ready) {
4147 if (scratchpad == CCISS_FIRMWARE_READY)
4148 return 0;
4149 } else {
4150 if (scratchpad != CCISS_FIRMWARE_READY)
4151 return 0;
4152 }
4034 msleep(CCISS_BOARD_READY_POLL_INTERVAL_MSECS); 4153 msleep(CCISS_BOARD_READY_POLL_INTERVAL_MSECS);
4035 } 4154 }
4036 dev_warn(&h->pdev->dev, "board not ready, timed out.\n"); 4155 dev_warn(&pdev->dev, "board not ready, timed out.\n");
4037 return -ENODEV; 4156 return -ENODEV;
4038} 4157}
4039 4158
@@ -4069,6 +4188,9 @@ static int __devinit cciss_find_cfgtables(ctlr_info_t *h)
4069 cfg_base_addr_index) + cfg_offset, sizeof(h->cfgtable)); 4188 cfg_base_addr_index) + cfg_offset, sizeof(h->cfgtable));
4070 if (!h->cfgtable) 4189 if (!h->cfgtable)
4071 return -ENOMEM; 4190 return -ENOMEM;
4191 rc = write_driver_ver_to_cfgtable(h->cfgtable);
4192 if (rc)
4193 return rc;
4072 /* Find performant mode table. */ 4194 /* Find performant mode table. */
4073 trans_offset = readl(&h->cfgtable->TransMethodOffset); 4195 trans_offset = readl(&h->cfgtable->TransMethodOffset);
4074 h->transtable = remap_pci_mem(pci_resource_start(h->pdev, 4196 h->transtable = remap_pci_mem(pci_resource_start(h->pdev,
@@ -4082,6 +4204,11 @@ static int __devinit cciss_find_cfgtables(ctlr_info_t *h)
4082static void __devinit cciss_get_max_perf_mode_cmds(struct ctlr_info *h) 4204static void __devinit cciss_get_max_perf_mode_cmds(struct ctlr_info *h)
4083{ 4205{
4084 h->max_commands = readl(&(h->cfgtable->MaxPerformantModeCommands)); 4206 h->max_commands = readl(&(h->cfgtable->MaxPerformantModeCommands));
4207
4208 /* Limit commands in memory limited kdump scenario. */
4209 if (reset_devices && h->max_commands > 32)
4210 h->max_commands = 32;
4211
4085 if (h->max_commands < 16) { 4212 if (h->max_commands < 16) {
4086 dev_warn(&h->pdev->dev, "Controller reports " 4213 dev_warn(&h->pdev->dev, "Controller reports "
4087 "max supported commands of %d, an obvious lie. " 4214 "max supported commands of %d, an obvious lie. "
@@ -4098,7 +4225,7 @@ static void __devinit cciss_get_max_perf_mode_cmds(struct ctlr_info *h)
4098static void __devinit cciss_find_board_params(ctlr_info_t *h) 4225static void __devinit cciss_find_board_params(ctlr_info_t *h)
4099{ 4226{
4100 cciss_get_max_perf_mode_cmds(h); 4227 cciss_get_max_perf_mode_cmds(h);
4101 h->nr_cmds = h->max_commands - 4; /* Allow room for some ioctls */ 4228 h->nr_cmds = h->max_commands - 4 - cciss_tape_cmds;
4102 h->maxsgentries = readl(&(h->cfgtable->MaxSGElements)); 4229 h->maxsgentries = readl(&(h->cfgtable->MaxSGElements));
4103 /* 4230 /*
4104 * Limit in-command s/g elements to 32 save dma'able memory. 4231 * Limit in-command s/g elements to 32 save dma'able memory.
@@ -4199,7 +4326,7 @@ static int __devinit cciss_pci_init(ctlr_info_t *h)
4199 err = -ENOMEM; 4326 err = -ENOMEM;
4200 goto err_out_free_res; 4327 goto err_out_free_res;
4201 } 4328 }
4202 err = cciss_wait_for_board_ready(h); 4329 err = cciss_wait_for_board_state(h->pdev, h->vaddr, BOARD_READY);
4203 if (err) 4330 if (err)
4204 goto err_out_free_res; 4331 goto err_out_free_res;
4205 err = cciss_find_cfgtables(h); 4332 err = cciss_find_cfgtables(h);
@@ -4334,7 +4461,7 @@ static __devinit int cciss_message(struct pci_dev *pdev, unsigned char opcode, u
4334 tag = readl(vaddr + SA5_REPLY_PORT_OFFSET); 4461 tag = readl(vaddr + SA5_REPLY_PORT_OFFSET);
4335 if ((tag & ~3) == paddr32) 4462 if ((tag & ~3) == paddr32)
4336 break; 4463 break;
4337 schedule_timeout_uninterruptible(HZ); 4464 msleep(CCISS_POST_RESET_NOOP_TIMEOUT_MSECS);
4338 } 4465 }
4339 4466
4340 iounmap(vaddr); 4467 iounmap(vaddr);
@@ -4361,41 +4488,10 @@ static __devinit int cciss_message(struct pci_dev *pdev, unsigned char opcode, u
4361 return 0; 4488 return 0;
4362} 4489}
4363 4490
4364#define cciss_soft_reset_controller(p) cciss_message(p, 1, 0)
4365#define cciss_noop(p) cciss_message(p, 3, 0) 4491#define cciss_noop(p) cciss_message(p, 3, 0)
4366 4492
4367static __devinit int cciss_reset_msi(struct pci_dev *pdev)
4368{
4369/* the #defines are stolen from drivers/pci/msi.h. */
4370#define msi_control_reg(base) (base + PCI_MSI_FLAGS)
4371#define PCI_MSIX_FLAGS_ENABLE (1 << 15)
4372
4373 int pos;
4374 u16 control = 0;
4375
4376 pos = pci_find_capability(pdev, PCI_CAP_ID_MSI);
4377 if (pos) {
4378 pci_read_config_word(pdev, msi_control_reg(pos), &control);
4379 if (control & PCI_MSI_FLAGS_ENABLE) {
4380 dev_info(&pdev->dev, "resetting MSI\n");
4381 pci_write_config_word(pdev, msi_control_reg(pos), control & ~PCI_MSI_FLAGS_ENABLE);
4382 }
4383 }
4384
4385 pos = pci_find_capability(pdev, PCI_CAP_ID_MSIX);
4386 if (pos) {
4387 pci_read_config_word(pdev, msi_control_reg(pos), &control);
4388 if (control & PCI_MSIX_FLAGS_ENABLE) {
4389 dev_info(&pdev->dev, "resetting MSI-X\n");
4390 pci_write_config_word(pdev, msi_control_reg(pos), control & ~PCI_MSIX_FLAGS_ENABLE);
4391 }
4392 }
4393
4394 return 0;
4395}
4396
4397static int cciss_controller_hard_reset(struct pci_dev *pdev, 4493static int cciss_controller_hard_reset(struct pci_dev *pdev,
4398 void * __iomem vaddr, bool use_doorbell) 4494 void * __iomem vaddr, u32 use_doorbell)
4399{ 4495{
4400 u16 pmcsr; 4496 u16 pmcsr;
4401 int pos; 4497 int pos;
@@ -4406,8 +4502,7 @@ static int cciss_controller_hard_reset(struct pci_dev *pdev,
4406 * other way using the doorbell register. 4502 * other way using the doorbell register.
4407 */ 4503 */
4408 dev_info(&pdev->dev, "using doorbell to reset controller\n"); 4504 dev_info(&pdev->dev, "using doorbell to reset controller\n");
4409 writel(DOORBELL_CTLR_RESET, vaddr + SA5_DOORBELL); 4505 writel(use_doorbell, vaddr + SA5_DOORBELL);
4410 msleep(1000);
4411 } else { /* Try to do it the PCI power state way */ 4506 } else { /* Try to do it the PCI power state way */
4412 4507
4413 /* Quoting from the Open CISS Specification: "The Power 4508 /* Quoting from the Open CISS Specification: "The Power
@@ -4438,27 +4533,79 @@ static int cciss_controller_hard_reset(struct pci_dev *pdev,
4438 pmcsr &= ~PCI_PM_CTRL_STATE_MASK; 4533 pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
4439 pmcsr |= PCI_D0; 4534 pmcsr |= PCI_D0;
4440 pci_write_config_word(pdev, pos + PCI_PM_CTRL, pmcsr); 4535 pci_write_config_word(pdev, pos + PCI_PM_CTRL, pmcsr);
4441
4442 msleep(500);
4443 } 4536 }
4444 return 0; 4537 return 0;
4445} 4538}
4446 4539
4540static __devinit void init_driver_version(char *driver_version, int len)
4541{
4542 memset(driver_version, 0, len);
4543 strncpy(driver_version, "cciss " DRIVER_NAME, len - 1);
4544}
4545
4546static __devinit int write_driver_ver_to_cfgtable(
4547 CfgTable_struct __iomem *cfgtable)
4548{
4549 char *driver_version;
4550 int i, size = sizeof(cfgtable->driver_version);
4551
4552 driver_version = kmalloc(size, GFP_KERNEL);
4553 if (!driver_version)
4554 return -ENOMEM;
4555
4556 init_driver_version(driver_version, size);
4557 for (i = 0; i < size; i++)
4558 writeb(driver_version[i], &cfgtable->driver_version[i]);
4559 kfree(driver_version);
4560 return 0;
4561}
4562
4563static __devinit void read_driver_ver_from_cfgtable(
4564 CfgTable_struct __iomem *cfgtable, unsigned char *driver_ver)
4565{
4566 int i;
4567
4568 for (i = 0; i < sizeof(cfgtable->driver_version); i++)
4569 driver_ver[i] = readb(&cfgtable->driver_version[i]);
4570}
4571
4572static __devinit int controller_reset_failed(
4573 CfgTable_struct __iomem *cfgtable)
4574{
4575
4576 char *driver_ver, *old_driver_ver;
4577 int rc, size = sizeof(cfgtable->driver_version);
4578
4579 old_driver_ver = kmalloc(2 * size, GFP_KERNEL);
4580 if (!old_driver_ver)
4581 return -ENOMEM;
4582 driver_ver = old_driver_ver + size;
4583
4584 /* After a reset, the 32 bytes of "driver version" in the cfgtable
4585 * should have been changed, otherwise we know the reset failed.
4586 */
4587 init_driver_version(old_driver_ver, size);
4588 read_driver_ver_from_cfgtable(cfgtable, driver_ver);
4589 rc = !memcmp(driver_ver, old_driver_ver, size);
4590 kfree(old_driver_ver);
4591 return rc;
4592}
4593
4447/* This does a hard reset of the controller using PCI power management 4594/* This does a hard reset of the controller using PCI power management
4448 * states or using the doorbell register. */ 4595 * states or using the doorbell register. */
4449static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev) 4596static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
4450{ 4597{
4451 u16 saved_config_space[32];
4452 u64 cfg_offset; 4598 u64 cfg_offset;
4453 u32 cfg_base_addr; 4599 u32 cfg_base_addr;
4454 u64 cfg_base_addr_index; 4600 u64 cfg_base_addr_index;
4455 void __iomem *vaddr; 4601 void __iomem *vaddr;
4456 unsigned long paddr; 4602 unsigned long paddr;
4457 u32 misc_fw_support, active_transport; 4603 u32 misc_fw_support;
4458 int rc, i; 4604 int rc;
4459 CfgTable_struct __iomem *cfgtable; 4605 CfgTable_struct __iomem *cfgtable;
4460 bool use_doorbell; 4606 u32 use_doorbell;
4461 u32 board_id; 4607 u32 board_id;
4608 u16 command_register;
4462 4609
4463 /* For controllers as old a the p600, this is very nearly 4610 /* For controllers as old a the p600, this is very nearly
4464 * the same thing as 4611 * the same thing as
@@ -4468,14 +4615,6 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
4468 * pci_set_power_state(pci_dev, PCI_D0); 4615 * pci_set_power_state(pci_dev, PCI_D0);
4469 * pci_restore_state(pci_dev); 4616 * pci_restore_state(pci_dev);
4470 * 4617 *
4471 * but we can't use these nice canned kernel routines on
4472 * kexec, because they also check the MSI/MSI-X state in PCI
4473 * configuration space and do the wrong thing when it is
4474 * set/cleared. Also, the pci_save/restore_state functions
4475 * violate the ordering requirements for restoring the
4476 * configuration space from the CCISS document (see the
4477 * comment below). So we roll our own ....
4478 *
4479 * For controllers newer than the P600, the pci power state 4618 * For controllers newer than the P600, the pci power state
4480 * method of resetting doesn't work so we have another way 4619 * method of resetting doesn't work so we have another way
4481 * using the doorbell register. 4620 * using the doorbell register.
@@ -4488,14 +4627,23 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
4488 * likely not be happy. Just forbid resetting this conjoined mess. 4627 * likely not be happy. Just forbid resetting this conjoined mess.
4489 */ 4628 */
4490 cciss_lookup_board_id(pdev, &board_id); 4629 cciss_lookup_board_id(pdev, &board_id);
4491 if (board_id == 0x409C0E11 || board_id == 0x409D0E11) { 4630 if (!ctlr_is_resettable(board_id)) {
4492 dev_warn(&pdev->dev, "Cannot reset Smart Array 640x " 4631 dev_warn(&pdev->dev, "Cannot reset Smart Array 640x "
4493 "due to shared cache module."); 4632 "due to shared cache module.");
4494 return -ENODEV; 4633 return -ENODEV;
4495 } 4634 }
4496 4635
4497 for (i = 0; i < 32; i++) 4636 /* if controller is soft- but not hard resettable... */
4498 pci_read_config_word(pdev, 2*i, &saved_config_space[i]); 4637 if (!ctlr_is_hard_resettable(board_id))
4638 return -ENOTSUPP; /* try soft reset later. */
4639
4640 /* Save the PCI command register */
4641 pci_read_config_word(pdev, 4, &command_register);
4642 /* Turn the board off. This is so that later pci_restore_state()
4643 * won't turn the board on before the rest of config space is ready.
4644 */
4645 pci_disable_device(pdev);
4646 pci_save_state(pdev);
4499 4647
4500 /* find the first memory BAR, so we can find the cfg table */ 4648 /* find the first memory BAR, so we can find the cfg table */
4501 rc = cciss_pci_find_memory_BAR(pdev, &paddr); 4649 rc = cciss_pci_find_memory_BAR(pdev, &paddr);
@@ -4516,51 +4664,70 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
4516 rc = -ENOMEM; 4664 rc = -ENOMEM;
4517 goto unmap_vaddr; 4665 goto unmap_vaddr;
4518 } 4666 }
4667 rc = write_driver_ver_to_cfgtable(cfgtable);
4668 if (rc)
4669 goto unmap_vaddr;
4519 4670
4520 /* If reset via doorbell register is supported, use that. */ 4671 /* If reset via doorbell register is supported, use that.
4521 misc_fw_support = readl(&cfgtable->misc_fw_support); 4672 * There are two such methods. Favor the newest method.
4522 use_doorbell = misc_fw_support & MISC_FW_DOORBELL_RESET;
4523
4524 /* The doorbell reset seems to cause lockups on some Smart
4525 * Arrays (e.g. P410, P410i, maybe others). Until this is
4526 * fixed or at least isolated, avoid the doorbell reset.
4527 */ 4673 */
4528 use_doorbell = 0; 4674 misc_fw_support = readl(&cfgtable->misc_fw_support);
4675 use_doorbell = misc_fw_support & MISC_FW_DOORBELL_RESET2;
4676 if (use_doorbell) {
4677 use_doorbell = DOORBELL_CTLR_RESET2;
4678 } else {
4679 use_doorbell = misc_fw_support & MISC_FW_DOORBELL_RESET;
4680 if (use_doorbell) {
4681 dev_warn(&pdev->dev, "Controller claims that "
4682 "'Bit 2 doorbell reset' is "
4683 "supported, but not 'bit 5 doorbell reset'. "
4684 "Firmware update is recommended.\n");
4685 rc = -ENOTSUPP; /* use the soft reset */
4686 goto unmap_cfgtable;
4687 }
4688 }
4529 4689
4530 rc = cciss_controller_hard_reset(pdev, vaddr, use_doorbell); 4690 rc = cciss_controller_hard_reset(pdev, vaddr, use_doorbell);
4531 if (rc) 4691 if (rc)
4532 goto unmap_cfgtable; 4692 goto unmap_cfgtable;
4533 4693 pci_restore_state(pdev);
4534 /* Restore the PCI configuration space. The Open CISS 4694 rc = pci_enable_device(pdev);
4535 * Specification says, "Restore the PCI Configuration 4695 if (rc) {
4536 * Registers, offsets 00h through 60h. It is important to 4696 dev_warn(&pdev->dev, "failed to enable device.\n");
4537 * restore the command register, 16-bits at offset 04h, 4697 goto unmap_cfgtable;
4538 * last. Do not restore the configuration status register,
4539 * 16-bits at offset 06h." Note that the offset is 2*i.
4540 */
4541 for (i = 0; i < 32; i++) {
4542 if (i == 2 || i == 3)
4543 continue;
4544 pci_write_config_word(pdev, 2*i, saved_config_space[i]);
4545 } 4698 }
4546 wmb(); 4699 pci_write_config_word(pdev, 4, command_register);
4547 pci_write_config_word(pdev, 4, saved_config_space[2]);
4548 4700
4549 /* Some devices (notably the HP Smart Array 5i Controller) 4701 /* Some devices (notably the HP Smart Array 5i Controller)
4550 need a little pause here */ 4702 need a little pause here */
4551 msleep(CCISS_POST_RESET_PAUSE_MSECS); 4703 msleep(CCISS_POST_RESET_PAUSE_MSECS);
4552 4704
4553 /* Controller should be in simple mode at this point. If it's not, 4705 /* Wait for board to become not ready, then ready. */
4554 * It means we're on one of those controllers which doesn't support 4706 dev_info(&pdev->dev, "Waiting for board to reset.\n");
4555 * the doorbell reset method and on which the PCI power management reset 4707 rc = cciss_wait_for_board_state(pdev, vaddr, BOARD_NOT_READY);
4556 * method doesn't work (P800, for example.) 4708 if (rc) {
4557 * In those cases, don't try to proceed, as it generally doesn't work. 4709 dev_warn(&pdev->dev, "Failed waiting for board to hard reset."
4558 */ 4710 " Will try soft reset.\n");
4559 active_transport = readl(&cfgtable->TransportActive); 4711 rc = -ENOTSUPP; /* Not expected, but try soft reset later */
4560 if (active_transport & PERFORMANT_MODE) { 4712 goto unmap_cfgtable;
4561 dev_warn(&pdev->dev, "Unable to successfully reset controller," 4713 }
4562 " Ignoring controller.\n"); 4714 rc = cciss_wait_for_board_state(pdev, vaddr, BOARD_READY);
4563 rc = -ENODEV; 4715 if (rc) {
4716 dev_warn(&pdev->dev,
4717 "failed waiting for board to become ready "
4718 "after hard reset\n");
4719 goto unmap_cfgtable;
4720 }
4721
4722 rc = controller_reset_failed(vaddr);
4723 if (rc < 0)
4724 goto unmap_cfgtable;
4725 if (rc) {
4726 dev_warn(&pdev->dev, "Unable to successfully hard reset "
4727 "controller. Will try soft reset.\n");
4728 rc = -ENOTSUPP; /* Not expected, but try soft reset later */
4729 } else {
4730 dev_info(&pdev->dev, "Board ready after hard reset.\n");
4564 } 4731 }
4565 4732
4566unmap_cfgtable: 4733unmap_cfgtable:
@@ -4587,13 +4754,12 @@ static __devinit int cciss_init_reset_devices(struct pci_dev *pdev)
4587 * due to concerns about shared bbwc between 6402/6404 pair. 4754 * due to concerns about shared bbwc between 6402/6404 pair.
4588 */ 4755 */
4589 if (rc == -ENOTSUPP) 4756 if (rc == -ENOTSUPP)
4590 return 0; /* just try to do the kdump anyhow. */ 4757 return rc; /* just try to do the kdump anyhow. */
4591 if (rc) 4758 if (rc)
4592 return -ENODEV; 4759 return -ENODEV;
4593 if (cciss_reset_msi(pdev))
4594 return -ENODEV;
4595 4760
4596 /* Now try to get the controller to respond to a no-op */ 4761 /* Now try to get the controller to respond to a no-op */
4762 dev_warn(&pdev->dev, "Waiting for controller to respond to no-op\n");
4597 for (i = 0; i < CCISS_POST_RESET_NOOP_RETRIES; i++) { 4763 for (i = 0; i < CCISS_POST_RESET_NOOP_RETRIES; i++) {
4598 if (cciss_noop(pdev) == 0) 4764 if (cciss_noop(pdev) == 0)
4599 break; 4765 break;
@@ -4606,6 +4772,148 @@ static __devinit int cciss_init_reset_devices(struct pci_dev *pdev)
4606 return 0; 4772 return 0;
4607} 4773}
4608 4774
4775static __devinit int cciss_allocate_cmd_pool(ctlr_info_t *h)
4776{
4777 h->cmd_pool_bits = kmalloc(
4778 DIV_ROUND_UP(h->nr_cmds, BITS_PER_LONG) *
4779 sizeof(unsigned long), GFP_KERNEL);
4780 h->cmd_pool = pci_alloc_consistent(h->pdev,
4781 h->nr_cmds * sizeof(CommandList_struct),
4782 &(h->cmd_pool_dhandle));
4783 h->errinfo_pool = pci_alloc_consistent(h->pdev,
4784 h->nr_cmds * sizeof(ErrorInfo_struct),
4785 &(h->errinfo_pool_dhandle));
4786 if ((h->cmd_pool_bits == NULL)
4787 || (h->cmd_pool == NULL)
4788 || (h->errinfo_pool == NULL)) {
4789 dev_err(&h->pdev->dev, "out of memory");
4790 return -ENOMEM;
4791 }
4792 return 0;
4793}
4794
4795static __devinit int cciss_allocate_scatterlists(ctlr_info_t *h)
4796{
4797 int i;
4798
4799 /* zero it, so that on free we need not know how many were alloc'ed */
4800 h->scatter_list = kzalloc(h->max_commands *
4801 sizeof(struct scatterlist *), GFP_KERNEL);
4802 if (!h->scatter_list)
4803 return -ENOMEM;
4804
4805 for (i = 0; i < h->nr_cmds; i++) {
4806 h->scatter_list[i] = kmalloc(sizeof(struct scatterlist) *
4807 h->maxsgentries, GFP_KERNEL);
4808 if (h->scatter_list[i] == NULL) {
4809 dev_err(&h->pdev->dev, "could not allocate "
4810 "s/g lists\n");
4811 return -ENOMEM;
4812 }
4813 }
4814 return 0;
4815}
4816
4817static void cciss_free_scatterlists(ctlr_info_t *h)
4818{
4819 int i;
4820
4821 if (h->scatter_list) {
4822 for (i = 0; i < h->nr_cmds; i++)
4823 kfree(h->scatter_list[i]);
4824 kfree(h->scatter_list);
4825 }
4826}
4827
4828static void cciss_free_cmd_pool(ctlr_info_t *h)
4829{
4830 kfree(h->cmd_pool_bits);
4831 if (h->cmd_pool)
4832 pci_free_consistent(h->pdev,
4833 h->nr_cmds * sizeof(CommandList_struct),
4834 h->cmd_pool, h->cmd_pool_dhandle);
4835 if (h->errinfo_pool)
4836 pci_free_consistent(h->pdev,
4837 h->nr_cmds * sizeof(ErrorInfo_struct),
4838 h->errinfo_pool, h->errinfo_pool_dhandle);
4839}
4840
4841static int cciss_request_irq(ctlr_info_t *h,
4842 irqreturn_t (*msixhandler)(int, void *),
4843 irqreturn_t (*intxhandler)(int, void *))
4844{
4845 if (h->msix_vector || h->msi_vector) {
4846 if (!request_irq(h->intr[PERF_MODE_INT], msixhandler,
4847 IRQF_DISABLED, h->devname, h))
4848 return 0;
4849 dev_err(&h->pdev->dev, "Unable to get msi irq %d"
4850 " for %s\n", h->intr[PERF_MODE_INT],
4851 h->devname);
4852 return -1;
4853 }
4854
4855 if (!request_irq(h->intr[PERF_MODE_INT], intxhandler,
4856 IRQF_DISABLED, h->devname, h))
4857 return 0;
4858 dev_err(&h->pdev->dev, "Unable to get irq %d for %s\n",
4859 h->intr[PERF_MODE_INT], h->devname);
4860 return -1;
4861}
4862
4863static int __devinit cciss_kdump_soft_reset(ctlr_info_t *h)
4864{
4865 if (cciss_send_reset(h, CTLR_LUNID, CCISS_RESET_TYPE_CONTROLLER)) {
4866 dev_warn(&h->pdev->dev, "Resetting array controller failed.\n");
4867 return -EIO;
4868 }
4869
4870 dev_info(&h->pdev->dev, "Waiting for board to soft reset.\n");
4871 if (cciss_wait_for_board_state(h->pdev, h->vaddr, BOARD_NOT_READY)) {
4872 dev_warn(&h->pdev->dev, "Soft reset had no effect.\n");
4873 return -1;
4874 }
4875
4876 dev_info(&h->pdev->dev, "Board reset, awaiting READY status.\n");
4877 if (cciss_wait_for_board_state(h->pdev, h->vaddr, BOARD_READY)) {
4878 dev_warn(&h->pdev->dev, "Board failed to become ready "
4879 "after soft reset.\n");
4880 return -1;
4881 }
4882
4883 return 0;
4884}
4885
4886static void cciss_undo_allocations_after_kdump_soft_reset(ctlr_info_t *h)
4887{
4888 int ctlr = h->ctlr;
4889
4890 free_irq(h->intr[PERF_MODE_INT], h);
4891#ifdef CONFIG_PCI_MSI
4892 if (h->msix_vector)
4893 pci_disable_msix(h->pdev);
4894 else if (h->msi_vector)
4895 pci_disable_msi(h->pdev);
4896#endif /* CONFIG_PCI_MSI */
4897 cciss_free_sg_chain_blocks(h->cmd_sg_list, h->nr_cmds);
4898 cciss_free_scatterlists(h);
4899 cciss_free_cmd_pool(h);
4900 kfree(h->blockFetchTable);
4901 if (h->reply_pool)
4902 pci_free_consistent(h->pdev, h->max_commands * sizeof(__u64),
4903 h->reply_pool, h->reply_pool_dhandle);
4904 if (h->transtable)
4905 iounmap(h->transtable);
4906 if (h->cfgtable)
4907 iounmap(h->cfgtable);
4908 if (h->vaddr)
4909 iounmap(h->vaddr);
4910 unregister_blkdev(h->major, h->devname);
4911 cciss_destroy_hba_sysfs_entry(h);
4912 pci_release_regions(h->pdev);
4913 kfree(h);
4914 hba[ctlr] = NULL;
4915}
4916
4609/* 4917/*
4610 * This is it. Find all the controllers and register them. I really hate 4918 * This is it. Find all the controllers and register them. I really hate
4611 * stealing all these major device numbers. 4919 * stealing all these major device numbers.
@@ -4616,15 +4924,28 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
4616{ 4924{
4617 int i; 4925 int i;
4618 int j = 0; 4926 int j = 0;
4619 int k = 0;
4620 int rc; 4927 int rc;
4928 int try_soft_reset = 0;
4621 int dac, return_code; 4929 int dac, return_code;
4622 InquiryData_struct *inq_buff; 4930 InquiryData_struct *inq_buff;
4623 ctlr_info_t *h; 4931 ctlr_info_t *h;
4932 unsigned long flags;
4624 4933
4625 rc = cciss_init_reset_devices(pdev); 4934 rc = cciss_init_reset_devices(pdev);
4626 if (rc) 4935 if (rc) {
4627 return rc; 4936 if (rc != -ENOTSUPP)
4937 return rc;
4938 /* If the reset fails in a particular way (it has no way to do
4939 * a proper hard reset, so returns -ENOTSUPP) we can try to do
4940 * a soft reset once we get the controller configured up to the
4941 * point that it can accept a command.
4942 */
4943 try_soft_reset = 1;
4944 rc = 0;
4945 }
4946
4947reinit_after_soft_reset:
4948
4628 i = alloc_cciss_hba(pdev); 4949 i = alloc_cciss_hba(pdev);
4629 if (i < 0) 4950 if (i < 0)
4630 return -1; 4951 return -1;
@@ -4632,8 +4953,8 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
4632 h = hba[i]; 4953 h = hba[i];
4633 h->pdev = pdev; 4954 h->pdev = pdev;
4634 h->busy_initializing = 1; 4955 h->busy_initializing = 1;
4635 INIT_HLIST_HEAD(&h->cmpQ); 4956 INIT_LIST_HEAD(&h->cmpQ);
4636 INIT_HLIST_HEAD(&h->reqQ); 4957 INIT_LIST_HEAD(&h->reqQ);
4637 mutex_init(&h->busy_shutting_down); 4958 mutex_init(&h->busy_shutting_down);
4638 4959
4639 if (cciss_pci_init(h) != 0) 4960 if (cciss_pci_init(h) != 0)
@@ -4642,6 +4963,11 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
4642 sprintf(h->devname, "cciss%d", i); 4963 sprintf(h->devname, "cciss%d", i);
4643 h->ctlr = i; 4964 h->ctlr = i;
4644 4965
4966 if (cciss_tape_cmds < 2)
4967 cciss_tape_cmds = 2;
4968 if (cciss_tape_cmds > 16)
4969 cciss_tape_cmds = 16;
4970
4645 init_completion(&h->scan_wait); 4971 init_completion(&h->scan_wait);
4646 4972
4647 if (cciss_create_hba_sysfs_entry(h)) 4973 if (cciss_create_hba_sysfs_entry(h))
@@ -4677,62 +5003,20 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
4677 5003
4678 /* make sure the board interrupts are off */ 5004 /* make sure the board interrupts are off */
4679 h->access.set_intr_mask(h, CCISS_INTR_OFF); 5005 h->access.set_intr_mask(h, CCISS_INTR_OFF);
4680 if (h->msi_vector || h->msix_vector) { 5006 rc = cciss_request_irq(h, do_cciss_msix_intr, do_cciss_intx);
4681 if (request_irq(h->intr[PERF_MODE_INT], 5007 if (rc)
4682 do_cciss_msix_intr, 5008 goto clean2;
4683 IRQF_DISABLED, h->devname, h)) {
4684 dev_err(&h->pdev->dev, "Unable to get irq %d for %s\n",
4685 h->intr[PERF_MODE_INT], h->devname);
4686 goto clean2;
4687 }
4688 } else {
4689 if (request_irq(h->intr[PERF_MODE_INT], do_cciss_intx,
4690 IRQF_DISABLED, h->devname, h)) {
4691 dev_err(&h->pdev->dev, "Unable to get irq %d for %s\n",
4692 h->intr[PERF_MODE_INT], h->devname);
4693 goto clean2;
4694 }
4695 }
4696 5009
4697 dev_info(&h->pdev->dev, "%s: <0x%x> at PCI %s IRQ %d%s using DAC\n", 5010 dev_info(&h->pdev->dev, "%s: <0x%x> at PCI %s IRQ %d%s using DAC\n",
4698 h->devname, pdev->device, pci_name(pdev), 5011 h->devname, pdev->device, pci_name(pdev),
4699 h->intr[PERF_MODE_INT], dac ? "" : " not"); 5012 h->intr[PERF_MODE_INT], dac ? "" : " not");
4700 5013
4701 h->cmd_pool_bits = 5014 if (cciss_allocate_cmd_pool(h))
4702 kmalloc(DIV_ROUND_UP(h->nr_cmds, BITS_PER_LONG)
4703 * sizeof(unsigned long), GFP_KERNEL);
4704 h->cmd_pool = (CommandList_struct *)
4705 pci_alloc_consistent(h->pdev,
4706 h->nr_cmds * sizeof(CommandList_struct),
4707 &(h->cmd_pool_dhandle));
4708 h->errinfo_pool = (ErrorInfo_struct *)
4709 pci_alloc_consistent(h->pdev,
4710 h->nr_cmds * sizeof(ErrorInfo_struct),
4711 &(h->errinfo_pool_dhandle));
4712 if ((h->cmd_pool_bits == NULL)
4713 || (h->cmd_pool == NULL)
4714 || (h->errinfo_pool == NULL)) {
4715 dev_err(&h->pdev->dev, "out of memory");
4716 goto clean4; 5015 goto clean4;
4717 }
4718 5016
4719 /* Need space for temp scatter list */ 5017 if (cciss_allocate_scatterlists(h))
4720 h->scatter_list = kmalloc(h->max_commands *
4721 sizeof(struct scatterlist *),
4722 GFP_KERNEL);
4723 if (!h->scatter_list)
4724 goto clean4; 5018 goto clean4;
4725 5019
4726 for (k = 0; k < h->nr_cmds; k++) {
4727 h->scatter_list[k] = kmalloc(sizeof(struct scatterlist) *
4728 h->maxsgentries,
4729 GFP_KERNEL);
4730 if (h->scatter_list[k] == NULL) {
4731 dev_err(&h->pdev->dev,
4732 "could not allocate s/g lists\n");
4733 goto clean4;
4734 }
4735 }
4736 h->cmd_sg_list = cciss_allocate_sg_chain_blocks(h, 5020 h->cmd_sg_list = cciss_allocate_sg_chain_blocks(h,
4737 h->chainsize, h->nr_cmds); 5021 h->chainsize, h->nr_cmds);
4738 if (!h->cmd_sg_list && h->chainsize > 0) 5022 if (!h->cmd_sg_list && h->chainsize > 0)
@@ -4756,6 +5040,62 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
4756 h->gendisk[j] = NULL; 5040 h->gendisk[j] = NULL;
4757 } 5041 }
4758 5042
5043 /* At this point, the controller is ready to take commands.
5044 * Now, if reset_devices and the hard reset didn't work, try
5045 * the soft reset and see if that works.
5046 */
5047 if (try_soft_reset) {
5048
5049 /* This is kind of gross. We may or may not get a completion
5050 * from the soft reset command, and if we do, then the value
5051 * from the fifo may or may not be valid. So, we wait 10 secs
5052 * after the reset throwing away any completions we get during
5053 * that time. Unregister the interrupt handler and register
5054 * fake ones to scoop up any residual completions.
5055 */
5056 spin_lock_irqsave(&h->lock, flags);
5057 h->access.set_intr_mask(h, CCISS_INTR_OFF);
5058 spin_unlock_irqrestore(&h->lock, flags);
5059 free_irq(h->intr[PERF_MODE_INT], h);
5060 rc = cciss_request_irq(h, cciss_msix_discard_completions,
5061 cciss_intx_discard_completions);
5062 if (rc) {
5063 dev_warn(&h->pdev->dev, "Failed to request_irq after "
5064 "soft reset.\n");
5065 goto clean4;
5066 }
5067
5068 rc = cciss_kdump_soft_reset(h);
5069 if (rc) {
5070 dev_warn(&h->pdev->dev, "Soft reset failed.\n");
5071 goto clean4;
5072 }
5073
5074 dev_info(&h->pdev->dev, "Board READY.\n");
5075 dev_info(&h->pdev->dev,
5076 "Waiting for stale completions to drain.\n");
5077 h->access.set_intr_mask(h, CCISS_INTR_ON);
5078 msleep(10000);
5079 h->access.set_intr_mask(h, CCISS_INTR_OFF);
5080
5081 rc = controller_reset_failed(h->cfgtable);
5082 if (rc)
5083 dev_info(&h->pdev->dev,
5084 "Soft reset appears to have failed.\n");
5085
5086 /* since the controller's reset, we have to go back and re-init
5087 * everything. Easiest to just forget what we've done and do it
5088 * all over again.
5089 */
5090 cciss_undo_allocations_after_kdump_soft_reset(h);
5091 try_soft_reset = 0;
5092 if (rc)
5093 /* don't go to clean4, we already unallocated */
5094 return -ENODEV;
5095
5096 goto reinit_after_soft_reset;
5097 }
5098
4759 cciss_scsi_setup(h); 5099 cciss_scsi_setup(h);
4760 5100
4761 /* Turn the interrupts on so we can service requests */ 5101 /* Turn the interrupts on so we can service requests */
@@ -4790,21 +5130,9 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
4790 return 1; 5130 return 1;
4791 5131
4792clean4: 5132clean4:
4793 kfree(h->cmd_pool_bits); 5133 cciss_free_cmd_pool(h);
4794 /* Free up sg elements */ 5134 cciss_free_scatterlists(h);
4795 for (k-- ; k >= 0; k--)
4796 kfree(h->scatter_list[k]);
4797 kfree(h->scatter_list);
4798 cciss_free_sg_chain_blocks(h->cmd_sg_list, h->nr_cmds); 5135 cciss_free_sg_chain_blocks(h->cmd_sg_list, h->nr_cmds);
4799 if (h->cmd_pool)
4800 pci_free_consistent(h->pdev,
4801 h->nr_cmds * sizeof(CommandList_struct),
4802 h->cmd_pool, h->cmd_pool_dhandle);
4803 if (h->errinfo_pool)
4804 pci_free_consistent(h->pdev,
4805 h->nr_cmds * sizeof(ErrorInfo_struct),
4806 h->errinfo_pool,
4807 h->errinfo_pool_dhandle);
4808 free_irq(h->intr[PERF_MODE_INT], h); 5136 free_irq(h->intr[PERF_MODE_INT], h);
4809clean2: 5137clean2:
4810 unregister_blkdev(h->major, h->devname); 5138 unregister_blkdev(h->major, h->devname);
@@ -4902,16 +5230,16 @@ static void __devexit cciss_remove_one(struct pci_dev *pdev)
4902 iounmap(h->cfgtable); 5230 iounmap(h->cfgtable);
4903 iounmap(h->vaddr); 5231 iounmap(h->vaddr);
4904 5232
4905 pci_free_consistent(h->pdev, h->nr_cmds * sizeof(CommandList_struct), 5233 cciss_free_cmd_pool(h);
4906 h->cmd_pool, h->cmd_pool_dhandle);
4907 pci_free_consistent(h->pdev, h->nr_cmds * sizeof(ErrorInfo_struct),
4908 h->errinfo_pool, h->errinfo_pool_dhandle);
4909 kfree(h->cmd_pool_bits);
4910 /* Free up sg elements */ 5234 /* Free up sg elements */
4911 for (j = 0; j < h->nr_cmds; j++) 5235 for (j = 0; j < h->nr_cmds; j++)
4912 kfree(h->scatter_list[j]); 5236 kfree(h->scatter_list[j]);
4913 kfree(h->scatter_list); 5237 kfree(h->scatter_list);
4914 cciss_free_sg_chain_blocks(h->cmd_sg_list, h->nr_cmds); 5238 cciss_free_sg_chain_blocks(h->cmd_sg_list, h->nr_cmds);
5239 kfree(h->blockFetchTable);
5240 if (h->reply_pool)
5241 pci_free_consistent(h->pdev, h->max_commands * sizeof(__u64),
5242 h->reply_pool, h->reply_pool_dhandle);
4915 /* 5243 /*
4916 * Deliberately omit pci_disable_device(): it does something nasty to 5244 * Deliberately omit pci_disable_device(): it does something nasty to
4917 * Smart Array controllers that pci_enable_device does not undo 5245 * Smart Array controllers that pci_enable_device does not undo
@@ -4987,7 +5315,8 @@ static void __exit cciss_cleanup(void)
4987 } 5315 }
4988 } 5316 }
4989 kthread_stop(cciss_scan_thread); 5317 kthread_stop(cciss_scan_thread);
4990 remove_proc_entry("driver/cciss", NULL); 5318 if (proc_cciss)
5319 remove_proc_entry("driver/cciss", NULL);
4991 bus_unregister(&cciss_bus_type); 5320 bus_unregister(&cciss_bus_type);
4992} 5321}
4993 5322
diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h
index ae340ffc8f81..16b4d58d84dd 100644
--- a/drivers/block/cciss.h
+++ b/drivers/block/cciss.h
@@ -103,8 +103,8 @@ struct ctlr_info
103 struct access_method access; 103 struct access_method access;
104 104
105 /* queue and queue Info */ 105 /* queue and queue Info */
106 struct hlist_head reqQ; 106 struct list_head reqQ;
107 struct hlist_head cmpQ; 107 struct list_head cmpQ;
108 unsigned int Qdepth; 108 unsigned int Qdepth;
109 unsigned int maxQsinceinit; 109 unsigned int maxQsinceinit;
110 unsigned int maxSG; 110 unsigned int maxSG;
@@ -200,13 +200,18 @@ struct ctlr_info
200 * the above. 200 * the above.
201 */ 201 */
202#define CCISS_BOARD_READY_WAIT_SECS (120) 202#define CCISS_BOARD_READY_WAIT_SECS (120)
203#define CCISS_BOARD_NOT_READY_WAIT_SECS (100)
203#define CCISS_BOARD_READY_POLL_INTERVAL_MSECS (100) 204#define CCISS_BOARD_READY_POLL_INTERVAL_MSECS (100)
204#define CCISS_BOARD_READY_ITERATIONS \ 205#define CCISS_BOARD_READY_ITERATIONS \
205 ((CCISS_BOARD_READY_WAIT_SECS * 1000) / \ 206 ((CCISS_BOARD_READY_WAIT_SECS * 1000) / \
206 CCISS_BOARD_READY_POLL_INTERVAL_MSECS) 207 CCISS_BOARD_READY_POLL_INTERVAL_MSECS)
208#define CCISS_BOARD_NOT_READY_ITERATIONS \
209 ((CCISS_BOARD_NOT_READY_WAIT_SECS * 1000) / \
210 CCISS_BOARD_READY_POLL_INTERVAL_MSECS)
207#define CCISS_POST_RESET_PAUSE_MSECS (3000) 211#define CCISS_POST_RESET_PAUSE_MSECS (3000)
208#define CCISS_POST_RESET_NOOP_INTERVAL_MSECS (1000) 212#define CCISS_POST_RESET_NOOP_INTERVAL_MSECS (4000)
209#define CCISS_POST_RESET_NOOP_RETRIES (12) 213#define CCISS_POST_RESET_NOOP_RETRIES (12)
214#define CCISS_POST_RESET_NOOP_TIMEOUT_MSECS (10000)
210 215
211/* 216/*
212 Send the command to the hardware 217 Send the command to the hardware
@@ -218,6 +223,7 @@ static void SA5_submit_command( ctlr_info_t *h, CommandList_struct *c)
218 h->ctlr, c->busaddr); 223 h->ctlr, c->busaddr);
219#endif /* CCISS_DEBUG */ 224#endif /* CCISS_DEBUG */
220 writel(c->busaddr, h->vaddr + SA5_REQUEST_PORT_OFFSET); 225 writel(c->busaddr, h->vaddr + SA5_REQUEST_PORT_OFFSET);
226 readl(h->vaddr + SA5_REQUEST_PORT_OFFSET);
221 h->commands_outstanding++; 227 h->commands_outstanding++;
222 if ( h->commands_outstanding > h->max_outstanding) 228 if ( h->commands_outstanding > h->max_outstanding)
223 h->max_outstanding = h->commands_outstanding; 229 h->max_outstanding = h->commands_outstanding;
@@ -234,11 +240,13 @@ static void SA5_intr_mask(ctlr_info_t *h, unsigned long val)
234 { /* Turn interrupts on */ 240 { /* Turn interrupts on */
235 h->interrupts_enabled = 1; 241 h->interrupts_enabled = 1;
236 writel(0, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); 242 writel(0, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
243 (void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
237 } else /* Turn them off */ 244 } else /* Turn them off */
238 { 245 {
239 h->interrupts_enabled = 0; 246 h->interrupts_enabled = 0;
240 writel( SA5_INTR_OFF, 247 writel( SA5_INTR_OFF,
241 h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); 248 h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
249 (void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
242 } 250 }
243} 251}
244/* 252/*
@@ -252,11 +260,13 @@ static void SA5B_intr_mask(ctlr_info_t *h, unsigned long val)
252 { /* Turn interrupts on */ 260 { /* Turn interrupts on */
253 h->interrupts_enabled = 1; 261 h->interrupts_enabled = 1;
254 writel(0, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); 262 writel(0, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
263 (void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
255 } else /* Turn them off */ 264 } else /* Turn them off */
256 { 265 {
257 h->interrupts_enabled = 0; 266 h->interrupts_enabled = 0;
258 writel( SA5B_INTR_OFF, 267 writel( SA5B_INTR_OFF,
259 h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); 268 h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
269 (void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
260 } 270 }
261} 271}
262 272
@@ -266,10 +276,12 @@ static void SA5_performant_intr_mask(ctlr_info_t *h, unsigned long val)
266 if (val) { /* turn on interrupts */ 276 if (val) { /* turn on interrupts */
267 h->interrupts_enabled = 1; 277 h->interrupts_enabled = 1;
268 writel(0, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); 278 writel(0, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
279 (void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
269 } else { 280 } else {
270 h->interrupts_enabled = 0; 281 h->interrupts_enabled = 0;
271 writel(SA5_PERF_INTR_OFF, 282 writel(SA5_PERF_INTR_OFF,
272 h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); 283 h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
284 (void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
273 } 285 }
274} 286}
275 287
diff --git a/drivers/block/cciss_cmd.h b/drivers/block/cciss_cmd.h
index eb060f1b00b6..d9be6b4d49a6 100644
--- a/drivers/block/cciss_cmd.h
+++ b/drivers/block/cciss_cmd.h
@@ -53,9 +53,11 @@
53#define CFGTBL_ChangeReq 0x00000001l 53#define CFGTBL_ChangeReq 0x00000001l
54#define CFGTBL_AccCmds 0x00000001l 54#define CFGTBL_AccCmds 0x00000001l
55#define DOORBELL_CTLR_RESET 0x00000004l 55#define DOORBELL_CTLR_RESET 0x00000004l
56#define DOORBELL_CTLR_RESET2 0x00000020l
56 57
57#define CFGTBL_Trans_Simple 0x00000002l 58#define CFGTBL_Trans_Simple 0x00000002l
58#define CFGTBL_Trans_Performant 0x00000004l 59#define CFGTBL_Trans_Performant 0x00000004l
60#define CFGTBL_Trans_use_short_tags 0x20000000l
59 61
60#define CFGTBL_BusType_Ultra2 0x00000001l 62#define CFGTBL_BusType_Ultra2 0x00000001l
61#define CFGTBL_BusType_Ultra3 0x00000002l 63#define CFGTBL_BusType_Ultra3 0x00000002l
@@ -141,6 +143,14 @@ typedef struct _ReadCapdata_struct_16
141#define BMIC_CACHE_FLUSH 0xc2 143#define BMIC_CACHE_FLUSH 0xc2
142#define CCISS_CACHE_FLUSH 0x01 /* C2 was already being used by CCISS */ 144#define CCISS_CACHE_FLUSH 0x01 /* C2 was already being used by CCISS */
143 145
146#define CCISS_ABORT_MSG 0x00
147#define CCISS_RESET_MSG 0x01
148#define CCISS_RESET_TYPE_CONTROLLER 0x00
149#define CCISS_RESET_TYPE_BUS 0x01
150#define CCISS_RESET_TYPE_TARGET 0x03
151#define CCISS_RESET_TYPE_LUN 0x04
152#define CCISS_NOOP_MSG 0x03
153
144/* Command List Structure */ 154/* Command List Structure */
145#define CTLR_LUNID "\0\0\0\0\0\0\0\0" 155#define CTLR_LUNID "\0\0\0\0\0\0\0\0"
146 156
@@ -195,7 +205,7 @@ typedef struct _CommandList_struct {
195 int ctlr; 205 int ctlr;
196 int cmd_type; 206 int cmd_type;
197 long cmdindex; 207 long cmdindex;
198 struct hlist_node list; 208 struct list_head list;
199 struct request * rq; 209 struct request * rq;
200 struct completion *waiting; 210 struct completion *waiting;
201 int retry_count; 211 int retry_count;
@@ -234,6 +244,8 @@ typedef struct _CfgTable_struct {
234 u8 reserved[0x78 - 0x58]; 244 u8 reserved[0x78 - 0x58];
235 u32 misc_fw_support; /* offset 0x78 */ 245 u32 misc_fw_support; /* offset 0x78 */
236#define MISC_FW_DOORBELL_RESET (0x02) 246#define MISC_FW_DOORBELL_RESET (0x02)
247#define MISC_FW_DOORBELL_RESET2 (0x10)
248 u8 driver_version[32];
237} CfgTable_struct; 249} CfgTable_struct;
238 250
239struct TransTable_struct { 251struct TransTable_struct {
diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index 575495f3c4b8..696100241a6f 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -62,8 +62,8 @@ static int cciss_scsi_proc_info(
62 int length, /* length of data in buffer */ 62 int length, /* length of data in buffer */
63 int func); /* 0 == read, 1 == write */ 63 int func); /* 0 == read, 1 == write */
64 64
65static int cciss_scsi_queue_command (struct scsi_cmnd *cmd, 65static int cciss_scsi_queue_command (struct Scsi_Host *h,
66 void (* done)(struct scsi_cmnd *)); 66 struct scsi_cmnd *cmd);
67static int cciss_eh_device_reset_handler(struct scsi_cmnd *); 67static int cciss_eh_device_reset_handler(struct scsi_cmnd *);
68static int cciss_eh_abort_handler(struct scsi_cmnd *); 68static int cciss_eh_abort_handler(struct scsi_cmnd *);
69 69
@@ -84,7 +84,6 @@ static struct scsi_host_template cciss_driver_template = {
84 .proc_name = "cciss", 84 .proc_name = "cciss",
85 .proc_info = cciss_scsi_proc_info, 85 .proc_info = cciss_scsi_proc_info,
86 .queuecommand = cciss_scsi_queue_command, 86 .queuecommand = cciss_scsi_queue_command,
87 .can_queue = SCSI_CCISS_CAN_QUEUE,
88 .this_id = 7, 87 .this_id = 7,
89 .cmd_per_lun = 1, 88 .cmd_per_lun = 1,
90 .use_clustering = DISABLE_CLUSTERING, 89 .use_clustering = DISABLE_CLUSTERING,
@@ -108,16 +107,13 @@ struct cciss_scsi_cmd_stack_elem_t {
108 107
109#pragma pack() 108#pragma pack()
110 109
111#define CMD_STACK_SIZE (SCSI_CCISS_CAN_QUEUE * \
112 CCISS_MAX_SCSI_DEVS_PER_HBA + 2)
113 // plus two for init time usage
114
115#pragma pack(1) 110#pragma pack(1)
116struct cciss_scsi_cmd_stack_t { 111struct cciss_scsi_cmd_stack_t {
117 struct cciss_scsi_cmd_stack_elem_t *pool; 112 struct cciss_scsi_cmd_stack_elem_t *pool;
118 struct cciss_scsi_cmd_stack_elem_t *elem[CMD_STACK_SIZE]; 113 struct cciss_scsi_cmd_stack_elem_t **elem;
119 dma_addr_t cmd_pool_handle; 114 dma_addr_t cmd_pool_handle;
120 int top; 115 int top;
116 int nelems;
121}; 117};
122#pragma pack() 118#pragma pack()
123 119
@@ -191,7 +187,7 @@ scsi_cmd_free(ctlr_info_t *h, CommandList_struct *c)
191 sa = h->scsi_ctlr; 187 sa = h->scsi_ctlr;
192 stk = &sa->cmd_stack; 188 stk = &sa->cmd_stack;
193 stk->top++; 189 stk->top++;
194 if (stk->top >= CMD_STACK_SIZE) { 190 if (stk->top >= stk->nelems) {
195 dev_err(&h->pdev->dev, 191 dev_err(&h->pdev->dev,
196 "scsi_cmd_free called too many times.\n"); 192 "scsi_cmd_free called too many times.\n");
197 BUG(); 193 BUG();
@@ -206,13 +202,14 @@ scsi_cmd_stack_setup(ctlr_info_t *h, struct cciss_scsi_adapter_data_t *sa)
206 struct cciss_scsi_cmd_stack_t *stk; 202 struct cciss_scsi_cmd_stack_t *stk;
207 size_t size; 203 size_t size;
208 204
205 stk = &sa->cmd_stack;
206 stk->nelems = cciss_tape_cmds + 2;
209 sa->cmd_sg_list = cciss_allocate_sg_chain_blocks(h, 207 sa->cmd_sg_list = cciss_allocate_sg_chain_blocks(h,
210 h->chainsize, CMD_STACK_SIZE); 208 h->chainsize, stk->nelems);
211 if (!sa->cmd_sg_list && h->chainsize > 0) 209 if (!sa->cmd_sg_list && h->chainsize > 0)
212 return -ENOMEM; 210 return -ENOMEM;
213 211
214 stk = &sa->cmd_stack; 212 size = sizeof(struct cciss_scsi_cmd_stack_elem_t) * stk->nelems;
215 size = sizeof(struct cciss_scsi_cmd_stack_elem_t) * CMD_STACK_SIZE;
216 213
217 /* Check alignment, see cciss_cmd.h near CommandList_struct def. */ 214 /* Check alignment, see cciss_cmd.h near CommandList_struct def. */
218 BUILD_BUG_ON((sizeof(*stk->pool) % COMMANDLIST_ALIGNMENT) != 0); 215 BUILD_BUG_ON((sizeof(*stk->pool) % COMMANDLIST_ALIGNMENT) != 0);
@@ -221,18 +218,23 @@ scsi_cmd_stack_setup(ctlr_info_t *h, struct cciss_scsi_adapter_data_t *sa)
221 pci_alloc_consistent(h->pdev, size, &stk->cmd_pool_handle); 218 pci_alloc_consistent(h->pdev, size, &stk->cmd_pool_handle);
222 219
223 if (stk->pool == NULL) { 220 if (stk->pool == NULL) {
224 cciss_free_sg_chain_blocks(sa->cmd_sg_list, CMD_STACK_SIZE); 221 cciss_free_sg_chain_blocks(sa->cmd_sg_list, stk->nelems);
225 sa->cmd_sg_list = NULL; 222 sa->cmd_sg_list = NULL;
226 return -ENOMEM; 223 return -ENOMEM;
227 } 224 }
228 225 stk->elem = kmalloc(sizeof(stk->elem[0]) * stk->nelems, GFP_KERNEL);
229 for (i=0; i<CMD_STACK_SIZE; i++) { 226 if (!stk->elem) {
227 pci_free_consistent(h->pdev, size, stk->pool,
228 stk->cmd_pool_handle);
229 return -1;
230 }
231 for (i = 0; i < stk->nelems; i++) {
230 stk->elem[i] = &stk->pool[i]; 232 stk->elem[i] = &stk->pool[i];
231 stk->elem[i]->busaddr = (__u32) (stk->cmd_pool_handle + 233 stk->elem[i]->busaddr = (__u32) (stk->cmd_pool_handle +
232 (sizeof(struct cciss_scsi_cmd_stack_elem_t) * i)); 234 (sizeof(struct cciss_scsi_cmd_stack_elem_t) * i));
233 stk->elem[i]->cmdindex = i; 235 stk->elem[i]->cmdindex = i;
234 } 236 }
235 stk->top = CMD_STACK_SIZE-1; 237 stk->top = stk->nelems-1;
236 return 0; 238 return 0;
237} 239}
238 240
@@ -245,16 +247,18 @@ scsi_cmd_stack_free(ctlr_info_t *h)
245 247
246 sa = h->scsi_ctlr; 248 sa = h->scsi_ctlr;
247 stk = &sa->cmd_stack; 249 stk = &sa->cmd_stack;
248 if (stk->top != CMD_STACK_SIZE-1) { 250 if (stk->top != stk->nelems-1) {
249 dev_warn(&h->pdev->dev, 251 dev_warn(&h->pdev->dev,
250 "bug: %d scsi commands are still outstanding.\n", 252 "bug: %d scsi commands are still outstanding.\n",
251 CMD_STACK_SIZE - stk->top); 253 stk->nelems - stk->top);
252 } 254 }
253 size = sizeof(struct cciss_scsi_cmd_stack_elem_t) * CMD_STACK_SIZE; 255 size = sizeof(struct cciss_scsi_cmd_stack_elem_t) * stk->nelems;
254 256
255 pci_free_consistent(h->pdev, size, stk->pool, stk->cmd_pool_handle); 257 pci_free_consistent(h->pdev, size, stk->pool, stk->cmd_pool_handle);
256 stk->pool = NULL; 258 stk->pool = NULL;
257 cciss_free_sg_chain_blocks(sa->cmd_sg_list, CMD_STACK_SIZE); 259 cciss_free_sg_chain_blocks(sa->cmd_sg_list, stk->nelems);
260 kfree(stk->elem);
261 stk->elem = NULL;
258} 262}
259 263
260#if 0 264#if 0
@@ -824,13 +828,18 @@ static void complete_scsi_command(CommandList_struct *c, int timeout,
824 break; 828 break;
825 case CMD_UNSOLICITED_ABORT: 829 case CMD_UNSOLICITED_ABORT:
826 cmd->result = DID_ABORT << 16; 830 cmd->result = DID_ABORT << 16;
827 dev_warn(&h->pdev->dev, "%p aborted do to an " 831 dev_warn(&h->pdev->dev, "%p aborted due to an "
828 "unsolicited abort\n", c); 832 "unsolicited abort\n", c);
829 break; 833 break;
830 case CMD_TIMEOUT: 834 case CMD_TIMEOUT:
831 cmd->result = DID_TIME_OUT << 16; 835 cmd->result = DID_TIME_OUT << 16;
832 dev_warn(&h->pdev->dev, "%p timedout\n", c); 836 dev_warn(&h->pdev->dev, "%p timedout\n", c);
833 break; 837 break;
838 case CMD_UNABORTABLE:
839 cmd->result = DID_ERROR << 16;
840 dev_warn(&h->pdev->dev, "c %p command "
841 "unabortable\n", c);
842 break;
834 default: 843 default:
835 cmd->result = DID_ERROR << 16; 844 cmd->result = DID_ERROR << 16;
836 dev_warn(&h->pdev->dev, 845 dev_warn(&h->pdev->dev,
@@ -854,6 +863,7 @@ cciss_scsi_detect(ctlr_info_t *h)
854 sh->io_port = 0; // good enough? FIXME, 863 sh->io_port = 0; // good enough? FIXME,
855 sh->n_io_port = 0; // I don't think we use these two... 864 sh->n_io_port = 0; // I don't think we use these two...
856 sh->this_id = SELF_SCSI_ID; 865 sh->this_id = SELF_SCSI_ID;
866 sh->can_queue = cciss_tape_cmds;
857 sh->sg_tablesize = h->maxsgentries; 867 sh->sg_tablesize = h->maxsgentries;
858 sh->max_cmd_len = MAX_COMMAND_SIZE; 868 sh->max_cmd_len = MAX_COMMAND_SIZE;
859 869
@@ -1007,11 +1017,15 @@ cciss_scsi_interpret_error(ctlr_info_t *h, CommandList_struct *c)
1007 break; 1017 break;
1008 case CMD_UNSOLICITED_ABORT: 1018 case CMD_UNSOLICITED_ABORT:
1009 dev_warn(&h->pdev->dev, 1019 dev_warn(&h->pdev->dev,
1010 "%p aborted do to an unsolicited abort\n", c); 1020 "%p aborted due to an unsolicited abort\n", c);
1011 break; 1021 break;
1012 case CMD_TIMEOUT: 1022 case CMD_TIMEOUT:
1013 dev_warn(&h->pdev->dev, "%p timedout\n", c); 1023 dev_warn(&h->pdev->dev, "%p timedout\n", c);
1014 break; 1024 break;
1025 case CMD_UNABORTABLE:
1026 dev_warn(&h->pdev->dev,
1027 "%p unabortable\n", c);
1028 break;
1015 default: 1029 default:
1016 dev_warn(&h->pdev->dev, 1030 dev_warn(&h->pdev->dev,
1017 "%p returned unknown status %x\n", 1031 "%p returned unknown status %x\n",
@@ -1406,7 +1420,7 @@ static void cciss_scatter_gather(ctlr_info_t *h, CommandList_struct *c,
1406 1420
1407 1421
1408static int 1422static int
1409cciss_scsi_queue_command (struct scsi_cmnd *cmd, void (* done)(struct scsi_cmnd *)) 1423cciss_scsi_queue_command_lck(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *))
1410{ 1424{
1411 ctlr_info_t *h; 1425 ctlr_info_t *h;
1412 int rc; 1426 int rc;
@@ -1504,6 +1518,8 @@ cciss_scsi_queue_command (struct scsi_cmnd *cmd, void (* done)(struct scsi_cmnd
1504 return 0; 1518 return 0;
1505} 1519}
1506 1520
1521static DEF_SCSI_QCMD(cciss_scsi_queue_command)
1522
1507static void cciss_unregister_scsi(ctlr_info_t *h) 1523static void cciss_unregister_scsi(ctlr_info_t *h)
1508{ 1524{
1509 struct cciss_scsi_adapter_data_t *sa; 1525 struct cciss_scsi_adapter_data_t *sa;
diff --git a/drivers/block/cciss_scsi.h b/drivers/block/cciss_scsi.h
index 6d5822fe851a..e71d986727ca 100644
--- a/drivers/block/cciss_scsi.h
+++ b/drivers/block/cciss_scsi.h
@@ -36,13 +36,9 @@
36 addressible natively, and may in fact turn 36 addressible natively, and may in fact turn
37 out to be not scsi at all. */ 37 out to be not scsi at all. */
38 38
39#define SCSI_CCISS_CAN_QUEUE 2
40 39
41/* 40/*
42 41
43Note, cmd_per_lun could give us some trouble, so I'm setting it very low.
44Likewise, SCSI_CCISS_CAN_QUEUE is set very conservatively.
45
46If the upper scsi layer tries to track how many commands we have 42If the upper scsi layer tries to track how many commands we have
47outstanding, it will be operating under the misapprehension that it is 43outstanding, it will be operating under the misapprehension that it is
48the only one sending us requests. We also have the block interface, 44the only one sending us requests. We also have the block interface,
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index d53b0291c44b..b2fceb53e809 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -35,7 +35,7 @@
35#include <linux/seq_file.h> 35#include <linux/seq_file.h>
36#include <linux/init.h> 36#include <linux/init.h>
37#include <linux/hdreg.h> 37#include <linux/hdreg.h>
38#include <linux/smp_lock.h> 38#include <linux/mutex.h>
39#include <linux/spinlock.h> 39#include <linux/spinlock.h>
40#include <linux/blkdev.h> 40#include <linux/blkdev.h>
41#include <linux/genhd.h> 41#include <linux/genhd.h>
@@ -68,6 +68,7 @@ MODULE_LICENSE("GPL");
68 68
69#define CPQARRAY_DMA_MASK 0xFFFFFFFF /* 32 bit DMA */ 69#define CPQARRAY_DMA_MASK 0xFFFFFFFF /* 32 bit DMA */
70 70
71static DEFINE_MUTEX(cpqarray_mutex);
71static int nr_ctlr; 72static int nr_ctlr;
72static ctlr_info_t *hba[MAX_CTLR]; 73static ctlr_info_t *hba[MAX_CTLR];
73 74
@@ -845,9 +846,9 @@ static int ida_unlocked_open(struct block_device *bdev, fmode_t mode)
845{ 846{
846 int ret; 847 int ret;
847 848
848 lock_kernel(); 849 mutex_lock(&cpqarray_mutex);
849 ret = ida_open(bdev, mode); 850 ret = ida_open(bdev, mode);
850 unlock_kernel(); 851 mutex_unlock(&cpqarray_mutex);
851 852
852 return ret; 853 return ret;
853} 854}
@@ -859,10 +860,10 @@ static int ida_release(struct gendisk *disk, fmode_t mode)
859{ 860{
860 ctlr_info_t *host; 861 ctlr_info_t *host;
861 862
862 lock_kernel(); 863 mutex_lock(&cpqarray_mutex);
863 host = get_host(disk); 864 host = get_host(disk);
864 host->usage_count--; 865 host->usage_count--;
865 unlock_kernel(); 866 mutex_unlock(&cpqarray_mutex);
866 867
867 return 0; 868 return 0;
868} 869}
@@ -910,9 +911,6 @@ static void do_ida_request(struct request_queue *q)
910 struct scatterlist tmp_sg[SG_MAX]; 911 struct scatterlist tmp_sg[SG_MAX];
911 int i, dir, seg; 912 int i, dir, seg;
912 913
913 if (blk_queue_plugged(q))
914 goto startio;
915
916queue_next: 914queue_next:
917 creq = blk_peek_request(q); 915 creq = blk_peek_request(q);
918 if (!creq) 916 if (!creq)
@@ -1217,9 +1215,9 @@ static int ida_ioctl(struct block_device *bdev, fmode_t mode,
1217{ 1215{
1218 int ret; 1216 int ret;
1219 1217
1220 lock_kernel(); 1218 mutex_lock(&cpqarray_mutex);
1221 ret = ida_locked_ioctl(bdev, mode, cmd, param); 1219 ret = ida_locked_ioctl(bdev, mode, cmd, param);
1222 unlock_kernel(); 1220 mutex_unlock(&cpqarray_mutex);
1223 1221
1224 return ret; 1222 return ret;
1225} 1223}
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 9400845d602e..cf0e63dd97da 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -28,9 +28,9 @@
28#include "drbd_int.h" 28#include "drbd_int.h"
29#include "drbd_wrappers.h" 29#include "drbd_wrappers.h"
30 30
31/* We maintain a trivial check sum in our on disk activity log. 31/* We maintain a trivial checksum in our on disk activity log.
32 * With that we can ensure correct operation even when the storage 32 * With that we can ensure correct operation even when the storage
33 * device might do a partial (last) sector write while loosing power. 33 * device might do a partial (last) sector write while losing power.
34 */ 34 */
35struct __packed al_transaction { 35struct __packed al_transaction {
36 u32 magic; 36 u32 magic;
@@ -78,11 +78,10 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
78 init_completion(&md_io.event); 78 init_completion(&md_io.event);
79 md_io.error = 0; 79 md_io.error = 0;
80 80
81 if ((rw & WRITE) && !test_bit(MD_NO_BARRIER, &mdev->flags)) 81 if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags))
82 rw |= REQ_HARDBARRIER; 82 rw |= REQ_FUA | REQ_FLUSH;
83 rw |= REQ_UNPLUG | REQ_SYNC; 83 rw |= REQ_SYNC;
84 84
85 retry:
86 bio = bio_alloc(GFP_NOIO, 1); 85 bio = bio_alloc(GFP_NOIO, 1);
87 bio->bi_bdev = bdev->md_bdev; 86 bio->bi_bdev = bdev->md_bdev;
88 bio->bi_sector = sector; 87 bio->bi_sector = sector;
@@ -93,24 +92,13 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
93 bio->bi_end_io = drbd_md_io_complete; 92 bio->bi_end_io = drbd_md_io_complete;
94 bio->bi_rw = rw; 93 bio->bi_rw = rw;
95 94
96 if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) 95 if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
97 bio_endio(bio, -EIO); 96 bio_endio(bio, -EIO);
98 else 97 else
99 submit_bio(rw, bio); 98 submit_bio(rw, bio);
100 wait_for_completion(&md_io.event); 99 wait_for_completion(&md_io.event);
101 ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0; 100 ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0;
102 101
103 /* check for unsupported barrier op.
104 * would rather check on EOPNOTSUPP, but that is not reliable.
105 * don't try again for ANY return value != 0 */
106 if (unlikely((bio->bi_rw & REQ_HARDBARRIER) && !ok)) {
107 /* Try again with no barrier */
108 dev_warn(DEV, "Barriers not supported on meta data device - disabling\n");
109 set_bit(MD_NO_BARRIER, &mdev->flags);
110 rw &= ~REQ_HARDBARRIER;
111 bio_put(bio);
112 goto retry;
113 }
114 out: 102 out:
115 bio_put(bio); 103 bio_put(bio);
116 return ok; 104 return ok;
@@ -188,13 +176,17 @@ static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr)
188 struct lc_element *al_ext; 176 struct lc_element *al_ext;
189 struct lc_element *tmp; 177 struct lc_element *tmp;
190 unsigned long al_flags = 0; 178 unsigned long al_flags = 0;
179 int wake;
191 180
192 spin_lock_irq(&mdev->al_lock); 181 spin_lock_irq(&mdev->al_lock);
193 tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT); 182 tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
194 if (unlikely(tmp != NULL)) { 183 if (unlikely(tmp != NULL)) {
195 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); 184 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
196 if (test_bit(BME_NO_WRITES, &bm_ext->flags)) { 185 if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
186 wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags);
197 spin_unlock_irq(&mdev->al_lock); 187 spin_unlock_irq(&mdev->al_lock);
188 if (wake)
189 wake_up(&mdev->al_wait);
198 return NULL; 190 return NULL;
199 } 191 }
200 } 192 }
@@ -270,6 +262,33 @@ void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector)
270 spin_unlock_irqrestore(&mdev->al_lock, flags); 262 spin_unlock_irqrestore(&mdev->al_lock, flags);
271} 263}
272 264
265#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)
266/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT
267 * are still coupled, or assume too much about their relation.
268 * Code below will not work if this is violated.
269 * Will be cleaned up with some followup patch.
270 */
271# error FIXME
272#endif
273
274static unsigned int al_extent_to_bm_page(unsigned int al_enr)
275{
276 return al_enr >>
277 /* bit to page */
278 ((PAGE_SHIFT + 3) -
279 /* al extent number to bit */
280 (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT));
281}
282
283static unsigned int rs_extent_to_bm_page(unsigned int rs_enr)
284{
285 return rs_enr >>
286 /* bit to page */
287 ((PAGE_SHIFT + 3) -
288 /* al extent number to bit */
289 (BM_EXT_SHIFT - BM_BLOCK_SHIFT));
290}
291
273int 292int
274w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) 293w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
275{ 294{
@@ -284,18 +303,32 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
284 u32 xor_sum = 0; 303 u32 xor_sum = 0;
285 304
286 if (!get_ldev(mdev)) { 305 if (!get_ldev(mdev)) {
287 dev_err(DEV, "get_ldev() failed in w_al_write_transaction\n"); 306 dev_err(DEV,
307 "disk is %s, cannot start al transaction (-%d +%d)\n",
308 drbd_disk_str(mdev->state.disk), evicted, new_enr);
288 complete(&((struct update_al_work *)w)->event); 309 complete(&((struct update_al_work *)w)->event);
289 return 1; 310 return 1;
290 } 311 }
291 /* do we have to do a bitmap write, first? 312 /* do we have to do a bitmap write, first?
292 * TODO reduce maximum latency: 313 * TODO reduce maximum latency:
293 * submit both bios, then wait for both, 314 * submit both bios, then wait for both,
294 * instead of doing two synchronous sector writes. */ 315 * instead of doing two synchronous sector writes.
316 * For now, we must not write the transaction,
317 * if we cannot write out the bitmap of the evicted extent. */
295 if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE) 318 if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE)
296 drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT); 319 drbd_bm_write_page(mdev, al_extent_to_bm_page(evicted));
320
321 /* The bitmap write may have failed, causing a state change. */
322 if (mdev->state.disk < D_INCONSISTENT) {
323 dev_err(DEV,
324 "disk is %s, cannot write al transaction (-%d +%d)\n",
325 drbd_disk_str(mdev->state.disk), evicted, new_enr);
326 complete(&((struct update_al_work *)w)->event);
327 put_ldev(mdev);
328 return 1;
329 }
297 330
298 mutex_lock(&mdev->md_io_mutex); /* protects md_io_page, al_tr_cycle, ... */ 331 mutex_lock(&mdev->md_io_mutex); /* protects md_io_buffer, al_tr_cycle, ... */
299 buffer = (struct al_transaction *)page_address(mdev->md_io_page); 332 buffer = (struct al_transaction *)page_address(mdev->md_io_page);
300 333
301 buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC); 334 buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC);
@@ -332,7 +365,7 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
332 + mdev->ldev->md.al_offset + mdev->al_tr_pos; 365 + mdev->ldev->md.al_offset + mdev->al_tr_pos;
333 366
334 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) 367 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE))
335 drbd_chk_io_error(mdev, 1, TRUE); 368 drbd_chk_io_error(mdev, 1, true);
336 369
337 if (++mdev->al_tr_pos > 370 if (++mdev->al_tr_pos >
338 div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT)) 371 div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
@@ -509,227 +542,6 @@ cancel:
509 return 1; 542 return 1;
510} 543}
511 544
512static void atodb_endio(struct bio *bio, int error)
513{
514 struct drbd_atodb_wait *wc = bio->bi_private;
515 struct drbd_conf *mdev = wc->mdev;
516 struct page *page;
517 int uptodate = bio_flagged(bio, BIO_UPTODATE);
518
519 /* strange behavior of some lower level drivers...
520 * fail the request by clearing the uptodate flag,
521 * but do not return any error?! */
522 if (!error && !uptodate)
523 error = -EIO;
524
525 drbd_chk_io_error(mdev, error, TRUE);
526 if (error && wc->error == 0)
527 wc->error = error;
528
529 if (atomic_dec_and_test(&wc->count))
530 complete(&wc->io_done);
531
532 page = bio->bi_io_vec[0].bv_page;
533 put_page(page);
534 bio_put(bio);
535 mdev->bm_writ_cnt++;
536 put_ldev(mdev);
537}
538
539/* sector to word */
540#define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
541
542/* activity log to on disk bitmap -- prepare bio unless that sector
543 * is already covered by previously prepared bios */
544static int atodb_prepare_unless_covered(struct drbd_conf *mdev,
545 struct bio **bios,
546 unsigned int enr,
547 struct drbd_atodb_wait *wc) __must_hold(local)
548{
549 struct bio *bio;
550 struct page *page;
551 sector_t on_disk_sector;
552 unsigned int page_offset = PAGE_SIZE;
553 int offset;
554 int i = 0;
555 int err = -ENOMEM;
556
557 /* We always write aligned, full 4k blocks,
558 * so we can ignore the logical_block_size (for now) */
559 enr &= ~7U;
560 on_disk_sector = enr + mdev->ldev->md.md_offset
561 + mdev->ldev->md.bm_offset;
562
563 D_ASSERT(!(on_disk_sector & 7U));
564
565 /* Check if that enr is already covered by an already created bio.
566 * Caution, bios[] is not NULL terminated,
567 * but only initialized to all NULL.
568 * For completely scattered activity log,
569 * the last invocation iterates over all bios,
570 * and finds the last NULL entry.
571 */
572 while ((bio = bios[i])) {
573 if (bio->bi_sector == on_disk_sector)
574 return 0;
575 i++;
576 }
577 /* bios[i] == NULL, the next not yet used slot */
578
579 /* GFP_KERNEL, we are not in the write-out path */
580 bio = bio_alloc(GFP_KERNEL, 1);
581 if (bio == NULL)
582 return -ENOMEM;
583
584 if (i > 0) {
585 const struct bio_vec *prev_bv = bios[i-1]->bi_io_vec;
586 page_offset = prev_bv->bv_offset + prev_bv->bv_len;
587 page = prev_bv->bv_page;
588 }
589 if (page_offset == PAGE_SIZE) {
590 page = alloc_page(__GFP_HIGHMEM);
591 if (page == NULL)
592 goto out_bio_put;
593 page_offset = 0;
594 } else {
595 get_page(page);
596 }
597
598 offset = S2W(enr);
599 drbd_bm_get_lel(mdev, offset,
600 min_t(size_t, S2W(8), drbd_bm_words(mdev) - offset),
601 kmap(page) + page_offset);
602 kunmap(page);
603
604 bio->bi_private = wc;
605 bio->bi_end_io = atodb_endio;
606 bio->bi_bdev = mdev->ldev->md_bdev;
607 bio->bi_sector = on_disk_sector;
608
609 if (bio_add_page(bio, page, 4096, page_offset) != 4096)
610 goto out_put_page;
611
612 atomic_inc(&wc->count);
613 /* we already know that we may do this...
614 * get_ldev_if_state(mdev,D_ATTACHING);
615 * just get the extra reference, so that the local_cnt reflects
616 * the number of pending IO requests DRBD at its backing device.
617 */
618 atomic_inc(&mdev->local_cnt);
619
620 bios[i] = bio;
621
622 return 0;
623
624out_put_page:
625 err = -EINVAL;
626 put_page(page);
627out_bio_put:
628 bio_put(bio);
629 return err;
630}
631
632/**
633 * drbd_al_to_on_disk_bm() - * Writes bitmap parts covered by active AL extents
634 * @mdev: DRBD device.
635 *
636 * Called when we detach (unconfigure) local storage,
637 * or when we go from R_PRIMARY to R_SECONDARY role.
638 */
639void drbd_al_to_on_disk_bm(struct drbd_conf *mdev)
640{
641 int i, nr_elements;
642 unsigned int enr;
643 struct bio **bios;
644 struct drbd_atodb_wait wc;
645
646 ERR_IF (!get_ldev_if_state(mdev, D_ATTACHING))
647 return; /* sorry, I don't have any act_log etc... */
648
649 wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
650
651 nr_elements = mdev->act_log->nr_elements;
652
653 /* GFP_KERNEL, we are not in anyone's write-out path */
654 bios = kzalloc(sizeof(struct bio *) * nr_elements, GFP_KERNEL);
655 if (!bios)
656 goto submit_one_by_one;
657
658 atomic_set(&wc.count, 0);
659 init_completion(&wc.io_done);
660 wc.mdev = mdev;
661 wc.error = 0;
662
663 for (i = 0; i < nr_elements; i++) {
664 enr = lc_element_by_index(mdev->act_log, i)->lc_number;
665 if (enr == LC_FREE)
666 continue;
667 /* next statement also does atomic_inc wc.count and local_cnt */
668 if (atodb_prepare_unless_covered(mdev, bios,
669 enr/AL_EXT_PER_BM_SECT,
670 &wc))
671 goto free_bios_submit_one_by_one;
672 }
673
674 /* unnecessary optimization? */
675 lc_unlock(mdev->act_log);
676 wake_up(&mdev->al_wait);
677
678 /* all prepared, submit them */
679 for (i = 0; i < nr_elements; i++) {
680 if (bios[i] == NULL)
681 break;
682 if (FAULT_ACTIVE(mdev, DRBD_FAULT_MD_WR)) {
683 bios[i]->bi_rw = WRITE;
684 bio_endio(bios[i], -EIO);
685 } else {
686 submit_bio(WRITE, bios[i]);
687 }
688 }
689
690 drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev));
691
692 /* always (try to) flush bitmap to stable storage */
693 drbd_md_flush(mdev);
694
695 /* In case we did not submit a single IO do not wait for
696 * them to complete. ( Because we would wait forever here. )
697 *
698 * In case we had IOs and they are already complete, there
699 * is not point in waiting anyways.
700 * Therefore this if () ... */
701 if (atomic_read(&wc.count))
702 wait_for_completion(&wc.io_done);
703
704 put_ldev(mdev);
705
706 kfree(bios);
707 return;
708
709 free_bios_submit_one_by_one:
710 /* free everything by calling the endio callback directly. */
711 for (i = 0; i < nr_elements && bios[i]; i++)
712 bio_endio(bios[i], 0);
713
714 kfree(bios);
715
716 submit_one_by_one:
717 dev_warn(DEV, "Using the slow drbd_al_to_on_disk_bm()\n");
718
719 for (i = 0; i < mdev->act_log->nr_elements; i++) {
720 enr = lc_element_by_index(mdev->act_log, i)->lc_number;
721 if (enr == LC_FREE)
722 continue;
723 /* Really slow: if we have al-extents 16..19 active,
724 * sector 4 will be written four times! Synchronous! */
725 drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT);
726 }
727
728 lc_unlock(mdev->act_log);
729 wake_up(&mdev->al_wait);
730 put_ldev(mdev);
731}
732
733/** 545/**
734 * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents 546 * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents
735 * @mdev: DRBD device. 547 * @mdev: DRBD device.
@@ -739,7 +551,7 @@ void drbd_al_apply_to_bm(struct drbd_conf *mdev)
739 unsigned int enr; 551 unsigned int enr;
740 unsigned long add = 0; 552 unsigned long add = 0;
741 char ppb[10]; 553 char ppb[10];
742 int i; 554 int i, tmp;
743 555
744 wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); 556 wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
745 557
@@ -747,7 +559,9 @@ void drbd_al_apply_to_bm(struct drbd_conf *mdev)
747 enr = lc_element_by_index(mdev->act_log, i)->lc_number; 559 enr = lc_element_by_index(mdev->act_log, i)->lc_number;
748 if (enr == LC_FREE) 560 if (enr == LC_FREE)
749 continue; 561 continue;
750 add += drbd_bm_ALe_set_all(mdev, enr); 562 tmp = drbd_bm_ALe_set_all(mdev, enr);
563 dynamic_dev_dbg(DEV, "AL: set %d bits in extent %u\n", tmp, enr);
564 add += tmp;
751 } 565 }
752 566
753 lc_unlock(mdev->act_log); 567 lc_unlock(mdev->act_log);
@@ -807,7 +621,7 @@ static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused
807 return 1; 621 return 1;
808 } 622 }
809 623
810 drbd_bm_write_sect(mdev, udw->enr); 624 drbd_bm_write_page(mdev, rs_extent_to_bm_page(udw->enr));
811 put_ldev(mdev); 625 put_ldev(mdev);
812 626
813 kfree(udw); 627 kfree(udw);
@@ -887,7 +701,6 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
887 dev_warn(DEV, "Kicking resync_lru element enr=%u " 701 dev_warn(DEV, "Kicking resync_lru element enr=%u "
888 "out with rs_failed=%d\n", 702 "out with rs_failed=%d\n",
889 ext->lce.lc_number, ext->rs_failed); 703 ext->lce.lc_number, ext->rs_failed);
890 set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
891 } 704 }
892 ext->rs_left = rs_left; 705 ext->rs_left = rs_left;
893 ext->rs_failed = success ? 0 : count; 706 ext->rs_failed = success ? 0 : count;
@@ -906,7 +719,6 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
906 drbd_queue_work_front(&mdev->data.work, &udw->w); 719 drbd_queue_work_front(&mdev->data.work, &udw->w);
907 } else { 720 } else {
908 dev_warn(DEV, "Could not kmalloc an udw\n"); 721 dev_warn(DEV, "Could not kmalloc an udw\n");
909 set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
910 } 722 }
911 } 723 }
912 } else { 724 } else {
@@ -917,6 +729,22 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
917 } 729 }
918} 730}
919 731
732void drbd_advance_rs_marks(struct drbd_conf *mdev, unsigned long still_to_go)
733{
734 unsigned long now = jiffies;
735 unsigned long last = mdev->rs_mark_time[mdev->rs_last_mark];
736 int next = (mdev->rs_last_mark + 1) % DRBD_SYNC_MARKS;
737 if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) {
738 if (mdev->rs_mark_left[mdev->rs_last_mark] != still_to_go &&
739 mdev->state.conn != C_PAUSED_SYNC_T &&
740 mdev->state.conn != C_PAUSED_SYNC_S) {
741 mdev->rs_mark_time[next] = now;
742 mdev->rs_mark_left[next] = still_to_go;
743 mdev->rs_last_mark = next;
744 }
745 }
746}
747
920/* clear the bit corresponding to the piece of storage in question: 748/* clear the bit corresponding to the piece of storage in question:
921 * size byte of data starting from sector. Only clear a bits of the affected 749 * size byte of data starting from sector. Only clear a bits of the affected
922 * one ore more _aligned_ BM_BLOCK_SIZE blocks. 750 * one ore more _aligned_ BM_BLOCK_SIZE blocks.
@@ -934,7 +762,7 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
934 int wake_up = 0; 762 int wake_up = 0;
935 unsigned long flags; 763 unsigned long flags;
936 764
937 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { 765 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
938 dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n", 766 dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n",
939 (unsigned long long)sector, size); 767 (unsigned long long)sector, size);
940 return; 768 return;
@@ -965,29 +793,18 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
965 * ok, (capacity & 7) != 0 sometimes, but who cares... 793 * ok, (capacity & 7) != 0 sometimes, but who cares...
966 * we count rs_{total,left} in bits, not sectors. 794 * we count rs_{total,left} in bits, not sectors.
967 */ 795 */
968 spin_lock_irqsave(&mdev->al_lock, flags);
969 count = drbd_bm_clear_bits(mdev, sbnr, ebnr); 796 count = drbd_bm_clear_bits(mdev, sbnr, ebnr);
970 if (count) { 797 if (count && get_ldev(mdev)) {
971 /* we need the lock for drbd_try_clear_on_disk_bm */ 798 drbd_advance_rs_marks(mdev, drbd_bm_total_weight(mdev));
972 if (jiffies - mdev->rs_mark_time > HZ*10) { 799 spin_lock_irqsave(&mdev->al_lock, flags);
973 /* should be rolling marks, 800 drbd_try_clear_on_disk_bm(mdev, sector, count, true);
974 * but we estimate only anyways. */ 801 spin_unlock_irqrestore(&mdev->al_lock, flags);
975 if (mdev->rs_mark_left != drbd_bm_total_weight(mdev) && 802
976 mdev->state.conn != C_PAUSED_SYNC_T &&
977 mdev->state.conn != C_PAUSED_SYNC_S) {
978 mdev->rs_mark_time = jiffies;
979 mdev->rs_mark_left = drbd_bm_total_weight(mdev);
980 }
981 }
982 if (get_ldev(mdev)) {
983 drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE);
984 put_ldev(mdev);
985 }
986 /* just wake_up unconditional now, various lc_chaged(), 803 /* just wake_up unconditional now, various lc_chaged(),
987 * lc_put() in drbd_try_clear_on_disk_bm(). */ 804 * lc_put() in drbd_try_clear_on_disk_bm(). */
988 wake_up = 1; 805 wake_up = 1;
806 put_ldev(mdev);
989 } 807 }
990 spin_unlock_irqrestore(&mdev->al_lock, flags);
991 if (wake_up) 808 if (wake_up)
992 wake_up(&mdev->al_wait); 809 wake_up(&mdev->al_wait);
993} 810}
@@ -995,27 +812,27 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
995/* 812/*
996 * this is intended to set one request worth of data out of sync. 813 * this is intended to set one request worth of data out of sync.
997 * affects at least 1 bit, 814 * affects at least 1 bit,
998 * and at most 1+DRBD_MAX_SEGMENT_SIZE/BM_BLOCK_SIZE bits. 815 * and at most 1+DRBD_MAX_BIO_SIZE/BM_BLOCK_SIZE bits.
999 * 816 *
1000 * called by tl_clear and drbd_send_dblock (==drbd_make_request). 817 * called by tl_clear and drbd_send_dblock (==drbd_make_request).
1001 * so this can be _any_ process. 818 * so this can be _any_ process.
1002 */ 819 */
1003void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, 820int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
1004 const char *file, const unsigned int line) 821 const char *file, const unsigned int line)
1005{ 822{
1006 unsigned long sbnr, ebnr, lbnr, flags; 823 unsigned long sbnr, ebnr, lbnr, flags;
1007 sector_t esector, nr_sectors; 824 sector_t esector, nr_sectors;
1008 unsigned int enr, count; 825 unsigned int enr, count = 0;
1009 struct lc_element *e; 826 struct lc_element *e;
1010 827
1011 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { 828 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
1012 dev_err(DEV, "sector: %llus, size: %d\n", 829 dev_err(DEV, "sector: %llus, size: %d\n",
1013 (unsigned long long)sector, size); 830 (unsigned long long)sector, size);
1014 return; 831 return 0;
1015 } 832 }
1016 833
1017 if (!get_ldev(mdev)) 834 if (!get_ldev(mdev))
1018 return; /* no disk, no metadata, no bitmap to set bits in */ 835 return 0; /* no disk, no metadata, no bitmap to set bits in */
1019 836
1020 nr_sectors = drbd_get_capacity(mdev->this_bdev); 837 nr_sectors = drbd_get_capacity(mdev->this_bdev);
1021 esector = sector + (size >> 9) - 1; 838 esector = sector + (size >> 9) - 1;
@@ -1045,6 +862,8 @@ void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
1045 862
1046out: 863out:
1047 put_ldev(mdev); 864 put_ldev(mdev);
865
866 return count;
1048} 867}
1049 868
1050static 869static
@@ -1118,40 +937,50 @@ static int _is_in_al(struct drbd_conf *mdev, unsigned int enr)
1118 * @mdev: DRBD device. 937 * @mdev: DRBD device.
1119 * @sector: The sector number. 938 * @sector: The sector number.
1120 * 939 *
1121 * This functions sleeps on al_wait. Returns 1 on success, 0 if interrupted. 940 * This functions sleeps on al_wait. Returns 0 on success, -EINTR if interrupted.
1122 */ 941 */
1123int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector) 942int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
1124{ 943{
1125 unsigned int enr = BM_SECT_TO_EXT(sector); 944 unsigned int enr = BM_SECT_TO_EXT(sector);
1126 struct bm_extent *bm_ext; 945 struct bm_extent *bm_ext;
1127 int i, sig; 946 int i, sig;
947 int sa = 200; /* Step aside 200 times, then grab the extent and let app-IO wait.
948 200 times -> 20 seconds. */
1128 949
950retry:
1129 sig = wait_event_interruptible(mdev->al_wait, 951 sig = wait_event_interruptible(mdev->al_wait,
1130 (bm_ext = _bme_get(mdev, enr))); 952 (bm_ext = _bme_get(mdev, enr)));
1131 if (sig) 953 if (sig)
1132 return 0; 954 return -EINTR;
1133 955
1134 if (test_bit(BME_LOCKED, &bm_ext->flags)) 956 if (test_bit(BME_LOCKED, &bm_ext->flags))
1135 return 1; 957 return 0;
1136 958
1137 for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { 959 for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
1138 sig = wait_event_interruptible(mdev->al_wait, 960 sig = wait_event_interruptible(mdev->al_wait,
1139 !_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i)); 961 !_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i) ||
1140 if (sig) { 962 test_bit(BME_PRIORITY, &bm_ext->flags));
963
964 if (sig || (test_bit(BME_PRIORITY, &bm_ext->flags) && sa)) {
1141 spin_lock_irq(&mdev->al_lock); 965 spin_lock_irq(&mdev->al_lock);
1142 if (lc_put(mdev->resync, &bm_ext->lce) == 0) { 966 if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
1143 clear_bit(BME_NO_WRITES, &bm_ext->flags); 967 bm_ext->flags = 0; /* clears BME_NO_WRITES and eventually BME_PRIORITY */
1144 mdev->resync_locked--; 968 mdev->resync_locked--;
1145 wake_up(&mdev->al_wait); 969 wake_up(&mdev->al_wait);
1146 } 970 }
1147 spin_unlock_irq(&mdev->al_lock); 971 spin_unlock_irq(&mdev->al_lock);
1148 return 0; 972 if (sig)
973 return -EINTR;
974 if (schedule_timeout_interruptible(HZ/10))
975 return -EINTR;
976 if (sa && --sa == 0)
977 dev_warn(DEV,"drbd_rs_begin_io() stepped aside for 20sec."
978 "Resync stalled?\n");
979 goto retry;
1149 } 980 }
1150 } 981 }
1151
1152 set_bit(BME_LOCKED, &bm_ext->flags); 982 set_bit(BME_LOCKED, &bm_ext->flags);
1153 983 return 0;
1154 return 1;
1155} 984}
1156 985
1157/** 986/**
@@ -1290,8 +1119,7 @@ void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector)
1290 } 1119 }
1291 1120
1292 if (lc_put(mdev->resync, &bm_ext->lce) == 0) { 1121 if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
1293 clear_bit(BME_LOCKED, &bm_ext->flags); 1122 bm_ext->flags = 0; /* clear BME_LOCKED, BME_NO_WRITES and BME_PRIORITY */
1294 clear_bit(BME_NO_WRITES, &bm_ext->flags);
1295 mdev->resync_locked--; 1123 mdev->resync_locked--;
1296 wake_up(&mdev->al_wait); 1124 wake_up(&mdev->al_wait);
1297 } 1125 }
@@ -1382,7 +1210,7 @@ void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size)
1382 sector_t esector, nr_sectors; 1210 sector_t esector, nr_sectors;
1383 int wake_up = 0; 1211 int wake_up = 0;
1384 1212
1385 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { 1213 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
1386 dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n", 1214 dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n",
1387 (unsigned long long)sector, size); 1215 (unsigned long long)sector, size);
1388 return; 1216 return;
@@ -1419,7 +1247,7 @@ void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size)
1419 mdev->rs_failed += count; 1247 mdev->rs_failed += count;
1420 1248
1421 if (get_ldev(mdev)) { 1249 if (get_ldev(mdev)) {
1422 drbd_try_clear_on_disk_bm(mdev, sector, count, FALSE); 1250 drbd_try_clear_on_disk_bm(mdev, sector, count, false);
1423 put_ldev(mdev); 1251 put_ldev(mdev);
1424 } 1252 }
1425 1253
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index e3f88d6e1412..7b976296b564 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -28,18 +28,56 @@
28#include <linux/drbd.h> 28#include <linux/drbd.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <asm/kmap_types.h> 30#include <asm/kmap_types.h>
31
31#include "drbd_int.h" 32#include "drbd_int.h"
32 33
34
33/* OPAQUE outside this file! 35/* OPAQUE outside this file!
34 * interface defined in drbd_int.h 36 * interface defined in drbd_int.h
35 37
36 * convention: 38 * convention:
37 * function name drbd_bm_... => used elsewhere, "public". 39 * function name drbd_bm_... => used elsewhere, "public".
38 * function name bm_... => internal to implementation, "private". 40 * function name bm_... => internal to implementation, "private".
41 */
42
43
44/*
45 * LIMITATIONS:
46 * We want to support >= peta byte of backend storage, while for now still using
47 * a granularity of one bit per 4KiB of storage.
48 * 1 << 50 bytes backend storage (1 PiB)
49 * 1 << (50 - 12) bits needed
50 * 38 --> we need u64 to index and count bits
51 * 1 << (38 - 3) bitmap bytes needed
52 * 35 --> we still need u64 to index and count bytes
53 * (that's 32 GiB of bitmap for 1 PiB storage)
54 * 1 << (35 - 2) 32bit longs needed
55 * 33 --> we'd even need u64 to index and count 32bit long words.
56 * 1 << (35 - 3) 64bit longs needed
57 * 32 --> we could get away with a 32bit unsigned int to index and count
58 * 64bit long words, but I rather stay with unsigned long for now.
59 * We probably should neither count nor point to bytes or long words
60 * directly, but either by bitnumber, or by page index and offset.
61 * 1 << (35 - 12)
62 * 22 --> we need that much 4KiB pages of bitmap.
63 * 1 << (22 + 3) --> on a 64bit arch,
64 * we need 32 MiB to store the array of page pointers.
65 *
66 * Because I'm lazy, and because the resulting patch was too large, too ugly
67 * and still incomplete, on 32bit we still "only" support 16 TiB (minus some),
68 * (1 << 32) bits * 4k storage.
69 *
39 70
40 * Note that since find_first_bit returns int, at the current granularity of 71 * bitmap storage and IO:
41 * the bitmap (4KB per byte), this implementation "only" supports up to 72 * Bitmap is stored little endian on disk, and is kept little endian in
42 * 1<<(32+12) == 16 TB... 73 * core memory. Currently we still hold the full bitmap in core as long
74 * as we are "attached" to a local disk, which at 32 GiB for 1PiB storage
75 * seems excessive.
76 *
77 * We plan to reduce the amount of in-core bitmap pages by paging them in
78 * and out against their on-disk location as necessary, but need to make
79 * sure we don't cause too much meta data IO, and must not deadlock in
80 * tight memory situations. This needs some more work.
43 */ 81 */
44 82
45/* 83/*
@@ -55,13 +93,9 @@
55struct drbd_bitmap { 93struct drbd_bitmap {
56 struct page **bm_pages; 94 struct page **bm_pages;
57 spinlock_t bm_lock; 95 spinlock_t bm_lock;
58 /* WARNING unsigned long bm_*: 96
59 * 32bit number of bit offset is just enough for 512 MB bitmap. 97 /* see LIMITATIONS: above */
60 * it will blow up if we make the bitmap bigger... 98
61 * not that it makes much sense to have a bitmap that large,
62 * rather change the granularity to 16k or 64k or something.
63 * (that implies other problems, however...)
64 */
65 unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */ 99 unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */
66 unsigned long bm_bits; 100 unsigned long bm_bits;
67 size_t bm_words; 101 size_t bm_words;
@@ -69,29 +103,15 @@ struct drbd_bitmap {
69 sector_t bm_dev_capacity; 103 sector_t bm_dev_capacity;
70 struct mutex bm_change; /* serializes resize operations */ 104 struct mutex bm_change; /* serializes resize operations */
71 105
72 atomic_t bm_async_io; 106 wait_queue_head_t bm_io_wait; /* used to serialize IO of single pages */
73 wait_queue_head_t bm_io_wait;
74 107
75 unsigned long bm_flags; 108 enum bm_flag bm_flags;
76 109
77 /* debugging aid, in case we are still racy somewhere */ 110 /* debugging aid, in case we are still racy somewhere */
78 char *bm_why; 111 char *bm_why;
79 struct task_struct *bm_task; 112 struct task_struct *bm_task;
80}; 113};
81 114
82/* definition of bits in bm_flags */
83#define BM_LOCKED 0
84#define BM_MD_IO_ERROR 1
85#define BM_P_VMALLOCED 2
86
87static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
88 unsigned long e, int val, const enum km_type km);
89
90static int bm_is_locked(struct drbd_bitmap *b)
91{
92 return test_bit(BM_LOCKED, &b->bm_flags);
93}
94
95#define bm_print_lock_info(m) __bm_print_lock_info(m, __func__) 115#define bm_print_lock_info(m) __bm_print_lock_info(m, __func__)
96static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func) 116static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func)
97{ 117{
@@ -108,7 +128,7 @@ static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func)
108 b->bm_task == mdev->worker.task ? "worker" : "?"); 128 b->bm_task == mdev->worker.task ? "worker" : "?");
109} 129}
110 130
111void drbd_bm_lock(struct drbd_conf *mdev, char *why) 131void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags)
112{ 132{
113 struct drbd_bitmap *b = mdev->bitmap; 133 struct drbd_bitmap *b = mdev->bitmap;
114 int trylock_failed; 134 int trylock_failed;
@@ -131,8 +151,9 @@ void drbd_bm_lock(struct drbd_conf *mdev, char *why)
131 b->bm_task == mdev->worker.task ? "worker" : "?"); 151 b->bm_task == mdev->worker.task ? "worker" : "?");
132 mutex_lock(&b->bm_change); 152 mutex_lock(&b->bm_change);
133 } 153 }
134 if (__test_and_set_bit(BM_LOCKED, &b->bm_flags)) 154 if (BM_LOCKED_MASK & b->bm_flags)
135 dev_err(DEV, "FIXME bitmap already locked in bm_lock\n"); 155 dev_err(DEV, "FIXME bitmap already locked in bm_lock\n");
156 b->bm_flags |= flags & BM_LOCKED_MASK;
136 157
137 b->bm_why = why; 158 b->bm_why = why;
138 b->bm_task = current; 159 b->bm_task = current;
@@ -146,31 +167,137 @@ void drbd_bm_unlock(struct drbd_conf *mdev)
146 return; 167 return;
147 } 168 }
148 169
149 if (!__test_and_clear_bit(BM_LOCKED, &mdev->bitmap->bm_flags)) 170 if (!(BM_LOCKED_MASK & mdev->bitmap->bm_flags))
150 dev_err(DEV, "FIXME bitmap not locked in bm_unlock\n"); 171 dev_err(DEV, "FIXME bitmap not locked in bm_unlock\n");
151 172
173 b->bm_flags &= ~BM_LOCKED_MASK;
152 b->bm_why = NULL; 174 b->bm_why = NULL;
153 b->bm_task = NULL; 175 b->bm_task = NULL;
154 mutex_unlock(&b->bm_change); 176 mutex_unlock(&b->bm_change);
155} 177}
156 178
157/* word offset to long pointer */ 179/* we store some "meta" info about our pages in page->private */
158static unsigned long *__bm_map_paddr(struct drbd_bitmap *b, unsigned long offset, const enum km_type km) 180/* at a granularity of 4k storage per bitmap bit:
181 * one peta byte storage: 1<<50 byte, 1<<38 * 4k storage blocks
182 * 1<<38 bits,
183 * 1<<23 4k bitmap pages.
184 * Use 24 bits as page index, covers 2 peta byte storage
185 * at a granularity of 4k per bit.
186 * Used to report the failed page idx on io error from the endio handlers.
187 */
188#define BM_PAGE_IDX_MASK ((1UL<<24)-1)
189/* this page is currently read in, or written back */
190#define BM_PAGE_IO_LOCK 31
191/* if there has been an IO error for this page */
192#define BM_PAGE_IO_ERROR 30
193/* this is to be able to intelligently skip disk IO,
194 * set if bits have been set since last IO. */
195#define BM_PAGE_NEED_WRITEOUT 29
196/* to mark for lazy writeout once syncer cleared all clearable bits,
197 * we if bits have been cleared since last IO. */
198#define BM_PAGE_LAZY_WRITEOUT 28
199
200/* store_page_idx uses non-atomic assignment. It is only used directly after
201 * allocating the page. All other bm_set_page_* and bm_clear_page_* need to
202 * use atomic bit manipulation, as set_out_of_sync (and therefore bitmap
203 * changes) may happen from various contexts, and wait_on_bit/wake_up_bit
204 * requires it all to be atomic as well. */
205static void bm_store_page_idx(struct page *page, unsigned long idx)
159{ 206{
160 struct page *page; 207 BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK));
161 unsigned long page_nr; 208 page_private(page) |= idx;
209}
210
211static unsigned long bm_page_to_idx(struct page *page)
212{
213 return page_private(page) & BM_PAGE_IDX_MASK;
214}
215
216/* As is very unlikely that the same page is under IO from more than one
217 * context, we can get away with a bit per page and one wait queue per bitmap.
218 */
219static void bm_page_lock_io(struct drbd_conf *mdev, int page_nr)
220{
221 struct drbd_bitmap *b = mdev->bitmap;
222 void *addr = &page_private(b->bm_pages[page_nr]);
223 wait_event(b->bm_io_wait, !test_and_set_bit(BM_PAGE_IO_LOCK, addr));
224}
225
226static void bm_page_unlock_io(struct drbd_conf *mdev, int page_nr)
227{
228 struct drbd_bitmap *b = mdev->bitmap;
229 void *addr = &page_private(b->bm_pages[page_nr]);
230 clear_bit(BM_PAGE_IO_LOCK, addr);
231 smp_mb__after_clear_bit();
232 wake_up(&mdev->bitmap->bm_io_wait);
233}
162 234
235/* set _before_ submit_io, so it may be reset due to being changed
236 * while this page is in flight... will get submitted later again */
237static void bm_set_page_unchanged(struct page *page)
238{
239 /* use cmpxchg? */
240 clear_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
241 clear_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
242}
243
244static void bm_set_page_need_writeout(struct page *page)
245{
246 set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
247}
248
249static int bm_test_page_unchanged(struct page *page)
250{
251 volatile const unsigned long *addr = &page_private(page);
252 return (*addr & ((1UL<<BM_PAGE_NEED_WRITEOUT)|(1UL<<BM_PAGE_LAZY_WRITEOUT))) == 0;
253}
254
255static void bm_set_page_io_err(struct page *page)
256{
257 set_bit(BM_PAGE_IO_ERROR, &page_private(page));
258}
259
260static void bm_clear_page_io_err(struct page *page)
261{
262 clear_bit(BM_PAGE_IO_ERROR, &page_private(page));
263}
264
265static void bm_set_page_lazy_writeout(struct page *page)
266{
267 set_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
268}
269
270static int bm_test_page_lazy_writeout(struct page *page)
271{
272 return test_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
273}
274
275/* on a 32bit box, this would allow for exactly (2<<38) bits. */
276static unsigned int bm_word_to_page_idx(struct drbd_bitmap *b, unsigned long long_nr)
277{
163 /* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */ 278 /* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */
164 page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3); 279 unsigned int page_nr = long_nr >> (PAGE_SHIFT - LN2_BPL + 3);
165 BUG_ON(page_nr >= b->bm_number_of_pages); 280 BUG_ON(page_nr >= b->bm_number_of_pages);
166 page = b->bm_pages[page_nr]; 281 return page_nr;
282}
283
284static unsigned int bm_bit_to_page_idx(struct drbd_bitmap *b, u64 bitnr)
285{
286 /* page_nr = (bitnr/8) >> PAGE_SHIFT; */
287 unsigned int page_nr = bitnr >> (PAGE_SHIFT + 3);
288 BUG_ON(page_nr >= b->bm_number_of_pages);
289 return page_nr;
290}
167 291
292static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx, const enum km_type km)
293{
294 struct page *page = b->bm_pages[idx];
168 return (unsigned long *) kmap_atomic(page, km); 295 return (unsigned long *) kmap_atomic(page, km);
169} 296}
170 297
171static unsigned long * bm_map_paddr(struct drbd_bitmap *b, unsigned long offset) 298static unsigned long *bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
172{ 299{
173 return __bm_map_paddr(b, offset, KM_IRQ1); 300 return __bm_map_pidx(b, idx, KM_IRQ1);
174} 301}
175 302
176static void __bm_unmap(unsigned long *p_addr, const enum km_type km) 303static void __bm_unmap(unsigned long *p_addr, const enum km_type km)
@@ -188,7 +315,7 @@ static void bm_unmap(unsigned long *p_addr)
188/* word offset from start of bitmap to word number _in_page_ 315/* word offset from start of bitmap to word number _in_page_
189 * modulo longs per page 316 * modulo longs per page
190#define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long)) 317#define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long))
191 hm, well, Philipp thinks gcc might not optimze the % into & (... - 1) 318 hm, well, Philipp thinks gcc might not optimize the % into & (... - 1)
192 so do it explicitly: 319 so do it explicitly:
193 */ 320 */
194#define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1)) 321#define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1))
@@ -202,6 +329,7 @@ static void bm_unmap(unsigned long *p_addr)
202 * to be able to report device specific. 329 * to be able to report device specific.
203 */ 330 */
204 331
332
205static void bm_free_pages(struct page **pages, unsigned long number) 333static void bm_free_pages(struct page **pages, unsigned long number)
206{ 334{
207 unsigned long i; 335 unsigned long i;
@@ -269,6 +397,9 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
269 bm_vk_free(new_pages, vmalloced); 397 bm_vk_free(new_pages, vmalloced);
270 return NULL; 398 return NULL;
271 } 399 }
400 /* we want to know which page it is
401 * from the endio handlers */
402 bm_store_page_idx(page, i);
272 new_pages[i] = page; 403 new_pages[i] = page;
273 } 404 }
274 } else { 405 } else {
@@ -280,9 +411,9 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
280 } 411 }
281 412
282 if (vmalloced) 413 if (vmalloced)
283 set_bit(BM_P_VMALLOCED, &b->bm_flags); 414 b->bm_flags |= BM_P_VMALLOCED;
284 else 415 else
285 clear_bit(BM_P_VMALLOCED, &b->bm_flags); 416 b->bm_flags &= ~BM_P_VMALLOCED;
286 417
287 return new_pages; 418 return new_pages;
288} 419}
@@ -319,7 +450,7 @@ void drbd_bm_cleanup(struct drbd_conf *mdev)
319{ 450{
320 ERR_IF (!mdev->bitmap) return; 451 ERR_IF (!mdev->bitmap) return;
321 bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages); 452 bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages);
322 bm_vk_free(mdev->bitmap->bm_pages, test_bit(BM_P_VMALLOCED, &mdev->bitmap->bm_flags)); 453 bm_vk_free(mdev->bitmap->bm_pages, (BM_P_VMALLOCED & mdev->bitmap->bm_flags));
323 kfree(mdev->bitmap); 454 kfree(mdev->bitmap);
324 mdev->bitmap = NULL; 455 mdev->bitmap = NULL;
325} 456}
@@ -329,22 +460,39 @@ void drbd_bm_cleanup(struct drbd_conf *mdev)
329 * this masks out the remaining bits. 460 * this masks out the remaining bits.
330 * Returns the number of bits cleared. 461 * Returns the number of bits cleared.
331 */ 462 */
463#define BITS_PER_PAGE (1UL << (PAGE_SHIFT + 3))
464#define BITS_PER_PAGE_MASK (BITS_PER_PAGE - 1)
465#define BITS_PER_LONG_MASK (BITS_PER_LONG - 1)
332static int bm_clear_surplus(struct drbd_bitmap *b) 466static int bm_clear_surplus(struct drbd_bitmap *b)
333{ 467{
334 const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1; 468 unsigned long mask;
335 size_t w = b->bm_bits >> LN2_BPL;
336 int cleared = 0;
337 unsigned long *p_addr, *bm; 469 unsigned long *p_addr, *bm;
470 int tmp;
471 int cleared = 0;
338 472
339 p_addr = bm_map_paddr(b, w); 473 /* number of bits modulo bits per page */
340 bm = p_addr + MLPP(w); 474 tmp = (b->bm_bits & BITS_PER_PAGE_MASK);
341 if (w < b->bm_words) { 475 /* mask the used bits of the word containing the last bit */
476 mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;
477 /* bitmap is always stored little endian,
478 * on disk and in core memory alike */
479 mask = cpu_to_lel(mask);
480
481 p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1);
482 bm = p_addr + (tmp/BITS_PER_LONG);
483 if (mask) {
484 /* If mask != 0, we are not exactly aligned, so bm now points
485 * to the long containing the last bit.
486 * If mask == 0, bm already points to the word immediately
487 * after the last (long word aligned) bit. */
342 cleared = hweight_long(*bm & ~mask); 488 cleared = hweight_long(*bm & ~mask);
343 *bm &= mask; 489 *bm &= mask;
344 w++; bm++; 490 bm++;
345 } 491 }
346 492
347 if (w < b->bm_words) { 493 if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {
494 /* on a 32bit arch, we may need to zero out
495 * a padding long to align with a 64bit remote */
348 cleared += hweight_long(*bm); 496 cleared += hweight_long(*bm);
349 *bm = 0; 497 *bm = 0;
350 } 498 }
@@ -354,66 +502,75 @@ static int bm_clear_surplus(struct drbd_bitmap *b)
354 502
355static void bm_set_surplus(struct drbd_bitmap *b) 503static void bm_set_surplus(struct drbd_bitmap *b)
356{ 504{
357 const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1; 505 unsigned long mask;
358 size_t w = b->bm_bits >> LN2_BPL;
359 unsigned long *p_addr, *bm; 506 unsigned long *p_addr, *bm;
360 507 int tmp;
361 p_addr = bm_map_paddr(b, w); 508
362 bm = p_addr + MLPP(w); 509 /* number of bits modulo bits per page */
363 if (w < b->bm_words) { 510 tmp = (b->bm_bits & BITS_PER_PAGE_MASK);
511 /* mask the used bits of the word containing the last bit */
512 mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;
513 /* bitmap is always stored little endian,
514 * on disk and in core memory alike */
515 mask = cpu_to_lel(mask);
516
517 p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1);
518 bm = p_addr + (tmp/BITS_PER_LONG);
519 if (mask) {
520 /* If mask != 0, we are not exactly aligned, so bm now points
521 * to the long containing the last bit.
522 * If mask == 0, bm already points to the word immediately
523 * after the last (long word aligned) bit. */
364 *bm |= ~mask; 524 *bm |= ~mask;
365 bm++; w++; 525 bm++;
366 } 526 }
367 527
368 if (w < b->bm_words) { 528 if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {
369 *bm = ~(0UL); 529 /* on a 32bit arch, we may need to zero out
530 * a padding long to align with a 64bit remote */
531 *bm = ~0UL;
370 } 532 }
371 bm_unmap(p_addr); 533 bm_unmap(p_addr);
372} 534}
373 535
374static unsigned long __bm_count_bits(struct drbd_bitmap *b, const int swap_endian) 536/* you better not modify the bitmap while this is running,
537 * or its results will be stale */
538static unsigned long bm_count_bits(struct drbd_bitmap *b)
375{ 539{
376 unsigned long *p_addr, *bm, offset = 0; 540 unsigned long *p_addr;
377 unsigned long bits = 0; 541 unsigned long bits = 0;
378 unsigned long i, do_now; 542 unsigned long mask = (1UL << (b->bm_bits & BITS_PER_LONG_MASK)) -1;
379 543 int idx, i, last_word;
380 while (offset < b->bm_words) { 544
381 i = do_now = min_t(size_t, b->bm_words-offset, LWPP); 545 /* all but last page */
382 p_addr = __bm_map_paddr(b, offset, KM_USER0); 546 for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) {
383 bm = p_addr + MLPP(offset); 547 p_addr = __bm_map_pidx(b, idx, KM_USER0);
384 while (i--) { 548 for (i = 0; i < LWPP; i++)
385#ifndef __LITTLE_ENDIAN 549 bits += hweight_long(p_addr[i]);
386 if (swap_endian)
387 *bm = lel_to_cpu(*bm);
388#endif
389 bits += hweight_long(*bm++);
390 }
391 __bm_unmap(p_addr, KM_USER0); 550 __bm_unmap(p_addr, KM_USER0);
392 offset += do_now;
393 cond_resched(); 551 cond_resched();
394 } 552 }
395 553 /* last (or only) page */
554 last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL;
555 p_addr = __bm_map_pidx(b, idx, KM_USER0);
556 for (i = 0; i < last_word; i++)
557 bits += hweight_long(p_addr[i]);
558 p_addr[last_word] &= cpu_to_lel(mask);
559 bits += hweight_long(p_addr[last_word]);
560 /* 32bit arch, may have an unused padding long */
561 if (BITS_PER_LONG == 32 && (last_word & 1) == 0)
562 p_addr[last_word+1] = 0;
563 __bm_unmap(p_addr, KM_USER0);
396 return bits; 564 return bits;
397} 565}
398 566
399static unsigned long bm_count_bits(struct drbd_bitmap *b)
400{
401 return __bm_count_bits(b, 0);
402}
403
404static unsigned long bm_count_bits_swap_endian(struct drbd_bitmap *b)
405{
406 return __bm_count_bits(b, 1);
407}
408
409/* offset and len in long words.*/ 567/* offset and len in long words.*/
410static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len) 568static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
411{ 569{
412 unsigned long *p_addr, *bm; 570 unsigned long *p_addr, *bm;
571 unsigned int idx;
413 size_t do_now, end; 572 size_t do_now, end;
414 573
415#define BM_SECTORS_PER_BIT (BM_BLOCK_SIZE/512)
416
417 end = offset + len; 574 end = offset + len;
418 575
419 if (end > b->bm_words) { 576 if (end > b->bm_words) {
@@ -423,15 +580,16 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
423 580
424 while (offset < end) { 581 while (offset < end) {
425 do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset; 582 do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset;
426 p_addr = bm_map_paddr(b, offset); 583 idx = bm_word_to_page_idx(b, offset);
584 p_addr = bm_map_pidx(b, idx);
427 bm = p_addr + MLPP(offset); 585 bm = p_addr + MLPP(offset);
428 if (bm+do_now > p_addr + LWPP) { 586 if (bm+do_now > p_addr + LWPP) {
429 printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n", 587 printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
430 p_addr, bm, (int)do_now); 588 p_addr, bm, (int)do_now);
431 break; /* breaks to after catch_oob_access_end() only! */ 589 } else
432 } 590 memset(bm, c, do_now * sizeof(long));
433 memset(bm, c, do_now * sizeof(long));
434 bm_unmap(p_addr); 591 bm_unmap(p_addr);
592 bm_set_page_need_writeout(b->bm_pages[idx]);
435 offset += do_now; 593 offset += do_now;
436 } 594 }
437} 595}
@@ -447,7 +605,7 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
447int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits) 605int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
448{ 606{
449 struct drbd_bitmap *b = mdev->bitmap; 607 struct drbd_bitmap *b = mdev->bitmap;
450 unsigned long bits, words, owords, obits, *p_addr, *bm; 608 unsigned long bits, words, owords, obits;
451 unsigned long want, have, onpages; /* number of pages */ 609 unsigned long want, have, onpages; /* number of pages */
452 struct page **npages, **opages = NULL; 610 struct page **npages, **opages = NULL;
453 int err = 0, growing; 611 int err = 0, growing;
@@ -455,7 +613,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
455 613
456 ERR_IF(!b) return -ENOMEM; 614 ERR_IF(!b) return -ENOMEM;
457 615
458 drbd_bm_lock(mdev, "resize"); 616 drbd_bm_lock(mdev, "resize", BM_LOCKED_MASK);
459 617
460 dev_info(DEV, "drbd_bm_resize called with capacity == %llu\n", 618 dev_info(DEV, "drbd_bm_resize called with capacity == %llu\n",
461 (unsigned long long)capacity); 619 (unsigned long long)capacity);
@@ -463,7 +621,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
463 if (capacity == b->bm_dev_capacity) 621 if (capacity == b->bm_dev_capacity)
464 goto out; 622 goto out;
465 623
466 opages_vmalloced = test_bit(BM_P_VMALLOCED, &b->bm_flags); 624 opages_vmalloced = (BM_P_VMALLOCED & b->bm_flags);
467 625
468 if (capacity == 0) { 626 if (capacity == 0) {
469 spin_lock_irq(&b->bm_lock); 627 spin_lock_irq(&b->bm_lock);
@@ -491,18 +649,23 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
491 words = ALIGN(bits, 64) >> LN2_BPL; 649 words = ALIGN(bits, 64) >> LN2_BPL;
492 650
493 if (get_ldev(mdev)) { 651 if (get_ldev(mdev)) {
494 D_ASSERT((u64)bits <= (((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12)); 652 u64 bits_on_disk = ((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12;
495 put_ldev(mdev); 653 put_ldev(mdev);
654 if (bits > bits_on_disk) {
655 dev_info(DEV, "bits = %lu\n", bits);
656 dev_info(DEV, "bits_on_disk = %llu\n", bits_on_disk);
657 err = -ENOSPC;
658 goto out;
659 }
496 } 660 }
497 661
498 /* one extra long to catch off by one errors */ 662 want = ALIGN(words*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT;
499 want = ALIGN((words+1)*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT;
500 have = b->bm_number_of_pages; 663 have = b->bm_number_of_pages;
501 if (want == have) { 664 if (want == have) {
502 D_ASSERT(b->bm_pages != NULL); 665 D_ASSERT(b->bm_pages != NULL);
503 npages = b->bm_pages; 666 npages = b->bm_pages;
504 } else { 667 } else {
505 if (FAULT_ACTIVE(mdev, DRBD_FAULT_BM_ALLOC)) 668 if (drbd_insert_fault(mdev, DRBD_FAULT_BM_ALLOC))
506 npages = NULL; 669 npages = NULL;
507 else 670 else
508 npages = bm_realloc_pages(b, want); 671 npages = bm_realloc_pages(b, want);
@@ -542,11 +705,6 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
542 bm_free_pages(opages + want, have - want); 705 bm_free_pages(opages + want, have - want);
543 } 706 }
544 707
545 p_addr = bm_map_paddr(b, words);
546 bm = p_addr + MLPP(words);
547 *bm = DRBD_MAGIC;
548 bm_unmap(p_addr);
549
550 (void)bm_clear_surplus(b); 708 (void)bm_clear_surplus(b);
551 709
552 spin_unlock_irq(&b->bm_lock); 710 spin_unlock_irq(&b->bm_lock);
@@ -554,7 +712,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
554 bm_vk_free(opages, opages_vmalloced); 712 bm_vk_free(opages, opages_vmalloced);
555 if (!growing) 713 if (!growing)
556 b->bm_set = bm_count_bits(b); 714 b->bm_set = bm_count_bits(b);
557 dev_info(DEV, "resync bitmap: bits=%lu words=%lu\n", bits, words); 715 dev_info(DEV, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want);
558 716
559 out: 717 out:
560 drbd_bm_unlock(mdev); 718 drbd_bm_unlock(mdev);
@@ -569,7 +727,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
569 * 727 *
570 * maybe bm_set should be atomic_t ? 728 * maybe bm_set should be atomic_t ?
571 */ 729 */
572static unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev) 730unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev)
573{ 731{
574 struct drbd_bitmap *b = mdev->bitmap; 732 struct drbd_bitmap *b = mdev->bitmap;
575 unsigned long s; 733 unsigned long s;
@@ -624,6 +782,7 @@ void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number,
624 struct drbd_bitmap *b = mdev->bitmap; 782 struct drbd_bitmap *b = mdev->bitmap;
625 unsigned long *p_addr, *bm; 783 unsigned long *p_addr, *bm;
626 unsigned long word, bits; 784 unsigned long word, bits;
785 unsigned int idx;
627 size_t end, do_now; 786 size_t end, do_now;
628 787
629 end = offset + number; 788 end = offset + number;
@@ -638,16 +797,18 @@ void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number,
638 spin_lock_irq(&b->bm_lock); 797 spin_lock_irq(&b->bm_lock);
639 while (offset < end) { 798 while (offset < end) {
640 do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset; 799 do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
641 p_addr = bm_map_paddr(b, offset); 800 idx = bm_word_to_page_idx(b, offset);
801 p_addr = bm_map_pidx(b, idx);
642 bm = p_addr + MLPP(offset); 802 bm = p_addr + MLPP(offset);
643 offset += do_now; 803 offset += do_now;
644 while (do_now--) { 804 while (do_now--) {
645 bits = hweight_long(*bm); 805 bits = hweight_long(*bm);
646 word = *bm | lel_to_cpu(*buffer++); 806 word = *bm | *buffer++;
647 *bm++ = word; 807 *bm++ = word;
648 b->bm_set += hweight_long(word) - bits; 808 b->bm_set += hweight_long(word) - bits;
649 } 809 }
650 bm_unmap(p_addr); 810 bm_unmap(p_addr);
811 bm_set_page_need_writeout(b->bm_pages[idx]);
651 } 812 }
652 /* with 32bit <-> 64bit cross-platform connect 813 /* with 32bit <-> 64bit cross-platform connect
653 * this is only correct for current usage, 814 * this is only correct for current usage,
@@ -656,7 +817,6 @@ void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number,
656 */ 817 */
657 if (end == b->bm_words) 818 if (end == b->bm_words)
658 b->bm_set -= bm_clear_surplus(b); 819 b->bm_set -= bm_clear_surplus(b);
659
660 spin_unlock_irq(&b->bm_lock); 820 spin_unlock_irq(&b->bm_lock);
661} 821}
662 822
@@ -686,11 +846,11 @@ void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number,
686 else { 846 else {
687 while (offset < end) { 847 while (offset < end) {
688 do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset; 848 do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
689 p_addr = bm_map_paddr(b, offset); 849 p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, offset));
690 bm = p_addr + MLPP(offset); 850 bm = p_addr + MLPP(offset);
691 offset += do_now; 851 offset += do_now;
692 while (do_now--) 852 while (do_now--)
693 *buffer++ = cpu_to_lel(*bm++); 853 *buffer++ = *bm++;
694 bm_unmap(p_addr); 854 bm_unmap(p_addr);
695 } 855 }
696 } 856 }
@@ -724,9 +884,22 @@ void drbd_bm_clear_all(struct drbd_conf *mdev)
724 spin_unlock_irq(&b->bm_lock); 884 spin_unlock_irq(&b->bm_lock);
725} 885}
726 886
887struct bm_aio_ctx {
888 struct drbd_conf *mdev;
889 atomic_t in_flight;
890 struct completion done;
891 unsigned flags;
892#define BM_AIO_COPY_PAGES 1
893 int error;
894};
895
896/* bv_page may be a copy, or may be the original */
727static void bm_async_io_complete(struct bio *bio, int error) 897static void bm_async_io_complete(struct bio *bio, int error)
728{ 898{
729 struct drbd_bitmap *b = bio->bi_private; 899 struct bm_aio_ctx *ctx = bio->bi_private;
900 struct drbd_conf *mdev = ctx->mdev;
901 struct drbd_bitmap *b = mdev->bitmap;
902 unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page);
730 int uptodate = bio_flagged(bio, BIO_UPTODATE); 903 int uptodate = bio_flagged(bio, BIO_UPTODATE);
731 904
732 905
@@ -737,127 +910,171 @@ static void bm_async_io_complete(struct bio *bio, int error)
737 if (!error && !uptodate) 910 if (!error && !uptodate)
738 error = -EIO; 911 error = -EIO;
739 912
913 if ((ctx->flags & BM_AIO_COPY_PAGES) == 0 &&
914 !bm_test_page_unchanged(b->bm_pages[idx]))
915 dev_warn(DEV, "bitmap page idx %u changed during IO!\n", idx);
916
740 if (error) { 917 if (error) {
741 /* doh. what now? 918 /* ctx error will hold the completed-last non-zero error code,
742 * for now, set all bits, and flag MD_IO_ERROR */ 919 * in case error codes differ. */
743 __set_bit(BM_MD_IO_ERROR, &b->bm_flags); 920 ctx->error = error;
921 bm_set_page_io_err(b->bm_pages[idx]);
922 /* Not identical to on disk version of it.
923 * Is BM_PAGE_IO_ERROR enough? */
924 if (__ratelimit(&drbd_ratelimit_state))
925 dev_err(DEV, "IO ERROR %d on bitmap page idx %u\n",
926 error, idx);
927 } else {
928 bm_clear_page_io_err(b->bm_pages[idx]);
929 dynamic_dev_dbg(DEV, "bitmap page idx %u completed\n", idx);
744 } 930 }
745 if (atomic_dec_and_test(&b->bm_async_io)) 931
746 wake_up(&b->bm_io_wait); 932 bm_page_unlock_io(mdev, idx);
933
934 /* FIXME give back to page pool */
935 if (ctx->flags & BM_AIO_COPY_PAGES)
936 put_page(bio->bi_io_vec[0].bv_page);
747 937
748 bio_put(bio); 938 bio_put(bio);
939
940 if (atomic_dec_and_test(&ctx->in_flight))
941 complete(&ctx->done);
749} 942}
750 943
751static void bm_page_io_async(struct drbd_conf *mdev, struct drbd_bitmap *b, int page_nr, int rw) __must_hold(local) 944static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local)
752{ 945{
753 /* we are process context. we always get a bio */ 946 /* we are process context. we always get a bio */
754 struct bio *bio = bio_alloc(GFP_KERNEL, 1); 947 struct bio *bio = bio_alloc(GFP_KERNEL, 1);
948 struct drbd_conf *mdev = ctx->mdev;
949 struct drbd_bitmap *b = mdev->bitmap;
950 struct page *page;
755 unsigned int len; 951 unsigned int len;
952
756 sector_t on_disk_sector = 953 sector_t on_disk_sector =
757 mdev->ldev->md.md_offset + mdev->ldev->md.bm_offset; 954 mdev->ldev->md.md_offset + mdev->ldev->md.bm_offset;
758 on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9); 955 on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9);
759 956
760 /* this might happen with very small 957 /* this might happen with very small
761 * flexible external meta data device */ 958 * flexible external meta data device,
959 * or with PAGE_SIZE > 4k */
762 len = min_t(unsigned int, PAGE_SIZE, 960 len = min_t(unsigned int, PAGE_SIZE,
763 (drbd_md_last_sector(mdev->ldev) - on_disk_sector + 1)<<9); 961 (drbd_md_last_sector(mdev->ldev) - on_disk_sector + 1)<<9);
764 962
963 /* serialize IO on this page */
964 bm_page_lock_io(mdev, page_nr);
965 /* before memcpy and submit,
966 * so it can be redirtied any time */
967 bm_set_page_unchanged(b->bm_pages[page_nr]);
968
969 if (ctx->flags & BM_AIO_COPY_PAGES) {
970 /* FIXME alloc_page is good enough for now, but actually needs
971 * to use pre-allocated page pool */
972 void *src, *dest;
973 page = alloc_page(__GFP_HIGHMEM|__GFP_WAIT);
974 dest = kmap_atomic(page, KM_USER0);
975 src = kmap_atomic(b->bm_pages[page_nr], KM_USER1);
976 memcpy(dest, src, PAGE_SIZE);
977 kunmap_atomic(src, KM_USER1);
978 kunmap_atomic(dest, KM_USER0);
979 bm_store_page_idx(page, page_nr);
980 } else
981 page = b->bm_pages[page_nr];
982
765 bio->bi_bdev = mdev->ldev->md_bdev; 983 bio->bi_bdev = mdev->ldev->md_bdev;
766 bio->bi_sector = on_disk_sector; 984 bio->bi_sector = on_disk_sector;
767 bio_add_page(bio, b->bm_pages[page_nr], len, 0); 985 bio_add_page(bio, page, len, 0);
768 bio->bi_private = b; 986 bio->bi_private = ctx;
769 bio->bi_end_io = bm_async_io_complete; 987 bio->bi_end_io = bm_async_io_complete;
770 988
771 if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) { 989 if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {
772 bio->bi_rw |= rw; 990 bio->bi_rw |= rw;
773 bio_endio(bio, -EIO); 991 bio_endio(bio, -EIO);
774 } else { 992 } else {
775 submit_bio(rw, bio); 993 submit_bio(rw, bio);
994 /* this should not count as user activity and cause the
995 * resync to throttle -- see drbd_rs_should_slow_down(). */
996 atomic_add(len >> 9, &mdev->rs_sect_ev);
776 } 997 }
777} 998}
778 999
779# if defined(__LITTLE_ENDIAN)
780 /* nothing to do, on disk == in memory */
781# define bm_cpu_to_lel(x) ((void)0)
782# else
783static void bm_cpu_to_lel(struct drbd_bitmap *b)
784{
785 /* need to cpu_to_lel all the pages ...
786 * this may be optimized by using
787 * cpu_to_lel(-1) == -1 and cpu_to_lel(0) == 0;
788 * the following is still not optimal, but better than nothing */
789 unsigned int i;
790 unsigned long *p_addr, *bm;
791 if (b->bm_set == 0) {
792 /* no page at all; avoid swap if all is 0 */
793 i = b->bm_number_of_pages;
794 } else if (b->bm_set == b->bm_bits) {
795 /* only the last page */
796 i = b->bm_number_of_pages - 1;
797 } else {
798 /* all pages */
799 i = 0;
800 }
801 for (; i < b->bm_number_of_pages; i++) {
802 p_addr = kmap_atomic(b->bm_pages[i], KM_USER0);
803 for (bm = p_addr; bm < p_addr + PAGE_SIZE/sizeof(long); bm++)
804 *bm = cpu_to_lel(*bm);
805 kunmap_atomic(p_addr, KM_USER0);
806 }
807}
808# endif
809/* lel_to_cpu == cpu_to_lel */
810# define bm_lel_to_cpu(x) bm_cpu_to_lel(x)
811
812/* 1000/*
813 * bm_rw: read/write the whole bitmap from/to its on disk location. 1001 * bm_rw: read/write the whole bitmap from/to its on disk location.
814 */ 1002 */
815static int bm_rw(struct drbd_conf *mdev, int rw) __must_hold(local) 1003static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_idx) __must_hold(local)
816{ 1004{
1005 struct bm_aio_ctx ctx = {
1006 .mdev = mdev,
1007 .in_flight = ATOMIC_INIT(1),
1008 .done = COMPLETION_INITIALIZER_ONSTACK(ctx.done),
1009 .flags = lazy_writeout_upper_idx ? BM_AIO_COPY_PAGES : 0,
1010 };
817 struct drbd_bitmap *b = mdev->bitmap; 1011 struct drbd_bitmap *b = mdev->bitmap;
818 /* sector_t sector; */ 1012 int num_pages, i, count = 0;
819 int bm_words, num_pages, i;
820 unsigned long now; 1013 unsigned long now;
821 char ppb[10]; 1014 char ppb[10];
822 int err = 0; 1015 int err = 0;
823 1016
824 WARN_ON(!bm_is_locked(b)); 1017 /*
825 1018 * We are protected against bitmap disappearing/resizing by holding an
826 /* no spinlock here, the drbd_bm_lock should be enough! */ 1019 * ldev reference (caller must have called get_ldev()).
827 1020 * For read/write, we are protected against changes to the bitmap by
828 bm_words = drbd_bm_words(mdev); 1021 * the bitmap lock (see drbd_bitmap_io).
829 num_pages = (bm_words*sizeof(long) + PAGE_SIZE-1) >> PAGE_SHIFT; 1022 * For lazy writeout, we don't care for ongoing changes to the bitmap,
1023 * as we submit copies of pages anyways.
1024 */
1025 if (!ctx.flags)
1026 WARN_ON(!(BM_LOCKED_MASK & b->bm_flags));
830 1027
831 /* on disk bitmap is little endian */ 1028 num_pages = b->bm_number_of_pages;
832 if (rw == WRITE)
833 bm_cpu_to_lel(b);
834 1029
835 now = jiffies; 1030 now = jiffies;
836 atomic_set(&b->bm_async_io, num_pages);
837 __clear_bit(BM_MD_IO_ERROR, &b->bm_flags);
838 1031
839 /* let the layers below us try to merge these bios... */ 1032 /* let the layers below us try to merge these bios... */
840 for (i = 0; i < num_pages; i++) 1033 for (i = 0; i < num_pages; i++) {
841 bm_page_io_async(mdev, b, i, rw); 1034 /* ignore completely unchanged pages */
1035 if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
1036 break;
1037 if (rw & WRITE) {
1038 if (bm_test_page_unchanged(b->bm_pages[i])) {
1039 dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i);
1040 continue;
1041 }
1042 /* during lazy writeout,
1043 * ignore those pages not marked for lazy writeout. */
1044 if (lazy_writeout_upper_idx &&
1045 !bm_test_page_lazy_writeout(b->bm_pages[i])) {
1046 dynamic_dev_dbg(DEV, "skipped bm lazy write for idx %u\n", i);
1047 continue;
1048 }
1049 }
1050 atomic_inc(&ctx.in_flight);
1051 bm_page_io_async(&ctx, i, rw);
1052 ++count;
1053 cond_resched();
1054 }
842 1055
843 drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev)); 1056 /*
844 wait_event(b->bm_io_wait, atomic_read(&b->bm_async_io) == 0); 1057 * We initialize ctx.in_flight to one to make sure bm_async_io_complete
1058 * will not complete() early, and decrement / test it here. If there
1059 * are still some bios in flight, we need to wait for them here.
1060 */
1061 if (!atomic_dec_and_test(&ctx.in_flight))
1062 wait_for_completion(&ctx.done);
1063 dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n",
1064 rw == WRITE ? "WRITE" : "READ",
1065 count, jiffies - now);
845 1066
846 if (test_bit(BM_MD_IO_ERROR, &b->bm_flags)) { 1067 if (ctx.error) {
847 dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n"); 1068 dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n");
848 drbd_chk_io_error(mdev, 1, TRUE); 1069 drbd_chk_io_error(mdev, 1, true);
849 err = -EIO; 1070 err = -EIO; /* ctx.error ? */
850 } 1071 }
851 1072
852 now = jiffies; 1073 now = jiffies;
853 if (rw == WRITE) { 1074 if (rw == WRITE) {
854 /* swap back endianness */
855 bm_lel_to_cpu(b);
856 /* flush bitmap to stable storage */
857 drbd_md_flush(mdev); 1075 drbd_md_flush(mdev);
858 } else /* rw == READ */ { 1076 } else /* rw == READ */ {
859 /* just read, if necessary adjust endianness */ 1077 b->bm_set = bm_count_bits(b);
860 b->bm_set = bm_count_bits_swap_endian(b);
861 dev_info(DEV, "recounting of set bits took additional %lu jiffies\n", 1078 dev_info(DEV, "recounting of set bits took additional %lu jiffies\n",
862 jiffies - now); 1079 jiffies - now);
863 } 1080 }
@@ -875,112 +1092,128 @@ static int bm_rw(struct drbd_conf *mdev, int rw) __must_hold(local)
875 */ 1092 */
876int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local) 1093int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local)
877{ 1094{
878 return bm_rw(mdev, READ); 1095 return bm_rw(mdev, READ, 0);
879} 1096}
880 1097
881/** 1098/**
882 * drbd_bm_write() - Write the whole bitmap to its on disk location. 1099 * drbd_bm_write() - Write the whole bitmap to its on disk location.
883 * @mdev: DRBD device. 1100 * @mdev: DRBD device.
1101 *
1102 * Will only write pages that have changed since last IO.
884 */ 1103 */
885int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local) 1104int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)
886{ 1105{
887 return bm_rw(mdev, WRITE); 1106 return bm_rw(mdev, WRITE, 0);
888} 1107}
889 1108
890/** 1109/**
891 * drbd_bm_write_sect: Writes a 512 (MD_SECTOR_SIZE) byte piece of the bitmap 1110 * drbd_bm_lazy_write_out() - Write bitmap pages 0 to @upper_idx-1, if they have changed.
892 * @mdev: DRBD device. 1111 * @mdev: DRBD device.
893 * @enr: Extent number in the resync lru (happens to be sector offset) 1112 * @upper_idx: 0: write all changed pages; +ve: page index to stop scanning for changed pages
894 *
895 * The BM_EXT_SIZE is on purpose exactly the amount of the bitmap covered
896 * by a single sector write. Therefore enr == sector offset from the
897 * start of the bitmap.
898 */ 1113 */
899int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local) 1114int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(local)
900{ 1115{
901 sector_t on_disk_sector = enr + mdev->ldev->md.md_offset 1116 return bm_rw(mdev, WRITE, upper_idx);
902 + mdev->ldev->md.bm_offset; 1117}
903 int bm_words, num_words, offset;
904 int err = 0;
905 1118
906 mutex_lock(&mdev->md_io_mutex); 1119
907 bm_words = drbd_bm_words(mdev); 1120/**
908 offset = S2W(enr); /* word offset into bitmap */ 1121 * drbd_bm_write_page: Writes a PAGE_SIZE aligned piece of bitmap
909 num_words = min(S2W(1), bm_words - offset); 1122 * @mdev: DRBD device.
910 if (num_words < S2W(1)) 1123 * @idx: bitmap page index
911 memset(page_address(mdev->md_io_page), 0, MD_SECTOR_SIZE); 1124 *
912 drbd_bm_get_lel(mdev, offset, num_words, 1125 * We don't want to special case on logical_block_size of the backend device,
913 page_address(mdev->md_io_page)); 1126 * so we submit PAGE_SIZE aligned pieces.
914 if (!drbd_md_sync_page_io(mdev, mdev->ldev, on_disk_sector, WRITE)) { 1127 * Note that on "most" systems, PAGE_SIZE is 4k.
915 int i; 1128 *
916 err = -EIO; 1129 * In case this becomes an issue on systems with larger PAGE_SIZE,
917 dev_err(DEV, "IO ERROR writing bitmap sector %lu " 1130 * we may want to change this again to write 4k aligned 4k pieces.
918 "(meta-disk sector %llus)\n", 1131 */
919 enr, (unsigned long long)on_disk_sector); 1132int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local)
920 drbd_chk_io_error(mdev, 1, TRUE); 1133{
921 for (i = 0; i < AL_EXT_PER_BM_SECT; i++) 1134 struct bm_aio_ctx ctx = {
922 drbd_bm_ALe_set_all(mdev, enr*AL_EXT_PER_BM_SECT+i); 1135 .mdev = mdev,
1136 .in_flight = ATOMIC_INIT(1),
1137 .done = COMPLETION_INITIALIZER_ONSTACK(ctx.done),
1138 .flags = BM_AIO_COPY_PAGES,
1139 };
1140
1141 if (bm_test_page_unchanged(mdev->bitmap->bm_pages[idx])) {
1142 dynamic_dev_dbg(DEV, "skipped bm page write for idx %u\n", idx);
1143 return 0;
923 } 1144 }
1145
1146 bm_page_io_async(&ctx, idx, WRITE_SYNC);
1147 wait_for_completion(&ctx.done);
1148
1149 if (ctx.error)
1150 drbd_chk_io_error(mdev, 1, true);
1151 /* that should force detach, so the in memory bitmap will be
1152 * gone in a moment as well. */
1153
924 mdev->bm_writ_cnt++; 1154 mdev->bm_writ_cnt++;
925 mutex_unlock(&mdev->md_io_mutex); 1155 return ctx.error;
926 return err;
927} 1156}
928 1157
929/* NOTE 1158/* NOTE
930 * find_first_bit returns int, we return unsigned long. 1159 * find_first_bit returns int, we return unsigned long.
931 * should not make much difference anyways, but ... 1160 * For this to work on 32bit arch with bitnumbers > (1<<32),
1161 * we'd need to return u64, and get a whole lot of other places
1162 * fixed where we still use unsigned long.
932 * 1163 *
933 * this returns a bit number, NOT a sector! 1164 * this returns a bit number, NOT a sector!
934 */ 1165 */
935#define BPP_MASK ((1UL << (PAGE_SHIFT+3)) - 1)
936static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo, 1166static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo,
937 const int find_zero_bit, const enum km_type km) 1167 const int find_zero_bit, const enum km_type km)
938{ 1168{
939 struct drbd_bitmap *b = mdev->bitmap; 1169 struct drbd_bitmap *b = mdev->bitmap;
940 unsigned long i = -1UL;
941 unsigned long *p_addr; 1170 unsigned long *p_addr;
942 unsigned long bit_offset; /* bit offset of the mapped page. */ 1171 unsigned long bit_offset;
1172 unsigned i;
1173
943 1174
944 if (bm_fo > b->bm_bits) { 1175 if (bm_fo > b->bm_bits) {
945 dev_err(DEV, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits); 1176 dev_err(DEV, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits);
1177 bm_fo = DRBD_END_OF_BITMAP;
946 } else { 1178 } else {
947 while (bm_fo < b->bm_bits) { 1179 while (bm_fo < b->bm_bits) {
948 unsigned long offset; 1180 /* bit offset of the first bit in the page */
949 bit_offset = bm_fo & ~BPP_MASK; /* bit offset of the page */ 1181 bit_offset = bm_fo & ~BITS_PER_PAGE_MASK;
950 offset = bit_offset >> LN2_BPL; /* word offset of the page */ 1182 p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo), km);
951 p_addr = __bm_map_paddr(b, offset, km);
952 1183
953 if (find_zero_bit) 1184 if (find_zero_bit)
954 i = find_next_zero_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK); 1185 i = find_next_zero_bit_le(p_addr,
1186 PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
955 else 1187 else
956 i = find_next_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK); 1188 i = find_next_bit_le(p_addr,
1189 PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
957 1190
958 __bm_unmap(p_addr, km); 1191 __bm_unmap(p_addr, km);
959 if (i < PAGE_SIZE*8) { 1192 if (i < PAGE_SIZE*8) {
960 i = bit_offset + i; 1193 bm_fo = bit_offset + i;
961 if (i >= b->bm_bits) 1194 if (bm_fo >= b->bm_bits)
962 break; 1195 break;
963 goto found; 1196 goto found;
964 } 1197 }
965 bm_fo = bit_offset + PAGE_SIZE*8; 1198 bm_fo = bit_offset + PAGE_SIZE*8;
966 } 1199 }
967 i = -1UL; 1200 bm_fo = DRBD_END_OF_BITMAP;
968 } 1201 }
969 found: 1202 found:
970 return i; 1203 return bm_fo;
971} 1204}
972 1205
973static unsigned long bm_find_next(struct drbd_conf *mdev, 1206static unsigned long bm_find_next(struct drbd_conf *mdev,
974 unsigned long bm_fo, const int find_zero_bit) 1207 unsigned long bm_fo, const int find_zero_bit)
975{ 1208{
976 struct drbd_bitmap *b = mdev->bitmap; 1209 struct drbd_bitmap *b = mdev->bitmap;
977 unsigned long i = -1UL; 1210 unsigned long i = DRBD_END_OF_BITMAP;
978 1211
979 ERR_IF(!b) return i; 1212 ERR_IF(!b) return i;
980 ERR_IF(!b->bm_pages) return i; 1213 ERR_IF(!b->bm_pages) return i;
981 1214
982 spin_lock_irq(&b->bm_lock); 1215 spin_lock_irq(&b->bm_lock);
983 if (bm_is_locked(b)) 1216 if (BM_DONT_TEST & b->bm_flags)
984 bm_print_lock_info(mdev); 1217 bm_print_lock_info(mdev);
985 1218
986 i = __bm_find_next(mdev, bm_fo, find_zero_bit, KM_IRQ1); 1219 i = __bm_find_next(mdev, bm_fo, find_zero_bit, KM_IRQ1);
@@ -1006,13 +1239,13 @@ unsigned long drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo
1006 * you must take drbd_bm_lock() first */ 1239 * you must take drbd_bm_lock() first */
1007unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo) 1240unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)
1008{ 1241{
1009 /* WARN_ON(!bm_is_locked(mdev)); */ 1242 /* WARN_ON(!(BM_DONT_SET & mdev->b->bm_flags)); */
1010 return __bm_find_next(mdev, bm_fo, 0, KM_USER1); 1243 return __bm_find_next(mdev, bm_fo, 0, KM_USER1);
1011} 1244}
1012 1245
1013unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo) 1246unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)
1014{ 1247{
1015 /* WARN_ON(!bm_is_locked(mdev)); */ 1248 /* WARN_ON(!(BM_DONT_SET & mdev->b->bm_flags)); */
1016 return __bm_find_next(mdev, bm_fo, 1, KM_USER1); 1249 return __bm_find_next(mdev, bm_fo, 1, KM_USER1);
1017} 1250}
1018 1251
@@ -1023,13 +1256,14 @@ unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_f
1023 * expected to be called for only a few bits (e - s about BITS_PER_LONG). 1256 * expected to be called for only a few bits (e - s about BITS_PER_LONG).
1024 * Must hold bitmap lock already. */ 1257 * Must hold bitmap lock already. */
1025static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, 1258static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
1026 unsigned long e, int val, const enum km_type km) 1259 unsigned long e, int val)
1027{ 1260{
1028 struct drbd_bitmap *b = mdev->bitmap; 1261 struct drbd_bitmap *b = mdev->bitmap;
1029 unsigned long *p_addr = NULL; 1262 unsigned long *p_addr = NULL;
1030 unsigned long bitnr; 1263 unsigned long bitnr;
1031 unsigned long last_page_nr = -1UL; 1264 unsigned int last_page_nr = -1U;
1032 int c = 0; 1265 int c = 0;
1266 int changed_total = 0;
1033 1267
1034 if (e >= b->bm_bits) { 1268 if (e >= b->bm_bits) {
1035 dev_err(DEV, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n", 1269 dev_err(DEV, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n",
@@ -1037,23 +1271,33 @@ static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
1037 e = b->bm_bits ? b->bm_bits -1 : 0; 1271 e = b->bm_bits ? b->bm_bits -1 : 0;
1038 } 1272 }
1039 for (bitnr = s; bitnr <= e; bitnr++) { 1273 for (bitnr = s; bitnr <= e; bitnr++) {
1040 unsigned long offset = bitnr>>LN2_BPL; 1274 unsigned int page_nr = bm_bit_to_page_idx(b, bitnr);
1041 unsigned long page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3);
1042 if (page_nr != last_page_nr) { 1275 if (page_nr != last_page_nr) {
1043 if (p_addr) 1276 if (p_addr)
1044 __bm_unmap(p_addr, km); 1277 __bm_unmap(p_addr, KM_IRQ1);
1045 p_addr = __bm_map_paddr(b, offset, km); 1278 if (c < 0)
1279 bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
1280 else if (c > 0)
1281 bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
1282 changed_total += c;
1283 c = 0;
1284 p_addr = __bm_map_pidx(b, page_nr, KM_IRQ1);
1046 last_page_nr = page_nr; 1285 last_page_nr = page_nr;
1047 } 1286 }
1048 if (val) 1287 if (val)
1049 c += (0 == __test_and_set_bit(bitnr & BPP_MASK, p_addr)); 1288 c += (0 == __test_and_set_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
1050 else 1289 else
1051 c -= (0 != __test_and_clear_bit(bitnr & BPP_MASK, p_addr)); 1290 c -= (0 != __test_and_clear_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
1052 } 1291 }
1053 if (p_addr) 1292 if (p_addr)
1054 __bm_unmap(p_addr, km); 1293 __bm_unmap(p_addr, KM_IRQ1);
1055 b->bm_set += c; 1294 if (c < 0)
1056 return c; 1295 bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
1296 else if (c > 0)
1297 bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
1298 changed_total += c;
1299 b->bm_set += changed_total;
1300 return changed_total;
1057} 1301}
1058 1302
1059/* returns number of bits actually changed. 1303/* returns number of bits actually changed.
@@ -1071,10 +1315,10 @@ static int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
1071 ERR_IF(!b->bm_pages) return 0; 1315 ERR_IF(!b->bm_pages) return 0;
1072 1316
1073 spin_lock_irqsave(&b->bm_lock, flags); 1317 spin_lock_irqsave(&b->bm_lock, flags);
1074 if (bm_is_locked(b)) 1318 if ((val ? BM_DONT_SET : BM_DONT_CLEAR) & b->bm_flags)
1075 bm_print_lock_info(mdev); 1319 bm_print_lock_info(mdev);
1076 1320
1077 c = __bm_change_bits_to(mdev, s, e, val, KM_IRQ1); 1321 c = __bm_change_bits_to(mdev, s, e, val);
1078 1322
1079 spin_unlock_irqrestore(&b->bm_lock, flags); 1323 spin_unlock_irqrestore(&b->bm_lock, flags);
1080 return c; 1324 return c;
@@ -1099,16 +1343,17 @@ static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
1099{ 1343{
1100 int i; 1344 int i;
1101 int bits; 1345 int bits;
1102 unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr], KM_USER0); 1346 unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr], KM_IRQ1);
1103 for (i = first_word; i < last_word; i++) { 1347 for (i = first_word; i < last_word; i++) {
1104 bits = hweight_long(paddr[i]); 1348 bits = hweight_long(paddr[i]);
1105 paddr[i] = ~0UL; 1349 paddr[i] = ~0UL;
1106 b->bm_set += BITS_PER_LONG - bits; 1350 b->bm_set += BITS_PER_LONG - bits;
1107 } 1351 }
1108 kunmap_atomic(paddr, KM_USER0); 1352 kunmap_atomic(paddr, KM_IRQ1);
1109} 1353}
1110 1354
1111/* Same thing as drbd_bm_set_bits, but without taking the spin_lock_irqsave. 1355/* Same thing as drbd_bm_set_bits,
1356 * but more efficient for a large bit range.
1112 * You must first drbd_bm_lock(). 1357 * You must first drbd_bm_lock().
1113 * Can be called to set the whole bitmap in one go. 1358 * Can be called to set the whole bitmap in one go.
1114 * Sets bits from s to e _inclusive_. */ 1359 * Sets bits from s to e _inclusive_. */
@@ -1122,6 +1367,7 @@ void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsi
1122 * Do not use memset, because we must account for changes, 1367 * Do not use memset, because we must account for changes,
1123 * so we need to loop over the words with hweight() anyways. 1368 * so we need to loop over the words with hweight() anyways.
1124 */ 1369 */
1370 struct drbd_bitmap *b = mdev->bitmap;
1125 unsigned long sl = ALIGN(s,BITS_PER_LONG); 1371 unsigned long sl = ALIGN(s,BITS_PER_LONG);
1126 unsigned long el = (e+1) & ~((unsigned long)BITS_PER_LONG-1); 1372 unsigned long el = (e+1) & ~((unsigned long)BITS_PER_LONG-1);
1127 int first_page; 1373 int first_page;
@@ -1132,15 +1378,19 @@ void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsi
1132 1378
1133 if (e - s <= 3*BITS_PER_LONG) { 1379 if (e - s <= 3*BITS_PER_LONG) {
1134 /* don't bother; el and sl may even be wrong. */ 1380 /* don't bother; el and sl may even be wrong. */
1135 __bm_change_bits_to(mdev, s, e, 1, KM_USER0); 1381 spin_lock_irq(&b->bm_lock);
1382 __bm_change_bits_to(mdev, s, e, 1);
1383 spin_unlock_irq(&b->bm_lock);
1136 return; 1384 return;
1137 } 1385 }
1138 1386
1139 /* difference is large enough that we can trust sl and el */ 1387 /* difference is large enough that we can trust sl and el */
1140 1388
1389 spin_lock_irq(&b->bm_lock);
1390
1141 /* bits filling the current long */ 1391 /* bits filling the current long */
1142 if (sl) 1392 if (sl)
1143 __bm_change_bits_to(mdev, s, sl-1, 1, KM_USER0); 1393 __bm_change_bits_to(mdev, s, sl-1, 1);
1144 1394
1145 first_page = sl >> (3 + PAGE_SHIFT); 1395 first_page = sl >> (3 + PAGE_SHIFT);
1146 last_page = el >> (3 + PAGE_SHIFT); 1396 last_page = el >> (3 + PAGE_SHIFT);
@@ -1153,8 +1403,10 @@ void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsi
1153 /* first and full pages, unless first page == last page */ 1403 /* first and full pages, unless first page == last page */
1154 for (page_nr = first_page; page_nr < last_page; page_nr++) { 1404 for (page_nr = first_page; page_nr < last_page; page_nr++) {
1155 bm_set_full_words_within_one_page(mdev->bitmap, page_nr, first_word, last_word); 1405 bm_set_full_words_within_one_page(mdev->bitmap, page_nr, first_word, last_word);
1406 spin_unlock_irq(&b->bm_lock);
1156 cond_resched(); 1407 cond_resched();
1157 first_word = 0; 1408 first_word = 0;
1409 spin_lock_irq(&b->bm_lock);
1158 } 1410 }
1159 1411
1160 /* last page (respectively only page, for first page == last page) */ 1412 /* last page (respectively only page, for first page == last page) */
@@ -1167,7 +1419,8 @@ void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsi
1167 * it would trigger an assert in __bm_change_bits_to() 1419 * it would trigger an assert in __bm_change_bits_to()
1168 */ 1420 */
1169 if (el <= e) 1421 if (el <= e)
1170 __bm_change_bits_to(mdev, el, e, 1, KM_USER0); 1422 __bm_change_bits_to(mdev, el, e, 1);
1423 spin_unlock_irq(&b->bm_lock);
1171} 1424}
1172 1425
1173/* returns bit state 1426/* returns bit state
@@ -1188,12 +1441,11 @@ int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr)
1188 ERR_IF(!b->bm_pages) return 0; 1441 ERR_IF(!b->bm_pages) return 0;
1189 1442
1190 spin_lock_irqsave(&b->bm_lock, flags); 1443 spin_lock_irqsave(&b->bm_lock, flags);
1191 if (bm_is_locked(b)) 1444 if (BM_DONT_TEST & b->bm_flags)
1192 bm_print_lock_info(mdev); 1445 bm_print_lock_info(mdev);
1193 if (bitnr < b->bm_bits) { 1446 if (bitnr < b->bm_bits) {
1194 unsigned long offset = bitnr>>LN2_BPL; 1447 p_addr = bm_map_pidx(b, bm_bit_to_page_idx(b, bitnr));
1195 p_addr = bm_map_paddr(b, offset); 1448 i = test_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr) ? 1 : 0;
1196 i = test_bit(bitnr & BPP_MASK, p_addr) ? 1 : 0;
1197 bm_unmap(p_addr); 1449 bm_unmap(p_addr);
1198 } else if (bitnr == b->bm_bits) { 1450 } else if (bitnr == b->bm_bits) {
1199 i = -1; 1451 i = -1;
@@ -1211,10 +1463,10 @@ int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsi
1211{ 1463{
1212 unsigned long flags; 1464 unsigned long flags;
1213 struct drbd_bitmap *b = mdev->bitmap; 1465 struct drbd_bitmap *b = mdev->bitmap;
1214 unsigned long *p_addr = NULL, page_nr = -1; 1466 unsigned long *p_addr = NULL;
1215 unsigned long bitnr; 1467 unsigned long bitnr;
1468 unsigned int page_nr = -1U;
1216 int c = 0; 1469 int c = 0;
1217 size_t w;
1218 1470
1219 /* If this is called without a bitmap, that is a bug. But just to be 1471 /* If this is called without a bitmap, that is a bug. But just to be
1220 * robust in case we screwed up elsewhere, in that case pretend there 1472 * robust in case we screwed up elsewhere, in that case pretend there
@@ -1224,20 +1476,20 @@ int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsi
1224 ERR_IF(!b->bm_pages) return 1; 1476 ERR_IF(!b->bm_pages) return 1;
1225 1477
1226 spin_lock_irqsave(&b->bm_lock, flags); 1478 spin_lock_irqsave(&b->bm_lock, flags);
1227 if (bm_is_locked(b)) 1479 if (BM_DONT_TEST & b->bm_flags)
1228 bm_print_lock_info(mdev); 1480 bm_print_lock_info(mdev);
1229 for (bitnr = s; bitnr <= e; bitnr++) { 1481 for (bitnr = s; bitnr <= e; bitnr++) {
1230 w = bitnr >> LN2_BPL; 1482 unsigned int idx = bm_bit_to_page_idx(b, bitnr);
1231 if (page_nr != w >> (PAGE_SHIFT - LN2_BPL + 3)) { 1483 if (page_nr != idx) {
1232 page_nr = w >> (PAGE_SHIFT - LN2_BPL + 3); 1484 page_nr = idx;
1233 if (p_addr) 1485 if (p_addr)
1234 bm_unmap(p_addr); 1486 bm_unmap(p_addr);
1235 p_addr = bm_map_paddr(b, w); 1487 p_addr = bm_map_pidx(b, idx);
1236 } 1488 }
1237 ERR_IF (bitnr >= b->bm_bits) { 1489 ERR_IF (bitnr >= b->bm_bits) {
1238 dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits); 1490 dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
1239 } else { 1491 } else {
1240 c += (0 != test_bit(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr)); 1492 c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
1241 } 1493 }
1242 } 1494 }
1243 if (p_addr) 1495 if (p_addr)
@@ -1272,7 +1524,7 @@ int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr)
1272 ERR_IF(!b->bm_pages) return 0; 1524 ERR_IF(!b->bm_pages) return 0;
1273 1525
1274 spin_lock_irqsave(&b->bm_lock, flags); 1526 spin_lock_irqsave(&b->bm_lock, flags);
1275 if (bm_is_locked(b)) 1527 if (BM_DONT_TEST & b->bm_flags)
1276 bm_print_lock_info(mdev); 1528 bm_print_lock_info(mdev);
1277 1529
1278 s = S2W(enr); 1530 s = S2W(enr);
@@ -1280,7 +1532,7 @@ int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr)
1280 count = 0; 1532 count = 0;
1281 if (s < b->bm_words) { 1533 if (s < b->bm_words) {
1282 int n = e-s; 1534 int n = e-s;
1283 p_addr = bm_map_paddr(b, s); 1535 p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
1284 bm = p_addr + MLPP(s); 1536 bm = p_addr + MLPP(s);
1285 while (n--) 1537 while (n--)
1286 count += hweight_long(*bm++); 1538 count += hweight_long(*bm++);
@@ -1292,18 +1544,20 @@ int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr)
1292 return count; 1544 return count;
1293} 1545}
1294 1546
1295/* set all bits covered by the AL-extent al_enr */ 1547/* Set all bits covered by the AL-extent al_enr.
1548 * Returns number of bits changed. */
1296unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr) 1549unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr)
1297{ 1550{
1298 struct drbd_bitmap *b = mdev->bitmap; 1551 struct drbd_bitmap *b = mdev->bitmap;
1299 unsigned long *p_addr, *bm; 1552 unsigned long *p_addr, *bm;
1300 unsigned long weight; 1553 unsigned long weight;
1301 int count, s, e, i, do_now; 1554 unsigned long s, e;
1555 int count, i, do_now;
1302 ERR_IF(!b) return 0; 1556 ERR_IF(!b) return 0;
1303 ERR_IF(!b->bm_pages) return 0; 1557 ERR_IF(!b->bm_pages) return 0;
1304 1558
1305 spin_lock_irq(&b->bm_lock); 1559 spin_lock_irq(&b->bm_lock);
1306 if (bm_is_locked(b)) 1560 if (BM_DONT_SET & b->bm_flags)
1307 bm_print_lock_info(mdev); 1561 bm_print_lock_info(mdev);
1308 weight = b->bm_set; 1562 weight = b->bm_set;
1309 1563
@@ -1315,7 +1569,7 @@ unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr)
1315 count = 0; 1569 count = 0;
1316 if (s < b->bm_words) { 1570 if (s < b->bm_words) {
1317 i = do_now = e-s; 1571 i = do_now = e-s;
1318 p_addr = bm_map_paddr(b, s); 1572 p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
1319 bm = p_addr + MLPP(s); 1573 bm = p_addr + MLPP(s);
1320 while (i--) { 1574 while (i--) {
1321 count += hweight_long(*bm); 1575 count += hweight_long(*bm);
@@ -1327,7 +1581,7 @@ unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr)
1327 if (e == b->bm_words) 1581 if (e == b->bm_words)
1328 b->bm_set -= bm_clear_surplus(b); 1582 b->bm_set -= bm_clear_surplus(b);
1329 } else { 1583 } else {
1330 dev_err(DEV, "start offset (%d) too large in drbd_bm_ALe_set_all\n", s); 1584 dev_err(DEV, "start offset (%lu) too large in drbd_bm_ALe_set_all\n", s);
1331 } 1585 }
1332 weight = b->bm_set - weight; 1586 weight = b->bm_set - weight;
1333 spin_unlock_irq(&b->bm_lock); 1587 spin_unlock_irq(&b->bm_lock);
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 352441b0f92f..ef2ceed3be4b 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -42,6 +42,7 @@
42#include <linux/genhd.h> 42#include <linux/genhd.h>
43#include <net/tcp.h> 43#include <net/tcp.h>
44#include <linux/lru_cache.h> 44#include <linux/lru_cache.h>
45#include <linux/prefetch.h>
45 46
46#ifdef __CHECKER__ 47#ifdef __CHECKER__
47# define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr"))) 48# define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr")))
@@ -72,13 +73,6 @@ extern int fault_devs;
72extern char usermode_helper[]; 73extern char usermode_helper[];
73 74
74 75
75#ifndef TRUE
76#define TRUE 1
77#endif
78#ifndef FALSE
79#define FALSE 0
80#endif
81
82/* I don't remember why XCPU ... 76/* I don't remember why XCPU ...
83 * This is used to wake the asender, 77 * This is used to wake the asender,
84 * and to interrupt sending the sending task 78 * and to interrupt sending the sending task
@@ -104,6 +98,7 @@ extern char usermode_helper[];
104#define ID_SYNCER (-1ULL) 98#define ID_SYNCER (-1ULL)
105#define ID_VACANT 0 99#define ID_VACANT 0
106#define is_syncer_block_id(id) ((id) == ID_SYNCER) 100#define is_syncer_block_id(id) ((id) == ID_SYNCER)
101#define UUID_NEW_BM_OFFSET ((u64)0x0001000000000000ULL)
107 102
108struct drbd_conf; 103struct drbd_conf;
109 104
@@ -114,11 +109,11 @@ struct drbd_conf;
114#define D_ASSERT(exp) if (!(exp)) \ 109#define D_ASSERT(exp) if (!(exp)) \
115 dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__) 110 dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__)
116 111
117#define ERR_IF(exp) if (({ \ 112#define ERR_IF(exp) if (({ \
118 int _b = (exp) != 0; \ 113 int _b = (exp) != 0; \
119 if (_b) dev_err(DEV, "%s: (%s) in %s:%d\n", \ 114 if (_b) dev_err(DEV, "ASSERT FAILED: %s: (%s) in %s:%d\n", \
120 __func__, #exp, __FILE__, __LINE__); \ 115 __func__, #exp, __FILE__, __LINE__); \
121 _b; \ 116 _b; \
122 })) 117 }))
123 118
124/* Defines to control fault insertion */ 119/* Defines to control fault insertion */
@@ -137,20 +132,19 @@ enum {
137 DRBD_FAULT_MAX, 132 DRBD_FAULT_MAX,
138}; 133};
139 134
140#ifdef CONFIG_DRBD_FAULT_INJECTION
141extern unsigned int 135extern unsigned int
142_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type); 136_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type);
137
143static inline int 138static inline int
144drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) { 139drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) {
140#ifdef CONFIG_DRBD_FAULT_INJECTION
145 return fault_rate && 141 return fault_rate &&
146 (enable_faults & (1<<type)) && 142 (enable_faults & (1<<type)) &&
147 _drbd_insert_fault(mdev, type); 143 _drbd_insert_fault(mdev, type);
148}
149#define FAULT_ACTIVE(_m, _t) (drbd_insert_fault((_m), (_t)))
150
151#else 144#else
152#define FAULT_ACTIVE(_m, _t) (0) 145 return 0;
153#endif 146#endif
147}
154 148
155/* integer division, round _UP_ to the next integer */ 149/* integer division, round _UP_ to the next integer */
156#define div_ceil(A, B) ((A)/(B) + ((A)%(B) ? 1 : 0)) 150#define div_ceil(A, B) ((A)/(B) + ((A)%(B) ? 1 : 0))
@@ -212,8 +206,10 @@ enum drbd_packets {
212 /* P_CKPT_FENCE_REQ = 0x25, * currently reserved for protocol D */ 206 /* P_CKPT_FENCE_REQ = 0x25, * currently reserved for protocol D */
213 /* P_CKPT_DISABLE_REQ = 0x26, * currently reserved for protocol D */ 207 /* P_CKPT_DISABLE_REQ = 0x26, * currently reserved for protocol D */
214 P_DELAY_PROBE = 0x27, /* is used on BOTH sockets */ 208 P_DELAY_PROBE = 0x27, /* is used on BOTH sockets */
209 P_OUT_OF_SYNC = 0x28, /* Mark as out of sync (Outrunning), data socket */
210 P_RS_CANCEL = 0x29, /* meta: Used to cancel RS_DATA_REQUEST packet by SyncSource */
215 211
216 P_MAX_CMD = 0x28, 212 P_MAX_CMD = 0x2A,
217 P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ 213 P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
218 P_MAX_OPT_CMD = 0x101, 214 P_MAX_OPT_CMD = 0x101,
219 215
@@ -269,6 +265,7 @@ static inline const char *cmdname(enum drbd_packets cmd)
269 [P_RS_IS_IN_SYNC] = "CsumRSIsInSync", 265 [P_RS_IS_IN_SYNC] = "CsumRSIsInSync",
270 [P_COMPRESSED_BITMAP] = "CBitmap", 266 [P_COMPRESSED_BITMAP] = "CBitmap",
271 [P_DELAY_PROBE] = "DelayProbe", 267 [P_DELAY_PROBE] = "DelayProbe",
268 [P_OUT_OF_SYNC] = "OutOfSync",
272 [P_MAX_CMD] = NULL, 269 [P_MAX_CMD] = NULL,
273 }; 270 };
274 271
@@ -337,13 +334,25 @@ static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c)
337 * NOTE that the payload starts at a long aligned offset, 334 * NOTE that the payload starts at a long aligned offset,
338 * regardless of 32 or 64 bit arch! 335 * regardless of 32 or 64 bit arch!
339 */ 336 */
340struct p_header { 337struct p_header80 {
341 u32 magic; 338 u32 magic;
342 u16 command; 339 u16 command;
343 u16 length; /* bytes of data after this header */ 340 u16 length; /* bytes of data after this header */
344 u8 payload[0]; 341 u8 payload[0];
345} __packed; 342} __packed;
346/* 8 bytes. packet FIXED for the next century! */ 343
344/* Header for big packets, Used for data packets exceeding 64kB */
345struct p_header95 {
346 u16 magic; /* use DRBD_MAGIC_BIG here */
347 u16 command;
348 u32 length; /* Use only 24 bits of that. Ignore the highest 8 bit. */
349 u8 payload[0];
350} __packed;
351
352union p_header {
353 struct p_header80 h80;
354 struct p_header95 h95;
355};
347 356
348/* 357/*
349 * short commands, packets without payload, plain p_header: 358 * short commands, packets without payload, plain p_header:
@@ -362,12 +371,16 @@ struct p_header {
362 */ 371 */
363 372
364/* these defines must not be changed without changing the protocol version */ 373/* these defines must not be changed without changing the protocol version */
365#define DP_HARDBARRIER 1 374#define DP_HARDBARRIER 1 /* depricated */
366#define DP_RW_SYNC 2 375#define DP_RW_SYNC 2 /* equals REQ_SYNC */
367#define DP_MAY_SET_IN_SYNC 4 376#define DP_MAY_SET_IN_SYNC 4
377#define DP_UNPLUG 8 /* not used anymore */
378#define DP_FUA 16 /* equals REQ_FUA */
379#define DP_FLUSH 32 /* equals REQ_FLUSH */
380#define DP_DISCARD 64 /* equals REQ_DISCARD */
368 381
369struct p_data { 382struct p_data {
370 struct p_header head; 383 union p_header head;
371 u64 sector; /* 64 bits sector number */ 384 u64 sector; /* 64 bits sector number */
372 u64 block_id; /* to identify the request in protocol B&C */ 385 u64 block_id; /* to identify the request in protocol B&C */
373 u32 seq_num; 386 u32 seq_num;
@@ -383,7 +396,7 @@ struct p_data {
383 * P_DATA_REQUEST, P_RS_DATA_REQUEST 396 * P_DATA_REQUEST, P_RS_DATA_REQUEST
384 */ 397 */
385struct p_block_ack { 398struct p_block_ack {
386 struct p_header head; 399 struct p_header80 head;
387 u64 sector; 400 u64 sector;
388 u64 block_id; 401 u64 block_id;
389 u32 blksize; 402 u32 blksize;
@@ -392,7 +405,7 @@ struct p_block_ack {
392 405
393 406
394struct p_block_req { 407struct p_block_req {
395 struct p_header head; 408 struct p_header80 head;
396 u64 sector; 409 u64 sector;
397 u64 block_id; 410 u64 block_id;
398 u32 blksize; 411 u32 blksize;
@@ -409,7 +422,7 @@ struct p_block_req {
409 */ 422 */
410 423
411struct p_handshake { 424struct p_handshake {
412 struct p_header head; /* 8 bytes */ 425 struct p_header80 head; /* 8 bytes */
413 u32 protocol_min; 426 u32 protocol_min;
414 u32 feature_flags; 427 u32 feature_flags;
415 u32 protocol_max; 428 u32 protocol_max;
@@ -424,19 +437,19 @@ struct p_handshake {
424/* 80 bytes, FIXED for the next century */ 437/* 80 bytes, FIXED for the next century */
425 438
426struct p_barrier { 439struct p_barrier {
427 struct p_header head; 440 struct p_header80 head;
428 u32 barrier; /* barrier number _handle_ only */ 441 u32 barrier; /* barrier number _handle_ only */
429 u32 pad; /* to multiple of 8 Byte */ 442 u32 pad; /* to multiple of 8 Byte */
430} __packed; 443} __packed;
431 444
432struct p_barrier_ack { 445struct p_barrier_ack {
433 struct p_header head; 446 struct p_header80 head;
434 u32 barrier; 447 u32 barrier;
435 u32 set_size; 448 u32 set_size;
436} __packed; 449} __packed;
437 450
438struct p_rs_param { 451struct p_rs_param {
439 struct p_header head; 452 struct p_header80 head;
440 u32 rate; 453 u32 rate;
441 454
442 /* Since protocol version 88 and higher. */ 455 /* Since protocol version 88 and higher. */
@@ -444,20 +457,31 @@ struct p_rs_param {
444} __packed; 457} __packed;
445 458
446struct p_rs_param_89 { 459struct p_rs_param_89 {
447 struct p_header head; 460 struct p_header80 head;
448 u32 rate; 461 u32 rate;
449 /* protocol version 89: */ 462 /* protocol version 89: */
450 char verify_alg[SHARED_SECRET_MAX]; 463 char verify_alg[SHARED_SECRET_MAX];
451 char csums_alg[SHARED_SECRET_MAX]; 464 char csums_alg[SHARED_SECRET_MAX];
452} __packed; 465} __packed;
453 466
467struct p_rs_param_95 {
468 struct p_header80 head;
469 u32 rate;
470 char verify_alg[SHARED_SECRET_MAX];
471 char csums_alg[SHARED_SECRET_MAX];
472 u32 c_plan_ahead;
473 u32 c_delay_target;
474 u32 c_fill_target;
475 u32 c_max_rate;
476} __packed;
477
454enum drbd_conn_flags { 478enum drbd_conn_flags {
455 CF_WANT_LOSE = 1, 479 CF_WANT_LOSE = 1,
456 CF_DRY_RUN = 2, 480 CF_DRY_RUN = 2,
457}; 481};
458 482
459struct p_protocol { 483struct p_protocol {
460 struct p_header head; 484 struct p_header80 head;
461 u32 protocol; 485 u32 protocol;
462 u32 after_sb_0p; 486 u32 after_sb_0p;
463 u32 after_sb_1p; 487 u32 after_sb_1p;
@@ -471,38 +495,38 @@ struct p_protocol {
471} __packed; 495} __packed;
472 496
473struct p_uuids { 497struct p_uuids {
474 struct p_header head; 498 struct p_header80 head;
475 u64 uuid[UI_EXTENDED_SIZE]; 499 u64 uuid[UI_EXTENDED_SIZE];
476} __packed; 500} __packed;
477 501
478struct p_rs_uuid { 502struct p_rs_uuid {
479 struct p_header head; 503 struct p_header80 head;
480 u64 uuid; 504 u64 uuid;
481} __packed; 505} __packed;
482 506
483struct p_sizes { 507struct p_sizes {
484 struct p_header head; 508 struct p_header80 head;
485 u64 d_size; /* size of disk */ 509 u64 d_size; /* size of disk */
486 u64 u_size; /* user requested size */ 510 u64 u_size; /* user requested size */
487 u64 c_size; /* current exported size */ 511 u64 c_size; /* current exported size */
488 u32 max_segment_size; /* Maximal size of a BIO */ 512 u32 max_bio_size; /* Maximal size of a BIO */
489 u16 queue_order_type; /* not yet implemented in DRBD*/ 513 u16 queue_order_type; /* not yet implemented in DRBD*/
490 u16 dds_flags; /* use enum dds_flags here. */ 514 u16 dds_flags; /* use enum dds_flags here. */
491} __packed; 515} __packed;
492 516
493struct p_state { 517struct p_state {
494 struct p_header head; 518 struct p_header80 head;
495 u32 state; 519 u32 state;
496} __packed; 520} __packed;
497 521
498struct p_req_state { 522struct p_req_state {
499 struct p_header head; 523 struct p_header80 head;
500 u32 mask; 524 u32 mask;
501 u32 val; 525 u32 val;
502} __packed; 526} __packed;
503 527
504struct p_req_state_reply { 528struct p_req_state_reply {
505 struct p_header head; 529 struct p_header80 head;
506 u32 retcode; 530 u32 retcode;
507} __packed; 531} __packed;
508 532
@@ -517,12 +541,19 @@ struct p_drbd06_param {
517} __packed; 541} __packed;
518 542
519struct p_discard { 543struct p_discard {
520 struct p_header head; 544 struct p_header80 head;
521 u64 block_id; 545 u64 block_id;
522 u32 seq_num; 546 u32 seq_num;
523 u32 pad; 547 u32 pad;
524} __packed; 548} __packed;
525 549
550struct p_block_desc {
551 struct p_header80 head;
552 u64 sector;
553 u32 blksize;
554 u32 pad; /* to multiple of 8 Byte */
555} __packed;
556
526/* Valid values for the encoding field. 557/* Valid values for the encoding field.
527 * Bump proto version when changing this. */ 558 * Bump proto version when changing this. */
528enum drbd_bitmap_code { 559enum drbd_bitmap_code {
@@ -533,7 +564,7 @@ enum drbd_bitmap_code {
533}; 564};
534 565
535struct p_compressed_bm { 566struct p_compressed_bm {
536 struct p_header head; 567 struct p_header80 head;
537 /* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code 568 /* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code
538 * (encoding & 0x80): polarity (set/unset) of first runlength 569 * (encoding & 0x80): polarity (set/unset) of first runlength
539 * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits 570 * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits
@@ -544,10 +575,10 @@ struct p_compressed_bm {
544 u8 code[0]; 575 u8 code[0];
545} __packed; 576} __packed;
546 577
547struct p_delay_probe { 578struct p_delay_probe93 {
548 struct p_header head; 579 struct p_header80 head;
549 u32 seq_num; /* sequence number to match the two probe packets */ 580 u32 seq_num; /* sequence number to match the two probe packets */
550 u32 offset; /* usecs the probe got sent after the reference time point */ 581 u32 offset; /* usecs the probe got sent after the reference time point */
551} __packed; 582} __packed;
552 583
553/* DCBP: Drbd Compressed Bitmap Packet ... */ 584/* DCBP: Drbd Compressed Bitmap Packet ... */
@@ -592,9 +623,9 @@ DCBP_set_pad_bits(struct p_compressed_bm *p, int n)
592/* one bitmap packet, including the p_header, 623/* one bitmap packet, including the p_header,
593 * should fit within one _architecture independend_ page. 624 * should fit within one _architecture independend_ page.
594 * so we need to use the fixed size 4KiB page size 625 * so we need to use the fixed size 4KiB page size
595 * most architechtures have used for a long time. 626 * most architectures have used for a long time.
596 */ 627 */
597#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header)) 628#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header80))
598#define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long)) 629#define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long))
599#define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm)) 630#define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm))
600#if (PAGE_SIZE < 4096) 631#if (PAGE_SIZE < 4096)
@@ -603,13 +634,14 @@ DCBP_set_pad_bits(struct p_compressed_bm *p, int n)
603#endif 634#endif
604 635
605union p_polymorph { 636union p_polymorph {
606 struct p_header header; 637 union p_header header;
607 struct p_handshake handshake; 638 struct p_handshake handshake;
608 struct p_data data; 639 struct p_data data;
609 struct p_block_ack block_ack; 640 struct p_block_ack block_ack;
610 struct p_barrier barrier; 641 struct p_barrier barrier;
611 struct p_barrier_ack barrier_ack; 642 struct p_barrier_ack barrier_ack;
612 struct p_rs_param_89 rs_param_89; 643 struct p_rs_param_89 rs_param_89;
644 struct p_rs_param_95 rs_param_95;
613 struct p_protocol protocol; 645 struct p_protocol protocol;
614 struct p_sizes sizes; 646 struct p_sizes sizes;
615 struct p_uuids uuids; 647 struct p_uuids uuids;
@@ -617,6 +649,9 @@ union p_polymorph {
617 struct p_req_state req_state; 649 struct p_req_state req_state;
618 struct p_req_state_reply req_state_reply; 650 struct p_req_state_reply req_state_reply;
619 struct p_block_req block_req; 651 struct p_block_req block_req;
652 struct p_delay_probe93 delay_probe93;
653 struct p_rs_uuid rs_uuid;
654 struct p_block_desc block_desc;
620} __packed; 655} __packed;
621 656
622/**********************************************************************/ 657/**********************************************************************/
@@ -647,13 +682,6 @@ static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi)
647 return thi->t_state; 682 return thi->t_state;
648} 683}
649 684
650
651/*
652 * Having this as the first member of a struct provides sort of "inheritance".
653 * "derived" structs can be "drbd_queue_work()"ed.
654 * The callback should know and cast back to the descendant struct.
655 * drbd_request and drbd_epoch_entry are descendants of drbd_work.
656 */
657struct drbd_work; 685struct drbd_work;
658typedef int (*drbd_work_cb)(struct drbd_conf *, struct drbd_work *, int cancel); 686typedef int (*drbd_work_cb)(struct drbd_conf *, struct drbd_work *, int cancel);
659struct drbd_work { 687struct drbd_work {
@@ -672,7 +700,7 @@ struct drbd_request {
672 * see drbd_endio_pri(). */ 700 * see drbd_endio_pri(). */
673 struct bio *private_bio; 701 struct bio *private_bio;
674 702
675 struct hlist_node colision; 703 struct hlist_node collision;
676 sector_t sector; 704 sector_t sector;
677 unsigned int size; 705 unsigned int size;
678 unsigned int epoch; /* barrier_nr */ 706 unsigned int epoch; /* barrier_nr */
@@ -682,9 +710,6 @@ struct drbd_request {
682 * starting a new epoch... 710 * starting a new epoch...
683 */ 711 */
684 712
685 /* up to here, the struct layout is identical to drbd_epoch_entry;
686 * we might be able to use that to our advantage... */
687
688 struct list_head tl_requests; /* ring list in the transfer log */ 713 struct list_head tl_requests; /* ring list in the transfer log */
689 struct bio *master_bio; /* master bio pointer */ 714 struct bio *master_bio; /* master bio pointer */
690 unsigned long rq_state; /* see comments above _req_mod() */ 715 unsigned long rq_state; /* see comments above _req_mod() */
@@ -697,7 +722,7 @@ struct drbd_tl_epoch {
697 struct list_head requests; /* requests before */ 722 struct list_head requests; /* requests before */
698 struct drbd_tl_epoch *next; /* pointer to the next barrier */ 723 struct drbd_tl_epoch *next; /* pointer to the next barrier */
699 unsigned int br_number; /* the barriers identifier. */ 724 unsigned int br_number; /* the barriers identifier. */
700 int n_req; /* number of requests attached before this barrier */ 725 int n_writes; /* number of requests attached before this barrier */
701}; 726};
702 727
703struct drbd_request; 728struct drbd_request;
@@ -719,17 +744,12 @@ struct drbd_epoch {
719 744
720/* drbd_epoch flag bits */ 745/* drbd_epoch flag bits */
721enum { 746enum {
722 DE_BARRIER_IN_NEXT_EPOCH_ISSUED,
723 DE_BARRIER_IN_NEXT_EPOCH_DONE,
724 DE_CONTAINS_A_BARRIER,
725 DE_HAVE_BARRIER_NUMBER, 747 DE_HAVE_BARRIER_NUMBER,
726 DE_IS_FINISHING,
727}; 748};
728 749
729enum epoch_event { 750enum epoch_event {
730 EV_PUT, 751 EV_PUT,
731 EV_GOT_BARRIER_NR, 752 EV_GOT_BARRIER_NR,
732 EV_BARRIER_DONE,
733 EV_BECAME_LAST, 753 EV_BECAME_LAST,
734 EV_CLEANUP = 32, /* used as flag */ 754 EV_CLEANUP = 32, /* used as flag */
735}; 755};
@@ -746,8 +766,8 @@ struct digest_info {
746 766
747struct drbd_epoch_entry { 767struct drbd_epoch_entry {
748 struct drbd_work w; 768 struct drbd_work w;
749 struct hlist_node colision; 769 struct hlist_node collision;
750 struct drbd_epoch *epoch; 770 struct drbd_epoch *epoch; /* for writes */
751 struct drbd_conf *mdev; 771 struct drbd_conf *mdev;
752 struct page *pages; 772 struct page *pages;
753 atomic_t pending_bios; 773 atomic_t pending_bios;
@@ -755,7 +775,10 @@ struct drbd_epoch_entry {
755 /* see comments on ee flag bits below */ 775 /* see comments on ee flag bits below */
756 unsigned long flags; 776 unsigned long flags;
757 sector_t sector; 777 sector_t sector;
758 u64 block_id; 778 union {
779 u64 block_id;
780 struct digest_info *digest;
781 };
759}; 782};
760 783
761/* ee flag bits. 784/* ee flag bits.
@@ -768,11 +791,6 @@ enum {
768 __EE_CALL_AL_COMPLETE_IO, 791 __EE_CALL_AL_COMPLETE_IO,
769 __EE_MAY_SET_IN_SYNC, 792 __EE_MAY_SET_IN_SYNC,
770 793
771 /* This epoch entry closes an epoch using a barrier.
772 * On sucessful completion, the epoch is released,
773 * and the P_BARRIER_ACK send. */
774 __EE_IS_BARRIER,
775
776 /* In case a barrier failed, 794 /* In case a barrier failed,
777 * we need to resubmit without the barrier flag. */ 795 * we need to resubmit without the barrier flag. */
778 __EE_RESUBMITTED, 796 __EE_RESUBMITTED,
@@ -781,20 +799,22 @@ enum {
781 * if any of those fail, we set this flag atomically 799 * if any of those fail, we set this flag atomically
782 * from the endio callback */ 800 * from the endio callback */
783 __EE_WAS_ERROR, 801 __EE_WAS_ERROR,
802
803 /* This ee has a pointer to a digest instead of a block id */
804 __EE_HAS_DIGEST,
784}; 805};
785#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) 806#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
786#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) 807#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
787#define EE_IS_BARRIER (1<<__EE_IS_BARRIER)
788#define EE_RESUBMITTED (1<<__EE_RESUBMITTED) 808#define EE_RESUBMITTED (1<<__EE_RESUBMITTED)
789#define EE_WAS_ERROR (1<<__EE_WAS_ERROR) 809#define EE_WAS_ERROR (1<<__EE_WAS_ERROR)
810#define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST)
790 811
791/* global flag bits */ 812/* global flag bits */
792enum { 813enum {
793 CREATE_BARRIER, /* next P_DATA is preceeded by a P_BARRIER */ 814 CREATE_BARRIER, /* next P_DATA is preceded by a P_BARRIER */
794 SIGNAL_ASENDER, /* whether asender wants to be interrupted */ 815 SIGNAL_ASENDER, /* whether asender wants to be interrupted */
795 SEND_PING, /* whether asender should send a ping asap */ 816 SEND_PING, /* whether asender should send a ping asap */
796 817
797 STOP_SYNC_TIMER, /* tell timer to cancel itself */
798 UNPLUG_QUEUED, /* only relevant with kernel 2.4 */ 818 UNPLUG_QUEUED, /* only relevant with kernel 2.4 */
799 UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */ 819 UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */
800 MD_DIRTY, /* current uuids and flags not yet on disk */ 820 MD_DIRTY, /* current uuids and flags not yet on disk */
@@ -806,16 +826,16 @@ enum {
806 CRASHED_PRIMARY, /* This node was a crashed primary. 826 CRASHED_PRIMARY, /* This node was a crashed primary.
807 * Gets cleared when the state.conn 827 * Gets cleared when the state.conn
808 * goes into C_CONNECTED state. */ 828 * goes into C_CONNECTED state. */
809 WRITE_BM_AFTER_RESYNC, /* A kmalloc() during resync failed */
810 NO_BARRIER_SUPP, /* underlying block device doesn't implement barriers */ 829 NO_BARRIER_SUPP, /* underlying block device doesn't implement barriers */
811 CONSIDER_RESYNC, 830 CONSIDER_RESYNC,
812 831
813 MD_NO_BARRIER, /* meta data device does not support barriers, 832 MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */
814 so don't even try */
815 SUSPEND_IO, /* suspend application io */ 833 SUSPEND_IO, /* suspend application io */
816 BITMAP_IO, /* suspend application io; 834 BITMAP_IO, /* suspend application io;
817 once no more io in flight, start bitmap io */ 835 once no more io in flight, start bitmap io */
818 BITMAP_IO_QUEUED, /* Started bitmap IO */ 836 BITMAP_IO_QUEUED, /* Started bitmap IO */
837 GO_DISKLESS, /* Disk is being detached, on io-error or admin request. */
838 WAS_IO_ERROR, /* Local disk failed returned IO error */
819 RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */ 839 RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */
820 NET_CONGESTED, /* The data socket is congested */ 840 NET_CONGESTED, /* The data socket is congested */
821 841
@@ -829,19 +849,44 @@ enum {
829 * the peer, if it changed there as well. */ 849 * the peer, if it changed there as well. */
830 CONN_DRY_RUN, /* Expect disconnect after resync handshake. */ 850 CONN_DRY_RUN, /* Expect disconnect after resync handshake. */
831 GOT_PING_ACK, /* set when we receive a ping_ack packet, misc wait gets woken */ 851 GOT_PING_ACK, /* set when we receive a ping_ack packet, misc wait gets woken */
852 NEW_CUR_UUID, /* Create new current UUID when thawing IO */
853 AL_SUSPENDED, /* Activity logging is currently suspended. */
854 AHEAD_TO_SYNC_SOURCE, /* Ahead -> SyncSource queued */
832}; 855};
833 856
834struct drbd_bitmap; /* opaque for drbd_conf */ 857struct drbd_bitmap; /* opaque for drbd_conf */
835 858
859/* definition of bits in bm_flags to be used in drbd_bm_lock
860 * and drbd_bitmap_io and friends. */
861enum bm_flag {
862 /* do we need to kfree, or vfree bm_pages? */
863 BM_P_VMALLOCED = 0x10000, /* internal use only, will be masked out */
864
865 /* currently locked for bulk operation */
866 BM_LOCKED_MASK = 0x7,
867
868 /* in detail, that is: */
869 BM_DONT_CLEAR = 0x1,
870 BM_DONT_SET = 0x2,
871 BM_DONT_TEST = 0x4,
872
873 /* (test bit, count bit) allowed (common case) */
874 BM_LOCKED_TEST_ALLOWED = 0x3,
875
876 /* testing bits, as well as setting new bits allowed, but clearing bits
877 * would be unexpected. Used during bitmap receive. Setting new bits
878 * requires sending of "out-of-sync" information, though. */
879 BM_LOCKED_SET_ALLOWED = 0x1,
880
881 /* clear is not expected while bitmap is locked for bulk operation */
882};
883
884
836/* TODO sort members for performance 885/* TODO sort members for performance
837 * MAYBE group them further */ 886 * MAYBE group them further */
838 887
839/* THINK maybe we actually want to use the default "event/%s" worker threads 888/* THINK maybe we actually want to use the default "event/%s" worker threads
840 * or similar in linux 2.6, which uses per cpu data and threads. 889 * or similar in linux 2.6, which uses per cpu data and threads.
841 *
842 * To be general, this might need a spin_lock member.
843 * For now, please use the mdev->req_lock to protect list_head,
844 * see drbd_queue_work below.
845 */ 890 */
846struct drbd_work_queue { 891struct drbd_work_queue {
847 struct list_head q; 892 struct list_head q;
@@ -888,8 +933,6 @@ struct drbd_md {
888struct drbd_backing_dev { 933struct drbd_backing_dev {
889 struct block_device *backing_bdev; 934 struct block_device *backing_bdev;
890 struct block_device *md_bdev; 935 struct block_device *md_bdev;
891 struct file *lo_file;
892 struct file *md_file;
893 struct drbd_md md; 936 struct drbd_md md;
894 struct disk_conf dc; /* The user provided config... */ 937 struct disk_conf dc; /* The user provided config... */
895 sector_t known_size; /* last known size of that backing device */ 938 sector_t known_size; /* last known size of that backing device */
@@ -904,6 +947,7 @@ struct drbd_md_io {
904struct bm_io_work { 947struct bm_io_work {
905 struct drbd_work w; 948 struct drbd_work w;
906 char *why; 949 char *why;
950 enum bm_flag flags;
907 int (*io_fn)(struct drbd_conf *mdev); 951 int (*io_fn)(struct drbd_conf *mdev);
908 void (*done)(struct drbd_conf *mdev, int rv); 952 void (*done)(struct drbd_conf *mdev, int rv);
909}; 953};
@@ -912,7 +956,12 @@ enum write_ordering_e {
912 WO_none, 956 WO_none,
913 WO_drain_io, 957 WO_drain_io,
914 WO_bdev_flush, 958 WO_bdev_flush,
915 WO_bio_barrier 959};
960
961struct fifo_buffer {
962 int *values;
963 unsigned int head_index;
964 unsigned int size;
916}; 965};
917 966
918struct drbd_conf { 967struct drbd_conf {
@@ -936,9 +985,19 @@ struct drbd_conf {
936 unsigned int ko_count; 985 unsigned int ko_count;
937 struct drbd_work resync_work, 986 struct drbd_work resync_work,
938 unplug_work, 987 unplug_work,
939 md_sync_work; 988 go_diskless,
989 md_sync_work,
990 start_resync_work;
940 struct timer_list resync_timer; 991 struct timer_list resync_timer;
941 struct timer_list md_sync_timer; 992 struct timer_list md_sync_timer;
993 struct timer_list start_resync_timer;
994 struct timer_list request_timer;
995#ifdef DRBD_DEBUG_MD_SYNC
996 struct {
997 unsigned int line;
998 const char* func;
999 } last_md_mark_dirty;
1000#endif
942 1001
943 /* Used after attach while negotiating new disk state. */ 1002 /* Used after attach while negotiating new disk state. */
944 union drbd_state new_state_tmp; 1003 union drbd_state new_state_tmp;
@@ -946,6 +1005,7 @@ struct drbd_conf {
946 union drbd_state state; 1005 union drbd_state state;
947 wait_queue_head_t misc_wait; 1006 wait_queue_head_t misc_wait;
948 wait_queue_head_t state_wait; /* upon each state change. */ 1007 wait_queue_head_t state_wait; /* upon each state change. */
1008 wait_queue_head_t net_cnt_wait;
949 unsigned int send_cnt; 1009 unsigned int send_cnt;
950 unsigned int recv_cnt; 1010 unsigned int recv_cnt;
951 unsigned int read_cnt; 1011 unsigned int read_cnt;
@@ -966,20 +1026,24 @@ struct drbd_conf {
966 struct hlist_head *tl_hash; 1026 struct hlist_head *tl_hash;
967 unsigned int tl_hash_s; 1027 unsigned int tl_hash_s;
968 1028
969 /* blocks to sync in this run [unit BM_BLOCK_SIZE] */ 1029 /* blocks to resync in this run [unit BM_BLOCK_SIZE] */
970 unsigned long rs_total; 1030 unsigned long rs_total;
971 /* number of sync IOs that failed in this run */ 1031 /* number of resync blocks that failed in this run */
972 unsigned long rs_failed; 1032 unsigned long rs_failed;
973 /* Syncer's start time [unit jiffies] */ 1033 /* Syncer's start time [unit jiffies] */
974 unsigned long rs_start; 1034 unsigned long rs_start;
975 /* cumulated time in PausedSyncX state [unit jiffies] */ 1035 /* cumulated time in PausedSyncX state [unit jiffies] */
976 unsigned long rs_paused; 1036 unsigned long rs_paused;
1037 /* skipped because csum was equal [unit BM_BLOCK_SIZE] */
1038 unsigned long rs_same_csum;
1039#define DRBD_SYNC_MARKS 8
1040#define DRBD_SYNC_MARK_STEP (3*HZ)
977 /* block not up-to-date at mark [unit BM_BLOCK_SIZE] */ 1041 /* block not up-to-date at mark [unit BM_BLOCK_SIZE] */
978 unsigned long rs_mark_left; 1042 unsigned long rs_mark_left[DRBD_SYNC_MARKS];
979 /* marks's time [unit jiffies] */ 1043 /* marks's time [unit jiffies] */
980 unsigned long rs_mark_time; 1044 unsigned long rs_mark_time[DRBD_SYNC_MARKS];
981 /* skipped because csum was equeal [unit BM_BLOCK_SIZE] */ 1045 /* current index into rs_mark_{left,time} */
982 unsigned long rs_same_csum; 1046 int rs_last_mark;
983 1047
984 /* where does the admin want us to start? (sector) */ 1048 /* where does the admin want us to start? (sector) */
985 sector_t ov_start_sector; 1049 sector_t ov_start_sector;
@@ -1012,10 +1076,10 @@ struct drbd_conf {
1012 spinlock_t epoch_lock; 1076 spinlock_t epoch_lock;
1013 unsigned int epochs; 1077 unsigned int epochs;
1014 enum write_ordering_e write_ordering; 1078 enum write_ordering_e write_ordering;
1015 struct list_head active_ee; /* IO in progress */ 1079 struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */
1016 struct list_head sync_ee; /* IO in progress */ 1080 struct list_head sync_ee; /* IO in progress (P_RS_DATA_REPLY gets written to disk) */
1017 struct list_head done_ee; /* send ack */ 1081 struct list_head done_ee; /* send ack */
1018 struct list_head read_ee; /* IO in progress */ 1082 struct list_head read_ee; /* IO in progress (any read) */
1019 struct list_head net_ee; /* zero-copy network send in progress */ 1083 struct list_head net_ee; /* zero-copy network send in progress */
1020 struct hlist_head *ee_hash; /* is proteced by req_lock! */ 1084 struct hlist_head *ee_hash; /* is proteced by req_lock! */
1021 unsigned int ee_hash_s; 1085 unsigned int ee_hash_s;
@@ -1026,7 +1090,8 @@ struct drbd_conf {
1026 int next_barrier_nr; 1090 int next_barrier_nr;
1027 struct hlist_head *app_reads_hash; /* is proteced by req_lock */ 1091 struct hlist_head *app_reads_hash; /* is proteced by req_lock */
1028 struct list_head resync_reads; 1092 struct list_head resync_reads;
1029 atomic_t pp_in_use; 1093 atomic_t pp_in_use; /* allocated from page pool */
1094 atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */
1030 wait_queue_head_t ee_wait; 1095 wait_queue_head_t ee_wait;
1031 struct page *md_io_page; /* one page buffer for md_io */ 1096 struct page *md_io_page; /* one page buffer for md_io */
1032 struct page *md_io_tmpp; /* for logical_block_size != 512 */ 1097 struct page *md_io_tmpp; /* for logical_block_size != 512 */
@@ -1054,6 +1119,18 @@ struct drbd_conf {
1054 u64 ed_uuid; /* UUID of the exposed data */ 1119 u64 ed_uuid; /* UUID of the exposed data */
1055 struct mutex state_mutex; 1120 struct mutex state_mutex;
1056 char congestion_reason; /* Why we where congested... */ 1121 char congestion_reason; /* Why we where congested... */
1122 atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */
1123 atomic_t rs_sect_ev; /* for submitted resync data rate, both */
1124 int rs_last_sect_ev; /* counter to compare with */
1125 int rs_last_events; /* counter of read or write "events" (unit sectors)
1126 * on the lower level device when we last looked. */
1127 int c_sync_rate; /* current resync rate after syncer throttle magic */
1128 struct fifo_buffer rs_plan_s; /* correction values of resync planer */
1129 int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */
1130 int rs_planed; /* resync sectors already planned */
1131 atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */
1132 int peer_max_bio_size;
1133 int local_max_bio_size;
1057}; 1134};
1058 1135
1059static inline struct drbd_conf *minor_to_mdev(unsigned int minor) 1136static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
@@ -1070,7 +1147,7 @@ static inline unsigned int mdev_to_minor(struct drbd_conf *mdev)
1070 return mdev->minor; 1147 return mdev->minor;
1071} 1148}
1072 1149
1073/* returns 1 if it was successfull, 1150/* returns 1 if it was successful,
1074 * returns 0 if there was no data socket. 1151 * returns 0 if there was no data socket.
1075 * so wherever you are going to use the data.socket, e.g. do 1152 * so wherever you are going to use the data.socket, e.g. do
1076 * if (!drbd_get_data_sock(mdev)) 1153 * if (!drbd_get_data_sock(mdev))
@@ -1115,14 +1192,19 @@ enum dds_flags {
1115}; 1192};
1116 1193
1117extern void drbd_init_set_defaults(struct drbd_conf *mdev); 1194extern void drbd_init_set_defaults(struct drbd_conf *mdev);
1118extern int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f, 1195extern enum drbd_state_rv drbd_change_state(struct drbd_conf *mdev,
1119 union drbd_state mask, union drbd_state val); 1196 enum chg_state_flags f,
1197 union drbd_state mask,
1198 union drbd_state val);
1120extern void drbd_force_state(struct drbd_conf *, union drbd_state, 1199extern void drbd_force_state(struct drbd_conf *, union drbd_state,
1121 union drbd_state); 1200 union drbd_state);
1122extern int _drbd_request_state(struct drbd_conf *, union drbd_state, 1201extern enum drbd_state_rv _drbd_request_state(struct drbd_conf *,
1123 union drbd_state, enum chg_state_flags); 1202 union drbd_state,
1124extern int __drbd_set_state(struct drbd_conf *, union drbd_state, 1203 union drbd_state,
1125 enum chg_state_flags, struct completion *done); 1204 enum chg_state_flags);
1205extern enum drbd_state_rv __drbd_set_state(struct drbd_conf *, union drbd_state,
1206 enum chg_state_flags,
1207 struct completion *done);
1126extern void print_st_err(struct drbd_conf *, union drbd_state, 1208extern void print_st_err(struct drbd_conf *, union drbd_state,
1127 union drbd_state, int); 1209 union drbd_state, int);
1128extern int drbd_thread_start(struct drbd_thread *thi); 1210extern int drbd_thread_start(struct drbd_thread *thi);
@@ -1145,17 +1227,17 @@ extern int drbd_send(struct drbd_conf *mdev, struct socket *sock,
1145extern int drbd_send_protocol(struct drbd_conf *mdev); 1227extern int drbd_send_protocol(struct drbd_conf *mdev);
1146extern int drbd_send_uuids(struct drbd_conf *mdev); 1228extern int drbd_send_uuids(struct drbd_conf *mdev);
1147extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev); 1229extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev);
1148extern int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val); 1230extern int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev);
1149extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags); 1231extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags);
1150extern int _drbd_send_state(struct drbd_conf *mdev); 1232extern int _drbd_send_state(struct drbd_conf *mdev);
1151extern int drbd_send_state(struct drbd_conf *mdev); 1233extern int drbd_send_state(struct drbd_conf *mdev);
1152extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, 1234extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1153 enum drbd_packets cmd, struct p_header *h, 1235 enum drbd_packets cmd, struct p_header80 *h,
1154 size_t size, unsigned msg_flags); 1236 size_t size, unsigned msg_flags);
1155#define USE_DATA_SOCKET 1 1237#define USE_DATA_SOCKET 1
1156#define USE_META_SOCKET 0 1238#define USE_META_SOCKET 0
1157extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket, 1239extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
1158 enum drbd_packets cmd, struct p_header *h, 1240 enum drbd_packets cmd, struct p_header80 *h,
1159 size_t size); 1241 size_t size);
1160extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, 1242extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd,
1161 char *data, size_t size); 1243 char *data, size_t size);
@@ -1167,14 +1249,13 @@ extern int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
1167extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd, 1249extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
1168 struct p_block_req *rp); 1250 struct p_block_req *rp);
1169extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd, 1251extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
1170 struct p_data *dp); 1252 struct p_data *dp, int data_size);
1171extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd, 1253extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
1172 sector_t sector, int blksize, u64 block_id); 1254 sector_t sector, int blksize, u64 block_id);
1255extern int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req);
1173extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, 1256extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
1174 struct drbd_epoch_entry *e); 1257 struct drbd_epoch_entry *e);
1175extern int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req); 1258extern int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req);
1176extern int _drbd_send_barrier(struct drbd_conf *mdev,
1177 struct drbd_tl_epoch *barrier);
1178extern int drbd_send_drequest(struct drbd_conf *mdev, int cmd, 1259extern int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
1179 sector_t sector, int size, u64 block_id); 1260 sector_t sector, int size, u64 block_id);
1180extern int drbd_send_drequest_csum(struct drbd_conf *mdev, 1261extern int drbd_send_drequest_csum(struct drbd_conf *mdev,
@@ -1185,14 +1266,13 @@ extern int drbd_send_ov_request(struct drbd_conf *mdev,sector_t sector,int size)
1185 1266
1186extern int drbd_send_bitmap(struct drbd_conf *mdev); 1267extern int drbd_send_bitmap(struct drbd_conf *mdev);
1187extern int _drbd_send_bitmap(struct drbd_conf *mdev); 1268extern int _drbd_send_bitmap(struct drbd_conf *mdev);
1188extern int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode); 1269extern int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode);
1189extern void drbd_free_bc(struct drbd_backing_dev *ldev); 1270extern void drbd_free_bc(struct drbd_backing_dev *ldev);
1190extern void drbd_mdev_cleanup(struct drbd_conf *mdev); 1271extern void drbd_mdev_cleanup(struct drbd_conf *mdev);
1272void drbd_print_uuids(struct drbd_conf *mdev, const char *text);
1191 1273
1192/* drbd_meta-data.c (still in drbd_main.c) */
1193extern void drbd_md_sync(struct drbd_conf *mdev); 1274extern void drbd_md_sync(struct drbd_conf *mdev);
1194extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev); 1275extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev);
1195/* maybe define them below as inline? */
1196extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); 1276extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
1197extern void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); 1277extern void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
1198extern void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local); 1278extern void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local);
@@ -1201,14 +1281,24 @@ extern void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
1201extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local); 1281extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local);
1202extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local); 1282extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local);
1203extern int drbd_md_test_flag(struct drbd_backing_dev *, int); 1283extern int drbd_md_test_flag(struct drbd_backing_dev *, int);
1284#ifndef DRBD_DEBUG_MD_SYNC
1204extern void drbd_md_mark_dirty(struct drbd_conf *mdev); 1285extern void drbd_md_mark_dirty(struct drbd_conf *mdev);
1286#else
1287#define drbd_md_mark_dirty(m) drbd_md_mark_dirty_(m, __LINE__ , __func__ )
1288extern void drbd_md_mark_dirty_(struct drbd_conf *mdev,
1289 unsigned int line, const char *func);
1290#endif
1205extern void drbd_queue_bitmap_io(struct drbd_conf *mdev, 1291extern void drbd_queue_bitmap_io(struct drbd_conf *mdev,
1206 int (*io_fn)(struct drbd_conf *), 1292 int (*io_fn)(struct drbd_conf *),
1207 void (*done)(struct drbd_conf *, int), 1293 void (*done)(struct drbd_conf *, int),
1208 char *why); 1294 char *why, enum bm_flag flags);
1295extern int drbd_bitmap_io(struct drbd_conf *mdev,
1296 int (*io_fn)(struct drbd_conf *),
1297 char *why, enum bm_flag flags);
1209extern int drbd_bmio_set_n_write(struct drbd_conf *mdev); 1298extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
1210extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); 1299extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
1211extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why); 1300extern void drbd_go_diskless(struct drbd_conf *mdev);
1301extern void drbd_ldev_destroy(struct drbd_conf *mdev);
1212 1302
1213 1303
1214/* Meta data layout 1304/* Meta data layout
@@ -1255,6 +1345,7 @@ struct bm_extent {
1255 1345
1256#define BME_NO_WRITES 0 /* bm_extent.flags: no more requests on this one! */ 1346#define BME_NO_WRITES 0 /* bm_extent.flags: no more requests on this one! */
1257#define BME_LOCKED 1 /* bm_extent.flags: syncer active on this one. */ 1347#define BME_LOCKED 1 /* bm_extent.flags: syncer active on this one. */
1348#define BME_PRIORITY 2 /* finish resync IO on this extent ASAP! App IO waiting! */
1258 1349
1259/* drbd_bitmap.c */ 1350/* drbd_bitmap.c */
1260/* 1351/*
@@ -1264,6 +1355,8 @@ struct bm_extent {
1264 * Bit 1 ==> local node thinks this block needs to be synced. 1355 * Bit 1 ==> local node thinks this block needs to be synced.
1265 */ 1356 */
1266 1357
1358#define SLEEP_TIME (HZ/10)
1359
1267#define BM_BLOCK_SHIFT 12 /* 4k per bit */ 1360#define BM_BLOCK_SHIFT 12 /* 4k per bit */
1268#define BM_BLOCK_SIZE (1<<BM_BLOCK_SHIFT) 1361#define BM_BLOCK_SIZE (1<<BM_BLOCK_SHIFT)
1269/* (9+3) : 512 bytes @ 8 bits; representing 16M storage 1362/* (9+3) : 512 bytes @ 8 bits; representing 16M storage
@@ -1330,15 +1423,20 @@ struct bm_extent {
1330 * you should use 64bit OS for that much storage, anyways. */ 1423 * you should use 64bit OS for that much storage, anyways. */
1331#define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0xffff7fff) 1424#define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0xffff7fff)
1332#else 1425#else
1333#define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0x1LU << 32) 1426/* we allow up to 1 PiB now on 64bit architecture with "flexible" meta data */
1427#define DRBD_MAX_SECTORS_FLEX (1UL << 51)
1428/* corresponds to (1UL << 38) bits right now. */
1334#endif 1429#endif
1335#endif 1430#endif
1336 1431
1337/* Sector shift value for the "hash" functions of tl_hash and ee_hash tables. 1432/* Sector shift value for the "hash" functions of tl_hash and ee_hash tables.
1338 * With a value of 6 all IO in one 32K block make it to the same slot of the 1433 * With a value of 8 all IO in one 128K block make it to the same slot of the
1339 * hash table. */ 1434 * hash table. */
1340#define HT_SHIFT 6 1435#define HT_SHIFT 8
1341#define DRBD_MAX_SEGMENT_SIZE (1U<<(9+HT_SHIFT)) 1436#define DRBD_MAX_BIO_SIZE (1U<<(9+HT_SHIFT))
1437#define DRBD_MAX_BIO_SIZE_SAFE (1 << 12) /* Works always = 4k */
1438
1439#define DRBD_MAX_SIZE_H80_PACKET (1 << 15) /* The old header only allows packets up to 32Kib data */
1342 1440
1343/* Number of elements in the app_reads_hash */ 1441/* Number of elements in the app_reads_hash */
1344#define APP_R_HSIZE 15 1442#define APP_R_HSIZE 15
@@ -1348,16 +1446,20 @@ extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors, int set_new
1348extern void drbd_bm_cleanup(struct drbd_conf *mdev); 1446extern void drbd_bm_cleanup(struct drbd_conf *mdev);
1349extern void drbd_bm_set_all(struct drbd_conf *mdev); 1447extern void drbd_bm_set_all(struct drbd_conf *mdev);
1350extern void drbd_bm_clear_all(struct drbd_conf *mdev); 1448extern void drbd_bm_clear_all(struct drbd_conf *mdev);
1449/* set/clear/test only a few bits at a time */
1351extern int drbd_bm_set_bits( 1450extern int drbd_bm_set_bits(
1352 struct drbd_conf *mdev, unsigned long s, unsigned long e); 1451 struct drbd_conf *mdev, unsigned long s, unsigned long e);
1353extern int drbd_bm_clear_bits( 1452extern int drbd_bm_clear_bits(
1354 struct drbd_conf *mdev, unsigned long s, unsigned long e); 1453 struct drbd_conf *mdev, unsigned long s, unsigned long e);
1355/* bm_set_bits variant for use while holding drbd_bm_lock */ 1454extern int drbd_bm_count_bits(
1455 struct drbd_conf *mdev, const unsigned long s, const unsigned long e);
1456/* bm_set_bits variant for use while holding drbd_bm_lock,
1457 * may process the whole bitmap in one go */
1356extern void _drbd_bm_set_bits(struct drbd_conf *mdev, 1458extern void _drbd_bm_set_bits(struct drbd_conf *mdev,
1357 const unsigned long s, const unsigned long e); 1459 const unsigned long s, const unsigned long e);
1358extern int drbd_bm_test_bit(struct drbd_conf *mdev, unsigned long bitnr); 1460extern int drbd_bm_test_bit(struct drbd_conf *mdev, unsigned long bitnr);
1359extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr); 1461extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr);
1360extern int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local); 1462extern int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local);
1361extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local); 1463extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local);
1362extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local); 1464extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local);
1363extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, 1465extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev,
@@ -1365,23 +1467,24 @@ extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev,
1365extern size_t drbd_bm_words(struct drbd_conf *mdev); 1467extern size_t drbd_bm_words(struct drbd_conf *mdev);
1366extern unsigned long drbd_bm_bits(struct drbd_conf *mdev); 1468extern unsigned long drbd_bm_bits(struct drbd_conf *mdev);
1367extern sector_t drbd_bm_capacity(struct drbd_conf *mdev); 1469extern sector_t drbd_bm_capacity(struct drbd_conf *mdev);
1470
1471#define DRBD_END_OF_BITMAP (~(unsigned long)0)
1368extern unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo); 1472extern unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo);
1369/* bm_find_next variants for use while you hold drbd_bm_lock() */ 1473/* bm_find_next variants for use while you hold drbd_bm_lock() */
1370extern unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo); 1474extern unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo);
1371extern unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo); 1475extern unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo);
1476extern unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev);
1372extern unsigned long drbd_bm_total_weight(struct drbd_conf *mdev); 1477extern unsigned long drbd_bm_total_weight(struct drbd_conf *mdev);
1373extern int drbd_bm_rs_done(struct drbd_conf *mdev); 1478extern int drbd_bm_rs_done(struct drbd_conf *mdev);
1374/* for receive_bitmap */ 1479/* for receive_bitmap */
1375extern void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, 1480extern void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset,
1376 size_t number, unsigned long *buffer); 1481 size_t number, unsigned long *buffer);
1377/* for _drbd_send_bitmap and drbd_bm_write_sect */ 1482/* for _drbd_send_bitmap */
1378extern void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, 1483extern void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset,
1379 size_t number, unsigned long *buffer); 1484 size_t number, unsigned long *buffer);
1380 1485
1381extern void drbd_bm_lock(struct drbd_conf *mdev, char *why); 1486extern void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags);
1382extern void drbd_bm_unlock(struct drbd_conf *mdev); 1487extern void drbd_bm_unlock(struct drbd_conf *mdev);
1383
1384extern int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e);
1385/* drbd_main.c */ 1488/* drbd_main.c */
1386 1489
1387extern struct kmem_cache *drbd_request_cache; 1490extern struct kmem_cache *drbd_request_cache;
@@ -1404,7 +1507,7 @@ extern void drbd_free_mdev(struct drbd_conf *mdev);
1404extern int proc_details; 1507extern int proc_details;
1405 1508
1406/* drbd_req */ 1509/* drbd_req */
1407extern int drbd_make_request_26(struct request_queue *q, struct bio *bio); 1510extern int drbd_make_request(struct request_queue *q, struct bio *bio);
1408extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req); 1511extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req);
1409extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec); 1512extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec);
1410extern int is_valid_ar_handle(struct drbd_request *, sector_t); 1513extern int is_valid_ar_handle(struct drbd_request *, sector_t);
@@ -1416,12 +1519,14 @@ extern void drbd_resume_io(struct drbd_conf *mdev);
1416extern char *ppsize(char *buf, unsigned long long size); 1519extern char *ppsize(char *buf, unsigned long long size);
1417extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, int); 1520extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, int);
1418enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 }; 1521enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 };
1419extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local); 1522extern enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local);
1420extern void resync_after_online_grow(struct drbd_conf *); 1523extern void resync_after_online_grow(struct drbd_conf *);
1421extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local); 1524extern void drbd_reconsider_max_bio_size(struct drbd_conf *mdev);
1422extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, 1525extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev,
1423 int force); 1526 enum drbd_role new_role,
1424enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev); 1527 int force);
1528extern enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev);
1529extern void drbd_try_outdate_peer_async(struct drbd_conf *mdev);
1425extern int drbd_khelper(struct drbd_conf *mdev, char *cmd); 1530extern int drbd_khelper(struct drbd_conf *mdev, char *cmd);
1426 1531
1427/* drbd_worker.c */ 1532/* drbd_worker.c */
@@ -1435,6 +1540,7 @@ extern int drbd_resync_finished(struct drbd_conf *mdev);
1435extern int drbd_md_sync_page_io(struct drbd_conf *mdev, 1540extern int drbd_md_sync_page_io(struct drbd_conf *mdev,
1436 struct drbd_backing_dev *bdev, sector_t sector, int rw); 1541 struct drbd_backing_dev *bdev, sector_t sector, int rw);
1437extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int); 1542extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int);
1543extern void drbd_rs_controller_reset(struct drbd_conf *mdev);
1438 1544
1439static inline void ov_oos_print(struct drbd_conf *mdev) 1545static inline void ov_oos_print(struct drbd_conf *mdev)
1440{ 1546{
@@ -1458,19 +1564,23 @@ extern int w_e_end_csum_rs_req(struct drbd_conf *, struct drbd_work *, int);
1458extern int w_e_end_ov_reply(struct drbd_conf *, struct drbd_work *, int); 1564extern int w_e_end_ov_reply(struct drbd_conf *, struct drbd_work *, int);
1459extern int w_e_end_ov_req(struct drbd_conf *, struct drbd_work *, int); 1565extern int w_e_end_ov_req(struct drbd_conf *, struct drbd_work *, int);
1460extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int); 1566extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int);
1461extern int w_resync_inactive(struct drbd_conf *, struct drbd_work *, int); 1567extern int w_resync_timer(struct drbd_conf *, struct drbd_work *, int);
1462extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int); 1568extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int);
1463extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int); 1569extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int);
1464extern int w_make_resync_request(struct drbd_conf *, struct drbd_work *, int);
1465extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int); 1570extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int);
1466extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int); 1571extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int);
1467extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int); 1572extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int);
1468extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int); 1573extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int);
1469extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int); 1574extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int);
1575extern int w_restart_disk_io(struct drbd_conf *, struct drbd_work *, int);
1576extern int w_send_oos(struct drbd_conf *, struct drbd_work *, int);
1577extern int w_start_resync(struct drbd_conf *, struct drbd_work *, int);
1470 1578
1471extern void resync_timer_fn(unsigned long data); 1579extern void resync_timer_fn(unsigned long data);
1580extern void start_resync_timer_fn(unsigned long data);
1472 1581
1473/* drbd_receiver.c */ 1582/* drbd_receiver.c */
1583extern int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector);
1474extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, 1584extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
1475 const unsigned rw, const int fault_type); 1585 const unsigned rw, const int fault_type);
1476extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list); 1586extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list);
@@ -1479,7 +1589,10 @@ extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
1479 sector_t sector, 1589 sector_t sector,
1480 unsigned int data_size, 1590 unsigned int data_size,
1481 gfp_t gfp_mask) __must_hold(local); 1591 gfp_t gfp_mask) __must_hold(local);
1482extern void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e); 1592extern void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
1593 int is_net);
1594#define drbd_free_ee(m,e) drbd_free_some_ee(m, e, 0)
1595#define drbd_free_net_ee(m,e) drbd_free_some_ee(m, e, 1)
1483extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev, 1596extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev,
1484 struct list_head *head); 1597 struct list_head *head);
1485extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, 1598extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
@@ -1487,6 +1600,7 @@ extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
1487extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled); 1600extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled);
1488extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed); 1601extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed);
1489extern void drbd_flush_workqueue(struct drbd_conf *mdev); 1602extern void drbd_flush_workqueue(struct drbd_conf *mdev);
1603extern void drbd_free_tl_hash(struct drbd_conf *mdev);
1490 1604
1491/* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to 1605/* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to
1492 * mess with get_fs/set_fs, we know we are KERNEL_DS always. */ 1606 * mess with get_fs/set_fs, we know we are KERNEL_DS always. */
@@ -1549,16 +1663,16 @@ extern int drbd_rs_del_all(struct drbd_conf *mdev);
1549extern void drbd_rs_failed_io(struct drbd_conf *mdev, 1663extern void drbd_rs_failed_io(struct drbd_conf *mdev,
1550 sector_t sector, int size); 1664 sector_t sector, int size);
1551extern int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *); 1665extern int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *);
1666extern void drbd_advance_rs_marks(struct drbd_conf *mdev, unsigned long still_to_go);
1552extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, 1667extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector,
1553 int size, const char *file, const unsigned int line); 1668 int size, const char *file, const unsigned int line);
1554#define drbd_set_in_sync(mdev, sector, size) \ 1669#define drbd_set_in_sync(mdev, sector, size) \
1555 __drbd_set_in_sync(mdev, sector, size, __FILE__, __LINE__) 1670 __drbd_set_in_sync(mdev, sector, size, __FILE__, __LINE__)
1556extern void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, 1671extern int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector,
1557 int size, const char *file, const unsigned int line); 1672 int size, const char *file, const unsigned int line);
1558#define drbd_set_out_of_sync(mdev, sector, size) \ 1673#define drbd_set_out_of_sync(mdev, sector, size) \
1559 __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__) 1674 __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__)
1560extern void drbd_al_apply_to_bm(struct drbd_conf *mdev); 1675extern void drbd_al_apply_to_bm(struct drbd_conf *mdev);
1561extern void drbd_al_to_on_disk_bm(struct drbd_conf *mdev);
1562extern void drbd_al_shrink(struct drbd_conf *mdev); 1676extern void drbd_al_shrink(struct drbd_conf *mdev);
1563 1677
1564 1678
@@ -1600,6 +1714,8 @@ void drbd_bcast_ee(struct drbd_conf *mdev,
1600#define susp_MASK 1 1714#define susp_MASK 1
1601#define user_isp_MASK 1 1715#define user_isp_MASK 1
1602#define aftr_isp_MASK 1 1716#define aftr_isp_MASK 1
1717#define susp_nod_MASK 1
1718#define susp_fen_MASK 1
1603 1719
1604#define NS(T, S) \ 1720#define NS(T, S) \
1605 ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \ 1721 ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
@@ -1675,11 +1791,11 @@ static inline void drbd_state_unlock(struct drbd_conf *mdev)
1675 wake_up(&mdev->misc_wait); 1791 wake_up(&mdev->misc_wait);
1676} 1792}
1677 1793
1678static inline int _drbd_set_state(struct drbd_conf *mdev, 1794static inline enum drbd_state_rv
1679 union drbd_state ns, enum chg_state_flags flags, 1795_drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
1680 struct completion *done) 1796 enum chg_state_flags flags, struct completion *done)
1681{ 1797{
1682 int rv; 1798 enum drbd_state_rv rv;
1683 1799
1684 read_lock(&global_state_lock); 1800 read_lock(&global_state_lock);
1685 rv = __drbd_set_state(mdev, ns, flags, done); 1801 rv = __drbd_set_state(mdev, ns, flags, done);
@@ -1712,17 +1828,19 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach,
1712 case EP_PASS_ON: 1828 case EP_PASS_ON:
1713 if (!forcedetach) { 1829 if (!forcedetach) {
1714 if (__ratelimit(&drbd_ratelimit_state)) 1830 if (__ratelimit(&drbd_ratelimit_state))
1715 dev_err(DEV, "Local IO failed in %s." 1831 dev_err(DEV, "Local IO failed in %s.\n", where);
1716 "Passing error on...\n", where); 1832 if (mdev->state.disk > D_INCONSISTENT)
1833 _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_HARD, NULL);
1717 break; 1834 break;
1718 } 1835 }
1719 /* NOTE fall through to detach case if forcedetach set */ 1836 /* NOTE fall through to detach case if forcedetach set */
1720 case EP_DETACH: 1837 case EP_DETACH:
1721 case EP_CALL_HELPER: 1838 case EP_CALL_HELPER:
1839 set_bit(WAS_IO_ERROR, &mdev->flags);
1722 if (mdev->state.disk > D_FAILED) { 1840 if (mdev->state.disk > D_FAILED) {
1723 _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL); 1841 _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL);
1724 dev_err(DEV, "Local IO failed in %s." 1842 dev_err(DEV,
1725 "Detaching...\n", where); 1843 "Local IO failed in %s. Detaching...\n", where);
1726 } 1844 }
1727 break; 1845 break;
1728 } 1846 }
@@ -1788,7 +1906,7 @@ static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev)
1788static inline sector_t drbd_get_capacity(struct block_device *bdev) 1906static inline sector_t drbd_get_capacity(struct block_device *bdev)
1789{ 1907{
1790 /* return bdev ? get_capacity(bdev->bd_disk) : 0; */ 1908 /* return bdev ? get_capacity(bdev->bd_disk) : 0; */
1791 return bdev ? bdev->bd_inode->i_size >> 9 : 0; 1909 return bdev ? i_size_read(bdev->bd_inode) >> 9 : 0;
1792} 1910}
1793 1911
1794/** 1912/**
@@ -1856,13 +1974,6 @@ static inline sector_t drbd_md_ss__(struct drbd_conf *mdev,
1856} 1974}
1857 1975
1858static inline void 1976static inline void
1859_drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
1860{
1861 list_add_tail(&w->list, &q->q);
1862 up(&q->s);
1863}
1864
1865static inline void
1866drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w) 1977drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w)
1867{ 1978{
1868 unsigned long flags; 1979 unsigned long flags;
@@ -1899,35 +2010,35 @@ static inline void request_ping(struct drbd_conf *mdev)
1899static inline int drbd_send_short_cmd(struct drbd_conf *mdev, 2010static inline int drbd_send_short_cmd(struct drbd_conf *mdev,
1900 enum drbd_packets cmd) 2011 enum drbd_packets cmd)
1901{ 2012{
1902 struct p_header h; 2013 struct p_header80 h;
1903 return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h)); 2014 return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h));
1904} 2015}
1905 2016
1906static inline int drbd_send_ping(struct drbd_conf *mdev) 2017static inline int drbd_send_ping(struct drbd_conf *mdev)
1907{ 2018{
1908 struct p_header h; 2019 struct p_header80 h;
1909 return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h)); 2020 return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h));
1910} 2021}
1911 2022
1912static inline int drbd_send_ping_ack(struct drbd_conf *mdev) 2023static inline int drbd_send_ping_ack(struct drbd_conf *mdev)
1913{ 2024{
1914 struct p_header h; 2025 struct p_header80 h;
1915 return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h)); 2026 return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h));
1916} 2027}
1917 2028
1918static inline void drbd_thread_stop(struct drbd_thread *thi) 2029static inline void drbd_thread_stop(struct drbd_thread *thi)
1919{ 2030{
1920 _drbd_thread_stop(thi, FALSE, TRUE); 2031 _drbd_thread_stop(thi, false, true);
1921} 2032}
1922 2033
1923static inline void drbd_thread_stop_nowait(struct drbd_thread *thi) 2034static inline void drbd_thread_stop_nowait(struct drbd_thread *thi)
1924{ 2035{
1925 _drbd_thread_stop(thi, FALSE, FALSE); 2036 _drbd_thread_stop(thi, false, false);
1926} 2037}
1927 2038
1928static inline void drbd_thread_restart_nowait(struct drbd_thread *thi) 2039static inline void drbd_thread_restart_nowait(struct drbd_thread *thi)
1929{ 2040{
1930 _drbd_thread_stop(thi, TRUE, FALSE); 2041 _drbd_thread_stop(thi, true, false);
1931} 2042}
1932 2043
1933/* counts how many answer packets packets we expect from our peer, 2044/* counts how many answer packets packets we expect from our peer,
@@ -1972,7 +2083,7 @@ static inline void inc_ap_pending(struct drbd_conf *mdev)
1972/* counts how many resync-related answers we still expect from the peer 2083/* counts how many resync-related answers we still expect from the peer
1973 * increase decrease 2084 * increase decrease
1974 * C_SYNC_TARGET sends P_RS_DATA_REQUEST (and expects P_RS_DATA_REPLY) 2085 * C_SYNC_TARGET sends P_RS_DATA_REQUEST (and expects P_RS_DATA_REPLY)
1975 * C_SYNC_SOURCE sends P_RS_DATA_REPLY (and expects P_WRITE_ACK whith ID_SYNCER) 2086 * C_SYNC_SOURCE sends P_RS_DATA_REPLY (and expects P_WRITE_ACK with ID_SYNCER)
1976 * (or P_NEG_ACK with ID_SYNCER) 2087 * (or P_NEG_ACK with ID_SYNCER)
1977 */ 2088 */
1978static inline void inc_rs_pending(struct drbd_conf *mdev) 2089static inline void inc_rs_pending(struct drbd_conf *mdev)
@@ -2013,7 +2124,7 @@ static inline void inc_unacked(struct drbd_conf *mdev)
2013static inline void put_net_conf(struct drbd_conf *mdev) 2124static inline void put_net_conf(struct drbd_conf *mdev)
2014{ 2125{
2015 if (atomic_dec_and_test(&mdev->net_cnt)) 2126 if (atomic_dec_and_test(&mdev->net_cnt))
2016 wake_up(&mdev->misc_wait); 2127 wake_up(&mdev->net_cnt_wait);
2017} 2128}
2018 2129
2019/** 2130/**
@@ -2044,10 +2155,22 @@ static inline int get_net_conf(struct drbd_conf *mdev)
2044 2155
2045static inline void put_ldev(struct drbd_conf *mdev) 2156static inline void put_ldev(struct drbd_conf *mdev)
2046{ 2157{
2158 int i = atomic_dec_return(&mdev->local_cnt);
2159
2160 /* This may be called from some endio handler,
2161 * so we must not sleep here. */
2162
2047 __release(local); 2163 __release(local);
2048 if (atomic_dec_and_test(&mdev->local_cnt)) 2164 D_ASSERT(i >= 0);
2165 if (i == 0) {
2166 if (mdev->state.disk == D_DISKLESS)
2167 /* even internal references gone, safe to destroy */
2168 drbd_ldev_destroy(mdev);
2169 if (mdev->state.disk == D_FAILED)
2170 /* all application IO references gone. */
2171 drbd_go_diskless(mdev);
2049 wake_up(&mdev->misc_wait); 2172 wake_up(&mdev->misc_wait);
2050 D_ASSERT(atomic_read(&mdev->local_cnt) >= 0); 2173 }
2051} 2174}
2052 2175
2053#ifndef __CHECKER__ 2176#ifndef __CHECKER__
@@ -2055,6 +2178,10 @@ static inline int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_stat
2055{ 2178{
2056 int io_allowed; 2179 int io_allowed;
2057 2180
2181 /* never get a reference while D_DISKLESS */
2182 if (mdev->state.disk == D_DISKLESS)
2183 return 0;
2184
2058 atomic_inc(&mdev->local_cnt); 2185 atomic_inc(&mdev->local_cnt);
2059 io_allowed = (mdev->state.disk >= mins); 2186 io_allowed = (mdev->state.disk >= mins);
2060 if (!io_allowed) 2187 if (!io_allowed)
@@ -2069,17 +2196,18 @@ extern int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
2069static inline void drbd_get_syncer_progress(struct drbd_conf *mdev, 2196static inline void drbd_get_syncer_progress(struct drbd_conf *mdev,
2070 unsigned long *bits_left, unsigned int *per_mil_done) 2197 unsigned long *bits_left, unsigned int *per_mil_done)
2071{ 2198{
2072 /* 2199 /* this is to break it at compile time when we change that, in case we
2073 * this is to break it at compile time when we change that 2200 * want to support more than (1<<32) bits on a 32bit arch. */
2074 * (we may feel 4TB maximum storage per drbd is not enough)
2075 */
2076 typecheck(unsigned long, mdev->rs_total); 2201 typecheck(unsigned long, mdev->rs_total);
2077 2202
2078 /* note: both rs_total and rs_left are in bits, i.e. in 2203 /* note: both rs_total and rs_left are in bits, i.e. in
2079 * units of BM_BLOCK_SIZE. 2204 * units of BM_BLOCK_SIZE.
2080 * for the percentage, we don't care. */ 2205 * for the percentage, we don't care. */
2081 2206
2082 *bits_left = drbd_bm_total_weight(mdev) - mdev->rs_failed; 2207 if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T)
2208 *bits_left = mdev->ov_left;
2209 else
2210 *bits_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
2083 /* >> 10 to prevent overflow, 2211 /* >> 10 to prevent overflow,
2084 * +1 to prevent division by zero */ 2212 * +1 to prevent division by zero */
2085 if (*bits_left > mdev->rs_total) { 2213 if (*bits_left > mdev->rs_total) {
@@ -2094,10 +2222,19 @@ static inline void drbd_get_syncer_progress(struct drbd_conf *mdev,
2094 *bits_left, mdev->rs_total, mdev->rs_failed); 2222 *bits_left, mdev->rs_total, mdev->rs_failed);
2095 *per_mil_done = 0; 2223 *per_mil_done = 0;
2096 } else { 2224 } else {
2097 /* make sure the calculation happens in long context */ 2225 /* Make sure the division happens in long context.
2098 unsigned long tmp = 1000UL - 2226 * We allow up to one petabyte storage right now,
2099 (*bits_left >> 10)*1000UL 2227 * at a granularity of 4k per bit that is 2**38 bits.
2100 / ((mdev->rs_total >> 10) + 1UL); 2228 * After shift right and multiplication by 1000,
2229 * this should still fit easily into a 32bit long,
2230 * so we don't need a 64bit division on 32bit arch.
2231 * Note: currently we don't support such large bitmaps on 32bit
2232 * arch anyways, but no harm done to be prepared for it here.
2233 */
2234 unsigned int shift = mdev->rs_total >= (1ULL << 32) ? 16 : 10;
2235 unsigned long left = *bits_left >> shift;
2236 unsigned long total = 1UL + (mdev->rs_total >> shift);
2237 unsigned long tmp = 1000UL - left * 1000UL/total;
2101 *per_mil_done = tmp; 2238 *per_mil_done = tmp;
2102 } 2239 }
2103} 2240}
@@ -2116,8 +2253,9 @@ static inline int drbd_get_max_buffers(struct drbd_conf *mdev)
2116 return mxb; 2253 return mxb;
2117} 2254}
2118 2255
2119static inline int drbd_state_is_stable(union drbd_state s) 2256static inline int drbd_state_is_stable(struct drbd_conf *mdev)
2120{ 2257{
2258 union drbd_state s = mdev->state;
2121 2259
2122 /* DO NOT add a default clause, we want the compiler to warn us 2260 /* DO NOT add a default clause, we want the compiler to warn us
2123 * for any newly introduced state we may have forgotten to add here */ 2261 * for any newly introduced state we may have forgotten to add here */
@@ -2134,11 +2272,9 @@ static inline int drbd_state_is_stable(union drbd_state s)
2134 case C_VERIFY_T: 2272 case C_VERIFY_T:
2135 case C_PAUSED_SYNC_S: 2273 case C_PAUSED_SYNC_S:
2136 case C_PAUSED_SYNC_T: 2274 case C_PAUSED_SYNC_T:
2137 /* maybe stable, look at the disk state */ 2275 case C_AHEAD:
2138 break; 2276 case C_BEHIND:
2139 2277 /* transitional states, IO allowed */
2140 /* no new io accepted during tansitional states
2141 * like handshake or teardown */
2142 case C_DISCONNECTING: 2278 case C_DISCONNECTING:
2143 case C_UNCONNECTED: 2279 case C_UNCONNECTED:
2144 case C_TIMEOUT: 2280 case C_TIMEOUT:
@@ -2149,7 +2285,15 @@ static inline int drbd_state_is_stable(union drbd_state s)
2149 case C_WF_REPORT_PARAMS: 2285 case C_WF_REPORT_PARAMS:
2150 case C_STARTING_SYNC_S: 2286 case C_STARTING_SYNC_S:
2151 case C_STARTING_SYNC_T: 2287 case C_STARTING_SYNC_T:
2288 break;
2289
2290 /* Allow IO in BM exchange states with new protocols */
2152 case C_WF_BITMAP_S: 2291 case C_WF_BITMAP_S:
2292 if (mdev->agreed_pro_version < 96)
2293 return 0;
2294 break;
2295
2296 /* no new io accepted in these states */
2153 case C_WF_BITMAP_T: 2297 case C_WF_BITMAP_T:
2154 case C_WF_SYNC_UUID: 2298 case C_WF_SYNC_UUID:
2155 case C_MASK: 2299 case C_MASK:
@@ -2179,41 +2323,52 @@ static inline int drbd_state_is_stable(union drbd_state s)
2179 return 1; 2323 return 1;
2180} 2324}
2181 2325
2182static inline int __inc_ap_bio_cond(struct drbd_conf *mdev) 2326static inline int is_susp(union drbd_state s)
2327{
2328 return s.susp || s.susp_nod || s.susp_fen;
2329}
2330
2331static inline bool may_inc_ap_bio(struct drbd_conf *mdev)
2183{ 2332{
2184 int mxb = drbd_get_max_buffers(mdev); 2333 int mxb = drbd_get_max_buffers(mdev);
2185 2334
2186 if (mdev->state.susp) 2335 if (is_susp(mdev->state))
2187 return 0; 2336 return false;
2188 if (test_bit(SUSPEND_IO, &mdev->flags)) 2337 if (test_bit(SUSPEND_IO, &mdev->flags))
2189 return 0; 2338 return false;
2190 2339
2191 /* to avoid potential deadlock or bitmap corruption, 2340 /* to avoid potential deadlock or bitmap corruption,
2192 * in various places, we only allow new application io 2341 * in various places, we only allow new application io
2193 * to start during "stable" states. */ 2342 * to start during "stable" states. */
2194 2343
2195 /* no new io accepted when attaching or detaching the disk */ 2344 /* no new io accepted when attaching or detaching the disk */
2196 if (!drbd_state_is_stable(mdev->state)) 2345 if (!drbd_state_is_stable(mdev))
2197 return 0; 2346 return false;
2198 2347
2199 /* since some older kernels don't have atomic_add_unless, 2348 /* since some older kernels don't have atomic_add_unless,
2200 * and we are within the spinlock anyways, we have this workaround. */ 2349 * and we are within the spinlock anyways, we have this workaround. */
2201 if (atomic_read(&mdev->ap_bio_cnt) > mxb) 2350 if (atomic_read(&mdev->ap_bio_cnt) > mxb)
2202 return 0; 2351 return false;
2203 if (test_bit(BITMAP_IO, &mdev->flags)) 2352 if (test_bit(BITMAP_IO, &mdev->flags))
2204 return 0; 2353 return false;
2205 return 1; 2354 return true;
2206} 2355}
2207 2356
2208/* I'd like to use wait_event_lock_irq, 2357static inline bool inc_ap_bio_cond(struct drbd_conf *mdev, int count)
2209 * but I'm not sure when it got introduced,
2210 * and not sure when it has 3 or 4 arguments */
2211static inline void inc_ap_bio(struct drbd_conf *mdev, int count)
2212{ 2358{
2213 /* compare with after_state_ch, 2359 bool rv = false;
2214 * os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S */ 2360
2215 DEFINE_WAIT(wait); 2361 spin_lock_irq(&mdev->req_lock);
2362 rv = may_inc_ap_bio(mdev);
2363 if (rv)
2364 atomic_add(count, &mdev->ap_bio_cnt);
2365 spin_unlock_irq(&mdev->req_lock);
2366
2367 return rv;
2368}
2216 2369
2370static inline void inc_ap_bio(struct drbd_conf *mdev, int count)
2371{
2217 /* we wait here 2372 /* we wait here
2218 * as long as the device is suspended 2373 * as long as the device is suspended
2219 * until the bitmap is no longer on the fly during connection 2374 * until the bitmap is no longer on the fly during connection
@@ -2222,16 +2377,7 @@ static inline void inc_ap_bio(struct drbd_conf *mdev, int count)
2222 * to avoid races with the reconnect code, 2377 * to avoid races with the reconnect code,
2223 * we need to atomic_inc within the spinlock. */ 2378 * we need to atomic_inc within the spinlock. */
2224 2379
2225 spin_lock_irq(&mdev->req_lock); 2380 wait_event(mdev->misc_wait, inc_ap_bio_cond(mdev, count));
2226 while (!__inc_ap_bio_cond(mdev)) {
2227 prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
2228 spin_unlock_irq(&mdev->req_lock);
2229 schedule();
2230 finish_wait(&mdev->misc_wait, &wait);
2231 spin_lock_irq(&mdev->req_lock);
2232 }
2233 atomic_add(count, &mdev->ap_bio_cnt);
2234 spin_unlock_irq(&mdev->req_lock);
2235} 2381}
2236 2382
2237static inline void dec_ap_bio(struct drbd_conf *mdev) 2383static inline void dec_ap_bio(struct drbd_conf *mdev)
@@ -2251,9 +2397,11 @@ static inline void dec_ap_bio(struct drbd_conf *mdev)
2251 } 2397 }
2252} 2398}
2253 2399
2254static inline void drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val) 2400static inline int drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val)
2255{ 2401{
2402 int changed = mdev->ed_uuid != val;
2256 mdev->ed_uuid = val; 2403 mdev->ed_uuid = val;
2404 return changed;
2257} 2405}
2258 2406
2259static inline int seq_cmp(u32 a, u32 b) 2407static inline int seq_cmp(u32 a, u32 b)
@@ -2300,31 +2448,16 @@ static inline int drbd_queue_order_type(struct drbd_conf *mdev)
2300 return QUEUE_ORDERED_NONE; 2448 return QUEUE_ORDERED_NONE;
2301} 2449}
2302 2450
2303static inline void drbd_blk_run_queue(struct request_queue *q)
2304{
2305 if (q && q->unplug_fn)
2306 q->unplug_fn(q);
2307}
2308
2309static inline void drbd_kick_lo(struct drbd_conf *mdev)
2310{
2311 if (get_ldev(mdev)) {
2312 drbd_blk_run_queue(bdev_get_queue(mdev->ldev->backing_bdev));
2313 put_ldev(mdev);
2314 }
2315}
2316
2317static inline void drbd_md_flush(struct drbd_conf *mdev) 2451static inline void drbd_md_flush(struct drbd_conf *mdev)
2318{ 2452{
2319 int r; 2453 int r;
2320 2454
2321 if (test_bit(MD_NO_BARRIER, &mdev->flags)) 2455 if (test_bit(MD_NO_FUA, &mdev->flags))
2322 return; 2456 return;
2323 2457
2324 r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL, 2458 r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL);
2325 BLKDEV_IFL_WAIT);
2326 if (r) { 2459 if (r) {
2327 set_bit(MD_NO_BARRIER, &mdev->flags); 2460 set_bit(MD_NO_FUA, &mdev->flags);
2328 dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r); 2461 dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
2329 } 2462 }
2330} 2463}
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index fa650dd85b90..0358e55356c8 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -32,7 +32,7 @@
32#include <asm/types.h> 32#include <asm/types.h>
33#include <net/sock.h> 33#include <net/sock.h>
34#include <linux/ctype.h> 34#include <linux/ctype.h>
35#include <linux/smp_lock.h> 35#include <linux/mutex.h>
36#include <linux/fs.h> 36#include <linux/fs.h>
37#include <linux/file.h> 37#include <linux/file.h>
38#include <linux/proc_fs.h> 38#include <linux/proc_fs.h>
@@ -64,6 +64,7 @@ struct after_state_chg_work {
64 struct completion *done; 64 struct completion *done;
65}; 65};
66 66
67static DEFINE_MUTEX(drbd_main_mutex);
67int drbdd_init(struct drbd_thread *); 68int drbdd_init(struct drbd_thread *);
68int drbd_worker(struct drbd_thread *); 69int drbd_worker(struct drbd_thread *);
69int drbd_asender(struct drbd_thread *); 70int drbd_asender(struct drbd_thread *);
@@ -77,13 +78,15 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
77static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused); 78static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
78static void md_sync_timer_fn(unsigned long data); 79static void md_sync_timer_fn(unsigned long data);
79static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused); 80static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
81static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
80 82
81MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, " 83MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
82 "Lars Ellenberg <lars@linbit.com>"); 84 "Lars Ellenberg <lars@linbit.com>");
83MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION); 85MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
84MODULE_VERSION(REL_VERSION); 86MODULE_VERSION(REL_VERSION);
85MODULE_LICENSE("GPL"); 87MODULE_LICENSE("GPL");
86MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)"); 88MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices ("
89 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
87MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR); 90MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
88 91
89#include <linux/moduleparam.h> 92#include <linux/moduleparam.h>
@@ -113,7 +116,7 @@ module_param(fault_devs, int, 0644);
113#endif 116#endif
114 117
115/* module parameter, defined */ 118/* module parameter, defined */
116unsigned int minor_count = 32; 119unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
117int disable_sendpage; 120int disable_sendpage;
118int allow_oos; 121int allow_oos;
119unsigned int cn_idx = CN_IDX_DRBD; 122unsigned int cn_idx = CN_IDX_DRBD;
@@ -199,7 +202,7 @@ static int tl_init(struct drbd_conf *mdev)
199 INIT_LIST_HEAD(&b->w.list); 202 INIT_LIST_HEAD(&b->w.list);
200 b->next = NULL; 203 b->next = NULL;
201 b->br_number = 4711; 204 b->br_number = 4711;
202 b->n_req = 0; 205 b->n_writes = 0;
203 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */ 206 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
204 207
205 mdev->oldest_tle = b; 208 mdev->oldest_tle = b;
@@ -240,7 +243,7 @@ void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
240 INIT_LIST_HEAD(&new->w.list); 243 INIT_LIST_HEAD(&new->w.list);
241 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */ 244 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
242 new->next = NULL; 245 new->next = NULL;
243 new->n_req = 0; 246 new->n_writes = 0;
244 247
245 newest_before = mdev->newest_tle; 248 newest_before = mdev->newest_tle;
246 /* never send a barrier number == 0, because that is special-cased 249 /* never send a barrier number == 0, because that is special-cased
@@ -284,9 +287,9 @@ void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
284 barrier_nr, b->br_number); 287 barrier_nr, b->br_number);
285 goto bail; 288 goto bail;
286 } 289 }
287 if (b->n_req != set_size) { 290 if (b->n_writes != set_size) {
288 dev_err(DEV, "BAD! BarrierAck #%u received with n_req=%u, expected n_req=%u!\n", 291 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
289 barrier_nr, set_size, b->n_req); 292 barrier_nr, set_size, b->n_writes);
290 goto bail; 293 goto bail;
291 } 294 }
292 295
@@ -335,57 +338,98 @@ bail:
335 338
336 339
337/** 340/**
338 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL 341 * _tl_restart() - Walks the transfer log, and applies an action to all requests
339 * @mdev: DRBD device. 342 * @mdev: DRBD device.
343 * @what: The action/event to perform with all request objects
340 * 344 *
341 * This is called after the connection to the peer was lost. The storage covered 345 * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
342 * by the requests on the transfer gets marked as our of sync. Called from the 346 * restart_frozen_disk_io.
343 * receiver thread and the worker thread.
344 */ 347 */
345void tl_clear(struct drbd_conf *mdev) 348static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
346{ 349{
347 struct drbd_tl_epoch *b, *tmp; 350 struct drbd_tl_epoch *b, *tmp, **pn;
348 struct list_head *le, *tle; 351 struct list_head *le, *tle, carry_reads;
349 struct drbd_request *r; 352 struct drbd_request *req;
350 int new_initial_bnr = net_random(); 353 int rv, n_writes, n_reads;
351
352 spin_lock_irq(&mdev->req_lock);
353 354
354 b = mdev->oldest_tle; 355 b = mdev->oldest_tle;
356 pn = &mdev->oldest_tle;
355 while (b) { 357 while (b) {
358 n_writes = 0;
359 n_reads = 0;
360 INIT_LIST_HEAD(&carry_reads);
356 list_for_each_safe(le, tle, &b->requests) { 361 list_for_each_safe(le, tle, &b->requests) {
357 r = list_entry(le, struct drbd_request, tl_requests); 362 req = list_entry(le, struct drbd_request, tl_requests);
358 /* It would be nice to complete outside of spinlock. 363 rv = _req_mod(req, what);
359 * But this is easier for now. */ 364
360 _req_mod(r, connection_lost_while_pending); 365 n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
366 n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
361 } 367 }
362 tmp = b->next; 368 tmp = b->next;
363 369
364 /* there could still be requests on that ring list, 370 if (n_writes) {
365 * in case local io is still pending */ 371 if (what == resend) {
366 list_del(&b->requests); 372 b->n_writes = n_writes;
367 373 if (b->w.cb == NULL) {
368 /* dec_ap_pending corresponding to queue_barrier. 374 b->w.cb = w_send_barrier;
369 * the newest barrier may not have been queued yet, 375 inc_ap_pending(mdev);
370 * in which case w.cb is still NULL. */ 376 set_bit(CREATE_BARRIER, &mdev->flags);
371 if (b->w.cb != NULL) 377 }
372 dec_ap_pending(mdev); 378
373 379 drbd_queue_work(&mdev->data.work, &b->w);
374 if (b == mdev->newest_tle) { 380 }
375 /* recycle, but reinit! */ 381 pn = &b->next;
376 D_ASSERT(tmp == NULL); 382 } else {
377 INIT_LIST_HEAD(&b->requests); 383 if (n_reads)
378 INIT_LIST_HEAD(&b->w.list); 384 list_add(&carry_reads, &b->requests);
379 b->w.cb = NULL; 385 /* there could still be requests on that ring list,
380 b->br_number = new_initial_bnr; 386 * in case local io is still pending */
381 b->n_req = 0; 387 list_del(&b->requests);
382 388
383 mdev->oldest_tle = b; 389 /* dec_ap_pending corresponding to queue_barrier.
384 break; 390 * the newest barrier may not have been queued yet,
391 * in which case w.cb is still NULL. */
392 if (b->w.cb != NULL)
393 dec_ap_pending(mdev);
394
395 if (b == mdev->newest_tle) {
396 /* recycle, but reinit! */
397 D_ASSERT(tmp == NULL);
398 INIT_LIST_HEAD(&b->requests);
399 list_splice(&carry_reads, &b->requests);
400 INIT_LIST_HEAD(&b->w.list);
401 b->w.cb = NULL;
402 b->br_number = net_random();
403 b->n_writes = 0;
404
405 *pn = b;
406 break;
407 }
408 *pn = tmp;
409 kfree(b);
385 } 410 }
386 kfree(b);
387 b = tmp; 411 b = tmp;
412 list_splice(&carry_reads, &b->requests);
388 } 413 }
414}
415
416
417/**
418 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
419 * @mdev: DRBD device.
420 *
421 * This is called after the connection to the peer was lost. The storage covered
422 * by the requests on the transfer gets marked as our of sync. Called from the
423 * receiver thread and the worker thread.
424 */
425void tl_clear(struct drbd_conf *mdev)
426{
427 struct list_head *le, *tle;
428 struct drbd_request *r;
429
430 spin_lock_irq(&mdev->req_lock);
431
432 _tl_restart(mdev, connection_lost_while_pending);
389 433
390 /* we expect this list to be empty. */ 434 /* we expect this list to be empty. */
391 D_ASSERT(list_empty(&mdev->out_of_sequence_requests)); 435 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
@@ -401,11 +445,20 @@ void tl_clear(struct drbd_conf *mdev)
401 /* ensure bit indicating barrier is required is clear */ 445 /* ensure bit indicating barrier is required is clear */
402 clear_bit(CREATE_BARRIER, &mdev->flags); 446 clear_bit(CREATE_BARRIER, &mdev->flags);
403 447
448 memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
449
450 spin_unlock_irq(&mdev->req_lock);
451}
452
453void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
454{
455 spin_lock_irq(&mdev->req_lock);
456 _tl_restart(mdev, what);
404 spin_unlock_irq(&mdev->req_lock); 457 spin_unlock_irq(&mdev->req_lock);
405} 458}
406 459
407/** 460/**
408 * cl_wide_st_chg() - TRUE if the state change is a cluster wide one 461 * cl_wide_st_chg() - true if the state change is a cluster wide one
409 * @mdev: DRBD device. 462 * @mdev: DRBD device.
410 * @os: old (current) state. 463 * @os: old (current) state.
411 * @ns: new (wanted) state. 464 * @ns: new (wanted) state.
@@ -422,12 +475,13 @@ static int cl_wide_st_chg(struct drbd_conf *mdev,
422 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S); 475 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
423} 476}
424 477
425int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f, 478enum drbd_state_rv
426 union drbd_state mask, union drbd_state val) 479drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
480 union drbd_state mask, union drbd_state val)
427{ 481{
428 unsigned long flags; 482 unsigned long flags;
429 union drbd_state os, ns; 483 union drbd_state os, ns;
430 int rv; 484 enum drbd_state_rv rv;
431 485
432 spin_lock_irqsave(&mdev->req_lock, flags); 486 spin_lock_irqsave(&mdev->req_lock, flags);
433 os = mdev->state; 487 os = mdev->state;
@@ -451,20 +505,22 @@ void drbd_force_state(struct drbd_conf *mdev,
451 drbd_change_state(mdev, CS_HARD, mask, val); 505 drbd_change_state(mdev, CS_HARD, mask, val);
452} 506}
453 507
454static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns); 508static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
455static int is_valid_state_transition(struct drbd_conf *, 509static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
456 union drbd_state, union drbd_state); 510 union drbd_state,
511 union drbd_state);
457static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, 512static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
458 union drbd_state ns, int *warn_sync_abort); 513 union drbd_state ns, const char **warn_sync_abort);
459int drbd_send_state_req(struct drbd_conf *, 514int drbd_send_state_req(struct drbd_conf *,
460 union drbd_state, union drbd_state); 515 union drbd_state, union drbd_state);
461 516
462static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev, 517static enum drbd_state_rv
463 union drbd_state mask, union drbd_state val) 518_req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
519 union drbd_state val)
464{ 520{
465 union drbd_state os, ns; 521 union drbd_state os, ns;
466 unsigned long flags; 522 unsigned long flags;
467 int rv; 523 enum drbd_state_rv rv;
468 524
469 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags)) 525 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
470 return SS_CW_SUCCESS; 526 return SS_CW_SUCCESS;
@@ -485,7 +541,7 @@ static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev,
485 if (rv == SS_SUCCESS) { 541 if (rv == SS_SUCCESS) {
486 rv = is_valid_state_transition(mdev, ns, os); 542 rv = is_valid_state_transition(mdev, ns, os);
487 if (rv == SS_SUCCESS) 543 if (rv == SS_SUCCESS)
488 rv = 0; /* cont waiting, otherwise fail. */ 544 rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
489 } 545 }
490 } 546 }
491 spin_unlock_irqrestore(&mdev->req_lock, flags); 547 spin_unlock_irqrestore(&mdev->req_lock, flags);
@@ -503,14 +559,14 @@ static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev,
503 * Should not be called directly, use drbd_request_state() or 559 * Should not be called directly, use drbd_request_state() or
504 * _drbd_request_state(). 560 * _drbd_request_state().
505 */ 561 */
506static int drbd_req_state(struct drbd_conf *mdev, 562static enum drbd_state_rv
507 union drbd_state mask, union drbd_state val, 563drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
508 enum chg_state_flags f) 564 union drbd_state val, enum chg_state_flags f)
509{ 565{
510 struct completion done; 566 struct completion done;
511 unsigned long flags; 567 unsigned long flags;
512 union drbd_state os, ns; 568 union drbd_state os, ns;
513 int rv; 569 enum drbd_state_rv rv;
514 570
515 init_completion(&done); 571 init_completion(&done);
516 572
@@ -585,10 +641,11 @@ abort:
585 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE 641 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
586 * flag, or when logging of failed state change requests is not desired. 642 * flag, or when logging of failed state change requests is not desired.
587 */ 643 */
588int _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask, 644enum drbd_state_rv
589 union drbd_state val, enum chg_state_flags f) 645_drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
646 union drbd_state val, enum chg_state_flags f)
590{ 647{
591 int rv; 648 enum drbd_state_rv rv;
592 649
593 wait_event(mdev->state_wait, 650 wait_event(mdev->state_wait,
594 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE); 651 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
@@ -605,15 +662,15 @@ static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
605 drbd_role_str(ns.peer), 662 drbd_role_str(ns.peer),
606 drbd_disk_str(ns.disk), 663 drbd_disk_str(ns.disk),
607 drbd_disk_str(ns.pdsk), 664 drbd_disk_str(ns.pdsk),
608 ns.susp ? 's' : 'r', 665 is_susp(ns) ? 's' : 'r',
609 ns.aftr_isp ? 'a' : '-', 666 ns.aftr_isp ? 'a' : '-',
610 ns.peer_isp ? 'p' : '-', 667 ns.peer_isp ? 'p' : '-',
611 ns.user_isp ? 'u' : '-' 668 ns.user_isp ? 'u' : '-'
612 ); 669 );
613} 670}
614 671
615void print_st_err(struct drbd_conf *mdev, 672void print_st_err(struct drbd_conf *mdev, union drbd_state os,
616 union drbd_state os, union drbd_state ns, int err) 673 union drbd_state ns, enum drbd_state_rv err)
617{ 674{
618 if (err == SS_IN_TRANSIENT_STATE) 675 if (err == SS_IN_TRANSIENT_STATE)
619 return; 676 return;
@@ -623,32 +680,18 @@ void print_st_err(struct drbd_conf *mdev,
623} 680}
624 681
625 682
626#define drbd_peer_str drbd_role_str
627#define drbd_pdsk_str drbd_disk_str
628
629#define drbd_susp_str(A) ((A) ? "1" : "0")
630#define drbd_aftr_isp_str(A) ((A) ? "1" : "0")
631#define drbd_peer_isp_str(A) ((A) ? "1" : "0")
632#define drbd_user_isp_str(A) ((A) ? "1" : "0")
633
634#define PSC(A) \
635 ({ if (ns.A != os.A) { \
636 pbp += sprintf(pbp, #A "( %s -> %s ) ", \
637 drbd_##A##_str(os.A), \
638 drbd_##A##_str(ns.A)); \
639 } })
640
641/** 683/**
642 * is_valid_state() - Returns an SS_ error code if ns is not valid 684 * is_valid_state() - Returns an SS_ error code if ns is not valid
643 * @mdev: DRBD device. 685 * @mdev: DRBD device.
644 * @ns: State to consider. 686 * @ns: State to consider.
645 */ 687 */
646static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns) 688static enum drbd_state_rv
689is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
647{ 690{
648 /* See drbd_state_sw_errors in drbd_strings.c */ 691 /* See drbd_state_sw_errors in drbd_strings.c */
649 692
650 enum drbd_fencing_p fp; 693 enum drbd_fencing_p fp;
651 int rv = SS_SUCCESS; 694 enum drbd_state_rv rv = SS_SUCCESS;
652 695
653 fp = FP_DONT_CARE; 696 fp = FP_DONT_CARE;
654 if (get_ldev(mdev)) { 697 if (get_ldev(mdev)) {
@@ -702,6 +745,9 @@ static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
702 mdev->agreed_pro_version < 88) 745 mdev->agreed_pro_version < 88)
703 rv = SS_NOT_SUPPORTED; 746 rv = SS_NOT_SUPPORTED;
704 747
748 else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
749 rv = SS_CONNECTED_OUTDATES;
750
705 return rv; 751 return rv;
706} 752}
707 753
@@ -711,10 +757,11 @@ static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
711 * @ns: new state. 757 * @ns: new state.
712 * @os: old state. 758 * @os: old state.
713 */ 759 */
714static int is_valid_state_transition(struct drbd_conf *mdev, 760static enum drbd_state_rv
715 union drbd_state ns, union drbd_state os) 761is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
762 union drbd_state os)
716{ 763{
717 int rv = SS_SUCCESS; 764 enum drbd_state_rv rv = SS_SUCCESS;
718 765
719 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) && 766 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
720 os.conn > C_CONNECTED) 767 os.conn > C_CONNECTED)
@@ -749,6 +796,10 @@ static int is_valid_state_transition(struct drbd_conf *mdev,
749 os.conn < C_CONNECTED) 796 os.conn < C_CONNECTED)
750 rv = SS_NEED_CONNECTION; 797 rv = SS_NEED_CONNECTION;
751 798
799 if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
800 && os.conn < C_WF_REPORT_PARAMS)
801 rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
802
752 return rv; 803 return rv;
753} 804}
754 805
@@ -763,9 +814,10 @@ static int is_valid_state_transition(struct drbd_conf *mdev,
763 * to D_UNKNOWN. This rule and many more along those lines are in this function. 814 * to D_UNKNOWN. This rule and many more along those lines are in this function.
764 */ 815 */
765static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, 816static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
766 union drbd_state ns, int *warn_sync_abort) 817 union drbd_state ns, const char **warn_sync_abort)
767{ 818{
768 enum drbd_fencing_p fp; 819 enum drbd_fencing_p fp;
820 enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
769 821
770 fp = FP_DONT_CARE; 822 fp = FP_DONT_CARE;
771 if (get_ldev(mdev)) { 823 if (get_ldev(mdev)) {
@@ -778,11 +830,21 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
778 os.conn <= C_DISCONNECTING) 830 os.conn <= C_DISCONNECTING)
779 ns.conn = os.conn; 831 ns.conn = os.conn;
780 832
781 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow */ 833 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
834 * If you try to go into some Sync* state, that shall fail (elsewhere). */
782 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN && 835 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
783 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING) 836 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
784 ns.conn = os.conn; 837 ns.conn = os.conn;
785 838
839 /* we cannot fail (again) if we already detached */
840 if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
841 ns.disk = D_DISKLESS;
842
843 /* if we are only D_ATTACHING yet,
844 * we can (and should) go directly to D_DISKLESS. */
845 if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
846 ns.disk = D_DISKLESS;
847
786 /* After C_DISCONNECTING only C_STANDALONE may follow */ 848 /* After C_DISCONNECTING only C_STANDALONE may follow */
787 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE) 849 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
788 ns.conn = os.conn; 850 ns.conn = os.conn;
@@ -798,67 +860,16 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
798 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY) 860 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
799 ns.aftr_isp = 0; 861 ns.aftr_isp = 0;
800 862
801 if (ns.conn <= C_DISCONNECTING && ns.disk == D_DISKLESS)
802 ns.pdsk = D_UNKNOWN;
803
804 /* Abort resync if a disk fails/detaches */ 863 /* Abort resync if a disk fails/detaches */
805 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED && 864 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
806 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) { 865 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
807 if (warn_sync_abort) 866 if (warn_sync_abort)
808 *warn_sync_abort = 1; 867 *warn_sync_abort =
868 os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
869 "Online-verify" : "Resync";
809 ns.conn = C_CONNECTED; 870 ns.conn = C_CONNECTED;
810 } 871 }
811 872
812 if (ns.conn >= C_CONNECTED &&
813 ((ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) ||
814 (ns.disk == D_NEGOTIATING && ns.conn == C_WF_BITMAP_T))) {
815 switch (ns.conn) {
816 case C_WF_BITMAP_T:
817 case C_PAUSED_SYNC_T:
818 ns.disk = D_OUTDATED;
819 break;
820 case C_CONNECTED:
821 case C_WF_BITMAP_S:
822 case C_SYNC_SOURCE:
823 case C_PAUSED_SYNC_S:
824 ns.disk = D_UP_TO_DATE;
825 break;
826 case C_SYNC_TARGET:
827 ns.disk = D_INCONSISTENT;
828 dev_warn(DEV, "Implicitly set disk state Inconsistent!\n");
829 break;
830 }
831 if (os.disk == D_OUTDATED && ns.disk == D_UP_TO_DATE)
832 dev_warn(DEV, "Implicitly set disk from Outdated to UpToDate\n");
833 }
834
835 if (ns.conn >= C_CONNECTED &&
836 (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)) {
837 switch (ns.conn) {
838 case C_CONNECTED:
839 case C_WF_BITMAP_T:
840 case C_PAUSED_SYNC_T:
841 case C_SYNC_TARGET:
842 ns.pdsk = D_UP_TO_DATE;
843 break;
844 case C_WF_BITMAP_S:
845 case C_PAUSED_SYNC_S:
846 /* remap any consistent state to D_OUTDATED,
847 * but disallow "upgrade" of not even consistent states.
848 */
849 ns.pdsk =
850 (D_DISKLESS < os.pdsk && os.pdsk < D_OUTDATED)
851 ? os.pdsk : D_OUTDATED;
852 break;
853 case C_SYNC_SOURCE:
854 ns.pdsk = D_INCONSISTENT;
855 dev_warn(DEV, "Implicitly set pdsk Inconsistent!\n");
856 break;
857 }
858 if (os.pdsk == D_OUTDATED && ns.pdsk == D_UP_TO_DATE)
859 dev_warn(DEV, "Implicitly set pdsk from Outdated to UpToDate\n");
860 }
861
862 /* Connection breaks down before we finished "Negotiating" */ 873 /* Connection breaks down before we finished "Negotiating" */
863 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING && 874 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
864 get_ldev_if_state(mdev, D_NEGOTIATING)) { 875 get_ldev_if_state(mdev, D_NEGOTIATING)) {
@@ -873,10 +884,103 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
873 put_ldev(mdev); 884 put_ldev(mdev);
874 } 885 }
875 886
887 /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
888 if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
889 if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
890 ns.disk = D_UP_TO_DATE;
891 if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
892 ns.pdsk = D_UP_TO_DATE;
893 }
894
895 /* Implications of the connection stat on the disk states */
896 disk_min = D_DISKLESS;
897 disk_max = D_UP_TO_DATE;
898 pdsk_min = D_INCONSISTENT;
899 pdsk_max = D_UNKNOWN;
900 switch ((enum drbd_conns)ns.conn) {
901 case C_WF_BITMAP_T:
902 case C_PAUSED_SYNC_T:
903 case C_STARTING_SYNC_T:
904 case C_WF_SYNC_UUID:
905 case C_BEHIND:
906 disk_min = D_INCONSISTENT;
907 disk_max = D_OUTDATED;
908 pdsk_min = D_UP_TO_DATE;
909 pdsk_max = D_UP_TO_DATE;
910 break;
911 case C_VERIFY_S:
912 case C_VERIFY_T:
913 disk_min = D_UP_TO_DATE;
914 disk_max = D_UP_TO_DATE;
915 pdsk_min = D_UP_TO_DATE;
916 pdsk_max = D_UP_TO_DATE;
917 break;
918 case C_CONNECTED:
919 disk_min = D_DISKLESS;
920 disk_max = D_UP_TO_DATE;
921 pdsk_min = D_DISKLESS;
922 pdsk_max = D_UP_TO_DATE;
923 break;
924 case C_WF_BITMAP_S:
925 case C_PAUSED_SYNC_S:
926 case C_STARTING_SYNC_S:
927 case C_AHEAD:
928 disk_min = D_UP_TO_DATE;
929 disk_max = D_UP_TO_DATE;
930 pdsk_min = D_INCONSISTENT;
931 pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
932 break;
933 case C_SYNC_TARGET:
934 disk_min = D_INCONSISTENT;
935 disk_max = D_INCONSISTENT;
936 pdsk_min = D_UP_TO_DATE;
937 pdsk_max = D_UP_TO_DATE;
938 break;
939 case C_SYNC_SOURCE:
940 disk_min = D_UP_TO_DATE;
941 disk_max = D_UP_TO_DATE;
942 pdsk_min = D_INCONSISTENT;
943 pdsk_max = D_INCONSISTENT;
944 break;
945 case C_STANDALONE:
946 case C_DISCONNECTING:
947 case C_UNCONNECTED:
948 case C_TIMEOUT:
949 case C_BROKEN_PIPE:
950 case C_NETWORK_FAILURE:
951 case C_PROTOCOL_ERROR:
952 case C_TEAR_DOWN:
953 case C_WF_CONNECTION:
954 case C_WF_REPORT_PARAMS:
955 case C_MASK:
956 break;
957 }
958 if (ns.disk > disk_max)
959 ns.disk = disk_max;
960
961 if (ns.disk < disk_min) {
962 dev_warn(DEV, "Implicitly set disk from %s to %s\n",
963 drbd_disk_str(ns.disk), drbd_disk_str(disk_min));
964 ns.disk = disk_min;
965 }
966 if (ns.pdsk > pdsk_max)
967 ns.pdsk = pdsk_max;
968
969 if (ns.pdsk < pdsk_min) {
970 dev_warn(DEV, "Implicitly set pdsk from %s to %s\n",
971 drbd_disk_str(ns.pdsk), drbd_disk_str(pdsk_min));
972 ns.pdsk = pdsk_min;
973 }
974
876 if (fp == FP_STONITH && 975 if (fp == FP_STONITH &&
877 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) && 976 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
878 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED)) 977 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
879 ns.susp = 1; 978 ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
979
980 if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
981 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
982 !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
983 ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
880 984
881 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) { 985 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
882 if (ns.conn == C_SYNC_SOURCE) 986 if (ns.conn == C_SYNC_SOURCE)
@@ -896,6 +1000,10 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
896/* helper for __drbd_set_state */ 1000/* helper for __drbd_set_state */
897static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs) 1001static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
898{ 1002{
1003 if (mdev->agreed_pro_version < 90)
1004 mdev->ov_start_sector = 0;
1005 mdev->rs_total = drbd_bm_bits(mdev);
1006 mdev->ov_position = 0;
899 if (cs == C_VERIFY_T) { 1007 if (cs == C_VERIFY_T) {
900 /* starting online verify from an arbitrary position 1008 /* starting online verify from an arbitrary position
901 * does not fit well into the existing protocol. 1009 * does not fit well into the existing protocol.
@@ -905,11 +1013,21 @@ static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
905 mdev->ov_start_sector = ~(sector_t)0; 1013 mdev->ov_start_sector = ~(sector_t)0;
906 } else { 1014 } else {
907 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector); 1015 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
908 if (bit >= mdev->rs_total) 1016 if (bit >= mdev->rs_total) {
909 mdev->ov_start_sector = 1017 mdev->ov_start_sector =
910 BM_BIT_TO_SECT(mdev->rs_total - 1); 1018 BM_BIT_TO_SECT(mdev->rs_total - 1);
1019 mdev->rs_total = 1;
1020 } else
1021 mdev->rs_total -= bit;
911 mdev->ov_position = mdev->ov_start_sector; 1022 mdev->ov_position = mdev->ov_start_sector;
912 } 1023 }
1024 mdev->ov_left = mdev->rs_total;
1025}
1026
1027static void drbd_resume_al(struct drbd_conf *mdev)
1028{
1029 if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
1030 dev_info(DEV, "Resumed AL updates\n");
913} 1031}
914 1032
915/** 1033/**
@@ -921,13 +1039,13 @@ static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
921 * 1039 *
922 * Caller needs to hold req_lock, and global_state_lock. Do not call directly. 1040 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
923 */ 1041 */
924int __drbd_set_state(struct drbd_conf *mdev, 1042enum drbd_state_rv
925 union drbd_state ns, enum chg_state_flags flags, 1043__drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
926 struct completion *done) 1044 enum chg_state_flags flags, struct completion *done)
927{ 1045{
928 union drbd_state os; 1046 union drbd_state os;
929 int rv = SS_SUCCESS; 1047 enum drbd_state_rv rv = SS_SUCCESS;
930 int warn_sync_abort = 0; 1048 const char *warn_sync_abort = NULL;
931 struct after_state_chg_work *ascw; 1049 struct after_state_chg_work *ascw;
932 1050
933 os = mdev->state; 1051 os = mdev->state;
@@ -946,14 +1064,8 @@ int __drbd_set_state(struct drbd_conf *mdev,
946 /* If the old state was illegal as well, then let 1064 /* If the old state was illegal as well, then let
947 this happen...*/ 1065 this happen...*/
948 1066
949 if (is_valid_state(mdev, os) == rv) { 1067 if (is_valid_state(mdev, os) == rv)
950 dev_err(DEV, "Considering state change from bad state. "
951 "Error would be: '%s'\n",
952 drbd_set_st_err_str(rv));
953 print_st(mdev, "old", os);
954 print_st(mdev, "new", ns);
955 rv = is_valid_state_transition(mdev, ns, os); 1068 rv = is_valid_state_transition(mdev, ns, os);
956 }
957 } else 1069 } else
958 rv = is_valid_state_transition(mdev, ns, os); 1070 rv = is_valid_state_transition(mdev, ns, os);
959 } 1071 }
@@ -965,22 +1077,49 @@ int __drbd_set_state(struct drbd_conf *mdev,
965 } 1077 }
966 1078
967 if (warn_sync_abort) 1079 if (warn_sync_abort)
968 dev_warn(DEV, "Resync aborted.\n"); 1080 dev_warn(DEV, "%s aborted.\n", warn_sync_abort);
969 1081
970 { 1082 {
971 char *pbp, pb[300]; 1083 char *pbp, pb[300];
972 pbp = pb; 1084 pbp = pb;
973 *pbp = 0; 1085 *pbp = 0;
974 PSC(role); 1086 if (ns.role != os.role)
975 PSC(peer); 1087 pbp += sprintf(pbp, "role( %s -> %s ) ",
976 PSC(conn); 1088 drbd_role_str(os.role),
977 PSC(disk); 1089 drbd_role_str(ns.role));
978 PSC(pdsk); 1090 if (ns.peer != os.peer)
979 PSC(susp); 1091 pbp += sprintf(pbp, "peer( %s -> %s ) ",
980 PSC(aftr_isp); 1092 drbd_role_str(os.peer),
981 PSC(peer_isp); 1093 drbd_role_str(ns.peer));
982 PSC(user_isp); 1094 if (ns.conn != os.conn)
983 dev_info(DEV, "%s\n", pb); 1095 pbp += sprintf(pbp, "conn( %s -> %s ) ",
1096 drbd_conn_str(os.conn),
1097 drbd_conn_str(ns.conn));
1098 if (ns.disk != os.disk)
1099 pbp += sprintf(pbp, "disk( %s -> %s ) ",
1100 drbd_disk_str(os.disk),
1101 drbd_disk_str(ns.disk));
1102 if (ns.pdsk != os.pdsk)
1103 pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
1104 drbd_disk_str(os.pdsk),
1105 drbd_disk_str(ns.pdsk));
1106 if (is_susp(ns) != is_susp(os))
1107 pbp += sprintf(pbp, "susp( %d -> %d ) ",
1108 is_susp(os),
1109 is_susp(ns));
1110 if (ns.aftr_isp != os.aftr_isp)
1111 pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
1112 os.aftr_isp,
1113 ns.aftr_isp);
1114 if (ns.peer_isp != os.peer_isp)
1115 pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
1116 os.peer_isp,
1117 ns.peer_isp);
1118 if (ns.user_isp != os.user_isp)
1119 pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
1120 os.user_isp,
1121 ns.user_isp);
1122 dev_info(DEV, "%s\n", pb);
984 } 1123 }
985 1124
986 /* solve the race between becoming unconfigured, 1125 /* solve the race between becoming unconfigured,
@@ -997,21 +1136,27 @@ int __drbd_set_state(struct drbd_conf *mdev,
997 !test_and_set_bit(CONFIG_PENDING, &mdev->flags)) 1136 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
998 set_bit(DEVICE_DYING, &mdev->flags); 1137 set_bit(DEVICE_DYING, &mdev->flags);
999 1138
1000 mdev->state.i = ns.i; 1139 /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1140 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1141 * drbd_ldev_destroy() won't happen before our corresponding
1142 * after_state_ch works run, where we put_ldev again. */
1143 if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1144 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1145 atomic_inc(&mdev->local_cnt);
1146
1147 mdev->state = ns;
1148
1149 if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
1150 drbd_print_uuids(mdev, "attached to UUIDs");
1151
1001 wake_up(&mdev->misc_wait); 1152 wake_up(&mdev->misc_wait);
1002 wake_up(&mdev->state_wait); 1153 wake_up(&mdev->state_wait);
1003 1154
1004 /* post-state-change actions */
1005 if (os.conn >= C_SYNC_SOURCE && ns.conn <= C_CONNECTED) {
1006 set_bit(STOP_SYNC_TIMER, &mdev->flags);
1007 mod_timer(&mdev->resync_timer, jiffies);
1008 }
1009
1010 /* aborted verify run. log the last position */ 1155 /* aborted verify run. log the last position */
1011 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) && 1156 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1012 ns.conn < C_CONNECTED) { 1157 ns.conn < C_CONNECTED) {
1013 mdev->ov_start_sector = 1158 mdev->ov_start_sector =
1014 BM_BIT_TO_SECT(mdev->rs_total - mdev->ov_left); 1159 BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
1015 dev_info(DEV, "Online Verify reached sector %llu\n", 1160 dev_info(DEV, "Online Verify reached sector %llu\n",
1016 (unsigned long long)mdev->ov_start_sector); 1161 (unsigned long long)mdev->ov_start_sector);
1017 } 1162 }
@@ -1019,41 +1164,37 @@ int __drbd_set_state(struct drbd_conf *mdev,
1019 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) && 1164 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1020 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) { 1165 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1021 dev_info(DEV, "Syncer continues.\n"); 1166 dev_info(DEV, "Syncer continues.\n");
1022 mdev->rs_paused += (long)jiffies-(long)mdev->rs_mark_time; 1167 mdev->rs_paused += (long)jiffies
1023 if (ns.conn == C_SYNC_TARGET) { 1168 -(long)mdev->rs_mark_time[mdev->rs_last_mark];
1024 if (!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags)) 1169 if (ns.conn == C_SYNC_TARGET)
1025 mod_timer(&mdev->resync_timer, jiffies); 1170 mod_timer(&mdev->resync_timer, jiffies);
1026 /* This if (!test_bit) is only needed for the case
1027 that a device that has ceased to used its timer,
1028 i.e. it is already in drbd_resync_finished() gets
1029 paused and resumed. */
1030 }
1031 } 1171 }
1032 1172
1033 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) && 1173 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1034 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) { 1174 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1035 dev_info(DEV, "Resync suspended\n"); 1175 dev_info(DEV, "Resync suspended\n");
1036 mdev->rs_mark_time = jiffies; 1176 mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
1037 if (ns.conn == C_PAUSED_SYNC_T)
1038 set_bit(STOP_SYNC_TIMER, &mdev->flags);
1039 } 1177 }
1040 1178
1041 if (os.conn == C_CONNECTED && 1179 if (os.conn == C_CONNECTED &&
1042 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) { 1180 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
1043 mdev->ov_position = 0; 1181 unsigned long now = jiffies;
1044 mdev->rs_total = 1182 int i;
1045 mdev->rs_mark_left = drbd_bm_bits(mdev); 1183
1046 if (mdev->agreed_pro_version >= 90) 1184 set_ov_position(mdev, ns.conn);
1047 set_ov_position(mdev, ns.conn); 1185 mdev->rs_start = now;
1048 else 1186 mdev->rs_last_events = 0;
1049 mdev->ov_start_sector = 0; 1187 mdev->rs_last_sect_ev = 0;
1050 mdev->ov_left = mdev->rs_total
1051 - BM_SECT_TO_BIT(mdev->ov_position);
1052 mdev->rs_start =
1053 mdev->rs_mark_time = jiffies;
1054 mdev->ov_last_oos_size = 0; 1188 mdev->ov_last_oos_size = 0;
1055 mdev->ov_last_oos_start = 0; 1189 mdev->ov_last_oos_start = 0;
1056 1190
1191 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
1192 mdev->rs_mark_left[i] = mdev->ov_left;
1193 mdev->rs_mark_time[i] = now;
1194 }
1195
1196 drbd_rs_controller_reset(mdev);
1197
1057 if (ns.conn == C_VERIFY_S) { 1198 if (ns.conn == C_VERIFY_S) {
1058 dev_info(DEV, "Starting Online Verify from sector %llu\n", 1199 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1059 (unsigned long long)mdev->ov_position); 1200 (unsigned long long)mdev->ov_position);
@@ -1106,6 +1247,10 @@ int __drbd_set_state(struct drbd_conf *mdev,
1106 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT) 1247 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1107 drbd_thread_restart_nowait(&mdev->receiver); 1248 drbd_thread_restart_nowait(&mdev->receiver);
1108 1249
1250 /* Resume AL writing if we get a connection */
1251 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1252 drbd_resume_al(mdev);
1253
1109 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC); 1254 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1110 if (ascw) { 1255 if (ascw) {
1111 ascw->os = os; 1256 ascw->os = os;
@@ -1153,6 +1298,26 @@ static void abw_start_sync(struct drbd_conf *mdev, int rv)
1153 } 1298 }
1154} 1299}
1155 1300
1301int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
1302 int (*io_fn)(struct drbd_conf *),
1303 char *why, enum bm_flag flags)
1304{
1305 int rv;
1306
1307 D_ASSERT(current == mdev->worker.task);
1308
1309 /* open coded non-blocking drbd_suspend_io(mdev); */
1310 set_bit(SUSPEND_IO, &mdev->flags);
1311
1312 drbd_bm_lock(mdev, why, flags);
1313 rv = io_fn(mdev);
1314 drbd_bm_unlock(mdev);
1315
1316 drbd_resume_io(mdev);
1317
1318 return rv;
1319}
1320
1156/** 1321/**
1157 * after_state_ch() - Perform after state change actions that may sleep 1322 * after_state_ch() - Perform after state change actions that may sleep
1158 * @mdev: DRBD device. 1323 * @mdev: DRBD device.
@@ -1164,6 +1329,8 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1164 union drbd_state ns, enum chg_state_flags flags) 1329 union drbd_state ns, enum chg_state_flags flags)
1165{ 1330{
1166 enum drbd_fencing_p fp; 1331 enum drbd_fencing_p fp;
1332 enum drbd_req_event what = nothing;
1333 union drbd_state nsm = (union drbd_state){ .i = -1 };
1167 1334
1168 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) { 1335 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1169 clear_bit(CRASHED_PRIMARY, &mdev->flags); 1336 clear_bit(CRASHED_PRIMARY, &mdev->flags);
@@ -1187,24 +1354,70 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1187 /* Here we have the actions that are performed after a 1354 /* Here we have the actions that are performed after a
1188 state change. This function might sleep */ 1355 state change. This function might sleep */
1189 1356
1190 if (fp == FP_STONITH && ns.susp) { 1357 nsm.i = -1;
1191 /* case1: The outdate peer handler is successful: 1358 if (ns.susp_nod) {
1192 * case2: The connection was established again: */ 1359 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1193 if ((os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) || 1360 what = resend;
1194 (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)) { 1361
1362 if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
1363 what = restart_frozen_disk_io;
1364
1365 if (what != nothing)
1366 nsm.susp_nod = 0;
1367 }
1368
1369 if (ns.susp_fen) {
1370 /* case1: The outdate peer handler is successful: */
1371 if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
1195 tl_clear(mdev); 1372 tl_clear(mdev);
1373 if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1374 drbd_uuid_new_current(mdev);
1375 clear_bit(NEW_CUR_UUID, &mdev->flags);
1376 }
1196 spin_lock_irq(&mdev->req_lock); 1377 spin_lock_irq(&mdev->req_lock);
1197 _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL); 1378 _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
1198 spin_unlock_irq(&mdev->req_lock); 1379 spin_unlock_irq(&mdev->req_lock);
1199 } 1380 }
1381 /* case2: The connection was established again: */
1382 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1383 clear_bit(NEW_CUR_UUID, &mdev->flags);
1384 what = resend;
1385 nsm.susp_fen = 0;
1386 }
1387 }
1388
1389 if (what != nothing) {
1390 spin_lock_irq(&mdev->req_lock);
1391 _tl_restart(mdev, what);
1392 nsm.i &= mdev->state.i;
1393 _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
1394 spin_unlock_irq(&mdev->req_lock);
1395 }
1396
1397 /* Became sync source. With protocol >= 96, we still need to send out
1398 * the sync uuid now. Need to do that before any drbd_send_state, or
1399 * the other side may go "paused sync" before receiving the sync uuids,
1400 * which is unexpected. */
1401 if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
1402 (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
1403 mdev->agreed_pro_version >= 96 && get_ldev(mdev)) {
1404 drbd_gen_and_send_sync_uuid(mdev);
1405 put_ldev(mdev);
1200 } 1406 }
1407
1201 /* Do not change the order of the if above and the two below... */ 1408 /* Do not change the order of the if above and the two below... */
1202 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */ 1409 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
1203 drbd_send_uuids(mdev); 1410 drbd_send_uuids(mdev);
1204 drbd_send_state(mdev); 1411 drbd_send_state(mdev);
1205 } 1412 }
1206 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S) 1413 /* No point in queuing send_bitmap if we don't have a connection
1207 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)"); 1414 * anymore, so check also the _current_ state, not only the new state
1415 * at the time this work was queued. */
1416 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
1417 mdev->state.conn == C_WF_BITMAP_S)
1418 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL,
1419 "send_bitmap (WFBitMapS)",
1420 BM_LOCKED_TEST_ALLOWED);
1208 1421
1209 /* Lost contact to peer's copy of the data */ 1422 /* Lost contact to peer's copy of the data */
1210 if ((os.pdsk >= D_INCONSISTENT && 1423 if ((os.pdsk >= D_INCONSISTENT &&
@@ -1216,20 +1429,42 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1216 if (get_ldev(mdev)) { 1429 if (get_ldev(mdev)) {
1217 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && 1430 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
1218 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { 1431 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
1219 drbd_uuid_new_current(mdev); 1432 if (is_susp(mdev->state)) {
1220 drbd_send_uuids(mdev); 1433 set_bit(NEW_CUR_UUID, &mdev->flags);
1434 } else {
1435 drbd_uuid_new_current(mdev);
1436 drbd_send_uuids(mdev);
1437 }
1221 } 1438 }
1222 put_ldev(mdev); 1439 put_ldev(mdev);
1223 } 1440 }
1224 } 1441 }
1225 1442
1226 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) { 1443 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
1227 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) 1444 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
1228 drbd_uuid_new_current(mdev); 1445 drbd_uuid_new_current(mdev);
1446 drbd_send_uuids(mdev);
1447 }
1229 1448
1230 /* D_DISKLESS Peer becomes secondary */ 1449 /* D_DISKLESS Peer becomes secondary */
1231 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) 1450 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
1232 drbd_al_to_on_disk_bm(mdev); 1451 /* We may still be Primary ourselves.
1452 * No harm done if the bitmap still changes,
1453 * redirtied pages will follow later. */
1454 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1455 "demote diskless peer", BM_LOCKED_SET_ALLOWED);
1456 put_ldev(mdev);
1457 }
1458
1459 /* Write out all changed bits on demote.
1460 * Though, no need to da that just yet
1461 * if there is a resync going on still */
1462 if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
1463 mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
1464 /* No changes to the bitmap expected this time, so assert that,
1465 * even though no harm was done if it did change. */
1466 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1467 "demote", BM_LOCKED_TEST_ALLOWED);
1233 put_ldev(mdev); 1468 put_ldev(mdev);
1234 } 1469 }
1235 1470
@@ -1257,64 +1492,86 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1257 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED) 1492 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1258 drbd_send_state(mdev); 1493 drbd_send_state(mdev);
1259 1494
1495 if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
1496 drbd_send_state(mdev);
1497
1260 /* We are in the progress to start a full sync... */ 1498 /* We are in the progress to start a full sync... */
1261 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || 1499 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1262 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S)) 1500 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
1263 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync"); 1501 /* no other bitmap changes expected during this phase */
1502 drbd_queue_bitmap_io(mdev,
1503 &drbd_bmio_set_n_write, &abw_start_sync,
1504 "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
1264 1505
1265 /* We are invalidating our self... */ 1506 /* We are invalidating our self... */
1266 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED && 1507 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1267 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT) 1508 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1268 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate"); 1509 /* other bitmap operation expected during this phase */
1510 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
1511 "set_n_write from invalidate", BM_LOCKED_MASK);
1269 1512
1270 if (os.disk > D_FAILED && ns.disk == D_FAILED) { 1513 /* first half of local IO error, failure to attach,
1514 * or administrative detach */
1515 if (os.disk != D_FAILED && ns.disk == D_FAILED) {
1271 enum drbd_io_error_p eh; 1516 enum drbd_io_error_p eh;
1272 1517 int was_io_error;
1273 eh = EP_PASS_ON; 1518 /* corresponding get_ldev was in __drbd_set_state, to serialize
1274 if (get_ldev_if_state(mdev, D_FAILED)) { 1519 * our cleanup here with the transition to D_DISKLESS,
1275 eh = mdev->ldev->dc.on_io_error; 1520 * so it is safe to dreference ldev here. */
1276 put_ldev(mdev); 1521 eh = mdev->ldev->dc.on_io_error;
1277 } 1522 was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
1523
1524 /* current state still has to be D_FAILED,
1525 * there is only one way out: to D_DISKLESS,
1526 * and that may only happen after our put_ldev below. */
1527 if (mdev->state.disk != D_FAILED)
1528 dev_err(DEV,
1529 "ASSERT FAILED: disk is %s during detach\n",
1530 drbd_disk_str(mdev->state.disk));
1531
1532 if (drbd_send_state(mdev))
1533 dev_warn(DEV, "Notified peer that I am detaching my disk\n");
1534 else
1535 dev_err(DEV, "Sending state for detaching disk failed\n");
1278 1536
1279 drbd_rs_cancel_all(mdev); 1537 drbd_rs_cancel_all(mdev);
1280 /* since get_ldev() only works as long as disk>=D_INCONSISTENT,
1281 and it is D_DISKLESS here, local_cnt can only go down, it can
1282 not increase... It will reach zero */
1283 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
1284 mdev->rs_total = 0;
1285 mdev->rs_failed = 0;
1286 atomic_set(&mdev->rs_pending_cnt, 0);
1287 1538
1288 spin_lock_irq(&mdev->req_lock); 1539 /* In case we want to get something to stable storage still,
1289 _drbd_set_state(_NS(mdev, disk, D_DISKLESS), CS_HARD, NULL); 1540 * this may be the last chance.
1290 spin_unlock_irq(&mdev->req_lock); 1541 * Following put_ldev may transition to D_DISKLESS. */
1542 drbd_md_sync(mdev);
1543 put_ldev(mdev);
1291 1544
1292 if (eh == EP_CALL_HELPER) 1545 if (was_io_error && eh == EP_CALL_HELPER)
1293 drbd_khelper(mdev, "local-io-error"); 1546 drbd_khelper(mdev, "local-io-error");
1294 } 1547 }
1295 1548
1296 if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) { 1549 /* second half of local IO error, failure to attach,
1297 1550 * or administrative detach,
1298 if (os.disk == D_FAILED) /* && ns.disk == D_DISKLESS*/ { 1551 * after local_cnt references have reached zero again */
1299 if (drbd_send_state(mdev)) 1552 if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1300 dev_warn(DEV, "Notified peer that my disk is broken.\n"); 1553 /* We must still be diskless,
1301 else 1554 * re-attach has to be serialized with this! */
1302 dev_err(DEV, "Sending state in drbd_io_error() failed\n"); 1555 if (mdev->state.disk != D_DISKLESS)
1303 } 1556 dev_err(DEV,
1304 1557 "ASSERT FAILED: disk is %s while going diskless\n",
1305 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); 1558 drbd_disk_str(mdev->state.disk));
1306 lc_destroy(mdev->resync); 1559
1307 mdev->resync = NULL; 1560 mdev->rs_total = 0;
1308 lc_destroy(mdev->act_log); 1561 mdev->rs_failed = 0;
1309 mdev->act_log = NULL; 1562 atomic_set(&mdev->rs_pending_cnt, 0);
1310 __no_warn(local, 1563
1311 drbd_free_bc(mdev->ldev); 1564 if (drbd_send_state(mdev))
1312 mdev->ldev = NULL;); 1565 dev_warn(DEV, "Notified peer that I'm now diskless.\n");
1313 1566 /* corresponding get_ldev in __drbd_set_state
1314 if (mdev->md_io_tmpp) 1567 * this may finally trigger drbd_ldev_destroy. */
1315 __free_page(mdev->md_io_tmpp); 1568 put_ldev(mdev);
1316 } 1569 }
1317 1570
1571 /* Notify peer that I had a local IO error, and did not detached.. */
1572 if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT)
1573 drbd_send_state(mdev);
1574
1318 /* Disks got bigger while they were detached */ 1575 /* Disks got bigger while they were detached */
1319 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING && 1576 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1320 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) { 1577 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
@@ -1328,6 +1585,28 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1328 (os.user_isp && !ns.user_isp)) 1585 (os.user_isp && !ns.user_isp))
1329 resume_next_sg(mdev); 1586 resume_next_sg(mdev);
1330 1587
1588 /* sync target done with resync. Explicitly notify peer, even though
1589 * it should (at least for non-empty resyncs) already know itself. */
1590 if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1591 drbd_send_state(mdev);
1592
1593 /* This triggers bitmap writeout of potentially still unwritten pages
1594 * if the resync finished cleanly, or aborted because of peer disk
1595 * failure, or because of connection loss.
1596 * For resync aborted because of local disk failure, we cannot do
1597 * any bitmap writeout anymore.
1598 * No harm done if some bits change during this phase.
1599 */
1600 if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
1601 drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL,
1602 "write from resync_finished", BM_LOCKED_SET_ALLOWED);
1603 put_ldev(mdev);
1604 }
1605
1606 /* free tl_hash if we Got thawed and are C_STANDALONE */
1607 if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
1608 drbd_free_tl_hash(mdev);
1609
1331 /* Upon network connection, we need to start the receiver */ 1610 /* Upon network connection, we need to start the receiver */
1332 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED) 1611 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1333 drbd_thread_start(&mdev->receiver); 1612 drbd_thread_start(&mdev->receiver);
@@ -1424,7 +1703,7 @@ int drbd_thread_start(struct drbd_thread *thi)
1424 if (!try_module_get(THIS_MODULE)) { 1703 if (!try_module_get(THIS_MODULE)) {
1425 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n"); 1704 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1426 spin_unlock_irqrestore(&thi->t_lock, flags); 1705 spin_unlock_irqrestore(&thi->t_lock, flags);
1427 return FALSE; 1706 return false;
1428 } 1707 }
1429 1708
1430 init_completion(&thi->stop); 1709 init_completion(&thi->stop);
@@ -1441,7 +1720,7 @@ int drbd_thread_start(struct drbd_thread *thi)
1441 dev_err(DEV, "Couldn't start thread\n"); 1720 dev_err(DEV, "Couldn't start thread\n");
1442 1721
1443 module_put(THIS_MODULE); 1722 module_put(THIS_MODULE);
1444 return FALSE; 1723 return false;
1445 } 1724 }
1446 spin_lock_irqsave(&thi->t_lock, flags); 1725 spin_lock_irqsave(&thi->t_lock, flags);
1447 thi->task = nt; 1726 thi->task = nt;
@@ -1461,7 +1740,7 @@ int drbd_thread_start(struct drbd_thread *thi)
1461 break; 1740 break;
1462 } 1741 }
1463 1742
1464 return TRUE; 1743 return true;
1465} 1744}
1466 1745
1467 1746
@@ -1554,23 +1833,23 @@ void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1554 1833
1555/* the appropriate socket mutex must be held already */ 1834/* the appropriate socket mutex must be held already */
1556int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, 1835int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1557 enum drbd_packets cmd, struct p_header *h, 1836 enum drbd_packets cmd, struct p_header80 *h,
1558 size_t size, unsigned msg_flags) 1837 size_t size, unsigned msg_flags)
1559{ 1838{
1560 int sent, ok; 1839 int sent, ok;
1561 1840
1562 ERR_IF(!h) return FALSE; 1841 ERR_IF(!h) return false;
1563 ERR_IF(!size) return FALSE; 1842 ERR_IF(!size) return false;
1564 1843
1565 h->magic = BE_DRBD_MAGIC; 1844 h->magic = BE_DRBD_MAGIC;
1566 h->command = cpu_to_be16(cmd); 1845 h->command = cpu_to_be16(cmd);
1567 h->length = cpu_to_be16(size-sizeof(struct p_header)); 1846 h->length = cpu_to_be16(size-sizeof(struct p_header80));
1568 1847
1569 sent = drbd_send(mdev, sock, h, size, msg_flags); 1848 sent = drbd_send(mdev, sock, h, size, msg_flags);
1570 1849
1571 ok = (sent == size); 1850 ok = (sent == size);
1572 if (!ok) 1851 if (!ok && !signal_pending(current))
1573 dev_err(DEV, "short sent %s size=%d sent=%d\n", 1852 dev_warn(DEV, "short sent %s size=%d sent=%d\n",
1574 cmdname(cmd), (int)size, sent); 1853 cmdname(cmd), (int)size, sent);
1575 return ok; 1854 return ok;
1576} 1855}
@@ -1579,7 +1858,7 @@ int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1579 * when we hold the appropriate socket mutex. 1858 * when we hold the appropriate socket mutex.
1580 */ 1859 */
1581int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket, 1860int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
1582 enum drbd_packets cmd, struct p_header *h, size_t size) 1861 enum drbd_packets cmd, struct p_header80 *h, size_t size)
1583{ 1862{
1584 int ok = 0; 1863 int ok = 0;
1585 struct socket *sock; 1864 struct socket *sock;
@@ -1607,7 +1886,7 @@ int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
1607int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data, 1886int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1608 size_t size) 1887 size_t size)
1609{ 1888{
1610 struct p_header h; 1889 struct p_header80 h;
1611 int ok; 1890 int ok;
1612 1891
1613 h.magic = BE_DRBD_MAGIC; 1892 h.magic = BE_DRBD_MAGIC;
@@ -1629,7 +1908,7 @@ int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1629 1908
1630int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc) 1909int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1631{ 1910{
1632 struct p_rs_param_89 *p; 1911 struct p_rs_param_95 *p;
1633 struct socket *sock; 1912 struct socket *sock;
1634 int size, rv; 1913 int size, rv;
1635 const int apv = mdev->agreed_pro_version; 1914 const int apv = mdev->agreed_pro_version;
@@ -1637,7 +1916,8 @@ int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1637 size = apv <= 87 ? sizeof(struct p_rs_param) 1916 size = apv <= 87 ? sizeof(struct p_rs_param)
1638 : apv == 88 ? sizeof(struct p_rs_param) 1917 : apv == 88 ? sizeof(struct p_rs_param)
1639 + strlen(mdev->sync_conf.verify_alg) + 1 1918 + strlen(mdev->sync_conf.verify_alg) + 1
1640 : /* 89 */ sizeof(struct p_rs_param_89); 1919 : apv <= 94 ? sizeof(struct p_rs_param_89)
1920 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
1641 1921
1642 /* used from admin command context and receiver/worker context. 1922 /* used from admin command context and receiver/worker context.
1643 * to avoid kmalloc, grab the socket right here, 1923 * to avoid kmalloc, grab the socket right here,
@@ -1648,12 +1928,16 @@ int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1648 if (likely(sock != NULL)) { 1928 if (likely(sock != NULL)) {
1649 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM; 1929 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1650 1930
1651 p = &mdev->data.sbuf.rs_param_89; 1931 p = &mdev->data.sbuf.rs_param_95;
1652 1932
1653 /* initialize verify_alg and csums_alg */ 1933 /* initialize verify_alg and csums_alg */
1654 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); 1934 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1655 1935
1656 p->rate = cpu_to_be32(sc->rate); 1936 p->rate = cpu_to_be32(sc->rate);
1937 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
1938 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
1939 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
1940 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
1657 1941
1658 if (apv >= 88) 1942 if (apv >= 88)
1659 strcpy(p->verify_alg, mdev->sync_conf.verify_alg); 1943 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
@@ -1700,7 +1984,7 @@ int drbd_send_protocol(struct drbd_conf *mdev)
1700 else { 1984 else {
1701 dev_err(DEV, "--dry-run is not supported by peer"); 1985 dev_err(DEV, "--dry-run is not supported by peer");
1702 kfree(p); 1986 kfree(p);
1703 return 0; 1987 return -1;
1704 } 1988 }
1705 } 1989 }
1706 p->conn_flags = cpu_to_be32(cf); 1990 p->conn_flags = cpu_to_be32(cf);
@@ -1709,7 +1993,7 @@ int drbd_send_protocol(struct drbd_conf *mdev)
1709 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg); 1993 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
1710 1994
1711 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL, 1995 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
1712 (struct p_header *)p, size); 1996 (struct p_header80 *)p, size);
1713 kfree(p); 1997 kfree(p);
1714 return rv; 1998 return rv;
1715} 1999}
@@ -1735,7 +2019,7 @@ int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
1735 put_ldev(mdev); 2019 put_ldev(mdev);
1736 2020
1737 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS, 2021 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
1738 (struct p_header *)&p, sizeof(p)); 2022 (struct p_header80 *)&p, sizeof(p));
1739} 2023}
1740 2024
1741int drbd_send_uuids(struct drbd_conf *mdev) 2025int drbd_send_uuids(struct drbd_conf *mdev)
@@ -1748,22 +2032,46 @@ int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
1748 return _drbd_send_uuids(mdev, 8); 2032 return _drbd_send_uuids(mdev, 8);
1749} 2033}
1750 2034
2035void drbd_print_uuids(struct drbd_conf *mdev, const char *text)
2036{
2037 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2038 u64 *uuid = mdev->ldev->md.uuid;
2039 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX\n",
2040 text,
2041 (unsigned long long)uuid[UI_CURRENT],
2042 (unsigned long long)uuid[UI_BITMAP],
2043 (unsigned long long)uuid[UI_HISTORY_START],
2044 (unsigned long long)uuid[UI_HISTORY_END]);
2045 put_ldev(mdev);
2046 } else {
2047 dev_info(DEV, "%s effective data uuid: %016llX\n",
2048 text,
2049 (unsigned long long)mdev->ed_uuid);
2050 }
2051}
1751 2052
1752int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val) 2053int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
1753{ 2054{
1754 struct p_rs_uuid p; 2055 struct p_rs_uuid p;
2056 u64 uuid;
1755 2057
1756 p.uuid = cpu_to_be64(val); 2058 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
2059
2060 uuid = mdev->ldev->md.uuid[UI_BITMAP] + UUID_NEW_BM_OFFSET;
2061 drbd_uuid_set(mdev, UI_BITMAP, uuid);
2062 drbd_print_uuids(mdev, "updated sync UUID");
2063 drbd_md_sync(mdev);
2064 p.uuid = cpu_to_be64(uuid);
1757 2065
1758 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID, 2066 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
1759 (struct p_header *)&p, sizeof(p)); 2067 (struct p_header80 *)&p, sizeof(p));
1760} 2068}
1761 2069
1762int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags) 2070int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
1763{ 2071{
1764 struct p_sizes p; 2072 struct p_sizes p;
1765 sector_t d_size, u_size; 2073 sector_t d_size, u_size;
1766 int q_order_type; 2074 int q_order_type, max_bio_size;
1767 int ok; 2075 int ok;
1768 2076
1769 if (get_ldev_if_state(mdev, D_NEGOTIATING)) { 2077 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
@@ -1771,22 +2079,25 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl
1771 d_size = drbd_get_max_capacity(mdev->ldev); 2079 d_size = drbd_get_max_capacity(mdev->ldev);
1772 u_size = mdev->ldev->dc.disk_size; 2080 u_size = mdev->ldev->dc.disk_size;
1773 q_order_type = drbd_queue_order_type(mdev); 2081 q_order_type = drbd_queue_order_type(mdev);
2082 max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
2083 max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE);
1774 put_ldev(mdev); 2084 put_ldev(mdev);
1775 } else { 2085 } else {
1776 d_size = 0; 2086 d_size = 0;
1777 u_size = 0; 2087 u_size = 0;
1778 q_order_type = QUEUE_ORDERED_NONE; 2088 q_order_type = QUEUE_ORDERED_NONE;
2089 max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
1779 } 2090 }
1780 2091
1781 p.d_size = cpu_to_be64(d_size); 2092 p.d_size = cpu_to_be64(d_size);
1782 p.u_size = cpu_to_be64(u_size); 2093 p.u_size = cpu_to_be64(u_size);
1783 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev)); 2094 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
1784 p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue)); 2095 p.max_bio_size = cpu_to_be32(max_bio_size);
1785 p.queue_order_type = cpu_to_be16(q_order_type); 2096 p.queue_order_type = cpu_to_be16(q_order_type);
1786 p.dds_flags = cpu_to_be16(flags); 2097 p.dds_flags = cpu_to_be16(flags);
1787 2098
1788 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES, 2099 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
1789 (struct p_header *)&p, sizeof(p)); 2100 (struct p_header80 *)&p, sizeof(p));
1790 return ok; 2101 return ok;
1791} 2102}
1792 2103
@@ -1811,7 +2122,7 @@ int drbd_send_state(struct drbd_conf *mdev)
1811 2122
1812 if (likely(sock != NULL)) { 2123 if (likely(sock != NULL)) {
1813 ok = _drbd_send_cmd(mdev, sock, P_STATE, 2124 ok = _drbd_send_cmd(mdev, sock, P_STATE,
1814 (struct p_header *)&p, sizeof(p), 0); 2125 (struct p_header80 *)&p, sizeof(p), 0);
1815 } 2126 }
1816 2127
1817 mutex_unlock(&mdev->data.mutex); 2128 mutex_unlock(&mdev->data.mutex);
@@ -1829,17 +2140,17 @@ int drbd_send_state_req(struct drbd_conf *mdev,
1829 p.val = cpu_to_be32(val.i); 2140 p.val = cpu_to_be32(val.i);
1830 2141
1831 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ, 2142 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
1832 (struct p_header *)&p, sizeof(p)); 2143 (struct p_header80 *)&p, sizeof(p));
1833} 2144}
1834 2145
1835int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode) 2146int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
1836{ 2147{
1837 struct p_req_state_reply p; 2148 struct p_req_state_reply p;
1838 2149
1839 p.retcode = cpu_to_be32(retcode); 2150 p.retcode = cpu_to_be32(retcode);
1840 2151
1841 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY, 2152 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
1842 (struct p_header *)&p, sizeof(p)); 2153 (struct p_header80 *)&p, sizeof(p));
1843} 2154}
1844 2155
1845int fill_bitmap_rle_bits(struct drbd_conf *mdev, 2156int fill_bitmap_rle_bits(struct drbd_conf *mdev,
@@ -1936,9 +2247,15 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev,
1936 return len; 2247 return len;
1937} 2248}
1938 2249
1939enum { OK, FAILED, DONE } 2250/**
2251 * send_bitmap_rle_or_plain
2252 *
2253 * Return 0 when done, 1 when another iteration is needed, and a negative error
2254 * code upon failure.
2255 */
2256static int
1940send_bitmap_rle_or_plain(struct drbd_conf *mdev, 2257send_bitmap_rle_or_plain(struct drbd_conf *mdev,
1941 struct p_header *h, struct bm_xfer_ctx *c) 2258 struct p_header80 *h, struct bm_xfer_ctx *c)
1942{ 2259{
1943 struct p_compressed_bm *p = (void*)h; 2260 struct p_compressed_bm *p = (void*)h;
1944 unsigned long num_words; 2261 unsigned long num_words;
@@ -1948,7 +2265,7 @@ send_bitmap_rle_or_plain(struct drbd_conf *mdev,
1948 len = fill_bitmap_rle_bits(mdev, p, c); 2265 len = fill_bitmap_rle_bits(mdev, p, c);
1949 2266
1950 if (len < 0) 2267 if (len < 0)
1951 return FAILED; 2268 return -EIO;
1952 2269
1953 if (len) { 2270 if (len) {
1954 DCBP_set_code(p, RLE_VLI_Bits); 2271 DCBP_set_code(p, RLE_VLI_Bits);
@@ -1968,38 +2285,41 @@ send_bitmap_rle_or_plain(struct drbd_conf *mdev,
1968 if (len) 2285 if (len)
1969 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload); 2286 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
1970 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP, 2287 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
1971 h, sizeof(struct p_header) + len, 0); 2288 h, sizeof(struct p_header80) + len, 0);
1972 c->word_offset += num_words; 2289 c->word_offset += num_words;
1973 c->bit_offset = c->word_offset * BITS_PER_LONG; 2290 c->bit_offset = c->word_offset * BITS_PER_LONG;
1974 2291
1975 c->packets[1]++; 2292 c->packets[1]++;
1976 c->bytes[1] += sizeof(struct p_header) + len; 2293 c->bytes[1] += sizeof(struct p_header80) + len;
1977 2294
1978 if (c->bit_offset > c->bm_bits) 2295 if (c->bit_offset > c->bm_bits)
1979 c->bit_offset = c->bm_bits; 2296 c->bit_offset = c->bm_bits;
1980 } 2297 }
1981 ok = ok ? ((len == 0) ? DONE : OK) : FAILED; 2298 if (ok) {
1982 2299 if (len == 0) {
1983 if (ok == DONE) 2300 INFO_bm_xfer_stats(mdev, "send", c);
1984 INFO_bm_xfer_stats(mdev, "send", c); 2301 return 0;
1985 return ok; 2302 } else
2303 return 1;
2304 }
2305 return -EIO;
1986} 2306}
1987 2307
1988/* See the comment at receive_bitmap() */ 2308/* See the comment at receive_bitmap() */
1989int _drbd_send_bitmap(struct drbd_conf *mdev) 2309int _drbd_send_bitmap(struct drbd_conf *mdev)
1990{ 2310{
1991 struct bm_xfer_ctx c; 2311 struct bm_xfer_ctx c;
1992 struct p_header *p; 2312 struct p_header80 *p;
1993 int ret; 2313 int err;
1994 2314
1995 ERR_IF(!mdev->bitmap) return FALSE; 2315 ERR_IF(!mdev->bitmap) return false;
1996 2316
1997 /* maybe we should use some per thread scratch page, 2317 /* maybe we should use some per thread scratch page,
1998 * and allocate that during initial device creation? */ 2318 * and allocate that during initial device creation? */
1999 p = (struct p_header *) __get_free_page(GFP_NOIO); 2319 p = (struct p_header80 *) __get_free_page(GFP_NOIO);
2000 if (!p) { 2320 if (!p) {
2001 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__); 2321 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
2002 return FALSE; 2322 return false;
2003 } 2323 }
2004 2324
2005 if (get_ldev(mdev)) { 2325 if (get_ldev(mdev)) {
@@ -2025,11 +2345,11 @@ int _drbd_send_bitmap(struct drbd_conf *mdev)
2025 }; 2345 };
2026 2346
2027 do { 2347 do {
2028 ret = send_bitmap_rle_or_plain(mdev, p, &c); 2348 err = send_bitmap_rle_or_plain(mdev, p, &c);
2029 } while (ret == OK); 2349 } while (err > 0);
2030 2350
2031 free_page((unsigned long) p); 2351 free_page((unsigned long) p);
2032 return (ret == DONE); 2352 return err == 0;
2033} 2353}
2034 2354
2035int drbd_send_bitmap(struct drbd_conf *mdev) 2355int drbd_send_bitmap(struct drbd_conf *mdev)
@@ -2052,9 +2372,9 @@ int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2052 p.set_size = cpu_to_be32(set_size); 2372 p.set_size = cpu_to_be32(set_size);
2053 2373
2054 if (mdev->state.conn < C_CONNECTED) 2374 if (mdev->state.conn < C_CONNECTED)
2055 return FALSE; 2375 return false;
2056 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK, 2376 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
2057 (struct p_header *)&p, sizeof(p)); 2377 (struct p_header80 *)&p, sizeof(p));
2058 return ok; 2378 return ok;
2059} 2379}
2060 2380
@@ -2080,19 +2400,20 @@ static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2080 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq)); 2400 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2081 2401
2082 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED) 2402 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
2083 return FALSE; 2403 return false;
2084 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd, 2404 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
2085 (struct p_header *)&p, sizeof(p)); 2405 (struct p_header80 *)&p, sizeof(p));
2086 return ok; 2406 return ok;
2087} 2407}
2088 2408
2409/* dp->sector and dp->block_id already/still in network byte order,
2410 * data_size is payload size according to dp->head,
2411 * and may need to be corrected for digest size. */
2089int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd, 2412int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
2090 struct p_data *dp) 2413 struct p_data *dp, int data_size)
2091{ 2414{
2092 const int header_size = sizeof(struct p_data) 2415 data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
2093 - sizeof(struct p_header); 2416 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
2094 int data_size = ((struct p_header *)dp)->length - header_size;
2095
2096 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size), 2417 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2097 dp->block_id); 2418 dp->block_id);
2098} 2419}
@@ -2140,7 +2461,7 @@ int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2140 p.blksize = cpu_to_be32(size); 2461 p.blksize = cpu_to_be32(size);
2141 2462
2142 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, 2463 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
2143 (struct p_header *)&p, sizeof(p)); 2464 (struct p_header80 *)&p, sizeof(p));
2144 return ok; 2465 return ok;
2145} 2466}
2146 2467
@@ -2158,7 +2479,7 @@ int drbd_send_drequest_csum(struct drbd_conf *mdev,
2158 2479
2159 p.head.magic = BE_DRBD_MAGIC; 2480 p.head.magic = BE_DRBD_MAGIC;
2160 p.head.command = cpu_to_be16(cmd); 2481 p.head.command = cpu_to_be16(cmd);
2161 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header) + digest_size); 2482 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
2162 2483
2163 mutex_lock(&mdev->data.mutex); 2484 mutex_lock(&mdev->data.mutex);
2164 2485
@@ -2180,13 +2501,13 @@ int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2180 p.blksize = cpu_to_be32(size); 2501 p.blksize = cpu_to_be32(size);
2181 2502
2182 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST, 2503 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
2183 (struct p_header *)&p, sizeof(p)); 2504 (struct p_header80 *)&p, sizeof(p));
2184 return ok; 2505 return ok;
2185} 2506}
2186 2507
2187/* called on sndtimeo 2508/* called on sndtimeo
2188 * returns FALSE if we should retry, 2509 * returns false if we should retry,
2189 * TRUE if we think connection is dead 2510 * true if we think connection is dead
2190 */ 2511 */
2191static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock) 2512static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2192{ 2513{
@@ -2199,7 +2520,7 @@ static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *
2199 || mdev->state.conn < C_CONNECTED; 2520 || mdev->state.conn < C_CONNECTED;
2200 2521
2201 if (drop_it) 2522 if (drop_it)
2202 return TRUE; 2523 return true;
2203 2524
2204 drop_it = !--mdev->ko_count; 2525 drop_it = !--mdev->ko_count;
2205 if (!drop_it) { 2526 if (!drop_it) {
@@ -2332,6 +2653,17 @@ static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2332 return 1; 2653 return 1;
2333} 2654}
2334 2655
2656static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
2657{
2658 if (mdev->agreed_pro_version >= 95)
2659 return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
2660 (bi_rw & REQ_FUA ? DP_FUA : 0) |
2661 (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2662 (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
2663 else
2664 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
2665}
2666
2335/* Used to send write requests 2667/* Used to send write requests
2336 * R_PRIMARY -> Peer (P_DATA) 2668 * R_PRIMARY -> Peer (P_DATA)
2337 */ 2669 */
@@ -2349,30 +2681,25 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2349 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ? 2681 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2350 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0; 2682 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2351 2683
2352 p.head.magic = BE_DRBD_MAGIC; 2684 if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
2353 p.head.command = cpu_to_be16(P_DATA); 2685 p.head.h80.magic = BE_DRBD_MAGIC;
2354 p.head.length = 2686 p.head.h80.command = cpu_to_be16(P_DATA);
2355 cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + req->size); 2687 p.head.h80.length =
2688 cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2689 } else {
2690 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2691 p.head.h95.command = cpu_to_be16(P_DATA);
2692 p.head.h95.length =
2693 cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2694 }
2356 2695
2357 p.sector = cpu_to_be64(req->sector); 2696 p.sector = cpu_to_be64(req->sector);
2358 p.block_id = (unsigned long)req; 2697 p.block_id = (unsigned long)req;
2359 p.seq_num = cpu_to_be32(req->seq_num = 2698 p.seq_num = cpu_to_be32(req->seq_num =
2360 atomic_add_return(1, &mdev->packet_seq)); 2699 atomic_add_return(1, &mdev->packet_seq));
2361 dp_flags = 0;
2362 2700
2363 /* NOTE: no need to check if barriers supported here as we would 2701 dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
2364 * not pass the test in make_request_common in that case 2702
2365 */
2366 if (req->master_bio->bi_rw & REQ_HARDBARRIER) {
2367 dev_err(DEV, "ASSERT FAILED would have set DP_HARDBARRIER\n");
2368 /* dp_flags |= DP_HARDBARRIER; */
2369 }
2370 if (req->master_bio->bi_rw & REQ_SYNC)
2371 dp_flags |= DP_RW_SYNC;
2372 /* for now handle SYNCIO and UNPLUG
2373 * as if they still were one and the same flag */
2374 if (req->master_bio->bi_rw & REQ_UNPLUG)
2375 dp_flags |= DP_RW_SYNC;
2376 if (mdev->state.conn >= C_SYNC_SOURCE && 2703 if (mdev->state.conn >= C_SYNC_SOURCE &&
2377 mdev->state.conn <= C_PAUSED_SYNC_T) 2704 mdev->state.conn <= C_PAUSED_SYNC_T)
2378 dp_flags |= DP_MAY_SET_IN_SYNC; 2705 dp_flags |= DP_MAY_SET_IN_SYNC;
@@ -2384,13 +2711,39 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2384 if (ok && dgs) { 2711 if (ok && dgs) {
2385 dgb = mdev->int_dig_out; 2712 dgb = mdev->int_dig_out;
2386 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb); 2713 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
2387 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, 0); 2714 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
2388 } 2715 }
2389 if (ok) { 2716 if (ok) {
2390 if (mdev->net_conf->wire_protocol == DRBD_PROT_A) 2717 /* For protocol A, we have to memcpy the payload into
2718 * socket buffers, as we may complete right away
2719 * as soon as we handed it over to tcp, at which point the data
2720 * pages may become invalid.
2721 *
2722 * For data-integrity enabled, we copy it as well, so we can be
2723 * sure that even if the bio pages may still be modified, it
2724 * won't change the data on the wire, thus if the digest checks
2725 * out ok after sending on this side, but does not fit on the
2726 * receiving side, we sure have detected corruption elsewhere.
2727 */
2728 if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
2391 ok = _drbd_send_bio(mdev, req->master_bio); 2729 ok = _drbd_send_bio(mdev, req->master_bio);
2392 else 2730 else
2393 ok = _drbd_send_zc_bio(mdev, req->master_bio); 2731 ok = _drbd_send_zc_bio(mdev, req->master_bio);
2732
2733 /* double check digest, sometimes buffers have been modified in flight. */
2734 if (dgs > 0 && dgs <= 64) {
2735 /* 64 byte, 512 bit, is the largest digest size
2736 * currently supported in kernel crypto. */
2737 unsigned char digest[64];
2738 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
2739 if (memcmp(mdev->int_dig_out, digest, dgs)) {
2740 dev_warn(DEV,
2741 "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
2742 (unsigned long long)req->sector, req->size);
2743 }
2744 } /* else if (dgs > 64) {
2745 ... Be noisy about digest too large ...
2746 } */
2394 } 2747 }
2395 2748
2396 drbd_put_data_sock(mdev); 2749 drbd_put_data_sock(mdev);
@@ -2413,10 +2766,17 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2413 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ? 2766 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2414 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0; 2767 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2415 2768
2416 p.head.magic = BE_DRBD_MAGIC; 2769 if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
2417 p.head.command = cpu_to_be16(cmd); 2770 p.head.h80.magic = BE_DRBD_MAGIC;
2418 p.head.length = 2771 p.head.h80.command = cpu_to_be16(cmd);
2419 cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + e->size); 2772 p.head.h80.length =
2773 cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2774 } else {
2775 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2776 p.head.h95.command = cpu_to_be16(cmd);
2777 p.head.h95.length =
2778 cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2779 }
2420 2780
2421 p.sector = cpu_to_be64(e->sector); 2781 p.sector = cpu_to_be64(e->sector);
2422 p.block_id = e->block_id; 2782 p.block_id = e->block_id;
@@ -2429,12 +2789,11 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2429 if (!drbd_get_data_sock(mdev)) 2789 if (!drbd_get_data_sock(mdev))
2430 return 0; 2790 return 0;
2431 2791
2432 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, 2792 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
2433 sizeof(p), dgs ? MSG_MORE : 0);
2434 if (ok && dgs) { 2793 if (ok && dgs) {
2435 dgb = mdev->int_dig_out; 2794 dgb = mdev->int_dig_out;
2436 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb); 2795 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
2437 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, 0); 2796 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
2438 } 2797 }
2439 if (ok) 2798 if (ok)
2440 ok = _drbd_send_zc_ee(mdev, e); 2799 ok = _drbd_send_zc_ee(mdev, e);
@@ -2444,6 +2803,16 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2444 return ok; 2803 return ok;
2445} 2804}
2446 2805
2806int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
2807{
2808 struct p_block_desc p;
2809
2810 p.sector = cpu_to_be64(req->sector);
2811 p.blksize = cpu_to_be32(req->size);
2812
2813 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
2814}
2815
2447/* 2816/*
2448 drbd_send distinguishes two cases: 2817 drbd_send distinguishes two cases:
2449 2818
@@ -2536,7 +2905,7 @@ static int drbd_open(struct block_device *bdev, fmode_t mode)
2536 unsigned long flags; 2905 unsigned long flags;
2537 int rv = 0; 2906 int rv = 0;
2538 2907
2539 lock_kernel(); 2908 mutex_lock(&drbd_main_mutex);
2540 spin_lock_irqsave(&mdev->req_lock, flags); 2909 spin_lock_irqsave(&mdev->req_lock, flags);
2541 /* to have a stable mdev->state.role 2910 /* to have a stable mdev->state.role
2542 * and no race with updating open_cnt */ 2911 * and no race with updating open_cnt */
@@ -2551,7 +2920,7 @@ static int drbd_open(struct block_device *bdev, fmode_t mode)
2551 if (!rv) 2920 if (!rv)
2552 mdev->open_cnt++; 2921 mdev->open_cnt++;
2553 spin_unlock_irqrestore(&mdev->req_lock, flags); 2922 spin_unlock_irqrestore(&mdev->req_lock, flags);
2554 unlock_kernel(); 2923 mutex_unlock(&drbd_main_mutex);
2555 2924
2556 return rv; 2925 return rv;
2557} 2926}
@@ -2559,41 +2928,12 @@ static int drbd_open(struct block_device *bdev, fmode_t mode)
2559static int drbd_release(struct gendisk *gd, fmode_t mode) 2928static int drbd_release(struct gendisk *gd, fmode_t mode)
2560{ 2929{
2561 struct drbd_conf *mdev = gd->private_data; 2930 struct drbd_conf *mdev = gd->private_data;
2562 lock_kernel(); 2931 mutex_lock(&drbd_main_mutex);
2563 mdev->open_cnt--; 2932 mdev->open_cnt--;
2564 unlock_kernel(); 2933 mutex_unlock(&drbd_main_mutex);
2565 return 0; 2934 return 0;
2566} 2935}
2567 2936
2568static void drbd_unplug_fn(struct request_queue *q)
2569{
2570 struct drbd_conf *mdev = q->queuedata;
2571
2572 /* unplug FIRST */
2573 spin_lock_irq(q->queue_lock);
2574 blk_remove_plug(q);
2575 spin_unlock_irq(q->queue_lock);
2576
2577 /* only if connected */
2578 spin_lock_irq(&mdev->req_lock);
2579 if (mdev->state.pdsk >= D_INCONSISTENT && mdev->state.conn >= C_CONNECTED) {
2580 D_ASSERT(mdev->state.role == R_PRIMARY);
2581 if (test_and_clear_bit(UNPLUG_REMOTE, &mdev->flags)) {
2582 /* add to the data.work queue,
2583 * unless already queued.
2584 * XXX this might be a good addition to drbd_queue_work
2585 * anyways, to detect "double queuing" ... */
2586 if (list_empty(&mdev->unplug_work.list))
2587 drbd_queue_work(&mdev->data.work,
2588 &mdev->unplug_work);
2589 }
2590 }
2591 spin_unlock_irq(&mdev->req_lock);
2592
2593 if (mdev->state.disk >= D_INCONSISTENT)
2594 drbd_kick_lo(mdev);
2595}
2596
2597static void drbd_set_defaults(struct drbd_conf *mdev) 2937static void drbd_set_defaults(struct drbd_conf *mdev)
2598{ 2938{
2599 /* This way we get a compile error when sync_conf grows, 2939 /* This way we get a compile error when sync_conf grows,
@@ -2605,7 +2945,13 @@ static void drbd_set_defaults(struct drbd_conf *mdev)
2605 /* .verify_alg = */ {}, 0, 2945 /* .verify_alg = */ {}, 0,
2606 /* .cpu_mask = */ {}, 0, 2946 /* .cpu_mask = */ {}, 0,
2607 /* .csums_alg = */ {}, 0, 2947 /* .csums_alg = */ {}, 0,
2608 /* .use_rle = */ 0 2948 /* .use_rle = */ 0,
2949 /* .on_no_data = */ DRBD_ON_NO_DATA_DEF,
2950 /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
2951 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
2952 /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
2953 /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
2954 /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
2609 }; 2955 };
2610 2956
2611 /* Have to use that way, because the layout differs between 2957 /* Have to use that way, because the layout differs between
@@ -2616,7 +2962,9 @@ static void drbd_set_defaults(struct drbd_conf *mdev)
2616 .conn = C_STANDALONE, 2962 .conn = C_STANDALONE,
2617 .disk = D_DISKLESS, 2963 .disk = D_DISKLESS,
2618 .pdsk = D_UNKNOWN, 2964 .pdsk = D_UNKNOWN,
2619 .susp = 0 2965 .susp = 0,
2966 .susp_nod = 0,
2967 .susp_fen = 0
2620 } }; 2968 } };
2621} 2969}
2622 2970
@@ -2627,11 +2975,6 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
2627 2975
2628 drbd_set_defaults(mdev); 2976 drbd_set_defaults(mdev);
2629 2977
2630 /* for now, we do NOT yet support it,
2631 * even though we start some framework
2632 * to eventually support barriers */
2633 set_bit(NO_BARRIER_SUPP, &mdev->flags);
2634
2635 atomic_set(&mdev->ap_bio_cnt, 0); 2978 atomic_set(&mdev->ap_bio_cnt, 0);
2636 atomic_set(&mdev->ap_pending_cnt, 0); 2979 atomic_set(&mdev->ap_pending_cnt, 0);
2637 atomic_set(&mdev->rs_pending_cnt, 0); 2980 atomic_set(&mdev->rs_pending_cnt, 0);
@@ -2640,6 +2983,10 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
2640 atomic_set(&mdev->net_cnt, 0); 2983 atomic_set(&mdev->net_cnt, 0);
2641 atomic_set(&mdev->packet_seq, 0); 2984 atomic_set(&mdev->packet_seq, 0);
2642 atomic_set(&mdev->pp_in_use, 0); 2985 atomic_set(&mdev->pp_in_use, 0);
2986 atomic_set(&mdev->pp_in_use_by_net, 0);
2987 atomic_set(&mdev->rs_sect_in, 0);
2988 atomic_set(&mdev->rs_sect_ev, 0);
2989 atomic_set(&mdev->ap_in_flight, 0);
2643 2990
2644 mutex_init(&mdev->md_io_mutex); 2991 mutex_init(&mdev->md_io_mutex);
2645 mutex_init(&mdev->data.mutex); 2992 mutex_init(&mdev->data.mutex);
@@ -2666,22 +3013,33 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
2666 INIT_LIST_HEAD(&mdev->meta.work.q); 3013 INIT_LIST_HEAD(&mdev->meta.work.q);
2667 INIT_LIST_HEAD(&mdev->resync_work.list); 3014 INIT_LIST_HEAD(&mdev->resync_work.list);
2668 INIT_LIST_HEAD(&mdev->unplug_work.list); 3015 INIT_LIST_HEAD(&mdev->unplug_work.list);
3016 INIT_LIST_HEAD(&mdev->go_diskless.list);
2669 INIT_LIST_HEAD(&mdev->md_sync_work.list); 3017 INIT_LIST_HEAD(&mdev->md_sync_work.list);
3018 INIT_LIST_HEAD(&mdev->start_resync_work.list);
2670 INIT_LIST_HEAD(&mdev->bm_io_work.w.list); 3019 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
2671 3020
2672 mdev->resync_work.cb = w_resync_inactive; 3021 mdev->resync_work.cb = w_resync_timer;
2673 mdev->unplug_work.cb = w_send_write_hint; 3022 mdev->unplug_work.cb = w_send_write_hint;
3023 mdev->go_diskless.cb = w_go_diskless;
2674 mdev->md_sync_work.cb = w_md_sync; 3024 mdev->md_sync_work.cb = w_md_sync;
2675 mdev->bm_io_work.w.cb = w_bitmap_io; 3025 mdev->bm_io_work.w.cb = w_bitmap_io;
3026 mdev->start_resync_work.cb = w_start_resync;
2676 init_timer(&mdev->resync_timer); 3027 init_timer(&mdev->resync_timer);
2677 init_timer(&mdev->md_sync_timer); 3028 init_timer(&mdev->md_sync_timer);
3029 init_timer(&mdev->start_resync_timer);
3030 init_timer(&mdev->request_timer);
2678 mdev->resync_timer.function = resync_timer_fn; 3031 mdev->resync_timer.function = resync_timer_fn;
2679 mdev->resync_timer.data = (unsigned long) mdev; 3032 mdev->resync_timer.data = (unsigned long) mdev;
2680 mdev->md_sync_timer.function = md_sync_timer_fn; 3033 mdev->md_sync_timer.function = md_sync_timer_fn;
2681 mdev->md_sync_timer.data = (unsigned long) mdev; 3034 mdev->md_sync_timer.data = (unsigned long) mdev;
3035 mdev->start_resync_timer.function = start_resync_timer_fn;
3036 mdev->start_resync_timer.data = (unsigned long) mdev;
3037 mdev->request_timer.function = request_timer_fn;
3038 mdev->request_timer.data = (unsigned long) mdev;
2682 3039
2683 init_waitqueue_head(&mdev->misc_wait); 3040 init_waitqueue_head(&mdev->misc_wait);
2684 init_waitqueue_head(&mdev->state_wait); 3041 init_waitqueue_head(&mdev->state_wait);
3042 init_waitqueue_head(&mdev->net_cnt_wait);
2685 init_waitqueue_head(&mdev->ee_wait); 3043 init_waitqueue_head(&mdev->ee_wait);
2686 init_waitqueue_head(&mdev->al_wait); 3044 init_waitqueue_head(&mdev->al_wait);
2687 init_waitqueue_head(&mdev->seq_wait); 3045 init_waitqueue_head(&mdev->seq_wait);
@@ -2691,12 +3049,15 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
2691 drbd_thread_init(mdev, &mdev->asender, drbd_asender); 3049 drbd_thread_init(mdev, &mdev->asender, drbd_asender);
2692 3050
2693 mdev->agreed_pro_version = PRO_VERSION_MAX; 3051 mdev->agreed_pro_version = PRO_VERSION_MAX;
2694 mdev->write_ordering = WO_bio_barrier; 3052 mdev->write_ordering = WO_bdev_flush;
2695 mdev->resync_wenr = LC_FREE; 3053 mdev->resync_wenr = LC_FREE;
3054 mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
3055 mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
2696} 3056}
2697 3057
2698void drbd_mdev_cleanup(struct drbd_conf *mdev) 3058void drbd_mdev_cleanup(struct drbd_conf *mdev)
2699{ 3059{
3060 int i;
2700 if (mdev->receiver.t_state != None) 3061 if (mdev->receiver.t_state != None)
2701 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n", 3062 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
2702 mdev->receiver.t_state); 3063 mdev->receiver.t_state);
@@ -2713,9 +3074,13 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev)
2713 mdev->p_size = 3074 mdev->p_size =
2714 mdev->rs_start = 3075 mdev->rs_start =
2715 mdev->rs_total = 3076 mdev->rs_total =
2716 mdev->rs_failed = 3077 mdev->rs_failed = 0;
2717 mdev->rs_mark_left = 3078 mdev->rs_last_events = 0;
2718 mdev->rs_mark_time = 0; 3079 mdev->rs_last_sect_ev = 0;
3080 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
3081 mdev->rs_mark_left[i] = 0;
3082 mdev->rs_mark_time[i] = 0;
3083 }
2719 D_ASSERT(mdev->net_conf == NULL); 3084 D_ASSERT(mdev->net_conf == NULL);
2720 3085
2721 drbd_set_my_capacity(mdev, 0); 3086 drbd_set_my_capacity(mdev, 0);
@@ -2726,6 +3091,7 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev)
2726 } 3091 }
2727 3092
2728 drbd_free_resources(mdev); 3093 drbd_free_resources(mdev);
3094 clear_bit(AL_SUSPENDED, &mdev->flags);
2729 3095
2730 /* 3096 /*
2731 * currently we drbd_init_ee only on module load, so 3097 * currently we drbd_init_ee only on module load, so
@@ -2741,7 +3107,9 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev)
2741 D_ASSERT(list_empty(&mdev->meta.work.q)); 3107 D_ASSERT(list_empty(&mdev->meta.work.q));
2742 D_ASSERT(list_empty(&mdev->resync_work.list)); 3108 D_ASSERT(list_empty(&mdev->resync_work.list));
2743 D_ASSERT(list_empty(&mdev->unplug_work.list)); 3109 D_ASSERT(list_empty(&mdev->unplug_work.list));
3110 D_ASSERT(list_empty(&mdev->go_diskless.list));
2744 3111
3112 drbd_set_defaults(mdev);
2745} 3113}
2746 3114
2747 3115
@@ -2784,7 +3152,7 @@ static void drbd_destroy_mempools(void)
2784static int drbd_create_mempools(void) 3152static int drbd_create_mempools(void)
2785{ 3153{
2786 struct page *page; 3154 struct page *page;
2787 const int number = (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE) * minor_count; 3155 const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
2788 int i; 3156 int i;
2789 3157
2790 /* prepare our caches and mempools */ 3158 /* prepare our caches and mempools */
@@ -2824,7 +3192,7 @@ static int drbd_create_mempools(void)
2824 3192
2825 drbd_ee_mempool = mempool_create(number, 3193 drbd_ee_mempool = mempool_create(number,
2826 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache); 3194 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
2827 if (drbd_request_mempool == NULL) 3195 if (drbd_ee_mempool == NULL)
2828 goto Enomem; 3196 goto Enomem;
2829 3197
2830 /* drbd's page pool */ 3198 /* drbd's page pool */
@@ -2919,7 +3287,7 @@ static void drbd_delete_device(unsigned int minor)
2919 3287
2920 drbd_release_ee_lists(mdev); 3288 drbd_release_ee_lists(mdev);
2921 3289
2922 /* should be free'd on disconnect? */ 3290 /* should be freed on disconnect? */
2923 kfree(mdev->ee_hash); 3291 kfree(mdev->ee_hash);
2924 /* 3292 /*
2925 mdev->ee_hash_s = 0; 3293 mdev->ee_hash_s = 0;
@@ -2948,11 +3316,20 @@ static void drbd_cleanup(void)
2948 3316
2949 unregister_reboot_notifier(&drbd_notifier); 3317 unregister_reboot_notifier(&drbd_notifier);
2950 3318
3319 /* first remove proc,
3320 * drbdsetup uses it's presence to detect
3321 * whether DRBD is loaded.
3322 * If we would get stuck in proc removal,
3323 * but have netlink already deregistered,
3324 * some drbdsetup commands may wait forever
3325 * for an answer.
3326 */
3327 if (drbd_proc)
3328 remove_proc_entry("drbd", NULL);
3329
2951 drbd_nl_cleanup(); 3330 drbd_nl_cleanup();
2952 3331
2953 if (minor_table) { 3332 if (minor_table) {
2954 if (drbd_proc)
2955 remove_proc_entry("drbd", NULL);
2956 i = minor_count; 3333 i = minor_count;
2957 while (i--) 3334 while (i--)
2958 drbd_delete_device(i); 3335 drbd_delete_device(i);
@@ -2980,7 +3357,7 @@ static int drbd_congested(void *congested_data, int bdi_bits)
2980 char reason = '-'; 3357 char reason = '-';
2981 int r = 0; 3358 int r = 0;
2982 3359
2983 if (!__inc_ap_bio_cond(mdev)) { 3360 if (!may_inc_ap_bio(mdev)) {
2984 /* DRBD has frozen IO */ 3361 /* DRBD has frozen IO */
2985 r = bdi_bits; 3362 r = bdi_bits;
2986 reason = 'd'; 3363 reason = 'd';
@@ -3033,7 +3410,7 @@ struct drbd_conf *drbd_new_device(unsigned int minor)
3033 goto out_no_disk; 3410 goto out_no_disk;
3034 mdev->vdisk = disk; 3411 mdev->vdisk = disk;
3035 3412
3036 set_disk_ro(disk, TRUE); 3413 set_disk_ro(disk, true);
3037 3414
3038 disk->queue = q; 3415 disk->queue = q;
3039 disk->major = DRBD_MAJOR; 3416 disk->major = DRBD_MAJOR;
@@ -3049,13 +3426,13 @@ struct drbd_conf *drbd_new_device(unsigned int minor)
3049 q->backing_dev_info.congested_fn = drbd_congested; 3426 q->backing_dev_info.congested_fn = drbd_congested;
3050 q->backing_dev_info.congested_data = mdev; 3427 q->backing_dev_info.congested_data = mdev;
3051 3428
3052 blk_queue_make_request(q, drbd_make_request_26); 3429 blk_queue_make_request(q, drbd_make_request);
3053 blk_queue_max_segment_size(q, DRBD_MAX_SEGMENT_SIZE); 3430 /* Setting the max_hw_sectors to an odd value of 8kibyte here
3431 This triggers a max_bio_size message upon first attach or connect */
3432 blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
3054 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); 3433 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3055 blk_queue_merge_bvec(q, drbd_merge_bvec); 3434 blk_queue_merge_bvec(q, drbd_merge_bvec);
3056 q->queue_lock = &mdev->req_lock; /* needed since we use */ 3435 q->queue_lock = &mdev->req_lock;
3057 /* plugging on a queue, that actually has no requests! */
3058 q->unplug_fn = drbd_unplug_fn;
3059 3436
3060 mdev->md_io_page = alloc_page(GFP_KERNEL); 3437 mdev->md_io_page = alloc_page(GFP_KERNEL);
3061 if (!mdev->md_io_page) 3438 if (!mdev->md_io_page)
@@ -3114,6 +3491,7 @@ void drbd_free_mdev(struct drbd_conf *mdev)
3114 put_disk(mdev->vdisk); 3491 put_disk(mdev->vdisk);
3115 blk_cleanup_queue(mdev->rq_queue); 3492 blk_cleanup_queue(mdev->rq_queue);
3116 free_cpumask_var(mdev->cpu_mask); 3493 free_cpumask_var(mdev->cpu_mask);
3494 drbd_free_tl_hash(mdev);
3117 kfree(mdev); 3495 kfree(mdev);
3118} 3496}
3119 3497
@@ -3129,7 +3507,7 @@ int __init drbd_init(void)
3129 return -EINVAL; 3507 return -EINVAL;
3130 } 3508 }
3131 3509
3132 if (1 > minor_count || minor_count > 255) { 3510 if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
3133 printk(KERN_ERR 3511 printk(KERN_ERR
3134 "drbd: invalid minor_count (%d)\n", minor_count); 3512 "drbd: invalid minor_count (%d)\n", minor_count);
3135#ifdef MODULE 3513#ifdef MODULE
@@ -3203,11 +3581,8 @@ void drbd_free_bc(struct drbd_backing_dev *ldev)
3203 if (ldev == NULL) 3581 if (ldev == NULL)
3204 return; 3582 return;
3205 3583
3206 bd_release(ldev->backing_bdev); 3584 blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3207 bd_release(ldev->md_bdev); 3585 blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3208
3209 fput(ldev->lo_file);
3210 fput(ldev->md_file);
3211 3586
3212 kfree(ldev); 3587 kfree(ldev);
3213} 3588}
@@ -3266,7 +3641,8 @@ struct meta_data_on_disk {
3266 /* `-- act_log->nr_elements <-- sync_conf.al_extents */ 3641 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3267 u32 bm_offset; /* offset to the bitmap, from here */ 3642 u32 bm_offset; /* offset to the bitmap, from here */
3268 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */ 3643 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
3269 u32 reserved_u32[4]; 3644 u32 la_peer_max_bio_size; /* last peer max_bio_size */
3645 u32 reserved_u32[3];
3270 3646
3271} __packed; 3647} __packed;
3272 3648
@@ -3280,9 +3656,10 @@ void drbd_md_sync(struct drbd_conf *mdev)
3280 sector_t sector; 3656 sector_t sector;
3281 int i; 3657 int i;
3282 3658
3659 del_timer(&mdev->md_sync_timer);
3660 /* timer may be rearmed by drbd_md_mark_dirty() now. */
3283 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags)) 3661 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3284 return; 3662 return;
3285 del_timer(&mdev->md_sync_timer);
3286 3663
3287 /* We use here D_FAILED and not D_ATTACHING because we try to write 3664 /* We use here D_FAILED and not D_ATTACHING because we try to write
3288 * metadata even if we detach due to a disk failure! */ 3665 * metadata even if we detach due to a disk failure! */
@@ -3306,17 +3683,15 @@ void drbd_md_sync(struct drbd_conf *mdev)
3306 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid); 3683 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3307 3684
3308 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset); 3685 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
3686 buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
3309 3687
3310 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset); 3688 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3311 sector = mdev->ldev->md.md_offset; 3689 sector = mdev->ldev->md.md_offset;
3312 3690
3313 if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { 3691 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
3314 clear_bit(MD_DIRTY, &mdev->flags);
3315 } else {
3316 /* this was a try anyways ... */ 3692 /* this was a try anyways ... */
3317 dev_err(DEV, "meta data update failed!\n"); 3693 dev_err(DEV, "meta data update failed!\n");
3318 3694 drbd_chk_io_error(mdev, 1, true);
3319 drbd_chk_io_error(mdev, 1, TRUE);
3320 } 3695 }
3321 3696
3322 /* Update mdev->ldev->md.la_size_sect, 3697 /* Update mdev->ldev->md.la_size_sect,
@@ -3332,7 +3707,7 @@ void drbd_md_sync(struct drbd_conf *mdev)
3332 * @mdev: DRBD device. 3707 * @mdev: DRBD device.
3333 * @bdev: Device from which the meta data should be read in. 3708 * @bdev: Device from which the meta data should be read in.
3334 * 3709 *
3335 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_codes in case 3710 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
3336 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID. 3711 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3337 */ 3712 */
3338int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) 3713int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
@@ -3347,7 +3722,7 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3347 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); 3722 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3348 3723
3349 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) { 3724 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
3350 /* NOTE: cant do normal error processing here as this is 3725 /* NOTE: can't do normal error processing here as this is
3351 called BEFORE disk is attached */ 3726 called BEFORE disk is attached */
3352 dev_err(DEV, "Error while reading metadata.\n"); 3727 dev_err(DEV, "Error while reading metadata.\n");
3353 rv = ERR_IO_MD_DISK; 3728 rv = ERR_IO_MD_DISK;
@@ -3392,6 +3767,15 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3392 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents); 3767 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3393 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid); 3768 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3394 3769
3770 spin_lock_irq(&mdev->req_lock);
3771 if (mdev->state.conn < C_CONNECTED) {
3772 int peer;
3773 peer = be32_to_cpu(buffer->la_peer_max_bio_size);
3774 peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE);
3775 mdev->peer_max_bio_size = peer;
3776 }
3777 spin_unlock_irq(&mdev->req_lock);
3778
3395 if (mdev->sync_conf.al_extents < 7) 3779 if (mdev->sync_conf.al_extents < 7)
3396 mdev->sync_conf.al_extents = 127; 3780 mdev->sync_conf.al_extents = 127;
3397 3781
@@ -3410,12 +3794,22 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3410 * the meta-data super block. This function sets MD_DIRTY, and starts a 3794 * the meta-data super block. This function sets MD_DIRTY, and starts a
3411 * timer that ensures that within five seconds you have to call drbd_md_sync(). 3795 * timer that ensures that within five seconds you have to call drbd_md_sync().
3412 */ 3796 */
3797#ifdef DEBUG
3798void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
3799{
3800 if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
3801 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
3802 mdev->last_md_mark_dirty.line = line;
3803 mdev->last_md_mark_dirty.func = func;
3804 }
3805}
3806#else
3413void drbd_md_mark_dirty(struct drbd_conf *mdev) 3807void drbd_md_mark_dirty(struct drbd_conf *mdev)
3414{ 3808{
3415 set_bit(MD_DIRTY, &mdev->flags); 3809 if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
3416 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ); 3810 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
3417} 3811}
3418 3812#endif
3419 3813
3420static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local) 3814static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3421{ 3815{
@@ -3460,13 +3854,18 @@ void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3460void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local) 3854void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3461{ 3855{
3462 u64 val; 3856 u64 val;
3857 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
3858
3859 if (bm_uuid)
3860 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
3463 3861
3464 dev_info(DEV, "Creating new current UUID\n");
3465 D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0);
3466 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT]; 3862 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
3467 3863
3468 get_random_bytes(&val, sizeof(u64)); 3864 get_random_bytes(&val, sizeof(u64));
3469 _drbd_uuid_set(mdev, UI_CURRENT, val); 3865 _drbd_uuid_set(mdev, UI_CURRENT, val);
3866 drbd_print_uuids(mdev, "new current UUID");
3867 /* get it to stable storage _now_ */
3868 drbd_md_sync(mdev);
3470} 3869}
3471 3870
3472void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local) 3871void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
@@ -3479,12 +3878,11 @@ void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3479 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP]; 3878 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3480 mdev->ldev->md.uuid[UI_BITMAP] = 0; 3879 mdev->ldev->md.uuid[UI_BITMAP] = 0;
3481 } else { 3880 } else {
3482 if (mdev->ldev->md.uuid[UI_BITMAP]) 3881 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
3483 dev_warn(DEV, "bm UUID already set"); 3882 if (bm_uuid)
3484 3883 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
3485 mdev->ldev->md.uuid[UI_BITMAP] = val;
3486 mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1);
3487 3884
3885 mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
3488 } 3886 }
3489 drbd_md_mark_dirty(mdev); 3887 drbd_md_mark_dirty(mdev);
3490} 3888}
@@ -3527,6 +3925,7 @@ int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3527{ 3925{
3528 int rv = -EIO; 3926 int rv = -EIO;
3529 3927
3928 drbd_resume_al(mdev);
3530 if (get_ldev_if_state(mdev, D_ATTACHING)) { 3929 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3531 drbd_bm_clear_all(mdev); 3930 drbd_bm_clear_all(mdev);
3532 rv = drbd_bm_write(mdev); 3931 rv = drbd_bm_write(mdev);
@@ -3539,15 +3938,19 @@ int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3539static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused) 3938static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3540{ 3939{
3541 struct bm_io_work *work = container_of(w, struct bm_io_work, w); 3940 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
3542 int rv; 3941 int rv = -EIO;
3543 3942
3544 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0); 3943 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3545 3944
3546 drbd_bm_lock(mdev, work->why); 3945 if (get_ldev(mdev)) {
3547 rv = work->io_fn(mdev); 3946 drbd_bm_lock(mdev, work->why, work->flags);
3548 drbd_bm_unlock(mdev); 3947 rv = work->io_fn(mdev);
3948 drbd_bm_unlock(mdev);
3949 put_ldev(mdev);
3950 }
3549 3951
3550 clear_bit(BITMAP_IO, &mdev->flags); 3952 clear_bit(BITMAP_IO, &mdev->flags);
3953 smp_mb__after_clear_bit();
3551 wake_up(&mdev->misc_wait); 3954 wake_up(&mdev->misc_wait);
3552 3955
3553 if (work->done) 3956 if (work->done)
@@ -3555,10 +3958,46 @@ static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3555 3958
3556 clear_bit(BITMAP_IO_QUEUED, &mdev->flags); 3959 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3557 work->why = NULL; 3960 work->why = NULL;
3961 work->flags = 0;
3962
3963 return 1;
3964}
3965
3966void drbd_ldev_destroy(struct drbd_conf *mdev)
3967{
3968 lc_destroy(mdev->resync);
3969 mdev->resync = NULL;
3970 lc_destroy(mdev->act_log);
3971 mdev->act_log = NULL;
3972 __no_warn(local,
3973 drbd_free_bc(mdev->ldev);
3974 mdev->ldev = NULL;);
3558 3975
3976 if (mdev->md_io_tmpp) {
3977 __free_page(mdev->md_io_tmpp);
3978 mdev->md_io_tmpp = NULL;
3979 }
3980 clear_bit(GO_DISKLESS, &mdev->flags);
3981}
3982
3983static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3984{
3985 D_ASSERT(mdev->state.disk == D_FAILED);
3986 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
3987 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
3988 * the protected members anymore, though, so once put_ldev reaches zero
3989 * again, it will be safe to free them. */
3990 drbd_force_state(mdev, NS(disk, D_DISKLESS));
3559 return 1; 3991 return 1;
3560} 3992}
3561 3993
3994void drbd_go_diskless(struct drbd_conf *mdev)
3995{
3996 D_ASSERT(mdev->state.disk == D_FAILED);
3997 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
3998 drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
3999}
4000
3562/** 4001/**
3563 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap 4002 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3564 * @mdev: DRBD device. 4003 * @mdev: DRBD device.
@@ -3574,7 +4013,7 @@ static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3574void drbd_queue_bitmap_io(struct drbd_conf *mdev, 4013void drbd_queue_bitmap_io(struct drbd_conf *mdev,
3575 int (*io_fn)(struct drbd_conf *), 4014 int (*io_fn)(struct drbd_conf *),
3576 void (*done)(struct drbd_conf *, int), 4015 void (*done)(struct drbd_conf *, int),
3577 char *why) 4016 char *why, enum bm_flag flags)
3578{ 4017{
3579 D_ASSERT(current == mdev->worker.task); 4018 D_ASSERT(current == mdev->worker.task);
3580 4019
@@ -3588,15 +4027,15 @@ void drbd_queue_bitmap_io(struct drbd_conf *mdev,
3588 mdev->bm_io_work.io_fn = io_fn; 4027 mdev->bm_io_work.io_fn = io_fn;
3589 mdev->bm_io_work.done = done; 4028 mdev->bm_io_work.done = done;
3590 mdev->bm_io_work.why = why; 4029 mdev->bm_io_work.why = why;
4030 mdev->bm_io_work.flags = flags;
3591 4031
4032 spin_lock_irq(&mdev->req_lock);
3592 set_bit(BITMAP_IO, &mdev->flags); 4033 set_bit(BITMAP_IO, &mdev->flags);
3593 if (atomic_read(&mdev->ap_bio_cnt) == 0) { 4034 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
3594 if (list_empty(&mdev->bm_io_work.w.list)) { 4035 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
3595 set_bit(BITMAP_IO_QUEUED, &mdev->flags);
3596 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w); 4036 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
3597 } else
3598 dev_err(DEV, "FIXME avoided double queuing bm_io_work\n");
3599 } 4037 }
4038 spin_unlock_irq(&mdev->req_lock);
3600} 4039}
3601 4040
3602/** 4041/**
@@ -3608,19 +4047,22 @@ void drbd_queue_bitmap_io(struct drbd_conf *mdev,
3608 * freezes application IO while that the actual IO operations runs. This 4047 * freezes application IO while that the actual IO operations runs. This
3609 * functions MAY NOT be called from worker context. 4048 * functions MAY NOT be called from worker context.
3610 */ 4049 */
3611int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why) 4050int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *),
4051 char *why, enum bm_flag flags)
3612{ 4052{
3613 int rv; 4053 int rv;
3614 4054
3615 D_ASSERT(current != mdev->worker.task); 4055 D_ASSERT(current != mdev->worker.task);
3616 4056
3617 drbd_suspend_io(mdev); 4057 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4058 drbd_suspend_io(mdev);
3618 4059
3619 drbd_bm_lock(mdev, why); 4060 drbd_bm_lock(mdev, why, flags);
3620 rv = io_fn(mdev); 4061 rv = io_fn(mdev);
3621 drbd_bm_unlock(mdev); 4062 drbd_bm_unlock(mdev);
3622 4063
3623 drbd_resume_io(mdev); 4064 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4065 drbd_resume_io(mdev);
3624 4066
3625 return rv; 4067 return rv;
3626} 4068}
@@ -3655,8 +4097,11 @@ static void md_sync_timer_fn(unsigned long data)
3655static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused) 4097static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3656{ 4098{
3657 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n"); 4099 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
4100#ifdef DEBUG
4101 dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
4102 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
4103#endif
3658 drbd_md_sync(mdev); 4104 drbd_md_sync(mdev);
3659
3660 return 1; 4105 return 1;
3661} 4106}
3662 4107
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 73131c5ae339..515bcd948a43 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -33,10 +33,13 @@
33#include <linux/blkpg.h> 33#include <linux/blkpg.h>
34#include <linux/cpumask.h> 34#include <linux/cpumask.h>
35#include "drbd_int.h" 35#include "drbd_int.h"
36#include "drbd_req.h"
36#include "drbd_wrappers.h" 37#include "drbd_wrappers.h"
37#include <asm/unaligned.h> 38#include <asm/unaligned.h>
38#include <linux/drbd_tag_magic.h> 39#include <linux/drbd_tag_magic.h>
39#include <linux/drbd_limits.h> 40#include <linux/drbd_limits.h>
41#include <linux/compiler.h>
42#include <linux/kthread.h>
40 43
41static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int); 44static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int);
42static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *); 45static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *);
@@ -169,6 +172,10 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd)
169 put_net_conf(mdev); 172 put_net_conf(mdev);
170 } 173 }
171 174
175 /* The helper may take some time.
176 * write out any unsynced meta data changes now */
177 drbd_md_sync(mdev);
178
172 dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb); 179 dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb);
173 180
174 drbd_bcast_ev_helper(mdev, cmd); 181 drbd_bcast_ev_helper(mdev, cmd);
@@ -202,12 +209,10 @@ enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev)
202 put_ldev(mdev); 209 put_ldev(mdev);
203 } else { 210 } else {
204 dev_warn(DEV, "Not fencing peer, I'm not even Consistent myself.\n"); 211 dev_warn(DEV, "Not fencing peer, I'm not even Consistent myself.\n");
205 return mdev->state.pdsk; 212 nps = mdev->state.pdsk;
213 goto out;
206 } 214 }
207 215
208 if (fp == FP_STONITH)
209 _drbd_request_state(mdev, NS(susp, 1), CS_WAIT_COMPLETE);
210
211 r = drbd_khelper(mdev, "fence-peer"); 216 r = drbd_khelper(mdev, "fence-peer");
212 217
213 switch ((r>>8) & 0xff) { 218 switch ((r>>8) & 0xff) {
@@ -252,14 +257,61 @@ enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev)
252 257
253 dev_info(DEV, "fence-peer helper returned %d (%s)\n", 258 dev_info(DEV, "fence-peer helper returned %d (%s)\n",
254 (r>>8) & 0xff, ex_to_string); 259 (r>>8) & 0xff, ex_to_string);
260
261out:
262 if (mdev->state.susp_fen && nps >= D_UNKNOWN) {
263 /* The handler was not successful... unfreeze here, the
264 state engine can not unfreeze... */
265 _drbd_request_state(mdev, NS(susp_fen, 0), CS_VERBOSE);
266 }
267
255 return nps; 268 return nps;
256} 269}
257 270
271static int _try_outdate_peer_async(void *data)
272{
273 struct drbd_conf *mdev = (struct drbd_conf *)data;
274 enum drbd_disk_state nps;
275 union drbd_state ns;
276
277 nps = drbd_try_outdate_peer(mdev);
278
279 /* Not using
280 drbd_request_state(mdev, NS(pdsk, nps));
281 here, because we might were able to re-establish the connection
282 in the meantime. This can only partially be solved in the state's
283 engine is_valid_state() and is_valid_state_transition()
284 functions.
285
286 nps can be D_INCONSISTENT, D_OUTDATED or D_UNKNOWN.
287 pdsk == D_INCONSISTENT while conn >= C_CONNECTED is valid,
288 therefore we have to have the pre state change check here.
289 */
290 spin_lock_irq(&mdev->req_lock);
291 ns = mdev->state;
292 if (ns.conn < C_WF_REPORT_PARAMS) {
293 ns.pdsk = nps;
294 _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
295 }
296 spin_unlock_irq(&mdev->req_lock);
297
298 return 0;
299}
300
301void drbd_try_outdate_peer_async(struct drbd_conf *mdev)
302{
303 struct task_struct *opa;
304
305 opa = kthread_run(_try_outdate_peer_async, mdev, "drbd%d_a_helper", mdev_to_minor(mdev));
306 if (IS_ERR(opa))
307 dev_err(DEV, "out of mem, failed to invoke fence-peer helper\n");
308}
258 309
259int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) 310enum drbd_state_rv
311drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
260{ 312{
261 const int max_tries = 4; 313 const int max_tries = 4;
262 int r = 0; 314 enum drbd_state_rv rv = SS_UNKNOWN_ERROR;
263 int try = 0; 315 int try = 0;
264 int forced = 0; 316 int forced = 0;
265 union drbd_state mask, val; 317 union drbd_state mask, val;
@@ -274,17 +326,17 @@ int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
274 val.i = 0; val.role = new_role; 326 val.i = 0; val.role = new_role;
275 327
276 while (try++ < max_tries) { 328 while (try++ < max_tries) {
277 r = _drbd_request_state(mdev, mask, val, CS_WAIT_COMPLETE); 329 rv = _drbd_request_state(mdev, mask, val, CS_WAIT_COMPLETE);
278 330
279 /* in case we first succeeded to outdate, 331 /* in case we first succeeded to outdate,
280 * but now suddenly could establish a connection */ 332 * but now suddenly could establish a connection */
281 if (r == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) { 333 if (rv == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) {
282 val.pdsk = 0; 334 val.pdsk = 0;
283 mask.pdsk = 0; 335 mask.pdsk = 0;
284 continue; 336 continue;
285 } 337 }
286 338
287 if (r == SS_NO_UP_TO_DATE_DISK && force && 339 if (rv == SS_NO_UP_TO_DATE_DISK && force &&
288 (mdev->state.disk < D_UP_TO_DATE && 340 (mdev->state.disk < D_UP_TO_DATE &&
289 mdev->state.disk >= D_INCONSISTENT)) { 341 mdev->state.disk >= D_INCONSISTENT)) {
290 mask.disk = D_MASK; 342 mask.disk = D_MASK;
@@ -293,7 +345,7 @@ int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
293 continue; 345 continue;
294 } 346 }
295 347
296 if (r == SS_NO_UP_TO_DATE_DISK && 348 if (rv == SS_NO_UP_TO_DATE_DISK &&
297 mdev->state.disk == D_CONSISTENT && mask.pdsk == 0) { 349 mdev->state.disk == D_CONSISTENT && mask.pdsk == 0) {
298 D_ASSERT(mdev->state.pdsk == D_UNKNOWN); 350 D_ASSERT(mdev->state.pdsk == D_UNKNOWN);
299 nps = drbd_try_outdate_peer(mdev); 351 nps = drbd_try_outdate_peer(mdev);
@@ -309,9 +361,9 @@ int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
309 continue; 361 continue;
310 } 362 }
311 363
312 if (r == SS_NOTHING_TO_DO) 364 if (rv == SS_NOTHING_TO_DO)
313 goto fail; 365 goto fail;
314 if (r == SS_PRIMARY_NOP && mask.pdsk == 0) { 366 if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) {
315 nps = drbd_try_outdate_peer(mdev); 367 nps = drbd_try_outdate_peer(mdev);
316 368
317 if (force && nps > D_OUTDATED) { 369 if (force && nps > D_OUTDATED) {
@@ -324,25 +376,24 @@ int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
324 376
325 continue; 377 continue;
326 } 378 }
327 if (r == SS_TWO_PRIMARIES) { 379 if (rv == SS_TWO_PRIMARIES) {
328 /* Maybe the peer is detected as dead very soon... 380 /* Maybe the peer is detected as dead very soon...
329 retry at most once more in this case. */ 381 retry at most once more in this case. */
330 __set_current_state(TASK_INTERRUPTIBLE); 382 schedule_timeout_interruptible((mdev->net_conf->ping_timeo+1)*HZ/10);
331 schedule_timeout((mdev->net_conf->ping_timeo+1)*HZ/10);
332 if (try < max_tries) 383 if (try < max_tries)
333 try = max_tries - 1; 384 try = max_tries - 1;
334 continue; 385 continue;
335 } 386 }
336 if (r < SS_SUCCESS) { 387 if (rv < SS_SUCCESS) {
337 r = _drbd_request_state(mdev, mask, val, 388 rv = _drbd_request_state(mdev, mask, val,
338 CS_VERBOSE + CS_WAIT_COMPLETE); 389 CS_VERBOSE + CS_WAIT_COMPLETE);
339 if (r < SS_SUCCESS) 390 if (rv < SS_SUCCESS)
340 goto fail; 391 goto fail;
341 } 392 }
342 break; 393 break;
343 } 394 }
344 395
345 if (r < SS_SUCCESS) 396 if (rv < SS_SUCCESS)
346 goto fail; 397 goto fail;
347 398
348 if (forced) 399 if (forced)
@@ -352,7 +403,7 @@ int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
352 wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0); 403 wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0);
353 404
354 if (new_role == R_SECONDARY) { 405 if (new_role == R_SECONDARY) {
355 set_disk_ro(mdev->vdisk, TRUE); 406 set_disk_ro(mdev->vdisk, true);
356 if (get_ldev(mdev)) { 407 if (get_ldev(mdev)) {
357 mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1; 408 mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
358 put_ldev(mdev); 409 put_ldev(mdev);
@@ -362,7 +413,7 @@ int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
362 mdev->net_conf->want_lose = 0; 413 mdev->net_conf->want_lose = 0;
363 put_net_conf(mdev); 414 put_net_conf(mdev);
364 } 415 }
365 set_disk_ro(mdev->vdisk, FALSE); 416 set_disk_ro(mdev->vdisk, false);
366 if (get_ldev(mdev)) { 417 if (get_ldev(mdev)) {
367 if (((mdev->state.conn < C_CONNECTED || 418 if (((mdev->state.conn < C_CONNECTED ||
368 mdev->state.pdsk <= D_FAILED) 419 mdev->state.pdsk <= D_FAILED)
@@ -374,10 +425,8 @@ int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
374 } 425 }
375 } 426 }
376 427
377 if ((new_role == R_SECONDARY) && get_ldev(mdev)) { 428 /* writeout of activity log covered areas of the bitmap
378 drbd_al_to_on_disk_bm(mdev); 429 * to stable storage done in after state change already */
379 put_ldev(mdev);
380 }
381 430
382 if (mdev->state.conn >= C_WF_REPORT_PARAMS) { 431 if (mdev->state.conn >= C_WF_REPORT_PARAMS) {
383 /* if this was forced, we should consider sync */ 432 /* if this was forced, we should consider sync */
@@ -391,9 +440,42 @@ int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
391 kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); 440 kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
392 fail: 441 fail:
393 mutex_unlock(&mdev->state_mutex); 442 mutex_unlock(&mdev->state_mutex);
394 return r; 443 return rv;
395} 444}
396 445
446static struct drbd_conf *ensure_mdev(int minor, int create)
447{
448 struct drbd_conf *mdev;
449
450 if (minor >= minor_count)
451 return NULL;
452
453 mdev = minor_to_mdev(minor);
454
455 if (!mdev && create) {
456 struct gendisk *disk = NULL;
457 mdev = drbd_new_device(minor);
458
459 spin_lock_irq(&drbd_pp_lock);
460 if (minor_table[minor] == NULL) {
461 minor_table[minor] = mdev;
462 disk = mdev->vdisk;
463 mdev = NULL;
464 } /* else: we lost the race */
465 spin_unlock_irq(&drbd_pp_lock);
466
467 if (disk) /* we won the race above */
468 /* in case we ever add a drbd_delete_device(),
469 * don't forget the del_gendisk! */
470 add_disk(disk);
471 else /* we lost the race above */
472 drbd_free_mdev(mdev);
473
474 mdev = minor_to_mdev(minor);
475 }
476
477 return mdev;
478}
397 479
398static int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, 480static int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
399 struct drbd_nl_cfg_reply *reply) 481 struct drbd_nl_cfg_reply *reply)
@@ -463,17 +545,19 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
463 } 545 }
464} 546}
465 547
548/* input size is expected to be in KB */
466char *ppsize(char *buf, unsigned long long size) 549char *ppsize(char *buf, unsigned long long size)
467{ 550{
468 /* Needs 9 bytes at max. */ 551 /* Needs 9 bytes at max including trailing NUL:
552 * -1ULL ==> "16384 EB" */
469 static char units[] = { 'K', 'M', 'G', 'T', 'P', 'E' }; 553 static char units[] = { 'K', 'M', 'G', 'T', 'P', 'E' };
470 int base = 0; 554 int base = 0;
471 while (size >= 10000) { 555 while (size >= 10000 && base < sizeof(units)-1) {
472 /* shift + round */ 556 /* shift + round */
473 size = (size >> 10) + !!(size & (1<<9)); 557 size = (size >> 10) + !!(size & (1<<9));
474 base++; 558 base++;
475 } 559 }
476 sprintf(buf, "%lu %cB", (long)size, units[base]); 560 sprintf(buf, "%u %cB", (unsigned)size, units[base]);
477 561
478 return buf; 562 return buf;
479} 563}
@@ -494,6 +578,8 @@ char *ppsize(char *buf, unsigned long long size)
494void drbd_suspend_io(struct drbd_conf *mdev) 578void drbd_suspend_io(struct drbd_conf *mdev)
495{ 579{
496 set_bit(SUSPEND_IO, &mdev->flags); 580 set_bit(SUSPEND_IO, &mdev->flags);
581 if (is_susp(mdev->state))
582 return;
497 wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); 583 wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
498} 584}
499 585
@@ -510,7 +596,7 @@ void drbd_resume_io(struct drbd_conf *mdev)
510 * Returns 0 on success, negative return values indicate errors. 596 * Returns 0 on success, negative return values indicate errors.
511 * You should call drbd_md_sync() after calling this function. 597 * You should call drbd_md_sync() after calling this function.
512 */ 598 */
513enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local) 599enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local)
514{ 600{
515 sector_t prev_first_sect, prev_size; /* previous meta location */ 601 sector_t prev_first_sect, prev_size; /* previous meta location */
516 sector_t la_size; 602 sector_t la_size;
@@ -575,11 +661,19 @@ enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, enum dds_
575 || prev_size != mdev->ldev->md.md_size_sect; 661 || prev_size != mdev->ldev->md.md_size_sect;
576 662
577 if (la_size_changed || md_moved) { 663 if (la_size_changed || md_moved) {
664 int err;
665
578 drbd_al_shrink(mdev); /* All extents inactive. */ 666 drbd_al_shrink(mdev); /* All extents inactive. */
579 dev_info(DEV, "Writing the whole bitmap, %s\n", 667 dev_info(DEV, "Writing the whole bitmap, %s\n",
580 la_size_changed && md_moved ? "size changed and md moved" : 668 la_size_changed && md_moved ? "size changed and md moved" :
581 la_size_changed ? "size changed" : "md moved"); 669 la_size_changed ? "size changed" : "md moved");
582 rv = drbd_bitmap_io(mdev, &drbd_bm_write, "size changed"); /* does drbd_resume_io() ! */ 670 /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */
671 err = drbd_bitmap_io(mdev, &drbd_bm_write,
672 "size changed", BM_LOCKED_MASK);
673 if (err) {
674 rv = dev_size_error;
675 goto out;
676 }
583 drbd_md_mark_dirty(mdev); 677 drbd_md_mark_dirty(mdev);
584 } 678 }
585 679
@@ -698,45 +792,91 @@ static int drbd_check_al_size(struct drbd_conf *mdev)
698 return 0; 792 return 0;
699} 793}
700 794
701void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __must_hold(local) 795static void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_size)
702{ 796{
703 struct request_queue * const q = mdev->rq_queue; 797 struct request_queue * const q = mdev->rq_queue;
704 struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; 798 int max_hw_sectors = max_bio_size >> 9;
705 int max_segments = mdev->ldev->dc.max_bio_bvecs; 799 int max_segments = 0;
706 800
707 max_seg_s = min(queue_max_sectors(b) * queue_logical_block_size(b), max_seg_s); 801 if (get_ldev_if_state(mdev, D_ATTACHING)) {
802 struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
803
804 max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
805 max_segments = mdev->ldev->dc.max_bio_bvecs;
806 put_ldev(mdev);
807 }
708 808
709 blk_queue_max_hw_sectors(q, max_seg_s >> 9);
710 blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
711 blk_queue_max_segment_size(q, max_seg_s);
712 blk_queue_logical_block_size(q, 512); 809 blk_queue_logical_block_size(q, 512);
713 blk_queue_segment_boundary(q, PAGE_SIZE-1); 810 blk_queue_max_hw_sectors(q, max_hw_sectors);
714 blk_stack_limits(&q->limits, &b->limits, 0); 811 /* This is the workaround for "bio would need to, but cannot, be split" */
812 blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
813 blk_queue_segment_boundary(q, PAGE_CACHE_SIZE-1);
814
815 if (get_ldev_if_state(mdev, D_ATTACHING)) {
816 struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
817
818 blk_queue_stack_limits(q, b);
819
820 if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
821 dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n",
822 q->backing_dev_info.ra_pages,
823 b->backing_dev_info.ra_pages);
824 q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
825 }
826 put_ldev(mdev);
827 }
828}
715 829
716 if (b->merge_bvec_fn) 830void drbd_reconsider_max_bio_size(struct drbd_conf *mdev)
717 dev_warn(DEV, "Backing device's merge_bvec_fn() = %p\n", 831{
718 b->merge_bvec_fn); 832 int now, new, local, peer;
719 dev_info(DEV, "max_segment_size ( = BIO size ) = %u\n", queue_max_segment_size(q)); 833
834 now = queue_max_hw_sectors(mdev->rq_queue) << 9;
835 local = mdev->local_max_bio_size; /* Eventually last known value, from volatile memory */
836 peer = mdev->peer_max_bio_size; /* Eventually last known value, from meta data */
837
838 if (get_ldev_if_state(mdev, D_ATTACHING)) {
839 local = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
840 mdev->local_max_bio_size = local;
841 put_ldev(mdev);
842 }
720 843
721 if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) { 844 /* We may ignore peer limits if the peer is modern enough.
722 dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n", 845 Because new from 8.3.8 onwards the peer can use multiple
723 q->backing_dev_info.ra_pages, 846 BIOs for a single peer_request */
724 b->backing_dev_info.ra_pages); 847 if (mdev->state.conn >= C_CONNECTED) {
725 q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages; 848 if (mdev->agreed_pro_version < 94)
849 peer = mdev->peer_max_bio_size;
850 else if (mdev->agreed_pro_version == 94)
851 peer = DRBD_MAX_SIZE_H80_PACKET;
852 else /* drbd 8.3.8 onwards */
853 peer = DRBD_MAX_BIO_SIZE;
726 } 854 }
855
856 new = min_t(int, local, peer);
857
858 if (mdev->state.role == R_PRIMARY && new < now)
859 dev_err(DEV, "ASSERT FAILED new < now; (%d < %d)\n", new, now);
860
861 if (new != now)
862 dev_info(DEV, "max BIO size = %u\n", new);
863
864 drbd_setup_queue_param(mdev, new);
727} 865}
728 866
729/* serialize deconfig (worker exiting, doing cleanup) 867/* serialize deconfig (worker exiting, doing cleanup)
730 * and reconfig (drbdsetup disk, drbdsetup net) 868 * and reconfig (drbdsetup disk, drbdsetup net)
731 * 869 *
732 * wait for a potentially exiting worker, then restart it, 870 * Wait for a potentially exiting worker, then restart it,
733 * or start a new one. 871 * or start a new one. Flush any pending work, there may still be an
872 * after_state_change queued.
734 */ 873 */
735static void drbd_reconfig_start(struct drbd_conf *mdev) 874static void drbd_reconfig_start(struct drbd_conf *mdev)
736{ 875{
737 wait_event(mdev->state_wait, !test_and_set_bit(CONFIG_PENDING, &mdev->flags)); 876 wait_event(mdev->state_wait, !test_and_set_bit(CONFIG_PENDING, &mdev->flags));
738 wait_event(mdev->state_wait, !test_bit(DEVICE_DYING, &mdev->flags)); 877 wait_event(mdev->state_wait, !test_bit(DEVICE_DYING, &mdev->flags));
739 drbd_thread_start(&mdev->worker); 878 drbd_thread_start(&mdev->worker);
879 drbd_flush_workqueue(mdev);
740} 880}
741 881
742/* if still unconfigured, stops worker again. 882/* if still unconfigured, stops worker again.
@@ -756,20 +896,43 @@ static void drbd_reconfig_done(struct drbd_conf *mdev)
756 wake_up(&mdev->state_wait); 896 wake_up(&mdev->state_wait);
757} 897}
758 898
899/* Make sure IO is suspended before calling this function(). */
900static void drbd_suspend_al(struct drbd_conf *mdev)
901{
902 int s = 0;
903
904 if (lc_try_lock(mdev->act_log)) {
905 drbd_al_shrink(mdev);
906 lc_unlock(mdev->act_log);
907 } else {
908 dev_warn(DEV, "Failed to lock al in drbd_suspend_al()\n");
909 return;
910 }
911
912 spin_lock_irq(&mdev->req_lock);
913 if (mdev->state.conn < C_CONNECTED)
914 s = !test_and_set_bit(AL_SUSPENDED, &mdev->flags);
915
916 spin_unlock_irq(&mdev->req_lock);
917
918 if (s)
919 dev_info(DEV, "Suspended AL updates\n");
920}
921
759/* does always return 0; 922/* does always return 0;
760 * interesting return code is in reply->ret_code */ 923 * interesting return code is in reply->ret_code */
761static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, 924static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
762 struct drbd_nl_cfg_reply *reply) 925 struct drbd_nl_cfg_reply *reply)
763{ 926{
764 enum drbd_ret_codes retcode; 927 enum drbd_ret_code retcode;
765 enum determine_dev_size dd; 928 enum determine_dev_size dd;
766 sector_t max_possible_sectors; 929 sector_t max_possible_sectors;
767 sector_t min_md_device_sectors; 930 sector_t min_md_device_sectors;
768 struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */ 931 struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
769 struct inode *inode, *inode2; 932 struct block_device *bdev;
770 struct lru_cache *resync_lru = NULL; 933 struct lru_cache *resync_lru = NULL;
771 union drbd_state ns, os; 934 union drbd_state ns, os;
772 int rv; 935 enum drbd_state_rv rv;
773 int cp_discovered = 0; 936 int cp_discovered = 0;
774 int logical_block_size; 937 int logical_block_size;
775 938
@@ -780,6 +943,11 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
780 retcode = ERR_DISK_CONFIGURED; 943 retcode = ERR_DISK_CONFIGURED;
781 goto fail; 944 goto fail;
782 } 945 }
946 /* It may just now have detached because of IO error. Make sure
947 * drbd_ldev_destroy is done already, we may end up here very fast,
948 * e.g. if someone calls attach from the on-io-error handler,
949 * to realize a "hot spare" feature (not that I'd recommend that) */
950 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
783 951
784 /* allocation not in the IO path, cqueue thread context */ 952 /* allocation not in the IO path, cqueue thread context */
785 nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL); 953 nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL);
@@ -803,46 +971,49 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
803 goto fail; 971 goto fail;
804 } 972 }
805 973
806 nbc->lo_file = filp_open(nbc->dc.backing_dev, O_RDWR, 0); 974 if (get_net_conf(mdev)) {
807 if (IS_ERR(nbc->lo_file)) { 975 int prot = mdev->net_conf->wire_protocol;
808 dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev, 976 put_net_conf(mdev);
809 PTR_ERR(nbc->lo_file)); 977 if (nbc->dc.fencing == FP_STONITH && prot == DRBD_PROT_A) {
810 nbc->lo_file = NULL; 978 retcode = ERR_STONITH_AND_PROT_A;
811 retcode = ERR_OPEN_DISK; 979 goto fail;
812 goto fail; 980 }
813 } 981 }
814 982
815 inode = nbc->lo_file->f_dentry->d_inode; 983 bdev = blkdev_get_by_path(nbc->dc.backing_dev,
816 984 FMODE_READ | FMODE_WRITE | FMODE_EXCL, mdev);
817 if (!S_ISBLK(inode->i_mode)) { 985 if (IS_ERR(bdev)) {
818 retcode = ERR_DISK_NOT_BDEV; 986 dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev,
987 PTR_ERR(bdev));
988 retcode = ERR_OPEN_DISK;
819 goto fail; 989 goto fail;
820 } 990 }
991 nbc->backing_bdev = bdev;
821 992
822 nbc->md_file = filp_open(nbc->dc.meta_dev, O_RDWR, 0); 993 /*
823 if (IS_ERR(nbc->md_file)) { 994 * meta_dev_idx >= 0: external fixed size, possibly multiple
995 * drbd sharing one meta device. TODO in that case, paranoia
996 * check that [md_bdev, meta_dev_idx] is not yet used by some
997 * other drbd minor! (if you use drbd.conf + drbdadm, that
998 * should check it for you already; but if you don't, or
999 * someone fooled it, we need to double check here)
1000 */
1001 bdev = blkdev_get_by_path(nbc->dc.meta_dev,
1002 FMODE_READ | FMODE_WRITE | FMODE_EXCL,
1003 (nbc->dc.meta_dev_idx < 0) ?
1004 (void *)mdev : (void *)drbd_m_holder);
1005 if (IS_ERR(bdev)) {
824 dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev, 1006 dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev,
825 PTR_ERR(nbc->md_file)); 1007 PTR_ERR(bdev));
826 nbc->md_file = NULL;
827 retcode = ERR_OPEN_MD_DISK; 1008 retcode = ERR_OPEN_MD_DISK;
828 goto fail; 1009 goto fail;
829 } 1010 }
1011 nbc->md_bdev = bdev;
830 1012
831 inode2 = nbc->md_file->f_dentry->d_inode; 1013 if ((nbc->backing_bdev == nbc->md_bdev) !=
832 1014 (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
833 if (!S_ISBLK(inode2->i_mode)) { 1015 nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
834 retcode = ERR_MD_NOT_BDEV; 1016 retcode = ERR_MD_IDX_INVALID;
835 goto fail;
836 }
837
838 nbc->backing_bdev = inode->i_bdev;
839 if (bd_claim(nbc->backing_bdev, mdev)) {
840 printk(KERN_ERR "drbd: bd_claim(%p,%p); failed [%p;%p;%u]\n",
841 nbc->backing_bdev, mdev,
842 nbc->backing_bdev->bd_holder,
843 nbc->backing_bdev->bd_contains->bd_holder,
844 nbc->backing_bdev->bd_holders);
845 retcode = ERR_BDCLAIM_DISK;
846 goto fail; 1017 goto fail;
847 } 1018 }
848 1019
@@ -851,28 +1022,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
851 offsetof(struct bm_extent, lce)); 1022 offsetof(struct bm_extent, lce));
852 if (!resync_lru) { 1023 if (!resync_lru) {
853 retcode = ERR_NOMEM; 1024 retcode = ERR_NOMEM;
854 goto release_bdev_fail; 1025 goto fail;
855 }
856
857 /* meta_dev_idx >= 0: external fixed size,
858 * possibly multiple drbd sharing one meta device.
859 * TODO in that case, paranoia check that [md_bdev, meta_dev_idx] is
860 * not yet used by some other drbd minor!
861 * (if you use drbd.conf + drbdadm,
862 * that should check it for you already; but if you don't, or someone
863 * fooled it, we need to double check here) */
864 nbc->md_bdev = inode2->i_bdev;
865 if (bd_claim(nbc->md_bdev, (nbc->dc.meta_dev_idx < 0) ? (void *)mdev
866 : (void *) drbd_m_holder)) {
867 retcode = ERR_BDCLAIM_MD_DISK;
868 goto release_bdev_fail;
869 }
870
871 if ((nbc->backing_bdev == nbc->md_bdev) !=
872 (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
873 nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
874 retcode = ERR_MD_IDX_INVALID;
875 goto release_bdev2_fail;
876 } 1026 }
877 1027
878 /* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */ 1028 /* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */
@@ -883,7 +1033,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
883 (unsigned long long) drbd_get_max_capacity(nbc), 1033 (unsigned long long) drbd_get_max_capacity(nbc),
884 (unsigned long long) nbc->dc.disk_size); 1034 (unsigned long long) nbc->dc.disk_size);
885 retcode = ERR_DISK_TO_SMALL; 1035 retcode = ERR_DISK_TO_SMALL;
886 goto release_bdev2_fail; 1036 goto fail;
887 } 1037 }
888 1038
889 if (nbc->dc.meta_dev_idx < 0) { 1039 if (nbc->dc.meta_dev_idx < 0) {
@@ -900,7 +1050,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
900 dev_warn(DEV, "refusing attach: md-device too small, " 1050 dev_warn(DEV, "refusing attach: md-device too small, "
901 "at least %llu sectors needed for this meta-disk type\n", 1051 "at least %llu sectors needed for this meta-disk type\n",
902 (unsigned long long) min_md_device_sectors); 1052 (unsigned long long) min_md_device_sectors);
903 goto release_bdev2_fail; 1053 goto fail;
904 } 1054 }
905 1055
906 /* Make sure the new disk is big enough 1056 /* Make sure the new disk is big enough
@@ -908,7 +1058,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
908 if (drbd_get_max_capacity(nbc) < 1058 if (drbd_get_max_capacity(nbc) <
909 drbd_get_capacity(mdev->this_bdev)) { 1059 drbd_get_capacity(mdev->this_bdev)) {
910 retcode = ERR_DISK_TO_SMALL; 1060 retcode = ERR_DISK_TO_SMALL;
911 goto release_bdev2_fail; 1061 goto fail;
912 } 1062 }
913 1063
914 nbc->known_size = drbd_get_capacity(nbc->backing_bdev); 1064 nbc->known_size = drbd_get_capacity(nbc->backing_bdev);
@@ -924,14 +1074,15 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
924 1074
925 drbd_suspend_io(mdev); 1075 drbd_suspend_io(mdev);
926 /* also wait for the last barrier ack. */ 1076 /* also wait for the last barrier ack. */
927 wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt)); 1077 wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt) || is_susp(mdev->state));
928 /* and for any other previously queued work */ 1078 /* and for any other previously queued work */
929 drbd_flush_workqueue(mdev); 1079 drbd_flush_workqueue(mdev);
930 1080
931 retcode = _drbd_request_state(mdev, NS(disk, D_ATTACHING), CS_VERBOSE); 1081 rv = _drbd_request_state(mdev, NS(disk, D_ATTACHING), CS_VERBOSE);
1082 retcode = rv; /* FIXME: Type mismatch. */
932 drbd_resume_io(mdev); 1083 drbd_resume_io(mdev);
933 if (retcode < SS_SUCCESS) 1084 if (rv < SS_SUCCESS)
934 goto release_bdev2_fail; 1085 goto fail;
935 1086
936 if (!get_ldev_if_state(mdev, D_ATTACHING)) 1087 if (!get_ldev_if_state(mdev, D_ATTACHING))
937 goto force_diskless; 1088 goto force_diskless;
@@ -999,9 +1150,9 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
999 /* Reset the "barriers don't work" bits here, then force meta data to 1150 /* Reset the "barriers don't work" bits here, then force meta data to
1000 * be written, to ensure we determine if barriers are supported. */ 1151 * be written, to ensure we determine if barriers are supported. */
1001 if (nbc->dc.no_md_flush) 1152 if (nbc->dc.no_md_flush)
1002 set_bit(MD_NO_BARRIER, &mdev->flags); 1153 set_bit(MD_NO_FUA, &mdev->flags);
1003 else 1154 else
1004 clear_bit(MD_NO_BARRIER, &mdev->flags); 1155 clear_bit(MD_NO_FUA, &mdev->flags);
1005 1156
1006 /* Point of no return reached. 1157 /* Point of no return reached.
1007 * Devices and memory are no longer released by error cleanup below. 1158 * Devices and memory are no longer released by error cleanup below.
@@ -1013,15 +1164,16 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
1013 nbc = NULL; 1164 nbc = NULL;
1014 resync_lru = NULL; 1165 resync_lru = NULL;
1015 1166
1016 mdev->write_ordering = WO_bio_barrier; 1167 mdev->write_ordering = WO_bdev_flush;
1017 drbd_bump_write_ordering(mdev, WO_bio_barrier); 1168 drbd_bump_write_ordering(mdev, WO_bdev_flush);
1018 1169
1019 if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY)) 1170 if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY))
1020 set_bit(CRASHED_PRIMARY, &mdev->flags); 1171 set_bit(CRASHED_PRIMARY, &mdev->flags);
1021 else 1172 else
1022 clear_bit(CRASHED_PRIMARY, &mdev->flags); 1173 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1023 1174
1024 if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND)) { 1175 if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) &&
1176 !(mdev->state.role == R_PRIMARY && mdev->state.susp_nod)) {
1025 set_bit(CRASHED_PRIMARY, &mdev->flags); 1177 set_bit(CRASHED_PRIMARY, &mdev->flags);
1026 cp_discovered = 1; 1178 cp_discovered = 1;
1027 } 1179 }
@@ -1031,7 +1183,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
1031 mdev->read_cnt = 0; 1183 mdev->read_cnt = 0;
1032 mdev->writ_cnt = 0; 1184 mdev->writ_cnt = 0;
1033 1185
1034 drbd_setup_queue_param(mdev, DRBD_MAX_SEGMENT_SIZE); 1186 drbd_reconsider_max_bio_size(mdev);
1035 1187
1036 /* If I am currently not R_PRIMARY, 1188 /* If I am currently not R_PRIMARY,
1037 * but meta data primary indicator is set, 1189 * but meta data primary indicator is set,
@@ -1053,7 +1205,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
1053 !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND)) 1205 !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND))
1054 set_bit(USE_DEGR_WFC_T, &mdev->flags); 1206 set_bit(USE_DEGR_WFC_T, &mdev->flags);
1055 1207
1056 dd = drbd_determin_dev_size(mdev, 0); 1208 dd = drbd_determine_dev_size(mdev, 0);
1057 if (dd == dev_size_error) { 1209 if (dd == dev_size_error) {
1058 retcode = ERR_NOMEM_BITMAP; 1210 retcode = ERR_NOMEM_BITMAP;
1059 goto force_diskless_dec; 1211 goto force_diskless_dec;
@@ -1063,12 +1215,14 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
1063 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) { 1215 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
1064 dev_info(DEV, "Assuming that all blocks are out of sync " 1216 dev_info(DEV, "Assuming that all blocks are out of sync "
1065 "(aka FullSync)\n"); 1217 "(aka FullSync)\n");
1066 if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from attaching")) { 1218 if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write,
1219 "set_n_write from attaching", BM_LOCKED_MASK)) {
1067 retcode = ERR_IO_MD_DISK; 1220 retcode = ERR_IO_MD_DISK;
1068 goto force_diskless_dec; 1221 goto force_diskless_dec;
1069 } 1222 }
1070 } else { 1223 } else {
1071 if (drbd_bitmap_io(mdev, &drbd_bm_read, "read from attaching") < 0) { 1224 if (drbd_bitmap_io(mdev, &drbd_bm_read,
1225 "read from attaching", BM_LOCKED_MASK) < 0) {
1072 retcode = ERR_IO_MD_DISK; 1226 retcode = ERR_IO_MD_DISK;
1073 goto force_diskless_dec; 1227 goto force_diskless_dec;
1074 } 1228 }
@@ -1076,9 +1230,16 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
1076 1230
1077 if (cp_discovered) { 1231 if (cp_discovered) {
1078 drbd_al_apply_to_bm(mdev); 1232 drbd_al_apply_to_bm(mdev);
1079 drbd_al_to_on_disk_bm(mdev); 1233 if (drbd_bitmap_io(mdev, &drbd_bm_write,
1234 "crashed primary apply AL", BM_LOCKED_MASK)) {
1235 retcode = ERR_IO_MD_DISK;
1236 goto force_diskless_dec;
1237 }
1080 } 1238 }
1081 1239
1240 if (_drbd_bm_total_weight(mdev) == drbd_bm_bits(mdev))
1241 drbd_suspend_al(mdev); /* IO is still suspended here... */
1242
1082 spin_lock_irq(&mdev->req_lock); 1243 spin_lock_irq(&mdev->req_lock);
1083 os = mdev->state; 1244 os = mdev->state;
1084 ns.i = os.i; 1245 ns.i = os.i;
@@ -1146,20 +1307,16 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
1146 force_diskless_dec: 1307 force_diskless_dec:
1147 put_ldev(mdev); 1308 put_ldev(mdev);
1148 force_diskless: 1309 force_diskless:
1149 drbd_force_state(mdev, NS(disk, D_DISKLESS)); 1310 drbd_force_state(mdev, NS(disk, D_FAILED));
1150 drbd_md_sync(mdev); 1311 drbd_md_sync(mdev);
1151 release_bdev2_fail:
1152 if (nbc)
1153 bd_release(nbc->md_bdev);
1154 release_bdev_fail:
1155 if (nbc)
1156 bd_release(nbc->backing_bdev);
1157 fail: 1312 fail:
1158 if (nbc) { 1313 if (nbc) {
1159 if (nbc->lo_file) 1314 if (nbc->backing_bdev)
1160 fput(nbc->lo_file); 1315 blkdev_put(nbc->backing_bdev,
1161 if (nbc->md_file) 1316 FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1162 fput(nbc->md_file); 1317 if (nbc->md_bdev)
1318 blkdev_put(nbc->md_bdev,
1319 FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1163 kfree(nbc); 1320 kfree(nbc);
1164 } 1321 }
1165 lc_destroy(resync_lru); 1322 lc_destroy(resync_lru);
@@ -1169,10 +1326,27 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
1169 return 0; 1326 return 0;
1170} 1327}
1171 1328
1329/* Detaching the disk is a process in multiple stages. First we need to lock
1330 * out application IO, in-flight IO, IO stuck in drbd_al_begin_io.
1331 * Then we transition to D_DISKLESS, and wait for put_ldev() to return all
1332 * internal references as well.
1333 * Only then we have finally detached. */
1172static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, 1334static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1173 struct drbd_nl_cfg_reply *reply) 1335 struct drbd_nl_cfg_reply *reply)
1174{ 1336{
1175 reply->ret_code = drbd_request_state(mdev, NS(disk, D_DISKLESS)); 1337 enum drbd_ret_code retcode;
1338 int ret;
1339 drbd_suspend_io(mdev); /* so no-one is stuck in drbd_al_begin_io */
1340 retcode = drbd_request_state(mdev, NS(disk, D_FAILED));
1341 /* D_FAILED will transition to DISKLESS. */
1342 ret = wait_event_interruptible(mdev->misc_wait,
1343 mdev->state.disk != D_FAILED);
1344 drbd_resume_io(mdev);
1345 if ((int)retcode == (int)SS_IS_DISKLESS)
1346 retcode = SS_NOTHING_TO_DO;
1347 if (ret)
1348 retcode = ERR_INTR;
1349 reply->ret_code = retcode;
1176 return 0; 1350 return 0;
1177} 1351}
1178 1352
@@ -1180,7 +1354,7 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1180 struct drbd_nl_cfg_reply *reply) 1354 struct drbd_nl_cfg_reply *reply)
1181{ 1355{
1182 int i, ns; 1356 int i, ns;
1183 enum drbd_ret_codes retcode; 1357 enum drbd_ret_code retcode;
1184 struct net_conf *new_conf = NULL; 1358 struct net_conf *new_conf = NULL;
1185 struct crypto_hash *tfm = NULL; 1359 struct crypto_hash *tfm = NULL;
1186 struct crypto_hash *integrity_w_tfm = NULL; 1360 struct crypto_hash *integrity_w_tfm = NULL;
@@ -1225,6 +1399,8 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1225 new_conf->wire_protocol = DRBD_PROT_C; 1399 new_conf->wire_protocol = DRBD_PROT_C;
1226 new_conf->ping_timeo = DRBD_PING_TIMEO_DEF; 1400 new_conf->ping_timeo = DRBD_PING_TIMEO_DEF;
1227 new_conf->rr_conflict = DRBD_RR_CONFLICT_DEF; 1401 new_conf->rr_conflict = DRBD_RR_CONFLICT_DEF;
1402 new_conf->on_congestion = DRBD_ON_CONGESTION_DEF;
1403 new_conf->cong_extents = DRBD_CONG_EXTENTS_DEF;
1228 1404
1229 if (!net_conf_from_tags(mdev, nlp->tag_list, new_conf)) { 1405 if (!net_conf_from_tags(mdev, nlp->tag_list, new_conf)) {
1230 retcode = ERR_MANDATORY_TAG; 1406 retcode = ERR_MANDATORY_TAG;
@@ -1235,7 +1411,21 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1235 && (new_conf->wire_protocol != DRBD_PROT_C)) { 1411 && (new_conf->wire_protocol != DRBD_PROT_C)) {
1236 retcode = ERR_NOT_PROTO_C; 1412 retcode = ERR_NOT_PROTO_C;
1237 goto fail; 1413 goto fail;
1238 }; 1414 }
1415
1416 if (get_ldev(mdev)) {
1417 enum drbd_fencing_p fp = mdev->ldev->dc.fencing;
1418 put_ldev(mdev);
1419 if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH) {
1420 retcode = ERR_STONITH_AND_PROT_A;
1421 goto fail;
1422 }
1423 }
1424
1425 if (new_conf->on_congestion != OC_BLOCK && new_conf->wire_protocol != DRBD_PROT_A) {
1426 retcode = ERR_CONG_NOT_PROTO_A;
1427 goto fail;
1428 }
1239 1429
1240 if (mdev->state.role == R_PRIMARY && new_conf->want_lose) { 1430 if (mdev->state.role == R_PRIMARY && new_conf->want_lose) {
1241 retcode = ERR_DISCARD; 1431 retcode = ERR_DISCARD;
@@ -1350,6 +1540,7 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1350 } 1540 }
1351 } 1541 }
1352 1542
1543 drbd_flush_workqueue(mdev);
1353 spin_lock_irq(&mdev->req_lock); 1544 spin_lock_irq(&mdev->req_lock);
1354 if (mdev->net_conf != NULL) { 1545 if (mdev->net_conf != NULL) {
1355 retcode = ERR_NET_CONFIGURED; 1546 retcode = ERR_NET_CONFIGURED;
@@ -1388,10 +1579,9 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1388 mdev->int_dig_out=int_dig_out; 1579 mdev->int_dig_out=int_dig_out;
1389 mdev->int_dig_in=int_dig_in; 1580 mdev->int_dig_in=int_dig_in;
1390 mdev->int_dig_vv=int_dig_vv; 1581 mdev->int_dig_vv=int_dig_vv;
1582 retcode = _drbd_set_state(_NS(mdev, conn, C_UNCONNECTED), CS_VERBOSE, NULL);
1391 spin_unlock_irq(&mdev->req_lock); 1583 spin_unlock_irq(&mdev->req_lock);
1392 1584
1393 retcode = _drbd_request_state(mdev, NS(conn, C_UNCONNECTED), CS_VERBOSE);
1394
1395 kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); 1585 kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
1396 reply->ret_code = retcode; 1586 reply->ret_code = retcode;
1397 drbd_reconfig_done(mdev); 1587 drbd_reconfig_done(mdev);
@@ -1417,6 +1607,21 @@ static int drbd_nl_disconnect(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
1417 struct drbd_nl_cfg_reply *reply) 1607 struct drbd_nl_cfg_reply *reply)
1418{ 1608{
1419 int retcode; 1609 int retcode;
1610 struct disconnect dc;
1611
1612 memset(&dc, 0, sizeof(struct disconnect));
1613 if (!disconnect_from_tags(mdev, nlp->tag_list, &dc)) {
1614 retcode = ERR_MANDATORY_TAG;
1615 goto fail;
1616 }
1617
1618 if (dc.force) {
1619 spin_lock_irq(&mdev->req_lock);
1620 if (mdev->state.conn >= C_WF_CONNECTION)
1621 _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), CS_HARD, NULL);
1622 spin_unlock_irq(&mdev->req_lock);
1623 goto done;
1624 }
1420 1625
1421 retcode = _drbd_request_state(mdev, NS(conn, C_DISCONNECTING), CS_ORDERED); 1626 retcode = _drbd_request_state(mdev, NS(conn, C_DISCONNECTING), CS_ORDERED);
1422 1627
@@ -1514,7 +1719,7 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1514 1719
1515 mdev->ldev->dc.disk_size = (sector_t)rs.resize_size; 1720 mdev->ldev->dc.disk_size = (sector_t)rs.resize_size;
1516 ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0); 1721 ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
1517 dd = drbd_determin_dev_size(mdev, ddsf); 1722 dd = drbd_determine_dev_size(mdev, ddsf);
1518 drbd_md_sync(mdev); 1723 drbd_md_sync(mdev);
1519 put_ldev(mdev); 1724 put_ldev(mdev);
1520 if (dd == dev_size_error) { 1725 if (dd == dev_size_error) {
@@ -1546,6 +1751,8 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
1546 struct crypto_hash *csums_tfm = NULL; 1751 struct crypto_hash *csums_tfm = NULL;
1547 struct syncer_conf sc; 1752 struct syncer_conf sc;
1548 cpumask_var_t new_cpu_mask; 1753 cpumask_var_t new_cpu_mask;
1754 int *rs_plan_s = NULL;
1755 int fifo_size;
1549 1756
1550 if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) { 1757 if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) {
1551 retcode = ERR_NOMEM; 1758 retcode = ERR_NOMEM;
@@ -1557,6 +1764,12 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
1557 sc.rate = DRBD_RATE_DEF; 1764 sc.rate = DRBD_RATE_DEF;
1558 sc.after = DRBD_AFTER_DEF; 1765 sc.after = DRBD_AFTER_DEF;
1559 sc.al_extents = DRBD_AL_EXTENTS_DEF; 1766 sc.al_extents = DRBD_AL_EXTENTS_DEF;
1767 sc.on_no_data = DRBD_ON_NO_DATA_DEF;
1768 sc.c_plan_ahead = DRBD_C_PLAN_AHEAD_DEF;
1769 sc.c_delay_target = DRBD_C_DELAY_TARGET_DEF;
1770 sc.c_fill_target = DRBD_C_FILL_TARGET_DEF;
1771 sc.c_max_rate = DRBD_C_MAX_RATE_DEF;
1772 sc.c_min_rate = DRBD_C_MIN_RATE_DEF;
1560 } else 1773 } else
1561 memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf)); 1774 memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf));
1562 1775
@@ -1634,6 +1847,12 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
1634 } 1847 }
1635#undef AL_MAX 1848#undef AL_MAX
1636 1849
1850 /* to avoid spurious errors when configuring minors before configuring
1851 * the minors they depend on: if necessary, first create the minor we
1852 * depend on */
1853 if (sc.after >= 0)
1854 ensure_mdev(sc.after, 1);
1855
1637 /* most sanity checks done, try to assign the new sync-after 1856 /* most sanity checks done, try to assign the new sync-after
1638 * dependency. need to hold the global lock in there, 1857 * dependency. need to hold the global lock in there,
1639 * to avoid a race in the dependency loop check. */ 1858 * to avoid a race in the dependency loop check. */
@@ -1641,6 +1860,16 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
1641 if (retcode != NO_ERROR) 1860 if (retcode != NO_ERROR)
1642 goto fail; 1861 goto fail;
1643 1862
1863 fifo_size = (sc.c_plan_ahead * 10 * SLEEP_TIME) / HZ;
1864 if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) {
1865 rs_plan_s = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL);
1866 if (!rs_plan_s) {
1867 dev_err(DEV, "kmalloc of fifo_buffer failed");
1868 retcode = ERR_NOMEM;
1869 goto fail;
1870 }
1871 }
1872
1644 /* ok, assign the rest of it as well. 1873 /* ok, assign the rest of it as well.
1645 * lock against receive_SyncParam() */ 1874 * lock against receive_SyncParam() */
1646 spin_lock(&mdev->peer_seq_lock); 1875 spin_lock(&mdev->peer_seq_lock);
@@ -1657,6 +1886,15 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
1657 mdev->verify_tfm = verify_tfm; 1886 mdev->verify_tfm = verify_tfm;
1658 verify_tfm = NULL; 1887 verify_tfm = NULL;
1659 } 1888 }
1889
1890 if (fifo_size != mdev->rs_plan_s.size) {
1891 kfree(mdev->rs_plan_s.values);
1892 mdev->rs_plan_s.values = rs_plan_s;
1893 mdev->rs_plan_s.size = fifo_size;
1894 mdev->rs_planed = 0;
1895 rs_plan_s = NULL;
1896 }
1897
1660 spin_unlock(&mdev->peer_seq_lock); 1898 spin_unlock(&mdev->peer_seq_lock);
1661 1899
1662 if (get_ldev(mdev)) { 1900 if (get_ldev(mdev)) {
@@ -1688,6 +1926,7 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
1688 1926
1689 kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); 1927 kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
1690fail: 1928fail:
1929 kfree(rs_plan_s);
1691 free_cpumask_var(new_cpu_mask); 1930 free_cpumask_var(new_cpu_mask);
1692 crypto_free_hash(csums_tfm); 1931 crypto_free_hash(csums_tfm);
1693 crypto_free_hash(verify_tfm); 1932 crypto_free_hash(verify_tfm);
@@ -1700,6 +1939,10 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
1700{ 1939{
1701 int retcode; 1940 int retcode;
1702 1941
1942 /* If there is still bitmap IO pending, probably because of a previous
1943 * resync just being finished, wait for it before requesting a new resync. */
1944 wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
1945
1703 retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED); 1946 retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED);
1704 1947
1705 if (retcode < SS_SUCCESS && retcode != SS_NEED_CONNECTION) 1948 if (retcode < SS_SUCCESS && retcode != SS_NEED_CONNECTION)
@@ -1721,12 +1964,42 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
1721 return 0; 1964 return 0;
1722} 1965}
1723 1966
1967static int drbd_bmio_set_susp_al(struct drbd_conf *mdev)
1968{
1969 int rv;
1970
1971 rv = drbd_bmio_set_n_write(mdev);
1972 drbd_suspend_al(mdev);
1973 return rv;
1974}
1975
1724static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, 1976static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1725 struct drbd_nl_cfg_reply *reply) 1977 struct drbd_nl_cfg_reply *reply)
1726{ 1978{
1979 int retcode;
1727 1980
1728 reply->ret_code = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S)); 1981 /* If there is still bitmap IO pending, probably because of a previous
1982 * resync just being finished, wait for it before requesting a new resync. */
1983 wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
1984
1985 retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED);
1986
1987 if (retcode < SS_SUCCESS) {
1988 if (retcode == SS_NEED_CONNECTION && mdev->state.role == R_PRIMARY) {
1989 /* The peer will get a resync upon connect anyways. Just make that
1990 into a full resync. */
1991 retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT));
1992 if (retcode >= SS_SUCCESS) {
1993 if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al,
1994 "set_n_write from invalidate_peer",
1995 BM_LOCKED_SET_ALLOWED))
1996 retcode = ERR_IO_MD_DISK;
1997 }
1998 } else
1999 retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
2000 }
1729 2001
2002 reply->ret_code = retcode;
1730 return 0; 2003 return 0;
1731} 2004}
1732 2005
@@ -1746,9 +2019,17 @@ static int drbd_nl_resume_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
1746 struct drbd_nl_cfg_reply *reply) 2019 struct drbd_nl_cfg_reply *reply)
1747{ 2020{
1748 int retcode = NO_ERROR; 2021 int retcode = NO_ERROR;
2022 union drbd_state s;
1749 2023
1750 if (drbd_request_state(mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO) 2024 if (drbd_request_state(mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO) {
1751 retcode = ERR_PAUSE_IS_CLEAR; 2025 s = mdev->state;
2026 if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) {
2027 retcode = s.aftr_isp ? ERR_PIC_AFTER_DEP :
2028 s.peer_isp ? ERR_PIC_PEER_DEP : ERR_PAUSE_IS_CLEAR;
2029 } else {
2030 retcode = ERR_PAUSE_IS_CLEAR;
2031 }
2032 }
1752 2033
1753 reply->ret_code = retcode; 2034 reply->ret_code = retcode;
1754 return 0; 2035 return 0;
@@ -1765,7 +2046,20 @@ static int drbd_nl_suspend_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
1765static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, 2046static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1766 struct drbd_nl_cfg_reply *reply) 2047 struct drbd_nl_cfg_reply *reply)
1767{ 2048{
1768 reply->ret_code = drbd_request_state(mdev, NS(susp, 0)); 2049 if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
2050 drbd_uuid_new_current(mdev);
2051 clear_bit(NEW_CUR_UUID, &mdev->flags);
2052 }
2053 drbd_suspend_io(mdev);
2054 reply->ret_code = drbd_request_state(mdev, NS3(susp, 0, susp_nod, 0, susp_fen, 0));
2055 if (reply->ret_code == SS_SUCCESS) {
2056 if (mdev->state.conn < C_CONNECTED)
2057 tl_clear(mdev);
2058 if (mdev->state.disk == D_DISKLESS || mdev->state.disk == D_FAILED)
2059 tl_restart(mdev, fail_frozen_disk_io);
2060 }
2061 drbd_resume_io(mdev);
2062
1769 return 0; 2063 return 0;
1770} 2064}
1771 2065
@@ -1873,6 +2167,11 @@ static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1873 reply->ret_code = ERR_MANDATORY_TAG; 2167 reply->ret_code = ERR_MANDATORY_TAG;
1874 return 0; 2168 return 0;
1875 } 2169 }
2170
2171 /* If there is still bitmap IO pending, e.g. previous resync or verify
2172 * just being finished, wait for it before requesting a new resync. */
2173 wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
2174
1876 /* w_make_ov_request expects position to be aligned */ 2175 /* w_make_ov_request expects position to be aligned */
1877 mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT; 2176 mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT;
1878 reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S)); 2177 reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S));
@@ -1916,7 +2215,8 @@ static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
1916 drbd_uuid_new_current(mdev); /* New current, previous to UI_BITMAP */ 2215 drbd_uuid_new_current(mdev); /* New current, previous to UI_BITMAP */
1917 2216
1918 if (args.clear_bm) { 2217 if (args.clear_bm) {
1919 err = drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write, "clear_n_write from new_c_uuid"); 2218 err = drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write,
2219 "clear_n_write from new_c_uuid", BM_LOCKED_MASK);
1920 if (err) { 2220 if (err) {
1921 dev_err(DEV, "Writing bitmap failed with %d\n",err); 2221 dev_err(DEV, "Writing bitmap failed with %d\n",err);
1922 retcode = ERR_IO_MD_DISK; 2222 retcode = ERR_IO_MD_DISK;
@@ -1924,6 +2224,7 @@ static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
1924 if (skip_initial_sync) { 2224 if (skip_initial_sync) {
1925 drbd_send_uuids_skip_initial_sync(mdev); 2225 drbd_send_uuids_skip_initial_sync(mdev);
1926 _drbd_uuid_set(mdev, UI_BITMAP, 0); 2226 _drbd_uuid_set(mdev, UI_BITMAP, 0);
2227 drbd_print_uuids(mdev, "cleared bitmap UUID");
1927 spin_lock_irq(&mdev->req_lock); 2228 spin_lock_irq(&mdev->req_lock);
1928 _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE), 2229 _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
1929 CS_VERBOSE, NULL); 2230 CS_VERBOSE, NULL);
@@ -1941,40 +2242,6 @@ out:
1941 return 0; 2242 return 0;
1942} 2243}
1943 2244
1944static struct drbd_conf *ensure_mdev(struct drbd_nl_cfg_req *nlp)
1945{
1946 struct drbd_conf *mdev;
1947
1948 if (nlp->drbd_minor >= minor_count)
1949 return NULL;
1950
1951 mdev = minor_to_mdev(nlp->drbd_minor);
1952
1953 if (!mdev && (nlp->flags & DRBD_NL_CREATE_DEVICE)) {
1954 struct gendisk *disk = NULL;
1955 mdev = drbd_new_device(nlp->drbd_minor);
1956
1957 spin_lock_irq(&drbd_pp_lock);
1958 if (minor_table[nlp->drbd_minor] == NULL) {
1959 minor_table[nlp->drbd_minor] = mdev;
1960 disk = mdev->vdisk;
1961 mdev = NULL;
1962 } /* else: we lost the race */
1963 spin_unlock_irq(&drbd_pp_lock);
1964
1965 if (disk) /* we won the race above */
1966 /* in case we ever add a drbd_delete_device(),
1967 * don't forget the del_gendisk! */
1968 add_disk(disk);
1969 else /* we lost the race above */
1970 drbd_free_mdev(mdev);
1971
1972 mdev = minor_to_mdev(nlp->drbd_minor);
1973 }
1974
1975 return mdev;
1976}
1977
1978struct cn_handler_struct { 2245struct cn_handler_struct {
1979 int (*function)(struct drbd_conf *, 2246 int (*function)(struct drbd_conf *,
1980 struct drbd_nl_cfg_req *, 2247 struct drbd_nl_cfg_req *,
@@ -2030,18 +2297,20 @@ static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms
2030 return; 2297 return;
2031 } 2298 }
2032 2299
2033 if (!cap_raised(nsp->eff_cap, CAP_SYS_ADMIN)) { 2300 if (!cap_raised(current_cap(), CAP_SYS_ADMIN)) {
2034 retcode = ERR_PERM; 2301 retcode = ERR_PERM;
2035 goto fail; 2302 goto fail;
2036 } 2303 }
2037 2304
2038 mdev = ensure_mdev(nlp); 2305 mdev = ensure_mdev(nlp->drbd_minor,
2306 (nlp->flags & DRBD_NL_CREATE_DEVICE));
2039 if (!mdev) { 2307 if (!mdev) {
2040 retcode = ERR_MINOR_INVALID; 2308 retcode = ERR_MINOR_INVALID;
2041 goto fail; 2309 goto fail;
2042 } 2310 }
2043 2311
2044 if (nlp->packet_type >= P_nl_after_last_packet) { 2312 if (nlp->packet_type >= P_nl_after_last_packet ||
2313 nlp->packet_type == P_return_code_only) {
2045 retcode = ERR_PACKET_NR; 2314 retcode = ERR_PACKET_NR;
2046 goto fail; 2315 goto fail;
2047 } 2316 }
@@ -2057,7 +2326,7 @@ static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms
2057 reply_size += cm->reply_body_size; 2326 reply_size += cm->reply_body_size;
2058 2327
2059 /* allocation not in the IO path, cqueue thread context */ 2328 /* allocation not in the IO path, cqueue thread context */
2060 cn_reply = kmalloc(reply_size, GFP_KERNEL); 2329 cn_reply = kzalloc(reply_size, GFP_KERNEL);
2061 if (!cn_reply) { 2330 if (!cn_reply) {
2062 retcode = ERR_NOMEM; 2331 retcode = ERR_NOMEM;
2063 goto fail; 2332 goto fail;
@@ -2065,7 +2334,7 @@ static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms
2065 reply = (struct drbd_nl_cfg_reply *) cn_reply->data; 2334 reply = (struct drbd_nl_cfg_reply *) cn_reply->data;
2066 2335
2067 reply->packet_type = 2336 reply->packet_type =
2068 cm->reply_body_size ? nlp->packet_type : P_nl_after_last_packet; 2337 cm->reply_body_size ? nlp->packet_type : P_return_code_only;
2069 reply->minor = nlp->drbd_minor; 2338 reply->minor = nlp->drbd_minor;
2070 reply->ret_code = NO_ERROR; /* Might by modified by cm->function. */ 2339 reply->ret_code = NO_ERROR; /* Might by modified by cm->function. */
2071 /* reply->tag_list; might be modified by cm->function. */ 2340 /* reply->tag_list; might be modified by cm->function. */
@@ -2228,7 +2497,7 @@ void drbd_bcast_ee(struct drbd_conf *mdev,
2228 /* receiver thread context, which is not in the writeout path (of this node), 2497 /* receiver thread context, which is not in the writeout path (of this node),
2229 * but may be in the writeout path of the _other_ node. 2498 * but may be in the writeout path of the _other_ node.
2230 * GFP_NOIO to avoid potential "distributed deadlock". */ 2499 * GFP_NOIO to avoid potential "distributed deadlock". */
2231 cn_reply = kmalloc( 2500 cn_reply = kzalloc(
2232 sizeof(struct cn_msg)+ 2501 sizeof(struct cn_msg)+
2233 sizeof(struct drbd_nl_cfg_reply)+ 2502 sizeof(struct drbd_nl_cfg_reply)+
2234 sizeof(struct dump_ee_tag_len_struct)+ 2503 sizeof(struct dump_ee_tag_len_struct)+
@@ -2250,10 +2519,11 @@ void drbd_bcast_ee(struct drbd_conf *mdev,
2250 tl = tl_add_int(tl, T_ee_sector, &e->sector); 2519 tl = tl_add_int(tl, T_ee_sector, &e->sector);
2251 tl = tl_add_int(tl, T_ee_block_id, &e->block_id); 2520 tl = tl_add_int(tl, T_ee_block_id, &e->block_id);
2252 2521
2522 /* dump the first 32k */
2523 len = min_t(unsigned, e->size, 32 << 10);
2253 put_unaligned(T_ee_data, tl++); 2524 put_unaligned(T_ee_data, tl++);
2254 put_unaligned(e->size, tl++); 2525 put_unaligned(len, tl++);
2255 2526
2256 len = e->size;
2257 page = e->pages; 2527 page = e->pages;
2258 page_chain_for_each(page) { 2528 page_chain_for_each(page) {
2259 void *d = kmap_atomic(page, KM_USER0); 2529 void *d = kmap_atomic(page, KM_USER0);
@@ -2262,6 +2532,8 @@ void drbd_bcast_ee(struct drbd_conf *mdev,
2262 kunmap_atomic(d, KM_USER0); 2532 kunmap_atomic(d, KM_USER0);
2263 tl = (unsigned short*)((char*)tl + l); 2533 tl = (unsigned short*)((char*)tl + l);
2264 len -= l; 2534 len -= l;
2535 if (len == 0)
2536 break;
2265 } 2537 }
2266 put_unaligned(TT_END, tl++); /* Close the tag list */ 2538 put_unaligned(TT_END, tl++); /* Close the tag list */
2267 2539
@@ -2360,6 +2632,7 @@ void drbd_nl_send_reply(struct cn_msg *req, int ret_code)
2360 (struct drbd_nl_cfg_reply *)cn_reply->data; 2632 (struct drbd_nl_cfg_reply *)cn_reply->data;
2361 int rr; 2633 int rr;
2362 2634
2635 memset(buffer, 0, sizeof(buffer));
2363 cn_reply->id = req->id; 2636 cn_reply->id = req->id;
2364 2637
2365 cn_reply->seq = req->seq; 2638 cn_reply->seq = req->seq;
@@ -2367,6 +2640,7 @@ void drbd_nl_send_reply(struct cn_msg *req, int ret_code)
2367 cn_reply->len = sizeof(struct drbd_nl_cfg_reply); 2640 cn_reply->len = sizeof(struct drbd_nl_cfg_reply);
2368 cn_reply->flags = 0; 2641 cn_reply->flags = 0;
2369 2642
2643 reply->packet_type = P_return_code_only;
2370 reply->minor = ((struct drbd_nl_cfg_req *)req->data)->drbd_minor; 2644 reply->minor = ((struct drbd_nl_cfg_req *)req->data)->drbd_minor;
2371 reply->ret_code = ret_code; 2645 reply->ret_code = ret_code;
2372 2646
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index be3374b68460..2959cdfb77f5 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -34,6 +34,7 @@
34#include "drbd_int.h" 34#include "drbd_int.h"
35 35
36static int drbd_proc_open(struct inode *inode, struct file *file); 36static int drbd_proc_open(struct inode *inode, struct file *file);
37static int drbd_proc_release(struct inode *inode, struct file *file);
37 38
38 39
39struct proc_dir_entry *drbd_proc; 40struct proc_dir_entry *drbd_proc;
@@ -42,9 +43,22 @@ const struct file_operations drbd_proc_fops = {
42 .open = drbd_proc_open, 43 .open = drbd_proc_open,
43 .read = seq_read, 44 .read = seq_read,
44 .llseek = seq_lseek, 45 .llseek = seq_lseek,
45 .release = single_release, 46 .release = drbd_proc_release,
46}; 47};
47 48
49void seq_printf_with_thousands_grouping(struct seq_file *seq, long v)
50{
51 /* v is in kB/sec. We don't expect TiByte/sec yet. */
52 if (unlikely(v >= 1000000)) {
53 /* cool: > GiByte/s */
54 seq_printf(seq, "%ld,", v / 1000000);
55 v /= 1000000;
56 seq_printf(seq, "%03ld,%03ld", v/1000, v % 1000);
57 } else if (likely(v >= 1000))
58 seq_printf(seq, "%ld,%03ld", v/1000, v % 1000);
59 else
60 seq_printf(seq, "%ld", v);
61}
48 62
49/*lge 63/*lge
50 * progress bars shamelessly adapted from driver/md/md.c 64 * progress bars shamelessly adapted from driver/md/md.c
@@ -57,6 +71,7 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
57 unsigned long db, dt, dbdt, rt, rs_left; 71 unsigned long db, dt, dbdt, rt, rs_left;
58 unsigned int res; 72 unsigned int res;
59 int i, x, y; 73 int i, x, y;
74 int stalled = 0;
60 75
61 drbd_get_syncer_progress(mdev, &rs_left, &res); 76 drbd_get_syncer_progress(mdev, &rs_left, &res);
62 77
@@ -70,10 +85,15 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
70 seq_printf(seq, "."); 85 seq_printf(seq, ".");
71 seq_printf(seq, "] "); 86 seq_printf(seq, "] ");
72 87
73 seq_printf(seq, "sync'ed:%3u.%u%% ", res / 10, res % 10); 88 if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T)
74 /* if more than 1 GB display in MB */ 89 seq_printf(seq, "verified:");
75 if (mdev->rs_total > 0x100000L) 90 else
76 seq_printf(seq, "(%lu/%lu)M\n\t", 91 seq_printf(seq, "sync'ed:");
92 seq_printf(seq, "%3u.%u%% ", res / 10, res % 10);
93
94 /* if more than a few GB, display in MB */
95 if (mdev->rs_total > (4UL << (30 - BM_BLOCK_SHIFT)))
96 seq_printf(seq, "(%lu/%lu)M",
77 (unsigned long) Bit2KB(rs_left >> 10), 97 (unsigned long) Bit2KB(rs_left >> 10),
78 (unsigned long) Bit2KB(mdev->rs_total >> 10)); 98 (unsigned long) Bit2KB(mdev->rs_total >> 10));
79 else 99 else
@@ -90,45 +110,76 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
90 * db: blocks written from mark until now 110 * db: blocks written from mark until now
91 * rt: remaining time 111 * rt: remaining time
92 */ 112 */
93 dt = (jiffies - mdev->rs_mark_time) / HZ; 113 /* Rolling marks. last_mark+1 may just now be modified. last_mark+2 is
94 114 * at least (DRBD_SYNC_MARKS-2)*DRBD_SYNC_MARK_STEP old, and has at
95 if (dt > 20) { 115 * least DRBD_SYNC_MARK_STEP time before it will be modified. */
96 /* if we made no update to rs_mark_time for too long, 116 /* ------------------------ ~18s average ------------------------ */
97 * we are stalled. show that. */ 117 i = (mdev->rs_last_mark + 2) % DRBD_SYNC_MARKS;
98 seq_printf(seq, "stalled\n"); 118 dt = (jiffies - mdev->rs_mark_time[i]) / HZ;
99 return; 119 if (dt > (DRBD_SYNC_MARK_STEP * DRBD_SYNC_MARKS))
100 } 120 stalled = 1;
101 121
102 if (!dt) 122 if (!dt)
103 dt++; 123 dt++;
104 db = mdev->rs_mark_left - rs_left; 124 db = mdev->rs_mark_left[i] - rs_left;
105 rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */ 125 rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */
106 126
107 seq_printf(seq, "finish: %lu:%02lu:%02lu", 127 seq_printf(seq, "finish: %lu:%02lu:%02lu",
108 rt / 3600, (rt % 3600) / 60, rt % 60); 128 rt / 3600, (rt % 3600) / 60, rt % 60);
109 129
110 /* current speed average over (SYNC_MARKS * SYNC_MARK_STEP) jiffies */
111 dbdt = Bit2KB(db/dt); 130 dbdt = Bit2KB(db/dt);
112 if (dbdt > 1000) 131 seq_printf(seq, " speed: ");
113 seq_printf(seq, " speed: %ld,%03ld", 132 seq_printf_with_thousands_grouping(seq, dbdt);
114 dbdt/1000, dbdt % 1000); 133 seq_printf(seq, " (");
115 else 134 /* ------------------------- ~3s average ------------------------ */
116 seq_printf(seq, " speed: %ld", dbdt); 135 if (proc_details >= 1) {
136 /* this is what drbd_rs_should_slow_down() uses */
137 i = (mdev->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
138 dt = (jiffies - mdev->rs_mark_time[i]) / HZ;
139 if (!dt)
140 dt++;
141 db = mdev->rs_mark_left[i] - rs_left;
142 dbdt = Bit2KB(db/dt);
143 seq_printf_with_thousands_grouping(seq, dbdt);
144 seq_printf(seq, " -- ");
145 }
117 146
147 /* --------------------- long term average ---------------------- */
118 /* mean speed since syncer started 148 /* mean speed since syncer started
119 * we do account for PausedSync periods */ 149 * we do account for PausedSync periods */
120 dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ; 150 dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
121 if (dt <= 0) 151 if (dt == 0)
122 dt = 1; 152 dt = 1;
123 db = mdev->rs_total - rs_left; 153 db = mdev->rs_total - rs_left;
124 dbdt = Bit2KB(db/dt); 154 dbdt = Bit2KB(db/dt);
125 if (dbdt > 1000) 155 seq_printf_with_thousands_grouping(seq, dbdt);
126 seq_printf(seq, " (%ld,%03ld)", 156 seq_printf(seq, ")");
127 dbdt/1000, dbdt % 1000);
128 else
129 seq_printf(seq, " (%ld)", dbdt);
130 157
131 seq_printf(seq, " K/sec\n"); 158 if (mdev->state.conn == C_SYNC_TARGET ||
159 mdev->state.conn == C_VERIFY_S) {
160 seq_printf(seq, " want: ");
161 seq_printf_with_thousands_grouping(seq, mdev->c_sync_rate);
162 }
163 seq_printf(seq, " K/sec%s\n", stalled ? " (stalled)" : "");
164
165 if (proc_details >= 1) {
166 /* 64 bit:
167 * we convert to sectors in the display below. */
168 unsigned long bm_bits = drbd_bm_bits(mdev);
169 unsigned long bit_pos;
170 if (mdev->state.conn == C_VERIFY_S ||
171 mdev->state.conn == C_VERIFY_T)
172 bit_pos = bm_bits - mdev->ov_left;
173 else
174 bit_pos = mdev->bm_resync_fo;
175 /* Total sectors may be slightly off for oddly
176 * sized devices. So what. */
177 seq_printf(seq,
178 "\t%3d%% sector pos: %llu/%llu\n",
179 (int)(bit_pos / (bm_bits/100+1)),
180 (unsigned long long)bit_pos * BM_SECT_PER_BIT,
181 (unsigned long long)bm_bits * BM_SECT_PER_BIT);
182 }
132} 183}
133 184
134static void resync_dump_detail(struct seq_file *seq, struct lc_element *e) 185static void resync_dump_detail(struct seq_file *seq, struct lc_element *e)
@@ -151,7 +202,6 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
151 [WO_none] = 'n', 202 [WO_none] = 'n',
152 [WO_drain_io] = 'd', 203 [WO_drain_io] = 'd',
153 [WO_bdev_flush] = 'f', 204 [WO_bdev_flush] = 'f',
154 [WO_bio_barrier] = 'b',
155 }; 205 };
156 206
157 seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n", 207 seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n",
@@ -196,7 +246,7 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
196 seq_printf(seq, "%2d: cs:Unconfigured\n", i); 246 seq_printf(seq, "%2d: cs:Unconfigured\n", i);
197 } else { 247 } else {
198 seq_printf(seq, 248 seq_printf(seq,
199 "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c\n" 249 "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n"
200 " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u " 250 " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
201 "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c", 251 "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c",
202 i, sn, 252 i, sn,
@@ -206,11 +256,12 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
206 drbd_disk_str(mdev->state.pdsk), 256 drbd_disk_str(mdev->state.pdsk),
207 (mdev->net_conf == NULL ? ' ' : 257 (mdev->net_conf == NULL ? ' ' :
208 (mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')), 258 (mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')),
209 mdev->state.susp ? 's' : 'r', 259 is_susp(mdev->state) ? 's' : 'r',
210 mdev->state.aftr_isp ? 'a' : '-', 260 mdev->state.aftr_isp ? 'a' : '-',
211 mdev->state.peer_isp ? 'p' : '-', 261 mdev->state.peer_isp ? 'p' : '-',
212 mdev->state.user_isp ? 'u' : '-', 262 mdev->state.user_isp ? 'u' : '-',
213 mdev->congestion_reason ?: '-', 263 mdev->congestion_reason ?: '-',
264 test_bit(AL_SUSPENDED, &mdev->flags) ? 's' : '-',
214 mdev->send_cnt/2, 265 mdev->send_cnt/2,
215 mdev->recv_cnt/2, 266 mdev->recv_cnt/2,
216 mdev->writ_cnt/2, 267 mdev->writ_cnt/2,
@@ -225,20 +276,16 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
225 mdev->epochs, 276 mdev->epochs,
226 write_ordering_chars[mdev->write_ordering] 277 write_ordering_chars[mdev->write_ordering]
227 ); 278 );
228 seq_printf(seq, " oos:%lu\n", 279 seq_printf(seq, " oos:%llu\n",
229 Bit2KB(drbd_bm_total_weight(mdev))); 280 Bit2KB((unsigned long long)
281 drbd_bm_total_weight(mdev)));
230 } 282 }
231 if (mdev->state.conn == C_SYNC_SOURCE || 283 if (mdev->state.conn == C_SYNC_SOURCE ||
232 mdev->state.conn == C_SYNC_TARGET) 284 mdev->state.conn == C_SYNC_TARGET ||
285 mdev->state.conn == C_VERIFY_S ||
286 mdev->state.conn == C_VERIFY_T)
233 drbd_syncer_progress(mdev, seq); 287 drbd_syncer_progress(mdev, seq);
234 288
235 if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T)
236 seq_printf(seq, "\t%3d%% %lu/%lu\n",
237 (int)((mdev->rs_total-mdev->ov_left) /
238 (mdev->rs_total/100+1)),
239 mdev->rs_total - mdev->ov_left,
240 mdev->rs_total);
241
242 if (proc_details >= 1 && get_ldev_if_state(mdev, D_FAILED)) { 289 if (proc_details >= 1 && get_ldev_if_state(mdev, D_FAILED)) {
243 lc_seq_printf_stats(seq, mdev->resync); 290 lc_seq_printf_stats(seq, mdev->resync);
244 lc_seq_printf_stats(seq, mdev->act_log); 291 lc_seq_printf_stats(seq, mdev->act_log);
@@ -258,7 +305,15 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
258 305
259static int drbd_proc_open(struct inode *inode, struct file *file) 306static int drbd_proc_open(struct inode *inode, struct file *file)
260{ 307{
261 return single_open(file, drbd_seq_show, PDE(inode)->data); 308 if (try_module_get(THIS_MODULE))
309 return single_open(file, drbd_seq_show, PDE(inode)->data);
310 return -ENODEV;
311}
312
313static int drbd_proc_release(struct inode *inode, struct file *file)
314{
315 module_put(THIS_MODULE);
316 return single_release(inode, file);
262} 317}
263 318
264/* PROC FS stuff end */ 319/* PROC FS stuff end */
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 081522d3c742..43beaca53179 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -36,7 +36,6 @@
36#include <linux/memcontrol.h> 36#include <linux/memcontrol.h>
37#include <linux/mm_inline.h> 37#include <linux/mm_inline.h>
38#include <linux/slab.h> 38#include <linux/slab.h>
39#include <linux/smp_lock.h>
40#include <linux/pkt_sched.h> 39#include <linux/pkt_sched.h>
41#define __KERNEL_SYSCALLS__ 40#define __KERNEL_SYSCALLS__
42#include <linux/unistd.h> 41#include <linux/unistd.h>
@@ -49,11 +48,6 @@
49 48
50#include "drbd_vli.h" 49#include "drbd_vli.h"
51 50
52struct flush_work {
53 struct drbd_work w;
54 struct drbd_epoch *epoch;
55};
56
57enum finish_epoch { 51enum finish_epoch {
58 FE_STILL_LIVE, 52 FE_STILL_LIVE,
59 FE_DESTROYED, 53 FE_DESTROYED,
@@ -66,16 +60,6 @@ static int drbd_do_auth(struct drbd_conf *mdev);
66static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event); 60static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event);
67static int e_end_block(struct drbd_conf *, struct drbd_work *, int); 61static int e_end_block(struct drbd_conf *, struct drbd_work *, int);
68 62
69static struct drbd_epoch *previous_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch)
70{
71 struct drbd_epoch *prev;
72 spin_lock(&mdev->epoch_lock);
73 prev = list_entry(epoch->list.prev, struct drbd_epoch, list);
74 if (prev == epoch || prev == mdev->current_epoch)
75 prev = NULL;
76 spin_unlock(&mdev->epoch_lock);
77 return prev;
78}
79 63
80#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) 64#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
81 65
@@ -203,15 +187,6 @@ static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int
203 return NULL; 187 return NULL;
204} 188}
205 189
206/* kick lower level device, if we have more than (arbitrary number)
207 * reference counts on it, which typically are locally submitted io
208 * requests. don't use unacked_cnt, so we speed up proto A and B, too. */
209static void maybe_kick_lo(struct drbd_conf *mdev)
210{
211 if (atomic_read(&mdev->local_cnt) >= mdev->net_conf->unplug_watermark)
212 drbd_kick_lo(mdev);
213}
214
215static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed) 190static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed)
216{ 191{
217 struct drbd_epoch_entry *e; 192 struct drbd_epoch_entry *e;
@@ -235,13 +210,12 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
235 LIST_HEAD(reclaimed); 210 LIST_HEAD(reclaimed);
236 struct drbd_epoch_entry *e, *t; 211 struct drbd_epoch_entry *e, *t;
237 212
238 maybe_kick_lo(mdev);
239 spin_lock_irq(&mdev->req_lock); 213 spin_lock_irq(&mdev->req_lock);
240 reclaim_net_ee(mdev, &reclaimed); 214 reclaim_net_ee(mdev, &reclaimed);
241 spin_unlock_irq(&mdev->req_lock); 215 spin_unlock_irq(&mdev->req_lock);
242 216
243 list_for_each_entry_safe(e, t, &reclaimed, w.list) 217 list_for_each_entry_safe(e, t, &reclaimed, w.list)
244 drbd_free_ee(mdev, e); 218 drbd_free_net_ee(mdev, e);
245} 219}
246 220
247/** 221/**
@@ -298,10 +272,12 @@ static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool
298 * Is also used from inside an other spin_lock_irq(&mdev->req_lock); 272 * Is also used from inside an other spin_lock_irq(&mdev->req_lock);
299 * Either links the page chain back to the global pool, 273 * Either links the page chain back to the global pool,
300 * or returns all pages to the system. */ 274 * or returns all pages to the system. */
301static void drbd_pp_free(struct drbd_conf *mdev, struct page *page) 275static void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net)
302{ 276{
277 atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use;
303 int i; 278 int i;
304 if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) 279
280 if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE)*minor_count)
305 i = page_chain_free(page); 281 i = page_chain_free(page);
306 else { 282 else {
307 struct page *tmp; 283 struct page *tmp;
@@ -311,10 +287,10 @@ static void drbd_pp_free(struct drbd_conf *mdev, struct page *page)
311 drbd_pp_vacant += i; 287 drbd_pp_vacant += i;
312 spin_unlock(&drbd_pp_lock); 288 spin_unlock(&drbd_pp_lock);
313 } 289 }
314 atomic_sub(i, &mdev->pp_in_use); 290 i = atomic_sub_return(i, a);
315 i = atomic_read(&mdev->pp_in_use);
316 if (i < 0) 291 if (i < 0)
317 dev_warn(DEV, "ASSERTION FAILED: pp_in_use: %d < 0\n", i); 292 dev_warn(DEV, "ASSERTION FAILED: %s: %d < 0\n",
293 is_net ? "pp_in_use_by_net" : "pp_in_use", i);
318 wake_up(&drbd_pp_wait); 294 wake_up(&drbd_pp_wait);
319} 295}
320 296
@@ -343,7 +319,7 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
343 struct page *page; 319 struct page *page;
344 unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT; 320 unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
345 321
346 if (FAULT_ACTIVE(mdev, DRBD_FAULT_AL_EE)) 322 if (drbd_insert_fault(mdev, DRBD_FAULT_AL_EE))
347 return NULL; 323 return NULL;
348 324
349 e = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM); 325 e = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
@@ -357,7 +333,7 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
357 if (!page) 333 if (!page)
358 goto fail; 334 goto fail;
359 335
360 INIT_HLIST_NODE(&e->colision); 336 INIT_HLIST_NODE(&e->collision);
361 e->epoch = NULL; 337 e->epoch = NULL;
362 e->mdev = mdev; 338 e->mdev = mdev;
363 e->pages = page; 339 e->pages = page;
@@ -365,7 +341,6 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
365 e->size = data_size; 341 e->size = data_size;
366 e->flags = 0; 342 e->flags = 0;
367 e->sector = sector; 343 e->sector = sector;
368 e->sector = sector;
369 e->block_id = id; 344 e->block_id = id;
370 345
371 return e; 346 return e;
@@ -375,11 +350,13 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
375 return NULL; 350 return NULL;
376} 351}
377 352
378void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e) 353void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, int is_net)
379{ 354{
380 drbd_pp_free(mdev, e->pages); 355 if (e->flags & EE_HAS_DIGEST)
356 kfree(e->digest);
357 drbd_pp_free(mdev, e->pages, is_net);
381 D_ASSERT(atomic_read(&e->pending_bios) == 0); 358 D_ASSERT(atomic_read(&e->pending_bios) == 0);
382 D_ASSERT(hlist_unhashed(&e->colision)); 359 D_ASSERT(hlist_unhashed(&e->collision));
383 mempool_free(e, drbd_ee_mempool); 360 mempool_free(e, drbd_ee_mempool);
384} 361}
385 362
@@ -388,13 +365,14 @@ int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list)
388 LIST_HEAD(work_list); 365 LIST_HEAD(work_list);
389 struct drbd_epoch_entry *e, *t; 366 struct drbd_epoch_entry *e, *t;
390 int count = 0; 367 int count = 0;
368 int is_net = list == &mdev->net_ee;
391 369
392 spin_lock_irq(&mdev->req_lock); 370 spin_lock_irq(&mdev->req_lock);
393 list_splice_init(list, &work_list); 371 list_splice_init(list, &work_list);
394 spin_unlock_irq(&mdev->req_lock); 372 spin_unlock_irq(&mdev->req_lock);
395 373
396 list_for_each_entry_safe(e, t, &work_list, w.list) { 374 list_for_each_entry_safe(e, t, &work_list, w.list) {
397 drbd_free_ee(mdev, e); 375 drbd_free_some_ee(mdev, e, is_net);
398 count++; 376 count++;
399 } 377 }
400 return count; 378 return count;
@@ -423,7 +401,7 @@ static int drbd_process_done_ee(struct drbd_conf *mdev)
423 spin_unlock_irq(&mdev->req_lock); 401 spin_unlock_irq(&mdev->req_lock);
424 402
425 list_for_each_entry_safe(e, t, &reclaimed, w.list) 403 list_for_each_entry_safe(e, t, &reclaimed, w.list)
426 drbd_free_ee(mdev, e); 404 drbd_free_net_ee(mdev, e);
427 405
428 /* possible callbacks here: 406 /* possible callbacks here:
429 * e_end_block, and e_end_resync_block, e_send_discard_ack. 407 * e_end_block, and e_end_resync_block, e_send_discard_ack.
@@ -448,8 +426,7 @@ void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
448 while (!list_empty(head)) { 426 while (!list_empty(head)) {
449 prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE); 427 prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
450 spin_unlock_irq(&mdev->req_lock); 428 spin_unlock_irq(&mdev->req_lock);
451 drbd_kick_lo(mdev); 429 io_schedule();
452 schedule();
453 finish_wait(&mdev->ee_wait, &wait); 430 finish_wait(&mdev->ee_wait, &wait);
454 spin_lock_irq(&mdev->req_lock); 431 spin_lock_irq(&mdev->req_lock);
455 } 432 }
@@ -719,14 +696,14 @@ out:
719static int drbd_send_fp(struct drbd_conf *mdev, 696static int drbd_send_fp(struct drbd_conf *mdev,
720 struct socket *sock, enum drbd_packets cmd) 697 struct socket *sock, enum drbd_packets cmd)
721{ 698{
722 struct p_header *h = (struct p_header *) &mdev->data.sbuf.header; 699 struct p_header80 *h = &mdev->data.sbuf.header.h80;
723 700
724 return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0); 701 return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0);
725} 702}
726 703
727static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock) 704static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock)
728{ 705{
729 struct p_header *h = (struct p_header *) &mdev->data.sbuf.header; 706 struct p_header80 *h = &mdev->data.rbuf.header.h80;
730 int rr; 707 int rr;
731 708
732 rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0); 709 rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0);
@@ -748,16 +725,16 @@ static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock)
748 char tb[4]; 725 char tb[4];
749 726
750 if (!*sock) 727 if (!*sock)
751 return FALSE; 728 return false;
752 729
753 rr = drbd_recv_short(mdev, *sock, tb, 4, MSG_DONTWAIT | MSG_PEEK); 730 rr = drbd_recv_short(mdev, *sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
754 731
755 if (rr > 0 || rr == -EAGAIN) { 732 if (rr > 0 || rr == -EAGAIN) {
756 return TRUE; 733 return true;
757 } else { 734 } else {
758 sock_release(*sock); 735 sock_release(*sock);
759 *sock = NULL; 736 *sock = NULL;
760 return FALSE; 737 return false;
761 } 738 }
762} 739}
763 740
@@ -776,9 +753,6 @@ static int drbd_connect(struct drbd_conf *mdev)
776 753
777 D_ASSERT(!mdev->data.socket); 754 D_ASSERT(!mdev->data.socket);
778 755
779 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags))
780 dev_err(DEV, "CREATE_BARRIER flag was set in drbd_connect - now cleared!\n");
781
782 if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS) 756 if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS)
783 return -2; 757 return -2;
784 758
@@ -794,8 +768,7 @@ static int drbd_connect(struct drbd_conf *mdev)
794 if (s || ++try >= 3) 768 if (s || ++try >= 3)
795 break; 769 break;
796 /* give the other side time to call bind() & listen() */ 770 /* give the other side time to call bind() & listen() */
797 __set_current_state(TASK_INTERRUPTIBLE); 771 schedule_timeout_interruptible(HZ / 10);
798 schedule_timeout(HZ / 10);
799 } 772 }
800 773
801 if (s) { 774 if (s) {
@@ -814,8 +787,7 @@ static int drbd_connect(struct drbd_conf *mdev)
814 } 787 }
815 788
816 if (sock && msock) { 789 if (sock && msock) {
817 __set_current_state(TASK_INTERRUPTIBLE); 790 schedule_timeout_interruptible(mdev->net_conf->ping_timeo*HZ/10);
818 schedule_timeout(HZ / 10);
819 ok = drbd_socket_okay(mdev, &sock); 791 ok = drbd_socket_okay(mdev, &sock);
820 ok = drbd_socket_okay(mdev, &msock) && ok; 792 ok = drbd_socket_okay(mdev, &msock) && ok;
821 if (ok) 793 if (ok)
@@ -890,7 +862,7 @@ retry:
890 msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ; 862 msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
891 863
892 /* we don't want delays. 864 /* we don't want delays.
893 * we use TCP_CORK where apropriate, though */ 865 * we use TCP_CORK where appropriate, though */
894 drbd_tcp_nodelay(sock); 866 drbd_tcp_nodelay(sock);
895 drbd_tcp_nodelay(msock); 867 drbd_tcp_nodelay(msock);
896 868
@@ -927,7 +899,7 @@ retry:
927 899
928 drbd_thread_start(&mdev->asender); 900 drbd_thread_start(&mdev->asender);
929 901
930 if (!drbd_send_protocol(mdev)) 902 if (drbd_send_protocol(mdev) == -1)
931 return -1; 903 return -1;
932 drbd_send_sync_param(mdev, &mdev->sync_conf); 904 drbd_send_sync_param(mdev, &mdev->sync_conf);
933 drbd_send_sizes(mdev, 0, 0); 905 drbd_send_sizes(mdev, 0, 0);
@@ -935,6 +907,7 @@ retry:
935 drbd_send_state(mdev); 907 drbd_send_state(mdev);
936 clear_bit(USE_DEGR_WFC_T, &mdev->flags); 908 clear_bit(USE_DEGR_WFC_T, &mdev->flags);
937 clear_bit(RESIZE_PENDING, &mdev->flags); 909 clear_bit(RESIZE_PENDING, &mdev->flags);
910 mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */
938 911
939 return 1; 912 return 1;
940 913
@@ -946,36 +919,43 @@ out_release_sockets:
946 return -1; 919 return -1;
947} 920}
948 921
949static int drbd_recv_header(struct drbd_conf *mdev, struct p_header *h) 922static int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsigned int *packet_size)
950{ 923{
924 union p_header *h = &mdev->data.rbuf.header;
951 int r; 925 int r;
952 926
953 r = drbd_recv(mdev, h, sizeof(*h)); 927 r = drbd_recv(mdev, h, sizeof(*h));
954
955 if (unlikely(r != sizeof(*h))) { 928 if (unlikely(r != sizeof(*h))) {
956 dev_err(DEV, "short read expecting header on sock: r=%d\n", r); 929 if (!signal_pending(current))
957 return FALSE; 930 dev_warn(DEV, "short read expecting header on sock: r=%d\n", r);
958 }; 931 return false;
959 h->command = be16_to_cpu(h->command); 932 }
960 h->length = be16_to_cpu(h->length); 933
961 if (unlikely(h->magic != BE_DRBD_MAGIC)) { 934 if (likely(h->h80.magic == BE_DRBD_MAGIC)) {
962 dev_err(DEV, "magic?? on data m: 0x%lx c: %d l: %d\n", 935 *cmd = be16_to_cpu(h->h80.command);
963 (long)be32_to_cpu(h->magic), 936 *packet_size = be16_to_cpu(h->h80.length);
964 h->command, h->length); 937 } else if (h->h95.magic == BE_DRBD_MAGIC_BIG) {
965 return FALSE; 938 *cmd = be16_to_cpu(h->h95.command);
939 *packet_size = be32_to_cpu(h->h95.length);
940 } else {
941 dev_err(DEV, "magic?? on data m: 0x%08x c: %d l: %d\n",
942 be32_to_cpu(h->h80.magic),
943 be16_to_cpu(h->h80.command),
944 be16_to_cpu(h->h80.length));
945 return false;
966 } 946 }
967 mdev->last_received = jiffies; 947 mdev->last_received = jiffies;
968 948
969 return TRUE; 949 return true;
970} 950}
971 951
972static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch) 952static void drbd_flush(struct drbd_conf *mdev)
973{ 953{
974 int rv; 954 int rv;
975 955
976 if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) { 956 if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
977 rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL, 957 rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL,
978 NULL, BLKDEV_IFL_WAIT); 958 NULL);
979 if (rv) { 959 if (rv) {
980 dev_err(DEV, "local disk flush failed with status %d\n", rv); 960 dev_err(DEV, "local disk flush failed with status %d\n", rv);
981 /* would rather check on EOPNOTSUPP, but that is not reliable. 961 /* would rather check on EOPNOTSUPP, but that is not reliable.
@@ -985,24 +965,6 @@ static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct d
985 } 965 }
986 put_ldev(mdev); 966 put_ldev(mdev);
987 } 967 }
988
989 return drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
990}
991
992static int w_flush(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
993{
994 struct flush_work *fw = (struct flush_work *)w;
995 struct drbd_epoch *epoch = fw->epoch;
996
997 kfree(w);
998
999 if (!test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags))
1000 drbd_flush_after_epoch(mdev, epoch);
1001
1002 drbd_may_finish_epoch(mdev, epoch, EV_PUT |
1003 (mdev->state.conn < C_CONNECTED ? EV_CLEANUP : 0));
1004
1005 return 1;
1006} 968}
1007 969
1008/** 970/**
@@ -1015,15 +977,13 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
1015 struct drbd_epoch *epoch, 977 struct drbd_epoch *epoch,
1016 enum epoch_event ev) 978 enum epoch_event ev)
1017{ 979{
1018 int finish, epoch_size; 980 int epoch_size;
1019 struct drbd_epoch *next_epoch; 981 struct drbd_epoch *next_epoch;
1020 int schedule_flush = 0;
1021 enum finish_epoch rv = FE_STILL_LIVE; 982 enum finish_epoch rv = FE_STILL_LIVE;
1022 983
1023 spin_lock(&mdev->epoch_lock); 984 spin_lock(&mdev->epoch_lock);
1024 do { 985 do {
1025 next_epoch = NULL; 986 next_epoch = NULL;
1026 finish = 0;
1027 987
1028 epoch_size = atomic_read(&epoch->epoch_size); 988 epoch_size = atomic_read(&epoch->epoch_size);
1029 989
@@ -1033,16 +993,6 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
1033 break; 993 break;
1034 case EV_GOT_BARRIER_NR: 994 case EV_GOT_BARRIER_NR:
1035 set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags); 995 set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
1036
1037 /* Special case: If we just switched from WO_bio_barrier to
1038 WO_bdev_flush we should not finish the current epoch */
1039 if (test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) && epoch_size == 1 &&
1040 mdev->write_ordering != WO_bio_barrier &&
1041 epoch == mdev->current_epoch)
1042 clear_bit(DE_CONTAINS_A_BARRIER, &epoch->flags);
1043 break;
1044 case EV_BARRIER_DONE:
1045 set_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags);
1046 break; 996 break;
1047 case EV_BECAME_LAST: 997 case EV_BECAME_LAST:
1048 /* nothing to do*/ 998 /* nothing to do*/
@@ -1051,23 +1001,7 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
1051 1001
1052 if (epoch_size != 0 && 1002 if (epoch_size != 0 &&
1053 atomic_read(&epoch->active) == 0 && 1003 atomic_read(&epoch->active) == 0 &&
1054 test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) && 1004 test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) {
1055 epoch->list.prev == &mdev->current_epoch->list &&
1056 !test_bit(DE_IS_FINISHING, &epoch->flags)) {
1057 /* Nearly all conditions are met to finish that epoch... */
1058 if (test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) ||
1059 mdev->write_ordering == WO_none ||
1060 (epoch_size == 1 && test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) ||
1061 ev & EV_CLEANUP) {
1062 finish = 1;
1063 set_bit(DE_IS_FINISHING, &epoch->flags);
1064 } else if (!test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) &&
1065 mdev->write_ordering == WO_bio_barrier) {
1066 atomic_inc(&epoch->active);
1067 schedule_flush = 1;
1068 }
1069 }
1070 if (finish) {
1071 if (!(ev & EV_CLEANUP)) { 1005 if (!(ev & EV_CLEANUP)) {
1072 spin_unlock(&mdev->epoch_lock); 1006 spin_unlock(&mdev->epoch_lock);
1073 drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size); 1007 drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size);
@@ -1090,6 +1024,7 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
1090 /* atomic_set(&epoch->active, 0); is already zero */ 1024 /* atomic_set(&epoch->active, 0); is already zero */
1091 if (rv == FE_STILL_LIVE) 1025 if (rv == FE_STILL_LIVE)
1092 rv = FE_RECYCLED; 1026 rv = FE_RECYCLED;
1027 wake_up(&mdev->ee_wait);
1093 } 1028 }
1094 } 1029 }
1095 1030
@@ -1101,22 +1036,6 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
1101 1036
1102 spin_unlock(&mdev->epoch_lock); 1037 spin_unlock(&mdev->epoch_lock);
1103 1038
1104 if (schedule_flush) {
1105 struct flush_work *fw;
1106 fw = kmalloc(sizeof(*fw), GFP_ATOMIC);
1107 if (fw) {
1108 fw->w.cb = w_flush;
1109 fw->epoch = epoch;
1110 drbd_queue_work(&mdev->data.work, &fw->w);
1111 } else {
1112 dev_warn(DEV, "Could not kmalloc a flush_work obj\n");
1113 set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
1114 /* That is not a recursion, only one level */
1115 drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
1116 drbd_may_finish_epoch(mdev, epoch, EV_PUT);
1117 }
1118 }
1119
1120 return rv; 1039 return rv;
1121} 1040}
1122 1041
@@ -1132,19 +1051,16 @@ void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo)
1132 [WO_none] = "none", 1051 [WO_none] = "none",
1133 [WO_drain_io] = "drain", 1052 [WO_drain_io] = "drain",
1134 [WO_bdev_flush] = "flush", 1053 [WO_bdev_flush] = "flush",
1135 [WO_bio_barrier] = "barrier",
1136 }; 1054 };
1137 1055
1138 pwo = mdev->write_ordering; 1056 pwo = mdev->write_ordering;
1139 wo = min(pwo, wo); 1057 wo = min(pwo, wo);
1140 if (wo == WO_bio_barrier && mdev->ldev->dc.no_disk_barrier)
1141 wo = WO_bdev_flush;
1142 if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush) 1058 if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush)
1143 wo = WO_drain_io; 1059 wo = WO_drain_io;
1144 if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain) 1060 if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain)
1145 wo = WO_none; 1061 wo = WO_none;
1146 mdev->write_ordering = wo; 1062 mdev->write_ordering = wo;
1147 if (pwo != mdev->write_ordering || wo == WO_bio_barrier) 1063 if (pwo != mdev->write_ordering || wo == WO_bdev_flush)
1148 dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]); 1064 dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]);
1149} 1065}
1150 1066
@@ -1153,6 +1069,16 @@ void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo)
1153 * @mdev: DRBD device. 1069 * @mdev: DRBD device.
1154 * @e: epoch entry 1070 * @e: epoch entry
1155 * @rw: flag field, see bio->bi_rw 1071 * @rw: flag field, see bio->bi_rw
1072 *
1073 * May spread the pages to multiple bios,
1074 * depending on bio_add_page restrictions.
1075 *
1076 * Returns 0 if all bios have been submitted,
1077 * -ENOMEM if we could not allocate enough bios,
1078 * -ENOSPC (any better suggestion?) if we have not been able to bio_add_page a
1079 * single page to an empty bio (which should never happen and likely indicates
1080 * that the lower level IO stack is in some way broken). This has been observed
1081 * on certain Xen deployments.
1156 */ 1082 */
1157/* TODO allocate from our own bio_set. */ 1083/* TODO allocate from our own bio_set. */
1158int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, 1084int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
@@ -1165,6 +1091,7 @@ int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
1165 unsigned ds = e->size; 1091 unsigned ds = e->size;
1166 unsigned n_bios = 0; 1092 unsigned n_bios = 0;
1167 unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT; 1093 unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT;
1094 int err = -ENOMEM;
1168 1095
1169 /* In most cases, we will only need one bio. But in case the lower 1096 /* In most cases, we will only need one bio. But in case the lower
1170 * level restrictions happen to be different at this offset on this 1097 * level restrictions happen to be different at this offset on this
@@ -1179,8 +1106,6 @@ next_bio:
1179 /* > e->sector, unless this is the first bio */ 1106 /* > e->sector, unless this is the first bio */
1180 bio->bi_sector = sector; 1107 bio->bi_sector = sector;
1181 bio->bi_bdev = mdev->ldev->backing_bdev; 1108 bio->bi_bdev = mdev->ldev->backing_bdev;
1182 /* we special case some flags in the multi-bio case, see below
1183 * (REQ_UNPLUG, REQ_HARDBARRIER) */
1184 bio->bi_rw = rw; 1109 bio->bi_rw = rw;
1185 bio->bi_private = e; 1110 bio->bi_private = e;
1186 bio->bi_end_io = drbd_endio_sec; 1111 bio->bi_end_io = drbd_endio_sec;
@@ -1192,8 +1117,17 @@ next_bio:
1192 page_chain_for_each(page) { 1117 page_chain_for_each(page) {
1193 unsigned len = min_t(unsigned, ds, PAGE_SIZE); 1118 unsigned len = min_t(unsigned, ds, PAGE_SIZE);
1194 if (!bio_add_page(bio, page, len, 0)) { 1119 if (!bio_add_page(bio, page, len, 0)) {
1195 /* a single page must always be possible! */ 1120 /* A single page must always be possible!
1196 BUG_ON(bio->bi_vcnt == 0); 1121 * But in case it fails anyways,
1122 * we deal with it, and complain (below). */
1123 if (bio->bi_vcnt == 0) {
1124 dev_err(DEV,
1125 "bio_add_page failed for len=%u, "
1126 "bi_vcnt=0 (bi_sector=%llu)\n",
1127 len, (unsigned long long)bio->bi_sector);
1128 err = -ENOSPC;
1129 goto fail;
1130 }
1197 goto next_bio; 1131 goto next_bio;
1198 } 1132 }
1199 ds -= len; 1133 ds -= len;
@@ -1209,18 +1143,8 @@ next_bio:
1209 bios = bios->bi_next; 1143 bios = bios->bi_next;
1210 bio->bi_next = NULL; 1144 bio->bi_next = NULL;
1211 1145
1212 /* strip off REQ_UNPLUG unless it is the last bio */
1213 if (bios)
1214 bio->bi_rw &= ~REQ_UNPLUG;
1215
1216 drbd_generic_make_request(mdev, fault_type, bio); 1146 drbd_generic_make_request(mdev, fault_type, bio);
1217
1218 /* strip off REQ_HARDBARRIER,
1219 * unless it is the first or last bio */
1220 if (bios && bios->bi_next)
1221 bios->bi_rw &= ~REQ_HARDBARRIER;
1222 } while (bios); 1147 } while (bios);
1223 maybe_kick_lo(mdev);
1224 return 0; 1148 return 0;
1225 1149
1226fail: 1150fail:
@@ -1229,61 +1153,17 @@ fail:
1229 bios = bios->bi_next; 1153 bios = bios->bi_next;
1230 bio_put(bio); 1154 bio_put(bio);
1231 } 1155 }
1232 return -ENOMEM; 1156 return err;
1233}
1234
1235/**
1236 * w_e_reissue() - Worker callback; Resubmit a bio, without REQ_HARDBARRIER set
1237 * @mdev: DRBD device.
1238 * @w: work object.
1239 * @cancel: The connection will be closed anyways (unused in this callback)
1240 */
1241int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local)
1242{
1243 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1244 /* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place,
1245 (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch)
1246 so that we can finish that epoch in drbd_may_finish_epoch().
1247 That is necessary if we already have a long chain of Epochs, before
1248 we realize that REQ_HARDBARRIER is actually not supported */
1249
1250 /* As long as the -ENOTSUPP on the barrier is reported immediately
1251 that will never trigger. If it is reported late, we will just
1252 print that warning and continue correctly for all future requests
1253 with WO_bdev_flush */
1254 if (previous_epoch(mdev, e->epoch))
1255 dev_warn(DEV, "Write ordering was not enforced (one time event)\n");
1256
1257 /* we still have a local reference,
1258 * get_ldev was done in receive_Data. */
1259
1260 e->w.cb = e_end_block;
1261 if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_DT_WR) != 0) {
1262 /* drbd_submit_ee fails for one reason only:
1263 * if was not able to allocate sufficient bios.
1264 * requeue, try again later. */
1265 e->w.cb = w_e_reissue;
1266 drbd_queue_work(&mdev->data.work, &e->w);
1267 }
1268 return 1;
1269} 1157}
1270 1158
1271static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h) 1159static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
1272{ 1160{
1273 int rv, issue_flush; 1161 int rv;
1274 struct p_barrier *p = (struct p_barrier *)h; 1162 struct p_barrier *p = &mdev->data.rbuf.barrier;
1275 struct drbd_epoch *epoch; 1163 struct drbd_epoch *epoch;
1276 1164
1277 ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
1278
1279 rv = drbd_recv(mdev, h->payload, h->length);
1280 ERR_IF(rv != h->length) return FALSE;
1281
1282 inc_unacked(mdev); 1165 inc_unacked(mdev);
1283 1166
1284 if (mdev->net_conf->wire_protocol != DRBD_PROT_C)
1285 drbd_kick_lo(mdev);
1286
1287 mdev->current_epoch->barrier_nr = p->barrier; 1167 mdev->current_epoch->barrier_nr = p->barrier;
1288 rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_GOT_BARRIER_NR); 1168 rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_GOT_BARRIER_NR);
1289 1169
@@ -1293,44 +1173,40 @@ static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h)
1293 * Therefore we must send the barrier_ack after the barrier request was 1173 * Therefore we must send the barrier_ack after the barrier request was
1294 * completed. */ 1174 * completed. */
1295 switch (mdev->write_ordering) { 1175 switch (mdev->write_ordering) {
1296 case WO_bio_barrier:
1297 case WO_none: 1176 case WO_none:
1298 if (rv == FE_RECYCLED) 1177 if (rv == FE_RECYCLED)
1299 return TRUE; 1178 return true;
1300 break; 1179
1180 /* receiver context, in the writeout path of the other node.
1181 * avoid potential distributed deadlock */
1182 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1183 if (epoch)
1184 break;
1185 else
1186 dev_warn(DEV, "Allocation of an epoch failed, slowing down\n");
1187 /* Fall through */
1301 1188
1302 case WO_bdev_flush: 1189 case WO_bdev_flush:
1303 case WO_drain_io: 1190 case WO_drain_io:
1304 if (rv == FE_STILL_LIVE) {
1305 set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags);
1306 drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
1307 rv = drbd_flush_after_epoch(mdev, mdev->current_epoch);
1308 }
1309 if (rv == FE_RECYCLED)
1310 return TRUE;
1311
1312 /* The asender will send all the ACKs and barrier ACKs out, since
1313 all EEs moved from the active_ee to the done_ee. We need to
1314 provide a new epoch object for the EEs that come in soon */
1315 break;
1316 }
1317
1318 /* receiver context, in the writeout path of the other node.
1319 * avoid potential distributed deadlock */
1320 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1321 if (!epoch) {
1322 dev_warn(DEV, "Allocation of an epoch failed, slowing down\n");
1323 issue_flush = !test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags);
1324 drbd_wait_ee_list_empty(mdev, &mdev->active_ee); 1191 drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
1325 if (issue_flush) { 1192 drbd_flush(mdev);
1326 rv = drbd_flush_after_epoch(mdev, mdev->current_epoch); 1193
1327 if (rv == FE_RECYCLED) 1194 if (atomic_read(&mdev->current_epoch->epoch_size)) {
1328 return TRUE; 1195 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1196 if (epoch)
1197 break;
1329 } 1198 }
1330 1199
1331 drbd_wait_ee_list_empty(mdev, &mdev->done_ee); 1200 epoch = mdev->current_epoch;
1201 wait_event(mdev->ee_wait, atomic_read(&epoch->epoch_size) == 0);
1202
1203 D_ASSERT(atomic_read(&epoch->active) == 0);
1204 D_ASSERT(epoch->flags == 0);
1332 1205
1333 return TRUE; 1206 return true;
1207 default:
1208 dev_err(DEV, "Strangeness in mdev->write_ordering %d\n", mdev->write_ordering);
1209 return false;
1334 } 1210 }
1335 1211
1336 epoch->flags = 0; 1212 epoch->flags = 0;
@@ -1348,7 +1224,7 @@ static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h)
1348 } 1224 }
1349 spin_unlock(&mdev->epoch_lock); 1225 spin_unlock(&mdev->epoch_lock);
1350 1226
1351 return TRUE; 1227 return true;
1352} 1228}
1353 1229
1354/* used from receive_RSDataReply (recv_resync_read) 1230/* used from receive_RSDataReply (recv_resync_read)
@@ -1370,21 +1246,25 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __
1370 if (dgs) { 1246 if (dgs) {
1371 rr = drbd_recv(mdev, dig_in, dgs); 1247 rr = drbd_recv(mdev, dig_in, dgs);
1372 if (rr != dgs) { 1248 if (rr != dgs) {
1373 dev_warn(DEV, "short read receiving data digest: read %d expected %d\n", 1249 if (!signal_pending(current))
1374 rr, dgs); 1250 dev_warn(DEV,
1251 "short read receiving data digest: read %d expected %d\n",
1252 rr, dgs);
1375 return NULL; 1253 return NULL;
1376 } 1254 }
1377 } 1255 }
1378 1256
1379 data_size -= dgs; 1257 data_size -= dgs;
1380 1258
1259 ERR_IF(data_size == 0) return NULL;
1381 ERR_IF(data_size & 0x1ff) return NULL; 1260 ERR_IF(data_size & 0x1ff) return NULL;
1382 ERR_IF(data_size > DRBD_MAX_SEGMENT_SIZE) return NULL; 1261 ERR_IF(data_size > DRBD_MAX_BIO_SIZE) return NULL;
1383 1262
1384 /* even though we trust out peer, 1263 /* even though we trust out peer,
1385 * we sometimes have to double check. */ 1264 * we sometimes have to double check. */
1386 if (sector + (data_size>>9) > capacity) { 1265 if (sector + (data_size>>9) > capacity) {
1387 dev_err(DEV, "capacity: %llus < sector: %llus + size: %u\n", 1266 dev_err(DEV, "request from peer beyond end of local disk: "
1267 "capacity: %llus < sector: %llus + size: %u\n",
1388 (unsigned long long)capacity, 1268 (unsigned long long)capacity,
1389 (unsigned long long)sector, data_size); 1269 (unsigned long long)sector, data_size);
1390 return NULL; 1270 return NULL;
@@ -1403,15 +1283,16 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __
1403 unsigned len = min_t(int, ds, PAGE_SIZE); 1283 unsigned len = min_t(int, ds, PAGE_SIZE);
1404 data = kmap(page); 1284 data = kmap(page);
1405 rr = drbd_recv(mdev, data, len); 1285 rr = drbd_recv(mdev, data, len);
1406 if (FAULT_ACTIVE(mdev, DRBD_FAULT_RECEIVE)) { 1286 if (drbd_insert_fault(mdev, DRBD_FAULT_RECEIVE)) {
1407 dev_err(DEV, "Fault injection: Corrupting data on receive\n"); 1287 dev_err(DEV, "Fault injection: Corrupting data on receive\n");
1408 data[0] = data[0] ^ (unsigned long)-1; 1288 data[0] = data[0] ^ (unsigned long)-1;
1409 } 1289 }
1410 kunmap(page); 1290 kunmap(page);
1411 if (rr != len) { 1291 if (rr != len) {
1412 drbd_free_ee(mdev, e); 1292 drbd_free_ee(mdev, e);
1413 dev_warn(DEV, "short read receiving data: read %d expected %d\n", 1293 if (!signal_pending(current))
1414 rr, len); 1294 dev_warn(DEV, "short read receiving data: read %d expected %d\n",
1295 rr, len);
1415 return NULL; 1296 return NULL;
1416 } 1297 }
1417 ds -= rr; 1298 ds -= rr;
@@ -1420,7 +1301,8 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __
1420 if (dgs) { 1301 if (dgs) {
1421 drbd_csum_ee(mdev, mdev->integrity_r_tfm, e, dig_vv); 1302 drbd_csum_ee(mdev, mdev->integrity_r_tfm, e, dig_vv);
1422 if (memcmp(dig_in, dig_vv, dgs)) { 1303 if (memcmp(dig_in, dig_vv, dgs)) {
1423 dev_err(DEV, "Digest integrity check FAILED.\n"); 1304 dev_err(DEV, "Digest integrity check FAILED: %llus +%u\n",
1305 (unsigned long long)sector, data_size);
1424 drbd_bcast_ee(mdev, "digest failed", 1306 drbd_bcast_ee(mdev, "digest failed",
1425 dgs, dig_in, dig_vv, e); 1307 dgs, dig_in, dig_vv, e);
1426 drbd_free_ee(mdev, e); 1308 drbd_free_ee(mdev, e);
@@ -1441,7 +1323,7 @@ static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
1441 void *data; 1323 void *data;
1442 1324
1443 if (!data_size) 1325 if (!data_size)
1444 return TRUE; 1326 return true;
1445 1327
1446 page = drbd_pp_alloc(mdev, 1, 1); 1328 page = drbd_pp_alloc(mdev, 1, 1);
1447 1329
@@ -1450,14 +1332,16 @@ static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
1450 rr = drbd_recv(mdev, data, min_t(int, data_size, PAGE_SIZE)); 1332 rr = drbd_recv(mdev, data, min_t(int, data_size, PAGE_SIZE));
1451 if (rr != min_t(int, data_size, PAGE_SIZE)) { 1333 if (rr != min_t(int, data_size, PAGE_SIZE)) {
1452 rv = 0; 1334 rv = 0;
1453 dev_warn(DEV, "short read receiving data: read %d expected %d\n", 1335 if (!signal_pending(current))
1454 rr, min_t(int, data_size, PAGE_SIZE)); 1336 dev_warn(DEV,
1337 "short read receiving data: read %d expected %d\n",
1338 rr, min_t(int, data_size, PAGE_SIZE));
1455 break; 1339 break;
1456 } 1340 }
1457 data_size -= rr; 1341 data_size -= rr;
1458 } 1342 }
1459 kunmap(page); 1343 kunmap(page);
1460 drbd_pp_free(mdev, page); 1344 drbd_pp_free(mdev, page, 0);
1461 return rv; 1345 return rv;
1462} 1346}
1463 1347
@@ -1476,8 +1360,10 @@ static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
1476 if (dgs) { 1360 if (dgs) {
1477 rr = drbd_recv(mdev, dig_in, dgs); 1361 rr = drbd_recv(mdev, dig_in, dgs);
1478 if (rr != dgs) { 1362 if (rr != dgs) {
1479 dev_warn(DEV, "short read receiving data reply digest: read %d expected %d\n", 1363 if (!signal_pending(current))
1480 rr, dgs); 1364 dev_warn(DEV,
1365 "short read receiving data reply digest: read %d expected %d\n",
1366 rr, dgs);
1481 return 0; 1367 return 0;
1482 } 1368 }
1483 } 1369 }
@@ -1498,9 +1384,10 @@ static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
1498 expect); 1384 expect);
1499 kunmap(bvec->bv_page); 1385 kunmap(bvec->bv_page);
1500 if (rr != expect) { 1386 if (rr != expect) {
1501 dev_warn(DEV, "short read receiving data reply: " 1387 if (!signal_pending(current))
1502 "read %d expected %d\n", 1388 dev_warn(DEV, "short read receiving data reply: "
1503 rr, expect); 1389 "read %d expected %d\n",
1390 rr, expect);
1504 return 0; 1391 return 0;
1505 } 1392 }
1506 data_size -= rr; 1393 data_size -= rr;
@@ -1526,7 +1413,7 @@ static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int u
1526 sector_t sector = e->sector; 1413 sector_t sector = e->sector;
1527 int ok; 1414 int ok;
1528 1415
1529 D_ASSERT(hlist_unhashed(&e->colision)); 1416 D_ASSERT(hlist_unhashed(&e->collision));
1530 1417
1531 if (likely((e->flags & EE_WAS_ERROR) == 0)) { 1418 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
1532 drbd_set_in_sync(mdev, sector, e->size); 1419 drbd_set_in_sync(mdev, sector, e->size);
@@ -1562,30 +1449,28 @@ static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_si
1562 list_add(&e->w.list, &mdev->sync_ee); 1449 list_add(&e->w.list, &mdev->sync_ee);
1563 spin_unlock_irq(&mdev->req_lock); 1450 spin_unlock_irq(&mdev->req_lock);
1564 1451
1452 atomic_add(data_size >> 9, &mdev->rs_sect_ev);
1565 if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0) 1453 if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0)
1566 return TRUE; 1454 return true;
1455
1456 /* don't care for the reason here */
1457 dev_err(DEV, "submit failed, triggering re-connect\n");
1458 spin_lock_irq(&mdev->req_lock);
1459 list_del(&e->w.list);
1460 spin_unlock_irq(&mdev->req_lock);
1567 1461
1568 drbd_free_ee(mdev, e); 1462 drbd_free_ee(mdev, e);
1569fail: 1463fail:
1570 put_ldev(mdev); 1464 put_ldev(mdev);
1571 return FALSE; 1465 return false;
1572} 1466}
1573 1467
1574static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h) 1468static int receive_DataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
1575{ 1469{
1576 struct drbd_request *req; 1470 struct drbd_request *req;
1577 sector_t sector; 1471 sector_t sector;
1578 unsigned int header_size, data_size;
1579 int ok; 1472 int ok;
1580 struct p_data *p = (struct p_data *)h; 1473 struct p_data *p = &mdev->data.rbuf.data;
1581
1582 header_size = sizeof(*p) - sizeof(*h);
1583 data_size = h->length - header_size;
1584
1585 ERR_IF(data_size == 0) return FALSE;
1586
1587 if (drbd_recv(mdev, h->payload, header_size) != header_size)
1588 return FALSE;
1589 1474
1590 sector = be64_to_cpu(p->sector); 1475 sector = be64_to_cpu(p->sector);
1591 1476
@@ -1594,10 +1479,10 @@ static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h)
1594 spin_unlock_irq(&mdev->req_lock); 1479 spin_unlock_irq(&mdev->req_lock);
1595 if (unlikely(!req)) { 1480 if (unlikely(!req)) {
1596 dev_err(DEV, "Got a corrupt block_id/sector pair(1).\n"); 1481 dev_err(DEV, "Got a corrupt block_id/sector pair(1).\n");
1597 return FALSE; 1482 return false;
1598 } 1483 }
1599 1484
1600 /* hlist_del(&req->colision) is done in _req_may_be_done, to avoid 1485 /* hlist_del(&req->collision) is done in _req_may_be_done, to avoid
1601 * special casing it there for the various failure cases. 1486 * special casing it there for the various failure cases.
1602 * still no race with drbd_fail_pending_reads */ 1487 * still no race with drbd_fail_pending_reads */
1603 ok = recv_dless_read(mdev, req, sector, data_size); 1488 ok = recv_dless_read(mdev, req, sector, data_size);
@@ -1611,20 +1496,11 @@ static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h)
1611 return ok; 1496 return ok;
1612} 1497}
1613 1498
1614static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header *h) 1499static int receive_RSDataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
1615{ 1500{
1616 sector_t sector; 1501 sector_t sector;
1617 unsigned int header_size, data_size;
1618 int ok; 1502 int ok;
1619 struct p_data *p = (struct p_data *)h; 1503 struct p_data *p = &mdev->data.rbuf.data;
1620
1621 header_size = sizeof(*p) - sizeof(*h);
1622 data_size = h->length - header_size;
1623
1624 ERR_IF(data_size == 0) return FALSE;
1625
1626 if (drbd_recv(mdev, h->payload, header_size) != header_size)
1627 return FALSE;
1628 1504
1629 sector = be64_to_cpu(p->sector); 1505 sector = be64_to_cpu(p->sector);
1630 D_ASSERT(p->block_id == ID_SYNCER); 1506 D_ASSERT(p->block_id == ID_SYNCER);
@@ -1640,9 +1516,11 @@ static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header *h)
1640 1516
1641 ok = drbd_drain_block(mdev, data_size); 1517 ok = drbd_drain_block(mdev, data_size);
1642 1518
1643 drbd_send_ack_dp(mdev, P_NEG_ACK, p); 1519 drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size);
1644 } 1520 }
1645 1521
1522 atomic_add(data_size >> 9, &mdev->rs_sect_in);
1523
1646 return ok; 1524 return ok;
1647} 1525}
1648 1526
@@ -1653,15 +1531,8 @@ static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1653{ 1531{
1654 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; 1532 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1655 sector_t sector = e->sector; 1533 sector_t sector = e->sector;
1656 struct drbd_epoch *epoch;
1657 int ok = 1, pcmd; 1534 int ok = 1, pcmd;
1658 1535
1659 if (e->flags & EE_IS_BARRIER) {
1660 epoch = previous_epoch(mdev, e->epoch);
1661 if (epoch)
1662 drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE + (cancel ? EV_CLEANUP : 0));
1663 }
1664
1665 if (mdev->net_conf->wire_protocol == DRBD_PROT_C) { 1536 if (mdev->net_conf->wire_protocol == DRBD_PROT_C) {
1666 if (likely((e->flags & EE_WAS_ERROR) == 0)) { 1537 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
1667 pcmd = (mdev->state.conn >= C_SYNC_SOURCE && 1538 pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
@@ -1682,11 +1553,11 @@ static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1682 * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */ 1553 * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */
1683 if (mdev->net_conf->two_primaries) { 1554 if (mdev->net_conf->two_primaries) {
1684 spin_lock_irq(&mdev->req_lock); 1555 spin_lock_irq(&mdev->req_lock);
1685 D_ASSERT(!hlist_unhashed(&e->colision)); 1556 D_ASSERT(!hlist_unhashed(&e->collision));
1686 hlist_del_init(&e->colision); 1557 hlist_del_init(&e->collision);
1687 spin_unlock_irq(&mdev->req_lock); 1558 spin_unlock_irq(&mdev->req_lock);
1688 } else { 1559 } else {
1689 D_ASSERT(hlist_unhashed(&e->colision)); 1560 D_ASSERT(hlist_unhashed(&e->collision));
1690 } 1561 }
1691 1562
1692 drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0)); 1563 drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
@@ -1703,8 +1574,8 @@ static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int u
1703 ok = drbd_send_ack(mdev, P_DISCARD_ACK, e); 1574 ok = drbd_send_ack(mdev, P_DISCARD_ACK, e);
1704 1575
1705 spin_lock_irq(&mdev->req_lock); 1576 spin_lock_irq(&mdev->req_lock);
1706 D_ASSERT(!hlist_unhashed(&e->colision)); 1577 D_ASSERT(!hlist_unhashed(&e->collision));
1707 hlist_del_init(&e->colision); 1578 hlist_del_init(&e->collision);
1708 spin_unlock_irq(&mdev->req_lock); 1579 spin_unlock_irq(&mdev->req_lock);
1709 1580
1710 dec_unacked(mdev); 1581 dec_unacked(mdev);
@@ -1765,34 +1636,33 @@ static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq)
1765 return ret; 1636 return ret;
1766} 1637}
1767 1638
1639/* see also bio_flags_to_wire()
1640 * DRBD_REQ_*, because we need to semantically map the flags to data packet
1641 * flags and back. We may replicate to other kernel versions. */
1642static unsigned long wire_flags_to_bio(struct drbd_conf *mdev, u32 dpf)
1643{
1644 return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
1645 (dpf & DP_FUA ? REQ_FUA : 0) |
1646 (dpf & DP_FLUSH ? REQ_FLUSH : 0) |
1647 (dpf & DP_DISCARD ? REQ_DISCARD : 0);
1648}
1649
1768/* mirrored write */ 1650/* mirrored write */
1769static int receive_Data(struct drbd_conf *mdev, struct p_header *h) 1651static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
1770{ 1652{
1771 sector_t sector; 1653 sector_t sector;
1772 struct drbd_epoch_entry *e; 1654 struct drbd_epoch_entry *e;
1773 struct p_data *p = (struct p_data *)h; 1655 struct p_data *p = &mdev->data.rbuf.data;
1774 int header_size, data_size;
1775 int rw = WRITE; 1656 int rw = WRITE;
1776 u32 dp_flags; 1657 u32 dp_flags;
1777 1658
1778 header_size = sizeof(*p) - sizeof(*h);
1779 data_size = h->length - header_size;
1780
1781 ERR_IF(data_size == 0) return FALSE;
1782
1783 if (drbd_recv(mdev, h->payload, header_size) != header_size)
1784 return FALSE;
1785
1786 if (!get_ldev(mdev)) { 1659 if (!get_ldev(mdev)) {
1787 if (__ratelimit(&drbd_ratelimit_state))
1788 dev_err(DEV, "Can not write mirrored data block "
1789 "to local disk.\n");
1790 spin_lock(&mdev->peer_seq_lock); 1660 spin_lock(&mdev->peer_seq_lock);
1791 if (mdev->peer_seq+1 == be32_to_cpu(p->seq_num)) 1661 if (mdev->peer_seq+1 == be32_to_cpu(p->seq_num))
1792 mdev->peer_seq++; 1662 mdev->peer_seq++;
1793 spin_unlock(&mdev->peer_seq_lock); 1663 spin_unlock(&mdev->peer_seq_lock);
1794 1664
1795 drbd_send_ack_dp(mdev, P_NEG_ACK, p); 1665 drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size);
1796 atomic_inc(&mdev->current_epoch->epoch_size); 1666 atomic_inc(&mdev->current_epoch->epoch_size);
1797 return drbd_drain_block(mdev, data_size); 1667 return drbd_drain_block(mdev, data_size);
1798 } 1668 }
@@ -1806,48 +1676,23 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
1806 e = read_in_block(mdev, p->block_id, sector, data_size); 1676 e = read_in_block(mdev, p->block_id, sector, data_size);
1807 if (!e) { 1677 if (!e) {
1808 put_ldev(mdev); 1678 put_ldev(mdev);
1809 return FALSE; 1679 return false;
1810 } 1680 }
1811 1681
1812 e->w.cb = e_end_block; 1682 e->w.cb = e_end_block;
1813 1683
1684 dp_flags = be32_to_cpu(p->dp_flags);
1685 rw |= wire_flags_to_bio(mdev, dp_flags);
1686
1687 if (dp_flags & DP_MAY_SET_IN_SYNC)
1688 e->flags |= EE_MAY_SET_IN_SYNC;
1689
1814 spin_lock(&mdev->epoch_lock); 1690 spin_lock(&mdev->epoch_lock);
1815 e->epoch = mdev->current_epoch; 1691 e->epoch = mdev->current_epoch;
1816 atomic_inc(&e->epoch->epoch_size); 1692 atomic_inc(&e->epoch->epoch_size);
1817 atomic_inc(&e->epoch->active); 1693 atomic_inc(&e->epoch->active);
1818
1819 if (mdev->write_ordering == WO_bio_barrier && atomic_read(&e->epoch->epoch_size) == 1) {
1820 struct drbd_epoch *epoch;
1821 /* Issue a barrier if we start a new epoch, and the previous epoch
1822 was not a epoch containing a single request which already was
1823 a Barrier. */
1824 epoch = list_entry(e->epoch->list.prev, struct drbd_epoch, list);
1825 if (epoch == e->epoch) {
1826 set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
1827 rw |= REQ_HARDBARRIER;
1828 e->flags |= EE_IS_BARRIER;
1829 } else {
1830 if (atomic_read(&epoch->epoch_size) > 1 ||
1831 !test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) {
1832 set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
1833 set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
1834 rw |= REQ_HARDBARRIER;
1835 e->flags |= EE_IS_BARRIER;
1836 }
1837 }
1838 }
1839 spin_unlock(&mdev->epoch_lock); 1694 spin_unlock(&mdev->epoch_lock);
1840 1695
1841 dp_flags = be32_to_cpu(p->dp_flags);
1842 if (dp_flags & DP_HARDBARRIER) {
1843 dev_err(DEV, "ASSERT FAILED would have submitted barrier request\n");
1844 /* rw |= REQ_HARDBARRIER; */
1845 }
1846 if (dp_flags & DP_RW_SYNC)
1847 rw |= REQ_SYNC | REQ_UNPLUG;
1848 if (dp_flags & DP_MAY_SET_IN_SYNC)
1849 e->flags |= EE_MAY_SET_IN_SYNC;
1850
1851 /* I'm the receiver, I do hold a net_cnt reference. */ 1696 /* I'm the receiver, I do hold a net_cnt reference. */
1852 if (!mdev->net_conf->two_primaries) { 1697 if (!mdev->net_conf->two_primaries) {
1853 spin_lock_irq(&mdev->req_lock); 1698 spin_lock_irq(&mdev->req_lock);
@@ -1905,7 +1750,7 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
1905 1750
1906 spin_lock_irq(&mdev->req_lock); 1751 spin_lock_irq(&mdev->req_lock);
1907 1752
1908 hlist_add_head(&e->colision, ee_hash_slot(mdev, sector)); 1753 hlist_add_head(&e->collision, ee_hash_slot(mdev, sector));
1909 1754
1910#define OVERLAPS overlaps(i->sector, i->size, sector, size) 1755#define OVERLAPS overlaps(i->sector, i->size, sector, size)
1911 slot = tl_hash_slot(mdev, sector); 1756 slot = tl_hash_slot(mdev, sector);
@@ -1915,7 +1760,7 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
1915 int have_conflict = 0; 1760 int have_conflict = 0;
1916 prepare_to_wait(&mdev->misc_wait, &wait, 1761 prepare_to_wait(&mdev->misc_wait, &wait,
1917 TASK_INTERRUPTIBLE); 1762 TASK_INTERRUPTIBLE);
1918 hlist_for_each_entry(i, n, slot, colision) { 1763 hlist_for_each_entry(i, n, slot, collision) {
1919 if (OVERLAPS) { 1764 if (OVERLAPS) {
1920 /* only ALERT on first iteration, 1765 /* only ALERT on first iteration,
1921 * we may be woken up early... */ 1766 * we may be woken up early... */
@@ -1950,11 +1795,11 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
1950 put_ldev(mdev); 1795 put_ldev(mdev);
1951 wake_asender(mdev); 1796 wake_asender(mdev);
1952 finish_wait(&mdev->misc_wait, &wait); 1797 finish_wait(&mdev->misc_wait, &wait);
1953 return TRUE; 1798 return true;
1954 } 1799 }
1955 1800
1956 if (signal_pending(current)) { 1801 if (signal_pending(current)) {
1957 hlist_del_init(&e->colision); 1802 hlist_del_init(&e->collision);
1958 1803
1959 spin_unlock_irq(&mdev->req_lock); 1804 spin_unlock_irq(&mdev->req_lock);
1960 1805
@@ -1997,61 +1842,150 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
1997 break; 1842 break;
1998 } 1843 }
1999 1844
2000 if (mdev->state.pdsk == D_DISKLESS) { 1845 if (mdev->state.pdsk < D_INCONSISTENT) {
2001 /* In case we have the only disk of the cluster, */ 1846 /* In case we have the only disk of the cluster, */
2002 drbd_set_out_of_sync(mdev, e->sector, e->size); 1847 drbd_set_out_of_sync(mdev, e->sector, e->size);
2003 e->flags |= EE_CALL_AL_COMPLETE_IO; 1848 e->flags |= EE_CALL_AL_COMPLETE_IO;
1849 e->flags &= ~EE_MAY_SET_IN_SYNC;
2004 drbd_al_begin_io(mdev, e->sector); 1850 drbd_al_begin_io(mdev, e->sector);
2005 } 1851 }
2006 1852
2007 if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0) 1853 if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0)
2008 return TRUE; 1854 return true;
1855
1856 /* don't care for the reason here */
1857 dev_err(DEV, "submit failed, triggering re-connect\n");
1858 spin_lock_irq(&mdev->req_lock);
1859 list_del(&e->w.list);
1860 hlist_del_init(&e->collision);
1861 spin_unlock_irq(&mdev->req_lock);
1862 if (e->flags & EE_CALL_AL_COMPLETE_IO)
1863 drbd_al_complete_io(mdev, e->sector);
2009 1864
2010out_interrupted: 1865out_interrupted:
2011 /* yes, the epoch_size now is imbalanced. 1866 drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + EV_CLEANUP);
2012 * but we drop the connection anyways, so we don't have a chance to
2013 * receive a barrier... atomic_inc(&mdev->epoch_size); */
2014 put_ldev(mdev); 1867 put_ldev(mdev);
2015 drbd_free_ee(mdev, e); 1868 drbd_free_ee(mdev, e);
2016 return FALSE; 1869 return false;
2017} 1870}
2018 1871
2019static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h) 1872/* We may throttle resync, if the lower device seems to be busy,
1873 * and current sync rate is above c_min_rate.
1874 *
1875 * To decide whether or not the lower device is busy, we use a scheme similar
1876 * to MD RAID is_mddev_idle(): if the partition stats reveal "significant"
1877 * (more than 64 sectors) of activity we cannot account for with our own resync
1878 * activity, it obviously is "busy".
1879 *
1880 * The current sync rate used here uses only the most recent two step marks,
1881 * to have a short time average so we can react faster.
1882 */
1883int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector)
1884{
1885 struct gendisk *disk = mdev->ldev->backing_bdev->bd_contains->bd_disk;
1886 unsigned long db, dt, dbdt;
1887 struct lc_element *tmp;
1888 int curr_events;
1889 int throttle = 0;
1890
1891 /* feature disabled? */
1892 if (mdev->sync_conf.c_min_rate == 0)
1893 return 0;
1894
1895 spin_lock_irq(&mdev->al_lock);
1896 tmp = lc_find(mdev->resync, BM_SECT_TO_EXT(sector));
1897 if (tmp) {
1898 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
1899 if (test_bit(BME_PRIORITY, &bm_ext->flags)) {
1900 spin_unlock_irq(&mdev->al_lock);
1901 return 0;
1902 }
1903 /* Do not slow down if app IO is already waiting for this extent */
1904 }
1905 spin_unlock_irq(&mdev->al_lock);
1906
1907 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
1908 (int)part_stat_read(&disk->part0, sectors[1]) -
1909 atomic_read(&mdev->rs_sect_ev);
1910
1911 if (!mdev->rs_last_events || curr_events - mdev->rs_last_events > 64) {
1912 unsigned long rs_left;
1913 int i;
1914
1915 mdev->rs_last_events = curr_events;
1916
1917 /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
1918 * approx. */
1919 i = (mdev->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
1920
1921 if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T)
1922 rs_left = mdev->ov_left;
1923 else
1924 rs_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
1925
1926 dt = ((long)jiffies - (long)mdev->rs_mark_time[i]) / HZ;
1927 if (!dt)
1928 dt++;
1929 db = mdev->rs_mark_left[i] - rs_left;
1930 dbdt = Bit2KB(db/dt);
1931
1932 if (dbdt > mdev->sync_conf.c_min_rate)
1933 throttle = 1;
1934 }
1935 return throttle;
1936}
1937
1938
1939static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int digest_size)
2020{ 1940{
2021 sector_t sector; 1941 sector_t sector;
2022 const sector_t capacity = drbd_get_capacity(mdev->this_bdev); 1942 const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
2023 struct drbd_epoch_entry *e; 1943 struct drbd_epoch_entry *e;
2024 struct digest_info *di = NULL; 1944 struct digest_info *di = NULL;
2025 int size, digest_size; 1945 int size, verb;
2026 unsigned int fault_type; 1946 unsigned int fault_type;
2027 struct p_block_req *p = 1947 struct p_block_req *p = &mdev->data.rbuf.block_req;
2028 (struct p_block_req *)h;
2029 const int brps = sizeof(*p)-sizeof(*h);
2030
2031 if (drbd_recv(mdev, h->payload, brps) != brps)
2032 return FALSE;
2033 1948
2034 sector = be64_to_cpu(p->sector); 1949 sector = be64_to_cpu(p->sector);
2035 size = be32_to_cpu(p->blksize); 1950 size = be32_to_cpu(p->blksize);
2036 1951
2037 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { 1952 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
2038 dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, 1953 dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
2039 (unsigned long long)sector, size); 1954 (unsigned long long)sector, size);
2040 return FALSE; 1955 return false;
2041 } 1956 }
2042 if (sector + (size>>9) > capacity) { 1957 if (sector + (size>>9) > capacity) {
2043 dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, 1958 dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
2044 (unsigned long long)sector, size); 1959 (unsigned long long)sector, size);
2045 return FALSE; 1960 return false;
2046 } 1961 }
2047 1962
2048 if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) { 1963 if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) {
2049 if (__ratelimit(&drbd_ratelimit_state)) 1964 verb = 1;
1965 switch (cmd) {
1966 case P_DATA_REQUEST:
1967 drbd_send_ack_rp(mdev, P_NEG_DREPLY, p);
1968 break;
1969 case P_RS_DATA_REQUEST:
1970 case P_CSUM_RS_REQUEST:
1971 case P_OV_REQUEST:
1972 drbd_send_ack_rp(mdev, P_NEG_RS_DREPLY , p);
1973 break;
1974 case P_OV_REPLY:
1975 verb = 0;
1976 dec_rs_pending(mdev);
1977 drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, ID_IN_SYNC);
1978 break;
1979 default:
1980 dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
1981 cmdname(cmd));
1982 }
1983 if (verb && __ratelimit(&drbd_ratelimit_state))
2050 dev_err(DEV, "Can not satisfy peer's read request, " 1984 dev_err(DEV, "Can not satisfy peer's read request, "
2051 "no local data.\n"); 1985 "no local data.\n");
2052 drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY : 1986
2053 P_NEG_RS_DREPLY , p); 1987 /* drain possibly payload */
2054 return drbd_drain_block(mdev, h->length - brps); 1988 return drbd_drain_block(mdev, digest_size);
2055 } 1989 }
2056 1990
2057 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD 1991 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
@@ -2060,34 +1994,26 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
2060 e = drbd_alloc_ee(mdev, p->block_id, sector, size, GFP_NOIO); 1994 e = drbd_alloc_ee(mdev, p->block_id, sector, size, GFP_NOIO);
2061 if (!e) { 1995 if (!e) {
2062 put_ldev(mdev); 1996 put_ldev(mdev);
2063 return FALSE; 1997 return false;
2064 } 1998 }
2065 1999
2066 switch (h->command) { 2000 switch (cmd) {
2067 case P_DATA_REQUEST: 2001 case P_DATA_REQUEST:
2068 e->w.cb = w_e_end_data_req; 2002 e->w.cb = w_e_end_data_req;
2069 fault_type = DRBD_FAULT_DT_RD; 2003 fault_type = DRBD_FAULT_DT_RD;
2070 break; 2004 /* application IO, don't drbd_rs_begin_io */
2005 goto submit;
2006
2071 case P_RS_DATA_REQUEST: 2007 case P_RS_DATA_REQUEST:
2072 e->w.cb = w_e_end_rsdata_req; 2008 e->w.cb = w_e_end_rsdata_req;
2073 fault_type = DRBD_FAULT_RS_RD; 2009 fault_type = DRBD_FAULT_RS_RD;
2074 /* Eventually this should become asynchronously. Currently it 2010 /* used in the sector offset progress display */
2075 * blocks the whole receiver just to delay the reading of a 2011 mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
2076 * resync data block.
2077 * the drbd_work_queue mechanism is made for this...
2078 */
2079 if (!drbd_rs_begin_io(mdev, sector)) {
2080 /* we have been interrupted,
2081 * probably connection lost! */
2082 D_ASSERT(signal_pending(current));
2083 goto out_free_e;
2084 }
2085 break; 2012 break;
2086 2013
2087 case P_OV_REPLY: 2014 case P_OV_REPLY:
2088 case P_CSUM_RS_REQUEST: 2015 case P_CSUM_RS_REQUEST:
2089 fault_type = DRBD_FAULT_RS_RD; 2016 fault_type = DRBD_FAULT_RS_RD;
2090 digest_size = h->length - brps ;
2091 di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO); 2017 di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO);
2092 if (!di) 2018 if (!di)
2093 goto out_free_e; 2019 goto out_free_e;
@@ -2095,75 +2021,105 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
2095 di->digest_size = digest_size; 2021 di->digest_size = digest_size;
2096 di->digest = (((char *)di)+sizeof(struct digest_info)); 2022 di->digest = (((char *)di)+sizeof(struct digest_info));
2097 2023
2024 e->digest = di;
2025 e->flags |= EE_HAS_DIGEST;
2026
2098 if (drbd_recv(mdev, di->digest, digest_size) != digest_size) 2027 if (drbd_recv(mdev, di->digest, digest_size) != digest_size)
2099 goto out_free_e; 2028 goto out_free_e;
2100 2029
2101 e->block_id = (u64)(unsigned long)di; 2030 if (cmd == P_CSUM_RS_REQUEST) {
2102 if (h->command == P_CSUM_RS_REQUEST) {
2103 D_ASSERT(mdev->agreed_pro_version >= 89); 2031 D_ASSERT(mdev->agreed_pro_version >= 89);
2104 e->w.cb = w_e_end_csum_rs_req; 2032 e->w.cb = w_e_end_csum_rs_req;
2105 } else if (h->command == P_OV_REPLY) { 2033 /* used in the sector offset progress display */
2034 mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
2035 } else if (cmd == P_OV_REPLY) {
2036 /* track progress, we may need to throttle */
2037 atomic_add(size >> 9, &mdev->rs_sect_in);
2106 e->w.cb = w_e_end_ov_reply; 2038 e->w.cb = w_e_end_ov_reply;
2107 dec_rs_pending(mdev); 2039 dec_rs_pending(mdev);
2108 break; 2040 /* drbd_rs_begin_io done when we sent this request,
2109 } 2041 * but accounting still needs to be done. */
2110 2042 goto submit_for_resync;
2111 if (!drbd_rs_begin_io(mdev, sector)) {
2112 /* we have been interrupted, probably connection lost! */
2113 D_ASSERT(signal_pending(current));
2114 goto out_free_e;
2115 } 2043 }
2116 break; 2044 break;
2117 2045
2118 case P_OV_REQUEST: 2046 case P_OV_REQUEST:
2119 if (mdev->state.conn >= C_CONNECTED &&
2120 mdev->state.conn != C_VERIFY_T)
2121 dev_warn(DEV, "ASSERT FAILED: got P_OV_REQUEST while being %s\n",
2122 drbd_conn_str(mdev->state.conn));
2123 if (mdev->ov_start_sector == ~(sector_t)0 && 2047 if (mdev->ov_start_sector == ~(sector_t)0 &&
2124 mdev->agreed_pro_version >= 90) { 2048 mdev->agreed_pro_version >= 90) {
2049 unsigned long now = jiffies;
2050 int i;
2125 mdev->ov_start_sector = sector; 2051 mdev->ov_start_sector = sector;
2126 mdev->ov_position = sector; 2052 mdev->ov_position = sector;
2127 mdev->ov_left = mdev->rs_total - BM_SECT_TO_BIT(sector); 2053 mdev->ov_left = drbd_bm_bits(mdev) - BM_SECT_TO_BIT(sector);
2054 mdev->rs_total = mdev->ov_left;
2055 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
2056 mdev->rs_mark_left[i] = mdev->ov_left;
2057 mdev->rs_mark_time[i] = now;
2058 }
2128 dev_info(DEV, "Online Verify start sector: %llu\n", 2059 dev_info(DEV, "Online Verify start sector: %llu\n",
2129 (unsigned long long)sector); 2060 (unsigned long long)sector);
2130 } 2061 }
2131 e->w.cb = w_e_end_ov_req; 2062 e->w.cb = w_e_end_ov_req;
2132 fault_type = DRBD_FAULT_RS_RD; 2063 fault_type = DRBD_FAULT_RS_RD;
2133 /* Eventually this should become asynchronous. Currently it
2134 * blocks the whole receiver just to delay the reading of a
2135 * resync data block.
2136 * the drbd_work_queue mechanism is made for this...
2137 */
2138 if (!drbd_rs_begin_io(mdev, sector)) {
2139 /* we have been interrupted,
2140 * probably connection lost! */
2141 D_ASSERT(signal_pending(current));
2142 goto out_free_e;
2143 }
2144 break; 2064 break;
2145 2065
2146
2147 default: 2066 default:
2148 dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n", 2067 dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
2149 cmdname(h->command)); 2068 cmdname(cmd));
2150 fault_type = DRBD_FAULT_MAX; 2069 fault_type = DRBD_FAULT_MAX;
2070 goto out_free_e;
2151 } 2071 }
2152 2072
2153 spin_lock_irq(&mdev->req_lock); 2073 /* Throttle, drbd_rs_begin_io and submit should become asynchronous
2154 list_add(&e->w.list, &mdev->read_ee); 2074 * wrt the receiver, but it is not as straightforward as it may seem.
2155 spin_unlock_irq(&mdev->req_lock); 2075 * Various places in the resync start and stop logic assume resync
2076 * requests are processed in order, requeuing this on the worker thread
2077 * introduces a bunch of new code for synchronization between threads.
2078 *
2079 * Unlimited throttling before drbd_rs_begin_io may stall the resync
2080 * "forever", throttling after drbd_rs_begin_io will lock that extent
2081 * for application writes for the same time. For now, just throttle
2082 * here, where the rest of the code expects the receiver to sleep for
2083 * a while, anyways.
2084 */
2156 2085
2086 /* Throttle before drbd_rs_begin_io, as that locks out application IO;
2087 * this defers syncer requests for some time, before letting at least
2088 * on request through. The resync controller on the receiving side
2089 * will adapt to the incoming rate accordingly.
2090 *
2091 * We cannot throttle here if remote is Primary/SyncTarget:
2092 * we would also throttle its application reads.
2093 * In that case, throttling is done on the SyncTarget only.
2094 */
2095 if (mdev->state.peer != R_PRIMARY && drbd_rs_should_slow_down(mdev, sector))
2096 schedule_timeout_uninterruptible(HZ/10);
2097 if (drbd_rs_begin_io(mdev, sector))
2098 goto out_free_e;
2099
2100submit_for_resync:
2101 atomic_add(size >> 9, &mdev->rs_sect_ev);
2102
2103submit:
2157 inc_unacked(mdev); 2104 inc_unacked(mdev);
2105 spin_lock_irq(&mdev->req_lock);
2106 list_add_tail(&e->w.list, &mdev->read_ee);
2107 spin_unlock_irq(&mdev->req_lock);
2158 2108
2159 if (drbd_submit_ee(mdev, e, READ, fault_type) == 0) 2109 if (drbd_submit_ee(mdev, e, READ, fault_type) == 0)
2160 return TRUE; 2110 return true;
2111
2112 /* don't care for the reason here */
2113 dev_err(DEV, "submit failed, triggering re-connect\n");
2114 spin_lock_irq(&mdev->req_lock);
2115 list_del(&e->w.list);
2116 spin_unlock_irq(&mdev->req_lock);
2117 /* no drbd_rs_complete_io(), we are dropping the connection anyways */
2161 2118
2162out_free_e: 2119out_free_e:
2163 kfree(di);
2164 put_ldev(mdev); 2120 put_ldev(mdev);
2165 drbd_free_ee(mdev, e); 2121 drbd_free_ee(mdev, e);
2166 return FALSE; 2122 return false;
2167} 2123}
2168 2124
2169static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local) 2125static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
@@ -2240,10 +2196,7 @@ static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
2240 2196
2241static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local) 2197static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
2242{ 2198{
2243 int self, peer, hg, rv = -100; 2199 int hg, rv = -100;
2244
2245 self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
2246 peer = mdev->p_uuid[UI_BITMAP] & 1;
2247 2200
2248 switch (mdev->net_conf->after_sb_1p) { 2201 switch (mdev->net_conf->after_sb_1p) {
2249 case ASB_DISCARD_YOUNGER_PRI: 2202 case ASB_DISCARD_YOUNGER_PRI:
@@ -2270,12 +2223,14 @@ static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
2270 case ASB_CALL_HELPER: 2223 case ASB_CALL_HELPER:
2271 hg = drbd_asb_recover_0p(mdev); 2224 hg = drbd_asb_recover_0p(mdev);
2272 if (hg == -1 && mdev->state.role == R_PRIMARY) { 2225 if (hg == -1 && mdev->state.role == R_PRIMARY) {
2273 self = drbd_set_role(mdev, R_SECONDARY, 0); 2226 enum drbd_state_rv rv2;
2227
2228 drbd_set_role(mdev, R_SECONDARY, 0);
2274 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE, 2229 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2275 * we might be here in C_WF_REPORT_PARAMS which is transient. 2230 * we might be here in C_WF_REPORT_PARAMS which is transient.
2276 * we do not need to wait for the after state change work either. */ 2231 * we do not need to wait for the after state change work either. */
2277 self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY)); 2232 rv2 = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
2278 if (self != SS_SUCCESS) { 2233 if (rv2 != SS_SUCCESS) {
2279 drbd_khelper(mdev, "pri-lost-after-sb"); 2234 drbd_khelper(mdev, "pri-lost-after-sb");
2280 } else { 2235 } else {
2281 dev_warn(DEV, "Successfully gave up primary role.\n"); 2236 dev_warn(DEV, "Successfully gave up primary role.\n");
@@ -2290,10 +2245,7 @@ static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
2290 2245
2291static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local) 2246static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local)
2292{ 2247{
2293 int self, peer, hg, rv = -100; 2248 int hg, rv = -100;
2294
2295 self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
2296 peer = mdev->p_uuid[UI_BITMAP] & 1;
2297 2249
2298 switch (mdev->net_conf->after_sb_2p) { 2250 switch (mdev->net_conf->after_sb_2p) {
2299 case ASB_DISCARD_YOUNGER_PRI: 2251 case ASB_DISCARD_YOUNGER_PRI:
@@ -2313,11 +2265,13 @@ static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local)
2313 case ASB_CALL_HELPER: 2265 case ASB_CALL_HELPER:
2314 hg = drbd_asb_recover_0p(mdev); 2266 hg = drbd_asb_recover_0p(mdev);
2315 if (hg == -1) { 2267 if (hg == -1) {
2268 enum drbd_state_rv rv2;
2269
2316 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE, 2270 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2317 * we might be here in C_WF_REPORT_PARAMS which is transient. 2271 * we might be here in C_WF_REPORT_PARAMS which is transient.
2318 * we do not need to wait for the after state change work either. */ 2272 * we do not need to wait for the after state change work either. */
2319 self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY)); 2273 rv2 = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
2320 if (self != SS_SUCCESS) { 2274 if (rv2 != SS_SUCCESS) {
2321 drbd_khelper(mdev, "pri-lost-after-sb"); 2275 drbd_khelper(mdev, "pri-lost-after-sb");
2322 } else { 2276 } else {
2323 dev_warn(DEV, "Successfully gave up primary role.\n"); 2277 dev_warn(DEV, "Successfully gave up primary role.\n");
@@ -2356,6 +2310,8 @@ static void drbd_uuid_dump(struct drbd_conf *mdev, char *text, u64 *uuid,
2356 -2 C_SYNC_TARGET set BitMap 2310 -2 C_SYNC_TARGET set BitMap
2357 -100 after split brain, disconnect 2311 -100 after split brain, disconnect
2358-1000 unrelated data 2312-1000 unrelated data
2313-1091 requires proto 91
2314-1096 requires proto 96
2359 */ 2315 */
2360static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(local) 2316static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(local)
2361{ 2317{
@@ -2385,7 +2341,7 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l
2385 if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) { 2341 if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) {
2386 2342
2387 if (mdev->agreed_pro_version < 91) 2343 if (mdev->agreed_pro_version < 91)
2388 return -1001; 2344 return -1091;
2389 2345
2390 if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) && 2346 if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
2391 (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) { 2347 (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
@@ -2406,7 +2362,7 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l
2406 if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) { 2362 if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) {
2407 2363
2408 if (mdev->agreed_pro_version < 91) 2364 if (mdev->agreed_pro_version < 91)
2409 return -1001; 2365 return -1091;
2410 2366
2411 if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) && 2367 if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) &&
2412 (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1))) { 2368 (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
@@ -2451,17 +2407,22 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l
2451 *rule_nr = 51; 2407 *rule_nr = 51;
2452 peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1); 2408 peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
2453 if (self == peer) { 2409 if (self == peer) {
2454 self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1); 2410 if (mdev->agreed_pro_version < 96 ?
2455 peer = mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1); 2411 (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) ==
2456 if (self == peer) { 2412 (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) :
2413 peer + UUID_NEW_BM_OFFSET == (mdev->p_uuid[UI_BITMAP] & ~((u64)1))) {
2457 /* The last P_SYNC_UUID did not get though. Undo the last start of 2414 /* The last P_SYNC_UUID did not get though. Undo the last start of
2458 resync as sync source modifications of the peer's UUIDs. */ 2415 resync as sync source modifications of the peer's UUIDs. */
2459 2416
2460 if (mdev->agreed_pro_version < 91) 2417 if (mdev->agreed_pro_version < 91)
2461 return -1001; 2418 return -1091;
2462 2419
2463 mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START]; 2420 mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START];
2464 mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1]; 2421 mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1];
2422
2423 dev_info(DEV, "Did not got last syncUUID packet, corrected:\n");
2424 drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
2425
2465 return -1; 2426 return -1;
2466 } 2427 }
2467 } 2428 }
@@ -2483,20 +2444,20 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l
2483 *rule_nr = 71; 2444 *rule_nr = 71;
2484 self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1); 2445 self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
2485 if (self == peer) { 2446 if (self == peer) {
2486 self = mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1); 2447 if (mdev->agreed_pro_version < 96 ?
2487 peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1); 2448 (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) ==
2488 if (self == peer) { 2449 (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) :
2450 self + UUID_NEW_BM_OFFSET == (mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) {
2489 /* The last P_SYNC_UUID did not get though. Undo the last start of 2451 /* The last P_SYNC_UUID did not get though. Undo the last start of
2490 resync as sync source modifications of our UUIDs. */ 2452 resync as sync source modifications of our UUIDs. */
2491 2453
2492 if (mdev->agreed_pro_version < 91) 2454 if (mdev->agreed_pro_version < 91)
2493 return -1001; 2455 return -1091;
2494 2456
2495 _drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]); 2457 _drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]);
2496 _drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]); 2458 _drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]);
2497 2459
2498 dev_info(DEV, "Undid last start of resync:\n"); 2460 dev_info(DEV, "Last syncUUID did not get through, corrected:\n");
2499
2500 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, 2461 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
2501 mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0); 2462 mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
2502 2463
@@ -2559,8 +2520,8 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol
2559 dev_alert(DEV, "Unrelated data, aborting!\n"); 2520 dev_alert(DEV, "Unrelated data, aborting!\n");
2560 return C_MASK; 2521 return C_MASK;
2561 } 2522 }
2562 if (hg == -1001) { 2523 if (hg < -1000) {
2563 dev_alert(DEV, "To resolve this both sides have to support at least protocol\n"); 2524 dev_alert(DEV, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000);
2564 return C_MASK; 2525 return C_MASK;
2565 } 2526 }
2566 2527
@@ -2659,7 +2620,8 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol
2659 2620
2660 if (abs(hg) >= 2) { 2621 if (abs(hg) >= 2) {
2661 dev_info(DEV, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n"); 2622 dev_info(DEV, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
2662 if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from sync_handshake")) 2623 if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from sync_handshake",
2624 BM_LOCKED_SET_ALLOWED))
2663 return C_MASK; 2625 return C_MASK;
2664 } 2626 }
2665 2627
@@ -2699,20 +2661,13 @@ static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self)
2699 return 1; 2661 return 1;
2700} 2662}
2701 2663
2702static int receive_protocol(struct drbd_conf *mdev, struct p_header *h) 2664static int receive_protocol(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
2703{ 2665{
2704 struct p_protocol *p = (struct p_protocol *)h; 2666 struct p_protocol *p = &mdev->data.rbuf.protocol;
2705 int header_size, data_size;
2706 int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p; 2667 int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
2707 int p_want_lose, p_two_primaries, cf; 2668 int p_want_lose, p_two_primaries, cf;
2708 char p_integrity_alg[SHARED_SECRET_MAX] = ""; 2669 char p_integrity_alg[SHARED_SECRET_MAX] = "";
2709 2670
2710 header_size = sizeof(*p) - sizeof(*h);
2711 data_size = h->length - header_size;
2712
2713 if (drbd_recv(mdev, h->payload, header_size) != header_size)
2714 return FALSE;
2715
2716 p_proto = be32_to_cpu(p->protocol); 2671 p_proto = be32_to_cpu(p->protocol);
2717 p_after_sb_0p = be32_to_cpu(p->after_sb_0p); 2672 p_after_sb_0p = be32_to_cpu(p->after_sb_0p);
2718 p_after_sb_1p = be32_to_cpu(p->after_sb_1p); 2673 p_after_sb_1p = be32_to_cpu(p->after_sb_1p);
@@ -2760,7 +2715,7 @@ static int receive_protocol(struct drbd_conf *mdev, struct p_header *h)
2760 unsigned char *my_alg = mdev->net_conf->integrity_alg; 2715 unsigned char *my_alg = mdev->net_conf->integrity_alg;
2761 2716
2762 if (drbd_recv(mdev, p_integrity_alg, data_size) != data_size) 2717 if (drbd_recv(mdev, p_integrity_alg, data_size) != data_size)
2763 return FALSE; 2718 return false;
2764 2719
2765 p_integrity_alg[SHARED_SECRET_MAX-1] = 0; 2720 p_integrity_alg[SHARED_SECRET_MAX-1] = 0;
2766 if (strcmp(p_integrity_alg, my_alg)) { 2721 if (strcmp(p_integrity_alg, my_alg)) {
@@ -2771,11 +2726,11 @@ static int receive_protocol(struct drbd_conf *mdev, struct p_header *h)
2771 my_alg[0] ? my_alg : (unsigned char *)"<not-used>"); 2726 my_alg[0] ? my_alg : (unsigned char *)"<not-used>");
2772 } 2727 }
2773 2728
2774 return TRUE; 2729 return true;
2775 2730
2776disconnect: 2731disconnect:
2777 drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 2732 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2778 return FALSE; 2733 return false;
2779} 2734}
2780 2735
2781/* helper function 2736/* helper function
@@ -2805,40 +2760,47 @@ struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev,
2805 return tfm; 2760 return tfm;
2806} 2761}
2807 2762
2808static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h) 2763static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int packet_size)
2809{ 2764{
2810 int ok = TRUE; 2765 int ok = true;
2811 struct p_rs_param_89 *p = (struct p_rs_param_89 *)h; 2766 struct p_rs_param_95 *p = &mdev->data.rbuf.rs_param_95;
2812 unsigned int header_size, data_size, exp_max_sz; 2767 unsigned int header_size, data_size, exp_max_sz;
2813 struct crypto_hash *verify_tfm = NULL; 2768 struct crypto_hash *verify_tfm = NULL;
2814 struct crypto_hash *csums_tfm = NULL; 2769 struct crypto_hash *csums_tfm = NULL;
2815 const int apv = mdev->agreed_pro_version; 2770 const int apv = mdev->agreed_pro_version;
2771 int *rs_plan_s = NULL;
2772 int fifo_size = 0;
2816 2773
2817 exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param) 2774 exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param)
2818 : apv == 88 ? sizeof(struct p_rs_param) 2775 : apv == 88 ? sizeof(struct p_rs_param)
2819 + SHARED_SECRET_MAX 2776 + SHARED_SECRET_MAX
2820 : /* 89 */ sizeof(struct p_rs_param_89); 2777 : apv <= 94 ? sizeof(struct p_rs_param_89)
2778 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
2821 2779
2822 if (h->length > exp_max_sz) { 2780 if (packet_size > exp_max_sz) {
2823 dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n", 2781 dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n",
2824 h->length, exp_max_sz); 2782 packet_size, exp_max_sz);
2825 return FALSE; 2783 return false;
2826 } 2784 }
2827 2785
2828 if (apv <= 88) { 2786 if (apv <= 88) {
2829 header_size = sizeof(struct p_rs_param) - sizeof(*h); 2787 header_size = sizeof(struct p_rs_param) - sizeof(struct p_header80);
2830 data_size = h->length - header_size; 2788 data_size = packet_size - header_size;
2831 } else /* apv >= 89 */ { 2789 } else if (apv <= 94) {
2832 header_size = sizeof(struct p_rs_param_89) - sizeof(*h); 2790 header_size = sizeof(struct p_rs_param_89) - sizeof(struct p_header80);
2833 data_size = h->length - header_size; 2791 data_size = packet_size - header_size;
2792 D_ASSERT(data_size == 0);
2793 } else {
2794 header_size = sizeof(struct p_rs_param_95) - sizeof(struct p_header80);
2795 data_size = packet_size - header_size;
2834 D_ASSERT(data_size == 0); 2796 D_ASSERT(data_size == 0);
2835 } 2797 }
2836 2798
2837 /* initialize verify_alg and csums_alg */ 2799 /* initialize verify_alg and csums_alg */
2838 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); 2800 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
2839 2801
2840 if (drbd_recv(mdev, h->payload, header_size) != header_size) 2802 if (drbd_recv(mdev, &p->head.payload, header_size) != header_size)
2841 return FALSE; 2803 return false;
2842 2804
2843 mdev->sync_conf.rate = be32_to_cpu(p->rate); 2805 mdev->sync_conf.rate = be32_to_cpu(p->rate);
2844 2806
@@ -2848,11 +2810,11 @@ static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h)
2848 dev_err(DEV, "verify-alg too long, " 2810 dev_err(DEV, "verify-alg too long, "
2849 "peer wants %u, accepting only %u byte\n", 2811 "peer wants %u, accepting only %u byte\n",
2850 data_size, SHARED_SECRET_MAX); 2812 data_size, SHARED_SECRET_MAX);
2851 return FALSE; 2813 return false;
2852 } 2814 }
2853 2815
2854 if (drbd_recv(mdev, p->verify_alg, data_size) != data_size) 2816 if (drbd_recv(mdev, p->verify_alg, data_size) != data_size)
2855 return FALSE; 2817 return false;
2856 2818
2857 /* we expect NUL terminated string */ 2819 /* we expect NUL terminated string */
2858 /* but just in case someone tries to be evil */ 2820 /* but just in case someone tries to be evil */
@@ -2896,6 +2858,22 @@ static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h)
2896 } 2858 }
2897 } 2859 }
2898 2860
2861 if (apv > 94) {
2862 mdev->sync_conf.rate = be32_to_cpu(p->rate);
2863 mdev->sync_conf.c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
2864 mdev->sync_conf.c_delay_target = be32_to_cpu(p->c_delay_target);
2865 mdev->sync_conf.c_fill_target = be32_to_cpu(p->c_fill_target);
2866 mdev->sync_conf.c_max_rate = be32_to_cpu(p->c_max_rate);
2867
2868 fifo_size = (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ;
2869 if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) {
2870 rs_plan_s = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL);
2871 if (!rs_plan_s) {
2872 dev_err(DEV, "kmalloc of fifo_buffer failed");
2873 goto disconnect;
2874 }
2875 }
2876 }
2899 2877
2900 spin_lock(&mdev->peer_seq_lock); 2878 spin_lock(&mdev->peer_seq_lock);
2901 /* lock against drbd_nl_syncer_conf() */ 2879 /* lock against drbd_nl_syncer_conf() */
@@ -2913,6 +2891,12 @@ static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h)
2913 mdev->csums_tfm = csums_tfm; 2891 mdev->csums_tfm = csums_tfm;
2914 dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg); 2892 dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg);
2915 } 2893 }
2894 if (fifo_size != mdev->rs_plan_s.size) {
2895 kfree(mdev->rs_plan_s.values);
2896 mdev->rs_plan_s.values = rs_plan_s;
2897 mdev->rs_plan_s.size = fifo_size;
2898 mdev->rs_planed = 0;
2899 }
2916 spin_unlock(&mdev->peer_seq_lock); 2900 spin_unlock(&mdev->peer_seq_lock);
2917 } 2901 }
2918 2902
@@ -2924,13 +2908,7 @@ disconnect:
2924 /* but free the verify_tfm again, if csums_tfm did not work out */ 2908 /* but free the verify_tfm again, if csums_tfm did not work out */
2925 crypto_free_hash(verify_tfm); 2909 crypto_free_hash(verify_tfm);
2926 drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 2910 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2927 return FALSE; 2911 return false;
2928}
2929
2930static void drbd_setup_order_type(struct drbd_conf *mdev, int peer)
2931{
2932 /* sorry, we currently have no working implementation
2933 * of distributed TCQ */
2934} 2912}
2935 2913
2936/* warn if the arguments differ by more than 12.5% */ 2914/* warn if the arguments differ by more than 12.5% */
@@ -2946,33 +2924,27 @@ static void warn_if_differ_considerably(struct drbd_conf *mdev,
2946 (unsigned long long)a, (unsigned long long)b); 2924 (unsigned long long)a, (unsigned long long)b);
2947} 2925}
2948 2926
2949static int receive_sizes(struct drbd_conf *mdev, struct p_header *h) 2927static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
2950{ 2928{
2951 struct p_sizes *p = (struct p_sizes *)h; 2929 struct p_sizes *p = &mdev->data.rbuf.sizes;
2952 enum determine_dev_size dd = unchanged; 2930 enum determine_dev_size dd = unchanged;
2953 unsigned int max_seg_s;
2954 sector_t p_size, p_usize, my_usize; 2931 sector_t p_size, p_usize, my_usize;
2955 int ldsc = 0; /* local disk size changed */ 2932 int ldsc = 0; /* local disk size changed */
2956 enum dds_flags ddsf; 2933 enum dds_flags ddsf;
2957 2934
2958 ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
2959 if (drbd_recv(mdev, h->payload, h->length) != h->length)
2960 return FALSE;
2961
2962 p_size = be64_to_cpu(p->d_size); 2935 p_size = be64_to_cpu(p->d_size);
2963 p_usize = be64_to_cpu(p->u_size); 2936 p_usize = be64_to_cpu(p->u_size);
2964 2937
2965 if (p_size == 0 && mdev->state.disk == D_DISKLESS) { 2938 if (p_size == 0 && mdev->state.disk == D_DISKLESS) {
2966 dev_err(DEV, "some backing storage is needed\n"); 2939 dev_err(DEV, "some backing storage is needed\n");
2967 drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 2940 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2968 return FALSE; 2941 return false;
2969 } 2942 }
2970 2943
2971 /* just store the peer's disk size for now. 2944 /* just store the peer's disk size for now.
2972 * we still need to figure out whether we accept that. */ 2945 * we still need to figure out whether we accept that. */
2973 mdev->p_size = p_size; 2946 mdev->p_size = p_size;
2974 2947
2975#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
2976 if (get_ldev(mdev)) { 2948 if (get_ldev(mdev)) {
2977 warn_if_differ_considerably(mdev, "lower level device sizes", 2949 warn_if_differ_considerably(mdev, "lower level device sizes",
2978 p_size, drbd_get_max_capacity(mdev->ldev)); 2950 p_size, drbd_get_max_capacity(mdev->ldev));
@@ -3003,39 +2975,32 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
3003 drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 2975 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
3004 mdev->ldev->dc.disk_size = my_usize; 2976 mdev->ldev->dc.disk_size = my_usize;
3005 put_ldev(mdev); 2977 put_ldev(mdev);
3006 return FALSE; 2978 return false;
3007 } 2979 }
3008 put_ldev(mdev); 2980 put_ldev(mdev);
3009 } 2981 }
3010#undef min_not_zero
3011 2982
3012 ddsf = be16_to_cpu(p->dds_flags); 2983 ddsf = be16_to_cpu(p->dds_flags);
3013 if (get_ldev(mdev)) { 2984 if (get_ldev(mdev)) {
3014 dd = drbd_determin_dev_size(mdev, ddsf); 2985 dd = drbd_determine_dev_size(mdev, ddsf);
3015 put_ldev(mdev); 2986 put_ldev(mdev);
3016 if (dd == dev_size_error) 2987 if (dd == dev_size_error)
3017 return FALSE; 2988 return false;
3018 drbd_md_sync(mdev); 2989 drbd_md_sync(mdev);
3019 } else { 2990 } else {
3020 /* I am diskless, need to accept the peer's size. */ 2991 /* I am diskless, need to accept the peer's size. */
3021 drbd_set_my_capacity(mdev, p_size); 2992 drbd_set_my_capacity(mdev, p_size);
3022 } 2993 }
3023 2994
2995 mdev->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
2996 drbd_reconsider_max_bio_size(mdev);
2997
3024 if (get_ldev(mdev)) { 2998 if (get_ldev(mdev)) {
3025 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) { 2999 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
3026 mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); 3000 mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
3027 ldsc = 1; 3001 ldsc = 1;
3028 } 3002 }
3029 3003
3030 if (mdev->agreed_pro_version < 94)
3031 max_seg_s = be32_to_cpu(p->max_segment_size);
3032 else /* drbd 8.3.8 onwards */
3033 max_seg_s = DRBD_MAX_SEGMENT_SIZE;
3034
3035 if (max_seg_s != queue_max_segment_size(mdev->rq_queue))
3036 drbd_setup_queue_param(mdev, max_seg_s);
3037
3038 drbd_setup_order_type(mdev, be16_to_cpu(p->queue_order_type));
3039 put_ldev(mdev); 3004 put_ldev(mdev);
3040 } 3005 }
3041 3006
@@ -3059,18 +3024,14 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
3059 } 3024 }
3060 } 3025 }
3061 3026
3062 return TRUE; 3027 return true;
3063} 3028}
3064 3029
3065static int receive_uuids(struct drbd_conf *mdev, struct p_header *h) 3030static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
3066{ 3031{
3067 struct p_uuids *p = (struct p_uuids *)h; 3032 struct p_uuids *p = &mdev->data.rbuf.uuids;
3068 u64 *p_uuid; 3033 u64 *p_uuid;
3069 int i; 3034 int i, updated_uuids = 0;
3070
3071 ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
3072 if (drbd_recv(mdev, h->payload, h->length) != h->length)
3073 return FALSE;
3074 3035
3075 p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO); 3036 p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
3076 3037
@@ -3087,7 +3048,7 @@ static int receive_uuids(struct drbd_conf *mdev, struct p_header *h)
3087 dev_err(DEV, "Can only connect to data with current UUID=%016llX\n", 3048 dev_err(DEV, "Can only connect to data with current UUID=%016llX\n",
3088 (unsigned long long)mdev->ed_uuid); 3049 (unsigned long long)mdev->ed_uuid);
3089 drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 3050 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
3090 return FALSE; 3051 return false;
3091 } 3052 }
3092 3053
3093 if (get_ldev(mdev)) { 3054 if (get_ldev(mdev)) {
@@ -3099,14 +3060,21 @@ static int receive_uuids(struct drbd_conf *mdev, struct p_header *h)
3099 if (skip_initial_sync) { 3060 if (skip_initial_sync) {
3100 dev_info(DEV, "Accepted new current UUID, preparing to skip initial sync\n"); 3061 dev_info(DEV, "Accepted new current UUID, preparing to skip initial sync\n");
3101 drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write, 3062 drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write,
3102 "clear_n_write from receive_uuids"); 3063 "clear_n_write from receive_uuids",
3064 BM_LOCKED_TEST_ALLOWED);
3103 _drbd_uuid_set(mdev, UI_CURRENT, p_uuid[UI_CURRENT]); 3065 _drbd_uuid_set(mdev, UI_CURRENT, p_uuid[UI_CURRENT]);
3104 _drbd_uuid_set(mdev, UI_BITMAP, 0); 3066 _drbd_uuid_set(mdev, UI_BITMAP, 0);
3105 _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE), 3067 _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
3106 CS_VERBOSE, NULL); 3068 CS_VERBOSE, NULL);
3107 drbd_md_sync(mdev); 3069 drbd_md_sync(mdev);
3070 updated_uuids = 1;
3108 } 3071 }
3109 put_ldev(mdev); 3072 put_ldev(mdev);
3073 } else if (mdev->state.disk < D_INCONSISTENT &&
3074 mdev->state.role == R_PRIMARY) {
3075 /* I am a diskless primary, the peer just created a new current UUID
3076 for me. */
3077 updated_uuids = drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
3110 } 3078 }
3111 3079
3112 /* Before we test for the disk state, we should wait until an eventually 3080 /* Before we test for the disk state, we should wait until an eventually
@@ -3115,9 +3083,12 @@ static int receive_uuids(struct drbd_conf *mdev, struct p_header *h)
3115 new disk state... */ 3083 new disk state... */
3116 wait_event(mdev->misc_wait, !test_bit(CLUSTER_ST_CHANGE, &mdev->flags)); 3084 wait_event(mdev->misc_wait, !test_bit(CLUSTER_ST_CHANGE, &mdev->flags));
3117 if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT) 3085 if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT)
3118 drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]); 3086 updated_uuids |= drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
3087
3088 if (updated_uuids)
3089 drbd_print_uuids(mdev, "receiver updated UUIDs to");
3119 3090
3120 return TRUE; 3091 return true;
3121} 3092}
3122 3093
3123/** 3094/**
@@ -3150,15 +3121,11 @@ static union drbd_state convert_state(union drbd_state ps)
3150 return ms; 3121 return ms;
3151} 3122}
3152 3123
3153static int receive_req_state(struct drbd_conf *mdev, struct p_header *h) 3124static int receive_req_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
3154{ 3125{
3155 struct p_req_state *p = (struct p_req_state *)h; 3126 struct p_req_state *p = &mdev->data.rbuf.req_state;
3156 union drbd_state mask, val; 3127 union drbd_state mask, val;
3157 int rv; 3128 enum drbd_state_rv rv;
3158
3159 ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
3160 if (drbd_recv(mdev, h->payload, h->length) != h->length)
3161 return FALSE;
3162 3129
3163 mask.i = be32_to_cpu(p->mask); 3130 mask.i = be32_to_cpu(p->mask);
3164 val.i = be32_to_cpu(p->val); 3131 val.i = be32_to_cpu(p->val);
@@ -3166,7 +3133,7 @@ static int receive_req_state(struct drbd_conf *mdev, struct p_header *h)
3166 if (test_bit(DISCARD_CONCURRENT, &mdev->flags) && 3133 if (test_bit(DISCARD_CONCURRENT, &mdev->flags) &&
3167 test_bit(CLUSTER_ST_CHANGE, &mdev->flags)) { 3134 test_bit(CLUSTER_ST_CHANGE, &mdev->flags)) {
3168 drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG); 3135 drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG);
3169 return TRUE; 3136 return true;
3170 } 3137 }
3171 3138
3172 mask = convert_state(mask); 3139 mask = convert_state(mask);
@@ -3177,23 +3144,17 @@ static int receive_req_state(struct drbd_conf *mdev, struct p_header *h)
3177 drbd_send_sr_reply(mdev, rv); 3144 drbd_send_sr_reply(mdev, rv);
3178 drbd_md_sync(mdev); 3145 drbd_md_sync(mdev);
3179 3146
3180 return TRUE; 3147 return true;
3181} 3148}
3182 3149
3183static int receive_state(struct drbd_conf *mdev, struct p_header *h) 3150static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
3184{ 3151{
3185 struct p_state *p = (struct p_state *)h; 3152 struct p_state *p = &mdev->data.rbuf.state;
3186 enum drbd_conns nconn, oconn; 3153 union drbd_state os, ns, peer_state;
3187 union drbd_state ns, peer_state;
3188 enum drbd_disk_state real_peer_disk; 3154 enum drbd_disk_state real_peer_disk;
3155 enum chg_state_flags cs_flags;
3189 int rv; 3156 int rv;
3190 3157
3191 ERR_IF(h->length != (sizeof(*p)-sizeof(*h)))
3192 return FALSE;
3193
3194 if (drbd_recv(mdev, h->payload, h->length) != h->length)
3195 return FALSE;
3196
3197 peer_state.i = be32_to_cpu(p->state); 3158 peer_state.i = be32_to_cpu(p->state);
3198 3159
3199 real_peer_disk = peer_state.disk; 3160 real_peer_disk = peer_state.disk;
@@ -3204,77 +3165,124 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h)
3204 3165
3205 spin_lock_irq(&mdev->req_lock); 3166 spin_lock_irq(&mdev->req_lock);
3206 retry: 3167 retry:
3207 oconn = nconn = mdev->state.conn; 3168 os = ns = mdev->state;
3208 spin_unlock_irq(&mdev->req_lock); 3169 spin_unlock_irq(&mdev->req_lock);
3209 3170
3210 if (nconn == C_WF_REPORT_PARAMS) 3171 /* peer says his disk is uptodate, while we think it is inconsistent,
3211 nconn = C_CONNECTED; 3172 * and this happens while we think we have a sync going on. */
3173 if (os.pdsk == D_INCONSISTENT && real_peer_disk == D_UP_TO_DATE &&
3174 os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
3175 /* If we are (becoming) SyncSource, but peer is still in sync
3176 * preparation, ignore its uptodate-ness to avoid flapping, it
3177 * will change to inconsistent once the peer reaches active
3178 * syncing states.
3179 * It may have changed syncer-paused flags, however, so we
3180 * cannot ignore this completely. */
3181 if (peer_state.conn > C_CONNECTED &&
3182 peer_state.conn < C_SYNC_SOURCE)
3183 real_peer_disk = D_INCONSISTENT;
3184
3185 /* if peer_state changes to connected at the same time,
3186 * it explicitly notifies us that it finished resync.
3187 * Maybe we should finish it up, too? */
3188 else if (os.conn >= C_SYNC_SOURCE &&
3189 peer_state.conn == C_CONNECTED) {
3190 if (drbd_bm_total_weight(mdev) <= mdev->rs_failed)
3191 drbd_resync_finished(mdev);
3192 return true;
3193 }
3194 }
3195
3196 /* peer says his disk is inconsistent, while we think it is uptodate,
3197 * and this happens while the peer still thinks we have a sync going on,
3198 * but we think we are already done with the sync.
3199 * We ignore this to avoid flapping pdsk.
3200 * This should not happen, if the peer is a recent version of drbd. */
3201 if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT &&
3202 os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE)
3203 real_peer_disk = D_UP_TO_DATE;
3204
3205 if (ns.conn == C_WF_REPORT_PARAMS)
3206 ns.conn = C_CONNECTED;
3207
3208 if (peer_state.conn == C_AHEAD)
3209 ns.conn = C_BEHIND;
3212 3210
3213 if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING && 3211 if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING &&
3214 get_ldev_if_state(mdev, D_NEGOTIATING)) { 3212 get_ldev_if_state(mdev, D_NEGOTIATING)) {
3215 int cr; /* consider resync */ 3213 int cr; /* consider resync */
3216 3214
3217 /* if we established a new connection */ 3215 /* if we established a new connection */
3218 cr = (oconn < C_CONNECTED); 3216 cr = (os.conn < C_CONNECTED);
3219 /* if we had an established connection 3217 /* if we had an established connection
3220 * and one of the nodes newly attaches a disk */ 3218 * and one of the nodes newly attaches a disk */
3221 cr |= (oconn == C_CONNECTED && 3219 cr |= (os.conn == C_CONNECTED &&
3222 (peer_state.disk == D_NEGOTIATING || 3220 (peer_state.disk == D_NEGOTIATING ||
3223 mdev->state.disk == D_NEGOTIATING)); 3221 os.disk == D_NEGOTIATING));
3224 /* if we have both been inconsistent, and the peer has been 3222 /* if we have both been inconsistent, and the peer has been
3225 * forced to be UpToDate with --overwrite-data */ 3223 * forced to be UpToDate with --overwrite-data */
3226 cr |= test_bit(CONSIDER_RESYNC, &mdev->flags); 3224 cr |= test_bit(CONSIDER_RESYNC, &mdev->flags);
3227 /* if we had been plain connected, and the admin requested to 3225 /* if we had been plain connected, and the admin requested to
3228 * start a sync by "invalidate" or "invalidate-remote" */ 3226 * start a sync by "invalidate" or "invalidate-remote" */
3229 cr |= (oconn == C_CONNECTED && 3227 cr |= (os.conn == C_CONNECTED &&
3230 (peer_state.conn >= C_STARTING_SYNC_S && 3228 (peer_state.conn >= C_STARTING_SYNC_S &&
3231 peer_state.conn <= C_WF_BITMAP_T)); 3229 peer_state.conn <= C_WF_BITMAP_T));
3232 3230
3233 if (cr) 3231 if (cr)
3234 nconn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk); 3232 ns.conn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
3235 3233
3236 put_ldev(mdev); 3234 put_ldev(mdev);
3237 if (nconn == C_MASK) { 3235 if (ns.conn == C_MASK) {
3238 nconn = C_CONNECTED; 3236 ns.conn = C_CONNECTED;
3239 if (mdev->state.disk == D_NEGOTIATING) { 3237 if (mdev->state.disk == D_NEGOTIATING) {
3240 drbd_force_state(mdev, NS(disk, D_DISKLESS)); 3238 drbd_force_state(mdev, NS(disk, D_FAILED));
3241 } else if (peer_state.disk == D_NEGOTIATING) { 3239 } else if (peer_state.disk == D_NEGOTIATING) {
3242 dev_err(DEV, "Disk attach process on the peer node was aborted.\n"); 3240 dev_err(DEV, "Disk attach process on the peer node was aborted.\n");
3243 peer_state.disk = D_DISKLESS; 3241 peer_state.disk = D_DISKLESS;
3244 real_peer_disk = D_DISKLESS; 3242 real_peer_disk = D_DISKLESS;
3245 } else { 3243 } else {
3246 if (test_and_clear_bit(CONN_DRY_RUN, &mdev->flags)) 3244 if (test_and_clear_bit(CONN_DRY_RUN, &mdev->flags))
3247 return FALSE; 3245 return false;
3248 D_ASSERT(oconn == C_WF_REPORT_PARAMS); 3246 D_ASSERT(os.conn == C_WF_REPORT_PARAMS);
3249 drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 3247 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
3250 return FALSE; 3248 return false;
3251 } 3249 }
3252 } 3250 }
3253 } 3251 }
3254 3252
3255 spin_lock_irq(&mdev->req_lock); 3253 spin_lock_irq(&mdev->req_lock);
3256 if (mdev->state.conn != oconn) 3254 if (mdev->state.i != os.i)
3257 goto retry; 3255 goto retry;
3258 clear_bit(CONSIDER_RESYNC, &mdev->flags); 3256 clear_bit(CONSIDER_RESYNC, &mdev->flags);
3259 ns.i = mdev->state.i;
3260 ns.conn = nconn;
3261 ns.peer = peer_state.role; 3257 ns.peer = peer_state.role;
3262 ns.pdsk = real_peer_disk; 3258 ns.pdsk = real_peer_disk;
3263 ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp); 3259 ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
3264 if ((nconn == C_CONNECTED || nconn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING) 3260 if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
3265 ns.disk = mdev->new_state_tmp.disk; 3261 ns.disk = mdev->new_state_tmp.disk;
3266 3262 cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
3267 rv = _drbd_set_state(mdev, ns, CS_VERBOSE | CS_HARD, NULL); 3263 if (ns.pdsk == D_CONSISTENT && is_susp(ns) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
3264 test_bit(NEW_CUR_UUID, &mdev->flags)) {
3265 /* Do not allow tl_restart(resend) for a rebooted peer. We can only allow this
3266 for temporal network outages! */
3267 spin_unlock_irq(&mdev->req_lock);
3268 dev_err(DEV, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
3269 tl_clear(mdev);
3270 drbd_uuid_new_current(mdev);
3271 clear_bit(NEW_CUR_UUID, &mdev->flags);
3272 drbd_force_state(mdev, NS2(conn, C_PROTOCOL_ERROR, susp, 0));
3273 return false;
3274 }
3275 rv = _drbd_set_state(mdev, ns, cs_flags, NULL);
3268 ns = mdev->state; 3276 ns = mdev->state;
3269 spin_unlock_irq(&mdev->req_lock); 3277 spin_unlock_irq(&mdev->req_lock);
3270 3278
3271 if (rv < SS_SUCCESS) { 3279 if (rv < SS_SUCCESS) {
3272 drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 3280 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
3273 return FALSE; 3281 return false;
3274 } 3282 }
3275 3283
3276 if (oconn > C_WF_REPORT_PARAMS) { 3284 if (os.conn > C_WF_REPORT_PARAMS) {
3277 if (nconn > C_CONNECTED && peer_state.conn <= C_CONNECTED && 3285 if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
3278 peer_state.disk != D_NEGOTIATING ) { 3286 peer_state.disk != D_NEGOTIATING ) {
3279 /* we want resync, peer has not yet decided to sync... */ 3287 /* we want resync, peer has not yet decided to sync... */
3280 /* Nowadays only used when forcing a node into primary role and 3288 /* Nowadays only used when forcing a node into primary role and
@@ -3288,56 +3296,63 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h)
3288 3296
3289 drbd_md_sync(mdev); /* update connected indicator, la_size, ... */ 3297 drbd_md_sync(mdev); /* update connected indicator, la_size, ... */
3290 3298
3291 return TRUE; 3299 return true;
3292} 3300}
3293 3301
3294static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h) 3302static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
3295{ 3303{
3296 struct p_rs_uuid *p = (struct p_rs_uuid *)h; 3304 struct p_rs_uuid *p = &mdev->data.rbuf.rs_uuid;
3297 3305
3298 wait_event(mdev->misc_wait, 3306 wait_event(mdev->misc_wait,
3299 mdev->state.conn == C_WF_SYNC_UUID || 3307 mdev->state.conn == C_WF_SYNC_UUID ||
3308 mdev->state.conn == C_BEHIND ||
3300 mdev->state.conn < C_CONNECTED || 3309 mdev->state.conn < C_CONNECTED ||
3301 mdev->state.disk < D_NEGOTIATING); 3310 mdev->state.disk < D_NEGOTIATING);
3302 3311
3303 /* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */ 3312 /* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */
3304 3313
3305 ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
3306 if (drbd_recv(mdev, h->payload, h->length) != h->length)
3307 return FALSE;
3308
3309 /* Here the _drbd_uuid_ functions are right, current should 3314 /* Here the _drbd_uuid_ functions are right, current should
3310 _not_ be rotated into the history */ 3315 _not_ be rotated into the history */
3311 if (get_ldev_if_state(mdev, D_NEGOTIATING)) { 3316 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
3312 _drbd_uuid_set(mdev, UI_CURRENT, be64_to_cpu(p->uuid)); 3317 _drbd_uuid_set(mdev, UI_CURRENT, be64_to_cpu(p->uuid));
3313 _drbd_uuid_set(mdev, UI_BITMAP, 0UL); 3318 _drbd_uuid_set(mdev, UI_BITMAP, 0UL);
3314 3319
3320 drbd_print_uuids(mdev, "updated sync uuid");
3315 drbd_start_resync(mdev, C_SYNC_TARGET); 3321 drbd_start_resync(mdev, C_SYNC_TARGET);
3316 3322
3317 put_ldev(mdev); 3323 put_ldev(mdev);
3318 } else 3324 } else
3319 dev_err(DEV, "Ignoring SyncUUID packet!\n"); 3325 dev_err(DEV, "Ignoring SyncUUID packet!\n");
3320 3326
3321 return TRUE; 3327 return true;
3322} 3328}
3323 3329
3324enum receive_bitmap_ret { OK, DONE, FAILED }; 3330/**
3325 3331 * receive_bitmap_plain
3326static enum receive_bitmap_ret 3332 *
3327receive_bitmap_plain(struct drbd_conf *mdev, struct p_header *h, 3333 * Return 0 when done, 1 when another iteration is needed, and a negative error
3328 unsigned long *buffer, struct bm_xfer_ctx *c) 3334 * code upon failure.
3335 */
3336static int
3337receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size,
3338 unsigned long *buffer, struct bm_xfer_ctx *c)
3329{ 3339{
3330 unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset); 3340 unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
3331 unsigned want = num_words * sizeof(long); 3341 unsigned want = num_words * sizeof(long);
3342 int err;
3332 3343
3333 if (want != h->length) { 3344 if (want != data_size) {
3334 dev_err(DEV, "%s:want (%u) != h->length (%u)\n", __func__, want, h->length); 3345 dev_err(DEV, "%s:want (%u) != data_size (%u)\n", __func__, want, data_size);
3335 return FAILED; 3346 return -EIO;
3336 } 3347 }
3337 if (want == 0) 3348 if (want == 0)
3338 return DONE; 3349 return 0;
3339 if (drbd_recv(mdev, buffer, want) != want) 3350 err = drbd_recv(mdev, buffer, want);
3340 return FAILED; 3351 if (err != want) {
3352 if (err >= 0)
3353 err = -EIO;
3354 return err;
3355 }
3341 3356
3342 drbd_bm_merge_lel(mdev, c->word_offset, num_words, buffer); 3357 drbd_bm_merge_lel(mdev, c->word_offset, num_words, buffer);
3343 3358
@@ -3346,10 +3361,16 @@ receive_bitmap_plain(struct drbd_conf *mdev, struct p_header *h,
3346 if (c->bit_offset > c->bm_bits) 3361 if (c->bit_offset > c->bm_bits)
3347 c->bit_offset = c->bm_bits; 3362 c->bit_offset = c->bm_bits;
3348 3363
3349 return OK; 3364 return 1;
3350} 3365}
3351 3366
3352static enum receive_bitmap_ret 3367/**
3368 * recv_bm_rle_bits
3369 *
3370 * Return 0 when done, 1 when another iteration is needed, and a negative error
3371 * code upon failure.
3372 */
3373static int
3353recv_bm_rle_bits(struct drbd_conf *mdev, 3374recv_bm_rle_bits(struct drbd_conf *mdev,
3354 struct p_compressed_bm *p, 3375 struct p_compressed_bm *p,
3355 struct bm_xfer_ctx *c) 3376 struct bm_xfer_ctx *c)
@@ -3360,7 +3381,7 @@ recv_bm_rle_bits(struct drbd_conf *mdev,
3360 u64 tmp; 3381 u64 tmp;
3361 unsigned long s = c->bit_offset; 3382 unsigned long s = c->bit_offset;
3362 unsigned long e; 3383 unsigned long e;
3363 int len = p->head.length - (sizeof(*p) - sizeof(p->head)); 3384 int len = be16_to_cpu(p->head.length) - (sizeof(*p) - sizeof(p->head));
3364 int toggle = DCBP_get_start(p); 3385 int toggle = DCBP_get_start(p);
3365 int have; 3386 int have;
3366 int bits; 3387 int bits;
@@ -3369,18 +3390,18 @@ recv_bm_rle_bits(struct drbd_conf *mdev,
3369 3390
3370 bits = bitstream_get_bits(&bs, &look_ahead, 64); 3391 bits = bitstream_get_bits(&bs, &look_ahead, 64);
3371 if (bits < 0) 3392 if (bits < 0)
3372 return FAILED; 3393 return -EIO;
3373 3394
3374 for (have = bits; have > 0; s += rl, toggle = !toggle) { 3395 for (have = bits; have > 0; s += rl, toggle = !toggle) {
3375 bits = vli_decode_bits(&rl, look_ahead); 3396 bits = vli_decode_bits(&rl, look_ahead);
3376 if (bits <= 0) 3397 if (bits <= 0)
3377 return FAILED; 3398 return -EIO;
3378 3399
3379 if (toggle) { 3400 if (toggle) {
3380 e = s + rl -1; 3401 e = s + rl -1;
3381 if (e >= c->bm_bits) { 3402 if (e >= c->bm_bits) {
3382 dev_err(DEV, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e); 3403 dev_err(DEV, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
3383 return FAILED; 3404 return -EIO;
3384 } 3405 }
3385 _drbd_bm_set_bits(mdev, s, e); 3406 _drbd_bm_set_bits(mdev, s, e);
3386 } 3407 }
@@ -3390,14 +3411,14 @@ recv_bm_rle_bits(struct drbd_conf *mdev,
3390 have, bits, look_ahead, 3411 have, bits, look_ahead,
3391 (unsigned int)(bs.cur.b - p->code), 3412 (unsigned int)(bs.cur.b - p->code),
3392 (unsigned int)bs.buf_len); 3413 (unsigned int)bs.buf_len);
3393 return FAILED; 3414 return -EIO;
3394 } 3415 }
3395 look_ahead >>= bits; 3416 look_ahead >>= bits;
3396 have -= bits; 3417 have -= bits;
3397 3418
3398 bits = bitstream_get_bits(&bs, &tmp, 64 - have); 3419 bits = bitstream_get_bits(&bs, &tmp, 64 - have);
3399 if (bits < 0) 3420 if (bits < 0)
3400 return FAILED; 3421 return -EIO;
3401 look_ahead |= tmp << have; 3422 look_ahead |= tmp << have;
3402 have += bits; 3423 have += bits;
3403 } 3424 }
@@ -3405,10 +3426,16 @@ recv_bm_rle_bits(struct drbd_conf *mdev,
3405 c->bit_offset = s; 3426 c->bit_offset = s;
3406 bm_xfer_ctx_bit_to_word_offset(c); 3427 bm_xfer_ctx_bit_to_word_offset(c);
3407 3428
3408 return (s == c->bm_bits) ? DONE : OK; 3429 return (s != c->bm_bits);
3409} 3430}
3410 3431
3411static enum receive_bitmap_ret 3432/**
3433 * decode_bitmap_c
3434 *
3435 * Return 0 when done, 1 when another iteration is needed, and a negative error
3436 * code upon failure.
3437 */
3438static int
3412decode_bitmap_c(struct drbd_conf *mdev, 3439decode_bitmap_c(struct drbd_conf *mdev,
3413 struct p_compressed_bm *p, 3440 struct p_compressed_bm *p,
3414 struct bm_xfer_ctx *c) 3441 struct bm_xfer_ctx *c)
@@ -3422,14 +3449,14 @@ decode_bitmap_c(struct drbd_conf *mdev,
3422 3449
3423 dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding); 3450 dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
3424 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); 3451 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
3425 return FAILED; 3452 return -EIO;
3426} 3453}
3427 3454
3428void INFO_bm_xfer_stats(struct drbd_conf *mdev, 3455void INFO_bm_xfer_stats(struct drbd_conf *mdev,
3429 const char *direction, struct bm_xfer_ctx *c) 3456 const char *direction, struct bm_xfer_ctx *c)
3430{ 3457{
3431 /* what would it take to transfer it "plaintext" */ 3458 /* what would it take to transfer it "plaintext" */
3432 unsigned plain = sizeof(struct p_header) * 3459 unsigned plain = sizeof(struct p_header80) *
3433 ((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1) 3460 ((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1)
3434 + c->bm_words * sizeof(long); 3461 + c->bm_words * sizeof(long);
3435 unsigned total = c->bytes[0] + c->bytes[1]; 3462 unsigned total = c->bytes[0] + c->bytes[1];
@@ -3467,16 +3494,17 @@ void INFO_bm_xfer_stats(struct drbd_conf *mdev,
3467 in order to be agnostic to the 32 vs 64 bits issue. 3494 in order to be agnostic to the 32 vs 64 bits issue.
3468 3495
3469 returns 0 on failure, 1 if we successfully received it. */ 3496 returns 0 on failure, 1 if we successfully received it. */
3470static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h) 3497static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
3471{ 3498{
3472 struct bm_xfer_ctx c; 3499 struct bm_xfer_ctx c;
3473 void *buffer; 3500 void *buffer;
3474 enum receive_bitmap_ret ret; 3501 int err;
3475 int ok = FALSE; 3502 int ok = false;
3476 3503 struct p_header80 *h = &mdev->data.rbuf.header.h80;
3477 wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
3478 3504
3479 drbd_bm_lock(mdev, "receive bitmap"); 3505 drbd_bm_lock(mdev, "receive bitmap", BM_LOCKED_SET_ALLOWED);
3506 /* you are supposed to send additional out-of-sync information
3507 * if you actually set bits during this phase */
3480 3508
3481 /* maybe we should use some per thread scratch page, 3509 /* maybe we should use some per thread scratch page,
3482 * and allocate that during initial device creation? */ 3510 * and allocate that during initial device creation? */
@@ -3491,54 +3519,56 @@ static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h)
3491 .bm_words = drbd_bm_words(mdev), 3519 .bm_words = drbd_bm_words(mdev),
3492 }; 3520 };
3493 3521
3494 do { 3522 for(;;) {
3495 if (h->command == P_BITMAP) { 3523 if (cmd == P_BITMAP) {
3496 ret = receive_bitmap_plain(mdev, h, buffer, &c); 3524 err = receive_bitmap_plain(mdev, data_size, buffer, &c);
3497 } else if (h->command == P_COMPRESSED_BITMAP) { 3525 } else if (cmd == P_COMPRESSED_BITMAP) {
3498 /* MAYBE: sanity check that we speak proto >= 90, 3526 /* MAYBE: sanity check that we speak proto >= 90,
3499 * and the feature is enabled! */ 3527 * and the feature is enabled! */
3500 struct p_compressed_bm *p; 3528 struct p_compressed_bm *p;
3501 3529
3502 if (h->length > BM_PACKET_PAYLOAD_BYTES) { 3530 if (data_size > BM_PACKET_PAYLOAD_BYTES) {
3503 dev_err(DEV, "ReportCBitmap packet too large\n"); 3531 dev_err(DEV, "ReportCBitmap packet too large\n");
3504 goto out; 3532 goto out;
3505 } 3533 }
3506 /* use the page buff */ 3534 /* use the page buff */
3507 p = buffer; 3535 p = buffer;
3508 memcpy(p, h, sizeof(*h)); 3536 memcpy(p, h, sizeof(*h));
3509 if (drbd_recv(mdev, p->head.payload, h->length) != h->length) 3537 if (drbd_recv(mdev, p->head.payload, data_size) != data_size)
3538 goto out;
3539 if (data_size <= (sizeof(*p) - sizeof(p->head))) {
3540 dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", data_size);
3510 goto out; 3541 goto out;
3511 if (p->head.length <= (sizeof(*p) - sizeof(p->head))) {
3512 dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", p->head.length);
3513 return FAILED;
3514 } 3542 }
3515 ret = decode_bitmap_c(mdev, p, &c); 3543 err = decode_bitmap_c(mdev, p, &c);
3516 } else { 3544 } else {
3517 dev_warn(DEV, "receive_bitmap: h->command neither ReportBitMap nor ReportCBitMap (is 0x%x)", h->command); 3545 dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", cmd);
3518 goto out; 3546 goto out;
3519 } 3547 }
3520 3548
3521 c.packets[h->command == P_BITMAP]++; 3549 c.packets[cmd == P_BITMAP]++;
3522 c.bytes[h->command == P_BITMAP] += sizeof(struct p_header) + h->length; 3550 c.bytes[cmd == P_BITMAP] += sizeof(struct p_header80) + data_size;
3523 3551
3524 if (ret != OK) 3552 if (err <= 0) {
3553 if (err < 0)
3554 goto out;
3525 break; 3555 break;
3526 3556 }
3527 if (!drbd_recv_header(mdev, h)) 3557 if (!drbd_recv_header(mdev, &cmd, &data_size))
3528 goto out; 3558 goto out;
3529 } while (ret == OK); 3559 }
3530 if (ret == FAILED)
3531 goto out;
3532 3560
3533 INFO_bm_xfer_stats(mdev, "receive", &c); 3561 INFO_bm_xfer_stats(mdev, "receive", &c);
3534 3562
3535 if (mdev->state.conn == C_WF_BITMAP_T) { 3563 if (mdev->state.conn == C_WF_BITMAP_T) {
3564 enum drbd_state_rv rv;
3565
3536 ok = !drbd_send_bitmap(mdev); 3566 ok = !drbd_send_bitmap(mdev);
3537 if (!ok) 3567 if (!ok)
3538 goto out; 3568 goto out;
3539 /* Omit CS_ORDERED with this state transition to avoid deadlocks. */ 3569 /* Omit CS_ORDERED with this state transition to avoid deadlocks. */
3540 ok = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); 3570 rv = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
3541 D_ASSERT(ok == SS_SUCCESS); 3571 D_ASSERT(rv == SS_SUCCESS);
3542 } else if (mdev->state.conn != C_WF_BITMAP_S) { 3572 } else if (mdev->state.conn != C_WF_BITMAP_S) {
3543 /* admin may have requested C_DISCONNECTING, 3573 /* admin may have requested C_DISCONNECTING,
3544 * other threads may have noticed network errors */ 3574 * other threads may have noticed network errors */
@@ -3546,7 +3576,7 @@ static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h)
3546 drbd_conn_str(mdev->state.conn)); 3576 drbd_conn_str(mdev->state.conn));
3547 } 3577 }
3548 3578
3549 ok = TRUE; 3579 ok = true;
3550 out: 3580 out:
3551 drbd_bm_unlock(mdev); 3581 drbd_bm_unlock(mdev);
3552 if (ok && mdev->state.conn == C_WF_BITMAP_S) 3582 if (ok && mdev->state.conn == C_WF_BITMAP_S)
@@ -3555,17 +3585,16 @@ static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h)
3555 return ok; 3585 return ok;
3556} 3586}
3557 3587
3558static int receive_skip_(struct drbd_conf *mdev, struct p_header *h, int silent) 3588static int receive_skip(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
3559{ 3589{
3560 /* TODO zero copy sink :) */ 3590 /* TODO zero copy sink :) */
3561 static char sink[128]; 3591 static char sink[128];
3562 int size, want, r; 3592 int size, want, r;
3563 3593
3564 if (!silent) 3594 dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n",
3565 dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n", 3595 cmd, data_size);
3566 h->command, h->length);
3567 3596
3568 size = h->length; 3597 size = data_size;
3569 while (size > 0) { 3598 while (size > 0) {
3570 want = min_t(int, size, sizeof(sink)); 3599 want = min_t(int, size, sizeof(sink));
3571 r = drbd_recv(mdev, sink, want); 3600 r = drbd_recv(mdev, sink, want);
@@ -3575,130 +3604,126 @@ static int receive_skip_(struct drbd_conf *mdev, struct p_header *h, int silent)
3575 return size == 0; 3604 return size == 0;
3576} 3605}
3577 3606
3578static int receive_skip(struct drbd_conf *mdev, struct p_header *h) 3607static int receive_UnplugRemote(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
3579{ 3608{
3580 return receive_skip_(mdev, h, 0); 3609 /* Make sure we've acked all the TCP data associated
3581} 3610 * with the data requests being unplugged */
3611 drbd_tcp_quickack(mdev->data.socket);
3582 3612
3583static int receive_skip_silent(struct drbd_conf *mdev, struct p_header *h) 3613 return true;
3584{
3585 return receive_skip_(mdev, h, 1);
3586} 3614}
3587 3615
3588static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h) 3616static int receive_out_of_sync(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
3589{ 3617{
3590 if (mdev->state.disk >= D_INCONSISTENT) 3618 struct p_block_desc *p = &mdev->data.rbuf.block_desc;
3591 drbd_kick_lo(mdev);
3592 3619
3593 /* Make sure we've acked all the TCP data associated 3620 switch (mdev->state.conn) {
3594 * with the data requests being unplugged */ 3621 case C_WF_SYNC_UUID:
3595 drbd_tcp_quickack(mdev->data.socket); 3622 case C_WF_BITMAP_T:
3623 case C_BEHIND:
3624 break;
3625 default:
3626 dev_err(DEV, "ASSERT FAILED cstate = %s, expected: WFSyncUUID|WFBitMapT|Behind\n",
3627 drbd_conn_str(mdev->state.conn));
3628 }
3596 3629
3597 return TRUE; 3630 drbd_set_out_of_sync(mdev, be64_to_cpu(p->sector), be32_to_cpu(p->blksize));
3631
3632 return true;
3598} 3633}
3599 3634
3600typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *); 3635typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, enum drbd_packets cmd, unsigned int to_receive);
3601 3636
3602static drbd_cmd_handler_f drbd_default_handler[] = { 3637struct data_cmd {
3603 [P_DATA] = receive_Data, 3638 int expect_payload;
3604 [P_DATA_REPLY] = receive_DataReply, 3639 size_t pkt_size;
3605 [P_RS_DATA_REPLY] = receive_RSDataReply, 3640 drbd_cmd_handler_f function;
3606 [P_BARRIER] = receive_Barrier, 3641};
3607 [P_BITMAP] = receive_bitmap, 3642
3608 [P_COMPRESSED_BITMAP] = receive_bitmap, 3643static struct data_cmd drbd_cmd_handler[] = {
3609 [P_UNPLUG_REMOTE] = receive_UnplugRemote, 3644 [P_DATA] = { 1, sizeof(struct p_data), receive_Data },
3610 [P_DATA_REQUEST] = receive_DataRequest, 3645 [P_DATA_REPLY] = { 1, sizeof(struct p_data), receive_DataReply },
3611 [P_RS_DATA_REQUEST] = receive_DataRequest, 3646 [P_RS_DATA_REPLY] = { 1, sizeof(struct p_data), receive_RSDataReply } ,
3612 [P_SYNC_PARAM] = receive_SyncParam, 3647 [P_BARRIER] = { 0, sizeof(struct p_barrier), receive_Barrier } ,
3613 [P_SYNC_PARAM89] = receive_SyncParam, 3648 [P_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } ,
3614 [P_PROTOCOL] = receive_protocol, 3649 [P_COMPRESSED_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } ,
3615 [P_UUIDS] = receive_uuids, 3650 [P_UNPLUG_REMOTE] = { 0, sizeof(struct p_header80), receive_UnplugRemote },
3616 [P_SIZES] = receive_sizes, 3651 [P_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
3617 [P_STATE] = receive_state, 3652 [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
3618 [P_STATE_CHG_REQ] = receive_req_state, 3653 [P_SYNC_PARAM] = { 1, sizeof(struct p_header80), receive_SyncParam },
3619 [P_SYNC_UUID] = receive_sync_uuid, 3654 [P_SYNC_PARAM89] = { 1, sizeof(struct p_header80), receive_SyncParam },
3620 [P_OV_REQUEST] = receive_DataRequest, 3655 [P_PROTOCOL] = { 1, sizeof(struct p_protocol), receive_protocol },
3621 [P_OV_REPLY] = receive_DataRequest, 3656 [P_UUIDS] = { 0, sizeof(struct p_uuids), receive_uuids },
3622 [P_CSUM_RS_REQUEST] = receive_DataRequest, 3657 [P_SIZES] = { 0, sizeof(struct p_sizes), receive_sizes },
3623 [P_DELAY_PROBE] = receive_skip_silent, 3658 [P_STATE] = { 0, sizeof(struct p_state), receive_state },
3659 [P_STATE_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_state },
3660 [P_SYNC_UUID] = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid },
3661 [P_OV_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
3662 [P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest },
3663 [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
3664 [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip },
3665 [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
3624 /* anything missing from this table is in 3666 /* anything missing from this table is in
3625 * the asender_tbl, see get_asender_cmd */ 3667 * the asender_tbl, see get_asender_cmd */
3626 [P_MAX_CMD] = NULL, 3668 [P_MAX_CMD] = { 0, 0, NULL },
3627}; 3669};
3628 3670
3629static drbd_cmd_handler_f *drbd_cmd_handler = drbd_default_handler; 3671/* All handler functions that expect a sub-header get that sub-heder in
3630static drbd_cmd_handler_f *drbd_opt_cmd_handler; 3672 mdev->data.rbuf.header.head.payload.
3673
3674 Usually in mdev->data.rbuf.header.head the callback can find the usual
3675 p_header, but they may not rely on that. Since there is also p_header95 !
3676 */
3631 3677
3632static void drbdd(struct drbd_conf *mdev) 3678static void drbdd(struct drbd_conf *mdev)
3633{ 3679{
3634 drbd_cmd_handler_f handler; 3680 union p_header *header = &mdev->data.rbuf.header;
3635 struct p_header *header = &mdev->data.rbuf.header; 3681 unsigned int packet_size;
3682 enum drbd_packets cmd;
3683 size_t shs; /* sub header size */
3684 int rv;
3636 3685
3637 while (get_t_state(&mdev->receiver) == Running) { 3686 while (get_t_state(&mdev->receiver) == Running) {
3638 drbd_thread_current_set_cpu(mdev); 3687 drbd_thread_current_set_cpu(mdev);
3639 if (!drbd_recv_header(mdev, header)) { 3688 if (!drbd_recv_header(mdev, &cmd, &packet_size))
3640 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); 3689 goto err_out;
3641 break;
3642 }
3643 3690
3644 if (header->command < P_MAX_CMD) 3691 if (unlikely(cmd >= P_MAX_CMD || !drbd_cmd_handler[cmd].function)) {
3645 handler = drbd_cmd_handler[header->command]; 3692 dev_err(DEV, "unknown packet type %d, l: %d!\n", cmd, packet_size);
3646 else if (P_MAY_IGNORE < header->command 3693 goto err_out;
3647 && header->command < P_MAX_OPT_CMD) 3694 }
3648 handler = drbd_opt_cmd_handler[header->command-P_MAY_IGNORE];
3649 else if (header->command > P_MAX_OPT_CMD)
3650 handler = receive_skip;
3651 else
3652 handler = NULL;
3653 3695
3654 if (unlikely(!handler)) { 3696 shs = drbd_cmd_handler[cmd].pkt_size - sizeof(union p_header);
3655 dev_err(DEV, "unknown packet type %d, l: %d!\n", 3697 if (packet_size - shs > 0 && !drbd_cmd_handler[cmd].expect_payload) {
3656 header->command, header->length); 3698 dev_err(DEV, "No payload expected %s l:%d\n", cmdname(cmd), packet_size);
3657 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); 3699 goto err_out;
3658 break;
3659 } 3700 }
3660 if (unlikely(!handler(mdev, header))) { 3701
3661 dev_err(DEV, "error receiving %s, l: %d!\n", 3702 if (shs) {
3662 cmdname(header->command), header->length); 3703 rv = drbd_recv(mdev, &header->h80.payload, shs);
3663 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); 3704 if (unlikely(rv != shs)) {
3664 break; 3705 if (!signal_pending(current))
3706 dev_warn(DEV, "short read while reading sub header: rv=%d\n", rv);
3707 goto err_out;
3708 }
3665 } 3709 }
3666 }
3667}
3668 3710
3669static void drbd_fail_pending_reads(struct drbd_conf *mdev) 3711 rv = drbd_cmd_handler[cmd].function(mdev, cmd, packet_size - shs);
3670{
3671 struct hlist_head *slot;
3672 struct hlist_node *pos;
3673 struct hlist_node *tmp;
3674 struct drbd_request *req;
3675 int i;
3676 3712
3677 /* 3713 if (unlikely(!rv)) {
3678 * Application READ requests 3714 dev_err(DEV, "error receiving %s, l: %d!\n",
3679 */ 3715 cmdname(cmd), packet_size);
3680 spin_lock_irq(&mdev->req_lock); 3716 goto err_out;
3681 for (i = 0; i < APP_R_HSIZE; i++) {
3682 slot = mdev->app_reads_hash+i;
3683 hlist_for_each_entry_safe(req, pos, tmp, slot, colision) {
3684 /* it may (but should not any longer!)
3685 * be on the work queue; if that assert triggers,
3686 * we need to also grab the
3687 * spin_lock_irq(&mdev->data.work.q_lock);
3688 * and list_del_init here. */
3689 D_ASSERT(list_empty(&req->w.list));
3690 /* It would be nice to complete outside of spinlock.
3691 * But this is easier for now. */
3692 _req_mod(req, connection_lost_while_pending);
3693 } 3717 }
3694 } 3718 }
3695 for (i = 0; i < APP_R_HSIZE; i++)
3696 if (!hlist_empty(mdev->app_reads_hash+i))
3697 dev_warn(DEV, "ASSERT FAILED: app_reads_hash[%d].first: "
3698 "%p, should be NULL\n", i, mdev->app_reads_hash[i].first);
3699 3719
3700 memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *)); 3720 if (0) {
3701 spin_unlock_irq(&mdev->req_lock); 3721 err_out:
3722 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
3723 }
3724 /* If we leave here, we probably want to update at least the
3725 * "Connected" indicator on stable storage. Do so explicitly here. */
3726 drbd_md_sync(mdev);
3702} 3727}
3703 3728
3704void drbd_flush_workqueue(struct drbd_conf *mdev) 3729void drbd_flush_workqueue(struct drbd_conf *mdev)
@@ -3711,6 +3736,36 @@ void drbd_flush_workqueue(struct drbd_conf *mdev)
3711 wait_for_completion(&barr.done); 3736 wait_for_completion(&barr.done);
3712} 3737}
3713 3738
3739void drbd_free_tl_hash(struct drbd_conf *mdev)
3740{
3741 struct hlist_head *h;
3742
3743 spin_lock_irq(&mdev->req_lock);
3744
3745 if (!mdev->tl_hash || mdev->state.conn != C_STANDALONE) {
3746 spin_unlock_irq(&mdev->req_lock);
3747 return;
3748 }
3749 /* paranoia code */
3750 for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
3751 if (h->first)
3752 dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
3753 (int)(h - mdev->ee_hash), h->first);
3754 kfree(mdev->ee_hash);
3755 mdev->ee_hash = NULL;
3756 mdev->ee_hash_s = 0;
3757
3758 /* paranoia code */
3759 for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
3760 if (h->first)
3761 dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
3762 (int)(h - mdev->tl_hash), h->first);
3763 kfree(mdev->tl_hash);
3764 mdev->tl_hash = NULL;
3765 mdev->tl_hash_s = 0;
3766 spin_unlock_irq(&mdev->req_lock);
3767}
3768
3714static void drbd_disconnect(struct drbd_conf *mdev) 3769static void drbd_disconnect(struct drbd_conf *mdev)
3715{ 3770{
3716 enum drbd_fencing_p fp; 3771 enum drbd_fencing_p fp;
@@ -3720,14 +3775,12 @@ static void drbd_disconnect(struct drbd_conf *mdev)
3720 3775
3721 if (mdev->state.conn == C_STANDALONE) 3776 if (mdev->state.conn == C_STANDALONE)
3722 return; 3777 return;
3723 if (mdev->state.conn >= C_WF_CONNECTION)
3724 dev_err(DEV, "ASSERT FAILED cstate = %s, expected < WFConnection\n",
3725 drbd_conn_str(mdev->state.conn));
3726 3778
3727 /* asender does not clean up anything. it must not interfere, either */ 3779 /* asender does not clean up anything. it must not interfere, either */
3728 drbd_thread_stop(&mdev->asender); 3780 drbd_thread_stop(&mdev->asender);
3729 drbd_free_sock(mdev); 3781 drbd_free_sock(mdev);
3730 3782
3783 /* wait for current activity to cease. */
3731 spin_lock_irq(&mdev->req_lock); 3784 spin_lock_irq(&mdev->req_lock);
3732 _drbd_wait_ee_list_empty(mdev, &mdev->active_ee); 3785 _drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
3733 _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee); 3786 _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee);
@@ -3750,9 +3803,10 @@ static void drbd_disconnect(struct drbd_conf *mdev)
3750 atomic_set(&mdev->rs_pending_cnt, 0); 3803 atomic_set(&mdev->rs_pending_cnt, 0);
3751 wake_up(&mdev->misc_wait); 3804 wake_up(&mdev->misc_wait);
3752 3805
3806 del_timer(&mdev->request_timer);
3807
3753 /* make sure syncer is stopped and w_resume_next_sg queued */ 3808 /* make sure syncer is stopped and w_resume_next_sg queued */
3754 del_timer_sync(&mdev->resync_timer); 3809 del_timer_sync(&mdev->resync_timer);
3755 set_bit(STOP_SYNC_TIMER, &mdev->flags);
3756 resync_timer_fn((unsigned long)mdev); 3810 resync_timer_fn((unsigned long)mdev);
3757 3811
3758 /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier, 3812 /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
@@ -3767,11 +3821,9 @@ static void drbd_disconnect(struct drbd_conf *mdev)
3767 kfree(mdev->p_uuid); 3821 kfree(mdev->p_uuid);
3768 mdev->p_uuid = NULL; 3822 mdev->p_uuid = NULL;
3769 3823
3770 if (!mdev->state.susp) 3824 if (!is_susp(mdev->state))
3771 tl_clear(mdev); 3825 tl_clear(mdev);
3772 3826
3773 drbd_fail_pending_reads(mdev);
3774
3775 dev_info(DEV, "Connection closed\n"); 3827 dev_info(DEV, "Connection closed\n");
3776 3828
3777 drbd_md_sync(mdev); 3829 drbd_md_sync(mdev);
@@ -3782,12 +3834,8 @@ static void drbd_disconnect(struct drbd_conf *mdev)
3782 put_ldev(mdev); 3834 put_ldev(mdev);
3783 } 3835 }
3784 3836
3785 if (mdev->state.role == R_PRIMARY) { 3837 if (mdev->state.role == R_PRIMARY && fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN)
3786 if (fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN) { 3838 drbd_try_outdate_peer_async(mdev);
3787 enum drbd_disk_state nps = drbd_try_outdate_peer(mdev);
3788 drbd_request_state(mdev, NS(pdsk, nps));
3789 }
3790 }
3791 3839
3792 spin_lock_irq(&mdev->req_lock); 3840 spin_lock_irq(&mdev->req_lock);
3793 os = mdev->state; 3841 os = mdev->state;
@@ -3800,32 +3848,7 @@ static void drbd_disconnect(struct drbd_conf *mdev)
3800 spin_unlock_irq(&mdev->req_lock); 3848 spin_unlock_irq(&mdev->req_lock);
3801 3849
3802 if (os.conn == C_DISCONNECTING) { 3850 if (os.conn == C_DISCONNECTING) {
3803 struct hlist_head *h; 3851 wait_event(mdev->net_cnt_wait, atomic_read(&mdev->net_cnt) == 0);
3804 wait_event(mdev->misc_wait, atomic_read(&mdev->net_cnt) == 0);
3805
3806 /* we must not free the tl_hash
3807 * while application io is still on the fly */
3808 wait_event(mdev->misc_wait, atomic_read(&mdev->ap_bio_cnt) == 0);
3809
3810 spin_lock_irq(&mdev->req_lock);
3811 /* paranoia code */
3812 for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
3813 if (h->first)
3814 dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
3815 (int)(h - mdev->ee_hash), h->first);
3816 kfree(mdev->ee_hash);
3817 mdev->ee_hash = NULL;
3818 mdev->ee_hash_s = 0;
3819
3820 /* paranoia code */
3821 for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
3822 if (h->first)
3823 dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
3824 (int)(h - mdev->tl_hash), h->first);
3825 kfree(mdev->tl_hash);
3826 mdev->tl_hash = NULL;
3827 mdev->tl_hash_s = 0;
3828 spin_unlock_irq(&mdev->req_lock);
3829 3852
3830 crypto_free_hash(mdev->cram_hmac_tfm); 3853 crypto_free_hash(mdev->cram_hmac_tfm);
3831 mdev->cram_hmac_tfm = NULL; 3854 mdev->cram_hmac_tfm = NULL;
@@ -3835,6 +3858,10 @@ static void drbd_disconnect(struct drbd_conf *mdev)
3835 drbd_request_state(mdev, NS(conn, C_STANDALONE)); 3858 drbd_request_state(mdev, NS(conn, C_STANDALONE));
3836 } 3859 }
3837 3860
3861 /* serialize with bitmap writeout triggered by the state change,
3862 * if any. */
3863 wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
3864
3838 /* tcp_close and release of sendpage pages can be deferred. I don't 3865 /* tcp_close and release of sendpage pages can be deferred. I don't
3839 * want to use SO_LINGER, because apparently it can be deferred for 3866 * want to use SO_LINGER, because apparently it can be deferred for
3840 * more than 20 seconds (longest time I checked). 3867 * more than 20 seconds (longest time I checked).
@@ -3845,6 +3872,9 @@ static void drbd_disconnect(struct drbd_conf *mdev)
3845 i = drbd_release_ee(mdev, &mdev->net_ee); 3872 i = drbd_release_ee(mdev, &mdev->net_ee);
3846 if (i) 3873 if (i)
3847 dev_info(DEV, "net_ee not empty, killed %u entries\n", i); 3874 dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
3875 i = atomic_read(&mdev->pp_in_use_by_net);
3876 if (i)
3877 dev_info(DEV, "pp_in_use_by_net = %d, expected 0\n", i);
3848 i = atomic_read(&mdev->pp_in_use); 3878 i = atomic_read(&mdev->pp_in_use);
3849 if (i) 3879 if (i)
3850 dev_info(DEV, "pp_in_use = %d, expected 0\n", i); 3880 dev_info(DEV, "pp_in_use = %d, expected 0\n", i);
@@ -3888,7 +3918,7 @@ static int drbd_send_handshake(struct drbd_conf *mdev)
3888 p->protocol_min = cpu_to_be32(PRO_VERSION_MIN); 3918 p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
3889 p->protocol_max = cpu_to_be32(PRO_VERSION_MAX); 3919 p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
3890 ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE, 3920 ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE,
3891 (struct p_header *)p, sizeof(*p), 0 ); 3921 (struct p_header80 *)p, sizeof(*p), 0 );
3892 mutex_unlock(&mdev->data.mutex); 3922 mutex_unlock(&mdev->data.mutex);
3893 return ok; 3923 return ok;
3894} 3924}
@@ -3904,34 +3934,36 @@ static int drbd_do_handshake(struct drbd_conf *mdev)
3904{ 3934{
3905 /* ASSERT current == mdev->receiver ... */ 3935 /* ASSERT current == mdev->receiver ... */
3906 struct p_handshake *p = &mdev->data.rbuf.handshake; 3936 struct p_handshake *p = &mdev->data.rbuf.handshake;
3907 const int expect = sizeof(struct p_handshake) 3937 const int expect = sizeof(struct p_handshake) - sizeof(struct p_header80);
3908 -sizeof(struct p_header); 3938 unsigned int length;
3939 enum drbd_packets cmd;
3909 int rv; 3940 int rv;
3910 3941
3911 rv = drbd_send_handshake(mdev); 3942 rv = drbd_send_handshake(mdev);
3912 if (!rv) 3943 if (!rv)
3913 return 0; 3944 return 0;
3914 3945
3915 rv = drbd_recv_header(mdev, &p->head); 3946 rv = drbd_recv_header(mdev, &cmd, &length);
3916 if (!rv) 3947 if (!rv)
3917 return 0; 3948 return 0;
3918 3949
3919 if (p->head.command != P_HAND_SHAKE) { 3950 if (cmd != P_HAND_SHAKE) {
3920 dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n", 3951 dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n",
3921 cmdname(p->head.command), p->head.command); 3952 cmdname(cmd), cmd);
3922 return -1; 3953 return -1;
3923 } 3954 }
3924 3955
3925 if (p->head.length != expect) { 3956 if (length != expect) {
3926 dev_err(DEV, "expected HandShake length: %u, received: %u\n", 3957 dev_err(DEV, "expected HandShake length: %u, received: %u\n",
3927 expect, p->head.length); 3958 expect, length);
3928 return -1; 3959 return -1;
3929 } 3960 }
3930 3961
3931 rv = drbd_recv(mdev, &p->head.payload, expect); 3962 rv = drbd_recv(mdev, &p->head.payload, expect);
3932 3963
3933 if (rv != expect) { 3964 if (rv != expect) {
3934 dev_err(DEV, "short read receiving handshake packet: l=%u\n", rv); 3965 if (!signal_pending(current))
3966 dev_warn(DEV, "short read receiving handshake packet: l=%u\n", rv);
3935 return 0; 3967 return 0;
3936 } 3968 }
3937 3969
@@ -3982,10 +4014,11 @@ static int drbd_do_auth(struct drbd_conf *mdev)
3982 char *response = NULL; 4014 char *response = NULL;
3983 char *right_response = NULL; 4015 char *right_response = NULL;
3984 char *peers_ch = NULL; 4016 char *peers_ch = NULL;
3985 struct p_header p;
3986 unsigned int key_len = strlen(mdev->net_conf->shared_secret); 4017 unsigned int key_len = strlen(mdev->net_conf->shared_secret);
3987 unsigned int resp_size; 4018 unsigned int resp_size;
3988 struct hash_desc desc; 4019 struct hash_desc desc;
4020 enum drbd_packets cmd;
4021 unsigned int length;
3989 int rv; 4022 int rv;
3990 4023
3991 desc.tfm = mdev->cram_hmac_tfm; 4024 desc.tfm = mdev->cram_hmac_tfm;
@@ -4005,34 +4038,35 @@ static int drbd_do_auth(struct drbd_conf *mdev)
4005 if (!rv) 4038 if (!rv)
4006 goto fail; 4039 goto fail;
4007 4040
4008 rv = drbd_recv_header(mdev, &p); 4041 rv = drbd_recv_header(mdev, &cmd, &length);
4009 if (!rv) 4042 if (!rv)
4010 goto fail; 4043 goto fail;
4011 4044
4012 if (p.command != P_AUTH_CHALLENGE) { 4045 if (cmd != P_AUTH_CHALLENGE) {
4013 dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n", 4046 dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n",
4014 cmdname(p.command), p.command); 4047 cmdname(cmd), cmd);
4015 rv = 0; 4048 rv = 0;
4016 goto fail; 4049 goto fail;
4017 } 4050 }
4018 4051
4019 if (p.length > CHALLENGE_LEN*2) { 4052 if (length > CHALLENGE_LEN * 2) {
4020 dev_err(DEV, "expected AuthChallenge payload too big.\n"); 4053 dev_err(DEV, "expected AuthChallenge payload too big.\n");
4021 rv = -1; 4054 rv = -1;
4022 goto fail; 4055 goto fail;
4023 } 4056 }
4024 4057
4025 peers_ch = kmalloc(p.length, GFP_NOIO); 4058 peers_ch = kmalloc(length, GFP_NOIO);
4026 if (peers_ch == NULL) { 4059 if (peers_ch == NULL) {
4027 dev_err(DEV, "kmalloc of peers_ch failed\n"); 4060 dev_err(DEV, "kmalloc of peers_ch failed\n");
4028 rv = -1; 4061 rv = -1;
4029 goto fail; 4062 goto fail;
4030 } 4063 }
4031 4064
4032 rv = drbd_recv(mdev, peers_ch, p.length); 4065 rv = drbd_recv(mdev, peers_ch, length);
4033 4066
4034 if (rv != p.length) { 4067 if (rv != length) {
4035 dev_err(DEV, "short read AuthChallenge: l=%u\n", rv); 4068 if (!signal_pending(current))
4069 dev_warn(DEV, "short read AuthChallenge: l=%u\n", rv);
4036 rv = 0; 4070 rv = 0;
4037 goto fail; 4071 goto fail;
4038 } 4072 }
@@ -4046,7 +4080,7 @@ static int drbd_do_auth(struct drbd_conf *mdev)
4046 } 4080 }
4047 4081
4048 sg_init_table(&sg, 1); 4082 sg_init_table(&sg, 1);
4049 sg_set_buf(&sg, peers_ch, p.length); 4083 sg_set_buf(&sg, peers_ch, length);
4050 4084
4051 rv = crypto_hash_digest(&desc, &sg, sg.length, response); 4085 rv = crypto_hash_digest(&desc, &sg, sg.length, response);
4052 if (rv) { 4086 if (rv) {
@@ -4059,18 +4093,18 @@ static int drbd_do_auth(struct drbd_conf *mdev)
4059 if (!rv) 4093 if (!rv)
4060 goto fail; 4094 goto fail;
4061 4095
4062 rv = drbd_recv_header(mdev, &p); 4096 rv = drbd_recv_header(mdev, &cmd, &length);
4063 if (!rv) 4097 if (!rv)
4064 goto fail; 4098 goto fail;
4065 4099
4066 if (p.command != P_AUTH_RESPONSE) { 4100 if (cmd != P_AUTH_RESPONSE) {
4067 dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n", 4101 dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n",
4068 cmdname(p.command), p.command); 4102 cmdname(cmd), cmd);
4069 rv = 0; 4103 rv = 0;
4070 goto fail; 4104 goto fail;
4071 } 4105 }
4072 4106
4073 if (p.length != resp_size) { 4107 if (length != resp_size) {
4074 dev_err(DEV, "expected AuthResponse payload of wrong size\n"); 4108 dev_err(DEV, "expected AuthResponse payload of wrong size\n");
4075 rv = 0; 4109 rv = 0;
4076 goto fail; 4110 goto fail;
@@ -4079,7 +4113,8 @@ static int drbd_do_auth(struct drbd_conf *mdev)
4079 rv = drbd_recv(mdev, response , resp_size); 4113 rv = drbd_recv(mdev, response , resp_size);
4080 4114
4081 if (rv != resp_size) { 4115 if (rv != resp_size) {
4082 dev_err(DEV, "short read receiving AuthResponse: l=%u\n", rv); 4116 if (!signal_pending(current))
4117 dev_warn(DEV, "short read receiving AuthResponse: l=%u\n", rv);
4083 rv = 0; 4118 rv = 0;
4084 goto fail; 4119 goto fail;
4085 } 4120 }
@@ -4131,8 +4166,7 @@ int drbdd_init(struct drbd_thread *thi)
4131 h = drbd_connect(mdev); 4166 h = drbd_connect(mdev);
4132 if (h == 0) { 4167 if (h == 0) {
4133 drbd_disconnect(mdev); 4168 drbd_disconnect(mdev);
4134 __set_current_state(TASK_INTERRUPTIBLE); 4169 schedule_timeout_interruptible(HZ);
4135 schedule_timeout(HZ);
4136 } 4170 }
4137 if (h == -1) { 4171 if (h == -1) {
4138 dev_warn(DEV, "Discarding network configuration.\n"); 4172 dev_warn(DEV, "Discarding network configuration.\n");
@@ -4155,7 +4189,7 @@ int drbdd_init(struct drbd_thread *thi)
4155 4189
4156/* ********* acknowledge sender ******** */ 4190/* ********* acknowledge sender ******** */
4157 4191
4158static int got_RqSReply(struct drbd_conf *mdev, struct p_header *h) 4192static int got_RqSReply(struct drbd_conf *mdev, struct p_header80 *h)
4159{ 4193{
4160 struct p_req_state_reply *p = (struct p_req_state_reply *)h; 4194 struct p_req_state_reply *p = (struct p_req_state_reply *)h;
4161 4195
@@ -4170,26 +4204,26 @@ static int got_RqSReply(struct drbd_conf *mdev, struct p_header *h)
4170 } 4204 }
4171 wake_up(&mdev->state_wait); 4205 wake_up(&mdev->state_wait);
4172 4206
4173 return TRUE; 4207 return true;
4174} 4208}
4175 4209
4176static int got_Ping(struct drbd_conf *mdev, struct p_header *h) 4210static int got_Ping(struct drbd_conf *mdev, struct p_header80 *h)
4177{ 4211{
4178 return drbd_send_ping_ack(mdev); 4212 return drbd_send_ping_ack(mdev);
4179 4213
4180} 4214}
4181 4215
4182static int got_PingAck(struct drbd_conf *mdev, struct p_header *h) 4216static int got_PingAck(struct drbd_conf *mdev, struct p_header80 *h)
4183{ 4217{
4184 /* restore idle timeout */ 4218 /* restore idle timeout */
4185 mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ; 4219 mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
4186 if (!test_and_set_bit(GOT_PING_ACK, &mdev->flags)) 4220 if (!test_and_set_bit(GOT_PING_ACK, &mdev->flags))
4187 wake_up(&mdev->misc_wait); 4221 wake_up(&mdev->misc_wait);
4188 4222
4189 return TRUE; 4223 return true;
4190} 4224}
4191 4225
4192static int got_IsInSync(struct drbd_conf *mdev, struct p_header *h) 4226static int got_IsInSync(struct drbd_conf *mdev, struct p_header80 *h)
4193{ 4227{
4194 struct p_block_ack *p = (struct p_block_ack *)h; 4228 struct p_block_ack *p = (struct p_block_ack *)h;
4195 sector_t sector = be64_to_cpu(p->sector); 4229 sector_t sector = be64_to_cpu(p->sector);
@@ -4199,13 +4233,17 @@ static int got_IsInSync(struct drbd_conf *mdev, struct p_header *h)
4199 4233
4200 update_peer_seq(mdev, be32_to_cpu(p->seq_num)); 4234 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4201 4235
4202 drbd_rs_complete_io(mdev, sector); 4236 if (get_ldev(mdev)) {
4203 drbd_set_in_sync(mdev, sector, blksize); 4237 drbd_rs_complete_io(mdev, sector);
4204 /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */ 4238 drbd_set_in_sync(mdev, sector, blksize);
4205 mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT); 4239 /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
4240 mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
4241 put_ldev(mdev);
4242 }
4206 dec_rs_pending(mdev); 4243 dec_rs_pending(mdev);
4244 atomic_add(blksize >> 9, &mdev->rs_sect_in);
4207 4245
4208 return TRUE; 4246 return true;
4209} 4247}
4210 4248
4211/* when we receive the ACK for a write request, 4249/* when we receive the ACK for a write request,
@@ -4217,7 +4255,7 @@ static struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev,
4217 struct hlist_node *n; 4255 struct hlist_node *n;
4218 struct drbd_request *req; 4256 struct drbd_request *req;
4219 4257
4220 hlist_for_each_entry(req, n, slot, colision) { 4258 hlist_for_each_entry(req, n, slot, collision) {
4221 if ((unsigned long)req == (unsigned long)id) { 4259 if ((unsigned long)req == (unsigned long)id) {
4222 if (req->sector != sector) { 4260 if (req->sector != sector) {
4223 dev_err(DEV, "_ack_id_to_req: found req %p but it has " 4261 dev_err(DEV, "_ack_id_to_req: found req %p but it has "
@@ -4229,8 +4267,6 @@ static struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev,
4229 return req; 4267 return req;
4230 } 4268 }
4231 } 4269 }
4232 dev_err(DEV, "_ack_id_to_req: failed to find req %p, sector %llus in list\n",
4233 (void *)(unsigned long)id, (unsigned long long)sector);
4234 return NULL; 4270 return NULL;
4235} 4271}
4236 4272
@@ -4248,18 +4284,20 @@ static int validate_req_change_req_state(struct drbd_conf *mdev,
4248 req = validator(mdev, id, sector); 4284 req = validator(mdev, id, sector);
4249 if (unlikely(!req)) { 4285 if (unlikely(!req)) {
4250 spin_unlock_irq(&mdev->req_lock); 4286 spin_unlock_irq(&mdev->req_lock);
4251 dev_err(DEV, "%s: got a corrupt block_id/sector pair\n", func); 4287
4252 return FALSE; 4288 dev_err(DEV, "%s: failed to find req %p, sector %llus\n", func,
4289 (void *)(unsigned long)id, (unsigned long long)sector);
4290 return false;
4253 } 4291 }
4254 __req_mod(req, what, &m); 4292 __req_mod(req, what, &m);
4255 spin_unlock_irq(&mdev->req_lock); 4293 spin_unlock_irq(&mdev->req_lock);
4256 4294
4257 if (m.bio) 4295 if (m.bio)
4258 complete_master_bio(mdev, &m); 4296 complete_master_bio(mdev, &m);
4259 return TRUE; 4297 return true;
4260} 4298}
4261 4299
4262static int got_BlockAck(struct drbd_conf *mdev, struct p_header *h) 4300static int got_BlockAck(struct drbd_conf *mdev, struct p_header80 *h)
4263{ 4301{
4264 struct p_block_ack *p = (struct p_block_ack *)h; 4302 struct p_block_ack *p = (struct p_block_ack *)h;
4265 sector_t sector = be64_to_cpu(p->sector); 4303 sector_t sector = be64_to_cpu(p->sector);
@@ -4271,7 +4309,7 @@ static int got_BlockAck(struct drbd_conf *mdev, struct p_header *h)
4271 if (is_syncer_block_id(p->block_id)) { 4309 if (is_syncer_block_id(p->block_id)) {
4272 drbd_set_in_sync(mdev, sector, blksize); 4310 drbd_set_in_sync(mdev, sector, blksize);
4273 dec_rs_pending(mdev); 4311 dec_rs_pending(mdev);
4274 return TRUE; 4312 return true;
4275 } 4313 }
4276 switch (be16_to_cpu(h->command)) { 4314 switch (be16_to_cpu(h->command)) {
4277 case P_RS_WRITE_ACK: 4315 case P_RS_WRITE_ACK:
@@ -4292,34 +4330,58 @@ static int got_BlockAck(struct drbd_conf *mdev, struct p_header *h)
4292 break; 4330 break;
4293 default: 4331 default:
4294 D_ASSERT(0); 4332 D_ASSERT(0);
4295 return FALSE; 4333 return false;
4296 } 4334 }
4297 4335
4298 return validate_req_change_req_state(mdev, p->block_id, sector, 4336 return validate_req_change_req_state(mdev, p->block_id, sector,
4299 _ack_id_to_req, __func__ , what); 4337 _ack_id_to_req, __func__ , what);
4300} 4338}
4301 4339
4302static int got_NegAck(struct drbd_conf *mdev, struct p_header *h) 4340static int got_NegAck(struct drbd_conf *mdev, struct p_header80 *h)
4303{ 4341{
4304 struct p_block_ack *p = (struct p_block_ack *)h; 4342 struct p_block_ack *p = (struct p_block_ack *)h;
4305 sector_t sector = be64_to_cpu(p->sector); 4343 sector_t sector = be64_to_cpu(p->sector);
4306 4344 int size = be32_to_cpu(p->blksize);
4307 if (__ratelimit(&drbd_ratelimit_state)) 4345 struct drbd_request *req;
4308 dev_warn(DEV, "Got NegAck packet. Peer is in troubles?\n"); 4346 struct bio_and_error m;
4309 4347
4310 update_peer_seq(mdev, be32_to_cpu(p->seq_num)); 4348 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4311 4349
4312 if (is_syncer_block_id(p->block_id)) { 4350 if (is_syncer_block_id(p->block_id)) {
4313 int size = be32_to_cpu(p->blksize);
4314 dec_rs_pending(mdev); 4351 dec_rs_pending(mdev);
4315 drbd_rs_failed_io(mdev, sector, size); 4352 drbd_rs_failed_io(mdev, sector, size);
4316 return TRUE; 4353 return true;
4317 } 4354 }
4318 return validate_req_change_req_state(mdev, p->block_id, sector, 4355
4319 _ack_id_to_req, __func__ , neg_acked); 4356 spin_lock_irq(&mdev->req_lock);
4357 req = _ack_id_to_req(mdev, p->block_id, sector);
4358 if (!req) {
4359 spin_unlock_irq(&mdev->req_lock);
4360 if (mdev->net_conf->wire_protocol == DRBD_PROT_A ||
4361 mdev->net_conf->wire_protocol == DRBD_PROT_B) {
4362 /* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs.
4363 The master bio might already be completed, therefore the
4364 request is no longer in the collision hash.
4365 => Do not try to validate block_id as request. */
4366 /* In Protocol B we might already have got a P_RECV_ACK
4367 but then get a P_NEG_ACK after wards. */
4368 drbd_set_out_of_sync(mdev, sector, size);
4369 return true;
4370 } else {
4371 dev_err(DEV, "%s: failed to find req %p, sector %llus\n", __func__,
4372 (void *)(unsigned long)p->block_id, (unsigned long long)sector);
4373 return false;
4374 }
4375 }
4376 __req_mod(req, neg_acked, &m);
4377 spin_unlock_irq(&mdev->req_lock);
4378
4379 if (m.bio)
4380 complete_master_bio(mdev, &m);
4381 return true;
4320} 4382}
4321 4383
4322static int got_NegDReply(struct drbd_conf *mdev, struct p_header *h) 4384static int got_NegDReply(struct drbd_conf *mdev, struct p_header80 *h)
4323{ 4385{
4324 struct p_block_ack *p = (struct p_block_ack *)h; 4386 struct p_block_ack *p = (struct p_block_ack *)h;
4325 sector_t sector = be64_to_cpu(p->sector); 4387 sector_t sector = be64_to_cpu(p->sector);
@@ -4332,7 +4394,7 @@ static int got_NegDReply(struct drbd_conf *mdev, struct p_header *h)
4332 _ar_id_to_req, __func__ , neg_acked); 4394 _ar_id_to_req, __func__ , neg_acked);
4333} 4395}
4334 4396
4335static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h) 4397static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header80 *h)
4336{ 4398{
4337 sector_t sector; 4399 sector_t sector;
4338 int size; 4400 int size;
@@ -4347,23 +4409,39 @@ static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h)
4347 4409
4348 if (get_ldev_if_state(mdev, D_FAILED)) { 4410 if (get_ldev_if_state(mdev, D_FAILED)) {
4349 drbd_rs_complete_io(mdev, sector); 4411 drbd_rs_complete_io(mdev, sector);
4350 drbd_rs_failed_io(mdev, sector, size); 4412 switch (be16_to_cpu(h->command)) {
4413 case P_NEG_RS_DREPLY:
4414 drbd_rs_failed_io(mdev, sector, size);
4415 case P_RS_CANCEL:
4416 break;
4417 default:
4418 D_ASSERT(0);
4419 put_ldev(mdev);
4420 return false;
4421 }
4351 put_ldev(mdev); 4422 put_ldev(mdev);
4352 } 4423 }
4353 4424
4354 return TRUE; 4425 return true;
4355} 4426}
4356 4427
4357static int got_BarrierAck(struct drbd_conf *mdev, struct p_header *h) 4428static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h)
4358{ 4429{
4359 struct p_barrier_ack *p = (struct p_barrier_ack *)h; 4430 struct p_barrier_ack *p = (struct p_barrier_ack *)h;
4360 4431
4361 tl_release(mdev, p->barrier, be32_to_cpu(p->set_size)); 4432 tl_release(mdev, p->barrier, be32_to_cpu(p->set_size));
4362 4433
4363 return TRUE; 4434 if (mdev->state.conn == C_AHEAD &&
4435 atomic_read(&mdev->ap_in_flight) == 0 &&
4436 !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->current_epoch->flags)) {
4437 mdev->start_resync_timer.expires = jiffies + HZ;
4438 add_timer(&mdev->start_resync_timer);
4439 }
4440
4441 return true;
4364} 4442}
4365 4443
4366static int got_OVResult(struct drbd_conf *mdev, struct p_header *h) 4444static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h)
4367{ 4445{
4368 struct p_block_ack *p = (struct p_block_ack *)h; 4446 struct p_block_ack *p = (struct p_block_ack *)h;
4369 struct drbd_work *w; 4447 struct drbd_work *w;
@@ -4380,10 +4458,19 @@ static int got_OVResult(struct drbd_conf *mdev, struct p_header *h)
4380 else 4458 else
4381 ov_oos_print(mdev); 4459 ov_oos_print(mdev);
4382 4460
4461 if (!get_ldev(mdev))
4462 return true;
4463
4383 drbd_rs_complete_io(mdev, sector); 4464 drbd_rs_complete_io(mdev, sector);
4384 dec_rs_pending(mdev); 4465 dec_rs_pending(mdev);
4385 4466
4386 if (--mdev->ov_left == 0) { 4467 --mdev->ov_left;
4468
4469 /* let's advance progress step marks only for every other megabyte */
4470 if ((mdev->ov_left & 0x200) == 0x200)
4471 drbd_advance_rs_marks(mdev, mdev->ov_left);
4472
4473 if (mdev->ov_left == 0) {
4387 w = kmalloc(sizeof(*w), GFP_NOIO); 4474 w = kmalloc(sizeof(*w), GFP_NOIO);
4388 if (w) { 4475 if (w) {
4389 w->cb = w_ov_finished; 4476 w->cb = w_ov_finished;
@@ -4394,18 +4481,18 @@ static int got_OVResult(struct drbd_conf *mdev, struct p_header *h)
4394 drbd_resync_finished(mdev); 4481 drbd_resync_finished(mdev);
4395 } 4482 }
4396 } 4483 }
4397 return TRUE; 4484 put_ldev(mdev);
4485 return true;
4398} 4486}
4399 4487
4400static int got_something_to_ignore_m(struct drbd_conf *mdev, struct p_header *h) 4488static int got_skip(struct drbd_conf *mdev, struct p_header80 *h)
4401{ 4489{
4402 /* IGNORE */ 4490 return true;
4403 return TRUE;
4404} 4491}
4405 4492
4406struct asender_cmd { 4493struct asender_cmd {
4407 size_t pkt_size; 4494 size_t pkt_size;
4408 int (*process)(struct drbd_conf *mdev, struct p_header *h); 4495 int (*process)(struct drbd_conf *mdev, struct p_header80 *h);
4409}; 4496};
4410 4497
4411static struct asender_cmd *get_asender_cmd(int cmd) 4498static struct asender_cmd *get_asender_cmd(int cmd)
@@ -4414,8 +4501,8 @@ static struct asender_cmd *get_asender_cmd(int cmd)
4414 /* anything missing from this table is in 4501 /* anything missing from this table is in
4415 * the drbd_cmd_handler (drbd_default_handler) table, 4502 * the drbd_cmd_handler (drbd_default_handler) table,
4416 * see the beginning of drbdd() */ 4503 * see the beginning of drbdd() */
4417 [P_PING] = { sizeof(struct p_header), got_Ping }, 4504 [P_PING] = { sizeof(struct p_header80), got_Ping },
4418 [P_PING_ACK] = { sizeof(struct p_header), got_PingAck }, 4505 [P_PING_ACK] = { sizeof(struct p_header80), got_PingAck },
4419 [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, 4506 [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
4420 [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, 4507 [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
4421 [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, 4508 [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
@@ -4427,7 +4514,8 @@ static struct asender_cmd *get_asender_cmd(int cmd)
4427 [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck }, 4514 [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck },
4428 [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply }, 4515 [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
4429 [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync }, 4516 [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync },
4430 [P_DELAY_PROBE] = { sizeof(struct p_delay_probe), got_something_to_ignore_m }, 4517 [P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_skip },
4518 [P_RS_CANCEL] = { sizeof(struct p_block_ack), got_NegRSDReply},
4431 [P_MAX_CMD] = { 0, NULL }, 4519 [P_MAX_CMD] = { 0, NULL },
4432 }; 4520 };
4433 if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL) 4521 if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL)
@@ -4438,14 +4526,15 @@ static struct asender_cmd *get_asender_cmd(int cmd)
4438int drbd_asender(struct drbd_thread *thi) 4526int drbd_asender(struct drbd_thread *thi)
4439{ 4527{
4440 struct drbd_conf *mdev = thi->mdev; 4528 struct drbd_conf *mdev = thi->mdev;
4441 struct p_header *h = &mdev->meta.rbuf.header; 4529 struct p_header80 *h = &mdev->meta.rbuf.header.h80;
4442 struct asender_cmd *cmd = NULL; 4530 struct asender_cmd *cmd = NULL;
4443 4531
4444 int rv, len; 4532 int rv, len;
4445 void *buf = h; 4533 void *buf = h;
4446 int received = 0; 4534 int received = 0;
4447 int expect = sizeof(struct p_header); 4535 int expect = sizeof(struct p_header80);
4448 int empty; 4536 int empty;
4537 int ping_timeout_active = 0;
4449 4538
4450 sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev)); 4539 sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev));
4451 4540
@@ -4458,6 +4547,7 @@ int drbd_asender(struct drbd_thread *thi)
4458 ERR_IF(!drbd_send_ping(mdev)) goto reconnect; 4547 ERR_IF(!drbd_send_ping(mdev)) goto reconnect;
4459 mdev->meta.socket->sk->sk_rcvtimeo = 4548 mdev->meta.socket->sk->sk_rcvtimeo =
4460 mdev->net_conf->ping_timeo*HZ/10; 4549 mdev->net_conf->ping_timeo*HZ/10;
4550 ping_timeout_active = 1;
4461 } 4551 }
4462 4552
4463 /* conditionally cork; 4553 /* conditionally cork;
@@ -4468,10 +4558,8 @@ int drbd_asender(struct drbd_thread *thi)
4468 while (1) { 4558 while (1) {
4469 clear_bit(SIGNAL_ASENDER, &mdev->flags); 4559 clear_bit(SIGNAL_ASENDER, &mdev->flags);
4470 flush_signals(current); 4560 flush_signals(current);
4471 if (!drbd_process_done_ee(mdev)) { 4561 if (!drbd_process_done_ee(mdev))
4472 dev_err(DEV, "process_done_ee() = NOT_OK\n");
4473 goto reconnect; 4562 goto reconnect;
4474 }
4475 /* to avoid race with newly queued ACKs */ 4563 /* to avoid race with newly queued ACKs */
4476 set_bit(SIGNAL_ASENDER, &mdev->flags); 4564 set_bit(SIGNAL_ASENDER, &mdev->flags);
4477 spin_lock_irq(&mdev->req_lock); 4565 spin_lock_irq(&mdev->req_lock);
@@ -4514,8 +4602,12 @@ int drbd_asender(struct drbd_thread *thi)
4514 dev_err(DEV, "meta connection shut down by peer.\n"); 4602 dev_err(DEV, "meta connection shut down by peer.\n");
4515 goto reconnect; 4603 goto reconnect;
4516 } else if (rv == -EAGAIN) { 4604 } else if (rv == -EAGAIN) {
4517 if (mdev->meta.socket->sk->sk_rcvtimeo == 4605 /* If the data socket received something meanwhile,
4518 mdev->net_conf->ping_timeo*HZ/10) { 4606 * that is good enough: peer is still alive. */
4607 if (time_after(mdev->last_received,
4608 jiffies - mdev->meta.socket->sk->sk_rcvtimeo))
4609 continue;
4610 if (ping_timeout_active) {
4519 dev_err(DEV, "PingAck did not arrive in time.\n"); 4611 dev_err(DEV, "PingAck did not arrive in time.\n");
4520 goto reconnect; 4612 goto reconnect;
4521 } 4613 }
@@ -4530,31 +4622,39 @@ int drbd_asender(struct drbd_thread *thi)
4530 4622
4531 if (received == expect && cmd == NULL) { 4623 if (received == expect && cmd == NULL) {
4532 if (unlikely(h->magic != BE_DRBD_MAGIC)) { 4624 if (unlikely(h->magic != BE_DRBD_MAGIC)) {
4533 dev_err(DEV, "magic?? on meta m: 0x%lx c: %d l: %d\n", 4625 dev_err(DEV, "magic?? on meta m: 0x%08x c: %d l: %d\n",
4534 (long)be32_to_cpu(h->magic), 4626 be32_to_cpu(h->magic),
4535 h->command, h->length); 4627 be16_to_cpu(h->command),
4628 be16_to_cpu(h->length));
4536 goto reconnect; 4629 goto reconnect;
4537 } 4630 }
4538 cmd = get_asender_cmd(be16_to_cpu(h->command)); 4631 cmd = get_asender_cmd(be16_to_cpu(h->command));
4539 len = be16_to_cpu(h->length); 4632 len = be16_to_cpu(h->length);
4540 if (unlikely(cmd == NULL)) { 4633 if (unlikely(cmd == NULL)) {
4541 dev_err(DEV, "unknown command?? on meta m: 0x%lx c: %d l: %d\n", 4634 dev_err(DEV, "unknown command?? on meta m: 0x%08x c: %d l: %d\n",
4542 (long)be32_to_cpu(h->magic), 4635 be32_to_cpu(h->magic),
4543 h->command, h->length); 4636 be16_to_cpu(h->command),
4637 be16_to_cpu(h->length));
4544 goto disconnect; 4638 goto disconnect;
4545 } 4639 }
4546 expect = cmd->pkt_size; 4640 expect = cmd->pkt_size;
4547 ERR_IF(len != expect-sizeof(struct p_header)) 4641 ERR_IF(len != expect-sizeof(struct p_header80))
4548 goto reconnect; 4642 goto reconnect;
4549 } 4643 }
4550 if (received == expect) { 4644 if (received == expect) {
4645 mdev->last_received = jiffies;
4551 D_ASSERT(cmd != NULL); 4646 D_ASSERT(cmd != NULL);
4552 if (!cmd->process(mdev, h)) 4647 if (!cmd->process(mdev, h))
4553 goto reconnect; 4648 goto reconnect;
4554 4649
4650 /* the idle_timeout (ping-int)
4651 * has been restored in got_PingAck() */
4652 if (cmd == get_asender_cmd(P_PING_ACK))
4653 ping_timeout_active = 0;
4654
4555 buf = h; 4655 buf = h;
4556 received = 0; 4656 received = 0;
4557 expect = sizeof(struct p_header); 4657 expect = sizeof(struct p_header80);
4558 cmd = NULL; 4658 cmd = NULL;
4559 } 4659 }
4560 } 4660 }
@@ -4562,10 +4662,12 @@ int drbd_asender(struct drbd_thread *thi)
4562 if (0) { 4662 if (0) {
4563reconnect: 4663reconnect:
4564 drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE)); 4664 drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
4665 drbd_md_sync(mdev);
4565 } 4666 }
4566 if (0) { 4667 if (0) {
4567disconnect: 4668disconnect:
4568 drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 4669 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
4670 drbd_md_sync(mdev);
4569 } 4671 }
4570 clear_bit(SIGNAL_ASENDER, &mdev->flags); 4672 clear_bit(SIGNAL_ASENDER, &mdev->flags);
4571 4673
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index f761d98a4e90..3424d675b769 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -59,17 +59,19 @@ static void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request *req)
59static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw) 59static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw)
60{ 60{
61 const unsigned long s = req->rq_state; 61 const unsigned long s = req->rq_state;
62
63 /* remove it from the transfer log.
64 * well, only if it had been there in the first
65 * place... if it had not (local only or conflicting
66 * and never sent), it should still be "empty" as
67 * initialized in drbd_req_new(), so we can list_del() it
68 * here unconditionally */
69 list_del(&req->tl_requests);
70
62 /* if it was a write, we may have to set the corresponding 71 /* if it was a write, we may have to set the corresponding
63 * bit(s) out-of-sync first. If it had a local part, we need to 72 * bit(s) out-of-sync first. If it had a local part, we need to
64 * release the reference to the activity log. */ 73 * release the reference to the activity log. */
65 if (rw == WRITE) { 74 if (rw == WRITE) {
66 /* remove it from the transfer log.
67 * well, only if it had been there in the first
68 * place... if it had not (local only or conflicting
69 * and never sent), it should still be "empty" as
70 * initialized in drbd_req_new(), so we can list_del() it
71 * here unconditionally */
72 list_del(&req->tl_requests);
73 /* Set out-of-sync unless both OK flags are set 75 /* Set out-of-sync unless both OK flags are set
74 * (local only or remote failed). 76 * (local only or remote failed).
75 * Other places where we set out-of-sync: 77 * Other places where we set out-of-sync:
@@ -92,7 +94,8 @@ static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const
92 */ 94 */
93 if (s & RQ_LOCAL_MASK) { 95 if (s & RQ_LOCAL_MASK) {
94 if (get_ldev_if_state(mdev, D_FAILED)) { 96 if (get_ldev_if_state(mdev, D_FAILED)) {
95 drbd_al_complete_io(mdev, req->sector); 97 if (s & RQ_IN_ACT_LOG)
98 drbd_al_complete_io(mdev, req->sector);
96 put_ldev(mdev); 99 put_ldev(mdev);
97 } else if (__ratelimit(&drbd_ratelimit_state)) { 100 } else if (__ratelimit(&drbd_ratelimit_state)) {
98 dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu), " 101 dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu), "
@@ -137,9 +140,14 @@ static void _about_to_complete_local_write(struct drbd_conf *mdev,
137 struct hlist_node *n; 140 struct hlist_node *n;
138 struct hlist_head *slot; 141 struct hlist_head *slot;
139 142
140 /* before we can signal completion to the upper layers, 143 /* Before we can signal completion to the upper layers,
141 * we may need to close the current epoch */ 144 * we may need to close the current epoch.
145 * We can skip this, if this request has not even been sent, because we
146 * did not have a fully established connection yet/anymore, during
147 * bitmap exchange, or while we are C_AHEAD due to congestion policy.
148 */
142 if (mdev->state.conn >= C_CONNECTED && 149 if (mdev->state.conn >= C_CONNECTED &&
150 (s & RQ_NET_SENT) != 0 &&
143 req->epoch == mdev->newest_tle->br_number) 151 req->epoch == mdev->newest_tle->br_number)
144 queue_barrier(mdev); 152 queue_barrier(mdev);
145 153
@@ -155,7 +163,7 @@ static void _about_to_complete_local_write(struct drbd_conf *mdev,
155 * they must have been failed on the spot */ 163 * they must have been failed on the spot */
156#define OVERLAPS overlaps(sector, size, i->sector, i->size) 164#define OVERLAPS overlaps(sector, size, i->sector, i->size)
157 slot = tl_hash_slot(mdev, sector); 165 slot = tl_hash_slot(mdev, sector);
158 hlist_for_each_entry(i, n, slot, colision) { 166 hlist_for_each_entry(i, n, slot, collision) {
159 if (OVERLAPS) { 167 if (OVERLAPS) {
160 dev_alert(DEV, "LOGIC BUG: completed: %p %llus +%u; " 168 dev_alert(DEV, "LOGIC BUG: completed: %p %llus +%u; "
161 "other: %p %llus +%u\n", 169 "other: %p %llus +%u\n",
@@ -179,7 +187,7 @@ static void _about_to_complete_local_write(struct drbd_conf *mdev,
179#undef OVERLAPS 187#undef OVERLAPS
180#define OVERLAPS overlaps(sector, size, e->sector, e->size) 188#define OVERLAPS overlaps(sector, size, e->sector, e->size)
181 slot = ee_hash_slot(mdev, req->sector); 189 slot = ee_hash_slot(mdev, req->sector);
182 hlist_for_each_entry(e, n, slot, colision) { 190 hlist_for_each_entry(e, n, slot, collision) {
183 if (OVERLAPS) { 191 if (OVERLAPS) {
184 wake_up(&mdev->misc_wait); 192 wake_up(&mdev->misc_wait);
185 break; 193 break;
@@ -252,10 +260,10 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
252 260
253 /* remove the request from the conflict detection 261 /* remove the request from the conflict detection
254 * respective block_id verification hash */ 262 * respective block_id verification hash */
255 if (!hlist_unhashed(&req->colision)) 263 if (!hlist_unhashed(&req->collision))
256 hlist_del(&req->colision); 264 hlist_del(&req->collision);
257 else 265 else
258 D_ASSERT((s & RQ_NET_MASK) == 0); 266 D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0);
259 267
260 /* for writes we need to do some extra housekeeping */ 268 /* for writes we need to do some extra housekeeping */
261 if (rw == WRITE) 269 if (rw == WRITE)
@@ -280,6 +288,14 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
280 * protocol A or B, barrier ack still pending... */ 288 * protocol A or B, barrier ack still pending... */
281} 289}
282 290
291static void _req_may_be_done_not_susp(struct drbd_request *req, struct bio_and_error *m)
292{
293 struct drbd_conf *mdev = req->mdev;
294
295 if (!is_susp(mdev->state))
296 _req_may_be_done(req, m);
297}
298
283/* 299/*
284 * checks whether there was an overlapping request 300 * checks whether there was an overlapping request
285 * or ee already registered. 301 * or ee already registered.
@@ -313,7 +329,7 @@ static int _req_conflicts(struct drbd_request *req)
313 struct hlist_node *n; 329 struct hlist_node *n;
314 struct hlist_head *slot; 330 struct hlist_head *slot;
315 331
316 D_ASSERT(hlist_unhashed(&req->colision)); 332 D_ASSERT(hlist_unhashed(&req->collision));
317 333
318 if (!get_net_conf(mdev)) 334 if (!get_net_conf(mdev))
319 return 0; 335 return 0;
@@ -325,7 +341,7 @@ static int _req_conflicts(struct drbd_request *req)
325 341
326#define OVERLAPS overlaps(i->sector, i->size, sector, size) 342#define OVERLAPS overlaps(i->sector, i->size, sector, size)
327 slot = tl_hash_slot(mdev, sector); 343 slot = tl_hash_slot(mdev, sector);
328 hlist_for_each_entry(i, n, slot, colision) { 344 hlist_for_each_entry(i, n, slot, collision) {
329 if (OVERLAPS) { 345 if (OVERLAPS) {
330 dev_alert(DEV, "%s[%u] Concurrent local write detected! " 346 dev_alert(DEV, "%s[%u] Concurrent local write detected! "
331 "[DISCARD L] new: %llus +%u; " 347 "[DISCARD L] new: %llus +%u; "
@@ -343,7 +359,7 @@ static int _req_conflicts(struct drbd_request *req)
343#undef OVERLAPS 359#undef OVERLAPS
344#define OVERLAPS overlaps(e->sector, e->size, sector, size) 360#define OVERLAPS overlaps(e->sector, e->size, sector, size)
345 slot = ee_hash_slot(mdev, sector); 361 slot = ee_hash_slot(mdev, sector);
346 hlist_for_each_entry(e, n, slot, colision) { 362 hlist_for_each_entry(e, n, slot, collision) {
347 if (OVERLAPS) { 363 if (OVERLAPS) {
348 dev_alert(DEV, "%s[%u] Concurrent remote write detected!" 364 dev_alert(DEV, "%s[%u] Concurrent remote write detected!"
349 " [DISCARD L] new: %llus +%u; " 365 " [DISCARD L] new: %llus +%u; "
@@ -380,10 +396,11 @@ out_conflict:
380 * and it enforces that we have to think in a very structured manner 396 * and it enforces that we have to think in a very structured manner
381 * about the "events" that may happen to a request during its life time ... 397 * about the "events" that may happen to a request during its life time ...
382 */ 398 */
383void __req_mod(struct drbd_request *req, enum drbd_req_event what, 399int __req_mod(struct drbd_request *req, enum drbd_req_event what,
384 struct bio_and_error *m) 400 struct bio_and_error *m)
385{ 401{
386 struct drbd_conf *mdev = req->mdev; 402 struct drbd_conf *mdev = req->mdev;
403 int rv = 0;
387 m->bio = NULL; 404 m->bio = NULL;
388 405
389 switch (what) { 406 switch (what) {
@@ -420,7 +437,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
420 req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK); 437 req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK);
421 req->rq_state &= ~RQ_LOCAL_PENDING; 438 req->rq_state &= ~RQ_LOCAL_PENDING;
422 439
423 _req_may_be_done(req, m); 440 _req_may_be_done_not_susp(req, m);
424 put_ldev(mdev); 441 put_ldev(mdev);
425 break; 442 break;
426 443
@@ -428,8 +445,8 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
428 req->rq_state |= RQ_LOCAL_COMPLETED; 445 req->rq_state |= RQ_LOCAL_COMPLETED;
429 req->rq_state &= ~RQ_LOCAL_PENDING; 446 req->rq_state &= ~RQ_LOCAL_PENDING;
430 447
431 __drbd_chk_io_error(mdev, FALSE); 448 __drbd_chk_io_error(mdev, false);
432 _req_may_be_done(req, m); 449 _req_may_be_done_not_susp(req, m);
433 put_ldev(mdev); 450 put_ldev(mdev);
434 break; 451 break;
435 452
@@ -437,7 +454,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
437 /* it is legal to fail READA */ 454 /* it is legal to fail READA */
438 req->rq_state |= RQ_LOCAL_COMPLETED; 455 req->rq_state |= RQ_LOCAL_COMPLETED;
439 req->rq_state &= ~RQ_LOCAL_PENDING; 456 req->rq_state &= ~RQ_LOCAL_PENDING;
440 _req_may_be_done(req, m); 457 _req_may_be_done_not_susp(req, m);
441 put_ldev(mdev); 458 put_ldev(mdev);
442 break; 459 break;
443 460
@@ -449,13 +466,13 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
449 466
450 D_ASSERT(!(req->rq_state & RQ_NET_MASK)); 467 D_ASSERT(!(req->rq_state & RQ_NET_MASK));
451 468
452 __drbd_chk_io_error(mdev, FALSE); 469 __drbd_chk_io_error(mdev, false);
453 put_ldev(mdev); 470 put_ldev(mdev);
454 471
455 /* no point in retrying if there is no good remote data, 472 /* no point in retrying if there is no good remote data,
456 * or we have no connection. */ 473 * or we have no connection. */
457 if (mdev->state.pdsk != D_UP_TO_DATE) { 474 if (mdev->state.pdsk != D_UP_TO_DATE) {
458 _req_may_be_done(req, m); 475 _req_may_be_done_not_susp(req, m);
459 break; 476 break;
460 } 477 }
461 478
@@ -474,7 +491,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
474 491
475 /* so we can verify the handle in the answer packet 492 /* so we can verify the handle in the answer packet
476 * corresponding hlist_del is in _req_may_be_done() */ 493 * corresponding hlist_del is in _req_may_be_done() */
477 hlist_add_head(&req->colision, ar_hash_slot(mdev, req->sector)); 494 hlist_add_head(&req->collision, ar_hash_slot(mdev, req->sector));
478 495
479 set_bit(UNPLUG_REMOTE, &mdev->flags); 496 set_bit(UNPLUG_REMOTE, &mdev->flags);
480 497
@@ -490,7 +507,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
490 /* assert something? */ 507 /* assert something? */
491 /* from drbd_make_request_common only */ 508 /* from drbd_make_request_common only */
492 509
493 hlist_add_head(&req->colision, tl_hash_slot(mdev, req->sector)); 510 hlist_add_head(&req->collision, tl_hash_slot(mdev, req->sector));
494 /* corresponding hlist_del is in _req_may_be_done() */ 511 /* corresponding hlist_del is in _req_may_be_done() */
495 512
496 /* NOTE 513 /* NOTE
@@ -517,11 +534,9 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
517 D_ASSERT(test_bit(CREATE_BARRIER, &mdev->flags) == 0); 534 D_ASSERT(test_bit(CREATE_BARRIER, &mdev->flags) == 0);
518 535
519 req->epoch = mdev->newest_tle->br_number; 536 req->epoch = mdev->newest_tle->br_number;
520 list_add_tail(&req->tl_requests,
521 &mdev->newest_tle->requests);
522 537
523 /* increment size of current epoch */ 538 /* increment size of current epoch */
524 mdev->newest_tle->n_req++; 539 mdev->newest_tle->n_writes++;
525 540
526 /* queue work item to send data */ 541 /* queue work item to send data */
527 D_ASSERT(req->rq_state & RQ_NET_PENDING); 542 D_ASSERT(req->rq_state & RQ_NET_PENDING);
@@ -530,11 +545,19 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
530 drbd_queue_work(&mdev->data.work, &req->w); 545 drbd_queue_work(&mdev->data.work, &req->w);
531 546
532 /* close the epoch, in case it outgrew the limit */ 547 /* close the epoch, in case it outgrew the limit */
533 if (mdev->newest_tle->n_req >= mdev->net_conf->max_epoch_size) 548 if (mdev->newest_tle->n_writes >= mdev->net_conf->max_epoch_size)
534 queue_barrier(mdev); 549 queue_barrier(mdev);
535 550
536 break; 551 break;
537 552
553 case queue_for_send_oos:
554 req->rq_state |= RQ_NET_QUEUED;
555 req->w.cb = w_send_oos;
556 drbd_queue_work(&mdev->data.work, &req->w);
557 break;
558
559 case oos_handed_to_network:
560 /* actually the same */
538 case send_canceled: 561 case send_canceled:
539 /* treat it the same */ 562 /* treat it the same */
540 case send_failed: 563 case send_failed:
@@ -543,11 +566,14 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
543 req->rq_state &= ~RQ_NET_QUEUED; 566 req->rq_state &= ~RQ_NET_QUEUED;
544 /* if we did it right, tl_clear should be scheduled only after 567 /* if we did it right, tl_clear should be scheduled only after
545 * this, so this should not be necessary! */ 568 * this, so this should not be necessary! */
546 _req_may_be_done(req, m); 569 _req_may_be_done_not_susp(req, m);
547 break; 570 break;
548 571
549 case handed_over_to_network: 572 case handed_over_to_network:
550 /* assert something? */ 573 /* assert something? */
574 if (bio_data_dir(req->master_bio) == WRITE)
575 atomic_add(req->size>>9, &mdev->ap_in_flight);
576
551 if (bio_data_dir(req->master_bio) == WRITE && 577 if (bio_data_dir(req->master_bio) == WRITE &&
552 mdev->net_conf->wire_protocol == DRBD_PROT_A) { 578 mdev->net_conf->wire_protocol == DRBD_PROT_A) {
553 /* this is what is dangerous about protocol A: 579 /* this is what is dangerous about protocol A:
@@ -568,7 +594,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
568 * "completed_ok" events came in, once we return from 594 * "completed_ok" events came in, once we return from
569 * _drbd_send_zc_bio (drbd_send_dblock), we have to check 595 * _drbd_send_zc_bio (drbd_send_dblock), we have to check
570 * whether it is done already, and end it. */ 596 * whether it is done already, and end it. */
571 _req_may_be_done(req, m); 597 _req_may_be_done_not_susp(req, m);
572 break; 598 break;
573 599
574 case read_retry_remote_canceled: 600 case read_retry_remote_canceled:
@@ -581,10 +607,13 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
581 dec_ap_pending(mdev); 607 dec_ap_pending(mdev);
582 req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING); 608 req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
583 req->rq_state |= RQ_NET_DONE; 609 req->rq_state |= RQ_NET_DONE;
610 if (req->rq_state & RQ_NET_SENT && req->rq_state & RQ_WRITE)
611 atomic_sub(req->size>>9, &mdev->ap_in_flight);
612
584 /* if it is still queued, we may not complete it here. 613 /* if it is still queued, we may not complete it here.
585 * it will be canceled soon. */ 614 * it will be canceled soon. */
586 if (!(req->rq_state & RQ_NET_QUEUED)) 615 if (!(req->rq_state & RQ_NET_QUEUED))
587 _req_may_be_done(req, m); 616 _req_may_be_done(req, m); /* Allowed while state.susp */
588 break; 617 break;
589 618
590 case write_acked_by_peer_and_sis: 619 case write_acked_by_peer_and_sis:
@@ -618,22 +647,64 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
618 req->rq_state |= RQ_NET_OK; 647 req->rq_state |= RQ_NET_OK;
619 D_ASSERT(req->rq_state & RQ_NET_PENDING); 648 D_ASSERT(req->rq_state & RQ_NET_PENDING);
620 dec_ap_pending(mdev); 649 dec_ap_pending(mdev);
650 atomic_sub(req->size>>9, &mdev->ap_in_flight);
621 req->rq_state &= ~RQ_NET_PENDING; 651 req->rq_state &= ~RQ_NET_PENDING;
622 _req_may_be_done(req, m); 652 _req_may_be_done_not_susp(req, m);
623 break; 653 break;
624 654
625 case neg_acked: 655 case neg_acked:
626 /* assert something? */ 656 /* assert something? */
627 if (req->rq_state & RQ_NET_PENDING) 657 if (req->rq_state & RQ_NET_PENDING) {
628 dec_ap_pending(mdev); 658 dec_ap_pending(mdev);
659 atomic_sub(req->size>>9, &mdev->ap_in_flight);
660 }
629 req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING); 661 req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
630 662
631 req->rq_state |= RQ_NET_DONE; 663 req->rq_state |= RQ_NET_DONE;
632 _req_may_be_done(req, m); 664 _req_may_be_done_not_susp(req, m);
633 /* else: done by handed_over_to_network */ 665 /* else: done by handed_over_to_network */
634 break; 666 break;
635 667
668 case fail_frozen_disk_io:
669 if (!(req->rq_state & RQ_LOCAL_COMPLETED))
670 break;
671
672 _req_may_be_done(req, m); /* Allowed while state.susp */
673 break;
674
675 case restart_frozen_disk_io:
676 if (!(req->rq_state & RQ_LOCAL_COMPLETED))
677 break;
678
679 req->rq_state &= ~RQ_LOCAL_COMPLETED;
680
681 rv = MR_READ;
682 if (bio_data_dir(req->master_bio) == WRITE)
683 rv = MR_WRITE;
684
685 get_ldev(mdev);
686 req->w.cb = w_restart_disk_io;
687 drbd_queue_work(&mdev->data.work, &req->w);
688 break;
689
690 case resend:
691 /* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK
692 before the connection loss (B&C only); only P_BARRIER_ACK was missing.
693 Trowing them out of the TL here by pretending we got a BARRIER_ACK
694 We ensure that the peer was not rebooted */
695 if (!(req->rq_state & RQ_NET_OK)) {
696 if (req->w.cb) {
697 drbd_queue_work(&mdev->data.work, &req->w);
698 rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ;
699 }
700 break;
701 }
702 /* else, fall through to barrier_acked */
703
636 case barrier_acked: 704 case barrier_acked:
705 if (!(req->rq_state & RQ_WRITE))
706 break;
707
637 if (req->rq_state & RQ_NET_PENDING) { 708 if (req->rq_state & RQ_NET_PENDING) {
638 /* barrier came in before all requests have been acked. 709 /* barrier came in before all requests have been acked.
639 * this is bad, because if the connection is lost now, 710 * this is bad, because if the connection is lost now,
@@ -641,9 +712,12 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
641 dev_err(DEV, "FIXME (barrier_acked but pending)\n"); 712 dev_err(DEV, "FIXME (barrier_acked but pending)\n");
642 list_move(&req->tl_requests, &mdev->out_of_sequence_requests); 713 list_move(&req->tl_requests, &mdev->out_of_sequence_requests);
643 } 714 }
644 D_ASSERT(req->rq_state & RQ_NET_SENT); 715 if ((req->rq_state & RQ_NET_MASK) != 0) {
645 req->rq_state |= RQ_NET_DONE; 716 req->rq_state |= RQ_NET_DONE;
646 _req_may_be_done(req, m); 717 if (mdev->net_conf->wire_protocol == DRBD_PROT_A)
718 atomic_sub(req->size>>9, &mdev->ap_in_flight);
719 }
720 _req_may_be_done(req, m); /* Allowed while state.susp */
647 break; 721 break;
648 722
649 case data_received: 723 case data_received:
@@ -651,9 +725,11 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
651 dec_ap_pending(mdev); 725 dec_ap_pending(mdev);
652 req->rq_state &= ~RQ_NET_PENDING; 726 req->rq_state &= ~RQ_NET_PENDING;
653 req->rq_state |= (RQ_NET_OK|RQ_NET_DONE); 727 req->rq_state |= (RQ_NET_OK|RQ_NET_DONE);
654 _req_may_be_done(req, m); 728 _req_may_be_done_not_susp(req, m);
655 break; 729 break;
656 }; 730 };
731
732 return rv;
657} 733}
658 734
659/* we may do a local read if: 735/* we may do a local read if:
@@ -687,14 +763,14 @@ static int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int s
687 return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr); 763 return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr);
688} 764}
689 765
690static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio) 766static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
691{ 767{
692 const int rw = bio_rw(bio); 768 const int rw = bio_rw(bio);
693 const int size = bio->bi_size; 769 const int size = bio->bi_size;
694 const sector_t sector = bio->bi_sector; 770 const sector_t sector = bio->bi_sector;
695 struct drbd_tl_epoch *b = NULL; 771 struct drbd_tl_epoch *b = NULL;
696 struct drbd_request *req; 772 struct drbd_request *req;
697 int local, remote; 773 int local, remote, send_oos = 0;
698 int err = -EIO; 774 int err = -EIO;
699 int ret = 0; 775 int ret = 0;
700 776
@@ -708,6 +784,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
708 bio_endio(bio, -ENOMEM); 784 bio_endio(bio, -ENOMEM);
709 return 0; 785 return 0;
710 } 786 }
787 req->start_time = start_time;
711 788
712 local = get_ldev(mdev); 789 local = get_ldev(mdev);
713 if (!local) { 790 if (!local) {
@@ -752,15 +829,18 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
752 * resync extent to finish, and, if necessary, pulls in the target 829 * resync extent to finish, and, if necessary, pulls in the target
753 * extent into the activity log, which involves further disk io because 830 * extent into the activity log, which involves further disk io because
754 * of transactional on-disk meta data updates. */ 831 * of transactional on-disk meta data updates. */
755 if (rw == WRITE && local) 832 if (rw == WRITE && local && !test_bit(AL_SUSPENDED, &mdev->flags)) {
833 req->rq_state |= RQ_IN_ACT_LOG;
756 drbd_al_begin_io(mdev, sector); 834 drbd_al_begin_io(mdev, sector);
835 }
757 836
758 remote = remote && (mdev->state.pdsk == D_UP_TO_DATE || 837 remote = remote && drbd_should_do_remote(mdev->state);
759 (mdev->state.pdsk == D_INCONSISTENT && 838 send_oos = rw == WRITE && drbd_should_send_oos(mdev->state);
760 mdev->state.conn >= C_CONNECTED)); 839 D_ASSERT(!(remote && send_oos));
761 840
762 if (!(local || remote) && !mdev->state.susp) { 841 if (!(local || remote) && !is_susp(mdev->state)) {
763 dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); 842 if (__ratelimit(&drbd_ratelimit_state))
843 dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
764 goto fail_free_complete; 844 goto fail_free_complete;
765 } 845 }
766 846
@@ -770,7 +850,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
770 * but there is a race between testing the bit and pointer outside the 850 * but there is a race between testing the bit and pointer outside the
771 * spinlock, and grabbing the spinlock. 851 * spinlock, and grabbing the spinlock.
772 * if we lost that race, we retry. */ 852 * if we lost that race, we retry. */
773 if (rw == WRITE && remote && 853 if (rw == WRITE && (remote || send_oos) &&
774 mdev->unused_spare_tle == NULL && 854 mdev->unused_spare_tle == NULL &&
775 test_bit(CREATE_BARRIER, &mdev->flags)) { 855 test_bit(CREATE_BARRIER, &mdev->flags)) {
776allocate_barrier: 856allocate_barrier:
@@ -785,21 +865,22 @@ allocate_barrier:
785 /* GOOD, everything prepared, grab the spin_lock */ 865 /* GOOD, everything prepared, grab the spin_lock */
786 spin_lock_irq(&mdev->req_lock); 866 spin_lock_irq(&mdev->req_lock);
787 867
788 if (mdev->state.susp) { 868 if (is_susp(mdev->state)) {
789 /* If we got suspended, use the retry mechanism of 869 /* If we got suspended, use the retry mechanism of
790 generic_make_request() to restart processing of this 870 generic_make_request() to restart processing of this
791 bio. In the next call to drbd_make_request_26 871 bio. In the next call to drbd_make_request
792 we sleep in inc_ap_bio() */ 872 we sleep in inc_ap_bio() */
793 ret = 1; 873 ret = 1;
794 spin_unlock_irq(&mdev->req_lock); 874 spin_unlock_irq(&mdev->req_lock);
795 goto fail_free_complete; 875 goto fail_free_complete;
796 } 876 }
797 877
798 if (remote) { 878 if (remote || send_oos) {
799 remote = (mdev->state.pdsk == D_UP_TO_DATE || 879 remote = drbd_should_do_remote(mdev->state);
800 (mdev->state.pdsk == D_INCONSISTENT && 880 send_oos = rw == WRITE && drbd_should_send_oos(mdev->state);
801 mdev->state.conn >= C_CONNECTED)); 881 D_ASSERT(!(remote && send_oos));
802 if (!remote) 882
883 if (!(remote || send_oos))
803 dev_warn(DEV, "lost connection while grabbing the req_lock!\n"); 884 dev_warn(DEV, "lost connection while grabbing the req_lock!\n");
804 if (!(local || remote)) { 885 if (!(local || remote)) {
805 dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); 886 dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
@@ -812,7 +893,7 @@ allocate_barrier:
812 mdev->unused_spare_tle = b; 893 mdev->unused_spare_tle = b;
813 b = NULL; 894 b = NULL;
814 } 895 }
815 if (rw == WRITE && remote && 896 if (rw == WRITE && (remote || send_oos) &&
816 mdev->unused_spare_tle == NULL && 897 mdev->unused_spare_tle == NULL &&
817 test_bit(CREATE_BARRIER, &mdev->flags)) { 898 test_bit(CREATE_BARRIER, &mdev->flags)) {
818 /* someone closed the current epoch 899 /* someone closed the current epoch
@@ -835,7 +916,7 @@ allocate_barrier:
835 * barrier packet. To get the write ordering right, we only have to 916 * barrier packet. To get the write ordering right, we only have to
836 * make sure that, if this is a write request and it triggered a 917 * make sure that, if this is a write request and it triggered a
837 * barrier packet, this request is queued within the same spinlock. */ 918 * barrier packet, this request is queued within the same spinlock. */
838 if (remote && mdev->unused_spare_tle && 919 if ((remote || send_oos) && mdev->unused_spare_tle &&
839 test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) { 920 test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
840 _tl_add_barrier(mdev, mdev->unused_spare_tle); 921 _tl_add_barrier(mdev, mdev->unused_spare_tle);
841 mdev->unused_spare_tle = NULL; 922 mdev->unused_spare_tle = NULL;
@@ -867,30 +948,10 @@ allocate_barrier:
867 /* check this request on the collision detection hash tables. 948 /* check this request on the collision detection hash tables.
868 * if we have a conflict, just complete it here. 949 * if we have a conflict, just complete it here.
869 * THINK do we want to check reads, too? (I don't think so...) */ 950 * THINK do we want to check reads, too? (I don't think so...) */
870 if (rw == WRITE && _req_conflicts(req)) { 951 if (rw == WRITE && _req_conflicts(req))
871 /* this is a conflicting request. 952 goto fail_conflicting;
872 * even though it may have been only _partially_ 953
873 * overlapping with one of the currently pending requests, 954 list_add_tail(&req->tl_requests, &mdev->newest_tle->requests);
874 * without even submitting or sending it, we will
875 * pretend that it was successfully served right now.
876 */
877 if (local) {
878 bio_put(req->private_bio);
879 req->private_bio = NULL;
880 drbd_al_complete_io(mdev, req->sector);
881 put_ldev(mdev);
882 local = 0;
883 }
884 if (remote)
885 dec_ap_pending(mdev);
886 _drbd_end_io_acct(mdev, req);
887 /* THINK: do we want to fail it (-EIO), or pretend success? */
888 bio_endio(req->master_bio, 0);
889 req->master_bio = NULL;
890 dec_ap_bio(mdev);
891 drbd_req_free(req);
892 remote = 0;
893 }
894 955
895 /* NOTE remote first: to get the concurrent write detection right, 956 /* NOTE remote first: to get the concurrent write detection right,
896 * we must register the request before start of local IO. */ 957 * we must register the request before start of local IO. */
@@ -903,28 +964,76 @@ allocate_barrier:
903 ? queue_for_net_write 964 ? queue_for_net_write
904 : queue_for_net_read); 965 : queue_for_net_read);
905 } 966 }
967 if (send_oos && drbd_set_out_of_sync(mdev, sector, size))
968 _req_mod(req, queue_for_send_oos);
969
970 if (remote &&
971 mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96) {
972 int congested = 0;
973
974 if (mdev->net_conf->cong_fill &&
975 atomic_read(&mdev->ap_in_flight) >= mdev->net_conf->cong_fill) {
976 dev_info(DEV, "Congestion-fill threshold reached\n");
977 congested = 1;
978 }
979
980 if (mdev->act_log->used >= mdev->net_conf->cong_extents) {
981 dev_info(DEV, "Congestion-extents threshold reached\n");
982 congested = 1;
983 }
984
985 if (congested) {
986 queue_barrier(mdev); /* last barrier, after mirrored writes */
987
988 if (mdev->net_conf->on_congestion == OC_PULL_AHEAD)
989 _drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL);
990 else /*mdev->net_conf->on_congestion == OC_DISCONNECT */
991 _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL);
992 }
993 }
994
906 spin_unlock_irq(&mdev->req_lock); 995 spin_unlock_irq(&mdev->req_lock);
907 kfree(b); /* if someone else has beaten us to it... */ 996 kfree(b); /* if someone else has beaten us to it... */
908 997
909 if (local) { 998 if (local) {
910 req->private_bio->bi_bdev = mdev->ldev->backing_bdev; 999 req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
911 1000
912 if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR 1001 /* State may have changed since we grabbed our reference on the
913 : rw == READ ? DRBD_FAULT_DT_RD 1002 * mdev->ldev member. Double check, and short-circuit to endio.
914 : DRBD_FAULT_DT_RA)) 1003 * In case the last activity log transaction failed to get on
1004 * stable storage, and this is a WRITE, we may not even submit
1005 * this bio. */
1006 if (get_ldev(mdev)) {
1007 if (drbd_insert_fault(mdev, rw == WRITE ? DRBD_FAULT_DT_WR
1008 : rw == READ ? DRBD_FAULT_DT_RD
1009 : DRBD_FAULT_DT_RA))
1010 bio_endio(req->private_bio, -EIO);
1011 else
1012 generic_make_request(req->private_bio);
1013 put_ldev(mdev);
1014 } else
915 bio_endio(req->private_bio, -EIO); 1015 bio_endio(req->private_bio, -EIO);
916 else
917 generic_make_request(req->private_bio);
918 } 1016 }
919 1017
920 /* we need to plug ALWAYS since we possibly need to kick lo_dev.
921 * we plug after submit, so we won't miss an unplug event */
922 drbd_plug_device(mdev);
923
924 return 0; 1018 return 0;
925 1019
1020fail_conflicting:
1021 /* this is a conflicting request.
1022 * even though it may have been only _partially_
1023 * overlapping with one of the currently pending requests,
1024 * without even submitting or sending it, we will
1025 * pretend that it was successfully served right now.
1026 */
1027 _drbd_end_io_acct(mdev, req);
1028 spin_unlock_irq(&mdev->req_lock);
1029 if (remote)
1030 dec_ap_pending(mdev);
1031 /* THINK: do we want to fail it (-EIO), or pretend success?
1032 * this pretends success. */
1033 err = 0;
1034
926fail_free_complete: 1035fail_free_complete:
927 if (rw == WRITE && local) 1036 if (req->rq_state & RQ_IN_ACT_LOG)
928 drbd_al_complete_io(mdev, sector); 1037 drbd_al_complete_io(mdev, sector);
929fail_and_free_req: 1038fail_and_free_req:
930 if (local) { 1039 if (local) {
@@ -961,47 +1070,21 @@ static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write)
961 return 1; 1070 return 1;
962 } 1071 }
963 1072
964 /*
965 * Paranoia: we might have been primary, but sync target, or
966 * even diskless, then lost the connection.
967 * This should have been handled (panic? suspend?) somewhere
968 * else. But maybe it was not, so check again here.
969 * Caution: as long as we do not have a read/write lock on mdev,
970 * to serialize state changes, this is racy, since we may lose
971 * the connection *after* we test for the cstate.
972 */
973 if (mdev->state.disk < D_UP_TO_DATE && mdev->state.pdsk < D_UP_TO_DATE) {
974 if (__ratelimit(&drbd_ratelimit_state))
975 dev_err(DEV, "Sorry, I have no access to good data anymore.\n");
976 return 1;
977 }
978
979 return 0; 1073 return 0;
980} 1074}
981 1075
982int drbd_make_request_26(struct request_queue *q, struct bio *bio) 1076int drbd_make_request(struct request_queue *q, struct bio *bio)
983{ 1077{
984 unsigned int s_enr, e_enr; 1078 unsigned int s_enr, e_enr;
985 struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata; 1079 struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
1080 unsigned long start_time;
986 1081
987 if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) { 1082 if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) {
988 bio_endio(bio, -EPERM); 1083 bio_endio(bio, -EPERM);
989 return 0; 1084 return 0;
990 } 1085 }
991 1086
992 /* Reject barrier requests if we know the underlying device does 1087 start_time = jiffies;
993 * not support them.
994 * XXX: Need to get this info from peer as well some how so we
995 * XXX: reject if EITHER side/data/metadata area does not support them.
996 *
997 * because of those XXX, this is not yet enabled,
998 * i.e. in drbd_init_set_defaults we set the NO_BARRIER_SUPP bit.
999 */
1000 if (unlikely(bio->bi_rw & REQ_HARDBARRIER) && test_bit(NO_BARRIER_SUPP, &mdev->flags)) {
1001 /* dev_warn(DEV, "Rejecting barrier request as underlying device does not support\n"); */
1002 bio_endio(bio, -EOPNOTSUPP);
1003 return 0;
1004 }
1005 1088
1006 /* 1089 /*
1007 * what we "blindly" assume: 1090 * what we "blindly" assume:
@@ -1017,12 +1100,12 @@ int drbd_make_request_26(struct request_queue *q, struct bio *bio)
1017 1100
1018 if (likely(s_enr == e_enr)) { 1101 if (likely(s_enr == e_enr)) {
1019 inc_ap_bio(mdev, 1); 1102 inc_ap_bio(mdev, 1);
1020 return drbd_make_request_common(mdev, bio); 1103 return drbd_make_request_common(mdev, bio, start_time);
1021 } 1104 }
1022 1105
1023 /* can this bio be split generically? 1106 /* can this bio be split generically?
1024 * Maybe add our own split-arbitrary-bios function. */ 1107 * Maybe add our own split-arbitrary-bios function. */
1025 if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_SEGMENT_SIZE) { 1108 if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_BIO_SIZE) {
1026 /* rather error out here than BUG in bio_split */ 1109 /* rather error out here than BUG in bio_split */
1027 dev_err(DEV, "bio would need to, but cannot, be split: " 1110 dev_err(DEV, "bio would need to, but cannot, be split: "
1028 "(vcnt=%u,idx=%u,size=%u,sector=%llu)\n", 1111 "(vcnt=%u,idx=%u,size=%u,sector=%llu)\n",
@@ -1044,11 +1127,7 @@ int drbd_make_request_26(struct request_queue *q, struct bio *bio)
1044 const int sps = 1 << HT_SHIFT; /* sectors per slot */ 1127 const int sps = 1 << HT_SHIFT; /* sectors per slot */
1045 const int mask = sps - 1; 1128 const int mask = sps - 1;
1046 const sector_t first_sectors = sps - (sect & mask); 1129 const sector_t first_sectors = sps - (sect & mask);
1047 bp = bio_split(bio, 1130 bp = bio_split(bio, first_sectors);
1048#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
1049 bio_split_pool,
1050#endif
1051 first_sectors);
1052 1131
1053 /* we need to get a "reference count" (ap_bio_cnt) 1132 /* we need to get a "reference count" (ap_bio_cnt)
1054 * to avoid races with the disconnect/reconnect/suspend code. 1133 * to avoid races with the disconnect/reconnect/suspend code.
@@ -1059,10 +1138,10 @@ int drbd_make_request_26(struct request_queue *q, struct bio *bio)
1059 1138
1060 D_ASSERT(e_enr == s_enr + 1); 1139 D_ASSERT(e_enr == s_enr + 1);
1061 1140
1062 while (drbd_make_request_common(mdev, &bp->bio1)) 1141 while (drbd_make_request_common(mdev, &bp->bio1, start_time))
1063 inc_ap_bio(mdev, 1); 1142 inc_ap_bio(mdev, 1);
1064 1143
1065 while (drbd_make_request_common(mdev, &bp->bio2)) 1144 while (drbd_make_request_common(mdev, &bp->bio2, start_time))
1066 inc_ap_bio(mdev, 1); 1145 inc_ap_bio(mdev, 1);
1067 1146
1068 dec_ap_bio(mdev); 1147 dec_ap_bio(mdev);
@@ -1073,7 +1152,7 @@ int drbd_make_request_26(struct request_queue *q, struct bio *bio)
1073} 1152}
1074 1153
1075/* This is called by bio_add_page(). With this function we reduce 1154/* This is called by bio_add_page(). With this function we reduce
1076 * the number of BIOs that span over multiple DRBD_MAX_SEGMENT_SIZEs 1155 * the number of BIOs that span over multiple DRBD_MAX_BIO_SIZEs
1077 * units (was AL_EXTENTs). 1156 * units (was AL_EXTENTs).
1078 * 1157 *
1079 * we do the calculation within the lower 32bit of the byte offsets, 1158 * we do the calculation within the lower 32bit of the byte offsets,
@@ -1083,7 +1162,7 @@ int drbd_make_request_26(struct request_queue *q, struct bio *bio)
1083 * As long as the BIO is empty we have to allow at least one bvec, 1162 * As long as the BIO is empty we have to allow at least one bvec,
1084 * regardless of size and offset. so the resulting bio may still 1163 * regardless of size and offset. so the resulting bio may still
1085 * cross extent boundaries. those are dealt with (bio_split) in 1164 * cross extent boundaries. those are dealt with (bio_split) in
1086 * drbd_make_request_26. 1165 * drbd_make_request.
1087 */ 1166 */
1088int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec) 1167int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec)
1089{ 1168{
@@ -1093,8 +1172,8 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct
1093 unsigned int bio_size = bvm->bi_size; 1172 unsigned int bio_size = bvm->bi_size;
1094 int limit, backing_limit; 1173 int limit, backing_limit;
1095 1174
1096 limit = DRBD_MAX_SEGMENT_SIZE 1175 limit = DRBD_MAX_BIO_SIZE
1097 - ((bio_offset & (DRBD_MAX_SEGMENT_SIZE-1)) + bio_size); 1176 - ((bio_offset & (DRBD_MAX_BIO_SIZE-1)) + bio_size);
1098 if (limit < 0) 1177 if (limit < 0)
1099 limit = 0; 1178 limit = 0;
1100 if (bio_size == 0) { 1179 if (bio_size == 0) {
@@ -1111,3 +1190,42 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct
1111 } 1190 }
1112 return limit; 1191 return limit;
1113} 1192}
1193
1194void request_timer_fn(unsigned long data)
1195{
1196 struct drbd_conf *mdev = (struct drbd_conf *) data;
1197 struct drbd_request *req; /* oldest request */
1198 struct list_head *le;
1199 unsigned long et = 0; /* effective timeout = ko_count * timeout */
1200
1201 if (get_net_conf(mdev)) {
1202 et = mdev->net_conf->timeout*HZ/10 * mdev->net_conf->ko_count;
1203 put_net_conf(mdev);
1204 }
1205 if (!et || mdev->state.conn < C_WF_REPORT_PARAMS)
1206 return; /* Recurring timer stopped */
1207
1208 spin_lock_irq(&mdev->req_lock);
1209 le = &mdev->oldest_tle->requests;
1210 if (list_empty(le)) {
1211 spin_unlock_irq(&mdev->req_lock);
1212 mod_timer(&mdev->request_timer, jiffies + et);
1213 return;
1214 }
1215
1216 le = le->prev;
1217 req = list_entry(le, struct drbd_request, tl_requests);
1218 if (time_is_before_eq_jiffies(req->start_time + et)) {
1219 if (req->rq_state & RQ_NET_PENDING) {
1220 dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n");
1221 _drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE, NULL);
1222 } else {
1223 dev_warn(DEV, "Local backing block device frozen?\n");
1224 mod_timer(&mdev->request_timer, jiffies + et);
1225 }
1226 } else {
1227 mod_timer(&mdev->request_timer, req->start_time + et);
1228 }
1229
1230 spin_unlock_irq(&mdev->req_lock);
1231}
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index 02d575d24518..68a234a5fdc5 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -82,14 +82,16 @@ enum drbd_req_event {
82 to_be_submitted, 82 to_be_submitted,
83 83
84 /* XXX yes, now I am inconsistent... 84 /* XXX yes, now I am inconsistent...
85 * these two are not "events" but "actions" 85 * these are not "events" but "actions"
86 * oh, well... */ 86 * oh, well... */
87 queue_for_net_write, 87 queue_for_net_write,
88 queue_for_net_read, 88 queue_for_net_read,
89 queue_for_send_oos,
89 90
90 send_canceled, 91 send_canceled,
91 send_failed, 92 send_failed,
92 handed_over_to_network, 93 handed_over_to_network,
94 oos_handed_to_network,
93 connection_lost_while_pending, 95 connection_lost_while_pending,
94 read_retry_remote_canceled, 96 read_retry_remote_canceled,
95 recv_acked_by_peer, 97 recv_acked_by_peer,
@@ -104,6 +106,9 @@ enum drbd_req_event {
104 read_ahead_completed_with_error, 106 read_ahead_completed_with_error,
105 write_completed_with_error, 107 write_completed_with_error,
106 completed_ok, 108 completed_ok,
109 resend,
110 fail_frozen_disk_io,
111 restart_frozen_disk_io,
107 nothing, /* for tracing only */ 112 nothing, /* for tracing only */
108}; 113};
109 114
@@ -183,6 +188,12 @@ enum drbd_req_state_bits {
183 188
184 /* keep this last, its for the RQ_NET_MASK */ 189 /* keep this last, its for the RQ_NET_MASK */
185 __RQ_NET_MAX, 190 __RQ_NET_MAX,
191
192 /* Set when this is a write, clear for a read */
193 __RQ_WRITE,
194
195 /* Should call drbd_al_complete_io() for this request... */
196 __RQ_IN_ACT_LOG,
186}; 197};
187 198
188#define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING) 199#define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING)
@@ -201,6 +212,16 @@ enum drbd_req_state_bits {
201/* 0x1f8 */ 212/* 0x1f8 */
202#define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK) 213#define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK)
203 214
215#define RQ_WRITE (1UL << __RQ_WRITE)
216#define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG)
217
218/* For waking up the frozen transfer log mod_req() has to return if the request
219 should be counted in the epoch object*/
220#define MR_WRITE_SHIFT 0
221#define MR_WRITE (1 << MR_WRITE_SHIFT)
222#define MR_READ_SHIFT 1
223#define MR_READ (1 << MR_READ_SHIFT)
224
204/* epoch entries */ 225/* epoch entries */
205static inline 226static inline
206struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector) 227struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector)
@@ -235,7 +256,7 @@ static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev,
235 struct hlist_node *n; 256 struct hlist_node *n;
236 struct drbd_request *req; 257 struct drbd_request *req;
237 258
238 hlist_for_each_entry(req, n, slot, colision) { 259 hlist_for_each_entry(req, n, slot, collision) {
239 if ((unsigned long)req == (unsigned long)id) { 260 if ((unsigned long)req == (unsigned long)id) {
240 D_ASSERT(req->sector == sector); 261 D_ASSERT(req->sector == sector);
241 return req; 262 return req;
@@ -244,30 +265,35 @@ static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev,
244 return NULL; 265 return NULL;
245} 266}
246 267
268static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src)
269{
270 struct bio *bio;
271 bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */
272
273 req->private_bio = bio;
274
275 bio->bi_private = req;
276 bio->bi_end_io = drbd_endio_pri;
277 bio->bi_next = NULL;
278}
279
247static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev, 280static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev,
248 struct bio *bio_src) 281 struct bio *bio_src)
249{ 282{
250 struct bio *bio;
251 struct drbd_request *req = 283 struct drbd_request *req =
252 mempool_alloc(drbd_request_mempool, GFP_NOIO); 284 mempool_alloc(drbd_request_mempool, GFP_NOIO);
253 if (likely(req)) { 285 if (likely(req)) {
254 bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */ 286 drbd_req_make_private_bio(req, bio_src);
255 287
256 req->rq_state = 0; 288 req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0;
257 req->mdev = mdev; 289 req->mdev = mdev;
258 req->master_bio = bio_src; 290 req->master_bio = bio_src;
259 req->private_bio = bio;
260 req->epoch = 0; 291 req->epoch = 0;
261 req->sector = bio->bi_sector; 292 req->sector = bio_src->bi_sector;
262 req->size = bio->bi_size; 293 req->size = bio_src->bi_size;
263 req->start_time = jiffies; 294 INIT_HLIST_NODE(&req->collision);
264 INIT_HLIST_NODE(&req->colision);
265 INIT_LIST_HEAD(&req->tl_requests); 295 INIT_LIST_HEAD(&req->tl_requests);
266 INIT_LIST_HEAD(&req->w.list); 296 INIT_LIST_HEAD(&req->w.list);
267
268 bio->bi_private = req;
269 bio->bi_end_io = drbd_endio_pri;
270 bio->bi_next = NULL;
271 } 297 }
272 return req; 298 return req;
273} 299}
@@ -292,36 +318,66 @@ struct bio_and_error {
292 318
293extern void _req_may_be_done(struct drbd_request *req, 319extern void _req_may_be_done(struct drbd_request *req,
294 struct bio_and_error *m); 320 struct bio_and_error *m);
295extern void __req_mod(struct drbd_request *req, enum drbd_req_event what, 321extern int __req_mod(struct drbd_request *req, enum drbd_req_event what,
296 struct bio_and_error *m); 322 struct bio_and_error *m);
297extern void complete_master_bio(struct drbd_conf *mdev, 323extern void complete_master_bio(struct drbd_conf *mdev,
298 struct bio_and_error *m); 324 struct bio_and_error *m);
325extern void request_timer_fn(unsigned long data);
326extern void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what);
299 327
300/* use this if you don't want to deal with calling complete_master_bio() 328/* use this if you don't want to deal with calling complete_master_bio()
301 * outside the spinlock, e.g. when walking some list on cleanup. */ 329 * outside the spinlock, e.g. when walking some list on cleanup. */
302static inline void _req_mod(struct drbd_request *req, enum drbd_req_event what) 330static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what)
303{ 331{
304 struct drbd_conf *mdev = req->mdev; 332 struct drbd_conf *mdev = req->mdev;
305 struct bio_and_error m; 333 struct bio_and_error m;
334 int rv;
306 335
307 /* __req_mod possibly frees req, do not touch req after that! */ 336 /* __req_mod possibly frees req, do not touch req after that! */
308 __req_mod(req, what, &m); 337 rv = __req_mod(req, what, &m);
309 if (m.bio) 338 if (m.bio)
310 complete_master_bio(mdev, &m); 339 complete_master_bio(mdev, &m);
340
341 return rv;
311} 342}
312 343
313/* completion of master bio is outside of spinlock. 344/* completion of master bio is outside of our spinlock.
314 * If you need it irqsave, do it your self! */ 345 * We still may or may not be inside some irqs disabled section
315static inline void req_mod(struct drbd_request *req, 346 * of the lower level driver completion callback, so we need to
347 * spin_lock_irqsave here. */
348static inline int req_mod(struct drbd_request *req,
316 enum drbd_req_event what) 349 enum drbd_req_event what)
317{ 350{
351 unsigned long flags;
318 struct drbd_conf *mdev = req->mdev; 352 struct drbd_conf *mdev = req->mdev;
319 struct bio_and_error m; 353 struct bio_and_error m;
320 spin_lock_irq(&mdev->req_lock); 354 int rv;
321 __req_mod(req, what, &m); 355
322 spin_unlock_irq(&mdev->req_lock); 356 spin_lock_irqsave(&mdev->req_lock, flags);
357 rv = __req_mod(req, what, &m);
358 spin_unlock_irqrestore(&mdev->req_lock, flags);
323 359
324 if (m.bio) 360 if (m.bio)
325 complete_master_bio(mdev, &m); 361 complete_master_bio(mdev, &m);
362
363 return rv;
364}
365
366static inline bool drbd_should_do_remote(union drbd_state s)
367{
368 return s.pdsk == D_UP_TO_DATE ||
369 (s.pdsk >= D_INCONSISTENT &&
370 s.conn >= C_WF_BITMAP_T &&
371 s.conn < C_AHEAD);
372 /* Before proto 96 that was >= CONNECTED instead of >= C_WF_BITMAP_T.
373 That is equivalent since before 96 IO was frozen in the C_WF_BITMAP*
374 states. */
326} 375}
376static inline bool drbd_should_send_oos(union drbd_state s)
377{
378 return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S;
379 /* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary
380 since we enter state C_AHEAD only if proto >= 96 */
381}
382
327#endif 383#endif
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c
index 85179e1fb50a..c44a2a602772 100644
--- a/drivers/block/drbd/drbd_strings.c
+++ b/drivers/block/drbd/drbd_strings.c
@@ -48,6 +48,8 @@ static const char *drbd_conn_s_names[] = {
48 [C_PAUSED_SYNC_T] = "PausedSyncT", 48 [C_PAUSED_SYNC_T] = "PausedSyncT",
49 [C_VERIFY_S] = "VerifyS", 49 [C_VERIFY_S] = "VerifyS",
50 [C_VERIFY_T] = "VerifyT", 50 [C_VERIFY_T] = "VerifyT",
51 [C_AHEAD] = "Ahead",
52 [C_BEHIND] = "Behind",
51}; 53};
52 54
53static const char *drbd_role_s_names[] = { 55static const char *drbd_role_s_names[] = {
@@ -92,7 +94,7 @@ static const char *drbd_state_sw_errors[] = {
92const char *drbd_conn_str(enum drbd_conns s) 94const char *drbd_conn_str(enum drbd_conns s)
93{ 95{
94 /* enums are unsigned... */ 96 /* enums are unsigned... */
95 return s > C_PAUSED_SYNC_T ? "TOO_LARGE" : drbd_conn_s_names[s]; 97 return s > C_BEHIND ? "TOO_LARGE" : drbd_conn_s_names[s];
96} 98}
97 99
98const char *drbd_role_str(enum drbd_role s) 100const char *drbd_role_str(enum drbd_role s)
@@ -105,7 +107,7 @@ const char *drbd_disk_str(enum drbd_disk_state s)
105 return s > D_UP_TO_DATE ? "TOO_LARGE" : drbd_disk_s_names[s]; 107 return s > D_UP_TO_DATE ? "TOO_LARGE" : drbd_disk_s_names[s];
106} 108}
107 109
108const char *drbd_set_st_err_str(enum drbd_state_ret_codes err) 110const char *drbd_set_st_err_str(enum drbd_state_rv err)
109{ 111{
110 return err <= SS_AFTER_LAST_ERROR ? "TOO_SMALL" : 112 return err <= SS_AFTER_LAST_ERROR ? "TOO_SMALL" :
111 err > SS_TWO_PRIMARIES ? "TOO_LARGE" 113 err > SS_TWO_PRIMARIES ? "TOO_LARGE"
diff --git a/drivers/block/drbd/drbd_vli.h b/drivers/block/drbd/drbd_vli.h
index fc824006e721..8cb1532a3816 100644
--- a/drivers/block/drbd/drbd_vli.h
+++ b/drivers/block/drbd/drbd_vli.h
@@ -32,7 +32,7 @@
32 * the bitmap transfer time can take much too long, 32 * the bitmap transfer time can take much too long,
33 * if transmitted in plain text. 33 * if transmitted in plain text.
34 * 34 *
35 * We try to reduce the transfered bitmap information 35 * We try to reduce the transferred bitmap information
36 * by encoding runlengths of bit polarity. 36 * by encoding runlengths of bit polarity.
37 * 37 *
38 * We never actually need to encode a "zero" (runlengths are positive). 38 * We never actually need to encode a "zero" (runlengths are positive).
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index ca4a16cea2d8..4d3e6f6213ba 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -26,7 +26,6 @@
26#include <linux/module.h> 26#include <linux/module.h>
27#include <linux/drbd.h> 27#include <linux/drbd.h>
28#include <linux/sched.h> 28#include <linux/sched.h>
29#include <linux/smp_lock.h>
30#include <linux/wait.h> 29#include <linux/wait.h>
31#include <linux/mm.h> 30#include <linux/mm.h>
32#include <linux/memcontrol.h> 31#include <linux/memcontrol.h>
@@ -39,21 +38,18 @@
39#include "drbd_int.h" 38#include "drbd_int.h"
40#include "drbd_req.h" 39#include "drbd_req.h"
41 40
42#define SLEEP_TIME (HZ/10)
43
44static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel); 41static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel);
42static int w_make_resync_request(struct drbd_conf *mdev,
43 struct drbd_work *w, int cancel);
45 44
46 45
47 46
48/* defined here: 47/* endio handlers:
49 drbd_md_io_complete 48 * drbd_md_io_complete (defined here)
50 drbd_endio_sec 49 * drbd_endio_pri (defined here)
51 drbd_endio_pri 50 * drbd_endio_sec (defined here)
52 51 * bm_async_io_complete (defined in drbd_bitmap.c)
53 * more endio handlers: 52 *
54 atodb_endio in drbd_actlog.c
55 drbd_bm_async_io_complete in drbd_bitmap.c
56
57 * For all these callbacks, note the following: 53 * For all these callbacks, note the following:
58 * The callbacks will be called in irq context by the IDE drivers, 54 * The callbacks will be called in irq context by the IDE drivers,
59 * and in Softirqs/Tasklets/BH context by the SCSI drivers. 55 * and in Softirqs/Tasklets/BH context by the SCSI drivers.
@@ -97,19 +93,13 @@ void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local)
97 if (list_empty(&mdev->read_ee)) 93 if (list_empty(&mdev->read_ee))
98 wake_up(&mdev->ee_wait); 94 wake_up(&mdev->ee_wait);
99 if (test_bit(__EE_WAS_ERROR, &e->flags)) 95 if (test_bit(__EE_WAS_ERROR, &e->flags))
100 __drbd_chk_io_error(mdev, FALSE); 96 __drbd_chk_io_error(mdev, false);
101 spin_unlock_irqrestore(&mdev->req_lock, flags); 97 spin_unlock_irqrestore(&mdev->req_lock, flags);
102 98
103 drbd_queue_work(&mdev->data.work, &e->w); 99 drbd_queue_work(&mdev->data.work, &e->w);
104 put_ldev(mdev); 100 put_ldev(mdev);
105} 101}
106 102
107static int is_failed_barrier(int ee_flags)
108{
109 return (ee_flags & (EE_IS_BARRIER|EE_WAS_ERROR|EE_RESUBMITTED))
110 == (EE_IS_BARRIER|EE_WAS_ERROR);
111}
112
113/* writes on behalf of the partner, or resync writes, 103/* writes on behalf of the partner, or resync writes,
114 * "submitted" by the receiver, final stage. */ 104 * "submitted" by the receiver, final stage. */
115static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(local) 105static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(local)
@@ -121,21 +111,6 @@ static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(lo
121 int is_syncer_req; 111 int is_syncer_req;
122 int do_al_complete_io; 112 int do_al_complete_io;
123 113
124 /* if this is a failed barrier request, disable use of barriers,
125 * and schedule for resubmission */
126 if (is_failed_barrier(e->flags)) {
127 drbd_bump_write_ordering(mdev, WO_bdev_flush);
128 spin_lock_irqsave(&mdev->req_lock, flags);
129 list_del(&e->w.list);
130 e->flags = (e->flags & ~EE_WAS_ERROR) | EE_RESUBMITTED;
131 e->w.cb = w_e_reissue;
132 /* put_ldev actually happens below, once we come here again. */
133 __release(local);
134 spin_unlock_irqrestore(&mdev->req_lock, flags);
135 drbd_queue_work(&mdev->data.work, &e->w);
136 return;
137 }
138
139 D_ASSERT(e->block_id != ID_VACANT); 114 D_ASSERT(e->block_id != ID_VACANT);
140 115
141 /* after we moved e to done_ee, 116 /* after we moved e to done_ee,
@@ -151,7 +126,7 @@ static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(lo
151 list_del(&e->w.list); /* has been on active_ee or sync_ee */ 126 list_del(&e->w.list); /* has been on active_ee or sync_ee */
152 list_add_tail(&e->w.list, &mdev->done_ee); 127 list_add_tail(&e->w.list, &mdev->done_ee);
153 128
154 /* No hlist_del_init(&e->colision) here, we did not send the Ack yet, 129 /* No hlist_del_init(&e->collision) here, we did not send the Ack yet,
155 * neither did we wake possibly waiting conflicting requests. 130 * neither did we wake possibly waiting conflicting requests.
156 * done from "drbd_process_done_ee" within the appropriate w.cb 131 * done from "drbd_process_done_ee" within the appropriate w.cb
157 * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */ 132 * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */
@@ -161,7 +136,7 @@ static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(lo
161 : list_empty(&mdev->active_ee); 136 : list_empty(&mdev->active_ee);
162 137
163 if (test_bit(__EE_WAS_ERROR, &e->flags)) 138 if (test_bit(__EE_WAS_ERROR, &e->flags))
164 __drbd_chk_io_error(mdev, FALSE); 139 __drbd_chk_io_error(mdev, false);
165 spin_unlock_irqrestore(&mdev->req_lock, flags); 140 spin_unlock_irqrestore(&mdev->req_lock, flags);
166 141
167 if (is_syncer_req) 142 if (is_syncer_req)
@@ -187,14 +162,15 @@ void drbd_endio_sec(struct bio *bio, int error)
187 int uptodate = bio_flagged(bio, BIO_UPTODATE); 162 int uptodate = bio_flagged(bio, BIO_UPTODATE);
188 int is_write = bio_data_dir(bio) == WRITE; 163 int is_write = bio_data_dir(bio) == WRITE;
189 164
190 if (error) 165 if (error && __ratelimit(&drbd_ratelimit_state))
191 dev_warn(DEV, "%s: error=%d s=%llus\n", 166 dev_warn(DEV, "%s: error=%d s=%llus\n",
192 is_write ? "write" : "read", error, 167 is_write ? "write" : "read", error,
193 (unsigned long long)e->sector); 168 (unsigned long long)e->sector);
194 if (!error && !uptodate) { 169 if (!error && !uptodate) {
195 dev_warn(DEV, "%s: setting error to -EIO s=%llus\n", 170 if (__ratelimit(&drbd_ratelimit_state))
196 is_write ? "write" : "read", 171 dev_warn(DEV, "%s: setting error to -EIO s=%llus\n",
197 (unsigned long long)e->sector); 172 is_write ? "write" : "read",
173 (unsigned long long)e->sector);
198 /* strange behavior of some lower level drivers... 174 /* strange behavior of some lower level drivers...
199 * fail the request by clearing the uptodate flag, 175 * fail the request by clearing the uptodate flag,
200 * but do not return any error?! */ 176 * but do not return any error?! */
@@ -246,6 +222,7 @@ void drbd_endio_pri(struct bio *bio, int error)
246 bio_put(req->private_bio); 222 bio_put(req->private_bio);
247 req->private_bio = ERR_PTR(error); 223 req->private_bio = ERR_PTR(error);
248 224
225 /* not req_mod(), we need irqsave here! */
249 spin_lock_irqsave(&mdev->req_lock, flags); 226 spin_lock_irqsave(&mdev->req_lock, flags);
250 __req_mod(req, what, &m); 227 __req_mod(req, what, &m);
251 spin_unlock_irqrestore(&mdev->req_lock, flags); 228 spin_unlock_irqrestore(&mdev->req_lock, flags);
@@ -273,13 +250,6 @@ int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
273 return w_send_read_req(mdev, w, 0); 250 return w_send_read_req(mdev, w, 0);
274} 251}
275 252
276int w_resync_inactive(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
277{
278 ERR_IF(cancel) return 1;
279 dev_err(DEV, "resync inactive, but callback triggered??\n");
280 return 1; /* Simply ignore this! */
281}
282
283void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_epoch_entry *e, void *digest) 253void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_epoch_entry *e, void *digest)
284{ 254{
285 struct hash_desc desc; 255 struct hash_desc desc;
@@ -327,42 +297,48 @@ void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *
327 crypto_hash_final(&desc, digest); 297 crypto_hash_final(&desc, digest);
328} 298}
329 299
330static int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel) 300/* TODO merge common code with w_e_end_ov_req */
301int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
331{ 302{
332 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); 303 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
333 int digest_size; 304 int digest_size;
334 void *digest; 305 void *digest;
335 int ok; 306 int ok = 1;
336 307
337 D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef); 308 D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef);
338 309
339 if (unlikely(cancel)) { 310 if (unlikely(cancel))
340 drbd_free_ee(mdev, e); 311 goto out;
341 return 1;
342 }
343 312
344 if (likely((e->flags & EE_WAS_ERROR) == 0)) { 313 if (likely((e->flags & EE_WAS_ERROR) != 0))
345 digest_size = crypto_hash_digestsize(mdev->csums_tfm); 314 goto out;
346 digest = kmalloc(digest_size, GFP_NOIO);
347 if (digest) {
348 drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
349 315
350 inc_rs_pending(mdev); 316 digest_size = crypto_hash_digestsize(mdev->csums_tfm);
351 ok = drbd_send_drequest_csum(mdev, 317 digest = kmalloc(digest_size, GFP_NOIO);
352 e->sector, 318 if (digest) {
353 e->size, 319 sector_t sector = e->sector;
354 digest, 320 unsigned int size = e->size;
355 digest_size, 321 drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
356 P_CSUM_RS_REQUEST); 322 /* Free e and pages before send.
357 kfree(digest); 323 * In case we block on congestion, we could otherwise run into
358 } else { 324 * some distributed deadlock, if the other side blocks on
359 dev_err(DEV, "kmalloc() of digest failed.\n"); 325 * congestion as well, because our receiver blocks in
360 ok = 0; 326 * drbd_pp_alloc due to pp_in_use > max_buffers. */
361 } 327 drbd_free_ee(mdev, e);
362 } else 328 e = NULL;
363 ok = 1; 329 inc_rs_pending(mdev);
330 ok = drbd_send_drequest_csum(mdev, sector, size,
331 digest, digest_size,
332 P_CSUM_RS_REQUEST);
333 kfree(digest);
334 } else {
335 dev_err(DEV, "kmalloc() of digest failed.\n");
336 ok = 0;
337 }
364 338
365 drbd_free_ee(mdev, e); 339out:
340 if (e)
341 drbd_free_ee(mdev, e);
366 342
367 if (unlikely(!ok)) 343 if (unlikely(!ok))
368 dev_err(DEV, "drbd_send_drequest(..., csum) failed\n"); 344 dev_err(DEV, "drbd_send_drequest(..., csum) failed\n");
@@ -376,109 +352,194 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
376 struct drbd_epoch_entry *e; 352 struct drbd_epoch_entry *e;
377 353
378 if (!get_ldev(mdev)) 354 if (!get_ldev(mdev))
379 return 0; 355 return -EIO;
356
357 if (drbd_rs_should_slow_down(mdev, sector))
358 goto defer;
380 359
381 /* GFP_TRY, because if there is no memory available right now, this may 360 /* GFP_TRY, because if there is no memory available right now, this may
382 * be rescheduled for later. It is "only" background resync, after all. */ 361 * be rescheduled for later. It is "only" background resync, after all. */
383 e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY); 362 e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
384 if (!e) 363 if (!e)
385 goto fail; 364 goto defer;
386 365
366 e->w.cb = w_e_send_csum;
387 spin_lock_irq(&mdev->req_lock); 367 spin_lock_irq(&mdev->req_lock);
388 list_add(&e->w.list, &mdev->read_ee); 368 list_add(&e->w.list, &mdev->read_ee);
389 spin_unlock_irq(&mdev->req_lock); 369 spin_unlock_irq(&mdev->req_lock);
390 370
391 e->w.cb = w_e_send_csum; 371 atomic_add(size >> 9, &mdev->rs_sect_ev);
392 if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0) 372 if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0)
393 return 1; 373 return 0;
374
375 /* If it failed because of ENOMEM, retry should help. If it failed
376 * because bio_add_page failed (probably broken lower level driver),
377 * retry may or may not help.
378 * If it does not, you may need to force disconnect. */
379 spin_lock_irq(&mdev->req_lock);
380 list_del(&e->w.list);
381 spin_unlock_irq(&mdev->req_lock);
394 382
395 drbd_free_ee(mdev, e); 383 drbd_free_ee(mdev, e);
396fail: 384defer:
397 put_ldev(mdev); 385 put_ldev(mdev);
398 return 2; 386 return -EAGAIN;
387}
388
389int w_resync_timer(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
390{
391 switch (mdev->state.conn) {
392 case C_VERIFY_S:
393 w_make_ov_request(mdev, w, cancel);
394 break;
395 case C_SYNC_TARGET:
396 w_make_resync_request(mdev, w, cancel);
397 break;
398 }
399
400 return 1;
399} 401}
400 402
401void resync_timer_fn(unsigned long data) 403void resync_timer_fn(unsigned long data)
402{ 404{
403 unsigned long flags;
404 struct drbd_conf *mdev = (struct drbd_conf *) data; 405 struct drbd_conf *mdev = (struct drbd_conf *) data;
405 int queue;
406 406
407 spin_lock_irqsave(&mdev->req_lock, flags); 407 if (list_empty(&mdev->resync_work.list))
408 drbd_queue_work(&mdev->data.work, &mdev->resync_work);
409}
408 410
409 if (likely(!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))) { 411static void fifo_set(struct fifo_buffer *fb, int value)
410 queue = 1; 412{
411 if (mdev->state.conn == C_VERIFY_S) 413 int i;
412 mdev->resync_work.cb = w_make_ov_request; 414
413 else 415 for (i = 0; i < fb->size; i++)
414 mdev->resync_work.cb = w_make_resync_request; 416 fb->values[i] = value;
415 } else { 417}
416 queue = 0; 418
417 mdev->resync_work.cb = w_resync_inactive; 419static int fifo_push(struct fifo_buffer *fb, int value)
420{
421 int ov;
422
423 ov = fb->values[fb->head_index];
424 fb->values[fb->head_index++] = value;
425
426 if (fb->head_index >= fb->size)
427 fb->head_index = 0;
428
429 return ov;
430}
431
432static void fifo_add_val(struct fifo_buffer *fb, int value)
433{
434 int i;
435
436 for (i = 0; i < fb->size; i++)
437 fb->values[i] += value;
438}
439
440static int drbd_rs_controller(struct drbd_conf *mdev)
441{
442 unsigned int sect_in; /* Number of sectors that came in since the last turn */
443 unsigned int want; /* The number of sectors we want in the proxy */
444 int req_sect; /* Number of sectors to request in this turn */
445 int correction; /* Number of sectors more we need in the proxy*/
446 int cps; /* correction per invocation of drbd_rs_controller() */
447 int steps; /* Number of time steps to plan ahead */
448 int curr_corr;
449 int max_sect;
450
451 sect_in = atomic_xchg(&mdev->rs_sect_in, 0); /* Number of sectors that came in */
452 mdev->rs_in_flight -= sect_in;
453
454 spin_lock(&mdev->peer_seq_lock); /* get an atomic view on mdev->rs_plan_s */
455
456 steps = mdev->rs_plan_s.size; /* (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
457
458 if (mdev->rs_in_flight + sect_in == 0) { /* At start of resync */
459 want = ((mdev->sync_conf.rate * 2 * SLEEP_TIME) / HZ) * steps;
460 } else { /* normal path */
461 want = mdev->sync_conf.c_fill_target ? mdev->sync_conf.c_fill_target :
462 sect_in * mdev->sync_conf.c_delay_target * HZ / (SLEEP_TIME * 10);
418 } 463 }
419 464
420 spin_unlock_irqrestore(&mdev->req_lock, flags); 465 correction = want - mdev->rs_in_flight - mdev->rs_planed;
421 466
422 /* harmless race: list_empty outside data.work.q_lock */ 467 /* Plan ahead */
423 if (list_empty(&mdev->resync_work.list) && queue) 468 cps = correction / steps;
424 drbd_queue_work(&mdev->data.work, &mdev->resync_work); 469 fifo_add_val(&mdev->rs_plan_s, cps);
470 mdev->rs_planed += cps * steps;
471
472 /* What we do in this step */
473 curr_corr = fifo_push(&mdev->rs_plan_s, 0);
474 spin_unlock(&mdev->peer_seq_lock);
475 mdev->rs_planed -= curr_corr;
476
477 req_sect = sect_in + curr_corr;
478 if (req_sect < 0)
479 req_sect = 0;
480
481 max_sect = (mdev->sync_conf.c_max_rate * 2 * SLEEP_TIME) / HZ;
482 if (req_sect > max_sect)
483 req_sect = max_sect;
484
485 /*
486 dev_warn(DEV, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n",
487 sect_in, mdev->rs_in_flight, want, correction,
488 steps, cps, mdev->rs_planed, curr_corr, req_sect);
489 */
490
491 return req_sect;
425} 492}
426 493
427int w_make_resync_request(struct drbd_conf *mdev, 494static int drbd_rs_number_requests(struct drbd_conf *mdev)
428 struct drbd_work *w, int cancel) 495{
496 int number;
497 if (mdev->rs_plan_s.size) { /* mdev->sync_conf.c_plan_ahead */
498 number = drbd_rs_controller(mdev) >> (BM_BLOCK_SHIFT - 9);
499 mdev->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
500 } else {
501 mdev->c_sync_rate = mdev->sync_conf.rate;
502 number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ);
503 }
504
505 /* ignore the amount of pending requests, the resync controller should
506 * throttle down to incoming reply rate soon enough anyways. */
507 return number;
508}
509
510static int w_make_resync_request(struct drbd_conf *mdev,
511 struct drbd_work *w, int cancel)
429{ 512{
430 unsigned long bit; 513 unsigned long bit;
431 sector_t sector; 514 sector_t sector;
432 const sector_t capacity = drbd_get_capacity(mdev->this_bdev); 515 const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
433 int max_segment_size; 516 int max_bio_size;
434 int number, i, size, pe, mx; 517 int number, rollback_i, size;
435 int align, queued, sndbuf; 518 int align, queued, sndbuf;
519 int i = 0;
436 520
437 if (unlikely(cancel)) 521 if (unlikely(cancel))
438 return 1; 522 return 1;
439 523
440 if (unlikely(mdev->state.conn < C_CONNECTED)) { 524 if (mdev->rs_total == 0) {
441 dev_err(DEV, "Confused in w_make_resync_request()! cstate < Connected"); 525 /* empty resync? */
442 return 0; 526 drbd_resync_finished(mdev);
527 return 1;
443 } 528 }
444 529
445 if (mdev->state.conn != C_SYNC_TARGET)
446 dev_err(DEV, "%s in w_make_resync_request\n",
447 drbd_conn_str(mdev->state.conn));
448
449 if (!get_ldev(mdev)) { 530 if (!get_ldev(mdev)) {
450 /* Since we only need to access mdev->rsync a 531 /* Since we only need to access mdev->rsync a
451 get_ldev_if_state(mdev,D_FAILED) would be sufficient, but 532 get_ldev_if_state(mdev,D_FAILED) would be sufficient, but
452 to continue resync with a broken disk makes no sense at 533 to continue resync with a broken disk makes no sense at
453 all */ 534 all */
454 dev_err(DEV, "Disk broke down during resync!\n"); 535 dev_err(DEV, "Disk broke down during resync!\n");
455 mdev->resync_work.cb = w_resync_inactive;
456 return 1; 536 return 1;
457 } 537 }
458 538
459 /* starting with drbd 8.3.8, we can handle multi-bio EEs, 539 max_bio_size = queue_max_hw_sectors(mdev->rq_queue) << 9;
460 * if it should be necessary */ 540 number = drbd_rs_number_requests(mdev);
461 max_segment_size = mdev->agreed_pro_version < 94 ? 541 if (number == 0)
462 queue_max_segment_size(mdev->rq_queue) : DRBD_MAX_SEGMENT_SIZE; 542 goto requeue;
463
464 number = SLEEP_TIME * mdev->sync_conf.rate / ((BM_BLOCK_SIZE / 1024) * HZ);
465 pe = atomic_read(&mdev->rs_pending_cnt);
466
467 mutex_lock(&mdev->data.mutex);
468 if (mdev->data.socket)
469 mx = mdev->data.socket->sk->sk_rcvbuf / sizeof(struct p_block_req);
470 else
471 mx = 1;
472 mutex_unlock(&mdev->data.mutex);
473
474 /* For resync rates >160MB/sec, allow more pending RS requests */
475 if (number > mx)
476 mx = number;
477
478 /* Limit the number of pending RS requests to no more than the peer's receive buffer */
479 if ((pe + number) > mx) {
480 number = mx - pe;
481 }
482 543
483 for (i = 0; i < number; i++) { 544 for (i = 0; i < number; i++) {
484 /* Stop generating RS requests, when half of the send buffer is filled */ 545 /* Stop generating RS requests, when half of the send buffer is filled */
@@ -498,16 +559,16 @@ next_sector:
498 size = BM_BLOCK_SIZE; 559 size = BM_BLOCK_SIZE;
499 bit = drbd_bm_find_next(mdev, mdev->bm_resync_fo); 560 bit = drbd_bm_find_next(mdev, mdev->bm_resync_fo);
500 561
501 if (bit == -1UL) { 562 if (bit == DRBD_END_OF_BITMAP) {
502 mdev->bm_resync_fo = drbd_bm_bits(mdev); 563 mdev->bm_resync_fo = drbd_bm_bits(mdev);
503 mdev->resync_work.cb = w_resync_inactive;
504 put_ldev(mdev); 564 put_ldev(mdev);
505 return 1; 565 return 1;
506 } 566 }
507 567
508 sector = BM_BIT_TO_SECT(bit); 568 sector = BM_BIT_TO_SECT(bit);
509 569
510 if (drbd_try_rs_begin_io(mdev, sector)) { 570 if (drbd_rs_should_slow_down(mdev, sector) ||
571 drbd_try_rs_begin_io(mdev, sector)) {
511 mdev->bm_resync_fo = bit; 572 mdev->bm_resync_fo = bit;
512 goto requeue; 573 goto requeue;
513 } 574 }
@@ -518,7 +579,7 @@ next_sector:
518 goto next_sector; 579 goto next_sector;
519 } 580 }
520 581
521#if DRBD_MAX_SEGMENT_SIZE > BM_BLOCK_SIZE 582#if DRBD_MAX_BIO_SIZE > BM_BLOCK_SIZE
522 /* try to find some adjacent bits. 583 /* try to find some adjacent bits.
523 * we stop if we have already the maximum req size. 584 * we stop if we have already the maximum req size.
524 * 585 *
@@ -526,8 +587,9 @@ next_sector:
526 * be prepared for all stripe sizes of software RAIDs. 587 * be prepared for all stripe sizes of software RAIDs.
527 */ 588 */
528 align = 1; 589 align = 1;
590 rollback_i = i;
529 for (;;) { 591 for (;;) {
530 if (size + BM_BLOCK_SIZE > max_segment_size) 592 if (size + BM_BLOCK_SIZE > max_bio_size)
531 break; 593 break;
532 594
533 /* Be always aligned */ 595 /* Be always aligned */
@@ -561,14 +623,19 @@ next_sector:
561 size = (capacity-sector)<<9; 623 size = (capacity-sector)<<9;
562 if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) { 624 if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) {
563 switch (read_for_csum(mdev, sector, size)) { 625 switch (read_for_csum(mdev, sector, size)) {
564 case 0: /* Disk failure*/ 626 case -EIO: /* Disk failure */
565 put_ldev(mdev); 627 put_ldev(mdev);
566 return 0; 628 return 0;
567 case 2: /* Allocation failed */ 629 case -EAGAIN: /* allocation failed, or ldev busy */
568 drbd_rs_complete_io(mdev, sector); 630 drbd_rs_complete_io(mdev, sector);
569 mdev->bm_resync_fo = BM_SECT_TO_BIT(sector); 631 mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
632 i = rollback_i;
570 goto requeue; 633 goto requeue;
571 /* case 1: everything ok */ 634 case 0:
635 /* everything ok */
636 break;
637 default:
638 BUG();
572 } 639 }
573 } else { 640 } else {
574 inc_rs_pending(mdev); 641 inc_rs_pending(mdev);
@@ -589,12 +656,12 @@ next_sector:
589 * resync data block, and the last bit is cleared. 656 * resync data block, and the last bit is cleared.
590 * until then resync "work" is "inactive" ... 657 * until then resync "work" is "inactive" ...
591 */ 658 */
592 mdev->resync_work.cb = w_resync_inactive;
593 put_ldev(mdev); 659 put_ldev(mdev);
594 return 1; 660 return 1;
595 } 661 }
596 662
597 requeue: 663 requeue:
664 mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
598 mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); 665 mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
599 put_ldev(mdev); 666 put_ldev(mdev);
600 return 1; 667 return 1;
@@ -609,27 +676,18 @@ static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int ca
609 if (unlikely(cancel)) 676 if (unlikely(cancel))
610 return 1; 677 return 1;
611 678
612 if (unlikely(mdev->state.conn < C_CONNECTED)) { 679 number = drbd_rs_number_requests(mdev);
613 dev_err(DEV, "Confused in w_make_ov_request()! cstate < Connected");
614 return 0;
615 }
616
617 number = SLEEP_TIME*mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ);
618 if (atomic_read(&mdev->rs_pending_cnt) > number)
619 goto requeue;
620
621 number -= atomic_read(&mdev->rs_pending_cnt);
622 680
623 sector = mdev->ov_position; 681 sector = mdev->ov_position;
624 for (i = 0; i < number; i++) { 682 for (i = 0; i < number; i++) {
625 if (sector >= capacity) { 683 if (sector >= capacity) {
626 mdev->resync_work.cb = w_resync_inactive;
627 return 1; 684 return 1;
628 } 685 }
629 686
630 size = BM_BLOCK_SIZE; 687 size = BM_BLOCK_SIZE;
631 688
632 if (drbd_try_rs_begin_io(mdev, sector)) { 689 if (drbd_rs_should_slow_down(mdev, sector) ||
690 drbd_try_rs_begin_io(mdev, sector)) {
633 mdev->ov_position = sector; 691 mdev->ov_position = sector;
634 goto requeue; 692 goto requeue;
635 } 693 }
@@ -647,11 +705,33 @@ static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int ca
647 mdev->ov_position = sector; 705 mdev->ov_position = sector;
648 706
649 requeue: 707 requeue:
708 mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
650 mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); 709 mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
651 return 1; 710 return 1;
652} 711}
653 712
654 713
714void start_resync_timer_fn(unsigned long data)
715{
716 struct drbd_conf *mdev = (struct drbd_conf *) data;
717
718 drbd_queue_work(&mdev->data.work, &mdev->start_resync_work);
719}
720
721int w_start_resync(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
722{
723 if (atomic_read(&mdev->unacked_cnt) || atomic_read(&mdev->rs_pending_cnt)) {
724 dev_warn(DEV, "w_start_resync later...\n");
725 mdev->start_resync_timer.expires = jiffies + HZ/10;
726 add_timer(&mdev->start_resync_timer);
727 return 1;
728 }
729
730 drbd_start_resync(mdev, C_SYNC_SOURCE);
731 clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->current_epoch->flags);
732 return 1;
733}
734
655int w_ov_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel) 735int w_ov_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
656{ 736{
657 kfree(w); 737 kfree(w);
@@ -670,6 +750,14 @@ static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int ca
670 return 1; 750 return 1;
671} 751}
672 752
753static void ping_peer(struct drbd_conf *mdev)
754{
755 clear_bit(GOT_PING_ACK, &mdev->flags);
756 request_ping(mdev);
757 wait_event(mdev->misc_wait,
758 test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED);
759}
760
673int drbd_resync_finished(struct drbd_conf *mdev) 761int drbd_resync_finished(struct drbd_conf *mdev)
674{ 762{
675 unsigned long db, dt, dbdt; 763 unsigned long db, dt, dbdt;
@@ -677,6 +765,7 @@ int drbd_resync_finished(struct drbd_conf *mdev)
677 union drbd_state os, ns; 765 union drbd_state os, ns;
678 struct drbd_work *w; 766 struct drbd_work *w;
679 char *khelper_cmd = NULL; 767 char *khelper_cmd = NULL;
768 int verify_done = 0;
680 769
681 /* Remove all elements from the resync LRU. Since future actions 770 /* Remove all elements from the resync LRU. Since future actions
682 * might set bits in the (main) bitmap, then the entries in the 771 * might set bits in the (main) bitmap, then the entries in the
@@ -687,9 +776,7 @@ int drbd_resync_finished(struct drbd_conf *mdev)
687 * queue (or even the read operations for those packets 776 * queue (or even the read operations for those packets
688 * is not finished by now). Retry in 100ms. */ 777 * is not finished by now). Retry in 100ms. */
689 778
690 drbd_kick_lo(mdev); 779 schedule_timeout_interruptible(HZ / 10);
691 __set_current_state(TASK_INTERRUPTIBLE);
692 schedule_timeout(HZ / 10);
693 w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC); 780 w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC);
694 if (w) { 781 if (w) {
695 w->cb = w_resync_finished; 782 w->cb = w_resync_finished;
@@ -709,9 +796,13 @@ int drbd_resync_finished(struct drbd_conf *mdev)
709 if (!get_ldev(mdev)) 796 if (!get_ldev(mdev))
710 goto out; 797 goto out;
711 798
799 ping_peer(mdev);
800
712 spin_lock_irq(&mdev->req_lock); 801 spin_lock_irq(&mdev->req_lock);
713 os = mdev->state; 802 os = mdev->state;
714 803
804 verify_done = (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T);
805
715 /* This protects us against multiple calls (that can happen in the presence 806 /* This protects us against multiple calls (that can happen in the presence
716 of application IO), and against connectivity loss just before we arrive here. */ 807 of application IO), and against connectivity loss just before we arrive here. */
717 if (os.conn <= C_CONNECTED) 808 if (os.conn <= C_CONNECTED)
@@ -721,8 +812,7 @@ int drbd_resync_finished(struct drbd_conf *mdev)
721 ns.conn = C_CONNECTED; 812 ns.conn = C_CONNECTED;
722 813
723 dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n", 814 dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
724 (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) ? 815 verify_done ? "Online verify " : "Resync",
725 "Online verify " : "Resync",
726 dt + mdev->rs_paused, mdev->rs_paused, dbdt); 816 dt + mdev->rs_paused, mdev->rs_paused, dbdt);
727 817
728 n_oos = drbd_bm_total_weight(mdev); 818 n_oos = drbd_bm_total_weight(mdev);
@@ -745,7 +835,7 @@ int drbd_resync_finished(struct drbd_conf *mdev)
745 const int ratio = 835 const int ratio =
746 (t == 0) ? 0 : 836 (t == 0) ? 0 :
747 (t < 100000) ? ((s*100)/t) : (s/(t/100)); 837 (t < 100000) ? ((s*100)/t) : (s/(t/100));
748 dev_info(DEV, "%u %% had equal check sums, eliminated: %luK; " 838 dev_info(DEV, "%u %% had equal checksums, eliminated: %luK; "
749 "transferred %luK total %luK\n", 839 "transferred %luK total %luK\n",
750 ratio, 840 ratio,
751 Bit2KB(mdev->rs_same_csum), 841 Bit2KB(mdev->rs_same_csum),
@@ -780,14 +870,18 @@ int drbd_resync_finished(struct drbd_conf *mdev)
780 } 870 }
781 } 871 }
782 872
783 drbd_uuid_set_bm(mdev, 0UL); 873 if (!(os.conn == C_VERIFY_S || os.conn == C_VERIFY_T)) {
784 874 /* for verify runs, we don't update uuids here,
785 if (mdev->p_uuid) { 875 * so there would be nothing to report. */
786 /* Now the two UUID sets are equal, update what we 876 drbd_uuid_set_bm(mdev, 0UL);
787 * know of the peer. */ 877 drbd_print_uuids(mdev, "updated UUIDs");
788 int i; 878 if (mdev->p_uuid) {
789 for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++) 879 /* Now the two UUID sets are equal, update what we
790 mdev->p_uuid[i] = mdev->ldev->md.uuid[i]; 880 * know of the peer. */
881 int i;
882 for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)
883 mdev->p_uuid[i] = mdev->ldev->md.uuid[i];
884 }
791 } 885 }
792 } 886 }
793 887
@@ -799,12 +893,10 @@ out:
799 mdev->rs_total = 0; 893 mdev->rs_total = 0;
800 mdev->rs_failed = 0; 894 mdev->rs_failed = 0;
801 mdev->rs_paused = 0; 895 mdev->rs_paused = 0;
802 mdev->ov_start_sector = 0; 896 if (verify_done)
897 mdev->ov_start_sector = 0;
803 898
804 if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) { 899 drbd_md_sync(mdev);
805 dev_warn(DEV, "Writing the whole bitmap, due to failed kmalloc\n");
806 drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished");
807 }
808 900
809 if (khelper_cmd) 901 if (khelper_cmd)
810 drbd_khelper(mdev, khelper_cmd); 902 drbd_khelper(mdev, khelper_cmd);
@@ -817,9 +909,13 @@ static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_ent
817{ 909{
818 if (drbd_ee_has_active_page(e)) { 910 if (drbd_ee_has_active_page(e)) {
819 /* This might happen if sendpage() has not finished */ 911 /* This might happen if sendpage() has not finished */
912 int i = (e->size + PAGE_SIZE -1) >> PAGE_SHIFT;
913 atomic_add(i, &mdev->pp_in_use_by_net);
914 atomic_sub(i, &mdev->pp_in_use);
820 spin_lock_irq(&mdev->req_lock); 915 spin_lock_irq(&mdev->req_lock);
821 list_add_tail(&e->w.list, &mdev->net_ee); 916 list_add_tail(&e->w.list, &mdev->net_ee);
822 spin_unlock_irq(&mdev->req_lock); 917 spin_unlock_irq(&mdev->req_lock);
918 wake_up(&drbd_pp_wait);
823 } else 919 } else
824 drbd_free_ee(mdev, e); 920 drbd_free_ee(mdev, e);
825} 921}
@@ -882,7 +978,9 @@ int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
882 put_ldev(mdev); 978 put_ldev(mdev);
883 } 979 }
884 980
885 if (likely((e->flags & EE_WAS_ERROR) == 0)) { 981 if (mdev->state.conn == C_AHEAD) {
982 ok = drbd_send_ack(mdev, P_RS_CANCEL, e);
983 } else if (likely((e->flags & EE_WAS_ERROR) == 0)) {
886 if (likely(mdev->state.pdsk >= D_INCONSISTENT)) { 984 if (likely(mdev->state.pdsk >= D_INCONSISTENT)) {
887 inc_rs_pending(mdev); 985 inc_rs_pending(mdev);
888 ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e); 986 ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
@@ -926,9 +1024,12 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
926 return 1; 1024 return 1;
927 } 1025 }
928 1026
929 drbd_rs_complete_io(mdev, e->sector); 1027 if (get_ldev(mdev)) {
1028 drbd_rs_complete_io(mdev, e->sector);
1029 put_ldev(mdev);
1030 }
930 1031
931 di = (struct digest_info *)(unsigned long)e->block_id; 1032 di = e->digest;
932 1033
933 if (likely((e->flags & EE_WAS_ERROR) == 0)) { 1034 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
934 /* quick hack to try to avoid a race against reconfiguration. 1035 /* quick hack to try to avoid a race against reconfiguration.
@@ -952,7 +1053,9 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
952 ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e); 1053 ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e);
953 } else { 1054 } else {
954 inc_rs_pending(mdev); 1055 inc_rs_pending(mdev);
955 e->block_id = ID_SYNCER; 1056 e->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */
1057 e->flags &= ~EE_HAS_DIGEST; /* This e no longer has a digest pointer */
1058 kfree(di);
956 ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e); 1059 ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
957 } 1060 }
958 } else { 1061 } else {
@@ -962,9 +1065,6 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
962 } 1065 }
963 1066
964 dec_unacked(mdev); 1067 dec_unacked(mdev);
965
966 kfree(di);
967
968 move_to_net_ee_or_free(mdev, e); 1068 move_to_net_ee_or_free(mdev, e);
969 1069
970 if (unlikely(!ok)) 1070 if (unlikely(!ok))
@@ -972,9 +1072,12 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
972 return ok; 1072 return ok;
973} 1073}
974 1074
1075/* TODO merge common code with w_e_send_csum */
975int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) 1076int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
976{ 1077{
977 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); 1078 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
1079 sector_t sector = e->sector;
1080 unsigned int size = e->size;
978 int digest_size; 1081 int digest_size;
979 void *digest; 1082 void *digest;
980 int ok = 1; 1083 int ok = 1;
@@ -982,27 +1085,37 @@ int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
982 if (unlikely(cancel)) 1085 if (unlikely(cancel))
983 goto out; 1086 goto out;
984 1087
985 if (unlikely((e->flags & EE_WAS_ERROR) != 0))
986 goto out;
987
988 digest_size = crypto_hash_digestsize(mdev->verify_tfm); 1088 digest_size = crypto_hash_digestsize(mdev->verify_tfm);
989 /* FIXME if this allocation fails, online verify will not terminate! */
990 digest = kmalloc(digest_size, GFP_NOIO); 1089 digest = kmalloc(digest_size, GFP_NOIO);
991 if (digest) { 1090 if (!digest) {
992 drbd_csum_ee(mdev, mdev->verify_tfm, e, digest); 1091 ok = 0; /* terminate the connection in case the allocation failed */
993 inc_rs_pending(mdev); 1092 goto out;
994 ok = drbd_send_drequest_csum(mdev, e->sector, e->size,
995 digest, digest_size, P_OV_REPLY);
996 if (!ok)
997 dec_rs_pending(mdev);
998 kfree(digest);
999 } 1093 }
1000 1094
1001out: 1095 if (likely(!(e->flags & EE_WAS_ERROR)))
1096 drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
1097 else
1098 memset(digest, 0, digest_size);
1099
1100 /* Free e and pages before send.
1101 * In case we block on congestion, we could otherwise run into
1102 * some distributed deadlock, if the other side blocks on
1103 * congestion as well, because our receiver blocks in
1104 * drbd_pp_alloc due to pp_in_use > max_buffers. */
1002 drbd_free_ee(mdev, e); 1105 drbd_free_ee(mdev, e);
1106 e = NULL;
1107 inc_rs_pending(mdev);
1108 ok = drbd_send_drequest_csum(mdev, sector, size,
1109 digest, digest_size,
1110 P_OV_REPLY);
1111 if (!ok)
1112 dec_rs_pending(mdev);
1113 kfree(digest);
1003 1114
1115out:
1116 if (e)
1117 drbd_free_ee(mdev, e);
1004 dec_unacked(mdev); 1118 dec_unacked(mdev);
1005
1006 return ok; 1119 return ok;
1007} 1120}
1008 1121
@@ -1015,15 +1128,16 @@ void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size)
1015 mdev->ov_last_oos_size = size>>9; 1128 mdev->ov_last_oos_size = size>>9;
1016 } 1129 }
1017 drbd_set_out_of_sync(mdev, sector, size); 1130 drbd_set_out_of_sync(mdev, sector, size);
1018 set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
1019} 1131}
1020 1132
1021int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel) 1133int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1022{ 1134{
1023 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); 1135 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
1024 struct digest_info *di; 1136 struct digest_info *di;
1025 int digest_size;
1026 void *digest; 1137 void *digest;
1138 sector_t sector = e->sector;
1139 unsigned int size = e->size;
1140 int digest_size;
1027 int ok, eq = 0; 1141 int ok, eq = 0;
1028 1142
1029 if (unlikely(cancel)) { 1143 if (unlikely(cancel)) {
@@ -1034,9 +1148,12 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1034 1148
1035 /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all 1149 /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
1036 * the resync lru has been cleaned up already */ 1150 * the resync lru has been cleaned up already */
1037 drbd_rs_complete_io(mdev, e->sector); 1151 if (get_ldev(mdev)) {
1152 drbd_rs_complete_io(mdev, e->sector);
1153 put_ldev(mdev);
1154 }
1038 1155
1039 di = (struct digest_info *)(unsigned long)e->block_id; 1156 di = e->digest;
1040 1157
1041 if (likely((e->flags & EE_WAS_ERROR) == 0)) { 1158 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
1042 digest_size = crypto_hash_digestsize(mdev->verify_tfm); 1159 digest_size = crypto_hash_digestsize(mdev->verify_tfm);
@@ -1048,27 +1165,31 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1048 eq = !memcmp(digest, di->digest, digest_size); 1165 eq = !memcmp(digest, di->digest, digest_size);
1049 kfree(digest); 1166 kfree(digest);
1050 } 1167 }
1051 } else {
1052 ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
1053 if (__ratelimit(&drbd_ratelimit_state))
1054 dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
1055 } 1168 }
1056 1169
1057 dec_unacked(mdev); 1170 /* Free e and pages before send.
1058 1171 * In case we block on congestion, we could otherwise run into
1059 kfree(di); 1172 * some distributed deadlock, if the other side blocks on
1060 1173 * congestion as well, because our receiver blocks in
1174 * drbd_pp_alloc due to pp_in_use > max_buffers. */
1175 drbd_free_ee(mdev, e);
1061 if (!eq) 1176 if (!eq)
1062 drbd_ov_oos_found(mdev, e->sector, e->size); 1177 drbd_ov_oos_found(mdev, sector, size);
1063 else 1178 else
1064 ov_oos_print(mdev); 1179 ov_oos_print(mdev);
1065 1180
1066 ok = drbd_send_ack_ex(mdev, P_OV_RESULT, e->sector, e->size, 1181 ok = drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size,
1067 eq ? ID_IN_SYNC : ID_OUT_OF_SYNC); 1182 eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
1068 1183
1069 drbd_free_ee(mdev, e); 1184 dec_unacked(mdev);
1070 1185
1071 if (--mdev->ov_left == 0) { 1186 --mdev->ov_left;
1187
1188 /* let's advance progress step marks only for every other megabyte */
1189 if ((mdev->ov_left & 0x200) == 0x200)
1190 drbd_advance_rs_marks(mdev, mdev->ov_left);
1191
1192 if (mdev->ov_left == 0) {
1072 ov_oos_print(mdev); 1193 ov_oos_print(mdev);
1073 drbd_resync_finished(mdev); 1194 drbd_resync_finished(mdev);
1074 } 1195 }
@@ -1108,7 +1229,7 @@ int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1108 * dec_ap_pending will be done in got_BarrierAck 1229 * dec_ap_pending will be done in got_BarrierAck
1109 * or (on connection loss) in w_clear_epoch. */ 1230 * or (on connection loss) in w_clear_epoch. */
1110 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER, 1231 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER,
1111 (struct p_header *)p, sizeof(*p), 0); 1232 (struct p_header80 *)p, sizeof(*p), 0);
1112 drbd_put_data_sock(mdev); 1233 drbd_put_data_sock(mdev);
1113 1234
1114 return ok; 1235 return ok;
@@ -1121,6 +1242,22 @@ int w_send_write_hint(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1121 return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE); 1242 return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE);
1122} 1243}
1123 1244
1245int w_send_oos(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1246{
1247 struct drbd_request *req = container_of(w, struct drbd_request, w);
1248 int ok;
1249
1250 if (unlikely(cancel)) {
1251 req_mod(req, send_canceled);
1252 return 1;
1253 }
1254
1255 ok = drbd_send_oos(mdev, req);
1256 req_mod(req, oos_handed_to_network);
1257
1258 return ok;
1259}
1260
1124/** 1261/**
1125 * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request 1262 * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
1126 * @mdev: DRBD device. 1263 * @mdev: DRBD device.
@@ -1173,6 +1310,24 @@ int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1173 return ok; 1310 return ok;
1174} 1311}
1175 1312
1313int w_restart_disk_io(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1314{
1315 struct drbd_request *req = container_of(w, struct drbd_request, w);
1316
1317 if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
1318 drbd_al_begin_io(mdev, req->sector);
1319 /* Calling drbd_al_begin_io() out of the worker might deadlocks
1320 theoretically. Practically it can not deadlock, since this is
1321 only used when unfreezing IOs. All the extents of the requests
1322 that made it into the TL are already active */
1323
1324 drbd_req_make_private_bio(req, req->master_bio);
1325 req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
1326 generic_make_request(req->private_bio);
1327
1328 return 1;
1329}
1330
1176static int _drbd_may_sync_now(struct drbd_conf *mdev) 1331static int _drbd_may_sync_now(struct drbd_conf *mdev)
1177{ 1332{
1178 struct drbd_conf *odev = mdev; 1333 struct drbd_conf *odev = mdev;
@@ -1298,12 +1453,15 @@ int drbd_alter_sa(struct drbd_conf *mdev, int na)
1298 return retcode; 1453 return retcode;
1299} 1454}
1300 1455
1301static void ping_peer(struct drbd_conf *mdev) 1456void drbd_rs_controller_reset(struct drbd_conf *mdev)
1302{ 1457{
1303 clear_bit(GOT_PING_ACK, &mdev->flags); 1458 atomic_set(&mdev->rs_sect_in, 0);
1304 request_ping(mdev); 1459 atomic_set(&mdev->rs_sect_ev, 0);
1305 wait_event(mdev->misc_wait, 1460 mdev->rs_in_flight = 0;
1306 test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED); 1461 mdev->rs_planed = 0;
1462 spin_lock(&mdev->peer_seq_lock);
1463 fifo_set(&mdev->rs_plan_s, 0);
1464 spin_unlock(&mdev->peer_seq_lock);
1307} 1465}
1308 1466
1309/** 1467/**
@@ -1319,13 +1477,18 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
1319 union drbd_state ns; 1477 union drbd_state ns;
1320 int r; 1478 int r;
1321 1479
1322 if (mdev->state.conn >= C_SYNC_SOURCE) { 1480 if (mdev->state.conn >= C_SYNC_SOURCE && mdev->state.conn < C_AHEAD) {
1323 dev_err(DEV, "Resync already running!\n"); 1481 dev_err(DEV, "Resync already running!\n");
1324 return; 1482 return;
1325 } 1483 }
1326 1484
1327 /* In case a previous resync run was aborted by an IO error/detach on the peer. */ 1485 if (mdev->state.conn < C_AHEAD) {
1328 drbd_rs_cancel_all(mdev); 1486 /* In case a previous resync run was aborted by an IO error/detach on the peer. */
1487 drbd_rs_cancel_all(mdev);
1488 /* This should be done when we abort the resync. We definitely do not
1489 want to have this for connections going back and forth between
1490 Ahead/Behind and SyncSource/SyncTarget */
1491 }
1329 1492
1330 if (side == C_SYNC_TARGET) { 1493 if (side == C_SYNC_TARGET) {
1331 /* Since application IO was locked out during C_WF_BITMAP_T and 1494 /* Since application IO was locked out during C_WF_BITMAP_T and
@@ -1339,6 +1502,20 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
1339 drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 1502 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
1340 return; 1503 return;
1341 } 1504 }
1505 } else /* C_SYNC_SOURCE */ {
1506 r = drbd_khelper(mdev, "before-resync-source");
1507 r = (r >> 8) & 0xff;
1508 if (r > 0) {
1509 if (r == 3) {
1510 dev_info(DEV, "before-resync-source handler returned %d, "
1511 "ignoring. Old userland tools?", r);
1512 } else {
1513 dev_info(DEV, "before-resync-source handler returned %d, "
1514 "dropping connection.\n", r);
1515 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
1516 return;
1517 }
1518 }
1342 } 1519 }
1343 1520
1344 drbd_state_lock(mdev); 1521 drbd_state_lock(mdev);
@@ -1348,18 +1525,6 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
1348 return; 1525 return;
1349 } 1526 }
1350 1527
1351 if (side == C_SYNC_TARGET) {
1352 mdev->bm_resync_fo = 0;
1353 } else /* side == C_SYNC_SOURCE */ {
1354 u64 uuid;
1355
1356 get_random_bytes(&uuid, sizeof(u64));
1357 drbd_uuid_set(mdev, UI_BITMAP, uuid);
1358 drbd_send_sync_uuid(mdev, uuid);
1359
1360 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
1361 }
1362
1363 write_lock_irq(&global_state_lock); 1528 write_lock_irq(&global_state_lock);
1364 ns = mdev->state; 1529 ns = mdev->state;
1365 1530
@@ -1379,30 +1544,62 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
1379 r = SS_UNKNOWN_ERROR; 1544 r = SS_UNKNOWN_ERROR;
1380 1545
1381 if (r == SS_SUCCESS) { 1546 if (r == SS_SUCCESS) {
1382 mdev->rs_total = 1547 unsigned long tw = drbd_bm_total_weight(mdev);
1383 mdev->rs_mark_left = drbd_bm_total_weight(mdev); 1548 unsigned long now = jiffies;
1549 int i;
1550
1384 mdev->rs_failed = 0; 1551 mdev->rs_failed = 0;
1385 mdev->rs_paused = 0; 1552 mdev->rs_paused = 0;
1386 mdev->rs_start =
1387 mdev->rs_mark_time = jiffies;
1388 mdev->rs_same_csum = 0; 1553 mdev->rs_same_csum = 0;
1554 mdev->rs_last_events = 0;
1555 mdev->rs_last_sect_ev = 0;
1556 mdev->rs_total = tw;
1557 mdev->rs_start = now;
1558 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
1559 mdev->rs_mark_left[i] = tw;
1560 mdev->rs_mark_time[i] = now;
1561 }
1389 _drbd_pause_after(mdev); 1562 _drbd_pause_after(mdev);
1390 } 1563 }
1391 write_unlock_irq(&global_state_lock); 1564 write_unlock_irq(&global_state_lock);
1392 put_ldev(mdev);
1393 1565
1394 if (r == SS_SUCCESS) { 1566 if (r == SS_SUCCESS) {
1395 dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n", 1567 dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
1396 drbd_conn_str(ns.conn), 1568 drbd_conn_str(ns.conn),
1397 (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10), 1569 (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10),
1398 (unsigned long) mdev->rs_total); 1570 (unsigned long) mdev->rs_total);
1399 1571 if (side == C_SYNC_TARGET)
1400 if (mdev->rs_total == 0) { 1572 mdev->bm_resync_fo = 0;
1401 /* Peer still reachable? Beware of failing before-resync-target handlers! */ 1573
1402 ping_peer(mdev); 1574 /* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid
1575 * with w_send_oos, or the sync target will get confused as to
1576 * how much bits to resync. We cannot do that always, because for an
1577 * empty resync and protocol < 95, we need to do it here, as we call
1578 * drbd_resync_finished from here in that case.
1579 * We drbd_gen_and_send_sync_uuid here for protocol < 96,
1580 * and from after_state_ch otherwise. */
1581 if (side == C_SYNC_SOURCE && mdev->agreed_pro_version < 96)
1582 drbd_gen_and_send_sync_uuid(mdev);
1583
1584 if (mdev->agreed_pro_version < 95 && mdev->rs_total == 0) {
1585 /* This still has a race (about when exactly the peers
1586 * detect connection loss) that can lead to a full sync
1587 * on next handshake. In 8.3.9 we fixed this with explicit
1588 * resync-finished notifications, but the fix
1589 * introduces a protocol change. Sleeping for some
1590 * time longer than the ping interval + timeout on the
1591 * SyncSource, to give the SyncTarget the chance to
1592 * detect connection loss, then waiting for a ping
1593 * response (implicit in drbd_resync_finished) reduces
1594 * the race considerably, but does not solve it. */
1595 if (side == C_SYNC_SOURCE)
1596 schedule_timeout_interruptible(
1597 mdev->net_conf->ping_int * HZ +
1598 mdev->net_conf->ping_timeo*HZ/9);
1403 drbd_resync_finished(mdev); 1599 drbd_resync_finished(mdev);
1404 } 1600 }
1405 1601
1602 drbd_rs_controller_reset(mdev);
1406 /* ns.conn may already be != mdev->state.conn, 1603 /* ns.conn may already be != mdev->state.conn,
1407 * we may have been paused in between, or become paused until 1604 * we may have been paused in between, or become paused until
1408 * the timer triggers. 1605 * the timer triggers.
@@ -1412,6 +1609,7 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
1412 1609
1413 drbd_md_sync(mdev); 1610 drbd_md_sync(mdev);
1414 } 1611 }
1612 put_ldev(mdev);
1415 drbd_state_unlock(mdev); 1613 drbd_state_unlock(mdev);
1416} 1614}
1417 1615
diff --git a/drivers/block/drbd/drbd_wrappers.h b/drivers/block/drbd/drbd_wrappers.h
index defdb5013ea3..151f1a37478f 100644
--- a/drivers/block/drbd/drbd_wrappers.h
+++ b/drivers/block/drbd/drbd_wrappers.h
@@ -39,30 +39,12 @@ static inline void drbd_generic_make_request(struct drbd_conf *mdev,
39 return; 39 return;
40 } 40 }
41 41
42 if (FAULT_ACTIVE(mdev, fault_type)) 42 if (drbd_insert_fault(mdev, fault_type))
43 bio_endio(bio, -EIO); 43 bio_endio(bio, -EIO);
44 else 44 else
45 generic_make_request(bio); 45 generic_make_request(bio);
46} 46}
47 47
48static inline void drbd_plug_device(struct drbd_conf *mdev)
49{
50 struct request_queue *q;
51 q = bdev_get_queue(mdev->this_bdev);
52
53 spin_lock_irq(q->queue_lock);
54
55/* XXX the check on !blk_queue_plugged is redundant,
56 * implicitly checked in blk_plug_device */
57
58 if (!blk_queue_plugged(q)) {
59 blk_plug_device(q);
60 del_timer(&q->unplug_timer);
61 /* unplugging should not happen automatically... */
62 }
63 spin_unlock_irq(q->queue_lock);
64}
65
66static inline int drbd_crypto_is_hash(struct crypto_tfm *tfm) 48static inline int drbd_crypto_is_hash(struct crypto_tfm *tfm)
67{ 49{
68 return (crypto_tfm_alg_type(tfm) & CRYPTO_ALG_TYPE_HASH_MASK) 50 return (crypto_tfm_alg_type(tfm) & CRYPTO_ALG_TYPE_HASH_MASK)
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index cf04c1b234ed..98de8f418676 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -178,7 +178,6 @@ static int print_unex = 1;
178#include <linux/slab.h> 178#include <linux/slab.h>
179#include <linux/mm.h> 179#include <linux/mm.h>
180#include <linux/bio.h> 180#include <linux/bio.h>
181#include <linux/smp_lock.h>
182#include <linux/string.h> 181#include <linux/string.h>
183#include <linux/jiffies.h> 182#include <linux/jiffies.h>
184#include <linux/fcntl.h> 183#include <linux/fcntl.h>
@@ -199,6 +198,7 @@ static int print_unex = 1;
199 * It's been recommended that take about 1/4 of the default speed 198 * It's been recommended that take about 1/4 of the default speed
200 * in some more extreme cases. 199 * in some more extreme cases.
201 */ 200 */
201static DEFINE_MUTEX(floppy_mutex);
202static int slow_floppy; 202static int slow_floppy;
203 203
204#include <asm/dma.h> 204#include <asm/dma.h>
@@ -258,8 +258,8 @@ static int irqdma_allocated;
258#include <linux/completion.h> 258#include <linux/completion.h>
259 259
260static struct request *current_req; 260static struct request *current_req;
261static struct request_queue *floppy_queue;
262static void do_fd_request(struct request_queue *q); 261static void do_fd_request(struct request_queue *q);
262static int set_next_request(void);
263 263
264#ifndef fd_get_dma_residue 264#ifndef fd_get_dma_residue
265#define fd_get_dma_residue() get_dma_residue(FLOPPY_DMA) 265#define fd_get_dma_residue() get_dma_residue(FLOPPY_DMA)
@@ -413,6 +413,7 @@ static struct gendisk *disks[N_DRIVE];
413static struct block_device *opened_bdev[N_DRIVE]; 413static struct block_device *opened_bdev[N_DRIVE];
414static DEFINE_MUTEX(open_lock); 414static DEFINE_MUTEX(open_lock);
415static struct floppy_raw_cmd *raw_cmd, default_raw_cmd; 415static struct floppy_raw_cmd *raw_cmd, default_raw_cmd;
416static int fdc_queue;
416 417
417/* 418/*
418 * This struct defines the different floppy types. 419 * This struct defines the different floppy types.
@@ -596,6 +597,11 @@ static unsigned char fsector_t; /* sector in track */
596static unsigned char in_sector_offset; /* offset within physical sector, 597static unsigned char in_sector_offset; /* offset within physical sector,
597 * expressed in units of 512 bytes */ 598 * expressed in units of 512 bytes */
598 599
600static inline bool drive_no_geom(int drive)
601{
602 return !current_type[drive] && !ITYPE(UDRS->fd_device);
603}
604
599#ifndef fd_eject 605#ifndef fd_eject
600static inline int fd_eject(int drive) 606static inline int fd_eject(int drive)
601{ 607{
@@ -890,8 +896,8 @@ static void unlock_fdc(void)
890 del_timer(&fd_timeout); 896 del_timer(&fd_timeout);
891 cont = NULL; 897 cont = NULL;
892 clear_bit(0, &fdc_busy); 898 clear_bit(0, &fdc_busy);
893 if (current_req || blk_peek_request(floppy_queue)) 899 if (current_req || set_next_request())
894 do_fd_request(floppy_queue); 900 do_fd_request(current_req->q);
895 spin_unlock_irqrestore(&floppy_lock, flags); 901 spin_unlock_irqrestore(&floppy_lock, flags);
896 wake_up(&fdc_wait); 902 wake_up(&fdc_wait);
897} 903}
@@ -1032,6 +1038,7 @@ static void floppy_disable_hlt(void)
1032{ 1038{
1033 unsigned long flags; 1039 unsigned long flags;
1034 1040
1041 WARN_ONCE(1, "floppy_disable_hlt() scheduled for removal in 2012");
1035 spin_lock_irqsave(&floppy_hlt_lock, flags); 1042 spin_lock_irqsave(&floppy_hlt_lock, flags);
1036 if (!hlt_disabled) { 1043 if (!hlt_disabled) {
1037 hlt_disabled = 1; 1044 hlt_disabled = 1;
@@ -2243,8 +2250,8 @@ static void floppy_end_request(struct request *req, int error)
2243 * logical buffer */ 2250 * logical buffer */
2244static void request_done(int uptodate) 2251static void request_done(int uptodate)
2245{ 2252{
2246 struct request_queue *q = floppy_queue;
2247 struct request *req = current_req; 2253 struct request *req = current_req;
2254 struct request_queue *q;
2248 unsigned long flags; 2255 unsigned long flags;
2249 int block; 2256 int block;
2250 char msg[sizeof("request done ") + sizeof(int) * 3]; 2257 char msg[sizeof("request done ") + sizeof(int) * 3];
@@ -2258,6 +2265,8 @@ static void request_done(int uptodate)
2258 return; 2265 return;
2259 } 2266 }
2260 2267
2268 q = req->q;
2269
2261 if (uptodate) { 2270 if (uptodate) {
2262 /* maintain values for invalidation on geometry 2271 /* maintain values for invalidation on geometry
2263 * change */ 2272 * change */
@@ -2811,6 +2820,28 @@ static int make_raw_rw_request(void)
2811 return 2; 2820 return 2;
2812} 2821}
2813 2822
2823/*
2824 * Round-robin between our available drives, doing one request from each
2825 */
2826static int set_next_request(void)
2827{
2828 struct request_queue *q;
2829 int old_pos = fdc_queue;
2830
2831 do {
2832 q = disks[fdc_queue]->queue;
2833 if (++fdc_queue == N_DRIVE)
2834 fdc_queue = 0;
2835 if (q) {
2836 current_req = blk_fetch_request(q);
2837 if (current_req)
2838 break;
2839 }
2840 } while (fdc_queue != old_pos);
2841
2842 return current_req != NULL;
2843}
2844
2814static void redo_fd_request(void) 2845static void redo_fd_request(void)
2815{ 2846{
2816 int drive; 2847 int drive;
@@ -2822,17 +2853,17 @@ static void redo_fd_request(void)
2822 2853
2823do_request: 2854do_request:
2824 if (!current_req) { 2855 if (!current_req) {
2825 struct request *req; 2856 int pending;
2857
2858 spin_lock_irq(&floppy_lock);
2859 pending = set_next_request();
2860 spin_unlock_irq(&floppy_lock);
2826 2861
2827 spin_lock_irq(floppy_queue->queue_lock); 2862 if (!pending) {
2828 req = blk_fetch_request(floppy_queue);
2829 spin_unlock_irq(floppy_queue->queue_lock);
2830 if (!req) {
2831 do_floppy = NULL; 2863 do_floppy = NULL;
2832 unlock_fdc(); 2864 unlock_fdc();
2833 return; 2865 return;
2834 } 2866 }
2835 current_req = req;
2836 } 2867 }
2837 drive = (long)current_req->rq_disk->private_data; 2868 drive = (long)current_req->rq_disk->private_data;
2838 set_fdc(drive); 2869 set_fdc(drive);
@@ -3251,7 +3282,7 @@ static int set_geometry(unsigned int cmd, struct floppy_struct *g,
3251 struct block_device *bdev = opened_bdev[cnt]; 3282 struct block_device *bdev = opened_bdev[cnt];
3252 if (!bdev || ITYPE(drive_state[cnt].fd_device) != type) 3283 if (!bdev || ITYPE(drive_state[cnt].fd_device) != type)
3253 continue; 3284 continue;
3254 __invalidate_device(bdev); 3285 __invalidate_device(bdev, true);
3255 } 3286 }
3256 mutex_unlock(&open_lock); 3287 mutex_unlock(&open_lock);
3257 } else { 3288 } else {
@@ -3553,9 +3584,9 @@ static int fd_ioctl(struct block_device *bdev, fmode_t mode,
3553{ 3584{
3554 int ret; 3585 int ret;
3555 3586
3556 lock_kernel(); 3587 mutex_lock(&floppy_mutex);
3557 ret = fd_locked_ioctl(bdev, mode, cmd, param); 3588 ret = fd_locked_ioctl(bdev, mode, cmd, param);
3558 unlock_kernel(); 3589 mutex_unlock(&floppy_mutex);
3559 3590
3560 return ret; 3591 return ret;
3561} 3592}
@@ -3616,7 +3647,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
3616{ 3647{
3617 int drive = (long)disk->private_data; 3648 int drive = (long)disk->private_data;
3618 3649
3619 lock_kernel(); 3650 mutex_lock(&floppy_mutex);
3620 mutex_lock(&open_lock); 3651 mutex_lock(&open_lock);
3621 if (UDRS->fd_ref < 0) 3652 if (UDRS->fd_ref < 0)
3622 UDRS->fd_ref = 0; 3653 UDRS->fd_ref = 0;
@@ -3627,7 +3658,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
3627 if (!UDRS->fd_ref) 3658 if (!UDRS->fd_ref)
3628 opened_bdev[drive] = NULL; 3659 opened_bdev[drive] = NULL;
3629 mutex_unlock(&open_lock); 3660 mutex_unlock(&open_lock);
3630 unlock_kernel(); 3661 mutex_unlock(&floppy_mutex);
3631 3662
3632 return 0; 3663 return 0;
3633} 3664}
@@ -3645,7 +3676,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
3645 int res = -EBUSY; 3676 int res = -EBUSY;
3646 char *tmp; 3677 char *tmp;
3647 3678
3648 lock_kernel(); 3679 mutex_lock(&floppy_mutex);
3649 mutex_lock(&open_lock); 3680 mutex_lock(&open_lock);
3650 old_dev = UDRS->fd_device; 3681 old_dev = UDRS->fd_device;
3651 if (opened_bdev[drive] && opened_bdev[drive] != bdev) 3682 if (opened_bdev[drive] && opened_bdev[drive] != bdev)
@@ -3722,7 +3753,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
3722 goto out; 3753 goto out;
3723 } 3754 }
3724 mutex_unlock(&open_lock); 3755 mutex_unlock(&open_lock);
3725 unlock_kernel(); 3756 mutex_unlock(&floppy_mutex);
3726 return 0; 3757 return 0;
3727out: 3758out:
3728 if (UDRS->fd_ref < 0) 3759 if (UDRS->fd_ref < 0)
@@ -3733,20 +3764,21 @@ out:
3733 opened_bdev[drive] = NULL; 3764 opened_bdev[drive] = NULL;
3734out2: 3765out2:
3735 mutex_unlock(&open_lock); 3766 mutex_unlock(&open_lock);
3736 unlock_kernel(); 3767 mutex_unlock(&floppy_mutex);
3737 return res; 3768 return res;
3738} 3769}
3739 3770
3740/* 3771/*
3741 * Check if the disk has been changed or if a change has been faked. 3772 * Check if the disk has been changed or if a change has been faked.
3742 */ 3773 */
3743static int check_floppy_change(struct gendisk *disk) 3774static unsigned int floppy_check_events(struct gendisk *disk,
3775 unsigned int clearing)
3744{ 3776{
3745 int drive = (long)disk->private_data; 3777 int drive = (long)disk->private_data;
3746 3778
3747 if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags) || 3779 if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags) ||
3748 test_bit(FD_VERIFY_BIT, &UDRS->flags)) 3780 test_bit(FD_VERIFY_BIT, &UDRS->flags))
3749 return 1; 3781 return DISK_EVENT_MEDIA_CHANGE;
3750 3782
3751 if (time_after(jiffies, UDRS->last_checked + UDP->checkfreq)) { 3783 if (time_after(jiffies, UDRS->last_checked + UDP->checkfreq)) {
3752 lock_fdc(drive, false); 3784 lock_fdc(drive, false);
@@ -3757,8 +3789,8 @@ static int check_floppy_change(struct gendisk *disk)
3757 if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags) || 3789 if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags) ||
3758 test_bit(FD_VERIFY_BIT, &UDRS->flags) || 3790 test_bit(FD_VERIFY_BIT, &UDRS->flags) ||
3759 test_bit(drive, &fake_change) || 3791 test_bit(drive, &fake_change) ||
3760 (!ITYPE(UDRS->fd_device) && !current_type[drive])) 3792 drive_no_geom(drive))
3761 return 1; 3793 return DISK_EVENT_MEDIA_CHANGE;
3762 return 0; 3794 return 0;
3763} 3795}
3764 3796
@@ -3807,7 +3839,6 @@ static int __floppy_read_block_0(struct block_device *bdev)
3807 bio.bi_end_io = floppy_rb0_complete; 3839 bio.bi_end_io = floppy_rb0_complete;
3808 3840
3809 submit_bio(READ, &bio); 3841 submit_bio(READ, &bio);
3810 generic_unplug_device(bdev_get_queue(bdev));
3811 process_fd_request(); 3842 process_fd_request();
3812 wait_for_completion(&complete); 3843 wait_for_completion(&complete);
3813 3844
@@ -3823,13 +3854,13 @@ static int __floppy_read_block_0(struct block_device *bdev)
3823static int floppy_revalidate(struct gendisk *disk) 3854static int floppy_revalidate(struct gendisk *disk)
3824{ 3855{
3825 int drive = (long)disk->private_data; 3856 int drive = (long)disk->private_data;
3826#define NO_GEOM (!current_type[drive] && !ITYPE(UDRS->fd_device))
3827 int cf; 3857 int cf;
3828 int res = 0; 3858 int res = 0;
3829 3859
3830 if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags) || 3860 if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags) ||
3831 test_bit(FD_VERIFY_BIT, &UDRS->flags) || 3861 test_bit(FD_VERIFY_BIT, &UDRS->flags) ||
3832 test_bit(drive, &fake_change) || NO_GEOM) { 3862 test_bit(drive, &fake_change) ||
3863 drive_no_geom(drive)) {
3833 if (WARN(atomic_read(&usage_count) == 0, 3864 if (WARN(atomic_read(&usage_count) == 0,
3834 "VFS: revalidate called on non-open device.\n")) 3865 "VFS: revalidate called on non-open device.\n"))
3835 return -EFAULT; 3866 return -EFAULT;
@@ -3837,7 +3868,7 @@ static int floppy_revalidate(struct gendisk *disk)
3837 lock_fdc(drive, false); 3868 lock_fdc(drive, false);
3838 cf = (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags) || 3869 cf = (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags) ||
3839 test_bit(FD_VERIFY_BIT, &UDRS->flags)); 3870 test_bit(FD_VERIFY_BIT, &UDRS->flags));
3840 if (!(cf || test_bit(drive, &fake_change) || NO_GEOM)) { 3871 if (!(cf || test_bit(drive, &fake_change) || drive_no_geom(drive))) {
3841 process_fd_request(); /*already done by another thread */ 3872 process_fd_request(); /*already done by another thread */
3842 return 0; 3873 return 0;
3843 } 3874 }
@@ -3849,7 +3880,7 @@ static int floppy_revalidate(struct gendisk *disk)
3849 clear_bit(FD_DISK_CHANGED_BIT, &UDRS->flags); 3880 clear_bit(FD_DISK_CHANGED_BIT, &UDRS->flags);
3850 if (cf) 3881 if (cf)
3851 UDRS->generation++; 3882 UDRS->generation++;
3852 if (NO_GEOM) { 3883 if (drive_no_geom(drive)) {
3853 /* auto-sensing */ 3884 /* auto-sensing */
3854 res = __floppy_read_block_0(opened_bdev[drive]); 3885 res = __floppy_read_block_0(opened_bdev[drive]);
3855 } else { 3886 } else {
@@ -3868,7 +3899,7 @@ static const struct block_device_operations floppy_fops = {
3868 .release = floppy_release, 3899 .release = floppy_release,
3869 .ioctl = fd_ioctl, 3900 .ioctl = fd_ioctl,
3870 .getgeo = fd_getgeo, 3901 .getgeo = fd_getgeo,
3871 .media_changed = check_floppy_change, 3902 .check_events = floppy_check_events,
3872 .revalidate_disk = floppy_revalidate, 3903 .revalidate_disk = floppy_revalidate,
3873}; 3904};
3874 3905
@@ -4165,6 +4196,13 @@ static int __init floppy_init(void)
4165 goto out_put_disk; 4196 goto out_put_disk;
4166 } 4197 }
4167 4198
4199 disks[dr]->queue = blk_init_queue(do_fd_request, &floppy_lock);
4200 if (!disks[dr]->queue) {
4201 err = -ENOMEM;
4202 goto out_put_disk;
4203 }
4204
4205 blk_queue_max_hw_sectors(disks[dr]->queue, 64);
4168 disks[dr]->major = FLOPPY_MAJOR; 4206 disks[dr]->major = FLOPPY_MAJOR;
4169 disks[dr]->first_minor = TOMINOR(dr); 4207 disks[dr]->first_minor = TOMINOR(dr);
4170 disks[dr]->fops = &floppy_fops; 4208 disks[dr]->fops = &floppy_fops;
@@ -4183,13 +4221,6 @@ static int __init floppy_init(void)
4183 if (err) 4221 if (err)
4184 goto out_unreg_blkdev; 4222 goto out_unreg_blkdev;
4185 4223
4186 floppy_queue = blk_init_queue(do_fd_request, &floppy_lock);
4187 if (!floppy_queue) {
4188 err = -ENOMEM;
4189 goto out_unreg_driver;
4190 }
4191 blk_queue_max_hw_sectors(floppy_queue, 64);
4192
4193 blk_register_region(MKDEV(FLOPPY_MAJOR, 0), 256, THIS_MODULE, 4224 blk_register_region(MKDEV(FLOPPY_MAJOR, 0), 256, THIS_MODULE,
4194 floppy_find, NULL, NULL); 4225 floppy_find, NULL, NULL);
4195 4226
@@ -4317,7 +4348,6 @@ static int __init floppy_init(void)
4317 4348
4318 /* to be cleaned up... */ 4349 /* to be cleaned up... */
4319 disks[drive]->private_data = (void *)(long)drive; 4350 disks[drive]->private_data = (void *)(long)drive;
4320 disks[drive]->queue = floppy_queue;
4321 disks[drive]->flags |= GENHD_FL_REMOVABLE; 4351 disks[drive]->flags |= GENHD_FL_REMOVABLE;
4322 disks[drive]->driverfs_dev = &floppy_device[drive].dev; 4352 disks[drive]->driverfs_dev = &floppy_device[drive].dev;
4323 add_disk(disks[drive]); 4353 add_disk(disks[drive]);
@@ -4328,19 +4358,19 @@ static int __init floppy_init(void)
4328out_unreg_platform_dev: 4358out_unreg_platform_dev:
4329 platform_device_unregister(&floppy_device[drive]); 4359 platform_device_unregister(&floppy_device[drive]);
4330out_flush_work: 4360out_flush_work:
4331 flush_scheduled_work(); 4361 flush_work_sync(&floppy_work);
4332 if (atomic_read(&usage_count)) 4362 if (atomic_read(&usage_count))
4333 floppy_release_irq_and_dma(); 4363 floppy_release_irq_and_dma();
4334out_unreg_region: 4364out_unreg_region:
4335 blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); 4365 blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256);
4336 blk_cleanup_queue(floppy_queue);
4337out_unreg_driver:
4338 platform_driver_unregister(&floppy_driver); 4366 platform_driver_unregister(&floppy_driver);
4339out_unreg_blkdev: 4367out_unreg_blkdev:
4340 unregister_blkdev(FLOPPY_MAJOR, "fd"); 4368 unregister_blkdev(FLOPPY_MAJOR, "fd");
4341out_put_disk: 4369out_put_disk:
4342 while (dr--) { 4370 while (dr--) {
4343 del_timer(&motor_off_timer[dr]); 4371 del_timer(&motor_off_timer[dr]);
4372 if (disks[dr]->queue)
4373 blk_cleanup_queue(disks[dr]->queue);
4344 put_disk(disks[dr]); 4374 put_disk(disks[dr]);
4345 } 4375 }
4346 return err; 4376 return err;
@@ -4398,7 +4428,7 @@ static int floppy_grab_irq_and_dma(void)
4398 * We might have scheduled a free_irq(), wait it to 4428 * We might have scheduled a free_irq(), wait it to
4399 * drain first: 4429 * drain first:
4400 */ 4430 */
4401 flush_scheduled_work(); 4431 flush_work_sync(&floppy_work);
4402 4432
4403 if (fd_request_irq()) { 4433 if (fd_request_irq()) {
4404 DPRINT("Unable to grab IRQ%d for the floppy driver\n", 4434 DPRINT("Unable to grab IRQ%d for the floppy driver\n",
@@ -4549,12 +4579,12 @@ static void __exit floppy_module_exit(void)
4549 device_remove_file(&floppy_device[drive].dev, &dev_attr_cmos); 4579 device_remove_file(&floppy_device[drive].dev, &dev_attr_cmos);
4550 platform_device_unregister(&floppy_device[drive]); 4580 platform_device_unregister(&floppy_device[drive]);
4551 } 4581 }
4582 blk_cleanup_queue(disks[drive]->queue);
4552 put_disk(disks[drive]); 4583 put_disk(disks[drive]);
4553 } 4584 }
4554 4585
4555 del_timer_sync(&fd_timeout); 4586 del_timer_sync(&fd_timeout);
4556 del_timer_sync(&fd_timer); 4587 del_timer_sync(&fd_timer);
4557 blk_cleanup_queue(floppy_queue);
4558 4588
4559 if (atomic_read(&usage_count)) 4589 if (atomic_read(&usage_count))
4560 floppy_release_irq_and_dma(); 4590 floppy_release_irq_and_dma();
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index 30ec6b37424e..007c630904c1 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -733,7 +733,7 @@ static int __init hd_init(void)
733 * the BIOS or CMOS. This doesn't work all that well, 733 * the BIOS or CMOS. This doesn't work all that well,
734 * since this assumes that this is a primary or secondary 734 * since this assumes that this is a primary or secondary
735 * drive, and if we're using this legacy driver, it's 735 * drive, and if we're using this legacy driver, it's
736 * probably an auxilliary controller added to recover 736 * probably an auxiliary controller added to recover
737 * legacy data off an ST-506 drive. Either way, it's 737 * legacy data off an ST-506 drive. Either way, it's
738 * definitely safest to have the user explicitly specify 738 * definitely safest to have the user explicitly specify
739 * the information. 739 * the information.
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 91797bbbe702..76c8da78212b 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -67,13 +67,14 @@
67#include <linux/compat.h> 67#include <linux/compat.h>
68#include <linux/suspend.h> 68#include <linux/suspend.h>
69#include <linux/freezer.h> 69#include <linux/freezer.h>
70#include <linux/smp_lock.h> 70#include <linux/mutex.h>
71#include <linux/writeback.h> 71#include <linux/writeback.h>
72#include <linux/buffer_head.h> /* for invalidate_bdev() */ 72#include <linux/buffer_head.h> /* for invalidate_bdev() */
73#include <linux/completion.h> 73#include <linux/completion.h>
74#include <linux/highmem.h> 74#include <linux/highmem.h>
75#include <linux/kthread.h> 75#include <linux/kthread.h>
76#include <linux/splice.h> 76#include <linux/splice.h>
77#include <linux/sysfs.h>
77 78
78#include <asm/uaccess.h> 79#include <asm/uaccess.h>
79 80
@@ -99,8 +100,8 @@ static int transfer_none(struct loop_device *lo, int cmd,
99 else 100 else
100 memcpy(raw_buf, loop_buf, size); 101 memcpy(raw_buf, loop_buf, size);
101 102
102 kunmap_atomic(raw_buf, KM_USER0);
103 kunmap_atomic(loop_buf, KM_USER1); 103 kunmap_atomic(loop_buf, KM_USER1);
104 kunmap_atomic(raw_buf, KM_USER0);
104 cond_resched(); 105 cond_resched();
105 return 0; 106 return 0;
106} 107}
@@ -128,8 +129,8 @@ static int transfer_xor(struct loop_device *lo, int cmd,
128 for (i = 0; i < size; i++) 129 for (i = 0; i < size; i++)
129 *out++ = *in++ ^ key[(i & 511) % keysize]; 130 *out++ = *in++ ^ key[(i & 511) % keysize];
130 131
131 kunmap_atomic(raw_buf, KM_USER0);
132 kunmap_atomic(loop_buf, KM_USER1); 132 kunmap_atomic(loop_buf, KM_USER1);
133 kunmap_atomic(raw_buf, KM_USER0);
133 cond_resched(); 134 cond_resched();
134 return 0; 135 return 0;
135} 136}
@@ -393,11 +394,7 @@ lo_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
393 struct loop_device *lo = p->lo; 394 struct loop_device *lo = p->lo;
394 struct page *page = buf->page; 395 struct page *page = buf->page;
395 sector_t IV; 396 sector_t IV;
396 int size, ret; 397 int size;
397
398 ret = buf->ops->confirm(pipe, buf);
399 if (unlikely(ret))
400 return ret;
401 398
402 IV = ((sector_t) page->index << (PAGE_CACHE_SHIFT - 9)) + 399 IV = ((sector_t) page->index << (PAGE_CACHE_SHIFT - 9)) +
403 (buf->offset >> 9); 400 (buf->offset >> 9);
@@ -477,17 +474,11 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
477 pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset; 474 pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset;
478 475
479 if (bio_rw(bio) == WRITE) { 476 if (bio_rw(bio) == WRITE) {
480 bool barrier = !!(bio->bi_rw & REQ_HARDBARRIER);
481 struct file *file = lo->lo_backing_file; 477 struct file *file = lo->lo_backing_file;
482 478
483 if (barrier) { 479 if (bio->bi_rw & REQ_FLUSH) {
484 if (unlikely(!file->f_op->fsync)) {
485 ret = -EOPNOTSUPP;
486 goto out;
487 }
488
489 ret = vfs_fsync(file, 0); 480 ret = vfs_fsync(file, 0);
490 if (unlikely(ret)) { 481 if (unlikely(ret && ret != -EINVAL)) {
491 ret = -EIO; 482 ret = -EIO;
492 goto out; 483 goto out;
493 } 484 }
@@ -495,9 +486,9 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
495 486
496 ret = lo_send(lo, bio, pos); 487 ret = lo_send(lo, bio, pos);
497 488
498 if (barrier && !ret) { 489 if ((bio->bi_rw & REQ_FUA) && !ret) {
499 ret = vfs_fsync(file, 0); 490 ret = vfs_fsync(file, 0);
500 if (unlikely(ret)) 491 if (unlikely(ret && ret != -EINVAL))
501 ret = -EIO; 492 ret = -EIO;
502 } 493 }
503 } else 494 } else
@@ -549,17 +540,6 @@ out:
549 return 0; 540 return 0;
550} 541}
551 542
552/*
553 * kick off io on the underlying address space
554 */
555static void loop_unplug(struct request_queue *q)
556{
557 struct loop_device *lo = q->queuedata;
558
559 queue_flag_clear_unlocked(QUEUE_FLAG_PLUGGED, q);
560 blk_run_address_space(lo->lo_backing_file->f_mapping);
561}
562
563struct switch_request { 543struct switch_request {
564 struct file *file; 544 struct file *file;
565 struct completion wait; 545 struct completion wait;
@@ -737,6 +717,103 @@ static inline int is_loop_device(struct file *file)
737 return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR; 717 return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR;
738} 718}
739 719
720/* loop sysfs attributes */
721
722static ssize_t loop_attr_show(struct device *dev, char *page,
723 ssize_t (*callback)(struct loop_device *, char *))
724{
725 struct loop_device *l, *lo = NULL;
726
727 mutex_lock(&loop_devices_mutex);
728 list_for_each_entry(l, &loop_devices, lo_list)
729 if (disk_to_dev(l->lo_disk) == dev) {
730 lo = l;
731 break;
732 }
733 mutex_unlock(&loop_devices_mutex);
734
735 return lo ? callback(lo, page) : -EIO;
736}
737
738#define LOOP_ATTR_RO(_name) \
739static ssize_t loop_attr_##_name##_show(struct loop_device *, char *); \
740static ssize_t loop_attr_do_show_##_name(struct device *d, \
741 struct device_attribute *attr, char *b) \
742{ \
743 return loop_attr_show(d, b, loop_attr_##_name##_show); \
744} \
745static struct device_attribute loop_attr_##_name = \
746 __ATTR(_name, S_IRUGO, loop_attr_do_show_##_name, NULL);
747
748static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf)
749{
750 ssize_t ret;
751 char *p = NULL;
752
753 mutex_lock(&lo->lo_ctl_mutex);
754 if (lo->lo_backing_file)
755 p = d_path(&lo->lo_backing_file->f_path, buf, PAGE_SIZE - 1);
756 mutex_unlock(&lo->lo_ctl_mutex);
757
758 if (IS_ERR_OR_NULL(p))
759 ret = PTR_ERR(p);
760 else {
761 ret = strlen(p);
762 memmove(buf, p, ret);
763 buf[ret++] = '\n';
764 buf[ret] = 0;
765 }
766
767 return ret;
768}
769
770static ssize_t loop_attr_offset_show(struct loop_device *lo, char *buf)
771{
772 return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_offset);
773}
774
775static ssize_t loop_attr_sizelimit_show(struct loop_device *lo, char *buf)
776{
777 return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_sizelimit);
778}
779
780static ssize_t loop_attr_autoclear_show(struct loop_device *lo, char *buf)
781{
782 int autoclear = (lo->lo_flags & LO_FLAGS_AUTOCLEAR);
783
784 return sprintf(buf, "%s\n", autoclear ? "1" : "0");
785}
786
787LOOP_ATTR_RO(backing_file);
788LOOP_ATTR_RO(offset);
789LOOP_ATTR_RO(sizelimit);
790LOOP_ATTR_RO(autoclear);
791
792static struct attribute *loop_attrs[] = {
793 &loop_attr_backing_file.attr,
794 &loop_attr_offset.attr,
795 &loop_attr_sizelimit.attr,
796 &loop_attr_autoclear.attr,
797 NULL,
798};
799
800static struct attribute_group loop_attribute_group = {
801 .name = "loop",
802 .attrs= loop_attrs,
803};
804
805static int loop_sysfs_init(struct loop_device *lo)
806{
807 return sysfs_create_group(&disk_to_dev(lo->lo_disk)->kobj,
808 &loop_attribute_group);
809}
810
811static void loop_sysfs_exit(struct loop_device *lo)
812{
813 sysfs_remove_group(&disk_to_dev(lo->lo_disk)->kobj,
814 &loop_attribute_group);
815}
816
740static int loop_set_fd(struct loop_device *lo, fmode_t mode, 817static int loop_set_fd(struct loop_device *lo, fmode_t mode,
741 struct block_device *bdev, unsigned int arg) 818 struct block_device *bdev, unsigned int arg)
742{ 819{
@@ -829,13 +906,13 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
829 */ 906 */
830 blk_queue_make_request(lo->lo_queue, loop_make_request); 907 blk_queue_make_request(lo->lo_queue, loop_make_request);
831 lo->lo_queue->queuedata = lo; 908 lo->lo_queue->queuedata = lo;
832 lo->lo_queue->unplug_fn = loop_unplug;
833 909
834 if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) 910 if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
835 blk_queue_ordered(lo->lo_queue, QUEUE_ORDERED_DRAIN); 911 blk_queue_flush(lo->lo_queue, REQ_FLUSH);
836 912
837 set_capacity(lo->lo_disk, size); 913 set_capacity(lo->lo_disk, size);
838 bd_set_size(bdev, size << 9); 914 bd_set_size(bdev, size << 9);
915 loop_sysfs_init(lo);
839 /* let user-space know about the new size */ 916 /* let user-space know about the new size */
840 kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); 917 kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
841 918
@@ -854,6 +931,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
854 return 0; 931 return 0;
855 932
856out_clr: 933out_clr:
934 loop_sysfs_exit(lo);
857 lo->lo_thread = NULL; 935 lo->lo_thread = NULL;
858 lo->lo_device = NULL; 936 lo->lo_device = NULL;
859 lo->lo_backing_file = NULL; 937 lo->lo_backing_file = NULL;
@@ -929,7 +1007,6 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
929 1007
930 kthread_stop(lo->lo_thread); 1008 kthread_stop(lo->lo_thread);
931 1009
932 lo->lo_queue->unplug_fn = NULL;
933 lo->lo_backing_file = NULL; 1010 lo->lo_backing_file = NULL;
934 1011
935 loop_release_xfer(lo); 1012 loop_release_xfer(lo);
@@ -948,6 +1025,7 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
948 if (bdev) 1025 if (bdev)
949 invalidate_bdev(bdev); 1026 invalidate_bdev(bdev);
950 set_capacity(lo->lo_disk, 0); 1027 set_capacity(lo->lo_disk, 0);
1028 loop_sysfs_exit(lo);
951 if (bdev) { 1029 if (bdev) {
952 bd_set_size(bdev, 0); 1030 bd_set_size(bdev, 0);
953 /* let user-space know about this change */ 1031 /* let user-space know about this change */
@@ -1409,11 +1487,9 @@ static int lo_open(struct block_device *bdev, fmode_t mode)
1409{ 1487{
1410 struct loop_device *lo = bdev->bd_disk->private_data; 1488 struct loop_device *lo = bdev->bd_disk->private_data;
1411 1489
1412 lock_kernel();
1413 mutex_lock(&lo->lo_ctl_mutex); 1490 mutex_lock(&lo->lo_ctl_mutex);
1414 lo->lo_refcnt++; 1491 lo->lo_refcnt++;
1415 mutex_unlock(&lo->lo_ctl_mutex); 1492 mutex_unlock(&lo->lo_ctl_mutex);
1416 unlock_kernel();
1417 1493
1418 return 0; 1494 return 0;
1419} 1495}
@@ -1423,7 +1499,6 @@ static int lo_release(struct gendisk *disk, fmode_t mode)
1423 struct loop_device *lo = disk->private_data; 1499 struct loop_device *lo = disk->private_data;
1424 int err; 1500 int err;
1425 1501
1426 lock_kernel();
1427 mutex_lock(&lo->lo_ctl_mutex); 1502 mutex_lock(&lo->lo_ctl_mutex);
1428 1503
1429 if (--lo->lo_refcnt) 1504 if (--lo->lo_refcnt)
@@ -1448,7 +1523,6 @@ static int lo_release(struct gendisk *disk, fmode_t mode)
1448out: 1523out:
1449 mutex_unlock(&lo->lo_ctl_mutex); 1524 mutex_unlock(&lo->lo_ctl_mutex);
1450out_unlocked: 1525out_unlocked:
1451 lock_kernel();
1452 return 0; 1526 return 0;
1453} 1527}
1454 1528
@@ -1466,9 +1540,9 @@ static const struct block_device_operations lo_fops = {
1466 * And now the modules code and kernel interface. 1540 * And now the modules code and kernel interface.
1467 */ 1541 */
1468static int max_loop; 1542static int max_loop;
1469module_param(max_loop, int, 0); 1543module_param(max_loop, int, S_IRUGO);
1470MODULE_PARM_DESC(max_loop, "Maximum number of loop devices"); 1544MODULE_PARM_DESC(max_loop, "Maximum number of loop devices");
1471module_param(max_part, int, 0); 1545module_param(max_part, int, S_IRUGO);
1472MODULE_PARM_DESC(max_part, "Maximum number of partitions per loop device"); 1546MODULE_PARM_DESC(max_part, "Maximum number of partitions per loop device");
1473MODULE_LICENSE("GPL"); 1547MODULE_LICENSE("GPL");
1474MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); 1548MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);
@@ -1584,7 +1658,7 @@ static struct kobject *loop_probe(dev_t dev, int *part, void *data)
1584 struct kobject *kobj; 1658 struct kobject *kobj;
1585 1659
1586 mutex_lock(&loop_devices_mutex); 1660 mutex_lock(&loop_devices_mutex);
1587 lo = loop_init_one(dev & MINORMASK); 1661 lo = loop_init_one(MINOR(dev) >> part_shift);
1588 kobj = lo ? get_disk(lo->lo_disk) : ERR_PTR(-ENOMEM); 1662 kobj = lo ? get_disk(lo->lo_disk) : ERR_PTR(-ENOMEM);
1589 mutex_unlock(&loop_devices_mutex); 1663 mutex_unlock(&loop_devices_mutex);
1590 1664
@@ -1614,18 +1688,32 @@ static int __init loop_init(void)
1614 */ 1688 */
1615 1689
1616 part_shift = 0; 1690 part_shift = 0;
1617 if (max_part > 0) 1691 if (max_part > 0) {
1618 part_shift = fls(max_part); 1692 part_shift = fls(max_part);
1619 1693
1694 /*
1695 * Adjust max_part according to part_shift as it is exported
1696 * to user space so that user can decide correct minor number
1697 * if [s]he want to create more devices.
1698 *
1699 * Note that -1 is required because partition 0 is reserved
1700 * for the whole disk.
1701 */
1702 max_part = (1UL << part_shift) - 1;
1703 }
1704
1705 if ((1UL << part_shift) > DISK_MAX_PARTS)
1706 return -EINVAL;
1707
1620 if (max_loop > 1UL << (MINORBITS - part_shift)) 1708 if (max_loop > 1UL << (MINORBITS - part_shift))
1621 return -EINVAL; 1709 return -EINVAL;
1622 1710
1623 if (max_loop) { 1711 if (max_loop) {
1624 nr = max_loop; 1712 nr = max_loop;
1625 range = max_loop; 1713 range = max_loop << part_shift;
1626 } else { 1714 } else {
1627 nr = 8; 1715 nr = 8;
1628 range = 1UL << (MINORBITS - part_shift); 1716 range = 1UL << MINORBITS;
1629 } 1717 }
1630 1718
1631 if (register_blkdev(LOOP_MAJOR, "loop")) 1719 if (register_blkdev(LOOP_MAJOR, "loop"))
@@ -1664,7 +1752,7 @@ static void __exit loop_exit(void)
1664 unsigned long range; 1752 unsigned long range;
1665 struct loop_device *lo, *next; 1753 struct loop_device *lo, *next;
1666 1754
1667 range = max_loop ? max_loop : 1UL << (MINORBITS - part_shift); 1755 range = max_loop ? max_loop << part_shift : 1UL << MINORBITS;
1668 1756
1669 list_for_each_entry_safe(lo, next, &loop_devices, lo_list) 1757 list_for_each_entry_safe(lo, next, &loop_devices, lo_list)
1670 loop_del_one(lo); 1758 loop_del_one(lo);
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 0daa422aa281..f533f3375e24 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -24,7 +24,7 @@
24#include <linux/errno.h> 24#include <linux/errno.h>
25#include <linux/file.h> 25#include <linux/file.h>
26#include <linux/ioctl.h> 26#include <linux/ioctl.h>
27#include <linux/smp_lock.h> 27#include <linux/mutex.h>
28#include <linux/compiler.h> 28#include <linux/compiler.h>
29#include <linux/err.h> 29#include <linux/err.h>
30#include <linux/kernel.h> 30#include <linux/kernel.h>
@@ -192,7 +192,8 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size,
192 if (lo->xmit_timeout) 192 if (lo->xmit_timeout)
193 del_timer_sync(&ti); 193 del_timer_sync(&ti);
194 } else 194 } else
195 result = kernel_recvmsg(sock, &msg, &iov, 1, size, 0); 195 result = kernel_recvmsg(sock, &msg, &iov, 1, size,
196 msg.msg_flags);
196 197
197 if (signal_pending(current)) { 198 if (signal_pending(current)) {
198 siginfo_t info; 199 siginfo_t info;
@@ -717,11 +718,9 @@ static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
717 dprintk(DBG_IOCTL, "%s: nbd_ioctl cmd=%s(0x%x) arg=%lu\n", 718 dprintk(DBG_IOCTL, "%s: nbd_ioctl cmd=%s(0x%x) arg=%lu\n",
718 lo->disk->disk_name, ioctl_cmd_to_ascii(cmd), cmd, arg); 719 lo->disk->disk_name, ioctl_cmd_to_ascii(cmd), cmd, arg);
719 720
720 lock_kernel();
721 mutex_lock(&lo->tx_lock); 721 mutex_lock(&lo->tx_lock);
722 error = __nbd_ioctl(bdev, lo, cmd, arg); 722 error = __nbd_ioctl(bdev, lo, cmd, arg);
723 mutex_unlock(&lo->tx_lock); 723 mutex_unlock(&lo->tx_lock);
724 unlock_kernel();
725 724
726 return error; 725 return error;
727} 726}
@@ -755,9 +754,26 @@ static int __init nbd_init(void)
755 return -ENOMEM; 754 return -ENOMEM;
756 755
757 part_shift = 0; 756 part_shift = 0;
758 if (max_part > 0) 757 if (max_part > 0) {
759 part_shift = fls(max_part); 758 part_shift = fls(max_part);
760 759
760 /*
761 * Adjust max_part according to part_shift as it is exported
762 * to user space so that user can know the max number of
763 * partition kernel should be able to manage.
764 *
765 * Note that -1 is required because partition 0 is reserved
766 * for the whole disk.
767 */
768 max_part = (1UL << part_shift) - 1;
769 }
770
771 if ((1UL << part_shift) > DISK_MAX_PARTS)
772 return -EINVAL;
773
774 if (nbds_max > 1UL << (MINORBITS - part_shift))
775 return -EINVAL;
776
761 for (i = 0; i < nbds_max; i++) { 777 for (i = 0; i < nbds_max; i++) {
762 struct gendisk *disk = alloc_disk(1 << part_shift); 778 struct gendisk *disk = alloc_disk(1 << part_shift);
763 if (!disk) 779 if (!disk)
diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
index 2284b4f05c62..87311ebac0db 100644
--- a/drivers/block/osdblk.c
+++ b/drivers/block/osdblk.c
@@ -310,8 +310,7 @@ static void osdblk_rq_fn(struct request_queue *q)
310 break; 310 break;
311 311
312 /* filter out block requests we don't understand */ 312 /* filter out block requests we don't understand */
313 if (rq->cmd_type != REQ_TYPE_FS && 313 if (rq->cmd_type != REQ_TYPE_FS) {
314 !(rq->cmd_flags & REQ_HARDBARRIER)) {
315 blk_end_request_all(rq, 0); 314 blk_end_request_all(rq, 0);
316 continue; 315 continue;
317 } 316 }
@@ -439,7 +438,7 @@ static int osdblk_init_disk(struct osdblk_device *osdev)
439 blk_queue_stack_limits(q, osd_request_queue(osdev->osd)); 438 blk_queue_stack_limits(q, osd_request_queue(osdev->osd));
440 439
441 blk_queue_prep_rq(q, blk_queue_start_tag); 440 blk_queue_prep_rq(q, blk_queue_start_tag);
442 blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH); 441 blk_queue_flush(q, REQ_FLUSH);
443 442
444 disk->queue = q; 443 disk->queue = q;
445 444
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index 76f8565e1e8d..46b8136c31bb 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -138,9 +138,10 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_DLY};
138#include <linux/cdrom.h> 138#include <linux/cdrom.h>
139#include <linux/spinlock.h> 139#include <linux/spinlock.h>
140#include <linux/blkdev.h> 140#include <linux/blkdev.h>
141#include <linux/smp_lock.h> 141#include <linux/mutex.h>
142#include <asm/uaccess.h> 142#include <asm/uaccess.h>
143 143
144static DEFINE_MUTEX(pcd_mutex);
144static DEFINE_SPINLOCK(pcd_lock); 145static DEFINE_SPINLOCK(pcd_lock);
145 146
146module_param(verbose, bool, 0644); 147module_param(verbose, bool, 0644);
@@ -171,7 +172,8 @@ module_param_array(drive3, int, NULL, 0);
171static int pcd_open(struct cdrom_device_info *cdi, int purpose); 172static int pcd_open(struct cdrom_device_info *cdi, int purpose);
172static void pcd_release(struct cdrom_device_info *cdi); 173static void pcd_release(struct cdrom_device_info *cdi);
173static int pcd_drive_status(struct cdrom_device_info *cdi, int slot_nr); 174static int pcd_drive_status(struct cdrom_device_info *cdi, int slot_nr);
174static int pcd_media_changed(struct cdrom_device_info *cdi, int slot_nr); 175static unsigned int pcd_check_events(struct cdrom_device_info *cdi,
176 unsigned int clearing, int slot_nr);
175static int pcd_tray_move(struct cdrom_device_info *cdi, int position); 177static int pcd_tray_move(struct cdrom_device_info *cdi, int position);
176static int pcd_lock_door(struct cdrom_device_info *cdi, int lock); 178static int pcd_lock_door(struct cdrom_device_info *cdi, int lock);
177static int pcd_drive_reset(struct cdrom_device_info *cdi); 179static int pcd_drive_reset(struct cdrom_device_info *cdi);
@@ -227,9 +229,9 @@ static int pcd_block_open(struct block_device *bdev, fmode_t mode)
227 struct pcd_unit *cd = bdev->bd_disk->private_data; 229 struct pcd_unit *cd = bdev->bd_disk->private_data;
228 int ret; 230 int ret;
229 231
230 lock_kernel(); 232 mutex_lock(&pcd_mutex);
231 ret = cdrom_open(&cd->info, bdev, mode); 233 ret = cdrom_open(&cd->info, bdev, mode);
232 unlock_kernel(); 234 mutex_unlock(&pcd_mutex);
233 235
234 return ret; 236 return ret;
235} 237}
@@ -237,9 +239,9 @@ static int pcd_block_open(struct block_device *bdev, fmode_t mode)
237static int pcd_block_release(struct gendisk *disk, fmode_t mode) 239static int pcd_block_release(struct gendisk *disk, fmode_t mode)
238{ 240{
239 struct pcd_unit *cd = disk->private_data; 241 struct pcd_unit *cd = disk->private_data;
240 lock_kernel(); 242 mutex_lock(&pcd_mutex);
241 cdrom_release(&cd->info, mode); 243 cdrom_release(&cd->info, mode);
242 unlock_kernel(); 244 mutex_unlock(&pcd_mutex);
243 return 0; 245 return 0;
244} 246}
245 247
@@ -249,17 +251,18 @@ static int pcd_block_ioctl(struct block_device *bdev, fmode_t mode,
249 struct pcd_unit *cd = bdev->bd_disk->private_data; 251 struct pcd_unit *cd = bdev->bd_disk->private_data;
250 int ret; 252 int ret;
251 253
252 lock_kernel(); 254 mutex_lock(&pcd_mutex);
253 ret = cdrom_ioctl(&cd->info, bdev, mode, cmd, arg); 255 ret = cdrom_ioctl(&cd->info, bdev, mode, cmd, arg);
254 unlock_kernel(); 256 mutex_unlock(&pcd_mutex);
255 257
256 return ret; 258 return ret;
257} 259}
258 260
259static int pcd_block_media_changed(struct gendisk *disk) 261static unsigned int pcd_block_check_events(struct gendisk *disk,
262 unsigned int clearing)
260{ 263{
261 struct pcd_unit *cd = disk->private_data; 264 struct pcd_unit *cd = disk->private_data;
262 return cdrom_media_changed(&cd->info); 265 return cdrom_check_events(&cd->info, clearing);
263} 266}
264 267
265static const struct block_device_operations pcd_bdops = { 268static const struct block_device_operations pcd_bdops = {
@@ -267,14 +270,14 @@ static const struct block_device_operations pcd_bdops = {
267 .open = pcd_block_open, 270 .open = pcd_block_open,
268 .release = pcd_block_release, 271 .release = pcd_block_release,
269 .ioctl = pcd_block_ioctl, 272 .ioctl = pcd_block_ioctl,
270 .media_changed = pcd_block_media_changed, 273 .check_events = pcd_block_check_events,
271}; 274};
272 275
273static struct cdrom_device_ops pcd_dops = { 276static struct cdrom_device_ops pcd_dops = {
274 .open = pcd_open, 277 .open = pcd_open,
275 .release = pcd_release, 278 .release = pcd_release,
276 .drive_status = pcd_drive_status, 279 .drive_status = pcd_drive_status,
277 .media_changed = pcd_media_changed, 280 .check_events = pcd_check_events,
278 .tray_move = pcd_tray_move, 281 .tray_move = pcd_tray_move,
279 .lock_door = pcd_lock_door, 282 .lock_door = pcd_lock_door,
280 .get_mcn = pcd_get_mcn, 283 .get_mcn = pcd_get_mcn,
@@ -317,6 +320,7 @@ static void pcd_init_units(void)
317 disk->first_minor = unit; 320 disk->first_minor = unit;
318 strcpy(disk->disk_name, cd->name); /* umm... */ 321 strcpy(disk->disk_name, cd->name); /* umm... */
319 disk->fops = &pcd_bdops; 322 disk->fops = &pcd_bdops;
323 disk->flags = GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE;
320 } 324 }
321} 325}
322 326
@@ -501,13 +505,14 @@ static int pcd_packet(struct cdrom_device_info *cdi, struct packet_command *cgc)
501 505
502#define DBMSG(msg) ((verbose>1)?(msg):NULL) 506#define DBMSG(msg) ((verbose>1)?(msg):NULL)
503 507
504static int pcd_media_changed(struct cdrom_device_info *cdi, int slot_nr) 508static unsigned int pcd_check_events(struct cdrom_device_info *cdi,
509 unsigned int clearing, int slot_nr)
505{ 510{
506 struct pcd_unit *cd = cdi->handle; 511 struct pcd_unit *cd = cdi->handle;
507 int res = cd->changed; 512 int res = cd->changed;
508 if (res) 513 if (res)
509 cd->changed = 0; 514 cd->changed = 0;
510 return res; 515 return res ? DISK_EVENT_MEDIA_CHANGE : 0;
511} 516}
512 517
513static int pcd_lock_door(struct cdrom_device_info *cdi, int lock) 518static int pcd_lock_door(struct cdrom_device_info *cdi, int lock)
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 985f0d4f1d1e..869e7676d46f 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -153,10 +153,11 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_GEO, D_SBY, D_DLY, D_SLV};
153#include <linux/blkdev.h> 153#include <linux/blkdev.h>
154#include <linux/blkpg.h> 154#include <linux/blkpg.h>
155#include <linux/kernel.h> 155#include <linux/kernel.h>
156#include <linux/smp_lock.h> 156#include <linux/mutex.h>
157#include <asm/uaccess.h> 157#include <asm/uaccess.h>
158#include <linux/workqueue.h> 158#include <linux/workqueue.h>
159 159
160static DEFINE_MUTEX(pd_mutex);
160static DEFINE_SPINLOCK(pd_lock); 161static DEFINE_SPINLOCK(pd_lock);
161 162
162module_param(verbose, bool, 0); 163module_param(verbose, bool, 0);
@@ -736,14 +737,14 @@ static int pd_open(struct block_device *bdev, fmode_t mode)
736{ 737{
737 struct pd_unit *disk = bdev->bd_disk->private_data; 738 struct pd_unit *disk = bdev->bd_disk->private_data;
738 739
739 lock_kernel(); 740 mutex_lock(&pd_mutex);
740 disk->access++; 741 disk->access++;
741 742
742 if (disk->removable) { 743 if (disk->removable) {
743 pd_special_command(disk, pd_media_check); 744 pd_special_command(disk, pd_media_check);
744 pd_special_command(disk, pd_door_lock); 745 pd_special_command(disk, pd_door_lock);
745 } 746 }
746 unlock_kernel(); 747 mutex_unlock(&pd_mutex);
747 return 0; 748 return 0;
748} 749}
749 750
@@ -771,10 +772,10 @@ static int pd_ioctl(struct block_device *bdev, fmode_t mode,
771 772
772 switch (cmd) { 773 switch (cmd) {
773 case CDROMEJECT: 774 case CDROMEJECT:
774 lock_kernel(); 775 mutex_lock(&pd_mutex);
775 if (disk->access == 1) 776 if (disk->access == 1)
776 pd_special_command(disk, pd_eject); 777 pd_special_command(disk, pd_eject);
777 unlock_kernel(); 778 mutex_unlock(&pd_mutex);
778 return 0; 779 return 0;
779 default: 780 default:
780 return -EINVAL; 781 return -EINVAL;
@@ -785,15 +786,15 @@ static int pd_release(struct gendisk *p, fmode_t mode)
785{ 786{
786 struct pd_unit *disk = p->private_data; 787 struct pd_unit *disk = p->private_data;
787 788
788 lock_kernel(); 789 mutex_lock(&pd_mutex);
789 if (!--disk->access && disk->removable) 790 if (!--disk->access && disk->removable)
790 pd_special_command(disk, pd_door_unlock); 791 pd_special_command(disk, pd_door_unlock);
791 unlock_kernel(); 792 mutex_unlock(&pd_mutex);
792 793
793 return 0; 794 return 0;
794} 795}
795 796
796static int pd_check_media(struct gendisk *p) 797static unsigned int pd_check_events(struct gendisk *p, unsigned int clearing)
797{ 798{
798 struct pd_unit *disk = p->private_data; 799 struct pd_unit *disk = p->private_data;
799 int r; 800 int r;
@@ -802,7 +803,7 @@ static int pd_check_media(struct gendisk *p)
802 pd_special_command(disk, pd_media_check); 803 pd_special_command(disk, pd_media_check);
803 r = disk->changed; 804 r = disk->changed;
804 disk->changed = 0; 805 disk->changed = 0;
805 return r; 806 return r ? DISK_EVENT_MEDIA_CHANGE : 0;
806} 807}
807 808
808static int pd_revalidate(struct gendisk *p) 809static int pd_revalidate(struct gendisk *p)
@@ -821,7 +822,7 @@ static const struct block_device_operations pd_fops = {
821 .release = pd_release, 822 .release = pd_release,
822 .ioctl = pd_ioctl, 823 .ioctl = pd_ioctl,
823 .getgeo = pd_getgeo, 824 .getgeo = pd_getgeo,
824 .media_changed = pd_check_media, 825 .check_events = pd_check_events,
825 .revalidate_disk= pd_revalidate 826 .revalidate_disk= pd_revalidate
826}; 827};
827 828
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index 4457b494882a..f21b520ef419 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -152,9 +152,10 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_LUN, D_DLY};
152#include <linux/spinlock.h> 152#include <linux/spinlock.h>
153#include <linux/blkdev.h> 153#include <linux/blkdev.h>
154#include <linux/blkpg.h> 154#include <linux/blkpg.h>
155#include <linux/smp_lock.h> 155#include <linux/mutex.h>
156#include <asm/uaccess.h> 156#include <asm/uaccess.h>
157 157
158static DEFINE_MUTEX(pf_mutex);
158static DEFINE_SPINLOCK(pf_spin_lock); 159static DEFINE_SPINLOCK(pf_spin_lock);
159 160
160module_param(verbose, bool, 0644); 161module_param(verbose, bool, 0644);
@@ -242,7 +243,8 @@ static struct pf_unit units[PF_UNITS];
242static int pf_identify(struct pf_unit *pf); 243static int pf_identify(struct pf_unit *pf);
243static void pf_lock(struct pf_unit *pf, int func); 244static void pf_lock(struct pf_unit *pf, int func);
244static void pf_eject(struct pf_unit *pf); 245static void pf_eject(struct pf_unit *pf);
245static int pf_check_media(struct gendisk *disk); 246static unsigned int pf_check_events(struct gendisk *disk,
247 unsigned int clearing);
246 248
247static char pf_scratch[512]; /* scratch block buffer */ 249static char pf_scratch[512]; /* scratch block buffer */
248 250
@@ -269,7 +271,7 @@ static const struct block_device_operations pf_fops = {
269 .release = pf_release, 271 .release = pf_release,
270 .ioctl = pf_ioctl, 272 .ioctl = pf_ioctl,
271 .getgeo = pf_getgeo, 273 .getgeo = pf_getgeo,
272 .media_changed = pf_check_media, 274 .check_events = pf_check_events,
273}; 275};
274 276
275static void __init pf_init_units(void) 277static void __init pf_init_units(void)
@@ -302,7 +304,7 @@ static int pf_open(struct block_device *bdev, fmode_t mode)
302 struct pf_unit *pf = bdev->bd_disk->private_data; 304 struct pf_unit *pf = bdev->bd_disk->private_data;
303 int ret; 305 int ret;
304 306
305 lock_kernel(); 307 mutex_lock(&pf_mutex);
306 pf_identify(pf); 308 pf_identify(pf);
307 309
308 ret = -ENODEV; 310 ret = -ENODEV;
@@ -318,7 +320,7 @@ static int pf_open(struct block_device *bdev, fmode_t mode)
318 if (pf->removable) 320 if (pf->removable)
319 pf_lock(pf, 1); 321 pf_lock(pf, 1);
320out: 322out:
321 unlock_kernel(); 323 mutex_unlock(&pf_mutex);
322 return ret; 324 return ret;
323} 325}
324 326
@@ -349,9 +351,9 @@ static int pf_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, u
349 351
350 if (pf->access != 1) 352 if (pf->access != 1)
351 return -EBUSY; 353 return -EBUSY;
352 lock_kernel(); 354 mutex_lock(&pf_mutex);
353 pf_eject(pf); 355 pf_eject(pf);
354 unlock_kernel(); 356 mutex_unlock(&pf_mutex);
355 357
356 return 0; 358 return 0;
357} 359}
@@ -360,9 +362,9 @@ static int pf_release(struct gendisk *disk, fmode_t mode)
360{ 362{
361 struct pf_unit *pf = disk->private_data; 363 struct pf_unit *pf = disk->private_data;
362 364
363 lock_kernel(); 365 mutex_lock(&pf_mutex);
364 if (pf->access <= 0) { 366 if (pf->access <= 0) {
365 unlock_kernel(); 367 mutex_unlock(&pf_mutex);
366 return -EINVAL; 368 return -EINVAL;
367 } 369 }
368 370
@@ -371,14 +373,14 @@ static int pf_release(struct gendisk *disk, fmode_t mode)
371 if (!pf->access && pf->removable) 373 if (!pf->access && pf->removable)
372 pf_lock(pf, 0); 374 pf_lock(pf, 0);
373 375
374 unlock_kernel(); 376 mutex_unlock(&pf_mutex);
375 return 0; 377 return 0;
376 378
377} 379}
378 380
379static int pf_check_media(struct gendisk *disk) 381static unsigned int pf_check_events(struct gendisk *disk, unsigned int clearing)
380{ 382{
381 return 1; 383 return DISK_EVENT_MEDIA_CHANGE;
382} 384}
383 385
384static inline int status_reg(struct pf_unit *pf) 386static inline int status_reg(struct pf_unit *pf)
diff --git a/drivers/block/paride/pg.c b/drivers/block/paride/pg.c
index c397b3ddba9b..6b9a2000d56a 100644
--- a/drivers/block/paride/pg.c
+++ b/drivers/block/paride/pg.c
@@ -162,7 +162,7 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_DLY};
162#include <linux/pg.h> 162#include <linux/pg.h>
163#include <linux/device.h> 163#include <linux/device.h>
164#include <linux/sched.h> /* current, TASK_* */ 164#include <linux/sched.h> /* current, TASK_* */
165#include <linux/smp_lock.h> 165#include <linux/mutex.h>
166#include <linux/jiffies.h> 166#include <linux/jiffies.h>
167 167
168#include <asm/uaccess.h> 168#include <asm/uaccess.h>
@@ -193,6 +193,7 @@ module_param_array(drive3, int, NULL, 0);
193 193
194#define ATAPI_IDENTIFY 0x12 194#define ATAPI_IDENTIFY 0x12
195 195
196static DEFINE_MUTEX(pg_mutex);
196static int pg_open(struct inode *inode, struct file *file); 197static int pg_open(struct inode *inode, struct file *file);
197static int pg_release(struct inode *inode, struct file *file); 198static int pg_release(struct inode *inode, struct file *file);
198static ssize_t pg_read(struct file *filp, char __user *buf, 199static ssize_t pg_read(struct file *filp, char __user *buf,
@@ -234,6 +235,7 @@ static const struct file_operations pg_fops = {
234 .write = pg_write, 235 .write = pg_write,
235 .open = pg_open, 236 .open = pg_open,
236 .release = pg_release, 237 .release = pg_release,
238 .llseek = noop_llseek,
237}; 239};
238 240
239static void pg_init_units(void) 241static void pg_init_units(void)
@@ -518,7 +520,7 @@ static int pg_open(struct inode *inode, struct file *file)
518 struct pg *dev = &devices[unit]; 520 struct pg *dev = &devices[unit];
519 int ret = 0; 521 int ret = 0;
520 522
521 lock_kernel(); 523 mutex_lock(&pg_mutex);
522 if ((unit >= PG_UNITS) || (!dev->present)) { 524 if ((unit >= PG_UNITS) || (!dev->present)) {
523 ret = -ENODEV; 525 ret = -ENODEV;
524 goto out; 526 goto out;
@@ -547,7 +549,7 @@ static int pg_open(struct inode *inode, struct file *file)
547 file->private_data = dev; 549 file->private_data = dev;
548 550
549out: 551out:
550 unlock_kernel(); 552 mutex_unlock(&pg_mutex);
551 return ret; 553 return ret;
552} 554}
553 555
diff --git a/drivers/block/paride/pt.c b/drivers/block/paride/pt.c
index bc5825fdeaab..7179f79d7468 100644
--- a/drivers/block/paride/pt.c
+++ b/drivers/block/paride/pt.c
@@ -146,7 +146,7 @@ static int (*drives[4])[6] = {&drive0, &drive1, &drive2, &drive3};
146#include <linux/mtio.h> 146#include <linux/mtio.h>
147#include <linux/device.h> 147#include <linux/device.h>
148#include <linux/sched.h> /* current, TASK_*, schedule_timeout() */ 148#include <linux/sched.h> /* current, TASK_*, schedule_timeout() */
149#include <linux/smp_lock.h> 149#include <linux/mutex.h>
150 150
151#include <asm/uaccess.h> 151#include <asm/uaccess.h>
152 152
@@ -189,6 +189,7 @@ module_param_array(drive3, int, NULL, 0);
189#define ATAPI_MODE_SENSE 0x1a 189#define ATAPI_MODE_SENSE 0x1a
190#define ATAPI_LOG_SENSE 0x4d 190#define ATAPI_LOG_SENSE 0x4d
191 191
192static DEFINE_MUTEX(pt_mutex);
192static int pt_open(struct inode *inode, struct file *file); 193static int pt_open(struct inode *inode, struct file *file);
193static long pt_ioctl(struct file *file, unsigned int cmd, unsigned long arg); 194static long pt_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
194static int pt_release(struct inode *inode, struct file *file); 195static int pt_release(struct inode *inode, struct file *file);
@@ -239,6 +240,7 @@ static const struct file_operations pt_fops = {
239 .unlocked_ioctl = pt_ioctl, 240 .unlocked_ioctl = pt_ioctl,
240 .open = pt_open, 241 .open = pt_open,
241 .release = pt_release, 242 .release = pt_release,
243 .llseek = noop_llseek,
242}; 244};
243 245
244/* sysfs class support */ 246/* sysfs class support */
@@ -650,9 +652,9 @@ static int pt_open(struct inode *inode, struct file *file)
650 struct pt_unit *tape = pt + unit; 652 struct pt_unit *tape = pt + unit;
651 int err; 653 int err;
652 654
653 lock_kernel(); 655 mutex_lock(&pt_mutex);
654 if (unit >= PT_UNITS || (!tape->present)) { 656 if (unit >= PT_UNITS || (!tape->present)) {
655 unlock_kernel(); 657 mutex_unlock(&pt_mutex);
656 return -ENODEV; 658 return -ENODEV;
657 } 659 }
658 660
@@ -681,12 +683,12 @@ static int pt_open(struct inode *inode, struct file *file)
681 } 683 }
682 684
683 file->private_data = tape; 685 file->private_data = tape;
684 unlock_kernel(); 686 mutex_unlock(&pt_mutex);
685 return 0; 687 return 0;
686 688
687out: 689out:
688 atomic_inc(&tape->available); 690 atomic_inc(&tape->available);
689 unlock_kernel(); 691 mutex_unlock(&pt_mutex);
690 return err; 692 return err;
691} 693}
692 694
@@ -704,15 +706,15 @@ static long pt_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
704 switch (mtop.mt_op) { 706 switch (mtop.mt_op) {
705 707
706 case MTREW: 708 case MTREW:
707 lock_kernel(); 709 mutex_lock(&pt_mutex);
708 pt_rewind(tape); 710 pt_rewind(tape);
709 unlock_kernel(); 711 mutex_unlock(&pt_mutex);
710 return 0; 712 return 0;
711 713
712 case MTWEOF: 714 case MTWEOF:
713 lock_kernel(); 715 mutex_lock(&pt_mutex);
714 pt_write_fm(tape); 716 pt_write_fm(tape);
715 unlock_kernel(); 717 mutex_unlock(&pt_mutex);
716 return 0; 718 return 0;
717 719
718 default: 720 default:
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 37a2bb595076..07a382eaf0a8 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -57,7 +57,6 @@
57#include <linux/seq_file.h> 57#include <linux/seq_file.h>
58#include <linux/miscdevice.h> 58#include <linux/miscdevice.h>
59#include <linux/freezer.h> 59#include <linux/freezer.h>
60#include <linux/smp_lock.h>
61#include <linux/mutex.h> 60#include <linux/mutex.h>
62#include <linux/slab.h> 61#include <linux/slab.h>
63#include <scsi/scsi_cmnd.h> 62#include <scsi/scsi_cmnd.h>
@@ -86,6 +85,7 @@
86 85
87#define ZONE(sector, pd) (((sector) + (pd)->offset) & ~((pd)->settings.size - 1)) 86#define ZONE(sector, pd) (((sector) + (pd)->offset) & ~((pd)->settings.size - 1))
88 87
88static DEFINE_MUTEX(pktcdvd_mutex);
89static struct pktcdvd_device *pkt_devs[MAX_WRITERS]; 89static struct pktcdvd_device *pkt_devs[MAX_WRITERS];
90static struct proc_dir_entry *pkt_proc; 90static struct proc_dir_entry *pkt_proc;
91static int pktdev_major; 91static int pktdev_major;
@@ -753,7 +753,6 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
753 753
754 rq->timeout = 60*HZ; 754 rq->timeout = 60*HZ;
755 rq->cmd_type = REQ_TYPE_BLOCK_PC; 755 rq->cmd_type = REQ_TYPE_BLOCK_PC;
756 rq->cmd_flags |= REQ_HARDBARRIER;
757 if (cgc->quiet) 756 if (cgc->quiet)
758 rq->cmd_flags |= REQ_QUIET; 757 rq->cmd_flags |= REQ_QUIET;
759 758
@@ -1607,8 +1606,6 @@ static int kcdrwd(void *foobar)
1607 min_sleep_time = pkt->sleep_time; 1606 min_sleep_time = pkt->sleep_time;
1608 } 1607 }
1609 1608
1610 generic_unplug_device(bdev_get_queue(pd->bdev));
1611
1612 VPRINTK("kcdrwd: sleeping\n"); 1609 VPRINTK("kcdrwd: sleeping\n");
1613 residue = schedule_timeout(min_sleep_time); 1610 residue = schedule_timeout(min_sleep_time);
1614 VPRINTK("kcdrwd: wake up\n"); 1611 VPRINTK("kcdrwd: wake up\n");
@@ -2297,15 +2294,12 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
2297 * so bdget() can't fail. 2294 * so bdget() can't fail.
2298 */ 2295 */
2299 bdget(pd->bdev->bd_dev); 2296 bdget(pd->bdev->bd_dev);
2300 if ((ret = blkdev_get(pd->bdev, FMODE_READ))) 2297 if ((ret = blkdev_get(pd->bdev, FMODE_READ | FMODE_EXCL, pd)))
2301 goto out; 2298 goto out;
2302 2299
2303 if ((ret = bd_claim(pd->bdev, pd)))
2304 goto out_putdev;
2305
2306 if ((ret = pkt_get_last_written(pd, &lba))) { 2300 if ((ret = pkt_get_last_written(pd, &lba))) {
2307 printk(DRIVER_NAME": pkt_get_last_written failed\n"); 2301 printk(DRIVER_NAME": pkt_get_last_written failed\n");
2308 goto out_unclaim; 2302 goto out_putdev;
2309 } 2303 }
2310 2304
2311 set_capacity(pd->disk, lba << 2); 2305 set_capacity(pd->disk, lba << 2);
@@ -2315,7 +2309,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
2315 q = bdev_get_queue(pd->bdev); 2309 q = bdev_get_queue(pd->bdev);
2316 if (write) { 2310 if (write) {
2317 if ((ret = pkt_open_write(pd))) 2311 if ((ret = pkt_open_write(pd)))
2318 goto out_unclaim; 2312 goto out_putdev;
2319 /* 2313 /*
2320 * Some CDRW drives can not handle writes larger than one packet, 2314 * Some CDRW drives can not handle writes larger than one packet,
2321 * even if the size is a multiple of the packet size. 2315 * even if the size is a multiple of the packet size.
@@ -2330,23 +2324,21 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
2330 } 2324 }
2331 2325
2332 if ((ret = pkt_set_segment_merging(pd, q))) 2326 if ((ret = pkt_set_segment_merging(pd, q)))
2333 goto out_unclaim; 2327 goto out_putdev;
2334 2328
2335 if (write) { 2329 if (write) {
2336 if (!pkt_grow_pktlist(pd, CONFIG_CDROM_PKTCDVD_BUFFERS)) { 2330 if (!pkt_grow_pktlist(pd, CONFIG_CDROM_PKTCDVD_BUFFERS)) {
2337 printk(DRIVER_NAME": not enough memory for buffers\n"); 2331 printk(DRIVER_NAME": not enough memory for buffers\n");
2338 ret = -ENOMEM; 2332 ret = -ENOMEM;
2339 goto out_unclaim; 2333 goto out_putdev;
2340 } 2334 }
2341 printk(DRIVER_NAME": %lukB available on disc\n", lba << 1); 2335 printk(DRIVER_NAME": %lukB available on disc\n", lba << 1);
2342 } 2336 }
2343 2337
2344 return 0; 2338 return 0;
2345 2339
2346out_unclaim:
2347 bd_release(pd->bdev);
2348out_putdev: 2340out_putdev:
2349 blkdev_put(pd->bdev, FMODE_READ); 2341 blkdev_put(pd->bdev, FMODE_READ | FMODE_EXCL);
2350out: 2342out:
2351 return ret; 2343 return ret;
2352} 2344}
@@ -2363,8 +2355,7 @@ static void pkt_release_dev(struct pktcdvd_device *pd, int flush)
2363 pkt_lock_door(pd, 0); 2355 pkt_lock_door(pd, 0);
2364 2356
2365 pkt_set_speed(pd, MAX_SPEED, MAX_SPEED); 2357 pkt_set_speed(pd, MAX_SPEED, MAX_SPEED);
2366 bd_release(pd->bdev); 2358 blkdev_put(pd->bdev, FMODE_READ | FMODE_EXCL);
2367 blkdev_put(pd->bdev, FMODE_READ);
2368 2359
2369 pkt_shrink_pktlist(pd); 2360 pkt_shrink_pktlist(pd);
2370} 2361}
@@ -2383,7 +2374,7 @@ static int pkt_open(struct block_device *bdev, fmode_t mode)
2383 2374
2384 VPRINTK(DRIVER_NAME": entering open\n"); 2375 VPRINTK(DRIVER_NAME": entering open\n");
2385 2376
2386 lock_kernel(); 2377 mutex_lock(&pktcdvd_mutex);
2387 mutex_lock(&ctl_mutex); 2378 mutex_lock(&ctl_mutex);
2388 pd = pkt_find_dev_from_minor(MINOR(bdev->bd_dev)); 2379 pd = pkt_find_dev_from_minor(MINOR(bdev->bd_dev));
2389 if (!pd) { 2380 if (!pd) {
@@ -2411,7 +2402,7 @@ static int pkt_open(struct block_device *bdev, fmode_t mode)
2411 } 2402 }
2412 2403
2413 mutex_unlock(&ctl_mutex); 2404 mutex_unlock(&ctl_mutex);
2414 unlock_kernel(); 2405 mutex_unlock(&pktcdvd_mutex);
2415 return 0; 2406 return 0;
2416 2407
2417out_dec: 2408out_dec:
@@ -2419,7 +2410,7 @@ out_dec:
2419out: 2410out:
2420 VPRINTK(DRIVER_NAME": failed open (%d)\n", ret); 2411 VPRINTK(DRIVER_NAME": failed open (%d)\n", ret);
2421 mutex_unlock(&ctl_mutex); 2412 mutex_unlock(&ctl_mutex);
2422 unlock_kernel(); 2413 mutex_unlock(&pktcdvd_mutex);
2423 return ret; 2414 return ret;
2424} 2415}
2425 2416
@@ -2428,7 +2419,7 @@ static int pkt_close(struct gendisk *disk, fmode_t mode)
2428 struct pktcdvd_device *pd = disk->private_data; 2419 struct pktcdvd_device *pd = disk->private_data;
2429 int ret = 0; 2420 int ret = 0;
2430 2421
2431 lock_kernel(); 2422 mutex_lock(&pktcdvd_mutex);
2432 mutex_lock(&ctl_mutex); 2423 mutex_lock(&ctl_mutex);
2433 pd->refcnt--; 2424 pd->refcnt--;
2434 BUG_ON(pd->refcnt < 0); 2425 BUG_ON(pd->refcnt < 0);
@@ -2437,7 +2428,7 @@ static int pkt_close(struct gendisk *disk, fmode_t mode)
2437 pkt_release_dev(pd, flush); 2428 pkt_release_dev(pd, flush);
2438 } 2429 }
2439 mutex_unlock(&ctl_mutex); 2430 mutex_unlock(&ctl_mutex);
2440 unlock_kernel(); 2431 mutex_unlock(&pktcdvd_mutex);
2441 return ret; 2432 return ret;
2442} 2433}
2443 2434
@@ -2734,7 +2725,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
2734 bdev = bdget(dev); 2725 bdev = bdget(dev);
2735 if (!bdev) 2726 if (!bdev)
2736 return -ENOMEM; 2727 return -ENOMEM;
2737 ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY); 2728 ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL);
2738 if (ret) 2729 if (ret)
2739 return ret; 2730 return ret;
2740 2731
@@ -2773,7 +2764,7 @@ static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
2773 VPRINTK("pkt_ioctl: cmd %x, dev %d:%d\n", cmd, 2764 VPRINTK("pkt_ioctl: cmd %x, dev %d:%d\n", cmd,
2774 MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); 2765 MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
2775 2766
2776 lock_kernel(); 2767 mutex_lock(&pktcdvd_mutex);
2777 switch (cmd) { 2768 switch (cmd) {
2778 case CDROMEJECT: 2769 case CDROMEJECT:
2779 /* 2770 /*
@@ -2798,12 +2789,13 @@ static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
2798 VPRINTK(DRIVER_NAME": Unknown ioctl for %s (%x)\n", pd->name, cmd); 2789 VPRINTK(DRIVER_NAME": Unknown ioctl for %s (%x)\n", pd->name, cmd);
2799 ret = -ENOTTY; 2790 ret = -ENOTTY;
2800 } 2791 }
2801 unlock_kernel(); 2792 mutex_unlock(&pktcdvd_mutex);
2802 2793
2803 return ret; 2794 return ret;
2804} 2795}
2805 2796
2806static int pkt_media_changed(struct gendisk *disk) 2797static unsigned int pkt_check_events(struct gendisk *disk,
2798 unsigned int clearing)
2807{ 2799{
2808 struct pktcdvd_device *pd = disk->private_data; 2800 struct pktcdvd_device *pd = disk->private_data;
2809 struct gendisk *attached_disk; 2801 struct gendisk *attached_disk;
@@ -2813,9 +2805,9 @@ static int pkt_media_changed(struct gendisk *disk)
2813 if (!pd->bdev) 2805 if (!pd->bdev)
2814 return 0; 2806 return 0;
2815 attached_disk = pd->bdev->bd_disk; 2807 attached_disk = pd->bdev->bd_disk;
2816 if (!attached_disk) 2808 if (!attached_disk || !attached_disk->fops->check_events)
2817 return 0; 2809 return 0;
2818 return attached_disk->fops->media_changed(attached_disk); 2810 return attached_disk->fops->check_events(attached_disk, clearing);
2819} 2811}
2820 2812
2821static const struct block_device_operations pktcdvd_ops = { 2813static const struct block_device_operations pktcdvd_ops = {
@@ -2823,7 +2815,7 @@ static const struct block_device_operations pktcdvd_ops = {
2823 .open = pkt_open, 2815 .open = pkt_open,
2824 .release = pkt_close, 2816 .release = pkt_close,
2825 .ioctl = pkt_ioctl, 2817 .ioctl = pkt_ioctl,
2826 .media_changed = pkt_media_changed, 2818 .check_events = pkt_check_events,
2827}; 2819};
2828 2820
2829static char *pktcdvd_devnode(struct gendisk *gd, mode_t *mode) 2821static char *pktcdvd_devnode(struct gendisk *gd, mode_t *mode)
@@ -2896,6 +2888,10 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
2896 if (ret) 2888 if (ret)
2897 goto out_new_dev; 2889 goto out_new_dev;
2898 2890
2891 /* inherit events of the host device */
2892 disk->events = pd->bdev->bd_disk->events;
2893 disk->async_events = pd->bdev->bd_disk->async_events;
2894
2899 add_disk(disk); 2895 add_disk(disk);
2900 2896
2901 pkt_sysfs_dev_new(pd); 2897 pkt_sysfs_dev_new(pd);
@@ -3046,6 +3042,7 @@ static const struct file_operations pkt_ctl_fops = {
3046 .compat_ioctl = pkt_ctl_compat_ioctl, 3042 .compat_ioctl = pkt_ctl_compat_ioctl,
3047#endif 3043#endif
3048 .owner = THIS_MODULE, 3044 .owner = THIS_MODULE,
3045 .llseek = no_llseek,
3049}; 3046};
3050 3047
3051static struct miscdevice pkt_misc = { 3048static struct miscdevice pkt_misc = {
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index 03688c2da319..8e1ce2e2916a 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -468,7 +468,7 @@ static int __devinit ps3disk_probe(struct ps3_system_bus_device *_dev)
468 blk_queue_dma_alignment(queue, dev->blk_size-1); 468 blk_queue_dma_alignment(queue, dev->blk_size-1);
469 blk_queue_logical_block_size(queue, dev->blk_size); 469 blk_queue_logical_block_size(queue, dev->blk_size);
470 470
471 blk_queue_ordered(queue, QUEUE_ORDERED_DRAIN_FLUSH); 471 blk_queue_flush(queue, REQ_FLUSH);
472 472
473 blk_queue_max_segments(queue, -1); 473 blk_queue_max_segments(queue, -1);
474 blk_queue_max_segment_size(queue, dev->bounce_size); 474 blk_queue_max_segment_size(queue, dev->bounce_size);
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
new file mode 100644
index 000000000000..1278098624e6
--- /dev/null
+++ b/drivers/block/rbd.c
@@ -0,0 +1,2499 @@
1/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
24 For usage instructions, please refer to:
25
26 Documentation/ABI/testing/sysfs-bus-rbd
27
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
34#include <linux/parser.h>
35
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
44#define DRV_NAME "rbd"
45#define DRV_NAME_LONG "rbd (rados block device)"
46
47#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
48
49#define RBD_MAX_MD_NAME_LEN (96 + sizeof(RBD_SUFFIX))
50#define RBD_MAX_POOL_NAME_LEN 64
51#define RBD_MAX_SNAP_NAME_LEN 32
52#define RBD_MAX_OPT_LEN 1024
53
54#define RBD_SNAP_HEAD_NAME "-"
55
56#define DEV_NAME_LEN 32
57
58#define RBD_NOTIFY_TIMEOUT_DEFAULT 10
59
60/*
61 * block device image metadata (in-memory version)
62 */
63struct rbd_image_header {
64 u64 image_size;
65 char block_name[32];
66 __u8 obj_order;
67 __u8 crypt_type;
68 __u8 comp_type;
69 struct rw_semaphore snap_rwsem;
70 struct ceph_snap_context *snapc;
71 size_t snap_names_len;
72 u64 snap_seq;
73 u32 total_snaps;
74
75 char *snap_names;
76 u64 *snap_sizes;
77
78 u64 obj_version;
79};
80
81struct rbd_options {
82 int notify_timeout;
83};
84
85/*
86 * an instance of the client. multiple devices may share a client.
87 */
88struct rbd_client {
89 struct ceph_client *client;
90 struct rbd_options *rbd_opts;
91 struct kref kref;
92 struct list_head node;
93};
94
95struct rbd_req_coll;
96
97/*
98 * a single io request
99 */
100struct rbd_request {
101 struct request *rq; /* blk layer request */
102 struct bio *bio; /* cloned bio */
103 struct page **pages; /* list of used pages */
104 u64 len;
105 int coll_index;
106 struct rbd_req_coll *coll;
107};
108
109struct rbd_req_status {
110 int done;
111 int rc;
112 u64 bytes;
113};
114
115/*
116 * a collection of requests
117 */
118struct rbd_req_coll {
119 int total;
120 int num_done;
121 struct kref kref;
122 struct rbd_req_status status[0];
123};
124
125struct rbd_snap {
126 struct device dev;
127 const char *name;
128 size_t size;
129 struct list_head node;
130 u64 id;
131};
132
133/*
134 * a single device
135 */
136struct rbd_device {
137 int id; /* blkdev unique id */
138
139 int major; /* blkdev assigned major */
140 struct gendisk *disk; /* blkdev's gendisk and rq */
141 struct request_queue *q;
142
143 struct ceph_client *client;
144 struct rbd_client *rbd_client;
145
146 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
147
148 spinlock_t lock; /* queue lock */
149
150 struct rbd_image_header header;
151 char obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */
152 int obj_len;
153 char obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */
154 char pool_name[RBD_MAX_POOL_NAME_LEN];
155 int poolid;
156
157 struct ceph_osd_event *watch_event;
158 struct ceph_osd_request *watch_request;
159
160 char snap_name[RBD_MAX_SNAP_NAME_LEN];
161 u32 cur_snap; /* index+1 of current snapshot within snap context
162 0 - for the head */
163 int read_only;
164
165 struct list_head node;
166
167 /* list of snapshots */
168 struct list_head snaps;
169
170 /* sysfs related */
171 struct device dev;
172};
173
174static struct bus_type rbd_bus_type = {
175 .name = "rbd",
176};
177
178static spinlock_t node_lock; /* protects client get/put */
179
180static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
181static LIST_HEAD(rbd_dev_list); /* devices */
182static LIST_HEAD(rbd_client_list); /* clients */
183
184static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
185static void rbd_dev_release(struct device *dev);
186static ssize_t rbd_snap_rollback(struct device *dev,
187 struct device_attribute *attr,
188 const char *buf,
189 size_t size);
190static ssize_t rbd_snap_add(struct device *dev,
191 struct device_attribute *attr,
192 const char *buf,
193 size_t count);
194static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
195 struct rbd_snap *snap);;
196
197
198static struct rbd_device *dev_to_rbd(struct device *dev)
199{
200 return container_of(dev, struct rbd_device, dev);
201}
202
203static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
204{
205 return get_device(&rbd_dev->dev);
206}
207
208static void rbd_put_dev(struct rbd_device *rbd_dev)
209{
210 put_device(&rbd_dev->dev);
211}
212
213static int __rbd_update_snaps(struct rbd_device *rbd_dev);
214
215static int rbd_open(struct block_device *bdev, fmode_t mode)
216{
217 struct gendisk *disk = bdev->bd_disk;
218 struct rbd_device *rbd_dev = disk->private_data;
219
220 rbd_get_dev(rbd_dev);
221
222 set_device_ro(bdev, rbd_dev->read_only);
223
224 if ((mode & FMODE_WRITE) && rbd_dev->read_only)
225 return -EROFS;
226
227 return 0;
228}
229
230static int rbd_release(struct gendisk *disk, fmode_t mode)
231{
232 struct rbd_device *rbd_dev = disk->private_data;
233
234 rbd_put_dev(rbd_dev);
235
236 return 0;
237}
238
239static const struct block_device_operations rbd_bd_ops = {
240 .owner = THIS_MODULE,
241 .open = rbd_open,
242 .release = rbd_release,
243};
244
245/*
246 * Initialize an rbd client instance.
247 * We own *opt.
248 */
249static struct rbd_client *rbd_client_create(struct ceph_options *opt,
250 struct rbd_options *rbd_opts)
251{
252 struct rbd_client *rbdc;
253 int ret = -ENOMEM;
254
255 dout("rbd_client_create\n");
256 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
257 if (!rbdc)
258 goto out_opt;
259
260 kref_init(&rbdc->kref);
261 INIT_LIST_HEAD(&rbdc->node);
262
263 rbdc->client = ceph_create_client(opt, rbdc);
264 if (IS_ERR(rbdc->client))
265 goto out_rbdc;
266 opt = NULL; /* Now rbdc->client is responsible for opt */
267
268 ret = ceph_open_session(rbdc->client);
269 if (ret < 0)
270 goto out_err;
271
272 rbdc->rbd_opts = rbd_opts;
273
274 spin_lock(&node_lock);
275 list_add_tail(&rbdc->node, &rbd_client_list);
276 spin_unlock(&node_lock);
277
278 dout("rbd_client_create created %p\n", rbdc);
279 return rbdc;
280
281out_err:
282 ceph_destroy_client(rbdc->client);
283out_rbdc:
284 kfree(rbdc);
285out_opt:
286 if (opt)
287 ceph_destroy_options(opt);
288 return ERR_PTR(ret);
289}
290
291/*
292 * Find a ceph client with specific addr and configuration.
293 */
294static struct rbd_client *__rbd_client_find(struct ceph_options *opt)
295{
296 struct rbd_client *client_node;
297
298 if (opt->flags & CEPH_OPT_NOSHARE)
299 return NULL;
300
301 list_for_each_entry(client_node, &rbd_client_list, node)
302 if (ceph_compare_options(opt, client_node->client) == 0)
303 return client_node;
304 return NULL;
305}
306
307/*
308 * mount options
309 */
310enum {
311 Opt_notify_timeout,
312 Opt_last_int,
313 /* int args above */
314 Opt_last_string,
315 /* string args above */
316};
317
318static match_table_t rbdopt_tokens = {
319 {Opt_notify_timeout, "notify_timeout=%d"},
320 /* int args above */
321 /* string args above */
322 {-1, NULL}
323};
324
325static int parse_rbd_opts_token(char *c, void *private)
326{
327 struct rbd_options *rbdopt = private;
328 substring_t argstr[MAX_OPT_ARGS];
329 int token, intval, ret;
330
331 token = match_token((char *)c, rbdopt_tokens, argstr);
332 if (token < 0)
333 return -EINVAL;
334
335 if (token < Opt_last_int) {
336 ret = match_int(&argstr[0], &intval);
337 if (ret < 0) {
338 pr_err("bad mount option arg (not int) "
339 "at '%s'\n", c);
340 return ret;
341 }
342 dout("got int token %d val %d\n", token, intval);
343 } else if (token > Opt_last_int && token < Opt_last_string) {
344 dout("got string token %d val %s\n", token,
345 argstr[0].from);
346 } else {
347 dout("got token %d\n", token);
348 }
349
350 switch (token) {
351 case Opt_notify_timeout:
352 rbdopt->notify_timeout = intval;
353 break;
354 default:
355 BUG_ON(token);
356 }
357 return 0;
358}
359
360/*
361 * Get a ceph client with specific addr and configuration, if one does
362 * not exist create it.
363 */
364static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
365 char *options)
366{
367 struct rbd_client *rbdc;
368 struct ceph_options *opt;
369 int ret;
370 struct rbd_options *rbd_opts;
371
372 rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
373 if (!rbd_opts)
374 return -ENOMEM;
375
376 rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
377
378 ret = ceph_parse_options(&opt, options, mon_addr,
379 mon_addr + strlen(mon_addr), parse_rbd_opts_token, rbd_opts);
380 if (ret < 0)
381 goto done_err;
382
383 spin_lock(&node_lock);
384 rbdc = __rbd_client_find(opt);
385 if (rbdc) {
386 ceph_destroy_options(opt);
387
388 /* using an existing client */
389 kref_get(&rbdc->kref);
390 rbd_dev->rbd_client = rbdc;
391 rbd_dev->client = rbdc->client;
392 spin_unlock(&node_lock);
393 return 0;
394 }
395 spin_unlock(&node_lock);
396
397 rbdc = rbd_client_create(opt, rbd_opts);
398 if (IS_ERR(rbdc)) {
399 ret = PTR_ERR(rbdc);
400 goto done_err;
401 }
402
403 rbd_dev->rbd_client = rbdc;
404 rbd_dev->client = rbdc->client;
405 return 0;
406done_err:
407 kfree(rbd_opts);
408 return ret;
409}
410
411/*
412 * Destroy ceph client
413 */
414static void rbd_client_release(struct kref *kref)
415{
416 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
417
418 dout("rbd_release_client %p\n", rbdc);
419 spin_lock(&node_lock);
420 list_del(&rbdc->node);
421 spin_unlock(&node_lock);
422
423 ceph_destroy_client(rbdc->client);
424 kfree(rbdc->rbd_opts);
425 kfree(rbdc);
426}
427
428/*
429 * Drop reference to ceph client node. If it's not referenced anymore, release
430 * it.
431 */
432static void rbd_put_client(struct rbd_device *rbd_dev)
433{
434 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
435 rbd_dev->rbd_client = NULL;
436 rbd_dev->client = NULL;
437}
438
439/*
440 * Destroy requests collection
441 */
442static void rbd_coll_release(struct kref *kref)
443{
444 struct rbd_req_coll *coll =
445 container_of(kref, struct rbd_req_coll, kref);
446
447 dout("rbd_coll_release %p\n", coll);
448 kfree(coll);
449}
450
451/*
452 * Create a new header structure, translate header format from the on-disk
453 * header.
454 */
455static int rbd_header_from_disk(struct rbd_image_header *header,
456 struct rbd_image_header_ondisk *ondisk,
457 int allocated_snaps,
458 gfp_t gfp_flags)
459{
460 int i;
461 u32 snap_count = le32_to_cpu(ondisk->snap_count);
462 int ret = -ENOMEM;
463
464 init_rwsem(&header->snap_rwsem);
465 header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
466 header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
467 snap_count *
468 sizeof(struct rbd_image_snap_ondisk),
469 gfp_flags);
470 if (!header->snapc)
471 return -ENOMEM;
472 if (snap_count) {
473 header->snap_names = kmalloc(header->snap_names_len,
474 GFP_KERNEL);
475 if (!header->snap_names)
476 goto err_snapc;
477 header->snap_sizes = kmalloc(snap_count * sizeof(u64),
478 GFP_KERNEL);
479 if (!header->snap_sizes)
480 goto err_names;
481 } else {
482 header->snap_names = NULL;
483 header->snap_sizes = NULL;
484 }
485 memcpy(header->block_name, ondisk->block_name,
486 sizeof(ondisk->block_name));
487
488 header->image_size = le64_to_cpu(ondisk->image_size);
489 header->obj_order = ondisk->options.order;
490 header->crypt_type = ondisk->options.crypt_type;
491 header->comp_type = ondisk->options.comp_type;
492
493 atomic_set(&header->snapc->nref, 1);
494 header->snap_seq = le64_to_cpu(ondisk->snap_seq);
495 header->snapc->num_snaps = snap_count;
496 header->total_snaps = snap_count;
497
498 if (snap_count &&
499 allocated_snaps == snap_count) {
500 for (i = 0; i < snap_count; i++) {
501 header->snapc->snaps[i] =
502 le64_to_cpu(ondisk->snaps[i].id);
503 header->snap_sizes[i] =
504 le64_to_cpu(ondisk->snaps[i].image_size);
505 }
506
507 /* copy snapshot names */
508 memcpy(header->snap_names, &ondisk->snaps[i],
509 header->snap_names_len);
510 }
511
512 return 0;
513
514err_names:
515 kfree(header->snap_names);
516err_snapc:
517 kfree(header->snapc);
518 return ret;
519}
520
521static int snap_index(struct rbd_image_header *header, int snap_num)
522{
523 return header->total_snaps - snap_num;
524}
525
526static u64 cur_snap_id(struct rbd_device *rbd_dev)
527{
528 struct rbd_image_header *header = &rbd_dev->header;
529
530 if (!rbd_dev->cur_snap)
531 return 0;
532
533 return header->snapc->snaps[snap_index(header, rbd_dev->cur_snap)];
534}
535
536static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
537 u64 *seq, u64 *size)
538{
539 int i;
540 char *p = header->snap_names;
541
542 for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) {
543 if (strcmp(snap_name, p) == 0)
544 break;
545 }
546 if (i == header->total_snaps)
547 return -ENOENT;
548 if (seq)
549 *seq = header->snapc->snaps[i];
550
551 if (size)
552 *size = header->snap_sizes[i];
553
554 return i;
555}
556
557static int rbd_header_set_snap(struct rbd_device *dev,
558 const char *snap_name,
559 u64 *size)
560{
561 struct rbd_image_header *header = &dev->header;
562 struct ceph_snap_context *snapc = header->snapc;
563 int ret = -ENOENT;
564
565 down_write(&header->snap_rwsem);
566
567 if (!snap_name ||
568 !*snap_name ||
569 strcmp(snap_name, "-") == 0 ||
570 strcmp(snap_name, RBD_SNAP_HEAD_NAME) == 0) {
571 if (header->total_snaps)
572 snapc->seq = header->snap_seq;
573 else
574 snapc->seq = 0;
575 dev->cur_snap = 0;
576 dev->read_only = 0;
577 if (size)
578 *size = header->image_size;
579 } else {
580 ret = snap_by_name(header, snap_name, &snapc->seq, size);
581 if (ret < 0)
582 goto done;
583
584 dev->cur_snap = header->total_snaps - ret;
585 dev->read_only = 1;
586 }
587
588 ret = 0;
589done:
590 up_write(&header->snap_rwsem);
591 return ret;
592}
593
594static void rbd_header_free(struct rbd_image_header *header)
595{
596 kfree(header->snapc);
597 kfree(header->snap_names);
598 kfree(header->snap_sizes);
599}
600
601/*
602 * get the actual striped segment name, offset and length
603 */
604static u64 rbd_get_segment(struct rbd_image_header *header,
605 const char *block_name,
606 u64 ofs, u64 len,
607 char *seg_name, u64 *segofs)
608{
609 u64 seg = ofs >> header->obj_order;
610
611 if (seg_name)
612 snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
613 "%s.%012llx", block_name, seg);
614
615 ofs = ofs & ((1 << header->obj_order) - 1);
616 len = min_t(u64, len, (1 << header->obj_order) - ofs);
617
618 if (segofs)
619 *segofs = ofs;
620
621 return len;
622}
623
624static int rbd_get_num_segments(struct rbd_image_header *header,
625 u64 ofs, u64 len)
626{
627 u64 start_seg = ofs >> header->obj_order;
628 u64 end_seg = (ofs + len - 1) >> header->obj_order;
629 return end_seg - start_seg + 1;
630}
631
632/*
633 * bio helpers
634 */
635
636static void bio_chain_put(struct bio *chain)
637{
638 struct bio *tmp;
639
640 while (chain) {
641 tmp = chain;
642 chain = chain->bi_next;
643 bio_put(tmp);
644 }
645}
646
647/*
648 * zeros a bio chain, starting at specific offset
649 */
650static void zero_bio_chain(struct bio *chain, int start_ofs)
651{
652 struct bio_vec *bv;
653 unsigned long flags;
654 void *buf;
655 int i;
656 int pos = 0;
657
658 while (chain) {
659 bio_for_each_segment(bv, chain, i) {
660 if (pos + bv->bv_len > start_ofs) {
661 int remainder = max(start_ofs - pos, 0);
662 buf = bvec_kmap_irq(bv, &flags);
663 memset(buf + remainder, 0,
664 bv->bv_len - remainder);
665 bvec_kunmap_irq(buf, &flags);
666 }
667 pos += bv->bv_len;
668 }
669
670 chain = chain->bi_next;
671 }
672}
673
674/*
675 * bio_chain_clone - clone a chain of bios up to a certain length.
676 * might return a bio_pair that will need to be released.
677 */
678static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
679 struct bio_pair **bp,
680 int len, gfp_t gfpmask)
681{
682 struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
683 int total = 0;
684
685 if (*bp) {
686 bio_pair_release(*bp);
687 *bp = NULL;
688 }
689
690 while (old_chain && (total < len)) {
691 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
692 if (!tmp)
693 goto err_out;
694
695 if (total + old_chain->bi_size > len) {
696 struct bio_pair *bp;
697
698 /*
699 * this split can only happen with a single paged bio,
700 * split_bio will BUG_ON if this is not the case
701 */
702 dout("bio_chain_clone split! total=%d remaining=%d"
703 "bi_size=%d\n",
704 (int)total, (int)len-total,
705 (int)old_chain->bi_size);
706
707 /* split the bio. We'll release it either in the next
708 call, or it will have to be released outside */
709 bp = bio_split(old_chain, (len - total) / 512ULL);
710 if (!bp)
711 goto err_out;
712
713 __bio_clone(tmp, &bp->bio1);
714
715 *next = &bp->bio2;
716 } else {
717 __bio_clone(tmp, old_chain);
718 *next = old_chain->bi_next;
719 }
720
721 tmp->bi_bdev = NULL;
722 gfpmask &= ~__GFP_WAIT;
723 tmp->bi_next = NULL;
724
725 if (!new_chain) {
726 new_chain = tail = tmp;
727 } else {
728 tail->bi_next = tmp;
729 tail = tmp;
730 }
731 old_chain = old_chain->bi_next;
732
733 total += tmp->bi_size;
734 }
735
736 BUG_ON(total < len);
737
738 if (tail)
739 tail->bi_next = NULL;
740
741 *old = old_chain;
742
743 return new_chain;
744
745err_out:
746 dout("bio_chain_clone with err\n");
747 bio_chain_put(new_chain);
748 return NULL;
749}
750
751/*
752 * helpers for osd request op vectors.
753 */
754static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
755 int num_ops,
756 int opcode,
757 u32 payload_len)
758{
759 *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
760 GFP_NOIO);
761 if (!*ops)
762 return -ENOMEM;
763 (*ops)[0].op = opcode;
764 /*
765 * op extent offset and length will be set later on
766 * in calc_raw_layout()
767 */
768 (*ops)[0].payload_len = payload_len;
769 return 0;
770}
771
772static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
773{
774 kfree(ops);
775}
776
777static void rbd_coll_end_req_index(struct request *rq,
778 struct rbd_req_coll *coll,
779 int index,
780 int ret, u64 len)
781{
782 struct request_queue *q;
783 int min, max, i;
784
785 dout("rbd_coll_end_req_index %p index %d ret %d len %lld\n",
786 coll, index, ret, len);
787
788 if (!rq)
789 return;
790
791 if (!coll) {
792 blk_end_request(rq, ret, len);
793 return;
794 }
795
796 q = rq->q;
797
798 spin_lock_irq(q->queue_lock);
799 coll->status[index].done = 1;
800 coll->status[index].rc = ret;
801 coll->status[index].bytes = len;
802 max = min = coll->num_done;
803 while (max < coll->total && coll->status[max].done)
804 max++;
805
806 for (i = min; i<max; i++) {
807 __blk_end_request(rq, coll->status[i].rc,
808 coll->status[i].bytes);
809 coll->num_done++;
810 kref_put(&coll->kref, rbd_coll_release);
811 }
812 spin_unlock_irq(q->queue_lock);
813}
814
815static void rbd_coll_end_req(struct rbd_request *req,
816 int ret, u64 len)
817{
818 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
819}
820
821/*
822 * Send ceph osd request
823 */
824static int rbd_do_request(struct request *rq,
825 struct rbd_device *dev,
826 struct ceph_snap_context *snapc,
827 u64 snapid,
828 const char *obj, u64 ofs, u64 len,
829 struct bio *bio,
830 struct page **pages,
831 int num_pages,
832 int flags,
833 struct ceph_osd_req_op *ops,
834 int num_reply,
835 struct rbd_req_coll *coll,
836 int coll_index,
837 void (*rbd_cb)(struct ceph_osd_request *req,
838 struct ceph_msg *msg),
839 struct ceph_osd_request **linger_req,
840 u64 *ver)
841{
842 struct ceph_osd_request *req;
843 struct ceph_file_layout *layout;
844 int ret;
845 u64 bno;
846 struct timespec mtime = CURRENT_TIME;
847 struct rbd_request *req_data;
848 struct ceph_osd_request_head *reqhead;
849 struct rbd_image_header *header = &dev->header;
850
851 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
852 if (!req_data) {
853 if (coll)
854 rbd_coll_end_req_index(rq, coll, coll_index,
855 -ENOMEM, len);
856 return -ENOMEM;
857 }
858
859 if (coll) {
860 req_data->coll = coll;
861 req_data->coll_index = coll_index;
862 }
863
864 dout("rbd_do_request obj=%s ofs=%lld len=%lld\n", obj, len, ofs);
865
866 down_read(&header->snap_rwsem);
867
868 req = ceph_osdc_alloc_request(&dev->client->osdc, flags,
869 snapc,
870 ops,
871 false,
872 GFP_NOIO, pages, bio);
873 if (!req) {
874 up_read(&header->snap_rwsem);
875 ret = -ENOMEM;
876 goto done_pages;
877 }
878
879 req->r_callback = rbd_cb;
880
881 req_data->rq = rq;
882 req_data->bio = bio;
883 req_data->pages = pages;
884 req_data->len = len;
885
886 req->r_priv = req_data;
887
888 reqhead = req->r_request->front.iov_base;
889 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
890
891 strncpy(req->r_oid, obj, sizeof(req->r_oid));
892 req->r_oid_len = strlen(req->r_oid);
893
894 layout = &req->r_file_layout;
895 memset(layout, 0, sizeof(*layout));
896 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
897 layout->fl_stripe_count = cpu_to_le32(1);
898 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
899 layout->fl_pg_preferred = cpu_to_le32(-1);
900 layout->fl_pg_pool = cpu_to_le32(dev->poolid);
901 ceph_calc_raw_layout(&dev->client->osdc, layout, snapid,
902 ofs, &len, &bno, req, ops);
903
904 ceph_osdc_build_request(req, ofs, &len,
905 ops,
906 snapc,
907 &mtime,
908 req->r_oid, req->r_oid_len);
909 up_read(&header->snap_rwsem);
910
911 if (linger_req) {
912 ceph_osdc_set_request_linger(&dev->client->osdc, req);
913 *linger_req = req;
914 }
915
916 ret = ceph_osdc_start_request(&dev->client->osdc, req, false);
917 if (ret < 0)
918 goto done_err;
919
920 if (!rbd_cb) {
921 ret = ceph_osdc_wait_request(&dev->client->osdc, req);
922 if (ver)
923 *ver = le64_to_cpu(req->r_reassert_version.version);
924 dout("reassert_ver=%lld\n",
925 le64_to_cpu(req->r_reassert_version.version));
926 ceph_osdc_put_request(req);
927 }
928 return ret;
929
930done_err:
931 bio_chain_put(req_data->bio);
932 ceph_osdc_put_request(req);
933done_pages:
934 rbd_coll_end_req(req_data, ret, len);
935 kfree(req_data);
936 return ret;
937}
938
939/*
940 * Ceph osd op callback
941 */
942static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
943{
944 struct rbd_request *req_data = req->r_priv;
945 struct ceph_osd_reply_head *replyhead;
946 struct ceph_osd_op *op;
947 __s32 rc;
948 u64 bytes;
949 int read_op;
950
951 /* parse reply */
952 replyhead = msg->front.iov_base;
953 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
954 op = (void *)(replyhead + 1);
955 rc = le32_to_cpu(replyhead->result);
956 bytes = le64_to_cpu(op->extent.length);
957 read_op = (le32_to_cpu(op->op) == CEPH_OSD_OP_READ);
958
959 dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
960
961 if (rc == -ENOENT && read_op) {
962 zero_bio_chain(req_data->bio, 0);
963 rc = 0;
964 } else if (rc == 0 && read_op && bytes < req_data->len) {
965 zero_bio_chain(req_data->bio, bytes);
966 bytes = req_data->len;
967 }
968
969 rbd_coll_end_req(req_data, rc, bytes);
970
971 if (req_data->bio)
972 bio_chain_put(req_data->bio);
973
974 ceph_osdc_put_request(req);
975 kfree(req_data);
976}
977
978static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
979{
980 ceph_osdc_put_request(req);
981}
982
983/*
984 * Do a synchronous ceph osd operation
985 */
986static int rbd_req_sync_op(struct rbd_device *dev,
987 struct ceph_snap_context *snapc,
988 u64 snapid,
989 int opcode,
990 int flags,
991 struct ceph_osd_req_op *orig_ops,
992 int num_reply,
993 const char *obj,
994 u64 ofs, u64 len,
995 char *buf,
996 struct ceph_osd_request **linger_req,
997 u64 *ver)
998{
999 int ret;
1000 struct page **pages;
1001 int num_pages;
1002 struct ceph_osd_req_op *ops = orig_ops;
1003 u32 payload_len;
1004
1005 num_pages = calc_pages_for(ofs , len);
1006 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1007 if (IS_ERR(pages))
1008 return PTR_ERR(pages);
1009
1010 if (!orig_ops) {
1011 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
1012 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1013 if (ret < 0)
1014 goto done;
1015
1016 if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
1017 ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
1018 if (ret < 0)
1019 goto done_ops;
1020 }
1021 }
1022
1023 ret = rbd_do_request(NULL, dev, snapc, snapid,
1024 obj, ofs, len, NULL,
1025 pages, num_pages,
1026 flags,
1027 ops,
1028 2,
1029 NULL, 0,
1030 NULL,
1031 linger_req, ver);
1032 if (ret < 0)
1033 goto done_ops;
1034
1035 if ((flags & CEPH_OSD_FLAG_READ) && buf)
1036 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1037
1038done_ops:
1039 if (!orig_ops)
1040 rbd_destroy_ops(ops);
1041done:
1042 ceph_release_page_vector(pages, num_pages);
1043 return ret;
1044}
1045
1046/*
1047 * Do an asynchronous ceph osd operation
1048 */
1049static int rbd_do_op(struct request *rq,
1050 struct rbd_device *rbd_dev ,
1051 struct ceph_snap_context *snapc,
1052 u64 snapid,
1053 int opcode, int flags, int num_reply,
1054 u64 ofs, u64 len,
1055 struct bio *bio,
1056 struct rbd_req_coll *coll,
1057 int coll_index)
1058{
1059 char *seg_name;
1060 u64 seg_ofs;
1061 u64 seg_len;
1062 int ret;
1063 struct ceph_osd_req_op *ops;
1064 u32 payload_len;
1065
1066 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1067 if (!seg_name)
1068 return -ENOMEM;
1069
1070 seg_len = rbd_get_segment(&rbd_dev->header,
1071 rbd_dev->header.block_name,
1072 ofs, len,
1073 seg_name, &seg_ofs);
1074
1075 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1076
1077 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1078 if (ret < 0)
1079 goto done;
1080
1081 /* we've taken care of segment sizes earlier when we
1082 cloned the bios. We should never have a segment
1083 truncated at this point */
1084 BUG_ON(seg_len < len);
1085
1086 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1087 seg_name, seg_ofs, seg_len,
1088 bio,
1089 NULL, 0,
1090 flags,
1091 ops,
1092 num_reply,
1093 coll, coll_index,
1094 rbd_req_cb, 0, NULL);
1095
1096 rbd_destroy_ops(ops);
1097done:
1098 kfree(seg_name);
1099 return ret;
1100}
1101
1102/*
1103 * Request async osd write
1104 */
1105static int rbd_req_write(struct request *rq,
1106 struct rbd_device *rbd_dev,
1107 struct ceph_snap_context *snapc,
1108 u64 ofs, u64 len,
1109 struct bio *bio,
1110 struct rbd_req_coll *coll,
1111 int coll_index)
1112{
1113 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1114 CEPH_OSD_OP_WRITE,
1115 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1116 2,
1117 ofs, len, bio, coll, coll_index);
1118}
1119
1120/*
1121 * Request async osd read
1122 */
1123static int rbd_req_read(struct request *rq,
1124 struct rbd_device *rbd_dev,
1125 u64 snapid,
1126 u64 ofs, u64 len,
1127 struct bio *bio,
1128 struct rbd_req_coll *coll,
1129 int coll_index)
1130{
1131 return rbd_do_op(rq, rbd_dev, NULL,
1132 (snapid ? snapid : CEPH_NOSNAP),
1133 CEPH_OSD_OP_READ,
1134 CEPH_OSD_FLAG_READ,
1135 2,
1136 ofs, len, bio, coll, coll_index);
1137}
1138
1139/*
1140 * Request sync osd read
1141 */
1142static int rbd_req_sync_read(struct rbd_device *dev,
1143 struct ceph_snap_context *snapc,
1144 u64 snapid,
1145 const char *obj,
1146 u64 ofs, u64 len,
1147 char *buf,
1148 u64 *ver)
1149{
1150 return rbd_req_sync_op(dev, NULL,
1151 (snapid ? snapid : CEPH_NOSNAP),
1152 CEPH_OSD_OP_READ,
1153 CEPH_OSD_FLAG_READ,
1154 NULL,
1155 1, obj, ofs, len, buf, NULL, ver);
1156}
1157
1158/*
1159 * Request sync osd watch
1160 */
1161static int rbd_req_sync_notify_ack(struct rbd_device *dev,
1162 u64 ver,
1163 u64 notify_id,
1164 const char *obj)
1165{
1166 struct ceph_osd_req_op *ops;
1167 struct page **pages = NULL;
1168 int ret;
1169
1170 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1171 if (ret < 0)
1172 return ret;
1173
1174 ops[0].watch.ver = cpu_to_le64(dev->header.obj_version);
1175 ops[0].watch.cookie = notify_id;
1176 ops[0].watch.flag = 0;
1177
1178 ret = rbd_do_request(NULL, dev, NULL, CEPH_NOSNAP,
1179 obj, 0, 0, NULL,
1180 pages, 0,
1181 CEPH_OSD_FLAG_READ,
1182 ops,
1183 1,
1184 NULL, 0,
1185 rbd_simple_req_cb, 0, NULL);
1186
1187 rbd_destroy_ops(ops);
1188 return ret;
1189}
1190
1191static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1192{
1193 struct rbd_device *dev = (struct rbd_device *)data;
1194 int rc;
1195
1196 if (!dev)
1197 return;
1198
1199 dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
1200 notify_id, (int)opcode);
1201 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1202 rc = __rbd_update_snaps(dev);
1203 mutex_unlock(&ctl_mutex);
1204 if (rc)
1205 pr_warning(DRV_NAME "%d got notification but failed to update"
1206 " snaps: %d\n", dev->major, rc);
1207
1208 rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name);
1209}
1210
1211/*
1212 * Request sync osd watch
1213 */
1214static int rbd_req_sync_watch(struct rbd_device *dev,
1215 const char *obj,
1216 u64 ver)
1217{
1218 struct ceph_osd_req_op *ops;
1219 struct ceph_osd_client *osdc = &dev->client->osdc;
1220
1221 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1222 if (ret < 0)
1223 return ret;
1224
1225 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
1226 (void *)dev, &dev->watch_event);
1227 if (ret < 0)
1228 goto fail;
1229
1230 ops[0].watch.ver = cpu_to_le64(ver);
1231 ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
1232 ops[0].watch.flag = 1;
1233
1234 ret = rbd_req_sync_op(dev, NULL,
1235 CEPH_NOSNAP,
1236 0,
1237 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1238 ops,
1239 1, obj, 0, 0, NULL,
1240 &dev->watch_request, NULL);
1241
1242 if (ret < 0)
1243 goto fail_event;
1244
1245 rbd_destroy_ops(ops);
1246 return 0;
1247
1248fail_event:
1249 ceph_osdc_cancel_event(dev->watch_event);
1250 dev->watch_event = NULL;
1251fail:
1252 rbd_destroy_ops(ops);
1253 return ret;
1254}
1255
1256struct rbd_notify_info {
1257 struct rbd_device *dev;
1258};
1259
1260static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1261{
1262 struct rbd_device *dev = (struct rbd_device *)data;
1263 if (!dev)
1264 return;
1265
1266 dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
1267 notify_id, (int)opcode);
1268}
1269
1270/*
1271 * Request sync osd notify
1272 */
1273static int rbd_req_sync_notify(struct rbd_device *dev,
1274 const char *obj)
1275{
1276 struct ceph_osd_req_op *ops;
1277 struct ceph_osd_client *osdc = &dev->client->osdc;
1278 struct ceph_osd_event *event;
1279 struct rbd_notify_info info;
1280 int payload_len = sizeof(u32) + sizeof(u32);
1281 int ret;
1282
1283 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len);
1284 if (ret < 0)
1285 return ret;
1286
1287 info.dev = dev;
1288
1289 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1290 (void *)&info, &event);
1291 if (ret < 0)
1292 goto fail;
1293
1294 ops[0].watch.ver = 1;
1295 ops[0].watch.flag = 1;
1296 ops[0].watch.cookie = event->cookie;
1297 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1298 ops[0].watch.timeout = 12;
1299
1300 ret = rbd_req_sync_op(dev, NULL,
1301 CEPH_NOSNAP,
1302 0,
1303 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1304 ops,
1305 1, obj, 0, 0, NULL, NULL, NULL);
1306 if (ret < 0)
1307 goto fail_event;
1308
1309 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1310 dout("ceph_osdc_wait_event returned %d\n", ret);
1311 rbd_destroy_ops(ops);
1312 return 0;
1313
1314fail_event:
1315 ceph_osdc_cancel_event(event);
1316fail:
1317 rbd_destroy_ops(ops);
1318 return ret;
1319}
1320
1321/*
1322 * Request sync osd rollback
1323 */
1324static int rbd_req_sync_rollback_obj(struct rbd_device *dev,
1325 u64 snapid,
1326 const char *obj)
1327{
1328 struct ceph_osd_req_op *ops;
1329 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_ROLLBACK, 0);
1330 if (ret < 0)
1331 return ret;
1332
1333 ops[0].snap.snapid = snapid;
1334
1335 ret = rbd_req_sync_op(dev, NULL,
1336 CEPH_NOSNAP,
1337 0,
1338 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1339 ops,
1340 1, obj, 0, 0, NULL, NULL, NULL);
1341
1342 rbd_destroy_ops(ops);
1343
1344 return ret;
1345}
1346
1347/*
1348 * Request sync osd read
1349 */
1350static int rbd_req_sync_exec(struct rbd_device *dev,
1351 const char *obj,
1352 const char *cls,
1353 const char *method,
1354 const char *data,
1355 int len,
1356 u64 *ver)
1357{
1358 struct ceph_osd_req_op *ops;
1359 int cls_len = strlen(cls);
1360 int method_len = strlen(method);
1361 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
1362 cls_len + method_len + len);
1363 if (ret < 0)
1364 return ret;
1365
1366 ops[0].cls.class_name = cls;
1367 ops[0].cls.class_len = (__u8)cls_len;
1368 ops[0].cls.method_name = method;
1369 ops[0].cls.method_len = (__u8)method_len;
1370 ops[0].cls.argc = 0;
1371 ops[0].cls.indata = data;
1372 ops[0].cls.indata_len = len;
1373
1374 ret = rbd_req_sync_op(dev, NULL,
1375 CEPH_NOSNAP,
1376 0,
1377 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1378 ops,
1379 1, obj, 0, 0, NULL, NULL, ver);
1380
1381 rbd_destroy_ops(ops);
1382
1383 dout("cls_exec returned %d\n", ret);
1384 return ret;
1385}
1386
1387static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1388{
1389 struct rbd_req_coll *coll =
1390 kzalloc(sizeof(struct rbd_req_coll) +
1391 sizeof(struct rbd_req_status) * num_reqs,
1392 GFP_ATOMIC);
1393
1394 if (!coll)
1395 return NULL;
1396 coll->total = num_reqs;
1397 kref_init(&coll->kref);
1398 return coll;
1399}
1400
1401/*
1402 * block device queue callback
1403 */
1404static void rbd_rq_fn(struct request_queue *q)
1405{
1406 struct rbd_device *rbd_dev = q->queuedata;
1407 struct request *rq;
1408 struct bio_pair *bp = NULL;
1409
1410 rq = blk_fetch_request(q);
1411
1412 while (1) {
1413 struct bio *bio;
1414 struct bio *rq_bio, *next_bio = NULL;
1415 bool do_write;
1416 int size, op_size = 0;
1417 u64 ofs;
1418 int num_segs, cur_seg = 0;
1419 struct rbd_req_coll *coll;
1420
1421 /* peek at request from block layer */
1422 if (!rq)
1423 break;
1424
1425 dout("fetched request\n");
1426
1427 /* filter out block requests we don't understand */
1428 if ((rq->cmd_type != REQ_TYPE_FS)) {
1429 __blk_end_request_all(rq, 0);
1430 goto next;
1431 }
1432
1433 /* deduce our operation (read, write) */
1434 do_write = (rq_data_dir(rq) == WRITE);
1435
1436 size = blk_rq_bytes(rq);
1437 ofs = blk_rq_pos(rq) * 512ULL;
1438 rq_bio = rq->bio;
1439 if (do_write && rbd_dev->read_only) {
1440 __blk_end_request_all(rq, -EROFS);
1441 goto next;
1442 }
1443
1444 spin_unlock_irq(q->queue_lock);
1445
1446 dout("%s 0x%x bytes at 0x%llx\n",
1447 do_write ? "write" : "read",
1448 size, blk_rq_pos(rq) * 512ULL);
1449
1450 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1451 coll = rbd_alloc_coll(num_segs);
1452 if (!coll) {
1453 spin_lock_irq(q->queue_lock);
1454 __blk_end_request_all(rq, -ENOMEM);
1455 goto next;
1456 }
1457
1458 do {
1459 /* a bio clone to be passed down to OSD req */
1460 dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
1461 op_size = rbd_get_segment(&rbd_dev->header,
1462 rbd_dev->header.block_name,
1463 ofs, size,
1464 NULL, NULL);
1465 kref_get(&coll->kref);
1466 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1467 op_size, GFP_ATOMIC);
1468 if (!bio) {
1469 rbd_coll_end_req_index(rq, coll, cur_seg,
1470 -ENOMEM, op_size);
1471 goto next_seg;
1472 }
1473
1474
1475 /* init OSD command: write or read */
1476 if (do_write)
1477 rbd_req_write(rq, rbd_dev,
1478 rbd_dev->header.snapc,
1479 ofs,
1480 op_size, bio,
1481 coll, cur_seg);
1482 else
1483 rbd_req_read(rq, rbd_dev,
1484 cur_snap_id(rbd_dev),
1485 ofs,
1486 op_size, bio,
1487 coll, cur_seg);
1488
1489next_seg:
1490 size -= op_size;
1491 ofs += op_size;
1492
1493 cur_seg++;
1494 rq_bio = next_bio;
1495 } while (size > 0);
1496 kref_put(&coll->kref, rbd_coll_release);
1497
1498 if (bp)
1499 bio_pair_release(bp);
1500 spin_lock_irq(q->queue_lock);
1501next:
1502 rq = blk_fetch_request(q);
1503 }
1504}
1505
1506/*
1507 * a queue callback. Makes sure that we don't create a bio that spans across
1508 * multiple osd objects. One exception would be with a single page bios,
1509 * which we handle later at bio_chain_clone
1510 */
1511static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1512 struct bio_vec *bvec)
1513{
1514 struct rbd_device *rbd_dev = q->queuedata;
1515 unsigned int chunk_sectors = 1 << (rbd_dev->header.obj_order - 9);
1516 sector_t sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1517 unsigned int bio_sectors = bmd->bi_size >> 9;
1518 int max;
1519
1520 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
1521 + bio_sectors)) << 9;
1522 if (max < 0)
1523 max = 0; /* bio_add cannot handle a negative return */
1524 if (max <= bvec->bv_len && bio_sectors == 0)
1525 return bvec->bv_len;
1526 return max;
1527}
1528
1529static void rbd_free_disk(struct rbd_device *rbd_dev)
1530{
1531 struct gendisk *disk = rbd_dev->disk;
1532
1533 if (!disk)
1534 return;
1535
1536 rbd_header_free(&rbd_dev->header);
1537
1538 if (disk->flags & GENHD_FL_UP)
1539 del_gendisk(disk);
1540 if (disk->queue)
1541 blk_cleanup_queue(disk->queue);
1542 put_disk(disk);
1543}
1544
1545/*
1546 * reload the ondisk the header
1547 */
1548static int rbd_read_header(struct rbd_device *rbd_dev,
1549 struct rbd_image_header *header)
1550{
1551 ssize_t rc;
1552 struct rbd_image_header_ondisk *dh;
1553 int snap_count = 0;
1554 u64 snap_names_len = 0;
1555 u64 ver;
1556
1557 while (1) {
1558 int len = sizeof(*dh) +
1559 snap_count * sizeof(struct rbd_image_snap_ondisk) +
1560 snap_names_len;
1561
1562 rc = -ENOMEM;
1563 dh = kmalloc(len, GFP_KERNEL);
1564 if (!dh)
1565 return -ENOMEM;
1566
1567 rc = rbd_req_sync_read(rbd_dev,
1568 NULL, CEPH_NOSNAP,
1569 rbd_dev->obj_md_name,
1570 0, len,
1571 (char *)dh, &ver);
1572 if (rc < 0)
1573 goto out_dh;
1574
1575 rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
1576 if (rc < 0)
1577 goto out_dh;
1578
1579 if (snap_count != header->total_snaps) {
1580 snap_count = header->total_snaps;
1581 snap_names_len = header->snap_names_len;
1582 rbd_header_free(header);
1583 kfree(dh);
1584 continue;
1585 }
1586 break;
1587 }
1588 header->obj_version = ver;
1589
1590out_dh:
1591 kfree(dh);
1592 return rc;
1593}
1594
1595/*
1596 * create a snapshot
1597 */
1598static int rbd_header_add_snap(struct rbd_device *dev,
1599 const char *snap_name,
1600 gfp_t gfp_flags)
1601{
1602 int name_len = strlen(snap_name);
1603 u64 new_snapid;
1604 int ret;
1605 void *data, *p, *e;
1606 u64 ver;
1607
1608 /* we should create a snapshot only if we're pointing at the head */
1609 if (dev->cur_snap)
1610 return -EINVAL;
1611
1612 ret = ceph_monc_create_snapid(&dev->client->monc, dev->poolid,
1613 &new_snapid);
1614 dout("created snapid=%lld\n", new_snapid);
1615 if (ret < 0)
1616 return ret;
1617
1618 data = kmalloc(name_len + 16, gfp_flags);
1619 if (!data)
1620 return -ENOMEM;
1621
1622 p = data;
1623 e = data + name_len + 16;
1624
1625 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1626 ceph_encode_64_safe(&p, e, new_snapid, bad);
1627
1628 ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add",
1629 data, p - data, &ver);
1630
1631 kfree(data);
1632
1633 if (ret < 0)
1634 return ret;
1635
1636 dev->header.snapc->seq = new_snapid;
1637
1638 return 0;
1639bad:
1640 return -ERANGE;
1641}
1642
1643static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1644{
1645 struct rbd_snap *snap;
1646
1647 while (!list_empty(&rbd_dev->snaps)) {
1648 snap = list_first_entry(&rbd_dev->snaps, struct rbd_snap, node);
1649 __rbd_remove_snap_dev(rbd_dev, snap);
1650 }
1651}
1652
1653/*
1654 * only read the first part of the ondisk header, without the snaps info
1655 */
1656static int __rbd_update_snaps(struct rbd_device *rbd_dev)
1657{
1658 int ret;
1659 struct rbd_image_header h;
1660 u64 snap_seq;
1661 int follow_seq = 0;
1662
1663 ret = rbd_read_header(rbd_dev, &h);
1664 if (ret < 0)
1665 return ret;
1666
1667 /* resized? */
1668 set_capacity(rbd_dev->disk, h.image_size / 512ULL);
1669
1670 down_write(&rbd_dev->header.snap_rwsem);
1671
1672 snap_seq = rbd_dev->header.snapc->seq;
1673 if (rbd_dev->header.total_snaps &&
1674 rbd_dev->header.snapc->snaps[0] == snap_seq)
1675 /* pointing at the head, will need to follow that
1676 if head moves */
1677 follow_seq = 1;
1678
1679 kfree(rbd_dev->header.snapc);
1680 kfree(rbd_dev->header.snap_names);
1681 kfree(rbd_dev->header.snap_sizes);
1682
1683 rbd_dev->header.total_snaps = h.total_snaps;
1684 rbd_dev->header.snapc = h.snapc;
1685 rbd_dev->header.snap_names = h.snap_names;
1686 rbd_dev->header.snap_names_len = h.snap_names_len;
1687 rbd_dev->header.snap_sizes = h.snap_sizes;
1688 if (follow_seq)
1689 rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0];
1690 else
1691 rbd_dev->header.snapc->seq = snap_seq;
1692
1693 ret = __rbd_init_snaps_header(rbd_dev);
1694
1695 up_write(&rbd_dev->header.snap_rwsem);
1696
1697 return ret;
1698}
1699
1700static int rbd_init_disk(struct rbd_device *rbd_dev)
1701{
1702 struct gendisk *disk;
1703 struct request_queue *q;
1704 int rc;
1705 u64 total_size = 0;
1706
1707 /* contact OSD, request size info about the object being mapped */
1708 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1709 if (rc)
1710 return rc;
1711
1712 /* no need to lock here, as rbd_dev is not registered yet */
1713 rc = __rbd_init_snaps_header(rbd_dev);
1714 if (rc)
1715 return rc;
1716
1717 rc = rbd_header_set_snap(rbd_dev, rbd_dev->snap_name, &total_size);
1718 if (rc)
1719 return rc;
1720
1721 /* create gendisk info */
1722 rc = -ENOMEM;
1723 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1724 if (!disk)
1725 goto out;
1726
1727 snprintf(disk->disk_name, sizeof(disk->disk_name), DRV_NAME "%d",
1728 rbd_dev->id);
1729 disk->major = rbd_dev->major;
1730 disk->first_minor = 0;
1731 disk->fops = &rbd_bd_ops;
1732 disk->private_data = rbd_dev;
1733
1734 /* init rq */
1735 rc = -ENOMEM;
1736 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1737 if (!q)
1738 goto out_disk;
1739 blk_queue_merge_bvec(q, rbd_merge_bvec);
1740 disk->queue = q;
1741
1742 q->queuedata = rbd_dev;
1743
1744 rbd_dev->disk = disk;
1745 rbd_dev->q = q;
1746
1747 /* finally, announce the disk to the world */
1748 set_capacity(disk, total_size / 512ULL);
1749 add_disk(disk);
1750
1751 pr_info("%s: added with size 0x%llx\n",
1752 disk->disk_name, (unsigned long long)total_size);
1753 return 0;
1754
1755out_disk:
1756 put_disk(disk);
1757out:
1758 return rc;
1759}
1760
1761/*
1762 sysfs
1763*/
1764
1765static ssize_t rbd_size_show(struct device *dev,
1766 struct device_attribute *attr, char *buf)
1767{
1768 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1769
1770 return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size);
1771}
1772
1773static ssize_t rbd_major_show(struct device *dev,
1774 struct device_attribute *attr, char *buf)
1775{
1776 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1777
1778 return sprintf(buf, "%d\n", rbd_dev->major);
1779}
1780
1781static ssize_t rbd_client_id_show(struct device *dev,
1782 struct device_attribute *attr, char *buf)
1783{
1784 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1785
1786 return sprintf(buf, "client%lld\n", ceph_client_id(rbd_dev->client));
1787}
1788
1789static ssize_t rbd_pool_show(struct device *dev,
1790 struct device_attribute *attr, char *buf)
1791{
1792 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1793
1794 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1795}
1796
1797static ssize_t rbd_name_show(struct device *dev,
1798 struct device_attribute *attr, char *buf)
1799{
1800 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1801
1802 return sprintf(buf, "%s\n", rbd_dev->obj);
1803}
1804
1805static ssize_t rbd_snap_show(struct device *dev,
1806 struct device_attribute *attr,
1807 char *buf)
1808{
1809 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1810
1811 return sprintf(buf, "%s\n", rbd_dev->snap_name);
1812}
1813
1814static ssize_t rbd_image_refresh(struct device *dev,
1815 struct device_attribute *attr,
1816 const char *buf,
1817 size_t size)
1818{
1819 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1820 int rc;
1821 int ret = size;
1822
1823 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1824
1825 rc = __rbd_update_snaps(rbd_dev);
1826 if (rc < 0)
1827 ret = rc;
1828
1829 mutex_unlock(&ctl_mutex);
1830 return ret;
1831}
1832
1833static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1834static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1835static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1836static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
1837static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1838static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1839static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1840static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
1841static DEVICE_ATTR(rollback_snap, S_IWUSR, NULL, rbd_snap_rollback);
1842
1843static struct attribute *rbd_attrs[] = {
1844 &dev_attr_size.attr,
1845 &dev_attr_major.attr,
1846 &dev_attr_client_id.attr,
1847 &dev_attr_pool.attr,
1848 &dev_attr_name.attr,
1849 &dev_attr_current_snap.attr,
1850 &dev_attr_refresh.attr,
1851 &dev_attr_create_snap.attr,
1852 &dev_attr_rollback_snap.attr,
1853 NULL
1854};
1855
1856static struct attribute_group rbd_attr_group = {
1857 .attrs = rbd_attrs,
1858};
1859
1860static const struct attribute_group *rbd_attr_groups[] = {
1861 &rbd_attr_group,
1862 NULL
1863};
1864
1865static void rbd_sysfs_dev_release(struct device *dev)
1866{
1867}
1868
1869static struct device_type rbd_device_type = {
1870 .name = "rbd",
1871 .groups = rbd_attr_groups,
1872 .release = rbd_sysfs_dev_release,
1873};
1874
1875
1876/*
1877 sysfs - snapshots
1878*/
1879
1880static ssize_t rbd_snap_size_show(struct device *dev,
1881 struct device_attribute *attr,
1882 char *buf)
1883{
1884 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1885
1886 return sprintf(buf, "%lld\n", (long long)snap->size);
1887}
1888
1889static ssize_t rbd_snap_id_show(struct device *dev,
1890 struct device_attribute *attr,
1891 char *buf)
1892{
1893 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1894
1895 return sprintf(buf, "%lld\n", (long long)snap->id);
1896}
1897
1898static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
1899static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
1900
1901static struct attribute *rbd_snap_attrs[] = {
1902 &dev_attr_snap_size.attr,
1903 &dev_attr_snap_id.attr,
1904 NULL,
1905};
1906
1907static struct attribute_group rbd_snap_attr_group = {
1908 .attrs = rbd_snap_attrs,
1909};
1910
1911static void rbd_snap_dev_release(struct device *dev)
1912{
1913 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1914 kfree(snap->name);
1915 kfree(snap);
1916}
1917
1918static const struct attribute_group *rbd_snap_attr_groups[] = {
1919 &rbd_snap_attr_group,
1920 NULL
1921};
1922
1923static struct device_type rbd_snap_device_type = {
1924 .groups = rbd_snap_attr_groups,
1925 .release = rbd_snap_dev_release,
1926};
1927
1928static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
1929 struct rbd_snap *snap)
1930{
1931 list_del(&snap->node);
1932 device_unregister(&snap->dev);
1933}
1934
1935static int rbd_register_snap_dev(struct rbd_device *rbd_dev,
1936 struct rbd_snap *snap,
1937 struct device *parent)
1938{
1939 struct device *dev = &snap->dev;
1940 int ret;
1941
1942 dev->type = &rbd_snap_device_type;
1943 dev->parent = parent;
1944 dev->release = rbd_snap_dev_release;
1945 dev_set_name(dev, "snap_%s", snap->name);
1946 ret = device_register(dev);
1947
1948 return ret;
1949}
1950
1951static int __rbd_add_snap_dev(struct rbd_device *rbd_dev,
1952 int i, const char *name,
1953 struct rbd_snap **snapp)
1954{
1955 int ret;
1956 struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL);
1957 if (!snap)
1958 return -ENOMEM;
1959 snap->name = kstrdup(name, GFP_KERNEL);
1960 snap->size = rbd_dev->header.snap_sizes[i];
1961 snap->id = rbd_dev->header.snapc->snaps[i];
1962 if (device_is_registered(&rbd_dev->dev)) {
1963 ret = rbd_register_snap_dev(rbd_dev, snap,
1964 &rbd_dev->dev);
1965 if (ret < 0)
1966 goto err;
1967 }
1968 *snapp = snap;
1969 return 0;
1970err:
1971 kfree(snap->name);
1972 kfree(snap);
1973 return ret;
1974}
1975
1976/*
1977 * search for the previous snap in a null delimited string list
1978 */
1979const char *rbd_prev_snap_name(const char *name, const char *start)
1980{
1981 if (name < start + 2)
1982 return NULL;
1983
1984 name -= 2;
1985 while (*name) {
1986 if (name == start)
1987 return start;
1988 name--;
1989 }
1990 return name + 1;
1991}
1992
1993/*
1994 * compare the old list of snapshots that we have to what's in the header
1995 * and update it accordingly. Note that the header holds the snapshots
1996 * in a reverse order (from newest to oldest) and we need to go from
1997 * older to new so that we don't get a duplicate snap name when
1998 * doing the process (e.g., removed snapshot and recreated a new
1999 * one with the same name.
2000 */
2001static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
2002{
2003 const char *name, *first_name;
2004 int i = rbd_dev->header.total_snaps;
2005 struct rbd_snap *snap, *old_snap = NULL;
2006 int ret;
2007 struct list_head *p, *n;
2008
2009 first_name = rbd_dev->header.snap_names;
2010 name = first_name + rbd_dev->header.snap_names_len;
2011
2012 list_for_each_prev_safe(p, n, &rbd_dev->snaps) {
2013 u64 cur_id;
2014
2015 old_snap = list_entry(p, struct rbd_snap, node);
2016
2017 if (i)
2018 cur_id = rbd_dev->header.snapc->snaps[i - 1];
2019
2020 if (!i || old_snap->id < cur_id) {
2021 /* old_snap->id was skipped, thus was removed */
2022 __rbd_remove_snap_dev(rbd_dev, old_snap);
2023 continue;
2024 }
2025 if (old_snap->id == cur_id) {
2026 /* we have this snapshot already */
2027 i--;
2028 name = rbd_prev_snap_name(name, first_name);
2029 continue;
2030 }
2031 for (; i > 0;
2032 i--, name = rbd_prev_snap_name(name, first_name)) {
2033 if (!name) {
2034 WARN_ON(1);
2035 return -EINVAL;
2036 }
2037 cur_id = rbd_dev->header.snapc->snaps[i];
2038 /* snapshot removal? handle it above */
2039 if (cur_id >= old_snap->id)
2040 break;
2041 /* a new snapshot */
2042 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2043 if (ret < 0)
2044 return ret;
2045
2046 /* note that we add it backward so using n and not p */
2047 list_add(&snap->node, n);
2048 p = &snap->node;
2049 }
2050 }
2051 /* we're done going over the old snap list, just add what's left */
2052 for (; i > 0; i--) {
2053 name = rbd_prev_snap_name(name, first_name);
2054 if (!name) {
2055 WARN_ON(1);
2056 return -EINVAL;
2057 }
2058 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2059 if (ret < 0)
2060 return ret;
2061 list_add(&snap->node, &rbd_dev->snaps);
2062 }
2063
2064 return 0;
2065}
2066
2067
2068static void rbd_root_dev_release(struct device *dev)
2069{
2070}
2071
2072static struct device rbd_root_dev = {
2073 .init_name = "rbd",
2074 .release = rbd_root_dev_release,
2075};
2076
2077static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2078{
2079 int ret = -ENOMEM;
2080 struct device *dev;
2081 struct rbd_snap *snap;
2082
2083 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2084 dev = &rbd_dev->dev;
2085
2086 dev->bus = &rbd_bus_type;
2087 dev->type = &rbd_device_type;
2088 dev->parent = &rbd_root_dev;
2089 dev->release = rbd_dev_release;
2090 dev_set_name(dev, "%d", rbd_dev->id);
2091 ret = device_register(dev);
2092 if (ret < 0)
2093 goto done_free;
2094
2095 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2096 ret = rbd_register_snap_dev(rbd_dev, snap,
2097 &rbd_dev->dev);
2098 if (ret < 0)
2099 break;
2100 }
2101
2102 mutex_unlock(&ctl_mutex);
2103 return 0;
2104done_free:
2105 mutex_unlock(&ctl_mutex);
2106 return ret;
2107}
2108
2109static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2110{
2111 device_unregister(&rbd_dev->dev);
2112}
2113
2114static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2115{
2116 int ret, rc;
2117
2118 do {
2119 ret = rbd_req_sync_watch(rbd_dev, rbd_dev->obj_md_name,
2120 rbd_dev->header.obj_version);
2121 if (ret == -ERANGE) {
2122 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2123 rc = __rbd_update_snaps(rbd_dev);
2124 mutex_unlock(&ctl_mutex);
2125 if (rc < 0)
2126 return rc;
2127 }
2128 } while (ret == -ERANGE);
2129
2130 return ret;
2131}
2132
2133static ssize_t rbd_add(struct bus_type *bus,
2134 const char *buf,
2135 size_t count)
2136{
2137 struct ceph_osd_client *osdc;
2138 struct rbd_device *rbd_dev;
2139 ssize_t rc = -ENOMEM;
2140 int irc, new_id = 0;
2141 struct list_head *tmp;
2142 char *mon_dev_name;
2143 char *options;
2144
2145 if (!try_module_get(THIS_MODULE))
2146 return -ENODEV;
2147
2148 mon_dev_name = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
2149 if (!mon_dev_name)
2150 goto err_out_mod;
2151
2152 options = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
2153 if (!options)
2154 goto err_mon_dev;
2155
2156 /* new rbd_device object */
2157 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2158 if (!rbd_dev)
2159 goto err_out_opt;
2160
2161 /* static rbd_device initialization */
2162 spin_lock_init(&rbd_dev->lock);
2163 INIT_LIST_HEAD(&rbd_dev->node);
2164 INIT_LIST_HEAD(&rbd_dev->snaps);
2165
2166 /* generate unique id: find highest unique id, add one */
2167 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2168
2169 list_for_each(tmp, &rbd_dev_list) {
2170 struct rbd_device *rbd_dev;
2171
2172 rbd_dev = list_entry(tmp, struct rbd_device, node);
2173 if (rbd_dev->id >= new_id)
2174 new_id = rbd_dev->id + 1;
2175 }
2176
2177 rbd_dev->id = new_id;
2178
2179 /* add to global list */
2180 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2181
2182 /* parse add command */
2183 if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s "
2184 "%" __stringify(RBD_MAX_OPT_LEN) "s "
2185 "%" __stringify(RBD_MAX_POOL_NAME_LEN) "s "
2186 "%" __stringify(RBD_MAX_OBJ_NAME_LEN) "s"
2187 "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
2188 mon_dev_name, options, rbd_dev->pool_name,
2189 rbd_dev->obj, rbd_dev->snap_name) < 4) {
2190 rc = -EINVAL;
2191 goto err_out_slot;
2192 }
2193
2194 if (rbd_dev->snap_name[0] == 0)
2195 rbd_dev->snap_name[0] = '-';
2196
2197 rbd_dev->obj_len = strlen(rbd_dev->obj);
2198 snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s",
2199 rbd_dev->obj, RBD_SUFFIX);
2200
2201 /* initialize rest of new object */
2202 snprintf(rbd_dev->name, DEV_NAME_LEN, DRV_NAME "%d", rbd_dev->id);
2203 rc = rbd_get_client(rbd_dev, mon_dev_name, options);
2204 if (rc < 0)
2205 goto err_out_slot;
2206
2207 mutex_unlock(&ctl_mutex);
2208
2209 /* pick the pool */
2210 osdc = &rbd_dev->client->osdc;
2211 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2212 if (rc < 0)
2213 goto err_out_client;
2214 rbd_dev->poolid = rc;
2215
2216 /* register our block device */
2217 irc = register_blkdev(0, rbd_dev->name);
2218 if (irc < 0) {
2219 rc = irc;
2220 goto err_out_client;
2221 }
2222 rbd_dev->major = irc;
2223
2224 rc = rbd_bus_add_dev(rbd_dev);
2225 if (rc)
2226 goto err_out_blkdev;
2227
2228 /* set up and announce blkdev mapping */
2229 rc = rbd_init_disk(rbd_dev);
2230 if (rc)
2231 goto err_out_bus;
2232
2233 rc = rbd_init_watch_dev(rbd_dev);
2234 if (rc)
2235 goto err_out_bus;
2236
2237 return count;
2238
2239err_out_bus:
2240 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2241 list_del_init(&rbd_dev->node);
2242 mutex_unlock(&ctl_mutex);
2243
2244 /* this will also clean up rest of rbd_dev stuff */
2245
2246 rbd_bus_del_dev(rbd_dev);
2247 kfree(options);
2248 kfree(mon_dev_name);
2249 return rc;
2250
2251err_out_blkdev:
2252 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2253err_out_client:
2254 rbd_put_client(rbd_dev);
2255 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2256err_out_slot:
2257 list_del_init(&rbd_dev->node);
2258 mutex_unlock(&ctl_mutex);
2259
2260 kfree(rbd_dev);
2261err_out_opt:
2262 kfree(options);
2263err_mon_dev:
2264 kfree(mon_dev_name);
2265err_out_mod:
2266 dout("Error adding device %s\n", buf);
2267 module_put(THIS_MODULE);
2268 return rc;
2269}
2270
2271static struct rbd_device *__rbd_get_dev(unsigned long id)
2272{
2273 struct list_head *tmp;
2274 struct rbd_device *rbd_dev;
2275
2276 list_for_each(tmp, &rbd_dev_list) {
2277 rbd_dev = list_entry(tmp, struct rbd_device, node);
2278 if (rbd_dev->id == id)
2279 return rbd_dev;
2280 }
2281 return NULL;
2282}
2283
2284static void rbd_dev_release(struct device *dev)
2285{
2286 struct rbd_device *rbd_dev =
2287 container_of(dev, struct rbd_device, dev);
2288
2289 if (rbd_dev->watch_request)
2290 ceph_osdc_unregister_linger_request(&rbd_dev->client->osdc,
2291 rbd_dev->watch_request);
2292 if (rbd_dev->watch_event)
2293 ceph_osdc_cancel_event(rbd_dev->watch_event);
2294
2295 rbd_put_client(rbd_dev);
2296
2297 /* clean up and free blkdev */
2298 rbd_free_disk(rbd_dev);
2299 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2300 kfree(rbd_dev);
2301
2302 /* release module ref */
2303 module_put(THIS_MODULE);
2304}
2305
2306static ssize_t rbd_remove(struct bus_type *bus,
2307 const char *buf,
2308 size_t count)
2309{
2310 struct rbd_device *rbd_dev = NULL;
2311 int target_id, rc;
2312 unsigned long ul;
2313 int ret = count;
2314
2315 rc = strict_strtoul(buf, 10, &ul);
2316 if (rc)
2317 return rc;
2318
2319 /* convert to int; abort if we lost anything in the conversion */
2320 target_id = (int) ul;
2321 if (target_id != ul)
2322 return -EINVAL;
2323
2324 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2325
2326 rbd_dev = __rbd_get_dev(target_id);
2327 if (!rbd_dev) {
2328 ret = -ENOENT;
2329 goto done;
2330 }
2331
2332 list_del_init(&rbd_dev->node);
2333
2334 __rbd_remove_all_snaps(rbd_dev);
2335 rbd_bus_del_dev(rbd_dev);
2336
2337done:
2338 mutex_unlock(&ctl_mutex);
2339 return ret;
2340}
2341
2342static ssize_t rbd_snap_add(struct device *dev,
2343 struct device_attribute *attr,
2344 const char *buf,
2345 size_t count)
2346{
2347 struct rbd_device *rbd_dev = dev_to_rbd(dev);
2348 int ret;
2349 char *name = kmalloc(count + 1, GFP_KERNEL);
2350 if (!name)
2351 return -ENOMEM;
2352
2353 snprintf(name, count, "%s", buf);
2354
2355 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2356
2357 ret = rbd_header_add_snap(rbd_dev,
2358 name, GFP_KERNEL);
2359 if (ret < 0)
2360 goto err_unlock;
2361
2362 ret = __rbd_update_snaps(rbd_dev);
2363 if (ret < 0)
2364 goto err_unlock;
2365
2366 /* shouldn't hold ctl_mutex when notifying.. notify might
2367 trigger a watch callback that would need to get that mutex */
2368 mutex_unlock(&ctl_mutex);
2369
2370 /* make a best effort, don't error if failed */
2371 rbd_req_sync_notify(rbd_dev, rbd_dev->obj_md_name);
2372
2373 ret = count;
2374 kfree(name);
2375 return ret;
2376
2377err_unlock:
2378 mutex_unlock(&ctl_mutex);
2379 kfree(name);
2380 return ret;
2381}
2382
2383static ssize_t rbd_snap_rollback(struct device *dev,
2384 struct device_attribute *attr,
2385 const char *buf,
2386 size_t count)
2387{
2388 struct rbd_device *rbd_dev = dev_to_rbd(dev);
2389 int ret;
2390 u64 snapid;
2391 u64 cur_ofs;
2392 char *seg_name = NULL;
2393 char *snap_name = kmalloc(count + 1, GFP_KERNEL);
2394 ret = -ENOMEM;
2395 if (!snap_name)
2396 return ret;
2397
2398 /* parse snaps add command */
2399 snprintf(snap_name, count, "%s", buf);
2400 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
2401 if (!seg_name)
2402 goto done;
2403
2404 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2405
2406 ret = snap_by_name(&rbd_dev->header, snap_name, &snapid, NULL);
2407 if (ret < 0)
2408 goto done_unlock;
2409
2410 dout("snapid=%lld\n", snapid);
2411
2412 cur_ofs = 0;
2413 while (cur_ofs < rbd_dev->header.image_size) {
2414 cur_ofs += rbd_get_segment(&rbd_dev->header,
2415 rbd_dev->obj,
2416 cur_ofs, (u64)-1,
2417 seg_name, NULL);
2418 dout("seg_name=%s\n", seg_name);
2419
2420 ret = rbd_req_sync_rollback_obj(rbd_dev, snapid, seg_name);
2421 if (ret < 0)
2422 pr_warning("could not roll back obj %s err=%d\n",
2423 seg_name, ret);
2424 }
2425
2426 ret = __rbd_update_snaps(rbd_dev);
2427 if (ret < 0)
2428 goto done_unlock;
2429
2430 ret = count;
2431
2432done_unlock:
2433 mutex_unlock(&ctl_mutex);
2434done:
2435 kfree(seg_name);
2436 kfree(snap_name);
2437
2438 return ret;
2439}
2440
2441static struct bus_attribute rbd_bus_attrs[] = {
2442 __ATTR(add, S_IWUSR, NULL, rbd_add),
2443 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
2444 __ATTR_NULL
2445};
2446
2447/*
2448 * create control files in sysfs
2449 * /sys/bus/rbd/...
2450 */
2451static int rbd_sysfs_init(void)
2452{
2453 int ret;
2454
2455 rbd_bus_type.bus_attrs = rbd_bus_attrs;
2456
2457 ret = bus_register(&rbd_bus_type);
2458 if (ret < 0)
2459 return ret;
2460
2461 ret = device_register(&rbd_root_dev);
2462
2463 return ret;
2464}
2465
2466static void rbd_sysfs_cleanup(void)
2467{
2468 device_unregister(&rbd_root_dev);
2469 bus_unregister(&rbd_bus_type);
2470}
2471
2472int __init rbd_init(void)
2473{
2474 int rc;
2475
2476 rc = rbd_sysfs_init();
2477 if (rc)
2478 return rc;
2479 spin_lock_init(&node_lock);
2480 pr_info("loaded " DRV_NAME_LONG "\n");
2481 return 0;
2482}
2483
2484void __exit rbd_exit(void)
2485{
2486 rbd_sysfs_cleanup();
2487}
2488
2489module_init(rbd_init);
2490module_exit(rbd_exit);
2491
2492MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2493MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2494MODULE_DESCRIPTION("rados block device");
2495
2496/* following authorship retained from original osdblk.c */
2497MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2498
2499MODULE_LICENSE("GPL");
diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h
new file mode 100644
index 000000000000..fc6c678aa2cb
--- /dev/null
+++ b/drivers/block/rbd_types.h
@@ -0,0 +1,73 @@
1/*
2 * Ceph - scalable distributed file system
3 *
4 * Copyright (C) 2004-2010 Sage Weil <sage@newdream.net>
5 *
6 * This is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License version 2.1, as published by the Free Software
9 * Foundation. See file COPYING.
10 *
11 */
12
13#ifndef CEPH_RBD_TYPES_H
14#define CEPH_RBD_TYPES_H
15
16#include <linux/types.h>
17
18/*
19 * rbd image 'foo' consists of objects
20 * foo.rbd - image metadata
21 * foo.00000000
22 * foo.00000001
23 * ... - data
24 */
25
26#define RBD_SUFFIX ".rbd"
27#define RBD_DIRECTORY "rbd_directory"
28#define RBD_INFO "rbd_info"
29
30#define RBD_DEFAULT_OBJ_ORDER 22 /* 4MB */
31#define RBD_MIN_OBJ_ORDER 16
32#define RBD_MAX_OBJ_ORDER 30
33
34#define RBD_MAX_OBJ_NAME_LEN 96
35#define RBD_MAX_SEG_NAME_LEN 128
36
37#define RBD_COMP_NONE 0
38#define RBD_CRYPT_NONE 0
39
40#define RBD_HEADER_TEXT "<<< Rados Block Device Image >>>\n"
41#define RBD_HEADER_SIGNATURE "RBD"
42#define RBD_HEADER_VERSION "001.005"
43
44struct rbd_info {
45 __le64 max_id;
46} __attribute__ ((packed));
47
48struct rbd_image_snap_ondisk {
49 __le64 id;
50 __le64 image_size;
51} __attribute__((packed));
52
53struct rbd_image_header_ondisk {
54 char text[40];
55 char block_name[24];
56 char signature[4];
57 char version[8];
58 struct {
59 __u8 order;
60 __u8 crypt_type;
61 __u8 comp_type;
62 __u8 unused;
63 } __attribute__((packed)) options;
64 __le64 image_size;
65 __le64 snap_seq;
66 __le32 snap_count;
67 __le32 reserved;
68 __le64 snap_names_len;
69 struct rbd_image_snap_ondisk snaps[0];
70} __attribute__((packed));
71
72
73#endif
diff --git a/drivers/block/smart1,2.h b/drivers/block/smart1,2.h
index a0b403a6b4ed..e5565fbaeb30 100644
--- a/drivers/block/smart1,2.h
+++ b/drivers/block/smart1,2.h
@@ -95,7 +95,7 @@ static unsigned long smart4_completed(ctlr_info_t *h)
95 /* 95 /*
96 * This hardware returns interrupt pending at a different place and 96 * This hardware returns interrupt pending at a different place and
97 * it does not tell us if the fifo is empty, we will have check 97 * it does not tell us if the fifo is empty, we will have check
98 * that by getting a 0 back from the comamnd_completed call. 98 * that by getting a 0 back from the command_completed call.
99 */ 99 */
100static unsigned long smart4_intr_pending(ctlr_info_t *h) 100static unsigned long smart4_intr_pending(ctlr_info_t *h)
101{ 101{
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index 2e46815876df..fd5adcd55944 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -20,7 +20,7 @@
20#include <linux/fd.h> 20#include <linux/fd.h>
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/blkdev.h> 22#include <linux/blkdev.h>
23#include <linux/smp_lock.h> 23#include <linux/mutex.h>
24#include <linux/hdreg.h> 24#include <linux/hdreg.h>
25#include <linux/kernel.h> 25#include <linux/kernel.h>
26#include <linux/delay.h> 26#include <linux/delay.h>
@@ -222,6 +222,7 @@ extern int swim_read_sector_header(struct swim __iomem *base,
222extern int swim_read_sector_data(struct swim __iomem *base, 222extern int swim_read_sector_data(struct swim __iomem *base,
223 unsigned char *data); 223 unsigned char *data);
224 224
225static DEFINE_MUTEX(swim_mutex);
225static inline void set_swim_mode(struct swim __iomem *base, int enable) 226static inline void set_swim_mode(struct swim __iomem *base, int enable)
226{ 227{
227 struct iwm __iomem *iwm_base; 228 struct iwm __iomem *iwm_base;
@@ -666,9 +667,9 @@ static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode)
666{ 667{
667 int ret; 668 int ret;
668 669
669 lock_kernel(); 670 mutex_lock(&swim_mutex);
670 ret = floppy_open(bdev, mode); 671 ret = floppy_open(bdev, mode);
671 unlock_kernel(); 672 mutex_unlock(&swim_mutex);
672 673
673 return ret; 674 return ret;
674} 675}
@@ -678,7 +679,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
678 struct floppy_state *fs = disk->private_data; 679 struct floppy_state *fs = disk->private_data;
679 struct swim __iomem *base = fs->swd->base; 680 struct swim __iomem *base = fs->swd->base;
680 681
681 lock_kernel(); 682 mutex_lock(&swim_mutex);
682 if (fs->ref_count < 0) 683 if (fs->ref_count < 0)
683 fs->ref_count = 0; 684 fs->ref_count = 0;
684 else if (fs->ref_count > 0) 685 else if (fs->ref_count > 0)
@@ -686,7 +687,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
686 687
687 if (fs->ref_count == 0) 688 if (fs->ref_count == 0)
688 swim_motor(base, OFF); 689 swim_motor(base, OFF);
689 unlock_kernel(); 690 mutex_unlock(&swim_mutex);
690 691
691 return 0; 692 return 0;
692} 693}
@@ -704,9 +705,9 @@ static int floppy_ioctl(struct block_device *bdev, fmode_t mode,
704 case FDEJECT: 705 case FDEJECT:
705 if (fs->ref_count != 1) 706 if (fs->ref_count != 1)
706 return -EBUSY; 707 return -EBUSY;
707 lock_kernel(); 708 mutex_lock(&swim_mutex);
708 err = floppy_eject(fs); 709 err = floppy_eject(fs);
709 unlock_kernel(); 710 mutex_unlock(&swim_mutex);
710 return err; 711 return err;
711 712
712 case FDGETPRM: 713 case FDGETPRM:
@@ -740,11 +741,12 @@ static int floppy_getgeo(struct block_device *bdev, struct hd_geometry *geo)
740 return 0; 741 return 0;
741} 742}
742 743
743static int floppy_check_change(struct gendisk *disk) 744static unsigned int floppy_check_events(struct gendisk *disk,
745 unsigned int clearing)
744{ 746{
745 struct floppy_state *fs = disk->private_data; 747 struct floppy_state *fs = disk->private_data;
746 748
747 return fs->ejected; 749 return fs->ejected ? DISK_EVENT_MEDIA_CHANGE : 0;
748} 750}
749 751
750static int floppy_revalidate(struct gendisk *disk) 752static int floppy_revalidate(struct gendisk *disk)
@@ -771,7 +773,7 @@ static const struct block_device_operations floppy_fops = {
771 .release = floppy_release, 773 .release = floppy_release,
772 .ioctl = floppy_ioctl, 774 .ioctl = floppy_ioctl,
773 .getgeo = floppy_getgeo, 775 .getgeo = floppy_getgeo,
774 .media_changed = floppy_check_change, 776 .check_events = floppy_check_events,
775 .revalidate_disk = floppy_revalidate, 777 .revalidate_disk = floppy_revalidate,
776}; 778};
777 779
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index cc6a3864822c..773bfa792777 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -25,7 +25,7 @@
25#include <linux/ioctl.h> 25#include <linux/ioctl.h>
26#include <linux/blkdev.h> 26#include <linux/blkdev.h>
27#include <linux/interrupt.h> 27#include <linux/interrupt.h>
28#include <linux/smp_lock.h> 28#include <linux/mutex.h>
29#include <linux/module.h> 29#include <linux/module.h>
30#include <linux/spinlock.h> 30#include <linux/spinlock.h>
31#include <asm/io.h> 31#include <asm/io.h>
@@ -36,6 +36,7 @@
36#include <asm/machdep.h> 36#include <asm/machdep.h>
37#include <asm/pmac_feature.h> 37#include <asm/pmac_feature.h>
38 38
39static DEFINE_MUTEX(swim3_mutex);
39static struct request_queue *swim3_queue; 40static struct request_queue *swim3_queue;
40static struct gendisk *disks[2]; 41static struct gendisk *disks[2];
41static struct request *fd_req; 42static struct request *fd_req;
@@ -249,7 +250,8 @@ static int floppy_ioctl(struct block_device *bdev, fmode_t mode,
249 unsigned int cmd, unsigned long param); 250 unsigned int cmd, unsigned long param);
250static int floppy_open(struct block_device *bdev, fmode_t mode); 251static int floppy_open(struct block_device *bdev, fmode_t mode);
251static int floppy_release(struct gendisk *disk, fmode_t mode); 252static int floppy_release(struct gendisk *disk, fmode_t mode);
252static int floppy_check_change(struct gendisk *disk); 253static unsigned int floppy_check_events(struct gendisk *disk,
254 unsigned int clearing);
253static int floppy_revalidate(struct gendisk *disk); 255static int floppy_revalidate(struct gendisk *disk);
254 256
255static bool swim3_end_request(int err, unsigned int nr_bytes) 257static bool swim3_end_request(int err, unsigned int nr_bytes)
@@ -873,9 +875,9 @@ static int floppy_ioctl(struct block_device *bdev, fmode_t mode,
873{ 875{
874 int ret; 876 int ret;
875 877
876 lock_kernel(); 878 mutex_lock(&swim3_mutex);
877 ret = floppy_locked_ioctl(bdev, mode, cmd, param); 879 ret = floppy_locked_ioctl(bdev, mode, cmd, param);
878 unlock_kernel(); 880 mutex_unlock(&swim3_mutex);
879 881
880 return ret; 882 return ret;
881} 883}
@@ -953,9 +955,9 @@ static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode)
953{ 955{
954 int ret; 956 int ret;
955 957
956 lock_kernel(); 958 mutex_lock(&swim3_mutex);
957 ret = floppy_open(bdev, mode); 959 ret = floppy_open(bdev, mode);
958 unlock_kernel(); 960 mutex_unlock(&swim3_mutex);
959 961
960 return ret; 962 return ret;
961} 963}
@@ -964,20 +966,21 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
964{ 966{
965 struct floppy_state *fs = disk->private_data; 967 struct floppy_state *fs = disk->private_data;
966 struct swim3 __iomem *sw = fs->swim3; 968 struct swim3 __iomem *sw = fs->swim3;
967 lock_kernel(); 969 mutex_lock(&swim3_mutex);
968 if (fs->ref_count > 0 && --fs->ref_count == 0) { 970 if (fs->ref_count > 0 && --fs->ref_count == 0) {
969 swim3_action(fs, MOTOR_OFF); 971 swim3_action(fs, MOTOR_OFF);
970 out_8(&sw->control_bic, 0xff); 972 out_8(&sw->control_bic, 0xff);
971 swim3_select(fs, RELAX); 973 swim3_select(fs, RELAX);
972 } 974 }
973 unlock_kernel(); 975 mutex_unlock(&swim3_mutex);
974 return 0; 976 return 0;
975} 977}
976 978
977static int floppy_check_change(struct gendisk *disk) 979static unsigned int floppy_check_events(struct gendisk *disk,
980 unsigned int clearing)
978{ 981{
979 struct floppy_state *fs = disk->private_data; 982 struct floppy_state *fs = disk->private_data;
980 return fs->ejected; 983 return fs->ejected ? DISK_EVENT_MEDIA_CHANGE : 0;
981} 984}
982 985
983static int floppy_revalidate(struct gendisk *disk) 986static int floppy_revalidate(struct gendisk *disk)
@@ -1024,7 +1027,7 @@ static const struct block_device_operations floppy_fops = {
1024 .open = floppy_unlocked_open, 1027 .open = floppy_unlocked_open,
1025 .release = floppy_release, 1028 .release = floppy_release,
1026 .ioctl = floppy_ioctl, 1029 .ioctl = floppy_ioctl,
1027 .media_changed = floppy_check_change, 1030 .check_events = floppy_check_events,
1028 .revalidate_disk= floppy_revalidate, 1031 .revalidate_disk= floppy_revalidate,
1029}; 1032};
1030 1033
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index c48e14878582..0e376d46bdd1 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -28,7 +28,7 @@
28#include <linux/timer.h> 28#include <linux/timer.h>
29#include <linux/scatterlist.h> 29#include <linux/scatterlist.h>
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/smp_lock.h> 31#include <linux/mutex.h>
32#include <scsi/scsi.h> 32#include <scsi/scsi.h>
33 33
34#define DRV_NAME "ub" 34#define DRV_NAME "ub"
@@ -248,6 +248,7 @@ struct ub_completion {
248 spinlock_t lock; 248 spinlock_t lock;
249}; 249};
250 250
251static DEFINE_MUTEX(ub_mutex);
251static inline void ub_init_completion(struct ub_completion *x) 252static inline void ub_init_completion(struct ub_completion *x)
252{ 253{
253 x->done = 0; 254 x->done = 0;
@@ -396,7 +397,7 @@ static int ub_probe_lun(struct ub_dev *sc, int lnum);
396#else 397#else
397 398
398static const struct usb_device_id ub_usb_ids[] = { 399static const struct usb_device_id ub_usb_ids[] = {
399 { USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, US_SC_SCSI, US_PR_BULK) }, 400 { USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, USB_SC_SCSI, USB_PR_BULK) },
400 { } 401 { }
401}; 402};
402 403
@@ -1715,9 +1716,9 @@ static int ub_bd_unlocked_open(struct block_device *bdev, fmode_t mode)
1715{ 1716{
1716 int ret; 1717 int ret;
1717 1718
1718 lock_kernel(); 1719 mutex_lock(&ub_mutex);
1719 ret = ub_bd_open(bdev, mode); 1720 ret = ub_bd_open(bdev, mode);
1720 unlock_kernel(); 1721 mutex_unlock(&ub_mutex);
1721 1722
1722 return ret; 1723 return ret;
1723} 1724}
@@ -1730,9 +1731,9 @@ static int ub_bd_release(struct gendisk *disk, fmode_t mode)
1730 struct ub_lun *lun = disk->private_data; 1731 struct ub_lun *lun = disk->private_data;
1731 struct ub_dev *sc = lun->udev; 1732 struct ub_dev *sc = lun->udev;
1732 1733
1733 lock_kernel(); 1734 mutex_lock(&ub_mutex);
1734 ub_put(sc); 1735 ub_put(sc);
1735 unlock_kernel(); 1736 mutex_unlock(&ub_mutex);
1736 1737
1737 return 0; 1738 return 0;
1738} 1739}
@@ -1747,9 +1748,9 @@ static int ub_bd_ioctl(struct block_device *bdev, fmode_t mode,
1747 void __user *usermem = (void __user *) arg; 1748 void __user *usermem = (void __user *) arg;
1748 int ret; 1749 int ret;
1749 1750
1750 lock_kernel(); 1751 mutex_lock(&ub_mutex);
1751 ret = scsi_cmd_ioctl(disk->queue, disk, mode, cmd, usermem); 1752 ret = scsi_cmd_ioctl(disk->queue, disk, mode, cmd, usermem);
1752 unlock_kernel(); 1753 mutex_unlock(&ub_mutex);
1753 1754
1754 return ret; 1755 return ret;
1755} 1756}
@@ -1787,7 +1788,8 @@ static int ub_bd_revalidate(struct gendisk *disk)
1787 * 1788 *
1788 * The return code is bool! 1789 * The return code is bool!
1789 */ 1790 */
1790static int ub_bd_media_changed(struct gendisk *disk) 1791static unsigned int ub_bd_check_events(struct gendisk *disk,
1792 unsigned int clearing)
1791{ 1793{
1792 struct ub_lun *lun = disk->private_data; 1794 struct ub_lun *lun = disk->private_data;
1793 1795
@@ -1805,10 +1807,10 @@ static int ub_bd_media_changed(struct gendisk *disk)
1805 */ 1807 */
1806 if (ub_sync_tur(lun->udev, lun) != 0) { 1808 if (ub_sync_tur(lun->udev, lun) != 0) {
1807 lun->changed = 1; 1809 lun->changed = 1;
1808 return 1; 1810 return DISK_EVENT_MEDIA_CHANGE;
1809 } 1811 }
1810 1812
1811 return lun->changed; 1813 return lun->changed ? DISK_EVENT_MEDIA_CHANGE : 0;
1812} 1814}
1813 1815
1814static const struct block_device_operations ub_bd_fops = { 1816static const struct block_device_operations ub_bd_fops = {
@@ -1816,7 +1818,7 @@ static const struct block_device_operations ub_bd_fops = {
1816 .open = ub_bd_unlocked_open, 1818 .open = ub_bd_unlocked_open,
1817 .release = ub_bd_release, 1819 .release = ub_bd_release,
1818 .ioctl = ub_bd_ioctl, 1820 .ioctl = ub_bd_ioctl,
1819 .media_changed = ub_bd_media_changed, 1821 .check_events = ub_bd_check_events,
1820 .revalidate_disk = ub_bd_revalidate, 1822 .revalidate_disk = ub_bd_revalidate,
1821}; 1823};
1822 1824
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index 8be57151f5d6..031ca720d926 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -241,8 +241,7 @@ static void dump_dmastat(struct cardinfo *card, unsigned int dmastat)
241 * 241 *
242 * Whenever IO on the active page completes, the Ready page is activated 242 * Whenever IO on the active page completes, the Ready page is activated
243 * and the ex-Active page is clean out and made Ready. 243 * and the ex-Active page is clean out and made Ready.
244 * Otherwise the Ready page is only activated when it becomes full, or 244 * Otherwise the Ready page is only activated when it becomes full.
245 * when mm_unplug_device is called via the unplug_io_fn.
246 * 245 *
247 * If a request arrives while both pages a full, it is queued, and b_rdev is 246 * If a request arrives while both pages a full, it is queued, and b_rdev is
248 * overloaded to record whether it was a read or a write. 247 * overloaded to record whether it was a read or a write.
@@ -333,17 +332,6 @@ static inline void reset_page(struct mm_page *page)
333 page->biotail = &page->bio; 332 page->biotail = &page->bio;
334} 333}
335 334
336static void mm_unplug_device(struct request_queue *q)
337{
338 struct cardinfo *card = q->queuedata;
339 unsigned long flags;
340
341 spin_lock_irqsave(&card->lock, flags);
342 if (blk_remove_plug(q))
343 activate(card);
344 spin_unlock_irqrestore(&card->lock, flags);
345}
346
347/* 335/*
348 * If there is room on Ready page, take 336 * If there is room on Ready page, take
349 * one bh off list and add it. 337 * one bh off list and add it.
@@ -535,7 +523,6 @@ static int mm_make_request(struct request_queue *q, struct bio *bio)
535 *card->biotail = bio; 523 *card->biotail = bio;
536 bio->bi_next = NULL; 524 bio->bi_next = NULL;
537 card->biotail = &bio->bi_next; 525 card->biotail = &bio->bi_next;
538 blk_plug_device(q);
539 spin_unlock_irq(&card->lock); 526 spin_unlock_irq(&card->lock);
540 527
541 return 0; 528 return 0;
@@ -779,20 +766,10 @@ static int mm_getgeo(struct block_device *bdev, struct hd_geometry *geo)
779 return 0; 766 return 0;
780} 767}
781 768
782/*
783 * Future support for removable devices
784 */
785static int mm_check_change(struct gendisk *disk)
786{
787/* struct cardinfo *dev = disk->private_data; */
788 return 0;
789}
790
791static const struct block_device_operations mm_fops = { 769static const struct block_device_operations mm_fops = {
792 .owner = THIS_MODULE, 770 .owner = THIS_MODULE,
793 .getgeo = mm_getgeo, 771 .getgeo = mm_getgeo,
794 .revalidate_disk = mm_revalidate, 772 .revalidate_disk = mm_revalidate,
795 .media_changed = mm_check_change,
796}; 773};
797 774
798static int __devinit mm_pci_probe(struct pci_dev *dev, 775static int __devinit mm_pci_probe(struct pci_dev *dev,
@@ -907,7 +884,6 @@ static int __devinit mm_pci_probe(struct pci_dev *dev,
907 blk_queue_make_request(card->queue, mm_make_request); 884 blk_queue_make_request(card->queue, mm_make_request);
908 card->queue->queue_lock = &card->lock; 885 card->queue->queue_lock = &card->lock;
909 card->queue->queuedata = card; 886 card->queue->queuedata = card;
910 card->queue->unplug_fn = mm_unplug_device;
911 887
912 tasklet_init(&card->tasklet, process_page, (unsigned long)card); 888 tasklet_init(&card->tasklet, process_page, (unsigned long)card);
913 889
diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c
index f651e51a3319..9a5b2a2d616d 100644
--- a/drivers/block/viodasd.c
+++ b/drivers/block/viodasd.c
@@ -41,7 +41,7 @@
41#include <linux/errno.h> 41#include <linux/errno.h>
42#include <linux/init.h> 42#include <linux/init.h>
43#include <linux/string.h> 43#include <linux/string.h>
44#include <linux/smp_lock.h> 44#include <linux/mutex.h>
45#include <linux/dma-mapping.h> 45#include <linux/dma-mapping.h>
46#include <linux/completion.h> 46#include <linux/completion.h>
47#include <linux/device.h> 47#include <linux/device.h>
@@ -73,6 +73,7 @@ enum {
73 MAX_DISK_NAME = FIELD_SIZEOF(struct gendisk, disk_name) 73 MAX_DISK_NAME = FIELD_SIZEOF(struct gendisk, disk_name)
74}; 74};
75 75
76static DEFINE_MUTEX(viodasd_mutex);
76static DEFINE_SPINLOCK(viodasd_spinlock); 77static DEFINE_SPINLOCK(viodasd_spinlock);
77 78
78#define VIOMAXREQ 16 79#define VIOMAXREQ 16
@@ -93,7 +94,7 @@ static const struct vio_error_entry viodasd_err_table[] = {
93 { 0x0204, EIO, "Use Error" }, 94 { 0x0204, EIO, "Use Error" },
94 { 0x0205, EIO, "Release Error" }, 95 { 0x0205, EIO, "Release Error" },
95 { 0x0206, EINVAL, "Invalid Disk" }, 96 { 0x0206, EINVAL, "Invalid Disk" },
96 { 0x0207, EBUSY, "Cant Lock" }, 97 { 0x0207, EBUSY, "Can't Lock" },
97 { 0x0208, EIO, "Already Locked" }, 98 { 0x0208, EIO, "Already Locked" },
98 { 0x0209, EIO, "Already Unlocked" }, 99 { 0x0209, EIO, "Already Unlocked" },
99 { 0x020A, EIO, "Invalid Arg" }, 100 { 0x020A, EIO, "Invalid Arg" },
@@ -180,9 +181,9 @@ static int viodasd_unlocked_open(struct block_device *bdev, fmode_t mode)
180{ 181{
181 int ret; 182 int ret;
182 183
183 lock_kernel(); 184 mutex_lock(&viodasd_mutex);
184 ret = viodasd_open(bdev, mode); 185 ret = viodasd_open(bdev, mode);
185 unlock_kernel(); 186 mutex_unlock(&viodasd_mutex);
186 187
187 return ret; 188 return ret;
188} 189}
@@ -196,7 +197,7 @@ static int viodasd_release(struct gendisk *disk, fmode_t mode)
196 struct viodasd_device *d = disk->private_data; 197 struct viodasd_device *d = disk->private_data;
197 HvLpEvent_Rc hvrc; 198 HvLpEvent_Rc hvrc;
198 199
199 lock_kernel(); 200 mutex_lock(&viodasd_mutex);
200 /* Send the event to OS/400. We DON'T expect a response */ 201 /* Send the event to OS/400. We DON'T expect a response */
201 hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp, 202 hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp,
202 HvLpEvent_Type_VirtualIo, 203 HvLpEvent_Type_VirtualIo,
@@ -210,7 +211,7 @@ static int viodasd_release(struct gendisk *disk, fmode_t mode)
210 if (hvrc != 0) 211 if (hvrc != 0)
211 pr_warning("HV close call failed %d\n", (int)hvrc); 212 pr_warning("HV close call failed %d\n", (int)hvrc);
212 213
213 unlock_kernel(); 214 mutex_unlock(&viodasd_mutex);
214 215
215 return 0; 216 return 0;
216} 217}
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 1101e251a629..079c08808d8a 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -2,15 +2,17 @@
2#include <linux/spinlock.h> 2#include <linux/spinlock.h>
3#include <linux/slab.h> 3#include <linux/slab.h>
4#include <linux/blkdev.h> 4#include <linux/blkdev.h>
5#include <linux/smp_lock.h>
6#include <linux/hdreg.h> 5#include <linux/hdreg.h>
7#include <linux/virtio.h> 6#include <linux/virtio.h>
8#include <linux/virtio_blk.h> 7#include <linux/virtio_blk.h>
9#include <linux/scatterlist.h> 8#include <linux/scatterlist.h>
9#include <linux/string_helpers.h>
10#include <scsi/scsi_cmnd.h>
10 11
11#define PART_BITS 4 12#define PART_BITS 4
12 13
13static int major, index; 14static int major, index;
15struct workqueue_struct *virtblk_wq;
14 16
15struct virtio_blk 17struct virtio_blk
16{ 18{
@@ -27,6 +29,9 @@ struct virtio_blk
27 29
28 mempool_t *pool; 30 mempool_t *pool;
29 31
32 /* Process context for config space updates */
33 struct work_struct config_work;
34
30 /* What host tells us, plus 2 for header & tailer. */ 35 /* What host tells us, plus 2 for header & tailer. */
31 unsigned int sg_elems; 36 unsigned int sg_elems;
32 37
@@ -128,9 +133,6 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
128 } 133 }
129 } 134 }
130 135
131 if (vbr->req->cmd_flags & REQ_HARDBARRIER)
132 vbr->out_hdr.type |= VIRTIO_BLK_T_BARRIER;
133
134 sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); 136 sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
135 137
136 /* 138 /*
@@ -145,7 +147,7 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
145 num = blk_rq_map_sg(q, vbr->req, vblk->sg + out); 147 num = blk_rq_map_sg(q, vbr->req, vblk->sg + out);
146 148
147 if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC) { 149 if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC) {
148 sg_set_buf(&vblk->sg[num + out + in++], vbr->req->sense, 96); 150 sg_set_buf(&vblk->sg[num + out + in++], vbr->req->sense, SCSI_SENSE_BUFFERSIZE);
149 sg_set_buf(&vblk->sg[num + out + in++], &vbr->in_hdr, 151 sg_set_buf(&vblk->sg[num + out + in++], &vbr->in_hdr,
150 sizeof(vbr->in_hdr)); 152 sizeof(vbr->in_hdr));
151 } 153 }
@@ -222,8 +224,8 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
222 return err; 224 return err;
223} 225}
224 226
225static int virtblk_locked_ioctl(struct block_device *bdev, fmode_t mode, 227static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
226 unsigned cmd, unsigned long data) 228 unsigned int cmd, unsigned long data)
227{ 229{
228 struct gendisk *disk = bdev->bd_disk; 230 struct gendisk *disk = bdev->bd_disk;
229 struct virtio_blk *vblk = disk->private_data; 231 struct virtio_blk *vblk = disk->private_data;
@@ -238,18 +240,6 @@ static int virtblk_locked_ioctl(struct block_device *bdev, fmode_t mode,
238 (void __user *)data); 240 (void __user *)data);
239} 241}
240 242
241static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
242 unsigned int cmd, unsigned long param)
243{
244 int ret;
245
246 lock_kernel();
247 ret = virtblk_locked_ioctl(bdev, mode, cmd, param);
248 unlock_kernel();
249
250 return ret;
251}
252
253/* We provide getgeo only to please some old bootloader/partitioning tools */ 243/* We provide getgeo only to please some old bootloader/partitioning tools */
254static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo) 244static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
255{ 245{
@@ -307,6 +297,46 @@ static ssize_t virtblk_serial_show(struct device *dev,
307} 297}
308DEVICE_ATTR(serial, S_IRUGO, virtblk_serial_show, NULL); 298DEVICE_ATTR(serial, S_IRUGO, virtblk_serial_show, NULL);
309 299
300static void virtblk_config_changed_work(struct work_struct *work)
301{
302 struct virtio_blk *vblk =
303 container_of(work, struct virtio_blk, config_work);
304 struct virtio_device *vdev = vblk->vdev;
305 struct request_queue *q = vblk->disk->queue;
306 char cap_str_2[10], cap_str_10[10];
307 u64 capacity, size;
308
309 /* Host must always specify the capacity. */
310 vdev->config->get(vdev, offsetof(struct virtio_blk_config, capacity),
311 &capacity, sizeof(capacity));
312
313 /* If capacity is too big, truncate with warning. */
314 if ((sector_t)capacity != capacity) {
315 dev_warn(&vdev->dev, "Capacity %llu too large: truncating\n",
316 (unsigned long long)capacity);
317 capacity = (sector_t)-1;
318 }
319
320 size = capacity * queue_logical_block_size(q);
321 string_get_size(size, STRING_UNITS_2, cap_str_2, sizeof(cap_str_2));
322 string_get_size(size, STRING_UNITS_10, cap_str_10, sizeof(cap_str_10));
323
324 dev_notice(&vdev->dev,
325 "new size: %llu %d-byte logical blocks (%s/%s)\n",
326 (unsigned long long)capacity,
327 queue_logical_block_size(q),
328 cap_str_10, cap_str_2);
329
330 set_capacity(vblk->disk, capacity);
331}
332
333static void virtblk_config_changed(struct virtio_device *vdev)
334{
335 struct virtio_blk *vblk = vdev->priv;
336
337 queue_work(virtblk_wq, &vblk->config_work);
338}
339
310static int __devinit virtblk_probe(struct virtio_device *vdev) 340static int __devinit virtblk_probe(struct virtio_device *vdev)
311{ 341{
312 struct virtio_blk *vblk; 342 struct virtio_blk *vblk;
@@ -343,6 +373,7 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
343 vblk->vdev = vdev; 373 vblk->vdev = vdev;
344 vblk->sg_elems = sg_elems; 374 vblk->sg_elems = sg_elems;
345 sg_init_table(vblk->sg, vblk->sg_elems); 375 sg_init_table(vblk->sg, vblk->sg_elems);
376 INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
346 377
347 /* We expect one virtqueue, for output. */ 378 /* We expect one virtqueue, for output. */
348 vblk->vq = virtio_find_single_vq(vdev, blk_done, "requests"); 379 vblk->vq = virtio_find_single_vq(vdev, blk_done, "requests");
@@ -392,31 +423,9 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
392 vblk->disk->driverfs_dev = &vdev->dev; 423 vblk->disk->driverfs_dev = &vdev->dev;
393 index++; 424 index++;
394 425
395 if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH)) { 426 /* configure queue flush support */
396 /* 427 if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH))
397 * If the FLUSH feature is supported we do have support for 428 blk_queue_flush(q, REQ_FLUSH);
398 * flushing a volatile write cache on the host. Use that
399 * to implement write barrier support.
400 */
401 blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH);
402 } else if (virtio_has_feature(vdev, VIRTIO_BLK_F_BARRIER)) {
403 /*
404 * If the BARRIER feature is supported the host expects us
405 * to order request by tags. This implies there is not
406 * volatile write cache on the host, and that the host
407 * never re-orders outstanding I/O. This feature is not
408 * useful for real life scenarious and deprecated.
409 */
410 blk_queue_ordered(q, QUEUE_ORDERED_TAG);
411 } else {
412 /*
413 * If the FLUSH feature is not supported we must assume that
414 * the host does not perform any kind of volatile write
415 * caching. We still need to drain the queue to provider
416 * proper barrier semantics.
417 */
418 blk_queue_ordered(q, QUEUE_ORDERED_DRAIN);
419 }
420 429
421 /* If disk is read-only in the host, the guest should obey */ 430 /* If disk is read-only in the host, the guest should obey */
422 if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO)) 431 if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
@@ -515,6 +524,8 @@ static void __devexit virtblk_remove(struct virtio_device *vdev)
515{ 524{
516 struct virtio_blk *vblk = vdev->priv; 525 struct virtio_blk *vblk = vdev->priv;
517 526
527 flush_work(&vblk->config_work);
528
518 /* Nothing should be pending. */ 529 /* Nothing should be pending. */
519 BUG_ON(!list_empty(&vblk->reqs)); 530 BUG_ON(!list_empty(&vblk->reqs));
520 531
@@ -535,9 +546,9 @@ static const struct virtio_device_id id_table[] = {
535}; 546};
536 547
537static unsigned int features[] = { 548static unsigned int features[] = {
538 VIRTIO_BLK_F_BARRIER, VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, 549 VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
539 VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, 550 VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_SCSI,
540 VIRTIO_BLK_F_SCSI, VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY 551 VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY
541}; 552};
542 553
543/* 554/*
@@ -546,27 +557,47 @@ static unsigned int features[] = {
546 * Use __refdata to avoid this warning. 557 * Use __refdata to avoid this warning.
547 */ 558 */
548static struct virtio_driver __refdata virtio_blk = { 559static struct virtio_driver __refdata virtio_blk = {
549 .feature_table = features, 560 .feature_table = features,
550 .feature_table_size = ARRAY_SIZE(features), 561 .feature_table_size = ARRAY_SIZE(features),
551 .driver.name = KBUILD_MODNAME, 562 .driver.name = KBUILD_MODNAME,
552 .driver.owner = THIS_MODULE, 563 .driver.owner = THIS_MODULE,
553 .id_table = id_table, 564 .id_table = id_table,
554 .probe = virtblk_probe, 565 .probe = virtblk_probe,
555 .remove = __devexit_p(virtblk_remove), 566 .remove = __devexit_p(virtblk_remove),
567 .config_changed = virtblk_config_changed,
556}; 568};
557 569
558static int __init init(void) 570static int __init init(void)
559{ 571{
572 int error;
573
574 virtblk_wq = alloc_workqueue("virtio-blk", 0, 0);
575 if (!virtblk_wq)
576 return -ENOMEM;
577
560 major = register_blkdev(0, "virtblk"); 578 major = register_blkdev(0, "virtblk");
561 if (major < 0) 579 if (major < 0) {
562 return major; 580 error = major;
563 return register_virtio_driver(&virtio_blk); 581 goto out_destroy_workqueue;
582 }
583
584 error = register_virtio_driver(&virtio_blk);
585 if (error)
586 goto out_unregister_blkdev;
587 return 0;
588
589out_unregister_blkdev:
590 unregister_blkdev(major, "virtblk");
591out_destroy_workqueue:
592 destroy_workqueue(virtblk_wq);
593 return error;
564} 594}
565 595
566static void __exit fini(void) 596static void __exit fini(void)
567{ 597{
568 unregister_blkdev(major, "virtblk"); 598 unregister_blkdev(major, "virtblk");
569 unregister_virtio_driver(&virtio_blk); 599 unregister_virtio_driver(&virtio_blk);
600 destroy_workqueue(virtblk_wq);
570} 601}
571module_init(init); 602module_init(init);
572module_exit(fini); 603module_exit(fini);
diff --git a/drivers/block/xd.c b/drivers/block/xd.c
index d5a3cd750561..4abd2bcd20fb 100644
--- a/drivers/block/xd.c
+++ b/drivers/block/xd.c
@@ -46,7 +46,7 @@
46#include <linux/init.h> 46#include <linux/init.h>
47#include <linux/wait.h> 47#include <linux/wait.h>
48#include <linux/blkdev.h> 48#include <linux/blkdev.h>
49#include <linux/smp_lock.h> 49#include <linux/mutex.h>
50#include <linux/blkpg.h> 50#include <linux/blkpg.h>
51#include <linux/delay.h> 51#include <linux/delay.h>
52#include <linux/io.h> 52#include <linux/io.h>
@@ -58,6 +58,7 @@
58 58
59#include "xd.h" 59#include "xd.h"
60 60
61static DEFINE_MUTEX(xd_mutex);
61static void __init do_xd_setup (int *integers); 62static void __init do_xd_setup (int *integers);
62#ifdef MODULE 63#ifdef MODULE
63static int xd[5] = { -1,-1,-1,-1, }; 64static int xd[5] = { -1,-1,-1,-1, };
@@ -381,9 +382,9 @@ static int xd_ioctl(struct block_device *bdev, fmode_t mode,
381{ 382{
382 int ret; 383 int ret;
383 384
384 lock_kernel(); 385 mutex_lock(&xd_mutex);
385 ret = xd_locked_ioctl(bdev, mode, cmd, param); 386 ret = xd_locked_ioctl(bdev, mode, cmd, param);
386 unlock_kernel(); 387 mutex_unlock(&xd_mutex);
387 388
388 return ret; 389 return ret;
389} 390}
diff --git a/drivers/block/xen-blkback/Makefile b/drivers/block/xen-blkback/Makefile
new file mode 100644
index 000000000000..e491c1b76878
--- /dev/null
+++ b/drivers/block/xen-blkback/Makefile
@@ -0,0 +1,3 @@
1obj-$(CONFIG_XEN_BLKDEV_BACKEND) := xen-blkback.o
2
3xen-blkback-y := blkback.o xenbus.o
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
new file mode 100644
index 000000000000..5cf2993a8338
--- /dev/null
+++ b/drivers/block/xen-blkback/blkback.c
@@ -0,0 +1,826 @@
1/******************************************************************************
2 *
3 * Back-end of the driver for virtual block devices. This portion of the
4 * driver exports a 'unified' block-device interface that can be accessed
5 * by any operating system that implements a compatible front end. A
6 * reference front-end implementation can be found in:
7 * drivers/block/xen-blkfront.c
8 *
9 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
10 * Copyright (c) 2005, Christopher Clark
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License version 2
14 * as published by the Free Software Foundation; or, when distributed
15 * separately from the Linux kernel or incorporated into other
16 * software packages, subject to the following license:
17 *
18 * Permission is hereby granted, free of charge, to any person obtaining a copy
19 * of this source file (the "Software"), to deal in the Software without
20 * restriction, including without limitation the rights to use, copy, modify,
21 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
22 * and to permit persons to whom the Software is furnished to do so, subject to
23 * the following conditions:
24 *
25 * The above copyright notice and this permission notice shall be included in
26 * all copies or substantial portions of the Software.
27 *
28 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
31 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
32 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
33 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
34 * IN THE SOFTWARE.
35 */
36
37#include <linux/spinlock.h>
38#include <linux/kthread.h>
39#include <linux/list.h>
40#include <linux/delay.h>
41#include <linux/freezer.h>
42
43#include <xen/events.h>
44#include <xen/page.h>
45#include <asm/xen/hypervisor.h>
46#include <asm/xen/hypercall.h>
47#include "common.h"
48
49/*
50 * These are rather arbitrary. They are fairly large because adjacent requests
51 * pulled from a communication ring are quite likely to end up being part of
52 * the same scatter/gather request at the disc.
53 *
54 * ** TRY INCREASING 'xen_blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
55 *
56 * This will increase the chances of being able to write whole tracks.
57 * 64 should be enough to keep us competitive with Linux.
58 */
59static int xen_blkif_reqs = 64;
60module_param_named(reqs, xen_blkif_reqs, int, 0);
61MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
62
63/* Run-time switchable: /sys/module/blkback/parameters/ */
64static unsigned int log_stats;
65module_param(log_stats, int, 0644);
66
67/*
68 * Each outstanding request that we've passed to the lower device layers has a
69 * 'pending_req' allocated to it. Each buffer_head that completes decrements
70 * the pendcnt towards zero. When it hits zero, the specified domain has a
71 * response queued for it, with the saved 'id' passed back.
72 */
73struct pending_req {
74 struct xen_blkif *blkif;
75 u64 id;
76 int nr_pages;
77 atomic_t pendcnt;
78 unsigned short operation;
79 int status;
80 struct list_head free_list;
81};
82
83#define BLKBACK_INVALID_HANDLE (~0)
84
85struct xen_blkbk {
86 struct pending_req *pending_reqs;
87 /* List of all 'pending_req' available */
88 struct list_head pending_free;
89 /* And its spinlock. */
90 spinlock_t pending_free_lock;
91 wait_queue_head_t pending_free_wq;
92 /* The list of all pages that are available. */
93 struct page **pending_pages;
94 /* And the grant handles that are available. */
95 grant_handle_t *pending_grant_handles;
96};
97
98static struct xen_blkbk *blkbk;
99
100/*
101 * Little helpful macro to figure out the index and virtual address of the
102 * pending_pages[..]. For each 'pending_req' we have have up to
103 * BLKIF_MAX_SEGMENTS_PER_REQUEST (11) pages. The seg would be from 0 through
104 * 10 and would index in the pending_pages[..].
105 */
106static inline int vaddr_pagenr(struct pending_req *req, int seg)
107{
108 return (req - blkbk->pending_reqs) *
109 BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
110}
111
112#define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)]
113
114static inline unsigned long vaddr(struct pending_req *req, int seg)
115{
116 unsigned long pfn = page_to_pfn(blkbk->pending_page(req, seg));
117 return (unsigned long)pfn_to_kaddr(pfn);
118}
119
120#define pending_handle(_req, _seg) \
121 (blkbk->pending_grant_handles[vaddr_pagenr(_req, _seg)])
122
123
124static int do_block_io_op(struct xen_blkif *blkif);
125static int dispatch_rw_block_io(struct xen_blkif *blkif,
126 struct blkif_request *req,
127 struct pending_req *pending_req);
128static void make_response(struct xen_blkif *blkif, u64 id,
129 unsigned short op, int st);
130
131/*
132 * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
133 */
134static struct pending_req *alloc_req(void)
135{
136 struct pending_req *req = NULL;
137 unsigned long flags;
138
139 spin_lock_irqsave(&blkbk->pending_free_lock, flags);
140 if (!list_empty(&blkbk->pending_free)) {
141 req = list_entry(blkbk->pending_free.next, struct pending_req,
142 free_list);
143 list_del(&req->free_list);
144 }
145 spin_unlock_irqrestore(&blkbk->pending_free_lock, flags);
146 return req;
147}
148
149/*
150 * Return the 'pending_req' structure back to the freepool. We also
151 * wake up the thread if it was waiting for a free page.
152 */
153static void free_req(struct pending_req *req)
154{
155 unsigned long flags;
156 int was_empty;
157
158 spin_lock_irqsave(&blkbk->pending_free_lock, flags);
159 was_empty = list_empty(&blkbk->pending_free);
160 list_add(&req->free_list, &blkbk->pending_free);
161 spin_unlock_irqrestore(&blkbk->pending_free_lock, flags);
162 if (was_empty)
163 wake_up(&blkbk->pending_free_wq);
164}
165
166/*
167 * Routines for managing virtual block devices (vbds).
168 */
169static int xen_vbd_translate(struct phys_req *req, struct xen_blkif *blkif,
170 int operation)
171{
172 struct xen_vbd *vbd = &blkif->vbd;
173 int rc = -EACCES;
174
175 if ((operation != READ) && vbd->readonly)
176 goto out;
177
178 if (likely(req->nr_sects)) {
179 blkif_sector_t end = req->sector_number + req->nr_sects;
180
181 if (unlikely(end < req->sector_number))
182 goto out;
183 if (unlikely(end > vbd_sz(vbd)))
184 goto out;
185 }
186
187 req->dev = vbd->pdevice;
188 req->bdev = vbd->bdev;
189 rc = 0;
190
191 out:
192 return rc;
193}
194
195static void xen_vbd_resize(struct xen_blkif *blkif)
196{
197 struct xen_vbd *vbd = &blkif->vbd;
198 struct xenbus_transaction xbt;
199 int err;
200 struct xenbus_device *dev = xen_blkbk_xenbus(blkif->be);
201 unsigned long long new_size = vbd_sz(vbd);
202
203 pr_info(DRV_PFX "VBD Resize: Domid: %d, Device: (%d, %d)\n",
204 blkif->domid, MAJOR(vbd->pdevice), MINOR(vbd->pdevice));
205 pr_info(DRV_PFX "VBD Resize: new size %llu\n", new_size);
206 vbd->size = new_size;
207again:
208 err = xenbus_transaction_start(&xbt);
209 if (err) {
210 pr_warn(DRV_PFX "Error starting transaction");
211 return;
212 }
213 err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
214 (unsigned long long)vbd_sz(vbd));
215 if (err) {
216 pr_warn(DRV_PFX "Error writing new size");
217 goto abort;
218 }
219 /*
220 * Write the current state; we will use this to synchronize
221 * the front-end. If the current state is "connected" the
222 * front-end will get the new size information online.
223 */
224 err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state);
225 if (err) {
226 pr_warn(DRV_PFX "Error writing the state");
227 goto abort;
228 }
229
230 err = xenbus_transaction_end(xbt, 0);
231 if (err == -EAGAIN)
232 goto again;
233 if (err)
234 pr_warn(DRV_PFX "Error ending transaction");
235 return;
236abort:
237 xenbus_transaction_end(xbt, 1);
238}
239
240/*
241 * Notification from the guest OS.
242 */
243static void blkif_notify_work(struct xen_blkif *blkif)
244{
245 blkif->waiting_reqs = 1;
246 wake_up(&blkif->wq);
247}
248
249irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
250{
251 blkif_notify_work(dev_id);
252 return IRQ_HANDLED;
253}
254
255/*
256 * SCHEDULER FUNCTIONS
257 */
258
259static void print_stats(struct xen_blkif *blkif)
260{
261 pr_info("xen-blkback (%s): oo %3d | rd %4d | wr %4d | f %4d\n",
262 current->comm, blkif->st_oo_req,
263 blkif->st_rd_req, blkif->st_wr_req, blkif->st_f_req);
264 blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
265 blkif->st_rd_req = 0;
266 blkif->st_wr_req = 0;
267 blkif->st_oo_req = 0;
268}
269
270int xen_blkif_schedule(void *arg)
271{
272 struct xen_blkif *blkif = arg;
273 struct xen_vbd *vbd = &blkif->vbd;
274
275 xen_blkif_get(blkif);
276
277 while (!kthread_should_stop()) {
278 if (try_to_freeze())
279 continue;
280 if (unlikely(vbd->size != vbd_sz(vbd)))
281 xen_vbd_resize(blkif);
282
283 wait_event_interruptible(
284 blkif->wq,
285 blkif->waiting_reqs || kthread_should_stop());
286 wait_event_interruptible(
287 blkbk->pending_free_wq,
288 !list_empty(&blkbk->pending_free) ||
289 kthread_should_stop());
290
291 blkif->waiting_reqs = 0;
292 smp_mb(); /* clear flag *before* checking for work */
293
294 if (do_block_io_op(blkif))
295 blkif->waiting_reqs = 1;
296
297 if (log_stats && time_after(jiffies, blkif->st_print))
298 print_stats(blkif);
299 }
300
301 if (log_stats)
302 print_stats(blkif);
303
304 blkif->xenblkd = NULL;
305 xen_blkif_put(blkif);
306
307 return 0;
308}
309
310struct seg_buf {
311 unsigned long buf;
312 unsigned int nsec;
313};
314/*
315 * Unmap the grant references, and also remove the M2P over-rides
316 * used in the 'pending_req'.
317 */
318static void xen_blkbk_unmap(struct pending_req *req)
319{
320 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
321 unsigned int i, invcount = 0;
322 grant_handle_t handle;
323 int ret;
324
325 for (i = 0; i < req->nr_pages; i++) {
326 handle = pending_handle(req, i);
327 if (handle == BLKBACK_INVALID_HANDLE)
328 continue;
329 gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i),
330 GNTMAP_host_map, handle);
331 pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
332 invcount++;
333 }
334
335 ret = HYPERVISOR_grant_table_op(
336 GNTTABOP_unmap_grant_ref, unmap, invcount);
337 BUG_ON(ret);
338 /*
339 * Note, we use invcount, so nr->pages, so we can't index
340 * using vaddr(req, i).
341 */
342 for (i = 0; i < invcount; i++) {
343 ret = m2p_remove_override(
344 virt_to_page(unmap[i].host_addr), false);
345 if (ret) {
346 pr_alert(DRV_PFX "Failed to remove M2P override for %lx\n",
347 (unsigned long)unmap[i].host_addr);
348 continue;
349 }
350 }
351}
352
353static int xen_blkbk_map(struct blkif_request *req,
354 struct pending_req *pending_req,
355 struct seg_buf seg[])
356{
357 struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
358 int i;
359 int nseg = req->nr_segments;
360 int ret = 0;
361
362 /*
363 * Fill out preq.nr_sects with proper amount of sectors, and setup
364 * assign map[..] with the PFN of the page in our domain with the
365 * corresponding grant reference for each page.
366 */
367 for (i = 0; i < nseg; i++) {
368 uint32_t flags;
369
370 flags = GNTMAP_host_map;
371 if (pending_req->operation != BLKIF_OP_READ)
372 flags |= GNTMAP_readonly;
373 gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags,
374 req->u.rw.seg[i].gref,
375 pending_req->blkif->domid);
376 }
377
378 ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
379 BUG_ON(ret);
380
381 /*
382 * Now swizzle the MFN in our domain with the MFN from the other domain
383 * so that when we access vaddr(pending_req,i) it has the contents of
384 * the page from the other domain.
385 */
386 for (i = 0; i < nseg; i++) {
387 if (unlikely(map[i].status != 0)) {
388 pr_debug(DRV_PFX "invalid buffer -- could not remap it\n");
389 map[i].handle = BLKBACK_INVALID_HANDLE;
390 ret |= 1;
391 }
392
393 pending_handle(pending_req, i) = map[i].handle;
394
395 if (ret)
396 continue;
397
398 ret = m2p_add_override(PFN_DOWN(map[i].dev_bus_addr),
399 blkbk->pending_page(pending_req, i), false);
400 if (ret) {
401 pr_alert(DRV_PFX "Failed to install M2P override for %lx (ret: %d)\n",
402 (unsigned long)map[i].dev_bus_addr, ret);
403 /* We could switch over to GNTTABOP_copy */
404 continue;
405 }
406
407 seg[i].buf = map[i].dev_bus_addr |
408 (req->u.rw.seg[i].first_sect << 9);
409 }
410 return ret;
411}
412
413/*
414 * Completion callback on the bio's. Called as bh->b_end_io()
415 */
416
417static void __end_block_io_op(struct pending_req *pending_req, int error)
418{
419 /* An error fails the entire request. */
420 if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) &&
421 (error == -EOPNOTSUPP)) {
422 pr_debug(DRV_PFX "flush diskcache op failed, not supported\n");
423 xen_blkbk_flush_diskcache(XBT_NIL, pending_req->blkif->be, 0);
424 pending_req->status = BLKIF_RSP_EOPNOTSUPP;
425 } else if (error) {
426 pr_debug(DRV_PFX "Buffer not up-to-date at end of operation,"
427 " error=%d\n", error);
428 pending_req->status = BLKIF_RSP_ERROR;
429 }
430
431 /*
432 * If all of the bio's have completed it is time to unmap
433 * the grant references associated with 'request' and provide
434 * the proper response on the ring.
435 */
436 if (atomic_dec_and_test(&pending_req->pendcnt)) {
437 xen_blkbk_unmap(pending_req);
438 make_response(pending_req->blkif, pending_req->id,
439 pending_req->operation, pending_req->status);
440 xen_blkif_put(pending_req->blkif);
441 free_req(pending_req);
442 }
443}
444
445/*
446 * bio callback.
447 */
448static void end_block_io_op(struct bio *bio, int error)
449{
450 __end_block_io_op(bio->bi_private, error);
451 bio_put(bio);
452}
453
454
455
456/*
457 * Function to copy the from the ring buffer the 'struct blkif_request'
458 * (which has the sectors we want, number of them, grant references, etc),
459 * and transmute it to the block API to hand it over to the proper block disk.
460 */
461static int do_block_io_op(struct xen_blkif *blkif)
462{
463 union blkif_back_rings *blk_rings = &blkif->blk_rings;
464 struct blkif_request req;
465 struct pending_req *pending_req;
466 RING_IDX rc, rp;
467 int more_to_do = 0;
468
469 rc = blk_rings->common.req_cons;
470 rp = blk_rings->common.sring->req_prod;
471 rmb(); /* Ensure we see queued requests up to 'rp'. */
472
473 while (rc != rp) {
474
475 if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
476 break;
477
478 if (kthread_should_stop()) {
479 more_to_do = 1;
480 break;
481 }
482
483 pending_req = alloc_req();
484 if (NULL == pending_req) {
485 blkif->st_oo_req++;
486 more_to_do = 1;
487 break;
488 }
489
490 switch (blkif->blk_protocol) {
491 case BLKIF_PROTOCOL_NATIVE:
492 memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
493 break;
494 case BLKIF_PROTOCOL_X86_32:
495 blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
496 break;
497 case BLKIF_PROTOCOL_X86_64:
498 blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
499 break;
500 default:
501 BUG();
502 }
503 blk_rings->common.req_cons = ++rc; /* before make_response() */
504
505 /* Apply all sanity checks to /private copy/ of request. */
506 barrier();
507
508 if (dispatch_rw_block_io(blkif, &req, pending_req))
509 break;
510
511 /* Yield point for this unbounded loop. */
512 cond_resched();
513 }
514
515 return more_to_do;
516}
517
518/*
519 * Transmutation of the 'struct blkif_request' to a proper 'struct bio'
520 * and call the 'submit_bio' to pass it to the underlying storage.
521 */
522static int dispatch_rw_block_io(struct xen_blkif *blkif,
523 struct blkif_request *req,
524 struct pending_req *pending_req)
525{
526 struct phys_req preq;
527 struct seg_buf seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
528 unsigned int nseg;
529 struct bio *bio = NULL;
530 struct bio *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
531 int i, nbio = 0;
532 int operation;
533 struct blk_plug plug;
534
535 switch (req->operation) {
536 case BLKIF_OP_READ:
537 blkif->st_rd_req++;
538 operation = READ;
539 break;
540 case BLKIF_OP_WRITE:
541 blkif->st_wr_req++;
542 operation = WRITE_ODIRECT;
543 break;
544 case BLKIF_OP_FLUSH_DISKCACHE:
545 blkif->st_f_req++;
546 operation = WRITE_FLUSH;
547 break;
548 case BLKIF_OP_WRITE_BARRIER:
549 default:
550 operation = 0; /* make gcc happy */
551 goto fail_response;
552 break;
553 }
554
555 /* Check that the number of segments is sane. */
556 nseg = req->nr_segments;
557 if (unlikely(nseg == 0 && operation != WRITE_FLUSH) ||
558 unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
559 pr_debug(DRV_PFX "Bad number of segments in request (%d)\n",
560 nseg);
561 /* Haven't submitted any bio's yet. */
562 goto fail_response;
563 }
564
565 preq.dev = req->handle;
566 preq.sector_number = req->u.rw.sector_number;
567 preq.nr_sects = 0;
568
569 pending_req->blkif = blkif;
570 pending_req->id = req->id;
571 pending_req->operation = req->operation;
572 pending_req->status = BLKIF_RSP_OKAY;
573 pending_req->nr_pages = nseg;
574
575 for (i = 0; i < nseg; i++) {
576 seg[i].nsec = req->u.rw.seg[i].last_sect -
577 req->u.rw.seg[i].first_sect + 1;
578 if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
579 (req->u.rw.seg[i].last_sect < req->u.rw.seg[i].first_sect))
580 goto fail_response;
581 preq.nr_sects += seg[i].nsec;
582
583 }
584
585 if (xen_vbd_translate(&preq, blkif, operation) != 0) {
586 pr_debug(DRV_PFX "access denied: %s of [%llu,%llu] on dev=%04x\n",
587 operation == READ ? "read" : "write",
588 preq.sector_number,
589 preq.sector_number + preq.nr_sects, preq.dev);
590 goto fail_response;
591 }
592
593 /*
594 * This check _MUST_ be done after xen_vbd_translate as the preq.bdev
595 * is set there.
596 */
597 for (i = 0; i < nseg; i++) {
598 if (((int)preq.sector_number|(int)seg[i].nsec) &
599 ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
600 pr_debug(DRV_PFX "Misaligned I/O request from domain %d",
601 blkif->domid);
602 goto fail_response;
603 }
604 }
605
606 /*
607 * If we have failed at this point, we need to undo the M2P override,
608 * set gnttab_set_unmap_op on all of the grant references and perform
609 * the hypercall to unmap the grants - that is all done in
610 * xen_blkbk_unmap.
611 */
612 if (xen_blkbk_map(req, pending_req, seg))
613 goto fail_flush;
614
615 /* This corresponding xen_blkif_put is done in __end_block_io_op */
616 xen_blkif_get(blkif);
617
618 for (i = 0; i < nseg; i++) {
619 while ((bio == NULL) ||
620 (bio_add_page(bio,
621 blkbk->pending_page(pending_req, i),
622 seg[i].nsec << 9,
623 seg[i].buf & ~PAGE_MASK) == 0)) {
624
625 bio = bio_alloc(GFP_KERNEL, nseg-i);
626 if (unlikely(bio == NULL))
627 goto fail_put_bio;
628
629 biolist[nbio++] = bio;
630 bio->bi_bdev = preq.bdev;
631 bio->bi_private = pending_req;
632 bio->bi_end_io = end_block_io_op;
633 bio->bi_sector = preq.sector_number;
634 }
635
636 preq.sector_number += seg[i].nsec;
637 }
638
639 /* This will be hit if the operation was a flush. */
640 if (!bio) {
641 BUG_ON(operation != WRITE_FLUSH);
642
643 bio = bio_alloc(GFP_KERNEL, 0);
644 if (unlikely(bio == NULL))
645 goto fail_put_bio;
646
647 biolist[nbio++] = bio;
648 bio->bi_bdev = preq.bdev;
649 bio->bi_private = pending_req;
650 bio->bi_end_io = end_block_io_op;
651 }
652
653 /*
654 * We set it one so that the last submit_bio does not have to call
655 * atomic_inc.
656 */
657 atomic_set(&pending_req->pendcnt, nbio);
658
659 /* Get a reference count for the disk queue and start sending I/O */
660 blk_start_plug(&plug);
661
662 for (i = 0; i < nbio; i++)
663 submit_bio(operation, biolist[i]);
664
665 /* Let the I/Os go.. */
666 blk_finish_plug(&plug);
667
668 if (operation == READ)
669 blkif->st_rd_sect += preq.nr_sects;
670 else if (operation == WRITE || operation == WRITE_FLUSH)
671 blkif->st_wr_sect += preq.nr_sects;
672
673 return 0;
674
675 fail_flush:
676 xen_blkbk_unmap(pending_req);
677 fail_response:
678 /* Haven't submitted any bio's yet. */
679 make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
680 free_req(pending_req);
681 msleep(1); /* back off a bit */
682 return -EIO;
683
684 fail_put_bio:
685 for (i = 0; i < nbio; i++)
686 bio_put(biolist[i]);
687 __end_block_io_op(pending_req, -EINVAL);
688 msleep(1); /* back off a bit */
689 return -EIO;
690}
691
692
693
694/*
695 * Put a response on the ring on how the operation fared.
696 */
697static void make_response(struct xen_blkif *blkif, u64 id,
698 unsigned short op, int st)
699{
700 struct blkif_response resp;
701 unsigned long flags;
702 union blkif_back_rings *blk_rings = &blkif->blk_rings;
703 int more_to_do = 0;
704 int notify;
705
706 resp.id = id;
707 resp.operation = op;
708 resp.status = st;
709
710 spin_lock_irqsave(&blkif->blk_ring_lock, flags);
711 /* Place on the response ring for the relevant domain. */
712 switch (blkif->blk_protocol) {
713 case BLKIF_PROTOCOL_NATIVE:
714 memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
715 &resp, sizeof(resp));
716 break;
717 case BLKIF_PROTOCOL_X86_32:
718 memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt),
719 &resp, sizeof(resp));
720 break;
721 case BLKIF_PROTOCOL_X86_64:
722 memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt),
723 &resp, sizeof(resp));
724 break;
725 default:
726 BUG();
727 }
728 blk_rings->common.rsp_prod_pvt++;
729 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
730 if (blk_rings->common.rsp_prod_pvt == blk_rings->common.req_cons) {
731 /*
732 * Tail check for pending requests. Allows frontend to avoid
733 * notifications if requests are already in flight (lower
734 * overheads and promotes batching).
735 */
736 RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
737
738 } else if (RING_HAS_UNCONSUMED_REQUESTS(&blk_rings->common)) {
739 more_to_do = 1;
740 }
741
742 spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
743
744 if (more_to_do)
745 blkif_notify_work(blkif);
746 if (notify)
747 notify_remote_via_irq(blkif->irq);
748}
749
750static int __init xen_blkif_init(void)
751{
752 int i, mmap_pages;
753 int rc = 0;
754
755 if (!xen_pv_domain())
756 return -ENODEV;
757
758 blkbk = kzalloc(sizeof(struct xen_blkbk), GFP_KERNEL);
759 if (!blkbk) {
760 pr_alert(DRV_PFX "%s: out of memory!\n", __func__);
761 return -ENOMEM;
762 }
763
764 mmap_pages = xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
765
766 blkbk->pending_reqs = kmalloc(sizeof(blkbk->pending_reqs[0]) *
767 xen_blkif_reqs, GFP_KERNEL);
768 blkbk->pending_grant_handles = kzalloc(sizeof(blkbk->pending_grant_handles[0]) *
769 mmap_pages, GFP_KERNEL);
770 blkbk->pending_pages = kzalloc(sizeof(blkbk->pending_pages[0]) *
771 mmap_pages, GFP_KERNEL);
772
773 if (!blkbk->pending_reqs || !blkbk->pending_grant_handles ||
774 !blkbk->pending_pages) {
775 rc = -ENOMEM;
776 goto out_of_memory;
777 }
778
779 for (i = 0; i < mmap_pages; i++) {
780 blkbk->pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
781 blkbk->pending_pages[i] = alloc_page(GFP_KERNEL);
782 if (blkbk->pending_pages[i] == NULL) {
783 rc = -ENOMEM;
784 goto out_of_memory;
785 }
786 }
787 rc = xen_blkif_interface_init();
788 if (rc)
789 goto failed_init;
790
791 memset(blkbk->pending_reqs, 0, sizeof(blkbk->pending_reqs));
792
793 INIT_LIST_HEAD(&blkbk->pending_free);
794 spin_lock_init(&blkbk->pending_free_lock);
795 init_waitqueue_head(&blkbk->pending_free_wq);
796
797 for (i = 0; i < xen_blkif_reqs; i++)
798 list_add_tail(&blkbk->pending_reqs[i].free_list,
799 &blkbk->pending_free);
800
801 rc = xen_blkif_xenbus_init();
802 if (rc)
803 goto failed_init;
804
805 return 0;
806
807 out_of_memory:
808 pr_alert(DRV_PFX "%s: out of memory\n", __func__);
809 failed_init:
810 kfree(blkbk->pending_reqs);
811 kfree(blkbk->pending_grant_handles);
812 if (blkbk->pending_pages) {
813 for (i = 0; i < mmap_pages; i++) {
814 if (blkbk->pending_pages[i])
815 __free_page(blkbk->pending_pages[i]);
816 }
817 kfree(blkbk->pending_pages);
818 }
819 kfree(blkbk);
820 blkbk = NULL;
821 return rc;
822}
823
824module_init(xen_blkif_init);
825
826MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
new file mode 100644
index 000000000000..9e40b283a468
--- /dev/null
+++ b/drivers/block/xen-blkback/common.h
@@ -0,0 +1,233 @@
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License version 2
4 * as published by the Free Software Foundation; or, when distributed
5 * separately from the Linux kernel or incorporated into other
6 * software packages, subject to the following license:
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this source file (the "Software"), to deal in the Software without
10 * restriction, including without limitation the rights to use, copy, modify,
11 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
12 * and to permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 * IN THE SOFTWARE.
25 */
26
27#ifndef __XEN_BLKIF__BACKEND__COMMON_H__
28#define __XEN_BLKIF__BACKEND__COMMON_H__
29
30#include <linux/version.h>
31#include <linux/module.h>
32#include <linux/interrupt.h>
33#include <linux/slab.h>
34#include <linux/blkdev.h>
35#include <linux/vmalloc.h>
36#include <linux/wait.h>
37#include <linux/io.h>
38#include <asm/setup.h>
39#include <asm/pgalloc.h>
40#include <asm/hypervisor.h>
41#include <xen/grant_table.h>
42#include <xen/xenbus.h>
43#include <xen/interface/io/ring.h>
44#include <xen/interface/io/blkif.h>
45#include <xen/interface/io/protocols.h>
46
47#define DRV_PFX "xen-blkback:"
48#define DPRINTK(fmt, args...) \
49 pr_debug(DRV_PFX "(%s:%d) " fmt ".\n", \
50 __func__, __LINE__, ##args)
51
52
53/* Not a real protocol. Used to generate ring structs which contain
54 * the elements common to all protocols only. This way we get a
55 * compiler-checkable way to use common struct elements, so we can
56 * avoid using switch(protocol) in a number of places. */
57struct blkif_common_request {
58 char dummy;
59};
60struct blkif_common_response {
61 char dummy;
62};
63
64/* i386 protocol version */
65#pragma pack(push, 4)
66struct blkif_x86_32_request {
67 uint8_t operation; /* BLKIF_OP_??? */
68 uint8_t nr_segments; /* number of segments */
69 blkif_vdev_t handle; /* only for read/write requests */
70 uint64_t id; /* private guest value, echoed in resp */
71 blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
72 struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
73};
74struct blkif_x86_32_response {
75 uint64_t id; /* copied from request */
76 uint8_t operation; /* copied from request */
77 int16_t status; /* BLKIF_RSP_??? */
78};
79#pragma pack(pop)
80
81/* x86_64 protocol version */
82struct blkif_x86_64_request {
83 uint8_t operation; /* BLKIF_OP_??? */
84 uint8_t nr_segments; /* number of segments */
85 blkif_vdev_t handle; /* only for read/write requests */
86 uint64_t __attribute__((__aligned__(8))) id;
87 blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
88 struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
89};
90struct blkif_x86_64_response {
91 uint64_t __attribute__((__aligned__(8))) id;
92 uint8_t operation; /* copied from request */
93 int16_t status; /* BLKIF_RSP_??? */
94};
95
96DEFINE_RING_TYPES(blkif_common, struct blkif_common_request,
97 struct blkif_common_response);
98DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request,
99 struct blkif_x86_32_response);
100DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request,
101 struct blkif_x86_64_response);
102
103union blkif_back_rings {
104 struct blkif_back_ring native;
105 struct blkif_common_back_ring common;
106 struct blkif_x86_32_back_ring x86_32;
107 struct blkif_x86_64_back_ring x86_64;
108};
109
110enum blkif_protocol {
111 BLKIF_PROTOCOL_NATIVE = 1,
112 BLKIF_PROTOCOL_X86_32 = 2,
113 BLKIF_PROTOCOL_X86_64 = 3,
114};
115
116struct xen_vbd {
117 /* What the domain refers to this vbd as. */
118 blkif_vdev_t handle;
119 /* Non-zero -> read-only */
120 unsigned char readonly;
121 /* VDISK_xxx */
122 unsigned char type;
123 /* phys device that this vbd maps to. */
124 u32 pdevice;
125 struct block_device *bdev;
126 /* Cached size parameter. */
127 sector_t size;
128 bool flush_support;
129};
130
131struct backend_info;
132
133struct xen_blkif {
134 /* Unique identifier for this interface. */
135 domid_t domid;
136 unsigned int handle;
137 /* Physical parameters of the comms window. */
138 unsigned int irq;
139 /* Comms information. */
140 enum blkif_protocol blk_protocol;
141 union blkif_back_rings blk_rings;
142 struct vm_struct *blk_ring_area;
143 /* The VBD attached to this interface. */
144 struct xen_vbd vbd;
145 /* Back pointer to the backend_info. */
146 struct backend_info *be;
147 /* Private fields. */
148 spinlock_t blk_ring_lock;
149 atomic_t refcnt;
150
151 wait_queue_head_t wq;
152 /* One thread per one blkif. */
153 struct task_struct *xenblkd;
154 unsigned int waiting_reqs;
155
156 /* statistics */
157 unsigned long st_print;
158 int st_rd_req;
159 int st_wr_req;
160 int st_oo_req;
161 int st_f_req;
162 int st_rd_sect;
163 int st_wr_sect;
164
165 wait_queue_head_t waiting_to_free;
166
167 grant_handle_t shmem_handle;
168 grant_ref_t shmem_ref;
169};
170
171
172#define vbd_sz(_v) ((_v)->bdev->bd_part ? \
173 (_v)->bdev->bd_part->nr_sects : \
174 get_capacity((_v)->bdev->bd_disk))
175
176#define xen_blkif_get(_b) (atomic_inc(&(_b)->refcnt))
177#define xen_blkif_put(_b) \
178 do { \
179 if (atomic_dec_and_test(&(_b)->refcnt)) \
180 wake_up(&(_b)->waiting_to_free);\
181 } while (0)
182
183struct phys_req {
184 unsigned short dev;
185 unsigned short nr_sects;
186 struct block_device *bdev;
187 blkif_sector_t sector_number;
188};
189int xen_blkif_interface_init(void);
190
191int xen_blkif_xenbus_init(void);
192
193irqreturn_t xen_blkif_be_int(int irq, void *dev_id);
194int xen_blkif_schedule(void *arg);
195
196int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
197 struct backend_info *be, int state);
198
199struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be);
200
201static inline void blkif_get_x86_32_req(struct blkif_request *dst,
202 struct blkif_x86_32_request *src)
203{
204 int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
205 dst->operation = src->operation;
206 dst->nr_segments = src->nr_segments;
207 dst->handle = src->handle;
208 dst->id = src->id;
209 dst->u.rw.sector_number = src->sector_number;
210 barrier();
211 if (n > dst->nr_segments)
212 n = dst->nr_segments;
213 for (i = 0; i < n; i++)
214 dst->u.rw.seg[i] = src->seg[i];
215}
216
217static inline void blkif_get_x86_64_req(struct blkif_request *dst,
218 struct blkif_x86_64_request *src)
219{
220 int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
221 dst->operation = src->operation;
222 dst->nr_segments = src->nr_segments;
223 dst->handle = src->handle;
224 dst->id = src->id;
225 dst->u.rw.sector_number = src->sector_number;
226 barrier();
227 if (n > dst->nr_segments)
228 n = dst->nr_segments;
229 for (i = 0; i < n; i++)
230 dst->u.rw.seg[i] = src->seg[i];
231}
232
233#endif /* __XEN_BLKIF__BACKEND__COMMON_H__ */
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
new file mode 100644
index 000000000000..6cc0db1bf522
--- /dev/null
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -0,0 +1,767 @@
1/* Xenbus code for blkif backend
2 Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
3 Copyright (C) 2005 XenSource Ltd
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15*/
16
17#include <stdarg.h>
18#include <linux/module.h>
19#include <linux/kthread.h>
20#include <xen/events.h>
21#include <xen/grant_table.h>
22#include "common.h"
23
24struct backend_info {
25 struct xenbus_device *dev;
26 struct xen_blkif *blkif;
27 struct xenbus_watch backend_watch;
28 unsigned major;
29 unsigned minor;
30 char *mode;
31};
32
33static struct kmem_cache *xen_blkif_cachep;
34static void connect(struct backend_info *);
35static int connect_ring(struct backend_info *);
36static void backend_changed(struct xenbus_watch *, const char **,
37 unsigned int);
38
39struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be)
40{
41 return be->dev;
42}
43
44static int blkback_name(struct xen_blkif *blkif, char *buf)
45{
46 char *devpath, *devname;
47 struct xenbus_device *dev = blkif->be->dev;
48
49 devpath = xenbus_read(XBT_NIL, dev->nodename, "dev", NULL);
50 if (IS_ERR(devpath))
51 return PTR_ERR(devpath);
52
53 devname = strstr(devpath, "/dev/");
54 if (devname != NULL)
55 devname += strlen("/dev/");
56 else
57 devname = devpath;
58
59 snprintf(buf, TASK_COMM_LEN, "blkback.%d.%s", blkif->domid, devname);
60 kfree(devpath);
61
62 return 0;
63}
64
65static void xen_update_blkif_status(struct xen_blkif *blkif)
66{
67 int err;
68 char name[TASK_COMM_LEN];
69
70 /* Not ready to connect? */
71 if (!blkif->irq || !blkif->vbd.bdev)
72 return;
73
74 /* Already connected? */
75 if (blkif->be->dev->state == XenbusStateConnected)
76 return;
77
78 /* Attempt to connect: exit if we fail to. */
79 connect(blkif->be);
80 if (blkif->be->dev->state != XenbusStateConnected)
81 return;
82
83 err = blkback_name(blkif, name);
84 if (err) {
85 xenbus_dev_error(blkif->be->dev, err, "get blkback dev name");
86 return;
87 }
88
89 err = filemap_write_and_wait(blkif->vbd.bdev->bd_inode->i_mapping);
90 if (err) {
91 xenbus_dev_error(blkif->be->dev, err, "block flush");
92 return;
93 }
94 invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping);
95
96 blkif->xenblkd = kthread_run(xen_blkif_schedule, blkif, name);
97 if (IS_ERR(blkif->xenblkd)) {
98 err = PTR_ERR(blkif->xenblkd);
99 blkif->xenblkd = NULL;
100 xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
101 }
102}
103
104static struct xen_blkif *xen_blkif_alloc(domid_t domid)
105{
106 struct xen_blkif *blkif;
107
108 blkif = kmem_cache_alloc(xen_blkif_cachep, GFP_KERNEL);
109 if (!blkif)
110 return ERR_PTR(-ENOMEM);
111
112 memset(blkif, 0, sizeof(*blkif));
113 blkif->domid = domid;
114 spin_lock_init(&blkif->blk_ring_lock);
115 atomic_set(&blkif->refcnt, 1);
116 init_waitqueue_head(&blkif->wq);
117 blkif->st_print = jiffies;
118 init_waitqueue_head(&blkif->waiting_to_free);
119
120 return blkif;
121}
122
123static int map_frontend_page(struct xen_blkif *blkif, unsigned long shared_page)
124{
125 struct gnttab_map_grant_ref op;
126
127 gnttab_set_map_op(&op, (unsigned long)blkif->blk_ring_area->addr,
128 GNTMAP_host_map, shared_page, blkif->domid);
129
130 if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
131 BUG();
132
133 if (op.status) {
134 DPRINTK("Grant table operation failure !\n");
135 return op.status;
136 }
137
138 blkif->shmem_ref = shared_page;
139 blkif->shmem_handle = op.handle;
140
141 return 0;
142}
143
144static void unmap_frontend_page(struct xen_blkif *blkif)
145{
146 struct gnttab_unmap_grant_ref op;
147
148 gnttab_set_unmap_op(&op, (unsigned long)blkif->blk_ring_area->addr,
149 GNTMAP_host_map, blkif->shmem_handle);
150
151 if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
152 BUG();
153}
154
155static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page,
156 unsigned int evtchn)
157{
158 int err;
159
160 /* Already connected through? */
161 if (blkif->irq)
162 return 0;
163
164 blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE);
165 if (!blkif->blk_ring_area)
166 return -ENOMEM;
167
168 err = map_frontend_page(blkif, shared_page);
169 if (err) {
170 free_vm_area(blkif->blk_ring_area);
171 return err;
172 }
173
174 switch (blkif->blk_protocol) {
175 case BLKIF_PROTOCOL_NATIVE:
176 {
177 struct blkif_sring *sring;
178 sring = (struct blkif_sring *)blkif->blk_ring_area->addr;
179 BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
180 break;
181 }
182 case BLKIF_PROTOCOL_X86_32:
183 {
184 struct blkif_x86_32_sring *sring_x86_32;
185 sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring_area->addr;
186 BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE);
187 break;
188 }
189 case BLKIF_PROTOCOL_X86_64:
190 {
191 struct blkif_x86_64_sring *sring_x86_64;
192 sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring_area->addr;
193 BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);
194 break;
195 }
196 default:
197 BUG();
198 }
199
200 err = bind_interdomain_evtchn_to_irqhandler(blkif->domid, evtchn,
201 xen_blkif_be_int, 0,
202 "blkif-backend", blkif);
203 if (err < 0) {
204 unmap_frontend_page(blkif);
205 free_vm_area(blkif->blk_ring_area);
206 blkif->blk_rings.common.sring = NULL;
207 return err;
208 }
209 blkif->irq = err;
210
211 return 0;
212}
213
214static void xen_blkif_disconnect(struct xen_blkif *blkif)
215{
216 if (blkif->xenblkd) {
217 kthread_stop(blkif->xenblkd);
218 blkif->xenblkd = NULL;
219 }
220
221 atomic_dec(&blkif->refcnt);
222 wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0);
223 atomic_inc(&blkif->refcnt);
224
225 if (blkif->irq) {
226 unbind_from_irqhandler(blkif->irq, blkif);
227 blkif->irq = 0;
228 }
229
230 if (blkif->blk_rings.common.sring) {
231 unmap_frontend_page(blkif);
232 free_vm_area(blkif->blk_ring_area);
233 blkif->blk_rings.common.sring = NULL;
234 }
235}
236
237void xen_blkif_free(struct xen_blkif *blkif)
238{
239 if (!atomic_dec_and_test(&blkif->refcnt))
240 BUG();
241 kmem_cache_free(xen_blkif_cachep, blkif);
242}
243
244int __init xen_blkif_interface_init(void)
245{
246 xen_blkif_cachep = kmem_cache_create("blkif_cache",
247 sizeof(struct xen_blkif),
248 0, 0, NULL);
249 if (!xen_blkif_cachep)
250 return -ENOMEM;
251
252 return 0;
253}
254
255/*
256 * sysfs interface for VBD I/O requests
257 */
258
259#define VBD_SHOW(name, format, args...) \
260 static ssize_t show_##name(struct device *_dev, \
261 struct device_attribute *attr, \
262 char *buf) \
263 { \
264 struct xenbus_device *dev = to_xenbus_device(_dev); \
265 struct backend_info *be = dev_get_drvdata(&dev->dev); \
266 \
267 return sprintf(buf, format, ##args); \
268 } \
269 static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
270
271VBD_SHOW(oo_req, "%d\n", be->blkif->st_oo_req);
272VBD_SHOW(rd_req, "%d\n", be->blkif->st_rd_req);
273VBD_SHOW(wr_req, "%d\n", be->blkif->st_wr_req);
274VBD_SHOW(f_req, "%d\n", be->blkif->st_f_req);
275VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect);
276VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect);
277
278static struct attribute *xen_vbdstat_attrs[] = {
279 &dev_attr_oo_req.attr,
280 &dev_attr_rd_req.attr,
281 &dev_attr_wr_req.attr,
282 &dev_attr_f_req.attr,
283 &dev_attr_rd_sect.attr,
284 &dev_attr_wr_sect.attr,
285 NULL
286};
287
288static struct attribute_group xen_vbdstat_group = {
289 .name = "statistics",
290 .attrs = xen_vbdstat_attrs,
291};
292
293VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor);
294VBD_SHOW(mode, "%s\n", be->mode);
295
296int xenvbd_sysfs_addif(struct xenbus_device *dev)
297{
298 int error;
299
300 error = device_create_file(&dev->dev, &dev_attr_physical_device);
301 if (error)
302 goto fail1;
303
304 error = device_create_file(&dev->dev, &dev_attr_mode);
305 if (error)
306 goto fail2;
307
308 error = sysfs_create_group(&dev->dev.kobj, &xen_vbdstat_group);
309 if (error)
310 goto fail3;
311
312 return 0;
313
314fail3: sysfs_remove_group(&dev->dev.kobj, &xen_vbdstat_group);
315fail2: device_remove_file(&dev->dev, &dev_attr_mode);
316fail1: device_remove_file(&dev->dev, &dev_attr_physical_device);
317 return error;
318}
319
320void xenvbd_sysfs_delif(struct xenbus_device *dev)
321{
322 sysfs_remove_group(&dev->dev.kobj, &xen_vbdstat_group);
323 device_remove_file(&dev->dev, &dev_attr_mode);
324 device_remove_file(&dev->dev, &dev_attr_physical_device);
325}
326
327
328static void xen_vbd_free(struct xen_vbd *vbd)
329{
330 if (vbd->bdev)
331 blkdev_put(vbd->bdev, vbd->readonly ? FMODE_READ : FMODE_WRITE);
332 vbd->bdev = NULL;
333}
334
335static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle,
336 unsigned major, unsigned minor, int readonly,
337 int cdrom)
338{
339 struct xen_vbd *vbd;
340 struct block_device *bdev;
341 struct request_queue *q;
342
343 vbd = &blkif->vbd;
344 vbd->handle = handle;
345 vbd->readonly = readonly;
346 vbd->type = 0;
347
348 vbd->pdevice = MKDEV(major, minor);
349
350 bdev = blkdev_get_by_dev(vbd->pdevice, vbd->readonly ?
351 FMODE_READ : FMODE_WRITE, NULL);
352
353 if (IS_ERR(bdev)) {
354 DPRINTK("xen_vbd_create: device %08x could not be opened.\n",
355 vbd->pdevice);
356 return -ENOENT;
357 }
358
359 vbd->bdev = bdev;
360 if (vbd->bdev->bd_disk == NULL) {
361 DPRINTK("xen_vbd_create: device %08x doesn't exist.\n",
362 vbd->pdevice);
363 xen_vbd_free(vbd);
364 return -ENOENT;
365 }
366 vbd->size = vbd_sz(vbd);
367
368 if (vbd->bdev->bd_disk->flags & GENHD_FL_CD || cdrom)
369 vbd->type |= VDISK_CDROM;
370 if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
371 vbd->type |= VDISK_REMOVABLE;
372
373 q = bdev_get_queue(bdev);
374 if (q && q->flush_flags)
375 vbd->flush_support = true;
376
377 DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
378 handle, blkif->domid);
379 return 0;
380}
381static int xen_blkbk_remove(struct xenbus_device *dev)
382{
383 struct backend_info *be = dev_get_drvdata(&dev->dev);
384
385 DPRINTK("");
386
387 if (be->major || be->minor)
388 xenvbd_sysfs_delif(dev);
389
390 if (be->backend_watch.node) {
391 unregister_xenbus_watch(&be->backend_watch);
392 kfree(be->backend_watch.node);
393 be->backend_watch.node = NULL;
394 }
395
396 if (be->blkif) {
397 xen_blkif_disconnect(be->blkif);
398 xen_vbd_free(&be->blkif->vbd);
399 xen_blkif_free(be->blkif);
400 be->blkif = NULL;
401 }
402
403 kfree(be);
404 dev_set_drvdata(&dev->dev, NULL);
405 return 0;
406}
407
408int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
409 struct backend_info *be, int state)
410{
411 struct xenbus_device *dev = be->dev;
412 int err;
413
414 err = xenbus_printf(xbt, dev->nodename, "feature-flush-cache",
415 "%d", state);
416 if (err)
417 xenbus_dev_fatal(dev, err, "writing feature-flush-cache");
418
419 return err;
420}
421
422/*
423 * Entry point to this code when a new device is created. Allocate the basic
424 * structures, and watch the store waiting for the hotplug scripts to tell us
425 * the device's physical major and minor numbers. Switch to InitWait.
426 */
427static int xen_blkbk_probe(struct xenbus_device *dev,
428 const struct xenbus_device_id *id)
429{
430 int err;
431 struct backend_info *be = kzalloc(sizeof(struct backend_info),
432 GFP_KERNEL);
433 if (!be) {
434 xenbus_dev_fatal(dev, -ENOMEM,
435 "allocating backend structure");
436 return -ENOMEM;
437 }
438 be->dev = dev;
439 dev_set_drvdata(&dev->dev, be);
440
441 be->blkif = xen_blkif_alloc(dev->otherend_id);
442 if (IS_ERR(be->blkif)) {
443 err = PTR_ERR(be->blkif);
444 be->blkif = NULL;
445 xenbus_dev_fatal(dev, err, "creating block interface");
446 goto fail;
447 }
448
449 /* setup back pointer */
450 be->blkif->be = be;
451
452 err = xenbus_watch_pathfmt(dev, &be->backend_watch, backend_changed,
453 "%s/%s", dev->nodename, "physical-device");
454 if (err)
455 goto fail;
456
457 err = xenbus_switch_state(dev, XenbusStateInitWait);
458 if (err)
459 goto fail;
460
461 return 0;
462
463fail:
464 DPRINTK("failed");
465 xen_blkbk_remove(dev);
466 return err;
467}
468
469
470/*
471 * Callback received when the hotplug scripts have placed the physical-device
472 * node. Read it and the mode node, and create a vbd. If the frontend is
473 * ready, connect.
474 */
475static void backend_changed(struct xenbus_watch *watch,
476 const char **vec, unsigned int len)
477{
478 int err;
479 unsigned major;
480 unsigned minor;
481 struct backend_info *be
482 = container_of(watch, struct backend_info, backend_watch);
483 struct xenbus_device *dev = be->dev;
484 int cdrom = 0;
485 char *device_type;
486
487 DPRINTK("");
488
489 err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x",
490 &major, &minor);
491 if (XENBUS_EXIST_ERR(err)) {
492 /*
493 * Since this watch will fire once immediately after it is
494 * registered, we expect this. Ignore it, and wait for the
495 * hotplug scripts.
496 */
497 return;
498 }
499 if (err != 2) {
500 xenbus_dev_fatal(dev, err, "reading physical-device");
501 return;
502 }
503
504 if ((be->major || be->minor) &&
505 ((be->major != major) || (be->minor != minor))) {
506 pr_warn(DRV_PFX "changing physical device (from %x:%x to %x:%x) not supported.\n",
507 be->major, be->minor, major, minor);
508 return;
509 }
510
511 be->mode = xenbus_read(XBT_NIL, dev->nodename, "mode", NULL);
512 if (IS_ERR(be->mode)) {
513 err = PTR_ERR(be->mode);
514 be->mode = NULL;
515 xenbus_dev_fatal(dev, err, "reading mode");
516 return;
517 }
518
519 device_type = xenbus_read(XBT_NIL, dev->otherend, "device-type", NULL);
520 if (!IS_ERR(device_type)) {
521 cdrom = strcmp(device_type, "cdrom") == 0;
522 kfree(device_type);
523 }
524
525 if (be->major == 0 && be->minor == 0) {
526 /* Front end dir is a number, which is used as the handle. */
527
528 char *p = strrchr(dev->otherend, '/') + 1;
529 long handle;
530 err = strict_strtoul(p, 0, &handle);
531 if (err)
532 return;
533
534 be->major = major;
535 be->minor = minor;
536
537 err = xen_vbd_create(be->blkif, handle, major, minor,
538 (NULL == strchr(be->mode, 'w')), cdrom);
539 if (err) {
540 be->major = 0;
541 be->minor = 0;
542 xenbus_dev_fatal(dev, err, "creating vbd structure");
543 return;
544 }
545
546 err = xenvbd_sysfs_addif(dev);
547 if (err) {
548 xen_vbd_free(&be->blkif->vbd);
549 be->major = 0;
550 be->minor = 0;
551 xenbus_dev_fatal(dev, err, "creating sysfs entries");
552 return;
553 }
554
555 /* We're potentially connected now */
556 xen_update_blkif_status(be->blkif);
557 }
558}
559
560
561/*
562 * Callback received when the frontend's state changes.
563 */
564static void frontend_changed(struct xenbus_device *dev,
565 enum xenbus_state frontend_state)
566{
567 struct backend_info *be = dev_get_drvdata(&dev->dev);
568 int err;
569
570 DPRINTK("%s", xenbus_strstate(frontend_state));
571
572 switch (frontend_state) {
573 case XenbusStateInitialising:
574 if (dev->state == XenbusStateClosed) {
575 pr_info(DRV_PFX "%s: prepare for reconnect\n",
576 dev->nodename);
577 xenbus_switch_state(dev, XenbusStateInitWait);
578 }
579 break;
580
581 case XenbusStateInitialised:
582 case XenbusStateConnected:
583 /*
584 * Ensure we connect even when two watches fire in
585 * close successsion and we miss the intermediate value
586 * of frontend_state.
587 */
588 if (dev->state == XenbusStateConnected)
589 break;
590
591 /*
592 * Enforce precondition before potential leak point.
593 * blkif_disconnect() is idempotent.
594 */
595 xen_blkif_disconnect(be->blkif);
596
597 err = connect_ring(be);
598 if (err)
599 break;
600 xen_update_blkif_status(be->blkif);
601 break;
602
603 case XenbusStateClosing:
604 xen_blkif_disconnect(be->blkif);
605 xenbus_switch_state(dev, XenbusStateClosing);
606 break;
607
608 case XenbusStateClosed:
609 xenbus_switch_state(dev, XenbusStateClosed);
610 if (xenbus_dev_is_online(dev))
611 break;
612 /* fall through if not online */
613 case XenbusStateUnknown:
614 /* implies blkif_disconnect() via blkback_remove() */
615 device_unregister(&dev->dev);
616 break;
617
618 default:
619 xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
620 frontend_state);
621 break;
622 }
623}
624
625
626/* ** Connection ** */
627
628
629/*
630 * Write the physical details regarding the block device to the store, and
631 * switch to Connected state.
632 */
633static void connect(struct backend_info *be)
634{
635 struct xenbus_transaction xbt;
636 int err;
637 struct xenbus_device *dev = be->dev;
638
639 DPRINTK("%s", dev->otherend);
640
641 /* Supply the information about the device the frontend needs */
642again:
643 err = xenbus_transaction_start(&xbt);
644 if (err) {
645 xenbus_dev_fatal(dev, err, "starting transaction");
646 return;
647 }
648
649 err = xen_blkbk_flush_diskcache(xbt, be, be->blkif->vbd.flush_support);
650 if (err)
651 goto abort;
652
653 err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
654 (unsigned long long)vbd_sz(&be->blkif->vbd));
655 if (err) {
656 xenbus_dev_fatal(dev, err, "writing %s/sectors",
657 dev->nodename);
658 goto abort;
659 }
660
661 /* FIXME: use a typename instead */
662 err = xenbus_printf(xbt, dev->nodename, "info", "%u",
663 be->blkif->vbd.type |
664 (be->blkif->vbd.readonly ? VDISK_READONLY : 0));
665 if (err) {
666 xenbus_dev_fatal(dev, err, "writing %s/info",
667 dev->nodename);
668 goto abort;
669 }
670 err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu",
671 (unsigned long)
672 bdev_logical_block_size(be->blkif->vbd.bdev));
673 if (err) {
674 xenbus_dev_fatal(dev, err, "writing %s/sector-size",
675 dev->nodename);
676 goto abort;
677 }
678
679 err = xenbus_transaction_end(xbt, 0);
680 if (err == -EAGAIN)
681 goto again;
682 if (err)
683 xenbus_dev_fatal(dev, err, "ending transaction");
684
685 err = xenbus_switch_state(dev, XenbusStateConnected);
686 if (err)
687 xenbus_dev_fatal(dev, err, "switching to Connected state",
688 dev->nodename);
689
690 return;
691 abort:
692 xenbus_transaction_end(xbt, 1);
693}
694
695
696static int connect_ring(struct backend_info *be)
697{
698 struct xenbus_device *dev = be->dev;
699 unsigned long ring_ref;
700 unsigned int evtchn;
701 char protocol[64] = "";
702 int err;
703
704 DPRINTK("%s", dev->otherend);
705
706 err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu",
707 &ring_ref, "event-channel", "%u", &evtchn, NULL);
708 if (err) {
709 xenbus_dev_fatal(dev, err,
710 "reading %s/ring-ref and event-channel",
711 dev->otherend);
712 return err;
713 }
714
715 be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
716 err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
717 "%63s", protocol, NULL);
718 if (err)
719 strcpy(protocol, "unspecified, assuming native");
720 else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
721 be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
722 else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
723 be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
724 else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
725 be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
726 else {
727 xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
728 return -1;
729 }
730 pr_info(DRV_PFX "ring-ref %ld, event-channel %d, protocol %d (%s)\n",
731 ring_ref, evtchn, be->blkif->blk_protocol, protocol);
732
733 /* Map the shared frame, irq etc. */
734 err = xen_blkif_map(be->blkif, ring_ref, evtchn);
735 if (err) {
736 xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
737 ring_ref, evtchn);
738 return err;
739 }
740
741 return 0;
742}
743
744
745/* ** Driver Registration ** */
746
747
748static const struct xenbus_device_id xen_blkbk_ids[] = {
749 { "vbd" },
750 { "" }
751};
752
753
754static struct xenbus_driver xen_blkbk = {
755 .name = "vbd",
756 .owner = THIS_MODULE,
757 .ids = xen_blkbk_ids,
758 .probe = xen_blkbk_probe,
759 .remove = xen_blkbk_remove,
760 .otherend_changed = frontend_changed
761};
762
763
764int xen_blkif_xenbus_init(void)
765{
766 return xenbus_register_backend(&xen_blkbk);
767}
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index ab735a605cf3..b536a9cef917 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -41,7 +41,7 @@
41#include <linux/cdrom.h> 41#include <linux/cdrom.h>
42#include <linux/module.h> 42#include <linux/module.h>
43#include <linux/slab.h> 43#include <linux/slab.h>
44#include <linux/smp_lock.h> 44#include <linux/mutex.h>
45#include <linux/scatterlist.h> 45#include <linux/scatterlist.h>
46 46
47#include <xen/xen.h> 47#include <xen/xen.h>
@@ -65,13 +65,14 @@ enum blkif_state {
65 65
66struct blk_shadow { 66struct blk_shadow {
67 struct blkif_request req; 67 struct blkif_request req;
68 unsigned long request; 68 struct request *request;
69 unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 69 unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
70}; 70};
71 71
72static DEFINE_MUTEX(blkfront_mutex);
72static const struct block_device_operations xlvbd_block_fops; 73static const struct block_device_operations xlvbd_block_fops;
73 74
74#define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE) 75#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
75 76
76/* 77/*
77 * We have one of these per vbd, whether ide, scsi or 'other'. They 78 * We have one of these per vbd, whether ide, scsi or 'other'. They
@@ -95,7 +96,8 @@ struct blkfront_info
95 struct gnttab_free_callback callback; 96 struct gnttab_free_callback callback;
96 struct blk_shadow shadow[BLK_RING_SIZE]; 97 struct blk_shadow shadow[BLK_RING_SIZE];
97 unsigned long shadow_free; 98 unsigned long shadow_free;
98 int feature_barrier; 99 unsigned int feature_flush;
100 unsigned int flush_op;
99 int is_ready; 101 int is_ready;
100}; 102};
101 103
@@ -119,6 +121,10 @@ static DEFINE_SPINLOCK(minor_lock);
119#define EXTENDED (1<<EXT_SHIFT) 121#define EXTENDED (1<<EXT_SHIFT)
120#define VDEV_IS_EXTENDED(dev) ((dev)&(EXTENDED)) 122#define VDEV_IS_EXTENDED(dev) ((dev)&(EXTENDED))
121#define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED)) 123#define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED))
124#define EMULATED_HD_DISK_MINOR_OFFSET (0)
125#define EMULATED_HD_DISK_NAME_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET / 256)
126#define EMULATED_SD_DISK_MINOR_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET + (4 * 16))
127#define EMULATED_SD_DISK_NAME_OFFSET (EMULATED_HD_DISK_NAME_OFFSET + 4)
122 128
123#define DEV_NAME "xvd" /* name in /dev */ 129#define DEV_NAME "xvd" /* name in /dev */
124 130
@@ -135,7 +141,7 @@ static void add_id_to_freelist(struct blkfront_info *info,
135 unsigned long id) 141 unsigned long id)
136{ 142{
137 info->shadow[id].req.id = info->shadow_free; 143 info->shadow[id].req.id = info->shadow_free;
138 info->shadow[id].request = 0; 144 info->shadow[id].request = NULL;
139 info->shadow_free = id; 145 info->shadow_free = id;
140} 146}
141 147
@@ -244,14 +250,10 @@ static int blkif_ioctl(struct block_device *bdev, fmode_t mode,
244} 250}
245 251
246/* 252/*
247 * blkif_queue_request 253 * Generate a Xen blkfront IO request from a blk layer request. Reads
254 * and writes are handled as expected.
248 * 255 *
249 * request block io 256 * @req: a request struct
250 *
251 * id: for guest use only.
252 * operation: BLKIF_OP_{READ,WRITE,PROBE}
253 * buffer: buffer to read/write into. this should be a
254 * virtual address in the guest os.
255 */ 257 */
256static int blkif_queue_request(struct request *req) 258static int blkif_queue_request(struct request *req)
257{ 259{
@@ -280,16 +282,25 @@ static int blkif_queue_request(struct request *req)
280 /* Fill out a communications ring structure. */ 282 /* Fill out a communications ring structure. */
281 ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); 283 ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
282 id = get_id_from_freelist(info); 284 id = get_id_from_freelist(info);
283 info->shadow[id].request = (unsigned long)req; 285 info->shadow[id].request = req;
284 286
285 ring_req->id = id; 287 ring_req->id = id;
286 ring_req->sector_number = (blkif_sector_t)blk_rq_pos(req); 288 ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
287 ring_req->handle = info->handle; 289 ring_req->handle = info->handle;
288 290
289 ring_req->operation = rq_data_dir(req) ? 291 ring_req->operation = rq_data_dir(req) ?
290 BLKIF_OP_WRITE : BLKIF_OP_READ; 292 BLKIF_OP_WRITE : BLKIF_OP_READ;
291 if (req->cmd_flags & REQ_HARDBARRIER) 293
292 ring_req->operation = BLKIF_OP_WRITE_BARRIER; 294 if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
295 /*
296 * Ideally we can do an unordered flush-to-disk. In case the
297 * backend onlysupports barriers, use that. A barrier request
298 * a superset of FUA, so we can implement it the same
299 * way. (It's also a FLUSH+FUA, since it is
300 * guaranteed ordered WRT previous writes.)
301 */
302 ring_req->operation = info->flush_op;
303 }
293 304
294 ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg); 305 ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg);
295 BUG_ON(ring_req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST); 306 BUG_ON(ring_req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
@@ -309,7 +320,7 @@ static int blkif_queue_request(struct request *req)
309 rq_data_dir(req) ); 320 rq_data_dir(req) );
310 321
311 info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn); 322 info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn);
312 ring_req->seg[i] = 323 ring_req->u.rw.seg[i] =
313 (struct blkif_request_segment) { 324 (struct blkif_request_segment) {
314 .gref = ref, 325 .gref = ref,
315 .first_sect = fsect, 326 .first_sect = fsect,
@@ -418,36 +429,84 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
418} 429}
419 430
420 431
421static int xlvbd_barrier(struct blkfront_info *info) 432static void xlvbd_flush(struct blkfront_info *info)
422{ 433{
423 int err; 434 blk_queue_flush(info->rq, info->feature_flush);
424 const char *barrier; 435 printk(KERN_INFO "blkfront: %s: %s: %s\n",
436 info->gd->disk_name,
437 info->flush_op == BLKIF_OP_WRITE_BARRIER ?
438 "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ?
439 "flush diskcache" : "barrier or flush"),
440 info->feature_flush ? "enabled" : "disabled");
441}
425 442
426 switch (info->feature_barrier) { 443static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
427 case QUEUE_ORDERED_DRAIN: barrier = "enabled (drain)"; break; 444{
428 case QUEUE_ORDERED_TAG: barrier = "enabled (tag)"; break; 445 int major;
429 case QUEUE_ORDERED_NONE: barrier = "disabled"; break; 446 major = BLKIF_MAJOR(vdevice);
430 default: return -EINVAL; 447 *minor = BLKIF_MINOR(vdevice);
448 switch (major) {
449 case XEN_IDE0_MAJOR:
450 *offset = (*minor / 64) + EMULATED_HD_DISK_NAME_OFFSET;
451 *minor = ((*minor / 64) * PARTS_PER_DISK) +
452 EMULATED_HD_DISK_MINOR_OFFSET;
453 break;
454 case XEN_IDE1_MAJOR:
455 *offset = (*minor / 64) + 2 + EMULATED_HD_DISK_NAME_OFFSET;
456 *minor = (((*minor / 64) + 2) * PARTS_PER_DISK) +
457 EMULATED_HD_DISK_MINOR_OFFSET;
458 break;
459 case XEN_SCSI_DISK0_MAJOR:
460 *offset = (*minor / PARTS_PER_DISK) + EMULATED_SD_DISK_NAME_OFFSET;
461 *minor = *minor + EMULATED_SD_DISK_MINOR_OFFSET;
462 break;
463 case XEN_SCSI_DISK1_MAJOR:
464 case XEN_SCSI_DISK2_MAJOR:
465 case XEN_SCSI_DISK3_MAJOR:
466 case XEN_SCSI_DISK4_MAJOR:
467 case XEN_SCSI_DISK5_MAJOR:
468 case XEN_SCSI_DISK6_MAJOR:
469 case XEN_SCSI_DISK7_MAJOR:
470 *offset = (*minor / PARTS_PER_DISK) +
471 ((major - XEN_SCSI_DISK1_MAJOR + 1) * 16) +
472 EMULATED_SD_DISK_NAME_OFFSET;
473 *minor = *minor +
474 ((major - XEN_SCSI_DISK1_MAJOR + 1) * 16 * PARTS_PER_DISK) +
475 EMULATED_SD_DISK_MINOR_OFFSET;
476 break;
477 case XEN_SCSI_DISK8_MAJOR:
478 case XEN_SCSI_DISK9_MAJOR:
479 case XEN_SCSI_DISK10_MAJOR:
480 case XEN_SCSI_DISK11_MAJOR:
481 case XEN_SCSI_DISK12_MAJOR:
482 case XEN_SCSI_DISK13_MAJOR:
483 case XEN_SCSI_DISK14_MAJOR:
484 case XEN_SCSI_DISK15_MAJOR:
485 *offset = (*minor / PARTS_PER_DISK) +
486 ((major - XEN_SCSI_DISK8_MAJOR + 8) * 16) +
487 EMULATED_SD_DISK_NAME_OFFSET;
488 *minor = *minor +
489 ((major - XEN_SCSI_DISK8_MAJOR + 8) * 16 * PARTS_PER_DISK) +
490 EMULATED_SD_DISK_MINOR_OFFSET;
491 break;
492 case XENVBD_MAJOR:
493 *offset = *minor / PARTS_PER_DISK;
494 break;
495 default:
496 printk(KERN_WARNING "blkfront: your disk configuration is "
497 "incorrect, please use an xvd device instead\n");
498 return -ENODEV;
431 } 499 }
432
433 err = blk_queue_ordered(info->rq, info->feature_barrier);
434
435 if (err)
436 return err;
437
438 printk(KERN_INFO "blkfront: %s: barriers %s\n",
439 info->gd->disk_name, barrier);
440 return 0; 500 return 0;
441} 501}
442 502
443
444static int xlvbd_alloc_gendisk(blkif_sector_t capacity, 503static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
445 struct blkfront_info *info, 504 struct blkfront_info *info,
446 u16 vdisk_info, u16 sector_size) 505 u16 vdisk_info, u16 sector_size)
447{ 506{
448 struct gendisk *gd; 507 struct gendisk *gd;
449 int nr_minors = 1; 508 int nr_minors = 1;
450 int err = -ENODEV; 509 int err;
451 unsigned int offset; 510 unsigned int offset;
452 int minor; 511 int minor;
453 int nr_parts; 512 int nr_parts;
@@ -462,12 +521,20 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
462 } 521 }
463 522
464 if (!VDEV_IS_EXTENDED(info->vdevice)) { 523 if (!VDEV_IS_EXTENDED(info->vdevice)) {
465 minor = BLKIF_MINOR(info->vdevice); 524 err = xen_translate_vdev(info->vdevice, &minor, &offset);
466 nr_parts = PARTS_PER_DISK; 525 if (err)
526 return err;
527 nr_parts = PARTS_PER_DISK;
467 } else { 528 } else {
468 minor = BLKIF_MINOR_EXT(info->vdevice); 529 minor = BLKIF_MINOR_EXT(info->vdevice);
469 nr_parts = PARTS_PER_EXT_DISK; 530 nr_parts = PARTS_PER_EXT_DISK;
531 offset = minor / nr_parts;
532 if (xen_hvm_domain() && offset <= EMULATED_HD_DISK_NAME_OFFSET + 4)
533 printk(KERN_WARNING "blkfront: vdevice 0x%x might conflict with "
534 "emulated IDE disks,\n\t choose an xvd device name"
535 "from xvde on\n", info->vdevice);
470 } 536 }
537 err = -ENODEV;
471 538
472 if ((minor % nr_parts) == 0) 539 if ((minor % nr_parts) == 0)
473 nr_minors = nr_parts; 540 nr_minors = nr_parts;
@@ -481,8 +548,6 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
481 if (gd == NULL) 548 if (gd == NULL)
482 goto release; 549 goto release;
483 550
484 offset = minor / nr_parts;
485
486 if (nr_minors > 1) { 551 if (nr_minors > 1) {
487 if (offset < 26) 552 if (offset < 26)
488 sprintf(gd->disk_name, "%s%c", DEV_NAME, 'a' + offset); 553 sprintf(gd->disk_name, "%s%c", DEV_NAME, 'a' + offset);
@@ -516,7 +581,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
516 info->rq = gd->queue; 581 info->rq = gd->queue;
517 info->gd = gd; 582 info->gd = gd;
518 583
519 xlvbd_barrier(info); 584 xlvbd_flush(info);
520 585
521 if (vdisk_info & VDISK_READONLY) 586 if (vdisk_info & VDISK_READONLY)
522 set_disk_ro(gd, 1); 587 set_disk_ro(gd, 1);
@@ -553,7 +618,7 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
553 spin_unlock_irqrestore(&blkif_io_lock, flags); 618 spin_unlock_irqrestore(&blkif_io_lock, flags);
554 619
555 /* Flush gnttab callback work. Must be done with no locks held. */ 620 /* Flush gnttab callback work. Must be done with no locks held. */
556 flush_scheduled_work(); 621 flush_work_sync(&info->work);
557 622
558 del_gendisk(info->gd); 623 del_gendisk(info->gd);
559 624
@@ -602,7 +667,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
602 spin_unlock_irq(&blkif_io_lock); 667 spin_unlock_irq(&blkif_io_lock);
603 668
604 /* Flush gnttab callback work. Must be done with no locks held. */ 669 /* Flush gnttab callback work. Must be done with no locks held. */
605 flush_scheduled_work(); 670 flush_work_sync(&info->work);
606 671
607 /* Free resources associated with old device channel. */ 672 /* Free resources associated with old device channel. */
608 if (info->ring_ref != GRANT_INVALID_REF) { 673 if (info->ring_ref != GRANT_INVALID_REF) {
@@ -621,7 +686,7 @@ static void blkif_completion(struct blk_shadow *s)
621{ 686{
622 int i; 687 int i;
623 for (i = 0; i < s->req.nr_segments; i++) 688 for (i = 0; i < s->req.nr_segments; i++)
624 gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL); 689 gnttab_end_foreign_access(s->req.u.rw.seg[i].gref, 0, 0UL);
625} 690}
626 691
627static irqreturn_t blkif_interrupt(int irq, void *dev_id) 692static irqreturn_t blkif_interrupt(int irq, void *dev_id)
@@ -649,7 +714,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
649 714
650 bret = RING_GET_RESPONSE(&info->ring, i); 715 bret = RING_GET_RESPONSE(&info->ring, i);
651 id = bret->id; 716 id = bret->id;
652 req = (struct request *)info->shadow[id].request; 717 req = info->shadow[id].request;
653 718
654 blkif_completion(&info->shadow[id]); 719 blkif_completion(&info->shadow[id]);
655 720
@@ -657,13 +722,29 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
657 722
658 error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO; 723 error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
659 switch (bret->operation) { 724 switch (bret->operation) {
725 case BLKIF_OP_FLUSH_DISKCACHE:
660 case BLKIF_OP_WRITE_BARRIER: 726 case BLKIF_OP_WRITE_BARRIER:
661 if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { 727 if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
662 printk(KERN_WARNING "blkfront: %s: write barrier op failed\n", 728 printk(KERN_WARNING "blkfront: %s: write %s op failed\n",
729 info->flush_op == BLKIF_OP_WRITE_BARRIER ?
730 "barrier" : "flush disk cache",
663 info->gd->disk_name); 731 info->gd->disk_name);
664 error = -EOPNOTSUPP; 732 error = -EOPNOTSUPP;
665 info->feature_barrier = QUEUE_ORDERED_NONE; 733 }
666 xlvbd_barrier(info); 734 if (unlikely(bret->status == BLKIF_RSP_ERROR &&
735 info->shadow[id].req.nr_segments == 0)) {
736 printk(KERN_WARNING "blkfront: %s: empty write %s op failed\n",
737 info->flush_op == BLKIF_OP_WRITE_BARRIER ?
738 "barrier" : "flush disk cache",
739 info->gd->disk_name);
740 error = -EOPNOTSUPP;
741 }
742 if (unlikely(error)) {
743 if (error == -EOPNOTSUPP)
744 error = 0;
745 info->feature_flush = 0;
746 info->flush_op = 0;
747 xlvbd_flush(info);
667 } 748 }
668 /* fall through */ 749 /* fall through */
669 case BLKIF_OP_READ: 750 case BLKIF_OP_READ:
@@ -914,7 +995,7 @@ static int blkif_recover(struct blkfront_info *info)
914 /* Stage 3: Find pending requests and requeue them. */ 995 /* Stage 3: Find pending requests and requeue them. */
915 for (i = 0; i < BLK_RING_SIZE; i++) { 996 for (i = 0; i < BLK_RING_SIZE; i++) {
916 /* Not in use? */ 997 /* Not in use? */
917 if (copy[i].request == 0) 998 if (!copy[i].request)
918 continue; 999 continue;
919 1000
920 /* Grab a request slot and copy shadow state into it. */ 1001 /* Grab a request slot and copy shadow state into it. */
@@ -928,12 +1009,10 @@ static int blkif_recover(struct blkfront_info *info)
928 /* Rewrite any grant references invalidated by susp/resume. */ 1009 /* Rewrite any grant references invalidated by susp/resume. */
929 for (j = 0; j < req->nr_segments; j++) 1010 for (j = 0; j < req->nr_segments; j++)
930 gnttab_grant_foreign_access_ref( 1011 gnttab_grant_foreign_access_ref(
931 req->seg[j].gref, 1012 req->u.rw.seg[j].gref,
932 info->xbdev->otherend_id, 1013 info->xbdev->otherend_id,
933 pfn_to_mfn(info->shadow[req->id].frame[j]), 1014 pfn_to_mfn(info->shadow[req->id].frame[j]),
934 rq_data_dir( 1015 rq_data_dir(info->shadow[req->id].request));
935 (struct request *)
936 info->shadow[req->id].request));
937 info->shadow[req->id].req = *req; 1016 info->shadow[req->id].req = *req;
938 1017
939 info->ring.req_prod_pvt++; 1018 info->ring.req_prod_pvt++;
@@ -1029,7 +1108,7 @@ static void blkfront_connect(struct blkfront_info *info)
1029 unsigned long sector_size; 1108 unsigned long sector_size;
1030 unsigned int binfo; 1109 unsigned int binfo;
1031 int err; 1110 int err;
1032 int barrier; 1111 int barrier, flush;
1033 1112
1034 switch (info->connected) { 1113 switch (info->connected) {
1035 case BLKIF_STATE_CONNECTED: 1114 case BLKIF_STATE_CONNECTED:
@@ -1069,28 +1148,37 @@ static void blkfront_connect(struct blkfront_info *info)
1069 return; 1148 return;
1070 } 1149 }
1071 1150
1151 info->feature_flush = 0;
1152 info->flush_op = 0;
1153
1072 err = xenbus_gather(XBT_NIL, info->xbdev->otherend, 1154 err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1073 "feature-barrier", "%lu", &barrier, 1155 "feature-barrier", "%d", &barrier,
1074 NULL); 1156 NULL);
1075 1157
1076 /* 1158 /*
1077 * If there's no "feature-barrier" defined, then it means 1159 * If there's no "feature-barrier" defined, then it means
1078 * we're dealing with a very old backend which writes 1160 * we're dealing with a very old backend which writes
1079 * synchronously; draining will do what needs to get done. 1161 * synchronously; nothing to do.
1080 *
1081 * If there are barriers, then we can do full queued writes
1082 * with tagged barriers.
1083 * 1162 *
1084 * If barriers are not supported, then there's no much we can 1163 * If there are barriers, then we use flush.
1085 * do, so just set ordering to NONE.
1086 */ 1164 */
1087 if (err) 1165 if (!err && barrier) {
1088 info->feature_barrier = QUEUE_ORDERED_DRAIN; 1166 info->feature_flush = REQ_FLUSH | REQ_FUA;
1089 else if (barrier) 1167 info->flush_op = BLKIF_OP_WRITE_BARRIER;
1090 info->feature_barrier = QUEUE_ORDERED_TAG; 1168 }
1091 else 1169 /*
1092 info->feature_barrier = QUEUE_ORDERED_NONE; 1170 * And if there is "feature-flush-cache" use that above
1171 * barriers.
1172 */
1173 err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1174 "feature-flush-cache", "%d", &flush,
1175 NULL);
1093 1176
1177 if (!err && flush) {
1178 info->feature_flush = REQ_FLUSH;
1179 info->flush_op = BLKIF_OP_FLUSH_DISKCACHE;
1180 }
1181
1094 err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size); 1182 err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
1095 if (err) { 1183 if (err) {
1096 xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", 1184 xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
@@ -1125,6 +1213,8 @@ static void blkback_changed(struct xenbus_device *dev,
1125 case XenbusStateInitialising: 1213 case XenbusStateInitialising:
1126 case XenbusStateInitWait: 1214 case XenbusStateInitWait:
1127 case XenbusStateInitialised: 1215 case XenbusStateInitialised:
1216 case XenbusStateReconfiguring:
1217 case XenbusStateReconfigured:
1128 case XenbusStateUnknown: 1218 case XenbusStateUnknown:
1129 case XenbusStateClosed: 1219 case XenbusStateClosed:
1130 break; 1220 break;
@@ -1201,7 +1291,7 @@ static int blkif_open(struct block_device *bdev, fmode_t mode)
1201 struct blkfront_info *info; 1291 struct blkfront_info *info;
1202 int err = 0; 1292 int err = 0;
1203 1293
1204 lock_kernel(); 1294 mutex_lock(&blkfront_mutex);
1205 1295
1206 info = disk->private_data; 1296 info = disk->private_data;
1207 if (!info) { 1297 if (!info) {
@@ -1219,7 +1309,7 @@ static int blkif_open(struct block_device *bdev, fmode_t mode)
1219 mutex_unlock(&info->mutex); 1309 mutex_unlock(&info->mutex);
1220 1310
1221out: 1311out:
1222 unlock_kernel(); 1312 mutex_unlock(&blkfront_mutex);
1223 return err; 1313 return err;
1224} 1314}
1225 1315
@@ -1229,7 +1319,7 @@ static int blkif_release(struct gendisk *disk, fmode_t mode)
1229 struct block_device *bdev; 1319 struct block_device *bdev;
1230 struct xenbus_device *xbdev; 1320 struct xenbus_device *xbdev;
1231 1321
1232 lock_kernel(); 1322 mutex_lock(&blkfront_mutex);
1233 1323
1234 bdev = bdget_disk(disk, 0); 1324 bdev = bdget_disk(disk, 0);
1235 bdput(bdev); 1325 bdput(bdev);
@@ -1263,7 +1353,7 @@ static int blkif_release(struct gendisk *disk, fmode_t mode)
1263 } 1353 }
1264 1354
1265out: 1355out:
1266 unlock_kernel(); 1356 mutex_unlock(&blkfront_mutex);
1267 return 0; 1357 return 0;
1268} 1358}
1269 1359
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index 057413bb16e2..6c7fd7db6dff 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -89,7 +89,7 @@
89#include <linux/delay.h> 89#include <linux/delay.h>
90#include <linux/slab.h> 90#include <linux/slab.h>
91#include <linux/blkdev.h> 91#include <linux/blkdev.h>
92#include <linux/smp_lock.h> 92#include <linux/mutex.h>
93#include <linux/ata.h> 93#include <linux/ata.h>
94#include <linux/hdreg.h> 94#include <linux/hdreg.h>
95#include <linux/platform_device.h> 95#include <linux/platform_device.h>
@@ -214,6 +214,7 @@ struct ace_device {
214 u16 cf_id[ATA_ID_WORDS]; 214 u16 cf_id[ATA_ID_WORDS];
215}; 215};
216 216
217static DEFINE_MUTEX(xsysace_mutex);
217static int ace_major; 218static int ace_major;
218 219
219/* --------------------------------------------------------------------- 220/* ---------------------------------------------------------------------
@@ -620,7 +621,7 @@ static void ace_fsm_dostate(struct ace_device *ace)
620 ace_dump_mem(ace->cf_id, 512); /* Debug: Dump out disk ID */ 621 ace_dump_mem(ace->cf_id, 512); /* Debug: Dump out disk ID */
621 622
622 if (ace->data_result) { 623 if (ace->data_result) {
623 /* Error occured, disable the disk */ 624 /* Error occurred, disable the disk */
624 ace->media_change = 1; 625 ace->media_change = 1;
625 set_capacity(ace->gd, 0); 626 set_capacity(ace->gd, 0);
626 dev_err(ace->dev, "error fetching CF id (%i)\n", 627 dev_err(ace->dev, "error fetching CF id (%i)\n",
@@ -800,7 +801,7 @@ static int ace_interrupt_checkstate(struct ace_device *ace)
800 u32 sreg = ace_in32(ace, ACE_STATUS); 801 u32 sreg = ace_in32(ace, ACE_STATUS);
801 u16 creg = ace_in(ace, ACE_CTRL); 802 u16 creg = ace_in(ace, ACE_CTRL);
802 803
803 /* Check for error occurance */ 804 /* Check for error occurrence */
804 if ((sreg & (ACE_STATUS_CFGERROR | ACE_STATUS_CFCERROR)) && 805 if ((sreg & (ACE_STATUS_CFGERROR | ACE_STATUS_CFCERROR)) &&
805 (creg & ACE_CTRL_ERRORIRQ)) { 806 (creg & ACE_CTRL_ERRORIRQ)) {
806 dev_err(ace->dev, "transfer failure\n"); 807 dev_err(ace->dev, "transfer failure\n");
@@ -866,12 +867,12 @@ static void ace_request(struct request_queue * q)
866 } 867 }
867} 868}
868 869
869static int ace_media_changed(struct gendisk *gd) 870static unsigned int ace_check_events(struct gendisk *gd, unsigned int clearing)
870{ 871{
871 struct ace_device *ace = gd->private_data; 872 struct ace_device *ace = gd->private_data;
872 dev_dbg(ace->dev, "ace_media_changed(): %i\n", ace->media_change); 873 dev_dbg(ace->dev, "ace_check_events(): %i\n", ace->media_change);
873 874
874 return ace->media_change; 875 return ace->media_change ? DISK_EVENT_MEDIA_CHANGE : 0;
875} 876}
876 877
877static int ace_revalidate_disk(struct gendisk *gd) 878static int ace_revalidate_disk(struct gendisk *gd)
@@ -903,13 +904,13 @@ static int ace_open(struct block_device *bdev, fmode_t mode)
903 904
904 dev_dbg(ace->dev, "ace_open() users=%i\n", ace->users + 1); 905 dev_dbg(ace->dev, "ace_open() users=%i\n", ace->users + 1);
905 906
906 lock_kernel(); 907 mutex_lock(&xsysace_mutex);
907 spin_lock_irqsave(&ace->lock, flags); 908 spin_lock_irqsave(&ace->lock, flags);
908 ace->users++; 909 ace->users++;
909 spin_unlock_irqrestore(&ace->lock, flags); 910 spin_unlock_irqrestore(&ace->lock, flags);
910 911
911 check_disk_change(bdev); 912 check_disk_change(bdev);
912 unlock_kernel(); 913 mutex_unlock(&xsysace_mutex);
913 914
914 return 0; 915 return 0;
915} 916}
@@ -922,7 +923,7 @@ static int ace_release(struct gendisk *disk, fmode_t mode)
922 923
923 dev_dbg(ace->dev, "ace_release() users=%i\n", ace->users - 1); 924 dev_dbg(ace->dev, "ace_release() users=%i\n", ace->users - 1);
924 925
925 lock_kernel(); 926 mutex_lock(&xsysace_mutex);
926 spin_lock_irqsave(&ace->lock, flags); 927 spin_lock_irqsave(&ace->lock, flags);
927 ace->users--; 928 ace->users--;
928 if (ace->users == 0) { 929 if (ace->users == 0) {
@@ -930,7 +931,7 @@ static int ace_release(struct gendisk *disk, fmode_t mode)
930 ace_out(ace, ACE_CTRL, val & ~ACE_CTRL_LOCKREQ); 931 ace_out(ace, ACE_CTRL, val & ~ACE_CTRL_LOCKREQ);
931 } 932 }
932 spin_unlock_irqrestore(&ace->lock, flags); 933 spin_unlock_irqrestore(&ace->lock, flags);
933 unlock_kernel(); 934 mutex_unlock(&xsysace_mutex);
934 return 0; 935 return 0;
935} 936}
936 937
@@ -952,7 +953,7 @@ static const struct block_device_operations ace_fops = {
952 .owner = THIS_MODULE, 953 .owner = THIS_MODULE,
953 .open = ace_open, 954 .open = ace_open,
954 .release = ace_release, 955 .release = ace_release,
955 .media_changed = ace_media_changed, 956 .check_events = ace_check_events,
956 .revalidate_disk = ace_revalidate_disk, 957 .revalidate_disk = ace_revalidate_disk,
957 .getgeo = ace_getgeo, 958 .getgeo = ace_getgeo,
958}; 959};
@@ -1167,7 +1168,7 @@ static int __devinit ace_probe(struct platform_device *dev)
1167 irq = dev->resource[i].start; 1168 irq = dev->resource[i].start;
1168 } 1169 }
1169 1170
1170 /* Call the bus-independant setup code */ 1171 /* Call the bus-independent setup code */
1171 return ace_alloc(&dev->dev, id, physaddr, irq, bus_width); 1172 return ace_alloc(&dev->dev, id, physaddr, irq, bus_width);
1172} 1173}
1173 1174
@@ -1194,16 +1195,13 @@ static struct platform_driver ace_platform_driver = {
1194 */ 1195 */
1195 1196
1196#if defined(CONFIG_OF) 1197#if defined(CONFIG_OF)
1197static int __devinit 1198static int __devinit ace_of_probe(struct platform_device *op)
1198ace_of_probe(struct platform_device *op, const struct of_device_id *match)
1199{ 1199{
1200 struct resource res; 1200 struct resource res;
1201 resource_size_t physaddr; 1201 resource_size_t physaddr;
1202 const u32 *id; 1202 const u32 *id;
1203 int irq, bus_width, rc; 1203 int irq, bus_width, rc;
1204 1204
1205 dev_dbg(&op->dev, "ace_of_probe(%p, %p)\n", op, match);
1206
1207 /* device id */ 1205 /* device id */
1208 id = of_get_property(op->dev.of_node, "port-number", NULL); 1206 id = of_get_property(op->dev.of_node, "port-number", NULL);
1209 1207
@@ -1223,8 +1221,9 @@ ace_of_probe(struct platform_device *op, const struct of_device_id *match)
1223 if (of_find_property(op->dev.of_node, "8-bit", NULL)) 1221 if (of_find_property(op->dev.of_node, "8-bit", NULL))
1224 bus_width = ACE_BUS_WIDTH_8; 1222 bus_width = ACE_BUS_WIDTH_8;
1225 1223
1226 /* Call the bus-independant setup code */ 1224 /* Call the bus-independent setup code */
1227 return ace_alloc(&op->dev, id ? *id : 0, physaddr, irq, bus_width); 1225 return ace_alloc(&op->dev, id ? be32_to_cpup(id) : 0,
1226 physaddr, irq, bus_width);
1228} 1227}
1229 1228
1230static int __devexit ace_of_remove(struct platform_device *op) 1229static int __devexit ace_of_remove(struct platform_device *op)
@@ -1243,7 +1242,7 @@ static const struct of_device_id ace_of_match[] __devinitconst = {
1243}; 1242};
1244MODULE_DEVICE_TABLE(of, ace_of_match); 1243MODULE_DEVICE_TABLE(of, ace_of_match);
1245 1244
1246static struct of_platform_driver ace_of_driver = { 1245static struct platform_driver ace_of_driver = {
1247 .probe = ace_of_probe, 1246 .probe = ace_of_probe,
1248 .remove = __devexit_p(ace_of_remove), 1247 .remove = __devexit_p(ace_of_remove),
1249 .driver = { 1248 .driver = {
@@ -1257,12 +1256,12 @@ static struct of_platform_driver ace_of_driver = {
1257static inline int __init ace_of_register(void) 1256static inline int __init ace_of_register(void)
1258{ 1257{
1259 pr_debug("xsysace: registering OF binding\n"); 1258 pr_debug("xsysace: registering OF binding\n");
1260 return of_register_platform_driver(&ace_of_driver); 1259 return platform_driver_register(&ace_of_driver);
1261} 1260}
1262 1261
1263static inline void __exit ace_of_unregister(void) 1262static inline void __exit ace_of_unregister(void)
1264{ 1263{
1265 of_unregister_platform_driver(&ace_of_driver); 1264 platform_driver_unregister(&ace_of_driver);
1266} 1265}
1267#else /* CONFIG_OF */ 1266#else /* CONFIG_OF */
1268/* CONFIG_OF not enabled; do nothing helpers */ 1267/* CONFIG_OF not enabled; do nothing helpers */
diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c
index d75b2bb601ad..a22e3f895947 100644
--- a/drivers/block/z2ram.c
+++ b/drivers/block/z2ram.c
@@ -33,7 +33,7 @@
33#include <linux/module.h> 33#include <linux/module.h>
34#include <linux/blkdev.h> 34#include <linux/blkdev.h>
35#include <linux/bitops.h> 35#include <linux/bitops.h>
36#include <linux/smp_lock.h> 36#include <linux/mutex.h>
37#include <linux/slab.h> 37#include <linux/slab.h>
38 38
39#include <asm/setup.h> 39#include <asm/setup.h>
@@ -57,6 +57,7 @@ extern struct mem_info m68k_memory[NUM_MEMINFO];
57 57
58#define Z2RAM_CHUNK1024 ( Z2RAM_CHUNKSIZE >> 10 ) 58#define Z2RAM_CHUNK1024 ( Z2RAM_CHUNKSIZE >> 10 )
59 59
60static DEFINE_MUTEX(z2ram_mutex);
60static u_long *z2ram_map = NULL; 61static u_long *z2ram_map = NULL;
61static u_long z2ram_size = 0; 62static u_long z2ram_size = 0;
62static int z2_count = 0; 63static int z2_count = 0;
@@ -79,8 +80,10 @@ static void do_z2_request(struct request_queue *q)
79 int err = 0; 80 int err = 0;
80 81
81 if (start + len > z2ram_size) { 82 if (start + len > z2ram_size) {
82 printk( KERN_ERR DEVICE_NAME ": bad access: block=%lu, count=%u\n", 83 pr_err(DEVICE_NAME ": bad access: block=%llu, "
83 blk_rq_pos(req), blk_rq_cur_sectors(req)); 84 "count=%u\n",
85 (unsigned long long)blk_rq_pos(req),
86 blk_rq_cur_sectors(req));
84 err = -EIO; 87 err = -EIO;
85 goto done; 88 goto done;
86 } 89 }
@@ -154,7 +157,7 @@ static int z2_open(struct block_device *bdev, fmode_t mode)
154 157
155 device = MINOR(bdev->bd_dev); 158 device = MINOR(bdev->bd_dev);
156 159
157 lock_kernel(); 160 mutex_lock(&z2ram_mutex);
158 if ( current_device != -1 && current_device != device ) 161 if ( current_device != -1 && current_device != device )
159 { 162 {
160 rc = -EBUSY; 163 rc = -EBUSY;
@@ -296,25 +299,25 @@ static int z2_open(struct block_device *bdev, fmode_t mode)
296 set_capacity(z2ram_gendisk, z2ram_size >> 9); 299 set_capacity(z2ram_gendisk, z2ram_size >> 9);
297 } 300 }
298 301
299 unlock_kernel(); 302 mutex_unlock(&z2ram_mutex);
300 return 0; 303 return 0;
301 304
302err_out_kfree: 305err_out_kfree:
303 kfree(z2ram_map); 306 kfree(z2ram_map);
304err_out: 307err_out:
305 unlock_kernel(); 308 mutex_unlock(&z2ram_mutex);
306 return rc; 309 return rc;
307} 310}
308 311
309static int 312static int
310z2_release(struct gendisk *disk, fmode_t mode) 313z2_release(struct gendisk *disk, fmode_t mode)
311{ 314{
312 lock_kernel(); 315 mutex_lock(&z2ram_mutex);
313 if ( current_device == -1 ) { 316 if ( current_device == -1 ) {
314 unlock_kernel(); 317 mutex_unlock(&z2ram_mutex);
315 return 0; 318 return 0;
316 } 319 }
317 unlock_kernel(); 320 mutex_unlock(&z2ram_mutex);
318 /* 321 /*
319 * FIXME: unmap memory 322 * FIXME: unmap memory
320 */ 323 */