aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/DAC960.c14
-rw-r--r--drivers/block/Kconfig17
-rw-r--r--drivers/block/Makefile1
-rw-r--r--drivers/block/amiflop.c79
-rw-r--r--drivers/block/aoe/aoeblk.c9
-rw-r--r--drivers/block/aoe/aoechr.c10
-rw-r--r--drivers/block/ataflop.c65
-rw-r--r--drivers/block/brd.c8
-rw-r--r--drivers/block/cciss.c878
-rw-r--r--drivers/block/cpqarray.c15
-rw-r--r--drivers/block/drbd/drbd_actlog.c41
-rw-r--r--drivers/block/drbd/drbd_bitmap.c2
-rw-r--r--drivers/block/drbd/drbd_int.h219
-rw-r--r--drivers/block/drbd/drbd_main.c604
-rw-r--r--drivers/block/drbd/drbd_nl.c270
-rw-r--r--drivers/block/drbd/drbd_proc.c34
-rw-r--r--drivers/block/drbd/drbd_receiver.c949
-rw-r--r--drivers/block/drbd/drbd_req.c165
-rw-r--r--drivers/block/drbd/drbd_req.h62
-rw-r--r--drivers/block/drbd/drbd_worker.c292
-rw-r--r--drivers/block/floppy.c82
-rw-r--r--drivers/block/loop.c132
-rw-r--r--drivers/block/nbd.c7
-rw-r--r--drivers/block/osdblk.c5
-rw-r--r--drivers/block/paride/pcd.c15
-rw-r--r--drivers/block/paride/pd.c15
-rw-r--r--drivers/block/paride/pf.c17
-rw-r--r--drivers/block/paride/pg.c8
-rw-r--r--drivers/block/paride/pt.c20
-rw-r--r--drivers/block/pktcdvd.c18
-rw-r--r--drivers/block/ps3disk.c2
-rw-r--r--drivers/block/rbd.c1841
-rw-r--r--drivers/block/rbd_types.h73
-rw-r--r--drivers/block/swim.c15
-rw-r--r--drivers/block/swim3.c15
-rw-r--r--drivers/block/ub.c15
-rw-r--r--drivers/block/viodasd.c11
-rw-r--r--drivers/block/virtio_blk.c54
-rw-r--r--drivers/block/xd.c7
-rw-r--r--drivers/block/xen-blkfront.c65
-rw-r--r--drivers/block/xsysace.c11
-rw-r--r--drivers/block/z2ram.c15
42 files changed, 4549 insertions, 1628 deletions
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 4e2c367fec11..1f286ab461d3 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -36,7 +36,7 @@
36#include <linux/ioport.h> 36#include <linux/ioport.h>
37#include <linux/mm.h> 37#include <linux/mm.h>
38#include <linux/slab.h> 38#include <linux/slab.h>
39#include <linux/smp_lock.h> 39#include <linux/mutex.h>
40#include <linux/proc_fs.h> 40#include <linux/proc_fs.h>
41#include <linux/seq_file.h> 41#include <linux/seq_file.h>
42#include <linux/reboot.h> 42#include <linux/reboot.h>
@@ -54,6 +54,7 @@
54#define DAC960_GAM_MINOR 252 54#define DAC960_GAM_MINOR 252
55 55
56 56
57static DEFINE_MUTEX(DAC960_mutex);
57static DAC960_Controller_T *DAC960_Controllers[DAC960_MaxControllers]; 58static DAC960_Controller_T *DAC960_Controllers[DAC960_MaxControllers];
58static int DAC960_ControllerCount; 59static int DAC960_ControllerCount;
59static struct proc_dir_entry *DAC960_ProcDirectoryEntry; 60static struct proc_dir_entry *DAC960_ProcDirectoryEntry;
@@ -81,7 +82,7 @@ static int DAC960_open(struct block_device *bdev, fmode_t mode)
81 int drive_nr = (long)disk->private_data; 82 int drive_nr = (long)disk->private_data;
82 int ret = -ENXIO; 83 int ret = -ENXIO;
83 84
84 lock_kernel(); 85 mutex_lock(&DAC960_mutex);
85 if (p->FirmwareType == DAC960_V1_Controller) { 86 if (p->FirmwareType == DAC960_V1_Controller) {
86 if (p->V1.LogicalDriveInformation[drive_nr]. 87 if (p->V1.LogicalDriveInformation[drive_nr].
87 LogicalDriveState == DAC960_V1_LogicalDrive_Offline) 88 LogicalDriveState == DAC960_V1_LogicalDrive_Offline)
@@ -99,7 +100,7 @@ static int DAC960_open(struct block_device *bdev, fmode_t mode)
99 goto out; 100 goto out;
100 ret = 0; 101 ret = 0;
101out: 102out:
102 unlock_kernel(); 103 mutex_unlock(&DAC960_mutex);
103 return ret; 104 return ret;
104} 105}
105 106
@@ -6625,7 +6626,7 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
6625 long ErrorCode = 0; 6626 long ErrorCode = 0;
6626 if (!capable(CAP_SYS_ADMIN)) return -EACCES; 6627 if (!capable(CAP_SYS_ADMIN)) return -EACCES;
6627 6628
6628 lock_kernel(); 6629 mutex_lock(&DAC960_mutex);
6629 switch (Request) 6630 switch (Request)
6630 { 6631 {
6631 case DAC960_IOCTL_GET_CONTROLLER_COUNT: 6632 case DAC960_IOCTL_GET_CONTROLLER_COUNT:
@@ -7056,13 +7057,14 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
7056 default: 7057 default:
7057 ErrorCode = -ENOTTY; 7058 ErrorCode = -ENOTTY;
7058 } 7059 }
7059 unlock_kernel(); 7060 mutex_unlock(&DAC960_mutex);
7060 return ErrorCode; 7061 return ErrorCode;
7061} 7062}
7062 7063
7063static const struct file_operations DAC960_gam_fops = { 7064static const struct file_operations DAC960_gam_fops = {
7064 .owner = THIS_MODULE, 7065 .owner = THIS_MODULE,
7065 .unlocked_ioctl = DAC960_gam_ioctl 7066 .unlocked_ioctl = DAC960_gam_ioctl,
7067 .llseek = noop_llseek,
7066}; 7068};
7067 7069
7068static struct miscdevice DAC960_gam_dev = { 7070static struct miscdevice DAC960_gam_dev = {
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index de277689da61..4b9359a6f6ca 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -488,4 +488,21 @@ config BLK_DEV_HD
488 488
489 If unsure, say N. 489 If unsure, say N.
490 490
491config BLK_DEV_RBD
492 tristate "Rados block device (RBD)"
493 depends on INET && EXPERIMENTAL && BLOCK
494 select CEPH_LIB
495 select LIBCRC32C
496 select CRYPTO_AES
497 select CRYPTO
498 default n
499 help
500 Say Y here if you want include the Rados block device, which stripes
501 a block device over objects stored in the Ceph distributed object
502 store.
503
504 More information at http://ceph.newdream.net/.
505
506 If unsure, say N.
507
491endif # BLK_DEV 508endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index aff5ac925c34..d7f463d6312d 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -37,5 +37,6 @@ obj-$(CONFIG_BLK_DEV_HD) += hd.o
37 37
38obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o 38obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
39obj-$(CONFIG_BLK_DEV_DRBD) += drbd/ 39obj-$(CONFIG_BLK_DEV_DRBD) += drbd/
40obj-$(CONFIG_BLK_DEV_RBD) += rbd.o
40 41
41swim_mod-objs := swim.o swim_asm.o 42swim_mod-objs := swim.o swim_asm.o
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 76f114f0bba3..a1725e6488d3 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -60,7 +60,7 @@
60#include <linux/hdreg.h> 60#include <linux/hdreg.h>
61#include <linux/delay.h> 61#include <linux/delay.h>
62#include <linux/init.h> 62#include <linux/init.h>
63#include <linux/smp_lock.h> 63#include <linux/mutex.h>
64#include <linux/amifdreg.h> 64#include <linux/amifdreg.h>
65#include <linux/amifd.h> 65#include <linux/amifd.h>
66#include <linux/buffer_head.h> 66#include <linux/buffer_head.h>
@@ -109,13 +109,12 @@
109#define FD_HD_3 0x55555555 /* high-density 3.5" (1760K) drive */ 109#define FD_HD_3 0x55555555 /* high-density 3.5" (1760K) drive */
110#define FD_DD_5 0xaaaaaaaa /* double-density 5.25" (440K) drive */ 110#define FD_DD_5 0xaaaaaaaa /* double-density 5.25" (440K) drive */
111 111
112static DEFINE_MUTEX(amiflop_mutex);
112static unsigned long int fd_def_df0 = FD_DD_3; /* default for df0 if it doesn't identify */ 113static unsigned long int fd_def_df0 = FD_DD_3; /* default for df0 if it doesn't identify */
113 114
114module_param(fd_def_df0, ulong, 0); 115module_param(fd_def_df0, ulong, 0);
115MODULE_LICENSE("GPL"); 116MODULE_LICENSE("GPL");
116 117
117static struct request_queue *floppy_queue;
118
119/* 118/*
120 * Macros 119 * Macros
121 */ 120 */
@@ -164,6 +163,7 @@ static volatile int selected = -1; /* currently selected drive */
164static int writepending; 163static int writepending;
165static int writefromint; 164static int writefromint;
166static char *raw_buf; 165static char *raw_buf;
166static int fdc_queue;
167 167
168static DEFINE_SPINLOCK(amiflop_lock); 168static DEFINE_SPINLOCK(amiflop_lock);
169 169
@@ -1334,6 +1334,42 @@ static int get_track(int drive, int track)
1334 return -1; 1334 return -1;
1335} 1335}
1336 1336
1337/*
1338 * Round-robin between our available drives, doing one request from each
1339 */
1340static struct request *set_next_request(void)
1341{
1342 struct request_queue *q;
1343 int cnt = FD_MAX_UNITS;
1344 struct request *rq;
1345
1346 /* Find next queue we can dispatch from */
1347 fdc_queue = fdc_queue + 1;
1348 if (fdc_queue == FD_MAX_UNITS)
1349 fdc_queue = 0;
1350
1351 for(cnt = FD_MAX_UNITS; cnt > 0; cnt--) {
1352
1353 if (unit[fdc_queue].type->code == FD_NODRIVE) {
1354 if (++fdc_queue == FD_MAX_UNITS)
1355 fdc_queue = 0;
1356 continue;
1357 }
1358
1359 q = unit[fdc_queue].gendisk->queue;
1360 if (q) {
1361 rq = blk_fetch_request(q);
1362 if (rq)
1363 break;
1364 }
1365
1366 if (++fdc_queue == FD_MAX_UNITS)
1367 fdc_queue = 0;
1368 }
1369
1370 return rq;
1371}
1372
1337static void redo_fd_request(void) 1373static void redo_fd_request(void)
1338{ 1374{
1339 struct request *rq; 1375 struct request *rq;
@@ -1345,7 +1381,7 @@ static void redo_fd_request(void)
1345 int err; 1381 int err;
1346 1382
1347next_req: 1383next_req:
1348 rq = blk_fetch_request(floppy_queue); 1384 rq = set_next_request();
1349 if (!rq) { 1385 if (!rq) {
1350 /* Nothing left to do */ 1386 /* Nothing left to do */
1351 return; 1387 return;
@@ -1506,9 +1542,9 @@ static int fd_ioctl(struct block_device *bdev, fmode_t mode,
1506{ 1542{
1507 int ret; 1543 int ret;
1508 1544
1509 lock_kernel(); 1545 mutex_lock(&amiflop_mutex);
1510 ret = fd_locked_ioctl(bdev, mode, cmd, param); 1546 ret = fd_locked_ioctl(bdev, mode, cmd, param);
1511 unlock_kernel(); 1547 mutex_unlock(&amiflop_mutex);
1512 1548
1513 return ret; 1549 return ret;
1514} 1550}
@@ -1555,11 +1591,11 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
1555 int old_dev; 1591 int old_dev;
1556 unsigned long flags; 1592 unsigned long flags;
1557 1593
1558 lock_kernel(); 1594 mutex_lock(&amiflop_mutex);
1559 old_dev = fd_device[drive]; 1595 old_dev = fd_device[drive];
1560 1596
1561 if (fd_ref[drive] && old_dev != system) { 1597 if (fd_ref[drive] && old_dev != system) {
1562 unlock_kernel(); 1598 mutex_unlock(&amiflop_mutex);
1563 return -EBUSY; 1599 return -EBUSY;
1564 } 1600 }
1565 1601
@@ -1575,7 +1611,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
1575 rel_fdc(); 1611 rel_fdc();
1576 1612
1577 if (wrprot) { 1613 if (wrprot) {
1578 unlock_kernel(); 1614 mutex_unlock(&amiflop_mutex);
1579 return -EROFS; 1615 return -EROFS;
1580 } 1616 }
1581 } 1617 }
@@ -1594,7 +1630,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
1594 printk(KERN_INFO "fd%d: accessing %s-disk with %s-layout\n",drive, 1630 printk(KERN_INFO "fd%d: accessing %s-disk with %s-layout\n",drive,
1595 unit[drive].type->name, data_types[system].name); 1631 unit[drive].type->name, data_types[system].name);
1596 1632
1597 unlock_kernel(); 1633 mutex_unlock(&amiflop_mutex);
1598 return 0; 1634 return 0;
1599} 1635}
1600 1636
@@ -1603,7 +1639,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
1603 struct amiga_floppy_struct *p = disk->private_data; 1639 struct amiga_floppy_struct *p = disk->private_data;
1604 int drive = p - unit; 1640 int drive = p - unit;
1605 1641
1606 lock_kernel(); 1642 mutex_lock(&amiflop_mutex);
1607 if (unit[drive].dirty == 1) { 1643 if (unit[drive].dirty == 1) {
1608 del_timer (flush_track_timer + drive); 1644 del_timer (flush_track_timer + drive);
1609 non_int_flush_track (drive); 1645 non_int_flush_track (drive);
@@ -1617,7 +1653,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
1617/* the mod_use counter is handled this way */ 1653/* the mod_use counter is handled this way */
1618 floppy_off (drive | 0x40000000); 1654 floppy_off (drive | 0x40000000);
1619#endif 1655#endif
1620 unlock_kernel(); 1656 mutex_unlock(&amiflop_mutex);
1621 return 0; 1657 return 0;
1622} 1658}
1623 1659
@@ -1682,6 +1718,13 @@ static int __init fd_probe_drives(void)
1682 continue; 1718 continue;
1683 } 1719 }
1684 unit[drive].gendisk = disk; 1720 unit[drive].gendisk = disk;
1721
1722 disk->queue = blk_init_queue(do_fd_request, &amiflop_lock);
1723 if (!disk->queue) {
1724 unit[drive].type->code = FD_NODRIVE;
1725 continue;
1726 }
1727
1685 drives++; 1728 drives++;
1686 if ((unit[drive].trackbuf = kmalloc(FLOPPY_MAX_SECTORS * 512, GFP_KERNEL)) == NULL) { 1729 if ((unit[drive].trackbuf = kmalloc(FLOPPY_MAX_SECTORS * 512, GFP_KERNEL)) == NULL) {
1687 printk("no mem for "); 1730 printk("no mem for ");
@@ -1695,7 +1738,6 @@ static int __init fd_probe_drives(void)
1695 disk->fops = &floppy_fops; 1738 disk->fops = &floppy_fops;
1696 sprintf(disk->disk_name, "fd%d", drive); 1739 sprintf(disk->disk_name, "fd%d", drive);
1697 disk->private_data = &unit[drive]; 1740 disk->private_data = &unit[drive];
1698 disk->queue = floppy_queue;
1699 set_capacity(disk, 880*2); 1741 set_capacity(disk, 880*2);
1700 add_disk(disk); 1742 add_disk(disk);
1701 } 1743 }
@@ -1743,11 +1785,6 @@ static int __init amiga_floppy_probe(struct platform_device *pdev)
1743 goto out_irq2; 1785 goto out_irq2;
1744 } 1786 }
1745 1787
1746 ret = -ENOMEM;
1747 floppy_queue = blk_init_queue(do_fd_request, &amiflop_lock);
1748 if (!floppy_queue)
1749 goto out_queue;
1750
1751 ret = -ENODEV; 1788 ret = -ENODEV;
1752 if (fd_probe_drives() < 1) /* No usable drives */ 1789 if (fd_probe_drives() < 1) /* No usable drives */
1753 goto out_probe; 1790 goto out_probe;
@@ -1791,8 +1828,6 @@ static int __init amiga_floppy_probe(struct platform_device *pdev)
1791 return 0; 1828 return 0;
1792 1829
1793out_probe: 1830out_probe:
1794 blk_cleanup_queue(floppy_queue);
1795out_queue:
1796 free_irq(IRQ_AMIGA_CIAA_TB, NULL); 1831 free_irq(IRQ_AMIGA_CIAA_TB, NULL);
1797out_irq2: 1832out_irq2:
1798 free_irq(IRQ_AMIGA_DSKBLK, NULL); 1833 free_irq(IRQ_AMIGA_DSKBLK, NULL);
@@ -1810,9 +1845,12 @@ static int __exit amiga_floppy_remove(struct platform_device *pdev)
1810 1845
1811 for( i = 0; i < FD_MAX_UNITS; i++) { 1846 for( i = 0; i < FD_MAX_UNITS; i++) {
1812 if (unit[i].type->code != FD_NODRIVE) { 1847 if (unit[i].type->code != FD_NODRIVE) {
1848 struct request_queue *q = unit[i].gendisk->queue;
1813 del_gendisk(unit[i].gendisk); 1849 del_gendisk(unit[i].gendisk);
1814 put_disk(unit[i].gendisk); 1850 put_disk(unit[i].gendisk);
1815 kfree(unit[i].trackbuf); 1851 kfree(unit[i].trackbuf);
1852 if (q)
1853 blk_cleanup_queue(q);
1816 } 1854 }
1817 } 1855 }
1818 blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); 1856 blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256);
@@ -1820,7 +1858,6 @@ static int __exit amiga_floppy_remove(struct platform_device *pdev)
1820 free_irq(IRQ_AMIGA_DSKBLK, NULL); 1858 free_irq(IRQ_AMIGA_DSKBLK, NULL);
1821 custom.dmacon = DMAF_DISK; /* disable DMA */ 1859 custom.dmacon = DMAF_DISK; /* disable DMA */
1822 amiga_chip_free(raw_buf); 1860 amiga_chip_free(raw_buf);
1823 blk_cleanup_queue(floppy_queue);
1824 unregister_blkdev(FLOPPY_MAJOR, "fd"); 1861 unregister_blkdev(FLOPPY_MAJOR, "fd");
1825} 1862}
1826#endif 1863#endif
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index a946929735a5..f21c237a9e5e 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -12,9 +12,10 @@
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/genhd.h> 13#include <linux/genhd.h>
14#include <linux/netdevice.h> 14#include <linux/netdevice.h>
15#include <linux/smp_lock.h> 15#include <linux/mutex.h>
16#include "aoe.h" 16#include "aoe.h"
17 17
18static DEFINE_MUTEX(aoeblk_mutex);
18static struct kmem_cache *buf_pool_cache; 19static struct kmem_cache *buf_pool_cache;
19 20
20static ssize_t aoedisk_show_state(struct device *dev, 21static ssize_t aoedisk_show_state(struct device *dev,
@@ -125,16 +126,16 @@ aoeblk_open(struct block_device *bdev, fmode_t mode)
125 struct aoedev *d = bdev->bd_disk->private_data; 126 struct aoedev *d = bdev->bd_disk->private_data;
126 ulong flags; 127 ulong flags;
127 128
128 lock_kernel(); 129 mutex_lock(&aoeblk_mutex);
129 spin_lock_irqsave(&d->lock, flags); 130 spin_lock_irqsave(&d->lock, flags);
130 if (d->flags & DEVFL_UP) { 131 if (d->flags & DEVFL_UP) {
131 d->nopen++; 132 d->nopen++;
132 spin_unlock_irqrestore(&d->lock, flags); 133 spin_unlock_irqrestore(&d->lock, flags);
133 unlock_kernel(); 134 mutex_unlock(&aoeblk_mutex);
134 return 0; 135 return 0;
135 } 136 }
136 spin_unlock_irqrestore(&d->lock, flags); 137 spin_unlock_irqrestore(&d->lock, flags);
137 unlock_kernel(); 138 mutex_unlock(&aoeblk_mutex);
138 return -ENODEV; 139 return -ENODEV;
139} 140}
140 141
diff --git a/drivers/block/aoe/aoechr.c b/drivers/block/aoe/aoechr.c
index 4a1b9e7464aa..146296ca4965 100644
--- a/drivers/block/aoe/aoechr.c
+++ b/drivers/block/aoe/aoechr.c
@@ -9,7 +9,7 @@
9#include <linux/completion.h> 9#include <linux/completion.h>
10#include <linux/delay.h> 10#include <linux/delay.h>
11#include <linux/slab.h> 11#include <linux/slab.h>
12#include <linux/smp_lock.h> 12#include <linux/mutex.h>
13#include <linux/skbuff.h> 13#include <linux/skbuff.h>
14#include "aoe.h" 14#include "aoe.h"
15 15
@@ -37,6 +37,7 @@ struct ErrMsg {
37 char *msg; 37 char *msg;
38}; 38};
39 39
40static DEFINE_MUTEX(aoechr_mutex);
40static struct ErrMsg emsgs[NMSG]; 41static struct ErrMsg emsgs[NMSG];
41static int emsgs_head_idx, emsgs_tail_idx; 42static int emsgs_head_idx, emsgs_tail_idx;
42static struct completion emsgs_comp; 43static struct completion emsgs_comp;
@@ -183,16 +184,16 @@ aoechr_open(struct inode *inode, struct file *filp)
183{ 184{
184 int n, i; 185 int n, i;
185 186
186 lock_kernel(); 187 mutex_lock(&aoechr_mutex);
187 n = iminor(inode); 188 n = iminor(inode);
188 filp->private_data = (void *) (unsigned long) n; 189 filp->private_data = (void *) (unsigned long) n;
189 190
190 for (i = 0; i < ARRAY_SIZE(chardevs); ++i) 191 for (i = 0; i < ARRAY_SIZE(chardevs); ++i)
191 if (chardevs[i].minor == n) { 192 if (chardevs[i].minor == n) {
192 unlock_kernel(); 193 mutex_unlock(&aoechr_mutex);
193 return 0; 194 return 0;
194 } 195 }
195 unlock_kernel(); 196 mutex_unlock(&aoechr_mutex);
196 return -EINVAL; 197 return -EINVAL;
197} 198}
198 199
@@ -265,6 +266,7 @@ static const struct file_operations aoe_fops = {
265 .open = aoechr_open, 266 .open = aoechr_open,
266 .release = aoechr_rel, 267 .release = aoechr_rel,
267 .owner = THIS_MODULE, 268 .owner = THIS_MODULE,
269 .llseek = noop_llseek,
268}; 270};
269 271
270static char *aoe_devnode(struct device *dev, mode_t *mode) 272static char *aoe_devnode(struct device *dev, mode_t *mode)
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index aceb96476524..4e4cc6c828cb 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -67,7 +67,7 @@
67#include <linux/delay.h> 67#include <linux/delay.h>
68#include <linux/init.h> 68#include <linux/init.h>
69#include <linux/blkdev.h> 69#include <linux/blkdev.h>
70#include <linux/smp_lock.h> 70#include <linux/mutex.h>
71 71
72#include <asm/atafd.h> 72#include <asm/atafd.h>
73#include <asm/atafdreg.h> 73#include <asm/atafdreg.h>
@@ -79,8 +79,9 @@
79 79
80#undef DEBUG 80#undef DEBUG
81 81
82static struct request_queue *floppy_queue; 82static DEFINE_MUTEX(ataflop_mutex);
83static struct request *fd_request; 83static struct request *fd_request;
84static int fdc_queue;
84 85
85/* Disk types: DD, HD, ED */ 86/* Disk types: DD, HD, ED */
86static struct atari_disk_type { 87static struct atari_disk_type {
@@ -1391,6 +1392,29 @@ static void setup_req_params( int drive )
1391 ReqTrack, ReqSector, (unsigned long)ReqData )); 1392 ReqTrack, ReqSector, (unsigned long)ReqData ));
1392} 1393}
1393 1394
1395/*
1396 * Round-robin between our available drives, doing one request from each
1397 */
1398static struct request *set_next_request(void)
1399{
1400 struct request_queue *q;
1401 int old_pos = fdc_queue;
1402 struct request *rq;
1403
1404 do {
1405 q = unit[fdc_queue].disk->queue;
1406 if (++fdc_queue == FD_MAX_UNITS)
1407 fdc_queue = 0;
1408 if (q) {
1409 rq = blk_fetch_request(q);
1410 if (rq)
1411 break;
1412 }
1413 } while (fdc_queue != old_pos);
1414
1415 return rq;
1416}
1417
1394 1418
1395static void redo_fd_request(void) 1419static void redo_fd_request(void)
1396{ 1420{
@@ -1405,7 +1429,7 @@ static void redo_fd_request(void)
1405 1429
1406repeat: 1430repeat:
1407 if (!fd_request) { 1431 if (!fd_request) {
1408 fd_request = blk_fetch_request(floppy_queue); 1432 fd_request = set_next_request();
1409 if (!fd_request) 1433 if (!fd_request)
1410 goto the_end; 1434 goto the_end;
1411 } 1435 }
@@ -1671,9 +1695,9 @@ static int fd_ioctl(struct block_device *bdev, fmode_t mode,
1671{ 1695{
1672 int ret; 1696 int ret;
1673 1697
1674 lock_kernel(); 1698 mutex_lock(&ataflop_mutex);
1675 ret = fd_locked_ioctl(bdev, mode, cmd, arg); 1699 ret = fd_locked_ioctl(bdev, mode, cmd, arg);
1676 unlock_kernel(); 1700 mutex_unlock(&ataflop_mutex);
1677 1701
1678 return ret; 1702 return ret;
1679} 1703}
@@ -1854,9 +1878,9 @@ static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode)
1854{ 1878{
1855 int ret; 1879 int ret;
1856 1880
1857 lock_kernel(); 1881 mutex_lock(&ataflop_mutex);
1858 ret = floppy_open(bdev, mode); 1882 ret = floppy_open(bdev, mode);
1859 unlock_kernel(); 1883 mutex_unlock(&ataflop_mutex);
1860 1884
1861 return ret; 1885 return ret;
1862} 1886}
@@ -1864,14 +1888,14 @@ static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode)
1864static int floppy_release(struct gendisk *disk, fmode_t mode) 1888static int floppy_release(struct gendisk *disk, fmode_t mode)
1865{ 1889{
1866 struct atari_floppy_struct *p = disk->private_data; 1890 struct atari_floppy_struct *p = disk->private_data;
1867 lock_kernel(); 1891 mutex_lock(&ataflop_mutex);
1868 if (p->ref < 0) 1892 if (p->ref < 0)
1869 p->ref = 0; 1893 p->ref = 0;
1870 else if (!p->ref--) { 1894 else if (!p->ref--) {
1871 printk(KERN_ERR "floppy_release with fd_ref == 0"); 1895 printk(KERN_ERR "floppy_release with fd_ref == 0");
1872 p->ref = 0; 1896 p->ref = 0;
1873 } 1897 }
1874 unlock_kernel(); 1898 mutex_unlock(&ataflop_mutex);
1875 return 0; 1899 return 0;
1876} 1900}
1877 1901
@@ -1932,10 +1956,6 @@ static int __init atari_floppy_init (void)
1932 PhysTrackBuffer = virt_to_phys(TrackBuffer); 1956 PhysTrackBuffer = virt_to_phys(TrackBuffer);
1933 BufferDrive = BufferSide = BufferTrack = -1; 1957 BufferDrive = BufferSide = BufferTrack = -1;
1934 1958
1935 floppy_queue = blk_init_queue(do_fd_request, &ataflop_lock);
1936 if (!floppy_queue)
1937 goto Enomem;
1938
1939 for (i = 0; i < FD_MAX_UNITS; i++) { 1959 for (i = 0; i < FD_MAX_UNITS; i++) {
1940 unit[i].track = -1; 1960 unit[i].track = -1;
1941 unit[i].flags = 0; 1961 unit[i].flags = 0;
@@ -1944,7 +1964,10 @@ static int __init atari_floppy_init (void)
1944 sprintf(unit[i].disk->disk_name, "fd%d", i); 1964 sprintf(unit[i].disk->disk_name, "fd%d", i);
1945 unit[i].disk->fops = &floppy_fops; 1965 unit[i].disk->fops = &floppy_fops;
1946 unit[i].disk->private_data = &unit[i]; 1966 unit[i].disk->private_data = &unit[i];
1947 unit[i].disk->queue = floppy_queue; 1967 unit[i].disk->queue = blk_init_queue(do_fd_request,
1968 &ataflop_lock);
1969 if (!unit[i].disk->queue)
1970 goto Enomem;
1948 set_capacity(unit[i].disk, MAX_DISK_SIZE * 2); 1971 set_capacity(unit[i].disk, MAX_DISK_SIZE * 2);
1949 add_disk(unit[i].disk); 1972 add_disk(unit[i].disk);
1950 } 1973 }
@@ -1959,10 +1982,14 @@ static int __init atari_floppy_init (void)
1959 1982
1960 return 0; 1983 return 0;
1961Enomem: 1984Enomem:
1962 while (i--) 1985 while (i--) {
1986 struct request_queue *q = unit[i].disk->queue;
1987
1963 put_disk(unit[i].disk); 1988 put_disk(unit[i].disk);
1964 if (floppy_queue) 1989 if (q)
1965 blk_cleanup_queue(floppy_queue); 1990 blk_cleanup_queue(q);
1991 }
1992
1966 unregister_blkdev(FLOPPY_MAJOR, "fd"); 1993 unregister_blkdev(FLOPPY_MAJOR, "fd");
1967 return -ENOMEM; 1994 return -ENOMEM;
1968} 1995}
@@ -2011,12 +2038,14 @@ static void __exit atari_floppy_exit(void)
2011 int i; 2038 int i;
2012 blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); 2039 blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256);
2013 for (i = 0; i < FD_MAX_UNITS; i++) { 2040 for (i = 0; i < FD_MAX_UNITS; i++) {
2041 struct request_queue *q = unit[i].disk->queue;
2042
2014 del_gendisk(unit[i].disk); 2043 del_gendisk(unit[i].disk);
2015 put_disk(unit[i].disk); 2044 put_disk(unit[i].disk);
2045 blk_cleanup_queue(q);
2016 } 2046 }
2017 unregister_blkdev(FLOPPY_MAJOR, "fd"); 2047 unregister_blkdev(FLOPPY_MAJOR, "fd");
2018 2048
2019 blk_cleanup_queue(floppy_queue);
2020 del_timer_sync(&fd_timer); 2049 del_timer_sync(&fd_timer);
2021 atari_stram_free( DMABuffer ); 2050 atari_stram_free( DMABuffer );
2022} 2051}
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 1c7f63792ff8..b7f51e4594f8 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -15,7 +15,7 @@
15#include <linux/blkdev.h> 15#include <linux/blkdev.h>
16#include <linux/bio.h> 16#include <linux/bio.h>
17#include <linux/highmem.h> 17#include <linux/highmem.h>
18#include <linux/smp_lock.h> 18#include <linux/mutex.h>
19#include <linux/radix-tree.h> 19#include <linux/radix-tree.h>
20#include <linux/buffer_head.h> /* invalidate_bh_lrus() */ 20#include <linux/buffer_head.h> /* invalidate_bh_lrus() */
21#include <linux/slab.h> 21#include <linux/slab.h>
@@ -55,6 +55,7 @@ struct brd_device {
55/* 55/*
56 * Look up and return a brd's page for a given sector. 56 * Look up and return a brd's page for a given sector.
57 */ 57 */
58static DEFINE_MUTEX(brd_mutex);
58static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector) 59static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
59{ 60{
60 pgoff_t idx; 61 pgoff_t idx;
@@ -402,7 +403,7 @@ static int brd_ioctl(struct block_device *bdev, fmode_t mode,
402 * ram device BLKFLSBUF has special semantics, we want to actually 403 * ram device BLKFLSBUF has special semantics, we want to actually
403 * release and destroy the ramdisk data. 404 * release and destroy the ramdisk data.
404 */ 405 */
405 lock_kernel(); 406 mutex_lock(&brd_mutex);
406 mutex_lock(&bdev->bd_mutex); 407 mutex_lock(&bdev->bd_mutex);
407 error = -EBUSY; 408 error = -EBUSY;
408 if (bdev->bd_openers <= 1) { 409 if (bdev->bd_openers <= 1) {
@@ -419,7 +420,7 @@ static int brd_ioctl(struct block_device *bdev, fmode_t mode,
419 error = 0; 420 error = 0;
420 } 421 }
421 mutex_unlock(&bdev->bd_mutex); 422 mutex_unlock(&bdev->bd_mutex);
422 unlock_kernel(); 423 mutex_unlock(&brd_mutex);
423 424
424 return error; 425 return error;
425} 426}
@@ -482,7 +483,6 @@ static struct brd_device *brd_alloc(int i)
482 if (!brd->brd_queue) 483 if (!brd->brd_queue)
483 goto out_free_dev; 484 goto out_free_dev;
484 blk_queue_make_request(brd->brd_queue, brd_make_request); 485 blk_queue_make_request(brd->brd_queue, brd_make_request);
485 blk_queue_ordered(brd->brd_queue, QUEUE_ORDERED_TAG);
486 blk_queue_max_hw_sectors(brd->brd_queue, 1024); 486 blk_queue_max_hw_sectors(brd->brd_queue, 1024);
487 blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY); 487 blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
488 488
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 5e4fadcdece9..f09e6df15aa7 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -26,7 +26,6 @@
26#include <linux/pci.h> 26#include <linux/pci.h>
27#include <linux/kernel.h> 27#include <linux/kernel.h>
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/smp_lock.h>
30#include <linux/delay.h> 29#include <linux/delay.h>
31#include <linux/major.h> 30#include <linux/major.h>
32#include <linux/fs.h> 31#include <linux/fs.h>
@@ -66,6 +65,7 @@ MODULE_SUPPORTED_DEVICE("HP Smart Array Controllers");
66MODULE_VERSION("3.6.26"); 65MODULE_VERSION("3.6.26");
67MODULE_LICENSE("GPL"); 66MODULE_LICENSE("GPL");
68 67
68static DEFINE_MUTEX(cciss_mutex);
69static int cciss_allow_hpsa; 69static int cciss_allow_hpsa;
70module_param(cciss_allow_hpsa, int, S_IRUGO|S_IWUSR); 70module_param(cciss_allow_hpsa, int, S_IRUGO|S_IWUSR);
71MODULE_PARM_DESC(cciss_allow_hpsa, 71MODULE_PARM_DESC(cciss_allow_hpsa,
@@ -105,11 +105,12 @@ static const struct pci_device_id cciss_pci_device_id[] = {
105 {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3249}, 105 {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3249},
106 {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x324A}, 106 {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x324A},
107 {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x324B}, 107 {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x324B},
108 {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3250}, 108 {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSF, 0x103C, 0x3350},
109 {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3251}, 109 {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSF, 0x103C, 0x3351},
110 {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3252}, 110 {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSF, 0x103C, 0x3352},
111 {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3253}, 111 {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSF, 0x103C, 0x3353},
112 {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3254}, 112 {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSF, 0x103C, 0x3354},
113 {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSF, 0x103C, 0x3355},
113 {0,} 114 {0,}
114}; 115};
115 116
@@ -149,11 +150,12 @@ static struct board_type products[] = {
149 {0x3249103C, "Smart Array P812", &SA5_access}, 150 {0x3249103C, "Smart Array P812", &SA5_access},
150 {0x324A103C, "Smart Array P712m", &SA5_access}, 151 {0x324A103C, "Smart Array P712m", &SA5_access},
151 {0x324B103C, "Smart Array P711m", &SA5_access}, 152 {0x324B103C, "Smart Array P711m", &SA5_access},
152 {0x3250103C, "Smart Array", &SA5_access}, 153 {0x3350103C, "Smart Array", &SA5_access},
153 {0x3251103C, "Smart Array", &SA5_access}, 154 {0x3351103C, "Smart Array", &SA5_access},
154 {0x3252103C, "Smart Array", &SA5_access}, 155 {0x3352103C, "Smart Array", &SA5_access},
155 {0x3253103C, "Smart Array", &SA5_access}, 156 {0x3353103C, "Smart Array", &SA5_access},
156 {0x3254103C, "Smart Array", &SA5_access}, 157 {0x3354103C, "Smart Array", &SA5_access},
158 {0x3355103C, "Smart Array", &SA5_access},
157}; 159};
158 160
159/* How long to wait (in milliseconds) for board to go into simple mode */ 161/* How long to wait (in milliseconds) for board to go into simple mode */
@@ -1059,9 +1061,9 @@ static int cciss_unlocked_open(struct block_device *bdev, fmode_t mode)
1059{ 1061{
1060 int ret; 1062 int ret;
1061 1063
1062 lock_kernel(); 1064 mutex_lock(&cciss_mutex);
1063 ret = cciss_open(bdev, mode); 1065 ret = cciss_open(bdev, mode);
1064 unlock_kernel(); 1066 mutex_unlock(&cciss_mutex);
1065 1067
1066 return ret; 1068 return ret;
1067} 1069}
@@ -1074,13 +1076,13 @@ static int cciss_release(struct gendisk *disk, fmode_t mode)
1074 ctlr_info_t *h; 1076 ctlr_info_t *h;
1075 drive_info_struct *drv; 1077 drive_info_struct *drv;
1076 1078
1077 lock_kernel(); 1079 mutex_lock(&cciss_mutex);
1078 h = get_host(disk); 1080 h = get_host(disk);
1079 drv = get_drv(disk); 1081 drv = get_drv(disk);
1080 dev_dbg(&h->pdev->dev, "cciss_release %s\n", disk->disk_name); 1082 dev_dbg(&h->pdev->dev, "cciss_release %s\n", disk->disk_name);
1081 drv->usage_count--; 1083 drv->usage_count--;
1082 h->usage_count--; 1084 h->usage_count--;
1083 unlock_kernel(); 1085 mutex_unlock(&cciss_mutex);
1084 return 0; 1086 return 0;
1085} 1087}
1086 1088
@@ -1088,9 +1090,9 @@ static int do_ioctl(struct block_device *bdev, fmode_t mode,
1088 unsigned cmd, unsigned long arg) 1090 unsigned cmd, unsigned long arg)
1089{ 1091{
1090 int ret; 1092 int ret;
1091 lock_kernel(); 1093 mutex_lock(&cciss_mutex);
1092 ret = cciss_ioctl(bdev, mode, cmd, arg); 1094 ret = cciss_ioctl(bdev, mode, cmd, arg);
1093 unlock_kernel(); 1095 mutex_unlock(&cciss_mutex);
1094 return ret; 1096 return ret;
1095} 1097}
1096 1098
@@ -1232,470 +1234,452 @@ static void check_ioctl_unit_attention(ctlr_info_t *h, CommandList_struct *c)
1232 c->err_info->ScsiStatus != SAM_STAT_CHECK_CONDITION) 1234 c->err_info->ScsiStatus != SAM_STAT_CHECK_CONDITION)
1233 (void)check_for_unit_attention(h, c); 1235 (void)check_for_unit_attention(h, c);
1234} 1236}
1235/* 1237
1236 * ioctl 1238static int cciss_getpciinfo(ctlr_info_t *h, void __user *argp)
1237 */
1238static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
1239 unsigned int cmd, unsigned long arg)
1240{ 1239{
1241 struct gendisk *disk = bdev->bd_disk; 1240 cciss_pci_info_struct pciinfo;
1242 ctlr_info_t *h = get_host(disk);
1243 drive_info_struct *drv = get_drv(disk);
1244 void __user *argp = (void __user *)arg;
1245 1241
1246 dev_dbg(&h->pdev->dev, "cciss_ioctl: Called with cmd=%x %lx\n", 1242 if (!argp)
1247 cmd, arg); 1243 return -EINVAL;
1248 switch (cmd) { 1244 pciinfo.domain = pci_domain_nr(h->pdev->bus);
1249 case CCISS_GETPCIINFO: 1245 pciinfo.bus = h->pdev->bus->number;
1250 { 1246 pciinfo.dev_fn = h->pdev->devfn;
1251 cciss_pci_info_struct pciinfo; 1247 pciinfo.board_id = h->board_id;
1252 1248 if (copy_to_user(argp, &pciinfo, sizeof(cciss_pci_info_struct)))
1253 if (!arg) 1249 return -EFAULT;
1254 return -EINVAL; 1250 return 0;
1255 pciinfo.domain = pci_domain_nr(h->pdev->bus); 1251}
1256 pciinfo.bus = h->pdev->bus->number;
1257 pciinfo.dev_fn = h->pdev->devfn;
1258 pciinfo.board_id = h->board_id;
1259 if (copy_to_user
1260 (argp, &pciinfo, sizeof(cciss_pci_info_struct)))
1261 return -EFAULT;
1262 return 0;
1263 }
1264 case CCISS_GETINTINFO:
1265 {
1266 cciss_coalint_struct intinfo;
1267 if (!arg)
1268 return -EINVAL;
1269 intinfo.delay =
1270 readl(&h->cfgtable->HostWrite.CoalIntDelay);
1271 intinfo.count =
1272 readl(&h->cfgtable->HostWrite.CoalIntCount);
1273 if (copy_to_user
1274 (argp, &intinfo, sizeof(cciss_coalint_struct)))
1275 return -EFAULT;
1276 return 0;
1277 }
1278 case CCISS_SETINTINFO:
1279 {
1280 cciss_coalint_struct intinfo;
1281 unsigned long flags;
1282 int i;
1283
1284 if (!arg)
1285 return -EINVAL;
1286 if (!capable(CAP_SYS_ADMIN))
1287 return -EPERM;
1288 if (copy_from_user
1289 (&intinfo, argp, sizeof(cciss_coalint_struct)))
1290 return -EFAULT;
1291 if ((intinfo.delay == 0) && (intinfo.count == 0))
1292 return -EINVAL;
1293 spin_lock_irqsave(&h->lock, flags);
1294 /* Update the field, and then ring the doorbell */
1295 writel(intinfo.delay,
1296 &(h->cfgtable->HostWrite.CoalIntDelay));
1297 writel(intinfo.count,
1298 &(h->cfgtable->HostWrite.CoalIntCount));
1299 writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL);
1300
1301 for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) {
1302 if (!(readl(h->vaddr + SA5_DOORBELL)
1303 & CFGTBL_ChangeReq))
1304 break;
1305 /* delay and try again */
1306 udelay(1000);
1307 }
1308 spin_unlock_irqrestore(&h->lock, flags);
1309 if (i >= MAX_IOCTL_CONFIG_WAIT)
1310 return -EAGAIN;
1311 return 0;
1312 }
1313 case CCISS_GETNODENAME:
1314 {
1315 NodeName_type NodeName;
1316 int i;
1317
1318 if (!arg)
1319 return -EINVAL;
1320 for (i = 0; i < 16; i++)
1321 NodeName[i] =
1322 readb(&h->cfgtable->ServerName[i]);
1323 if (copy_to_user(argp, NodeName, sizeof(NodeName_type)))
1324 return -EFAULT;
1325 return 0;
1326 }
1327 case CCISS_SETNODENAME:
1328 {
1329 NodeName_type NodeName;
1330 unsigned long flags;
1331 int i;
1332 1252
1333 if (!arg) 1253static int cciss_getintinfo(ctlr_info_t *h, void __user *argp)
1334 return -EINVAL; 1254{
1335 if (!capable(CAP_SYS_ADMIN)) 1255 cciss_coalint_struct intinfo;
1336 return -EPERM;
1337 1256
1338 if (copy_from_user 1257 if (!argp)
1339 (NodeName, argp, sizeof(NodeName_type))) 1258 return -EINVAL;
1340 return -EFAULT; 1259 intinfo.delay = readl(&h->cfgtable->HostWrite.CoalIntDelay);
1260 intinfo.count = readl(&h->cfgtable->HostWrite.CoalIntCount);
1261 if (copy_to_user
1262 (argp, &intinfo, sizeof(cciss_coalint_struct)))
1263 return -EFAULT;
1264 return 0;
1265}
1341 1266
1342 spin_lock_irqsave(&h->lock, flags); 1267static int cciss_setintinfo(ctlr_info_t *h, void __user *argp)
1268{
1269 cciss_coalint_struct intinfo;
1270 unsigned long flags;
1271 int i;
1343 1272
1344 /* Update the field, and then ring the doorbell */ 1273 if (!argp)
1345 for (i = 0; i < 16; i++) 1274 return -EINVAL;
1346 writeb(NodeName[i], 1275 if (!capable(CAP_SYS_ADMIN))
1347 &h->cfgtable->ServerName[i]); 1276 return -EPERM;
1277 if (copy_from_user(&intinfo, argp, sizeof(intinfo)))
1278 return -EFAULT;
1279 if ((intinfo.delay == 0) && (intinfo.count == 0))
1280 return -EINVAL;
1281 spin_lock_irqsave(&h->lock, flags);
1282 /* Update the field, and then ring the doorbell */
1283 writel(intinfo.delay, &(h->cfgtable->HostWrite.CoalIntDelay));
1284 writel(intinfo.count, &(h->cfgtable->HostWrite.CoalIntCount));
1285 writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL);
1348 1286
1349 writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL); 1287 for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) {
1288 if (!(readl(h->vaddr + SA5_DOORBELL) & CFGTBL_ChangeReq))
1289 break;
1290 udelay(1000); /* delay and try again */
1291 }
1292 spin_unlock_irqrestore(&h->lock, flags);
1293 if (i >= MAX_IOCTL_CONFIG_WAIT)
1294 return -EAGAIN;
1295 return 0;
1296}
1350 1297
1351 for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) { 1298static int cciss_getnodename(ctlr_info_t *h, void __user *argp)
1352 if (!(readl(h->vaddr + SA5_DOORBELL) 1299{
1353 & CFGTBL_ChangeReq)) 1300 NodeName_type NodeName;
1354 break; 1301 int i;
1355 /* delay and try again */
1356 udelay(1000);
1357 }
1358 spin_unlock_irqrestore(&h->lock, flags);
1359 if (i >= MAX_IOCTL_CONFIG_WAIT)
1360 return -EAGAIN;
1361 return 0;
1362 }
1363 1302
1364 case CCISS_GETHEARTBEAT: 1303 if (!argp)
1365 { 1304 return -EINVAL;
1366 Heartbeat_type heartbeat; 1305 for (i = 0; i < 16; i++)
1367 1306 NodeName[i] = readb(&h->cfgtable->ServerName[i]);
1368 if (!arg) 1307 if (copy_to_user(argp, NodeName, sizeof(NodeName_type)))
1369 return -EINVAL; 1308 return -EFAULT;
1370 heartbeat = readl(&h->cfgtable->HeartBeat); 1309 return 0;
1371 if (copy_to_user 1310}
1372 (argp, &heartbeat, sizeof(Heartbeat_type)))
1373 return -EFAULT;
1374 return 0;
1375 }
1376 case CCISS_GETBUSTYPES:
1377 {
1378 BusTypes_type BusTypes;
1379
1380 if (!arg)
1381 return -EINVAL;
1382 BusTypes = readl(&h->cfgtable->BusTypes);
1383 if (copy_to_user
1384 (argp, &BusTypes, sizeof(BusTypes_type)))
1385 return -EFAULT;
1386 return 0;
1387 }
1388 case CCISS_GETFIRMVER:
1389 {
1390 FirmwareVer_type firmware;
1391 1311
1392 if (!arg) 1312static int cciss_setnodename(ctlr_info_t *h, void __user *argp)
1393 return -EINVAL; 1313{
1394 memcpy(firmware, h->firm_ver, 4); 1314 NodeName_type NodeName;
1315 unsigned long flags;
1316 int i;
1395 1317
1396 if (copy_to_user 1318 if (!argp)
1397 (argp, firmware, sizeof(FirmwareVer_type))) 1319 return -EINVAL;
1398 return -EFAULT; 1320 if (!capable(CAP_SYS_ADMIN))
1399 return 0; 1321 return -EPERM;
1400 } 1322 if (copy_from_user(NodeName, argp, sizeof(NodeName_type)))
1401 case CCISS_GETDRIVVER: 1323 return -EFAULT;
1402 { 1324 spin_lock_irqsave(&h->lock, flags);
1403 DriverVer_type DriverVer = DRIVER_VERSION; 1325 /* Update the field, and then ring the doorbell */
1326 for (i = 0; i < 16; i++)
1327 writeb(NodeName[i], &h->cfgtable->ServerName[i]);
1328 writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL);
1329 for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) {
1330 if (!(readl(h->vaddr + SA5_DOORBELL) & CFGTBL_ChangeReq))
1331 break;
1332 udelay(1000); /* delay and try again */
1333 }
1334 spin_unlock_irqrestore(&h->lock, flags);
1335 if (i >= MAX_IOCTL_CONFIG_WAIT)
1336 return -EAGAIN;
1337 return 0;
1338}
1404 1339
1405 if (!arg) 1340static int cciss_getheartbeat(ctlr_info_t *h, void __user *argp)
1406 return -EINVAL; 1341{
1342 Heartbeat_type heartbeat;
1407 1343
1408 if (copy_to_user 1344 if (!argp)
1409 (argp, &DriverVer, sizeof(DriverVer_type))) 1345 return -EINVAL;
1410 return -EFAULT; 1346 heartbeat = readl(&h->cfgtable->HeartBeat);
1411 return 0; 1347 if (copy_to_user(argp, &heartbeat, sizeof(Heartbeat_type)))
1412 } 1348 return -EFAULT;
1349 return 0;
1350}
1413 1351
1414 case CCISS_DEREGDISK: 1352static int cciss_getbustypes(ctlr_info_t *h, void __user *argp)
1415 case CCISS_REGNEWD: 1353{
1416 case CCISS_REVALIDVOLS: 1354 BusTypes_type BusTypes;
1417 return rebuild_lun_table(h, 0, 1); 1355
1356 if (!argp)
1357 return -EINVAL;
1358 BusTypes = readl(&h->cfgtable->BusTypes);
1359 if (copy_to_user(argp, &BusTypes, sizeof(BusTypes_type)))
1360 return -EFAULT;
1361 return 0;
1362}
1418 1363
1419 case CCISS_GETLUNINFO:{ 1364static int cciss_getfirmver(ctlr_info_t *h, void __user *argp)
1420 LogvolInfo_struct luninfo; 1365{
1366 FirmwareVer_type firmware;
1421 1367
1422 memcpy(&luninfo.LunID, drv->LunID, 1368 if (!argp)
1423 sizeof(luninfo.LunID)); 1369 return -EINVAL;
1424 luninfo.num_opens = drv->usage_count; 1370 memcpy(firmware, h->firm_ver, 4);
1425 luninfo.num_parts = 0; 1371
1426 if (copy_to_user(argp, &luninfo, 1372 if (copy_to_user
1427 sizeof(LogvolInfo_struct))) 1373 (argp, firmware, sizeof(FirmwareVer_type)))
1428 return -EFAULT; 1374 return -EFAULT;
1429 return 0; 1375 return 0;
1376}
1377
1378static int cciss_getdrivver(ctlr_info_t *h, void __user *argp)
1379{
1380 DriverVer_type DriverVer = DRIVER_VERSION;
1381
1382 if (!argp)
1383 return -EINVAL;
1384 if (copy_to_user(argp, &DriverVer, sizeof(DriverVer_type)))
1385 return -EFAULT;
1386 return 0;
1387}
1388
1389static int cciss_getluninfo(ctlr_info_t *h,
1390 struct gendisk *disk, void __user *argp)
1391{
1392 LogvolInfo_struct luninfo;
1393 drive_info_struct *drv = get_drv(disk);
1394
1395 if (!argp)
1396 return -EINVAL;
1397 memcpy(&luninfo.LunID, drv->LunID, sizeof(luninfo.LunID));
1398 luninfo.num_opens = drv->usage_count;
1399 luninfo.num_parts = 0;
1400 if (copy_to_user(argp, &luninfo, sizeof(LogvolInfo_struct)))
1401 return -EFAULT;
1402 return 0;
1403}
1404
1405static int cciss_passthru(ctlr_info_t *h, void __user *argp)
1406{
1407 IOCTL_Command_struct iocommand;
1408 CommandList_struct *c;
1409 char *buff = NULL;
1410 u64bit temp64;
1411 DECLARE_COMPLETION_ONSTACK(wait);
1412
1413 if (!argp)
1414 return -EINVAL;
1415
1416 if (!capable(CAP_SYS_RAWIO))
1417 return -EPERM;
1418
1419 if (copy_from_user
1420 (&iocommand, argp, sizeof(IOCTL_Command_struct)))
1421 return -EFAULT;
1422 if ((iocommand.buf_size < 1) &&
1423 (iocommand.Request.Type.Direction != XFER_NONE)) {
1424 return -EINVAL;
1425 }
1426 if (iocommand.buf_size > 0) {
1427 buff = kmalloc(iocommand.buf_size, GFP_KERNEL);
1428 if (buff == NULL)
1429 return -EFAULT;
1430 }
1431 if (iocommand.Request.Type.Direction == XFER_WRITE) {
1432 /* Copy the data into the buffer we created */
1433 if (copy_from_user(buff, iocommand.buf, iocommand.buf_size)) {
1434 kfree(buff);
1435 return -EFAULT;
1430 } 1436 }
1431 case CCISS_PASSTHRU: 1437 } else {
1432 { 1438 memset(buff, 0, iocommand.buf_size);
1433 IOCTL_Command_struct iocommand; 1439 }
1434 CommandList_struct *c; 1440 c = cmd_special_alloc(h);
1435 char *buff = NULL; 1441 if (!c) {
1436 u64bit temp64; 1442 kfree(buff);
1437 DECLARE_COMPLETION_ONSTACK(wait); 1443 return -ENOMEM;
1438 1444 }
1439 if (!arg) 1445 /* Fill in the command type */
1440 return -EINVAL; 1446 c->cmd_type = CMD_IOCTL_PEND;
1441 1447 /* Fill in Command Header */
1442 if (!capable(CAP_SYS_RAWIO)) 1448 c->Header.ReplyQueue = 0; /* unused in simple mode */
1443 return -EPERM; 1449 if (iocommand.buf_size > 0) { /* buffer to fill */
1444 1450 c->Header.SGList = 1;
1445 if (copy_from_user 1451 c->Header.SGTotal = 1;
1446 (&iocommand, argp, sizeof(IOCTL_Command_struct))) 1452 } else { /* no buffers to fill */
1447 return -EFAULT; 1453 c->Header.SGList = 0;
1448 if ((iocommand.buf_size < 1) && 1454 c->Header.SGTotal = 0;
1449 (iocommand.Request.Type.Direction != XFER_NONE)) { 1455 }
1450 return -EINVAL; 1456 c->Header.LUN = iocommand.LUN_info;
1451 } 1457 /* use the kernel address the cmd block for tag */
1452#if 0 /* 'buf_size' member is 16-bits, and always smaller than kmalloc limit */ 1458 c->Header.Tag.lower = c->busaddr;
1453 /* Check kmalloc limits */
1454 if (iocommand.buf_size > 128000)
1455 return -EINVAL;
1456#endif
1457 if (iocommand.buf_size > 0) {
1458 buff = kmalloc(iocommand.buf_size, GFP_KERNEL);
1459 if (buff == NULL)
1460 return -EFAULT;
1461 }
1462 if (iocommand.Request.Type.Direction == XFER_WRITE) {
1463 /* Copy the data into the buffer we created */
1464 if (copy_from_user
1465 (buff, iocommand.buf, iocommand.buf_size)) {
1466 kfree(buff);
1467 return -EFAULT;
1468 }
1469 } else {
1470 memset(buff, 0, iocommand.buf_size);
1471 }
1472 c = cmd_special_alloc(h);
1473 if (!c) {
1474 kfree(buff);
1475 return -ENOMEM;
1476 }
1477 /* Fill in the command type */
1478 c->cmd_type = CMD_IOCTL_PEND;
1479 /* Fill in Command Header */
1480 c->Header.ReplyQueue = 0; /* unused in simple mode */
1481 if (iocommand.buf_size > 0) /* buffer to fill */
1482 {
1483 c->Header.SGList = 1;
1484 c->Header.SGTotal = 1;
1485 } else /* no buffers to fill */
1486 {
1487 c->Header.SGList = 0;
1488 c->Header.SGTotal = 0;
1489 }
1490 c->Header.LUN = iocommand.LUN_info;
1491 /* use the kernel address the cmd block for tag */
1492 c->Header.Tag.lower = c->busaddr;
1493
1494 /* Fill in Request block */
1495 c->Request = iocommand.Request;
1496
1497 /* Fill in the scatter gather information */
1498 if (iocommand.buf_size > 0) {
1499 temp64.val = pci_map_single(h->pdev, buff,
1500 iocommand.buf_size,
1501 PCI_DMA_BIDIRECTIONAL);
1502 c->SG[0].Addr.lower = temp64.val32.lower;
1503 c->SG[0].Addr.upper = temp64.val32.upper;
1504 c->SG[0].Len = iocommand.buf_size;
1505 c->SG[0].Ext = 0; /* we are not chaining */
1506 }
1507 c->waiting = &wait;
1508 1459
1509 enqueue_cmd_and_start_io(h, c); 1460 /* Fill in Request block */
1510 wait_for_completion(&wait); 1461 c->Request = iocommand.Request;
1511 1462
1512 /* unlock the buffers from DMA */ 1463 /* Fill in the scatter gather information */
1513 temp64.val32.lower = c->SG[0].Addr.lower; 1464 if (iocommand.buf_size > 0) {
1514 temp64.val32.upper = c->SG[0].Addr.upper; 1465 temp64.val = pci_map_single(h->pdev, buff,
1515 pci_unmap_single(h->pdev, (dma_addr_t) temp64.val, 1466 iocommand.buf_size, PCI_DMA_BIDIRECTIONAL);
1516 iocommand.buf_size, 1467 c->SG[0].Addr.lower = temp64.val32.lower;
1517 PCI_DMA_BIDIRECTIONAL); 1468 c->SG[0].Addr.upper = temp64.val32.upper;
1469 c->SG[0].Len = iocommand.buf_size;
1470 c->SG[0].Ext = 0; /* we are not chaining */
1471 }
1472 c->waiting = &wait;
1518 1473
1519 check_ioctl_unit_attention(h, c); 1474 enqueue_cmd_and_start_io(h, c);
1475 wait_for_completion(&wait);
1520 1476
1521 /* Copy the error information out */ 1477 /* unlock the buffers from DMA */
1522 iocommand.error_info = *(c->err_info); 1478 temp64.val32.lower = c->SG[0].Addr.lower;
1523 if (copy_to_user 1479 temp64.val32.upper = c->SG[0].Addr.upper;
1524 (argp, &iocommand, sizeof(IOCTL_Command_struct))) { 1480 pci_unmap_single(h->pdev, (dma_addr_t) temp64.val, iocommand.buf_size,
1525 kfree(buff); 1481 PCI_DMA_BIDIRECTIONAL);
1526 cmd_special_free(h, c); 1482 check_ioctl_unit_attention(h, c);
1527 return -EFAULT; 1483
1528 } 1484 /* Copy the error information out */
1485 iocommand.error_info = *(c->err_info);
1486 if (copy_to_user(argp, &iocommand, sizeof(IOCTL_Command_struct))) {
1487 kfree(buff);
1488 cmd_special_free(h, c);
1489 return -EFAULT;
1490 }
1529 1491
1530 if (iocommand.Request.Type.Direction == XFER_READ) { 1492 if (iocommand.Request.Type.Direction == XFER_READ) {
1531 /* Copy the data out of the buffer we created */ 1493 /* Copy the data out of the buffer we created */
1532 if (copy_to_user 1494 if (copy_to_user(iocommand.buf, buff, iocommand.buf_size)) {
1533 (iocommand.buf, buff, iocommand.buf_size)) {
1534 kfree(buff);
1535 cmd_special_free(h, c);
1536 return -EFAULT;
1537 }
1538 }
1539 kfree(buff); 1495 kfree(buff);
1540 cmd_special_free(h, c); 1496 cmd_special_free(h, c);
1541 return 0; 1497 return -EFAULT;
1542 } 1498 }
1543 case CCISS_BIG_PASSTHRU:{ 1499 }
1544 BIG_IOCTL_Command_struct *ioc; 1500 kfree(buff);
1545 CommandList_struct *c; 1501 cmd_special_free(h, c);
1546 unsigned char **buff = NULL; 1502 return 0;
1547 int *buff_size = NULL; 1503}
1548 u64bit temp64; 1504
1549 BYTE sg_used = 0; 1505static int cciss_bigpassthru(ctlr_info_t *h, void __user *argp)
1550 int status = 0; 1506{
1551 int i; 1507 BIG_IOCTL_Command_struct *ioc;
1552 DECLARE_COMPLETION_ONSTACK(wait); 1508 CommandList_struct *c;
1553 __u32 left; 1509 unsigned char **buff = NULL;
1554 __u32 sz; 1510 int *buff_size = NULL;
1555 BYTE __user *data_ptr; 1511 u64bit temp64;
1556 1512 BYTE sg_used = 0;
1557 if (!arg) 1513 int status = 0;
1558 return -EINVAL; 1514 int i;
1559 if (!capable(CAP_SYS_RAWIO)) 1515 DECLARE_COMPLETION_ONSTACK(wait);
1560 return -EPERM; 1516 __u32 left;
1561 ioc = (BIG_IOCTL_Command_struct *) 1517 __u32 sz;
1562 kmalloc(sizeof(*ioc), GFP_KERNEL); 1518 BYTE __user *data_ptr;
1563 if (!ioc) { 1519
1564 status = -ENOMEM; 1520 if (!argp)
1565 goto cleanup1; 1521 return -EINVAL;
1566 } 1522 if (!capable(CAP_SYS_RAWIO))
1567 if (copy_from_user(ioc, argp, sizeof(*ioc))) { 1523 return -EPERM;
1524 ioc = (BIG_IOCTL_Command_struct *)
1525 kmalloc(sizeof(*ioc), GFP_KERNEL);
1526 if (!ioc) {
1527 status = -ENOMEM;
1528 goto cleanup1;
1529 }
1530 if (copy_from_user(ioc, argp, sizeof(*ioc))) {
1531 status = -EFAULT;
1532 goto cleanup1;
1533 }
1534 if ((ioc->buf_size < 1) &&
1535 (ioc->Request.Type.Direction != XFER_NONE)) {
1536 status = -EINVAL;
1537 goto cleanup1;
1538 }
1539 /* Check kmalloc limits using all SGs */
1540 if (ioc->malloc_size > MAX_KMALLOC_SIZE) {
1541 status = -EINVAL;
1542 goto cleanup1;
1543 }
1544 if (ioc->buf_size > ioc->malloc_size * MAXSGENTRIES) {
1545 status = -EINVAL;
1546 goto cleanup1;
1547 }
1548 buff = kzalloc(MAXSGENTRIES * sizeof(char *), GFP_KERNEL);
1549 if (!buff) {
1550 status = -ENOMEM;
1551 goto cleanup1;
1552 }
1553 buff_size = kmalloc(MAXSGENTRIES * sizeof(int), GFP_KERNEL);
1554 if (!buff_size) {
1555 status = -ENOMEM;
1556 goto cleanup1;
1557 }
1558 left = ioc->buf_size;
1559 data_ptr = ioc->buf;
1560 while (left) {
1561 sz = (left > ioc->malloc_size) ? ioc->malloc_size : left;
1562 buff_size[sg_used] = sz;
1563 buff[sg_used] = kmalloc(sz, GFP_KERNEL);
1564 if (buff[sg_used] == NULL) {
1565 status = -ENOMEM;
1566 goto cleanup1;
1567 }
1568 if (ioc->Request.Type.Direction == XFER_WRITE) {
1569 if (copy_from_user(buff[sg_used], data_ptr, sz)) {
1568 status = -EFAULT; 1570 status = -EFAULT;
1569 goto cleanup1; 1571 goto cleanup1;
1570 } 1572 }
1571 if ((ioc->buf_size < 1) && 1573 } else {
1572 (ioc->Request.Type.Direction != XFER_NONE)) { 1574 memset(buff[sg_used], 0, sz);
1573 status = -EINVAL; 1575 }
1574 goto cleanup1; 1576 left -= sz;
1575 } 1577 data_ptr += sz;
1576 /* Check kmalloc limits using all SGs */ 1578 sg_used++;
1577 if (ioc->malloc_size > MAX_KMALLOC_SIZE) { 1579 }
1578 status = -EINVAL; 1580 c = cmd_special_alloc(h);
1579 goto cleanup1; 1581 if (!c) {
1580 } 1582 status = -ENOMEM;
1581 if (ioc->buf_size > ioc->malloc_size * MAXSGENTRIES) { 1583 goto cleanup1;
1582 status = -EINVAL; 1584 }
1583 goto cleanup1; 1585 c->cmd_type = CMD_IOCTL_PEND;
1584 } 1586 c->Header.ReplyQueue = 0;
1585 buff = 1587 c->Header.SGList = sg_used;
1586 kzalloc(MAXSGENTRIES * sizeof(char *), GFP_KERNEL); 1588 c->Header.SGTotal = sg_used;
1587 if (!buff) { 1589 c->Header.LUN = ioc->LUN_info;
1588 status = -ENOMEM; 1590 c->Header.Tag.lower = c->busaddr;
1589 goto cleanup1;
1590 }
1591 buff_size = kmalloc(MAXSGENTRIES * sizeof(int),
1592 GFP_KERNEL);
1593 if (!buff_size) {
1594 status = -ENOMEM;
1595 goto cleanup1;
1596 }
1597 left = ioc->buf_size;
1598 data_ptr = ioc->buf;
1599 while (left) {
1600 sz = (left >
1601 ioc->malloc_size) ? ioc->
1602 malloc_size : left;
1603 buff_size[sg_used] = sz;
1604 buff[sg_used] = kmalloc(sz, GFP_KERNEL);
1605 if (buff[sg_used] == NULL) {
1606 status = -ENOMEM;
1607 goto cleanup1;
1608 }
1609 if (ioc->Request.Type.Direction == XFER_WRITE) {
1610 if (copy_from_user
1611 (buff[sg_used], data_ptr, sz)) {
1612 status = -EFAULT;
1613 goto cleanup1;
1614 }
1615 } else {
1616 memset(buff[sg_used], 0, sz);
1617 }
1618 left -= sz;
1619 data_ptr += sz;
1620 sg_used++;
1621 }
1622 c = cmd_special_alloc(h);
1623 if (!c) {
1624 status = -ENOMEM;
1625 goto cleanup1;
1626 }
1627 c->cmd_type = CMD_IOCTL_PEND;
1628 c->Header.ReplyQueue = 0;
1629 1591
1630 if (ioc->buf_size > 0) { 1592 c->Request = ioc->Request;
1631 c->Header.SGList = sg_used; 1593 for (i = 0; i < sg_used; i++) {
1632 c->Header.SGTotal = sg_used; 1594 temp64.val = pci_map_single(h->pdev, buff[i], buff_size[i],
1633 } else { 1595 PCI_DMA_BIDIRECTIONAL);
1634 c->Header.SGList = 0; 1596 c->SG[i].Addr.lower = temp64.val32.lower;
1635 c->Header.SGTotal = 0; 1597 c->SG[i].Addr.upper = temp64.val32.upper;
1636 } 1598 c->SG[i].Len = buff_size[i];
1637 c->Header.LUN = ioc->LUN_info; 1599 c->SG[i].Ext = 0; /* we are not chaining */
1638 c->Header.Tag.lower = c->busaddr; 1600 }
1639 1601 c->waiting = &wait;
1640 c->Request = ioc->Request; 1602 enqueue_cmd_and_start_io(h, c);
1641 if (ioc->buf_size > 0) { 1603 wait_for_completion(&wait);
1642 for (i = 0; i < sg_used; i++) { 1604 /* unlock the buffers from DMA */
1643 temp64.val = 1605 for (i = 0; i < sg_used; i++) {
1644 pci_map_single(h->pdev, buff[i], 1606 temp64.val32.lower = c->SG[i].Addr.lower;
1645 buff_size[i], 1607 temp64.val32.upper = c->SG[i].Addr.upper;
1646 PCI_DMA_BIDIRECTIONAL); 1608 pci_unmap_single(h->pdev,
1647 c->SG[i].Addr.lower = 1609 (dma_addr_t) temp64.val, buff_size[i],
1648 temp64.val32.lower; 1610 PCI_DMA_BIDIRECTIONAL);
1649 c->SG[i].Addr.upper = 1611 }
1650 temp64.val32.upper; 1612 check_ioctl_unit_attention(h, c);
1651 c->SG[i].Len = buff_size[i]; 1613 /* Copy the error information out */
1652 c->SG[i].Ext = 0; /* we are not chaining */ 1614 ioc->error_info = *(c->err_info);
1653 } 1615 if (copy_to_user(argp, ioc, sizeof(*ioc))) {
1654 } 1616 cmd_special_free(h, c);
1655 c->waiting = &wait; 1617 status = -EFAULT;
1656 enqueue_cmd_and_start_io(h, c); 1618 goto cleanup1;
1657 wait_for_completion(&wait); 1619 }
1658 /* unlock the buffers from DMA */ 1620 if (ioc->Request.Type.Direction == XFER_READ) {
1659 for (i = 0; i < sg_used; i++) { 1621 /* Copy the data out of the buffer we created */
1660 temp64.val32.lower = c->SG[i].Addr.lower; 1622 BYTE __user *ptr = ioc->buf;
1661 temp64.val32.upper = c->SG[i].Addr.upper; 1623 for (i = 0; i < sg_used; i++) {
1662 pci_unmap_single(h->pdev, 1624 if (copy_to_user(ptr, buff[i], buff_size[i])) {
1663 (dma_addr_t) temp64.val, buff_size[i],
1664 PCI_DMA_BIDIRECTIONAL);
1665 }
1666 check_ioctl_unit_attention(h, c);
1667 /* Copy the error information out */
1668 ioc->error_info = *(c->err_info);
1669 if (copy_to_user(argp, ioc, sizeof(*ioc))) {
1670 cmd_special_free(h, c); 1625 cmd_special_free(h, c);
1671 status = -EFAULT; 1626 status = -EFAULT;
1672 goto cleanup1; 1627 goto cleanup1;
1673 } 1628 }
1674 if (ioc->Request.Type.Direction == XFER_READ) { 1629 ptr += buff_size[i];
1675 /* Copy the data out of the buffer we created */
1676 BYTE __user *ptr = ioc->buf;
1677 for (i = 0; i < sg_used; i++) {
1678 if (copy_to_user
1679 (ptr, buff[i], buff_size[i])) {
1680 cmd_special_free(h, c);
1681 status = -EFAULT;
1682 goto cleanup1;
1683 }
1684 ptr += buff_size[i];
1685 }
1686 }
1687 cmd_special_free(h, c);
1688 status = 0;
1689 cleanup1:
1690 if (buff) {
1691 for (i = 0; i < sg_used; i++)
1692 kfree(buff[i]);
1693 kfree(buff);
1694 }
1695 kfree(buff_size);
1696 kfree(ioc);
1697 return status;
1698 } 1630 }
1631 }
1632 cmd_special_free(h, c);
1633 status = 0;
1634cleanup1:
1635 if (buff) {
1636 for (i = 0; i < sg_used; i++)
1637 kfree(buff[i]);
1638 kfree(buff);
1639 }
1640 kfree(buff_size);
1641 kfree(ioc);
1642 return status;
1643}
1644
1645static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
1646 unsigned int cmd, unsigned long arg)
1647{
1648 struct gendisk *disk = bdev->bd_disk;
1649 ctlr_info_t *h = get_host(disk);
1650 void __user *argp = (void __user *)arg;
1651
1652 dev_dbg(&h->pdev->dev, "cciss_ioctl: Called with cmd=%x %lx\n",
1653 cmd, arg);
1654 switch (cmd) {
1655 case CCISS_GETPCIINFO:
1656 return cciss_getpciinfo(h, argp);
1657 case CCISS_GETINTINFO:
1658 return cciss_getintinfo(h, argp);
1659 case CCISS_SETINTINFO:
1660 return cciss_setintinfo(h, argp);
1661 case CCISS_GETNODENAME:
1662 return cciss_getnodename(h, argp);
1663 case CCISS_SETNODENAME:
1664 return cciss_setnodename(h, argp);
1665 case CCISS_GETHEARTBEAT:
1666 return cciss_getheartbeat(h, argp);
1667 case CCISS_GETBUSTYPES:
1668 return cciss_getbustypes(h, argp);
1669 case CCISS_GETFIRMVER:
1670 return cciss_getfirmver(h, argp);
1671 case CCISS_GETDRIVVER:
1672 return cciss_getdrivver(h, argp);
1673 case CCISS_DEREGDISK:
1674 case CCISS_REGNEWD:
1675 case CCISS_REVALIDVOLS:
1676 return rebuild_lun_table(h, 0, 1);
1677 case CCISS_GETLUNINFO:
1678 return cciss_getluninfo(h, disk, argp);
1679 case CCISS_PASSTHRU:
1680 return cciss_passthru(h, argp);
1681 case CCISS_BIG_PASSTHRU:
1682 return cciss_bigpassthru(h, argp);
1699 1683
1700 /* scsi_cmd_ioctl handles these, below, though some are not */ 1684 /* scsi_cmd_ioctl handles these, below, though some are not */
1701 /* very meaningful for cciss. SG_IO is the main one people want. */ 1685 /* very meaningful for cciss. SG_IO is the main one people want. */
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index d53b0291c44b..946dad4caef3 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -35,7 +35,7 @@
35#include <linux/seq_file.h> 35#include <linux/seq_file.h>
36#include <linux/init.h> 36#include <linux/init.h>
37#include <linux/hdreg.h> 37#include <linux/hdreg.h>
38#include <linux/smp_lock.h> 38#include <linux/mutex.h>
39#include <linux/spinlock.h> 39#include <linux/spinlock.h>
40#include <linux/blkdev.h> 40#include <linux/blkdev.h>
41#include <linux/genhd.h> 41#include <linux/genhd.h>
@@ -68,6 +68,7 @@ MODULE_LICENSE("GPL");
68 68
69#define CPQARRAY_DMA_MASK 0xFFFFFFFF /* 32 bit DMA */ 69#define CPQARRAY_DMA_MASK 0xFFFFFFFF /* 32 bit DMA */
70 70
71static DEFINE_MUTEX(cpqarray_mutex);
71static int nr_ctlr; 72static int nr_ctlr;
72static ctlr_info_t *hba[MAX_CTLR]; 73static ctlr_info_t *hba[MAX_CTLR];
73 74
@@ -845,9 +846,9 @@ static int ida_unlocked_open(struct block_device *bdev, fmode_t mode)
845{ 846{
846 int ret; 847 int ret;
847 848
848 lock_kernel(); 849 mutex_lock(&cpqarray_mutex);
849 ret = ida_open(bdev, mode); 850 ret = ida_open(bdev, mode);
850 unlock_kernel(); 851 mutex_unlock(&cpqarray_mutex);
851 852
852 return ret; 853 return ret;
853} 854}
@@ -859,10 +860,10 @@ static int ida_release(struct gendisk *disk, fmode_t mode)
859{ 860{
860 ctlr_info_t *host; 861 ctlr_info_t *host;
861 862
862 lock_kernel(); 863 mutex_lock(&cpqarray_mutex);
863 host = get_host(disk); 864 host = get_host(disk);
864 host->usage_count--; 865 host->usage_count--;
865 unlock_kernel(); 866 mutex_unlock(&cpqarray_mutex);
866 867
867 return 0; 868 return 0;
868} 869}
@@ -1217,9 +1218,9 @@ static int ida_ioctl(struct block_device *bdev, fmode_t mode,
1217{ 1218{
1218 int ret; 1219 int ret;
1219 1220
1220 lock_kernel(); 1221 mutex_lock(&cpqarray_mutex);
1221 ret = ida_locked_ioctl(bdev, mode, cmd, param); 1222 ret = ida_locked_ioctl(bdev, mode, cmd, param);
1222 unlock_kernel(); 1223 mutex_unlock(&cpqarray_mutex);
1223 1224
1224 return ret; 1225 return ret;
1225} 1226}
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 9400845d602e..ac04ef97eac2 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -965,29 +965,30 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
965 * ok, (capacity & 7) != 0 sometimes, but who cares... 965 * ok, (capacity & 7) != 0 sometimes, but who cares...
966 * we count rs_{total,left} in bits, not sectors. 966 * we count rs_{total,left} in bits, not sectors.
967 */ 967 */
968 spin_lock_irqsave(&mdev->al_lock, flags);
969 count = drbd_bm_clear_bits(mdev, sbnr, ebnr); 968 count = drbd_bm_clear_bits(mdev, sbnr, ebnr);
970 if (count) { 969 if (count && get_ldev(mdev)) {
971 /* we need the lock for drbd_try_clear_on_disk_bm */ 970 unsigned long now = jiffies;
972 if (jiffies - mdev->rs_mark_time > HZ*10) { 971 unsigned long last = mdev->rs_mark_time[mdev->rs_last_mark];
973 /* should be rolling marks, 972 int next = (mdev->rs_last_mark + 1) % DRBD_SYNC_MARKS;
974 * but we estimate only anyways. */ 973 if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) {
975 if (mdev->rs_mark_left != drbd_bm_total_weight(mdev) && 974 unsigned long tw = drbd_bm_total_weight(mdev);
975 if (mdev->rs_mark_left[mdev->rs_last_mark] != tw &&
976 mdev->state.conn != C_PAUSED_SYNC_T && 976 mdev->state.conn != C_PAUSED_SYNC_T &&
977 mdev->state.conn != C_PAUSED_SYNC_S) { 977 mdev->state.conn != C_PAUSED_SYNC_S) {
978 mdev->rs_mark_time = jiffies; 978 mdev->rs_mark_time[next] = now;
979 mdev->rs_mark_left = drbd_bm_total_weight(mdev); 979 mdev->rs_mark_left[next] = tw;
980 mdev->rs_last_mark = next;
980 } 981 }
981 } 982 }
982 if (get_ldev(mdev)) { 983 spin_lock_irqsave(&mdev->al_lock, flags);
983 drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE); 984 drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE);
984 put_ldev(mdev); 985 spin_unlock_irqrestore(&mdev->al_lock, flags);
985 } 986
986 /* just wake_up unconditional now, various lc_chaged(), 987 /* just wake_up unconditional now, various lc_chaged(),
987 * lc_put() in drbd_try_clear_on_disk_bm(). */ 988 * lc_put() in drbd_try_clear_on_disk_bm(). */
988 wake_up = 1; 989 wake_up = 1;
990 put_ldev(mdev);
989 } 991 }
990 spin_unlock_irqrestore(&mdev->al_lock, flags);
991 if (wake_up) 992 if (wake_up)
992 wake_up(&mdev->al_wait); 993 wake_up(&mdev->al_wait);
993} 994}
@@ -1118,7 +1119,7 @@ static int _is_in_al(struct drbd_conf *mdev, unsigned int enr)
1118 * @mdev: DRBD device. 1119 * @mdev: DRBD device.
1119 * @sector: The sector number. 1120 * @sector: The sector number.
1120 * 1121 *
1121 * This functions sleeps on al_wait. Returns 1 on success, 0 if interrupted. 1122 * This functions sleeps on al_wait. Returns 0 on success, -EINTR if interrupted.
1122 */ 1123 */
1123int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector) 1124int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
1124{ 1125{
@@ -1129,10 +1130,10 @@ int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
1129 sig = wait_event_interruptible(mdev->al_wait, 1130 sig = wait_event_interruptible(mdev->al_wait,
1130 (bm_ext = _bme_get(mdev, enr))); 1131 (bm_ext = _bme_get(mdev, enr)));
1131 if (sig) 1132 if (sig)
1132 return 0; 1133 return -EINTR;
1133 1134
1134 if (test_bit(BME_LOCKED, &bm_ext->flags)) 1135 if (test_bit(BME_LOCKED, &bm_ext->flags))
1135 return 1; 1136 return 0;
1136 1137
1137 for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { 1138 for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
1138 sig = wait_event_interruptible(mdev->al_wait, 1139 sig = wait_event_interruptible(mdev->al_wait,
@@ -1145,13 +1146,11 @@ int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
1145 wake_up(&mdev->al_wait); 1146 wake_up(&mdev->al_wait);
1146 } 1147 }
1147 spin_unlock_irq(&mdev->al_lock); 1148 spin_unlock_irq(&mdev->al_lock);
1148 return 0; 1149 return -EINTR;
1149 } 1150 }
1150 } 1151 }
1151
1152 set_bit(BME_LOCKED, &bm_ext->flags); 1152 set_bit(BME_LOCKED, &bm_ext->flags);
1153 1153 return 0;
1154 return 1;
1155} 1154}
1156 1155
1157/** 1156/**
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index e3f88d6e1412..fd42832f785b 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -569,7 +569,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
569 * 569 *
570 * maybe bm_set should be atomic_t ? 570 * maybe bm_set should be atomic_t ?
571 */ 571 */
572static unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev) 572unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev)
573{ 573{
574 struct drbd_bitmap *b = mdev->bitmap; 574 struct drbd_bitmap *b = mdev->bitmap;
575 unsigned long s; 575 unsigned long s;
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 352441b0f92f..9bdcf4393c0a 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -337,13 +337,25 @@ static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c)
337 * NOTE that the payload starts at a long aligned offset, 337 * NOTE that the payload starts at a long aligned offset,
338 * regardless of 32 or 64 bit arch! 338 * regardless of 32 or 64 bit arch!
339 */ 339 */
340struct p_header { 340struct p_header80 {
341 u32 magic; 341 u32 magic;
342 u16 command; 342 u16 command;
343 u16 length; /* bytes of data after this header */ 343 u16 length; /* bytes of data after this header */
344 u8 payload[0]; 344 u8 payload[0];
345} __packed; 345} __packed;
346/* 8 bytes. packet FIXED for the next century! */ 346
347/* Header for big packets, Used for data packets exceeding 64kB */
348struct p_header95 {
349 u16 magic; /* use DRBD_MAGIC_BIG here */
350 u16 command;
351 u32 length; /* Use only 24 bits of that. Ignore the highest 8 bit. */
352 u8 payload[0];
353} __packed;
354
355union p_header {
356 struct p_header80 h80;
357 struct p_header95 h95;
358};
347 359
348/* 360/*
349 * short commands, packets without payload, plain p_header: 361 * short commands, packets without payload, plain p_header:
@@ -362,12 +374,16 @@ struct p_header {
362 */ 374 */
363 375
364/* these defines must not be changed without changing the protocol version */ 376/* these defines must not be changed without changing the protocol version */
365#define DP_HARDBARRIER 1 377#define DP_HARDBARRIER 1 /* depricated */
366#define DP_RW_SYNC 2 378#define DP_RW_SYNC 2 /* equals REQ_SYNC */
367#define DP_MAY_SET_IN_SYNC 4 379#define DP_MAY_SET_IN_SYNC 4
380#define DP_UNPLUG 8 /* equals REQ_UNPLUG */
381#define DP_FUA 16 /* equals REQ_FUA */
382#define DP_FLUSH 32 /* equals REQ_FLUSH */
383#define DP_DISCARD 64 /* equals REQ_DISCARD */
368 384
369struct p_data { 385struct p_data {
370 struct p_header head; 386 union p_header head;
371 u64 sector; /* 64 bits sector number */ 387 u64 sector; /* 64 bits sector number */
372 u64 block_id; /* to identify the request in protocol B&C */ 388 u64 block_id; /* to identify the request in protocol B&C */
373 u32 seq_num; 389 u32 seq_num;
@@ -383,7 +399,7 @@ struct p_data {
383 * P_DATA_REQUEST, P_RS_DATA_REQUEST 399 * P_DATA_REQUEST, P_RS_DATA_REQUEST
384 */ 400 */
385struct p_block_ack { 401struct p_block_ack {
386 struct p_header head; 402 struct p_header80 head;
387 u64 sector; 403 u64 sector;
388 u64 block_id; 404 u64 block_id;
389 u32 blksize; 405 u32 blksize;
@@ -392,7 +408,7 @@ struct p_block_ack {
392 408
393 409
394struct p_block_req { 410struct p_block_req {
395 struct p_header head; 411 struct p_header80 head;
396 u64 sector; 412 u64 sector;
397 u64 block_id; 413 u64 block_id;
398 u32 blksize; 414 u32 blksize;
@@ -409,7 +425,7 @@ struct p_block_req {
409 */ 425 */
410 426
411struct p_handshake { 427struct p_handshake {
412 struct p_header head; /* 8 bytes */ 428 struct p_header80 head; /* 8 bytes */
413 u32 protocol_min; 429 u32 protocol_min;
414 u32 feature_flags; 430 u32 feature_flags;
415 u32 protocol_max; 431 u32 protocol_max;
@@ -424,19 +440,19 @@ struct p_handshake {
424/* 80 bytes, FIXED for the next century */ 440/* 80 bytes, FIXED for the next century */
425 441
426struct p_barrier { 442struct p_barrier {
427 struct p_header head; 443 struct p_header80 head;
428 u32 barrier; /* barrier number _handle_ only */ 444 u32 barrier; /* barrier number _handle_ only */
429 u32 pad; /* to multiple of 8 Byte */ 445 u32 pad; /* to multiple of 8 Byte */
430} __packed; 446} __packed;
431 447
432struct p_barrier_ack { 448struct p_barrier_ack {
433 struct p_header head; 449 struct p_header80 head;
434 u32 barrier; 450 u32 barrier;
435 u32 set_size; 451 u32 set_size;
436} __packed; 452} __packed;
437 453
438struct p_rs_param { 454struct p_rs_param {
439 struct p_header head; 455 struct p_header80 head;
440 u32 rate; 456 u32 rate;
441 457
442 /* Since protocol version 88 and higher. */ 458 /* Since protocol version 88 and higher. */
@@ -444,20 +460,31 @@ struct p_rs_param {
444} __packed; 460} __packed;
445 461
446struct p_rs_param_89 { 462struct p_rs_param_89 {
447 struct p_header head; 463 struct p_header80 head;
448 u32 rate; 464 u32 rate;
449 /* protocol version 89: */ 465 /* protocol version 89: */
450 char verify_alg[SHARED_SECRET_MAX]; 466 char verify_alg[SHARED_SECRET_MAX];
451 char csums_alg[SHARED_SECRET_MAX]; 467 char csums_alg[SHARED_SECRET_MAX];
452} __packed; 468} __packed;
453 469
470struct p_rs_param_95 {
471 struct p_header80 head;
472 u32 rate;
473 char verify_alg[SHARED_SECRET_MAX];
474 char csums_alg[SHARED_SECRET_MAX];
475 u32 c_plan_ahead;
476 u32 c_delay_target;
477 u32 c_fill_target;
478 u32 c_max_rate;
479} __packed;
480
454enum drbd_conn_flags { 481enum drbd_conn_flags {
455 CF_WANT_LOSE = 1, 482 CF_WANT_LOSE = 1,
456 CF_DRY_RUN = 2, 483 CF_DRY_RUN = 2,
457}; 484};
458 485
459struct p_protocol { 486struct p_protocol {
460 struct p_header head; 487 struct p_header80 head;
461 u32 protocol; 488 u32 protocol;
462 u32 after_sb_0p; 489 u32 after_sb_0p;
463 u32 after_sb_1p; 490 u32 after_sb_1p;
@@ -471,17 +498,17 @@ struct p_protocol {
471} __packed; 498} __packed;
472 499
473struct p_uuids { 500struct p_uuids {
474 struct p_header head; 501 struct p_header80 head;
475 u64 uuid[UI_EXTENDED_SIZE]; 502 u64 uuid[UI_EXTENDED_SIZE];
476} __packed; 503} __packed;
477 504
478struct p_rs_uuid { 505struct p_rs_uuid {
479 struct p_header head; 506 struct p_header80 head;
480 u64 uuid; 507 u64 uuid;
481} __packed; 508} __packed;
482 509
483struct p_sizes { 510struct p_sizes {
484 struct p_header head; 511 struct p_header80 head;
485 u64 d_size; /* size of disk */ 512 u64 d_size; /* size of disk */
486 u64 u_size; /* user requested size */ 513 u64 u_size; /* user requested size */
487 u64 c_size; /* current exported size */ 514 u64 c_size; /* current exported size */
@@ -491,18 +518,18 @@ struct p_sizes {
491} __packed; 518} __packed;
492 519
493struct p_state { 520struct p_state {
494 struct p_header head; 521 struct p_header80 head;
495 u32 state; 522 u32 state;
496} __packed; 523} __packed;
497 524
498struct p_req_state { 525struct p_req_state {
499 struct p_header head; 526 struct p_header80 head;
500 u32 mask; 527 u32 mask;
501 u32 val; 528 u32 val;
502} __packed; 529} __packed;
503 530
504struct p_req_state_reply { 531struct p_req_state_reply {
505 struct p_header head; 532 struct p_header80 head;
506 u32 retcode; 533 u32 retcode;
507} __packed; 534} __packed;
508 535
@@ -517,7 +544,7 @@ struct p_drbd06_param {
517} __packed; 544} __packed;
518 545
519struct p_discard { 546struct p_discard {
520 struct p_header head; 547 struct p_header80 head;
521 u64 block_id; 548 u64 block_id;
522 u32 seq_num; 549 u32 seq_num;
523 u32 pad; 550 u32 pad;
@@ -533,7 +560,7 @@ enum drbd_bitmap_code {
533}; 560};
534 561
535struct p_compressed_bm { 562struct p_compressed_bm {
536 struct p_header head; 563 struct p_header80 head;
537 /* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code 564 /* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code
538 * (encoding & 0x80): polarity (set/unset) of first runlength 565 * (encoding & 0x80): polarity (set/unset) of first runlength
539 * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits 566 * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits
@@ -544,10 +571,10 @@ struct p_compressed_bm {
544 u8 code[0]; 571 u8 code[0];
545} __packed; 572} __packed;
546 573
547struct p_delay_probe { 574struct p_delay_probe93 {
548 struct p_header head; 575 struct p_header80 head;
549 u32 seq_num; /* sequence number to match the two probe packets */ 576 u32 seq_num; /* sequence number to match the two probe packets */
550 u32 offset; /* usecs the probe got sent after the reference time point */ 577 u32 offset; /* usecs the probe got sent after the reference time point */
551} __packed; 578} __packed;
552 579
553/* DCBP: Drbd Compressed Bitmap Packet ... */ 580/* DCBP: Drbd Compressed Bitmap Packet ... */
@@ -594,7 +621,7 @@ DCBP_set_pad_bits(struct p_compressed_bm *p, int n)
594 * so we need to use the fixed size 4KiB page size 621 * so we need to use the fixed size 4KiB page size
595 * most architechtures have used for a long time. 622 * most architechtures have used for a long time.
596 */ 623 */
597#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header)) 624#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header80))
598#define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long)) 625#define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long))
599#define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm)) 626#define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm))
600#if (PAGE_SIZE < 4096) 627#if (PAGE_SIZE < 4096)
@@ -603,13 +630,14 @@ DCBP_set_pad_bits(struct p_compressed_bm *p, int n)
603#endif 630#endif
604 631
605union p_polymorph { 632union p_polymorph {
606 struct p_header header; 633 union p_header header;
607 struct p_handshake handshake; 634 struct p_handshake handshake;
608 struct p_data data; 635 struct p_data data;
609 struct p_block_ack block_ack; 636 struct p_block_ack block_ack;
610 struct p_barrier barrier; 637 struct p_barrier barrier;
611 struct p_barrier_ack barrier_ack; 638 struct p_barrier_ack barrier_ack;
612 struct p_rs_param_89 rs_param_89; 639 struct p_rs_param_89 rs_param_89;
640 struct p_rs_param_95 rs_param_95;
613 struct p_protocol protocol; 641 struct p_protocol protocol;
614 struct p_sizes sizes; 642 struct p_sizes sizes;
615 struct p_uuids uuids; 643 struct p_uuids uuids;
@@ -617,6 +645,8 @@ union p_polymorph {
617 struct p_req_state req_state; 645 struct p_req_state req_state;
618 struct p_req_state_reply req_state_reply; 646 struct p_req_state_reply req_state_reply;
619 struct p_block_req block_req; 647 struct p_block_req block_req;
648 struct p_delay_probe93 delay_probe93;
649 struct p_rs_uuid rs_uuid;
620} __packed; 650} __packed;
621 651
622/**********************************************************************/ 652/**********************************************************************/
@@ -697,7 +727,7 @@ struct drbd_tl_epoch {
697 struct list_head requests; /* requests before */ 727 struct list_head requests; /* requests before */
698 struct drbd_tl_epoch *next; /* pointer to the next barrier */ 728 struct drbd_tl_epoch *next; /* pointer to the next barrier */
699 unsigned int br_number; /* the barriers identifier. */ 729 unsigned int br_number; /* the barriers identifier. */
700 int n_req; /* number of requests attached before this barrier */ 730 int n_writes; /* number of requests attached before this barrier */
701}; 731};
702 732
703struct drbd_request; 733struct drbd_request;
@@ -747,7 +777,7 @@ struct digest_info {
747struct drbd_epoch_entry { 777struct drbd_epoch_entry {
748 struct drbd_work w; 778 struct drbd_work w;
749 struct hlist_node colision; 779 struct hlist_node colision;
750 struct drbd_epoch *epoch; 780 struct drbd_epoch *epoch; /* for writes */
751 struct drbd_conf *mdev; 781 struct drbd_conf *mdev;
752 struct page *pages; 782 struct page *pages;
753 atomic_t pending_bios; 783 atomic_t pending_bios;
@@ -755,7 +785,10 @@ struct drbd_epoch_entry {
755 /* see comments on ee flag bits below */ 785 /* see comments on ee flag bits below */
756 unsigned long flags; 786 unsigned long flags;
757 sector_t sector; 787 sector_t sector;
758 u64 block_id; 788 union {
789 u64 block_id;
790 struct digest_info *digest;
791 };
759}; 792};
760 793
761/* ee flag bits. 794/* ee flag bits.
@@ -781,12 +814,16 @@ enum {
781 * if any of those fail, we set this flag atomically 814 * if any of those fail, we set this flag atomically
782 * from the endio callback */ 815 * from the endio callback */
783 __EE_WAS_ERROR, 816 __EE_WAS_ERROR,
817
818 /* This ee has a pointer to a digest instead of a block id */
819 __EE_HAS_DIGEST,
784}; 820};
785#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) 821#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
786#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) 822#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
787#define EE_IS_BARRIER (1<<__EE_IS_BARRIER) 823#define EE_IS_BARRIER (1<<__EE_IS_BARRIER)
788#define EE_RESUBMITTED (1<<__EE_RESUBMITTED) 824#define EE_RESUBMITTED (1<<__EE_RESUBMITTED)
789#define EE_WAS_ERROR (1<<__EE_WAS_ERROR) 825#define EE_WAS_ERROR (1<<__EE_WAS_ERROR)
826#define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST)
790 827
791/* global flag bits */ 828/* global flag bits */
792enum { 829enum {
@@ -794,7 +831,6 @@ enum {
794 SIGNAL_ASENDER, /* whether asender wants to be interrupted */ 831 SIGNAL_ASENDER, /* whether asender wants to be interrupted */
795 SEND_PING, /* whether asender should send a ping asap */ 832 SEND_PING, /* whether asender should send a ping asap */
796 833
797 STOP_SYNC_TIMER, /* tell timer to cancel itself */
798 UNPLUG_QUEUED, /* only relevant with kernel 2.4 */ 834 UNPLUG_QUEUED, /* only relevant with kernel 2.4 */
799 UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */ 835 UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */
800 MD_DIRTY, /* current uuids and flags not yet on disk */ 836 MD_DIRTY, /* current uuids and flags not yet on disk */
@@ -816,6 +852,7 @@ enum {
816 BITMAP_IO, /* suspend application io; 852 BITMAP_IO, /* suspend application io;
817 once no more io in flight, start bitmap io */ 853 once no more io in flight, start bitmap io */
818 BITMAP_IO_QUEUED, /* Started bitmap IO */ 854 BITMAP_IO_QUEUED, /* Started bitmap IO */
855 GO_DISKLESS, /* Disk failed, local_cnt reached zero, we are going diskless */
819 RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */ 856 RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */
820 NET_CONGESTED, /* The data socket is congested */ 857 NET_CONGESTED, /* The data socket is congested */
821 858
@@ -829,6 +866,8 @@ enum {
829 * the peer, if it changed there as well. */ 866 * the peer, if it changed there as well. */
830 CONN_DRY_RUN, /* Expect disconnect after resync handshake. */ 867 CONN_DRY_RUN, /* Expect disconnect after resync handshake. */
831 GOT_PING_ACK, /* set when we receive a ping_ack packet, misc wait gets woken */ 868 GOT_PING_ACK, /* set when we receive a ping_ack packet, misc wait gets woken */
869 NEW_CUR_UUID, /* Create new current UUID when thawing IO */
870 AL_SUSPENDED, /* Activity logging is currently suspended. */
832}; 871};
833 872
834struct drbd_bitmap; /* opaque for drbd_conf */ 873struct drbd_bitmap; /* opaque for drbd_conf */
@@ -838,10 +877,6 @@ struct drbd_bitmap; /* opaque for drbd_conf */
838 877
839/* THINK maybe we actually want to use the default "event/%s" worker threads 878/* THINK maybe we actually want to use the default "event/%s" worker threads
840 * or similar in linux 2.6, which uses per cpu data and threads. 879 * or similar in linux 2.6, which uses per cpu data and threads.
841 *
842 * To be general, this might need a spin_lock member.
843 * For now, please use the mdev->req_lock to protect list_head,
844 * see drbd_queue_work below.
845 */ 880 */
846struct drbd_work_queue { 881struct drbd_work_queue {
847 struct list_head q; 882 struct list_head q;
@@ -915,6 +950,12 @@ enum write_ordering_e {
915 WO_bio_barrier 950 WO_bio_barrier
916}; 951};
917 952
953struct fifo_buffer {
954 int *values;
955 unsigned int head_index;
956 unsigned int size;
957};
958
918struct drbd_conf { 959struct drbd_conf {
919 /* things that are stored as / read from meta data on disk */ 960 /* things that are stored as / read from meta data on disk */
920 unsigned long flags; 961 unsigned long flags;
@@ -936,9 +977,16 @@ struct drbd_conf {
936 unsigned int ko_count; 977 unsigned int ko_count;
937 struct drbd_work resync_work, 978 struct drbd_work resync_work,
938 unplug_work, 979 unplug_work,
980 go_diskless,
939 md_sync_work; 981 md_sync_work;
940 struct timer_list resync_timer; 982 struct timer_list resync_timer;
941 struct timer_list md_sync_timer; 983 struct timer_list md_sync_timer;
984#ifdef DRBD_DEBUG_MD_SYNC
985 struct {
986 unsigned int line;
987 const char* func;
988 } last_md_mark_dirty;
989#endif
942 990
943 /* Used after attach while negotiating new disk state. */ 991 /* Used after attach while negotiating new disk state. */
944 union drbd_state new_state_tmp; 992 union drbd_state new_state_tmp;
@@ -946,6 +994,7 @@ struct drbd_conf {
946 union drbd_state state; 994 union drbd_state state;
947 wait_queue_head_t misc_wait; 995 wait_queue_head_t misc_wait;
948 wait_queue_head_t state_wait; /* upon each state change. */ 996 wait_queue_head_t state_wait; /* upon each state change. */
997 wait_queue_head_t net_cnt_wait;
949 unsigned int send_cnt; 998 unsigned int send_cnt;
950 unsigned int recv_cnt; 999 unsigned int recv_cnt;
951 unsigned int read_cnt; 1000 unsigned int read_cnt;
@@ -974,12 +1023,16 @@ struct drbd_conf {
974 unsigned long rs_start; 1023 unsigned long rs_start;
975 /* cumulated time in PausedSyncX state [unit jiffies] */ 1024 /* cumulated time in PausedSyncX state [unit jiffies] */
976 unsigned long rs_paused; 1025 unsigned long rs_paused;
1026 /* skipped because csum was equal [unit BM_BLOCK_SIZE] */
1027 unsigned long rs_same_csum;
1028#define DRBD_SYNC_MARKS 8
1029#define DRBD_SYNC_MARK_STEP (3*HZ)
977 /* block not up-to-date at mark [unit BM_BLOCK_SIZE] */ 1030 /* block not up-to-date at mark [unit BM_BLOCK_SIZE] */
978 unsigned long rs_mark_left; 1031 unsigned long rs_mark_left[DRBD_SYNC_MARKS];
979 /* marks's time [unit jiffies] */ 1032 /* marks's time [unit jiffies] */
980 unsigned long rs_mark_time; 1033 unsigned long rs_mark_time[DRBD_SYNC_MARKS];
981 /* skipped because csum was equeal [unit BM_BLOCK_SIZE] */ 1034 /* current index into rs_mark_{left,time} */
982 unsigned long rs_same_csum; 1035 int rs_last_mark;
983 1036
984 /* where does the admin want us to start? (sector) */ 1037 /* where does the admin want us to start? (sector) */
985 sector_t ov_start_sector; 1038 sector_t ov_start_sector;
@@ -1012,10 +1065,10 @@ struct drbd_conf {
1012 spinlock_t epoch_lock; 1065 spinlock_t epoch_lock;
1013 unsigned int epochs; 1066 unsigned int epochs;
1014 enum write_ordering_e write_ordering; 1067 enum write_ordering_e write_ordering;
1015 struct list_head active_ee; /* IO in progress */ 1068 struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */
1016 struct list_head sync_ee; /* IO in progress */ 1069 struct list_head sync_ee; /* IO in progress (P_RS_DATA_REPLY gets written to disk) */
1017 struct list_head done_ee; /* send ack */ 1070 struct list_head done_ee; /* send ack */
1018 struct list_head read_ee; /* IO in progress */ 1071 struct list_head read_ee; /* IO in progress (any read) */
1019 struct list_head net_ee; /* zero-copy network send in progress */ 1072 struct list_head net_ee; /* zero-copy network send in progress */
1020 struct hlist_head *ee_hash; /* is proteced by req_lock! */ 1073 struct hlist_head *ee_hash; /* is proteced by req_lock! */
1021 unsigned int ee_hash_s; 1074 unsigned int ee_hash_s;
@@ -1026,7 +1079,8 @@ struct drbd_conf {
1026 int next_barrier_nr; 1079 int next_barrier_nr;
1027 struct hlist_head *app_reads_hash; /* is proteced by req_lock */ 1080 struct hlist_head *app_reads_hash; /* is proteced by req_lock */
1028 struct list_head resync_reads; 1081 struct list_head resync_reads;
1029 atomic_t pp_in_use; 1082 atomic_t pp_in_use; /* allocated from page pool */
1083 atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */
1030 wait_queue_head_t ee_wait; 1084 wait_queue_head_t ee_wait;
1031 struct page *md_io_page; /* one page buffer for md_io */ 1085 struct page *md_io_page; /* one page buffer for md_io */
1032 struct page *md_io_tmpp; /* for logical_block_size != 512 */ 1086 struct page *md_io_tmpp; /* for logical_block_size != 512 */
@@ -1054,6 +1108,15 @@ struct drbd_conf {
1054 u64 ed_uuid; /* UUID of the exposed data */ 1108 u64 ed_uuid; /* UUID of the exposed data */
1055 struct mutex state_mutex; 1109 struct mutex state_mutex;
1056 char congestion_reason; /* Why we where congested... */ 1110 char congestion_reason; /* Why we where congested... */
1111 atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */
1112 atomic_t rs_sect_ev; /* for submitted resync data rate, both */
1113 int rs_last_sect_ev; /* counter to compare with */
1114 int rs_last_events; /* counter of read or write "events" (unit sectors)
1115 * on the lower level device when we last looked. */
1116 int c_sync_rate; /* current resync rate after syncer throttle magic */
1117 struct fifo_buffer rs_plan_s; /* correction values of resync planer */
1118 int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */
1119 int rs_planed; /* resync sectors already planed */
1057}; 1120};
1058 1121
1059static inline struct drbd_conf *minor_to_mdev(unsigned int minor) 1122static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
@@ -1138,6 +1201,8 @@ extern void drbd_free_resources(struct drbd_conf *mdev);
1138extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, 1201extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
1139 unsigned int set_size); 1202 unsigned int set_size);
1140extern void tl_clear(struct drbd_conf *mdev); 1203extern void tl_clear(struct drbd_conf *mdev);
1204enum drbd_req_event;
1205extern void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what);
1141extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *); 1206extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *);
1142extern void drbd_free_sock(struct drbd_conf *mdev); 1207extern void drbd_free_sock(struct drbd_conf *mdev);
1143extern int drbd_send(struct drbd_conf *mdev, struct socket *sock, 1208extern int drbd_send(struct drbd_conf *mdev, struct socket *sock,
@@ -1150,12 +1215,12 @@ extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_f
1150extern int _drbd_send_state(struct drbd_conf *mdev); 1215extern int _drbd_send_state(struct drbd_conf *mdev);
1151extern int drbd_send_state(struct drbd_conf *mdev); 1216extern int drbd_send_state(struct drbd_conf *mdev);
1152extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, 1217extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1153 enum drbd_packets cmd, struct p_header *h, 1218 enum drbd_packets cmd, struct p_header80 *h,
1154 size_t size, unsigned msg_flags); 1219 size_t size, unsigned msg_flags);
1155#define USE_DATA_SOCKET 1 1220#define USE_DATA_SOCKET 1
1156#define USE_META_SOCKET 0 1221#define USE_META_SOCKET 0
1157extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket, 1222extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
1158 enum drbd_packets cmd, struct p_header *h, 1223 enum drbd_packets cmd, struct p_header80 *h,
1159 size_t size); 1224 size_t size);
1160extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, 1225extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd,
1161 char *data, size_t size); 1226 char *data, size_t size);
@@ -1167,7 +1232,7 @@ extern int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
1167extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd, 1232extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
1168 struct p_block_req *rp); 1233 struct p_block_req *rp);
1169extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd, 1234extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
1170 struct p_data *dp); 1235 struct p_data *dp, int data_size);
1171extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd, 1236extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
1172 sector_t sector, int blksize, u64 block_id); 1237 sector_t sector, int blksize, u64 block_id);
1173extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, 1238extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
@@ -1201,7 +1266,13 @@ extern void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
1201extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local); 1266extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local);
1202extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local); 1267extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local);
1203extern int drbd_md_test_flag(struct drbd_backing_dev *, int); 1268extern int drbd_md_test_flag(struct drbd_backing_dev *, int);
1269#ifndef DRBD_DEBUG_MD_SYNC
1204extern void drbd_md_mark_dirty(struct drbd_conf *mdev); 1270extern void drbd_md_mark_dirty(struct drbd_conf *mdev);
1271#else
1272#define drbd_md_mark_dirty(m) drbd_md_mark_dirty_(m, __LINE__ , __func__ )
1273extern void drbd_md_mark_dirty_(struct drbd_conf *mdev,
1274 unsigned int line, const char *func);
1275#endif
1205extern void drbd_queue_bitmap_io(struct drbd_conf *mdev, 1276extern void drbd_queue_bitmap_io(struct drbd_conf *mdev,
1206 int (*io_fn)(struct drbd_conf *), 1277 int (*io_fn)(struct drbd_conf *),
1207 void (*done)(struct drbd_conf *, int), 1278 void (*done)(struct drbd_conf *, int),
@@ -1209,6 +1280,7 @@ extern void drbd_queue_bitmap_io(struct drbd_conf *mdev,
1209extern int drbd_bmio_set_n_write(struct drbd_conf *mdev); 1280extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
1210extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); 1281extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
1211extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why); 1282extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why);
1283extern void drbd_go_diskless(struct drbd_conf *mdev);
1212 1284
1213 1285
1214/* Meta data layout 1286/* Meta data layout
@@ -1264,6 +1336,8 @@ struct bm_extent {
1264 * Bit 1 ==> local node thinks this block needs to be synced. 1336 * Bit 1 ==> local node thinks this block needs to be synced.
1265 */ 1337 */
1266 1338
1339#define SLEEP_TIME (HZ/10)
1340
1267#define BM_BLOCK_SHIFT 12 /* 4k per bit */ 1341#define BM_BLOCK_SHIFT 12 /* 4k per bit */
1268#define BM_BLOCK_SIZE (1<<BM_BLOCK_SHIFT) 1342#define BM_BLOCK_SIZE (1<<BM_BLOCK_SHIFT)
1269/* (9+3) : 512 bytes @ 8 bits; representing 16M storage 1343/* (9+3) : 512 bytes @ 8 bits; representing 16M storage
@@ -1335,11 +1409,13 @@ struct bm_extent {
1335#endif 1409#endif
1336 1410
1337/* Sector shift value for the "hash" functions of tl_hash and ee_hash tables. 1411/* Sector shift value for the "hash" functions of tl_hash and ee_hash tables.
1338 * With a value of 6 all IO in one 32K block make it to the same slot of the 1412 * With a value of 8 all IO in one 128K block make it to the same slot of the
1339 * hash table. */ 1413 * hash table. */
1340#define HT_SHIFT 6 1414#define HT_SHIFT 8
1341#define DRBD_MAX_SEGMENT_SIZE (1U<<(9+HT_SHIFT)) 1415#define DRBD_MAX_SEGMENT_SIZE (1U<<(9+HT_SHIFT))
1342 1416
1417#define DRBD_MAX_SIZE_H80_PACKET (1 << 15) /* The old header only allows packets up to 32Kib data */
1418
1343/* Number of elements in the app_reads_hash */ 1419/* Number of elements in the app_reads_hash */
1344#define APP_R_HSIZE 15 1420#define APP_R_HSIZE 15
1345 1421
@@ -1369,6 +1445,7 @@ extern unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_
1369/* bm_find_next variants for use while you hold drbd_bm_lock() */ 1445/* bm_find_next variants for use while you hold drbd_bm_lock() */
1370extern unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo); 1446extern unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo);
1371extern unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo); 1447extern unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo);
1448extern unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev);
1372extern unsigned long drbd_bm_total_weight(struct drbd_conf *mdev); 1449extern unsigned long drbd_bm_total_weight(struct drbd_conf *mdev);
1373extern int drbd_bm_rs_done(struct drbd_conf *mdev); 1450extern int drbd_bm_rs_done(struct drbd_conf *mdev);
1374/* for receive_bitmap */ 1451/* for receive_bitmap */
@@ -1421,7 +1498,8 @@ extern void resync_after_online_grow(struct drbd_conf *);
1421extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local); 1498extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local);
1422extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, 1499extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role,
1423 int force); 1500 int force);
1424enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev); 1501extern enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev);
1502extern void drbd_try_outdate_peer_async(struct drbd_conf *mdev);
1425extern int drbd_khelper(struct drbd_conf *mdev, char *cmd); 1503extern int drbd_khelper(struct drbd_conf *mdev, char *cmd);
1426 1504
1427/* drbd_worker.c */ 1505/* drbd_worker.c */
@@ -1467,10 +1545,12 @@ extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int);
1467extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int); 1545extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int);
1468extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int); 1546extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int);
1469extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int); 1547extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int);
1548extern int w_restart_disk_io(struct drbd_conf *, struct drbd_work *, int);
1470 1549
1471extern void resync_timer_fn(unsigned long data); 1550extern void resync_timer_fn(unsigned long data);
1472 1551
1473/* drbd_receiver.c */ 1552/* drbd_receiver.c */
1553extern int drbd_rs_should_slow_down(struct drbd_conf *mdev);
1474extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, 1554extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
1475 const unsigned rw, const int fault_type); 1555 const unsigned rw, const int fault_type);
1476extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list); 1556extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list);
@@ -1479,7 +1559,10 @@ extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
1479 sector_t sector, 1559 sector_t sector,
1480 unsigned int data_size, 1560 unsigned int data_size,
1481 gfp_t gfp_mask) __must_hold(local); 1561 gfp_t gfp_mask) __must_hold(local);
1482extern void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e); 1562extern void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
1563 int is_net);
1564#define drbd_free_ee(m,e) drbd_free_some_ee(m, e, 0)
1565#define drbd_free_net_ee(m,e) drbd_free_some_ee(m, e, 1)
1483extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev, 1566extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev,
1484 struct list_head *head); 1567 struct list_head *head);
1485extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, 1568extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
@@ -1487,6 +1570,7 @@ extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
1487extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled); 1570extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled);
1488extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed); 1571extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed);
1489extern void drbd_flush_workqueue(struct drbd_conf *mdev); 1572extern void drbd_flush_workqueue(struct drbd_conf *mdev);
1573extern void drbd_free_tl_hash(struct drbd_conf *mdev);
1490 1574
1491/* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to 1575/* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to
1492 * mess with get_fs/set_fs, we know we are KERNEL_DS always. */ 1576 * mess with get_fs/set_fs, we know we are KERNEL_DS always. */
@@ -1600,6 +1684,8 @@ void drbd_bcast_ee(struct drbd_conf *mdev,
1600#define susp_MASK 1 1684#define susp_MASK 1
1601#define user_isp_MASK 1 1685#define user_isp_MASK 1
1602#define aftr_isp_MASK 1 1686#define aftr_isp_MASK 1
1687#define susp_nod_MASK 1
1688#define susp_fen_MASK 1
1603 1689
1604#define NS(T, S) \ 1690#define NS(T, S) \
1605 ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \ 1691 ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
@@ -1856,13 +1942,6 @@ static inline sector_t drbd_md_ss__(struct drbd_conf *mdev,
1856} 1942}
1857 1943
1858static inline void 1944static inline void
1859_drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
1860{
1861 list_add_tail(&w->list, &q->q);
1862 up(&q->s);
1863}
1864
1865static inline void
1866drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w) 1945drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w)
1867{ 1946{
1868 unsigned long flags; 1947 unsigned long flags;
@@ -1899,19 +1978,19 @@ static inline void request_ping(struct drbd_conf *mdev)
1899static inline int drbd_send_short_cmd(struct drbd_conf *mdev, 1978static inline int drbd_send_short_cmd(struct drbd_conf *mdev,
1900 enum drbd_packets cmd) 1979 enum drbd_packets cmd)
1901{ 1980{
1902 struct p_header h; 1981 struct p_header80 h;
1903 return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h)); 1982 return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h));
1904} 1983}
1905 1984
1906static inline int drbd_send_ping(struct drbd_conf *mdev) 1985static inline int drbd_send_ping(struct drbd_conf *mdev)
1907{ 1986{
1908 struct p_header h; 1987 struct p_header80 h;
1909 return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h)); 1988 return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h));
1910} 1989}
1911 1990
1912static inline int drbd_send_ping_ack(struct drbd_conf *mdev) 1991static inline int drbd_send_ping_ack(struct drbd_conf *mdev)
1913{ 1992{
1914 struct p_header h; 1993 struct p_header80 h;
1915 return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h)); 1994 return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h));
1916} 1995}
1917 1996
@@ -2013,7 +2092,7 @@ static inline void inc_unacked(struct drbd_conf *mdev)
2013static inline void put_net_conf(struct drbd_conf *mdev) 2092static inline void put_net_conf(struct drbd_conf *mdev)
2014{ 2093{
2015 if (atomic_dec_and_test(&mdev->net_cnt)) 2094 if (atomic_dec_and_test(&mdev->net_cnt))
2016 wake_up(&mdev->misc_wait); 2095 wake_up(&mdev->net_cnt_wait);
2017} 2096}
2018 2097
2019/** 2098/**
@@ -2044,10 +2123,14 @@ static inline int get_net_conf(struct drbd_conf *mdev)
2044 2123
2045static inline void put_ldev(struct drbd_conf *mdev) 2124static inline void put_ldev(struct drbd_conf *mdev)
2046{ 2125{
2126 int i = atomic_dec_return(&mdev->local_cnt);
2047 __release(local); 2127 __release(local);
2048 if (atomic_dec_and_test(&mdev->local_cnt)) 2128 D_ASSERT(i >= 0);
2129 if (i == 0) {
2130 if (mdev->state.disk == D_FAILED)
2131 drbd_go_diskless(mdev);
2049 wake_up(&mdev->misc_wait); 2132 wake_up(&mdev->misc_wait);
2050 D_ASSERT(atomic_read(&mdev->local_cnt) >= 0); 2133 }
2051} 2134}
2052 2135
2053#ifndef __CHECKER__ 2136#ifndef __CHECKER__
@@ -2179,11 +2262,16 @@ static inline int drbd_state_is_stable(union drbd_state s)
2179 return 1; 2262 return 1;
2180} 2263}
2181 2264
2265static inline int is_susp(union drbd_state s)
2266{
2267 return s.susp || s.susp_nod || s.susp_fen;
2268}
2269
2182static inline int __inc_ap_bio_cond(struct drbd_conf *mdev) 2270static inline int __inc_ap_bio_cond(struct drbd_conf *mdev)
2183{ 2271{
2184 int mxb = drbd_get_max_buffers(mdev); 2272 int mxb = drbd_get_max_buffers(mdev);
2185 2273
2186 if (mdev->state.susp) 2274 if (is_susp(mdev->state))
2187 return 0; 2275 return 0;
2188 if (test_bit(SUSPEND_IO, &mdev->flags)) 2276 if (test_bit(SUSPEND_IO, &mdev->flags))
2189 return 0; 2277 return 0;
@@ -2321,8 +2409,7 @@ static inline void drbd_md_flush(struct drbd_conf *mdev)
2321 if (test_bit(MD_NO_BARRIER, &mdev->flags)) 2409 if (test_bit(MD_NO_BARRIER, &mdev->flags))
2322 return; 2410 return;
2323 2411
2324 r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL, 2412 r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL);
2325 BLKDEV_IFL_WAIT);
2326 if (r) { 2413 if (r) {
2327 set_bit(MD_NO_BARRIER, &mdev->flags); 2414 set_bit(MD_NO_BARRIER, &mdev->flags);
2328 dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r); 2415 dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index fa650dd85b90..c5dfe6486cf3 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -32,7 +32,7 @@
32#include <asm/types.h> 32#include <asm/types.h>
33#include <net/sock.h> 33#include <net/sock.h>
34#include <linux/ctype.h> 34#include <linux/ctype.h>
35#include <linux/smp_lock.h> 35#include <linux/mutex.h>
36#include <linux/fs.h> 36#include <linux/fs.h>
37#include <linux/file.h> 37#include <linux/file.h>
38#include <linux/proc_fs.h> 38#include <linux/proc_fs.h>
@@ -64,6 +64,7 @@ struct after_state_chg_work {
64 struct completion *done; 64 struct completion *done;
65}; 65};
66 66
67static DEFINE_MUTEX(drbd_main_mutex);
67int drbdd_init(struct drbd_thread *); 68int drbdd_init(struct drbd_thread *);
68int drbd_worker(struct drbd_thread *); 69int drbd_worker(struct drbd_thread *);
69int drbd_asender(struct drbd_thread *); 70int drbd_asender(struct drbd_thread *);
@@ -77,6 +78,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
77static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused); 78static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
78static void md_sync_timer_fn(unsigned long data); 79static void md_sync_timer_fn(unsigned long data);
79static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused); 80static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
81static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
80 82
81MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, " 83MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
82 "Lars Ellenberg <lars@linbit.com>"); 84 "Lars Ellenberg <lars@linbit.com>");
@@ -199,7 +201,7 @@ static int tl_init(struct drbd_conf *mdev)
199 INIT_LIST_HEAD(&b->w.list); 201 INIT_LIST_HEAD(&b->w.list);
200 b->next = NULL; 202 b->next = NULL;
201 b->br_number = 4711; 203 b->br_number = 4711;
202 b->n_req = 0; 204 b->n_writes = 0;
203 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */ 205 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
204 206
205 mdev->oldest_tle = b; 207 mdev->oldest_tle = b;
@@ -240,7 +242,7 @@ void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
240 INIT_LIST_HEAD(&new->w.list); 242 INIT_LIST_HEAD(&new->w.list);
241 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */ 243 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
242 new->next = NULL; 244 new->next = NULL;
243 new->n_req = 0; 245 new->n_writes = 0;
244 246
245 newest_before = mdev->newest_tle; 247 newest_before = mdev->newest_tle;
246 /* never send a barrier number == 0, because that is special-cased 248 /* never send a barrier number == 0, because that is special-cased
@@ -284,9 +286,9 @@ void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
284 barrier_nr, b->br_number); 286 barrier_nr, b->br_number);
285 goto bail; 287 goto bail;
286 } 288 }
287 if (b->n_req != set_size) { 289 if (b->n_writes != set_size) {
288 dev_err(DEV, "BAD! BarrierAck #%u received with n_req=%u, expected n_req=%u!\n", 290 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
289 barrier_nr, set_size, b->n_req); 291 barrier_nr, set_size, b->n_writes);
290 goto bail; 292 goto bail;
291 } 293 }
292 294
@@ -333,6 +335,82 @@ bail:
333 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); 335 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
334} 336}
335 337
338/**
339 * _tl_restart() - Walks the transfer log, and applies an action to all requests
340 * @mdev: DRBD device.
341 * @what: The action/event to perform with all request objects
342 *
343 * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
344 * restart_frozen_disk_io.
345 */
346static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
347{
348 struct drbd_tl_epoch *b, *tmp, **pn;
349 struct list_head *le, *tle, carry_reads;
350 struct drbd_request *req;
351 int rv, n_writes, n_reads;
352
353 b = mdev->oldest_tle;
354 pn = &mdev->oldest_tle;
355 while (b) {
356 n_writes = 0;
357 n_reads = 0;
358 INIT_LIST_HEAD(&carry_reads);
359 list_for_each_safe(le, tle, &b->requests) {
360 req = list_entry(le, struct drbd_request, tl_requests);
361 rv = _req_mod(req, what);
362
363 n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
364 n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
365 }
366 tmp = b->next;
367
368 if (n_writes) {
369 if (what == resend) {
370 b->n_writes = n_writes;
371 if (b->w.cb == NULL) {
372 b->w.cb = w_send_barrier;
373 inc_ap_pending(mdev);
374 set_bit(CREATE_BARRIER, &mdev->flags);
375 }
376
377 drbd_queue_work(&mdev->data.work, &b->w);
378 }
379 pn = &b->next;
380 } else {
381 if (n_reads)
382 list_add(&carry_reads, &b->requests);
383 /* there could still be requests on that ring list,
384 * in case local io is still pending */
385 list_del(&b->requests);
386
387 /* dec_ap_pending corresponding to queue_barrier.
388 * the newest barrier may not have been queued yet,
389 * in which case w.cb is still NULL. */
390 if (b->w.cb != NULL)
391 dec_ap_pending(mdev);
392
393 if (b == mdev->newest_tle) {
394 /* recycle, but reinit! */
395 D_ASSERT(tmp == NULL);
396 INIT_LIST_HEAD(&b->requests);
397 list_splice(&carry_reads, &b->requests);
398 INIT_LIST_HEAD(&b->w.list);
399 b->w.cb = NULL;
400 b->br_number = net_random();
401 b->n_writes = 0;
402
403 *pn = b;
404 break;
405 }
406 *pn = tmp;
407 kfree(b);
408 }
409 b = tmp;
410 list_splice(&carry_reads, &b->requests);
411 }
412}
413
336 414
337/** 415/**
338 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL 416 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
@@ -344,48 +422,12 @@ bail:
344 */ 422 */
345void tl_clear(struct drbd_conf *mdev) 423void tl_clear(struct drbd_conf *mdev)
346{ 424{
347 struct drbd_tl_epoch *b, *tmp;
348 struct list_head *le, *tle; 425 struct list_head *le, *tle;
349 struct drbd_request *r; 426 struct drbd_request *r;
350 int new_initial_bnr = net_random();
351 427
352 spin_lock_irq(&mdev->req_lock); 428 spin_lock_irq(&mdev->req_lock);
353 429
354 b = mdev->oldest_tle; 430 _tl_restart(mdev, connection_lost_while_pending);
355 while (b) {
356 list_for_each_safe(le, tle, &b->requests) {
357 r = list_entry(le, struct drbd_request, tl_requests);
358 /* It would be nice to complete outside of spinlock.
359 * But this is easier for now. */
360 _req_mod(r, connection_lost_while_pending);
361 }
362 tmp = b->next;
363
364 /* there could still be requests on that ring list,
365 * in case local io is still pending */
366 list_del(&b->requests);
367
368 /* dec_ap_pending corresponding to queue_barrier.
369 * the newest barrier may not have been queued yet,
370 * in which case w.cb is still NULL. */
371 if (b->w.cb != NULL)
372 dec_ap_pending(mdev);
373
374 if (b == mdev->newest_tle) {
375 /* recycle, but reinit! */
376 D_ASSERT(tmp == NULL);
377 INIT_LIST_HEAD(&b->requests);
378 INIT_LIST_HEAD(&b->w.list);
379 b->w.cb = NULL;
380 b->br_number = new_initial_bnr;
381 b->n_req = 0;
382
383 mdev->oldest_tle = b;
384 break;
385 }
386 kfree(b);
387 b = tmp;
388 }
389 431
390 /* we expect this list to be empty. */ 432 /* we expect this list to be empty. */
391 D_ASSERT(list_empty(&mdev->out_of_sequence_requests)); 433 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
@@ -401,6 +443,15 @@ void tl_clear(struct drbd_conf *mdev)
401 /* ensure bit indicating barrier is required is clear */ 443 /* ensure bit indicating barrier is required is clear */
402 clear_bit(CREATE_BARRIER, &mdev->flags); 444 clear_bit(CREATE_BARRIER, &mdev->flags);
403 445
446 memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
447
448 spin_unlock_irq(&mdev->req_lock);
449}
450
451void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
452{
453 spin_lock_irq(&mdev->req_lock);
454 _tl_restart(mdev, what);
404 spin_unlock_irq(&mdev->req_lock); 455 spin_unlock_irq(&mdev->req_lock);
405} 456}
406 457
@@ -455,7 +506,7 @@ static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns);
455static int is_valid_state_transition(struct drbd_conf *, 506static int is_valid_state_transition(struct drbd_conf *,
456 union drbd_state, union drbd_state); 507 union drbd_state, union drbd_state);
457static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, 508static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
458 union drbd_state ns, int *warn_sync_abort); 509 union drbd_state ns, const char **warn_sync_abort);
459int drbd_send_state_req(struct drbd_conf *, 510int drbd_send_state_req(struct drbd_conf *,
460 union drbd_state, union drbd_state); 511 union drbd_state, union drbd_state);
461 512
@@ -605,7 +656,7 @@ static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
605 drbd_role_str(ns.peer), 656 drbd_role_str(ns.peer),
606 drbd_disk_str(ns.disk), 657 drbd_disk_str(ns.disk),
607 drbd_disk_str(ns.pdsk), 658 drbd_disk_str(ns.pdsk),
608 ns.susp ? 's' : 'r', 659 is_susp(ns) ? 's' : 'r',
609 ns.aftr_isp ? 'a' : '-', 660 ns.aftr_isp ? 'a' : '-',
610 ns.peer_isp ? 'p' : '-', 661 ns.peer_isp ? 'p' : '-',
611 ns.user_isp ? 'u' : '-' 662 ns.user_isp ? 'u' : '-'
@@ -763,7 +814,7 @@ static int is_valid_state_transition(struct drbd_conf *mdev,
763 * to D_UNKNOWN. This rule and many more along those lines are in this function. 814 * to D_UNKNOWN. This rule and many more along those lines are in this function.
764 */ 815 */
765static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, 816static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
766 union drbd_state ns, int *warn_sync_abort) 817 union drbd_state ns, const char **warn_sync_abort)
767{ 818{
768 enum drbd_fencing_p fp; 819 enum drbd_fencing_p fp;
769 820
@@ -778,9 +829,10 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
778 os.conn <= C_DISCONNECTING) 829 os.conn <= C_DISCONNECTING)
779 ns.conn = os.conn; 830 ns.conn = os.conn;
780 831
781 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow */ 832 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
833 * If you try to go into some Sync* state, that shall fail (elsewhere). */
782 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN && 834 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
783 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING) 835 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
784 ns.conn = os.conn; 836 ns.conn = os.conn;
785 837
786 /* After C_DISCONNECTING only C_STANDALONE may follow */ 838 /* After C_DISCONNECTING only C_STANDALONE may follow */
@@ -798,14 +850,13 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
798 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY) 850 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
799 ns.aftr_isp = 0; 851 ns.aftr_isp = 0;
800 852
801 if (ns.conn <= C_DISCONNECTING && ns.disk == D_DISKLESS)
802 ns.pdsk = D_UNKNOWN;
803
804 /* Abort resync if a disk fails/detaches */ 853 /* Abort resync if a disk fails/detaches */
805 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED && 854 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
806 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) { 855 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
807 if (warn_sync_abort) 856 if (warn_sync_abort)
808 *warn_sync_abort = 1; 857 *warn_sync_abort =
858 os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
859 "Online-verify" : "Resync";
809 ns.conn = C_CONNECTED; 860 ns.conn = C_CONNECTED;
810 } 861 }
811 862
@@ -876,7 +927,12 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
876 if (fp == FP_STONITH && 927 if (fp == FP_STONITH &&
877 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) && 928 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
878 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED)) 929 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
879 ns.susp = 1; 930 ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
931
932 if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
933 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
934 !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
935 ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
880 936
881 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) { 937 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
882 if (ns.conn == C_SYNC_SOURCE) 938 if (ns.conn == C_SYNC_SOURCE)
@@ -912,6 +968,12 @@ static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
912 } 968 }
913} 969}
914 970
971static void drbd_resume_al(struct drbd_conf *mdev)
972{
973 if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
974 dev_info(DEV, "Resumed AL updates\n");
975}
976
915/** 977/**
916 * __drbd_set_state() - Set a new DRBD state 978 * __drbd_set_state() - Set a new DRBD state
917 * @mdev: DRBD device. 979 * @mdev: DRBD device.
@@ -927,7 +989,7 @@ int __drbd_set_state(struct drbd_conf *mdev,
927{ 989{
928 union drbd_state os; 990 union drbd_state os;
929 int rv = SS_SUCCESS; 991 int rv = SS_SUCCESS;
930 int warn_sync_abort = 0; 992 const char *warn_sync_abort = NULL;
931 struct after_state_chg_work *ascw; 993 struct after_state_chg_work *ascw;
932 994
933 os = mdev->state; 995 os = mdev->state;
@@ -946,14 +1008,8 @@ int __drbd_set_state(struct drbd_conf *mdev,
946 /* If the old state was illegal as well, then let 1008 /* If the old state was illegal as well, then let
947 this happen...*/ 1009 this happen...*/
948 1010
949 if (is_valid_state(mdev, os) == rv) { 1011 if (is_valid_state(mdev, os) == rv)
950 dev_err(DEV, "Considering state change from bad state. "
951 "Error would be: '%s'\n",
952 drbd_set_st_err_str(rv));
953 print_st(mdev, "old", os);
954 print_st(mdev, "new", ns);
955 rv = is_valid_state_transition(mdev, ns, os); 1012 rv = is_valid_state_transition(mdev, ns, os);
956 }
957 } else 1013 } else
958 rv = is_valid_state_transition(mdev, ns, os); 1014 rv = is_valid_state_transition(mdev, ns, os);
959 } 1015 }
@@ -965,7 +1021,7 @@ int __drbd_set_state(struct drbd_conf *mdev,
965 } 1021 }
966 1022
967 if (warn_sync_abort) 1023 if (warn_sync_abort)
968 dev_warn(DEV, "Resync aborted.\n"); 1024 dev_warn(DEV, "%s aborted.\n", warn_sync_abort);
969 1025
970 { 1026 {
971 char *pbp, pb[300]; 1027 char *pbp, pb[300];
@@ -976,7 +1032,10 @@ int __drbd_set_state(struct drbd_conf *mdev,
976 PSC(conn); 1032 PSC(conn);
977 PSC(disk); 1033 PSC(disk);
978 PSC(pdsk); 1034 PSC(pdsk);
979 PSC(susp); 1035 if (is_susp(ns) != is_susp(os))
1036 pbp += sprintf(pbp, "susp( %s -> %s ) ",
1037 drbd_susp_str(is_susp(os)),
1038 drbd_susp_str(is_susp(ns)));
980 PSC(aftr_isp); 1039 PSC(aftr_isp);
981 PSC(peer_isp); 1040 PSC(peer_isp);
982 PSC(user_isp); 1041 PSC(user_isp);
@@ -1001,12 +1060,6 @@ int __drbd_set_state(struct drbd_conf *mdev,
1001 wake_up(&mdev->misc_wait); 1060 wake_up(&mdev->misc_wait);
1002 wake_up(&mdev->state_wait); 1061 wake_up(&mdev->state_wait);
1003 1062
1004 /* post-state-change actions */
1005 if (os.conn >= C_SYNC_SOURCE && ns.conn <= C_CONNECTED) {
1006 set_bit(STOP_SYNC_TIMER, &mdev->flags);
1007 mod_timer(&mdev->resync_timer, jiffies);
1008 }
1009
1010 /* aborted verify run. log the last position */ 1063 /* aborted verify run. log the last position */
1011 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) && 1064 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1012 ns.conn < C_CONNECTED) { 1065 ns.conn < C_CONNECTED) {
@@ -1019,41 +1072,42 @@ int __drbd_set_state(struct drbd_conf *mdev,
1019 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) && 1072 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1020 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) { 1073 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1021 dev_info(DEV, "Syncer continues.\n"); 1074 dev_info(DEV, "Syncer continues.\n");
1022 mdev->rs_paused += (long)jiffies-(long)mdev->rs_mark_time; 1075 mdev->rs_paused += (long)jiffies
1023 if (ns.conn == C_SYNC_TARGET) { 1076 -(long)mdev->rs_mark_time[mdev->rs_last_mark];
1024 if (!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags)) 1077 if (ns.conn == C_SYNC_TARGET)
1025 mod_timer(&mdev->resync_timer, jiffies); 1078 mod_timer(&mdev->resync_timer, jiffies);
1026 /* This if (!test_bit) is only needed for the case
1027 that a device that has ceased to used its timer,
1028 i.e. it is already in drbd_resync_finished() gets
1029 paused and resumed. */
1030 }
1031 } 1079 }
1032 1080
1033 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) && 1081 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1034 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) { 1082 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1035 dev_info(DEV, "Resync suspended\n"); 1083 dev_info(DEV, "Resync suspended\n");
1036 mdev->rs_mark_time = jiffies; 1084 mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
1037 if (ns.conn == C_PAUSED_SYNC_T)
1038 set_bit(STOP_SYNC_TIMER, &mdev->flags);
1039 } 1085 }
1040 1086
1041 if (os.conn == C_CONNECTED && 1087 if (os.conn == C_CONNECTED &&
1042 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) { 1088 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
1089 unsigned long now = jiffies;
1090 int i;
1091
1043 mdev->ov_position = 0; 1092 mdev->ov_position = 0;
1044 mdev->rs_total = 1093 mdev->rs_total = drbd_bm_bits(mdev);
1045 mdev->rs_mark_left = drbd_bm_bits(mdev);
1046 if (mdev->agreed_pro_version >= 90) 1094 if (mdev->agreed_pro_version >= 90)
1047 set_ov_position(mdev, ns.conn); 1095 set_ov_position(mdev, ns.conn);
1048 else 1096 else
1049 mdev->ov_start_sector = 0; 1097 mdev->ov_start_sector = 0;
1050 mdev->ov_left = mdev->rs_total 1098 mdev->ov_left = mdev->rs_total
1051 - BM_SECT_TO_BIT(mdev->ov_position); 1099 - BM_SECT_TO_BIT(mdev->ov_position);
1052 mdev->rs_start = 1100 mdev->rs_start = now;
1053 mdev->rs_mark_time = jiffies; 1101 mdev->rs_last_events = 0;
1102 mdev->rs_last_sect_ev = 0;
1054 mdev->ov_last_oos_size = 0; 1103 mdev->ov_last_oos_size = 0;
1055 mdev->ov_last_oos_start = 0; 1104 mdev->ov_last_oos_start = 0;
1056 1105
1106 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
1107 mdev->rs_mark_left[i] = mdev->rs_total;
1108 mdev->rs_mark_time[i] = now;
1109 }
1110
1057 if (ns.conn == C_VERIFY_S) { 1111 if (ns.conn == C_VERIFY_S) {
1058 dev_info(DEV, "Starting Online Verify from sector %llu\n", 1112 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1059 (unsigned long long)mdev->ov_position); 1113 (unsigned long long)mdev->ov_position);
@@ -1106,6 +1160,10 @@ int __drbd_set_state(struct drbd_conf *mdev,
1106 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT) 1160 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1107 drbd_thread_restart_nowait(&mdev->receiver); 1161 drbd_thread_restart_nowait(&mdev->receiver);
1108 1162
1163 /* Resume AL writing if we get a connection */
1164 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1165 drbd_resume_al(mdev);
1166
1109 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC); 1167 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1110 if (ascw) { 1168 if (ascw) {
1111 ascw->os = os; 1169 ascw->os = os;
@@ -1164,6 +1222,8 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1164 union drbd_state ns, enum chg_state_flags flags) 1222 union drbd_state ns, enum chg_state_flags flags)
1165{ 1223{
1166 enum drbd_fencing_p fp; 1224 enum drbd_fencing_p fp;
1225 enum drbd_req_event what = nothing;
1226 union drbd_state nsm = (union drbd_state){ .i = -1 };
1167 1227
1168 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) { 1228 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1169 clear_bit(CRASHED_PRIMARY, &mdev->flags); 1229 clear_bit(CRASHED_PRIMARY, &mdev->flags);
@@ -1187,17 +1247,49 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1187 /* Here we have the actions that are performed after a 1247 /* Here we have the actions that are performed after a
1188 state change. This function might sleep */ 1248 state change. This function might sleep */
1189 1249
1190 if (fp == FP_STONITH && ns.susp) { 1250 nsm.i = -1;
1191 /* case1: The outdate peer handler is successful: 1251 if (ns.susp_nod) {
1192 * case2: The connection was established again: */ 1252 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1193 if ((os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) || 1253 if (ns.conn == C_CONNECTED)
1194 (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)) { 1254 what = resend, nsm.susp_nod = 0;
1255 else /* ns.conn > C_CONNECTED */
1256 dev_err(DEV, "Unexpected Resynd going on!\n");
1257 }
1258
1259 if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
1260 what = restart_frozen_disk_io, nsm.susp_nod = 0;
1261
1262 }
1263
1264 if (ns.susp_fen) {
1265 /* case1: The outdate peer handler is successful: */
1266 if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
1195 tl_clear(mdev); 1267 tl_clear(mdev);
1268 if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1269 drbd_uuid_new_current(mdev);
1270 clear_bit(NEW_CUR_UUID, &mdev->flags);
1271 drbd_md_sync(mdev);
1272 }
1196 spin_lock_irq(&mdev->req_lock); 1273 spin_lock_irq(&mdev->req_lock);
1197 _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL); 1274 _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
1198 spin_unlock_irq(&mdev->req_lock); 1275 spin_unlock_irq(&mdev->req_lock);
1199 } 1276 }
1277 /* case2: The connection was established again: */
1278 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1279 clear_bit(NEW_CUR_UUID, &mdev->flags);
1280 what = resend;
1281 nsm.susp_fen = 0;
1282 }
1283 }
1284
1285 if (what != nothing) {
1286 spin_lock_irq(&mdev->req_lock);
1287 _tl_restart(mdev, what);
1288 nsm.i &= mdev->state.i;
1289 _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
1290 spin_unlock_irq(&mdev->req_lock);
1200 } 1291 }
1292
1201 /* Do not change the order of the if above and the two below... */ 1293 /* Do not change the order of the if above and the two below... */
1202 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */ 1294 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
1203 drbd_send_uuids(mdev); 1295 drbd_send_uuids(mdev);
@@ -1216,16 +1308,22 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1216 if (get_ldev(mdev)) { 1308 if (get_ldev(mdev)) {
1217 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && 1309 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
1218 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { 1310 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
1219 drbd_uuid_new_current(mdev); 1311 if (is_susp(mdev->state)) {
1220 drbd_send_uuids(mdev); 1312 set_bit(NEW_CUR_UUID, &mdev->flags);
1313 } else {
1314 drbd_uuid_new_current(mdev);
1315 drbd_send_uuids(mdev);
1316 }
1221 } 1317 }
1222 put_ldev(mdev); 1318 put_ldev(mdev);
1223 } 1319 }
1224 } 1320 }
1225 1321
1226 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) { 1322 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
1227 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) 1323 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
1228 drbd_uuid_new_current(mdev); 1324 drbd_uuid_new_current(mdev);
1325 drbd_send_uuids(mdev);
1326 }
1229 1327
1230 /* D_DISKLESS Peer becomes secondary */ 1328 /* D_DISKLESS Peer becomes secondary */
1231 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) 1329 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
@@ -1267,42 +1365,51 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1267 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT) 1365 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1268 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate"); 1366 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
1269 1367
1368 /* first half of local IO error */
1270 if (os.disk > D_FAILED && ns.disk == D_FAILED) { 1369 if (os.disk > D_FAILED && ns.disk == D_FAILED) {
1271 enum drbd_io_error_p eh; 1370 enum drbd_io_error_p eh = EP_PASS_ON;
1371
1372 if (drbd_send_state(mdev))
1373 dev_warn(DEV, "Notified peer that my disk is broken.\n");
1374 else
1375 dev_err(DEV, "Sending state for drbd_io_error() failed\n");
1376
1377 drbd_rs_cancel_all(mdev);
1272 1378
1273 eh = EP_PASS_ON;
1274 if (get_ldev_if_state(mdev, D_FAILED)) { 1379 if (get_ldev_if_state(mdev, D_FAILED)) {
1275 eh = mdev->ldev->dc.on_io_error; 1380 eh = mdev->ldev->dc.on_io_error;
1276 put_ldev(mdev); 1381 put_ldev(mdev);
1277 } 1382 }
1383 if (eh == EP_CALL_HELPER)
1384 drbd_khelper(mdev, "local-io-error");
1385 }
1278 1386
1279 drbd_rs_cancel_all(mdev); 1387
1280 /* since get_ldev() only works as long as disk>=D_INCONSISTENT, 1388 /* second half of local IO error handling,
1281 and it is D_DISKLESS here, local_cnt can only go down, it can 1389 * after local_cnt references have reached zero: */
1282 not increase... It will reach zero */ 1390 if (os.disk == D_FAILED && ns.disk == D_DISKLESS) {
1283 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
1284 mdev->rs_total = 0; 1391 mdev->rs_total = 0;
1285 mdev->rs_failed = 0; 1392 mdev->rs_failed = 0;
1286 atomic_set(&mdev->rs_pending_cnt, 0); 1393 atomic_set(&mdev->rs_pending_cnt, 0);
1287
1288 spin_lock_irq(&mdev->req_lock);
1289 _drbd_set_state(_NS(mdev, disk, D_DISKLESS), CS_HARD, NULL);
1290 spin_unlock_irq(&mdev->req_lock);
1291
1292 if (eh == EP_CALL_HELPER)
1293 drbd_khelper(mdev, "local-io-error");
1294 } 1394 }
1295 1395
1296 if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) { 1396 if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) {
1397 /* We must still be diskless,
1398 * re-attach has to be serialized with this! */
1399 if (mdev->state.disk != D_DISKLESS)
1400 dev_err(DEV,
1401 "ASSERT FAILED: disk is %s while going diskless\n",
1402 drbd_disk_str(mdev->state.disk));
1403
1404 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state
1405 * will inc/dec it frequently. Since we became D_DISKLESS, no
1406 * one has touched the protected members anymore, though, so we
1407 * are safe to free them here. */
1408 if (drbd_send_state(mdev))
1409 dev_warn(DEV, "Notified peer that I detached my disk.\n");
1410 else
1411 dev_err(DEV, "Sending state for detach failed\n");
1297 1412
1298 if (os.disk == D_FAILED) /* && ns.disk == D_DISKLESS*/ {
1299 if (drbd_send_state(mdev))
1300 dev_warn(DEV, "Notified peer that my disk is broken.\n");
1301 else
1302 dev_err(DEV, "Sending state in drbd_io_error() failed\n");
1303 }
1304
1305 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
1306 lc_destroy(mdev->resync); 1413 lc_destroy(mdev->resync);
1307 mdev->resync = NULL; 1414 mdev->resync = NULL;
1308 lc_destroy(mdev->act_log); 1415 lc_destroy(mdev->act_log);
@@ -1311,8 +1418,10 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1311 drbd_free_bc(mdev->ldev); 1418 drbd_free_bc(mdev->ldev);
1312 mdev->ldev = NULL;); 1419 mdev->ldev = NULL;);
1313 1420
1314 if (mdev->md_io_tmpp) 1421 if (mdev->md_io_tmpp) {
1315 __free_page(mdev->md_io_tmpp); 1422 __free_page(mdev->md_io_tmpp);
1423 mdev->md_io_tmpp = NULL;
1424 }
1316 } 1425 }
1317 1426
1318 /* Disks got bigger while they were detached */ 1427 /* Disks got bigger while they were detached */
@@ -1328,6 +1437,15 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1328 (os.user_isp && !ns.user_isp)) 1437 (os.user_isp && !ns.user_isp))
1329 resume_next_sg(mdev); 1438 resume_next_sg(mdev);
1330 1439
1440 /* sync target done with resync. Explicitly notify peer, even though
1441 * it should (at least for non-empty resyncs) already know itself. */
1442 if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1443 drbd_send_state(mdev);
1444
1445 /* free tl_hash if we Got thawed and are C_STANDALONE */
1446 if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
1447 drbd_free_tl_hash(mdev);
1448
1331 /* Upon network connection, we need to start the receiver */ 1449 /* Upon network connection, we need to start the receiver */
1332 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED) 1450 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1333 drbd_thread_start(&mdev->receiver); 1451 drbd_thread_start(&mdev->receiver);
@@ -1554,7 +1672,7 @@ void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1554 1672
1555/* the appropriate socket mutex must be held already */ 1673/* the appropriate socket mutex must be held already */
1556int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, 1674int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1557 enum drbd_packets cmd, struct p_header *h, 1675 enum drbd_packets cmd, struct p_header80 *h,
1558 size_t size, unsigned msg_flags) 1676 size_t size, unsigned msg_flags)
1559{ 1677{
1560 int sent, ok; 1678 int sent, ok;
@@ -1564,7 +1682,7 @@ int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1564 1682
1565 h->magic = BE_DRBD_MAGIC; 1683 h->magic = BE_DRBD_MAGIC;
1566 h->command = cpu_to_be16(cmd); 1684 h->command = cpu_to_be16(cmd);
1567 h->length = cpu_to_be16(size-sizeof(struct p_header)); 1685 h->length = cpu_to_be16(size-sizeof(struct p_header80));
1568 1686
1569 sent = drbd_send(mdev, sock, h, size, msg_flags); 1687 sent = drbd_send(mdev, sock, h, size, msg_flags);
1570 1688
@@ -1579,7 +1697,7 @@ int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1579 * when we hold the appropriate socket mutex. 1697 * when we hold the appropriate socket mutex.
1580 */ 1698 */
1581int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket, 1699int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
1582 enum drbd_packets cmd, struct p_header *h, size_t size) 1700 enum drbd_packets cmd, struct p_header80 *h, size_t size)
1583{ 1701{
1584 int ok = 0; 1702 int ok = 0;
1585 struct socket *sock; 1703 struct socket *sock;
@@ -1607,7 +1725,7 @@ int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
1607int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data, 1725int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1608 size_t size) 1726 size_t size)
1609{ 1727{
1610 struct p_header h; 1728 struct p_header80 h;
1611 int ok; 1729 int ok;
1612 1730
1613 h.magic = BE_DRBD_MAGIC; 1731 h.magic = BE_DRBD_MAGIC;
@@ -1629,7 +1747,7 @@ int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1629 1747
1630int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc) 1748int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1631{ 1749{
1632 struct p_rs_param_89 *p; 1750 struct p_rs_param_95 *p;
1633 struct socket *sock; 1751 struct socket *sock;
1634 int size, rv; 1752 int size, rv;
1635 const int apv = mdev->agreed_pro_version; 1753 const int apv = mdev->agreed_pro_version;
@@ -1637,7 +1755,8 @@ int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1637 size = apv <= 87 ? sizeof(struct p_rs_param) 1755 size = apv <= 87 ? sizeof(struct p_rs_param)
1638 : apv == 88 ? sizeof(struct p_rs_param) 1756 : apv == 88 ? sizeof(struct p_rs_param)
1639 + strlen(mdev->sync_conf.verify_alg) + 1 1757 + strlen(mdev->sync_conf.verify_alg) + 1
1640 : /* 89 */ sizeof(struct p_rs_param_89); 1758 : apv <= 94 ? sizeof(struct p_rs_param_89)
1759 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
1641 1760
1642 /* used from admin command context and receiver/worker context. 1761 /* used from admin command context and receiver/worker context.
1643 * to avoid kmalloc, grab the socket right here, 1762 * to avoid kmalloc, grab the socket right here,
@@ -1648,12 +1767,16 @@ int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1648 if (likely(sock != NULL)) { 1767 if (likely(sock != NULL)) {
1649 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM; 1768 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1650 1769
1651 p = &mdev->data.sbuf.rs_param_89; 1770 p = &mdev->data.sbuf.rs_param_95;
1652 1771
1653 /* initialize verify_alg and csums_alg */ 1772 /* initialize verify_alg and csums_alg */
1654 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); 1773 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1655 1774
1656 p->rate = cpu_to_be32(sc->rate); 1775 p->rate = cpu_to_be32(sc->rate);
1776 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
1777 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
1778 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
1779 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
1657 1780
1658 if (apv >= 88) 1781 if (apv >= 88)
1659 strcpy(p->verify_alg, mdev->sync_conf.verify_alg); 1782 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
@@ -1709,7 +1832,7 @@ int drbd_send_protocol(struct drbd_conf *mdev)
1709 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg); 1832 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
1710 1833
1711 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL, 1834 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
1712 (struct p_header *)p, size); 1835 (struct p_header80 *)p, size);
1713 kfree(p); 1836 kfree(p);
1714 return rv; 1837 return rv;
1715} 1838}
@@ -1735,7 +1858,7 @@ int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
1735 put_ldev(mdev); 1858 put_ldev(mdev);
1736 1859
1737 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS, 1860 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
1738 (struct p_header *)&p, sizeof(p)); 1861 (struct p_header80 *)&p, sizeof(p));
1739} 1862}
1740 1863
1741int drbd_send_uuids(struct drbd_conf *mdev) 1864int drbd_send_uuids(struct drbd_conf *mdev)
@@ -1756,7 +1879,7 @@ int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
1756 p.uuid = cpu_to_be64(val); 1879 p.uuid = cpu_to_be64(val);
1757 1880
1758 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID, 1881 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
1759 (struct p_header *)&p, sizeof(p)); 1882 (struct p_header80 *)&p, sizeof(p));
1760} 1883}
1761 1884
1762int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags) 1885int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
@@ -1786,7 +1909,7 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl
1786 p.dds_flags = cpu_to_be16(flags); 1909 p.dds_flags = cpu_to_be16(flags);
1787 1910
1788 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES, 1911 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
1789 (struct p_header *)&p, sizeof(p)); 1912 (struct p_header80 *)&p, sizeof(p));
1790 return ok; 1913 return ok;
1791} 1914}
1792 1915
@@ -1811,7 +1934,7 @@ int drbd_send_state(struct drbd_conf *mdev)
1811 1934
1812 if (likely(sock != NULL)) { 1935 if (likely(sock != NULL)) {
1813 ok = _drbd_send_cmd(mdev, sock, P_STATE, 1936 ok = _drbd_send_cmd(mdev, sock, P_STATE,
1814 (struct p_header *)&p, sizeof(p), 0); 1937 (struct p_header80 *)&p, sizeof(p), 0);
1815 } 1938 }
1816 1939
1817 mutex_unlock(&mdev->data.mutex); 1940 mutex_unlock(&mdev->data.mutex);
@@ -1829,7 +1952,7 @@ int drbd_send_state_req(struct drbd_conf *mdev,
1829 p.val = cpu_to_be32(val.i); 1952 p.val = cpu_to_be32(val.i);
1830 1953
1831 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ, 1954 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
1832 (struct p_header *)&p, sizeof(p)); 1955 (struct p_header80 *)&p, sizeof(p));
1833} 1956}
1834 1957
1835int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode) 1958int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode)
@@ -1839,7 +1962,7 @@ int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode)
1839 p.retcode = cpu_to_be32(retcode); 1962 p.retcode = cpu_to_be32(retcode);
1840 1963
1841 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY, 1964 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
1842 (struct p_header *)&p, sizeof(p)); 1965 (struct p_header80 *)&p, sizeof(p));
1843} 1966}
1844 1967
1845int fill_bitmap_rle_bits(struct drbd_conf *mdev, 1968int fill_bitmap_rle_bits(struct drbd_conf *mdev,
@@ -1938,7 +2061,7 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev,
1938 2061
1939enum { OK, FAILED, DONE } 2062enum { OK, FAILED, DONE }
1940send_bitmap_rle_or_plain(struct drbd_conf *mdev, 2063send_bitmap_rle_or_plain(struct drbd_conf *mdev,
1941 struct p_header *h, struct bm_xfer_ctx *c) 2064 struct p_header80 *h, struct bm_xfer_ctx *c)
1942{ 2065{
1943 struct p_compressed_bm *p = (void*)h; 2066 struct p_compressed_bm *p = (void*)h;
1944 unsigned long num_words; 2067 unsigned long num_words;
@@ -1968,12 +2091,12 @@ send_bitmap_rle_or_plain(struct drbd_conf *mdev,
1968 if (len) 2091 if (len)
1969 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload); 2092 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
1970 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP, 2093 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
1971 h, sizeof(struct p_header) + len, 0); 2094 h, sizeof(struct p_header80) + len, 0);
1972 c->word_offset += num_words; 2095 c->word_offset += num_words;
1973 c->bit_offset = c->word_offset * BITS_PER_LONG; 2096 c->bit_offset = c->word_offset * BITS_PER_LONG;
1974 2097
1975 c->packets[1]++; 2098 c->packets[1]++;
1976 c->bytes[1] += sizeof(struct p_header) + len; 2099 c->bytes[1] += sizeof(struct p_header80) + len;
1977 2100
1978 if (c->bit_offset > c->bm_bits) 2101 if (c->bit_offset > c->bm_bits)
1979 c->bit_offset = c->bm_bits; 2102 c->bit_offset = c->bm_bits;
@@ -1989,14 +2112,14 @@ send_bitmap_rle_or_plain(struct drbd_conf *mdev,
1989int _drbd_send_bitmap(struct drbd_conf *mdev) 2112int _drbd_send_bitmap(struct drbd_conf *mdev)
1990{ 2113{
1991 struct bm_xfer_ctx c; 2114 struct bm_xfer_ctx c;
1992 struct p_header *p; 2115 struct p_header80 *p;
1993 int ret; 2116 int ret;
1994 2117
1995 ERR_IF(!mdev->bitmap) return FALSE; 2118 ERR_IF(!mdev->bitmap) return FALSE;
1996 2119
1997 /* maybe we should use some per thread scratch page, 2120 /* maybe we should use some per thread scratch page,
1998 * and allocate that during initial device creation? */ 2121 * and allocate that during initial device creation? */
1999 p = (struct p_header *) __get_free_page(GFP_NOIO); 2122 p = (struct p_header80 *) __get_free_page(GFP_NOIO);
2000 if (!p) { 2123 if (!p) {
2001 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__); 2124 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
2002 return FALSE; 2125 return FALSE;
@@ -2054,7 +2177,7 @@ int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2054 if (mdev->state.conn < C_CONNECTED) 2177 if (mdev->state.conn < C_CONNECTED)
2055 return FALSE; 2178 return FALSE;
2056 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK, 2179 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
2057 (struct p_header *)&p, sizeof(p)); 2180 (struct p_header80 *)&p, sizeof(p));
2058 return ok; 2181 return ok;
2059} 2182}
2060 2183
@@ -2082,17 +2205,18 @@ static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2082 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED) 2205 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
2083 return FALSE; 2206 return FALSE;
2084 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd, 2207 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
2085 (struct p_header *)&p, sizeof(p)); 2208 (struct p_header80 *)&p, sizeof(p));
2086 return ok; 2209 return ok;
2087} 2210}
2088 2211
2212/* dp->sector and dp->block_id already/still in network byte order,
2213 * data_size is payload size according to dp->head,
2214 * and may need to be corrected for digest size. */
2089int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd, 2215int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
2090 struct p_data *dp) 2216 struct p_data *dp, int data_size)
2091{ 2217{
2092 const int header_size = sizeof(struct p_data) 2218 data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
2093 - sizeof(struct p_header); 2219 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
2094 int data_size = ((struct p_header *)dp)->length - header_size;
2095
2096 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size), 2220 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2097 dp->block_id); 2221 dp->block_id);
2098} 2222}
@@ -2140,7 +2264,7 @@ int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2140 p.blksize = cpu_to_be32(size); 2264 p.blksize = cpu_to_be32(size);
2141 2265
2142 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, 2266 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
2143 (struct p_header *)&p, sizeof(p)); 2267 (struct p_header80 *)&p, sizeof(p));
2144 return ok; 2268 return ok;
2145} 2269}
2146 2270
@@ -2158,7 +2282,7 @@ int drbd_send_drequest_csum(struct drbd_conf *mdev,
2158 2282
2159 p.head.magic = BE_DRBD_MAGIC; 2283 p.head.magic = BE_DRBD_MAGIC;
2160 p.head.command = cpu_to_be16(cmd); 2284 p.head.command = cpu_to_be16(cmd);
2161 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header) + digest_size); 2285 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
2162 2286
2163 mutex_lock(&mdev->data.mutex); 2287 mutex_lock(&mdev->data.mutex);
2164 2288
@@ -2180,7 +2304,7 @@ int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2180 p.blksize = cpu_to_be32(size); 2304 p.blksize = cpu_to_be32(size);
2181 2305
2182 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST, 2306 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
2183 (struct p_header *)&p, sizeof(p)); 2307 (struct p_header80 *)&p, sizeof(p));
2184 return ok; 2308 return ok;
2185} 2309}
2186 2310
@@ -2332,6 +2456,18 @@ static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2332 return 1; 2456 return 1;
2333} 2457}
2334 2458
2459static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
2460{
2461 if (mdev->agreed_pro_version >= 95)
2462 return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
2463 (bi_rw & REQ_UNPLUG ? DP_UNPLUG : 0) |
2464 (bi_rw & REQ_FUA ? DP_FUA : 0) |
2465 (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2466 (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
2467 else
2468 return bi_rw & (REQ_SYNC | REQ_UNPLUG) ? DP_RW_SYNC : 0;
2469}
2470
2335/* Used to send write requests 2471/* Used to send write requests
2336 * R_PRIMARY -> Peer (P_DATA) 2472 * R_PRIMARY -> Peer (P_DATA)
2337 */ 2473 */
@@ -2349,30 +2485,25 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2349 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ? 2485 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2350 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0; 2486 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2351 2487
2352 p.head.magic = BE_DRBD_MAGIC; 2488 if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
2353 p.head.command = cpu_to_be16(P_DATA); 2489 p.head.h80.magic = BE_DRBD_MAGIC;
2354 p.head.length = 2490 p.head.h80.command = cpu_to_be16(P_DATA);
2355 cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + req->size); 2491 p.head.h80.length =
2492 cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2493 } else {
2494 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2495 p.head.h95.command = cpu_to_be16(P_DATA);
2496 p.head.h95.length =
2497 cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2498 }
2356 2499
2357 p.sector = cpu_to_be64(req->sector); 2500 p.sector = cpu_to_be64(req->sector);
2358 p.block_id = (unsigned long)req; 2501 p.block_id = (unsigned long)req;
2359 p.seq_num = cpu_to_be32(req->seq_num = 2502 p.seq_num = cpu_to_be32(req->seq_num =
2360 atomic_add_return(1, &mdev->packet_seq)); 2503 atomic_add_return(1, &mdev->packet_seq));
2361 dp_flags = 0;
2362 2504
2363 /* NOTE: no need to check if barriers supported here as we would 2505 dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
2364 * not pass the test in make_request_common in that case 2506
2365 */
2366 if (req->master_bio->bi_rw & REQ_HARDBARRIER) {
2367 dev_err(DEV, "ASSERT FAILED would have set DP_HARDBARRIER\n");
2368 /* dp_flags |= DP_HARDBARRIER; */
2369 }
2370 if (req->master_bio->bi_rw & REQ_SYNC)
2371 dp_flags |= DP_RW_SYNC;
2372 /* for now handle SYNCIO and UNPLUG
2373 * as if they still were one and the same flag */
2374 if (req->master_bio->bi_rw & REQ_UNPLUG)
2375 dp_flags |= DP_RW_SYNC;
2376 if (mdev->state.conn >= C_SYNC_SOURCE && 2507 if (mdev->state.conn >= C_SYNC_SOURCE &&
2377 mdev->state.conn <= C_PAUSED_SYNC_T) 2508 mdev->state.conn <= C_PAUSED_SYNC_T)
2378 dp_flags |= DP_MAY_SET_IN_SYNC; 2509 dp_flags |= DP_MAY_SET_IN_SYNC;
@@ -2413,10 +2544,17 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2413 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ? 2544 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2414 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0; 2545 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2415 2546
2416 p.head.magic = BE_DRBD_MAGIC; 2547 if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
2417 p.head.command = cpu_to_be16(cmd); 2548 p.head.h80.magic = BE_DRBD_MAGIC;
2418 p.head.length = 2549 p.head.h80.command = cpu_to_be16(cmd);
2419 cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + e->size); 2550 p.head.h80.length =
2551 cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2552 } else {
2553 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2554 p.head.h95.command = cpu_to_be16(cmd);
2555 p.head.h95.length =
2556 cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2557 }
2420 2558
2421 p.sector = cpu_to_be64(e->sector); 2559 p.sector = cpu_to_be64(e->sector);
2422 p.block_id = e->block_id; 2560 p.block_id = e->block_id;
@@ -2429,8 +2567,7 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2429 if (!drbd_get_data_sock(mdev)) 2567 if (!drbd_get_data_sock(mdev))
2430 return 0; 2568 return 0;
2431 2569
2432 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, 2570 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
2433 sizeof(p), dgs ? MSG_MORE : 0);
2434 if (ok && dgs) { 2571 if (ok && dgs) {
2435 dgb = mdev->int_dig_out; 2572 dgb = mdev->int_dig_out;
2436 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb); 2573 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
@@ -2536,7 +2673,7 @@ static int drbd_open(struct block_device *bdev, fmode_t mode)
2536 unsigned long flags; 2673 unsigned long flags;
2537 int rv = 0; 2674 int rv = 0;
2538 2675
2539 lock_kernel(); 2676 mutex_lock(&drbd_main_mutex);
2540 spin_lock_irqsave(&mdev->req_lock, flags); 2677 spin_lock_irqsave(&mdev->req_lock, flags);
2541 /* to have a stable mdev->state.role 2678 /* to have a stable mdev->state.role
2542 * and no race with updating open_cnt */ 2679 * and no race with updating open_cnt */
@@ -2551,7 +2688,7 @@ static int drbd_open(struct block_device *bdev, fmode_t mode)
2551 if (!rv) 2688 if (!rv)
2552 mdev->open_cnt++; 2689 mdev->open_cnt++;
2553 spin_unlock_irqrestore(&mdev->req_lock, flags); 2690 spin_unlock_irqrestore(&mdev->req_lock, flags);
2554 unlock_kernel(); 2691 mutex_unlock(&drbd_main_mutex);
2555 2692
2556 return rv; 2693 return rv;
2557} 2694}
@@ -2559,9 +2696,9 @@ static int drbd_open(struct block_device *bdev, fmode_t mode)
2559static int drbd_release(struct gendisk *gd, fmode_t mode) 2696static int drbd_release(struct gendisk *gd, fmode_t mode)
2560{ 2697{
2561 struct drbd_conf *mdev = gd->private_data; 2698 struct drbd_conf *mdev = gd->private_data;
2562 lock_kernel(); 2699 mutex_lock(&drbd_main_mutex);
2563 mdev->open_cnt--; 2700 mdev->open_cnt--;
2564 unlock_kernel(); 2701 mutex_unlock(&drbd_main_mutex);
2565 return 0; 2702 return 0;
2566} 2703}
2567 2704
@@ -2605,7 +2742,13 @@ static void drbd_set_defaults(struct drbd_conf *mdev)
2605 /* .verify_alg = */ {}, 0, 2742 /* .verify_alg = */ {}, 0,
2606 /* .cpu_mask = */ {}, 0, 2743 /* .cpu_mask = */ {}, 0,
2607 /* .csums_alg = */ {}, 0, 2744 /* .csums_alg = */ {}, 0,
2608 /* .use_rle = */ 0 2745 /* .use_rle = */ 0,
2746 /* .on_no_data = */ DRBD_ON_NO_DATA_DEF,
2747 /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
2748 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
2749 /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
2750 /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
2751 /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
2609 }; 2752 };
2610 2753
2611 /* Have to use that way, because the layout differs between 2754 /* Have to use that way, because the layout differs between
@@ -2616,7 +2759,9 @@ static void drbd_set_defaults(struct drbd_conf *mdev)
2616 .conn = C_STANDALONE, 2759 .conn = C_STANDALONE,
2617 .disk = D_DISKLESS, 2760 .disk = D_DISKLESS,
2618 .pdsk = D_UNKNOWN, 2761 .pdsk = D_UNKNOWN,
2619 .susp = 0 2762 .susp = 0,
2763 .susp_nod = 0,
2764 .susp_fen = 0
2620 } }; 2765 } };
2621} 2766}
2622 2767
@@ -2640,6 +2785,9 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
2640 atomic_set(&mdev->net_cnt, 0); 2785 atomic_set(&mdev->net_cnt, 0);
2641 atomic_set(&mdev->packet_seq, 0); 2786 atomic_set(&mdev->packet_seq, 0);
2642 atomic_set(&mdev->pp_in_use, 0); 2787 atomic_set(&mdev->pp_in_use, 0);
2788 atomic_set(&mdev->pp_in_use_by_net, 0);
2789 atomic_set(&mdev->rs_sect_in, 0);
2790 atomic_set(&mdev->rs_sect_ev, 0);
2643 2791
2644 mutex_init(&mdev->md_io_mutex); 2792 mutex_init(&mdev->md_io_mutex);
2645 mutex_init(&mdev->data.mutex); 2793 mutex_init(&mdev->data.mutex);
@@ -2666,11 +2814,13 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
2666 INIT_LIST_HEAD(&mdev->meta.work.q); 2814 INIT_LIST_HEAD(&mdev->meta.work.q);
2667 INIT_LIST_HEAD(&mdev->resync_work.list); 2815 INIT_LIST_HEAD(&mdev->resync_work.list);
2668 INIT_LIST_HEAD(&mdev->unplug_work.list); 2816 INIT_LIST_HEAD(&mdev->unplug_work.list);
2817 INIT_LIST_HEAD(&mdev->go_diskless.list);
2669 INIT_LIST_HEAD(&mdev->md_sync_work.list); 2818 INIT_LIST_HEAD(&mdev->md_sync_work.list);
2670 INIT_LIST_HEAD(&mdev->bm_io_work.w.list); 2819 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
2671 2820
2672 mdev->resync_work.cb = w_resync_inactive; 2821 mdev->resync_work.cb = w_resync_inactive;
2673 mdev->unplug_work.cb = w_send_write_hint; 2822 mdev->unplug_work.cb = w_send_write_hint;
2823 mdev->go_diskless.cb = w_go_diskless;
2674 mdev->md_sync_work.cb = w_md_sync; 2824 mdev->md_sync_work.cb = w_md_sync;
2675 mdev->bm_io_work.w.cb = w_bitmap_io; 2825 mdev->bm_io_work.w.cb = w_bitmap_io;
2676 init_timer(&mdev->resync_timer); 2826 init_timer(&mdev->resync_timer);
@@ -2682,6 +2832,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
2682 2832
2683 init_waitqueue_head(&mdev->misc_wait); 2833 init_waitqueue_head(&mdev->misc_wait);
2684 init_waitqueue_head(&mdev->state_wait); 2834 init_waitqueue_head(&mdev->state_wait);
2835 init_waitqueue_head(&mdev->net_cnt_wait);
2685 init_waitqueue_head(&mdev->ee_wait); 2836 init_waitqueue_head(&mdev->ee_wait);
2686 init_waitqueue_head(&mdev->al_wait); 2837 init_waitqueue_head(&mdev->al_wait);
2687 init_waitqueue_head(&mdev->seq_wait); 2838 init_waitqueue_head(&mdev->seq_wait);
@@ -2697,6 +2848,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
2697 2848
2698void drbd_mdev_cleanup(struct drbd_conf *mdev) 2849void drbd_mdev_cleanup(struct drbd_conf *mdev)
2699{ 2850{
2851 int i;
2700 if (mdev->receiver.t_state != None) 2852 if (mdev->receiver.t_state != None)
2701 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n", 2853 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
2702 mdev->receiver.t_state); 2854 mdev->receiver.t_state);
@@ -2713,9 +2865,13 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev)
2713 mdev->p_size = 2865 mdev->p_size =
2714 mdev->rs_start = 2866 mdev->rs_start =
2715 mdev->rs_total = 2867 mdev->rs_total =
2716 mdev->rs_failed = 2868 mdev->rs_failed = 0;
2717 mdev->rs_mark_left = 2869 mdev->rs_last_events = 0;
2718 mdev->rs_mark_time = 0; 2870 mdev->rs_last_sect_ev = 0;
2871 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
2872 mdev->rs_mark_left[i] = 0;
2873 mdev->rs_mark_time[i] = 0;
2874 }
2719 D_ASSERT(mdev->net_conf == NULL); 2875 D_ASSERT(mdev->net_conf == NULL);
2720 2876
2721 drbd_set_my_capacity(mdev, 0); 2877 drbd_set_my_capacity(mdev, 0);
@@ -2726,6 +2882,7 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev)
2726 } 2882 }
2727 2883
2728 drbd_free_resources(mdev); 2884 drbd_free_resources(mdev);
2885 clear_bit(AL_SUSPENDED, &mdev->flags);
2729 2886
2730 /* 2887 /*
2731 * currently we drbd_init_ee only on module load, so 2888 * currently we drbd_init_ee only on module load, so
@@ -2741,6 +2898,7 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev)
2741 D_ASSERT(list_empty(&mdev->meta.work.q)); 2898 D_ASSERT(list_empty(&mdev->meta.work.q));
2742 D_ASSERT(list_empty(&mdev->resync_work.list)); 2899 D_ASSERT(list_empty(&mdev->resync_work.list));
2743 D_ASSERT(list_empty(&mdev->unplug_work.list)); 2900 D_ASSERT(list_empty(&mdev->unplug_work.list));
2901 D_ASSERT(list_empty(&mdev->go_diskless.list));
2744 2902
2745} 2903}
2746 2904
@@ -3280,9 +3438,10 @@ void drbd_md_sync(struct drbd_conf *mdev)
3280 sector_t sector; 3438 sector_t sector;
3281 int i; 3439 int i;
3282 3440
3441 del_timer(&mdev->md_sync_timer);
3442 /* timer may be rearmed by drbd_md_mark_dirty() now. */
3283 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags)) 3443 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3284 return; 3444 return;
3285 del_timer(&mdev->md_sync_timer);
3286 3445
3287 /* We use here D_FAILED and not D_ATTACHING because we try to write 3446 /* We use here D_FAILED and not D_ATTACHING because we try to write
3288 * metadata even if we detach due to a disk failure! */ 3447 * metadata even if we detach due to a disk failure! */
@@ -3310,12 +3469,9 @@ void drbd_md_sync(struct drbd_conf *mdev)
3310 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset); 3469 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3311 sector = mdev->ldev->md.md_offset; 3470 sector = mdev->ldev->md.md_offset;
3312 3471
3313 if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { 3472 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
3314 clear_bit(MD_DIRTY, &mdev->flags);
3315 } else {
3316 /* this was a try anyways ... */ 3473 /* this was a try anyways ... */
3317 dev_err(DEV, "meta data update failed!\n"); 3474 dev_err(DEV, "meta data update failed!\n");
3318
3319 drbd_chk_io_error(mdev, 1, TRUE); 3475 drbd_chk_io_error(mdev, 1, TRUE);
3320 } 3476 }
3321 3477
@@ -3402,6 +3558,28 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3402 return rv; 3558 return rv;
3403} 3559}
3404 3560
3561static void debug_drbd_uuid(struct drbd_conf *mdev, enum drbd_uuid_index index)
3562{
3563 static char *uuid_str[UI_EXTENDED_SIZE] = {
3564 [UI_CURRENT] = "CURRENT",
3565 [UI_BITMAP] = "BITMAP",
3566 [UI_HISTORY_START] = "HISTORY_START",
3567 [UI_HISTORY_END] = "HISTORY_END",
3568 [UI_SIZE] = "SIZE",
3569 [UI_FLAGS] = "FLAGS",
3570 };
3571
3572 if (index >= UI_EXTENDED_SIZE) {
3573 dev_warn(DEV, " uuid_index >= EXTENDED_SIZE\n");
3574 return;
3575 }
3576
3577 dynamic_dev_dbg(DEV, " uuid[%s] now %016llX\n",
3578 uuid_str[index],
3579 (unsigned long long)mdev->ldev->md.uuid[index]);
3580}
3581
3582
3405/** 3583/**
3406 * drbd_md_mark_dirty() - Mark meta data super block as dirty 3584 * drbd_md_mark_dirty() - Mark meta data super block as dirty
3407 * @mdev: DRBD device. 3585 * @mdev: DRBD device.
@@ -3410,19 +3588,31 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3410 * the meta-data super block. This function sets MD_DIRTY, and starts a 3588 * the meta-data super block. This function sets MD_DIRTY, and starts a
3411 * timer that ensures that within five seconds you have to call drbd_md_sync(). 3589 * timer that ensures that within five seconds you have to call drbd_md_sync().
3412 */ 3590 */
3591#ifdef DEBUG
3592void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
3593{
3594 if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
3595 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
3596 mdev->last_md_mark_dirty.line = line;
3597 mdev->last_md_mark_dirty.func = func;
3598 }
3599}
3600#else
3413void drbd_md_mark_dirty(struct drbd_conf *mdev) 3601void drbd_md_mark_dirty(struct drbd_conf *mdev)
3414{ 3602{
3415 set_bit(MD_DIRTY, &mdev->flags); 3603 if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
3416 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ); 3604 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
3417} 3605}
3418 3606#endif
3419 3607
3420static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local) 3608static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3421{ 3609{
3422 int i; 3610 int i;
3423 3611
3424 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) 3612 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) {
3425 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i]; 3613 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
3614 debug_drbd_uuid(mdev, i+1);
3615 }
3426} 3616}
3427 3617
3428void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) 3618void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
@@ -3437,6 +3627,7 @@ void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3437 } 3627 }
3438 3628
3439 mdev->ldev->md.uuid[idx] = val; 3629 mdev->ldev->md.uuid[idx] = val;
3630 debug_drbd_uuid(mdev, idx);
3440 drbd_md_mark_dirty(mdev); 3631 drbd_md_mark_dirty(mdev);
3441} 3632}
3442 3633
@@ -3446,6 +3637,7 @@ void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3446 if (mdev->ldev->md.uuid[idx]) { 3637 if (mdev->ldev->md.uuid[idx]) {
3447 drbd_uuid_move_history(mdev); 3638 drbd_uuid_move_history(mdev);
3448 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx]; 3639 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
3640 debug_drbd_uuid(mdev, UI_HISTORY_START);
3449 } 3641 }
3450 _drbd_uuid_set(mdev, idx, val); 3642 _drbd_uuid_set(mdev, idx, val);
3451} 3643}
@@ -3464,6 +3656,7 @@ void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3464 dev_info(DEV, "Creating new current UUID\n"); 3656 dev_info(DEV, "Creating new current UUID\n");
3465 D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0); 3657 D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0);
3466 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT]; 3658 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
3659 debug_drbd_uuid(mdev, UI_BITMAP);
3467 3660
3468 get_random_bytes(&val, sizeof(u64)); 3661 get_random_bytes(&val, sizeof(u64));
3469 _drbd_uuid_set(mdev, UI_CURRENT, val); 3662 _drbd_uuid_set(mdev, UI_CURRENT, val);
@@ -3478,6 +3671,8 @@ void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3478 drbd_uuid_move_history(mdev); 3671 drbd_uuid_move_history(mdev);
3479 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP]; 3672 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3480 mdev->ldev->md.uuid[UI_BITMAP] = 0; 3673 mdev->ldev->md.uuid[UI_BITMAP] = 0;
3674 debug_drbd_uuid(mdev, UI_HISTORY_START);
3675 debug_drbd_uuid(mdev, UI_BITMAP);
3481 } else { 3676 } else {
3482 if (mdev->ldev->md.uuid[UI_BITMAP]) 3677 if (mdev->ldev->md.uuid[UI_BITMAP])
3483 dev_warn(DEV, "bm UUID already set"); 3678 dev_warn(DEV, "bm UUID already set");
@@ -3485,6 +3680,7 @@ void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3485 mdev->ldev->md.uuid[UI_BITMAP] = val; 3680 mdev->ldev->md.uuid[UI_BITMAP] = val;
3486 mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1); 3681 mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1);
3487 3682
3683 debug_drbd_uuid(mdev, UI_BITMAP);
3488 } 3684 }
3489 drbd_md_mark_dirty(mdev); 3685 drbd_md_mark_dirty(mdev);
3490} 3686}
@@ -3527,6 +3723,7 @@ int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3527{ 3723{
3528 int rv = -EIO; 3724 int rv = -EIO;
3529 3725
3726 drbd_resume_al(mdev);
3530 if (get_ldev_if_state(mdev, D_ATTACHING)) { 3727 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3531 drbd_bm_clear_all(mdev); 3728 drbd_bm_clear_all(mdev);
3532 rv = drbd_bm_write(mdev); 3729 rv = drbd_bm_write(mdev);
@@ -3559,6 +3756,32 @@ static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3559 return 1; 3756 return 1;
3560} 3757}
3561 3758
3759static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3760{
3761 D_ASSERT(mdev->state.disk == D_FAILED);
3762 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
3763 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
3764 * the protected members anymore, though, so in the after_state_ch work
3765 * it will be safe to free them. */
3766 drbd_force_state(mdev, NS(disk, D_DISKLESS));
3767 /* We need to wait for return of references checked out while we still
3768 * have been D_FAILED, though (drbd_md_sync, bitmap io). */
3769 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
3770
3771 clear_bit(GO_DISKLESS, &mdev->flags);
3772 return 1;
3773}
3774
3775void drbd_go_diskless(struct drbd_conf *mdev)
3776{
3777 D_ASSERT(mdev->state.disk == D_FAILED);
3778 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
3779 drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
3780 /* don't drbd_queue_work_front,
3781 * we need to serialize with the after_state_ch work
3782 * of the -> D_FAILED transition. */
3783}
3784
3562/** 3785/**
3563 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap 3786 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3564 * @mdev: DRBD device. 3787 * @mdev: DRBD device.
@@ -3655,8 +3878,11 @@ static void md_sync_timer_fn(unsigned long data)
3655static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused) 3878static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3656{ 3879{
3657 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n"); 3880 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
3881#ifdef DEBUG
3882 dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
3883 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
3884#endif
3658 drbd_md_sync(mdev); 3885 drbd_md_sync(mdev);
3659
3660 return 1; 3886 return 1;
3661} 3887}
3662 3888
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 73131c5ae339..87925e97e613 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -33,10 +33,13 @@
33#include <linux/blkpg.h> 33#include <linux/blkpg.h>
34#include <linux/cpumask.h> 34#include <linux/cpumask.h>
35#include "drbd_int.h" 35#include "drbd_int.h"
36#include "drbd_req.h"
36#include "drbd_wrappers.h" 37#include "drbd_wrappers.h"
37#include <asm/unaligned.h> 38#include <asm/unaligned.h>
38#include <linux/drbd_tag_magic.h> 39#include <linux/drbd_tag_magic.h>
39#include <linux/drbd_limits.h> 40#include <linux/drbd_limits.h>
41#include <linux/compiler.h>
42#include <linux/kthread.h>
40 43
41static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int); 44static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int);
42static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *); 45static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *);
@@ -169,6 +172,10 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd)
169 put_net_conf(mdev); 172 put_net_conf(mdev);
170 } 173 }
171 174
175 /* The helper may take some time.
176 * write out any unsynced meta data changes now */
177 drbd_md_sync(mdev);
178
172 dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb); 179 dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb);
173 180
174 drbd_bcast_ev_helper(mdev, cmd); 181 drbd_bcast_ev_helper(mdev, cmd);
@@ -202,12 +209,10 @@ enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev)
202 put_ldev(mdev); 209 put_ldev(mdev);
203 } else { 210 } else {
204 dev_warn(DEV, "Not fencing peer, I'm not even Consistent myself.\n"); 211 dev_warn(DEV, "Not fencing peer, I'm not even Consistent myself.\n");
205 return mdev->state.pdsk; 212 nps = mdev->state.pdsk;
213 goto out;
206 } 214 }
207 215
208 if (fp == FP_STONITH)
209 _drbd_request_state(mdev, NS(susp, 1), CS_WAIT_COMPLETE);
210
211 r = drbd_khelper(mdev, "fence-peer"); 216 r = drbd_khelper(mdev, "fence-peer");
212 217
213 switch ((r>>8) & 0xff) { 218 switch ((r>>8) & 0xff) {
@@ -252,9 +257,36 @@ enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev)
252 257
253 dev_info(DEV, "fence-peer helper returned %d (%s)\n", 258 dev_info(DEV, "fence-peer helper returned %d (%s)\n",
254 (r>>8) & 0xff, ex_to_string); 259 (r>>8) & 0xff, ex_to_string);
260
261out:
262 if (mdev->state.susp_fen && nps >= D_UNKNOWN) {
263 /* The handler was not successful... unfreeze here, the
264 state engine can not unfreeze... */
265 _drbd_request_state(mdev, NS(susp_fen, 0), CS_VERBOSE);
266 }
267
255 return nps; 268 return nps;
256} 269}
257 270
271static int _try_outdate_peer_async(void *data)
272{
273 struct drbd_conf *mdev = (struct drbd_conf *)data;
274 enum drbd_disk_state nps;
275
276 nps = drbd_try_outdate_peer(mdev);
277 drbd_request_state(mdev, NS(pdsk, nps));
278
279 return 0;
280}
281
282void drbd_try_outdate_peer_async(struct drbd_conf *mdev)
283{
284 struct task_struct *opa;
285
286 opa = kthread_run(_try_outdate_peer_async, mdev, "drbd%d_a_helper", mdev_to_minor(mdev));
287 if (IS_ERR(opa))
288 dev_err(DEV, "out of mem, failed to invoke fence-peer helper\n");
289}
258 290
259int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) 291int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
260{ 292{
@@ -394,6 +426,39 @@ int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
394 return r; 426 return r;
395} 427}
396 428
429static struct drbd_conf *ensure_mdev(int minor, int create)
430{
431 struct drbd_conf *mdev;
432
433 if (minor >= minor_count)
434 return NULL;
435
436 mdev = minor_to_mdev(minor);
437
438 if (!mdev && create) {
439 struct gendisk *disk = NULL;
440 mdev = drbd_new_device(minor);
441
442 spin_lock_irq(&drbd_pp_lock);
443 if (minor_table[minor] == NULL) {
444 minor_table[minor] = mdev;
445 disk = mdev->vdisk;
446 mdev = NULL;
447 } /* else: we lost the race */
448 spin_unlock_irq(&drbd_pp_lock);
449
450 if (disk) /* we won the race above */
451 /* in case we ever add a drbd_delete_device(),
452 * don't forget the del_gendisk! */
453 add_disk(disk);
454 else /* we lost the race above */
455 drbd_free_mdev(mdev);
456
457 mdev = minor_to_mdev(minor);
458 }
459
460 return mdev;
461}
397 462
398static int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, 463static int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
399 struct drbd_nl_cfg_reply *reply) 464 struct drbd_nl_cfg_reply *reply)
@@ -494,6 +559,8 @@ char *ppsize(char *buf, unsigned long long size)
494void drbd_suspend_io(struct drbd_conf *mdev) 559void drbd_suspend_io(struct drbd_conf *mdev)
495{ 560{
496 set_bit(SUSPEND_IO, &mdev->flags); 561 set_bit(SUSPEND_IO, &mdev->flags);
562 if (is_susp(mdev->state))
563 return;
497 wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); 564 wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
498} 565}
499 566
@@ -713,9 +780,6 @@ void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __mu
713 blk_queue_segment_boundary(q, PAGE_SIZE-1); 780 blk_queue_segment_boundary(q, PAGE_SIZE-1);
714 blk_stack_limits(&q->limits, &b->limits, 0); 781 blk_stack_limits(&q->limits, &b->limits, 0);
715 782
716 if (b->merge_bvec_fn)
717 dev_warn(DEV, "Backing device's merge_bvec_fn() = %p\n",
718 b->merge_bvec_fn);
719 dev_info(DEV, "max_segment_size ( = BIO size ) = %u\n", queue_max_segment_size(q)); 783 dev_info(DEV, "max_segment_size ( = BIO size ) = %u\n", queue_max_segment_size(q));
720 784
721 if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) { 785 if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
@@ -729,14 +793,16 @@ void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __mu
729/* serialize deconfig (worker exiting, doing cleanup) 793/* serialize deconfig (worker exiting, doing cleanup)
730 * and reconfig (drbdsetup disk, drbdsetup net) 794 * and reconfig (drbdsetup disk, drbdsetup net)
731 * 795 *
732 * wait for a potentially exiting worker, then restart it, 796 * Wait for a potentially exiting worker, then restart it,
733 * or start a new one. 797 * or start a new one. Flush any pending work, there may still be an
798 * after_state_change queued.
734 */ 799 */
735static void drbd_reconfig_start(struct drbd_conf *mdev) 800static void drbd_reconfig_start(struct drbd_conf *mdev)
736{ 801{
737 wait_event(mdev->state_wait, !test_and_set_bit(CONFIG_PENDING, &mdev->flags)); 802 wait_event(mdev->state_wait, !test_and_set_bit(CONFIG_PENDING, &mdev->flags));
738 wait_event(mdev->state_wait, !test_bit(DEVICE_DYING, &mdev->flags)); 803 wait_event(mdev->state_wait, !test_bit(DEVICE_DYING, &mdev->flags));
739 drbd_thread_start(&mdev->worker); 804 drbd_thread_start(&mdev->worker);
805 drbd_flush_workqueue(mdev);
740} 806}
741 807
742/* if still unconfigured, stops worker again. 808/* if still unconfigured, stops worker again.
@@ -756,6 +822,29 @@ static void drbd_reconfig_done(struct drbd_conf *mdev)
756 wake_up(&mdev->state_wait); 822 wake_up(&mdev->state_wait);
757} 823}
758 824
825/* Make sure IO is suspended before calling this function(). */
826static void drbd_suspend_al(struct drbd_conf *mdev)
827{
828 int s = 0;
829
830 if (lc_try_lock(mdev->act_log)) {
831 drbd_al_shrink(mdev);
832 lc_unlock(mdev->act_log);
833 } else {
834 dev_warn(DEV, "Failed to lock al in drbd_suspend_al()\n");
835 return;
836 }
837
838 spin_lock_irq(&mdev->req_lock);
839 if (mdev->state.conn < C_CONNECTED)
840 s = !test_and_set_bit(AL_SUSPENDED, &mdev->flags);
841
842 spin_unlock_irq(&mdev->req_lock);
843
844 if (s)
845 dev_info(DEV, "Suspended AL updates\n");
846}
847
759/* does always return 0; 848/* does always return 0;
760 * interesting return code is in reply->ret_code */ 849 * interesting return code is in reply->ret_code */
761static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, 850static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
@@ -769,6 +858,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
769 struct inode *inode, *inode2; 858 struct inode *inode, *inode2;
770 struct lru_cache *resync_lru = NULL; 859 struct lru_cache *resync_lru = NULL;
771 union drbd_state ns, os; 860 union drbd_state ns, os;
861 unsigned int max_seg_s;
772 int rv; 862 int rv;
773 int cp_discovered = 0; 863 int cp_discovered = 0;
774 int logical_block_size; 864 int logical_block_size;
@@ -803,6 +893,15 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
803 goto fail; 893 goto fail;
804 } 894 }
805 895
896 if (get_net_conf(mdev)) {
897 int prot = mdev->net_conf->wire_protocol;
898 put_net_conf(mdev);
899 if (nbc->dc.fencing == FP_STONITH && prot == DRBD_PROT_A) {
900 retcode = ERR_STONITH_AND_PROT_A;
901 goto fail;
902 }
903 }
904
806 nbc->lo_file = filp_open(nbc->dc.backing_dev, O_RDWR, 0); 905 nbc->lo_file = filp_open(nbc->dc.backing_dev, O_RDWR, 0);
807 if (IS_ERR(nbc->lo_file)) { 906 if (IS_ERR(nbc->lo_file)) {
808 dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev, 907 dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev,
@@ -924,7 +1023,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
924 1023
925 drbd_suspend_io(mdev); 1024 drbd_suspend_io(mdev);
926 /* also wait for the last barrier ack. */ 1025 /* also wait for the last barrier ack. */
927 wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt)); 1026 wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt) || is_susp(mdev->state));
928 /* and for any other previously queued work */ 1027 /* and for any other previously queued work */
929 drbd_flush_workqueue(mdev); 1028 drbd_flush_workqueue(mdev);
930 1029
@@ -1021,7 +1120,8 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
1021 else 1120 else
1022 clear_bit(CRASHED_PRIMARY, &mdev->flags); 1121 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1023 1122
1024 if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND)) { 1123 if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) &&
1124 !(mdev->state.role == R_PRIMARY && mdev->state.susp_nod)) {
1025 set_bit(CRASHED_PRIMARY, &mdev->flags); 1125 set_bit(CRASHED_PRIMARY, &mdev->flags);
1026 cp_discovered = 1; 1126 cp_discovered = 1;
1027 } 1127 }
@@ -1031,7 +1131,20 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
1031 mdev->read_cnt = 0; 1131 mdev->read_cnt = 0;
1032 mdev->writ_cnt = 0; 1132 mdev->writ_cnt = 0;
1033 1133
1034 drbd_setup_queue_param(mdev, DRBD_MAX_SEGMENT_SIZE); 1134 max_seg_s = DRBD_MAX_SEGMENT_SIZE;
1135 if (mdev->state.conn == C_CONNECTED) {
1136 /* We are Primary, Connected, and now attach a new local
1137 * backing store. We must not increase the user visible maximum
1138 * bio size on this device to something the peer may not be
1139 * able to handle. */
1140 if (mdev->agreed_pro_version < 94)
1141 max_seg_s = queue_max_segment_size(mdev->rq_queue);
1142 else if (mdev->agreed_pro_version == 94)
1143 max_seg_s = DRBD_MAX_SIZE_H80_PACKET;
1144 /* else: drbd 8.3.9 and later, stay with default */
1145 }
1146
1147 drbd_setup_queue_param(mdev, max_seg_s);
1035 1148
1036 /* If I am currently not R_PRIMARY, 1149 /* If I am currently not R_PRIMARY,
1037 * but meta data primary indicator is set, 1150 * but meta data primary indicator is set,
@@ -1079,6 +1192,9 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
1079 drbd_al_to_on_disk_bm(mdev); 1192 drbd_al_to_on_disk_bm(mdev);
1080 } 1193 }
1081 1194
1195 if (_drbd_bm_total_weight(mdev) == drbd_bm_bits(mdev))
1196 drbd_suspend_al(mdev); /* IO is still suspended here... */
1197
1082 spin_lock_irq(&mdev->req_lock); 1198 spin_lock_irq(&mdev->req_lock);
1083 os = mdev->state; 1199 os = mdev->state;
1084 ns.i = os.i; 1200 ns.i = os.i;
@@ -1235,7 +1351,16 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1235 && (new_conf->wire_protocol != DRBD_PROT_C)) { 1351 && (new_conf->wire_protocol != DRBD_PROT_C)) {
1236 retcode = ERR_NOT_PROTO_C; 1352 retcode = ERR_NOT_PROTO_C;
1237 goto fail; 1353 goto fail;
1238 }; 1354 }
1355
1356 if (get_ldev(mdev)) {
1357 enum drbd_fencing_p fp = mdev->ldev->dc.fencing;
1358 put_ldev(mdev);
1359 if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH) {
1360 retcode = ERR_STONITH_AND_PROT_A;
1361 goto fail;
1362 }
1363 }
1239 1364
1240 if (mdev->state.role == R_PRIMARY && new_conf->want_lose) { 1365 if (mdev->state.role == R_PRIMARY && new_conf->want_lose) {
1241 retcode = ERR_DISCARD; 1366 retcode = ERR_DISCARD;
@@ -1350,6 +1475,7 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1350 } 1475 }
1351 } 1476 }
1352 1477
1478 drbd_flush_workqueue(mdev);
1353 spin_lock_irq(&mdev->req_lock); 1479 spin_lock_irq(&mdev->req_lock);
1354 if (mdev->net_conf != NULL) { 1480 if (mdev->net_conf != NULL) {
1355 retcode = ERR_NET_CONFIGURED; 1481 retcode = ERR_NET_CONFIGURED;
@@ -1388,10 +1514,9 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1388 mdev->int_dig_out=int_dig_out; 1514 mdev->int_dig_out=int_dig_out;
1389 mdev->int_dig_in=int_dig_in; 1515 mdev->int_dig_in=int_dig_in;
1390 mdev->int_dig_vv=int_dig_vv; 1516 mdev->int_dig_vv=int_dig_vv;
1517 retcode = _drbd_set_state(_NS(mdev, conn, C_UNCONNECTED), CS_VERBOSE, NULL);
1391 spin_unlock_irq(&mdev->req_lock); 1518 spin_unlock_irq(&mdev->req_lock);
1392 1519
1393 retcode = _drbd_request_state(mdev, NS(conn, C_UNCONNECTED), CS_VERBOSE);
1394
1395 kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); 1520 kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
1396 reply->ret_code = retcode; 1521 reply->ret_code = retcode;
1397 drbd_reconfig_done(mdev); 1522 drbd_reconfig_done(mdev);
@@ -1546,6 +1671,8 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
1546 struct crypto_hash *csums_tfm = NULL; 1671 struct crypto_hash *csums_tfm = NULL;
1547 struct syncer_conf sc; 1672 struct syncer_conf sc;
1548 cpumask_var_t new_cpu_mask; 1673 cpumask_var_t new_cpu_mask;
1674 int *rs_plan_s = NULL;
1675 int fifo_size;
1549 1676
1550 if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) { 1677 if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) {
1551 retcode = ERR_NOMEM; 1678 retcode = ERR_NOMEM;
@@ -1557,6 +1684,12 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
1557 sc.rate = DRBD_RATE_DEF; 1684 sc.rate = DRBD_RATE_DEF;
1558 sc.after = DRBD_AFTER_DEF; 1685 sc.after = DRBD_AFTER_DEF;
1559 sc.al_extents = DRBD_AL_EXTENTS_DEF; 1686 sc.al_extents = DRBD_AL_EXTENTS_DEF;
1687 sc.on_no_data = DRBD_ON_NO_DATA_DEF;
1688 sc.c_plan_ahead = DRBD_C_PLAN_AHEAD_DEF;
1689 sc.c_delay_target = DRBD_C_DELAY_TARGET_DEF;
1690 sc.c_fill_target = DRBD_C_FILL_TARGET_DEF;
1691 sc.c_max_rate = DRBD_C_MAX_RATE_DEF;
1692 sc.c_min_rate = DRBD_C_MIN_RATE_DEF;
1560 } else 1693 } else
1561 memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf)); 1694 memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf));
1562 1695
@@ -1634,6 +1767,12 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
1634 } 1767 }
1635#undef AL_MAX 1768#undef AL_MAX
1636 1769
1770 /* to avoid spurious errors when configuring minors before configuring
1771 * the minors they depend on: if necessary, first create the minor we
1772 * depend on */
1773 if (sc.after >= 0)
1774 ensure_mdev(sc.after, 1);
1775
1637 /* most sanity checks done, try to assign the new sync-after 1776 /* most sanity checks done, try to assign the new sync-after
1638 * dependency. need to hold the global lock in there, 1777 * dependency. need to hold the global lock in there,
1639 * to avoid a race in the dependency loop check. */ 1778 * to avoid a race in the dependency loop check. */
@@ -1641,6 +1780,16 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
1641 if (retcode != NO_ERROR) 1780 if (retcode != NO_ERROR)
1642 goto fail; 1781 goto fail;
1643 1782
1783 fifo_size = (sc.c_plan_ahead * 10 * SLEEP_TIME) / HZ;
1784 if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) {
1785 rs_plan_s = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL);
1786 if (!rs_plan_s) {
1787 dev_err(DEV, "kmalloc of fifo_buffer failed");
1788 retcode = ERR_NOMEM;
1789 goto fail;
1790 }
1791 }
1792
1644 /* ok, assign the rest of it as well. 1793 /* ok, assign the rest of it as well.
1645 * lock against receive_SyncParam() */ 1794 * lock against receive_SyncParam() */
1646 spin_lock(&mdev->peer_seq_lock); 1795 spin_lock(&mdev->peer_seq_lock);
@@ -1657,6 +1806,15 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
1657 mdev->verify_tfm = verify_tfm; 1806 mdev->verify_tfm = verify_tfm;
1658 verify_tfm = NULL; 1807 verify_tfm = NULL;
1659 } 1808 }
1809
1810 if (fifo_size != mdev->rs_plan_s.size) {
1811 kfree(mdev->rs_plan_s.values);
1812 mdev->rs_plan_s.values = rs_plan_s;
1813 mdev->rs_plan_s.size = fifo_size;
1814 mdev->rs_planed = 0;
1815 rs_plan_s = NULL;
1816 }
1817
1660 spin_unlock(&mdev->peer_seq_lock); 1818 spin_unlock(&mdev->peer_seq_lock);
1661 1819
1662 if (get_ldev(mdev)) { 1820 if (get_ldev(mdev)) {
@@ -1688,6 +1846,7 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
1688 1846
1689 kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); 1847 kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
1690fail: 1848fail:
1849 kfree(rs_plan_s);
1691 free_cpumask_var(new_cpu_mask); 1850 free_cpumask_var(new_cpu_mask);
1692 crypto_free_hash(csums_tfm); 1851 crypto_free_hash(csums_tfm);
1693 crypto_free_hash(verify_tfm); 1852 crypto_free_hash(verify_tfm);
@@ -1721,12 +1880,38 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
1721 return 0; 1880 return 0;
1722} 1881}
1723 1882
1883static int drbd_bmio_set_susp_al(struct drbd_conf *mdev)
1884{
1885 int rv;
1886
1887 rv = drbd_bmio_set_n_write(mdev);
1888 drbd_suspend_al(mdev);
1889 return rv;
1890}
1891
1724static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, 1892static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1725 struct drbd_nl_cfg_reply *reply) 1893 struct drbd_nl_cfg_reply *reply)
1726{ 1894{
1895 int retcode;
1727 1896
1728 reply->ret_code = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S)); 1897 retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED);
1898
1899 if (retcode < SS_SUCCESS) {
1900 if (retcode == SS_NEED_CONNECTION && mdev->state.role == R_PRIMARY) {
1901 /* The peer will get a resync upon connect anyways. Just make that
1902 into a full resync. */
1903 retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT));
1904 if (retcode >= SS_SUCCESS) {
1905 /* open coded drbd_bitmap_io() */
1906 if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al,
1907 "set_n_write from invalidate_peer"))
1908 retcode = ERR_IO_MD_DISK;
1909 }
1910 } else
1911 retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
1912 }
1729 1913
1914 reply->ret_code = retcode;
1730 return 0; 1915 return 0;
1731} 1916}
1732 1917
@@ -1765,7 +1950,21 @@ static int drbd_nl_suspend_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
1765static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, 1950static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1766 struct drbd_nl_cfg_reply *reply) 1951 struct drbd_nl_cfg_reply *reply)
1767{ 1952{
1768 reply->ret_code = drbd_request_state(mdev, NS(susp, 0)); 1953 if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1954 drbd_uuid_new_current(mdev);
1955 clear_bit(NEW_CUR_UUID, &mdev->flags);
1956 drbd_md_sync(mdev);
1957 }
1958 drbd_suspend_io(mdev);
1959 reply->ret_code = drbd_request_state(mdev, NS3(susp, 0, susp_nod, 0, susp_fen, 0));
1960 if (reply->ret_code == SS_SUCCESS) {
1961 if (mdev->state.conn < C_CONNECTED)
1962 tl_clear(mdev);
1963 if (mdev->state.disk == D_DISKLESS || mdev->state.disk == D_FAILED)
1964 tl_restart(mdev, fail_frozen_disk_io);
1965 }
1966 drbd_resume_io(mdev);
1967
1769 return 0; 1968 return 0;
1770} 1969}
1771 1970
@@ -1941,40 +2140,6 @@ out:
1941 return 0; 2140 return 0;
1942} 2141}
1943 2142
1944static struct drbd_conf *ensure_mdev(struct drbd_nl_cfg_req *nlp)
1945{
1946 struct drbd_conf *mdev;
1947
1948 if (nlp->drbd_minor >= minor_count)
1949 return NULL;
1950
1951 mdev = minor_to_mdev(nlp->drbd_minor);
1952
1953 if (!mdev && (nlp->flags & DRBD_NL_CREATE_DEVICE)) {
1954 struct gendisk *disk = NULL;
1955 mdev = drbd_new_device(nlp->drbd_minor);
1956
1957 spin_lock_irq(&drbd_pp_lock);
1958 if (minor_table[nlp->drbd_minor] == NULL) {
1959 minor_table[nlp->drbd_minor] = mdev;
1960 disk = mdev->vdisk;
1961 mdev = NULL;
1962 } /* else: we lost the race */
1963 spin_unlock_irq(&drbd_pp_lock);
1964
1965 if (disk) /* we won the race above */
1966 /* in case we ever add a drbd_delete_device(),
1967 * don't forget the del_gendisk! */
1968 add_disk(disk);
1969 else /* we lost the race above */
1970 drbd_free_mdev(mdev);
1971
1972 mdev = minor_to_mdev(nlp->drbd_minor);
1973 }
1974
1975 return mdev;
1976}
1977
1978struct cn_handler_struct { 2143struct cn_handler_struct {
1979 int (*function)(struct drbd_conf *, 2144 int (*function)(struct drbd_conf *,
1980 struct drbd_nl_cfg_req *, 2145 struct drbd_nl_cfg_req *,
@@ -2035,7 +2200,8 @@ static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms
2035 goto fail; 2200 goto fail;
2036 } 2201 }
2037 2202
2038 mdev = ensure_mdev(nlp); 2203 mdev = ensure_mdev(nlp->drbd_minor,
2204 (nlp->flags & DRBD_NL_CREATE_DEVICE));
2039 if (!mdev) { 2205 if (!mdev) {
2040 retcode = ERR_MINOR_INVALID; 2206 retcode = ERR_MINOR_INVALID;
2041 goto fail; 2207 goto fail;
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index be3374b68460..ad325c5d0ce1 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -57,6 +57,7 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
57 unsigned long db, dt, dbdt, rt, rs_left; 57 unsigned long db, dt, dbdt, rt, rs_left;
58 unsigned int res; 58 unsigned int res;
59 int i, x, y; 59 int i, x, y;
60 int stalled = 0;
60 61
61 drbd_get_syncer_progress(mdev, &rs_left, &res); 62 drbd_get_syncer_progress(mdev, &rs_left, &res);
62 63
@@ -90,18 +91,17 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
90 * db: blocks written from mark until now 91 * db: blocks written from mark until now
91 * rt: remaining time 92 * rt: remaining time
92 */ 93 */
93 dt = (jiffies - mdev->rs_mark_time) / HZ; 94 /* Rolling marks. last_mark+1 may just now be modified. last_mark+2 is
94 95 * at least (DRBD_SYNC_MARKS-2)*DRBD_SYNC_MARK_STEP old, and has at
95 if (dt > 20) { 96 * least DRBD_SYNC_MARK_STEP time before it will be modified. */
96 /* if we made no update to rs_mark_time for too long, 97 i = (mdev->rs_last_mark + 2) % DRBD_SYNC_MARKS;
97 * we are stalled. show that. */ 98 dt = (jiffies - mdev->rs_mark_time[i]) / HZ;
98 seq_printf(seq, "stalled\n"); 99 if (dt > (DRBD_SYNC_MARK_STEP * DRBD_SYNC_MARKS))
99 return; 100 stalled = 1;
100 }
101 101
102 if (!dt) 102 if (!dt)
103 dt++; 103 dt++;
104 db = mdev->rs_mark_left - rs_left; 104 db = mdev->rs_mark_left[i] - rs_left;
105 rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */ 105 rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */
106 106
107 seq_printf(seq, "finish: %lu:%02lu:%02lu", 107 seq_printf(seq, "finish: %lu:%02lu:%02lu",
@@ -118,7 +118,7 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
118 /* mean speed since syncer started 118 /* mean speed since syncer started
119 * we do account for PausedSync periods */ 119 * we do account for PausedSync periods */
120 dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ; 120 dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
121 if (dt <= 0) 121 if (dt == 0)
122 dt = 1; 122 dt = 1;
123 db = mdev->rs_total - rs_left; 123 db = mdev->rs_total - rs_left;
124 dbdt = Bit2KB(db/dt); 124 dbdt = Bit2KB(db/dt);
@@ -128,7 +128,14 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
128 else 128 else
129 seq_printf(seq, " (%ld)", dbdt); 129 seq_printf(seq, " (%ld)", dbdt);
130 130
131 seq_printf(seq, " K/sec\n"); 131 if (mdev->state.conn == C_SYNC_TARGET) {
132 if (mdev->c_sync_rate > 1000)
133 seq_printf(seq, " want: %d,%03d",
134 mdev->c_sync_rate / 1000, mdev->c_sync_rate % 1000);
135 else
136 seq_printf(seq, " want: %d", mdev->c_sync_rate);
137 }
138 seq_printf(seq, " K/sec%s\n", stalled ? " (stalled)" : "");
132} 139}
133 140
134static void resync_dump_detail(struct seq_file *seq, struct lc_element *e) 141static void resync_dump_detail(struct seq_file *seq, struct lc_element *e)
@@ -196,7 +203,7 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
196 seq_printf(seq, "%2d: cs:Unconfigured\n", i); 203 seq_printf(seq, "%2d: cs:Unconfigured\n", i);
197 } else { 204 } else {
198 seq_printf(seq, 205 seq_printf(seq,
199 "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c\n" 206 "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n"
200 " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u " 207 " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
201 "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c", 208 "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c",
202 i, sn, 209 i, sn,
@@ -206,11 +213,12 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
206 drbd_disk_str(mdev->state.pdsk), 213 drbd_disk_str(mdev->state.pdsk),
207 (mdev->net_conf == NULL ? ' ' : 214 (mdev->net_conf == NULL ? ' ' :
208 (mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')), 215 (mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')),
209 mdev->state.susp ? 's' : 'r', 216 is_susp(mdev->state) ? 's' : 'r',
210 mdev->state.aftr_isp ? 'a' : '-', 217 mdev->state.aftr_isp ? 'a' : '-',
211 mdev->state.peer_isp ? 'p' : '-', 218 mdev->state.peer_isp ? 'p' : '-',
212 mdev->state.user_isp ? 'u' : '-', 219 mdev->state.user_isp ? 'u' : '-',
213 mdev->congestion_reason ?: '-', 220 mdev->congestion_reason ?: '-',
221 test_bit(AL_SUSPENDED, &mdev->flags) ? 's' : '-',
214 mdev->send_cnt/2, 222 mdev->send_cnt/2,
215 mdev->recv_cnt/2, 223 mdev->recv_cnt/2,
216 mdev->writ_cnt/2, 224 mdev->writ_cnt/2,
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 081522d3c742..efd6169acf2f 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -241,7 +241,7 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
241 spin_unlock_irq(&mdev->req_lock); 241 spin_unlock_irq(&mdev->req_lock);
242 242
243 list_for_each_entry_safe(e, t, &reclaimed, w.list) 243 list_for_each_entry_safe(e, t, &reclaimed, w.list)
244 drbd_free_ee(mdev, e); 244 drbd_free_net_ee(mdev, e);
245} 245}
246 246
247/** 247/**
@@ -298,9 +298,11 @@ static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool
298 * Is also used from inside an other spin_lock_irq(&mdev->req_lock); 298 * Is also used from inside an other spin_lock_irq(&mdev->req_lock);
299 * Either links the page chain back to the global pool, 299 * Either links the page chain back to the global pool,
300 * or returns all pages to the system. */ 300 * or returns all pages to the system. */
301static void drbd_pp_free(struct drbd_conf *mdev, struct page *page) 301static void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net)
302{ 302{
303 atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use;
303 int i; 304 int i;
305
304 if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) 306 if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count)
305 i = page_chain_free(page); 307 i = page_chain_free(page);
306 else { 308 else {
@@ -311,10 +313,10 @@ static void drbd_pp_free(struct drbd_conf *mdev, struct page *page)
311 drbd_pp_vacant += i; 313 drbd_pp_vacant += i;
312 spin_unlock(&drbd_pp_lock); 314 spin_unlock(&drbd_pp_lock);
313 } 315 }
314 atomic_sub(i, &mdev->pp_in_use); 316 i = atomic_sub_return(i, a);
315 i = atomic_read(&mdev->pp_in_use);
316 if (i < 0) 317 if (i < 0)
317 dev_warn(DEV, "ASSERTION FAILED: pp_in_use: %d < 0\n", i); 318 dev_warn(DEV, "ASSERTION FAILED: %s: %d < 0\n",
319 is_net ? "pp_in_use_by_net" : "pp_in_use", i);
318 wake_up(&drbd_pp_wait); 320 wake_up(&drbd_pp_wait);
319} 321}
320 322
@@ -365,7 +367,6 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
365 e->size = data_size; 367 e->size = data_size;
366 e->flags = 0; 368 e->flags = 0;
367 e->sector = sector; 369 e->sector = sector;
368 e->sector = sector;
369 e->block_id = id; 370 e->block_id = id;
370 371
371 return e; 372 return e;
@@ -375,9 +376,11 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
375 return NULL; 376 return NULL;
376} 377}
377 378
378void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e) 379void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, int is_net)
379{ 380{
380 drbd_pp_free(mdev, e->pages); 381 if (e->flags & EE_HAS_DIGEST)
382 kfree(e->digest);
383 drbd_pp_free(mdev, e->pages, is_net);
381 D_ASSERT(atomic_read(&e->pending_bios) == 0); 384 D_ASSERT(atomic_read(&e->pending_bios) == 0);
382 D_ASSERT(hlist_unhashed(&e->colision)); 385 D_ASSERT(hlist_unhashed(&e->colision));
383 mempool_free(e, drbd_ee_mempool); 386 mempool_free(e, drbd_ee_mempool);
@@ -388,13 +391,14 @@ int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list)
388 LIST_HEAD(work_list); 391 LIST_HEAD(work_list);
389 struct drbd_epoch_entry *e, *t; 392 struct drbd_epoch_entry *e, *t;
390 int count = 0; 393 int count = 0;
394 int is_net = list == &mdev->net_ee;
391 395
392 spin_lock_irq(&mdev->req_lock); 396 spin_lock_irq(&mdev->req_lock);
393 list_splice_init(list, &work_list); 397 list_splice_init(list, &work_list);
394 spin_unlock_irq(&mdev->req_lock); 398 spin_unlock_irq(&mdev->req_lock);
395 399
396 list_for_each_entry_safe(e, t, &work_list, w.list) { 400 list_for_each_entry_safe(e, t, &work_list, w.list) {
397 drbd_free_ee(mdev, e); 401 drbd_free_some_ee(mdev, e, is_net);
398 count++; 402 count++;
399 } 403 }
400 return count; 404 return count;
@@ -423,7 +427,7 @@ static int drbd_process_done_ee(struct drbd_conf *mdev)
423 spin_unlock_irq(&mdev->req_lock); 427 spin_unlock_irq(&mdev->req_lock);
424 428
425 list_for_each_entry_safe(e, t, &reclaimed, w.list) 429 list_for_each_entry_safe(e, t, &reclaimed, w.list)
426 drbd_free_ee(mdev, e); 430 drbd_free_net_ee(mdev, e);
427 431
428 /* possible callbacks here: 432 /* possible callbacks here:
429 * e_end_block, and e_end_resync_block, e_send_discard_ack. 433 * e_end_block, and e_end_resync_block, e_send_discard_ack.
@@ -719,14 +723,14 @@ out:
719static int drbd_send_fp(struct drbd_conf *mdev, 723static int drbd_send_fp(struct drbd_conf *mdev,
720 struct socket *sock, enum drbd_packets cmd) 724 struct socket *sock, enum drbd_packets cmd)
721{ 725{
722 struct p_header *h = (struct p_header *) &mdev->data.sbuf.header; 726 struct p_header80 *h = &mdev->data.sbuf.header.h80;
723 727
724 return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0); 728 return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0);
725} 729}
726 730
727static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock) 731static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock)
728{ 732{
729 struct p_header *h = (struct p_header *) &mdev->data.sbuf.header; 733 struct p_header80 *h = &mdev->data.rbuf.header.h80;
730 int rr; 734 int rr;
731 735
732 rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0); 736 rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0);
@@ -776,9 +780,6 @@ static int drbd_connect(struct drbd_conf *mdev)
776 780
777 D_ASSERT(!mdev->data.socket); 781 D_ASSERT(!mdev->data.socket);
778 782
779 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags))
780 dev_err(DEV, "CREATE_BARRIER flag was set in drbd_connect - now cleared!\n");
781
782 if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS) 783 if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS)
783 return -2; 784 return -2;
784 785
@@ -927,6 +928,11 @@ retry:
927 928
928 drbd_thread_start(&mdev->asender); 929 drbd_thread_start(&mdev->asender);
929 930
931 if (mdev->agreed_pro_version < 95 && get_ldev(mdev)) {
932 drbd_setup_queue_param(mdev, DRBD_MAX_SIZE_H80_PACKET);
933 put_ldev(mdev);
934 }
935
930 if (!drbd_send_protocol(mdev)) 936 if (!drbd_send_protocol(mdev))
931 return -1; 937 return -1;
932 drbd_send_sync_param(mdev, &mdev->sync_conf); 938 drbd_send_sync_param(mdev, &mdev->sync_conf);
@@ -946,22 +952,28 @@ out_release_sockets:
946 return -1; 952 return -1;
947} 953}
948 954
949static int drbd_recv_header(struct drbd_conf *mdev, struct p_header *h) 955static int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsigned int *packet_size)
950{ 956{
957 union p_header *h = &mdev->data.rbuf.header;
951 int r; 958 int r;
952 959
953 r = drbd_recv(mdev, h, sizeof(*h)); 960 r = drbd_recv(mdev, h, sizeof(*h));
954
955 if (unlikely(r != sizeof(*h))) { 961 if (unlikely(r != sizeof(*h))) {
956 dev_err(DEV, "short read expecting header on sock: r=%d\n", r); 962 dev_err(DEV, "short read expecting header on sock: r=%d\n", r);
957 return FALSE; 963 return FALSE;
958 }; 964 }
959 h->command = be16_to_cpu(h->command); 965
960 h->length = be16_to_cpu(h->length); 966 if (likely(h->h80.magic == BE_DRBD_MAGIC)) {
961 if (unlikely(h->magic != BE_DRBD_MAGIC)) { 967 *cmd = be16_to_cpu(h->h80.command);
962 dev_err(DEV, "magic?? on data m: 0x%lx c: %d l: %d\n", 968 *packet_size = be16_to_cpu(h->h80.length);
963 (long)be32_to_cpu(h->magic), 969 } else if (h->h95.magic == BE_DRBD_MAGIC_BIG) {
964 h->command, h->length); 970 *cmd = be16_to_cpu(h->h95.command);
971 *packet_size = be32_to_cpu(h->h95.length);
972 } else {
973 dev_err(DEV, "magic?? on data m: 0x%08x c: %d l: %d\n",
974 be32_to_cpu(h->h80.magic),
975 be16_to_cpu(h->h80.command),
976 be16_to_cpu(h->h80.length));
965 return FALSE; 977 return FALSE;
966 } 978 }
967 mdev->last_received = jiffies; 979 mdev->last_received = jiffies;
@@ -975,7 +987,7 @@ static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct d
975 987
976 if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) { 988 if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
977 rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL, 989 rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL,
978 NULL, BLKDEV_IFL_WAIT); 990 NULL);
979 if (rv) { 991 if (rv) {
980 dev_err(DEV, "local disk flush failed with status %d\n", rv); 992 dev_err(DEV, "local disk flush failed with status %d\n", rv);
981 /* would rather check on EOPNOTSUPP, but that is not reliable. 993 /* would rather check on EOPNOTSUPP, but that is not reliable.
@@ -1268,17 +1280,12 @@ int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __relea
1268 return 1; 1280 return 1;
1269} 1281}
1270 1282
1271static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h) 1283static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
1272{ 1284{
1273 int rv, issue_flush; 1285 int rv, issue_flush;
1274 struct p_barrier *p = (struct p_barrier *)h; 1286 struct p_barrier *p = &mdev->data.rbuf.barrier;
1275 struct drbd_epoch *epoch; 1287 struct drbd_epoch *epoch;
1276 1288
1277 ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
1278
1279 rv = drbd_recv(mdev, h->payload, h->length);
1280 ERR_IF(rv != h->length) return FALSE;
1281
1282 inc_unacked(mdev); 1289 inc_unacked(mdev);
1283 1290
1284 if (mdev->net_conf->wire_protocol != DRBD_PROT_C) 1291 if (mdev->net_conf->wire_protocol != DRBD_PROT_C)
@@ -1457,7 +1464,7 @@ static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
1457 data_size -= rr; 1464 data_size -= rr;
1458 } 1465 }
1459 kunmap(page); 1466 kunmap(page);
1460 drbd_pp_free(mdev, page); 1467 drbd_pp_free(mdev, page, 0);
1461 return rv; 1468 return rv;
1462} 1469}
1463 1470
@@ -1562,30 +1569,29 @@ static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_si
1562 list_add(&e->w.list, &mdev->sync_ee); 1569 list_add(&e->w.list, &mdev->sync_ee);
1563 spin_unlock_irq(&mdev->req_lock); 1570 spin_unlock_irq(&mdev->req_lock);
1564 1571
1572 atomic_add(data_size >> 9, &mdev->rs_sect_ev);
1565 if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0) 1573 if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0)
1566 return TRUE; 1574 return TRUE;
1567 1575
1576 /* drbd_submit_ee currently fails for one reason only:
1577 * not being able to allocate enough bios.
1578 * Is dropping the connection going to help? */
1579 spin_lock_irq(&mdev->req_lock);
1580 list_del(&e->w.list);
1581 spin_unlock_irq(&mdev->req_lock);
1582
1568 drbd_free_ee(mdev, e); 1583 drbd_free_ee(mdev, e);
1569fail: 1584fail:
1570 put_ldev(mdev); 1585 put_ldev(mdev);
1571 return FALSE; 1586 return FALSE;
1572} 1587}
1573 1588
1574static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h) 1589static int receive_DataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
1575{ 1590{
1576 struct drbd_request *req; 1591 struct drbd_request *req;
1577 sector_t sector; 1592 sector_t sector;
1578 unsigned int header_size, data_size;
1579 int ok; 1593 int ok;
1580 struct p_data *p = (struct p_data *)h; 1594 struct p_data *p = &mdev->data.rbuf.data;
1581
1582 header_size = sizeof(*p) - sizeof(*h);
1583 data_size = h->length - header_size;
1584
1585 ERR_IF(data_size == 0) return FALSE;
1586
1587 if (drbd_recv(mdev, h->payload, header_size) != header_size)
1588 return FALSE;
1589 1595
1590 sector = be64_to_cpu(p->sector); 1596 sector = be64_to_cpu(p->sector);
1591 1597
@@ -1611,20 +1617,11 @@ static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h)
1611 return ok; 1617 return ok;
1612} 1618}
1613 1619
1614static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header *h) 1620static int receive_RSDataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
1615{ 1621{
1616 sector_t sector; 1622 sector_t sector;
1617 unsigned int header_size, data_size;
1618 int ok; 1623 int ok;
1619 struct p_data *p = (struct p_data *)h; 1624 struct p_data *p = &mdev->data.rbuf.data;
1620
1621 header_size = sizeof(*p) - sizeof(*h);
1622 data_size = h->length - header_size;
1623
1624 ERR_IF(data_size == 0) return FALSE;
1625
1626 if (drbd_recv(mdev, h->payload, header_size) != header_size)
1627 return FALSE;
1628 1625
1629 sector = be64_to_cpu(p->sector); 1626 sector = be64_to_cpu(p->sector);
1630 D_ASSERT(p->block_id == ID_SYNCER); 1627 D_ASSERT(p->block_id == ID_SYNCER);
@@ -1640,9 +1637,11 @@ static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header *h)
1640 1637
1641 ok = drbd_drain_block(mdev, data_size); 1638 ok = drbd_drain_block(mdev, data_size);
1642 1639
1643 drbd_send_ack_dp(mdev, P_NEG_ACK, p); 1640 drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size);
1644 } 1641 }
1645 1642
1643 atomic_add(data_size >> 9, &mdev->rs_sect_in);
1644
1646 return ok; 1645 return ok;
1647} 1646}
1648 1647
@@ -1765,24 +1764,27 @@ static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq)
1765 return ret; 1764 return ret;
1766} 1765}
1767 1766
1767static unsigned long write_flags_to_bio(struct drbd_conf *mdev, u32 dpf)
1768{
1769 if (mdev->agreed_pro_version >= 95)
1770 return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
1771 (dpf & DP_UNPLUG ? REQ_UNPLUG : 0) |
1772 (dpf & DP_FUA ? REQ_FUA : 0) |
1773 (dpf & DP_FLUSH ? REQ_FUA : 0) |
1774 (dpf & DP_DISCARD ? REQ_DISCARD : 0);
1775 else
1776 return dpf & DP_RW_SYNC ? (REQ_SYNC | REQ_UNPLUG) : 0;
1777}
1778
1768/* mirrored write */ 1779/* mirrored write */
1769static int receive_Data(struct drbd_conf *mdev, struct p_header *h) 1780static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
1770{ 1781{
1771 sector_t sector; 1782 sector_t sector;
1772 struct drbd_epoch_entry *e; 1783 struct drbd_epoch_entry *e;
1773 struct p_data *p = (struct p_data *)h; 1784 struct p_data *p = &mdev->data.rbuf.data;
1774 int header_size, data_size;
1775 int rw = WRITE; 1785 int rw = WRITE;
1776 u32 dp_flags; 1786 u32 dp_flags;
1777 1787
1778 header_size = sizeof(*p) - sizeof(*h);
1779 data_size = h->length - header_size;
1780
1781 ERR_IF(data_size == 0) return FALSE;
1782
1783 if (drbd_recv(mdev, h->payload, header_size) != header_size)
1784 return FALSE;
1785
1786 if (!get_ldev(mdev)) { 1788 if (!get_ldev(mdev)) {
1787 if (__ratelimit(&drbd_ratelimit_state)) 1789 if (__ratelimit(&drbd_ratelimit_state))
1788 dev_err(DEV, "Can not write mirrored data block " 1790 dev_err(DEV, "Can not write mirrored data block "
@@ -1792,7 +1794,7 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
1792 mdev->peer_seq++; 1794 mdev->peer_seq++;
1793 spin_unlock(&mdev->peer_seq_lock); 1795 spin_unlock(&mdev->peer_seq_lock);
1794 1796
1795 drbd_send_ack_dp(mdev, P_NEG_ACK, p); 1797 drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size);
1796 atomic_inc(&mdev->current_epoch->epoch_size); 1798 atomic_inc(&mdev->current_epoch->epoch_size);
1797 return drbd_drain_block(mdev, data_size); 1799 return drbd_drain_block(mdev, data_size);
1798 } 1800 }
@@ -1839,12 +1841,8 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
1839 spin_unlock(&mdev->epoch_lock); 1841 spin_unlock(&mdev->epoch_lock);
1840 1842
1841 dp_flags = be32_to_cpu(p->dp_flags); 1843 dp_flags = be32_to_cpu(p->dp_flags);
1842 if (dp_flags & DP_HARDBARRIER) { 1844 rw |= write_flags_to_bio(mdev, dp_flags);
1843 dev_err(DEV, "ASSERT FAILED would have submitted barrier request\n"); 1845
1844 /* rw |= REQ_HARDBARRIER; */
1845 }
1846 if (dp_flags & DP_RW_SYNC)
1847 rw |= REQ_SYNC | REQ_UNPLUG;
1848 if (dp_flags & DP_MAY_SET_IN_SYNC) 1846 if (dp_flags & DP_MAY_SET_IN_SYNC)
1849 e->flags |= EE_MAY_SET_IN_SYNC; 1847 e->flags |= EE_MAY_SET_IN_SYNC;
1850 1848
@@ -2007,6 +2005,16 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
2007 if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0) 2005 if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0)
2008 return TRUE; 2006 return TRUE;
2009 2007
2008 /* drbd_submit_ee currently fails for one reason only:
2009 * not being able to allocate enough bios.
2010 * Is dropping the connection going to help? */
2011 spin_lock_irq(&mdev->req_lock);
2012 list_del(&e->w.list);
2013 hlist_del_init(&e->colision);
2014 spin_unlock_irq(&mdev->req_lock);
2015 if (e->flags & EE_CALL_AL_COMPLETE_IO)
2016 drbd_al_complete_io(mdev, e->sector);
2017
2010out_interrupted: 2018out_interrupted:
2011 /* yes, the epoch_size now is imbalanced. 2019 /* yes, the epoch_size now is imbalanced.
2012 * but we drop the connection anyways, so we don't have a chance to 2020 * but we drop the connection anyways, so we don't have a chance to
@@ -2016,20 +2024,64 @@ out_interrupted:
2016 return FALSE; 2024 return FALSE;
2017} 2025}
2018 2026
2019static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h) 2027/* We may throttle resync, if the lower device seems to be busy,
2028 * and current sync rate is above c_min_rate.
2029 *
2030 * To decide whether or not the lower device is busy, we use a scheme similar
2031 * to MD RAID is_mddev_idle(): if the partition stats reveal "significant"
2032 * (more than 64 sectors) of activity we cannot account for with our own resync
2033 * activity, it obviously is "busy".
2034 *
2035 * The current sync rate used here uses only the most recent two step marks,
2036 * to have a short time average so we can react faster.
2037 */
2038int drbd_rs_should_slow_down(struct drbd_conf *mdev)
2039{
2040 struct gendisk *disk = mdev->ldev->backing_bdev->bd_contains->bd_disk;
2041 unsigned long db, dt, dbdt;
2042 int curr_events;
2043 int throttle = 0;
2044
2045 /* feature disabled? */
2046 if (mdev->sync_conf.c_min_rate == 0)
2047 return 0;
2048
2049 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
2050 (int)part_stat_read(&disk->part0, sectors[1]) -
2051 atomic_read(&mdev->rs_sect_ev);
2052 if (!mdev->rs_last_events || curr_events - mdev->rs_last_events > 64) {
2053 unsigned long rs_left;
2054 int i;
2055
2056 mdev->rs_last_events = curr_events;
2057
2058 /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
2059 * approx. */
2060 i = (mdev->rs_last_mark + DRBD_SYNC_MARKS-2) % DRBD_SYNC_MARKS;
2061 rs_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
2062
2063 dt = ((long)jiffies - (long)mdev->rs_mark_time[i]) / HZ;
2064 if (!dt)
2065 dt++;
2066 db = mdev->rs_mark_left[i] - rs_left;
2067 dbdt = Bit2KB(db/dt);
2068
2069 if (dbdt > mdev->sync_conf.c_min_rate)
2070 throttle = 1;
2071 }
2072 return throttle;
2073}
2074
2075
2076static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int digest_size)
2020{ 2077{
2021 sector_t sector; 2078 sector_t sector;
2022 const sector_t capacity = drbd_get_capacity(mdev->this_bdev); 2079 const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
2023 struct drbd_epoch_entry *e; 2080 struct drbd_epoch_entry *e;
2024 struct digest_info *di = NULL; 2081 struct digest_info *di = NULL;
2025 int size, digest_size; 2082 int size, verb;
2026 unsigned int fault_type; 2083 unsigned int fault_type;
2027 struct p_block_req *p = 2084 struct p_block_req *p = &mdev->data.rbuf.block_req;
2028 (struct p_block_req *)h;
2029 const int brps = sizeof(*p)-sizeof(*h);
2030
2031 if (drbd_recv(mdev, h->payload, brps) != brps)
2032 return FALSE;
2033 2085
2034 sector = be64_to_cpu(p->sector); 2086 sector = be64_to_cpu(p->sector);
2035 size = be32_to_cpu(p->blksize); 2087 size = be32_to_cpu(p->blksize);
@@ -2046,12 +2098,31 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
2046 } 2098 }
2047 2099
2048 if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) { 2100 if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) {
2049 if (__ratelimit(&drbd_ratelimit_state)) 2101 verb = 1;
2102 switch (cmd) {
2103 case P_DATA_REQUEST:
2104 drbd_send_ack_rp(mdev, P_NEG_DREPLY, p);
2105 break;
2106 case P_RS_DATA_REQUEST:
2107 case P_CSUM_RS_REQUEST:
2108 case P_OV_REQUEST:
2109 drbd_send_ack_rp(mdev, P_NEG_RS_DREPLY , p);
2110 break;
2111 case P_OV_REPLY:
2112 verb = 0;
2113 dec_rs_pending(mdev);
2114 drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, ID_IN_SYNC);
2115 break;
2116 default:
2117 dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
2118 cmdname(cmd));
2119 }
2120 if (verb && __ratelimit(&drbd_ratelimit_state))
2050 dev_err(DEV, "Can not satisfy peer's read request, " 2121 dev_err(DEV, "Can not satisfy peer's read request, "
2051 "no local data.\n"); 2122 "no local data.\n");
2052 drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY : 2123
2053 P_NEG_RS_DREPLY , p); 2124 /* drain possibly payload */
2054 return drbd_drain_block(mdev, h->length - brps); 2125 return drbd_drain_block(mdev, digest_size);
2055 } 2126 }
2056 2127
2057 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD 2128 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
@@ -2063,31 +2134,21 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
2063 return FALSE; 2134 return FALSE;
2064 } 2135 }
2065 2136
2066 switch (h->command) { 2137 switch (cmd) {
2067 case P_DATA_REQUEST: 2138 case P_DATA_REQUEST:
2068 e->w.cb = w_e_end_data_req; 2139 e->w.cb = w_e_end_data_req;
2069 fault_type = DRBD_FAULT_DT_RD; 2140 fault_type = DRBD_FAULT_DT_RD;
2070 break; 2141 /* application IO, don't drbd_rs_begin_io */
2142 goto submit;
2143
2071 case P_RS_DATA_REQUEST: 2144 case P_RS_DATA_REQUEST:
2072 e->w.cb = w_e_end_rsdata_req; 2145 e->w.cb = w_e_end_rsdata_req;
2073 fault_type = DRBD_FAULT_RS_RD; 2146 fault_type = DRBD_FAULT_RS_RD;
2074 /* Eventually this should become asynchronously. Currently it
2075 * blocks the whole receiver just to delay the reading of a
2076 * resync data block.
2077 * the drbd_work_queue mechanism is made for this...
2078 */
2079 if (!drbd_rs_begin_io(mdev, sector)) {
2080 /* we have been interrupted,
2081 * probably connection lost! */
2082 D_ASSERT(signal_pending(current));
2083 goto out_free_e;
2084 }
2085 break; 2147 break;
2086 2148
2087 case P_OV_REPLY: 2149 case P_OV_REPLY:
2088 case P_CSUM_RS_REQUEST: 2150 case P_CSUM_RS_REQUEST:
2089 fault_type = DRBD_FAULT_RS_RD; 2151 fault_type = DRBD_FAULT_RS_RD;
2090 digest_size = h->length - brps ;
2091 di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO); 2152 di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO);
2092 if (!di) 2153 if (!di)
2093 goto out_free_e; 2154 goto out_free_e;
@@ -2095,31 +2156,25 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
2095 di->digest_size = digest_size; 2156 di->digest_size = digest_size;
2096 di->digest = (((char *)di)+sizeof(struct digest_info)); 2157 di->digest = (((char *)di)+sizeof(struct digest_info));
2097 2158
2159 e->digest = di;
2160 e->flags |= EE_HAS_DIGEST;
2161
2098 if (drbd_recv(mdev, di->digest, digest_size) != digest_size) 2162 if (drbd_recv(mdev, di->digest, digest_size) != digest_size)
2099 goto out_free_e; 2163 goto out_free_e;
2100 2164
2101 e->block_id = (u64)(unsigned long)di; 2165 if (cmd == P_CSUM_RS_REQUEST) {
2102 if (h->command == P_CSUM_RS_REQUEST) {
2103 D_ASSERT(mdev->agreed_pro_version >= 89); 2166 D_ASSERT(mdev->agreed_pro_version >= 89);
2104 e->w.cb = w_e_end_csum_rs_req; 2167 e->w.cb = w_e_end_csum_rs_req;
2105 } else if (h->command == P_OV_REPLY) { 2168 } else if (cmd == P_OV_REPLY) {
2106 e->w.cb = w_e_end_ov_reply; 2169 e->w.cb = w_e_end_ov_reply;
2107 dec_rs_pending(mdev); 2170 dec_rs_pending(mdev);
2108 break; 2171 /* drbd_rs_begin_io done when we sent this request,
2109 } 2172 * but accounting still needs to be done. */
2110 2173 goto submit_for_resync;
2111 if (!drbd_rs_begin_io(mdev, sector)) {
2112 /* we have been interrupted, probably connection lost! */
2113 D_ASSERT(signal_pending(current));
2114 goto out_free_e;
2115 } 2174 }
2116 break; 2175 break;
2117 2176
2118 case P_OV_REQUEST: 2177 case P_OV_REQUEST:
2119 if (mdev->state.conn >= C_CONNECTED &&
2120 mdev->state.conn != C_VERIFY_T)
2121 dev_warn(DEV, "ASSERT FAILED: got P_OV_REQUEST while being %s\n",
2122 drbd_conn_str(mdev->state.conn));
2123 if (mdev->ov_start_sector == ~(sector_t)0 && 2178 if (mdev->ov_start_sector == ~(sector_t)0 &&
2124 mdev->agreed_pro_version >= 90) { 2179 mdev->agreed_pro_version >= 90) {
2125 mdev->ov_start_sector = sector; 2180 mdev->ov_start_sector = sector;
@@ -2130,37 +2185,63 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
2130 } 2185 }
2131 e->w.cb = w_e_end_ov_req; 2186 e->w.cb = w_e_end_ov_req;
2132 fault_type = DRBD_FAULT_RS_RD; 2187 fault_type = DRBD_FAULT_RS_RD;
2133 /* Eventually this should become asynchronous. Currently it
2134 * blocks the whole receiver just to delay the reading of a
2135 * resync data block.
2136 * the drbd_work_queue mechanism is made for this...
2137 */
2138 if (!drbd_rs_begin_io(mdev, sector)) {
2139 /* we have been interrupted,
2140 * probably connection lost! */
2141 D_ASSERT(signal_pending(current));
2142 goto out_free_e;
2143 }
2144 break; 2188 break;
2145 2189
2146
2147 default: 2190 default:
2148 dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n", 2191 dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
2149 cmdname(h->command)); 2192 cmdname(cmd));
2150 fault_type = DRBD_FAULT_MAX; 2193 fault_type = DRBD_FAULT_MAX;
2194 goto out_free_e;
2151 } 2195 }
2152 2196
2153 spin_lock_irq(&mdev->req_lock); 2197 /* Throttle, drbd_rs_begin_io and submit should become asynchronous
2154 list_add(&e->w.list, &mdev->read_ee); 2198 * wrt the receiver, but it is not as straightforward as it may seem.
2155 spin_unlock_irq(&mdev->req_lock); 2199 * Various places in the resync start and stop logic assume resync
2200 * requests are processed in order, requeuing this on the worker thread
2201 * introduces a bunch of new code for synchronization between threads.
2202 *
2203 * Unlimited throttling before drbd_rs_begin_io may stall the resync
2204 * "forever", throttling after drbd_rs_begin_io will lock that extent
2205 * for application writes for the same time. For now, just throttle
2206 * here, where the rest of the code expects the receiver to sleep for
2207 * a while, anyways.
2208 */
2209
2210 /* Throttle before drbd_rs_begin_io, as that locks out application IO;
2211 * this defers syncer requests for some time, before letting at least
2212 * on request through. The resync controller on the receiving side
2213 * will adapt to the incoming rate accordingly.
2214 *
2215 * We cannot throttle here if remote is Primary/SyncTarget:
2216 * we would also throttle its application reads.
2217 * In that case, throttling is done on the SyncTarget only.
2218 */
2219 if (mdev->state.peer != R_PRIMARY && drbd_rs_should_slow_down(mdev))
2220 msleep(100);
2221 if (drbd_rs_begin_io(mdev, e->sector))
2222 goto out_free_e;
2156 2223
2224submit_for_resync:
2225 atomic_add(size >> 9, &mdev->rs_sect_ev);
2226
2227submit:
2157 inc_unacked(mdev); 2228 inc_unacked(mdev);
2229 spin_lock_irq(&mdev->req_lock);
2230 list_add_tail(&e->w.list, &mdev->read_ee);
2231 spin_unlock_irq(&mdev->req_lock);
2158 2232
2159 if (drbd_submit_ee(mdev, e, READ, fault_type) == 0) 2233 if (drbd_submit_ee(mdev, e, READ, fault_type) == 0)
2160 return TRUE; 2234 return TRUE;
2161 2235
2236 /* drbd_submit_ee currently fails for one reason only:
2237 * not being able to allocate enough bios.
2238 * Is dropping the connection going to help? */
2239 spin_lock_irq(&mdev->req_lock);
2240 list_del(&e->w.list);
2241 spin_unlock_irq(&mdev->req_lock);
2242 /* no drbd_rs_complete_io(), we are dropping the connection anyways */
2243
2162out_free_e: 2244out_free_e:
2163 kfree(di);
2164 put_ldev(mdev); 2245 put_ldev(mdev);
2165 drbd_free_ee(mdev, e); 2246 drbd_free_ee(mdev, e);
2166 return FALSE; 2247 return FALSE;
@@ -2699,20 +2780,13 @@ static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self)
2699 return 1; 2780 return 1;
2700} 2781}
2701 2782
2702static int receive_protocol(struct drbd_conf *mdev, struct p_header *h) 2783static int receive_protocol(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
2703{ 2784{
2704 struct p_protocol *p = (struct p_protocol *)h; 2785 struct p_protocol *p = &mdev->data.rbuf.protocol;
2705 int header_size, data_size;
2706 int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p; 2786 int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
2707 int p_want_lose, p_two_primaries, cf; 2787 int p_want_lose, p_two_primaries, cf;
2708 char p_integrity_alg[SHARED_SECRET_MAX] = ""; 2788 char p_integrity_alg[SHARED_SECRET_MAX] = "";
2709 2789
2710 header_size = sizeof(*p) - sizeof(*h);
2711 data_size = h->length - header_size;
2712
2713 if (drbd_recv(mdev, h->payload, header_size) != header_size)
2714 return FALSE;
2715
2716 p_proto = be32_to_cpu(p->protocol); 2790 p_proto = be32_to_cpu(p->protocol);
2717 p_after_sb_0p = be32_to_cpu(p->after_sb_0p); 2791 p_after_sb_0p = be32_to_cpu(p->after_sb_0p);
2718 p_after_sb_1p = be32_to_cpu(p->after_sb_1p); 2792 p_after_sb_1p = be32_to_cpu(p->after_sb_1p);
@@ -2805,39 +2879,46 @@ struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev,
2805 return tfm; 2879 return tfm;
2806} 2880}
2807 2881
2808static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h) 2882static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int packet_size)
2809{ 2883{
2810 int ok = TRUE; 2884 int ok = TRUE;
2811 struct p_rs_param_89 *p = (struct p_rs_param_89 *)h; 2885 struct p_rs_param_95 *p = &mdev->data.rbuf.rs_param_95;
2812 unsigned int header_size, data_size, exp_max_sz; 2886 unsigned int header_size, data_size, exp_max_sz;
2813 struct crypto_hash *verify_tfm = NULL; 2887 struct crypto_hash *verify_tfm = NULL;
2814 struct crypto_hash *csums_tfm = NULL; 2888 struct crypto_hash *csums_tfm = NULL;
2815 const int apv = mdev->agreed_pro_version; 2889 const int apv = mdev->agreed_pro_version;
2890 int *rs_plan_s = NULL;
2891 int fifo_size = 0;
2816 2892
2817 exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param) 2893 exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param)
2818 : apv == 88 ? sizeof(struct p_rs_param) 2894 : apv == 88 ? sizeof(struct p_rs_param)
2819 + SHARED_SECRET_MAX 2895 + SHARED_SECRET_MAX
2820 : /* 89 */ sizeof(struct p_rs_param_89); 2896 : apv <= 94 ? sizeof(struct p_rs_param_89)
2897 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
2821 2898
2822 if (h->length > exp_max_sz) { 2899 if (packet_size > exp_max_sz) {
2823 dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n", 2900 dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n",
2824 h->length, exp_max_sz); 2901 packet_size, exp_max_sz);
2825 return FALSE; 2902 return FALSE;
2826 } 2903 }
2827 2904
2828 if (apv <= 88) { 2905 if (apv <= 88) {
2829 header_size = sizeof(struct p_rs_param) - sizeof(*h); 2906 header_size = sizeof(struct p_rs_param) - sizeof(struct p_header80);
2830 data_size = h->length - header_size; 2907 data_size = packet_size - header_size;
2831 } else /* apv >= 89 */ { 2908 } else if (apv <= 94) {
2832 header_size = sizeof(struct p_rs_param_89) - sizeof(*h); 2909 header_size = sizeof(struct p_rs_param_89) - sizeof(struct p_header80);
2833 data_size = h->length - header_size; 2910 data_size = packet_size - header_size;
2911 D_ASSERT(data_size == 0);
2912 } else {
2913 header_size = sizeof(struct p_rs_param_95) - sizeof(struct p_header80);
2914 data_size = packet_size - header_size;
2834 D_ASSERT(data_size == 0); 2915 D_ASSERT(data_size == 0);
2835 } 2916 }
2836 2917
2837 /* initialize verify_alg and csums_alg */ 2918 /* initialize verify_alg and csums_alg */
2838 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); 2919 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
2839 2920
2840 if (drbd_recv(mdev, h->payload, header_size) != header_size) 2921 if (drbd_recv(mdev, &p->head.payload, header_size) != header_size)
2841 return FALSE; 2922 return FALSE;
2842 2923
2843 mdev->sync_conf.rate = be32_to_cpu(p->rate); 2924 mdev->sync_conf.rate = be32_to_cpu(p->rate);
@@ -2896,6 +2977,22 @@ static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h)
2896 } 2977 }
2897 } 2978 }
2898 2979
2980 if (apv > 94) {
2981 mdev->sync_conf.rate = be32_to_cpu(p->rate);
2982 mdev->sync_conf.c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
2983 mdev->sync_conf.c_delay_target = be32_to_cpu(p->c_delay_target);
2984 mdev->sync_conf.c_fill_target = be32_to_cpu(p->c_fill_target);
2985 mdev->sync_conf.c_max_rate = be32_to_cpu(p->c_max_rate);
2986
2987 fifo_size = (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ;
2988 if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) {
2989 rs_plan_s = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL);
2990 if (!rs_plan_s) {
2991 dev_err(DEV, "kmalloc of fifo_buffer failed");
2992 goto disconnect;
2993 }
2994 }
2995 }
2899 2996
2900 spin_lock(&mdev->peer_seq_lock); 2997 spin_lock(&mdev->peer_seq_lock);
2901 /* lock against drbd_nl_syncer_conf() */ 2998 /* lock against drbd_nl_syncer_conf() */
@@ -2913,6 +3010,12 @@ static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h)
2913 mdev->csums_tfm = csums_tfm; 3010 mdev->csums_tfm = csums_tfm;
2914 dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg); 3011 dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg);
2915 } 3012 }
3013 if (fifo_size != mdev->rs_plan_s.size) {
3014 kfree(mdev->rs_plan_s.values);
3015 mdev->rs_plan_s.values = rs_plan_s;
3016 mdev->rs_plan_s.size = fifo_size;
3017 mdev->rs_planed = 0;
3018 }
2916 spin_unlock(&mdev->peer_seq_lock); 3019 spin_unlock(&mdev->peer_seq_lock);
2917 } 3020 }
2918 3021
@@ -2946,19 +3049,15 @@ static void warn_if_differ_considerably(struct drbd_conf *mdev,
2946 (unsigned long long)a, (unsigned long long)b); 3049 (unsigned long long)a, (unsigned long long)b);
2947} 3050}
2948 3051
2949static int receive_sizes(struct drbd_conf *mdev, struct p_header *h) 3052static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
2950{ 3053{
2951 struct p_sizes *p = (struct p_sizes *)h; 3054 struct p_sizes *p = &mdev->data.rbuf.sizes;
2952 enum determine_dev_size dd = unchanged; 3055 enum determine_dev_size dd = unchanged;
2953 unsigned int max_seg_s; 3056 unsigned int max_seg_s;
2954 sector_t p_size, p_usize, my_usize; 3057 sector_t p_size, p_usize, my_usize;
2955 int ldsc = 0; /* local disk size changed */ 3058 int ldsc = 0; /* local disk size changed */
2956 enum dds_flags ddsf; 3059 enum dds_flags ddsf;
2957 3060
2958 ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
2959 if (drbd_recv(mdev, h->payload, h->length) != h->length)
2960 return FALSE;
2961
2962 p_size = be64_to_cpu(p->d_size); 3061 p_size = be64_to_cpu(p->d_size);
2963 p_usize = be64_to_cpu(p->u_size); 3062 p_usize = be64_to_cpu(p->u_size);
2964 3063
@@ -2972,7 +3071,6 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
2972 * we still need to figure out whether we accept that. */ 3071 * we still need to figure out whether we accept that. */
2973 mdev->p_size = p_size; 3072 mdev->p_size = p_size;
2974 3073
2975#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
2976 if (get_ldev(mdev)) { 3074 if (get_ldev(mdev)) {
2977 warn_if_differ_considerably(mdev, "lower level device sizes", 3075 warn_if_differ_considerably(mdev, "lower level device sizes",
2978 p_size, drbd_get_max_capacity(mdev->ldev)); 3076 p_size, drbd_get_max_capacity(mdev->ldev));
@@ -3029,6 +3127,8 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
3029 3127
3030 if (mdev->agreed_pro_version < 94) 3128 if (mdev->agreed_pro_version < 94)
3031 max_seg_s = be32_to_cpu(p->max_segment_size); 3129 max_seg_s = be32_to_cpu(p->max_segment_size);
3130 else if (mdev->agreed_pro_version == 94)
3131 max_seg_s = DRBD_MAX_SIZE_H80_PACKET;
3032 else /* drbd 8.3.8 onwards */ 3132 else /* drbd 8.3.8 onwards */
3033 max_seg_s = DRBD_MAX_SEGMENT_SIZE; 3133 max_seg_s = DRBD_MAX_SEGMENT_SIZE;
3034 3134
@@ -3062,16 +3162,12 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
3062 return TRUE; 3162 return TRUE;
3063} 3163}
3064 3164
3065static int receive_uuids(struct drbd_conf *mdev, struct p_header *h) 3165static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
3066{ 3166{
3067 struct p_uuids *p = (struct p_uuids *)h; 3167 struct p_uuids *p = &mdev->data.rbuf.uuids;
3068 u64 *p_uuid; 3168 u64 *p_uuid;
3069 int i; 3169 int i;
3070 3170
3071 ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
3072 if (drbd_recv(mdev, h->payload, h->length) != h->length)
3073 return FALSE;
3074
3075 p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO); 3171 p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
3076 3172
3077 for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++) 3173 for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
@@ -3107,6 +3203,11 @@ static int receive_uuids(struct drbd_conf *mdev, struct p_header *h)
3107 drbd_md_sync(mdev); 3203 drbd_md_sync(mdev);
3108 } 3204 }
3109 put_ldev(mdev); 3205 put_ldev(mdev);
3206 } else if (mdev->state.disk < D_INCONSISTENT &&
3207 mdev->state.role == R_PRIMARY) {
3208 /* I am a diskless primary, the peer just created a new current UUID
3209 for me. */
3210 drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
3110 } 3211 }
3111 3212
3112 /* Before we test for the disk state, we should wait until an eventually 3213 /* Before we test for the disk state, we should wait until an eventually
@@ -3150,16 +3251,12 @@ static union drbd_state convert_state(union drbd_state ps)
3150 return ms; 3251 return ms;
3151} 3252}
3152 3253
3153static int receive_req_state(struct drbd_conf *mdev, struct p_header *h) 3254static int receive_req_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
3154{ 3255{
3155 struct p_req_state *p = (struct p_req_state *)h; 3256 struct p_req_state *p = &mdev->data.rbuf.req_state;
3156 union drbd_state mask, val; 3257 union drbd_state mask, val;
3157 int rv; 3258 int rv;
3158 3259
3159 ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
3160 if (drbd_recv(mdev, h->payload, h->length) != h->length)
3161 return FALSE;
3162
3163 mask.i = be32_to_cpu(p->mask); 3260 mask.i = be32_to_cpu(p->mask);
3164 val.i = be32_to_cpu(p->val); 3261 val.i = be32_to_cpu(p->val);
3165 3262
@@ -3180,20 +3277,14 @@ static int receive_req_state(struct drbd_conf *mdev, struct p_header *h)
3180 return TRUE; 3277 return TRUE;
3181} 3278}
3182 3279
3183static int receive_state(struct drbd_conf *mdev, struct p_header *h) 3280static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
3184{ 3281{
3185 struct p_state *p = (struct p_state *)h; 3282 struct p_state *p = &mdev->data.rbuf.state;
3186 enum drbd_conns nconn, oconn; 3283 union drbd_state os, ns, peer_state;
3187 union drbd_state ns, peer_state;
3188 enum drbd_disk_state real_peer_disk; 3284 enum drbd_disk_state real_peer_disk;
3285 enum chg_state_flags cs_flags;
3189 int rv; 3286 int rv;
3190 3287
3191 ERR_IF(h->length != (sizeof(*p)-sizeof(*h)))
3192 return FALSE;
3193
3194 if (drbd_recv(mdev, h->payload, h->length) != h->length)
3195 return FALSE;
3196
3197 peer_state.i = be32_to_cpu(p->state); 3288 peer_state.i = be32_to_cpu(p->state);
3198 3289
3199 real_peer_disk = peer_state.disk; 3290 real_peer_disk = peer_state.disk;
@@ -3204,38 +3295,72 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h)
3204 3295
3205 spin_lock_irq(&mdev->req_lock); 3296 spin_lock_irq(&mdev->req_lock);
3206 retry: 3297 retry:
3207 oconn = nconn = mdev->state.conn; 3298 os = ns = mdev->state;
3208 spin_unlock_irq(&mdev->req_lock); 3299 spin_unlock_irq(&mdev->req_lock);
3209 3300
3210 if (nconn == C_WF_REPORT_PARAMS) 3301 /* peer says his disk is uptodate, while we think it is inconsistent,
3211 nconn = C_CONNECTED; 3302 * and this happens while we think we have a sync going on. */
3303 if (os.pdsk == D_INCONSISTENT && real_peer_disk == D_UP_TO_DATE &&
3304 os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
3305 /* If we are (becoming) SyncSource, but peer is still in sync
3306 * preparation, ignore its uptodate-ness to avoid flapping, it
3307 * will change to inconsistent once the peer reaches active
3308 * syncing states.
3309 * It may have changed syncer-paused flags, however, so we
3310 * cannot ignore this completely. */
3311 if (peer_state.conn > C_CONNECTED &&
3312 peer_state.conn < C_SYNC_SOURCE)
3313 real_peer_disk = D_INCONSISTENT;
3314
3315 /* if peer_state changes to connected at the same time,
3316 * it explicitly notifies us that it finished resync.
3317 * Maybe we should finish it up, too? */
3318 else if (os.conn >= C_SYNC_SOURCE &&
3319 peer_state.conn == C_CONNECTED) {
3320 if (drbd_bm_total_weight(mdev) <= mdev->rs_failed)
3321 drbd_resync_finished(mdev);
3322 return TRUE;
3323 }
3324 }
3325
3326 /* peer says his disk is inconsistent, while we think it is uptodate,
3327 * and this happens while the peer still thinks we have a sync going on,
3328 * but we think we are already done with the sync.
3329 * We ignore this to avoid flapping pdsk.
3330 * This should not happen, if the peer is a recent version of drbd. */
3331 if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT &&
3332 os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE)
3333 real_peer_disk = D_UP_TO_DATE;
3334
3335 if (ns.conn == C_WF_REPORT_PARAMS)
3336 ns.conn = C_CONNECTED;
3212 3337
3213 if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING && 3338 if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING &&
3214 get_ldev_if_state(mdev, D_NEGOTIATING)) { 3339 get_ldev_if_state(mdev, D_NEGOTIATING)) {
3215 int cr; /* consider resync */ 3340 int cr; /* consider resync */
3216 3341
3217 /* if we established a new connection */ 3342 /* if we established a new connection */
3218 cr = (oconn < C_CONNECTED); 3343 cr = (os.conn < C_CONNECTED);
3219 /* if we had an established connection 3344 /* if we had an established connection
3220 * and one of the nodes newly attaches a disk */ 3345 * and one of the nodes newly attaches a disk */
3221 cr |= (oconn == C_CONNECTED && 3346 cr |= (os.conn == C_CONNECTED &&
3222 (peer_state.disk == D_NEGOTIATING || 3347 (peer_state.disk == D_NEGOTIATING ||
3223 mdev->state.disk == D_NEGOTIATING)); 3348 os.disk == D_NEGOTIATING));
3224 /* if we have both been inconsistent, and the peer has been 3349 /* if we have both been inconsistent, and the peer has been
3225 * forced to be UpToDate with --overwrite-data */ 3350 * forced to be UpToDate with --overwrite-data */
3226 cr |= test_bit(CONSIDER_RESYNC, &mdev->flags); 3351 cr |= test_bit(CONSIDER_RESYNC, &mdev->flags);
3227 /* if we had been plain connected, and the admin requested to 3352 /* if we had been plain connected, and the admin requested to
3228 * start a sync by "invalidate" or "invalidate-remote" */ 3353 * start a sync by "invalidate" or "invalidate-remote" */
3229 cr |= (oconn == C_CONNECTED && 3354 cr |= (os.conn == C_CONNECTED &&
3230 (peer_state.conn >= C_STARTING_SYNC_S && 3355 (peer_state.conn >= C_STARTING_SYNC_S &&
3231 peer_state.conn <= C_WF_BITMAP_T)); 3356 peer_state.conn <= C_WF_BITMAP_T));
3232 3357
3233 if (cr) 3358 if (cr)
3234 nconn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk); 3359 ns.conn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
3235 3360
3236 put_ldev(mdev); 3361 put_ldev(mdev);
3237 if (nconn == C_MASK) { 3362 if (ns.conn == C_MASK) {
3238 nconn = C_CONNECTED; 3363 ns.conn = C_CONNECTED;
3239 if (mdev->state.disk == D_NEGOTIATING) { 3364 if (mdev->state.disk == D_NEGOTIATING) {
3240 drbd_force_state(mdev, NS(disk, D_DISKLESS)); 3365 drbd_force_state(mdev, NS(disk, D_DISKLESS));
3241 } else if (peer_state.disk == D_NEGOTIATING) { 3366 } else if (peer_state.disk == D_NEGOTIATING) {
@@ -3245,7 +3370,7 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h)
3245 } else { 3370 } else {
3246 if (test_and_clear_bit(CONN_DRY_RUN, &mdev->flags)) 3371 if (test_and_clear_bit(CONN_DRY_RUN, &mdev->flags))
3247 return FALSE; 3372 return FALSE;
3248 D_ASSERT(oconn == C_WF_REPORT_PARAMS); 3373 D_ASSERT(os.conn == C_WF_REPORT_PARAMS);
3249 drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 3374 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
3250 return FALSE; 3375 return FALSE;
3251 } 3376 }
@@ -3253,18 +3378,28 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h)
3253 } 3378 }
3254 3379
3255 spin_lock_irq(&mdev->req_lock); 3380 spin_lock_irq(&mdev->req_lock);
3256 if (mdev->state.conn != oconn) 3381 if (mdev->state.i != os.i)
3257 goto retry; 3382 goto retry;
3258 clear_bit(CONSIDER_RESYNC, &mdev->flags); 3383 clear_bit(CONSIDER_RESYNC, &mdev->flags);
3259 ns.i = mdev->state.i;
3260 ns.conn = nconn;
3261 ns.peer = peer_state.role; 3384 ns.peer = peer_state.role;
3262 ns.pdsk = real_peer_disk; 3385 ns.pdsk = real_peer_disk;
3263 ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp); 3386 ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
3264 if ((nconn == C_CONNECTED || nconn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING) 3387 if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
3265 ns.disk = mdev->new_state_tmp.disk; 3388 ns.disk = mdev->new_state_tmp.disk;
3266 3389 cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
3267 rv = _drbd_set_state(mdev, ns, CS_VERBOSE | CS_HARD, NULL); 3390 if (ns.pdsk == D_CONSISTENT && is_susp(ns) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
3391 test_bit(NEW_CUR_UUID, &mdev->flags)) {
3392 /* Do not allow tl_restart(resend) for a rebooted peer. We can only allow this
3393 for temporal network outages! */
3394 spin_unlock_irq(&mdev->req_lock);
3395 dev_err(DEV, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
3396 tl_clear(mdev);
3397 drbd_uuid_new_current(mdev);
3398 clear_bit(NEW_CUR_UUID, &mdev->flags);
3399 drbd_force_state(mdev, NS2(conn, C_PROTOCOL_ERROR, susp, 0));
3400 return FALSE;
3401 }
3402 rv = _drbd_set_state(mdev, ns, cs_flags, NULL);
3268 ns = mdev->state; 3403 ns = mdev->state;
3269 spin_unlock_irq(&mdev->req_lock); 3404 spin_unlock_irq(&mdev->req_lock);
3270 3405
@@ -3273,8 +3408,8 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h)
3273 return FALSE; 3408 return FALSE;
3274 } 3409 }
3275 3410
3276 if (oconn > C_WF_REPORT_PARAMS) { 3411 if (os.conn > C_WF_REPORT_PARAMS) {
3277 if (nconn > C_CONNECTED && peer_state.conn <= C_CONNECTED && 3412 if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
3278 peer_state.disk != D_NEGOTIATING ) { 3413 peer_state.disk != D_NEGOTIATING ) {
3279 /* we want resync, peer has not yet decided to sync... */ 3414 /* we want resync, peer has not yet decided to sync... */
3280 /* Nowadays only used when forcing a node into primary role and 3415 /* Nowadays only used when forcing a node into primary role and
@@ -3291,9 +3426,9 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h)
3291 return TRUE; 3426 return TRUE;
3292} 3427}
3293 3428
3294static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h) 3429static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
3295{ 3430{
3296 struct p_rs_uuid *p = (struct p_rs_uuid *)h; 3431 struct p_rs_uuid *p = &mdev->data.rbuf.rs_uuid;
3297 3432
3298 wait_event(mdev->misc_wait, 3433 wait_event(mdev->misc_wait,
3299 mdev->state.conn == C_WF_SYNC_UUID || 3434 mdev->state.conn == C_WF_SYNC_UUID ||
@@ -3302,10 +3437,6 @@ static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h)
3302 3437
3303 /* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */ 3438 /* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */
3304 3439
3305 ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
3306 if (drbd_recv(mdev, h->payload, h->length) != h->length)
3307 return FALSE;
3308
3309 /* Here the _drbd_uuid_ functions are right, current should 3440 /* Here the _drbd_uuid_ functions are right, current should
3310 _not_ be rotated into the history */ 3441 _not_ be rotated into the history */
3311 if (get_ldev_if_state(mdev, D_NEGOTIATING)) { 3442 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
@@ -3324,14 +3455,14 @@ static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h)
3324enum receive_bitmap_ret { OK, DONE, FAILED }; 3455enum receive_bitmap_ret { OK, DONE, FAILED };
3325 3456
3326static enum receive_bitmap_ret 3457static enum receive_bitmap_ret
3327receive_bitmap_plain(struct drbd_conf *mdev, struct p_header *h, 3458receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size,
3328 unsigned long *buffer, struct bm_xfer_ctx *c) 3459 unsigned long *buffer, struct bm_xfer_ctx *c)
3329{ 3460{
3330 unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset); 3461 unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
3331 unsigned want = num_words * sizeof(long); 3462 unsigned want = num_words * sizeof(long);
3332 3463
3333 if (want != h->length) { 3464 if (want != data_size) {
3334 dev_err(DEV, "%s:want (%u) != h->length (%u)\n", __func__, want, h->length); 3465 dev_err(DEV, "%s:want (%u) != data_size (%u)\n", __func__, want, data_size);
3335 return FAILED; 3466 return FAILED;
3336 } 3467 }
3337 if (want == 0) 3468 if (want == 0)
@@ -3360,7 +3491,7 @@ recv_bm_rle_bits(struct drbd_conf *mdev,
3360 u64 tmp; 3491 u64 tmp;
3361 unsigned long s = c->bit_offset; 3492 unsigned long s = c->bit_offset;
3362 unsigned long e; 3493 unsigned long e;
3363 int len = p->head.length - (sizeof(*p) - sizeof(p->head)); 3494 int len = be16_to_cpu(p->head.length) - (sizeof(*p) - sizeof(p->head));
3364 int toggle = DCBP_get_start(p); 3495 int toggle = DCBP_get_start(p);
3365 int have; 3496 int have;
3366 int bits; 3497 int bits;
@@ -3429,7 +3560,7 @@ void INFO_bm_xfer_stats(struct drbd_conf *mdev,
3429 const char *direction, struct bm_xfer_ctx *c) 3560 const char *direction, struct bm_xfer_ctx *c)
3430{ 3561{
3431 /* what would it take to transfer it "plaintext" */ 3562 /* what would it take to transfer it "plaintext" */
3432 unsigned plain = sizeof(struct p_header) * 3563 unsigned plain = sizeof(struct p_header80) *
3433 ((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1) 3564 ((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1)
3434 + c->bm_words * sizeof(long); 3565 + c->bm_words * sizeof(long);
3435 unsigned total = c->bytes[0] + c->bytes[1]; 3566 unsigned total = c->bytes[0] + c->bytes[1];
@@ -3467,12 +3598,13 @@ void INFO_bm_xfer_stats(struct drbd_conf *mdev,
3467 in order to be agnostic to the 32 vs 64 bits issue. 3598 in order to be agnostic to the 32 vs 64 bits issue.
3468 3599
3469 returns 0 on failure, 1 if we successfully received it. */ 3600 returns 0 on failure, 1 if we successfully received it. */
3470static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h) 3601static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
3471{ 3602{
3472 struct bm_xfer_ctx c; 3603 struct bm_xfer_ctx c;
3473 void *buffer; 3604 void *buffer;
3474 enum receive_bitmap_ret ret; 3605 enum receive_bitmap_ret ret;
3475 int ok = FALSE; 3606 int ok = FALSE;
3607 struct p_header80 *h = &mdev->data.rbuf.header.h80;
3476 3608
3477 wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); 3609 wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
3478 3610
@@ -3492,39 +3624,39 @@ static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h)
3492 }; 3624 };
3493 3625
3494 do { 3626 do {
3495 if (h->command == P_BITMAP) { 3627 if (cmd == P_BITMAP) {
3496 ret = receive_bitmap_plain(mdev, h, buffer, &c); 3628 ret = receive_bitmap_plain(mdev, data_size, buffer, &c);
3497 } else if (h->command == P_COMPRESSED_BITMAP) { 3629 } else if (cmd == P_COMPRESSED_BITMAP) {
3498 /* MAYBE: sanity check that we speak proto >= 90, 3630 /* MAYBE: sanity check that we speak proto >= 90,
3499 * and the feature is enabled! */ 3631 * and the feature is enabled! */
3500 struct p_compressed_bm *p; 3632 struct p_compressed_bm *p;
3501 3633
3502 if (h->length > BM_PACKET_PAYLOAD_BYTES) { 3634 if (data_size > BM_PACKET_PAYLOAD_BYTES) {
3503 dev_err(DEV, "ReportCBitmap packet too large\n"); 3635 dev_err(DEV, "ReportCBitmap packet too large\n");
3504 goto out; 3636 goto out;
3505 } 3637 }
3506 /* use the page buff */ 3638 /* use the page buff */
3507 p = buffer; 3639 p = buffer;
3508 memcpy(p, h, sizeof(*h)); 3640 memcpy(p, h, sizeof(*h));
3509 if (drbd_recv(mdev, p->head.payload, h->length) != h->length) 3641 if (drbd_recv(mdev, p->head.payload, data_size) != data_size)
3510 goto out; 3642 goto out;
3511 if (p->head.length <= (sizeof(*p) - sizeof(p->head))) { 3643 if (data_size <= (sizeof(*p) - sizeof(p->head))) {
3512 dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", p->head.length); 3644 dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", data_size);
3513 return FAILED; 3645 return FAILED;
3514 } 3646 }
3515 ret = decode_bitmap_c(mdev, p, &c); 3647 ret = decode_bitmap_c(mdev, p, &c);
3516 } else { 3648 } else {
3517 dev_warn(DEV, "receive_bitmap: h->command neither ReportBitMap nor ReportCBitMap (is 0x%x)", h->command); 3649 dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", cmd);
3518 goto out; 3650 goto out;
3519 } 3651 }
3520 3652
3521 c.packets[h->command == P_BITMAP]++; 3653 c.packets[cmd == P_BITMAP]++;
3522 c.bytes[h->command == P_BITMAP] += sizeof(struct p_header) + h->length; 3654 c.bytes[cmd == P_BITMAP] += sizeof(struct p_header80) + data_size;
3523 3655
3524 if (ret != OK) 3656 if (ret != OK)
3525 break; 3657 break;
3526 3658
3527 if (!drbd_recv_header(mdev, h)) 3659 if (!drbd_recv_header(mdev, &cmd, &data_size))
3528 goto out; 3660 goto out;
3529 } while (ret == OK); 3661 } while (ret == OK);
3530 if (ret == FAILED) 3662 if (ret == FAILED)
@@ -3555,17 +3687,16 @@ static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h)
3555 return ok; 3687 return ok;
3556} 3688}
3557 3689
3558static int receive_skip_(struct drbd_conf *mdev, struct p_header *h, int silent) 3690static int receive_skip(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
3559{ 3691{
3560 /* TODO zero copy sink :) */ 3692 /* TODO zero copy sink :) */
3561 static char sink[128]; 3693 static char sink[128];
3562 int size, want, r; 3694 int size, want, r;
3563 3695
3564 if (!silent) 3696 dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n",
3565 dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n", 3697 cmd, data_size);
3566 h->command, h->length);
3567 3698
3568 size = h->length; 3699 size = data_size;
3569 while (size > 0) { 3700 while (size > 0) {
3570 want = min_t(int, size, sizeof(sink)); 3701 want = min_t(int, size, sizeof(sink));
3571 r = drbd_recv(mdev, sink, want); 3702 r = drbd_recv(mdev, sink, want);
@@ -3575,17 +3706,7 @@ static int receive_skip_(struct drbd_conf *mdev, struct p_header *h, int silent)
3575 return size == 0; 3706 return size == 0;
3576} 3707}
3577 3708
3578static int receive_skip(struct drbd_conf *mdev, struct p_header *h) 3709static int receive_UnplugRemote(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
3579{
3580 return receive_skip_(mdev, h, 0);
3581}
3582
3583static int receive_skip_silent(struct drbd_conf *mdev, struct p_header *h)
3584{
3585 return receive_skip_(mdev, h, 1);
3586}
3587
3588static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h)
3589{ 3710{
3590 if (mdev->state.disk >= D_INCONSISTENT) 3711 if (mdev->state.disk >= D_INCONSISTENT)
3591 drbd_kick_lo(mdev); 3712 drbd_kick_lo(mdev);
@@ -3597,108 +3718,94 @@ static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h)
3597 return TRUE; 3718 return TRUE;
3598} 3719}
3599 3720
3600typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *); 3721typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, enum drbd_packets cmd, unsigned int to_receive);
3601 3722
3602static drbd_cmd_handler_f drbd_default_handler[] = { 3723struct data_cmd {
3603 [P_DATA] = receive_Data, 3724 int expect_payload;
3604 [P_DATA_REPLY] = receive_DataReply, 3725 size_t pkt_size;
3605 [P_RS_DATA_REPLY] = receive_RSDataReply, 3726 drbd_cmd_handler_f function;
3606 [P_BARRIER] = receive_Barrier, 3727};
3607 [P_BITMAP] = receive_bitmap, 3728
3608 [P_COMPRESSED_BITMAP] = receive_bitmap, 3729static struct data_cmd drbd_cmd_handler[] = {
3609 [P_UNPLUG_REMOTE] = receive_UnplugRemote, 3730 [P_DATA] = { 1, sizeof(struct p_data), receive_Data },
3610 [P_DATA_REQUEST] = receive_DataRequest, 3731 [P_DATA_REPLY] = { 1, sizeof(struct p_data), receive_DataReply },
3611 [P_RS_DATA_REQUEST] = receive_DataRequest, 3732 [P_RS_DATA_REPLY] = { 1, sizeof(struct p_data), receive_RSDataReply } ,
3612 [P_SYNC_PARAM] = receive_SyncParam, 3733 [P_BARRIER] = { 0, sizeof(struct p_barrier), receive_Barrier } ,
3613 [P_SYNC_PARAM89] = receive_SyncParam, 3734 [P_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } ,
3614 [P_PROTOCOL] = receive_protocol, 3735 [P_COMPRESSED_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } ,
3615 [P_UUIDS] = receive_uuids, 3736 [P_UNPLUG_REMOTE] = { 0, sizeof(struct p_header80), receive_UnplugRemote },
3616 [P_SIZES] = receive_sizes, 3737 [P_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
3617 [P_STATE] = receive_state, 3738 [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
3618 [P_STATE_CHG_REQ] = receive_req_state, 3739 [P_SYNC_PARAM] = { 1, sizeof(struct p_header80), receive_SyncParam },
3619 [P_SYNC_UUID] = receive_sync_uuid, 3740 [P_SYNC_PARAM89] = { 1, sizeof(struct p_header80), receive_SyncParam },
3620 [P_OV_REQUEST] = receive_DataRequest, 3741 [P_PROTOCOL] = { 1, sizeof(struct p_protocol), receive_protocol },
3621 [P_OV_REPLY] = receive_DataRequest, 3742 [P_UUIDS] = { 0, sizeof(struct p_uuids), receive_uuids },
3622 [P_CSUM_RS_REQUEST] = receive_DataRequest, 3743 [P_SIZES] = { 0, sizeof(struct p_sizes), receive_sizes },
3623 [P_DELAY_PROBE] = receive_skip_silent, 3744 [P_STATE] = { 0, sizeof(struct p_state), receive_state },
3745 [P_STATE_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_state },
3746 [P_SYNC_UUID] = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid },
3747 [P_OV_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
3748 [P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest },
3749 [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
3750 [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip },
3624 /* anything missing from this table is in 3751 /* anything missing from this table is in
3625 * the asender_tbl, see get_asender_cmd */ 3752 * the asender_tbl, see get_asender_cmd */
3626 [P_MAX_CMD] = NULL, 3753 [P_MAX_CMD] = { 0, 0, NULL },
3627}; 3754};
3628 3755
3629static drbd_cmd_handler_f *drbd_cmd_handler = drbd_default_handler; 3756/* All handler functions that expect a sub-header get that sub-heder in
3630static drbd_cmd_handler_f *drbd_opt_cmd_handler; 3757 mdev->data.rbuf.header.head.payload.
3758
3759 Usually in mdev->data.rbuf.header.head the callback can find the usual
3760 p_header, but they may not rely on that. Since there is also p_header95 !
3761 */
3631 3762
3632static void drbdd(struct drbd_conf *mdev) 3763static void drbdd(struct drbd_conf *mdev)
3633{ 3764{
3634 drbd_cmd_handler_f handler; 3765 union p_header *header = &mdev->data.rbuf.header;
3635 struct p_header *header = &mdev->data.rbuf.header; 3766 unsigned int packet_size;
3767 enum drbd_packets cmd;
3768 size_t shs; /* sub header size */
3769 int rv;
3636 3770
3637 while (get_t_state(&mdev->receiver) == Running) { 3771 while (get_t_state(&mdev->receiver) == Running) {
3638 drbd_thread_current_set_cpu(mdev); 3772 drbd_thread_current_set_cpu(mdev);
3639 if (!drbd_recv_header(mdev, header)) { 3773 if (!drbd_recv_header(mdev, &cmd, &packet_size))
3640 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); 3774 goto err_out;
3641 break;
3642 }
3643 3775
3644 if (header->command < P_MAX_CMD) 3776 if (unlikely(cmd >= P_MAX_CMD || !drbd_cmd_handler[cmd].function)) {
3645 handler = drbd_cmd_handler[header->command]; 3777 dev_err(DEV, "unknown packet type %d, l: %d!\n", cmd, packet_size);
3646 else if (P_MAY_IGNORE < header->command 3778 goto err_out;
3647 && header->command < P_MAX_OPT_CMD) 3779 }
3648 handler = drbd_opt_cmd_handler[header->command-P_MAY_IGNORE];
3649 else if (header->command > P_MAX_OPT_CMD)
3650 handler = receive_skip;
3651 else
3652 handler = NULL;
3653 3780
3654 if (unlikely(!handler)) { 3781 shs = drbd_cmd_handler[cmd].pkt_size - sizeof(union p_header);
3655 dev_err(DEV, "unknown packet type %d, l: %d!\n", 3782 rv = drbd_recv(mdev, &header->h80.payload, shs);
3656 header->command, header->length); 3783 if (unlikely(rv != shs)) {
3657 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); 3784 dev_err(DEV, "short read while reading sub header: rv=%d\n", rv);
3658 break; 3785 goto err_out;
3659 } 3786 }
3660 if (unlikely(!handler(mdev, header))) { 3787
3661 dev_err(DEV, "error receiving %s, l: %d!\n", 3788 if (packet_size - shs > 0 && !drbd_cmd_handler[cmd].expect_payload) {
3662 cmdname(header->command), header->length); 3789 dev_err(DEV, "No payload expected %s l:%d\n", cmdname(cmd), packet_size);
3663 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); 3790 goto err_out;
3664 break;
3665 } 3791 }
3666 }
3667}
3668 3792
3669static void drbd_fail_pending_reads(struct drbd_conf *mdev) 3793 rv = drbd_cmd_handler[cmd].function(mdev, cmd, packet_size - shs);
3670{
3671 struct hlist_head *slot;
3672 struct hlist_node *pos;
3673 struct hlist_node *tmp;
3674 struct drbd_request *req;
3675 int i;
3676 3794
3677 /* 3795 if (unlikely(!rv)) {
3678 * Application READ requests 3796 dev_err(DEV, "error receiving %s, l: %d!\n",
3679 */ 3797 cmdname(cmd), packet_size);
3680 spin_lock_irq(&mdev->req_lock); 3798 goto err_out;
3681 for (i = 0; i < APP_R_HSIZE; i++) {
3682 slot = mdev->app_reads_hash+i;
3683 hlist_for_each_entry_safe(req, pos, tmp, slot, colision) {
3684 /* it may (but should not any longer!)
3685 * be on the work queue; if that assert triggers,
3686 * we need to also grab the
3687 * spin_lock_irq(&mdev->data.work.q_lock);
3688 * and list_del_init here. */
3689 D_ASSERT(list_empty(&req->w.list));
3690 /* It would be nice to complete outside of spinlock.
3691 * But this is easier for now. */
3692 _req_mod(req, connection_lost_while_pending);
3693 } 3799 }
3694 } 3800 }
3695 for (i = 0; i < APP_R_HSIZE; i++)
3696 if (!hlist_empty(mdev->app_reads_hash+i))
3697 dev_warn(DEV, "ASSERT FAILED: app_reads_hash[%d].first: "
3698 "%p, should be NULL\n", i, mdev->app_reads_hash[i].first);
3699 3801
3700 memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *)); 3802 if (0) {
3701 spin_unlock_irq(&mdev->req_lock); 3803 err_out:
3804 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
3805 }
3806 /* If we leave here, we probably want to update at least the
3807 * "Connected" indicator on stable storage. Do so explicitly here. */
3808 drbd_md_sync(mdev);
3702} 3809}
3703 3810
3704void drbd_flush_workqueue(struct drbd_conf *mdev) 3811void drbd_flush_workqueue(struct drbd_conf *mdev)
@@ -3711,6 +3818,36 @@ void drbd_flush_workqueue(struct drbd_conf *mdev)
3711 wait_for_completion(&barr.done); 3818 wait_for_completion(&barr.done);
3712} 3819}
3713 3820
3821void drbd_free_tl_hash(struct drbd_conf *mdev)
3822{
3823 struct hlist_head *h;
3824
3825 spin_lock_irq(&mdev->req_lock);
3826
3827 if (!mdev->tl_hash || mdev->state.conn != C_STANDALONE) {
3828 spin_unlock_irq(&mdev->req_lock);
3829 return;
3830 }
3831 /* paranoia code */
3832 for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
3833 if (h->first)
3834 dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
3835 (int)(h - mdev->ee_hash), h->first);
3836 kfree(mdev->ee_hash);
3837 mdev->ee_hash = NULL;
3838 mdev->ee_hash_s = 0;
3839
3840 /* paranoia code */
3841 for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
3842 if (h->first)
3843 dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
3844 (int)(h - mdev->tl_hash), h->first);
3845 kfree(mdev->tl_hash);
3846 mdev->tl_hash = NULL;
3847 mdev->tl_hash_s = 0;
3848 spin_unlock_irq(&mdev->req_lock);
3849}
3850
3714static void drbd_disconnect(struct drbd_conf *mdev) 3851static void drbd_disconnect(struct drbd_conf *mdev)
3715{ 3852{
3716 enum drbd_fencing_p fp; 3853 enum drbd_fencing_p fp;
@@ -3728,6 +3865,7 @@ static void drbd_disconnect(struct drbd_conf *mdev)
3728 drbd_thread_stop(&mdev->asender); 3865 drbd_thread_stop(&mdev->asender);
3729 drbd_free_sock(mdev); 3866 drbd_free_sock(mdev);
3730 3867
3868 /* wait for current activity to cease. */
3731 spin_lock_irq(&mdev->req_lock); 3869 spin_lock_irq(&mdev->req_lock);
3732 _drbd_wait_ee_list_empty(mdev, &mdev->active_ee); 3870 _drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
3733 _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee); 3871 _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee);
@@ -3752,7 +3890,6 @@ static void drbd_disconnect(struct drbd_conf *mdev)
3752 3890
3753 /* make sure syncer is stopped and w_resume_next_sg queued */ 3891 /* make sure syncer is stopped and w_resume_next_sg queued */
3754 del_timer_sync(&mdev->resync_timer); 3892 del_timer_sync(&mdev->resync_timer);
3755 set_bit(STOP_SYNC_TIMER, &mdev->flags);
3756 resync_timer_fn((unsigned long)mdev); 3893 resync_timer_fn((unsigned long)mdev);
3757 3894
3758 /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier, 3895 /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
@@ -3767,11 +3904,9 @@ static void drbd_disconnect(struct drbd_conf *mdev)
3767 kfree(mdev->p_uuid); 3904 kfree(mdev->p_uuid);
3768 mdev->p_uuid = NULL; 3905 mdev->p_uuid = NULL;
3769 3906
3770 if (!mdev->state.susp) 3907 if (!is_susp(mdev->state))
3771 tl_clear(mdev); 3908 tl_clear(mdev);
3772 3909
3773 drbd_fail_pending_reads(mdev);
3774
3775 dev_info(DEV, "Connection closed\n"); 3910 dev_info(DEV, "Connection closed\n");
3776 3911
3777 drbd_md_sync(mdev); 3912 drbd_md_sync(mdev);
@@ -3782,12 +3917,8 @@ static void drbd_disconnect(struct drbd_conf *mdev)
3782 put_ldev(mdev); 3917 put_ldev(mdev);
3783 } 3918 }
3784 3919
3785 if (mdev->state.role == R_PRIMARY) { 3920 if (mdev->state.role == R_PRIMARY && fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN)
3786 if (fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN) { 3921 drbd_try_outdate_peer_async(mdev);
3787 enum drbd_disk_state nps = drbd_try_outdate_peer(mdev);
3788 drbd_request_state(mdev, NS(pdsk, nps));
3789 }
3790 }
3791 3922
3792 spin_lock_irq(&mdev->req_lock); 3923 spin_lock_irq(&mdev->req_lock);
3793 os = mdev->state; 3924 os = mdev->state;
@@ -3800,32 +3931,14 @@ static void drbd_disconnect(struct drbd_conf *mdev)
3800 spin_unlock_irq(&mdev->req_lock); 3931 spin_unlock_irq(&mdev->req_lock);
3801 3932
3802 if (os.conn == C_DISCONNECTING) { 3933 if (os.conn == C_DISCONNECTING) {
3803 struct hlist_head *h; 3934 wait_event(mdev->net_cnt_wait, atomic_read(&mdev->net_cnt) == 0);
3804 wait_event(mdev->misc_wait, atomic_read(&mdev->net_cnt) == 0);
3805 3935
3806 /* we must not free the tl_hash 3936 if (!is_susp(mdev->state)) {
3807 * while application io is still on the fly */ 3937 /* we must not free the tl_hash
3808 wait_event(mdev->misc_wait, atomic_read(&mdev->ap_bio_cnt) == 0); 3938 * while application io is still on the fly */
3809 3939 wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
3810 spin_lock_irq(&mdev->req_lock); 3940 drbd_free_tl_hash(mdev);
3811 /* paranoia code */ 3941 }
3812 for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
3813 if (h->first)
3814 dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
3815 (int)(h - mdev->ee_hash), h->first);
3816 kfree(mdev->ee_hash);
3817 mdev->ee_hash = NULL;
3818 mdev->ee_hash_s = 0;
3819
3820 /* paranoia code */
3821 for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
3822 if (h->first)
3823 dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
3824 (int)(h - mdev->tl_hash), h->first);
3825 kfree(mdev->tl_hash);
3826 mdev->tl_hash = NULL;
3827 mdev->tl_hash_s = 0;
3828 spin_unlock_irq(&mdev->req_lock);
3829 3942
3830 crypto_free_hash(mdev->cram_hmac_tfm); 3943 crypto_free_hash(mdev->cram_hmac_tfm);
3831 mdev->cram_hmac_tfm = NULL; 3944 mdev->cram_hmac_tfm = NULL;
@@ -3845,6 +3958,9 @@ static void drbd_disconnect(struct drbd_conf *mdev)
3845 i = drbd_release_ee(mdev, &mdev->net_ee); 3958 i = drbd_release_ee(mdev, &mdev->net_ee);
3846 if (i) 3959 if (i)
3847 dev_info(DEV, "net_ee not empty, killed %u entries\n", i); 3960 dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
3961 i = atomic_read(&mdev->pp_in_use_by_net);
3962 if (i)
3963 dev_info(DEV, "pp_in_use_by_net = %d, expected 0\n", i);
3848 i = atomic_read(&mdev->pp_in_use); 3964 i = atomic_read(&mdev->pp_in_use);
3849 if (i) 3965 if (i)
3850 dev_info(DEV, "pp_in_use = %d, expected 0\n", i); 3966 dev_info(DEV, "pp_in_use = %d, expected 0\n", i);
@@ -3888,7 +4004,7 @@ static int drbd_send_handshake(struct drbd_conf *mdev)
3888 p->protocol_min = cpu_to_be32(PRO_VERSION_MIN); 4004 p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
3889 p->protocol_max = cpu_to_be32(PRO_VERSION_MAX); 4005 p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
3890 ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE, 4006 ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE,
3891 (struct p_header *)p, sizeof(*p), 0 ); 4007 (struct p_header80 *)p, sizeof(*p), 0 );
3892 mutex_unlock(&mdev->data.mutex); 4008 mutex_unlock(&mdev->data.mutex);
3893 return ok; 4009 return ok;
3894} 4010}
@@ -3904,27 +4020,28 @@ static int drbd_do_handshake(struct drbd_conf *mdev)
3904{ 4020{
3905 /* ASSERT current == mdev->receiver ... */ 4021 /* ASSERT current == mdev->receiver ... */
3906 struct p_handshake *p = &mdev->data.rbuf.handshake; 4022 struct p_handshake *p = &mdev->data.rbuf.handshake;
3907 const int expect = sizeof(struct p_handshake) 4023 const int expect = sizeof(struct p_handshake) - sizeof(struct p_header80);
3908 -sizeof(struct p_header); 4024 unsigned int length;
4025 enum drbd_packets cmd;
3909 int rv; 4026 int rv;
3910 4027
3911 rv = drbd_send_handshake(mdev); 4028 rv = drbd_send_handshake(mdev);
3912 if (!rv) 4029 if (!rv)
3913 return 0; 4030 return 0;
3914 4031
3915 rv = drbd_recv_header(mdev, &p->head); 4032 rv = drbd_recv_header(mdev, &cmd, &length);
3916 if (!rv) 4033 if (!rv)
3917 return 0; 4034 return 0;
3918 4035
3919 if (p->head.command != P_HAND_SHAKE) { 4036 if (cmd != P_HAND_SHAKE) {
3920 dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n", 4037 dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n",
3921 cmdname(p->head.command), p->head.command); 4038 cmdname(cmd), cmd);
3922 return -1; 4039 return -1;
3923 } 4040 }
3924 4041
3925 if (p->head.length != expect) { 4042 if (length != expect) {
3926 dev_err(DEV, "expected HandShake length: %u, received: %u\n", 4043 dev_err(DEV, "expected HandShake length: %u, received: %u\n",
3927 expect, p->head.length); 4044 expect, length);
3928 return -1; 4045 return -1;
3929 } 4046 }
3930 4047
@@ -3982,10 +4099,11 @@ static int drbd_do_auth(struct drbd_conf *mdev)
3982 char *response = NULL; 4099 char *response = NULL;
3983 char *right_response = NULL; 4100 char *right_response = NULL;
3984 char *peers_ch = NULL; 4101 char *peers_ch = NULL;
3985 struct p_header p;
3986 unsigned int key_len = strlen(mdev->net_conf->shared_secret); 4102 unsigned int key_len = strlen(mdev->net_conf->shared_secret);
3987 unsigned int resp_size; 4103 unsigned int resp_size;
3988 struct hash_desc desc; 4104 struct hash_desc desc;
4105 enum drbd_packets cmd;
4106 unsigned int length;
3989 int rv; 4107 int rv;
3990 4108
3991 desc.tfm = mdev->cram_hmac_tfm; 4109 desc.tfm = mdev->cram_hmac_tfm;
@@ -4005,33 +4123,33 @@ static int drbd_do_auth(struct drbd_conf *mdev)
4005 if (!rv) 4123 if (!rv)
4006 goto fail; 4124 goto fail;
4007 4125
4008 rv = drbd_recv_header(mdev, &p); 4126 rv = drbd_recv_header(mdev, &cmd, &length);
4009 if (!rv) 4127 if (!rv)
4010 goto fail; 4128 goto fail;
4011 4129
4012 if (p.command != P_AUTH_CHALLENGE) { 4130 if (cmd != P_AUTH_CHALLENGE) {
4013 dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n", 4131 dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n",
4014 cmdname(p.command), p.command); 4132 cmdname(cmd), cmd);
4015 rv = 0; 4133 rv = 0;
4016 goto fail; 4134 goto fail;
4017 } 4135 }
4018 4136
4019 if (p.length > CHALLENGE_LEN*2) { 4137 if (length > CHALLENGE_LEN * 2) {
4020 dev_err(DEV, "expected AuthChallenge payload too big.\n"); 4138 dev_err(DEV, "expected AuthChallenge payload too big.\n");
4021 rv = -1; 4139 rv = -1;
4022 goto fail; 4140 goto fail;
4023 } 4141 }
4024 4142
4025 peers_ch = kmalloc(p.length, GFP_NOIO); 4143 peers_ch = kmalloc(length, GFP_NOIO);
4026 if (peers_ch == NULL) { 4144 if (peers_ch == NULL) {
4027 dev_err(DEV, "kmalloc of peers_ch failed\n"); 4145 dev_err(DEV, "kmalloc of peers_ch failed\n");
4028 rv = -1; 4146 rv = -1;
4029 goto fail; 4147 goto fail;
4030 } 4148 }
4031 4149
4032 rv = drbd_recv(mdev, peers_ch, p.length); 4150 rv = drbd_recv(mdev, peers_ch, length);
4033 4151
4034 if (rv != p.length) { 4152 if (rv != length) {
4035 dev_err(DEV, "short read AuthChallenge: l=%u\n", rv); 4153 dev_err(DEV, "short read AuthChallenge: l=%u\n", rv);
4036 rv = 0; 4154 rv = 0;
4037 goto fail; 4155 goto fail;
@@ -4046,7 +4164,7 @@ static int drbd_do_auth(struct drbd_conf *mdev)
4046 } 4164 }
4047 4165
4048 sg_init_table(&sg, 1); 4166 sg_init_table(&sg, 1);
4049 sg_set_buf(&sg, peers_ch, p.length); 4167 sg_set_buf(&sg, peers_ch, length);
4050 4168
4051 rv = crypto_hash_digest(&desc, &sg, sg.length, response); 4169 rv = crypto_hash_digest(&desc, &sg, sg.length, response);
4052 if (rv) { 4170 if (rv) {
@@ -4059,18 +4177,18 @@ static int drbd_do_auth(struct drbd_conf *mdev)
4059 if (!rv) 4177 if (!rv)
4060 goto fail; 4178 goto fail;
4061 4179
4062 rv = drbd_recv_header(mdev, &p); 4180 rv = drbd_recv_header(mdev, &cmd, &length);
4063 if (!rv) 4181 if (!rv)
4064 goto fail; 4182 goto fail;
4065 4183
4066 if (p.command != P_AUTH_RESPONSE) { 4184 if (cmd != P_AUTH_RESPONSE) {
4067 dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n", 4185 dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n",
4068 cmdname(p.command), p.command); 4186 cmdname(cmd), cmd);
4069 rv = 0; 4187 rv = 0;
4070 goto fail; 4188 goto fail;
4071 } 4189 }
4072 4190
4073 if (p.length != resp_size) { 4191 if (length != resp_size) {
4074 dev_err(DEV, "expected AuthResponse payload of wrong size\n"); 4192 dev_err(DEV, "expected AuthResponse payload of wrong size\n");
4075 rv = 0; 4193 rv = 0;
4076 goto fail; 4194 goto fail;
@@ -4155,7 +4273,7 @@ int drbdd_init(struct drbd_thread *thi)
4155 4273
4156/* ********* acknowledge sender ******** */ 4274/* ********* acknowledge sender ******** */
4157 4275
4158static int got_RqSReply(struct drbd_conf *mdev, struct p_header *h) 4276static int got_RqSReply(struct drbd_conf *mdev, struct p_header80 *h)
4159{ 4277{
4160 struct p_req_state_reply *p = (struct p_req_state_reply *)h; 4278 struct p_req_state_reply *p = (struct p_req_state_reply *)h;
4161 4279
@@ -4173,13 +4291,13 @@ static int got_RqSReply(struct drbd_conf *mdev, struct p_header *h)
4173 return TRUE; 4291 return TRUE;
4174} 4292}
4175 4293
4176static int got_Ping(struct drbd_conf *mdev, struct p_header *h) 4294static int got_Ping(struct drbd_conf *mdev, struct p_header80 *h)
4177{ 4295{
4178 return drbd_send_ping_ack(mdev); 4296 return drbd_send_ping_ack(mdev);
4179 4297
4180} 4298}
4181 4299
4182static int got_PingAck(struct drbd_conf *mdev, struct p_header *h) 4300static int got_PingAck(struct drbd_conf *mdev, struct p_header80 *h)
4183{ 4301{
4184 /* restore idle timeout */ 4302 /* restore idle timeout */
4185 mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ; 4303 mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
@@ -4189,7 +4307,7 @@ static int got_PingAck(struct drbd_conf *mdev, struct p_header *h)
4189 return TRUE; 4307 return TRUE;
4190} 4308}
4191 4309
4192static int got_IsInSync(struct drbd_conf *mdev, struct p_header *h) 4310static int got_IsInSync(struct drbd_conf *mdev, struct p_header80 *h)
4193{ 4311{
4194 struct p_block_ack *p = (struct p_block_ack *)h; 4312 struct p_block_ack *p = (struct p_block_ack *)h;
4195 sector_t sector = be64_to_cpu(p->sector); 4313 sector_t sector = be64_to_cpu(p->sector);
@@ -4199,11 +4317,15 @@ static int got_IsInSync(struct drbd_conf *mdev, struct p_header *h)
4199 4317
4200 update_peer_seq(mdev, be32_to_cpu(p->seq_num)); 4318 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4201 4319
4202 drbd_rs_complete_io(mdev, sector); 4320 if (get_ldev(mdev)) {
4203 drbd_set_in_sync(mdev, sector, blksize); 4321 drbd_rs_complete_io(mdev, sector);
4204 /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */ 4322 drbd_set_in_sync(mdev, sector, blksize);
4205 mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT); 4323 /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
4324 mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
4325 put_ldev(mdev);
4326 }
4206 dec_rs_pending(mdev); 4327 dec_rs_pending(mdev);
4328 atomic_add(blksize >> 9, &mdev->rs_sect_in);
4207 4329
4208 return TRUE; 4330 return TRUE;
4209} 4331}
@@ -4259,7 +4381,7 @@ static int validate_req_change_req_state(struct drbd_conf *mdev,
4259 return TRUE; 4381 return TRUE;
4260} 4382}
4261 4383
4262static int got_BlockAck(struct drbd_conf *mdev, struct p_header *h) 4384static int got_BlockAck(struct drbd_conf *mdev, struct p_header80 *h)
4263{ 4385{
4264 struct p_block_ack *p = (struct p_block_ack *)h; 4386 struct p_block_ack *p = (struct p_block_ack *)h;
4265 sector_t sector = be64_to_cpu(p->sector); 4387 sector_t sector = be64_to_cpu(p->sector);
@@ -4299,7 +4421,7 @@ static int got_BlockAck(struct drbd_conf *mdev, struct p_header *h)
4299 _ack_id_to_req, __func__ , what); 4421 _ack_id_to_req, __func__ , what);
4300} 4422}
4301 4423
4302static int got_NegAck(struct drbd_conf *mdev, struct p_header *h) 4424static int got_NegAck(struct drbd_conf *mdev, struct p_header80 *h)
4303{ 4425{
4304 struct p_block_ack *p = (struct p_block_ack *)h; 4426 struct p_block_ack *p = (struct p_block_ack *)h;
4305 sector_t sector = be64_to_cpu(p->sector); 4427 sector_t sector = be64_to_cpu(p->sector);
@@ -4319,7 +4441,7 @@ static int got_NegAck(struct drbd_conf *mdev, struct p_header *h)
4319 _ack_id_to_req, __func__ , neg_acked); 4441 _ack_id_to_req, __func__ , neg_acked);
4320} 4442}
4321 4443
4322static int got_NegDReply(struct drbd_conf *mdev, struct p_header *h) 4444static int got_NegDReply(struct drbd_conf *mdev, struct p_header80 *h)
4323{ 4445{
4324 struct p_block_ack *p = (struct p_block_ack *)h; 4446 struct p_block_ack *p = (struct p_block_ack *)h;
4325 sector_t sector = be64_to_cpu(p->sector); 4447 sector_t sector = be64_to_cpu(p->sector);
@@ -4332,7 +4454,7 @@ static int got_NegDReply(struct drbd_conf *mdev, struct p_header *h)
4332 _ar_id_to_req, __func__ , neg_acked); 4454 _ar_id_to_req, __func__ , neg_acked);
4333} 4455}
4334 4456
4335static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h) 4457static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header80 *h)
4336{ 4458{
4337 sector_t sector; 4459 sector_t sector;
4338 int size; 4460 int size;
@@ -4354,7 +4476,7 @@ static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h)
4354 return TRUE; 4476 return TRUE;
4355} 4477}
4356 4478
4357static int got_BarrierAck(struct drbd_conf *mdev, struct p_header *h) 4479static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h)
4358{ 4480{
4359 struct p_barrier_ack *p = (struct p_barrier_ack *)h; 4481 struct p_barrier_ack *p = (struct p_barrier_ack *)h;
4360 4482
@@ -4363,7 +4485,7 @@ static int got_BarrierAck(struct drbd_conf *mdev, struct p_header *h)
4363 return TRUE; 4485 return TRUE;
4364} 4486}
4365 4487
4366static int got_OVResult(struct drbd_conf *mdev, struct p_header *h) 4488static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h)
4367{ 4489{
4368 struct p_block_ack *p = (struct p_block_ack *)h; 4490 struct p_block_ack *p = (struct p_block_ack *)h;
4369 struct drbd_work *w; 4491 struct drbd_work *w;
@@ -4380,6 +4502,9 @@ static int got_OVResult(struct drbd_conf *mdev, struct p_header *h)
4380 else 4502 else
4381 ov_oos_print(mdev); 4503 ov_oos_print(mdev);
4382 4504
4505 if (!get_ldev(mdev))
4506 return TRUE;
4507
4383 drbd_rs_complete_io(mdev, sector); 4508 drbd_rs_complete_io(mdev, sector);
4384 dec_rs_pending(mdev); 4509 dec_rs_pending(mdev);
4385 4510
@@ -4394,18 +4519,18 @@ static int got_OVResult(struct drbd_conf *mdev, struct p_header *h)
4394 drbd_resync_finished(mdev); 4519 drbd_resync_finished(mdev);
4395 } 4520 }
4396 } 4521 }
4522 put_ldev(mdev);
4397 return TRUE; 4523 return TRUE;
4398} 4524}
4399 4525
4400static int got_something_to_ignore_m(struct drbd_conf *mdev, struct p_header *h) 4526static int got_skip(struct drbd_conf *mdev, struct p_header80 *h)
4401{ 4527{
4402 /* IGNORE */
4403 return TRUE; 4528 return TRUE;
4404} 4529}
4405 4530
4406struct asender_cmd { 4531struct asender_cmd {
4407 size_t pkt_size; 4532 size_t pkt_size;
4408 int (*process)(struct drbd_conf *mdev, struct p_header *h); 4533 int (*process)(struct drbd_conf *mdev, struct p_header80 *h);
4409}; 4534};
4410 4535
4411static struct asender_cmd *get_asender_cmd(int cmd) 4536static struct asender_cmd *get_asender_cmd(int cmd)
@@ -4414,8 +4539,8 @@ static struct asender_cmd *get_asender_cmd(int cmd)
4414 /* anything missing from this table is in 4539 /* anything missing from this table is in
4415 * the drbd_cmd_handler (drbd_default_handler) table, 4540 * the drbd_cmd_handler (drbd_default_handler) table,
4416 * see the beginning of drbdd() */ 4541 * see the beginning of drbdd() */
4417 [P_PING] = { sizeof(struct p_header), got_Ping }, 4542 [P_PING] = { sizeof(struct p_header80), got_Ping },
4418 [P_PING_ACK] = { sizeof(struct p_header), got_PingAck }, 4543 [P_PING_ACK] = { sizeof(struct p_header80), got_PingAck },
4419 [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, 4544 [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
4420 [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, 4545 [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
4421 [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, 4546 [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
@@ -4427,7 +4552,7 @@ static struct asender_cmd *get_asender_cmd(int cmd)
4427 [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck }, 4552 [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck },
4428 [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply }, 4553 [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
4429 [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync }, 4554 [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync },
4430 [P_DELAY_PROBE] = { sizeof(struct p_delay_probe), got_something_to_ignore_m }, 4555 [P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_skip },
4431 [P_MAX_CMD] = { 0, NULL }, 4556 [P_MAX_CMD] = { 0, NULL },
4432 }; 4557 };
4433 if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL) 4558 if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL)
@@ -4438,13 +4563,13 @@ static struct asender_cmd *get_asender_cmd(int cmd)
4438int drbd_asender(struct drbd_thread *thi) 4563int drbd_asender(struct drbd_thread *thi)
4439{ 4564{
4440 struct drbd_conf *mdev = thi->mdev; 4565 struct drbd_conf *mdev = thi->mdev;
4441 struct p_header *h = &mdev->meta.rbuf.header; 4566 struct p_header80 *h = &mdev->meta.rbuf.header.h80;
4442 struct asender_cmd *cmd = NULL; 4567 struct asender_cmd *cmd = NULL;
4443 4568
4444 int rv, len; 4569 int rv, len;
4445 void *buf = h; 4570 void *buf = h;
4446 int received = 0; 4571 int received = 0;
4447 int expect = sizeof(struct p_header); 4572 int expect = sizeof(struct p_header80);
4448 int empty; 4573 int empty;
4449 4574
4450 sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev)); 4575 sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev));
@@ -4468,10 +4593,8 @@ int drbd_asender(struct drbd_thread *thi)
4468 while (1) { 4593 while (1) {
4469 clear_bit(SIGNAL_ASENDER, &mdev->flags); 4594 clear_bit(SIGNAL_ASENDER, &mdev->flags);
4470 flush_signals(current); 4595 flush_signals(current);
4471 if (!drbd_process_done_ee(mdev)) { 4596 if (!drbd_process_done_ee(mdev))
4472 dev_err(DEV, "process_done_ee() = NOT_OK\n");
4473 goto reconnect; 4597 goto reconnect;
4474 }
4475 /* to avoid race with newly queued ACKs */ 4598 /* to avoid race with newly queued ACKs */
4476 set_bit(SIGNAL_ASENDER, &mdev->flags); 4599 set_bit(SIGNAL_ASENDER, &mdev->flags);
4477 spin_lock_irq(&mdev->req_lock); 4600 spin_lock_irq(&mdev->req_lock);
@@ -4530,21 +4653,23 @@ int drbd_asender(struct drbd_thread *thi)
4530 4653
4531 if (received == expect && cmd == NULL) { 4654 if (received == expect && cmd == NULL) {
4532 if (unlikely(h->magic != BE_DRBD_MAGIC)) { 4655 if (unlikely(h->magic != BE_DRBD_MAGIC)) {
4533 dev_err(DEV, "magic?? on meta m: 0x%lx c: %d l: %d\n", 4656 dev_err(DEV, "magic?? on meta m: 0x%08x c: %d l: %d\n",
4534 (long)be32_to_cpu(h->magic), 4657 be32_to_cpu(h->magic),
4535 h->command, h->length); 4658 be16_to_cpu(h->command),
4659 be16_to_cpu(h->length));
4536 goto reconnect; 4660 goto reconnect;
4537 } 4661 }
4538 cmd = get_asender_cmd(be16_to_cpu(h->command)); 4662 cmd = get_asender_cmd(be16_to_cpu(h->command));
4539 len = be16_to_cpu(h->length); 4663 len = be16_to_cpu(h->length);
4540 if (unlikely(cmd == NULL)) { 4664 if (unlikely(cmd == NULL)) {
4541 dev_err(DEV, "unknown command?? on meta m: 0x%lx c: %d l: %d\n", 4665 dev_err(DEV, "unknown command?? on meta m: 0x%08x c: %d l: %d\n",
4542 (long)be32_to_cpu(h->magic), 4666 be32_to_cpu(h->magic),
4543 h->command, h->length); 4667 be16_to_cpu(h->command),
4668 be16_to_cpu(h->length));
4544 goto disconnect; 4669 goto disconnect;
4545 } 4670 }
4546 expect = cmd->pkt_size; 4671 expect = cmd->pkt_size;
4547 ERR_IF(len != expect-sizeof(struct p_header)) 4672 ERR_IF(len != expect-sizeof(struct p_header80))
4548 goto reconnect; 4673 goto reconnect;
4549 } 4674 }
4550 if (received == expect) { 4675 if (received == expect) {
@@ -4554,7 +4679,7 @@ int drbd_asender(struct drbd_thread *thi)
4554 4679
4555 buf = h; 4680 buf = h;
4556 received = 0; 4681 received = 0;
4557 expect = sizeof(struct p_header); 4682 expect = sizeof(struct p_header80);
4558 cmd = NULL; 4683 cmd = NULL;
4559 } 4684 }
4560 } 4685 }
@@ -4562,10 +4687,12 @@ int drbd_asender(struct drbd_thread *thi)
4562 if (0) { 4687 if (0) {
4563reconnect: 4688reconnect:
4564 drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE)); 4689 drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
4690 drbd_md_sync(mdev);
4565 } 4691 }
4566 if (0) { 4692 if (0) {
4567disconnect: 4693disconnect:
4568 drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 4694 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
4695 drbd_md_sync(mdev);
4569 } 4696 }
4570 clear_bit(SIGNAL_ASENDER, &mdev->flags); 4697 clear_bit(SIGNAL_ASENDER, &mdev->flags);
4571 4698
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index f761d98a4e90..9e91a2545fc8 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -59,17 +59,19 @@ static void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request *req)
59static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw) 59static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw)
60{ 60{
61 const unsigned long s = req->rq_state; 61 const unsigned long s = req->rq_state;
62
63 /* remove it from the transfer log.
64 * well, only if it had been there in the first
65 * place... if it had not (local only or conflicting
66 * and never sent), it should still be "empty" as
67 * initialized in drbd_req_new(), so we can list_del() it
68 * here unconditionally */
69 list_del(&req->tl_requests);
70
62 /* if it was a write, we may have to set the corresponding 71 /* if it was a write, we may have to set the corresponding
63 * bit(s) out-of-sync first. If it had a local part, we need to 72 * bit(s) out-of-sync first. If it had a local part, we need to
64 * release the reference to the activity log. */ 73 * release the reference to the activity log. */
65 if (rw == WRITE) { 74 if (rw == WRITE) {
66 /* remove it from the transfer log.
67 * well, only if it had been there in the first
68 * place... if it had not (local only or conflicting
69 * and never sent), it should still be "empty" as
70 * initialized in drbd_req_new(), so we can list_del() it
71 * here unconditionally */
72 list_del(&req->tl_requests);
73 /* Set out-of-sync unless both OK flags are set 75 /* Set out-of-sync unless both OK flags are set
74 * (local only or remote failed). 76 * (local only or remote failed).
75 * Other places where we set out-of-sync: 77 * Other places where we set out-of-sync:
@@ -92,7 +94,8 @@ static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const
92 */ 94 */
93 if (s & RQ_LOCAL_MASK) { 95 if (s & RQ_LOCAL_MASK) {
94 if (get_ldev_if_state(mdev, D_FAILED)) { 96 if (get_ldev_if_state(mdev, D_FAILED)) {
95 drbd_al_complete_io(mdev, req->sector); 97 if (s & RQ_IN_ACT_LOG)
98 drbd_al_complete_io(mdev, req->sector);
96 put_ldev(mdev); 99 put_ldev(mdev);
97 } else if (__ratelimit(&drbd_ratelimit_state)) { 100 } else if (__ratelimit(&drbd_ratelimit_state)) {
98 dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu), " 101 dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu), "
@@ -280,6 +283,14 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
280 * protocol A or B, barrier ack still pending... */ 283 * protocol A or B, barrier ack still pending... */
281} 284}
282 285
286static void _req_may_be_done_not_susp(struct drbd_request *req, struct bio_and_error *m)
287{
288 struct drbd_conf *mdev = req->mdev;
289
290 if (!is_susp(mdev->state))
291 _req_may_be_done(req, m);
292}
293
283/* 294/*
284 * checks whether there was an overlapping request 295 * checks whether there was an overlapping request
285 * or ee already registered. 296 * or ee already registered.
@@ -380,10 +391,11 @@ out_conflict:
380 * and it enforces that we have to think in a very structured manner 391 * and it enforces that we have to think in a very structured manner
381 * about the "events" that may happen to a request during its life time ... 392 * about the "events" that may happen to a request during its life time ...
382 */ 393 */
383void __req_mod(struct drbd_request *req, enum drbd_req_event what, 394int __req_mod(struct drbd_request *req, enum drbd_req_event what,
384 struct bio_and_error *m) 395 struct bio_and_error *m)
385{ 396{
386 struct drbd_conf *mdev = req->mdev; 397 struct drbd_conf *mdev = req->mdev;
398 int rv = 0;
387 m->bio = NULL; 399 m->bio = NULL;
388 400
389 switch (what) { 401 switch (what) {
@@ -420,7 +432,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
420 req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK); 432 req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK);
421 req->rq_state &= ~RQ_LOCAL_PENDING; 433 req->rq_state &= ~RQ_LOCAL_PENDING;
422 434
423 _req_may_be_done(req, m); 435 _req_may_be_done_not_susp(req, m);
424 put_ldev(mdev); 436 put_ldev(mdev);
425 break; 437 break;
426 438
@@ -429,7 +441,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
429 req->rq_state &= ~RQ_LOCAL_PENDING; 441 req->rq_state &= ~RQ_LOCAL_PENDING;
430 442
431 __drbd_chk_io_error(mdev, FALSE); 443 __drbd_chk_io_error(mdev, FALSE);
432 _req_may_be_done(req, m); 444 _req_may_be_done_not_susp(req, m);
433 put_ldev(mdev); 445 put_ldev(mdev);
434 break; 446 break;
435 447
@@ -437,7 +449,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
437 /* it is legal to fail READA */ 449 /* it is legal to fail READA */
438 req->rq_state |= RQ_LOCAL_COMPLETED; 450 req->rq_state |= RQ_LOCAL_COMPLETED;
439 req->rq_state &= ~RQ_LOCAL_PENDING; 451 req->rq_state &= ~RQ_LOCAL_PENDING;
440 _req_may_be_done(req, m); 452 _req_may_be_done_not_susp(req, m);
441 put_ldev(mdev); 453 put_ldev(mdev);
442 break; 454 break;
443 455
@@ -455,7 +467,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
455 /* no point in retrying if there is no good remote data, 467 /* no point in retrying if there is no good remote data,
456 * or we have no connection. */ 468 * or we have no connection. */
457 if (mdev->state.pdsk != D_UP_TO_DATE) { 469 if (mdev->state.pdsk != D_UP_TO_DATE) {
458 _req_may_be_done(req, m); 470 _req_may_be_done_not_susp(req, m);
459 break; 471 break;
460 } 472 }
461 473
@@ -517,11 +529,9 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
517 D_ASSERT(test_bit(CREATE_BARRIER, &mdev->flags) == 0); 529 D_ASSERT(test_bit(CREATE_BARRIER, &mdev->flags) == 0);
518 530
519 req->epoch = mdev->newest_tle->br_number; 531 req->epoch = mdev->newest_tle->br_number;
520 list_add_tail(&req->tl_requests,
521 &mdev->newest_tle->requests);
522 532
523 /* increment size of current epoch */ 533 /* increment size of current epoch */
524 mdev->newest_tle->n_req++; 534 mdev->newest_tle->n_writes++;
525 535
526 /* queue work item to send data */ 536 /* queue work item to send data */
527 D_ASSERT(req->rq_state & RQ_NET_PENDING); 537 D_ASSERT(req->rq_state & RQ_NET_PENDING);
@@ -530,7 +540,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
530 drbd_queue_work(&mdev->data.work, &req->w); 540 drbd_queue_work(&mdev->data.work, &req->w);
531 541
532 /* close the epoch, in case it outgrew the limit */ 542 /* close the epoch, in case it outgrew the limit */
533 if (mdev->newest_tle->n_req >= mdev->net_conf->max_epoch_size) 543 if (mdev->newest_tle->n_writes >= mdev->net_conf->max_epoch_size)
534 queue_barrier(mdev); 544 queue_barrier(mdev);
535 545
536 break; 546 break;
@@ -543,7 +553,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
543 req->rq_state &= ~RQ_NET_QUEUED; 553 req->rq_state &= ~RQ_NET_QUEUED;
544 /* if we did it right, tl_clear should be scheduled only after 554 /* if we did it right, tl_clear should be scheduled only after
545 * this, so this should not be necessary! */ 555 * this, so this should not be necessary! */
546 _req_may_be_done(req, m); 556 _req_may_be_done_not_susp(req, m);
547 break; 557 break;
548 558
549 case handed_over_to_network: 559 case handed_over_to_network:
@@ -568,7 +578,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
568 * "completed_ok" events came in, once we return from 578 * "completed_ok" events came in, once we return from
569 * _drbd_send_zc_bio (drbd_send_dblock), we have to check 579 * _drbd_send_zc_bio (drbd_send_dblock), we have to check
570 * whether it is done already, and end it. */ 580 * whether it is done already, and end it. */
571 _req_may_be_done(req, m); 581 _req_may_be_done_not_susp(req, m);
572 break; 582 break;
573 583
574 case read_retry_remote_canceled: 584 case read_retry_remote_canceled:
@@ -584,7 +594,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
584 /* if it is still queued, we may not complete it here. 594 /* if it is still queued, we may not complete it here.
585 * it will be canceled soon. */ 595 * it will be canceled soon. */
586 if (!(req->rq_state & RQ_NET_QUEUED)) 596 if (!(req->rq_state & RQ_NET_QUEUED))
587 _req_may_be_done(req, m); 597 _req_may_be_done(req, m); /* Allowed while state.susp */
588 break; 598 break;
589 599
590 case write_acked_by_peer_and_sis: 600 case write_acked_by_peer_and_sis:
@@ -619,7 +629,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
619 D_ASSERT(req->rq_state & RQ_NET_PENDING); 629 D_ASSERT(req->rq_state & RQ_NET_PENDING);
620 dec_ap_pending(mdev); 630 dec_ap_pending(mdev);
621 req->rq_state &= ~RQ_NET_PENDING; 631 req->rq_state &= ~RQ_NET_PENDING;
622 _req_may_be_done(req, m); 632 _req_may_be_done_not_susp(req, m);
623 break; 633 break;
624 634
625 case neg_acked: 635 case neg_acked:
@@ -629,11 +639,50 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
629 req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING); 639 req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
630 640
631 req->rq_state |= RQ_NET_DONE; 641 req->rq_state |= RQ_NET_DONE;
632 _req_may_be_done(req, m); 642 _req_may_be_done_not_susp(req, m);
633 /* else: done by handed_over_to_network */ 643 /* else: done by handed_over_to_network */
634 break; 644 break;
635 645
646 case fail_frozen_disk_io:
647 if (!(req->rq_state & RQ_LOCAL_COMPLETED))
648 break;
649
650 _req_may_be_done(req, m); /* Allowed while state.susp */
651 break;
652
653 case restart_frozen_disk_io:
654 if (!(req->rq_state & RQ_LOCAL_COMPLETED))
655 break;
656
657 req->rq_state &= ~RQ_LOCAL_COMPLETED;
658
659 rv = MR_READ;
660 if (bio_data_dir(req->master_bio) == WRITE)
661 rv = MR_WRITE;
662
663 get_ldev(mdev);
664 req->w.cb = w_restart_disk_io;
665 drbd_queue_work(&mdev->data.work, &req->w);
666 break;
667
668 case resend:
669 /* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK
670 before the connection loss (B&C only); only P_BARRIER_ACK was missing.
671 Trowing them out of the TL here by pretending we got a BARRIER_ACK
672 We ensure that the peer was not rebooted */
673 if (!(req->rq_state & RQ_NET_OK)) {
674 if (req->w.cb) {
675 drbd_queue_work(&mdev->data.work, &req->w);
676 rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ;
677 }
678 break;
679 }
680 /* else, fall through to barrier_acked */
681
636 case barrier_acked: 682 case barrier_acked:
683 if (!(req->rq_state & RQ_WRITE))
684 break;
685
637 if (req->rq_state & RQ_NET_PENDING) { 686 if (req->rq_state & RQ_NET_PENDING) {
638 /* barrier came in before all requests have been acked. 687 /* barrier came in before all requests have been acked.
639 * this is bad, because if the connection is lost now, 688 * this is bad, because if the connection is lost now,
@@ -643,7 +692,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
643 } 692 }
644 D_ASSERT(req->rq_state & RQ_NET_SENT); 693 D_ASSERT(req->rq_state & RQ_NET_SENT);
645 req->rq_state |= RQ_NET_DONE; 694 req->rq_state |= RQ_NET_DONE;
646 _req_may_be_done(req, m); 695 _req_may_be_done(req, m); /* Allowed while state.susp */
647 break; 696 break;
648 697
649 case data_received: 698 case data_received:
@@ -651,9 +700,11 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
651 dec_ap_pending(mdev); 700 dec_ap_pending(mdev);
652 req->rq_state &= ~RQ_NET_PENDING; 701 req->rq_state &= ~RQ_NET_PENDING;
653 req->rq_state |= (RQ_NET_OK|RQ_NET_DONE); 702 req->rq_state |= (RQ_NET_OK|RQ_NET_DONE);
654 _req_may_be_done(req, m); 703 _req_may_be_done_not_susp(req, m);
655 break; 704 break;
656 }; 705 };
706
707 return rv;
657} 708}
658 709
659/* we may do a local read if: 710/* we may do a local read if:
@@ -752,14 +803,16 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
752 * resync extent to finish, and, if necessary, pulls in the target 803 * resync extent to finish, and, if necessary, pulls in the target
753 * extent into the activity log, which involves further disk io because 804 * extent into the activity log, which involves further disk io because
754 * of transactional on-disk meta data updates. */ 805 * of transactional on-disk meta data updates. */
755 if (rw == WRITE && local) 806 if (rw == WRITE && local && !test_bit(AL_SUSPENDED, &mdev->flags)) {
807 req->rq_state |= RQ_IN_ACT_LOG;
756 drbd_al_begin_io(mdev, sector); 808 drbd_al_begin_io(mdev, sector);
809 }
757 810
758 remote = remote && (mdev->state.pdsk == D_UP_TO_DATE || 811 remote = remote && (mdev->state.pdsk == D_UP_TO_DATE ||
759 (mdev->state.pdsk == D_INCONSISTENT && 812 (mdev->state.pdsk == D_INCONSISTENT &&
760 mdev->state.conn >= C_CONNECTED)); 813 mdev->state.conn >= C_CONNECTED));
761 814
762 if (!(local || remote) && !mdev->state.susp) { 815 if (!(local || remote) && !is_susp(mdev->state)) {
763 dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); 816 dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
764 goto fail_free_complete; 817 goto fail_free_complete;
765 } 818 }
@@ -785,7 +838,7 @@ allocate_barrier:
785 /* GOOD, everything prepared, grab the spin_lock */ 838 /* GOOD, everything prepared, grab the spin_lock */
786 spin_lock_irq(&mdev->req_lock); 839 spin_lock_irq(&mdev->req_lock);
787 840
788 if (mdev->state.susp) { 841 if (is_susp(mdev->state)) {
789 /* If we got suspended, use the retry mechanism of 842 /* If we got suspended, use the retry mechanism of
790 generic_make_request() to restart processing of this 843 generic_make_request() to restart processing of this
791 bio. In the next call to drbd_make_request_26 844 bio. In the next call to drbd_make_request_26
@@ -867,30 +920,10 @@ allocate_barrier:
867 /* check this request on the collision detection hash tables. 920 /* check this request on the collision detection hash tables.
868 * if we have a conflict, just complete it here. 921 * if we have a conflict, just complete it here.
869 * THINK do we want to check reads, too? (I don't think so...) */ 922 * THINK do we want to check reads, too? (I don't think so...) */
870 if (rw == WRITE && _req_conflicts(req)) { 923 if (rw == WRITE && _req_conflicts(req))
871 /* this is a conflicting request. 924 goto fail_conflicting;
872 * even though it may have been only _partially_ 925
873 * overlapping with one of the currently pending requests, 926 list_add_tail(&req->tl_requests, &mdev->newest_tle->requests);
874 * without even submitting or sending it, we will
875 * pretend that it was successfully served right now.
876 */
877 if (local) {
878 bio_put(req->private_bio);
879 req->private_bio = NULL;
880 drbd_al_complete_io(mdev, req->sector);
881 put_ldev(mdev);
882 local = 0;
883 }
884 if (remote)
885 dec_ap_pending(mdev);
886 _drbd_end_io_acct(mdev, req);
887 /* THINK: do we want to fail it (-EIO), or pretend success? */
888 bio_endio(req->master_bio, 0);
889 req->master_bio = NULL;
890 dec_ap_bio(mdev);
891 drbd_req_free(req);
892 remote = 0;
893 }
894 927
895 /* NOTE remote first: to get the concurrent write detection right, 928 /* NOTE remote first: to get the concurrent write detection right,
896 * we must register the request before start of local IO. */ 929 * we must register the request before start of local IO. */
@@ -923,6 +956,21 @@ allocate_barrier:
923 956
924 return 0; 957 return 0;
925 958
959fail_conflicting:
960 /* this is a conflicting request.
961 * even though it may have been only _partially_
962 * overlapping with one of the currently pending requests,
963 * without even submitting or sending it, we will
964 * pretend that it was successfully served right now.
965 */
966 _drbd_end_io_acct(mdev, req);
967 spin_unlock_irq(&mdev->req_lock);
968 if (remote)
969 dec_ap_pending(mdev);
970 /* THINK: do we want to fail it (-EIO), or pretend success?
971 * this pretends success. */
972 err = 0;
973
926fail_free_complete: 974fail_free_complete:
927 if (rw == WRITE && local) 975 if (rw == WRITE && local)
928 drbd_al_complete_io(mdev, sector); 976 drbd_al_complete_io(mdev, sector);
@@ -961,21 +1009,6 @@ static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write)
961 return 1; 1009 return 1;
962 } 1010 }
963 1011
964 /*
965 * Paranoia: we might have been primary, but sync target, or
966 * even diskless, then lost the connection.
967 * This should have been handled (panic? suspend?) somewhere
968 * else. But maybe it was not, so check again here.
969 * Caution: as long as we do not have a read/write lock on mdev,
970 * to serialize state changes, this is racy, since we may lose
971 * the connection *after* we test for the cstate.
972 */
973 if (mdev->state.disk < D_UP_TO_DATE && mdev->state.pdsk < D_UP_TO_DATE) {
974 if (__ratelimit(&drbd_ratelimit_state))
975 dev_err(DEV, "Sorry, I have no access to good data anymore.\n");
976 return 1;
977 }
978
979 return 0; 1012 return 0;
980} 1013}
981 1014
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index 02d575d24518..181ea0364822 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -104,6 +104,9 @@ enum drbd_req_event {
104 read_ahead_completed_with_error, 104 read_ahead_completed_with_error,
105 write_completed_with_error, 105 write_completed_with_error,
106 completed_ok, 106 completed_ok,
107 resend,
108 fail_frozen_disk_io,
109 restart_frozen_disk_io,
107 nothing, /* for tracing only */ 110 nothing, /* for tracing only */
108}; 111};
109 112
@@ -183,6 +186,12 @@ enum drbd_req_state_bits {
183 186
184 /* keep this last, its for the RQ_NET_MASK */ 187 /* keep this last, its for the RQ_NET_MASK */
185 __RQ_NET_MAX, 188 __RQ_NET_MAX,
189
190 /* Set when this is a write, clear for a read */
191 __RQ_WRITE,
192
193 /* Should call drbd_al_complete_io() for this request... */
194 __RQ_IN_ACT_LOG,
186}; 195};
187 196
188#define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING) 197#define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING)
@@ -201,6 +210,16 @@ enum drbd_req_state_bits {
201/* 0x1f8 */ 210/* 0x1f8 */
202#define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK) 211#define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK)
203 212
213#define RQ_WRITE (1UL << __RQ_WRITE)
214#define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG)
215
216/* For waking up the frozen transfer log mod_req() has to return if the request
217 should be counted in the epoch object*/
218#define MR_WRITE_SHIFT 0
219#define MR_WRITE (1 << MR_WRITE_SHIFT)
220#define MR_READ_SHIFT 1
221#define MR_READ (1 << MR_READ_SHIFT)
222
204/* epoch entries */ 223/* epoch entries */
205static inline 224static inline
206struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector) 225struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector)
@@ -244,30 +263,36 @@ static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev,
244 return NULL; 263 return NULL;
245} 264}
246 265
266static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src)
267{
268 struct bio *bio;
269 bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */
270
271 req->private_bio = bio;
272
273 bio->bi_private = req;
274 bio->bi_end_io = drbd_endio_pri;
275 bio->bi_next = NULL;
276}
277
247static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev, 278static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev,
248 struct bio *bio_src) 279 struct bio *bio_src)
249{ 280{
250 struct bio *bio;
251 struct drbd_request *req = 281 struct drbd_request *req =
252 mempool_alloc(drbd_request_mempool, GFP_NOIO); 282 mempool_alloc(drbd_request_mempool, GFP_NOIO);
253 if (likely(req)) { 283 if (likely(req)) {
254 bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */ 284 drbd_req_make_private_bio(req, bio_src);
255 285
256 req->rq_state = 0; 286 req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0;
257 req->mdev = mdev; 287 req->mdev = mdev;
258 req->master_bio = bio_src; 288 req->master_bio = bio_src;
259 req->private_bio = bio;
260 req->epoch = 0; 289 req->epoch = 0;
261 req->sector = bio->bi_sector; 290 req->sector = bio_src->bi_sector;
262 req->size = bio->bi_size; 291 req->size = bio_src->bi_size;
263 req->start_time = jiffies; 292 req->start_time = jiffies;
264 INIT_HLIST_NODE(&req->colision); 293 INIT_HLIST_NODE(&req->colision);
265 INIT_LIST_HEAD(&req->tl_requests); 294 INIT_LIST_HEAD(&req->tl_requests);
266 INIT_LIST_HEAD(&req->w.list); 295 INIT_LIST_HEAD(&req->w.list);
267
268 bio->bi_private = req;
269 bio->bi_end_io = drbd_endio_pri;
270 bio->bi_next = NULL;
271 } 296 }
272 return req; 297 return req;
273} 298}
@@ -292,36 +317,43 @@ struct bio_and_error {
292 317
293extern void _req_may_be_done(struct drbd_request *req, 318extern void _req_may_be_done(struct drbd_request *req,
294 struct bio_and_error *m); 319 struct bio_and_error *m);
295extern void __req_mod(struct drbd_request *req, enum drbd_req_event what, 320extern int __req_mod(struct drbd_request *req, enum drbd_req_event what,
296 struct bio_and_error *m); 321 struct bio_and_error *m);
297extern void complete_master_bio(struct drbd_conf *mdev, 322extern void complete_master_bio(struct drbd_conf *mdev,
298 struct bio_and_error *m); 323 struct bio_and_error *m);
299 324
300/* use this if you don't want to deal with calling complete_master_bio() 325/* use this if you don't want to deal with calling complete_master_bio()
301 * outside the spinlock, e.g. when walking some list on cleanup. */ 326 * outside the spinlock, e.g. when walking some list on cleanup. */
302static inline void _req_mod(struct drbd_request *req, enum drbd_req_event what) 327static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what)
303{ 328{
304 struct drbd_conf *mdev = req->mdev; 329 struct drbd_conf *mdev = req->mdev;
305 struct bio_and_error m; 330 struct bio_and_error m;
331 int rv;
306 332
307 /* __req_mod possibly frees req, do not touch req after that! */ 333 /* __req_mod possibly frees req, do not touch req after that! */
308 __req_mod(req, what, &m); 334 rv = __req_mod(req, what, &m);
309 if (m.bio) 335 if (m.bio)
310 complete_master_bio(mdev, &m); 336 complete_master_bio(mdev, &m);
337
338 return rv;
311} 339}
312 340
313/* completion of master bio is outside of spinlock. 341/* completion of master bio is outside of spinlock.
314 * If you need it irqsave, do it your self! */ 342 * If you need it irqsave, do it your self! */
315static inline void req_mod(struct drbd_request *req, 343static inline int req_mod(struct drbd_request *req,
316 enum drbd_req_event what) 344 enum drbd_req_event what)
317{ 345{
318 struct drbd_conf *mdev = req->mdev; 346 struct drbd_conf *mdev = req->mdev;
319 struct bio_and_error m; 347 struct bio_and_error m;
348 int rv;
349
320 spin_lock_irq(&mdev->req_lock); 350 spin_lock_irq(&mdev->req_lock);
321 __req_mod(req, what, &m); 351 rv = __req_mod(req, what, &m);
322 spin_unlock_irq(&mdev->req_lock); 352 spin_unlock_irq(&mdev->req_lock);
323 353
324 if (m.bio) 354 if (m.bio)
325 complete_master_bio(mdev, &m); 355 complete_master_bio(mdev, &m);
356
357 return rv;
326} 358}
327#endif 359#endif
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index ca4a16cea2d8..108d58015cd1 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -39,8 +39,6 @@
39#include "drbd_int.h" 39#include "drbd_int.h"
40#include "drbd_req.h" 40#include "drbd_req.h"
41 41
42#define SLEEP_TIME (HZ/10)
43
44static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel); 42static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel);
45 43
46 44
@@ -217,10 +215,8 @@ void drbd_endio_sec(struct bio *bio, int error)
217 */ 215 */
218void drbd_endio_pri(struct bio *bio, int error) 216void drbd_endio_pri(struct bio *bio, int error)
219{ 217{
220 unsigned long flags;
221 struct drbd_request *req = bio->bi_private; 218 struct drbd_request *req = bio->bi_private;
222 struct drbd_conf *mdev = req->mdev; 219 struct drbd_conf *mdev = req->mdev;
223 struct bio_and_error m;
224 enum drbd_req_event what; 220 enum drbd_req_event what;
225 int uptodate = bio_flagged(bio, BIO_UPTODATE); 221 int uptodate = bio_flagged(bio, BIO_UPTODATE);
226 222
@@ -246,12 +242,7 @@ void drbd_endio_pri(struct bio *bio, int error)
246 bio_put(req->private_bio); 242 bio_put(req->private_bio);
247 req->private_bio = ERR_PTR(error); 243 req->private_bio = ERR_PTR(error);
248 244
249 spin_lock_irqsave(&mdev->req_lock, flags); 245 req_mod(req, what);
250 __req_mod(req, what, &m);
251 spin_unlock_irqrestore(&mdev->req_lock, flags);
252
253 if (m.bio)
254 complete_master_bio(mdev, &m);
255} 246}
256 247
257int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel) 248int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
@@ -376,54 +367,145 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
376 struct drbd_epoch_entry *e; 367 struct drbd_epoch_entry *e;
377 368
378 if (!get_ldev(mdev)) 369 if (!get_ldev(mdev))
379 return 0; 370 return -EIO;
371
372 if (drbd_rs_should_slow_down(mdev))
373 goto defer;
380 374
381 /* GFP_TRY, because if there is no memory available right now, this may 375 /* GFP_TRY, because if there is no memory available right now, this may
382 * be rescheduled for later. It is "only" background resync, after all. */ 376 * be rescheduled for later. It is "only" background resync, after all. */
383 e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY); 377 e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
384 if (!e) 378 if (!e)
385 goto fail; 379 goto defer;
386 380
381 e->w.cb = w_e_send_csum;
387 spin_lock_irq(&mdev->req_lock); 382 spin_lock_irq(&mdev->req_lock);
388 list_add(&e->w.list, &mdev->read_ee); 383 list_add(&e->w.list, &mdev->read_ee);
389 spin_unlock_irq(&mdev->req_lock); 384 spin_unlock_irq(&mdev->req_lock);
390 385
391 e->w.cb = w_e_send_csum; 386 atomic_add(size >> 9, &mdev->rs_sect_ev);
392 if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0) 387 if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0)
393 return 1; 388 return 0;
389
390 /* drbd_submit_ee currently fails for one reason only:
391 * not being able to allocate enough bios.
392 * Is dropping the connection going to help? */
393 spin_lock_irq(&mdev->req_lock);
394 list_del(&e->w.list);
395 spin_unlock_irq(&mdev->req_lock);
394 396
395 drbd_free_ee(mdev, e); 397 drbd_free_ee(mdev, e);
396fail: 398defer:
397 put_ldev(mdev); 399 put_ldev(mdev);
398 return 2; 400 return -EAGAIN;
399} 401}
400 402
401void resync_timer_fn(unsigned long data) 403void resync_timer_fn(unsigned long data)
402{ 404{
403 unsigned long flags;
404 struct drbd_conf *mdev = (struct drbd_conf *) data; 405 struct drbd_conf *mdev = (struct drbd_conf *) data;
405 int queue; 406 int queue;
406 407
407 spin_lock_irqsave(&mdev->req_lock, flags); 408 queue = 1;
408 409 switch (mdev->state.conn) {
409 if (likely(!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))) { 410 case C_VERIFY_S:
410 queue = 1; 411 mdev->resync_work.cb = w_make_ov_request;
411 if (mdev->state.conn == C_VERIFY_S) 412 break;
412 mdev->resync_work.cb = w_make_ov_request; 413 case C_SYNC_TARGET:
413 else 414 mdev->resync_work.cb = w_make_resync_request;
414 mdev->resync_work.cb = w_make_resync_request; 415 break;
415 } else { 416 default:
416 queue = 0; 417 queue = 0;
417 mdev->resync_work.cb = w_resync_inactive; 418 mdev->resync_work.cb = w_resync_inactive;
418 } 419 }
419 420
420 spin_unlock_irqrestore(&mdev->req_lock, flags);
421
422 /* harmless race: list_empty outside data.work.q_lock */ 421 /* harmless race: list_empty outside data.work.q_lock */
423 if (list_empty(&mdev->resync_work.list) && queue) 422 if (list_empty(&mdev->resync_work.list) && queue)
424 drbd_queue_work(&mdev->data.work, &mdev->resync_work); 423 drbd_queue_work(&mdev->data.work, &mdev->resync_work);
425} 424}
426 425
426static void fifo_set(struct fifo_buffer *fb, int value)
427{
428 int i;
429
430 for (i = 0; i < fb->size; i++)
431 fb->values[i] = value;
432}
433
434static int fifo_push(struct fifo_buffer *fb, int value)
435{
436 int ov;
437
438 ov = fb->values[fb->head_index];
439 fb->values[fb->head_index++] = value;
440
441 if (fb->head_index >= fb->size)
442 fb->head_index = 0;
443
444 return ov;
445}
446
447static void fifo_add_val(struct fifo_buffer *fb, int value)
448{
449 int i;
450
451 for (i = 0; i < fb->size; i++)
452 fb->values[i] += value;
453}
454
455int drbd_rs_controller(struct drbd_conf *mdev)
456{
457 unsigned int sect_in; /* Number of sectors that came in since the last turn */
458 unsigned int want; /* The number of sectors we want in the proxy */
459 int req_sect; /* Number of sectors to request in this turn */
460 int correction; /* Number of sectors more we need in the proxy*/
461 int cps; /* correction per invocation of drbd_rs_controller() */
462 int steps; /* Number of time steps to plan ahead */
463 int curr_corr;
464 int max_sect;
465
466 sect_in = atomic_xchg(&mdev->rs_sect_in, 0); /* Number of sectors that came in */
467 mdev->rs_in_flight -= sect_in;
468
469 spin_lock(&mdev->peer_seq_lock); /* get an atomic view on mdev->rs_plan_s */
470
471 steps = mdev->rs_plan_s.size; /* (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
472
473 if (mdev->rs_in_flight + sect_in == 0) { /* At start of resync */
474 want = ((mdev->sync_conf.rate * 2 * SLEEP_TIME) / HZ) * steps;
475 } else { /* normal path */
476 want = mdev->sync_conf.c_fill_target ? mdev->sync_conf.c_fill_target :
477 sect_in * mdev->sync_conf.c_delay_target * HZ / (SLEEP_TIME * 10);
478 }
479
480 correction = want - mdev->rs_in_flight - mdev->rs_planed;
481
482 /* Plan ahead */
483 cps = correction / steps;
484 fifo_add_val(&mdev->rs_plan_s, cps);
485 mdev->rs_planed += cps * steps;
486
487 /* What we do in this step */
488 curr_corr = fifo_push(&mdev->rs_plan_s, 0);
489 spin_unlock(&mdev->peer_seq_lock);
490 mdev->rs_planed -= curr_corr;
491
492 req_sect = sect_in + curr_corr;
493 if (req_sect < 0)
494 req_sect = 0;
495
496 max_sect = (mdev->sync_conf.c_max_rate * 2 * SLEEP_TIME) / HZ;
497 if (req_sect > max_sect)
498 req_sect = max_sect;
499
500 /*
501 dev_warn(DEV, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n",
502 sect_in, mdev->rs_in_flight, want, correction,
503 steps, cps, mdev->rs_planed, curr_corr, req_sect);
504 */
505
506 return req_sect;
507}
508
427int w_make_resync_request(struct drbd_conf *mdev, 509int w_make_resync_request(struct drbd_conf *mdev,
428 struct drbd_work *w, int cancel) 510 struct drbd_work *w, int cancel)
429{ 511{
@@ -431,8 +513,9 @@ int w_make_resync_request(struct drbd_conf *mdev,
431 sector_t sector; 513 sector_t sector;
432 const sector_t capacity = drbd_get_capacity(mdev->this_bdev); 514 const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
433 int max_segment_size; 515 int max_segment_size;
434 int number, i, size, pe, mx; 516 int number, rollback_i, size, pe, mx;
435 int align, queued, sndbuf; 517 int align, queued, sndbuf;
518 int i = 0;
436 519
437 if (unlikely(cancel)) 520 if (unlikely(cancel))
438 return 1; 521 return 1;
@@ -446,6 +529,12 @@ int w_make_resync_request(struct drbd_conf *mdev,
446 dev_err(DEV, "%s in w_make_resync_request\n", 529 dev_err(DEV, "%s in w_make_resync_request\n",
447 drbd_conn_str(mdev->state.conn)); 530 drbd_conn_str(mdev->state.conn));
448 531
532 if (mdev->rs_total == 0) {
533 /* empty resync? */
534 drbd_resync_finished(mdev);
535 return 1;
536 }
537
449 if (!get_ldev(mdev)) { 538 if (!get_ldev(mdev)) {
450 /* Since we only need to access mdev->rsync a 539 /* Since we only need to access mdev->rsync a
451 get_ldev_if_state(mdev,D_FAILED) would be sufficient, but 540 get_ldev_if_state(mdev,D_FAILED) would be sufficient, but
@@ -458,11 +547,25 @@ int w_make_resync_request(struct drbd_conf *mdev,
458 547
459 /* starting with drbd 8.3.8, we can handle multi-bio EEs, 548 /* starting with drbd 8.3.8, we can handle multi-bio EEs,
460 * if it should be necessary */ 549 * if it should be necessary */
461 max_segment_size = mdev->agreed_pro_version < 94 ? 550 max_segment_size =
462 queue_max_segment_size(mdev->rq_queue) : DRBD_MAX_SEGMENT_SIZE; 551 mdev->agreed_pro_version < 94 ? queue_max_segment_size(mdev->rq_queue) :
552 mdev->agreed_pro_version < 95 ? DRBD_MAX_SIZE_H80_PACKET : DRBD_MAX_SEGMENT_SIZE;
463 553
464 number = SLEEP_TIME * mdev->sync_conf.rate / ((BM_BLOCK_SIZE / 1024) * HZ); 554 if (mdev->rs_plan_s.size) { /* mdev->sync_conf.c_plan_ahead */
465 pe = atomic_read(&mdev->rs_pending_cnt); 555 number = drbd_rs_controller(mdev) >> (BM_BLOCK_SHIFT - 9);
556 mdev->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
557 } else {
558 mdev->c_sync_rate = mdev->sync_conf.rate;
559 number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ);
560 }
561
562 /* Throttle resync on lower level disk activity, which may also be
563 * caused by application IO on Primary/SyncTarget.
564 * Keep this after the call to drbd_rs_controller, as that assumes
565 * to be called as precisely as possible every SLEEP_TIME,
566 * and would be confused otherwise. */
567 if (drbd_rs_should_slow_down(mdev))
568 goto requeue;
466 569
467 mutex_lock(&mdev->data.mutex); 570 mutex_lock(&mdev->data.mutex);
468 if (mdev->data.socket) 571 if (mdev->data.socket)
@@ -476,6 +579,7 @@ int w_make_resync_request(struct drbd_conf *mdev,
476 mx = number; 579 mx = number;
477 580
478 /* Limit the number of pending RS requests to no more than the peer's receive buffer */ 581 /* Limit the number of pending RS requests to no more than the peer's receive buffer */
582 pe = atomic_read(&mdev->rs_pending_cnt);
479 if ((pe + number) > mx) { 583 if ((pe + number) > mx) {
480 number = mx - pe; 584 number = mx - pe;
481 } 585 }
@@ -526,6 +630,7 @@ next_sector:
526 * be prepared for all stripe sizes of software RAIDs. 630 * be prepared for all stripe sizes of software RAIDs.
527 */ 631 */
528 align = 1; 632 align = 1;
633 rollback_i = i;
529 for (;;) { 634 for (;;) {
530 if (size + BM_BLOCK_SIZE > max_segment_size) 635 if (size + BM_BLOCK_SIZE > max_segment_size)
531 break; 636 break;
@@ -561,14 +666,19 @@ next_sector:
561 size = (capacity-sector)<<9; 666 size = (capacity-sector)<<9;
562 if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) { 667 if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) {
563 switch (read_for_csum(mdev, sector, size)) { 668 switch (read_for_csum(mdev, sector, size)) {
564 case 0: /* Disk failure*/ 669 case -EIO: /* Disk failure */
565 put_ldev(mdev); 670 put_ldev(mdev);
566 return 0; 671 return 0;
567 case 2: /* Allocation failed */ 672 case -EAGAIN: /* allocation failed, or ldev busy */
568 drbd_rs_complete_io(mdev, sector); 673 drbd_rs_complete_io(mdev, sector);
569 mdev->bm_resync_fo = BM_SECT_TO_BIT(sector); 674 mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
675 i = rollback_i;
570 goto requeue; 676 goto requeue;
571 /* case 1: everything ok */ 677 case 0:
678 /* everything ok */
679 break;
680 default:
681 BUG();
572 } 682 }
573 } else { 683 } else {
574 inc_rs_pending(mdev); 684 inc_rs_pending(mdev);
@@ -595,6 +705,7 @@ next_sector:
595 } 705 }
596 706
597 requeue: 707 requeue:
708 mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
598 mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); 709 mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
599 put_ldev(mdev); 710 put_ldev(mdev);
600 return 1; 711 return 1;
@@ -670,6 +781,14 @@ static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int ca
670 return 1; 781 return 1;
671} 782}
672 783
784static void ping_peer(struct drbd_conf *mdev)
785{
786 clear_bit(GOT_PING_ACK, &mdev->flags);
787 request_ping(mdev);
788 wait_event(mdev->misc_wait,
789 test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED);
790}
791
673int drbd_resync_finished(struct drbd_conf *mdev) 792int drbd_resync_finished(struct drbd_conf *mdev)
674{ 793{
675 unsigned long db, dt, dbdt; 794 unsigned long db, dt, dbdt;
@@ -709,6 +828,8 @@ int drbd_resync_finished(struct drbd_conf *mdev)
709 if (!get_ldev(mdev)) 828 if (!get_ldev(mdev))
710 goto out; 829 goto out;
711 830
831 ping_peer(mdev);
832
712 spin_lock_irq(&mdev->req_lock); 833 spin_lock_irq(&mdev->req_lock);
713 os = mdev->state; 834 os = mdev->state;
714 835
@@ -801,6 +922,8 @@ out:
801 mdev->rs_paused = 0; 922 mdev->rs_paused = 0;
802 mdev->ov_start_sector = 0; 923 mdev->ov_start_sector = 0;
803 924
925 drbd_md_sync(mdev);
926
804 if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) { 927 if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) {
805 dev_warn(DEV, "Writing the whole bitmap, due to failed kmalloc\n"); 928 dev_warn(DEV, "Writing the whole bitmap, due to failed kmalloc\n");
806 drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished"); 929 drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished");
@@ -817,9 +940,13 @@ static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_ent
817{ 940{
818 if (drbd_ee_has_active_page(e)) { 941 if (drbd_ee_has_active_page(e)) {
819 /* This might happen if sendpage() has not finished */ 942 /* This might happen if sendpage() has not finished */
943 int i = (e->size + PAGE_SIZE -1) >> PAGE_SHIFT;
944 atomic_add(i, &mdev->pp_in_use_by_net);
945 atomic_sub(i, &mdev->pp_in_use);
820 spin_lock_irq(&mdev->req_lock); 946 spin_lock_irq(&mdev->req_lock);
821 list_add_tail(&e->w.list, &mdev->net_ee); 947 list_add_tail(&e->w.list, &mdev->net_ee);
822 spin_unlock_irq(&mdev->req_lock); 948 spin_unlock_irq(&mdev->req_lock);
949 wake_up(&drbd_pp_wait);
823 } else 950 } else
824 drbd_free_ee(mdev, e); 951 drbd_free_ee(mdev, e);
825} 952}
@@ -926,9 +1053,12 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
926 return 1; 1053 return 1;
927 } 1054 }
928 1055
929 drbd_rs_complete_io(mdev, e->sector); 1056 if (get_ldev(mdev)) {
1057 drbd_rs_complete_io(mdev, e->sector);
1058 put_ldev(mdev);
1059 }
930 1060
931 di = (struct digest_info *)(unsigned long)e->block_id; 1061 di = e->digest;
932 1062
933 if (likely((e->flags & EE_WAS_ERROR) == 0)) { 1063 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
934 /* quick hack to try to avoid a race against reconfiguration. 1064 /* quick hack to try to avoid a race against reconfiguration.
@@ -952,7 +1082,9 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
952 ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e); 1082 ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e);
953 } else { 1083 } else {
954 inc_rs_pending(mdev); 1084 inc_rs_pending(mdev);
955 e->block_id = ID_SYNCER; 1085 e->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */
1086 e->flags &= ~EE_HAS_DIGEST; /* This e no longer has a digest pointer */
1087 kfree(di);
956 ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e); 1088 ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
957 } 1089 }
958 } else { 1090 } else {
@@ -962,9 +1094,6 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
962 } 1094 }
963 1095
964 dec_unacked(mdev); 1096 dec_unacked(mdev);
965
966 kfree(di);
967
968 move_to_net_ee_or_free(mdev, e); 1097 move_to_net_ee_or_free(mdev, e);
969 1098
970 if (unlikely(!ok)) 1099 if (unlikely(!ok))
@@ -1034,9 +1163,12 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1034 1163
1035 /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all 1164 /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
1036 * the resync lru has been cleaned up already */ 1165 * the resync lru has been cleaned up already */
1037 drbd_rs_complete_io(mdev, e->sector); 1166 if (get_ldev(mdev)) {
1167 drbd_rs_complete_io(mdev, e->sector);
1168 put_ldev(mdev);
1169 }
1038 1170
1039 di = (struct digest_info *)(unsigned long)e->block_id; 1171 di = e->digest;
1040 1172
1041 if (likely((e->flags & EE_WAS_ERROR) == 0)) { 1173 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
1042 digest_size = crypto_hash_digestsize(mdev->verify_tfm); 1174 digest_size = crypto_hash_digestsize(mdev->verify_tfm);
@@ -1055,9 +1187,6 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1055 } 1187 }
1056 1188
1057 dec_unacked(mdev); 1189 dec_unacked(mdev);
1058
1059 kfree(di);
1060
1061 if (!eq) 1190 if (!eq)
1062 drbd_ov_oos_found(mdev, e->sector, e->size); 1191 drbd_ov_oos_found(mdev, e->sector, e->size);
1063 else 1192 else
@@ -1108,7 +1237,7 @@ int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1108 * dec_ap_pending will be done in got_BarrierAck 1237 * dec_ap_pending will be done in got_BarrierAck
1109 * or (on connection loss) in w_clear_epoch. */ 1238 * or (on connection loss) in w_clear_epoch. */
1110 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER, 1239 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER,
1111 (struct p_header *)p, sizeof(*p), 0); 1240 (struct p_header80 *)p, sizeof(*p), 0);
1112 drbd_put_data_sock(mdev); 1241 drbd_put_data_sock(mdev);
1113 1242
1114 return ok; 1243 return ok;
@@ -1173,6 +1302,24 @@ int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1173 return ok; 1302 return ok;
1174} 1303}
1175 1304
1305int w_restart_disk_io(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1306{
1307 struct drbd_request *req = container_of(w, struct drbd_request, w);
1308
1309 if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
1310 drbd_al_begin_io(mdev, req->sector);
1311 /* Calling drbd_al_begin_io() out of the worker might deadlocks
1312 theoretically. Practically it can not deadlock, since this is
1313 only used when unfreezing IOs. All the extents of the requests
1314 that made it into the TL are already active */
1315
1316 drbd_req_make_private_bio(req, req->master_bio);
1317 req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
1318 generic_make_request(req->private_bio);
1319
1320 return 1;
1321}
1322
1176static int _drbd_may_sync_now(struct drbd_conf *mdev) 1323static int _drbd_may_sync_now(struct drbd_conf *mdev)
1177{ 1324{
1178 struct drbd_conf *odev = mdev; 1325 struct drbd_conf *odev = mdev;
@@ -1298,14 +1445,6 @@ int drbd_alter_sa(struct drbd_conf *mdev, int na)
1298 return retcode; 1445 return retcode;
1299} 1446}
1300 1447
1301static void ping_peer(struct drbd_conf *mdev)
1302{
1303 clear_bit(GOT_PING_ACK, &mdev->flags);
1304 request_ping(mdev);
1305 wait_event(mdev->misc_wait,
1306 test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED);
1307}
1308
1309/** 1448/**
1310 * drbd_start_resync() - Start the resync process 1449 * drbd_start_resync() - Start the resync process
1311 * @mdev: DRBD device. 1450 * @mdev: DRBD device.
@@ -1379,13 +1518,21 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
1379 r = SS_UNKNOWN_ERROR; 1518 r = SS_UNKNOWN_ERROR;
1380 1519
1381 if (r == SS_SUCCESS) { 1520 if (r == SS_SUCCESS) {
1382 mdev->rs_total = 1521 unsigned long tw = drbd_bm_total_weight(mdev);
1383 mdev->rs_mark_left = drbd_bm_total_weight(mdev); 1522 unsigned long now = jiffies;
1523 int i;
1524
1384 mdev->rs_failed = 0; 1525 mdev->rs_failed = 0;
1385 mdev->rs_paused = 0; 1526 mdev->rs_paused = 0;
1386 mdev->rs_start =
1387 mdev->rs_mark_time = jiffies;
1388 mdev->rs_same_csum = 0; 1527 mdev->rs_same_csum = 0;
1528 mdev->rs_last_events = 0;
1529 mdev->rs_last_sect_ev = 0;
1530 mdev->rs_total = tw;
1531 mdev->rs_start = now;
1532 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
1533 mdev->rs_mark_left[i] = tw;
1534 mdev->rs_mark_time[i] = now;
1535 }
1389 _drbd_pause_after(mdev); 1536 _drbd_pause_after(mdev);
1390 } 1537 }
1391 write_unlock_irq(&global_state_lock); 1538 write_unlock_irq(&global_state_lock);
@@ -1397,12 +1544,31 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
1397 (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10), 1544 (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10),
1398 (unsigned long) mdev->rs_total); 1545 (unsigned long) mdev->rs_total);
1399 1546
1400 if (mdev->rs_total == 0) { 1547 if (mdev->agreed_pro_version < 95 && mdev->rs_total == 0) {
1401 /* Peer still reachable? Beware of failing before-resync-target handlers! */ 1548 /* This still has a race (about when exactly the peers
1402 ping_peer(mdev); 1549 * detect connection loss) that can lead to a full sync
1550 * on next handshake. In 8.3.9 we fixed this with explicit
1551 * resync-finished notifications, but the fix
1552 * introduces a protocol change. Sleeping for some
1553 * time longer than the ping interval + timeout on the
1554 * SyncSource, to give the SyncTarget the chance to
1555 * detect connection loss, then waiting for a ping
1556 * response (implicit in drbd_resync_finished) reduces
1557 * the race considerably, but does not solve it. */
1558 if (side == C_SYNC_SOURCE)
1559 schedule_timeout_interruptible(
1560 mdev->net_conf->ping_int * HZ +
1561 mdev->net_conf->ping_timeo*HZ/9);
1403 drbd_resync_finished(mdev); 1562 drbd_resync_finished(mdev);
1404 } 1563 }
1405 1564
1565 atomic_set(&mdev->rs_sect_in, 0);
1566 atomic_set(&mdev->rs_sect_ev, 0);
1567 mdev->rs_in_flight = 0;
1568 mdev->rs_planed = 0;
1569 spin_lock(&mdev->peer_seq_lock);
1570 fifo_set(&mdev->rs_plan_s, 0);
1571 spin_unlock(&mdev->peer_seq_lock);
1406 /* ns.conn may already be != mdev->state.conn, 1572 /* ns.conn may already be != mdev->state.conn,
1407 * we may have been paused in between, or become paused until 1573 * we may have been paused in between, or become paused until
1408 * the timer triggers. 1574 * the timer triggers.
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index cf04c1b234ed..767107cce982 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -178,7 +178,6 @@ static int print_unex = 1;
178#include <linux/slab.h> 178#include <linux/slab.h>
179#include <linux/mm.h> 179#include <linux/mm.h>
180#include <linux/bio.h> 180#include <linux/bio.h>
181#include <linux/smp_lock.h>
182#include <linux/string.h> 181#include <linux/string.h>
183#include <linux/jiffies.h> 182#include <linux/jiffies.h>
184#include <linux/fcntl.h> 183#include <linux/fcntl.h>
@@ -199,6 +198,7 @@ static int print_unex = 1;
199 * It's been recommended that take about 1/4 of the default speed 198 * It's been recommended that take about 1/4 of the default speed
200 * in some more extreme cases. 199 * in some more extreme cases.
201 */ 200 */
201static DEFINE_MUTEX(floppy_mutex);
202static int slow_floppy; 202static int slow_floppy;
203 203
204#include <asm/dma.h> 204#include <asm/dma.h>
@@ -258,8 +258,8 @@ static int irqdma_allocated;
258#include <linux/completion.h> 258#include <linux/completion.h>
259 259
260static struct request *current_req; 260static struct request *current_req;
261static struct request_queue *floppy_queue;
262static void do_fd_request(struct request_queue *q); 261static void do_fd_request(struct request_queue *q);
262static int set_next_request(void);
263 263
264#ifndef fd_get_dma_residue 264#ifndef fd_get_dma_residue
265#define fd_get_dma_residue() get_dma_residue(FLOPPY_DMA) 265#define fd_get_dma_residue() get_dma_residue(FLOPPY_DMA)
@@ -413,6 +413,7 @@ static struct gendisk *disks[N_DRIVE];
413static struct block_device *opened_bdev[N_DRIVE]; 413static struct block_device *opened_bdev[N_DRIVE];
414static DEFINE_MUTEX(open_lock); 414static DEFINE_MUTEX(open_lock);
415static struct floppy_raw_cmd *raw_cmd, default_raw_cmd; 415static struct floppy_raw_cmd *raw_cmd, default_raw_cmd;
416static int fdc_queue;
416 417
417/* 418/*
418 * This struct defines the different floppy types. 419 * This struct defines the different floppy types.
@@ -890,8 +891,8 @@ static void unlock_fdc(void)
890 del_timer(&fd_timeout); 891 del_timer(&fd_timeout);
891 cont = NULL; 892 cont = NULL;
892 clear_bit(0, &fdc_busy); 893 clear_bit(0, &fdc_busy);
893 if (current_req || blk_peek_request(floppy_queue)) 894 if (current_req || set_next_request())
894 do_fd_request(floppy_queue); 895 do_fd_request(current_req->q);
895 spin_unlock_irqrestore(&floppy_lock, flags); 896 spin_unlock_irqrestore(&floppy_lock, flags);
896 wake_up(&fdc_wait); 897 wake_up(&fdc_wait);
897} 898}
@@ -2243,8 +2244,8 @@ static void floppy_end_request(struct request *req, int error)
2243 * logical buffer */ 2244 * logical buffer */
2244static void request_done(int uptodate) 2245static void request_done(int uptodate)
2245{ 2246{
2246 struct request_queue *q = floppy_queue;
2247 struct request *req = current_req; 2247 struct request *req = current_req;
2248 struct request_queue *q;
2248 unsigned long flags; 2249 unsigned long flags;
2249 int block; 2250 int block;
2250 char msg[sizeof("request done ") + sizeof(int) * 3]; 2251 char msg[sizeof("request done ") + sizeof(int) * 3];
@@ -2258,6 +2259,8 @@ static void request_done(int uptodate)
2258 return; 2259 return;
2259 } 2260 }
2260 2261
2262 q = req->q;
2263
2261 if (uptodate) { 2264 if (uptodate) {
2262 /* maintain values for invalidation on geometry 2265 /* maintain values for invalidation on geometry
2263 * change */ 2266 * change */
@@ -2811,6 +2814,28 @@ static int make_raw_rw_request(void)
2811 return 2; 2814 return 2;
2812} 2815}
2813 2816
2817/*
2818 * Round-robin between our available drives, doing one request from each
2819 */
2820static int set_next_request(void)
2821{
2822 struct request_queue *q;
2823 int old_pos = fdc_queue;
2824
2825 do {
2826 q = disks[fdc_queue]->queue;
2827 if (++fdc_queue == N_DRIVE)
2828 fdc_queue = 0;
2829 if (q) {
2830 current_req = blk_fetch_request(q);
2831 if (current_req)
2832 break;
2833 }
2834 } while (fdc_queue != old_pos);
2835
2836 return current_req != NULL;
2837}
2838
2814static void redo_fd_request(void) 2839static void redo_fd_request(void)
2815{ 2840{
2816 int drive; 2841 int drive;
@@ -2822,17 +2847,17 @@ static void redo_fd_request(void)
2822 2847
2823do_request: 2848do_request:
2824 if (!current_req) { 2849 if (!current_req) {
2825 struct request *req; 2850 int pending;
2851
2852 spin_lock_irq(&floppy_lock);
2853 pending = set_next_request();
2854 spin_unlock_irq(&floppy_lock);
2826 2855
2827 spin_lock_irq(floppy_queue->queue_lock); 2856 if (!pending) {
2828 req = blk_fetch_request(floppy_queue);
2829 spin_unlock_irq(floppy_queue->queue_lock);
2830 if (!req) {
2831 do_floppy = NULL; 2857 do_floppy = NULL;
2832 unlock_fdc(); 2858 unlock_fdc();
2833 return; 2859 return;
2834 } 2860 }
2835 current_req = req;
2836 } 2861 }
2837 drive = (long)current_req->rq_disk->private_data; 2862 drive = (long)current_req->rq_disk->private_data;
2838 set_fdc(drive); 2863 set_fdc(drive);
@@ -3553,9 +3578,9 @@ static int fd_ioctl(struct block_device *bdev, fmode_t mode,
3553{ 3578{
3554 int ret; 3579 int ret;
3555 3580
3556 lock_kernel(); 3581 mutex_lock(&floppy_mutex);
3557 ret = fd_locked_ioctl(bdev, mode, cmd, param); 3582 ret = fd_locked_ioctl(bdev, mode, cmd, param);
3558 unlock_kernel(); 3583 mutex_unlock(&floppy_mutex);
3559 3584
3560 return ret; 3585 return ret;
3561} 3586}
@@ -3616,7 +3641,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
3616{ 3641{
3617 int drive = (long)disk->private_data; 3642 int drive = (long)disk->private_data;
3618 3643
3619 lock_kernel(); 3644 mutex_lock(&floppy_mutex);
3620 mutex_lock(&open_lock); 3645 mutex_lock(&open_lock);
3621 if (UDRS->fd_ref < 0) 3646 if (UDRS->fd_ref < 0)
3622 UDRS->fd_ref = 0; 3647 UDRS->fd_ref = 0;
@@ -3627,7 +3652,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
3627 if (!UDRS->fd_ref) 3652 if (!UDRS->fd_ref)
3628 opened_bdev[drive] = NULL; 3653 opened_bdev[drive] = NULL;
3629 mutex_unlock(&open_lock); 3654 mutex_unlock(&open_lock);
3630 unlock_kernel(); 3655 mutex_unlock(&floppy_mutex);
3631 3656
3632 return 0; 3657 return 0;
3633} 3658}
@@ -3645,7 +3670,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
3645 int res = -EBUSY; 3670 int res = -EBUSY;
3646 char *tmp; 3671 char *tmp;
3647 3672
3648 lock_kernel(); 3673 mutex_lock(&floppy_mutex);
3649 mutex_lock(&open_lock); 3674 mutex_lock(&open_lock);
3650 old_dev = UDRS->fd_device; 3675 old_dev = UDRS->fd_device;
3651 if (opened_bdev[drive] && opened_bdev[drive] != bdev) 3676 if (opened_bdev[drive] && opened_bdev[drive] != bdev)
@@ -3722,7 +3747,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
3722 goto out; 3747 goto out;
3723 } 3748 }
3724 mutex_unlock(&open_lock); 3749 mutex_unlock(&open_lock);
3725 unlock_kernel(); 3750 mutex_unlock(&floppy_mutex);
3726 return 0; 3751 return 0;
3727out: 3752out:
3728 if (UDRS->fd_ref < 0) 3753 if (UDRS->fd_ref < 0)
@@ -3733,7 +3758,7 @@ out:
3733 opened_bdev[drive] = NULL; 3758 opened_bdev[drive] = NULL;
3734out2: 3759out2:
3735 mutex_unlock(&open_lock); 3760 mutex_unlock(&open_lock);
3736 unlock_kernel(); 3761 mutex_unlock(&floppy_mutex);
3737 return res; 3762 return res;
3738} 3763}
3739 3764
@@ -4165,6 +4190,13 @@ static int __init floppy_init(void)
4165 goto out_put_disk; 4190 goto out_put_disk;
4166 } 4191 }
4167 4192
4193 disks[dr]->queue = blk_init_queue(do_fd_request, &floppy_lock);
4194 if (!disks[dr]->queue) {
4195 err = -ENOMEM;
4196 goto out_put_disk;
4197 }
4198
4199 blk_queue_max_hw_sectors(disks[dr]->queue, 64);
4168 disks[dr]->major = FLOPPY_MAJOR; 4200 disks[dr]->major = FLOPPY_MAJOR;
4169 disks[dr]->first_minor = TOMINOR(dr); 4201 disks[dr]->first_minor = TOMINOR(dr);
4170 disks[dr]->fops = &floppy_fops; 4202 disks[dr]->fops = &floppy_fops;
@@ -4183,13 +4215,6 @@ static int __init floppy_init(void)
4183 if (err) 4215 if (err)
4184 goto out_unreg_blkdev; 4216 goto out_unreg_blkdev;
4185 4217
4186 floppy_queue = blk_init_queue(do_fd_request, &floppy_lock);
4187 if (!floppy_queue) {
4188 err = -ENOMEM;
4189 goto out_unreg_driver;
4190 }
4191 blk_queue_max_hw_sectors(floppy_queue, 64);
4192
4193 blk_register_region(MKDEV(FLOPPY_MAJOR, 0), 256, THIS_MODULE, 4218 blk_register_region(MKDEV(FLOPPY_MAJOR, 0), 256, THIS_MODULE,
4194 floppy_find, NULL, NULL); 4219 floppy_find, NULL, NULL);
4195 4220
@@ -4317,7 +4342,6 @@ static int __init floppy_init(void)
4317 4342
4318 /* to be cleaned up... */ 4343 /* to be cleaned up... */
4319 disks[drive]->private_data = (void *)(long)drive; 4344 disks[drive]->private_data = (void *)(long)drive;
4320 disks[drive]->queue = floppy_queue;
4321 disks[drive]->flags |= GENHD_FL_REMOVABLE; 4345 disks[drive]->flags |= GENHD_FL_REMOVABLE;
4322 disks[drive]->driverfs_dev = &floppy_device[drive].dev; 4346 disks[drive]->driverfs_dev = &floppy_device[drive].dev;
4323 add_disk(disks[drive]); 4347 add_disk(disks[drive]);
@@ -4333,8 +4357,6 @@ out_flush_work:
4333 floppy_release_irq_and_dma(); 4357 floppy_release_irq_and_dma();
4334out_unreg_region: 4358out_unreg_region:
4335 blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); 4359 blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256);
4336 blk_cleanup_queue(floppy_queue);
4337out_unreg_driver:
4338 platform_driver_unregister(&floppy_driver); 4360 platform_driver_unregister(&floppy_driver);
4339out_unreg_blkdev: 4361out_unreg_blkdev:
4340 unregister_blkdev(FLOPPY_MAJOR, "fd"); 4362 unregister_blkdev(FLOPPY_MAJOR, "fd");
@@ -4342,6 +4364,8 @@ out_put_disk:
4342 while (dr--) { 4364 while (dr--) {
4343 del_timer(&motor_off_timer[dr]); 4365 del_timer(&motor_off_timer[dr]);
4344 put_disk(disks[dr]); 4366 put_disk(disks[dr]);
4367 if (disks[dr]->queue)
4368 blk_cleanup_queue(disks[dr]->queue);
4345 } 4369 }
4346 return err; 4370 return err;
4347} 4371}
@@ -4550,11 +4574,11 @@ static void __exit floppy_module_exit(void)
4550 platform_device_unregister(&floppy_device[drive]); 4574 platform_device_unregister(&floppy_device[drive]);
4551 } 4575 }
4552 put_disk(disks[drive]); 4576 put_disk(disks[drive]);
4577 blk_cleanup_queue(disks[drive]->queue);
4553 } 4578 }
4554 4579
4555 del_timer_sync(&fd_timeout); 4580 del_timer_sync(&fd_timeout);
4556 del_timer_sync(&fd_timer); 4581 del_timer_sync(&fd_timer);
4557 blk_cleanup_queue(floppy_queue);
4558 4582
4559 if (atomic_read(&usage_count)) 4583 if (atomic_read(&usage_count))
4560 floppy_release_irq_and_dma(); 4584 floppy_release_irq_and_dma();
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 91797bbbe702..6c48b3545f84 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -67,16 +67,18 @@
67#include <linux/compat.h> 67#include <linux/compat.h>
68#include <linux/suspend.h> 68#include <linux/suspend.h>
69#include <linux/freezer.h> 69#include <linux/freezer.h>
70#include <linux/smp_lock.h> 70#include <linux/mutex.h>
71#include <linux/writeback.h> 71#include <linux/writeback.h>
72#include <linux/buffer_head.h> /* for invalidate_bdev() */ 72#include <linux/buffer_head.h> /* for invalidate_bdev() */
73#include <linux/completion.h> 73#include <linux/completion.h>
74#include <linux/highmem.h> 74#include <linux/highmem.h>
75#include <linux/kthread.h> 75#include <linux/kthread.h>
76#include <linux/splice.h> 76#include <linux/splice.h>
77#include <linux/sysfs.h>
77 78
78#include <asm/uaccess.h> 79#include <asm/uaccess.h>
79 80
81static DEFINE_MUTEX(loop_mutex);
80static LIST_HEAD(loop_devices); 82static LIST_HEAD(loop_devices);
81static DEFINE_MUTEX(loop_devices_mutex); 83static DEFINE_MUTEX(loop_devices_mutex);
82 84
@@ -477,17 +479,17 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
477 pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset; 479 pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset;
478 480
479 if (bio_rw(bio) == WRITE) { 481 if (bio_rw(bio) == WRITE) {
480 bool barrier = !!(bio->bi_rw & REQ_HARDBARRIER);
481 struct file *file = lo->lo_backing_file; 482 struct file *file = lo->lo_backing_file;
482 483
483 if (barrier) { 484 /* REQ_HARDBARRIER is deprecated */
484 if (unlikely(!file->f_op->fsync)) { 485 if (bio->bi_rw & REQ_HARDBARRIER) {
485 ret = -EOPNOTSUPP; 486 ret = -EOPNOTSUPP;
486 goto out; 487 goto out;
487 } 488 }
488 489
490 if (bio->bi_rw & REQ_FLUSH) {
489 ret = vfs_fsync(file, 0); 491 ret = vfs_fsync(file, 0);
490 if (unlikely(ret)) { 492 if (unlikely(ret && ret != -EINVAL)) {
491 ret = -EIO; 493 ret = -EIO;
492 goto out; 494 goto out;
493 } 495 }
@@ -495,9 +497,9 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
495 497
496 ret = lo_send(lo, bio, pos); 498 ret = lo_send(lo, bio, pos);
497 499
498 if (barrier && !ret) { 500 if ((bio->bi_rw & REQ_FUA) && !ret) {
499 ret = vfs_fsync(file, 0); 501 ret = vfs_fsync(file, 0);
500 if (unlikely(ret)) 502 if (unlikely(ret && ret != -EINVAL))
501 ret = -EIO; 503 ret = -EIO;
502 } 504 }
503 } else 505 } else
@@ -737,6 +739,103 @@ static inline int is_loop_device(struct file *file)
737 return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR; 739 return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR;
738} 740}
739 741
742/* loop sysfs attributes */
743
744static ssize_t loop_attr_show(struct device *dev, char *page,
745 ssize_t (*callback)(struct loop_device *, char *))
746{
747 struct loop_device *l, *lo = NULL;
748
749 mutex_lock(&loop_devices_mutex);
750 list_for_each_entry(l, &loop_devices, lo_list)
751 if (disk_to_dev(l->lo_disk) == dev) {
752 lo = l;
753 break;
754 }
755 mutex_unlock(&loop_devices_mutex);
756
757 return lo ? callback(lo, page) : -EIO;
758}
759
760#define LOOP_ATTR_RO(_name) \
761static ssize_t loop_attr_##_name##_show(struct loop_device *, char *); \
762static ssize_t loop_attr_do_show_##_name(struct device *d, \
763 struct device_attribute *attr, char *b) \
764{ \
765 return loop_attr_show(d, b, loop_attr_##_name##_show); \
766} \
767static struct device_attribute loop_attr_##_name = \
768 __ATTR(_name, S_IRUGO, loop_attr_do_show_##_name, NULL);
769
770static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf)
771{
772 ssize_t ret;
773 char *p = NULL;
774
775 mutex_lock(&lo->lo_ctl_mutex);
776 if (lo->lo_backing_file)
777 p = d_path(&lo->lo_backing_file->f_path, buf, PAGE_SIZE - 1);
778 mutex_unlock(&lo->lo_ctl_mutex);
779
780 if (IS_ERR_OR_NULL(p))
781 ret = PTR_ERR(p);
782 else {
783 ret = strlen(p);
784 memmove(buf, p, ret);
785 buf[ret++] = '\n';
786 buf[ret] = 0;
787 }
788
789 return ret;
790}
791
792static ssize_t loop_attr_offset_show(struct loop_device *lo, char *buf)
793{
794 return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_offset);
795}
796
797static ssize_t loop_attr_sizelimit_show(struct loop_device *lo, char *buf)
798{
799 return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_sizelimit);
800}
801
802static ssize_t loop_attr_autoclear_show(struct loop_device *lo, char *buf)
803{
804 int autoclear = (lo->lo_flags & LO_FLAGS_AUTOCLEAR);
805
806 return sprintf(buf, "%s\n", autoclear ? "1" : "0");
807}
808
809LOOP_ATTR_RO(backing_file);
810LOOP_ATTR_RO(offset);
811LOOP_ATTR_RO(sizelimit);
812LOOP_ATTR_RO(autoclear);
813
814static struct attribute *loop_attrs[] = {
815 &loop_attr_backing_file.attr,
816 &loop_attr_offset.attr,
817 &loop_attr_sizelimit.attr,
818 &loop_attr_autoclear.attr,
819 NULL,
820};
821
822static struct attribute_group loop_attribute_group = {
823 .name = "loop",
824 .attrs= loop_attrs,
825};
826
827static int loop_sysfs_init(struct loop_device *lo)
828{
829 return sysfs_create_group(&disk_to_dev(lo->lo_disk)->kobj,
830 &loop_attribute_group);
831}
832
833static void loop_sysfs_exit(struct loop_device *lo)
834{
835 sysfs_remove_group(&disk_to_dev(lo->lo_disk)->kobj,
836 &loop_attribute_group);
837}
838
740static int loop_set_fd(struct loop_device *lo, fmode_t mode, 839static int loop_set_fd(struct loop_device *lo, fmode_t mode,
741 struct block_device *bdev, unsigned int arg) 840 struct block_device *bdev, unsigned int arg)
742{ 841{
@@ -832,10 +931,11 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
832 lo->lo_queue->unplug_fn = loop_unplug; 931 lo->lo_queue->unplug_fn = loop_unplug;
833 932
834 if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) 933 if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
835 blk_queue_ordered(lo->lo_queue, QUEUE_ORDERED_DRAIN); 934 blk_queue_flush(lo->lo_queue, REQ_FLUSH);
836 935
837 set_capacity(lo->lo_disk, size); 936 set_capacity(lo->lo_disk, size);
838 bd_set_size(bdev, size << 9); 937 bd_set_size(bdev, size << 9);
938 loop_sysfs_init(lo);
839 /* let user-space know about the new size */ 939 /* let user-space know about the new size */
840 kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); 940 kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
841 941
@@ -854,6 +954,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
854 return 0; 954 return 0;
855 955
856out_clr: 956out_clr:
957 loop_sysfs_exit(lo);
857 lo->lo_thread = NULL; 958 lo->lo_thread = NULL;
858 lo->lo_device = NULL; 959 lo->lo_device = NULL;
859 lo->lo_backing_file = NULL; 960 lo->lo_backing_file = NULL;
@@ -950,6 +1051,7 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
950 set_capacity(lo->lo_disk, 0); 1051 set_capacity(lo->lo_disk, 0);
951 if (bdev) { 1052 if (bdev) {
952 bd_set_size(bdev, 0); 1053 bd_set_size(bdev, 0);
1054 loop_sysfs_exit(lo);
953 /* let user-space know about this change */ 1055 /* let user-space know about this change */
954 kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); 1056 kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
955 } 1057 }
@@ -1409,11 +1511,11 @@ static int lo_open(struct block_device *bdev, fmode_t mode)
1409{ 1511{
1410 struct loop_device *lo = bdev->bd_disk->private_data; 1512 struct loop_device *lo = bdev->bd_disk->private_data;
1411 1513
1412 lock_kernel(); 1514 mutex_lock(&loop_mutex);
1413 mutex_lock(&lo->lo_ctl_mutex); 1515 mutex_lock(&lo->lo_ctl_mutex);
1414 lo->lo_refcnt++; 1516 lo->lo_refcnt++;
1415 mutex_unlock(&lo->lo_ctl_mutex); 1517 mutex_unlock(&lo->lo_ctl_mutex);
1416 unlock_kernel(); 1518 mutex_unlock(&loop_mutex);
1417 1519
1418 return 0; 1520 return 0;
1419} 1521}
@@ -1423,7 +1525,7 @@ static int lo_release(struct gendisk *disk, fmode_t mode)
1423 struct loop_device *lo = disk->private_data; 1525 struct loop_device *lo = disk->private_data;
1424 int err; 1526 int err;
1425 1527
1426 lock_kernel(); 1528 mutex_lock(&loop_mutex);
1427 mutex_lock(&lo->lo_ctl_mutex); 1529 mutex_lock(&lo->lo_ctl_mutex);
1428 1530
1429 if (--lo->lo_refcnt) 1531 if (--lo->lo_refcnt)
@@ -1448,7 +1550,7 @@ static int lo_release(struct gendisk *disk, fmode_t mode)
1448out: 1550out:
1449 mutex_unlock(&lo->lo_ctl_mutex); 1551 mutex_unlock(&lo->lo_ctl_mutex);
1450out_unlocked: 1552out_unlocked:
1451 lock_kernel(); 1553 mutex_unlock(&loop_mutex);
1452 return 0; 1554 return 0;
1453} 1555}
1454 1556
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 0daa422aa281..a32fb41246f8 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -24,7 +24,7 @@
24#include <linux/errno.h> 24#include <linux/errno.h>
25#include <linux/file.h> 25#include <linux/file.h>
26#include <linux/ioctl.h> 26#include <linux/ioctl.h>
27#include <linux/smp_lock.h> 27#include <linux/mutex.h>
28#include <linux/compiler.h> 28#include <linux/compiler.h>
29#include <linux/err.h> 29#include <linux/err.h>
30#include <linux/kernel.h> 30#include <linux/kernel.h>
@@ -53,6 +53,7 @@
53#define DBG_BLKDEV 0x0100 53#define DBG_BLKDEV 0x0100
54#define DBG_RX 0x0200 54#define DBG_RX 0x0200
55#define DBG_TX 0x0400 55#define DBG_TX 0x0400
56static DEFINE_MUTEX(nbd_mutex);
56static unsigned int debugflags; 57static unsigned int debugflags;
57#endif /* NDEBUG */ 58#endif /* NDEBUG */
58 59
@@ -717,11 +718,11 @@ static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
717 dprintk(DBG_IOCTL, "%s: nbd_ioctl cmd=%s(0x%x) arg=%lu\n", 718 dprintk(DBG_IOCTL, "%s: nbd_ioctl cmd=%s(0x%x) arg=%lu\n",
718 lo->disk->disk_name, ioctl_cmd_to_ascii(cmd), cmd, arg); 719 lo->disk->disk_name, ioctl_cmd_to_ascii(cmd), cmd, arg);
719 720
720 lock_kernel(); 721 mutex_lock(&nbd_mutex);
721 mutex_lock(&lo->tx_lock); 722 mutex_lock(&lo->tx_lock);
722 error = __nbd_ioctl(bdev, lo, cmd, arg); 723 error = __nbd_ioctl(bdev, lo, cmd, arg);
723 mutex_unlock(&lo->tx_lock); 724 mutex_unlock(&lo->tx_lock);
724 unlock_kernel(); 725 mutex_unlock(&nbd_mutex);
725 726
726 return error; 727 return error;
727} 728}
diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
index 2284b4f05c62..87311ebac0db 100644
--- a/drivers/block/osdblk.c
+++ b/drivers/block/osdblk.c
@@ -310,8 +310,7 @@ static void osdblk_rq_fn(struct request_queue *q)
310 break; 310 break;
311 311
312 /* filter out block requests we don't understand */ 312 /* filter out block requests we don't understand */
313 if (rq->cmd_type != REQ_TYPE_FS && 313 if (rq->cmd_type != REQ_TYPE_FS) {
314 !(rq->cmd_flags & REQ_HARDBARRIER)) {
315 blk_end_request_all(rq, 0); 314 blk_end_request_all(rq, 0);
316 continue; 315 continue;
317 } 316 }
@@ -439,7 +438,7 @@ static int osdblk_init_disk(struct osdblk_device *osdev)
439 blk_queue_stack_limits(q, osd_request_queue(osdev->osd)); 438 blk_queue_stack_limits(q, osd_request_queue(osdev->osd));
440 439
441 blk_queue_prep_rq(q, blk_queue_start_tag); 440 blk_queue_prep_rq(q, blk_queue_start_tag);
442 blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH); 441 blk_queue_flush(q, REQ_FLUSH);
443 442
444 disk->queue = q; 443 disk->queue = q;
445 444
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index 76f8565e1e8d..62cec6afd7ad 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -138,9 +138,10 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_DLY};
138#include <linux/cdrom.h> 138#include <linux/cdrom.h>
139#include <linux/spinlock.h> 139#include <linux/spinlock.h>
140#include <linux/blkdev.h> 140#include <linux/blkdev.h>
141#include <linux/smp_lock.h> 141#include <linux/mutex.h>
142#include <asm/uaccess.h> 142#include <asm/uaccess.h>
143 143
144static DEFINE_MUTEX(pcd_mutex);
144static DEFINE_SPINLOCK(pcd_lock); 145static DEFINE_SPINLOCK(pcd_lock);
145 146
146module_param(verbose, bool, 0644); 147module_param(verbose, bool, 0644);
@@ -227,9 +228,9 @@ static int pcd_block_open(struct block_device *bdev, fmode_t mode)
227 struct pcd_unit *cd = bdev->bd_disk->private_data; 228 struct pcd_unit *cd = bdev->bd_disk->private_data;
228 int ret; 229 int ret;
229 230
230 lock_kernel(); 231 mutex_lock(&pcd_mutex);
231 ret = cdrom_open(&cd->info, bdev, mode); 232 ret = cdrom_open(&cd->info, bdev, mode);
232 unlock_kernel(); 233 mutex_unlock(&pcd_mutex);
233 234
234 return ret; 235 return ret;
235} 236}
@@ -237,9 +238,9 @@ static int pcd_block_open(struct block_device *bdev, fmode_t mode)
237static int pcd_block_release(struct gendisk *disk, fmode_t mode) 238static int pcd_block_release(struct gendisk *disk, fmode_t mode)
238{ 239{
239 struct pcd_unit *cd = disk->private_data; 240 struct pcd_unit *cd = disk->private_data;
240 lock_kernel(); 241 mutex_lock(&pcd_mutex);
241 cdrom_release(&cd->info, mode); 242 cdrom_release(&cd->info, mode);
242 unlock_kernel(); 243 mutex_unlock(&pcd_mutex);
243 return 0; 244 return 0;
244} 245}
245 246
@@ -249,9 +250,9 @@ static int pcd_block_ioctl(struct block_device *bdev, fmode_t mode,
249 struct pcd_unit *cd = bdev->bd_disk->private_data; 250 struct pcd_unit *cd = bdev->bd_disk->private_data;
250 int ret; 251 int ret;
251 252
252 lock_kernel(); 253 mutex_lock(&pcd_mutex);
253 ret = cdrom_ioctl(&cd->info, bdev, mode, cmd, arg); 254 ret = cdrom_ioctl(&cd->info, bdev, mode, cmd, arg);
254 unlock_kernel(); 255 mutex_unlock(&pcd_mutex);
255 256
256 return ret; 257 return ret;
257} 258}
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 985f0d4f1d1e..c0ee1558b9bb 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -153,10 +153,11 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_GEO, D_SBY, D_DLY, D_SLV};
153#include <linux/blkdev.h> 153#include <linux/blkdev.h>
154#include <linux/blkpg.h> 154#include <linux/blkpg.h>
155#include <linux/kernel.h> 155#include <linux/kernel.h>
156#include <linux/smp_lock.h> 156#include <linux/mutex.h>
157#include <asm/uaccess.h> 157#include <asm/uaccess.h>
158#include <linux/workqueue.h> 158#include <linux/workqueue.h>
159 159
160static DEFINE_MUTEX(pd_mutex);
160static DEFINE_SPINLOCK(pd_lock); 161static DEFINE_SPINLOCK(pd_lock);
161 162
162module_param(verbose, bool, 0); 163module_param(verbose, bool, 0);
@@ -736,14 +737,14 @@ static int pd_open(struct block_device *bdev, fmode_t mode)
736{ 737{
737 struct pd_unit *disk = bdev->bd_disk->private_data; 738 struct pd_unit *disk = bdev->bd_disk->private_data;
738 739
739 lock_kernel(); 740 mutex_lock(&pd_mutex);
740 disk->access++; 741 disk->access++;
741 742
742 if (disk->removable) { 743 if (disk->removable) {
743 pd_special_command(disk, pd_media_check); 744 pd_special_command(disk, pd_media_check);
744 pd_special_command(disk, pd_door_lock); 745 pd_special_command(disk, pd_door_lock);
745 } 746 }
746 unlock_kernel(); 747 mutex_unlock(&pd_mutex);
747 return 0; 748 return 0;
748} 749}
749 750
@@ -771,10 +772,10 @@ static int pd_ioctl(struct block_device *bdev, fmode_t mode,
771 772
772 switch (cmd) { 773 switch (cmd) {
773 case CDROMEJECT: 774 case CDROMEJECT:
774 lock_kernel(); 775 mutex_lock(&pd_mutex);
775 if (disk->access == 1) 776 if (disk->access == 1)
776 pd_special_command(disk, pd_eject); 777 pd_special_command(disk, pd_eject);
777 unlock_kernel(); 778 mutex_unlock(&pd_mutex);
778 return 0; 779 return 0;
779 default: 780 default:
780 return -EINVAL; 781 return -EINVAL;
@@ -785,10 +786,10 @@ static int pd_release(struct gendisk *p, fmode_t mode)
785{ 786{
786 struct pd_unit *disk = p->private_data; 787 struct pd_unit *disk = p->private_data;
787 788
788 lock_kernel(); 789 mutex_lock(&pd_mutex);
789 if (!--disk->access && disk->removable) 790 if (!--disk->access && disk->removable)
790 pd_special_command(disk, pd_door_unlock); 791 pd_special_command(disk, pd_door_unlock);
791 unlock_kernel(); 792 mutex_unlock(&pd_mutex);
792 793
793 return 0; 794 return 0;
794} 795}
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index 4457b494882a..635f25dd9e10 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -152,9 +152,10 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_LUN, D_DLY};
152#include <linux/spinlock.h> 152#include <linux/spinlock.h>
153#include <linux/blkdev.h> 153#include <linux/blkdev.h>
154#include <linux/blkpg.h> 154#include <linux/blkpg.h>
155#include <linux/smp_lock.h> 155#include <linux/mutex.h>
156#include <asm/uaccess.h> 156#include <asm/uaccess.h>
157 157
158static DEFINE_MUTEX(pf_mutex);
158static DEFINE_SPINLOCK(pf_spin_lock); 159static DEFINE_SPINLOCK(pf_spin_lock);
159 160
160module_param(verbose, bool, 0644); 161module_param(verbose, bool, 0644);
@@ -302,7 +303,7 @@ static int pf_open(struct block_device *bdev, fmode_t mode)
302 struct pf_unit *pf = bdev->bd_disk->private_data; 303 struct pf_unit *pf = bdev->bd_disk->private_data;
303 int ret; 304 int ret;
304 305
305 lock_kernel(); 306 mutex_lock(&pf_mutex);
306 pf_identify(pf); 307 pf_identify(pf);
307 308
308 ret = -ENODEV; 309 ret = -ENODEV;
@@ -318,7 +319,7 @@ static int pf_open(struct block_device *bdev, fmode_t mode)
318 if (pf->removable) 319 if (pf->removable)
319 pf_lock(pf, 1); 320 pf_lock(pf, 1);
320out: 321out:
321 unlock_kernel(); 322 mutex_unlock(&pf_mutex);
322 return ret; 323 return ret;
323} 324}
324 325
@@ -349,9 +350,9 @@ static int pf_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, u
349 350
350 if (pf->access != 1) 351 if (pf->access != 1)
351 return -EBUSY; 352 return -EBUSY;
352 lock_kernel(); 353 mutex_lock(&pf_mutex);
353 pf_eject(pf); 354 pf_eject(pf);
354 unlock_kernel(); 355 mutex_unlock(&pf_mutex);
355 356
356 return 0; 357 return 0;
357} 358}
@@ -360,9 +361,9 @@ static int pf_release(struct gendisk *disk, fmode_t mode)
360{ 361{
361 struct pf_unit *pf = disk->private_data; 362 struct pf_unit *pf = disk->private_data;
362 363
363 lock_kernel(); 364 mutex_lock(&pf_mutex);
364 if (pf->access <= 0) { 365 if (pf->access <= 0) {
365 unlock_kernel(); 366 mutex_unlock(&pf_mutex);
366 return -EINVAL; 367 return -EINVAL;
367 } 368 }
368 369
@@ -371,7 +372,7 @@ static int pf_release(struct gendisk *disk, fmode_t mode)
371 if (!pf->access && pf->removable) 372 if (!pf->access && pf->removable)
372 pf_lock(pf, 0); 373 pf_lock(pf, 0);
373 374
374 unlock_kernel(); 375 mutex_unlock(&pf_mutex);
375 return 0; 376 return 0;
376 377
377} 378}
diff --git a/drivers/block/paride/pg.c b/drivers/block/paride/pg.c
index c397b3ddba9b..6b9a2000d56a 100644
--- a/drivers/block/paride/pg.c
+++ b/drivers/block/paride/pg.c
@@ -162,7 +162,7 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_DLY};
162#include <linux/pg.h> 162#include <linux/pg.h>
163#include <linux/device.h> 163#include <linux/device.h>
164#include <linux/sched.h> /* current, TASK_* */ 164#include <linux/sched.h> /* current, TASK_* */
165#include <linux/smp_lock.h> 165#include <linux/mutex.h>
166#include <linux/jiffies.h> 166#include <linux/jiffies.h>
167 167
168#include <asm/uaccess.h> 168#include <asm/uaccess.h>
@@ -193,6 +193,7 @@ module_param_array(drive3, int, NULL, 0);
193 193
194#define ATAPI_IDENTIFY 0x12 194#define ATAPI_IDENTIFY 0x12
195 195
196static DEFINE_MUTEX(pg_mutex);
196static int pg_open(struct inode *inode, struct file *file); 197static int pg_open(struct inode *inode, struct file *file);
197static int pg_release(struct inode *inode, struct file *file); 198static int pg_release(struct inode *inode, struct file *file);
198static ssize_t pg_read(struct file *filp, char __user *buf, 199static ssize_t pg_read(struct file *filp, char __user *buf,
@@ -234,6 +235,7 @@ static const struct file_operations pg_fops = {
234 .write = pg_write, 235 .write = pg_write,
235 .open = pg_open, 236 .open = pg_open,
236 .release = pg_release, 237 .release = pg_release,
238 .llseek = noop_llseek,
237}; 239};
238 240
239static void pg_init_units(void) 241static void pg_init_units(void)
@@ -518,7 +520,7 @@ static int pg_open(struct inode *inode, struct file *file)
518 struct pg *dev = &devices[unit]; 520 struct pg *dev = &devices[unit];
519 int ret = 0; 521 int ret = 0;
520 522
521 lock_kernel(); 523 mutex_lock(&pg_mutex);
522 if ((unit >= PG_UNITS) || (!dev->present)) { 524 if ((unit >= PG_UNITS) || (!dev->present)) {
523 ret = -ENODEV; 525 ret = -ENODEV;
524 goto out; 526 goto out;
@@ -547,7 +549,7 @@ static int pg_open(struct inode *inode, struct file *file)
547 file->private_data = dev; 549 file->private_data = dev;
548 550
549out: 551out:
550 unlock_kernel(); 552 mutex_unlock(&pg_mutex);
551 return ret; 553 return ret;
552} 554}
553 555
diff --git a/drivers/block/paride/pt.c b/drivers/block/paride/pt.c
index bc5825fdeaab..7179f79d7468 100644
--- a/drivers/block/paride/pt.c
+++ b/drivers/block/paride/pt.c
@@ -146,7 +146,7 @@ static int (*drives[4])[6] = {&drive0, &drive1, &drive2, &drive3};
146#include <linux/mtio.h> 146#include <linux/mtio.h>
147#include <linux/device.h> 147#include <linux/device.h>
148#include <linux/sched.h> /* current, TASK_*, schedule_timeout() */ 148#include <linux/sched.h> /* current, TASK_*, schedule_timeout() */
149#include <linux/smp_lock.h> 149#include <linux/mutex.h>
150 150
151#include <asm/uaccess.h> 151#include <asm/uaccess.h>
152 152
@@ -189,6 +189,7 @@ module_param_array(drive3, int, NULL, 0);
189#define ATAPI_MODE_SENSE 0x1a 189#define ATAPI_MODE_SENSE 0x1a
190#define ATAPI_LOG_SENSE 0x4d 190#define ATAPI_LOG_SENSE 0x4d
191 191
192static DEFINE_MUTEX(pt_mutex);
192static int pt_open(struct inode *inode, struct file *file); 193static int pt_open(struct inode *inode, struct file *file);
193static long pt_ioctl(struct file *file, unsigned int cmd, unsigned long arg); 194static long pt_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
194static int pt_release(struct inode *inode, struct file *file); 195static int pt_release(struct inode *inode, struct file *file);
@@ -239,6 +240,7 @@ static const struct file_operations pt_fops = {
239 .unlocked_ioctl = pt_ioctl, 240 .unlocked_ioctl = pt_ioctl,
240 .open = pt_open, 241 .open = pt_open,
241 .release = pt_release, 242 .release = pt_release,
243 .llseek = noop_llseek,
242}; 244};
243 245
244/* sysfs class support */ 246/* sysfs class support */
@@ -650,9 +652,9 @@ static int pt_open(struct inode *inode, struct file *file)
650 struct pt_unit *tape = pt + unit; 652 struct pt_unit *tape = pt + unit;
651 int err; 653 int err;
652 654
653 lock_kernel(); 655 mutex_lock(&pt_mutex);
654 if (unit >= PT_UNITS || (!tape->present)) { 656 if (unit >= PT_UNITS || (!tape->present)) {
655 unlock_kernel(); 657 mutex_unlock(&pt_mutex);
656 return -ENODEV; 658 return -ENODEV;
657 } 659 }
658 660
@@ -681,12 +683,12 @@ static int pt_open(struct inode *inode, struct file *file)
681 } 683 }
682 684
683 file->private_data = tape; 685 file->private_data = tape;
684 unlock_kernel(); 686 mutex_unlock(&pt_mutex);
685 return 0; 687 return 0;
686 688
687out: 689out:
688 atomic_inc(&tape->available); 690 atomic_inc(&tape->available);
689 unlock_kernel(); 691 mutex_unlock(&pt_mutex);
690 return err; 692 return err;
691} 693}
692 694
@@ -704,15 +706,15 @@ static long pt_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
704 switch (mtop.mt_op) { 706 switch (mtop.mt_op) {
705 707
706 case MTREW: 708 case MTREW:
707 lock_kernel(); 709 mutex_lock(&pt_mutex);
708 pt_rewind(tape); 710 pt_rewind(tape);
709 unlock_kernel(); 711 mutex_unlock(&pt_mutex);
710 return 0; 712 return 0;
711 713
712 case MTWEOF: 714 case MTWEOF:
713 lock_kernel(); 715 mutex_lock(&pt_mutex);
714 pt_write_fm(tape); 716 pt_write_fm(tape);
715 unlock_kernel(); 717 mutex_unlock(&pt_mutex);
716 return 0; 718 return 0;
717 719
718 default: 720 default:
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 37a2bb595076..19b3568e9326 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -57,7 +57,6 @@
57#include <linux/seq_file.h> 57#include <linux/seq_file.h>
58#include <linux/miscdevice.h> 58#include <linux/miscdevice.h>
59#include <linux/freezer.h> 59#include <linux/freezer.h>
60#include <linux/smp_lock.h>
61#include <linux/mutex.h> 60#include <linux/mutex.h>
62#include <linux/slab.h> 61#include <linux/slab.h>
63#include <scsi/scsi_cmnd.h> 62#include <scsi/scsi_cmnd.h>
@@ -86,6 +85,7 @@
86 85
87#define ZONE(sector, pd) (((sector) + (pd)->offset) & ~((pd)->settings.size - 1)) 86#define ZONE(sector, pd) (((sector) + (pd)->offset) & ~((pd)->settings.size - 1))
88 87
88static DEFINE_MUTEX(pktcdvd_mutex);
89static struct pktcdvd_device *pkt_devs[MAX_WRITERS]; 89static struct pktcdvd_device *pkt_devs[MAX_WRITERS];
90static struct proc_dir_entry *pkt_proc; 90static struct proc_dir_entry *pkt_proc;
91static int pktdev_major; 91static int pktdev_major;
@@ -753,7 +753,6 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
753 753
754 rq->timeout = 60*HZ; 754 rq->timeout = 60*HZ;
755 rq->cmd_type = REQ_TYPE_BLOCK_PC; 755 rq->cmd_type = REQ_TYPE_BLOCK_PC;
756 rq->cmd_flags |= REQ_HARDBARRIER;
757 if (cgc->quiet) 756 if (cgc->quiet)
758 rq->cmd_flags |= REQ_QUIET; 757 rq->cmd_flags |= REQ_QUIET;
759 758
@@ -2383,7 +2382,7 @@ static int pkt_open(struct block_device *bdev, fmode_t mode)
2383 2382
2384 VPRINTK(DRIVER_NAME": entering open\n"); 2383 VPRINTK(DRIVER_NAME": entering open\n");
2385 2384
2386 lock_kernel(); 2385 mutex_lock(&pktcdvd_mutex);
2387 mutex_lock(&ctl_mutex); 2386 mutex_lock(&ctl_mutex);
2388 pd = pkt_find_dev_from_minor(MINOR(bdev->bd_dev)); 2387 pd = pkt_find_dev_from_minor(MINOR(bdev->bd_dev));
2389 if (!pd) { 2388 if (!pd) {
@@ -2411,7 +2410,7 @@ static int pkt_open(struct block_device *bdev, fmode_t mode)
2411 } 2410 }
2412 2411
2413 mutex_unlock(&ctl_mutex); 2412 mutex_unlock(&ctl_mutex);
2414 unlock_kernel(); 2413 mutex_unlock(&pktcdvd_mutex);
2415 return 0; 2414 return 0;
2416 2415
2417out_dec: 2416out_dec:
@@ -2419,7 +2418,7 @@ out_dec:
2419out: 2418out:
2420 VPRINTK(DRIVER_NAME": failed open (%d)\n", ret); 2419 VPRINTK(DRIVER_NAME": failed open (%d)\n", ret);
2421 mutex_unlock(&ctl_mutex); 2420 mutex_unlock(&ctl_mutex);
2422 unlock_kernel(); 2421 mutex_unlock(&pktcdvd_mutex);
2423 return ret; 2422 return ret;
2424} 2423}
2425 2424
@@ -2428,7 +2427,7 @@ static int pkt_close(struct gendisk *disk, fmode_t mode)
2428 struct pktcdvd_device *pd = disk->private_data; 2427 struct pktcdvd_device *pd = disk->private_data;
2429 int ret = 0; 2428 int ret = 0;
2430 2429
2431 lock_kernel(); 2430 mutex_lock(&pktcdvd_mutex);
2432 mutex_lock(&ctl_mutex); 2431 mutex_lock(&ctl_mutex);
2433 pd->refcnt--; 2432 pd->refcnt--;
2434 BUG_ON(pd->refcnt < 0); 2433 BUG_ON(pd->refcnt < 0);
@@ -2437,7 +2436,7 @@ static int pkt_close(struct gendisk *disk, fmode_t mode)
2437 pkt_release_dev(pd, flush); 2436 pkt_release_dev(pd, flush);
2438 } 2437 }
2439 mutex_unlock(&ctl_mutex); 2438 mutex_unlock(&ctl_mutex);
2440 unlock_kernel(); 2439 mutex_unlock(&pktcdvd_mutex);
2441 return ret; 2440 return ret;
2442} 2441}
2443 2442
@@ -2773,7 +2772,7 @@ static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
2773 VPRINTK("pkt_ioctl: cmd %x, dev %d:%d\n", cmd, 2772 VPRINTK("pkt_ioctl: cmd %x, dev %d:%d\n", cmd,
2774 MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); 2773 MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
2775 2774
2776 lock_kernel(); 2775 mutex_lock(&pktcdvd_mutex);
2777 switch (cmd) { 2776 switch (cmd) {
2778 case CDROMEJECT: 2777 case CDROMEJECT:
2779 /* 2778 /*
@@ -2798,7 +2797,7 @@ static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
2798 VPRINTK(DRIVER_NAME": Unknown ioctl for %s (%x)\n", pd->name, cmd); 2797 VPRINTK(DRIVER_NAME": Unknown ioctl for %s (%x)\n", pd->name, cmd);
2799 ret = -ENOTTY; 2798 ret = -ENOTTY;
2800 } 2799 }
2801 unlock_kernel(); 2800 mutex_unlock(&pktcdvd_mutex);
2802 2801
2803 return ret; 2802 return ret;
2804} 2803}
@@ -3046,6 +3045,7 @@ static const struct file_operations pkt_ctl_fops = {
3046 .compat_ioctl = pkt_ctl_compat_ioctl, 3045 .compat_ioctl = pkt_ctl_compat_ioctl,
3047#endif 3046#endif
3048 .owner = THIS_MODULE, 3047 .owner = THIS_MODULE,
3048 .llseek = no_llseek,
3049}; 3049};
3050 3050
3051static struct miscdevice pkt_misc = { 3051static struct miscdevice pkt_misc = {
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index 03688c2da319..8e1ce2e2916a 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -468,7 +468,7 @@ static int __devinit ps3disk_probe(struct ps3_system_bus_device *_dev)
468 blk_queue_dma_alignment(queue, dev->blk_size-1); 468 blk_queue_dma_alignment(queue, dev->blk_size-1);
469 blk_queue_logical_block_size(queue, dev->blk_size); 469 blk_queue_logical_block_size(queue, dev->blk_size);
470 470
471 blk_queue_ordered(queue, QUEUE_ORDERED_DRAIN_FLUSH); 471 blk_queue_flush(queue, REQ_FLUSH);
472 472
473 blk_queue_max_segments(queue, -1); 473 blk_queue_max_segments(queue, -1);
474 blk_queue_max_segment_size(queue, dev->bounce_size); 474 blk_queue_max_segment_size(queue, dev->bounce_size);
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
new file mode 100644
index 000000000000..6ec9d53806c5
--- /dev/null
+++ b/drivers/block/rbd.c
@@ -0,0 +1,1841 @@
1/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
24 Instructions for use
25 --------------------
26
27 1) Map a Linux block device to an existing rbd image.
28
29 Usage: <mon ip addr> <options> <pool name> <rbd image name> [snap name]
30
31 $ echo "192.168.0.1 name=admin rbd foo" > /sys/class/rbd/add
32
33 The snapshot name can be "-" or omitted to map the image read/write.
34
35 2) List all active blkdev<->object mappings.
36
37 In this example, we have performed step #1 twice, creating two blkdevs,
38 mapped to two separate rados objects in the rados rbd pool
39
40 $ cat /sys/class/rbd/list
41 #id major client_name pool name snap KB
42 0 254 client4143 rbd foo - 1024000
43
44 The columns, in order, are:
45 - blkdev unique id
46 - blkdev assigned major
47 - rados client id
48 - rados pool name
49 - rados block device name
50 - mapped snapshot ("-" if none)
51 - device size in KB
52
53
54 3) Create a snapshot.
55
56 Usage: <blkdev id> <snapname>
57
58 $ echo "0 mysnap" > /sys/class/rbd/snap_create
59
60
61 4) Listing a snapshot.
62
63 $ cat /sys/class/rbd/snaps_list
64 #id snap KB
65 0 - 1024000 (*)
66 0 foo 1024000
67
68 The columns, in order, are:
69 - blkdev unique id
70 - snapshot name, '-' means none (active read/write version)
71 - size of device at time of snapshot
72 - the (*) indicates this is the active version
73
74 5) Rollback to snapshot.
75
76 Usage: <blkdev id> <snapname>
77
78 $ echo "0 mysnap" > /sys/class/rbd/snap_rollback
79
80
81 6) Mapping an image using snapshot.
82
83 A snapshot mapping is read-only. This is being done by passing
84 snap=<snapname> to the options when adding a device.
85
86 $ echo "192.168.0.1 name=admin,snap=mysnap rbd foo" > /sys/class/rbd/add
87
88
89 7) Remove an active blkdev<->rbd image mapping.
90
91 In this example, we remove the mapping with blkdev unique id 1.
92
93 $ echo 1 > /sys/class/rbd/remove
94
95
96 NOTE: The actual creation and deletion of rados objects is outside the scope
97 of this driver.
98
99 */
100
101#include <linux/ceph/libceph.h>
102#include <linux/ceph/osd_client.h>
103#include <linux/ceph/mon_client.h>
104#include <linux/ceph/decode.h>
105
106#include <linux/kernel.h>
107#include <linux/device.h>
108#include <linux/module.h>
109#include <linux/fs.h>
110#include <linux/blkdev.h>
111
112#include "rbd_types.h"
113
114#define DRV_NAME "rbd"
115#define DRV_NAME_LONG "rbd (rados block device)"
116
117#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
118
119#define RBD_MAX_MD_NAME_LEN (96 + sizeof(RBD_SUFFIX))
120#define RBD_MAX_POOL_NAME_LEN 64
121#define RBD_MAX_SNAP_NAME_LEN 32
122#define RBD_MAX_OPT_LEN 1024
123
124#define RBD_SNAP_HEAD_NAME "-"
125
126#define DEV_NAME_LEN 32
127
128/*
129 * block device image metadata (in-memory version)
130 */
131struct rbd_image_header {
132 u64 image_size;
133 char block_name[32];
134 __u8 obj_order;
135 __u8 crypt_type;
136 __u8 comp_type;
137 struct rw_semaphore snap_rwsem;
138 struct ceph_snap_context *snapc;
139 size_t snap_names_len;
140 u64 snap_seq;
141 u32 total_snaps;
142
143 char *snap_names;
144 u64 *snap_sizes;
145};
146
147/*
148 * an instance of the client. multiple devices may share a client.
149 */
150struct rbd_client {
151 struct ceph_client *client;
152 struct kref kref;
153 struct list_head node;
154};
155
156/*
157 * a single io request
158 */
159struct rbd_request {
160 struct request *rq; /* blk layer request */
161 struct bio *bio; /* cloned bio */
162 struct page **pages; /* list of used pages */
163 u64 len;
164};
165
166/*
167 * a single device
168 */
169struct rbd_device {
170 int id; /* blkdev unique id */
171
172 int major; /* blkdev assigned major */
173 struct gendisk *disk; /* blkdev's gendisk and rq */
174 struct request_queue *q;
175
176 struct ceph_client *client;
177 struct rbd_client *rbd_client;
178
179 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
180
181 spinlock_t lock; /* queue lock */
182
183 struct rbd_image_header header;
184 char obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */
185 int obj_len;
186 char obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */
187 char pool_name[RBD_MAX_POOL_NAME_LEN];
188 int poolid;
189
190 char snap_name[RBD_MAX_SNAP_NAME_LEN];
191 u32 cur_snap; /* index+1 of current snapshot within snap context
192 0 - for the head */
193 int read_only;
194
195 struct list_head node;
196};
197
198static spinlock_t node_lock; /* protects client get/put */
199
200static struct class *class_rbd; /* /sys/class/rbd */
201static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
202static LIST_HEAD(rbd_dev_list); /* devices */
203static LIST_HEAD(rbd_client_list); /* clients */
204
205
206static int rbd_open(struct block_device *bdev, fmode_t mode)
207{
208 struct gendisk *disk = bdev->bd_disk;
209 struct rbd_device *rbd_dev = disk->private_data;
210
211 set_device_ro(bdev, rbd_dev->read_only);
212
213 if ((mode & FMODE_WRITE) && rbd_dev->read_only)
214 return -EROFS;
215
216 return 0;
217}
218
219static const struct block_device_operations rbd_bd_ops = {
220 .owner = THIS_MODULE,
221 .open = rbd_open,
222};
223
224/*
225 * Initialize an rbd client instance.
226 * We own *opt.
227 */
228static struct rbd_client *rbd_client_create(struct ceph_options *opt)
229{
230 struct rbd_client *rbdc;
231 int ret = -ENOMEM;
232
233 dout("rbd_client_create\n");
234 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
235 if (!rbdc)
236 goto out_opt;
237
238 kref_init(&rbdc->kref);
239 INIT_LIST_HEAD(&rbdc->node);
240
241 rbdc->client = ceph_create_client(opt, rbdc);
242 if (IS_ERR(rbdc->client))
243 goto out_rbdc;
244 opt = NULL; /* Now rbdc->client is responsible for opt */
245
246 ret = ceph_open_session(rbdc->client);
247 if (ret < 0)
248 goto out_err;
249
250 spin_lock(&node_lock);
251 list_add_tail(&rbdc->node, &rbd_client_list);
252 spin_unlock(&node_lock);
253
254 dout("rbd_client_create created %p\n", rbdc);
255 return rbdc;
256
257out_err:
258 ceph_destroy_client(rbdc->client);
259out_rbdc:
260 kfree(rbdc);
261out_opt:
262 if (opt)
263 ceph_destroy_options(opt);
264 return ERR_PTR(ret);
265}
266
267/*
268 * Find a ceph client with specific addr and configuration.
269 */
270static struct rbd_client *__rbd_client_find(struct ceph_options *opt)
271{
272 struct rbd_client *client_node;
273
274 if (opt->flags & CEPH_OPT_NOSHARE)
275 return NULL;
276
277 list_for_each_entry(client_node, &rbd_client_list, node)
278 if (ceph_compare_options(opt, client_node->client) == 0)
279 return client_node;
280 return NULL;
281}
282
283/*
284 * Get a ceph client with specific addr and configuration, if one does
285 * not exist create it.
286 */
287static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
288 char *options)
289{
290 struct rbd_client *rbdc;
291 struct ceph_options *opt;
292 int ret;
293
294 ret = ceph_parse_options(&opt, options, mon_addr,
295 mon_addr + strlen(mon_addr), NULL, NULL);
296 if (ret < 0)
297 return ret;
298
299 spin_lock(&node_lock);
300 rbdc = __rbd_client_find(opt);
301 if (rbdc) {
302 ceph_destroy_options(opt);
303
304 /* using an existing client */
305 kref_get(&rbdc->kref);
306 rbd_dev->rbd_client = rbdc;
307 rbd_dev->client = rbdc->client;
308 spin_unlock(&node_lock);
309 return 0;
310 }
311 spin_unlock(&node_lock);
312
313 rbdc = rbd_client_create(opt);
314 if (IS_ERR(rbdc))
315 return PTR_ERR(rbdc);
316
317 rbd_dev->rbd_client = rbdc;
318 rbd_dev->client = rbdc->client;
319 return 0;
320}
321
322/*
323 * Destroy ceph client
324 */
325static void rbd_client_release(struct kref *kref)
326{
327 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
328
329 dout("rbd_release_client %p\n", rbdc);
330 spin_lock(&node_lock);
331 list_del(&rbdc->node);
332 spin_unlock(&node_lock);
333
334 ceph_destroy_client(rbdc->client);
335 kfree(rbdc);
336}
337
338/*
339 * Drop reference to ceph client node. If it's not referenced anymore, release
340 * it.
341 */
342static void rbd_put_client(struct rbd_device *rbd_dev)
343{
344 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
345 rbd_dev->rbd_client = NULL;
346 rbd_dev->client = NULL;
347}
348
349
350/*
351 * Create a new header structure, translate header format from the on-disk
352 * header.
353 */
354static int rbd_header_from_disk(struct rbd_image_header *header,
355 struct rbd_image_header_ondisk *ondisk,
356 int allocated_snaps,
357 gfp_t gfp_flags)
358{
359 int i;
360 u32 snap_count = le32_to_cpu(ondisk->snap_count);
361 int ret = -ENOMEM;
362
363 init_rwsem(&header->snap_rwsem);
364
365 header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
366 header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
367 snap_count *
368 sizeof(struct rbd_image_snap_ondisk),
369 gfp_flags);
370 if (!header->snapc)
371 return -ENOMEM;
372 if (snap_count) {
373 header->snap_names = kmalloc(header->snap_names_len,
374 GFP_KERNEL);
375 if (!header->snap_names)
376 goto err_snapc;
377 header->snap_sizes = kmalloc(snap_count * sizeof(u64),
378 GFP_KERNEL);
379 if (!header->snap_sizes)
380 goto err_names;
381 } else {
382 header->snap_names = NULL;
383 header->snap_sizes = NULL;
384 }
385 memcpy(header->block_name, ondisk->block_name,
386 sizeof(ondisk->block_name));
387
388 header->image_size = le64_to_cpu(ondisk->image_size);
389 header->obj_order = ondisk->options.order;
390 header->crypt_type = ondisk->options.crypt_type;
391 header->comp_type = ondisk->options.comp_type;
392
393 atomic_set(&header->snapc->nref, 1);
394 header->snap_seq = le64_to_cpu(ondisk->snap_seq);
395 header->snapc->num_snaps = snap_count;
396 header->total_snaps = snap_count;
397
398 if (snap_count &&
399 allocated_snaps == snap_count) {
400 for (i = 0; i < snap_count; i++) {
401 header->snapc->snaps[i] =
402 le64_to_cpu(ondisk->snaps[i].id);
403 header->snap_sizes[i] =
404 le64_to_cpu(ondisk->snaps[i].image_size);
405 }
406
407 /* copy snapshot names */
408 memcpy(header->snap_names, &ondisk->snaps[i],
409 header->snap_names_len);
410 }
411
412 return 0;
413
414err_names:
415 kfree(header->snap_names);
416err_snapc:
417 kfree(header->snapc);
418 return ret;
419}
420
421static int snap_index(struct rbd_image_header *header, int snap_num)
422{
423 return header->total_snaps - snap_num;
424}
425
426static u64 cur_snap_id(struct rbd_device *rbd_dev)
427{
428 struct rbd_image_header *header = &rbd_dev->header;
429
430 if (!rbd_dev->cur_snap)
431 return 0;
432
433 return header->snapc->snaps[snap_index(header, rbd_dev->cur_snap)];
434}
435
436static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
437 u64 *seq, u64 *size)
438{
439 int i;
440 char *p = header->snap_names;
441
442 for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) {
443 if (strcmp(snap_name, p) == 0)
444 break;
445 }
446 if (i == header->total_snaps)
447 return -ENOENT;
448 if (seq)
449 *seq = header->snapc->snaps[i];
450
451 if (size)
452 *size = header->snap_sizes[i];
453
454 return i;
455}
456
457static int rbd_header_set_snap(struct rbd_device *dev,
458 const char *snap_name,
459 u64 *size)
460{
461 struct rbd_image_header *header = &dev->header;
462 struct ceph_snap_context *snapc = header->snapc;
463 int ret = -ENOENT;
464
465 down_write(&header->snap_rwsem);
466
467 if (!snap_name ||
468 !*snap_name ||
469 strcmp(snap_name, "-") == 0 ||
470 strcmp(snap_name, RBD_SNAP_HEAD_NAME) == 0) {
471 if (header->total_snaps)
472 snapc->seq = header->snap_seq;
473 else
474 snapc->seq = 0;
475 dev->cur_snap = 0;
476 dev->read_only = 0;
477 if (size)
478 *size = header->image_size;
479 } else {
480 ret = snap_by_name(header, snap_name, &snapc->seq, size);
481 if (ret < 0)
482 goto done;
483
484 dev->cur_snap = header->total_snaps - ret;
485 dev->read_only = 1;
486 }
487
488 ret = 0;
489done:
490 up_write(&header->snap_rwsem);
491 return ret;
492}
493
494static void rbd_header_free(struct rbd_image_header *header)
495{
496 kfree(header->snapc);
497 kfree(header->snap_names);
498 kfree(header->snap_sizes);
499}
500
501/*
502 * get the actual striped segment name, offset and length
503 */
504static u64 rbd_get_segment(struct rbd_image_header *header,
505 const char *block_name,
506 u64 ofs, u64 len,
507 char *seg_name, u64 *segofs)
508{
509 u64 seg = ofs >> header->obj_order;
510
511 if (seg_name)
512 snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
513 "%s.%012llx", block_name, seg);
514
515 ofs = ofs & ((1 << header->obj_order) - 1);
516 len = min_t(u64, len, (1 << header->obj_order) - ofs);
517
518 if (segofs)
519 *segofs = ofs;
520
521 return len;
522}
523
524/*
525 * bio helpers
526 */
527
528static void bio_chain_put(struct bio *chain)
529{
530 struct bio *tmp;
531
532 while (chain) {
533 tmp = chain;
534 chain = chain->bi_next;
535 bio_put(tmp);
536 }
537}
538
539/*
540 * zeros a bio chain, starting at specific offset
541 */
542static void zero_bio_chain(struct bio *chain, int start_ofs)
543{
544 struct bio_vec *bv;
545 unsigned long flags;
546 void *buf;
547 int i;
548 int pos = 0;
549
550 while (chain) {
551 bio_for_each_segment(bv, chain, i) {
552 if (pos + bv->bv_len > start_ofs) {
553 int remainder = max(start_ofs - pos, 0);
554 buf = bvec_kmap_irq(bv, &flags);
555 memset(buf + remainder, 0,
556 bv->bv_len - remainder);
557 bvec_kunmap_irq(buf, &flags);
558 }
559 pos += bv->bv_len;
560 }
561
562 chain = chain->bi_next;
563 }
564}
565
566/*
567 * bio_chain_clone - clone a chain of bios up to a certain length.
568 * might return a bio_pair that will need to be released.
569 */
570static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
571 struct bio_pair **bp,
572 int len, gfp_t gfpmask)
573{
574 struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
575 int total = 0;
576
577 if (*bp) {
578 bio_pair_release(*bp);
579 *bp = NULL;
580 }
581
582 while (old_chain && (total < len)) {
583 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
584 if (!tmp)
585 goto err_out;
586
587 if (total + old_chain->bi_size > len) {
588 struct bio_pair *bp;
589
590 /*
591 * this split can only happen with a single paged bio,
592 * split_bio will BUG_ON if this is not the case
593 */
594 dout("bio_chain_clone split! total=%d remaining=%d"
595 "bi_size=%d\n",
596 (int)total, (int)len-total,
597 (int)old_chain->bi_size);
598
599 /* split the bio. We'll release it either in the next
600 call, or it will have to be released outside */
601 bp = bio_split(old_chain, (len - total) / 512ULL);
602 if (!bp)
603 goto err_out;
604
605 __bio_clone(tmp, &bp->bio1);
606
607 *next = &bp->bio2;
608 } else {
609 __bio_clone(tmp, old_chain);
610 *next = old_chain->bi_next;
611 }
612
613 tmp->bi_bdev = NULL;
614 gfpmask &= ~__GFP_WAIT;
615 tmp->bi_next = NULL;
616
617 if (!new_chain) {
618 new_chain = tail = tmp;
619 } else {
620 tail->bi_next = tmp;
621 tail = tmp;
622 }
623 old_chain = old_chain->bi_next;
624
625 total += tmp->bi_size;
626 }
627
628 BUG_ON(total < len);
629
630 if (tail)
631 tail->bi_next = NULL;
632
633 *old = old_chain;
634
635 return new_chain;
636
637err_out:
638 dout("bio_chain_clone with err\n");
639 bio_chain_put(new_chain);
640 return NULL;
641}
642
643/*
644 * helpers for osd request op vectors.
645 */
646static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
647 int num_ops,
648 int opcode,
649 u32 payload_len)
650{
651 *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
652 GFP_NOIO);
653 if (!*ops)
654 return -ENOMEM;
655 (*ops)[0].op = opcode;
656 /*
657 * op extent offset and length will be set later on
658 * in calc_raw_layout()
659 */
660 (*ops)[0].payload_len = payload_len;
661 return 0;
662}
663
664static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
665{
666 kfree(ops);
667}
668
669/*
670 * Send ceph osd request
671 */
672static int rbd_do_request(struct request *rq,
673 struct rbd_device *dev,
674 struct ceph_snap_context *snapc,
675 u64 snapid,
676 const char *obj, u64 ofs, u64 len,
677 struct bio *bio,
678 struct page **pages,
679 int num_pages,
680 int flags,
681 struct ceph_osd_req_op *ops,
682 int num_reply,
683 void (*rbd_cb)(struct ceph_osd_request *req,
684 struct ceph_msg *msg))
685{
686 struct ceph_osd_request *req;
687 struct ceph_file_layout *layout;
688 int ret;
689 u64 bno;
690 struct timespec mtime = CURRENT_TIME;
691 struct rbd_request *req_data;
692 struct ceph_osd_request_head *reqhead;
693 struct rbd_image_header *header = &dev->header;
694
695 ret = -ENOMEM;
696 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
697 if (!req_data)
698 goto done;
699
700 dout("rbd_do_request len=%lld ofs=%lld\n", len, ofs);
701
702 down_read(&header->snap_rwsem);
703
704 req = ceph_osdc_alloc_request(&dev->client->osdc, flags,
705 snapc,
706 ops,
707 false,
708 GFP_NOIO, pages, bio);
709 if (IS_ERR(req)) {
710 up_read(&header->snap_rwsem);
711 ret = PTR_ERR(req);
712 goto done_pages;
713 }
714
715 req->r_callback = rbd_cb;
716
717 req_data->rq = rq;
718 req_data->bio = bio;
719 req_data->pages = pages;
720 req_data->len = len;
721
722 req->r_priv = req_data;
723
724 reqhead = req->r_request->front.iov_base;
725 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
726
727 strncpy(req->r_oid, obj, sizeof(req->r_oid));
728 req->r_oid_len = strlen(req->r_oid);
729
730 layout = &req->r_file_layout;
731 memset(layout, 0, sizeof(*layout));
732 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
733 layout->fl_stripe_count = cpu_to_le32(1);
734 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
735 layout->fl_pg_preferred = cpu_to_le32(-1);
736 layout->fl_pg_pool = cpu_to_le32(dev->poolid);
737 ceph_calc_raw_layout(&dev->client->osdc, layout, snapid,
738 ofs, &len, &bno, req, ops);
739
740 ceph_osdc_build_request(req, ofs, &len,
741 ops,
742 snapc,
743 &mtime,
744 req->r_oid, req->r_oid_len);
745 up_read(&header->snap_rwsem);
746
747 ret = ceph_osdc_start_request(&dev->client->osdc, req, false);
748 if (ret < 0)
749 goto done_err;
750
751 if (!rbd_cb) {
752 ret = ceph_osdc_wait_request(&dev->client->osdc, req);
753 ceph_osdc_put_request(req);
754 }
755 return ret;
756
757done_err:
758 bio_chain_put(req_data->bio);
759 ceph_osdc_put_request(req);
760done_pages:
761 kfree(req_data);
762done:
763 if (rq)
764 blk_end_request(rq, ret, len);
765 return ret;
766}
767
768/*
769 * Ceph osd op callback
770 */
771static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
772{
773 struct rbd_request *req_data = req->r_priv;
774 struct ceph_osd_reply_head *replyhead;
775 struct ceph_osd_op *op;
776 __s32 rc;
777 u64 bytes;
778 int read_op;
779
780 /* parse reply */
781 replyhead = msg->front.iov_base;
782 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
783 op = (void *)(replyhead + 1);
784 rc = le32_to_cpu(replyhead->result);
785 bytes = le64_to_cpu(op->extent.length);
786 read_op = (le32_to_cpu(op->op) == CEPH_OSD_OP_READ);
787
788 dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
789
790 if (rc == -ENOENT && read_op) {
791 zero_bio_chain(req_data->bio, 0);
792 rc = 0;
793 } else if (rc == 0 && read_op && bytes < req_data->len) {
794 zero_bio_chain(req_data->bio, bytes);
795 bytes = req_data->len;
796 }
797
798 blk_end_request(req_data->rq, rc, bytes);
799
800 if (req_data->bio)
801 bio_chain_put(req_data->bio);
802
803 ceph_osdc_put_request(req);
804 kfree(req_data);
805}
806
807/*
808 * Do a synchronous ceph osd operation
809 */
810static int rbd_req_sync_op(struct rbd_device *dev,
811 struct ceph_snap_context *snapc,
812 u64 snapid,
813 int opcode,
814 int flags,
815 struct ceph_osd_req_op *orig_ops,
816 int num_reply,
817 const char *obj,
818 u64 ofs, u64 len,
819 char *buf)
820{
821 int ret;
822 struct page **pages;
823 int num_pages;
824 struct ceph_osd_req_op *ops = orig_ops;
825 u32 payload_len;
826
827 num_pages = calc_pages_for(ofs , len);
828 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
829 if (IS_ERR(pages))
830 return PTR_ERR(pages);
831
832 if (!orig_ops) {
833 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
834 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
835 if (ret < 0)
836 goto done;
837
838 if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
839 ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
840 if (ret < 0)
841 goto done_ops;
842 }
843 }
844
845 ret = rbd_do_request(NULL, dev, snapc, snapid,
846 obj, ofs, len, NULL,
847 pages, num_pages,
848 flags,
849 ops,
850 2,
851 NULL);
852 if (ret < 0)
853 goto done_ops;
854
855 if ((flags & CEPH_OSD_FLAG_READ) && buf)
856 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
857
858done_ops:
859 if (!orig_ops)
860 rbd_destroy_ops(ops);
861done:
862 ceph_release_page_vector(pages, num_pages);
863 return ret;
864}
865
866/*
867 * Do an asynchronous ceph osd operation
868 */
869static int rbd_do_op(struct request *rq,
870 struct rbd_device *rbd_dev ,
871 struct ceph_snap_context *snapc,
872 u64 snapid,
873 int opcode, int flags, int num_reply,
874 u64 ofs, u64 len,
875 struct bio *bio)
876{
877 char *seg_name;
878 u64 seg_ofs;
879 u64 seg_len;
880 int ret;
881 struct ceph_osd_req_op *ops;
882 u32 payload_len;
883
884 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
885 if (!seg_name)
886 return -ENOMEM;
887
888 seg_len = rbd_get_segment(&rbd_dev->header,
889 rbd_dev->header.block_name,
890 ofs, len,
891 seg_name, &seg_ofs);
892
893 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
894
895 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
896 if (ret < 0)
897 goto done;
898
899 /* we've taken care of segment sizes earlier when we
900 cloned the bios. We should never have a segment
901 truncated at this point */
902 BUG_ON(seg_len < len);
903
904 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
905 seg_name, seg_ofs, seg_len,
906 bio,
907 NULL, 0,
908 flags,
909 ops,
910 num_reply,
911 rbd_req_cb);
912done:
913 kfree(seg_name);
914 return ret;
915}
916
917/*
918 * Request async osd write
919 */
920static int rbd_req_write(struct request *rq,
921 struct rbd_device *rbd_dev,
922 struct ceph_snap_context *snapc,
923 u64 ofs, u64 len,
924 struct bio *bio)
925{
926 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
927 CEPH_OSD_OP_WRITE,
928 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
929 2,
930 ofs, len, bio);
931}
932
933/*
934 * Request async osd read
935 */
936static int rbd_req_read(struct request *rq,
937 struct rbd_device *rbd_dev,
938 u64 snapid,
939 u64 ofs, u64 len,
940 struct bio *bio)
941{
942 return rbd_do_op(rq, rbd_dev, NULL,
943 (snapid ? snapid : CEPH_NOSNAP),
944 CEPH_OSD_OP_READ,
945 CEPH_OSD_FLAG_READ,
946 2,
947 ofs, len, bio);
948}
949
950/*
951 * Request sync osd read
952 */
953static int rbd_req_sync_read(struct rbd_device *dev,
954 struct ceph_snap_context *snapc,
955 u64 snapid,
956 const char *obj,
957 u64 ofs, u64 len,
958 char *buf)
959{
960 return rbd_req_sync_op(dev, NULL,
961 (snapid ? snapid : CEPH_NOSNAP),
962 CEPH_OSD_OP_READ,
963 CEPH_OSD_FLAG_READ,
964 NULL,
965 1, obj, ofs, len, buf);
966}
967
968/*
969 * Request sync osd read
970 */
971static int rbd_req_sync_rollback_obj(struct rbd_device *dev,
972 u64 snapid,
973 const char *obj)
974{
975 struct ceph_osd_req_op *ops;
976 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_ROLLBACK, 0);
977 if (ret < 0)
978 return ret;
979
980 ops[0].snap.snapid = snapid;
981
982 ret = rbd_req_sync_op(dev, NULL,
983 CEPH_NOSNAP,
984 0,
985 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
986 ops,
987 1, obj, 0, 0, NULL);
988
989 rbd_destroy_ops(ops);
990
991 if (ret < 0)
992 return ret;
993
994 return ret;
995}
996
997/*
998 * Request sync osd read
999 */
1000static int rbd_req_sync_exec(struct rbd_device *dev,
1001 const char *obj,
1002 const char *cls,
1003 const char *method,
1004 const char *data,
1005 int len)
1006{
1007 struct ceph_osd_req_op *ops;
1008 int cls_len = strlen(cls);
1009 int method_len = strlen(method);
1010 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
1011 cls_len + method_len + len);
1012 if (ret < 0)
1013 return ret;
1014
1015 ops[0].cls.class_name = cls;
1016 ops[0].cls.class_len = (__u8)cls_len;
1017 ops[0].cls.method_name = method;
1018 ops[0].cls.method_len = (__u8)method_len;
1019 ops[0].cls.argc = 0;
1020 ops[0].cls.indata = data;
1021 ops[0].cls.indata_len = len;
1022
1023 ret = rbd_req_sync_op(dev, NULL,
1024 CEPH_NOSNAP,
1025 0,
1026 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1027 ops,
1028 1, obj, 0, 0, NULL);
1029
1030 rbd_destroy_ops(ops);
1031
1032 dout("cls_exec returned %d\n", ret);
1033 return ret;
1034}
1035
1036/*
1037 * block device queue callback
1038 */
1039static void rbd_rq_fn(struct request_queue *q)
1040{
1041 struct rbd_device *rbd_dev = q->queuedata;
1042 struct request *rq;
1043 struct bio_pair *bp = NULL;
1044
1045 rq = blk_fetch_request(q);
1046
1047 while (1) {
1048 struct bio *bio;
1049 struct bio *rq_bio, *next_bio = NULL;
1050 bool do_write;
1051 int size, op_size = 0;
1052 u64 ofs;
1053
1054 /* peek at request from block layer */
1055 if (!rq)
1056 break;
1057
1058 dout("fetched request\n");
1059
1060 /* filter out block requests we don't understand */
1061 if ((rq->cmd_type != REQ_TYPE_FS)) {
1062 __blk_end_request_all(rq, 0);
1063 goto next;
1064 }
1065
1066 /* deduce our operation (read, write) */
1067 do_write = (rq_data_dir(rq) == WRITE);
1068
1069 size = blk_rq_bytes(rq);
1070 ofs = blk_rq_pos(rq) * 512ULL;
1071 rq_bio = rq->bio;
1072 if (do_write && rbd_dev->read_only) {
1073 __blk_end_request_all(rq, -EROFS);
1074 goto next;
1075 }
1076
1077 spin_unlock_irq(q->queue_lock);
1078
1079 dout("%s 0x%x bytes at 0x%llx\n",
1080 do_write ? "write" : "read",
1081 size, blk_rq_pos(rq) * 512ULL);
1082
1083 do {
1084 /* a bio clone to be passed down to OSD req */
1085 dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
1086 op_size = rbd_get_segment(&rbd_dev->header,
1087 rbd_dev->header.block_name,
1088 ofs, size,
1089 NULL, NULL);
1090 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1091 op_size, GFP_ATOMIC);
1092 if (!bio) {
1093 spin_lock_irq(q->queue_lock);
1094 __blk_end_request_all(rq, -ENOMEM);
1095 goto next;
1096 }
1097
1098 /* init OSD command: write or read */
1099 if (do_write)
1100 rbd_req_write(rq, rbd_dev,
1101 rbd_dev->header.snapc,
1102 ofs,
1103 op_size, bio);
1104 else
1105 rbd_req_read(rq, rbd_dev,
1106 cur_snap_id(rbd_dev),
1107 ofs,
1108 op_size, bio);
1109
1110 size -= op_size;
1111 ofs += op_size;
1112
1113 rq_bio = next_bio;
1114 } while (size > 0);
1115
1116 if (bp)
1117 bio_pair_release(bp);
1118
1119 spin_lock_irq(q->queue_lock);
1120next:
1121 rq = blk_fetch_request(q);
1122 }
1123}
1124
1125/*
1126 * a queue callback. Makes sure that we don't create a bio that spans across
1127 * multiple osd objects. One exception would be with a single page bios,
1128 * which we handle later at bio_chain_clone
1129 */
1130static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1131 struct bio_vec *bvec)
1132{
1133 struct rbd_device *rbd_dev = q->queuedata;
1134 unsigned int chunk_sectors = 1 << (rbd_dev->header.obj_order - 9);
1135 sector_t sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1136 unsigned int bio_sectors = bmd->bi_size >> 9;
1137 int max;
1138
1139 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
1140 + bio_sectors)) << 9;
1141 if (max < 0)
1142 max = 0; /* bio_add cannot handle a negative return */
1143 if (max <= bvec->bv_len && bio_sectors == 0)
1144 return bvec->bv_len;
1145 return max;
1146}
1147
1148static void rbd_free_disk(struct rbd_device *rbd_dev)
1149{
1150 struct gendisk *disk = rbd_dev->disk;
1151
1152 if (!disk)
1153 return;
1154
1155 rbd_header_free(&rbd_dev->header);
1156
1157 if (disk->flags & GENHD_FL_UP)
1158 del_gendisk(disk);
1159 if (disk->queue)
1160 blk_cleanup_queue(disk->queue);
1161 put_disk(disk);
1162}
1163
1164/*
1165 * reload the ondisk the header
1166 */
1167static int rbd_read_header(struct rbd_device *rbd_dev,
1168 struct rbd_image_header *header)
1169{
1170 ssize_t rc;
1171 struct rbd_image_header_ondisk *dh;
1172 int snap_count = 0;
1173 u64 snap_names_len = 0;
1174
1175 while (1) {
1176 int len = sizeof(*dh) +
1177 snap_count * sizeof(struct rbd_image_snap_ondisk) +
1178 snap_names_len;
1179
1180 rc = -ENOMEM;
1181 dh = kmalloc(len, GFP_KERNEL);
1182 if (!dh)
1183 return -ENOMEM;
1184
1185 rc = rbd_req_sync_read(rbd_dev,
1186 NULL, CEPH_NOSNAP,
1187 rbd_dev->obj_md_name,
1188 0, len,
1189 (char *)dh);
1190 if (rc < 0)
1191 goto out_dh;
1192
1193 rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
1194 if (rc < 0)
1195 goto out_dh;
1196
1197 if (snap_count != header->total_snaps) {
1198 snap_count = header->total_snaps;
1199 snap_names_len = header->snap_names_len;
1200 rbd_header_free(header);
1201 kfree(dh);
1202 continue;
1203 }
1204 break;
1205 }
1206
1207out_dh:
1208 kfree(dh);
1209 return rc;
1210}
1211
1212/*
1213 * create a snapshot
1214 */
1215static int rbd_header_add_snap(struct rbd_device *dev,
1216 const char *snap_name,
1217 gfp_t gfp_flags)
1218{
1219 int name_len = strlen(snap_name);
1220 u64 new_snapid;
1221 int ret;
1222 void *data, *data_start, *data_end;
1223
1224 /* we should create a snapshot only if we're pointing at the head */
1225 if (dev->cur_snap)
1226 return -EINVAL;
1227
1228 ret = ceph_monc_create_snapid(&dev->client->monc, dev->poolid,
1229 &new_snapid);
1230 dout("created snapid=%lld\n", new_snapid);
1231 if (ret < 0)
1232 return ret;
1233
1234 data = kmalloc(name_len + 16, gfp_flags);
1235 if (!data)
1236 return -ENOMEM;
1237
1238 data_start = data;
1239 data_end = data + name_len + 16;
1240
1241 ceph_encode_string_safe(&data, data_end, snap_name, name_len, bad);
1242 ceph_encode_64_safe(&data, data_end, new_snapid, bad);
1243
1244 ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add",
1245 data_start, data - data_start);
1246
1247 kfree(data_start);
1248
1249 if (ret < 0)
1250 return ret;
1251
1252 dev->header.snapc->seq = new_snapid;
1253
1254 return 0;
1255bad:
1256 return -ERANGE;
1257}
1258
1259/*
1260 * only read the first part of the ondisk header, without the snaps info
1261 */
1262static int rbd_update_snaps(struct rbd_device *rbd_dev)
1263{
1264 int ret;
1265 struct rbd_image_header h;
1266 u64 snap_seq;
1267
1268 ret = rbd_read_header(rbd_dev, &h);
1269 if (ret < 0)
1270 return ret;
1271
1272 down_write(&rbd_dev->header.snap_rwsem);
1273
1274 snap_seq = rbd_dev->header.snapc->seq;
1275
1276 kfree(rbd_dev->header.snapc);
1277 kfree(rbd_dev->header.snap_names);
1278 kfree(rbd_dev->header.snap_sizes);
1279
1280 rbd_dev->header.total_snaps = h.total_snaps;
1281 rbd_dev->header.snapc = h.snapc;
1282 rbd_dev->header.snap_names = h.snap_names;
1283 rbd_dev->header.snap_sizes = h.snap_sizes;
1284 rbd_dev->header.snapc->seq = snap_seq;
1285
1286 up_write(&rbd_dev->header.snap_rwsem);
1287
1288 return 0;
1289}
1290
1291static int rbd_init_disk(struct rbd_device *rbd_dev)
1292{
1293 struct gendisk *disk;
1294 struct request_queue *q;
1295 int rc;
1296 u64 total_size = 0;
1297
1298 /* contact OSD, request size info about the object being mapped */
1299 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1300 if (rc)
1301 return rc;
1302
1303 rc = rbd_header_set_snap(rbd_dev, rbd_dev->snap_name, &total_size);
1304 if (rc)
1305 return rc;
1306
1307 /* create gendisk info */
1308 rc = -ENOMEM;
1309 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1310 if (!disk)
1311 goto out;
1312
1313 sprintf(disk->disk_name, DRV_NAME "%d", rbd_dev->id);
1314 disk->major = rbd_dev->major;
1315 disk->first_minor = 0;
1316 disk->fops = &rbd_bd_ops;
1317 disk->private_data = rbd_dev;
1318
1319 /* init rq */
1320 rc = -ENOMEM;
1321 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1322 if (!q)
1323 goto out_disk;
1324 blk_queue_merge_bvec(q, rbd_merge_bvec);
1325 disk->queue = q;
1326
1327 q->queuedata = rbd_dev;
1328
1329 rbd_dev->disk = disk;
1330 rbd_dev->q = q;
1331
1332 /* finally, announce the disk to the world */
1333 set_capacity(disk, total_size / 512ULL);
1334 add_disk(disk);
1335
1336 pr_info("%s: added with size 0x%llx\n",
1337 disk->disk_name, (unsigned long long)total_size);
1338 return 0;
1339
1340out_disk:
1341 put_disk(disk);
1342out:
1343 return rc;
1344}
1345
1346/********************************************************************
1347 * /sys/class/rbd/
1348 * add map rados objects to blkdev
1349 * remove unmap rados objects
1350 * list show mappings
1351 *******************************************************************/
1352
1353static void class_rbd_release(struct class *cls)
1354{
1355 kfree(cls);
1356}
1357
1358static ssize_t class_rbd_list(struct class *c,
1359 struct class_attribute *attr,
1360 char *data)
1361{
1362 int n = 0;
1363 struct list_head *tmp;
1364 int max = PAGE_SIZE;
1365
1366 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1367
1368 n += snprintf(data, max,
1369 "#id\tmajor\tclient_name\tpool\tname\tsnap\tKB\n");
1370
1371 list_for_each(tmp, &rbd_dev_list) {
1372 struct rbd_device *rbd_dev;
1373
1374 rbd_dev = list_entry(tmp, struct rbd_device, node);
1375 n += snprintf(data+n, max-n,
1376 "%d\t%d\tclient%lld\t%s\t%s\t%s\t%lld\n",
1377 rbd_dev->id,
1378 rbd_dev->major,
1379 ceph_client_id(rbd_dev->client),
1380 rbd_dev->pool_name,
1381 rbd_dev->obj, rbd_dev->snap_name,
1382 rbd_dev->header.image_size >> 10);
1383 if (n == max)
1384 break;
1385 }
1386
1387 mutex_unlock(&ctl_mutex);
1388 return n;
1389}
1390
1391static ssize_t class_rbd_add(struct class *c,
1392 struct class_attribute *attr,
1393 const char *buf, size_t count)
1394{
1395 struct ceph_osd_client *osdc;
1396 struct rbd_device *rbd_dev;
1397 ssize_t rc = -ENOMEM;
1398 int irc, new_id = 0;
1399 struct list_head *tmp;
1400 char *mon_dev_name;
1401 char *options;
1402
1403 if (!try_module_get(THIS_MODULE))
1404 return -ENODEV;
1405
1406 mon_dev_name = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
1407 if (!mon_dev_name)
1408 goto err_out_mod;
1409
1410 options = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
1411 if (!options)
1412 goto err_mon_dev;
1413
1414 /* new rbd_device object */
1415 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
1416 if (!rbd_dev)
1417 goto err_out_opt;
1418
1419 /* static rbd_device initialization */
1420 spin_lock_init(&rbd_dev->lock);
1421 INIT_LIST_HEAD(&rbd_dev->node);
1422
1423 /* generate unique id: find highest unique id, add one */
1424 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1425
1426 list_for_each(tmp, &rbd_dev_list) {
1427 struct rbd_device *rbd_dev;
1428
1429 rbd_dev = list_entry(tmp, struct rbd_device, node);
1430 if (rbd_dev->id >= new_id)
1431 new_id = rbd_dev->id + 1;
1432 }
1433
1434 rbd_dev->id = new_id;
1435
1436 /* add to global list */
1437 list_add_tail(&rbd_dev->node, &rbd_dev_list);
1438
1439 /* parse add command */
1440 if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s "
1441 "%" __stringify(RBD_MAX_OPT_LEN) "s "
1442 "%" __stringify(RBD_MAX_POOL_NAME_LEN) "s "
1443 "%" __stringify(RBD_MAX_OBJ_NAME_LEN) "s"
1444 "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
1445 mon_dev_name, options, rbd_dev->pool_name,
1446 rbd_dev->obj, rbd_dev->snap_name) < 4) {
1447 rc = -EINVAL;
1448 goto err_out_slot;
1449 }
1450
1451 if (rbd_dev->snap_name[0] == 0)
1452 rbd_dev->snap_name[0] = '-';
1453
1454 rbd_dev->obj_len = strlen(rbd_dev->obj);
1455 snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s",
1456 rbd_dev->obj, RBD_SUFFIX);
1457
1458 /* initialize rest of new object */
1459 snprintf(rbd_dev->name, DEV_NAME_LEN, DRV_NAME "%d", rbd_dev->id);
1460 rc = rbd_get_client(rbd_dev, mon_dev_name, options);
1461 if (rc < 0)
1462 goto err_out_slot;
1463
1464 mutex_unlock(&ctl_mutex);
1465
1466 /* pick the pool */
1467 osdc = &rbd_dev->client->osdc;
1468 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
1469 if (rc < 0)
1470 goto err_out_client;
1471 rbd_dev->poolid = rc;
1472
1473 /* register our block device */
1474 irc = register_blkdev(0, rbd_dev->name);
1475 if (irc < 0) {
1476 rc = irc;
1477 goto err_out_client;
1478 }
1479 rbd_dev->major = irc;
1480
1481 /* set up and announce blkdev mapping */
1482 rc = rbd_init_disk(rbd_dev);
1483 if (rc)
1484 goto err_out_blkdev;
1485
1486 return count;
1487
1488err_out_blkdev:
1489 unregister_blkdev(rbd_dev->major, rbd_dev->name);
1490err_out_client:
1491 rbd_put_client(rbd_dev);
1492 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1493err_out_slot:
1494 list_del_init(&rbd_dev->node);
1495 mutex_unlock(&ctl_mutex);
1496
1497 kfree(rbd_dev);
1498err_out_opt:
1499 kfree(options);
1500err_mon_dev:
1501 kfree(mon_dev_name);
1502err_out_mod:
1503 dout("Error adding device %s\n", buf);
1504 module_put(THIS_MODULE);
1505 return rc;
1506}
1507
1508static struct rbd_device *__rbd_get_dev(unsigned long id)
1509{
1510 struct list_head *tmp;
1511 struct rbd_device *rbd_dev;
1512
1513 list_for_each(tmp, &rbd_dev_list) {
1514 rbd_dev = list_entry(tmp, struct rbd_device, node);
1515 if (rbd_dev->id == id)
1516 return rbd_dev;
1517 }
1518 return NULL;
1519}
1520
1521static ssize_t class_rbd_remove(struct class *c,
1522 struct class_attribute *attr,
1523 const char *buf,
1524 size_t count)
1525{
1526 struct rbd_device *rbd_dev = NULL;
1527 int target_id, rc;
1528 unsigned long ul;
1529
1530 rc = strict_strtoul(buf, 10, &ul);
1531 if (rc)
1532 return rc;
1533
1534 /* convert to int; abort if we lost anything in the conversion */
1535 target_id = (int) ul;
1536 if (target_id != ul)
1537 return -EINVAL;
1538
1539 /* remove object from list immediately */
1540 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1541
1542 rbd_dev = __rbd_get_dev(target_id);
1543 if (rbd_dev)
1544 list_del_init(&rbd_dev->node);
1545
1546 mutex_unlock(&ctl_mutex);
1547
1548 if (!rbd_dev)
1549 return -ENOENT;
1550
1551 rbd_put_client(rbd_dev);
1552
1553 /* clean up and free blkdev */
1554 rbd_free_disk(rbd_dev);
1555 unregister_blkdev(rbd_dev->major, rbd_dev->name);
1556 kfree(rbd_dev);
1557
1558 /* release module ref */
1559 module_put(THIS_MODULE);
1560
1561 return count;
1562}
1563
1564static ssize_t class_rbd_snaps_list(struct class *c,
1565 struct class_attribute *attr,
1566 char *data)
1567{
1568 struct rbd_device *rbd_dev = NULL;
1569 struct list_head *tmp;
1570 struct rbd_image_header *header;
1571 int i, n = 0, max = PAGE_SIZE;
1572 int ret;
1573
1574 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1575
1576 n += snprintf(data, max, "#id\tsnap\tKB\n");
1577
1578 list_for_each(tmp, &rbd_dev_list) {
1579 char *names, *p;
1580 struct ceph_snap_context *snapc;
1581
1582 rbd_dev = list_entry(tmp, struct rbd_device, node);
1583 header = &rbd_dev->header;
1584
1585 down_read(&header->snap_rwsem);
1586
1587 names = header->snap_names;
1588 snapc = header->snapc;
1589
1590 n += snprintf(data + n, max - n, "%d\t%s\t%lld%s\n",
1591 rbd_dev->id, RBD_SNAP_HEAD_NAME,
1592 header->image_size >> 10,
1593 (!rbd_dev->cur_snap ? " (*)" : ""));
1594 if (n == max)
1595 break;
1596
1597 p = names;
1598 for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) {
1599 n += snprintf(data + n, max - n, "%d\t%s\t%lld%s\n",
1600 rbd_dev->id, p, header->snap_sizes[i] >> 10,
1601 (rbd_dev->cur_snap &&
1602 (snap_index(header, i) == rbd_dev->cur_snap) ?
1603 " (*)" : ""));
1604 if (n == max)
1605 break;
1606 }
1607
1608 up_read(&header->snap_rwsem);
1609 }
1610
1611
1612 ret = n;
1613 mutex_unlock(&ctl_mutex);
1614 return ret;
1615}
1616
1617static ssize_t class_rbd_snaps_refresh(struct class *c,
1618 struct class_attribute *attr,
1619 const char *buf,
1620 size_t count)
1621{
1622 struct rbd_device *rbd_dev = NULL;
1623 int target_id, rc;
1624 unsigned long ul;
1625 int ret = count;
1626
1627 rc = strict_strtoul(buf, 10, &ul);
1628 if (rc)
1629 return rc;
1630
1631 /* convert to int; abort if we lost anything in the conversion */
1632 target_id = (int) ul;
1633 if (target_id != ul)
1634 return -EINVAL;
1635
1636 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1637
1638 rbd_dev = __rbd_get_dev(target_id);
1639 if (!rbd_dev) {
1640 ret = -ENOENT;
1641 goto done;
1642 }
1643
1644 rc = rbd_update_snaps(rbd_dev);
1645 if (rc < 0)
1646 ret = rc;
1647
1648done:
1649 mutex_unlock(&ctl_mutex);
1650 return ret;
1651}
1652
1653static ssize_t class_rbd_snap_create(struct class *c,
1654 struct class_attribute *attr,
1655 const char *buf,
1656 size_t count)
1657{
1658 struct rbd_device *rbd_dev = NULL;
1659 int target_id, ret;
1660 char *name;
1661
1662 name = kmalloc(RBD_MAX_SNAP_NAME_LEN + 1, GFP_KERNEL);
1663 if (!name)
1664 return -ENOMEM;
1665
1666 /* parse snaps add command */
1667 if (sscanf(buf, "%d "
1668 "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
1669 &target_id,
1670 name) != 2) {
1671 ret = -EINVAL;
1672 goto done;
1673 }
1674
1675 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1676
1677 rbd_dev = __rbd_get_dev(target_id);
1678 if (!rbd_dev) {
1679 ret = -ENOENT;
1680 goto done_unlock;
1681 }
1682
1683 ret = rbd_header_add_snap(rbd_dev,
1684 name, GFP_KERNEL);
1685 if (ret < 0)
1686 goto done_unlock;
1687
1688 ret = rbd_update_snaps(rbd_dev);
1689 if (ret < 0)
1690 goto done_unlock;
1691
1692 ret = count;
1693done_unlock:
1694 mutex_unlock(&ctl_mutex);
1695done:
1696 kfree(name);
1697 return ret;
1698}
1699
1700static ssize_t class_rbd_rollback(struct class *c,
1701 struct class_attribute *attr,
1702 const char *buf,
1703 size_t count)
1704{
1705 struct rbd_device *rbd_dev = NULL;
1706 int target_id, ret;
1707 u64 snapid;
1708 char snap_name[RBD_MAX_SNAP_NAME_LEN];
1709 u64 cur_ofs;
1710 char *seg_name;
1711
1712 /* parse snaps add command */
1713 if (sscanf(buf, "%d "
1714 "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
1715 &target_id,
1716 snap_name) != 2) {
1717 return -EINVAL;
1718 }
1719
1720 ret = -ENOMEM;
1721 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1722 if (!seg_name)
1723 return ret;
1724
1725 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1726
1727 rbd_dev = __rbd_get_dev(target_id);
1728 if (!rbd_dev) {
1729 ret = -ENOENT;
1730 goto done_unlock;
1731 }
1732
1733 ret = snap_by_name(&rbd_dev->header, snap_name, &snapid, NULL);
1734 if (ret < 0)
1735 goto done_unlock;
1736
1737 dout("snapid=%lld\n", snapid);
1738
1739 cur_ofs = 0;
1740 while (cur_ofs < rbd_dev->header.image_size) {
1741 cur_ofs += rbd_get_segment(&rbd_dev->header,
1742 rbd_dev->obj,
1743 cur_ofs, (u64)-1,
1744 seg_name, NULL);
1745 dout("seg_name=%s\n", seg_name);
1746
1747 ret = rbd_req_sync_rollback_obj(rbd_dev, snapid, seg_name);
1748 if (ret < 0)
1749 pr_warning("could not roll back obj %s err=%d\n",
1750 seg_name, ret);
1751 }
1752
1753 ret = rbd_update_snaps(rbd_dev);
1754 if (ret < 0)
1755 goto done_unlock;
1756
1757 ret = count;
1758
1759done_unlock:
1760 mutex_unlock(&ctl_mutex);
1761 kfree(seg_name);
1762
1763 return ret;
1764}
1765
1766static struct class_attribute class_rbd_attrs[] = {
1767 __ATTR(add, 0200, NULL, class_rbd_add),
1768 __ATTR(remove, 0200, NULL, class_rbd_remove),
1769 __ATTR(list, 0444, class_rbd_list, NULL),
1770 __ATTR(snaps_refresh, 0200, NULL, class_rbd_snaps_refresh),
1771 __ATTR(snap_create, 0200, NULL, class_rbd_snap_create),
1772 __ATTR(snaps_list, 0444, class_rbd_snaps_list, NULL),
1773 __ATTR(snap_rollback, 0200, NULL, class_rbd_rollback),
1774 __ATTR_NULL
1775};
1776
1777/*
1778 * create control files in sysfs
1779 * /sys/class/rbd/...
1780 */
1781static int rbd_sysfs_init(void)
1782{
1783 int ret = -ENOMEM;
1784
1785 class_rbd = kzalloc(sizeof(*class_rbd), GFP_KERNEL);
1786 if (!class_rbd)
1787 goto out;
1788
1789 class_rbd->name = DRV_NAME;
1790 class_rbd->owner = THIS_MODULE;
1791 class_rbd->class_release = class_rbd_release;
1792 class_rbd->class_attrs = class_rbd_attrs;
1793
1794 ret = class_register(class_rbd);
1795 if (ret)
1796 goto out_class;
1797 return 0;
1798
1799out_class:
1800 kfree(class_rbd);
1801 class_rbd = NULL;
1802 pr_err(DRV_NAME ": failed to create class rbd\n");
1803out:
1804 return ret;
1805}
1806
1807static void rbd_sysfs_cleanup(void)
1808{
1809 if (class_rbd)
1810 class_destroy(class_rbd);
1811 class_rbd = NULL;
1812}
1813
1814int __init rbd_init(void)
1815{
1816 int rc;
1817
1818 rc = rbd_sysfs_init();
1819 if (rc)
1820 return rc;
1821 spin_lock_init(&node_lock);
1822 pr_info("loaded " DRV_NAME_LONG "\n");
1823 return 0;
1824}
1825
1826void __exit rbd_exit(void)
1827{
1828 rbd_sysfs_cleanup();
1829}
1830
1831module_init(rbd_init);
1832module_exit(rbd_exit);
1833
1834MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
1835MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
1836MODULE_DESCRIPTION("rados block device");
1837
1838/* following authorship retained from original osdblk.c */
1839MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
1840
1841MODULE_LICENSE("GPL");
diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h
new file mode 100644
index 000000000000..fc6c678aa2cb
--- /dev/null
+++ b/drivers/block/rbd_types.h
@@ -0,0 +1,73 @@
1/*
2 * Ceph - scalable distributed file system
3 *
4 * Copyright (C) 2004-2010 Sage Weil <sage@newdream.net>
5 *
6 * This is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License version 2.1, as published by the Free Software
9 * Foundation. See file COPYING.
10 *
11 */
12
13#ifndef CEPH_RBD_TYPES_H
14#define CEPH_RBD_TYPES_H
15
16#include <linux/types.h>
17
18/*
19 * rbd image 'foo' consists of objects
20 * foo.rbd - image metadata
21 * foo.00000000
22 * foo.00000001
23 * ... - data
24 */
25
26#define RBD_SUFFIX ".rbd"
27#define RBD_DIRECTORY "rbd_directory"
28#define RBD_INFO "rbd_info"
29
30#define RBD_DEFAULT_OBJ_ORDER 22 /* 4MB */
31#define RBD_MIN_OBJ_ORDER 16
32#define RBD_MAX_OBJ_ORDER 30
33
34#define RBD_MAX_OBJ_NAME_LEN 96
35#define RBD_MAX_SEG_NAME_LEN 128
36
37#define RBD_COMP_NONE 0
38#define RBD_CRYPT_NONE 0
39
40#define RBD_HEADER_TEXT "<<< Rados Block Device Image >>>\n"
41#define RBD_HEADER_SIGNATURE "RBD"
42#define RBD_HEADER_VERSION "001.005"
43
44struct rbd_info {
45 __le64 max_id;
46} __attribute__ ((packed));
47
48struct rbd_image_snap_ondisk {
49 __le64 id;
50 __le64 image_size;
51} __attribute__((packed));
52
53struct rbd_image_header_ondisk {
54 char text[40];
55 char block_name[24];
56 char signature[4];
57 char version[8];
58 struct {
59 __u8 order;
60 __u8 crypt_type;
61 __u8 comp_type;
62 __u8 unused;
63 } __attribute__((packed)) options;
64 __le64 image_size;
65 __le64 snap_seq;
66 __le32 snap_count;
67 __le32 reserved;
68 __le64 snap_names_len;
69 struct rbd_image_snap_ondisk snaps[0];
70} __attribute__((packed));
71
72
73#endif
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index 2e46815876df..75333d0a3327 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -20,7 +20,7 @@
20#include <linux/fd.h> 20#include <linux/fd.h>
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/blkdev.h> 22#include <linux/blkdev.h>
23#include <linux/smp_lock.h> 23#include <linux/mutex.h>
24#include <linux/hdreg.h> 24#include <linux/hdreg.h>
25#include <linux/kernel.h> 25#include <linux/kernel.h>
26#include <linux/delay.h> 26#include <linux/delay.h>
@@ -222,6 +222,7 @@ extern int swim_read_sector_header(struct swim __iomem *base,
222extern int swim_read_sector_data(struct swim __iomem *base, 222extern int swim_read_sector_data(struct swim __iomem *base,
223 unsigned char *data); 223 unsigned char *data);
224 224
225static DEFINE_MUTEX(swim_mutex);
225static inline void set_swim_mode(struct swim __iomem *base, int enable) 226static inline void set_swim_mode(struct swim __iomem *base, int enable)
226{ 227{
227 struct iwm __iomem *iwm_base; 228 struct iwm __iomem *iwm_base;
@@ -666,9 +667,9 @@ static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode)
666{ 667{
667 int ret; 668 int ret;
668 669
669 lock_kernel(); 670 mutex_lock(&swim_mutex);
670 ret = floppy_open(bdev, mode); 671 ret = floppy_open(bdev, mode);
671 unlock_kernel(); 672 mutex_unlock(&swim_mutex);
672 673
673 return ret; 674 return ret;
674} 675}
@@ -678,7 +679,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
678 struct floppy_state *fs = disk->private_data; 679 struct floppy_state *fs = disk->private_data;
679 struct swim __iomem *base = fs->swd->base; 680 struct swim __iomem *base = fs->swd->base;
680 681
681 lock_kernel(); 682 mutex_lock(&swim_mutex);
682 if (fs->ref_count < 0) 683 if (fs->ref_count < 0)
683 fs->ref_count = 0; 684 fs->ref_count = 0;
684 else if (fs->ref_count > 0) 685 else if (fs->ref_count > 0)
@@ -686,7 +687,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
686 687
687 if (fs->ref_count == 0) 688 if (fs->ref_count == 0)
688 swim_motor(base, OFF); 689 swim_motor(base, OFF);
689 unlock_kernel(); 690 mutex_unlock(&swim_mutex);
690 691
691 return 0; 692 return 0;
692} 693}
@@ -704,9 +705,9 @@ static int floppy_ioctl(struct block_device *bdev, fmode_t mode,
704 case FDEJECT: 705 case FDEJECT:
705 if (fs->ref_count != 1) 706 if (fs->ref_count != 1)
706 return -EBUSY; 707 return -EBUSY;
707 lock_kernel(); 708 mutex_lock(&swim_mutex);
708 err = floppy_eject(fs); 709 err = floppy_eject(fs);
709 unlock_kernel(); 710 mutex_unlock(&swim_mutex);
710 return err; 711 return err;
711 712
712 case FDGETPRM: 713 case FDGETPRM:
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index cc6a3864822c..bf3a5b859299 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -25,7 +25,7 @@
25#include <linux/ioctl.h> 25#include <linux/ioctl.h>
26#include <linux/blkdev.h> 26#include <linux/blkdev.h>
27#include <linux/interrupt.h> 27#include <linux/interrupt.h>
28#include <linux/smp_lock.h> 28#include <linux/mutex.h>
29#include <linux/module.h> 29#include <linux/module.h>
30#include <linux/spinlock.h> 30#include <linux/spinlock.h>
31#include <asm/io.h> 31#include <asm/io.h>
@@ -36,6 +36,7 @@
36#include <asm/machdep.h> 36#include <asm/machdep.h>
37#include <asm/pmac_feature.h> 37#include <asm/pmac_feature.h>
38 38
39static DEFINE_MUTEX(swim3_mutex);
39static struct request_queue *swim3_queue; 40static struct request_queue *swim3_queue;
40static struct gendisk *disks[2]; 41static struct gendisk *disks[2];
41static struct request *fd_req; 42static struct request *fd_req;
@@ -873,9 +874,9 @@ static int floppy_ioctl(struct block_device *bdev, fmode_t mode,
873{ 874{
874 int ret; 875 int ret;
875 876
876 lock_kernel(); 877 mutex_lock(&swim3_mutex);
877 ret = floppy_locked_ioctl(bdev, mode, cmd, param); 878 ret = floppy_locked_ioctl(bdev, mode, cmd, param);
878 unlock_kernel(); 879 mutex_unlock(&swim3_mutex);
879 880
880 return ret; 881 return ret;
881} 882}
@@ -953,9 +954,9 @@ static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode)
953{ 954{
954 int ret; 955 int ret;
955 956
956 lock_kernel(); 957 mutex_lock(&swim3_mutex);
957 ret = floppy_open(bdev, mode); 958 ret = floppy_open(bdev, mode);
958 unlock_kernel(); 959 mutex_unlock(&swim3_mutex);
959 960
960 return ret; 961 return ret;
961} 962}
@@ -964,13 +965,13 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
964{ 965{
965 struct floppy_state *fs = disk->private_data; 966 struct floppy_state *fs = disk->private_data;
966 struct swim3 __iomem *sw = fs->swim3; 967 struct swim3 __iomem *sw = fs->swim3;
967 lock_kernel(); 968 mutex_lock(&swim3_mutex);
968 if (fs->ref_count > 0 && --fs->ref_count == 0) { 969 if (fs->ref_count > 0 && --fs->ref_count == 0) {
969 swim3_action(fs, MOTOR_OFF); 970 swim3_action(fs, MOTOR_OFF);
970 out_8(&sw->control_bic, 0xff); 971 out_8(&sw->control_bic, 0xff);
971 swim3_select(fs, RELAX); 972 swim3_select(fs, RELAX);
972 } 973 }
973 unlock_kernel(); 974 mutex_unlock(&swim3_mutex);
974 return 0; 975 return 0;
975} 976}
976 977
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index f3fd454e28f9..9ae3bb713286 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -28,7 +28,7 @@
28#include <linux/timer.h> 28#include <linux/timer.h>
29#include <linux/scatterlist.h> 29#include <linux/scatterlist.h>
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/smp_lock.h> 31#include <linux/mutex.h>
32#include <scsi/scsi.h> 32#include <scsi/scsi.h>
33 33
34#define DRV_NAME "ub" 34#define DRV_NAME "ub"
@@ -248,6 +248,7 @@ struct ub_completion {
248 spinlock_t lock; 248 spinlock_t lock;
249}; 249};
250 250
251static DEFINE_MUTEX(ub_mutex);
251static inline void ub_init_completion(struct ub_completion *x) 252static inline void ub_init_completion(struct ub_completion *x)
252{ 253{
253 x->done = 0; 254 x->done = 0;
@@ -1715,9 +1716,9 @@ static int ub_bd_unlocked_open(struct block_device *bdev, fmode_t mode)
1715{ 1716{
1716 int ret; 1717 int ret;
1717 1718
1718 lock_kernel(); 1719 mutex_lock(&ub_mutex);
1719 ret = ub_bd_open(bdev, mode); 1720 ret = ub_bd_open(bdev, mode);
1720 unlock_kernel(); 1721 mutex_unlock(&ub_mutex);
1721 1722
1722 return ret; 1723 return ret;
1723} 1724}
@@ -1730,9 +1731,9 @@ static int ub_bd_release(struct gendisk *disk, fmode_t mode)
1730 struct ub_lun *lun = disk->private_data; 1731 struct ub_lun *lun = disk->private_data;
1731 struct ub_dev *sc = lun->udev; 1732 struct ub_dev *sc = lun->udev;
1732 1733
1733 lock_kernel(); 1734 mutex_lock(&ub_mutex);
1734 ub_put(sc); 1735 ub_put(sc);
1735 unlock_kernel(); 1736 mutex_unlock(&ub_mutex);
1736 1737
1737 return 0; 1738 return 0;
1738} 1739}
@@ -1747,9 +1748,9 @@ static int ub_bd_ioctl(struct block_device *bdev, fmode_t mode,
1747 void __user *usermem = (void __user *) arg; 1748 void __user *usermem = (void __user *) arg;
1748 int ret; 1749 int ret;
1749 1750
1750 lock_kernel(); 1751 mutex_lock(&ub_mutex);
1751 ret = scsi_cmd_ioctl(disk->queue, disk, mode, cmd, usermem); 1752 ret = scsi_cmd_ioctl(disk->queue, disk, mode, cmd, usermem);
1752 unlock_kernel(); 1753 mutex_unlock(&ub_mutex);
1753 1754
1754 return ret; 1755 return ret;
1755} 1756}
diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c
index f651e51a3319..e2ff697697c2 100644
--- a/drivers/block/viodasd.c
+++ b/drivers/block/viodasd.c
@@ -41,7 +41,7 @@
41#include <linux/errno.h> 41#include <linux/errno.h>
42#include <linux/init.h> 42#include <linux/init.h>
43#include <linux/string.h> 43#include <linux/string.h>
44#include <linux/smp_lock.h> 44#include <linux/mutex.h>
45#include <linux/dma-mapping.h> 45#include <linux/dma-mapping.h>
46#include <linux/completion.h> 46#include <linux/completion.h>
47#include <linux/device.h> 47#include <linux/device.h>
@@ -73,6 +73,7 @@ enum {
73 MAX_DISK_NAME = FIELD_SIZEOF(struct gendisk, disk_name) 73 MAX_DISK_NAME = FIELD_SIZEOF(struct gendisk, disk_name)
74}; 74};
75 75
76static DEFINE_MUTEX(viodasd_mutex);
76static DEFINE_SPINLOCK(viodasd_spinlock); 77static DEFINE_SPINLOCK(viodasd_spinlock);
77 78
78#define VIOMAXREQ 16 79#define VIOMAXREQ 16
@@ -180,9 +181,9 @@ static int viodasd_unlocked_open(struct block_device *bdev, fmode_t mode)
180{ 181{
181 int ret; 182 int ret;
182 183
183 lock_kernel(); 184 mutex_lock(&viodasd_mutex);
184 ret = viodasd_open(bdev, mode); 185 ret = viodasd_open(bdev, mode);
185 unlock_kernel(); 186 mutex_unlock(&viodasd_mutex);
186 187
187 return ret; 188 return ret;
188} 189}
@@ -196,7 +197,7 @@ static int viodasd_release(struct gendisk *disk, fmode_t mode)
196 struct viodasd_device *d = disk->private_data; 197 struct viodasd_device *d = disk->private_data;
197 HvLpEvent_Rc hvrc; 198 HvLpEvent_Rc hvrc;
198 199
199 lock_kernel(); 200 mutex_lock(&viodasd_mutex);
200 /* Send the event to OS/400. We DON'T expect a response */ 201 /* Send the event to OS/400. We DON'T expect a response */
201 hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp, 202 hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp,
202 HvLpEvent_Type_VirtualIo, 203 HvLpEvent_Type_VirtualIo,
@@ -210,7 +211,7 @@ static int viodasd_release(struct gendisk *disk, fmode_t mode)
210 if (hvrc != 0) 211 if (hvrc != 0)
211 pr_warning("HV close call failed %d\n", (int)hvrc); 212 pr_warning("HV close call failed %d\n", (int)hvrc);
212 213
213 unlock_kernel(); 214 mutex_unlock(&viodasd_mutex);
214 215
215 return 0; 216 return 0;
216} 217}
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 1101e251a629..6ecf89cdf006 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -2,7 +2,6 @@
2#include <linux/spinlock.h> 2#include <linux/spinlock.h>
3#include <linux/slab.h> 3#include <linux/slab.h>
4#include <linux/blkdev.h> 4#include <linux/blkdev.h>
5#include <linux/smp_lock.h>
6#include <linux/hdreg.h> 5#include <linux/hdreg.h>
7#include <linux/virtio.h> 6#include <linux/virtio.h>
8#include <linux/virtio_blk.h> 7#include <linux/virtio_blk.h>
@@ -128,9 +127,6 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
128 } 127 }
129 } 128 }
130 129
131 if (vbr->req->cmd_flags & REQ_HARDBARRIER)
132 vbr->out_hdr.type |= VIRTIO_BLK_T_BARRIER;
133
134 sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); 130 sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
135 131
136 /* 132 /*
@@ -222,8 +218,8 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
222 return err; 218 return err;
223} 219}
224 220
225static int virtblk_locked_ioctl(struct block_device *bdev, fmode_t mode, 221static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
226 unsigned cmd, unsigned long data) 222 unsigned int cmd, unsigned long data)
227{ 223{
228 struct gendisk *disk = bdev->bd_disk; 224 struct gendisk *disk = bdev->bd_disk;
229 struct virtio_blk *vblk = disk->private_data; 225 struct virtio_blk *vblk = disk->private_data;
@@ -238,18 +234,6 @@ static int virtblk_locked_ioctl(struct block_device *bdev, fmode_t mode,
238 (void __user *)data); 234 (void __user *)data);
239} 235}
240 236
241static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
242 unsigned int cmd, unsigned long param)
243{
244 int ret;
245
246 lock_kernel();
247 ret = virtblk_locked_ioctl(bdev, mode, cmd, param);
248 unlock_kernel();
249
250 return ret;
251}
252
253/* We provide getgeo only to please some old bootloader/partitioning tools */ 237/* We provide getgeo only to please some old bootloader/partitioning tools */
254static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo) 238static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
255{ 239{
@@ -392,31 +376,9 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
392 vblk->disk->driverfs_dev = &vdev->dev; 376 vblk->disk->driverfs_dev = &vdev->dev;
393 index++; 377 index++;
394 378
395 if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH)) { 379 /* configure queue flush support */
396 /* 380 if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH))
397 * If the FLUSH feature is supported we do have support for 381 blk_queue_flush(q, REQ_FLUSH);
398 * flushing a volatile write cache on the host. Use that
399 * to implement write barrier support.
400 */
401 blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH);
402 } else if (virtio_has_feature(vdev, VIRTIO_BLK_F_BARRIER)) {
403 /*
404 * If the BARRIER feature is supported the host expects us
405 * to order request by tags. This implies there is not
406 * volatile write cache on the host, and that the host
407 * never re-orders outstanding I/O. This feature is not
408 * useful for real life scenarious and deprecated.
409 */
410 blk_queue_ordered(q, QUEUE_ORDERED_TAG);
411 } else {
412 /*
413 * If the FLUSH feature is not supported we must assume that
414 * the host does not perform any kind of volatile write
415 * caching. We still need to drain the queue to provider
416 * proper barrier semantics.
417 */
418 blk_queue_ordered(q, QUEUE_ORDERED_DRAIN);
419 }
420 382
421 /* If disk is read-only in the host, the guest should obey */ 383 /* If disk is read-only in the host, the guest should obey */
422 if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO)) 384 if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
@@ -535,9 +497,9 @@ static const struct virtio_device_id id_table[] = {
535}; 497};
536 498
537static unsigned int features[] = { 499static unsigned int features[] = {
538 VIRTIO_BLK_F_BARRIER, VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, 500 VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
539 VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, 501 VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_SCSI,
540 VIRTIO_BLK_F_SCSI, VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY 502 VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY
541}; 503};
542 504
543/* 505/*
diff --git a/drivers/block/xd.c b/drivers/block/xd.c
index d5a3cd750561..4abd2bcd20fb 100644
--- a/drivers/block/xd.c
+++ b/drivers/block/xd.c
@@ -46,7 +46,7 @@
46#include <linux/init.h> 46#include <linux/init.h>
47#include <linux/wait.h> 47#include <linux/wait.h>
48#include <linux/blkdev.h> 48#include <linux/blkdev.h>
49#include <linux/smp_lock.h> 49#include <linux/mutex.h>
50#include <linux/blkpg.h> 50#include <linux/blkpg.h>
51#include <linux/delay.h> 51#include <linux/delay.h>
52#include <linux/io.h> 52#include <linux/io.h>
@@ -58,6 +58,7 @@
58 58
59#include "xd.h" 59#include "xd.h"
60 60
61static DEFINE_MUTEX(xd_mutex);
61static void __init do_xd_setup (int *integers); 62static void __init do_xd_setup (int *integers);
62#ifdef MODULE 63#ifdef MODULE
63static int xd[5] = { -1,-1,-1,-1, }; 64static int xd[5] = { -1,-1,-1,-1, };
@@ -381,9 +382,9 @@ static int xd_ioctl(struct block_device *bdev, fmode_t mode,
381{ 382{
382 int ret; 383 int ret;
383 384
384 lock_kernel(); 385 mutex_lock(&xd_mutex);
385 ret = xd_locked_ioctl(bdev, mode, cmd, param); 386 ret = xd_locked_ioctl(bdev, mode, cmd, param);
386 unlock_kernel(); 387 mutex_unlock(&xd_mutex);
387 388
388 return ret; 389 return ret;
389} 390}
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index ab735a605cf3..4b33a18c32e0 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -41,7 +41,7 @@
41#include <linux/cdrom.h> 41#include <linux/cdrom.h>
42#include <linux/module.h> 42#include <linux/module.h>
43#include <linux/slab.h> 43#include <linux/slab.h>
44#include <linux/smp_lock.h> 44#include <linux/mutex.h>
45#include <linux/scatterlist.h> 45#include <linux/scatterlist.h>
46 46
47#include <xen/xen.h> 47#include <xen/xen.h>
@@ -69,6 +69,7 @@ struct blk_shadow {
69 unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 69 unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
70}; 70};
71 71
72static DEFINE_MUTEX(blkfront_mutex);
72static const struct block_device_operations xlvbd_block_fops; 73static const struct block_device_operations xlvbd_block_fops;
73 74
74#define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE) 75#define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
@@ -95,7 +96,7 @@ struct blkfront_info
95 struct gnttab_free_callback callback; 96 struct gnttab_free_callback callback;
96 struct blk_shadow shadow[BLK_RING_SIZE]; 97 struct blk_shadow shadow[BLK_RING_SIZE];
97 unsigned long shadow_free; 98 unsigned long shadow_free;
98 int feature_barrier; 99 unsigned int feature_flush;
99 int is_ready; 100 int is_ready;
100}; 101};
101 102
@@ -418,26 +419,12 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
418} 419}
419 420
420 421
421static int xlvbd_barrier(struct blkfront_info *info) 422static void xlvbd_flush(struct blkfront_info *info)
422{ 423{
423 int err; 424 blk_queue_flush(info->rq, info->feature_flush);
424 const char *barrier;
425
426 switch (info->feature_barrier) {
427 case QUEUE_ORDERED_DRAIN: barrier = "enabled (drain)"; break;
428 case QUEUE_ORDERED_TAG: barrier = "enabled (tag)"; break;
429 case QUEUE_ORDERED_NONE: barrier = "disabled"; break;
430 default: return -EINVAL;
431 }
432
433 err = blk_queue_ordered(info->rq, info->feature_barrier);
434
435 if (err)
436 return err;
437
438 printk(KERN_INFO "blkfront: %s: barriers %s\n", 425 printk(KERN_INFO "blkfront: %s: barriers %s\n",
439 info->gd->disk_name, barrier); 426 info->gd->disk_name,
440 return 0; 427 info->feature_flush ? "enabled" : "disabled");
441} 428}
442 429
443 430
@@ -516,7 +503,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
516 info->rq = gd->queue; 503 info->rq = gd->queue;
517 info->gd = gd; 504 info->gd = gd;
518 505
519 xlvbd_barrier(info); 506 xlvbd_flush(info);
520 507
521 if (vdisk_info & VDISK_READONLY) 508 if (vdisk_info & VDISK_READONLY)
522 set_disk_ro(gd, 1); 509 set_disk_ro(gd, 1);
@@ -662,8 +649,8 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
662 printk(KERN_WARNING "blkfront: %s: write barrier op failed\n", 649 printk(KERN_WARNING "blkfront: %s: write barrier op failed\n",
663 info->gd->disk_name); 650 info->gd->disk_name);
664 error = -EOPNOTSUPP; 651 error = -EOPNOTSUPP;
665 info->feature_barrier = QUEUE_ORDERED_NONE; 652 info->feature_flush = 0;
666 xlvbd_barrier(info); 653 xlvbd_flush(info);
667 } 654 }
668 /* fall through */ 655 /* fall through */
669 case BLKIF_OP_READ: 656 case BLKIF_OP_READ:
@@ -1076,20 +1063,20 @@ static void blkfront_connect(struct blkfront_info *info)
1076 /* 1063 /*
1077 * If there's no "feature-barrier" defined, then it means 1064 * If there's no "feature-barrier" defined, then it means
1078 * we're dealing with a very old backend which writes 1065 * we're dealing with a very old backend which writes
1079 * synchronously; draining will do what needs to get done. 1066 * synchronously; nothing to do.
1080 * 1067 *
1081 * If there are barriers, then we can do full queued writes 1068 * If there are barriers, then we use flush.
1082 * with tagged barriers.
1083 *
1084 * If barriers are not supported, then there's no much we can
1085 * do, so just set ordering to NONE.
1086 */ 1069 */
1087 if (err) 1070 info->feature_flush = 0;
1088 info->feature_barrier = QUEUE_ORDERED_DRAIN; 1071
1089 else if (barrier) 1072 /*
1090 info->feature_barrier = QUEUE_ORDERED_TAG; 1073 * The driver doesn't properly handled empty flushes, so
1091 else 1074 * lets disable barrier support for now.
1092 info->feature_barrier = QUEUE_ORDERED_NONE; 1075 */
1076#if 0
1077 if (!err && barrier)
1078 info->feature_flush = REQ_FLUSH;
1079#endif
1093 1080
1094 err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size); 1081 err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
1095 if (err) { 1082 if (err) {
@@ -1201,7 +1188,7 @@ static int blkif_open(struct block_device *bdev, fmode_t mode)
1201 struct blkfront_info *info; 1188 struct blkfront_info *info;
1202 int err = 0; 1189 int err = 0;
1203 1190
1204 lock_kernel(); 1191 mutex_lock(&blkfront_mutex);
1205 1192
1206 info = disk->private_data; 1193 info = disk->private_data;
1207 if (!info) { 1194 if (!info) {
@@ -1219,7 +1206,7 @@ static int blkif_open(struct block_device *bdev, fmode_t mode)
1219 mutex_unlock(&info->mutex); 1206 mutex_unlock(&info->mutex);
1220 1207
1221out: 1208out:
1222 unlock_kernel(); 1209 mutex_unlock(&blkfront_mutex);
1223 return err; 1210 return err;
1224} 1211}
1225 1212
@@ -1229,7 +1216,7 @@ static int blkif_release(struct gendisk *disk, fmode_t mode)
1229 struct block_device *bdev; 1216 struct block_device *bdev;
1230 struct xenbus_device *xbdev; 1217 struct xenbus_device *xbdev;
1231 1218
1232 lock_kernel(); 1219 mutex_lock(&blkfront_mutex);
1233 1220
1234 bdev = bdget_disk(disk, 0); 1221 bdev = bdget_disk(disk, 0);
1235 bdput(bdev); 1222 bdput(bdev);
@@ -1263,7 +1250,7 @@ static int blkif_release(struct gendisk *disk, fmode_t mode)
1263 } 1250 }
1264 1251
1265out: 1252out:
1266 unlock_kernel(); 1253 mutex_unlock(&blkfront_mutex);
1267 return 0; 1254 return 0;
1268} 1255}
1269 1256
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index 057413bb16e2..6e968cd4893c 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -89,7 +89,7 @@
89#include <linux/delay.h> 89#include <linux/delay.h>
90#include <linux/slab.h> 90#include <linux/slab.h>
91#include <linux/blkdev.h> 91#include <linux/blkdev.h>
92#include <linux/smp_lock.h> 92#include <linux/mutex.h>
93#include <linux/ata.h> 93#include <linux/ata.h>
94#include <linux/hdreg.h> 94#include <linux/hdreg.h>
95#include <linux/platform_device.h> 95#include <linux/platform_device.h>
@@ -214,6 +214,7 @@ struct ace_device {
214 u16 cf_id[ATA_ID_WORDS]; 214 u16 cf_id[ATA_ID_WORDS];
215}; 215};
216 216
217static DEFINE_MUTEX(xsysace_mutex);
217static int ace_major; 218static int ace_major;
218 219
219/* --------------------------------------------------------------------- 220/* ---------------------------------------------------------------------
@@ -903,13 +904,13 @@ static int ace_open(struct block_device *bdev, fmode_t mode)
903 904
904 dev_dbg(ace->dev, "ace_open() users=%i\n", ace->users + 1); 905 dev_dbg(ace->dev, "ace_open() users=%i\n", ace->users + 1);
905 906
906 lock_kernel(); 907 mutex_lock(&xsysace_mutex);
907 spin_lock_irqsave(&ace->lock, flags); 908 spin_lock_irqsave(&ace->lock, flags);
908 ace->users++; 909 ace->users++;
909 spin_unlock_irqrestore(&ace->lock, flags); 910 spin_unlock_irqrestore(&ace->lock, flags);
910 911
911 check_disk_change(bdev); 912 check_disk_change(bdev);
912 unlock_kernel(); 913 mutex_unlock(&xsysace_mutex);
913 914
914 return 0; 915 return 0;
915} 916}
@@ -922,7 +923,7 @@ static int ace_release(struct gendisk *disk, fmode_t mode)
922 923
923 dev_dbg(ace->dev, "ace_release() users=%i\n", ace->users - 1); 924 dev_dbg(ace->dev, "ace_release() users=%i\n", ace->users - 1);
924 925
925 lock_kernel(); 926 mutex_lock(&xsysace_mutex);
926 spin_lock_irqsave(&ace->lock, flags); 927 spin_lock_irqsave(&ace->lock, flags);
927 ace->users--; 928 ace->users--;
928 if (ace->users == 0) { 929 if (ace->users == 0) {
@@ -930,7 +931,7 @@ static int ace_release(struct gendisk *disk, fmode_t mode)
930 ace_out(ace, ACE_CTRL, val & ~ACE_CTRL_LOCKREQ); 931 ace_out(ace, ACE_CTRL, val & ~ACE_CTRL_LOCKREQ);
931 } 932 }
932 spin_unlock_irqrestore(&ace->lock, flags); 933 spin_unlock_irqrestore(&ace->lock, flags);
933 unlock_kernel(); 934 mutex_unlock(&xsysace_mutex);
934 return 0; 935 return 0;
935} 936}
936 937
diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c
index d75b2bb601ad..dcd4cfcf4126 100644
--- a/drivers/block/z2ram.c
+++ b/drivers/block/z2ram.c
@@ -33,7 +33,7 @@
33#include <linux/module.h> 33#include <linux/module.h>
34#include <linux/blkdev.h> 34#include <linux/blkdev.h>
35#include <linux/bitops.h> 35#include <linux/bitops.h>
36#include <linux/smp_lock.h> 36#include <linux/mutex.h>
37#include <linux/slab.h> 37#include <linux/slab.h>
38 38
39#include <asm/setup.h> 39#include <asm/setup.h>
@@ -57,6 +57,7 @@ extern struct mem_info m68k_memory[NUM_MEMINFO];
57 57
58#define Z2RAM_CHUNK1024 ( Z2RAM_CHUNKSIZE >> 10 ) 58#define Z2RAM_CHUNK1024 ( Z2RAM_CHUNKSIZE >> 10 )
59 59
60static DEFINE_MUTEX(z2ram_mutex);
60static u_long *z2ram_map = NULL; 61static u_long *z2ram_map = NULL;
61static u_long z2ram_size = 0; 62static u_long z2ram_size = 0;
62static int z2_count = 0; 63static int z2_count = 0;
@@ -154,7 +155,7 @@ static int z2_open(struct block_device *bdev, fmode_t mode)
154 155
155 device = MINOR(bdev->bd_dev); 156 device = MINOR(bdev->bd_dev);
156 157
157 lock_kernel(); 158 mutex_lock(&z2ram_mutex);
158 if ( current_device != -1 && current_device != device ) 159 if ( current_device != -1 && current_device != device )
159 { 160 {
160 rc = -EBUSY; 161 rc = -EBUSY;
@@ -296,25 +297,25 @@ static int z2_open(struct block_device *bdev, fmode_t mode)
296 set_capacity(z2ram_gendisk, z2ram_size >> 9); 297 set_capacity(z2ram_gendisk, z2ram_size >> 9);
297 } 298 }
298 299
299 unlock_kernel(); 300 mutex_unlock(&z2ram_mutex);
300 return 0; 301 return 0;
301 302
302err_out_kfree: 303err_out_kfree:
303 kfree(z2ram_map); 304 kfree(z2ram_map);
304err_out: 305err_out:
305 unlock_kernel(); 306 mutex_unlock(&z2ram_mutex);
306 return rc; 307 return rc;
307} 308}
308 309
309static int 310static int
310z2_release(struct gendisk *disk, fmode_t mode) 311z2_release(struct gendisk *disk, fmode_t mode)
311{ 312{
312 lock_kernel(); 313 mutex_lock(&z2ram_mutex);
313 if ( current_device == -1 ) { 314 if ( current_device == -1 ) {
314 unlock_kernel(); 315 mutex_unlock(&z2ram_mutex);
315 return 0; 316 return 0;
316 } 317 }
317 unlock_kernel(); 318 mutex_unlock(&z2ram_mutex);
318 /* 319 /*
319 * FIXME: unmap memory 320 * FIXME: unmap memory
320 */ 321 */