diff options
Diffstat (limited to 'drivers/block')
44 files changed, 13387 insertions, 11507 deletions
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index a796407123c7..824e09c4d0d7 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig | |||
@@ -131,6 +131,7 @@ config BLK_CPQ_DA | |||
131 | config BLK_CPQ_CISS_DA | 131 | config BLK_CPQ_CISS_DA |
132 | tristate "Compaq Smart Array 5xxx support" | 132 | tristate "Compaq Smart Array 5xxx support" |
133 | depends on PCI | 133 | depends on PCI |
134 | select CHECK_SIGNATURE | ||
134 | help | 135 | help |
135 | This is the driver for Compaq Smart Array 5xxx controllers. | 136 | This is the driver for Compaq Smart Array 5xxx controllers. |
136 | Everyone using these boards should say Y here. | 137 | Everyone using these boards should say Y here. |
@@ -166,8 +167,8 @@ config BLK_DEV_DAC960 | |||
166 | module will be called DAC960. | 167 | module will be called DAC960. |
167 | 168 | ||
168 | config BLK_DEV_UMEM | 169 | config BLK_DEV_UMEM |
169 | tristate "Micro Memory MM5415 Battery Backed RAM support (EXPERIMENTAL)" | 170 | tristate "Micro Memory MM5415 Battery Backed RAM support" |
170 | depends on PCI && EXPERIMENTAL | 171 | depends on PCI |
171 | ---help--- | 172 | ---help--- |
172 | Saying Y here will include support for the MM5415 family of | 173 | Saying Y here will include support for the MM5415 family of |
173 | battery backed (Non-volatile) RAM cards. | 174 | battery backed (Non-volatile) RAM cards. |
@@ -353,18 +354,6 @@ config BLK_DEV_SX8 | |||
353 | 354 | ||
354 | Use devices /dev/sx8/$N and /dev/sx8/$Np$M. | 355 | Use devices /dev/sx8/$N and /dev/sx8/$Np$M. |
355 | 356 | ||
356 | config BLK_DEV_UB | ||
357 | tristate "Low Performance USB Block driver (deprecated)" | ||
358 | depends on USB | ||
359 | help | ||
360 | This driver supports certain USB attached storage devices | ||
361 | such as flash keys. | ||
362 | |||
363 | If you enable this driver, it is recommended to avoid conflicts | ||
364 | with usb-storage by enabling USB_LIBUSUAL. | ||
365 | |||
366 | If unsure, say N. | ||
367 | |||
368 | config BLK_DEV_RAM | 357 | config BLK_DEV_RAM |
369 | tristate "RAM block device support" | 358 | tristate "RAM block device support" |
370 | ---help--- | 359 | ---help--- |
@@ -442,8 +431,8 @@ config CDROM_PKTCDVD_BUFFERS | |||
442 | a disc is opened for writing. | 431 | a disc is opened for writing. |
443 | 432 | ||
444 | config CDROM_PKTCDVD_WCACHE | 433 | config CDROM_PKTCDVD_WCACHE |
445 | bool "Enable write caching (EXPERIMENTAL)" | 434 | bool "Enable write caching" |
446 | depends on CDROM_PKTCDVD && EXPERIMENTAL | 435 | depends on CDROM_PKTCDVD |
447 | help | 436 | help |
448 | If enabled, write caching will be set for the CD-R/W device. For now | 437 | If enabled, write caching will be set for the CD-R/W device. For now |
449 | this option is dangerous unless the CD-RW media is known good, as we | 438 | this option is dangerous unless the CD-RW media is known good, as we |
@@ -520,8 +509,8 @@ config XEN_BLKDEV_BACKEND | |||
520 | 509 | ||
521 | 510 | ||
522 | config VIRTIO_BLK | 511 | config VIRTIO_BLK |
523 | tristate "Virtio block driver (EXPERIMENTAL)" | 512 | tristate "Virtio block driver" |
524 | depends on EXPERIMENTAL && VIRTIO | 513 | depends on VIRTIO |
525 | ---help--- | 514 | ---help--- |
526 | This is the virtual block driver for virtio. It can be used with | 515 | This is the virtual block driver for virtio. It can be used with |
527 | lguest or QEMU based VMMs (like KVM or Xen). Say Y or M. | 516 | lguest or QEMU based VMMs (like KVM or Xen). Say Y or M. |
@@ -540,7 +529,7 @@ config BLK_DEV_HD | |||
540 | 529 | ||
541 | config BLK_DEV_RBD | 530 | config BLK_DEV_RBD |
542 | tristate "Rados block device (RBD)" | 531 | tristate "Rados block device (RBD)" |
543 | depends on INET && EXPERIMENTAL && BLOCK | 532 | depends on INET && BLOCK |
544 | select CEPH_LIB | 533 | select CEPH_LIB |
545 | select LIBCRC32C | 534 | select LIBCRC32C |
546 | select CRYPTO_AES | 535 | select CRYPTO_AES |
diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 5b795059f8fb..17e82df3df74 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile | |||
@@ -33,7 +33,6 @@ obj-$(CONFIG_VIRTIO_BLK) += virtio_blk.o | |||
33 | 33 | ||
34 | obj-$(CONFIG_VIODASD) += viodasd.o | 34 | obj-$(CONFIG_VIODASD) += viodasd.o |
35 | obj-$(CONFIG_BLK_DEV_SX8) += sx8.o | 35 | obj-$(CONFIG_BLK_DEV_SX8) += sx8.o |
36 | obj-$(CONFIG_BLK_DEV_UB) += ub.o | ||
37 | obj-$(CONFIG_BLK_DEV_HD) += hd.o | 36 | obj-$(CONFIG_BLK_DEV_HD) += hd.o |
38 | 37 | ||
39 | obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o | 38 | obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o |
diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h index db195abad698..175649468c95 100644 --- a/drivers/block/aoe/aoe.h +++ b/drivers/block/aoe/aoe.h | |||
@@ -1,5 +1,5 @@ | |||
1 | /* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */ | 1 | /* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */ |
2 | #define VERSION "47" | 2 | #define VERSION "81" |
3 | #define AOE_MAJOR 152 | 3 | #define AOE_MAJOR 152 |
4 | #define DEVICE_NAME "aoe" | 4 | #define DEVICE_NAME "aoe" |
5 | 5 | ||
@@ -10,10 +10,7 @@ | |||
10 | #define AOE_PARTITIONS (16) | 10 | #define AOE_PARTITIONS (16) |
11 | #endif | 11 | #endif |
12 | 12 | ||
13 | #define SYSMINOR(aoemajor, aoeminor) ((aoemajor) * NPERSHELF + (aoeminor)) | 13 | #define WHITESPACE " \t\v\f\n," |
14 | #define AOEMAJOR(sysminor) ((sysminor) / NPERSHELF) | ||
15 | #define AOEMINOR(sysminor) ((sysminor) % NPERSHELF) | ||
16 | #define WHITESPACE " \t\v\f\n" | ||
17 | 14 | ||
18 | enum { | 15 | enum { |
19 | AOECMD_ATA, | 16 | AOECMD_ATA, |
@@ -75,101 +72,134 @@ enum { | |||
75 | DEVFL_UP = 1, /* device is installed in system and ready for AoE->ATA commands */ | 72 | DEVFL_UP = 1, /* device is installed in system and ready for AoE->ATA commands */ |
76 | DEVFL_TKILL = (1<<1), /* flag for timer to know when to kill self */ | 73 | DEVFL_TKILL = (1<<1), /* flag for timer to know when to kill self */ |
77 | DEVFL_EXT = (1<<2), /* device accepts lba48 commands */ | 74 | DEVFL_EXT = (1<<2), /* device accepts lba48 commands */ |
78 | DEVFL_CLOSEWAIT = (1<<3), /* device is waiting for all closes to revalidate */ | 75 | DEVFL_GDALLOC = (1<<3), /* need to alloc gendisk */ |
79 | DEVFL_GDALLOC = (1<<4), /* need to alloc gendisk */ | 76 | DEVFL_GD_NOW = (1<<4), /* allocating gendisk */ |
80 | DEVFL_KICKME = (1<<5), /* slow polling network card catch */ | 77 | DEVFL_KICKME = (1<<5), /* slow polling network card catch */ |
81 | DEVFL_NEWSIZE = (1<<6), /* need to update dev size in block layer */ | 78 | DEVFL_NEWSIZE = (1<<6), /* need to update dev size in block layer */ |
82 | 79 | DEVFL_FREEING = (1<<7), /* set when device is being cleaned up */ | |
83 | BUFFL_FAIL = 1, | 80 | DEVFL_FREED = (1<<8), /* device has been cleaned up */ |
84 | }; | 81 | }; |
85 | 82 | ||
86 | enum { | 83 | enum { |
87 | DEFAULTBCNT = 2 * 512, /* 2 sectors */ | 84 | DEFAULTBCNT = 2 * 512, /* 2 sectors */ |
88 | NPERSHELF = 16, /* number of slots per shelf address */ | ||
89 | FREETAG = -1, | ||
90 | MIN_BUFS = 16, | 85 | MIN_BUFS = 16, |
91 | NTARGETS = 8, | 86 | NTARGETS = 4, |
92 | NAOEIFS = 8, | 87 | NAOEIFS = 8, |
93 | NSKBPOOLMAX = 128, | 88 | NSKBPOOLMAX = 256, |
89 | NFACTIVE = 61, | ||
94 | 90 | ||
95 | TIMERTICK = HZ / 10, | 91 | TIMERTICK = HZ / 10, |
96 | MINTIMER = HZ >> 2, | 92 | RTTSCALE = 8, |
97 | MAXTIMER = HZ << 1, | 93 | RTTDSCALE = 3, |
98 | HELPWAIT = 20, | 94 | RTTAVG_INIT = USEC_PER_SEC / 4 << RTTSCALE, |
95 | RTTDEV_INIT = RTTAVG_INIT / 4, | ||
96 | |||
97 | HARD_SCORN_SECS = 10, /* try another remote port after this */ | ||
98 | MAX_TAINT = 1000, /* cap on aoetgt taint */ | ||
99 | }; | 99 | }; |
100 | 100 | ||
101 | struct buf { | 101 | struct buf { |
102 | struct list_head bufs; | ||
103 | ulong stime; /* for disk stats */ | ||
104 | ulong flags; | ||
105 | ulong nframesout; | 102 | ulong nframesout; |
106 | ulong resid; | 103 | ulong resid; |
107 | ulong bv_resid; | 104 | ulong bv_resid; |
108 | ulong bv_off; | ||
109 | sector_t sector; | 105 | sector_t sector; |
110 | struct bio *bio; | 106 | struct bio *bio; |
111 | struct bio_vec *bv; | 107 | struct bio_vec *bv; |
108 | struct request *rq; | ||
109 | }; | ||
110 | |||
111 | enum frame_flags { | ||
112 | FFL_PROBE = 1, | ||
112 | }; | 113 | }; |
113 | 114 | ||
114 | struct frame { | 115 | struct frame { |
115 | int tag; | 116 | struct list_head head; |
117 | u32 tag; | ||
118 | struct timeval sent; /* high-res time packet was sent */ | ||
119 | u32 sent_jiffs; /* low-res jiffies-based sent time */ | ||
116 | ulong waited; | 120 | ulong waited; |
121 | ulong waited_total; | ||
122 | struct aoetgt *t; /* parent target I belong to */ | ||
123 | sector_t lba; | ||
124 | struct sk_buff *skb; /* command skb freed on module exit */ | ||
125 | struct sk_buff *r_skb; /* response skb for async processing */ | ||
117 | struct buf *buf; | 126 | struct buf *buf; |
118 | char *bufaddr; | 127 | struct bio_vec *bv; |
119 | ulong bcnt; | 128 | ulong bcnt; |
120 | sector_t lba; | 129 | ulong bv_off; |
121 | struct sk_buff *skb; | 130 | char flags; |
122 | }; | 131 | }; |
123 | 132 | ||
124 | struct aoeif { | 133 | struct aoeif { |
125 | struct net_device *nd; | 134 | struct net_device *nd; |
126 | unsigned char lost; | 135 | ulong lost; |
127 | unsigned char lostjumbo; | 136 | int bcnt; |
128 | ushort maxbcnt; | ||
129 | }; | 137 | }; |
130 | 138 | ||
131 | struct aoetgt { | 139 | struct aoetgt { |
132 | unsigned char addr[6]; | 140 | unsigned char addr[6]; |
133 | ushort nframes; | 141 | ushort nframes; /* cap on frames to use */ |
134 | struct frame *frames; | 142 | struct aoedev *d; /* parent device I belong to */ |
143 | struct list_head ffree; /* list of free frames */ | ||
135 | struct aoeif ifs[NAOEIFS]; | 144 | struct aoeif ifs[NAOEIFS]; |
136 | struct aoeif *ifp; /* current aoeif in use */ | 145 | struct aoeif *ifp; /* current aoeif in use */ |
137 | ushort nout; | 146 | ushort nout; /* number of AoE commands outstanding */ |
138 | ushort maxout; | 147 | ushort maxout; /* current value for max outstanding */ |
139 | u16 lasttag; /* last tag sent */ | 148 | ushort next_cwnd; /* incr maxout after decrementing to zero */ |
140 | u16 useme; | 149 | ushort ssthresh; /* slow start threshold */ |
141 | ulong lastwadj; /* last window adjustment */ | 150 | ulong falloc; /* number of allocated frames */ |
151 | int taint; /* how much we want to avoid this aoetgt */ | ||
152 | int minbcnt; | ||
142 | int wpkts, rpkts; | 153 | int wpkts, rpkts; |
143 | int dataref; | 154 | char nout_probes; |
144 | }; | 155 | }; |
145 | 156 | ||
146 | struct aoedev { | 157 | struct aoedev { |
147 | struct aoedev *next; | 158 | struct aoedev *next; |
148 | ulong sysminor; | 159 | ulong sysminor; |
149 | ulong aoemajor; | 160 | ulong aoemajor; |
161 | u32 rttavg; /* scaled AoE round trip time average */ | ||
162 | u32 rttdev; /* scaled round trip time mean deviation */ | ||
150 | u16 aoeminor; | 163 | u16 aoeminor; |
151 | u16 flags; | 164 | u16 flags; |
152 | u16 nopen; /* (bd_openers isn't available without sleeping) */ | 165 | u16 nopen; /* (bd_openers isn't available without sleeping) */ |
153 | u16 rttavg; /* round trip average of requests/responses */ | ||
154 | u16 mintimer; | ||
155 | u16 fw_ver; /* version of blade's firmware */ | 166 | u16 fw_ver; /* version of blade's firmware */ |
167 | u16 lasttag; /* last tag sent */ | ||
168 | u16 useme; | ||
169 | ulong ref; | ||
156 | struct work_struct work;/* disk create work struct */ | 170 | struct work_struct work;/* disk create work struct */ |
157 | struct gendisk *gd; | 171 | struct gendisk *gd; |
158 | struct request_queue *blkq; | 172 | struct request_queue *blkq; |
159 | struct hd_geometry geo; | 173 | struct hd_geometry geo; |
160 | sector_t ssize; | 174 | sector_t ssize; |
161 | struct timer_list timer; | 175 | struct timer_list timer; |
162 | spinlock_t lock; | 176 | spinlock_t lock; |
163 | struct sk_buff_head sendq; | ||
164 | struct sk_buff_head skbpool; | 177 | struct sk_buff_head skbpool; |
165 | mempool_t *bufpool; /* for deadlock-free Buf allocation */ | 178 | mempool_t *bufpool; /* for deadlock-free Buf allocation */ |
166 | struct list_head bufq; /* queue of bios to work on */ | 179 | struct { /* pointers to work in progress */ |
167 | struct buf *inprocess; /* the one we're currently working on */ | 180 | struct buf *buf; |
168 | struct aoetgt *targets[NTARGETS]; | 181 | struct bio *nxbio; |
182 | struct request *rq; | ||
183 | } ip; | ||
184 | ulong maxbcnt; | ||
185 | struct list_head factive[NFACTIVE]; /* hash of active frames */ | ||
186 | struct list_head rexmitq; /* deferred retransmissions */ | ||
187 | struct aoetgt **targets; | ||
188 | ulong ntargets; /* number of allocated aoetgt pointers */ | ||
169 | struct aoetgt **tgt; /* target in use when working */ | 189 | struct aoetgt **tgt; /* target in use when working */ |
170 | struct aoetgt **htgt; /* target needing rexmit assistance */ | 190 | ulong kicked; |
191 | char ident[512]; | ||
171 | }; | 192 | }; |
172 | 193 | ||
194 | /* kthread tracking */ | ||
195 | struct ktstate { | ||
196 | struct completion rendez; | ||
197 | struct task_struct *task; | ||
198 | wait_queue_head_t *waitq; | ||
199 | int (*fn) (void); | ||
200 | char *name; | ||
201 | spinlock_t *lock; | ||
202 | }; | ||
173 | 203 | ||
174 | int aoeblk_init(void); | 204 | int aoeblk_init(void); |
175 | void aoeblk_exit(void); | 205 | void aoeblk_exit(void); |
@@ -182,22 +212,30 @@ void aoechr_error(char *); | |||
182 | 212 | ||
183 | void aoecmd_work(struct aoedev *d); | 213 | void aoecmd_work(struct aoedev *d); |
184 | void aoecmd_cfg(ushort aoemajor, unsigned char aoeminor); | 214 | void aoecmd_cfg(ushort aoemajor, unsigned char aoeminor); |
185 | void aoecmd_ata_rsp(struct sk_buff *); | 215 | struct sk_buff *aoecmd_ata_rsp(struct sk_buff *); |
186 | void aoecmd_cfg_rsp(struct sk_buff *); | 216 | void aoecmd_cfg_rsp(struct sk_buff *); |
187 | void aoecmd_sleepwork(struct work_struct *); | 217 | void aoecmd_sleepwork(struct work_struct *); |
218 | void aoecmd_wreset(struct aoetgt *t); | ||
188 | void aoecmd_cleanslate(struct aoedev *); | 219 | void aoecmd_cleanslate(struct aoedev *); |
220 | void aoecmd_exit(void); | ||
221 | int aoecmd_init(void); | ||
189 | struct sk_buff *aoecmd_ata_id(struct aoedev *); | 222 | struct sk_buff *aoecmd_ata_id(struct aoedev *); |
223 | void aoe_freetframe(struct frame *); | ||
224 | void aoe_flush_iocq(void); | ||
225 | void aoe_end_request(struct aoedev *, struct request *, int); | ||
226 | int aoe_ktstart(struct ktstate *k); | ||
227 | void aoe_ktstop(struct ktstate *k); | ||
190 | 228 | ||
191 | int aoedev_init(void); | 229 | int aoedev_init(void); |
192 | void aoedev_exit(void); | 230 | void aoedev_exit(void); |
193 | struct aoedev *aoedev_by_aoeaddr(int maj, int min); | 231 | struct aoedev *aoedev_by_aoeaddr(ulong maj, int min, int do_alloc); |
194 | struct aoedev *aoedev_by_sysminor_m(ulong sysminor); | ||
195 | void aoedev_downdev(struct aoedev *d); | 232 | void aoedev_downdev(struct aoedev *d); |
196 | int aoedev_flush(const char __user *str, size_t size); | 233 | int aoedev_flush(const char __user *str, size_t size); |
234 | void aoe_failbuf(struct aoedev *, struct buf *); | ||
235 | void aoedev_put(struct aoedev *); | ||
197 | 236 | ||
198 | int aoenet_init(void); | 237 | int aoenet_init(void); |
199 | void aoenet_exit(void); | 238 | void aoenet_exit(void); |
200 | void aoenet_xmit(struct sk_buff_head *); | 239 | void aoenet_xmit(struct sk_buff_head *); |
201 | int is_aoe_netif(struct net_device *ifp); | 240 | int is_aoe_netif(struct net_device *ifp); |
202 | int set_aoe_iflist(const char __user *str, size_t size); | 241 | int set_aoe_iflist(const char __user *str, size_t size); |
203 | |||
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index 321de7b6c442..a129f8c8073d 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c | |||
@@ -1,4 +1,4 @@ | |||
1 | /* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */ | 1 | /* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */ |
2 | /* | 2 | /* |
3 | * aoeblk.c | 3 | * aoeblk.c |
4 | * block device routines | 4 | * block device routines |
@@ -16,11 +16,19 @@ | |||
16 | #include <linux/netdevice.h> | 16 | #include <linux/netdevice.h> |
17 | #include <linux/mutex.h> | 17 | #include <linux/mutex.h> |
18 | #include <linux/export.h> | 18 | #include <linux/export.h> |
19 | #include <linux/moduleparam.h> | ||
20 | #include <scsi/sg.h> | ||
19 | #include "aoe.h" | 21 | #include "aoe.h" |
20 | 22 | ||
21 | static DEFINE_MUTEX(aoeblk_mutex); | 23 | static DEFINE_MUTEX(aoeblk_mutex); |
22 | static struct kmem_cache *buf_pool_cache; | 24 | static struct kmem_cache *buf_pool_cache; |
23 | 25 | ||
26 | /* GPFS needs a larger value than the default. */ | ||
27 | static int aoe_maxsectors; | ||
28 | module_param(aoe_maxsectors, int, 0644); | ||
29 | MODULE_PARM_DESC(aoe_maxsectors, | ||
30 | "When nonzero, set the maximum number of sectors per I/O request"); | ||
31 | |||
24 | static ssize_t aoedisk_show_state(struct device *dev, | 32 | static ssize_t aoedisk_show_state(struct device *dev, |
25 | struct device_attribute *attr, char *page) | 33 | struct device_attribute *attr, char *page) |
26 | { | 34 | { |
@@ -59,7 +67,7 @@ static ssize_t aoedisk_show_netif(struct device *dev, | |||
59 | nd = nds; | 67 | nd = nds; |
60 | ne = nd + ARRAY_SIZE(nds); | 68 | ne = nd + ARRAY_SIZE(nds); |
61 | t = d->targets; | 69 | t = d->targets; |
62 | te = t + NTARGETS; | 70 | te = t + d->ntargets; |
63 | for (; t < te && *t; t++) { | 71 | for (; t < te && *t; t++) { |
64 | ifp = (*t)->ifs; | 72 | ifp = (*t)->ifs; |
65 | e = ifp + NAOEIFS; | 73 | e = ifp + NAOEIFS; |
@@ -91,6 +99,14 @@ static ssize_t aoedisk_show_fwver(struct device *dev, | |||
91 | 99 | ||
92 | return snprintf(page, PAGE_SIZE, "0x%04x\n", (unsigned int) d->fw_ver); | 100 | return snprintf(page, PAGE_SIZE, "0x%04x\n", (unsigned int) d->fw_ver); |
93 | } | 101 | } |
102 | static ssize_t aoedisk_show_payload(struct device *dev, | ||
103 | struct device_attribute *attr, char *page) | ||
104 | { | ||
105 | struct gendisk *disk = dev_to_disk(dev); | ||
106 | struct aoedev *d = disk->private_data; | ||
107 | |||
108 | return snprintf(page, PAGE_SIZE, "%lu\n", d->maxbcnt); | ||
109 | } | ||
94 | 110 | ||
95 | static DEVICE_ATTR(state, S_IRUGO, aoedisk_show_state, NULL); | 111 | static DEVICE_ATTR(state, S_IRUGO, aoedisk_show_state, NULL); |
96 | static DEVICE_ATTR(mac, S_IRUGO, aoedisk_show_mac, NULL); | 112 | static DEVICE_ATTR(mac, S_IRUGO, aoedisk_show_mac, NULL); |
@@ -99,12 +115,14 @@ static struct device_attribute dev_attr_firmware_version = { | |||
99 | .attr = { .name = "firmware-version", .mode = S_IRUGO }, | 115 | .attr = { .name = "firmware-version", .mode = S_IRUGO }, |
100 | .show = aoedisk_show_fwver, | 116 | .show = aoedisk_show_fwver, |
101 | }; | 117 | }; |
118 | static DEVICE_ATTR(payload, S_IRUGO, aoedisk_show_payload, NULL); | ||
102 | 119 | ||
103 | static struct attribute *aoe_attrs[] = { | 120 | static struct attribute *aoe_attrs[] = { |
104 | &dev_attr_state.attr, | 121 | &dev_attr_state.attr, |
105 | &dev_attr_mac.attr, | 122 | &dev_attr_mac.attr, |
106 | &dev_attr_netif.attr, | 123 | &dev_attr_netif.attr, |
107 | &dev_attr_firmware_version.attr, | 124 | &dev_attr_firmware_version.attr, |
125 | &dev_attr_payload.attr, | ||
108 | NULL, | 126 | NULL, |
109 | }; | 127 | }; |
110 | 128 | ||
@@ -129,9 +147,18 @@ aoeblk_open(struct block_device *bdev, fmode_t mode) | |||
129 | struct aoedev *d = bdev->bd_disk->private_data; | 147 | struct aoedev *d = bdev->bd_disk->private_data; |
130 | ulong flags; | 148 | ulong flags; |
131 | 149 | ||
150 | if (!virt_addr_valid(d)) { | ||
151 | pr_crit("aoe: invalid device pointer in %s\n", | ||
152 | __func__); | ||
153 | WARN_ON(1); | ||
154 | return -ENODEV; | ||
155 | } | ||
156 | if (!(d->flags & DEVFL_UP) || d->flags & DEVFL_TKILL) | ||
157 | return -ENODEV; | ||
158 | |||
132 | mutex_lock(&aoeblk_mutex); | 159 | mutex_lock(&aoeblk_mutex); |
133 | spin_lock_irqsave(&d->lock, flags); | 160 | spin_lock_irqsave(&d->lock, flags); |
134 | if (d->flags & DEVFL_UP) { | 161 | if (d->flags & DEVFL_UP && !(d->flags & DEVFL_TKILL)) { |
135 | d->nopen++; | 162 | d->nopen++; |
136 | spin_unlock_irqrestore(&d->lock, flags); | 163 | spin_unlock_irqrestore(&d->lock, flags); |
137 | mutex_unlock(&aoeblk_mutex); | 164 | mutex_unlock(&aoeblk_mutex); |
@@ -161,68 +188,22 @@ aoeblk_release(struct gendisk *disk, fmode_t mode) | |||
161 | } | 188 | } |
162 | 189 | ||
163 | static void | 190 | static void |
164 | aoeblk_make_request(struct request_queue *q, struct bio *bio) | 191 | aoeblk_request(struct request_queue *q) |
165 | { | 192 | { |
166 | struct sk_buff_head queue; | ||
167 | struct aoedev *d; | 193 | struct aoedev *d; |
168 | struct buf *buf; | 194 | struct request *rq; |
169 | ulong flags; | ||
170 | |||
171 | blk_queue_bounce(q, &bio); | ||
172 | |||
173 | if (bio == NULL) { | ||
174 | printk(KERN_ERR "aoe: bio is NULL\n"); | ||
175 | BUG(); | ||
176 | return; | ||
177 | } | ||
178 | d = bio->bi_bdev->bd_disk->private_data; | ||
179 | if (d == NULL) { | ||
180 | printk(KERN_ERR "aoe: bd_disk->private_data is NULL\n"); | ||
181 | BUG(); | ||
182 | bio_endio(bio, -ENXIO); | ||
183 | return; | ||
184 | } else if (bio->bi_io_vec == NULL) { | ||
185 | printk(KERN_ERR "aoe: bi_io_vec is NULL\n"); | ||
186 | BUG(); | ||
187 | bio_endio(bio, -ENXIO); | ||
188 | return; | ||
189 | } | ||
190 | buf = mempool_alloc(d->bufpool, GFP_NOIO); | ||
191 | if (buf == NULL) { | ||
192 | printk(KERN_INFO "aoe: buf allocation failure\n"); | ||
193 | bio_endio(bio, -ENOMEM); | ||
194 | return; | ||
195 | } | ||
196 | memset(buf, 0, sizeof(*buf)); | ||
197 | INIT_LIST_HEAD(&buf->bufs); | ||
198 | buf->stime = jiffies; | ||
199 | buf->bio = bio; | ||
200 | buf->resid = bio->bi_size; | ||
201 | buf->sector = bio->bi_sector; | ||
202 | buf->bv = &bio->bi_io_vec[bio->bi_idx]; | ||
203 | buf->bv_resid = buf->bv->bv_len; | ||
204 | WARN_ON(buf->bv_resid == 0); | ||
205 | buf->bv_off = buf->bv->bv_offset; | ||
206 | |||
207 | spin_lock_irqsave(&d->lock, flags); | ||
208 | 195 | ||
196 | d = q->queuedata; | ||
209 | if ((d->flags & DEVFL_UP) == 0) { | 197 | if ((d->flags & DEVFL_UP) == 0) { |
210 | pr_info_ratelimited("aoe: device %ld.%d is not up\n", | 198 | pr_info_ratelimited("aoe: device %ld.%d is not up\n", |
211 | d->aoemajor, d->aoeminor); | 199 | d->aoemajor, d->aoeminor); |
212 | spin_unlock_irqrestore(&d->lock, flags); | 200 | while ((rq = blk_peek_request(q))) { |
213 | mempool_free(buf, d->bufpool); | 201 | blk_start_request(rq); |
214 | bio_endio(bio, -ENXIO); | 202 | aoe_end_request(d, rq, 1); |
203 | } | ||
215 | return; | 204 | return; |
216 | } | 205 | } |
217 | |||
218 | list_add_tail(&buf->bufs, &d->bufq); | ||
219 | |||
220 | aoecmd_work(d); | 206 | aoecmd_work(d); |
221 | __skb_queue_head_init(&queue); | ||
222 | skb_queue_splice_init(&d->sendq, &queue); | ||
223 | |||
224 | spin_unlock_irqrestore(&d->lock, flags); | ||
225 | aoenet_xmit(&queue); | ||
226 | } | 207 | } |
227 | 208 | ||
228 | static int | 209 | static int |
@@ -241,9 +222,38 @@ aoeblk_getgeo(struct block_device *bdev, struct hd_geometry *geo) | |||
241 | return 0; | 222 | return 0; |
242 | } | 223 | } |
243 | 224 | ||
225 | static int | ||
226 | aoeblk_ioctl(struct block_device *bdev, fmode_t mode, uint cmd, ulong arg) | ||
227 | { | ||
228 | struct aoedev *d; | ||
229 | |||
230 | if (!arg) | ||
231 | return -EINVAL; | ||
232 | |||
233 | d = bdev->bd_disk->private_data; | ||
234 | if ((d->flags & DEVFL_UP) == 0) { | ||
235 | pr_err("aoe: disk not up\n"); | ||
236 | return -ENODEV; | ||
237 | } | ||
238 | |||
239 | if (cmd == HDIO_GET_IDENTITY) { | ||
240 | if (!copy_to_user((void __user *) arg, &d->ident, | ||
241 | sizeof(d->ident))) | ||
242 | return 0; | ||
243 | return -EFAULT; | ||
244 | } | ||
245 | |||
246 | /* udev calls scsi_id, which uses SG_IO, resulting in noise */ | ||
247 | if (cmd != SG_IO) | ||
248 | pr_info("aoe: unknown ioctl 0x%x\n", cmd); | ||
249 | |||
250 | return -ENOTTY; | ||
251 | } | ||
252 | |||
244 | static const struct block_device_operations aoe_bdops = { | 253 | static const struct block_device_operations aoe_bdops = { |
245 | .open = aoeblk_open, | 254 | .open = aoeblk_open, |
246 | .release = aoeblk_release, | 255 | .release = aoeblk_release, |
256 | .ioctl = aoeblk_ioctl, | ||
247 | .getgeo = aoeblk_getgeo, | 257 | .getgeo = aoeblk_getgeo, |
248 | .owner = THIS_MODULE, | 258 | .owner = THIS_MODULE, |
249 | }; | 259 | }; |
@@ -254,41 +264,67 @@ aoeblk_gdalloc(void *vp) | |||
254 | { | 264 | { |
255 | struct aoedev *d = vp; | 265 | struct aoedev *d = vp; |
256 | struct gendisk *gd; | 266 | struct gendisk *gd; |
267 | mempool_t *mp; | ||
268 | struct request_queue *q; | ||
269 | enum { KB = 1024, MB = KB * KB, READ_AHEAD = 2 * MB, }; | ||
257 | ulong flags; | 270 | ulong flags; |
271 | int late = 0; | ||
272 | |||
273 | spin_lock_irqsave(&d->lock, flags); | ||
274 | if (d->flags & DEVFL_GDALLOC | ||
275 | && !(d->flags & DEVFL_TKILL) | ||
276 | && !(d->flags & DEVFL_GD_NOW)) | ||
277 | d->flags |= DEVFL_GD_NOW; | ||
278 | else | ||
279 | late = 1; | ||
280 | spin_unlock_irqrestore(&d->lock, flags); | ||
281 | if (late) | ||
282 | return; | ||
258 | 283 | ||
259 | gd = alloc_disk(AOE_PARTITIONS); | 284 | gd = alloc_disk(AOE_PARTITIONS); |
260 | if (gd == NULL) { | 285 | if (gd == NULL) { |
261 | printk(KERN_ERR | 286 | pr_err("aoe: cannot allocate disk structure for %ld.%d\n", |
262 | "aoe: cannot allocate disk structure for %ld.%d\n", | ||
263 | d->aoemajor, d->aoeminor); | 287 | d->aoemajor, d->aoeminor); |
264 | goto err; | 288 | goto err; |
265 | } | 289 | } |
266 | 290 | ||
267 | d->bufpool = mempool_create_slab_pool(MIN_BUFS, buf_pool_cache); | 291 | mp = mempool_create(MIN_BUFS, mempool_alloc_slab, mempool_free_slab, |
268 | if (d->bufpool == NULL) { | 292 | buf_pool_cache); |
293 | if (mp == NULL) { | ||
269 | printk(KERN_ERR "aoe: cannot allocate bufpool for %ld.%d\n", | 294 | printk(KERN_ERR "aoe: cannot allocate bufpool for %ld.%d\n", |
270 | d->aoemajor, d->aoeminor); | 295 | d->aoemajor, d->aoeminor); |
271 | goto err_disk; | 296 | goto err_disk; |
272 | } | 297 | } |
273 | 298 | q = blk_init_queue(aoeblk_request, &d->lock); | |
274 | d->blkq = blk_alloc_queue(GFP_KERNEL); | 299 | if (q == NULL) { |
275 | if (!d->blkq) | 300 | pr_err("aoe: cannot allocate block queue for %ld.%d\n", |
301 | d->aoemajor, d->aoeminor); | ||
276 | goto err_mempool; | 302 | goto err_mempool; |
277 | blk_queue_make_request(d->blkq, aoeblk_make_request); | 303 | } |
278 | d->blkq->backing_dev_info.name = "aoe"; | 304 | |
279 | if (bdi_init(&d->blkq->backing_dev_info)) | ||
280 | goto err_blkq; | ||
281 | spin_lock_irqsave(&d->lock, flags); | 305 | spin_lock_irqsave(&d->lock, flags); |
306 | WARN_ON(!(d->flags & DEVFL_GD_NOW)); | ||
307 | WARN_ON(!(d->flags & DEVFL_GDALLOC)); | ||
308 | WARN_ON(d->flags & DEVFL_TKILL); | ||
309 | WARN_ON(d->gd); | ||
310 | WARN_ON(d->flags & DEVFL_UP); | ||
311 | blk_queue_max_hw_sectors(q, BLK_DEF_MAX_SECTORS); | ||
312 | q->backing_dev_info.name = "aoe"; | ||
313 | q->backing_dev_info.ra_pages = READ_AHEAD / PAGE_CACHE_SIZE; | ||
314 | d->bufpool = mp; | ||
315 | d->blkq = gd->queue = q; | ||
316 | q->queuedata = d; | ||
317 | d->gd = gd; | ||
318 | if (aoe_maxsectors) | ||
319 | blk_queue_max_hw_sectors(q, aoe_maxsectors); | ||
282 | gd->major = AOE_MAJOR; | 320 | gd->major = AOE_MAJOR; |
283 | gd->first_minor = d->sysminor * AOE_PARTITIONS; | 321 | gd->first_minor = d->sysminor; |
284 | gd->fops = &aoe_bdops; | 322 | gd->fops = &aoe_bdops; |
285 | gd->private_data = d; | 323 | gd->private_data = d; |
286 | set_capacity(gd, d->ssize); | 324 | set_capacity(gd, d->ssize); |
287 | snprintf(gd->disk_name, sizeof gd->disk_name, "etherd/e%ld.%d", | 325 | snprintf(gd->disk_name, sizeof gd->disk_name, "etherd/e%ld.%d", |
288 | d->aoemajor, d->aoeminor); | 326 | d->aoemajor, d->aoeminor); |
289 | 327 | ||
290 | gd->queue = d->blkq; | ||
291 | d->gd = gd; | ||
292 | d->flags &= ~DEVFL_GDALLOC; | 328 | d->flags &= ~DEVFL_GDALLOC; |
293 | d->flags |= DEVFL_UP; | 329 | d->flags |= DEVFL_UP; |
294 | 330 | ||
@@ -296,18 +332,21 @@ aoeblk_gdalloc(void *vp) | |||
296 | 332 | ||
297 | add_disk(gd); | 333 | add_disk(gd); |
298 | aoedisk_add_sysfs(d); | 334 | aoedisk_add_sysfs(d); |
335 | |||
336 | spin_lock_irqsave(&d->lock, flags); | ||
337 | WARN_ON(!(d->flags & DEVFL_GD_NOW)); | ||
338 | d->flags &= ~DEVFL_GD_NOW; | ||
339 | spin_unlock_irqrestore(&d->lock, flags); | ||
299 | return; | 340 | return; |
300 | 341 | ||
301 | err_blkq: | ||
302 | blk_cleanup_queue(d->blkq); | ||
303 | d->blkq = NULL; | ||
304 | err_mempool: | 342 | err_mempool: |
305 | mempool_destroy(d->bufpool); | 343 | mempool_destroy(mp); |
306 | err_disk: | 344 | err_disk: |
307 | put_disk(gd); | 345 | put_disk(gd); |
308 | err: | 346 | err: |
309 | spin_lock_irqsave(&d->lock, flags); | 347 | spin_lock_irqsave(&d->lock, flags); |
310 | d->flags &= ~DEVFL_GDALLOC; | 348 | d->flags &= ~DEVFL_GD_NOW; |
349 | schedule_work(&d->work); | ||
311 | spin_unlock_irqrestore(&d->lock, flags); | 350 | spin_unlock_irqrestore(&d->lock, flags); |
312 | } | 351 | } |
313 | 352 | ||
diff --git a/drivers/block/aoe/aoechr.c b/drivers/block/aoe/aoechr.c index e86d2062a164..42e67ad6bd20 100644 --- a/drivers/block/aoe/aoechr.c +++ b/drivers/block/aoe/aoechr.c | |||
@@ -1,4 +1,4 @@ | |||
1 | /* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */ | 1 | /* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */ |
2 | /* | 2 | /* |
3 | * aoechr.c | 3 | * aoechr.c |
4 | * AoE character device driver | 4 | * AoE character device driver |
@@ -39,6 +39,11 @@ struct ErrMsg { | |||
39 | }; | 39 | }; |
40 | 40 | ||
41 | static DEFINE_MUTEX(aoechr_mutex); | 41 | static DEFINE_MUTEX(aoechr_mutex); |
42 | |||
43 | /* A ring buffer of error messages, to be read through | ||
44 | * "/dev/etherd/err". When no messages are present, | ||
45 | * readers will block waiting for messages to appear. | ||
46 | */ | ||
42 | static struct ErrMsg emsgs[NMSG]; | 47 | static struct ErrMsg emsgs[NMSG]; |
43 | static int emsgs_head_idx, emsgs_tail_idx; | 48 | static int emsgs_head_idx, emsgs_tail_idx; |
44 | static struct completion emsgs_comp; | 49 | static struct completion emsgs_comp; |
@@ -86,34 +91,34 @@ revalidate(const char __user *str, size_t size) | |||
86 | if (copy_from_user(buf, str, size)) | 91 | if (copy_from_user(buf, str, size)) |
87 | return -EFAULT; | 92 | return -EFAULT; |
88 | 93 | ||
89 | /* should be e%d.%d format */ | ||
90 | n = sscanf(buf, "e%d.%d", &major, &minor); | 94 | n = sscanf(buf, "e%d.%d", &major, &minor); |
91 | if (n != 2) { | 95 | if (n != 2) { |
92 | printk(KERN_ERR "aoe: invalid device specification\n"); | 96 | pr_err("aoe: invalid device specification %s\n", buf); |
93 | return -EINVAL; | 97 | return -EINVAL; |
94 | } | 98 | } |
95 | d = aoedev_by_aoeaddr(major, minor); | 99 | d = aoedev_by_aoeaddr(major, minor, 0); |
96 | if (!d) | 100 | if (!d) |
97 | return -EINVAL; | 101 | return -EINVAL; |
98 | spin_lock_irqsave(&d->lock, flags); | 102 | spin_lock_irqsave(&d->lock, flags); |
99 | aoecmd_cleanslate(d); | 103 | aoecmd_cleanslate(d); |
104 | aoecmd_cfg(major, minor); | ||
100 | loop: | 105 | loop: |
101 | skb = aoecmd_ata_id(d); | 106 | skb = aoecmd_ata_id(d); |
102 | spin_unlock_irqrestore(&d->lock, flags); | 107 | spin_unlock_irqrestore(&d->lock, flags); |
103 | /* try again if we are able to sleep a bit, | 108 | /* try again if we are able to sleep a bit, |
104 | * otherwise give up this revalidation | 109 | * otherwise give up this revalidation |
105 | */ | 110 | */ |
106 | if (!skb && !msleep_interruptible(200)) { | 111 | if (!skb && !msleep_interruptible(250)) { |
107 | spin_lock_irqsave(&d->lock, flags); | 112 | spin_lock_irqsave(&d->lock, flags); |
108 | goto loop; | 113 | goto loop; |
109 | } | 114 | } |
115 | aoedev_put(d); | ||
110 | if (skb) { | 116 | if (skb) { |
111 | struct sk_buff_head queue; | 117 | struct sk_buff_head queue; |
112 | __skb_queue_head_init(&queue); | 118 | __skb_queue_head_init(&queue); |
113 | __skb_queue_tail(&queue, skb); | 119 | __skb_queue_tail(&queue, skb); |
114 | aoenet_xmit(&queue); | 120 | aoenet_xmit(&queue); |
115 | } | 121 | } |
116 | aoecmd_cfg(major, minor); | ||
117 | return 0; | 122 | return 0; |
118 | } | 123 | } |
119 | 124 | ||
@@ -174,6 +179,7 @@ aoechr_write(struct file *filp, const char __user *buf, size_t cnt, loff_t *offp | |||
174 | break; | 179 | break; |
175 | case MINOR_FLUSH: | 180 | case MINOR_FLUSH: |
176 | ret = aoedev_flush(buf, cnt); | 181 | ret = aoedev_flush(buf, cnt); |
182 | break; | ||
177 | } | 183 | } |
178 | if (ret == 0) | 184 | if (ret == 0) |
179 | ret = cnt; | 185 | ret = cnt; |
@@ -281,7 +287,7 @@ aoechr_init(void) | |||
281 | int n, i; | 287 | int n, i; |
282 | 288 | ||
283 | n = register_chrdev(AOE_MAJOR, "aoechr", &aoe_fops); | 289 | n = register_chrdev(AOE_MAJOR, "aoechr", &aoe_fops); |
284 | if (n < 0) { | 290 | if (n < 0) { |
285 | printk(KERN_ERR "aoe: can't register char device\n"); | 291 | printk(KERN_ERR "aoe: can't register char device\n"); |
286 | return n; | 292 | return n; |
287 | } | 293 | } |
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 887f68f6d79a..25ef5c014fca 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c | |||
@@ -1,4 +1,4 @@ | |||
1 | /* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */ | 1 | /* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */ |
2 | /* | 2 | /* |
3 | * aoecmd.c | 3 | * aoecmd.c |
4 | * Filesystem request handling methods | 4 | * Filesystem request handling methods |
@@ -12,19 +12,40 @@ | |||
12 | #include <linux/netdevice.h> | 12 | #include <linux/netdevice.h> |
13 | #include <linux/genhd.h> | 13 | #include <linux/genhd.h> |
14 | #include <linux/moduleparam.h> | 14 | #include <linux/moduleparam.h> |
15 | #include <linux/workqueue.h> | ||
16 | #include <linux/kthread.h> | ||
15 | #include <net/net_namespace.h> | 17 | #include <net/net_namespace.h> |
16 | #include <asm/unaligned.h> | 18 | #include <asm/unaligned.h> |
19 | #include <linux/uio.h> | ||
17 | #include "aoe.h" | 20 | #include "aoe.h" |
18 | 21 | ||
22 | #define MAXIOC (8192) /* default meant to avoid most soft lockups */ | ||
23 | |||
24 | static void ktcomplete(struct frame *, struct sk_buff *); | ||
25 | static int count_targets(struct aoedev *d, int *untainted); | ||
26 | |||
27 | static struct buf *nextbuf(struct aoedev *); | ||
28 | |||
19 | static int aoe_deadsecs = 60 * 3; | 29 | static int aoe_deadsecs = 60 * 3; |
20 | module_param(aoe_deadsecs, int, 0644); | 30 | module_param(aoe_deadsecs, int, 0644); |
21 | MODULE_PARM_DESC(aoe_deadsecs, "After aoe_deadsecs seconds, give up and fail dev."); | 31 | MODULE_PARM_DESC(aoe_deadsecs, "After aoe_deadsecs seconds, give up and fail dev."); |
22 | 32 | ||
23 | static int aoe_maxout = 16; | 33 | static int aoe_maxout = 64; |
24 | module_param(aoe_maxout, int, 0644); | 34 | module_param(aoe_maxout, int, 0644); |
25 | MODULE_PARM_DESC(aoe_maxout, | 35 | MODULE_PARM_DESC(aoe_maxout, |
26 | "Only aoe_maxout outstanding packets for every MAC on eX.Y."); | 36 | "Only aoe_maxout outstanding packets for every MAC on eX.Y."); |
27 | 37 | ||
38 | static wait_queue_head_t ktiowq; | ||
39 | static struct ktstate kts; | ||
40 | |||
41 | /* io completion queue */ | ||
42 | static struct { | ||
43 | struct list_head head; | ||
44 | spinlock_t lock; | ||
45 | } iocq; | ||
46 | |||
47 | static struct page *empty_page; | ||
48 | |||
28 | static struct sk_buff * | 49 | static struct sk_buff * |
29 | new_skb(ulong len) | 50 | new_skb(ulong len) |
30 | { | 51 | { |
@@ -41,15 +62,38 @@ new_skb(ulong len) | |||
41 | } | 62 | } |
42 | 63 | ||
43 | static struct frame * | 64 | static struct frame * |
44 | getframe(struct aoetgt *t, int tag) | 65 | getframe_deferred(struct aoedev *d, u32 tag) |
45 | { | 66 | { |
46 | struct frame *f, *e; | 67 | struct list_head *head, *pos, *nx; |
68 | struct frame *f; | ||
47 | 69 | ||
48 | f = t->frames; | 70 | head = &d->rexmitq; |
49 | e = f + t->nframes; | 71 | list_for_each_safe(pos, nx, head) { |
50 | for (; f<e; f++) | 72 | f = list_entry(pos, struct frame, head); |
51 | if (f->tag == tag) | 73 | if (f->tag == tag) { |
74 | list_del(pos); | ||
52 | return f; | 75 | return f; |
76 | } | ||
77 | } | ||
78 | return NULL; | ||
79 | } | ||
80 | |||
81 | static struct frame * | ||
82 | getframe(struct aoedev *d, u32 tag) | ||
83 | { | ||
84 | struct frame *f; | ||
85 | struct list_head *head, *pos, *nx; | ||
86 | u32 n; | ||
87 | |||
88 | n = tag % NFACTIVE; | ||
89 | head = &d->factive[n]; | ||
90 | list_for_each_safe(pos, nx, head) { | ||
91 | f = list_entry(pos, struct frame, head); | ||
92 | if (f->tag == tag) { | ||
93 | list_del(pos); | ||
94 | return f; | ||
95 | } | ||
96 | } | ||
53 | return NULL; | 97 | return NULL; |
54 | } | 98 | } |
55 | 99 | ||
@@ -59,18 +103,18 @@ getframe(struct aoetgt *t, int tag) | |||
59 | * This driver reserves tag -1 to mean "unused frame." | 103 | * This driver reserves tag -1 to mean "unused frame." |
60 | */ | 104 | */ |
61 | static int | 105 | static int |
62 | newtag(struct aoetgt *t) | 106 | newtag(struct aoedev *d) |
63 | { | 107 | { |
64 | register ulong n; | 108 | register ulong n; |
65 | 109 | ||
66 | n = jiffies & 0xffff; | 110 | n = jiffies & 0xffff; |
67 | return n |= (++t->lasttag & 0x7fff) << 16; | 111 | return n |= (++d->lasttag & 0x7fff) << 16; |
68 | } | 112 | } |
69 | 113 | ||
70 | static int | 114 | static u32 |
71 | aoehdr_atainit(struct aoedev *d, struct aoetgt *t, struct aoe_hdr *h) | 115 | aoehdr_atainit(struct aoedev *d, struct aoetgt *t, struct aoe_hdr *h) |
72 | { | 116 | { |
73 | u32 host_tag = newtag(t); | 117 | u32 host_tag = newtag(d); |
74 | 118 | ||
75 | memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src); | 119 | memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src); |
76 | memcpy(h->dst, t->addr, sizeof h->dst); | 120 | memcpy(h->dst, t->addr, sizeof h->dst); |
@@ -95,16 +139,18 @@ put_lba(struct aoe_atahdr *ah, sector_t lba) | |||
95 | ah->lba5 = lba >>= 8; | 139 | ah->lba5 = lba >>= 8; |
96 | } | 140 | } |
97 | 141 | ||
98 | static void | 142 | static struct aoeif * |
99 | ifrotate(struct aoetgt *t) | 143 | ifrotate(struct aoetgt *t) |
100 | { | 144 | { |
101 | t->ifp++; | 145 | struct aoeif *ifp; |
102 | if (t->ifp >= &t->ifs[NAOEIFS] || t->ifp->nd == NULL) | 146 | |
103 | t->ifp = t->ifs; | 147 | ifp = t->ifp; |
104 | if (t->ifp->nd == NULL) { | 148 | ifp++; |
105 | printk(KERN_INFO "aoe: no interface to rotate to\n"); | 149 | if (ifp >= &t->ifs[NAOEIFS] || ifp->nd == NULL) |
106 | BUG(); | 150 | ifp = t->ifs; |
107 | } | 151 | if (ifp->nd == NULL) |
152 | return NULL; | ||
153 | return t->ifp = ifp; | ||
108 | } | 154 | } |
109 | 155 | ||
110 | static void | 156 | static void |
@@ -129,134 +175,182 @@ skb_pool_get(struct aoedev *d) | |||
129 | return NULL; | 175 | return NULL; |
130 | } | 176 | } |
131 | 177 | ||
132 | /* freeframe is where we do our load balancing so it's a little hairy. */ | 178 | void |
179 | aoe_freetframe(struct frame *f) | ||
180 | { | ||
181 | struct aoetgt *t; | ||
182 | |||
183 | t = f->t; | ||
184 | f->buf = NULL; | ||
185 | f->lba = 0; | ||
186 | f->bv = NULL; | ||
187 | f->r_skb = NULL; | ||
188 | f->flags = 0; | ||
189 | list_add(&f->head, &t->ffree); | ||
190 | } | ||
191 | |||
133 | static struct frame * | 192 | static struct frame * |
134 | freeframe(struct aoedev *d) | 193 | newtframe(struct aoedev *d, struct aoetgt *t) |
135 | { | 194 | { |
136 | struct frame *f, *e, *rf; | 195 | struct frame *f; |
137 | struct aoetgt **t; | ||
138 | struct sk_buff *skb; | 196 | struct sk_buff *skb; |
197 | struct list_head *pos; | ||
198 | |||
199 | if (list_empty(&t->ffree)) { | ||
200 | if (t->falloc >= NSKBPOOLMAX*2) | ||
201 | return NULL; | ||
202 | f = kcalloc(1, sizeof(*f), GFP_ATOMIC); | ||
203 | if (f == NULL) | ||
204 | return NULL; | ||
205 | t->falloc++; | ||
206 | f->t = t; | ||
207 | } else { | ||
208 | pos = t->ffree.next; | ||
209 | list_del(pos); | ||
210 | f = list_entry(pos, struct frame, head); | ||
211 | } | ||
212 | |||
213 | skb = f->skb; | ||
214 | if (skb == NULL) { | ||
215 | f->skb = skb = new_skb(ETH_ZLEN); | ||
216 | if (!skb) { | ||
217 | bail: aoe_freetframe(f); | ||
218 | return NULL; | ||
219 | } | ||
220 | } | ||
221 | |||
222 | if (atomic_read(&skb_shinfo(skb)->dataref) != 1) { | ||
223 | skb = skb_pool_get(d); | ||
224 | if (skb == NULL) | ||
225 | goto bail; | ||
226 | skb_pool_put(d, f->skb); | ||
227 | f->skb = skb; | ||
228 | } | ||
229 | |||
230 | skb->truesize -= skb->data_len; | ||
231 | skb_shinfo(skb)->nr_frags = skb->data_len = 0; | ||
232 | skb_trim(skb, 0); | ||
233 | return f; | ||
234 | } | ||
235 | |||
236 | static struct frame * | ||
237 | newframe(struct aoedev *d) | ||
238 | { | ||
239 | struct frame *f; | ||
240 | struct aoetgt *t, **tt; | ||
241 | int totout = 0; | ||
242 | int use_tainted; | ||
243 | int has_untainted; | ||
139 | 244 | ||
140 | if (d->targets[0] == NULL) { /* shouldn't happen, but I'm paranoid */ | 245 | if (!d->targets || !d->targets[0]) { |
141 | printk(KERN_ERR "aoe: NULL TARGETS!\n"); | 246 | printk(KERN_ERR "aoe: NULL TARGETS!\n"); |
142 | return NULL; | 247 | return NULL; |
143 | } | 248 | } |
144 | t = d->tgt; | 249 | tt = d->tgt; /* last used target */ |
145 | t++; | 250 | for (use_tainted = 0, has_untainted = 0;;) { |
146 | if (t >= &d->targets[NTARGETS] || !*t) | 251 | tt++; |
147 | t = d->targets; | 252 | if (tt >= &d->targets[d->ntargets] || !*tt) |
148 | for (;;) { | 253 | tt = d->targets; |
149 | if ((*t)->nout < (*t)->maxout | 254 | t = *tt; |
150 | && t != d->htgt | 255 | if (!t->taint) { |
151 | && (*t)->ifp->nd) { | 256 | has_untainted = 1; |
152 | rf = NULL; | 257 | totout += t->nout; |
153 | f = (*t)->frames; | 258 | } |
154 | e = f + (*t)->nframes; | 259 | if (t->nout < t->maxout |
155 | for (; f < e; f++) { | 260 | && (use_tainted || !t->taint) |
156 | if (f->tag != FREETAG) | 261 | && t->ifp->nd) { |
157 | continue; | 262 | f = newtframe(d, t); |
158 | skb = f->skb; | 263 | if (f) { |
159 | if (!skb | 264 | ifrotate(t); |
160 | && !(f->skb = skb = new_skb(ETH_ZLEN))) | 265 | d->tgt = tt; |
161 | continue; | ||
162 | if (atomic_read(&skb_shinfo(skb)->dataref) | ||
163 | != 1) { | ||
164 | if (!rf) | ||
165 | rf = f; | ||
166 | continue; | ||
167 | } | ||
168 | gotone: skb_shinfo(skb)->nr_frags = skb->data_len = 0; | ||
169 | skb_trim(skb, 0); | ||
170 | d->tgt = t; | ||
171 | ifrotate(*t); | ||
172 | return f; | 266 | return f; |
173 | } | 267 | } |
174 | /* Work can be done, but the network layer is | ||
175 | holding our precious packets. Try to grab | ||
176 | one from the pool. */ | ||
177 | f = rf; | ||
178 | if (f == NULL) { /* more paranoia */ | ||
179 | printk(KERN_ERR | ||
180 | "aoe: freeframe: %s.\n", | ||
181 | "unexpected null rf"); | ||
182 | d->flags |= DEVFL_KICKME; | ||
183 | return NULL; | ||
184 | } | ||
185 | skb = skb_pool_get(d); | ||
186 | if (skb) { | ||
187 | skb_pool_put(d, f->skb); | ||
188 | f->skb = skb; | ||
189 | goto gotone; | ||
190 | } | ||
191 | (*t)->dataref++; | ||
192 | if ((*t)->nout == 0) | ||
193 | d->flags |= DEVFL_KICKME; | ||
194 | } | 268 | } |
195 | if (t == d->tgt) /* we've looped and found nada */ | 269 | if (tt == d->tgt) { /* we've looped and found nada */ |
196 | break; | 270 | if (!use_tainted && !has_untainted) |
197 | t++; | 271 | use_tainted = 1; |
198 | if (t >= &d->targets[NTARGETS] || !*t) | 272 | else |
199 | t = d->targets; | 273 | break; |
274 | } | ||
275 | } | ||
276 | if (totout == 0) { | ||
277 | d->kicked++; | ||
278 | d->flags |= DEVFL_KICKME; | ||
200 | } | 279 | } |
201 | return NULL; | 280 | return NULL; |
202 | } | 281 | } |
203 | 282 | ||
204 | static int | 283 | static void |
205 | aoecmd_ata_rw(struct aoedev *d) | 284 | skb_fillup(struct sk_buff *skb, struct bio_vec *bv, ulong off, ulong cnt) |
206 | { | 285 | { |
207 | struct frame *f; | 286 | int frag = 0; |
287 | ulong fcnt; | ||
288 | loop: | ||
289 | fcnt = bv->bv_len - (off - bv->bv_offset); | ||
290 | if (fcnt > cnt) | ||
291 | fcnt = cnt; | ||
292 | skb_fill_page_desc(skb, frag++, bv->bv_page, off, fcnt); | ||
293 | cnt -= fcnt; | ||
294 | if (cnt <= 0) | ||
295 | return; | ||
296 | bv++; | ||
297 | off = bv->bv_offset; | ||
298 | goto loop; | ||
299 | } | ||
300 | |||
301 | static void | ||
302 | fhash(struct frame *f) | ||
303 | { | ||
304 | struct aoedev *d = f->t->d; | ||
305 | u32 n; | ||
306 | |||
307 | n = f->tag % NFACTIVE; | ||
308 | list_add_tail(&f->head, &d->factive[n]); | ||
309 | } | ||
310 | |||
311 | static void | ||
312 | ata_rw_frameinit(struct frame *f) | ||
313 | { | ||
314 | struct aoetgt *t; | ||
208 | struct aoe_hdr *h; | 315 | struct aoe_hdr *h; |
209 | struct aoe_atahdr *ah; | 316 | struct aoe_atahdr *ah; |
210 | struct buf *buf; | ||
211 | struct bio_vec *bv; | ||
212 | struct aoetgt *t; | ||
213 | struct sk_buff *skb; | 317 | struct sk_buff *skb; |
214 | ulong bcnt; | ||
215 | char writebit, extbit; | 318 | char writebit, extbit; |
216 | 319 | ||
217 | writebit = 0x10; | ||
218 | extbit = 0x4; | ||
219 | |||
220 | f = freeframe(d); | ||
221 | if (f == NULL) | ||
222 | return 0; | ||
223 | t = *d->tgt; | ||
224 | buf = d->inprocess; | ||
225 | bv = buf->bv; | ||
226 | bcnt = t->ifp->maxbcnt; | ||
227 | if (bcnt == 0) | ||
228 | bcnt = DEFAULTBCNT; | ||
229 | if (bcnt > buf->bv_resid) | ||
230 | bcnt = buf->bv_resid; | ||
231 | /* initialize the headers & frame */ | ||
232 | skb = f->skb; | 320 | skb = f->skb; |
233 | h = (struct aoe_hdr *) skb_mac_header(skb); | 321 | h = (struct aoe_hdr *) skb_mac_header(skb); |
234 | ah = (struct aoe_atahdr *) (h+1); | 322 | ah = (struct aoe_atahdr *) (h + 1); |
235 | skb_put(skb, sizeof *h + sizeof *ah); | 323 | skb_put(skb, sizeof(*h) + sizeof(*ah)); |
236 | memset(h, 0, skb->len); | 324 | memset(h, 0, skb->len); |
237 | f->tag = aoehdr_atainit(d, t, h); | 325 | |
326 | writebit = 0x10; | ||
327 | extbit = 0x4; | ||
328 | |||
329 | t = f->t; | ||
330 | f->tag = aoehdr_atainit(t->d, t, h); | ||
331 | fhash(f); | ||
238 | t->nout++; | 332 | t->nout++; |
239 | f->waited = 0; | 333 | f->waited = 0; |
240 | f->buf = buf; | 334 | f->waited_total = 0; |
241 | f->bufaddr = page_address(bv->bv_page) + buf->bv_off; | 335 | if (f->buf) |
242 | f->bcnt = bcnt; | 336 | f->lba = f->buf->sector; |
243 | f->lba = buf->sector; | ||
244 | 337 | ||
245 | /* set up ata header */ | 338 | /* set up ata header */ |
246 | ah->scnt = bcnt >> 9; | 339 | ah->scnt = f->bcnt >> 9; |
247 | put_lba(ah, buf->sector); | 340 | put_lba(ah, f->lba); |
248 | if (d->flags & DEVFL_EXT) { | 341 | if (t->d->flags & DEVFL_EXT) { |
249 | ah->aflags |= AOEAFL_EXT; | 342 | ah->aflags |= AOEAFL_EXT; |
250 | } else { | 343 | } else { |
251 | extbit = 0; | 344 | extbit = 0; |
252 | ah->lba3 &= 0x0f; | 345 | ah->lba3 &= 0x0f; |
253 | ah->lba3 |= 0xe0; /* LBA bit + obsolete 0xa0 */ | 346 | ah->lba3 |= 0xe0; /* LBA bit + obsolete 0xa0 */ |
254 | } | 347 | } |
255 | if (bio_data_dir(buf->bio) == WRITE) { | 348 | if (f->buf && bio_data_dir(f->buf->bio) == WRITE) { |
256 | skb_fill_page_desc(skb, 0, bv->bv_page, buf->bv_off, bcnt); | 349 | skb_fillup(skb, f->bv, f->bv_off, f->bcnt); |
257 | ah->aflags |= AOEAFL_WRITE; | 350 | ah->aflags |= AOEAFL_WRITE; |
258 | skb->len += bcnt; | 351 | skb->len += f->bcnt; |
259 | skb->data_len = bcnt; | 352 | skb->data_len = f->bcnt; |
353 | skb->truesize += f->bcnt; | ||
260 | t->wpkts++; | 354 | t->wpkts++; |
261 | } else { | 355 | } else { |
262 | t->rpkts++; | 356 | t->rpkts++; |
@@ -264,26 +358,68 @@ aoecmd_ata_rw(struct aoedev *d) | |||
264 | } | 358 | } |
265 | 359 | ||
266 | ah->cmdstat = ATA_CMD_PIO_READ | writebit | extbit; | 360 | ah->cmdstat = ATA_CMD_PIO_READ | writebit | extbit; |
361 | skb->dev = t->ifp->nd; | ||
362 | } | ||
363 | |||
364 | static int | ||
365 | aoecmd_ata_rw(struct aoedev *d) | ||
366 | { | ||
367 | struct frame *f; | ||
368 | struct buf *buf; | ||
369 | struct aoetgt *t; | ||
370 | struct sk_buff *skb; | ||
371 | struct sk_buff_head queue; | ||
372 | ulong bcnt, fbcnt; | ||
373 | |||
374 | buf = nextbuf(d); | ||
375 | if (buf == NULL) | ||
376 | return 0; | ||
377 | f = newframe(d); | ||
378 | if (f == NULL) | ||
379 | return 0; | ||
380 | t = *d->tgt; | ||
381 | bcnt = d->maxbcnt; | ||
382 | if (bcnt == 0) | ||
383 | bcnt = DEFAULTBCNT; | ||
384 | if (bcnt > buf->resid) | ||
385 | bcnt = buf->resid; | ||
386 | fbcnt = bcnt; | ||
387 | f->bv = buf->bv; | ||
388 | f->bv_off = f->bv->bv_offset + (f->bv->bv_len - buf->bv_resid); | ||
389 | do { | ||
390 | if (fbcnt < buf->bv_resid) { | ||
391 | buf->bv_resid -= fbcnt; | ||
392 | buf->resid -= fbcnt; | ||
393 | break; | ||
394 | } | ||
395 | fbcnt -= buf->bv_resid; | ||
396 | buf->resid -= buf->bv_resid; | ||
397 | if (buf->resid == 0) { | ||
398 | d->ip.buf = NULL; | ||
399 | break; | ||
400 | } | ||
401 | buf->bv++; | ||
402 | buf->bv_resid = buf->bv->bv_len; | ||
403 | WARN_ON(buf->bv_resid == 0); | ||
404 | } while (fbcnt); | ||
405 | |||
406 | /* initialize the headers & frame */ | ||
407 | f->buf = buf; | ||
408 | f->bcnt = bcnt; | ||
409 | ata_rw_frameinit(f); | ||
267 | 410 | ||
268 | /* mark all tracking fields and load out */ | 411 | /* mark all tracking fields and load out */ |
269 | buf->nframesout += 1; | 412 | buf->nframesout += 1; |
270 | buf->bv_off += bcnt; | ||
271 | buf->bv_resid -= bcnt; | ||
272 | buf->resid -= bcnt; | ||
273 | buf->sector += bcnt >> 9; | 413 | buf->sector += bcnt >> 9; |
274 | if (buf->resid == 0) { | ||
275 | d->inprocess = NULL; | ||
276 | } else if (buf->bv_resid == 0) { | ||
277 | buf->bv = ++bv; | ||
278 | buf->bv_resid = bv->bv_len; | ||
279 | WARN_ON(buf->bv_resid == 0); | ||
280 | buf->bv_off = bv->bv_offset; | ||
281 | } | ||
282 | 414 | ||
283 | skb->dev = t->ifp->nd; | 415 | skb = skb_clone(f->skb, GFP_ATOMIC); |
284 | skb = skb_clone(skb, GFP_ATOMIC); | 416 | if (skb) { |
285 | if (skb) | 417 | do_gettimeofday(&f->sent); |
286 | __skb_queue_tail(&d->sendq, skb); | 418 | f->sent_jiffs = (u32) jiffies; |
419 | __skb_queue_head_init(&queue); | ||
420 | __skb_queue_tail(&queue, skb); | ||
421 | aoenet_xmit(&queue); | ||
422 | } | ||
287 | return 1; | 423 | return 1; |
288 | } | 424 | } |
289 | 425 | ||
@@ -330,60 +466,88 @@ cont: | |||
330 | } | 466 | } |
331 | 467 | ||
332 | static void | 468 | static void |
333 | resend(struct aoedev *d, struct aoetgt *t, struct frame *f) | 469 | resend(struct aoedev *d, struct frame *f) |
334 | { | 470 | { |
335 | struct sk_buff *skb; | 471 | struct sk_buff *skb; |
472 | struct sk_buff_head queue; | ||
336 | struct aoe_hdr *h; | 473 | struct aoe_hdr *h; |
337 | struct aoe_atahdr *ah; | 474 | struct aoe_atahdr *ah; |
475 | struct aoetgt *t; | ||
338 | char buf[128]; | 476 | char buf[128]; |
339 | u32 n; | 477 | u32 n; |
340 | 478 | ||
341 | ifrotate(t); | 479 | t = f->t; |
342 | n = newtag(t); | 480 | n = newtag(d); |
343 | skb = f->skb; | 481 | skb = f->skb; |
482 | if (ifrotate(t) == NULL) { | ||
483 | /* probably can't happen, but set it up to fail anyway */ | ||
484 | pr_info("aoe: resend: no interfaces to rotate to.\n"); | ||
485 | ktcomplete(f, NULL); | ||
486 | return; | ||
487 | } | ||
344 | h = (struct aoe_hdr *) skb_mac_header(skb); | 488 | h = (struct aoe_hdr *) skb_mac_header(skb); |
345 | ah = (struct aoe_atahdr *) (h+1); | 489 | ah = (struct aoe_atahdr *) (h+1); |
346 | 490 | ||
347 | snprintf(buf, sizeof buf, | 491 | if (!(f->flags & FFL_PROBE)) { |
348 | "%15s e%ld.%d oldtag=%08x@%08lx newtag=%08x s=%pm d=%pm nout=%d\n", | 492 | snprintf(buf, sizeof(buf), |
349 | "retransmit", d->aoemajor, d->aoeminor, f->tag, jiffies, n, | 493 | "%15s e%ld.%d oldtag=%08x@%08lx newtag=%08x s=%pm d=%pm nout=%d\n", |
350 | h->src, h->dst, t->nout); | 494 | "retransmit", d->aoemajor, d->aoeminor, |
351 | aoechr_error(buf); | 495 | f->tag, jiffies, n, |
496 | h->src, h->dst, t->nout); | ||
497 | aoechr_error(buf); | ||
498 | } | ||
352 | 499 | ||
353 | f->tag = n; | 500 | f->tag = n; |
501 | fhash(f); | ||
354 | h->tag = cpu_to_be32(n); | 502 | h->tag = cpu_to_be32(n); |
355 | memcpy(h->dst, t->addr, sizeof h->dst); | 503 | memcpy(h->dst, t->addr, sizeof h->dst); |
356 | memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src); | 504 | memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src); |
357 | 505 | ||
358 | switch (ah->cmdstat) { | ||
359 | default: | ||
360 | break; | ||
361 | case ATA_CMD_PIO_READ: | ||
362 | case ATA_CMD_PIO_READ_EXT: | ||
363 | case ATA_CMD_PIO_WRITE: | ||
364 | case ATA_CMD_PIO_WRITE_EXT: | ||
365 | put_lba(ah, f->lba); | ||
366 | |||
367 | n = f->bcnt; | ||
368 | if (n > DEFAULTBCNT) | ||
369 | n = DEFAULTBCNT; | ||
370 | ah->scnt = n >> 9; | ||
371 | if (ah->aflags & AOEAFL_WRITE) { | ||
372 | skb_fill_page_desc(skb, 0, virt_to_page(f->bufaddr), | ||
373 | offset_in_page(f->bufaddr), n); | ||
374 | skb->len = sizeof *h + sizeof *ah + n; | ||
375 | skb->data_len = n; | ||
376 | } | ||
377 | } | ||
378 | skb->dev = t->ifp->nd; | 506 | skb->dev = t->ifp->nd; |
379 | skb = skb_clone(skb, GFP_ATOMIC); | 507 | skb = skb_clone(skb, GFP_ATOMIC); |
380 | if (skb == NULL) | 508 | if (skb == NULL) |
381 | return; | 509 | return; |
382 | __skb_queue_tail(&d->sendq, skb); | 510 | do_gettimeofday(&f->sent); |
511 | f->sent_jiffs = (u32) jiffies; | ||
512 | __skb_queue_head_init(&queue); | ||
513 | __skb_queue_tail(&queue, skb); | ||
514 | aoenet_xmit(&queue); | ||
515 | } | ||
516 | |||
517 | static int | ||
518 | tsince_hr(struct frame *f) | ||
519 | { | ||
520 | struct timeval now; | ||
521 | int n; | ||
522 | |||
523 | do_gettimeofday(&now); | ||
524 | n = now.tv_usec - f->sent.tv_usec; | ||
525 | n += (now.tv_sec - f->sent.tv_sec) * USEC_PER_SEC; | ||
526 | |||
527 | if (n < 0) | ||
528 | n = -n; | ||
529 | |||
530 | /* For relatively long periods, use jiffies to avoid | ||
531 | * discrepancies caused by updates to the system time. | ||
532 | * | ||
533 | * On system with HZ of 1000, 32-bits is over 49 days | ||
534 | * worth of jiffies, or over 71 minutes worth of usecs. | ||
535 | * | ||
536 | * Jiffies overflow is handled by subtraction of unsigned ints: | ||
537 | * (gdb) print (unsigned) 2 - (unsigned) 0xfffffffe | ||
538 | * $3 = 4 | ||
539 | * (gdb) | ||
540 | */ | ||
541 | if (n > USEC_PER_SEC / 4) { | ||
542 | n = ((u32) jiffies) - f->sent_jiffs; | ||
543 | n *= USEC_PER_SEC / HZ; | ||
544 | } | ||
545 | |||
546 | return n; | ||
383 | } | 547 | } |
384 | 548 | ||
385 | static int | 549 | static int |
386 | tsince(int tag) | 550 | tsince(u32 tag) |
387 | { | 551 | { |
388 | int n; | 552 | int n; |
389 | 553 | ||
@@ -391,7 +555,7 @@ tsince(int tag) | |||
391 | n -= tag & 0xffff; | 555 | n -= tag & 0xffff; |
392 | if (n < 0) | 556 | if (n < 0) |
393 | n += 1<<16; | 557 | n += 1<<16; |
394 | return n; | 558 | return jiffies_to_usecs(n + 1); |
395 | } | 559 | } |
396 | 560 | ||
397 | static struct aoeif * | 561 | static struct aoeif * |
@@ -407,195 +571,411 @@ getif(struct aoetgt *t, struct net_device *nd) | |||
407 | return NULL; | 571 | return NULL; |
408 | } | 572 | } |
409 | 573 | ||
410 | static struct aoeif * | ||
411 | addif(struct aoetgt *t, struct net_device *nd) | ||
412 | { | ||
413 | struct aoeif *p; | ||
414 | |||
415 | p = getif(t, NULL); | ||
416 | if (!p) | ||
417 | return NULL; | ||
418 | p->nd = nd; | ||
419 | p->maxbcnt = DEFAULTBCNT; | ||
420 | p->lost = 0; | ||
421 | p->lostjumbo = 0; | ||
422 | return p; | ||
423 | } | ||
424 | |||
425 | static void | 574 | static void |
426 | ejectif(struct aoetgt *t, struct aoeif *ifp) | 575 | ejectif(struct aoetgt *t, struct aoeif *ifp) |
427 | { | 576 | { |
428 | struct aoeif *e; | 577 | struct aoeif *e; |
578 | struct net_device *nd; | ||
429 | ulong n; | 579 | ulong n; |
430 | 580 | ||
581 | nd = ifp->nd; | ||
431 | e = t->ifs + NAOEIFS - 1; | 582 | e = t->ifs + NAOEIFS - 1; |
432 | n = (e - ifp) * sizeof *ifp; | 583 | n = (e - ifp) * sizeof *ifp; |
433 | memmove(ifp, ifp+1, n); | 584 | memmove(ifp, ifp+1, n); |
434 | e->nd = NULL; | 585 | e->nd = NULL; |
586 | dev_put(nd); | ||
435 | } | 587 | } |
436 | 588 | ||
437 | static int | 589 | static struct frame * |
438 | sthtith(struct aoedev *d) | 590 | reassign_frame(struct frame *f) |
591 | { | ||
592 | struct frame *nf; | ||
593 | struct sk_buff *skb; | ||
594 | |||
595 | nf = newframe(f->t->d); | ||
596 | if (!nf) | ||
597 | return NULL; | ||
598 | if (nf->t == f->t) { | ||
599 | aoe_freetframe(nf); | ||
600 | return NULL; | ||
601 | } | ||
602 | |||
603 | skb = nf->skb; | ||
604 | nf->skb = f->skb; | ||
605 | nf->buf = f->buf; | ||
606 | nf->bcnt = f->bcnt; | ||
607 | nf->lba = f->lba; | ||
608 | nf->bv = f->bv; | ||
609 | nf->bv_off = f->bv_off; | ||
610 | nf->waited = 0; | ||
611 | nf->waited_total = f->waited_total; | ||
612 | nf->sent = f->sent; | ||
613 | nf->sent_jiffs = f->sent_jiffs; | ||
614 | f->skb = skb; | ||
615 | |||
616 | return nf; | ||
617 | } | ||
618 | |||
619 | static void | ||
620 | probe(struct aoetgt *t) | ||
439 | { | 621 | { |
440 | struct frame *f, *e, *nf; | 622 | struct aoedev *d; |
623 | struct frame *f; | ||
441 | struct sk_buff *skb; | 624 | struct sk_buff *skb; |
442 | struct aoetgt *ht = *d->htgt; | 625 | struct sk_buff_head queue; |
626 | size_t n, m; | ||
627 | int frag; | ||
628 | |||
629 | d = t->d; | ||
630 | f = newtframe(d, t); | ||
631 | if (!f) { | ||
632 | pr_err("%s %pm for e%ld.%d: %s\n", | ||
633 | "aoe: cannot probe remote address", | ||
634 | t->addr, | ||
635 | (long) d->aoemajor, d->aoeminor, | ||
636 | "no frame available"); | ||
637 | return; | ||
638 | } | ||
639 | f->flags |= FFL_PROBE; | ||
640 | ifrotate(t); | ||
641 | f->bcnt = t->d->maxbcnt ? t->d->maxbcnt : DEFAULTBCNT; | ||
642 | ata_rw_frameinit(f); | ||
643 | skb = f->skb; | ||
644 | for (frag = 0, n = f->bcnt; n > 0; ++frag, n -= m) { | ||
645 | if (n < PAGE_SIZE) | ||
646 | m = n; | ||
647 | else | ||
648 | m = PAGE_SIZE; | ||
649 | skb_fill_page_desc(skb, frag, empty_page, 0, m); | ||
650 | } | ||
651 | skb->len += f->bcnt; | ||
652 | skb->data_len = f->bcnt; | ||
653 | skb->truesize += f->bcnt; | ||
654 | |||
655 | skb = skb_clone(f->skb, GFP_ATOMIC); | ||
656 | if (skb) { | ||
657 | do_gettimeofday(&f->sent); | ||
658 | f->sent_jiffs = (u32) jiffies; | ||
659 | __skb_queue_head_init(&queue); | ||
660 | __skb_queue_tail(&queue, skb); | ||
661 | aoenet_xmit(&queue); | ||
662 | } | ||
663 | } | ||
443 | 664 | ||
444 | f = ht->frames; | 665 | static long |
445 | e = f + ht->nframes; | 666 | rto(struct aoedev *d) |
446 | for (; f < e; f++) { | 667 | { |
447 | if (f->tag == FREETAG) | 668 | long t; |
669 | |||
670 | t = 2 * d->rttavg >> RTTSCALE; | ||
671 | t += 8 * d->rttdev >> RTTDSCALE; | ||
672 | if (t == 0) | ||
673 | t = 1; | ||
674 | |||
675 | return t; | ||
676 | } | ||
677 | |||
678 | static void | ||
679 | rexmit_deferred(struct aoedev *d) | ||
680 | { | ||
681 | struct aoetgt *t; | ||
682 | struct frame *f; | ||
683 | struct frame *nf; | ||
684 | struct list_head *pos, *nx, *head; | ||
685 | int since; | ||
686 | int untainted; | ||
687 | |||
688 | count_targets(d, &untainted); | ||
689 | |||
690 | head = &d->rexmitq; | ||
691 | list_for_each_safe(pos, nx, head) { | ||
692 | f = list_entry(pos, struct frame, head); | ||
693 | t = f->t; | ||
694 | if (t->taint) { | ||
695 | if (!(f->flags & FFL_PROBE)) { | ||
696 | nf = reassign_frame(f); | ||
697 | if (nf) { | ||
698 | if (t->nout_probes == 0 | ||
699 | && untainted > 0) { | ||
700 | probe(t); | ||
701 | t->nout_probes++; | ||
702 | } | ||
703 | list_replace(&f->head, &nf->head); | ||
704 | pos = &nf->head; | ||
705 | aoe_freetframe(f); | ||
706 | f = nf; | ||
707 | t = f->t; | ||
708 | } | ||
709 | } else if (untainted < 1) { | ||
710 | /* don't probe w/o other untainted aoetgts */ | ||
711 | goto stop_probe; | ||
712 | } else if (tsince_hr(f) < t->taint * rto(d)) { | ||
713 | /* reprobe slowly when taint is high */ | ||
714 | continue; | ||
715 | } | ||
716 | } else if (f->flags & FFL_PROBE) { | ||
717 | stop_probe: /* don't probe untainted aoetgts */ | ||
718 | list_del(pos); | ||
719 | aoe_freetframe(f); | ||
720 | /* leaving d->kicked, because this is routine */ | ||
721 | f->t->d->flags |= DEVFL_KICKME; | ||
448 | continue; | 722 | continue; |
449 | nf = freeframe(d); | 723 | } |
450 | if (!nf) | 724 | if (t->nout >= t->maxout) |
451 | return 0; | 725 | continue; |
452 | skb = nf->skb; | 726 | list_del(pos); |
453 | *nf = *f; | 727 | t->nout++; |
454 | f->skb = skb; | 728 | if (f->flags & FFL_PROBE) |
455 | f->tag = FREETAG; | 729 | t->nout_probes++; |
456 | nf->waited = 0; | 730 | since = tsince_hr(f); |
457 | ht->nout--; | 731 | f->waited += since; |
458 | (*d->tgt)->nout++; | 732 | f->waited_total += since; |
459 | resend(d, *d->tgt, nf); | 733 | resend(d, f); |
460 | } | 734 | } |
461 | /* he's clean, he's useless. take away his interfaces */ | ||
462 | memset(ht->ifs, 0, sizeof ht->ifs); | ||
463 | d->htgt = NULL; | ||
464 | return 1; | ||
465 | } | 735 | } |
466 | 736 | ||
467 | static inline unsigned char | 737 | /* An aoetgt accumulates demerits quickly, and successful |
468 | ata_scnt(unsigned char *packet) { | 738 | * probing redeems the aoetgt slowly. |
469 | struct aoe_hdr *h; | 739 | */ |
470 | struct aoe_atahdr *ah; | 740 | static void |
741 | scorn(struct aoetgt *t) | ||
742 | { | ||
743 | int n; | ||
471 | 744 | ||
472 | h = (struct aoe_hdr *) packet; | 745 | n = t->taint++; |
473 | ah = (struct aoe_atahdr *) (h+1); | 746 | t->taint += t->taint * 2; |
474 | return ah->scnt; | 747 | if (n > t->taint) |
748 | t->taint = n; | ||
749 | if (t->taint > MAX_TAINT) | ||
750 | t->taint = MAX_TAINT; | ||
751 | } | ||
752 | |||
753 | static int | ||
754 | count_targets(struct aoedev *d, int *untainted) | ||
755 | { | ||
756 | int i, good; | ||
757 | |||
758 | for (i = good = 0; i < d->ntargets && d->targets[i]; ++i) | ||
759 | if (d->targets[i]->taint == 0) | ||
760 | good++; | ||
761 | |||
762 | if (untainted) | ||
763 | *untainted = good; | ||
764 | return i; | ||
475 | } | 765 | } |
476 | 766 | ||
477 | static void | 767 | static void |
478 | rexmit_timer(ulong vp) | 768 | rexmit_timer(ulong vp) |
479 | { | 769 | { |
480 | struct sk_buff_head queue; | ||
481 | struct aoedev *d; | 770 | struct aoedev *d; |
482 | struct aoetgt *t, **tt, **te; | 771 | struct aoetgt *t; |
483 | struct aoeif *ifp; | 772 | struct aoeif *ifp; |
484 | struct frame *f, *e; | 773 | struct frame *f; |
774 | struct list_head *head, *pos, *nx; | ||
775 | LIST_HEAD(flist); | ||
485 | register long timeout; | 776 | register long timeout; |
486 | ulong flags, n; | 777 | ulong flags, n; |
778 | int i; | ||
779 | int utgts; /* number of aoetgt descriptors (not slots) */ | ||
780 | int since; | ||
487 | 781 | ||
488 | d = (struct aoedev *) vp; | 782 | d = (struct aoedev *) vp; |
489 | 783 | ||
490 | /* timeout is always ~150% of the moving average */ | ||
491 | timeout = d->rttavg; | ||
492 | timeout += timeout >> 1; | ||
493 | |||
494 | spin_lock_irqsave(&d->lock, flags); | 784 | spin_lock_irqsave(&d->lock, flags); |
495 | 785 | ||
786 | /* timeout based on observed timings and variations */ | ||
787 | timeout = rto(d); | ||
788 | |||
789 | utgts = count_targets(d, NULL); | ||
790 | |||
496 | if (d->flags & DEVFL_TKILL) { | 791 | if (d->flags & DEVFL_TKILL) { |
497 | spin_unlock_irqrestore(&d->lock, flags); | 792 | spin_unlock_irqrestore(&d->lock, flags); |
498 | return; | 793 | return; |
499 | } | 794 | } |
500 | tt = d->targets; | ||
501 | te = tt + NTARGETS; | ||
502 | for (; tt < te && *tt; tt++) { | ||
503 | t = *tt; | ||
504 | f = t->frames; | ||
505 | e = f + t->nframes; | ||
506 | for (; f < e; f++) { | ||
507 | if (f->tag == FREETAG | ||
508 | || tsince(f->tag) < timeout) | ||
509 | continue; | ||
510 | n = f->waited += timeout; | ||
511 | n /= HZ; | ||
512 | if (n > aoe_deadsecs) { | ||
513 | /* waited too long. device failure. */ | ||
514 | aoedev_downdev(d); | ||
515 | break; | ||
516 | } | ||
517 | 795 | ||
518 | if (n > HELPWAIT /* see if another target can help */ | 796 | /* collect all frames to rexmit into flist */ |
519 | && (tt != d->targets || d->targets[1])) | 797 | for (i = 0; i < NFACTIVE; i++) { |
520 | d->htgt = tt; | 798 | head = &d->factive[i]; |
799 | list_for_each_safe(pos, nx, head) { | ||
800 | f = list_entry(pos, struct frame, head); | ||
801 | if (tsince_hr(f) < timeout) | ||
802 | break; /* end of expired frames */ | ||
803 | /* move to flist for later processing */ | ||
804 | list_move_tail(pos, &flist); | ||
805 | } | ||
806 | } | ||
521 | 807 | ||
522 | if (t->nout == t->maxout) { | 808 | /* process expired frames */ |
523 | if (t->maxout > 1) | 809 | while (!list_empty(&flist)) { |
524 | t->maxout--; | 810 | pos = flist.next; |
525 | t->lastwadj = jiffies; | 811 | f = list_entry(pos, struct frame, head); |
526 | } | 812 | since = tsince_hr(f); |
813 | n = f->waited_total + since; | ||
814 | n /= USEC_PER_SEC; | ||
815 | if (aoe_deadsecs | ||
816 | && n > aoe_deadsecs | ||
817 | && !(f->flags & FFL_PROBE)) { | ||
818 | /* Waited too long. Device failure. | ||
819 | * Hang all frames on first hash bucket for downdev | ||
820 | * to clean up. | ||
821 | */ | ||
822 | list_splice(&flist, &d->factive[0]); | ||
823 | aoedev_downdev(d); | ||
824 | goto out; | ||
825 | } | ||
826 | |||
827 | t = f->t; | ||
828 | n = f->waited + since; | ||
829 | n /= USEC_PER_SEC; | ||
830 | if (aoe_deadsecs && utgts > 0 | ||
831 | && (n > aoe_deadsecs / utgts || n > HARD_SCORN_SECS)) | ||
832 | scorn(t); /* avoid this target */ | ||
833 | |||
834 | if (t->maxout != 1) { | ||
835 | t->ssthresh = t->maxout / 2; | ||
836 | t->maxout = 1; | ||
837 | } | ||
527 | 838 | ||
839 | if (f->flags & FFL_PROBE) { | ||
840 | t->nout_probes--; | ||
841 | } else { | ||
528 | ifp = getif(t, f->skb->dev); | 842 | ifp = getif(t, f->skb->dev); |
529 | if (ifp && ++ifp->lost > (t->nframes << 1) | 843 | if (ifp && ++ifp->lost > (t->nframes << 1) |
530 | && (ifp != t->ifs || t->ifs[1].nd)) { | 844 | && (ifp != t->ifs || t->ifs[1].nd)) { |
531 | ejectif(t, ifp); | 845 | ejectif(t, ifp); |
532 | ifp = NULL; | 846 | ifp = NULL; |
533 | } | 847 | } |
534 | |||
535 | if (ata_scnt(skb_mac_header(f->skb)) > DEFAULTBCNT / 512 | ||
536 | && ifp && ++ifp->lostjumbo > (t->nframes << 1) | ||
537 | && ifp->maxbcnt != DEFAULTBCNT) { | ||
538 | printk(KERN_INFO | ||
539 | "aoe: e%ld.%d: " | ||
540 | "too many lost jumbo on " | ||
541 | "%s:%pm - " | ||
542 | "falling back to %d frames.\n", | ||
543 | d->aoemajor, d->aoeminor, | ||
544 | ifp->nd->name, t->addr, | ||
545 | DEFAULTBCNT); | ||
546 | ifp->maxbcnt = 0; | ||
547 | } | ||
548 | resend(d, t, f); | ||
549 | } | ||
550 | |||
551 | /* window check */ | ||
552 | if (t->nout == t->maxout | ||
553 | && t->maxout < t->nframes | ||
554 | && (jiffies - t->lastwadj)/HZ > 10) { | ||
555 | t->maxout++; | ||
556 | t->lastwadj = jiffies; | ||
557 | } | 848 | } |
849 | list_move_tail(pos, &d->rexmitq); | ||
850 | t->nout--; | ||
558 | } | 851 | } |
852 | rexmit_deferred(d); | ||
559 | 853 | ||
560 | if (!skb_queue_empty(&d->sendq)) { | 854 | out: |
561 | n = d->rttavg <<= 1; | 855 | if ((d->flags & DEVFL_KICKME) && d->blkq) { |
562 | if (n > MAXTIMER) | ||
563 | d->rttavg = MAXTIMER; | ||
564 | } | ||
565 | |||
566 | if (d->flags & DEVFL_KICKME || d->htgt) { | ||
567 | d->flags &= ~DEVFL_KICKME; | 856 | d->flags &= ~DEVFL_KICKME; |
568 | aoecmd_work(d); | 857 | d->blkq->request_fn(d->blkq); |
569 | } | 858 | } |
570 | 859 | ||
571 | __skb_queue_head_init(&queue); | ||
572 | skb_queue_splice_init(&d->sendq, &queue); | ||
573 | |||
574 | d->timer.expires = jiffies + TIMERTICK; | 860 | d->timer.expires = jiffies + TIMERTICK; |
575 | add_timer(&d->timer); | 861 | add_timer(&d->timer); |
576 | 862 | ||
577 | spin_unlock_irqrestore(&d->lock, flags); | 863 | spin_unlock_irqrestore(&d->lock, flags); |
864 | } | ||
578 | 865 | ||
579 | aoenet_xmit(&queue); | 866 | static unsigned long |
867 | rqbiocnt(struct request *r) | ||
868 | { | ||
869 | struct bio *bio; | ||
870 | unsigned long n = 0; | ||
871 | |||
872 | __rq_for_each_bio(bio, r) | ||
873 | n++; | ||
874 | return n; | ||
875 | } | ||
876 | |||
877 | /* This can be removed if we are certain that no users of the block | ||
878 | * layer will ever use zero-count pages in bios. Otherwise we have to | ||
879 | * protect against the put_page sometimes done by the network layer. | ||
880 | * | ||
881 | * See http://oss.sgi.com/archives/xfs/2007-01/msg00594.html for | ||
882 | * discussion. | ||
883 | * | ||
884 | * We cannot use get_page in the workaround, because it insists on a | ||
885 | * positive page count as a precondition. So we use _count directly. | ||
886 | */ | ||
887 | static void | ||
888 | bio_pageinc(struct bio *bio) | ||
889 | { | ||
890 | struct bio_vec *bv; | ||
891 | struct page *page; | ||
892 | int i; | ||
893 | |||
894 | bio_for_each_segment(bv, bio, i) { | ||
895 | page = bv->bv_page; | ||
896 | /* Non-zero page count for non-head members of | ||
897 | * compound pages is no longer allowed by the kernel, | ||
898 | * but this has never been seen here. | ||
899 | */ | ||
900 | if (unlikely(PageCompound(page))) | ||
901 | if (compound_trans_head(page) != page) { | ||
902 | pr_crit("page tail used for block I/O\n"); | ||
903 | BUG(); | ||
904 | } | ||
905 | atomic_inc(&page->_count); | ||
906 | } | ||
907 | } | ||
908 | |||
909 | static void | ||
910 | bio_pagedec(struct bio *bio) | ||
911 | { | ||
912 | struct bio_vec *bv; | ||
913 | int i; | ||
914 | |||
915 | bio_for_each_segment(bv, bio, i) | ||
916 | atomic_dec(&bv->bv_page->_count); | ||
917 | } | ||
918 | |||
919 | static void | ||
920 | bufinit(struct buf *buf, struct request *rq, struct bio *bio) | ||
921 | { | ||
922 | struct bio_vec *bv; | ||
923 | |||
924 | memset(buf, 0, sizeof(*buf)); | ||
925 | buf->rq = rq; | ||
926 | buf->bio = bio; | ||
927 | buf->resid = bio->bi_size; | ||
928 | buf->sector = bio->bi_sector; | ||
929 | bio_pageinc(bio); | ||
930 | buf->bv = bv = &bio->bi_io_vec[bio->bi_idx]; | ||
931 | buf->bv_resid = bv->bv_len; | ||
932 | WARN_ON(buf->bv_resid == 0); | ||
933 | } | ||
934 | |||
935 | static struct buf * | ||
936 | nextbuf(struct aoedev *d) | ||
937 | { | ||
938 | struct request *rq; | ||
939 | struct request_queue *q; | ||
940 | struct buf *buf; | ||
941 | struct bio *bio; | ||
942 | |||
943 | q = d->blkq; | ||
944 | if (q == NULL) | ||
945 | return NULL; /* initializing */ | ||
946 | if (d->ip.buf) | ||
947 | return d->ip.buf; | ||
948 | rq = d->ip.rq; | ||
949 | if (rq == NULL) { | ||
950 | rq = blk_peek_request(q); | ||
951 | if (rq == NULL) | ||
952 | return NULL; | ||
953 | blk_start_request(rq); | ||
954 | d->ip.rq = rq; | ||
955 | d->ip.nxbio = rq->bio; | ||
956 | rq->special = (void *) rqbiocnt(rq); | ||
957 | } | ||
958 | buf = mempool_alloc(d->bufpool, GFP_ATOMIC); | ||
959 | if (buf == NULL) { | ||
960 | pr_err("aoe: nextbuf: unable to mempool_alloc!\n"); | ||
961 | return NULL; | ||
962 | } | ||
963 | bio = d->ip.nxbio; | ||
964 | bufinit(buf, rq, bio); | ||
965 | bio = bio->bi_next; | ||
966 | d->ip.nxbio = bio; | ||
967 | if (bio == NULL) | ||
968 | d->ip.rq = NULL; | ||
969 | return d->ip.buf = buf; | ||
580 | } | 970 | } |
581 | 971 | ||
582 | /* enters with d->lock held */ | 972 | /* enters with d->lock held */ |
583 | void | 973 | void |
584 | aoecmd_work(struct aoedev *d) | 974 | aoecmd_work(struct aoedev *d) |
585 | { | 975 | { |
586 | struct buf *buf; | 976 | rexmit_deferred(d); |
587 | loop: | 977 | while (aoecmd_ata_rw(d)) |
588 | if (d->htgt && !sthtith(d)) | 978 | ; |
589 | return; | ||
590 | if (d->inprocess == NULL) { | ||
591 | if (list_empty(&d->bufq)) | ||
592 | return; | ||
593 | buf = container_of(d->bufq.next, struct buf, bufs); | ||
594 | list_del(d->bufq.next); | ||
595 | d->inprocess = buf; | ||
596 | } | ||
597 | if (aoecmd_ata_rw(d)) | ||
598 | goto loop; | ||
599 | } | 979 | } |
600 | 980 | ||
601 | /* this function performs work that has been deferred until sleeping is OK | 981 | /* this function performs work that has been deferred until sleeping is OK |
@@ -604,28 +984,36 @@ void | |||
604 | aoecmd_sleepwork(struct work_struct *work) | 984 | aoecmd_sleepwork(struct work_struct *work) |
605 | { | 985 | { |
606 | struct aoedev *d = container_of(work, struct aoedev, work); | 986 | struct aoedev *d = container_of(work, struct aoedev, work); |
987 | struct block_device *bd; | ||
988 | u64 ssize; | ||
607 | 989 | ||
608 | if (d->flags & DEVFL_GDALLOC) | 990 | if (d->flags & DEVFL_GDALLOC) |
609 | aoeblk_gdalloc(d); | 991 | aoeblk_gdalloc(d); |
610 | 992 | ||
611 | if (d->flags & DEVFL_NEWSIZE) { | 993 | if (d->flags & DEVFL_NEWSIZE) { |
612 | struct block_device *bd; | ||
613 | unsigned long flags; | ||
614 | u64 ssize; | ||
615 | |||
616 | ssize = get_capacity(d->gd); | 994 | ssize = get_capacity(d->gd); |
617 | bd = bdget_disk(d->gd, 0); | 995 | bd = bdget_disk(d->gd, 0); |
618 | |||
619 | if (bd) { | 996 | if (bd) { |
620 | mutex_lock(&bd->bd_inode->i_mutex); | 997 | mutex_lock(&bd->bd_inode->i_mutex); |
621 | i_size_write(bd->bd_inode, (loff_t)ssize<<9); | 998 | i_size_write(bd->bd_inode, (loff_t)ssize<<9); |
622 | mutex_unlock(&bd->bd_inode->i_mutex); | 999 | mutex_unlock(&bd->bd_inode->i_mutex); |
623 | bdput(bd); | 1000 | bdput(bd); |
624 | } | 1001 | } |
625 | spin_lock_irqsave(&d->lock, flags); | 1002 | spin_lock_irq(&d->lock); |
626 | d->flags |= DEVFL_UP; | 1003 | d->flags |= DEVFL_UP; |
627 | d->flags &= ~DEVFL_NEWSIZE; | 1004 | d->flags &= ~DEVFL_NEWSIZE; |
628 | spin_unlock_irqrestore(&d->lock, flags); | 1005 | spin_unlock_irq(&d->lock); |
1006 | } | ||
1007 | } | ||
1008 | |||
1009 | static void | ||
1010 | ata_ident_fixstring(u16 *id, int ns) | ||
1011 | { | ||
1012 | u16 s; | ||
1013 | |||
1014 | while (ns-- > 0) { | ||
1015 | s = *id; | ||
1016 | *id++ = s >> 8 | s << 8; | ||
629 | } | 1017 | } |
630 | } | 1018 | } |
631 | 1019 | ||
@@ -664,6 +1052,11 @@ ataid_complete(struct aoedev *d, struct aoetgt *t, unsigned char *id) | |||
664 | d->geo.sectors = get_unaligned_le16(&id[56 << 1]); | 1052 | d->geo.sectors = get_unaligned_le16(&id[56 << 1]); |
665 | } | 1053 | } |
666 | 1054 | ||
1055 | ata_ident_fixstring((u16 *) &id[10<<1], 10); /* serial */ | ||
1056 | ata_ident_fixstring((u16 *) &id[23<<1], 4); /* firmware */ | ||
1057 | ata_ident_fixstring((u16 *) &id[27<<1], 20); /* model */ | ||
1058 | memcpy(d->ident, id, sizeof(d->ident)); | ||
1059 | |||
667 | if (d->ssize != ssize) | 1060 | if (d->ssize != ssize) |
668 | printk(KERN_INFO | 1061 | printk(KERN_INFO |
669 | "aoe: %pm e%ld.%d v%04x has %llu sectors\n", | 1062 | "aoe: %pm e%ld.%d v%04x has %llu sectors\n", |
@@ -683,26 +1076,28 @@ ataid_complete(struct aoedev *d, struct aoetgt *t, unsigned char *id) | |||
683 | } | 1076 | } |
684 | 1077 | ||
685 | static void | 1078 | static void |
686 | calc_rttavg(struct aoedev *d, int rtt) | 1079 | calc_rttavg(struct aoedev *d, struct aoetgt *t, int rtt) |
687 | { | 1080 | { |
688 | register long n; | 1081 | register long n; |
689 | 1082 | ||
690 | n = rtt; | 1083 | n = rtt; |
691 | if (n < 0) { | 1084 | |
692 | n = -rtt; | 1085 | /* cf. Congestion Avoidance and Control, Jacobson & Karels, 1988 */ |
693 | if (n < MINTIMER) | 1086 | n -= d->rttavg >> RTTSCALE; |
694 | n = MINTIMER; | 1087 | d->rttavg += n; |
695 | else if (n > MAXTIMER) | 1088 | if (n < 0) |
696 | n = MAXTIMER; | 1089 | n = -n; |
697 | d->mintimer += (n - d->mintimer) >> 1; | 1090 | n -= d->rttdev >> RTTDSCALE; |
698 | } else if (n < d->mintimer) | 1091 | d->rttdev += n; |
699 | n = d->mintimer; | 1092 | |
700 | else if (n > MAXTIMER) | 1093 | if (!t || t->maxout >= t->nframes) |
701 | n = MAXTIMER; | 1094 | return; |
702 | 1095 | if (t->maxout < t->ssthresh) | |
703 | /* g == .25; cf. Congestion Avoidance and Control, Jacobson & Karels; 1988 */ | 1096 | t->maxout += 1; |
704 | n -= d->rttavg; | 1097 | else if (t->nout == t->maxout && t->next_cwnd-- == 0) { |
705 | d->rttavg += n >> 2; | 1098 | t->maxout += 1; |
1099 | t->next_cwnd = t->maxout; | ||
1100 | } | ||
706 | } | 1101 | } |
707 | 1102 | ||
708 | static struct aoetgt * | 1103 | static struct aoetgt * |
@@ -711,170 +1106,326 @@ gettgt(struct aoedev *d, char *addr) | |||
711 | struct aoetgt **t, **e; | 1106 | struct aoetgt **t, **e; |
712 | 1107 | ||
713 | t = d->targets; | 1108 | t = d->targets; |
714 | e = t + NTARGETS; | 1109 | e = t + d->ntargets; |
715 | for (; t < e && *t; t++) | 1110 | for (; t < e && *t; t++) |
716 | if (memcmp((*t)->addr, addr, sizeof((*t)->addr)) == 0) | 1111 | if (memcmp((*t)->addr, addr, sizeof((*t)->addr)) == 0) |
717 | return *t; | 1112 | return *t; |
718 | return NULL; | 1113 | return NULL; |
719 | } | 1114 | } |
720 | 1115 | ||
721 | static inline void | 1116 | static void |
722 | diskstats(struct gendisk *disk, struct bio *bio, ulong duration, sector_t sector) | 1117 | bvcpy(struct bio_vec *bv, ulong off, struct sk_buff *skb, long cnt) |
723 | { | 1118 | { |
724 | unsigned long n_sect = bio->bi_size >> 9; | 1119 | ulong fcnt; |
725 | const int rw = bio_data_dir(bio); | 1120 | char *p; |
726 | struct hd_struct *part; | 1121 | int soff = 0; |
727 | int cpu; | 1122 | loop: |
728 | 1123 | fcnt = bv->bv_len - (off - bv->bv_offset); | |
729 | cpu = part_stat_lock(); | 1124 | if (fcnt > cnt) |
730 | part = disk_map_sector_rcu(disk, sector); | 1125 | fcnt = cnt; |
1126 | p = page_address(bv->bv_page) + off; | ||
1127 | skb_copy_bits(skb, soff, p, fcnt); | ||
1128 | soff += fcnt; | ||
1129 | cnt -= fcnt; | ||
1130 | if (cnt <= 0) | ||
1131 | return; | ||
1132 | bv++; | ||
1133 | off = bv->bv_offset; | ||
1134 | goto loop; | ||
1135 | } | ||
731 | 1136 | ||
732 | part_stat_inc(cpu, part, ios[rw]); | 1137 | void |
733 | part_stat_add(cpu, part, ticks[rw], duration); | 1138 | aoe_end_request(struct aoedev *d, struct request *rq, int fastfail) |
734 | part_stat_add(cpu, part, sectors[rw], n_sect); | 1139 | { |
735 | part_stat_add(cpu, part, io_ticks, duration); | 1140 | struct bio *bio; |
1141 | int bok; | ||
1142 | struct request_queue *q; | ||
1143 | |||
1144 | q = d->blkq; | ||
1145 | if (rq == d->ip.rq) | ||
1146 | d->ip.rq = NULL; | ||
1147 | do { | ||
1148 | bio = rq->bio; | ||
1149 | bok = !fastfail && test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
1150 | } while (__blk_end_request(rq, bok ? 0 : -EIO, bio->bi_size)); | ||
1151 | |||
1152 | /* cf. http://lkml.org/lkml/2006/10/31/28 */ | ||
1153 | if (!fastfail) | ||
1154 | __blk_run_queue(q); | ||
1155 | } | ||
736 | 1156 | ||
737 | part_stat_unlock(); | 1157 | static void |
1158 | aoe_end_buf(struct aoedev *d, struct buf *buf) | ||
1159 | { | ||
1160 | struct request *rq; | ||
1161 | unsigned long n; | ||
1162 | |||
1163 | if (buf == d->ip.buf) | ||
1164 | d->ip.buf = NULL; | ||
1165 | rq = buf->rq; | ||
1166 | bio_pagedec(buf->bio); | ||
1167 | mempool_free(buf, d->bufpool); | ||
1168 | n = (unsigned long) rq->special; | ||
1169 | rq->special = (void *) --n; | ||
1170 | if (n == 0) | ||
1171 | aoe_end_request(d, rq, 0); | ||
738 | } | 1172 | } |
739 | 1173 | ||
740 | void | 1174 | static void |
741 | aoecmd_ata_rsp(struct sk_buff *skb) | 1175 | ktiocomplete(struct frame *f) |
742 | { | 1176 | { |
743 | struct sk_buff_head queue; | ||
744 | struct aoedev *d; | ||
745 | struct aoe_hdr *hin, *hout; | 1177 | struct aoe_hdr *hin, *hout; |
746 | struct aoe_atahdr *ahin, *ahout; | 1178 | struct aoe_atahdr *ahin, *ahout; |
747 | struct frame *f; | ||
748 | struct buf *buf; | 1179 | struct buf *buf; |
1180 | struct sk_buff *skb; | ||
749 | struct aoetgt *t; | 1181 | struct aoetgt *t; |
750 | struct aoeif *ifp; | 1182 | struct aoeif *ifp; |
751 | register long n; | 1183 | struct aoedev *d; |
752 | ulong flags; | 1184 | long n; |
753 | char ebuf[128]; | 1185 | int untainted; |
754 | u16 aoemajor; | ||
755 | |||
756 | hin = (struct aoe_hdr *) skb_mac_header(skb); | ||
757 | aoemajor = get_unaligned_be16(&hin->major); | ||
758 | d = aoedev_by_aoeaddr(aoemajor, hin->minor); | ||
759 | if (d == NULL) { | ||
760 | snprintf(ebuf, sizeof ebuf, "aoecmd_ata_rsp: ata response " | ||
761 | "for unknown device %d.%d\n", | ||
762 | aoemajor, hin->minor); | ||
763 | aoechr_error(ebuf); | ||
764 | return; | ||
765 | } | ||
766 | |||
767 | spin_lock_irqsave(&d->lock, flags); | ||
768 | 1186 | ||
769 | n = get_unaligned_be32(&hin->tag); | 1187 | if (f == NULL) |
770 | t = gettgt(d, hin->src); | ||
771 | if (t == NULL) { | ||
772 | printk(KERN_INFO "aoe: can't find target e%ld.%d:%pm\n", | ||
773 | d->aoemajor, d->aoeminor, hin->src); | ||
774 | spin_unlock_irqrestore(&d->lock, flags); | ||
775 | return; | ||
776 | } | ||
777 | f = getframe(t, n); | ||
778 | if (f == NULL) { | ||
779 | calc_rttavg(d, -tsince(n)); | ||
780 | spin_unlock_irqrestore(&d->lock, flags); | ||
781 | snprintf(ebuf, sizeof ebuf, | ||
782 | "%15s e%d.%d tag=%08x@%08lx\n", | ||
783 | "unexpected rsp", | ||
784 | get_unaligned_be16(&hin->major), | ||
785 | hin->minor, | ||
786 | get_unaligned_be32(&hin->tag), | ||
787 | jiffies); | ||
788 | aoechr_error(ebuf); | ||
789 | return; | 1188 | return; |
790 | } | ||
791 | 1189 | ||
792 | calc_rttavg(d, tsince(f->tag)); | 1190 | t = f->t; |
1191 | d = t->d; | ||
1192 | skb = f->r_skb; | ||
1193 | buf = f->buf; | ||
1194 | if (f->flags & FFL_PROBE) | ||
1195 | goto out; | ||
1196 | if (!skb) /* just fail the buf. */ | ||
1197 | goto noskb; | ||
793 | 1198 | ||
794 | ahin = (struct aoe_atahdr *) (hin+1); | ||
795 | hout = (struct aoe_hdr *) skb_mac_header(f->skb); | 1199 | hout = (struct aoe_hdr *) skb_mac_header(f->skb); |
796 | ahout = (struct aoe_atahdr *) (hout+1); | 1200 | ahout = (struct aoe_atahdr *) (hout+1); |
797 | buf = f->buf; | ||
798 | 1201 | ||
1202 | hin = (struct aoe_hdr *) skb->data; | ||
1203 | skb_pull(skb, sizeof(*hin)); | ||
1204 | ahin = (struct aoe_atahdr *) skb->data; | ||
1205 | skb_pull(skb, sizeof(*ahin)); | ||
799 | if (ahin->cmdstat & 0xa9) { /* these bits cleared on success */ | 1206 | if (ahin->cmdstat & 0xa9) { /* these bits cleared on success */ |
800 | printk(KERN_ERR | 1207 | pr_err("aoe: ata error cmd=%2.2Xh stat=%2.2Xh from e%ld.%d\n", |
801 | "aoe: ata error cmd=%2.2Xh stat=%2.2Xh from e%ld.%d\n", | ||
802 | ahout->cmdstat, ahin->cmdstat, | 1208 | ahout->cmdstat, ahin->cmdstat, |
803 | d->aoemajor, d->aoeminor); | 1209 | d->aoemajor, d->aoeminor); |
804 | if (buf) | 1210 | noskb: if (buf) |
805 | buf->flags |= BUFFL_FAIL; | 1211 | clear_bit(BIO_UPTODATE, &buf->bio->bi_flags); |
806 | } else { | 1212 | goto out; |
807 | if (d->htgt && t == *d->htgt) /* I'll help myself, thank you. */ | 1213 | } |
808 | d->htgt = NULL; | 1214 | |
809 | n = ahout->scnt << 9; | 1215 | n = ahout->scnt << 9; |
810 | switch (ahout->cmdstat) { | 1216 | switch (ahout->cmdstat) { |
811 | case ATA_CMD_PIO_READ: | 1217 | case ATA_CMD_PIO_READ: |
812 | case ATA_CMD_PIO_READ_EXT: | 1218 | case ATA_CMD_PIO_READ_EXT: |
813 | if (skb->len - sizeof *hin - sizeof *ahin < n) { | 1219 | if (skb->len < n) { |
814 | printk(KERN_ERR | 1220 | pr_err("%s e%ld.%d. skb->len=%d need=%ld\n", |
815 | "aoe: %s. skb->len=%d need=%ld\n", | 1221 | "aoe: runt data size in read from", |
816 | "runt data size in read", skb->len, n); | 1222 | (long) d->aoemajor, d->aoeminor, |
817 | /* fail frame f? just returning will rexmit. */ | 1223 | skb->len, n); |
818 | spin_unlock_irqrestore(&d->lock, flags); | 1224 | clear_bit(BIO_UPTODATE, &buf->bio->bi_flags); |
819 | return; | ||
820 | } | ||
821 | memcpy(f->bufaddr, ahin+1, n); | ||
822 | case ATA_CMD_PIO_WRITE: | ||
823 | case ATA_CMD_PIO_WRITE_EXT: | ||
824 | ifp = getif(t, skb->dev); | ||
825 | if (ifp) { | ||
826 | ifp->lost = 0; | ||
827 | if (n > DEFAULTBCNT) | ||
828 | ifp->lostjumbo = 0; | ||
829 | } | ||
830 | if (f->bcnt -= n) { | ||
831 | f->lba += n >> 9; | ||
832 | f->bufaddr += n; | ||
833 | resend(d, t, f); | ||
834 | goto xmit; | ||
835 | } | ||
836 | break; | 1225 | break; |
837 | case ATA_CMD_ID_ATA: | 1226 | } |
838 | if (skb->len - sizeof *hin - sizeof *ahin < 512) { | 1227 | bvcpy(f->bv, f->bv_off, skb, n); |
839 | printk(KERN_INFO | 1228 | case ATA_CMD_PIO_WRITE: |
840 | "aoe: runt data size in ataid. skb->len=%d\n", | 1229 | case ATA_CMD_PIO_WRITE_EXT: |
841 | skb->len); | 1230 | spin_lock_irq(&d->lock); |
842 | spin_unlock_irqrestore(&d->lock, flags); | 1231 | ifp = getif(t, skb->dev); |
843 | return; | 1232 | if (ifp) |
844 | } | 1233 | ifp->lost = 0; |
845 | ataid_complete(d, t, (char *) (ahin+1)); | 1234 | spin_unlock_irq(&d->lock); |
1235 | break; | ||
1236 | case ATA_CMD_ID_ATA: | ||
1237 | if (skb->len < 512) { | ||
1238 | pr_info("%s e%ld.%d. skb->len=%d need=512\n", | ||
1239 | "aoe: runt data size in ataid from", | ||
1240 | (long) d->aoemajor, d->aoeminor, | ||
1241 | skb->len); | ||
1242 | break; | ||
1243 | } | ||
1244 | if (skb_linearize(skb)) | ||
846 | break; | 1245 | break; |
847 | default: | 1246 | spin_lock_irq(&d->lock); |
848 | printk(KERN_INFO | 1247 | ataid_complete(d, t, skb->data); |
849 | "aoe: unrecognized ata command %2.2Xh for %d.%d\n", | 1248 | spin_unlock_irq(&d->lock); |
850 | ahout->cmdstat, | 1249 | break; |
851 | get_unaligned_be16(&hin->major), | 1250 | default: |
852 | hin->minor); | 1251 | pr_info("aoe: unrecognized ata command %2.2Xh for %d.%d\n", |
1252 | ahout->cmdstat, | ||
1253 | be16_to_cpu(get_unaligned(&hin->major)), | ||
1254 | hin->minor); | ||
1255 | } | ||
1256 | out: | ||
1257 | spin_lock_irq(&d->lock); | ||
1258 | if (t->taint > 0 | ||
1259 | && --t->taint > 0 | ||
1260 | && t->nout_probes == 0) { | ||
1261 | count_targets(d, &untainted); | ||
1262 | if (untainted > 0) { | ||
1263 | probe(t); | ||
1264 | t->nout_probes++; | ||
853 | } | 1265 | } |
854 | } | 1266 | } |
855 | 1267 | ||
856 | if (buf && --buf->nframesout == 0 && buf->resid == 0) { | 1268 | aoe_freetframe(f); |
857 | diskstats(d->gd, buf->bio, jiffies - buf->stime, buf->sector); | 1269 | |
858 | if (buf->flags & BUFFL_FAIL) | 1270 | if (buf && --buf->nframesout == 0 && buf->resid == 0) |
859 | bio_endio(buf->bio, -EIO); | 1271 | aoe_end_buf(d, buf); |
860 | else { | 1272 | |
861 | bio_flush_dcache_pages(buf->bio); | 1273 | spin_unlock_irq(&d->lock); |
862 | bio_endio(buf->bio, 0); | 1274 | aoedev_put(d); |
1275 | dev_kfree_skb(skb); | ||
1276 | } | ||
1277 | |||
1278 | /* Enters with iocq.lock held. | ||
1279 | * Returns true iff responses needing processing remain. | ||
1280 | */ | ||
1281 | static int | ||
1282 | ktio(void) | ||
1283 | { | ||
1284 | struct frame *f; | ||
1285 | struct list_head *pos; | ||
1286 | int i; | ||
1287 | |||
1288 | for (i = 0; ; ++i) { | ||
1289 | if (i == MAXIOC) | ||
1290 | return 1; | ||
1291 | if (list_empty(&iocq.head)) | ||
1292 | return 0; | ||
1293 | pos = iocq.head.next; | ||
1294 | list_del(pos); | ||
1295 | spin_unlock_irq(&iocq.lock); | ||
1296 | f = list_entry(pos, struct frame, head); | ||
1297 | ktiocomplete(f); | ||
1298 | spin_lock_irq(&iocq.lock); | ||
1299 | } | ||
1300 | } | ||
1301 | |||
1302 | static int | ||
1303 | kthread(void *vp) | ||
1304 | { | ||
1305 | struct ktstate *k; | ||
1306 | DECLARE_WAITQUEUE(wait, current); | ||
1307 | int more; | ||
1308 | |||
1309 | k = vp; | ||
1310 | current->flags |= PF_NOFREEZE; | ||
1311 | set_user_nice(current, -10); | ||
1312 | complete(&k->rendez); /* tell spawner we're running */ | ||
1313 | do { | ||
1314 | spin_lock_irq(k->lock); | ||
1315 | more = k->fn(); | ||
1316 | if (!more) { | ||
1317 | add_wait_queue(k->waitq, &wait); | ||
1318 | __set_current_state(TASK_INTERRUPTIBLE); | ||
863 | } | 1319 | } |
864 | mempool_free(buf, d->bufpool); | 1320 | spin_unlock_irq(k->lock); |
1321 | if (!more) { | ||
1322 | schedule(); | ||
1323 | remove_wait_queue(k->waitq, &wait); | ||
1324 | } else | ||
1325 | cond_resched(); | ||
1326 | } while (!kthread_should_stop()); | ||
1327 | complete(&k->rendez); /* tell spawner we're stopping */ | ||
1328 | return 0; | ||
1329 | } | ||
1330 | |||
1331 | void | ||
1332 | aoe_ktstop(struct ktstate *k) | ||
1333 | { | ||
1334 | kthread_stop(k->task); | ||
1335 | wait_for_completion(&k->rendez); | ||
1336 | } | ||
1337 | |||
1338 | int | ||
1339 | aoe_ktstart(struct ktstate *k) | ||
1340 | { | ||
1341 | struct task_struct *task; | ||
1342 | |||
1343 | init_completion(&k->rendez); | ||
1344 | task = kthread_run(kthread, k, k->name); | ||
1345 | if (task == NULL || IS_ERR(task)) | ||
1346 | return -ENOMEM; | ||
1347 | k->task = task; | ||
1348 | wait_for_completion(&k->rendez); /* allow kthread to start */ | ||
1349 | init_completion(&k->rendez); /* for waiting for exit later */ | ||
1350 | return 0; | ||
1351 | } | ||
1352 | |||
1353 | /* pass it off to kthreads for processing */ | ||
1354 | static void | ||
1355 | ktcomplete(struct frame *f, struct sk_buff *skb) | ||
1356 | { | ||
1357 | ulong flags; | ||
1358 | |||
1359 | f->r_skb = skb; | ||
1360 | spin_lock_irqsave(&iocq.lock, flags); | ||
1361 | list_add_tail(&f->head, &iocq.head); | ||
1362 | spin_unlock_irqrestore(&iocq.lock, flags); | ||
1363 | wake_up(&ktiowq); | ||
1364 | } | ||
1365 | |||
1366 | struct sk_buff * | ||
1367 | aoecmd_ata_rsp(struct sk_buff *skb) | ||
1368 | { | ||
1369 | struct aoedev *d; | ||
1370 | struct aoe_hdr *h; | ||
1371 | struct frame *f; | ||
1372 | u32 n; | ||
1373 | ulong flags; | ||
1374 | char ebuf[128]; | ||
1375 | u16 aoemajor; | ||
1376 | |||
1377 | h = (struct aoe_hdr *) skb->data; | ||
1378 | aoemajor = be16_to_cpu(get_unaligned(&h->major)); | ||
1379 | d = aoedev_by_aoeaddr(aoemajor, h->minor, 0); | ||
1380 | if (d == NULL) { | ||
1381 | snprintf(ebuf, sizeof ebuf, "aoecmd_ata_rsp: ata response " | ||
1382 | "for unknown device %d.%d\n", | ||
1383 | aoemajor, h->minor); | ||
1384 | aoechr_error(ebuf); | ||
1385 | return skb; | ||
865 | } | 1386 | } |
866 | 1387 | ||
867 | f->buf = NULL; | 1388 | spin_lock_irqsave(&d->lock, flags); |
868 | f->tag = FREETAG; | ||
869 | t->nout--; | ||
870 | 1389 | ||
1390 | n = be32_to_cpu(get_unaligned(&h->tag)); | ||
1391 | f = getframe(d, n); | ||
1392 | if (f) { | ||
1393 | calc_rttavg(d, f->t, tsince_hr(f)); | ||
1394 | f->t->nout--; | ||
1395 | if (f->flags & FFL_PROBE) | ||
1396 | f->t->nout_probes--; | ||
1397 | } else { | ||
1398 | f = getframe_deferred(d, n); | ||
1399 | if (f) { | ||
1400 | calc_rttavg(d, NULL, tsince_hr(f)); | ||
1401 | } else { | ||
1402 | calc_rttavg(d, NULL, tsince(n)); | ||
1403 | spin_unlock_irqrestore(&d->lock, flags); | ||
1404 | aoedev_put(d); | ||
1405 | snprintf(ebuf, sizeof(ebuf), | ||
1406 | "%15s e%d.%d tag=%08x@%08lx s=%pm d=%pm\n", | ||
1407 | "unexpected rsp", | ||
1408 | get_unaligned_be16(&h->major), | ||
1409 | h->minor, | ||
1410 | get_unaligned_be32(&h->tag), | ||
1411 | jiffies, | ||
1412 | h->src, | ||
1413 | h->dst); | ||
1414 | aoechr_error(ebuf); | ||
1415 | return skb; | ||
1416 | } | ||
1417 | } | ||
871 | aoecmd_work(d); | 1418 | aoecmd_work(d); |
872 | xmit: | ||
873 | __skb_queue_head_init(&queue); | ||
874 | skb_queue_splice_init(&d->sendq, &queue); | ||
875 | 1419 | ||
876 | spin_unlock_irqrestore(&d->lock, flags); | 1420 | spin_unlock_irqrestore(&d->lock, flags); |
877 | aoenet_xmit(&queue); | 1421 | |
1422 | ktcomplete(f, skb); | ||
1423 | |||
1424 | /* | ||
1425 | * Note here that we do not perform an aoedev_put, as we are | ||
1426 | * leaving this reference for the ktio to release. | ||
1427 | */ | ||
1428 | return NULL; | ||
878 | } | 1429 | } |
879 | 1430 | ||
880 | void | 1431 | void |
@@ -886,7 +1437,7 @@ aoecmd_cfg(ushort aoemajor, unsigned char aoeminor) | |||
886 | aoecmd_cfg_pkts(aoemajor, aoeminor, &queue); | 1437 | aoecmd_cfg_pkts(aoemajor, aoeminor, &queue); |
887 | aoenet_xmit(&queue); | 1438 | aoenet_xmit(&queue); |
888 | } | 1439 | } |
889 | 1440 | ||
890 | struct sk_buff * | 1441 | struct sk_buff * |
891 | aoecmd_ata_id(struct aoedev *d) | 1442 | aoecmd_ata_id(struct aoedev *d) |
892 | { | 1443 | { |
@@ -896,7 +1447,7 @@ aoecmd_ata_id(struct aoedev *d) | |||
896 | struct sk_buff *skb; | 1447 | struct sk_buff *skb; |
897 | struct aoetgt *t; | 1448 | struct aoetgt *t; |
898 | 1449 | ||
899 | f = freeframe(d); | 1450 | f = newframe(d); |
900 | if (f == NULL) | 1451 | if (f == NULL) |
901 | return NULL; | 1452 | return NULL; |
902 | 1453 | ||
@@ -909,8 +1460,10 @@ aoecmd_ata_id(struct aoedev *d) | |||
909 | skb_put(skb, sizeof *h + sizeof *ah); | 1460 | skb_put(skb, sizeof *h + sizeof *ah); |
910 | memset(h, 0, skb->len); | 1461 | memset(h, 0, skb->len); |
911 | f->tag = aoehdr_atainit(d, t, h); | 1462 | f->tag = aoehdr_atainit(d, t, h); |
1463 | fhash(f); | ||
912 | t->nout++; | 1464 | t->nout++; |
913 | f->waited = 0; | 1465 | f->waited = 0; |
1466 | f->waited_total = 0; | ||
914 | 1467 | ||
915 | /* set up ata header */ | 1468 | /* set up ata header */ |
916 | ah->scnt = 1; | 1469 | ah->scnt = 1; |
@@ -919,46 +1472,120 @@ aoecmd_ata_id(struct aoedev *d) | |||
919 | 1472 | ||
920 | skb->dev = t->ifp->nd; | 1473 | skb->dev = t->ifp->nd; |
921 | 1474 | ||
922 | d->rttavg = MAXTIMER; | 1475 | d->rttavg = RTTAVG_INIT; |
1476 | d->rttdev = RTTDEV_INIT; | ||
923 | d->timer.function = rexmit_timer; | 1477 | d->timer.function = rexmit_timer; |
924 | 1478 | ||
925 | return skb_clone(skb, GFP_ATOMIC); | 1479 | skb = skb_clone(skb, GFP_ATOMIC); |
1480 | if (skb) { | ||
1481 | do_gettimeofday(&f->sent); | ||
1482 | f->sent_jiffs = (u32) jiffies; | ||
1483 | } | ||
1484 | |||
1485 | return skb; | ||
1486 | } | ||
1487 | |||
1488 | static struct aoetgt ** | ||
1489 | grow_targets(struct aoedev *d) | ||
1490 | { | ||
1491 | ulong oldn, newn; | ||
1492 | struct aoetgt **tt; | ||
1493 | |||
1494 | oldn = d->ntargets; | ||
1495 | newn = oldn * 2; | ||
1496 | tt = kcalloc(newn, sizeof(*d->targets), GFP_ATOMIC); | ||
1497 | if (!tt) | ||
1498 | return NULL; | ||
1499 | memmove(tt, d->targets, sizeof(*d->targets) * oldn); | ||
1500 | d->tgt = tt + (d->tgt - d->targets); | ||
1501 | kfree(d->targets); | ||
1502 | d->targets = tt; | ||
1503 | d->ntargets = newn; | ||
1504 | |||
1505 | return &d->targets[oldn]; | ||
926 | } | 1506 | } |
927 | 1507 | ||
928 | static struct aoetgt * | 1508 | static struct aoetgt * |
929 | addtgt(struct aoedev *d, char *addr, ulong nframes) | 1509 | addtgt(struct aoedev *d, char *addr, ulong nframes) |
930 | { | 1510 | { |
931 | struct aoetgt *t, **tt, **te; | 1511 | struct aoetgt *t, **tt, **te; |
932 | struct frame *f, *e; | ||
933 | 1512 | ||
934 | tt = d->targets; | 1513 | tt = d->targets; |
935 | te = tt + NTARGETS; | 1514 | te = tt + d->ntargets; |
936 | for (; tt < te && *tt; tt++) | 1515 | for (; tt < te && *tt; tt++) |
937 | ; | 1516 | ; |
938 | 1517 | ||
939 | if (tt == te) { | 1518 | if (tt == te) { |
940 | printk(KERN_INFO | 1519 | tt = grow_targets(d); |
941 | "aoe: device addtgt failure; too many targets\n"); | 1520 | if (!tt) |
942 | return NULL; | 1521 | goto nomem; |
943 | } | ||
944 | t = kcalloc(1, sizeof *t, GFP_ATOMIC); | ||
945 | f = kcalloc(nframes, sizeof *f, GFP_ATOMIC); | ||
946 | if (!t || !f) { | ||
947 | kfree(f); | ||
948 | kfree(t); | ||
949 | printk(KERN_INFO "aoe: cannot allocate memory to add target\n"); | ||
950 | return NULL; | ||
951 | } | 1522 | } |
952 | 1523 | t = kzalloc(sizeof(*t), GFP_ATOMIC); | |
1524 | if (!t) | ||
1525 | goto nomem; | ||
953 | t->nframes = nframes; | 1526 | t->nframes = nframes; |
954 | t->frames = f; | 1527 | t->d = d; |
955 | e = f + nframes; | ||
956 | for (; f < e; f++) | ||
957 | f->tag = FREETAG; | ||
958 | memcpy(t->addr, addr, sizeof t->addr); | 1528 | memcpy(t->addr, addr, sizeof t->addr); |
959 | t->ifp = t->ifs; | 1529 | t->ifp = t->ifs; |
960 | t->maxout = t->nframes; | 1530 | aoecmd_wreset(t); |
1531 | t->maxout = t->nframes / 2; | ||
1532 | INIT_LIST_HEAD(&t->ffree); | ||
961 | return *tt = t; | 1533 | return *tt = t; |
1534 | |||
1535 | nomem: | ||
1536 | pr_info("aoe: cannot allocate memory to add target\n"); | ||
1537 | return NULL; | ||
1538 | } | ||
1539 | |||
1540 | static void | ||
1541 | setdbcnt(struct aoedev *d) | ||
1542 | { | ||
1543 | struct aoetgt **t, **e; | ||
1544 | int bcnt = 0; | ||
1545 | |||
1546 | t = d->targets; | ||
1547 | e = t + d->ntargets; | ||
1548 | for (; t < e && *t; t++) | ||
1549 | if (bcnt == 0 || bcnt > (*t)->minbcnt) | ||
1550 | bcnt = (*t)->minbcnt; | ||
1551 | if (bcnt != d->maxbcnt) { | ||
1552 | d->maxbcnt = bcnt; | ||
1553 | pr_info("aoe: e%ld.%d: setting %d byte data frames\n", | ||
1554 | d->aoemajor, d->aoeminor, bcnt); | ||
1555 | } | ||
1556 | } | ||
1557 | |||
1558 | static void | ||
1559 | setifbcnt(struct aoetgt *t, struct net_device *nd, int bcnt) | ||
1560 | { | ||
1561 | struct aoedev *d; | ||
1562 | struct aoeif *p, *e; | ||
1563 | int minbcnt; | ||
1564 | |||
1565 | d = t->d; | ||
1566 | minbcnt = bcnt; | ||
1567 | p = t->ifs; | ||
1568 | e = p + NAOEIFS; | ||
1569 | for (; p < e; p++) { | ||
1570 | if (p->nd == NULL) | ||
1571 | break; /* end of the valid interfaces */ | ||
1572 | if (p->nd == nd) { | ||
1573 | p->bcnt = bcnt; /* we're updating */ | ||
1574 | nd = NULL; | ||
1575 | } else if (minbcnt > p->bcnt) | ||
1576 | minbcnt = p->bcnt; /* find the min interface */ | ||
1577 | } | ||
1578 | if (nd) { | ||
1579 | if (p == e) { | ||
1580 | pr_err("aoe: device setifbcnt failure; too many interfaces.\n"); | ||
1581 | return; | ||
1582 | } | ||
1583 | dev_hold(nd); | ||
1584 | p->nd = nd; | ||
1585 | p->bcnt = bcnt; | ||
1586 | } | ||
1587 | t->minbcnt = minbcnt; | ||
1588 | setdbcnt(d); | ||
962 | } | 1589 | } |
963 | 1590 | ||
964 | void | 1591 | void |
@@ -968,11 +1595,12 @@ aoecmd_cfg_rsp(struct sk_buff *skb) | |||
968 | struct aoe_hdr *h; | 1595 | struct aoe_hdr *h; |
969 | struct aoe_cfghdr *ch; | 1596 | struct aoe_cfghdr *ch; |
970 | struct aoetgt *t; | 1597 | struct aoetgt *t; |
971 | struct aoeif *ifp; | 1598 | ulong flags, aoemajor; |
972 | ulong flags, sysminor, aoemajor; | ||
973 | struct sk_buff *sl; | 1599 | struct sk_buff *sl; |
1600 | struct sk_buff_head queue; | ||
974 | u16 n; | 1601 | u16 n; |
975 | 1602 | ||
1603 | sl = NULL; | ||
976 | h = (struct aoe_hdr *) skb_mac_header(skb); | 1604 | h = (struct aoe_hdr *) skb_mac_header(skb); |
977 | ch = (struct aoe_cfghdr *) (h+1); | 1605 | ch = (struct aoe_cfghdr *) (h+1); |
978 | 1606 | ||
@@ -986,10 +1614,13 @@ aoecmd_cfg_rsp(struct sk_buff *skb) | |||
986 | "Check shelf dip switches.\n"); | 1614 | "Check shelf dip switches.\n"); |
987 | return; | 1615 | return; |
988 | } | 1616 | } |
989 | 1617 | if (aoemajor == 0xffff) { | |
990 | sysminor = SYSMINOR(aoemajor, h->minor); | 1618 | pr_info("aoe: e%ld.%d: broadcast shelf number invalid\n", |
991 | if (sysminor * AOE_PARTITIONS + AOE_PARTITIONS > MINORMASK) { | 1619 | aoemajor, (int) h->minor); |
992 | printk(KERN_INFO "aoe: e%ld.%d: minor number too large\n", | 1620 | return; |
1621 | } | ||
1622 | if (h->minor == 0xff) { | ||
1623 | pr_info("aoe: e%ld.%d: broadcast slot number invalid\n", | ||
993 | aoemajor, (int) h->minor); | 1624 | aoemajor, (int) h->minor); |
994 | return; | 1625 | return; |
995 | } | 1626 | } |
@@ -998,63 +1629,41 @@ aoecmd_cfg_rsp(struct sk_buff *skb) | |||
998 | if (n > aoe_maxout) /* keep it reasonable */ | 1629 | if (n > aoe_maxout) /* keep it reasonable */ |
999 | n = aoe_maxout; | 1630 | n = aoe_maxout; |
1000 | 1631 | ||
1001 | d = aoedev_by_sysminor_m(sysminor); | 1632 | d = aoedev_by_aoeaddr(aoemajor, h->minor, 1); |
1002 | if (d == NULL) { | 1633 | if (d == NULL) { |
1003 | printk(KERN_INFO "aoe: device sysminor_m failure\n"); | 1634 | pr_info("aoe: device allocation failure\n"); |
1004 | return; | 1635 | return; |
1005 | } | 1636 | } |
1006 | 1637 | ||
1007 | spin_lock_irqsave(&d->lock, flags); | 1638 | spin_lock_irqsave(&d->lock, flags); |
1008 | 1639 | ||
1009 | t = gettgt(d, h->src); | 1640 | t = gettgt(d, h->src); |
1010 | if (!t) { | 1641 | if (t) { |
1642 | t->nframes = n; | ||
1643 | if (n < t->maxout) | ||
1644 | aoecmd_wreset(t); | ||
1645 | } else { | ||
1011 | t = addtgt(d, h->src, n); | 1646 | t = addtgt(d, h->src, n); |
1012 | if (!t) { | 1647 | if (!t) |
1013 | spin_unlock_irqrestore(&d->lock, flags); | 1648 | goto bail; |
1014 | return; | ||
1015 | } | ||
1016 | } | ||
1017 | ifp = getif(t, skb->dev); | ||
1018 | if (!ifp) { | ||
1019 | ifp = addif(t, skb->dev); | ||
1020 | if (!ifp) { | ||
1021 | printk(KERN_INFO | ||
1022 | "aoe: device addif failure; " | ||
1023 | "too many interfaces?\n"); | ||
1024 | spin_unlock_irqrestore(&d->lock, flags); | ||
1025 | return; | ||
1026 | } | ||
1027 | } | ||
1028 | if (ifp->maxbcnt) { | ||
1029 | n = ifp->nd->mtu; | ||
1030 | n -= sizeof (struct aoe_hdr) + sizeof (struct aoe_atahdr); | ||
1031 | n /= 512; | ||
1032 | if (n > ch->scnt) | ||
1033 | n = ch->scnt; | ||
1034 | n = n ? n * 512 : DEFAULTBCNT; | ||
1035 | if (n != ifp->maxbcnt) { | ||
1036 | printk(KERN_INFO | ||
1037 | "aoe: e%ld.%d: setting %d%s%s:%pm\n", | ||
1038 | d->aoemajor, d->aoeminor, n, | ||
1039 | " byte data frames on ", ifp->nd->name, | ||
1040 | t->addr); | ||
1041 | ifp->maxbcnt = n; | ||
1042 | } | ||
1043 | } | 1649 | } |
1650 | n = skb->dev->mtu; | ||
1651 | n -= sizeof(struct aoe_hdr) + sizeof(struct aoe_atahdr); | ||
1652 | n /= 512; | ||
1653 | if (n > ch->scnt) | ||
1654 | n = ch->scnt; | ||
1655 | n = n ? n * 512 : DEFAULTBCNT; | ||
1656 | setifbcnt(t, skb->dev, n); | ||
1044 | 1657 | ||
1045 | /* don't change users' perspective */ | 1658 | /* don't change users' perspective */ |
1046 | if (d->nopen) { | 1659 | if (d->nopen == 0) { |
1047 | spin_unlock_irqrestore(&d->lock, flags); | 1660 | d->fw_ver = be16_to_cpu(ch->fwver); |
1048 | return; | 1661 | sl = aoecmd_ata_id(d); |
1049 | } | 1662 | } |
1050 | d->fw_ver = be16_to_cpu(ch->fwver); | 1663 | bail: |
1051 | |||
1052 | sl = aoecmd_ata_id(d); | ||
1053 | |||
1054 | spin_unlock_irqrestore(&d->lock, flags); | 1664 | spin_unlock_irqrestore(&d->lock, flags); |
1055 | 1665 | aoedev_put(d); | |
1056 | if (sl) { | 1666 | if (sl) { |
1057 | struct sk_buff_head queue; | ||
1058 | __skb_queue_head_init(&queue); | 1667 | __skb_queue_head_init(&queue); |
1059 | __skb_queue_tail(&queue, sl); | 1668 | __skb_queue_tail(&queue, sl); |
1060 | aoenet_xmit(&queue); | 1669 | aoenet_xmit(&queue); |
@@ -1062,23 +1671,97 @@ aoecmd_cfg_rsp(struct sk_buff *skb) | |||
1062 | } | 1671 | } |
1063 | 1672 | ||
1064 | void | 1673 | void |
1674 | aoecmd_wreset(struct aoetgt *t) | ||
1675 | { | ||
1676 | t->maxout = 1; | ||
1677 | t->ssthresh = t->nframes / 2; | ||
1678 | t->next_cwnd = t->nframes; | ||
1679 | } | ||
1680 | |||
1681 | void | ||
1065 | aoecmd_cleanslate(struct aoedev *d) | 1682 | aoecmd_cleanslate(struct aoedev *d) |
1066 | { | 1683 | { |
1067 | struct aoetgt **t, **te; | 1684 | struct aoetgt **t, **te; |
1068 | struct aoeif *p, *e; | ||
1069 | 1685 | ||
1070 | d->mintimer = MINTIMER; | 1686 | d->rttavg = RTTAVG_INIT; |
1687 | d->rttdev = RTTDEV_INIT; | ||
1688 | d->maxbcnt = 0; | ||
1071 | 1689 | ||
1072 | t = d->targets; | 1690 | t = d->targets; |
1073 | te = t + NTARGETS; | 1691 | te = t + d->ntargets; |
1074 | for (; t < te && *t; t++) { | 1692 | for (; t < te && *t; t++) |
1075 | (*t)->maxout = (*t)->nframes; | 1693 | aoecmd_wreset(*t); |
1076 | p = (*t)->ifs; | 1694 | } |
1077 | e = p + NAOEIFS; | 1695 | |
1078 | for (; p < e; p++) { | 1696 | void |
1079 | p->lostjumbo = 0; | 1697 | aoe_failbuf(struct aoedev *d, struct buf *buf) |
1080 | p->lost = 0; | 1698 | { |
1081 | p->maxbcnt = DEFAULTBCNT; | 1699 | if (buf == NULL) |
1700 | return; | ||
1701 | buf->resid = 0; | ||
1702 | clear_bit(BIO_UPTODATE, &buf->bio->bi_flags); | ||
1703 | if (buf->nframesout == 0) | ||
1704 | aoe_end_buf(d, buf); | ||
1705 | } | ||
1706 | |||
1707 | void | ||
1708 | aoe_flush_iocq(void) | ||
1709 | { | ||
1710 | struct frame *f; | ||
1711 | struct aoedev *d; | ||
1712 | LIST_HEAD(flist); | ||
1713 | struct list_head *pos; | ||
1714 | struct sk_buff *skb; | ||
1715 | ulong flags; | ||
1716 | |||
1717 | spin_lock_irqsave(&iocq.lock, flags); | ||
1718 | list_splice_init(&iocq.head, &flist); | ||
1719 | spin_unlock_irqrestore(&iocq.lock, flags); | ||
1720 | while (!list_empty(&flist)) { | ||
1721 | pos = flist.next; | ||
1722 | list_del(pos); | ||
1723 | f = list_entry(pos, struct frame, head); | ||
1724 | d = f->t->d; | ||
1725 | skb = f->r_skb; | ||
1726 | spin_lock_irqsave(&d->lock, flags); | ||
1727 | if (f->buf) { | ||
1728 | f->buf->nframesout--; | ||
1729 | aoe_failbuf(d, f->buf); | ||
1082 | } | 1730 | } |
1731 | aoe_freetframe(f); | ||
1732 | spin_unlock_irqrestore(&d->lock, flags); | ||
1733 | dev_kfree_skb(skb); | ||
1734 | aoedev_put(d); | ||
1083 | } | 1735 | } |
1084 | } | 1736 | } |
1737 | |||
1738 | int __init | ||
1739 | aoecmd_init(void) | ||
1740 | { | ||
1741 | void *p; | ||
1742 | |||
1743 | /* get_zeroed_page returns page with ref count 1 */ | ||
1744 | p = (void *) get_zeroed_page(GFP_KERNEL | __GFP_REPEAT); | ||
1745 | if (!p) | ||
1746 | return -ENOMEM; | ||
1747 | empty_page = virt_to_page(p); | ||
1748 | |||
1749 | INIT_LIST_HEAD(&iocq.head); | ||
1750 | spin_lock_init(&iocq.lock); | ||
1751 | init_waitqueue_head(&ktiowq); | ||
1752 | kts.name = "aoe_ktio"; | ||
1753 | kts.fn = ktio; | ||
1754 | kts.waitq = &ktiowq; | ||
1755 | kts.lock = &iocq.lock; | ||
1756 | return aoe_ktstart(&kts); | ||
1757 | } | ||
1758 | |||
1759 | void | ||
1760 | aoecmd_exit(void) | ||
1761 | { | ||
1762 | aoe_ktstop(&kts); | ||
1763 | aoe_flush_iocq(); | ||
1764 | |||
1765 | free_page((unsigned long) page_address(empty_page)); | ||
1766 | empty_page = NULL; | ||
1767 | } | ||
diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c index 6b5110a47458..98f2965778b9 100644 --- a/drivers/block/aoe/aoedev.c +++ b/drivers/block/aoe/aoedev.c | |||
@@ -1,4 +1,4 @@ | |||
1 | /* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */ | 1 | /* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */ |
2 | /* | 2 | /* |
3 | * aoedev.c | 3 | * aoedev.c |
4 | * AoE device utility functions; maintains device list. | 4 | * AoE device utility functions; maintains device list. |
@@ -9,30 +9,139 @@ | |||
9 | #include <linux/netdevice.h> | 9 | #include <linux/netdevice.h> |
10 | #include <linux/delay.h> | 10 | #include <linux/delay.h> |
11 | #include <linux/slab.h> | 11 | #include <linux/slab.h> |
12 | #include <linux/bitmap.h> | ||
13 | #include <linux/kdev_t.h> | ||
14 | #include <linux/moduleparam.h> | ||
12 | #include "aoe.h" | 15 | #include "aoe.h" |
13 | 16 | ||
14 | static void dummy_timer(ulong); | 17 | static void dummy_timer(ulong); |
15 | static void aoedev_freedev(struct aoedev *); | ||
16 | static void freetgt(struct aoedev *d, struct aoetgt *t); | 18 | static void freetgt(struct aoedev *d, struct aoetgt *t); |
17 | static void skbpoolfree(struct aoedev *d); | 19 | static void skbpoolfree(struct aoedev *d); |
18 | 20 | ||
21 | static int aoe_dyndevs = 1; | ||
22 | module_param(aoe_dyndevs, int, 0644); | ||
23 | MODULE_PARM_DESC(aoe_dyndevs, "Use dynamic minor numbers for devices."); | ||
24 | |||
19 | static struct aoedev *devlist; | 25 | static struct aoedev *devlist; |
20 | static DEFINE_SPINLOCK(devlist_lock); | 26 | static DEFINE_SPINLOCK(devlist_lock); |
21 | 27 | ||
22 | struct aoedev * | 28 | /* Because some systems will have one, many, or no |
23 | aoedev_by_aoeaddr(int maj, int min) | 29 | * - partitions, |
30 | * - slots per shelf, | ||
31 | * - or shelves, | ||
32 | * we need some flexibility in the way the minor numbers | ||
33 | * are allocated. So they are dynamic. | ||
34 | */ | ||
35 | #define N_DEVS ((1U<<MINORBITS)/AOE_PARTITIONS) | ||
36 | |||
37 | static DEFINE_SPINLOCK(used_minors_lock); | ||
38 | static DECLARE_BITMAP(used_minors, N_DEVS); | ||
39 | |||
40 | static int | ||
41 | minor_get_dyn(ulong *sysminor) | ||
24 | { | 42 | { |
25 | struct aoedev *d; | ||
26 | ulong flags; | 43 | ulong flags; |
44 | ulong n; | ||
45 | int error = 0; | ||
46 | |||
47 | spin_lock_irqsave(&used_minors_lock, flags); | ||
48 | n = find_first_zero_bit(used_minors, N_DEVS); | ||
49 | if (n < N_DEVS) | ||
50 | set_bit(n, used_minors); | ||
51 | else | ||
52 | error = -1; | ||
53 | spin_unlock_irqrestore(&used_minors_lock, flags); | ||
54 | |||
55 | *sysminor = n * AOE_PARTITIONS; | ||
56 | return error; | ||
57 | } | ||
27 | 58 | ||
28 | spin_lock_irqsave(&devlist_lock, flags); | 59 | static int |
60 | minor_get_static(ulong *sysminor, ulong aoemaj, int aoemin) | ||
61 | { | ||
62 | ulong flags; | ||
63 | ulong n; | ||
64 | int error = 0; | ||
65 | enum { | ||
66 | /* for backwards compatibility when !aoe_dyndevs, | ||
67 | * a static number of supported slots per shelf */ | ||
68 | NPERSHELF = 16, | ||
69 | }; | ||
70 | |||
71 | if (aoemin >= NPERSHELF) { | ||
72 | pr_err("aoe: %s %d slots per shelf\n", | ||
73 | "static minor device numbers support only", | ||
74 | NPERSHELF); | ||
75 | error = -1; | ||
76 | goto out; | ||
77 | } | ||
29 | 78 | ||
30 | for (d=devlist; d; d=d->next) | 79 | n = aoemaj * NPERSHELF + aoemin; |
31 | if (d->aoemajor == maj && d->aoeminor == min) | 80 | if (n >= N_DEVS) { |
32 | break; | 81 | pr_err("aoe: %s with e%ld.%d\n", |
82 | "cannot use static minor device numbers", | ||
83 | aoemaj, aoemin); | ||
84 | error = -1; | ||
85 | goto out; | ||
86 | } | ||
87 | |||
88 | spin_lock_irqsave(&used_minors_lock, flags); | ||
89 | if (test_bit(n, used_minors)) { | ||
90 | pr_err("aoe: %s %lu\n", | ||
91 | "existing device already has static minor number", | ||
92 | n); | ||
93 | error = -1; | ||
94 | } else | ||
95 | set_bit(n, used_minors); | ||
96 | spin_unlock_irqrestore(&used_minors_lock, flags); | ||
97 | *sysminor = n * AOE_PARTITIONS; | ||
98 | out: | ||
99 | return error; | ||
100 | } | ||
101 | |||
102 | static int | ||
103 | minor_get(ulong *sysminor, ulong aoemaj, int aoemin) | ||
104 | { | ||
105 | if (aoe_dyndevs) | ||
106 | return minor_get_dyn(sysminor); | ||
107 | else | ||
108 | return minor_get_static(sysminor, aoemaj, aoemin); | ||
109 | } | ||
110 | |||
111 | static void | ||
112 | minor_free(ulong minor) | ||
113 | { | ||
114 | ulong flags; | ||
115 | |||
116 | minor /= AOE_PARTITIONS; | ||
117 | BUG_ON(minor >= N_DEVS); | ||
33 | 118 | ||
119 | spin_lock_irqsave(&used_minors_lock, flags); | ||
120 | BUG_ON(!test_bit(minor, used_minors)); | ||
121 | clear_bit(minor, used_minors); | ||
122 | spin_unlock_irqrestore(&used_minors_lock, flags); | ||
123 | } | ||
124 | |||
125 | /* | ||
126 | * Users who grab a pointer to the device with aoedev_by_aoeaddr | ||
127 | * automatically get a reference count and must be responsible | ||
128 | * for performing a aoedev_put. With the addition of async | ||
129 | * kthread processing I'm no longer confident that we can | ||
130 | * guarantee consistency in the face of device flushes. | ||
131 | * | ||
132 | * For the time being, we only bother to add extra references for | ||
133 | * frames sitting on the iocq. When the kthreads finish processing | ||
134 | * these frames, they will aoedev_put the device. | ||
135 | */ | ||
136 | |||
137 | void | ||
138 | aoedev_put(struct aoedev *d) | ||
139 | { | ||
140 | ulong flags; | ||
141 | |||
142 | spin_lock_irqsave(&devlist_lock, flags); | ||
143 | d->ref--; | ||
34 | spin_unlock_irqrestore(&devlist_lock, flags); | 144 | spin_unlock_irqrestore(&devlist_lock, flags); |
35 | return d; | ||
36 | } | 145 | } |
37 | 146 | ||
38 | static void | 147 | static void |
@@ -47,128 +156,250 @@ dummy_timer(ulong vp) | |||
47 | add_timer(&d->timer); | 156 | add_timer(&d->timer); |
48 | } | 157 | } |
49 | 158 | ||
159 | static void | ||
160 | aoe_failip(struct aoedev *d) | ||
161 | { | ||
162 | struct request *rq; | ||
163 | struct bio *bio; | ||
164 | unsigned long n; | ||
165 | |||
166 | aoe_failbuf(d, d->ip.buf); | ||
167 | |||
168 | rq = d->ip.rq; | ||
169 | if (rq == NULL) | ||
170 | return; | ||
171 | while ((bio = d->ip.nxbio)) { | ||
172 | clear_bit(BIO_UPTODATE, &bio->bi_flags); | ||
173 | d->ip.nxbio = bio->bi_next; | ||
174 | n = (unsigned long) rq->special; | ||
175 | rq->special = (void *) --n; | ||
176 | } | ||
177 | if ((unsigned long) rq->special == 0) | ||
178 | aoe_end_request(d, rq, 0); | ||
179 | } | ||
180 | |||
181 | static void | ||
182 | downdev_frame(struct list_head *pos) | ||
183 | { | ||
184 | struct frame *f; | ||
185 | |||
186 | f = list_entry(pos, struct frame, head); | ||
187 | list_del(pos); | ||
188 | if (f->buf) { | ||
189 | f->buf->nframesout--; | ||
190 | aoe_failbuf(f->t->d, f->buf); | ||
191 | } | ||
192 | aoe_freetframe(f); | ||
193 | } | ||
194 | |||
50 | void | 195 | void |
51 | aoedev_downdev(struct aoedev *d) | 196 | aoedev_downdev(struct aoedev *d) |
52 | { | 197 | { |
53 | struct aoetgt **t, **te; | 198 | struct aoetgt *t, **tt, **te; |
54 | struct frame *f, *e; | 199 | struct list_head *head, *pos, *nx; |
55 | struct buf *buf; | 200 | struct request *rq; |
56 | struct bio *bio; | 201 | int i; |
57 | 202 | ||
58 | t = d->targets; | 203 | d->flags &= ~DEVFL_UP; |
59 | te = t + NTARGETS; | 204 | |
60 | for (; t < te && *t; t++) { | 205 | /* clean out active and to-be-retransmitted buffers */ |
61 | f = (*t)->frames; | 206 | for (i = 0; i < NFACTIVE; i++) { |
62 | e = f + (*t)->nframes; | 207 | head = &d->factive[i]; |
63 | for (; f < e; f->tag = FREETAG, f->buf = NULL, f++) { | 208 | list_for_each_safe(pos, nx, head) |
64 | if (f->tag == FREETAG || f->buf == NULL) | 209 | downdev_frame(pos); |
65 | continue; | ||
66 | buf = f->buf; | ||
67 | bio = buf->bio; | ||
68 | if (--buf->nframesout == 0 | ||
69 | && buf != d->inprocess) { | ||
70 | mempool_free(buf, d->bufpool); | ||
71 | bio_endio(bio, -EIO); | ||
72 | } | ||
73 | } | ||
74 | (*t)->maxout = (*t)->nframes; | ||
75 | (*t)->nout = 0; | ||
76 | } | 210 | } |
77 | buf = d->inprocess; | 211 | head = &d->rexmitq; |
78 | if (buf) { | 212 | list_for_each_safe(pos, nx, head) |
79 | bio = buf->bio; | 213 | downdev_frame(pos); |
80 | mempool_free(buf, d->bufpool); | 214 | |
81 | bio_endio(bio, -EIO); | 215 | /* reset window dressings */ |
216 | tt = d->targets; | ||
217 | te = tt + d->ntargets; | ||
218 | for (; tt < te && (t = *tt); tt++) { | ||
219 | aoecmd_wreset(t); | ||
220 | t->nout = 0; | ||
82 | } | 221 | } |
83 | d->inprocess = NULL; | 222 | |
84 | d->htgt = NULL; | 223 | /* clean out the in-process request (if any) */ |
85 | 224 | aoe_failip(d); | |
86 | while (!list_empty(&d->bufq)) { | 225 | |
87 | buf = container_of(d->bufq.next, struct buf, bufs); | 226 | /* fast fail all pending I/O */ |
88 | list_del(d->bufq.next); | 227 | if (d->blkq) { |
89 | bio = buf->bio; | 228 | while ((rq = blk_peek_request(d->blkq))) { |
90 | mempool_free(buf, d->bufpool); | 229 | blk_start_request(rq); |
91 | bio_endio(bio, -EIO); | 230 | aoe_end_request(d, rq, 1); |
231 | } | ||
92 | } | 232 | } |
93 | 233 | ||
94 | if (d->gd) | 234 | if (d->gd) |
95 | set_capacity(d->gd, 0); | 235 | set_capacity(d->gd, 0); |
236 | } | ||
96 | 237 | ||
97 | d->flags &= ~DEVFL_UP; | 238 | /* return whether the user asked for this particular |
239 | * device to be flushed | ||
240 | */ | ||
241 | static int | ||
242 | user_req(char *s, size_t slen, struct aoedev *d) | ||
243 | { | ||
244 | char *p; | ||
245 | size_t lim; | ||
246 | |||
247 | if (!d->gd) | ||
248 | return 0; | ||
249 | p = strrchr(d->gd->disk_name, '/'); | ||
250 | if (!p) | ||
251 | p = d->gd->disk_name; | ||
252 | else | ||
253 | p += 1; | ||
254 | lim = sizeof(d->gd->disk_name); | ||
255 | lim -= p - d->gd->disk_name; | ||
256 | if (slen < lim) | ||
257 | lim = slen; | ||
258 | |||
259 | return !strncmp(s, p, lim); | ||
98 | } | 260 | } |
99 | 261 | ||
100 | static void | 262 | static void |
101 | aoedev_freedev(struct aoedev *d) | 263 | freedev(struct aoedev *d) |
102 | { | 264 | { |
103 | struct aoetgt **t, **e; | 265 | struct aoetgt **t, **e; |
266 | int freeing = 0; | ||
267 | unsigned long flags; | ||
268 | |||
269 | spin_lock_irqsave(&d->lock, flags); | ||
270 | if (d->flags & DEVFL_TKILL | ||
271 | && !(d->flags & DEVFL_FREEING)) { | ||
272 | d->flags |= DEVFL_FREEING; | ||
273 | freeing = 1; | ||
274 | } | ||
275 | spin_unlock_irqrestore(&d->lock, flags); | ||
276 | if (!freeing) | ||
277 | return; | ||
104 | 278 | ||
105 | cancel_work_sync(&d->work); | 279 | del_timer_sync(&d->timer); |
106 | if (d->gd) { | 280 | if (d->gd) { |
107 | aoedisk_rm_sysfs(d); | 281 | aoedisk_rm_sysfs(d); |
108 | del_gendisk(d->gd); | 282 | del_gendisk(d->gd); |
109 | put_disk(d->gd); | 283 | put_disk(d->gd); |
284 | blk_cleanup_queue(d->blkq); | ||
110 | } | 285 | } |
111 | t = d->targets; | 286 | t = d->targets; |
112 | e = t + NTARGETS; | 287 | e = t + d->ntargets; |
113 | for (; t < e && *t; t++) | 288 | for (; t < e && *t; t++) |
114 | freetgt(d, *t); | 289 | freetgt(d, *t); |
115 | if (d->bufpool) | 290 | if (d->bufpool) |
116 | mempool_destroy(d->bufpool); | 291 | mempool_destroy(d->bufpool); |
117 | skbpoolfree(d); | 292 | skbpoolfree(d); |
118 | blk_cleanup_queue(d->blkq); | 293 | minor_free(d->sysminor); |
119 | kfree(d); | 294 | |
295 | spin_lock_irqsave(&d->lock, flags); | ||
296 | d->flags |= DEVFL_FREED; | ||
297 | spin_unlock_irqrestore(&d->lock, flags); | ||
120 | } | 298 | } |
121 | 299 | ||
122 | int | 300 | enum flush_parms { |
123 | aoedev_flush(const char __user *str, size_t cnt) | 301 | NOT_EXITING = 0, |
302 | EXITING = 1, | ||
303 | }; | ||
304 | |||
305 | static int | ||
306 | flush(const char __user *str, size_t cnt, int exiting) | ||
124 | { | 307 | { |
125 | ulong flags; | 308 | ulong flags; |
126 | struct aoedev *d, **dd; | 309 | struct aoedev *d, **dd; |
127 | struct aoedev *rmd = NULL; | ||
128 | char buf[16]; | 310 | char buf[16]; |
129 | int all = 0; | 311 | int all = 0; |
312 | int specified = 0; /* flush a specific device */ | ||
313 | unsigned int skipflags; | ||
314 | |||
315 | skipflags = DEVFL_GDALLOC | DEVFL_NEWSIZE | DEVFL_TKILL; | ||
130 | 316 | ||
131 | if (cnt >= 3) { | 317 | if (!exiting && cnt >= 3) { |
132 | if (cnt > sizeof buf) | 318 | if (cnt > sizeof buf) |
133 | cnt = sizeof buf; | 319 | cnt = sizeof buf; |
134 | if (copy_from_user(buf, str, cnt)) | 320 | if (copy_from_user(buf, str, cnt)) |
135 | return -EFAULT; | 321 | return -EFAULT; |
136 | all = !strncmp(buf, "all", 3); | 322 | all = !strncmp(buf, "all", 3); |
323 | if (!all) | ||
324 | specified = 1; | ||
137 | } | 325 | } |
138 | 326 | ||
327 | flush_scheduled_work(); | ||
328 | /* pass one: without sleeping, do aoedev_downdev */ | ||
139 | spin_lock_irqsave(&devlist_lock, flags); | 329 | spin_lock_irqsave(&devlist_lock, flags); |
140 | dd = &devlist; | 330 | for (d = devlist; d; d = d->next) { |
141 | while ((d = *dd)) { | ||
142 | spin_lock(&d->lock); | 331 | spin_lock(&d->lock); |
143 | if ((!all && (d->flags & DEVFL_UP)) | 332 | if (exiting) { |
144 | || (d->flags & (DEVFL_GDALLOC|DEVFL_NEWSIZE)) | 333 | /* unconditionally take each device down */ |
145 | || d->nopen) { | 334 | } else if (specified) { |
146 | spin_unlock(&d->lock); | 335 | if (!user_req(buf, cnt, d)) |
147 | dd = &d->next; | 336 | goto cont; |
148 | continue; | 337 | } else if ((!all && (d->flags & DEVFL_UP)) |
149 | } | 338 | || d->flags & skipflags |
150 | *dd = d->next; | 339 | || d->nopen |
340 | || d->ref) | ||
341 | goto cont; | ||
342 | |||
151 | aoedev_downdev(d); | 343 | aoedev_downdev(d); |
152 | d->flags |= DEVFL_TKILL; | 344 | d->flags |= DEVFL_TKILL; |
345 | cont: | ||
153 | spin_unlock(&d->lock); | 346 | spin_unlock(&d->lock); |
154 | d->next = rmd; | ||
155 | rmd = d; | ||
156 | } | 347 | } |
157 | spin_unlock_irqrestore(&devlist_lock, flags); | 348 | spin_unlock_irqrestore(&devlist_lock, flags); |
158 | while ((d = rmd)) { | 349 | |
159 | rmd = d->next; | 350 | /* pass two: call freedev, which might sleep, |
160 | del_timer_sync(&d->timer); | 351 | * for aoedevs marked with DEVFL_TKILL |
161 | aoedev_freedev(d); /* must be able to sleep */ | 352 | */ |
353 | restart: | ||
354 | spin_lock_irqsave(&devlist_lock, flags); | ||
355 | for (d = devlist; d; d = d->next) { | ||
356 | spin_lock(&d->lock); | ||
357 | if (d->flags & DEVFL_TKILL | ||
358 | && !(d->flags & DEVFL_FREEING)) { | ||
359 | spin_unlock(&d->lock); | ||
360 | spin_unlock_irqrestore(&devlist_lock, flags); | ||
361 | freedev(d); | ||
362 | goto restart; | ||
363 | } | ||
364 | spin_unlock(&d->lock); | ||
162 | } | 365 | } |
366 | |||
367 | /* pass three: remove aoedevs marked with DEVFL_FREED */ | ||
368 | for (dd = &devlist, d = *dd; d; d = *dd) { | ||
369 | struct aoedev *doomed = NULL; | ||
370 | |||
371 | spin_lock(&d->lock); | ||
372 | if (d->flags & DEVFL_FREED) { | ||
373 | *dd = d->next; | ||
374 | doomed = d; | ||
375 | } else { | ||
376 | dd = &d->next; | ||
377 | } | ||
378 | spin_unlock(&d->lock); | ||
379 | if (doomed) | ||
380 | kfree(doomed->targets); | ||
381 | kfree(doomed); | ||
382 | } | ||
383 | spin_unlock_irqrestore(&devlist_lock, flags); | ||
384 | |||
163 | return 0; | 385 | return 0; |
164 | } | 386 | } |
165 | 387 | ||
166 | /* I'm not really sure that this is a realistic problem, but if the | 388 | int |
167 | network driver goes gonzo let's just leak memory after complaining. */ | 389 | aoedev_flush(const char __user *str, size_t cnt) |
390 | { | ||
391 | return flush(str, cnt, NOT_EXITING); | ||
392 | } | ||
393 | |||
394 | /* This has been confirmed to occur once with Tms=3*1000 due to the | ||
395 | * driver changing link and not processing its transmit ring. The | ||
396 | * problem is hard enough to solve by returning an error that I'm | ||
397 | * still punting on "solving" this. | ||
398 | */ | ||
168 | static void | 399 | static void |
169 | skbfree(struct sk_buff *skb) | 400 | skbfree(struct sk_buff *skb) |
170 | { | 401 | { |
171 | enum { Sms = 100, Tms = 3*1000}; | 402 | enum { Sms = 250, Tms = 30 * 1000}; |
172 | int i = Tms / Sms; | 403 | int i = Tms / Sms; |
173 | 404 | ||
174 | if (skb == NULL) | 405 | if (skb == NULL) |
@@ -182,6 +413,7 @@ skbfree(struct sk_buff *skb) | |||
182 | "cannot free skb -- memory leaked."); | 413 | "cannot free skb -- memory leaked."); |
183 | return; | 414 | return; |
184 | } | 415 | } |
416 | skb->truesize -= skb->data_len; | ||
185 | skb_shinfo(skb)->nr_frags = skb->data_len = 0; | 417 | skb_shinfo(skb)->nr_frags = skb->data_len = 0; |
186 | skb_trim(skb, 0); | 418 | skb_trim(skb, 0); |
187 | dev_kfree_skb(skb); | 419 | dev_kfree_skb(skb); |
@@ -198,26 +430,43 @@ skbpoolfree(struct aoedev *d) | |||
198 | __skb_queue_head_init(&d->skbpool); | 430 | __skb_queue_head_init(&d->skbpool); |
199 | } | 431 | } |
200 | 432 | ||
201 | /* find it or malloc it */ | 433 | /* find it or allocate it */ |
202 | struct aoedev * | 434 | struct aoedev * |
203 | aoedev_by_sysminor_m(ulong sysminor) | 435 | aoedev_by_aoeaddr(ulong maj, int min, int do_alloc) |
204 | { | 436 | { |
205 | struct aoedev *d; | 437 | struct aoedev *d; |
438 | int i; | ||
206 | ulong flags; | 439 | ulong flags; |
440 | ulong sysminor = 0; | ||
207 | 441 | ||
208 | spin_lock_irqsave(&devlist_lock, flags); | 442 | spin_lock_irqsave(&devlist_lock, flags); |
209 | 443 | ||
210 | for (d=devlist; d; d=d->next) | 444 | for (d=devlist; d; d=d->next) |
211 | if (d->sysminor == sysminor) | 445 | if (d->aoemajor == maj && d->aoeminor == min) { |
446 | spin_lock(&d->lock); | ||
447 | if (d->flags & DEVFL_TKILL) { | ||
448 | spin_unlock(&d->lock); | ||
449 | d = NULL; | ||
450 | goto out; | ||
451 | } | ||
452 | d->ref++; | ||
453 | spin_unlock(&d->lock); | ||
212 | break; | 454 | break; |
213 | if (d) | 455 | } |
456 | if (d || !do_alloc || minor_get(&sysminor, maj, min) < 0) | ||
214 | goto out; | 457 | goto out; |
215 | d = kcalloc(1, sizeof *d, GFP_ATOMIC); | 458 | d = kcalloc(1, sizeof *d, GFP_ATOMIC); |
216 | if (!d) | 459 | if (!d) |
217 | goto out; | 460 | goto out; |
461 | d->targets = kcalloc(NTARGETS, sizeof(*d->targets), GFP_ATOMIC); | ||
462 | if (!d->targets) { | ||
463 | kfree(d); | ||
464 | d = NULL; | ||
465 | goto out; | ||
466 | } | ||
467 | d->ntargets = NTARGETS; | ||
218 | INIT_WORK(&d->work, aoecmd_sleepwork); | 468 | INIT_WORK(&d->work, aoecmd_sleepwork); |
219 | spin_lock_init(&d->lock); | 469 | spin_lock_init(&d->lock); |
220 | skb_queue_head_init(&d->sendq); | ||
221 | skb_queue_head_init(&d->skbpool); | 470 | skb_queue_head_init(&d->skbpool); |
222 | init_timer(&d->timer); | 471 | init_timer(&d->timer); |
223 | d->timer.data = (ulong) d; | 472 | d->timer.data = (ulong) d; |
@@ -226,11 +475,15 @@ aoedev_by_sysminor_m(ulong sysminor) | |||
226 | add_timer(&d->timer); | 475 | add_timer(&d->timer); |
227 | d->bufpool = NULL; /* defer to aoeblk_gdalloc */ | 476 | d->bufpool = NULL; /* defer to aoeblk_gdalloc */ |
228 | d->tgt = d->targets; | 477 | d->tgt = d->targets; |
229 | INIT_LIST_HEAD(&d->bufq); | 478 | d->ref = 1; |
479 | for (i = 0; i < NFACTIVE; i++) | ||
480 | INIT_LIST_HEAD(&d->factive[i]); | ||
481 | INIT_LIST_HEAD(&d->rexmitq); | ||
230 | d->sysminor = sysminor; | 482 | d->sysminor = sysminor; |
231 | d->aoemajor = AOEMAJOR(sysminor); | 483 | d->aoemajor = maj; |
232 | d->aoeminor = AOEMINOR(sysminor); | 484 | d->aoeminor = min; |
233 | d->mintimer = MINTIMER; | 485 | d->rttavg = RTTAVG_INIT; |
486 | d->rttdev = RTTDEV_INIT; | ||
234 | d->next = devlist; | 487 | d->next = devlist; |
235 | devlist = d; | 488 | devlist = d; |
236 | out: | 489 | out: |
@@ -241,33 +494,32 @@ aoedev_by_sysminor_m(ulong sysminor) | |||
241 | static void | 494 | static void |
242 | freetgt(struct aoedev *d, struct aoetgt *t) | 495 | freetgt(struct aoedev *d, struct aoetgt *t) |
243 | { | 496 | { |
244 | struct frame *f, *e; | 497 | struct frame *f; |
498 | struct list_head *pos, *nx, *head; | ||
499 | struct aoeif *ifp; | ||
245 | 500 | ||
246 | f = t->frames; | 501 | for (ifp = t->ifs; ifp < &t->ifs[NAOEIFS]; ++ifp) { |
247 | e = f + t->nframes; | 502 | if (!ifp->nd) |
248 | for (; f < e; f++) | 503 | break; |
504 | dev_put(ifp->nd); | ||
505 | } | ||
506 | |||
507 | head = &t->ffree; | ||
508 | list_for_each_safe(pos, nx, head) { | ||
509 | list_del(pos); | ||
510 | f = list_entry(pos, struct frame, head); | ||
249 | skbfree(f->skb); | 511 | skbfree(f->skb); |
250 | kfree(t->frames); | 512 | kfree(f); |
513 | } | ||
251 | kfree(t); | 514 | kfree(t); |
252 | } | 515 | } |
253 | 516 | ||
254 | void | 517 | void |
255 | aoedev_exit(void) | 518 | aoedev_exit(void) |
256 | { | 519 | { |
257 | struct aoedev *d; | 520 | flush_scheduled_work(); |
258 | ulong flags; | 521 | aoe_flush_iocq(); |
259 | 522 | flush(NULL, 0, EXITING); | |
260 | while ((d = devlist)) { | ||
261 | devlist = d->next; | ||
262 | |||
263 | spin_lock_irqsave(&d->lock, flags); | ||
264 | aoedev_downdev(d); | ||
265 | d->flags |= DEVFL_TKILL; | ||
266 | spin_unlock_irqrestore(&d->lock, flags); | ||
267 | |||
268 | del_timer_sync(&d->timer); | ||
269 | aoedev_freedev(d); | ||
270 | } | ||
271 | } | 523 | } |
272 | 524 | ||
273 | int __init | 525 | int __init |
diff --git a/drivers/block/aoe/aoemain.c b/drivers/block/aoe/aoemain.c index 7f83ad90e76f..4b987c2fefbe 100644 --- a/drivers/block/aoe/aoemain.c +++ b/drivers/block/aoe/aoemain.c | |||
@@ -1,4 +1,4 @@ | |||
1 | /* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */ | 1 | /* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */ |
2 | /* | 2 | /* |
3 | * aoemain.c | 3 | * aoemain.c |
4 | * Module initialization routines, discover timer | 4 | * Module initialization routines, discover timer |
@@ -61,6 +61,7 @@ aoe_exit(void) | |||
61 | 61 | ||
62 | aoenet_exit(); | 62 | aoenet_exit(); |
63 | unregister_blkdev(AOE_MAJOR, DEVICE_NAME); | 63 | unregister_blkdev(AOE_MAJOR, DEVICE_NAME); |
64 | aoecmd_exit(); | ||
64 | aoechr_exit(); | 65 | aoechr_exit(); |
65 | aoedev_exit(); | 66 | aoedev_exit(); |
66 | aoeblk_exit(); /* free cache after de-allocating bufs */ | 67 | aoeblk_exit(); /* free cache after de-allocating bufs */ |
@@ -83,17 +84,20 @@ aoe_init(void) | |||
83 | ret = aoenet_init(); | 84 | ret = aoenet_init(); |
84 | if (ret) | 85 | if (ret) |
85 | goto net_fail; | 86 | goto net_fail; |
87 | ret = aoecmd_init(); | ||
88 | if (ret) | ||
89 | goto cmd_fail; | ||
86 | ret = register_blkdev(AOE_MAJOR, DEVICE_NAME); | 90 | ret = register_blkdev(AOE_MAJOR, DEVICE_NAME); |
87 | if (ret < 0) { | 91 | if (ret < 0) { |
88 | printk(KERN_ERR "aoe: can't register major\n"); | 92 | printk(KERN_ERR "aoe: can't register major\n"); |
89 | goto blkreg_fail; | 93 | goto blkreg_fail; |
90 | } | 94 | } |
91 | |||
92 | printk(KERN_INFO "aoe: AoE v%s initialised.\n", VERSION); | 95 | printk(KERN_INFO "aoe: AoE v%s initialised.\n", VERSION); |
93 | discover_timer(TINIT); | 96 | discover_timer(TINIT); |
94 | return 0; | 97 | return 0; |
95 | |||
96 | blkreg_fail: | 98 | blkreg_fail: |
99 | aoecmd_exit(); | ||
100 | cmd_fail: | ||
97 | aoenet_exit(); | 101 | aoenet_exit(); |
98 | net_fail: | 102 | net_fail: |
99 | aoeblk_exit(); | 103 | aoeblk_exit(); |
@@ -101,7 +105,7 @@ aoe_init(void) | |||
101 | aoechr_exit(); | 105 | aoechr_exit(); |
102 | chr_fail: | 106 | chr_fail: |
103 | aoedev_exit(); | 107 | aoedev_exit(); |
104 | 108 | ||
105 | printk(KERN_INFO "aoe: initialisation failure.\n"); | 109 | printk(KERN_INFO "aoe: initialisation failure.\n"); |
106 | return ret; | 110 | return ret; |
107 | } | 111 | } |
diff --git a/drivers/block/aoe/aoenet.c b/drivers/block/aoe/aoenet.c index 4d3bc0d49df5..71d3ea8d3006 100644 --- a/drivers/block/aoe/aoenet.c +++ b/drivers/block/aoe/aoenet.c | |||
@@ -1,4 +1,4 @@ | |||
1 | /* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */ | 1 | /* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */ |
2 | /* | 2 | /* |
3 | * aoenet.c | 3 | * aoenet.c |
4 | * Ethernet portion of AoE driver | 4 | * Ethernet portion of AoE driver |
@@ -31,7 +31,10 @@ enum { | |||
31 | 31 | ||
32 | static char aoe_iflist[IFLISTSZ]; | 32 | static char aoe_iflist[IFLISTSZ]; |
33 | module_param_string(aoe_iflist, aoe_iflist, IFLISTSZ, 0600); | 33 | module_param_string(aoe_iflist, aoe_iflist, IFLISTSZ, 0600); |
34 | MODULE_PARM_DESC(aoe_iflist, "aoe_iflist=\"dev1 [dev2 ...]\""); | 34 | MODULE_PARM_DESC(aoe_iflist, "aoe_iflist=dev1[,dev2...]"); |
35 | |||
36 | static wait_queue_head_t txwq; | ||
37 | static struct ktstate kts; | ||
35 | 38 | ||
36 | #ifndef MODULE | 39 | #ifndef MODULE |
37 | static int __init aoe_iflist_setup(char *str) | 40 | static int __init aoe_iflist_setup(char *str) |
@@ -44,6 +47,28 @@ static int __init aoe_iflist_setup(char *str) | |||
44 | __setup("aoe_iflist=", aoe_iflist_setup); | 47 | __setup("aoe_iflist=", aoe_iflist_setup); |
45 | #endif | 48 | #endif |
46 | 49 | ||
50 | static spinlock_t txlock; | ||
51 | static struct sk_buff_head skbtxq; | ||
52 | |||
53 | /* enters with txlock held */ | ||
54 | static int | ||
55 | tx(void) __must_hold(&txlock) | ||
56 | { | ||
57 | struct sk_buff *skb; | ||
58 | struct net_device *ifp; | ||
59 | |||
60 | while ((skb = skb_dequeue(&skbtxq))) { | ||
61 | spin_unlock_irq(&txlock); | ||
62 | ifp = skb->dev; | ||
63 | if (dev_queue_xmit(skb) == NET_XMIT_DROP && net_ratelimit()) | ||
64 | pr_warn("aoe: packet could not be sent on %s. %s\n", | ||
65 | ifp ? ifp->name : "netif", | ||
66 | "consider increasing tx_queue_len"); | ||
67 | spin_lock_irq(&txlock); | ||
68 | } | ||
69 | return 0; | ||
70 | } | ||
71 | |||
47 | int | 72 | int |
48 | is_aoe_netif(struct net_device *ifp) | 73 | is_aoe_netif(struct net_device *ifp) |
49 | { | 74 | { |
@@ -88,21 +113,27 @@ void | |||
88 | aoenet_xmit(struct sk_buff_head *queue) | 113 | aoenet_xmit(struct sk_buff_head *queue) |
89 | { | 114 | { |
90 | struct sk_buff *skb, *tmp; | 115 | struct sk_buff *skb, *tmp; |
116 | ulong flags; | ||
91 | 117 | ||
92 | skb_queue_walk_safe(queue, skb, tmp) { | 118 | skb_queue_walk_safe(queue, skb, tmp) { |
93 | __skb_unlink(skb, queue); | 119 | __skb_unlink(skb, queue); |
94 | dev_queue_xmit(skb); | 120 | spin_lock_irqsave(&txlock, flags); |
121 | skb_queue_tail(&skbtxq, skb); | ||
122 | spin_unlock_irqrestore(&txlock, flags); | ||
123 | wake_up(&txwq); | ||
95 | } | 124 | } |
96 | } | 125 | } |
97 | 126 | ||
98 | /* | 127 | /* |
99 | * (1) len doesn't include the header by default. I want this. | 128 | * (1) len doesn't include the header by default. I want this. |
100 | */ | 129 | */ |
101 | static int | 130 | static int |
102 | aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt, struct net_device *orig_dev) | 131 | aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt, struct net_device *orig_dev) |
103 | { | 132 | { |
104 | struct aoe_hdr *h; | 133 | struct aoe_hdr *h; |
134 | struct aoe_atahdr *ah; | ||
105 | u32 n; | 135 | u32 n; |
136 | int sn; | ||
106 | 137 | ||
107 | if (dev_net(ifp) != &init_net) | 138 | if (dev_net(ifp) != &init_net) |
108 | goto exit; | 139 | goto exit; |
@@ -110,13 +141,16 @@ aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt, | |||
110 | skb = skb_share_check(skb, GFP_ATOMIC); | 141 | skb = skb_share_check(skb, GFP_ATOMIC); |
111 | if (skb == NULL) | 142 | if (skb == NULL) |
112 | return 0; | 143 | return 0; |
113 | if (skb_linearize(skb)) | ||
114 | goto exit; | ||
115 | if (!is_aoe_netif(ifp)) | 144 | if (!is_aoe_netif(ifp)) |
116 | goto exit; | 145 | goto exit; |
117 | skb_push(skb, ETH_HLEN); /* (1) */ | 146 | skb_push(skb, ETH_HLEN); /* (1) */ |
118 | 147 | sn = sizeof(*h) + sizeof(*ah); | |
119 | h = (struct aoe_hdr *) skb_mac_header(skb); | 148 | if (skb->len >= sn) { |
149 | sn -= skb_headlen(skb); | ||
150 | if (sn > 0 && !__pskb_pull_tail(skb, sn)) | ||
151 | goto exit; | ||
152 | } | ||
153 | h = (struct aoe_hdr *) skb->data; | ||
120 | n = get_unaligned_be32(&h->tag); | 154 | n = get_unaligned_be32(&h->tag); |
121 | if ((h->verfl & AOEFL_RSP) == 0 || (n & 1<<31)) | 155 | if ((h->verfl & AOEFL_RSP) == 0 || (n & 1<<31)) |
122 | goto exit; | 156 | goto exit; |
@@ -137,7 +171,8 @@ aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt, | |||
137 | 171 | ||
138 | switch (h->cmd) { | 172 | switch (h->cmd) { |
139 | case AOECMD_ATA: | 173 | case AOECMD_ATA: |
140 | aoecmd_ata_rsp(skb); | 174 | /* ata_rsp may keep skb for later processing or give it back */ |
175 | skb = aoecmd_ata_rsp(skb); | ||
141 | break; | 176 | break; |
142 | case AOECMD_CFG: | 177 | case AOECMD_CFG: |
143 | aoecmd_cfg_rsp(skb); | 178 | aoecmd_cfg_rsp(skb); |
@@ -145,8 +180,12 @@ aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt, | |||
145 | default: | 180 | default: |
146 | if (h->cmd >= AOECMD_VEND_MIN) | 181 | if (h->cmd >= AOECMD_VEND_MIN) |
147 | break; /* don't complain about vendor commands */ | 182 | break; /* don't complain about vendor commands */ |
148 | printk(KERN_INFO "aoe: unknown cmd %d\n", h->cmd); | 183 | pr_info("aoe: unknown AoE command type 0x%02x\n", h->cmd); |
184 | break; | ||
149 | } | 185 | } |
186 | |||
187 | if (!skb) | ||
188 | return 0; | ||
150 | exit: | 189 | exit: |
151 | dev_kfree_skb(skb); | 190 | dev_kfree_skb(skb); |
152 | return 0; | 191 | return 0; |
@@ -160,6 +199,15 @@ static struct packet_type aoe_pt __read_mostly = { | |||
160 | int __init | 199 | int __init |
161 | aoenet_init(void) | 200 | aoenet_init(void) |
162 | { | 201 | { |
202 | skb_queue_head_init(&skbtxq); | ||
203 | init_waitqueue_head(&txwq); | ||
204 | spin_lock_init(&txlock); | ||
205 | kts.lock = &txlock; | ||
206 | kts.fn = tx; | ||
207 | kts.waitq = &txwq; | ||
208 | kts.name = "aoe_tx"; | ||
209 | if (aoe_ktstart(&kts)) | ||
210 | return -EAGAIN; | ||
163 | dev_add_pack(&aoe_pt); | 211 | dev_add_pack(&aoe_pt); |
164 | return 0; | 212 | return 0; |
165 | } | 213 | } |
@@ -167,6 +215,8 @@ aoenet_init(void) | |||
167 | void | 215 | void |
168 | aoenet_exit(void) | 216 | aoenet_exit(void) |
169 | { | 217 | { |
218 | aoe_ktstop(&kts); | ||
219 | skb_queue_purge(&skbtxq); | ||
170 | dev_remove_pack(&aoe_pt); | 220 | dev_remove_pack(&aoe_pt); |
171 | } | 221 | } |
172 | 222 | ||
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index b0f553b26d0f..6526157edafc 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c | |||
@@ -41,8 +41,9 @@ | |||
41 | #include <linux/spinlock.h> | 41 | #include <linux/spinlock.h> |
42 | #include <linux/compat.h> | 42 | #include <linux/compat.h> |
43 | #include <linux/mutex.h> | 43 | #include <linux/mutex.h> |
44 | #include <linux/bitmap.h> | ||
45 | #include <linux/io.h> | ||
44 | #include <asm/uaccess.h> | 46 | #include <asm/uaccess.h> |
45 | #include <asm/io.h> | ||
46 | 47 | ||
47 | #include <linux/dma-mapping.h> | 48 | #include <linux/dma-mapping.h> |
48 | #include <linux/blkdev.h> | 49 | #include <linux/blkdev.h> |
@@ -978,8 +979,7 @@ static CommandList_struct *cmd_alloc(ctlr_info_t *h) | |||
978 | i = find_first_zero_bit(h->cmd_pool_bits, h->nr_cmds); | 979 | i = find_first_zero_bit(h->cmd_pool_bits, h->nr_cmds); |
979 | if (i == h->nr_cmds) | 980 | if (i == h->nr_cmds) |
980 | return NULL; | 981 | return NULL; |
981 | } while (test_and_set_bit(i & (BITS_PER_LONG - 1), | 982 | } while (test_and_set_bit(i, h->cmd_pool_bits) != 0); |
982 | h->cmd_pool_bits + (i / BITS_PER_LONG)) != 0); | ||
983 | c = h->cmd_pool + i; | 983 | c = h->cmd_pool + i; |
984 | memset(c, 0, sizeof(CommandList_struct)); | 984 | memset(c, 0, sizeof(CommandList_struct)); |
985 | cmd_dma_handle = h->cmd_pool_dhandle + i * sizeof(CommandList_struct); | 985 | cmd_dma_handle = h->cmd_pool_dhandle + i * sizeof(CommandList_struct); |
@@ -1046,8 +1046,7 @@ static void cmd_free(ctlr_info_t *h, CommandList_struct *c) | |||
1046 | int i; | 1046 | int i; |
1047 | 1047 | ||
1048 | i = c - h->cmd_pool; | 1048 | i = c - h->cmd_pool; |
1049 | clear_bit(i & (BITS_PER_LONG - 1), | 1049 | clear_bit(i, h->cmd_pool_bits); |
1050 | h->cmd_pool_bits + (i / BITS_PER_LONG)); | ||
1051 | h->nr_frees++; | 1050 | h->nr_frees++; |
1052 | } | 1051 | } |
1053 | 1052 | ||
@@ -4268,10 +4267,7 @@ static void __devinit cciss_find_board_params(ctlr_info_t *h) | |||
4268 | 4267 | ||
4269 | static inline bool CISS_signature_present(ctlr_info_t *h) | 4268 | static inline bool CISS_signature_present(ctlr_info_t *h) |
4270 | { | 4269 | { |
4271 | if ((readb(&h->cfgtable->Signature[0]) != 'C') || | 4270 | if (!check_signature(h->cfgtable->Signature, "CISS", 4)) { |
4272 | (readb(&h->cfgtable->Signature[1]) != 'I') || | ||
4273 | (readb(&h->cfgtable->Signature[2]) != 'S') || | ||
4274 | (readb(&h->cfgtable->Signature[3]) != 'S')) { | ||
4275 | dev_warn(&h->pdev->dev, "not a valid CISS config table\n"); | 4271 | dev_warn(&h->pdev->dev, "not a valid CISS config table\n"); |
4276 | return false; | 4272 | return false; |
4277 | } | 4273 | } |
@@ -4812,8 +4808,7 @@ static __devinit int cciss_init_reset_devices(struct pci_dev *pdev) | |||
4812 | 4808 | ||
4813 | static __devinit int cciss_allocate_cmd_pool(ctlr_info_t *h) | 4809 | static __devinit int cciss_allocate_cmd_pool(ctlr_info_t *h) |
4814 | { | 4810 | { |
4815 | h->cmd_pool_bits = kmalloc( | 4811 | h->cmd_pool_bits = kmalloc(BITS_TO_LONGS(h->nr_cmds) * |
4816 | DIV_ROUND_UP(h->nr_cmds, BITS_PER_LONG) * | ||
4817 | sizeof(unsigned long), GFP_KERNEL); | 4812 | sizeof(unsigned long), GFP_KERNEL); |
4818 | h->cmd_pool = pci_alloc_consistent(h->pdev, | 4813 | h->cmd_pool = pci_alloc_consistent(h->pdev, |
4819 | h->nr_cmds * sizeof(CommandList_struct), | 4814 | h->nr_cmds * sizeof(CommandList_struct), |
@@ -5068,9 +5063,7 @@ reinit_after_soft_reset: | |||
5068 | pci_set_drvdata(pdev, h); | 5063 | pci_set_drvdata(pdev, h); |
5069 | /* command and error info recs zeroed out before | 5064 | /* command and error info recs zeroed out before |
5070 | they are used */ | 5065 | they are used */ |
5071 | memset(h->cmd_pool_bits, 0, | 5066 | bitmap_zero(h->cmd_pool_bits, h->nr_cmds); |
5072 | DIV_ROUND_UP(h->nr_cmds, BITS_PER_LONG) | ||
5073 | * sizeof(unsigned long)); | ||
5074 | 5067 | ||
5075 | h->num_luns = 0; | 5068 | h->num_luns = 0; |
5076 | h->highest_lun = -1; | 5069 | h->highest_lun = -1; |
@@ -5205,7 +5198,6 @@ static void cciss_shutdown(struct pci_dev *pdev) | |||
5205 | return; | 5198 | return; |
5206 | } | 5199 | } |
5207 | /* write all data in the battery backed cache to disk */ | 5200 | /* write all data in the battery backed cache to disk */ |
5208 | memset(flush_buf, 0, 4); | ||
5209 | return_code = sendcmd_withirq(h, CCISS_CACHE_FLUSH, flush_buf, | 5201 | return_code = sendcmd_withirq(h, CCISS_CACHE_FLUSH, flush_buf, |
5210 | 4, 0, CTLR_LUNID, TYPE_CMD); | 5202 | 4, 0, CTLR_LUNID, TYPE_CMD); |
5211 | kfree(flush_buf); | 5203 | kfree(flush_buf); |
diff --git a/drivers/block/drbd/Kconfig b/drivers/block/drbd/Kconfig index df0983787390..7845bd6ee414 100644 --- a/drivers/block/drbd/Kconfig +++ b/drivers/block/drbd/Kconfig | |||
@@ -2,13 +2,14 @@ | |||
2 | # DRBD device driver configuration | 2 | # DRBD device driver configuration |
3 | # | 3 | # |
4 | 4 | ||
5 | comment "DRBD disabled because PROC_FS, INET or CONNECTOR not selected" | 5 | comment "DRBD disabled because PROC_FS or INET not selected" |
6 | depends on PROC_FS='n' || INET='n' || CONNECTOR='n' | 6 | depends on PROC_FS='n' || INET='n' |
7 | 7 | ||
8 | config BLK_DEV_DRBD | 8 | config BLK_DEV_DRBD |
9 | tristate "DRBD Distributed Replicated Block Device support" | 9 | tristate "DRBD Distributed Replicated Block Device support" |
10 | depends on PROC_FS && INET && CONNECTOR | 10 | depends on PROC_FS && INET |
11 | select LRU_CACHE | 11 | select LRU_CACHE |
12 | select LIBCRC32C | ||
12 | default n | 13 | default n |
13 | help | 14 | help |
14 | 15 | ||
@@ -58,7 +59,8 @@ config DRBD_FAULT_INJECTION | |||
58 | 32 data read | 59 | 32 data read |
59 | 64 read ahead | 60 | 64 read ahead |
60 | 128 kmalloc of bitmap | 61 | 128 kmalloc of bitmap |
61 | 256 allocation of EE (epoch_entries) | 62 | 256 allocation of peer_requests |
63 | 512 insert data corruption on receiving side | ||
62 | 64 | ||
63 | fault_devs: bitmask of minor numbers | 65 | fault_devs: bitmask of minor numbers |
64 | fault_rate: frequency in percent | 66 | fault_rate: frequency in percent |
diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile index 0d3f337ff5ff..8b450338075e 100644 --- a/drivers/block/drbd/Makefile +++ b/drivers/block/drbd/Makefile | |||
@@ -1,5 +1,7 @@ | |||
1 | drbd-y := drbd_bitmap.o drbd_proc.o | 1 | drbd-y := drbd_bitmap.o drbd_proc.o |
2 | drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o | 2 | drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o |
3 | drbd-y += drbd_main.o drbd_strings.o drbd_nl.o | 3 | drbd-y += drbd_main.o drbd_strings.o drbd_nl.o |
4 | drbd-y += drbd_interval.o drbd_state.o | ||
5 | drbd-y += drbd_nla.o | ||
4 | 6 | ||
5 | obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o | 7 | obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o |
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 3fbef018ce55..92510f8ad013 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c | |||
@@ -24,21 +24,73 @@ | |||
24 | */ | 24 | */ |
25 | 25 | ||
26 | #include <linux/slab.h> | 26 | #include <linux/slab.h> |
27 | #include <linux/crc32c.h> | ||
27 | #include <linux/drbd.h> | 28 | #include <linux/drbd.h> |
29 | #include <linux/drbd_limits.h> | ||
30 | #include <linux/dynamic_debug.h> | ||
28 | #include "drbd_int.h" | 31 | #include "drbd_int.h" |
29 | #include "drbd_wrappers.h" | 32 | #include "drbd_wrappers.h" |
30 | 33 | ||
31 | /* We maintain a trivial checksum in our on disk activity log. | 34 | |
32 | * With that we can ensure correct operation even when the storage | 35 | enum al_transaction_types { |
33 | * device might do a partial (last) sector write while losing power. | 36 | AL_TR_UPDATE = 0, |
34 | */ | 37 | AL_TR_INITIALIZED = 0xffff |
35 | struct __packed al_transaction { | 38 | }; |
36 | u32 magic; | 39 | /* all fields on disc in big endian */ |
37 | u32 tr_number; | 40 | struct __packed al_transaction_on_disk { |
38 | struct __packed { | 41 | /* don't we all like magic */ |
39 | u32 pos; | 42 | __be32 magic; |
40 | u32 extent; } updates[1 + AL_EXTENTS_PT]; | 43 | |
41 | u32 xor_sum; | 44 | /* to identify the most recent transaction block |
45 | * in the on disk ring buffer */ | ||
46 | __be32 tr_number; | ||
47 | |||
48 | /* checksum on the full 4k block, with this field set to 0. */ | ||
49 | __be32 crc32c; | ||
50 | |||
51 | /* type of transaction, special transaction types like: | ||
52 | * purge-all, set-all-idle, set-all-active, ... to-be-defined | ||
53 | * see also enum al_transaction_types */ | ||
54 | __be16 transaction_type; | ||
55 | |||
56 | /* we currently allow only a few thousand extents, | ||
57 | * so 16bit will be enough for the slot number. */ | ||
58 | |||
59 | /* how many updates in this transaction */ | ||
60 | __be16 n_updates; | ||
61 | |||
62 | /* maximum slot number, "al-extents" in drbd.conf speak. | ||
63 | * Having this in each transaction should make reconfiguration | ||
64 | * of that parameter easier. */ | ||
65 | __be16 context_size; | ||
66 | |||
67 | /* slot number the context starts with */ | ||
68 | __be16 context_start_slot_nr; | ||
69 | |||
70 | /* Some reserved bytes. Expected usage is a 64bit counter of | ||
71 | * sectors-written since device creation, and other data generation tag | ||
72 | * supporting usage */ | ||
73 | __be32 __reserved[4]; | ||
74 | |||
75 | /* --- 36 byte used --- */ | ||
76 | |||
77 | /* Reserve space for up to AL_UPDATES_PER_TRANSACTION changes | ||
78 | * in one transaction, then use the remaining byte in the 4k block for | ||
79 | * context information. "Flexible" number of updates per transaction | ||
80 | * does not help, as we have to account for the case when all update | ||
81 | * slots are used anyways, so it would only complicate code without | ||
82 | * additional benefit. | ||
83 | */ | ||
84 | __be16 update_slot_nr[AL_UPDATES_PER_TRANSACTION]; | ||
85 | |||
86 | /* but the extent number is 32bit, which at an extent size of 4 MiB | ||
87 | * allows to cover device sizes of up to 2**54 Byte (16 PiB) */ | ||
88 | __be32 update_extent_nr[AL_UPDATES_PER_TRANSACTION]; | ||
89 | |||
90 | /* --- 420 bytes used (36 + 64*6) --- */ | ||
91 | |||
92 | /* 4096 - 420 = 3676 = 919 * 4 */ | ||
93 | __be32 context[AL_CONTEXT_PER_TRANSACTION]; | ||
42 | }; | 94 | }; |
43 | 95 | ||
44 | struct update_odbm_work { | 96 | struct update_odbm_work { |
@@ -48,22 +100,11 @@ struct update_odbm_work { | |||
48 | 100 | ||
49 | struct update_al_work { | 101 | struct update_al_work { |
50 | struct drbd_work w; | 102 | struct drbd_work w; |
51 | struct lc_element *al_ext; | ||
52 | struct completion event; | 103 | struct completion event; |
53 | unsigned int enr; | 104 | int err; |
54 | /* if old_enr != LC_FREE, write corresponding bitmap sector, too */ | ||
55 | unsigned int old_enr; | ||
56 | }; | ||
57 | |||
58 | struct drbd_atodb_wait { | ||
59 | atomic_t count; | ||
60 | struct completion io_done; | ||
61 | struct drbd_conf *mdev; | ||
62 | int error; | ||
63 | }; | 105 | }; |
64 | 106 | ||
65 | 107 | static int al_write_transaction(struct drbd_conf *mdev); | |
66 | int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int); | ||
67 | 108 | ||
68 | void *drbd_md_get_buffer(struct drbd_conf *mdev) | 109 | void *drbd_md_get_buffer(struct drbd_conf *mdev) |
69 | { | 110 | { |
@@ -82,22 +123,24 @@ void drbd_md_put_buffer(struct drbd_conf *mdev) | |||
82 | wake_up(&mdev->misc_wait); | 123 | wake_up(&mdev->misc_wait); |
83 | } | 124 | } |
84 | 125 | ||
85 | static bool md_io_allowed(struct drbd_conf *mdev) | 126 | void wait_until_done_or_force_detached(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, |
86 | { | ||
87 | enum drbd_disk_state ds = mdev->state.disk; | ||
88 | return ds >= D_NEGOTIATING || ds == D_ATTACHING; | ||
89 | } | ||
90 | |||
91 | void wait_until_done_or_disk_failure(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, | ||
92 | unsigned int *done) | 127 | unsigned int *done) |
93 | { | 128 | { |
94 | long dt = bdev->dc.disk_timeout * HZ / 10; | 129 | long dt; |
130 | |||
131 | rcu_read_lock(); | ||
132 | dt = rcu_dereference(bdev->disk_conf)->disk_timeout; | ||
133 | rcu_read_unlock(); | ||
134 | dt = dt * HZ / 10; | ||
95 | if (dt == 0) | 135 | if (dt == 0) |
96 | dt = MAX_SCHEDULE_TIMEOUT; | 136 | dt = MAX_SCHEDULE_TIMEOUT; |
97 | 137 | ||
98 | dt = wait_event_timeout(mdev->misc_wait, *done || !md_io_allowed(mdev), dt); | 138 | dt = wait_event_timeout(mdev->misc_wait, |
99 | if (dt == 0) | 139 | *done || test_bit(FORCE_DETACH, &mdev->flags), dt); |
140 | if (dt == 0) { | ||
100 | dev_err(DEV, "meta-data IO operation timed out\n"); | 141 | dev_err(DEV, "meta-data IO operation timed out\n"); |
142 | drbd_chk_io_error(mdev, 1, DRBD_FORCE_DETACH); | ||
143 | } | ||
101 | } | 144 | } |
102 | 145 | ||
103 | static int _drbd_md_sync_page_io(struct drbd_conf *mdev, | 146 | static int _drbd_md_sync_page_io(struct drbd_conf *mdev, |
@@ -106,7 +149,7 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, | |||
106 | int rw, int size) | 149 | int rw, int size) |
107 | { | 150 | { |
108 | struct bio *bio; | 151 | struct bio *bio; |
109 | int ok; | 152 | int err; |
110 | 153 | ||
111 | mdev->md_io.done = 0; | 154 | mdev->md_io.done = 0; |
112 | mdev->md_io.error = -ENODEV; | 155 | mdev->md_io.error = -ENODEV; |
@@ -118,8 +161,8 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, | |||
118 | bio = bio_alloc_drbd(GFP_NOIO); | 161 | bio = bio_alloc_drbd(GFP_NOIO); |
119 | bio->bi_bdev = bdev->md_bdev; | 162 | bio->bi_bdev = bdev->md_bdev; |
120 | bio->bi_sector = sector; | 163 | bio->bi_sector = sector; |
121 | ok = (bio_add_page(bio, page, size, 0) == size); | 164 | err = -EIO; |
122 | if (!ok) | 165 | if (bio_add_page(bio, page, size, 0) != size) |
123 | goto out; | 166 | goto out; |
124 | bio->bi_private = &mdev->md_io; | 167 | bio->bi_private = &mdev->md_io; |
125 | bio->bi_end_io = drbd_md_io_complete; | 168 | bio->bi_end_io = drbd_md_io_complete; |
@@ -127,7 +170,7 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, | |||
127 | 170 | ||
128 | if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* Corresponding put_ldev in drbd_md_io_complete() */ | 171 | if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* Corresponding put_ldev in drbd_md_io_complete() */ |
129 | dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n"); | 172 | dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n"); |
130 | ok = 0; | 173 | err = -ENODEV; |
131 | goto out; | 174 | goto out; |
132 | } | 175 | } |
133 | 176 | ||
@@ -137,86 +180,47 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, | |||
137 | bio_endio(bio, -EIO); | 180 | bio_endio(bio, -EIO); |
138 | else | 181 | else |
139 | submit_bio(rw, bio); | 182 | submit_bio(rw, bio); |
140 | wait_until_done_or_disk_failure(mdev, bdev, &mdev->md_io.done); | 183 | wait_until_done_or_force_detached(mdev, bdev, &mdev->md_io.done); |
141 | ok = bio_flagged(bio, BIO_UPTODATE) && mdev->md_io.error == 0; | 184 | if (bio_flagged(bio, BIO_UPTODATE)) |
185 | err = mdev->md_io.error; | ||
142 | 186 | ||
143 | out: | 187 | out: |
144 | bio_put(bio); | 188 | bio_put(bio); |
145 | return ok; | 189 | return err; |
146 | } | 190 | } |
147 | 191 | ||
148 | int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, | 192 | int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, |
149 | sector_t sector, int rw) | 193 | sector_t sector, int rw) |
150 | { | 194 | { |
151 | int logical_block_size, mask, ok; | 195 | int err; |
152 | int offset = 0; | ||
153 | struct page *iop = mdev->md_io_page; | 196 | struct page *iop = mdev->md_io_page; |
154 | 197 | ||
155 | D_ASSERT(atomic_read(&mdev->md_io_in_use) == 1); | 198 | D_ASSERT(atomic_read(&mdev->md_io_in_use) == 1); |
156 | 199 | ||
157 | BUG_ON(!bdev->md_bdev); | 200 | BUG_ON(!bdev->md_bdev); |
158 | 201 | ||
159 | logical_block_size = bdev_logical_block_size(bdev->md_bdev); | 202 | dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s)\n", |
160 | if (logical_block_size == 0) | 203 | current->comm, current->pid, __func__, |
161 | logical_block_size = MD_SECTOR_SIZE; | 204 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); |
162 | |||
163 | /* in case logical_block_size != 512 [ s390 only? ] */ | ||
164 | if (logical_block_size != MD_SECTOR_SIZE) { | ||
165 | mask = (logical_block_size / MD_SECTOR_SIZE) - 1; | ||
166 | D_ASSERT(mask == 1 || mask == 3 || mask == 7); | ||
167 | D_ASSERT(logical_block_size == (mask+1) * MD_SECTOR_SIZE); | ||
168 | offset = sector & mask; | ||
169 | sector = sector & ~mask; | ||
170 | iop = mdev->md_io_tmpp; | ||
171 | |||
172 | if (rw & WRITE) { | ||
173 | /* these are GFP_KERNEL pages, pre-allocated | ||
174 | * on device initialization */ | ||
175 | void *p = page_address(mdev->md_io_page); | ||
176 | void *hp = page_address(mdev->md_io_tmpp); | ||
177 | |||
178 | ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, | ||
179 | READ, logical_block_size); | ||
180 | |||
181 | if (unlikely(!ok)) { | ||
182 | dev_err(DEV, "drbd_md_sync_page_io(,%llus," | ||
183 | "READ [logical_block_size!=512]) failed!\n", | ||
184 | (unsigned long long)sector); | ||
185 | return 0; | ||
186 | } | ||
187 | |||
188 | memcpy(hp + offset*MD_SECTOR_SIZE, p, MD_SECTOR_SIZE); | ||
189 | } | ||
190 | } | ||
191 | 205 | ||
192 | if (sector < drbd_md_first_sector(bdev) || | 206 | if (sector < drbd_md_first_sector(bdev) || |
193 | sector > drbd_md_last_sector(bdev)) | 207 | sector + 7 > drbd_md_last_sector(bdev)) |
194 | dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n", | 208 | dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n", |
195 | current->comm, current->pid, __func__, | 209 | current->comm, current->pid, __func__, |
196 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); | 210 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); |
197 | 211 | ||
198 | ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, logical_block_size); | 212 | err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, MD_BLOCK_SIZE); |
199 | if (unlikely(!ok)) { | 213 | if (err) { |
200 | dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed!\n", | 214 | dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n", |
201 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); | 215 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err); |
202 | return 0; | ||
203 | } | ||
204 | |||
205 | if (logical_block_size != MD_SECTOR_SIZE && !(rw & WRITE)) { | ||
206 | void *p = page_address(mdev->md_io_page); | ||
207 | void *hp = page_address(mdev->md_io_tmpp); | ||
208 | |||
209 | memcpy(p, hp + offset*MD_SECTOR_SIZE, MD_SECTOR_SIZE); | ||
210 | } | 216 | } |
211 | 217 | return err; | |
212 | return ok; | ||
213 | } | 218 | } |
214 | 219 | ||
215 | static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr) | 220 | static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr) |
216 | { | 221 | { |
217 | struct lc_element *al_ext; | 222 | struct lc_element *al_ext; |
218 | struct lc_element *tmp; | 223 | struct lc_element *tmp; |
219 | unsigned long al_flags = 0; | ||
220 | int wake; | 224 | int wake; |
221 | 225 | ||
222 | spin_lock_irq(&mdev->al_lock); | 226 | spin_lock_irq(&mdev->al_lock); |
@@ -231,76 +235,92 @@ static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr) | |||
231 | return NULL; | 235 | return NULL; |
232 | } | 236 | } |
233 | } | 237 | } |
234 | al_ext = lc_get(mdev->act_log, enr); | 238 | al_ext = lc_get(mdev->act_log, enr); |
235 | al_flags = mdev->act_log->flags; | ||
236 | spin_unlock_irq(&mdev->al_lock); | 239 | spin_unlock_irq(&mdev->al_lock); |
237 | |||
238 | /* | ||
239 | if (!al_ext) { | ||
240 | if (al_flags & LC_STARVING) | ||
241 | dev_warn(DEV, "Have to wait for LRU element (AL too small?)\n"); | ||
242 | if (al_flags & LC_DIRTY) | ||
243 | dev_warn(DEV, "Ongoing AL update (AL device too slow?)\n"); | ||
244 | } | ||
245 | */ | ||
246 | |||
247 | return al_ext; | 240 | return al_ext; |
248 | } | 241 | } |
249 | 242 | ||
250 | void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector) | 243 | void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i) |
251 | { | 244 | { |
252 | unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9)); | 245 | /* for bios crossing activity log extent boundaries, |
253 | struct lc_element *al_ext; | 246 | * we may need to activate two extents in one go */ |
254 | struct update_al_work al_work; | 247 | unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); |
248 | unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); | ||
249 | unsigned enr; | ||
250 | bool locked = false; | ||
255 | 251 | ||
252 | |||
253 | D_ASSERT(first <= last); | ||
256 | D_ASSERT(atomic_read(&mdev->local_cnt) > 0); | 254 | D_ASSERT(atomic_read(&mdev->local_cnt) > 0); |
257 | 255 | ||
258 | wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr))); | 256 | for (enr = first; enr <= last; enr++) |
257 | wait_event(mdev->al_wait, _al_get(mdev, enr) != NULL); | ||
258 | |||
259 | /* Serialize multiple transactions. | ||
260 | * This uses test_and_set_bit, memory barrier is implicit. | ||
261 | */ | ||
262 | wait_event(mdev->al_wait, | ||
263 | mdev->act_log->pending_changes == 0 || | ||
264 | (locked = lc_try_lock_for_transaction(mdev->act_log))); | ||
259 | 265 | ||
260 | if (al_ext->lc_number != enr) { | 266 | if (locked) { |
261 | /* drbd_al_write_transaction(mdev,al_ext,enr); | 267 | /* drbd_al_write_transaction(mdev,al_ext,enr); |
262 | * recurses into generic_make_request(), which | 268 | * recurses into generic_make_request(), which |
263 | * disallows recursion, bios being serialized on the | 269 | * disallows recursion, bios being serialized on the |
264 | * current->bio_tail list now. | 270 | * current->bio_tail list now. |
265 | * we have to delegate updates to the activity log | 271 | * we have to delegate updates to the activity log |
266 | * to the worker thread. */ | 272 | * to the worker thread. */ |
267 | init_completion(&al_work.event); | 273 | |
268 | al_work.al_ext = al_ext; | 274 | /* Double check: it may have been committed by someone else, |
269 | al_work.enr = enr; | 275 | * while we have been waiting for the lock. */ |
270 | al_work.old_enr = al_ext->lc_number; | 276 | if (mdev->act_log->pending_changes) { |
271 | al_work.w.cb = w_al_write_transaction; | 277 | bool write_al_updates; |
272 | drbd_queue_work_front(&mdev->data.work, &al_work.w); | 278 | |
273 | wait_for_completion(&al_work.event); | 279 | rcu_read_lock(); |
274 | 280 | write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates; | |
275 | mdev->al_writ_cnt++; | 281 | rcu_read_unlock(); |
276 | 282 | ||
277 | spin_lock_irq(&mdev->al_lock); | 283 | if (write_al_updates) { |
278 | lc_changed(mdev->act_log, al_ext); | 284 | al_write_transaction(mdev); |
279 | spin_unlock_irq(&mdev->al_lock); | 285 | mdev->al_writ_cnt++; |
286 | } | ||
287 | |||
288 | spin_lock_irq(&mdev->al_lock); | ||
289 | /* FIXME | ||
290 | if (err) | ||
291 | we need an "lc_cancel" here; | ||
292 | */ | ||
293 | lc_committed(mdev->act_log); | ||
294 | spin_unlock_irq(&mdev->al_lock); | ||
295 | } | ||
296 | lc_unlock(mdev->act_log); | ||
280 | wake_up(&mdev->al_wait); | 297 | wake_up(&mdev->al_wait); |
281 | } | 298 | } |
282 | } | 299 | } |
283 | 300 | ||
284 | void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector) | 301 | void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i) |
285 | { | 302 | { |
286 | unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9)); | 303 | /* for bios crossing activity log extent boundaries, |
304 | * we may need to activate two extents in one go */ | ||
305 | unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); | ||
306 | unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); | ||
307 | unsigned enr; | ||
287 | struct lc_element *extent; | 308 | struct lc_element *extent; |
288 | unsigned long flags; | 309 | unsigned long flags; |
289 | 310 | ||
311 | D_ASSERT(first <= last); | ||
290 | spin_lock_irqsave(&mdev->al_lock, flags); | 312 | spin_lock_irqsave(&mdev->al_lock, flags); |
291 | 313 | ||
292 | extent = lc_find(mdev->act_log, enr); | 314 | for (enr = first; enr <= last; enr++) { |
293 | 315 | extent = lc_find(mdev->act_log, enr); | |
294 | if (!extent) { | 316 | if (!extent) { |
295 | spin_unlock_irqrestore(&mdev->al_lock, flags); | 317 | dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr); |
296 | dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr); | 318 | continue; |
297 | return; | 319 | } |
320 | lc_put(mdev->act_log, extent); | ||
298 | } | 321 | } |
299 | |||
300 | if (lc_put(mdev->act_log, extent) == 0) | ||
301 | wake_up(&mdev->al_wait); | ||
302 | |||
303 | spin_unlock_irqrestore(&mdev->al_lock, flags); | 322 | spin_unlock_irqrestore(&mdev->al_lock, flags); |
323 | wake_up(&mdev->al_wait); | ||
304 | } | 324 | } |
305 | 325 | ||
306 | #if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT) | 326 | #if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT) |
@@ -326,296 +346,148 @@ static unsigned int rs_extent_to_bm_page(unsigned int rs_enr) | |||
326 | return rs_enr >> | 346 | return rs_enr >> |
327 | /* bit to page */ | 347 | /* bit to page */ |
328 | ((PAGE_SHIFT + 3) - | 348 | ((PAGE_SHIFT + 3) - |
329 | /* al extent number to bit */ | 349 | /* resync extent number to bit */ |
330 | (BM_EXT_SHIFT - BM_BLOCK_SHIFT)); | 350 | (BM_EXT_SHIFT - BM_BLOCK_SHIFT)); |
331 | } | 351 | } |
332 | 352 | ||
333 | int | 353 | static int |
334 | w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) | 354 | _al_write_transaction(struct drbd_conf *mdev) |
335 | { | 355 | { |
336 | struct update_al_work *aw = container_of(w, struct update_al_work, w); | 356 | struct al_transaction_on_disk *buffer; |
337 | struct lc_element *updated = aw->al_ext; | 357 | struct lc_element *e; |
338 | const unsigned int new_enr = aw->enr; | ||
339 | const unsigned int evicted = aw->old_enr; | ||
340 | struct al_transaction *buffer; | ||
341 | sector_t sector; | 358 | sector_t sector; |
342 | int i, n, mx; | 359 | int i, mx; |
343 | unsigned int extent_nr; | 360 | unsigned extent_nr; |
344 | u32 xor_sum = 0; | 361 | unsigned crc = 0; |
362 | int err = 0; | ||
345 | 363 | ||
346 | if (!get_ldev(mdev)) { | 364 | if (!get_ldev(mdev)) { |
347 | dev_err(DEV, | 365 | dev_err(DEV, "disk is %s, cannot start al transaction\n", |
348 | "disk is %s, cannot start al transaction (-%d +%d)\n", | 366 | drbd_disk_str(mdev->state.disk)); |
349 | drbd_disk_str(mdev->state.disk), evicted, new_enr); | 367 | return -EIO; |
350 | complete(&((struct update_al_work *)w)->event); | ||
351 | return 1; | ||
352 | } | 368 | } |
353 | /* do we have to do a bitmap write, first? | ||
354 | * TODO reduce maximum latency: | ||
355 | * submit both bios, then wait for both, | ||
356 | * instead of doing two synchronous sector writes. | ||
357 | * For now, we must not write the transaction, | ||
358 | * if we cannot write out the bitmap of the evicted extent. */ | ||
359 | if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE) | ||
360 | drbd_bm_write_page(mdev, al_extent_to_bm_page(evicted)); | ||
361 | 369 | ||
362 | /* The bitmap write may have failed, causing a state change. */ | 370 | /* The bitmap write may have failed, causing a state change. */ |
363 | if (mdev->state.disk < D_INCONSISTENT) { | 371 | if (mdev->state.disk < D_INCONSISTENT) { |
364 | dev_err(DEV, | 372 | dev_err(DEV, |
365 | "disk is %s, cannot write al transaction (-%d +%d)\n", | 373 | "disk is %s, cannot write al transaction\n", |
366 | drbd_disk_str(mdev->state.disk), evicted, new_enr); | 374 | drbd_disk_str(mdev->state.disk)); |
367 | complete(&((struct update_al_work *)w)->event); | ||
368 | put_ldev(mdev); | 375 | put_ldev(mdev); |
369 | return 1; | 376 | return -EIO; |
370 | } | 377 | } |
371 | 378 | ||
372 | buffer = drbd_md_get_buffer(mdev); /* protects md_io_buffer, al_tr_cycle, ... */ | 379 | buffer = drbd_md_get_buffer(mdev); /* protects md_io_buffer, al_tr_cycle, ... */ |
373 | if (!buffer) { | 380 | if (!buffer) { |
374 | dev_err(DEV, "disk failed while waiting for md_io buffer\n"); | 381 | dev_err(DEV, "disk failed while waiting for md_io buffer\n"); |
375 | complete(&((struct update_al_work *)w)->event); | ||
376 | put_ldev(mdev); | 382 | put_ldev(mdev); |
377 | return 1; | 383 | return -ENODEV; |
378 | } | 384 | } |
379 | 385 | ||
380 | buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC); | 386 | memset(buffer, 0, sizeof(*buffer)); |
387 | buffer->magic = cpu_to_be32(DRBD_AL_MAGIC); | ||
381 | buffer->tr_number = cpu_to_be32(mdev->al_tr_number); | 388 | buffer->tr_number = cpu_to_be32(mdev->al_tr_number); |
382 | 389 | ||
383 | n = lc_index_of(mdev->act_log, updated); | 390 | i = 0; |
391 | |||
392 | /* Even though no one can start to change this list | ||
393 | * once we set the LC_LOCKED -- from drbd_al_begin_io(), | ||
394 | * lc_try_lock_for_transaction() --, someone may still | ||
395 | * be in the process of changing it. */ | ||
396 | spin_lock_irq(&mdev->al_lock); | ||
397 | list_for_each_entry(e, &mdev->act_log->to_be_changed, list) { | ||
398 | if (i == AL_UPDATES_PER_TRANSACTION) { | ||
399 | i++; | ||
400 | break; | ||
401 | } | ||
402 | buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index); | ||
403 | buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number); | ||
404 | if (e->lc_number != LC_FREE) | ||
405 | drbd_bm_mark_for_writeout(mdev, | ||
406 | al_extent_to_bm_page(e->lc_number)); | ||
407 | i++; | ||
408 | } | ||
409 | spin_unlock_irq(&mdev->al_lock); | ||
410 | BUG_ON(i > AL_UPDATES_PER_TRANSACTION); | ||
384 | 411 | ||
385 | buffer->updates[0].pos = cpu_to_be32(n); | 412 | buffer->n_updates = cpu_to_be16(i); |
386 | buffer->updates[0].extent = cpu_to_be32(new_enr); | 413 | for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) { |
414 | buffer->update_slot_nr[i] = cpu_to_be16(-1); | ||
415 | buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE); | ||
416 | } | ||
387 | 417 | ||
388 | xor_sum ^= new_enr; | 418 | buffer->context_size = cpu_to_be16(mdev->act_log->nr_elements); |
419 | buffer->context_start_slot_nr = cpu_to_be16(mdev->al_tr_cycle); | ||
389 | 420 | ||
390 | mx = min_t(int, AL_EXTENTS_PT, | 421 | mx = min_t(int, AL_CONTEXT_PER_TRANSACTION, |
391 | mdev->act_log->nr_elements - mdev->al_tr_cycle); | 422 | mdev->act_log->nr_elements - mdev->al_tr_cycle); |
392 | for (i = 0; i < mx; i++) { | 423 | for (i = 0; i < mx; i++) { |
393 | unsigned idx = mdev->al_tr_cycle + i; | 424 | unsigned idx = mdev->al_tr_cycle + i; |
394 | extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number; | 425 | extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number; |
395 | buffer->updates[i+1].pos = cpu_to_be32(idx); | 426 | buffer->context[i] = cpu_to_be32(extent_nr); |
396 | buffer->updates[i+1].extent = cpu_to_be32(extent_nr); | ||
397 | xor_sum ^= extent_nr; | ||
398 | } | ||
399 | for (; i < AL_EXTENTS_PT; i++) { | ||
400 | buffer->updates[i+1].pos = __constant_cpu_to_be32(-1); | ||
401 | buffer->updates[i+1].extent = __constant_cpu_to_be32(LC_FREE); | ||
402 | xor_sum ^= LC_FREE; | ||
403 | } | 427 | } |
404 | mdev->al_tr_cycle += AL_EXTENTS_PT; | 428 | for (; i < AL_CONTEXT_PER_TRANSACTION; i++) |
429 | buffer->context[i] = cpu_to_be32(LC_FREE); | ||
430 | |||
431 | mdev->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION; | ||
405 | if (mdev->al_tr_cycle >= mdev->act_log->nr_elements) | 432 | if (mdev->al_tr_cycle >= mdev->act_log->nr_elements) |
406 | mdev->al_tr_cycle = 0; | 433 | mdev->al_tr_cycle = 0; |
407 | 434 | ||
408 | buffer->xor_sum = cpu_to_be32(xor_sum); | ||
409 | |||
410 | sector = mdev->ldev->md.md_offset | 435 | sector = mdev->ldev->md.md_offset |
411 | + mdev->ldev->md.al_offset + mdev->al_tr_pos; | 436 | + mdev->ldev->md.al_offset |
412 | 437 | + mdev->al_tr_pos * (MD_BLOCK_SIZE>>9); | |
413 | if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) | ||
414 | drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); | ||
415 | 438 | ||
416 | if (++mdev->al_tr_pos > | 439 | crc = crc32c(0, buffer, 4096); |
417 | div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT)) | 440 | buffer->crc32c = cpu_to_be32(crc); |
418 | mdev->al_tr_pos = 0; | ||
419 | 441 | ||
420 | D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE); | 442 | if (drbd_bm_write_hinted(mdev)) |
421 | mdev->al_tr_number++; | 443 | err = -EIO; |
444 | /* drbd_chk_io_error done already */ | ||
445 | else if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { | ||
446 | err = -EIO; | ||
447 | drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); | ||
448 | } else { | ||
449 | /* advance ringbuffer position and transaction counter */ | ||
450 | mdev->al_tr_pos = (mdev->al_tr_pos + 1) % (MD_AL_SECTORS*512/MD_BLOCK_SIZE); | ||
451 | mdev->al_tr_number++; | ||
452 | } | ||
422 | 453 | ||
423 | drbd_md_put_buffer(mdev); | 454 | drbd_md_put_buffer(mdev); |
424 | |||
425 | complete(&((struct update_al_work *)w)->event); | ||
426 | put_ldev(mdev); | 455 | put_ldev(mdev); |
427 | 456 | ||
428 | return 1; | 457 | return err; |
429 | } | 458 | } |
430 | 459 | ||
431 | /** | ||
432 | * drbd_al_read_tr() - Read a single transaction from the on disk activity log | ||
433 | * @mdev: DRBD device. | ||
434 | * @bdev: Block device to read form. | ||
435 | * @b: pointer to an al_transaction. | ||
436 | * @index: On disk slot of the transaction to read. | ||
437 | * | ||
438 | * Returns -1 on IO error, 0 on checksum error and 1 upon success. | ||
439 | */ | ||
440 | static int drbd_al_read_tr(struct drbd_conf *mdev, | ||
441 | struct drbd_backing_dev *bdev, | ||
442 | struct al_transaction *b, | ||
443 | int index) | ||
444 | { | ||
445 | sector_t sector; | ||
446 | int rv, i; | ||
447 | u32 xor_sum = 0; | ||
448 | |||
449 | sector = bdev->md.md_offset + bdev->md.al_offset + index; | ||
450 | |||
451 | /* Dont process error normally, | ||
452 | * as this is done before disk is attached! */ | ||
453 | if (!drbd_md_sync_page_io(mdev, bdev, sector, READ)) | ||
454 | return -1; | ||
455 | |||
456 | rv = (be32_to_cpu(b->magic) == DRBD_MAGIC); | ||
457 | |||
458 | for (i = 0; i < AL_EXTENTS_PT + 1; i++) | ||
459 | xor_sum ^= be32_to_cpu(b->updates[i].extent); | ||
460 | rv &= (xor_sum == be32_to_cpu(b->xor_sum)); | ||
461 | 460 | ||
462 | return rv; | 461 | static int w_al_write_transaction(struct drbd_work *w, int unused) |
463 | } | ||
464 | |||
465 | /** | ||
466 | * drbd_al_read_log() - Restores the activity log from its on disk representation. | ||
467 | * @mdev: DRBD device. | ||
468 | * @bdev: Block device to read form. | ||
469 | * | ||
470 | * Returns 1 on success, returns 0 when reading the log failed due to IO errors. | ||
471 | */ | ||
472 | int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) | ||
473 | { | 462 | { |
474 | struct al_transaction *buffer; | 463 | struct update_al_work *aw = container_of(w, struct update_al_work, w); |
475 | int i; | 464 | struct drbd_conf *mdev = w->mdev; |
476 | int rv; | 465 | int err; |
477 | int mx; | ||
478 | int active_extents = 0; | ||
479 | int transactions = 0; | ||
480 | int found_valid = 0; | ||
481 | int from = 0; | ||
482 | int to = 0; | ||
483 | u32 from_tnr = 0; | ||
484 | u32 to_tnr = 0; | ||
485 | u32 cnr; | ||
486 | |||
487 | mx = div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT); | ||
488 | |||
489 | /* lock out all other meta data io for now, | ||
490 | * and make sure the page is mapped. | ||
491 | */ | ||
492 | buffer = drbd_md_get_buffer(mdev); | ||
493 | if (!buffer) | ||
494 | return 0; | ||
495 | |||
496 | /* Find the valid transaction in the log */ | ||
497 | for (i = 0; i <= mx; i++) { | ||
498 | rv = drbd_al_read_tr(mdev, bdev, buffer, i); | ||
499 | if (rv == 0) | ||
500 | continue; | ||
501 | if (rv == -1) { | ||
502 | drbd_md_put_buffer(mdev); | ||
503 | return 0; | ||
504 | } | ||
505 | cnr = be32_to_cpu(buffer->tr_number); | ||
506 | |||
507 | if (++found_valid == 1) { | ||
508 | from = i; | ||
509 | to = i; | ||
510 | from_tnr = cnr; | ||
511 | to_tnr = cnr; | ||
512 | continue; | ||
513 | } | ||
514 | if ((int)cnr - (int)from_tnr < 0) { | ||
515 | D_ASSERT(from_tnr - cnr + i - from == mx+1); | ||
516 | from = i; | ||
517 | from_tnr = cnr; | ||
518 | } | ||
519 | if ((int)cnr - (int)to_tnr > 0) { | ||
520 | D_ASSERT(cnr - to_tnr == i - to); | ||
521 | to = i; | ||
522 | to_tnr = cnr; | ||
523 | } | ||
524 | } | ||
525 | |||
526 | if (!found_valid) { | ||
527 | dev_warn(DEV, "No usable activity log found.\n"); | ||
528 | drbd_md_put_buffer(mdev); | ||
529 | return 1; | ||
530 | } | ||
531 | |||
532 | /* Read the valid transactions. | ||
533 | * dev_info(DEV, "Reading from %d to %d.\n",from,to); */ | ||
534 | i = from; | ||
535 | while (1) { | ||
536 | int j, pos; | ||
537 | unsigned int extent_nr; | ||
538 | unsigned int trn; | ||
539 | |||
540 | rv = drbd_al_read_tr(mdev, bdev, buffer, i); | ||
541 | ERR_IF(rv == 0) goto cancel; | ||
542 | if (rv == -1) { | ||
543 | drbd_md_put_buffer(mdev); | ||
544 | return 0; | ||
545 | } | ||
546 | |||
547 | trn = be32_to_cpu(buffer->tr_number); | ||
548 | |||
549 | spin_lock_irq(&mdev->al_lock); | ||
550 | |||
551 | /* This loop runs backwards because in the cyclic | ||
552 | elements there might be an old version of the | ||
553 | updated element (in slot 0). So the element in slot 0 | ||
554 | can overwrite old versions. */ | ||
555 | for (j = AL_EXTENTS_PT; j >= 0; j--) { | ||
556 | pos = be32_to_cpu(buffer->updates[j].pos); | ||
557 | extent_nr = be32_to_cpu(buffer->updates[j].extent); | ||
558 | |||
559 | if (extent_nr == LC_FREE) | ||
560 | continue; | ||
561 | |||
562 | lc_set(mdev->act_log, extent_nr, pos); | ||
563 | active_extents++; | ||
564 | } | ||
565 | spin_unlock_irq(&mdev->al_lock); | ||
566 | |||
567 | transactions++; | ||
568 | |||
569 | cancel: | ||
570 | if (i == to) | ||
571 | break; | ||
572 | i++; | ||
573 | if (i > mx) | ||
574 | i = 0; | ||
575 | } | ||
576 | |||
577 | mdev->al_tr_number = to_tnr+1; | ||
578 | mdev->al_tr_pos = to; | ||
579 | if (++mdev->al_tr_pos > | ||
580 | div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT)) | ||
581 | mdev->al_tr_pos = 0; | ||
582 | |||
583 | /* ok, we are done with it */ | ||
584 | drbd_md_put_buffer(mdev); | ||
585 | 466 | ||
586 | dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n", | 467 | err = _al_write_transaction(mdev); |
587 | transactions, active_extents); | 468 | aw->err = err; |
469 | complete(&aw->event); | ||
588 | 470 | ||
589 | return 1; | 471 | return err != -EIO ? err : 0; |
590 | } | 472 | } |
591 | 473 | ||
592 | /** | 474 | /* Calls from worker context (see w_restart_disk_io()) need to write the |
593 | * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents | 475 | transaction directly. Others came through generic_make_request(), |
594 | * @mdev: DRBD device. | 476 | those need to delegate it to the worker. */ |
595 | */ | 477 | static int al_write_transaction(struct drbd_conf *mdev) |
596 | void drbd_al_apply_to_bm(struct drbd_conf *mdev) | ||
597 | { | 478 | { |
598 | unsigned int enr; | 479 | struct update_al_work al_work; |
599 | unsigned long add = 0; | ||
600 | char ppb[10]; | ||
601 | int i, tmp; | ||
602 | |||
603 | wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); | ||
604 | 480 | ||
605 | for (i = 0; i < mdev->act_log->nr_elements; i++) { | 481 | if (current == mdev->tconn->worker.task) |
606 | enr = lc_element_by_index(mdev->act_log, i)->lc_number; | 482 | return _al_write_transaction(mdev); |
607 | if (enr == LC_FREE) | ||
608 | continue; | ||
609 | tmp = drbd_bm_ALe_set_all(mdev, enr); | ||
610 | dynamic_dev_dbg(DEV, "AL: set %d bits in extent %u\n", tmp, enr); | ||
611 | add += tmp; | ||
612 | } | ||
613 | 483 | ||
614 | lc_unlock(mdev->act_log); | 484 | init_completion(&al_work.event); |
615 | wake_up(&mdev->al_wait); | 485 | al_work.w.cb = w_al_write_transaction; |
486 | al_work.w.mdev = mdev; | ||
487 | drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w); | ||
488 | wait_for_completion(&al_work.event); | ||
616 | 489 | ||
617 | dev_info(DEV, "Marked additional %s as out-of-sync based on AL.\n", | 490 | return al_work.err; |
618 | ppsize(ppb, Bit2KB(add))); | ||
619 | } | 491 | } |
620 | 492 | ||
621 | static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext) | 493 | static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext) |
@@ -645,7 +517,7 @@ void drbd_al_shrink(struct drbd_conf *mdev) | |||
645 | struct lc_element *al_ext; | 517 | struct lc_element *al_ext; |
646 | int i; | 518 | int i; |
647 | 519 | ||
648 | D_ASSERT(test_bit(__LC_DIRTY, &mdev->act_log->flags)); | 520 | D_ASSERT(test_bit(__LC_LOCKED, &mdev->act_log->flags)); |
649 | 521 | ||
650 | for (i = 0; i < mdev->act_log->nr_elements; i++) { | 522 | for (i = 0; i < mdev->act_log->nr_elements; i++) { |
651 | al_ext = lc_element_by_index(mdev->act_log, i); | 523 | al_ext = lc_element_by_index(mdev->act_log, i); |
@@ -657,15 +529,17 @@ void drbd_al_shrink(struct drbd_conf *mdev) | |||
657 | wake_up(&mdev->al_wait); | 529 | wake_up(&mdev->al_wait); |
658 | } | 530 | } |
659 | 531 | ||
660 | static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused) | 532 | static int w_update_odbm(struct drbd_work *w, int unused) |
661 | { | 533 | { |
662 | struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w); | 534 | struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w); |
535 | struct drbd_conf *mdev = w->mdev; | ||
536 | struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, }; | ||
663 | 537 | ||
664 | if (!get_ldev(mdev)) { | 538 | if (!get_ldev(mdev)) { |
665 | if (__ratelimit(&drbd_ratelimit_state)) | 539 | if (__ratelimit(&drbd_ratelimit_state)) |
666 | dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n"); | 540 | dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n"); |
667 | kfree(udw); | 541 | kfree(udw); |
668 | return 1; | 542 | return 0; |
669 | } | 543 | } |
670 | 544 | ||
671 | drbd_bm_write_page(mdev, rs_extent_to_bm_page(udw->enr)); | 545 | drbd_bm_write_page(mdev, rs_extent_to_bm_page(udw->enr)); |
@@ -683,9 +557,9 @@ static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused | |||
683 | break; | 557 | break; |
684 | } | 558 | } |
685 | } | 559 | } |
686 | drbd_bcast_sync_progress(mdev); | 560 | drbd_bcast_event(mdev, &sib); |
687 | 561 | ||
688 | return 1; | 562 | return 0; |
689 | } | 563 | } |
690 | 564 | ||
691 | 565 | ||
@@ -755,7 +629,9 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector, | |||
755 | } | 629 | } |
756 | ext->rs_left = rs_left; | 630 | ext->rs_left = rs_left; |
757 | ext->rs_failed = success ? 0 : count; | 631 | ext->rs_failed = success ? 0 : count; |
758 | lc_changed(mdev->resync, &ext->lce); | 632 | /* we don't keep a persistent log of the resync lru, |
633 | * we can commit any change right away. */ | ||
634 | lc_committed(mdev->resync); | ||
759 | } | 635 | } |
760 | lc_put(mdev->resync, &ext->lce); | 636 | lc_put(mdev->resync, &ext->lce); |
761 | /* no race, we are within the al_lock! */ | 637 | /* no race, we are within the al_lock! */ |
@@ -767,7 +643,8 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector, | |||
767 | if (udw) { | 643 | if (udw) { |
768 | udw->enr = ext->lce.lc_number; | 644 | udw->enr = ext->lce.lc_number; |
769 | udw->w.cb = w_update_odbm; | 645 | udw->w.cb = w_update_odbm; |
770 | drbd_queue_work_front(&mdev->data.work, &udw->w); | 646 | udw->w.mdev = mdev; |
647 | drbd_queue_work_front(&mdev->tconn->sender_work, &udw->w); | ||
771 | } else { | 648 | } else { |
772 | dev_warn(DEV, "Could not kmalloc an udw\n"); | 649 | dev_warn(DEV, "Could not kmalloc an udw\n"); |
773 | } | 650 | } |
@@ -813,16 +690,22 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, | |||
813 | int wake_up = 0; | 690 | int wake_up = 0; |
814 | unsigned long flags; | 691 | unsigned long flags; |
815 | 692 | ||
816 | if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) { | 693 | if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { |
817 | dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n", | 694 | dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n", |
818 | (unsigned long long)sector, size); | 695 | (unsigned long long)sector, size); |
819 | return; | 696 | return; |
820 | } | 697 | } |
698 | |||
699 | if (!get_ldev(mdev)) | ||
700 | return; /* no disk, no metadata, no bitmap to clear bits in */ | ||
701 | |||
821 | nr_sectors = drbd_get_capacity(mdev->this_bdev); | 702 | nr_sectors = drbd_get_capacity(mdev->this_bdev); |
822 | esector = sector + (size >> 9) - 1; | 703 | esector = sector + (size >> 9) - 1; |
823 | 704 | ||
824 | ERR_IF(sector >= nr_sectors) return; | 705 | if (!expect(sector < nr_sectors)) |
825 | ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1); | 706 | goto out; |
707 | if (!expect(esector < nr_sectors)) | ||
708 | esector = nr_sectors - 1; | ||
826 | 709 | ||
827 | lbnr = BM_SECT_TO_BIT(nr_sectors-1); | 710 | lbnr = BM_SECT_TO_BIT(nr_sectors-1); |
828 | 711 | ||
@@ -830,7 +713,7 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, | |||
830 | * round up start sector, round down end sector. we make sure we only | 713 | * round up start sector, round down end sector. we make sure we only |
831 | * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */ | 714 | * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */ |
832 | if (unlikely(esector < BM_SECT_PER_BIT-1)) | 715 | if (unlikely(esector < BM_SECT_PER_BIT-1)) |
833 | return; | 716 | goto out; |
834 | if (unlikely(esector == (nr_sectors-1))) | 717 | if (unlikely(esector == (nr_sectors-1))) |
835 | ebnr = lbnr; | 718 | ebnr = lbnr; |
836 | else | 719 | else |
@@ -838,14 +721,14 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, | |||
838 | sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); | 721 | sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); |
839 | 722 | ||
840 | if (sbnr > ebnr) | 723 | if (sbnr > ebnr) |
841 | return; | 724 | goto out; |
842 | 725 | ||
843 | /* | 726 | /* |
844 | * ok, (capacity & 7) != 0 sometimes, but who cares... | 727 | * ok, (capacity & 7) != 0 sometimes, but who cares... |
845 | * we count rs_{total,left} in bits, not sectors. | 728 | * we count rs_{total,left} in bits, not sectors. |
846 | */ | 729 | */ |
847 | count = drbd_bm_clear_bits(mdev, sbnr, ebnr); | 730 | count = drbd_bm_clear_bits(mdev, sbnr, ebnr); |
848 | if (count && get_ldev(mdev)) { | 731 | if (count) { |
849 | drbd_advance_rs_marks(mdev, drbd_bm_total_weight(mdev)); | 732 | drbd_advance_rs_marks(mdev, drbd_bm_total_weight(mdev)); |
850 | spin_lock_irqsave(&mdev->al_lock, flags); | 733 | spin_lock_irqsave(&mdev->al_lock, flags); |
851 | drbd_try_clear_on_disk_bm(mdev, sector, count, true); | 734 | drbd_try_clear_on_disk_bm(mdev, sector, count, true); |
@@ -854,8 +737,9 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, | |||
854 | /* just wake_up unconditional now, various lc_chaged(), | 737 | /* just wake_up unconditional now, various lc_chaged(), |
855 | * lc_put() in drbd_try_clear_on_disk_bm(). */ | 738 | * lc_put() in drbd_try_clear_on_disk_bm(). */ |
856 | wake_up = 1; | 739 | wake_up = 1; |
857 | put_ldev(mdev); | ||
858 | } | 740 | } |
741 | out: | ||
742 | put_ldev(mdev); | ||
859 | if (wake_up) | 743 | if (wake_up) |
860 | wake_up(&mdev->al_wait); | 744 | wake_up(&mdev->al_wait); |
861 | } | 745 | } |
@@ -871,7 +755,7 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, | |||
871 | int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, | 755 | int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, |
872 | const char *file, const unsigned int line) | 756 | const char *file, const unsigned int line) |
873 | { | 757 | { |
874 | unsigned long sbnr, ebnr, lbnr, flags; | 758 | unsigned long sbnr, ebnr, flags; |
875 | sector_t esector, nr_sectors; | 759 | sector_t esector, nr_sectors; |
876 | unsigned int enr, count = 0; | 760 | unsigned int enr, count = 0; |
877 | struct lc_element *e; | 761 | struct lc_element *e; |
@@ -880,7 +764,7 @@ int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, | |||
880 | if (size == 0) | 764 | if (size == 0) |
881 | return 0; | 765 | return 0; |
882 | 766 | ||
883 | if (size < 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) { | 767 | if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { |
884 | dev_err(DEV, "sector: %llus, size: %d\n", | 768 | dev_err(DEV, "sector: %llus, size: %d\n", |
885 | (unsigned long long)sector, size); | 769 | (unsigned long long)sector, size); |
886 | return 0; | 770 | return 0; |
@@ -892,12 +776,10 @@ int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, | |||
892 | nr_sectors = drbd_get_capacity(mdev->this_bdev); | 776 | nr_sectors = drbd_get_capacity(mdev->this_bdev); |
893 | esector = sector + (size >> 9) - 1; | 777 | esector = sector + (size >> 9) - 1; |
894 | 778 | ||
895 | ERR_IF(sector >= nr_sectors) | 779 | if (!expect(sector < nr_sectors)) |
896 | goto out; | 780 | goto out; |
897 | ERR_IF(esector >= nr_sectors) | 781 | if (!expect(esector < nr_sectors)) |
898 | esector = (nr_sectors-1); | 782 | esector = nr_sectors - 1; |
899 | |||
900 | lbnr = BM_SECT_TO_BIT(nr_sectors-1); | ||
901 | 783 | ||
902 | /* we set it out of sync, | 784 | /* we set it out of sync, |
903 | * we do not need to round anything here */ | 785 | * we do not need to round anything here */ |
@@ -940,7 +822,7 @@ struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr) | |||
940 | if (bm_ext->lce.lc_number != enr) { | 822 | if (bm_ext->lce.lc_number != enr) { |
941 | bm_ext->rs_left = drbd_bm_e_weight(mdev, enr); | 823 | bm_ext->rs_left = drbd_bm_e_weight(mdev, enr); |
942 | bm_ext->rs_failed = 0; | 824 | bm_ext->rs_failed = 0; |
943 | lc_changed(mdev->resync, &bm_ext->lce); | 825 | lc_committed(mdev->resync); |
944 | wakeup = 1; | 826 | wakeup = 1; |
945 | } | 827 | } |
946 | if (bm_ext->lce.refcnt == 1) | 828 | if (bm_ext->lce.refcnt == 1) |
@@ -956,7 +838,7 @@ struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr) | |||
956 | if (rs_flags & LC_STARVING) | 838 | if (rs_flags & LC_STARVING) |
957 | dev_warn(DEV, "Have to wait for element" | 839 | dev_warn(DEV, "Have to wait for element" |
958 | " (resync LRU too small?)\n"); | 840 | " (resync LRU too small?)\n"); |
959 | BUG_ON(rs_flags & LC_DIRTY); | 841 | BUG_ON(rs_flags & LC_LOCKED); |
960 | } | 842 | } |
961 | 843 | ||
962 | return bm_ext; | 844 | return bm_ext; |
@@ -964,26 +846,12 @@ struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr) | |||
964 | 846 | ||
965 | static int _is_in_al(struct drbd_conf *mdev, unsigned int enr) | 847 | static int _is_in_al(struct drbd_conf *mdev, unsigned int enr) |
966 | { | 848 | { |
967 | struct lc_element *al_ext; | 849 | int rv; |
968 | int rv = 0; | ||
969 | 850 | ||
970 | spin_lock_irq(&mdev->al_lock); | 851 | spin_lock_irq(&mdev->al_lock); |
971 | if (unlikely(enr == mdev->act_log->new_number)) | 852 | rv = lc_is_used(mdev->act_log, enr); |
972 | rv = 1; | ||
973 | else { | ||
974 | al_ext = lc_find(mdev->act_log, enr); | ||
975 | if (al_ext) { | ||
976 | if (al_ext->refcnt) | ||
977 | rv = 1; | ||
978 | } | ||
979 | } | ||
980 | spin_unlock_irq(&mdev->al_lock); | 853 | spin_unlock_irq(&mdev->al_lock); |
981 | 854 | ||
982 | /* | ||
983 | if (unlikely(rv)) { | ||
984 | dev_info(DEV, "Delaying sync read until app's write is done\n"); | ||
985 | } | ||
986 | */ | ||
987 | return rv; | 855 | return rv; |
988 | } | 856 | } |
989 | 857 | ||
@@ -1113,13 +981,13 @@ int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector) | |||
1113 | if (rs_flags & LC_STARVING) | 981 | if (rs_flags & LC_STARVING) |
1114 | dev_warn(DEV, "Have to wait for element" | 982 | dev_warn(DEV, "Have to wait for element" |
1115 | " (resync LRU too small?)\n"); | 983 | " (resync LRU too small?)\n"); |
1116 | BUG_ON(rs_flags & LC_DIRTY); | 984 | BUG_ON(rs_flags & LC_LOCKED); |
1117 | goto try_again; | 985 | goto try_again; |
1118 | } | 986 | } |
1119 | if (bm_ext->lce.lc_number != enr) { | 987 | if (bm_ext->lce.lc_number != enr) { |
1120 | bm_ext->rs_left = drbd_bm_e_weight(mdev, enr); | 988 | bm_ext->rs_left = drbd_bm_e_weight(mdev, enr); |
1121 | bm_ext->rs_failed = 0; | 989 | bm_ext->rs_failed = 0; |
1122 | lc_changed(mdev->resync, &bm_ext->lce); | 990 | lc_committed(mdev->resync); |
1123 | wake_up(&mdev->al_wait); | 991 | wake_up(&mdev->al_wait); |
1124 | D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0); | 992 | D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0); |
1125 | } | 993 | } |
@@ -1130,8 +998,6 @@ int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector) | |||
1130 | } | 998 | } |
1131 | check_al: | 999 | check_al: |
1132 | for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { | 1000 | for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { |
1133 | if (unlikely(al_enr+i == mdev->act_log->new_number)) | ||
1134 | goto try_again; | ||
1135 | if (lc_is_used(mdev->act_log, al_enr+i)) | 1001 | if (lc_is_used(mdev->act_log, al_enr+i)) |
1136 | goto try_again; | 1002 | goto try_again; |
1137 | } | 1003 | } |
@@ -1266,7 +1132,7 @@ void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size) | |||
1266 | sector_t esector, nr_sectors; | 1132 | sector_t esector, nr_sectors; |
1267 | int wake_up = 0; | 1133 | int wake_up = 0; |
1268 | 1134 | ||
1269 | if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) { | 1135 | if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { |
1270 | dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n", | 1136 | dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n", |
1271 | (unsigned long long)sector, size); | 1137 | (unsigned long long)sector, size); |
1272 | return; | 1138 | return; |
@@ -1274,8 +1140,10 @@ void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size) | |||
1274 | nr_sectors = drbd_get_capacity(mdev->this_bdev); | 1140 | nr_sectors = drbd_get_capacity(mdev->this_bdev); |
1275 | esector = sector + (size >> 9) - 1; | 1141 | esector = sector + (size >> 9) - 1; |
1276 | 1142 | ||
1277 | ERR_IF(sector >= nr_sectors) return; | 1143 | if (!expect(sector < nr_sectors)) |
1278 | ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1); | 1144 | return; |
1145 | if (!expect(esector < nr_sectors)) | ||
1146 | esector = nr_sectors - 1; | ||
1279 | 1147 | ||
1280 | lbnr = BM_SECT_TO_BIT(nr_sectors-1); | 1148 | lbnr = BM_SECT_TO_BIT(nr_sectors-1); |
1281 | 1149 | ||
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index d84566496746..8dc29502dc08 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c | |||
@@ -119,13 +119,9 @@ static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func) | |||
119 | if (!__ratelimit(&drbd_ratelimit_state)) | 119 | if (!__ratelimit(&drbd_ratelimit_state)) |
120 | return; | 120 | return; |
121 | dev_err(DEV, "FIXME %s in %s, bitmap locked for '%s' by %s\n", | 121 | dev_err(DEV, "FIXME %s in %s, bitmap locked for '%s' by %s\n", |
122 | current == mdev->receiver.task ? "receiver" : | 122 | drbd_task_to_thread_name(mdev->tconn, current), |
123 | current == mdev->asender.task ? "asender" : | 123 | func, b->bm_why ?: "?", |
124 | current == mdev->worker.task ? "worker" : current->comm, | 124 | drbd_task_to_thread_name(mdev->tconn, b->bm_task)); |
125 | func, b->bm_why ?: "?", | ||
126 | b->bm_task == mdev->receiver.task ? "receiver" : | ||
127 | b->bm_task == mdev->asender.task ? "asender" : | ||
128 | b->bm_task == mdev->worker.task ? "worker" : "?"); | ||
129 | } | 125 | } |
130 | 126 | ||
131 | void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags) | 127 | void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags) |
@@ -142,13 +138,9 @@ void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags) | |||
142 | 138 | ||
143 | if (trylock_failed) { | 139 | if (trylock_failed) { |
144 | dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n", | 140 | dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n", |
145 | current == mdev->receiver.task ? "receiver" : | 141 | drbd_task_to_thread_name(mdev->tconn, current), |
146 | current == mdev->asender.task ? "asender" : | 142 | why, b->bm_why ?: "?", |
147 | current == mdev->worker.task ? "worker" : current->comm, | 143 | drbd_task_to_thread_name(mdev->tconn, b->bm_task)); |
148 | why, b->bm_why ?: "?", | ||
149 | b->bm_task == mdev->receiver.task ? "receiver" : | ||
150 | b->bm_task == mdev->asender.task ? "asender" : | ||
151 | b->bm_task == mdev->worker.task ? "worker" : "?"); | ||
152 | mutex_lock(&b->bm_change); | 144 | mutex_lock(&b->bm_change); |
153 | } | 145 | } |
154 | if (BM_LOCKED_MASK & b->bm_flags) | 146 | if (BM_LOCKED_MASK & b->bm_flags) |
@@ -196,6 +188,9 @@ void drbd_bm_unlock(struct drbd_conf *mdev) | |||
196 | /* to mark for lazy writeout once syncer cleared all clearable bits, | 188 | /* to mark for lazy writeout once syncer cleared all clearable bits, |
197 | * we if bits have been cleared since last IO. */ | 189 | * we if bits have been cleared since last IO. */ |
198 | #define BM_PAGE_LAZY_WRITEOUT 28 | 190 | #define BM_PAGE_LAZY_WRITEOUT 28 |
191 | /* pages marked with this "HINT" will be considered for writeout | ||
192 | * on activity log transactions */ | ||
193 | #define BM_PAGE_HINT_WRITEOUT 27 | ||
199 | 194 | ||
200 | /* store_page_idx uses non-atomic assignment. It is only used directly after | 195 | /* store_page_idx uses non-atomic assignment. It is only used directly after |
201 | * allocating the page. All other bm_set_page_* and bm_clear_page_* need to | 196 | * allocating the page. All other bm_set_page_* and bm_clear_page_* need to |
@@ -227,8 +222,7 @@ static void bm_page_unlock_io(struct drbd_conf *mdev, int page_nr) | |||
227 | { | 222 | { |
228 | struct drbd_bitmap *b = mdev->bitmap; | 223 | struct drbd_bitmap *b = mdev->bitmap; |
229 | void *addr = &page_private(b->bm_pages[page_nr]); | 224 | void *addr = &page_private(b->bm_pages[page_nr]); |
230 | clear_bit(BM_PAGE_IO_LOCK, addr); | 225 | clear_bit_unlock(BM_PAGE_IO_LOCK, addr); |
231 | smp_mb__after_clear_bit(); | ||
232 | wake_up(&mdev->bitmap->bm_io_wait); | 226 | wake_up(&mdev->bitmap->bm_io_wait); |
233 | } | 227 | } |
234 | 228 | ||
@@ -246,6 +240,27 @@ static void bm_set_page_need_writeout(struct page *page) | |||
246 | set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page)); | 240 | set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page)); |
247 | } | 241 | } |
248 | 242 | ||
243 | /** | ||
244 | * drbd_bm_mark_for_writeout() - mark a page with a "hint" to be considered for writeout | ||
245 | * @mdev: DRBD device. | ||
246 | * @page_nr: the bitmap page to mark with the "hint" flag | ||
247 | * | ||
248 | * From within an activity log transaction, we mark a few pages with these | ||
249 | * hints, then call drbd_bm_write_hinted(), which will only write out changed | ||
250 | * pages which are flagged with this mark. | ||
251 | */ | ||
252 | void drbd_bm_mark_for_writeout(struct drbd_conf *mdev, int page_nr) | ||
253 | { | ||
254 | struct page *page; | ||
255 | if (page_nr >= mdev->bitmap->bm_number_of_pages) { | ||
256 | dev_warn(DEV, "BAD: page_nr: %u, number_of_pages: %u\n", | ||
257 | page_nr, (int)mdev->bitmap->bm_number_of_pages); | ||
258 | return; | ||
259 | } | ||
260 | page = mdev->bitmap->bm_pages[page_nr]; | ||
261 | set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page)); | ||
262 | } | ||
263 | |||
249 | static int bm_test_page_unchanged(struct page *page) | 264 | static int bm_test_page_unchanged(struct page *page) |
250 | { | 265 | { |
251 | volatile const unsigned long *addr = &page_private(page); | 266 | volatile const unsigned long *addr = &page_private(page); |
@@ -373,14 +388,16 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) | |||
373 | return old_pages; | 388 | return old_pages; |
374 | 389 | ||
375 | /* Trying kmalloc first, falling back to vmalloc. | 390 | /* Trying kmalloc first, falling back to vmalloc. |
376 | * GFP_KERNEL is ok, as this is done when a lower level disk is | 391 | * GFP_NOIO, as this is called while drbd IO is "suspended", |
377 | * "attached" to the drbd. Context is receiver thread or cqueue | 392 | * and during resize or attach on diskless Primary, |
378 | * thread. As we have no disk yet, we are not in the IO path, | 393 | * we must not block on IO to ourselves. |
379 | * not even the IO path of the peer. */ | 394 | * Context is receiver thread or dmsetup. */ |
380 | bytes = sizeof(struct page *)*want; | 395 | bytes = sizeof(struct page *)*want; |
381 | new_pages = kzalloc(bytes, GFP_KERNEL); | 396 | new_pages = kzalloc(bytes, GFP_NOIO); |
382 | if (!new_pages) { | 397 | if (!new_pages) { |
383 | new_pages = vzalloc(bytes); | 398 | new_pages = __vmalloc(bytes, |
399 | GFP_NOIO | __GFP_HIGHMEM | __GFP_ZERO, | ||
400 | PAGE_KERNEL); | ||
384 | if (!new_pages) | 401 | if (!new_pages) |
385 | return NULL; | 402 | return NULL; |
386 | vmalloced = 1; | 403 | vmalloced = 1; |
@@ -390,7 +407,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) | |||
390 | for (i = 0; i < have; i++) | 407 | for (i = 0; i < have; i++) |
391 | new_pages[i] = old_pages[i]; | 408 | new_pages[i] = old_pages[i]; |
392 | for (; i < want; i++) { | 409 | for (; i < want; i++) { |
393 | page = alloc_page(GFP_HIGHUSER); | 410 | page = alloc_page(GFP_NOIO | __GFP_HIGHMEM); |
394 | if (!page) { | 411 | if (!page) { |
395 | bm_free_pages(new_pages + have, i - have); | 412 | bm_free_pages(new_pages + have, i - have); |
396 | bm_vk_free(new_pages, vmalloced); | 413 | bm_vk_free(new_pages, vmalloced); |
@@ -439,7 +456,8 @@ int drbd_bm_init(struct drbd_conf *mdev) | |||
439 | 456 | ||
440 | sector_t drbd_bm_capacity(struct drbd_conf *mdev) | 457 | sector_t drbd_bm_capacity(struct drbd_conf *mdev) |
441 | { | 458 | { |
442 | ERR_IF(!mdev->bitmap) return 0; | 459 | if (!expect(mdev->bitmap)) |
460 | return 0; | ||
443 | return mdev->bitmap->bm_dev_capacity; | 461 | return mdev->bitmap->bm_dev_capacity; |
444 | } | 462 | } |
445 | 463 | ||
@@ -447,7 +465,8 @@ sector_t drbd_bm_capacity(struct drbd_conf *mdev) | |||
447 | */ | 465 | */ |
448 | void drbd_bm_cleanup(struct drbd_conf *mdev) | 466 | void drbd_bm_cleanup(struct drbd_conf *mdev) |
449 | { | 467 | { |
450 | ERR_IF (!mdev->bitmap) return; | 468 | if (!expect(mdev->bitmap)) |
469 | return; | ||
451 | bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages); | 470 | bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages); |
452 | bm_vk_free(mdev->bitmap->bm_pages, (BM_P_VMALLOCED & mdev->bitmap->bm_flags)); | 471 | bm_vk_free(mdev->bitmap->bm_pages, (BM_P_VMALLOCED & mdev->bitmap->bm_flags)); |
453 | kfree(mdev->bitmap); | 472 | kfree(mdev->bitmap); |
@@ -610,7 +629,8 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits) | |||
610 | int err = 0, growing; | 629 | int err = 0, growing; |
611 | int opages_vmalloced; | 630 | int opages_vmalloced; |
612 | 631 | ||
613 | ERR_IF(!b) return -ENOMEM; | 632 | if (!expect(b)) |
633 | return -ENOMEM; | ||
614 | 634 | ||
615 | drbd_bm_lock(mdev, "resize", BM_LOCKED_MASK); | 635 | drbd_bm_lock(mdev, "resize", BM_LOCKED_MASK); |
616 | 636 | ||
@@ -732,8 +752,10 @@ unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev) | |||
732 | unsigned long s; | 752 | unsigned long s; |
733 | unsigned long flags; | 753 | unsigned long flags; |
734 | 754 | ||
735 | ERR_IF(!b) return 0; | 755 | if (!expect(b)) |
736 | ERR_IF(!b->bm_pages) return 0; | 756 | return 0; |
757 | if (!expect(b->bm_pages)) | ||
758 | return 0; | ||
737 | 759 | ||
738 | spin_lock_irqsave(&b->bm_lock, flags); | 760 | spin_lock_irqsave(&b->bm_lock, flags); |
739 | s = b->bm_set; | 761 | s = b->bm_set; |
@@ -756,8 +778,10 @@ unsigned long drbd_bm_total_weight(struct drbd_conf *mdev) | |||
756 | size_t drbd_bm_words(struct drbd_conf *mdev) | 778 | size_t drbd_bm_words(struct drbd_conf *mdev) |
757 | { | 779 | { |
758 | struct drbd_bitmap *b = mdev->bitmap; | 780 | struct drbd_bitmap *b = mdev->bitmap; |
759 | ERR_IF(!b) return 0; | 781 | if (!expect(b)) |
760 | ERR_IF(!b->bm_pages) return 0; | 782 | return 0; |
783 | if (!expect(b->bm_pages)) | ||
784 | return 0; | ||
761 | 785 | ||
762 | return b->bm_words; | 786 | return b->bm_words; |
763 | } | 787 | } |
@@ -765,7 +789,8 @@ size_t drbd_bm_words(struct drbd_conf *mdev) | |||
765 | unsigned long drbd_bm_bits(struct drbd_conf *mdev) | 789 | unsigned long drbd_bm_bits(struct drbd_conf *mdev) |
766 | { | 790 | { |
767 | struct drbd_bitmap *b = mdev->bitmap; | 791 | struct drbd_bitmap *b = mdev->bitmap; |
768 | ERR_IF(!b) return 0; | 792 | if (!expect(b)) |
793 | return 0; | ||
769 | 794 | ||
770 | return b->bm_bits; | 795 | return b->bm_bits; |
771 | } | 796 | } |
@@ -786,8 +811,10 @@ void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number, | |||
786 | 811 | ||
787 | end = offset + number; | 812 | end = offset + number; |
788 | 813 | ||
789 | ERR_IF(!b) return; | 814 | if (!expect(b)) |
790 | ERR_IF(!b->bm_pages) return; | 815 | return; |
816 | if (!expect(b->bm_pages)) | ||
817 | return; | ||
791 | if (number == 0) | 818 | if (number == 0) |
792 | return; | 819 | return; |
793 | WARN_ON(offset >= b->bm_words); | 820 | WARN_ON(offset >= b->bm_words); |
@@ -831,8 +858,10 @@ void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number, | |||
831 | 858 | ||
832 | end = offset + number; | 859 | end = offset + number; |
833 | 860 | ||
834 | ERR_IF(!b) return; | 861 | if (!expect(b)) |
835 | ERR_IF(!b->bm_pages) return; | 862 | return; |
863 | if (!expect(b->bm_pages)) | ||
864 | return; | ||
836 | 865 | ||
837 | spin_lock_irq(&b->bm_lock); | 866 | spin_lock_irq(&b->bm_lock); |
838 | if ((offset >= b->bm_words) || | 867 | if ((offset >= b->bm_words) || |
@@ -860,8 +889,10 @@ void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number, | |||
860 | void drbd_bm_set_all(struct drbd_conf *mdev) | 889 | void drbd_bm_set_all(struct drbd_conf *mdev) |
861 | { | 890 | { |
862 | struct drbd_bitmap *b = mdev->bitmap; | 891 | struct drbd_bitmap *b = mdev->bitmap; |
863 | ERR_IF(!b) return; | 892 | if (!expect(b)) |
864 | ERR_IF(!b->bm_pages) return; | 893 | return; |
894 | if (!expect(b->bm_pages)) | ||
895 | return; | ||
865 | 896 | ||
866 | spin_lock_irq(&b->bm_lock); | 897 | spin_lock_irq(&b->bm_lock); |
867 | bm_memset(b, 0, 0xff, b->bm_words); | 898 | bm_memset(b, 0, 0xff, b->bm_words); |
@@ -874,8 +905,10 @@ void drbd_bm_set_all(struct drbd_conf *mdev) | |||
874 | void drbd_bm_clear_all(struct drbd_conf *mdev) | 905 | void drbd_bm_clear_all(struct drbd_conf *mdev) |
875 | { | 906 | { |
876 | struct drbd_bitmap *b = mdev->bitmap; | 907 | struct drbd_bitmap *b = mdev->bitmap; |
877 | ERR_IF(!b) return; | 908 | if (!expect(b)) |
878 | ERR_IF(!b->bm_pages) return; | 909 | return; |
910 | if (!expect(b->bm_pages)) | ||
911 | return; | ||
879 | 912 | ||
880 | spin_lock_irq(&b->bm_lock); | 913 | spin_lock_irq(&b->bm_lock); |
881 | bm_memset(b, 0, 0, b->bm_words); | 914 | bm_memset(b, 0, 0, b->bm_words); |
@@ -889,7 +922,8 @@ struct bm_aio_ctx { | |||
889 | unsigned int done; | 922 | unsigned int done; |
890 | unsigned flags; | 923 | unsigned flags; |
891 | #define BM_AIO_COPY_PAGES 1 | 924 | #define BM_AIO_COPY_PAGES 1 |
892 | #define BM_WRITE_ALL_PAGES 2 | 925 | #define BM_AIO_WRITE_HINTED 2 |
926 | #define BM_WRITE_ALL_PAGES 4 | ||
893 | int error; | 927 | int error; |
894 | struct kref kref; | 928 | struct kref kref; |
895 | }; | 929 | }; |
@@ -977,17 +1011,11 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must | |||
977 | bm_set_page_unchanged(b->bm_pages[page_nr]); | 1011 | bm_set_page_unchanged(b->bm_pages[page_nr]); |
978 | 1012 | ||
979 | if (ctx->flags & BM_AIO_COPY_PAGES) { | 1013 | if (ctx->flags & BM_AIO_COPY_PAGES) { |
980 | void *src, *dest; | ||
981 | page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT); | 1014 | page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT); |
982 | dest = kmap_atomic(page); | 1015 | copy_highpage(page, b->bm_pages[page_nr]); |
983 | src = kmap_atomic(b->bm_pages[page_nr]); | ||
984 | memcpy(dest, src, PAGE_SIZE); | ||
985 | kunmap_atomic(src); | ||
986 | kunmap_atomic(dest); | ||
987 | bm_store_page_idx(page, page_nr); | 1016 | bm_store_page_idx(page, page_nr); |
988 | } else | 1017 | } else |
989 | page = b->bm_pages[page_nr]; | 1018 | page = b->bm_pages[page_nr]; |
990 | |||
991 | bio->bi_bdev = mdev->ldev->md_bdev; | 1019 | bio->bi_bdev = mdev->ldev->md_bdev; |
992 | bio->bi_sector = on_disk_sector; | 1020 | bio->bi_sector = on_disk_sector; |
993 | /* bio_add_page of a single page to an empty bio will always succeed, | 1021 | /* bio_add_page of a single page to an empty bio will always succeed, |
@@ -1060,6 +1088,11 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w | |||
1060 | if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx) | 1088 | if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx) |
1061 | break; | 1089 | break; |
1062 | if (rw & WRITE) { | 1090 | if (rw & WRITE) { |
1091 | if ((flags & BM_AIO_WRITE_HINTED) && | ||
1092 | !test_and_clear_bit(BM_PAGE_HINT_WRITEOUT, | ||
1093 | &page_private(b->bm_pages[i]))) | ||
1094 | continue; | ||
1095 | |||
1063 | if (!(flags & BM_WRITE_ALL_PAGES) && | 1096 | if (!(flags & BM_WRITE_ALL_PAGES) && |
1064 | bm_test_page_unchanged(b->bm_pages[i])) { | 1097 | bm_test_page_unchanged(b->bm_pages[i])) { |
1065 | dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i); | 1098 | dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i); |
@@ -1088,13 +1121,15 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w | |||
1088 | * "in_flight reached zero, all done" event. | 1121 | * "in_flight reached zero, all done" event. |
1089 | */ | 1122 | */ |
1090 | if (!atomic_dec_and_test(&ctx->in_flight)) | 1123 | if (!atomic_dec_and_test(&ctx->in_flight)) |
1091 | wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done); | 1124 | wait_until_done_or_force_detached(mdev, mdev->ldev, &ctx->done); |
1092 | else | 1125 | else |
1093 | kref_put(&ctx->kref, &bm_aio_ctx_destroy); | 1126 | kref_put(&ctx->kref, &bm_aio_ctx_destroy); |
1094 | 1127 | ||
1095 | dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n", | 1128 | /* summary for global bitmap IO */ |
1096 | rw == WRITE ? "WRITE" : "READ", | 1129 | if (flags == 0) |
1097 | count, jiffies - now); | 1130 | dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n", |
1131 | rw == WRITE ? "WRITE" : "READ", | ||
1132 | count, jiffies - now); | ||
1098 | 1133 | ||
1099 | if (ctx->error) { | 1134 | if (ctx->error) { |
1100 | dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n"); | 1135 | dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n"); |
@@ -1103,7 +1138,7 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w | |||
1103 | } | 1138 | } |
1104 | 1139 | ||
1105 | if (atomic_read(&ctx->in_flight)) | 1140 | if (atomic_read(&ctx->in_flight)) |
1106 | err = -EIO; /* Disk failed during IO... */ | 1141 | err = -EIO; /* Disk timeout/force-detach during IO... */ |
1107 | 1142 | ||
1108 | now = jiffies; | 1143 | now = jiffies; |
1109 | if (rw == WRITE) { | 1144 | if (rw == WRITE) { |
@@ -1115,8 +1150,9 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w | |||
1115 | } | 1150 | } |
1116 | now = b->bm_set; | 1151 | now = b->bm_set; |
1117 | 1152 | ||
1118 | dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n", | 1153 | if (flags == 0) |
1119 | ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now); | 1154 | dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n", |
1155 | ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now); | ||
1120 | 1156 | ||
1121 | kref_put(&ctx->kref, &bm_aio_ctx_destroy); | 1157 | kref_put(&ctx->kref, &bm_aio_ctx_destroy); |
1122 | return err; | 1158 | return err; |
@@ -1179,9 +1215,17 @@ int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local) | |||
1179 | return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, 0); | 1215 | return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, 0); |
1180 | } | 1216 | } |
1181 | 1217 | ||
1218 | /** | ||
1219 | * drbd_bm_write_hinted() - Write bitmap pages with "hint" marks, if they have changed. | ||
1220 | * @mdev: DRBD device. | ||
1221 | */ | ||
1222 | int drbd_bm_write_hinted(struct drbd_conf *mdev) __must_hold(local) | ||
1223 | { | ||
1224 | return bm_rw(mdev, WRITE, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0); | ||
1225 | } | ||
1182 | 1226 | ||
1183 | /** | 1227 | /** |
1184 | * drbd_bm_write_page: Writes a PAGE_SIZE aligned piece of bitmap | 1228 | * drbd_bm_write_page() - Writes a PAGE_SIZE aligned piece of bitmap |
1185 | * @mdev: DRBD device. | 1229 | * @mdev: DRBD device. |
1186 | * @idx: bitmap page index | 1230 | * @idx: bitmap page index |
1187 | * | 1231 | * |
@@ -1222,11 +1266,11 @@ int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(loc | |||
1222 | } | 1266 | } |
1223 | 1267 | ||
1224 | bm_page_io_async(ctx, idx, WRITE_SYNC); | 1268 | bm_page_io_async(ctx, idx, WRITE_SYNC); |
1225 | wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done); | 1269 | wait_until_done_or_force_detached(mdev, mdev->ldev, &ctx->done); |
1226 | 1270 | ||
1227 | if (ctx->error) | 1271 | if (ctx->error) |
1228 | drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); | 1272 | drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); |
1229 | /* that should force detach, so the in memory bitmap will be | 1273 | /* that causes us to detach, so the in memory bitmap will be |
1230 | * gone in a moment as well. */ | 1274 | * gone in a moment as well. */ |
1231 | 1275 | ||
1232 | mdev->bm_writ_cnt++; | 1276 | mdev->bm_writ_cnt++; |
@@ -1289,8 +1333,10 @@ static unsigned long bm_find_next(struct drbd_conf *mdev, | |||
1289 | struct drbd_bitmap *b = mdev->bitmap; | 1333 | struct drbd_bitmap *b = mdev->bitmap; |
1290 | unsigned long i = DRBD_END_OF_BITMAP; | 1334 | unsigned long i = DRBD_END_OF_BITMAP; |
1291 | 1335 | ||
1292 | ERR_IF(!b) return i; | 1336 | if (!expect(b)) |
1293 | ERR_IF(!b->bm_pages) return i; | 1337 | return i; |
1338 | if (!expect(b->bm_pages)) | ||
1339 | return i; | ||
1294 | 1340 | ||
1295 | spin_lock_irq(&b->bm_lock); | 1341 | spin_lock_irq(&b->bm_lock); |
1296 | if (BM_DONT_TEST & b->bm_flags) | 1342 | if (BM_DONT_TEST & b->bm_flags) |
@@ -1391,8 +1437,10 @@ static int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, | |||
1391 | struct drbd_bitmap *b = mdev->bitmap; | 1437 | struct drbd_bitmap *b = mdev->bitmap; |
1392 | int c = 0; | 1438 | int c = 0; |
1393 | 1439 | ||
1394 | ERR_IF(!b) return 1; | 1440 | if (!expect(b)) |
1395 | ERR_IF(!b->bm_pages) return 0; | 1441 | return 1; |
1442 | if (!expect(b->bm_pages)) | ||
1443 | return 0; | ||
1396 | 1444 | ||
1397 | spin_lock_irqsave(&b->bm_lock, flags); | 1445 | spin_lock_irqsave(&b->bm_lock, flags); |
1398 | if ((val ? BM_DONT_SET : BM_DONT_CLEAR) & b->bm_flags) | 1446 | if ((val ? BM_DONT_SET : BM_DONT_CLEAR) & b->bm_flags) |
@@ -1423,13 +1471,21 @@ static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b, | |||
1423 | { | 1471 | { |
1424 | int i; | 1472 | int i; |
1425 | int bits; | 1473 | int bits; |
1474 | int changed = 0; | ||
1426 | unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]); | 1475 | unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]); |
1427 | for (i = first_word; i < last_word; i++) { | 1476 | for (i = first_word; i < last_word; i++) { |
1428 | bits = hweight_long(paddr[i]); | 1477 | bits = hweight_long(paddr[i]); |
1429 | paddr[i] = ~0UL; | 1478 | paddr[i] = ~0UL; |
1430 | b->bm_set += BITS_PER_LONG - bits; | 1479 | changed += BITS_PER_LONG - bits; |
1431 | } | 1480 | } |
1432 | kunmap_atomic(paddr); | 1481 | kunmap_atomic(paddr); |
1482 | if (changed) { | ||
1483 | /* We only need lazy writeout, the information is still in the | ||
1484 | * remote bitmap as well, and is reconstructed during the next | ||
1485 | * bitmap exchange, if lost locally due to a crash. */ | ||
1486 | bm_set_page_lazy_writeout(b->bm_pages[page_nr]); | ||
1487 | b->bm_set += changed; | ||
1488 | } | ||
1433 | } | 1489 | } |
1434 | 1490 | ||
1435 | /* Same thing as drbd_bm_set_bits, | 1491 | /* Same thing as drbd_bm_set_bits, |
@@ -1524,8 +1580,10 @@ int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr) | |||
1524 | unsigned long *p_addr; | 1580 | unsigned long *p_addr; |
1525 | int i; | 1581 | int i; |
1526 | 1582 | ||
1527 | ERR_IF(!b) return 0; | 1583 | if (!expect(b)) |
1528 | ERR_IF(!b->bm_pages) return 0; | 1584 | return 0; |
1585 | if (!expect(b->bm_pages)) | ||
1586 | return 0; | ||
1529 | 1587 | ||
1530 | spin_lock_irqsave(&b->bm_lock, flags); | 1588 | spin_lock_irqsave(&b->bm_lock, flags); |
1531 | if (BM_DONT_TEST & b->bm_flags) | 1589 | if (BM_DONT_TEST & b->bm_flags) |
@@ -1559,8 +1617,10 @@ int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsi | |||
1559 | * robust in case we screwed up elsewhere, in that case pretend there | 1617 | * robust in case we screwed up elsewhere, in that case pretend there |
1560 | * was one dirty bit in the requested area, so we won't try to do a | 1618 | * was one dirty bit in the requested area, so we won't try to do a |
1561 | * local read there (no bitmap probably implies no disk) */ | 1619 | * local read there (no bitmap probably implies no disk) */ |
1562 | ERR_IF(!b) return 1; | 1620 | if (!expect(b)) |
1563 | ERR_IF(!b->bm_pages) return 1; | 1621 | return 1; |
1622 | if (!expect(b->bm_pages)) | ||
1623 | return 1; | ||
1564 | 1624 | ||
1565 | spin_lock_irqsave(&b->bm_lock, flags); | 1625 | spin_lock_irqsave(&b->bm_lock, flags); |
1566 | if (BM_DONT_TEST & b->bm_flags) | 1626 | if (BM_DONT_TEST & b->bm_flags) |
@@ -1573,11 +1633,10 @@ int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsi | |||
1573 | bm_unmap(p_addr); | 1633 | bm_unmap(p_addr); |
1574 | p_addr = bm_map_pidx(b, idx); | 1634 | p_addr = bm_map_pidx(b, idx); |
1575 | } | 1635 | } |
1576 | ERR_IF (bitnr >= b->bm_bits) { | 1636 | if (expect(bitnr < b->bm_bits)) |
1577 | dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits); | ||
1578 | } else { | ||
1579 | c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr)); | 1637 | c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr)); |
1580 | } | 1638 | else |
1639 | dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits); | ||
1581 | } | 1640 | } |
1582 | if (p_addr) | 1641 | if (p_addr) |
1583 | bm_unmap(p_addr); | 1642 | bm_unmap(p_addr); |
@@ -1607,8 +1666,10 @@ int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr) | |||
1607 | unsigned long flags; | 1666 | unsigned long flags; |
1608 | unsigned long *p_addr, *bm; | 1667 | unsigned long *p_addr, *bm; |
1609 | 1668 | ||
1610 | ERR_IF(!b) return 0; | 1669 | if (!expect(b)) |
1611 | ERR_IF(!b->bm_pages) return 0; | 1670 | return 0; |
1671 | if (!expect(b->bm_pages)) | ||
1672 | return 0; | ||
1612 | 1673 | ||
1613 | spin_lock_irqsave(&b->bm_lock, flags); | 1674 | spin_lock_irqsave(&b->bm_lock, flags); |
1614 | if (BM_DONT_TEST & b->bm_flags) | 1675 | if (BM_DONT_TEST & b->bm_flags) |
@@ -1630,47 +1691,3 @@ int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr) | |||
1630 | spin_unlock_irqrestore(&b->bm_lock, flags); | 1691 | spin_unlock_irqrestore(&b->bm_lock, flags); |
1631 | return count; | 1692 | return count; |
1632 | } | 1693 | } |
1633 | |||
1634 | /* Set all bits covered by the AL-extent al_enr. | ||
1635 | * Returns number of bits changed. */ | ||
1636 | unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr) | ||
1637 | { | ||
1638 | struct drbd_bitmap *b = mdev->bitmap; | ||
1639 | unsigned long *p_addr, *bm; | ||
1640 | unsigned long weight; | ||
1641 | unsigned long s, e; | ||
1642 | int count, i, do_now; | ||
1643 | ERR_IF(!b) return 0; | ||
1644 | ERR_IF(!b->bm_pages) return 0; | ||
1645 | |||
1646 | spin_lock_irq(&b->bm_lock); | ||
1647 | if (BM_DONT_SET & b->bm_flags) | ||
1648 | bm_print_lock_info(mdev); | ||
1649 | weight = b->bm_set; | ||
1650 | |||
1651 | s = al_enr * BM_WORDS_PER_AL_EXT; | ||
1652 | e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words); | ||
1653 | /* assert that s and e are on the same page */ | ||
1654 | D_ASSERT((e-1) >> (PAGE_SHIFT - LN2_BPL + 3) | ||
1655 | == s >> (PAGE_SHIFT - LN2_BPL + 3)); | ||
1656 | count = 0; | ||
1657 | if (s < b->bm_words) { | ||
1658 | i = do_now = e-s; | ||
1659 | p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s)); | ||
1660 | bm = p_addr + MLPP(s); | ||
1661 | while (i--) { | ||
1662 | count += hweight_long(*bm); | ||
1663 | *bm = -1UL; | ||
1664 | bm++; | ||
1665 | } | ||
1666 | bm_unmap(p_addr); | ||
1667 | b->bm_set += do_now*BITS_PER_LONG - count; | ||
1668 | if (e == b->bm_words) | ||
1669 | b->bm_set -= bm_clear_surplus(b); | ||
1670 | } else { | ||
1671 | dev_err(DEV, "start offset (%lu) too large in drbd_bm_ALe_set_all\n", s); | ||
1672 | } | ||
1673 | weight = b->bm_set - weight; | ||
1674 | spin_unlock_irq(&b->bm_lock); | ||
1675 | return weight; | ||
1676 | } | ||
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index b953cc7c9c00..6b51afa1aae1 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h | |||
@@ -39,9 +39,13 @@ | |||
39 | #include <linux/major.h> | 39 | #include <linux/major.h> |
40 | #include <linux/blkdev.h> | 40 | #include <linux/blkdev.h> |
41 | #include <linux/genhd.h> | 41 | #include <linux/genhd.h> |
42 | #include <linux/idr.h> | ||
42 | #include <net/tcp.h> | 43 | #include <net/tcp.h> |
43 | #include <linux/lru_cache.h> | 44 | #include <linux/lru_cache.h> |
44 | #include <linux/prefetch.h> | 45 | #include <linux/prefetch.h> |
46 | #include <linux/drbd_genl_api.h> | ||
47 | #include <linux/drbd.h> | ||
48 | #include "drbd_state.h" | ||
45 | 49 | ||
46 | #ifdef __CHECKER__ | 50 | #ifdef __CHECKER__ |
47 | # define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr"))) | 51 | # define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr"))) |
@@ -61,7 +65,6 @@ | |||
61 | extern unsigned int minor_count; | 65 | extern unsigned int minor_count; |
62 | extern bool disable_sendpage; | 66 | extern bool disable_sendpage; |
63 | extern bool allow_oos; | 67 | extern bool allow_oos; |
64 | extern unsigned int cn_idx; | ||
65 | 68 | ||
66 | #ifdef CONFIG_DRBD_FAULT_INJECTION | 69 | #ifdef CONFIG_DRBD_FAULT_INJECTION |
67 | extern int enable_faults; | 70 | extern int enable_faults; |
@@ -86,34 +89,44 @@ extern char usermode_helper[]; | |||
86 | */ | 89 | */ |
87 | #define DRBD_SIGKILL SIGHUP | 90 | #define DRBD_SIGKILL SIGHUP |
88 | 91 | ||
89 | /* All EEs on the free list should have ID_VACANT (== 0) | ||
90 | * freshly allocated EEs get !ID_VACANT (== 1) | ||
91 | * so if it says "cannot dereference null pointer at address 0x00000001", | ||
92 | * it is most likely one of these :( */ | ||
93 | |||
94 | #define ID_IN_SYNC (4711ULL) | 92 | #define ID_IN_SYNC (4711ULL) |
95 | #define ID_OUT_OF_SYNC (4712ULL) | 93 | #define ID_OUT_OF_SYNC (4712ULL) |
96 | |||
97 | #define ID_SYNCER (-1ULL) | 94 | #define ID_SYNCER (-1ULL) |
98 | #define ID_VACANT 0 | 95 | |
99 | #define is_syncer_block_id(id) ((id) == ID_SYNCER) | ||
100 | #define UUID_NEW_BM_OFFSET ((u64)0x0001000000000000ULL) | 96 | #define UUID_NEW_BM_OFFSET ((u64)0x0001000000000000ULL) |
101 | 97 | ||
102 | struct drbd_conf; | 98 | struct drbd_conf; |
99 | struct drbd_tconn; | ||
103 | 100 | ||
104 | 101 | ||
105 | /* to shorten dev_warn(DEV, "msg"); and relatives statements */ | 102 | /* to shorten dev_warn(DEV, "msg"); and relatives statements */ |
106 | #define DEV (disk_to_dev(mdev->vdisk)) | 103 | #define DEV (disk_to_dev(mdev->vdisk)) |
107 | 104 | ||
105 | #define conn_printk(LEVEL, TCONN, FMT, ARGS...) \ | ||
106 | printk(LEVEL "d-con %s: " FMT, TCONN->name , ## ARGS) | ||
107 | #define conn_alert(TCONN, FMT, ARGS...) conn_printk(KERN_ALERT, TCONN, FMT, ## ARGS) | ||
108 | #define conn_crit(TCONN, FMT, ARGS...) conn_printk(KERN_CRIT, TCONN, FMT, ## ARGS) | ||
109 | #define conn_err(TCONN, FMT, ARGS...) conn_printk(KERN_ERR, TCONN, FMT, ## ARGS) | ||
110 | #define conn_warn(TCONN, FMT, ARGS...) conn_printk(KERN_WARNING, TCONN, FMT, ## ARGS) | ||
111 | #define conn_notice(TCONN, FMT, ARGS...) conn_printk(KERN_NOTICE, TCONN, FMT, ## ARGS) | ||
112 | #define conn_info(TCONN, FMT, ARGS...) conn_printk(KERN_INFO, TCONN, FMT, ## ARGS) | ||
113 | #define conn_dbg(TCONN, FMT, ARGS...) conn_printk(KERN_DEBUG, TCONN, FMT, ## ARGS) | ||
114 | |||
108 | #define D_ASSERT(exp) if (!(exp)) \ | 115 | #define D_ASSERT(exp) if (!(exp)) \ |
109 | dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__) | 116 | dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__) |
110 | 117 | ||
111 | #define ERR_IF(exp) if (({ \ | 118 | /** |
112 | int _b = (exp) != 0; \ | 119 | * expect - Make an assertion |
113 | if (_b) dev_err(DEV, "ASSERT FAILED: %s: (%s) in %s:%d\n", \ | 120 | * |
114 | __func__, #exp, __FILE__, __LINE__); \ | 121 | * Unlike the assert macro, this macro returns a boolean result. |
115 | _b; \ | 122 | */ |
116 | })) | 123 | #define expect(exp) ({ \ |
124 | bool _bool = (exp); \ | ||
125 | if (!_bool) \ | ||
126 | dev_err(DEV, "ASSERTION %s FAILED in %s\n", \ | ||
127 | #exp, __func__); \ | ||
128 | _bool; \ | ||
129 | }) | ||
117 | 130 | ||
118 | /* Defines to control fault insertion */ | 131 | /* Defines to control fault insertion */ |
119 | enum { | 132 | enum { |
@@ -150,15 +163,12 @@ drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) { | |||
150 | /* usual integer division */ | 163 | /* usual integer division */ |
151 | #define div_floor(A, B) ((A)/(B)) | 164 | #define div_floor(A, B) ((A)/(B)) |
152 | 165 | ||
153 | /* drbd_meta-data.c (still in drbd_main.c) */ | ||
154 | /* 4th incarnation of the disk layout. */ | ||
155 | #define DRBD_MD_MAGIC (DRBD_MAGIC+4) | ||
156 | |||
157 | extern struct drbd_conf **minor_table; | ||
158 | extern struct ratelimit_state drbd_ratelimit_state; | 166 | extern struct ratelimit_state drbd_ratelimit_state; |
167 | extern struct idr minors; /* RCU, updates: genl_lock() */ | ||
168 | extern struct list_head drbd_tconns; /* RCU, updates: genl_lock() */ | ||
159 | 169 | ||
160 | /* on the wire */ | 170 | /* on the wire */ |
161 | enum drbd_packets { | 171 | enum drbd_packet { |
162 | /* receiver (data socket) */ | 172 | /* receiver (data socket) */ |
163 | P_DATA = 0x00, | 173 | P_DATA = 0x00, |
164 | P_DATA_REPLY = 0x01, /* Response to P_DATA_REQUEST */ | 174 | P_DATA_REPLY = 0x01, /* Response to P_DATA_REQUEST */ |
@@ -186,7 +196,7 @@ enum drbd_packets { | |||
186 | P_RECV_ACK = 0x15, /* Used in protocol B */ | 196 | P_RECV_ACK = 0x15, /* Used in protocol B */ |
187 | P_WRITE_ACK = 0x16, /* Used in protocol C */ | 197 | P_WRITE_ACK = 0x16, /* Used in protocol C */ |
188 | P_RS_WRITE_ACK = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */ | 198 | P_RS_WRITE_ACK = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */ |
189 | P_DISCARD_ACK = 0x18, /* Used in proto C, two-primaries conflict detection */ | 199 | P_SUPERSEDED = 0x18, /* Used in proto C, two-primaries conflict detection */ |
190 | P_NEG_ACK = 0x19, /* Sent if local disk is unusable */ | 200 | P_NEG_ACK = 0x19, /* Sent if local disk is unusable */ |
191 | P_NEG_DREPLY = 0x1a, /* Local disk is broken... */ | 201 | P_NEG_DREPLY = 0x1a, /* Local disk is broken... */ |
192 | P_NEG_RS_DREPLY = 0x1b, /* Local disk is broken... */ | 202 | P_NEG_RS_DREPLY = 0x1b, /* Local disk is broken... */ |
@@ -207,77 +217,23 @@ enum drbd_packets { | |||
207 | P_DELAY_PROBE = 0x27, /* is used on BOTH sockets */ | 217 | P_DELAY_PROBE = 0x27, /* is used on BOTH sockets */ |
208 | P_OUT_OF_SYNC = 0x28, /* Mark as out of sync (Outrunning), data socket */ | 218 | P_OUT_OF_SYNC = 0x28, /* Mark as out of sync (Outrunning), data socket */ |
209 | P_RS_CANCEL = 0x29, /* meta: Used to cancel RS_DATA_REQUEST packet by SyncSource */ | 219 | P_RS_CANCEL = 0x29, /* meta: Used to cancel RS_DATA_REQUEST packet by SyncSource */ |
220 | P_CONN_ST_CHG_REQ = 0x2a, /* data sock: Connection wide state request */ | ||
221 | P_CONN_ST_CHG_REPLY = 0x2b, /* meta sock: Connection side state req reply */ | ||
222 | P_RETRY_WRITE = 0x2c, /* Protocol C: retry conflicting write request */ | ||
223 | P_PROTOCOL_UPDATE = 0x2d, /* data sock: is used in established connections */ | ||
210 | 224 | ||
211 | P_MAX_CMD = 0x2A, | ||
212 | P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ | 225 | P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ |
213 | P_MAX_OPT_CMD = 0x101, | 226 | P_MAX_OPT_CMD = 0x101, |
214 | 227 | ||
215 | /* special command ids for handshake */ | 228 | /* special command ids for handshake */ |
216 | 229 | ||
217 | P_HAND_SHAKE_M = 0xfff1, /* First Packet on the MetaSock */ | 230 | P_INITIAL_META = 0xfff1, /* First Packet on the MetaSock */ |
218 | P_HAND_SHAKE_S = 0xfff2, /* First Packet on the Socket */ | 231 | P_INITIAL_DATA = 0xfff2, /* First Packet on the Socket */ |
219 | 232 | ||
220 | P_HAND_SHAKE = 0xfffe /* FIXED for the next century! */ | 233 | P_CONNECTION_FEATURES = 0xfffe /* FIXED for the next century! */ |
221 | }; | 234 | }; |
222 | 235 | ||
223 | static inline const char *cmdname(enum drbd_packets cmd) | 236 | extern const char *cmdname(enum drbd_packet cmd); |
224 | { | ||
225 | /* THINK may need to become several global tables | ||
226 | * when we want to support more than | ||
227 | * one PRO_VERSION */ | ||
228 | static const char *cmdnames[] = { | ||
229 | [P_DATA] = "Data", | ||
230 | [P_DATA_REPLY] = "DataReply", | ||
231 | [P_RS_DATA_REPLY] = "RSDataReply", | ||
232 | [P_BARRIER] = "Barrier", | ||
233 | [P_BITMAP] = "ReportBitMap", | ||
234 | [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget", | ||
235 | [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource", | ||
236 | [P_UNPLUG_REMOTE] = "UnplugRemote", | ||
237 | [P_DATA_REQUEST] = "DataRequest", | ||
238 | [P_RS_DATA_REQUEST] = "RSDataRequest", | ||
239 | [P_SYNC_PARAM] = "SyncParam", | ||
240 | [P_SYNC_PARAM89] = "SyncParam89", | ||
241 | [P_PROTOCOL] = "ReportProtocol", | ||
242 | [P_UUIDS] = "ReportUUIDs", | ||
243 | [P_SIZES] = "ReportSizes", | ||
244 | [P_STATE] = "ReportState", | ||
245 | [P_SYNC_UUID] = "ReportSyncUUID", | ||
246 | [P_AUTH_CHALLENGE] = "AuthChallenge", | ||
247 | [P_AUTH_RESPONSE] = "AuthResponse", | ||
248 | [P_PING] = "Ping", | ||
249 | [P_PING_ACK] = "PingAck", | ||
250 | [P_RECV_ACK] = "RecvAck", | ||
251 | [P_WRITE_ACK] = "WriteAck", | ||
252 | [P_RS_WRITE_ACK] = "RSWriteAck", | ||
253 | [P_DISCARD_ACK] = "DiscardAck", | ||
254 | [P_NEG_ACK] = "NegAck", | ||
255 | [P_NEG_DREPLY] = "NegDReply", | ||
256 | [P_NEG_RS_DREPLY] = "NegRSDReply", | ||
257 | [P_BARRIER_ACK] = "BarrierAck", | ||
258 | [P_STATE_CHG_REQ] = "StateChgRequest", | ||
259 | [P_STATE_CHG_REPLY] = "StateChgReply", | ||
260 | [P_OV_REQUEST] = "OVRequest", | ||
261 | [P_OV_REPLY] = "OVReply", | ||
262 | [P_OV_RESULT] = "OVResult", | ||
263 | [P_CSUM_RS_REQUEST] = "CsumRSRequest", | ||
264 | [P_RS_IS_IN_SYNC] = "CsumRSIsInSync", | ||
265 | [P_COMPRESSED_BITMAP] = "CBitmap", | ||
266 | [P_DELAY_PROBE] = "DelayProbe", | ||
267 | [P_OUT_OF_SYNC] = "OutOfSync", | ||
268 | [P_MAX_CMD] = NULL, | ||
269 | }; | ||
270 | |||
271 | if (cmd == P_HAND_SHAKE_M) | ||
272 | return "HandShakeM"; | ||
273 | if (cmd == P_HAND_SHAKE_S) | ||
274 | return "HandShakeS"; | ||
275 | if (cmd == P_HAND_SHAKE) | ||
276 | return "HandShake"; | ||
277 | if (cmd >= P_MAX_CMD) | ||
278 | return "Unknown"; | ||
279 | return cmdnames[cmd]; | ||
280 | } | ||
281 | 237 | ||
282 | /* for sending/receiving the bitmap, | 238 | /* for sending/receiving the bitmap, |
283 | * possibly in some encoding scheme */ | 239 | * possibly in some encoding scheme */ |
@@ -337,37 +293,24 @@ struct p_header80 { | |||
337 | u32 magic; | 293 | u32 magic; |
338 | u16 command; | 294 | u16 command; |
339 | u16 length; /* bytes of data after this header */ | 295 | u16 length; /* bytes of data after this header */ |
340 | u8 payload[0]; | ||
341 | } __packed; | 296 | } __packed; |
342 | 297 | ||
343 | /* Header for big packets, Used for data packets exceeding 64kB */ | 298 | /* Header for big packets, Used for data packets exceeding 64kB */ |
344 | struct p_header95 { | 299 | struct p_header95 { |
345 | u16 magic; /* use DRBD_MAGIC_BIG here */ | 300 | u16 magic; /* use DRBD_MAGIC_BIG here */ |
346 | u16 command; | 301 | u16 command; |
347 | u32 length; /* Use only 24 bits of that. Ignore the highest 8 bit. */ | 302 | u32 length; |
348 | u8 payload[0]; | ||
349 | } __packed; | 303 | } __packed; |
350 | 304 | ||
351 | union p_header { | 305 | struct p_header100 { |
352 | struct p_header80 h80; | 306 | u32 magic; |
353 | struct p_header95 h95; | 307 | u16 volume; |
354 | }; | 308 | u16 command; |
355 | 309 | u32 length; | |
356 | /* | 310 | u32 pad; |
357 | * short commands, packets without payload, plain p_header: | 311 | } __packed; |
358 | * P_PING | ||
359 | * P_PING_ACK | ||
360 | * P_BECOME_SYNC_TARGET | ||
361 | * P_BECOME_SYNC_SOURCE | ||
362 | * P_UNPLUG_REMOTE | ||
363 | */ | ||
364 | 312 | ||
365 | /* | 313 | extern unsigned int drbd_header_size(struct drbd_tconn *tconn); |
366 | * commands with out-of-struct payload: | ||
367 | * P_BITMAP (no additional fields) | ||
368 | * P_DATA, P_DATA_REPLY (see p_data) | ||
369 | * P_COMPRESSED_BITMAP (see receive_compressed_bitmap) | ||
370 | */ | ||
371 | 314 | ||
372 | /* these defines must not be changed without changing the protocol version */ | 315 | /* these defines must not be changed without changing the protocol version */ |
373 | #define DP_HARDBARRIER 1 /* depricated */ | 316 | #define DP_HARDBARRIER 1 /* depricated */ |
@@ -377,9 +320,10 @@ union p_header { | |||
377 | #define DP_FUA 16 /* equals REQ_FUA */ | 320 | #define DP_FUA 16 /* equals REQ_FUA */ |
378 | #define DP_FLUSH 32 /* equals REQ_FLUSH */ | 321 | #define DP_FLUSH 32 /* equals REQ_FLUSH */ |
379 | #define DP_DISCARD 64 /* equals REQ_DISCARD */ | 322 | #define DP_DISCARD 64 /* equals REQ_DISCARD */ |
323 | #define DP_SEND_RECEIVE_ACK 128 /* This is a proto B write request */ | ||
324 | #define DP_SEND_WRITE_ACK 256 /* This is a proto C write request */ | ||
380 | 325 | ||
381 | struct p_data { | 326 | struct p_data { |
382 | union p_header head; | ||
383 | u64 sector; /* 64 bits sector number */ | 327 | u64 sector; /* 64 bits sector number */ |
384 | u64 block_id; /* to identify the request in protocol B&C */ | 328 | u64 block_id; /* to identify the request in protocol B&C */ |
385 | u32 seq_num; | 329 | u32 seq_num; |
@@ -390,21 +334,18 @@ struct p_data { | |||
390 | * commands which share a struct: | 334 | * commands which share a struct: |
391 | * p_block_ack: | 335 | * p_block_ack: |
392 | * P_RECV_ACK (proto B), P_WRITE_ACK (proto C), | 336 | * P_RECV_ACK (proto B), P_WRITE_ACK (proto C), |
393 | * P_DISCARD_ACK (proto C, two-primaries conflict detection) | 337 | * P_SUPERSEDED (proto C, two-primaries conflict detection) |
394 | * p_block_req: | 338 | * p_block_req: |
395 | * P_DATA_REQUEST, P_RS_DATA_REQUEST | 339 | * P_DATA_REQUEST, P_RS_DATA_REQUEST |
396 | */ | 340 | */ |
397 | struct p_block_ack { | 341 | struct p_block_ack { |
398 | struct p_header80 head; | ||
399 | u64 sector; | 342 | u64 sector; |
400 | u64 block_id; | 343 | u64 block_id; |
401 | u32 blksize; | 344 | u32 blksize; |
402 | u32 seq_num; | 345 | u32 seq_num; |
403 | } __packed; | 346 | } __packed; |
404 | 347 | ||
405 | |||
406 | struct p_block_req { | 348 | struct p_block_req { |
407 | struct p_header80 head; | ||
408 | u64 sector; | 349 | u64 sector; |
409 | u64 block_id; | 350 | u64 block_id; |
410 | u32 blksize; | 351 | u32 blksize; |
@@ -413,59 +354,52 @@ struct p_block_req { | |||
413 | 354 | ||
414 | /* | 355 | /* |
415 | * commands with their own struct for additional fields: | 356 | * commands with their own struct for additional fields: |
416 | * P_HAND_SHAKE | 357 | * P_CONNECTION_FEATURES |
417 | * P_BARRIER | 358 | * P_BARRIER |
418 | * P_BARRIER_ACK | 359 | * P_BARRIER_ACK |
419 | * P_SYNC_PARAM | 360 | * P_SYNC_PARAM |
420 | * ReportParams | 361 | * ReportParams |
421 | */ | 362 | */ |
422 | 363 | ||
423 | struct p_handshake { | 364 | struct p_connection_features { |
424 | struct p_header80 head; /* 8 bytes */ | ||
425 | u32 protocol_min; | 365 | u32 protocol_min; |
426 | u32 feature_flags; | 366 | u32 feature_flags; |
427 | u32 protocol_max; | 367 | u32 protocol_max; |
428 | 368 | ||
429 | /* should be more than enough for future enhancements | 369 | /* should be more than enough for future enhancements |
430 | * for now, feature_flags and the reserverd array shall be zero. | 370 | * for now, feature_flags and the reserved array shall be zero. |
431 | */ | 371 | */ |
432 | 372 | ||
433 | u32 _pad; | 373 | u32 _pad; |
434 | u64 reserverd[7]; | 374 | u64 reserved[7]; |
435 | } __packed; | 375 | } __packed; |
436 | /* 80 bytes, FIXED for the next century */ | ||
437 | 376 | ||
438 | struct p_barrier { | 377 | struct p_barrier { |
439 | struct p_header80 head; | ||
440 | u32 barrier; /* barrier number _handle_ only */ | 378 | u32 barrier; /* barrier number _handle_ only */ |
441 | u32 pad; /* to multiple of 8 Byte */ | 379 | u32 pad; /* to multiple of 8 Byte */ |
442 | } __packed; | 380 | } __packed; |
443 | 381 | ||
444 | struct p_barrier_ack { | 382 | struct p_barrier_ack { |
445 | struct p_header80 head; | ||
446 | u32 barrier; | 383 | u32 barrier; |
447 | u32 set_size; | 384 | u32 set_size; |
448 | } __packed; | 385 | } __packed; |
449 | 386 | ||
450 | struct p_rs_param { | 387 | struct p_rs_param { |
451 | struct p_header80 head; | 388 | u32 resync_rate; |
452 | u32 rate; | ||
453 | 389 | ||
454 | /* Since protocol version 88 and higher. */ | 390 | /* Since protocol version 88 and higher. */ |
455 | char verify_alg[0]; | 391 | char verify_alg[0]; |
456 | } __packed; | 392 | } __packed; |
457 | 393 | ||
458 | struct p_rs_param_89 { | 394 | struct p_rs_param_89 { |
459 | struct p_header80 head; | 395 | u32 resync_rate; |
460 | u32 rate; | ||
461 | /* protocol version 89: */ | 396 | /* protocol version 89: */ |
462 | char verify_alg[SHARED_SECRET_MAX]; | 397 | char verify_alg[SHARED_SECRET_MAX]; |
463 | char csums_alg[SHARED_SECRET_MAX]; | 398 | char csums_alg[SHARED_SECRET_MAX]; |
464 | } __packed; | 399 | } __packed; |
465 | 400 | ||
466 | struct p_rs_param_95 { | 401 | struct p_rs_param_95 { |
467 | struct p_header80 head; | 402 | u32 resync_rate; |
468 | u32 rate; | ||
469 | char verify_alg[SHARED_SECRET_MAX]; | 403 | char verify_alg[SHARED_SECRET_MAX]; |
470 | char csums_alg[SHARED_SECRET_MAX]; | 404 | char csums_alg[SHARED_SECRET_MAX]; |
471 | u32 c_plan_ahead; | 405 | u32 c_plan_ahead; |
@@ -475,12 +409,11 @@ struct p_rs_param_95 { | |||
475 | } __packed; | 409 | } __packed; |
476 | 410 | ||
477 | enum drbd_conn_flags { | 411 | enum drbd_conn_flags { |
478 | CF_WANT_LOSE = 1, | 412 | CF_DISCARD_MY_DATA = 1, |
479 | CF_DRY_RUN = 2, | 413 | CF_DRY_RUN = 2, |
480 | }; | 414 | }; |
481 | 415 | ||
482 | struct p_protocol { | 416 | struct p_protocol { |
483 | struct p_header80 head; | ||
484 | u32 protocol; | 417 | u32 protocol; |
485 | u32 after_sb_0p; | 418 | u32 after_sb_0p; |
486 | u32 after_sb_1p; | 419 | u32 after_sb_1p; |
@@ -494,17 +427,14 @@ struct p_protocol { | |||
494 | } __packed; | 427 | } __packed; |
495 | 428 | ||
496 | struct p_uuids { | 429 | struct p_uuids { |
497 | struct p_header80 head; | ||
498 | u64 uuid[UI_EXTENDED_SIZE]; | 430 | u64 uuid[UI_EXTENDED_SIZE]; |
499 | } __packed; | 431 | } __packed; |
500 | 432 | ||
501 | struct p_rs_uuid { | 433 | struct p_rs_uuid { |
502 | struct p_header80 head; | ||
503 | u64 uuid; | 434 | u64 uuid; |
504 | } __packed; | 435 | } __packed; |
505 | 436 | ||
506 | struct p_sizes { | 437 | struct p_sizes { |
507 | struct p_header80 head; | ||
508 | u64 d_size; /* size of disk */ | 438 | u64 d_size; /* size of disk */ |
509 | u64 u_size; /* user requested size */ | 439 | u64 u_size; /* user requested size */ |
510 | u64 c_size; /* current exported size */ | 440 | u64 c_size; /* current exported size */ |
@@ -514,18 +444,15 @@ struct p_sizes { | |||
514 | } __packed; | 444 | } __packed; |
515 | 445 | ||
516 | struct p_state { | 446 | struct p_state { |
517 | struct p_header80 head; | ||
518 | u32 state; | 447 | u32 state; |
519 | } __packed; | 448 | } __packed; |
520 | 449 | ||
521 | struct p_req_state { | 450 | struct p_req_state { |
522 | struct p_header80 head; | ||
523 | u32 mask; | 451 | u32 mask; |
524 | u32 val; | 452 | u32 val; |
525 | } __packed; | 453 | } __packed; |
526 | 454 | ||
527 | struct p_req_state_reply { | 455 | struct p_req_state_reply { |
528 | struct p_header80 head; | ||
529 | u32 retcode; | 456 | u32 retcode; |
530 | } __packed; | 457 | } __packed; |
531 | 458 | ||
@@ -539,15 +466,7 @@ struct p_drbd06_param { | |||
539 | u32 bit_map_gen[5]; | 466 | u32 bit_map_gen[5]; |
540 | } __packed; | 467 | } __packed; |
541 | 468 | ||
542 | struct p_discard { | ||
543 | struct p_header80 head; | ||
544 | u64 block_id; | ||
545 | u32 seq_num; | ||
546 | u32 pad; | ||
547 | } __packed; | ||
548 | |||
549 | struct p_block_desc { | 469 | struct p_block_desc { |
550 | struct p_header80 head; | ||
551 | u64 sector; | 470 | u64 sector; |
552 | u32 blksize; | 471 | u32 blksize; |
553 | u32 pad; /* to multiple of 8 Byte */ | 472 | u32 pad; /* to multiple of 8 Byte */ |
@@ -563,7 +482,6 @@ enum drbd_bitmap_code { | |||
563 | }; | 482 | }; |
564 | 483 | ||
565 | struct p_compressed_bm { | 484 | struct p_compressed_bm { |
566 | struct p_header80 head; | ||
567 | /* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code | 485 | /* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code |
568 | * (encoding & 0x80): polarity (set/unset) of first runlength | 486 | * (encoding & 0x80): polarity (set/unset) of first runlength |
569 | * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits | 487 | * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits |
@@ -575,90 +493,22 @@ struct p_compressed_bm { | |||
575 | } __packed; | 493 | } __packed; |
576 | 494 | ||
577 | struct p_delay_probe93 { | 495 | struct p_delay_probe93 { |
578 | struct p_header80 head; | ||
579 | u32 seq_num; /* sequence number to match the two probe packets */ | 496 | u32 seq_num; /* sequence number to match the two probe packets */ |
580 | u32 offset; /* usecs the probe got sent after the reference time point */ | 497 | u32 offset; /* usecs the probe got sent after the reference time point */ |
581 | } __packed; | 498 | } __packed; |
582 | 499 | ||
583 | /* DCBP: Drbd Compressed Bitmap Packet ... */ | 500 | /* |
584 | static inline enum drbd_bitmap_code | 501 | * Bitmap packets need to fit within a single page on the sender and receiver, |
585 | DCBP_get_code(struct p_compressed_bm *p) | 502 | * so we are limited to 4 KiB (and not to PAGE_SIZE, which can be bigger). |
586 | { | ||
587 | return (enum drbd_bitmap_code)(p->encoding & 0x0f); | ||
588 | } | ||
589 | |||
590 | static inline void | ||
591 | DCBP_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code) | ||
592 | { | ||
593 | BUG_ON(code & ~0xf); | ||
594 | p->encoding = (p->encoding & ~0xf) | code; | ||
595 | } | ||
596 | |||
597 | static inline int | ||
598 | DCBP_get_start(struct p_compressed_bm *p) | ||
599 | { | ||
600 | return (p->encoding & 0x80) != 0; | ||
601 | } | ||
602 | |||
603 | static inline void | ||
604 | DCBP_set_start(struct p_compressed_bm *p, int set) | ||
605 | { | ||
606 | p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0); | ||
607 | } | ||
608 | |||
609 | static inline int | ||
610 | DCBP_get_pad_bits(struct p_compressed_bm *p) | ||
611 | { | ||
612 | return (p->encoding >> 4) & 0x7; | ||
613 | } | ||
614 | |||
615 | static inline void | ||
616 | DCBP_set_pad_bits(struct p_compressed_bm *p, int n) | ||
617 | { | ||
618 | BUG_ON(n & ~0x7); | ||
619 | p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4); | ||
620 | } | ||
621 | |||
622 | /* one bitmap packet, including the p_header, | ||
623 | * should fit within one _architecture independend_ page. | ||
624 | * so we need to use the fixed size 4KiB page size | ||
625 | * most architectures have used for a long time. | ||
626 | */ | 503 | */ |
627 | #define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header80)) | 504 | #define DRBD_SOCKET_BUFFER_SIZE 4096 |
628 | #define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long)) | ||
629 | #define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm)) | ||
630 | #if (PAGE_SIZE < 4096) | ||
631 | /* drbd_send_bitmap / receive_bitmap would break horribly */ | ||
632 | #error "PAGE_SIZE too small" | ||
633 | #endif | ||
634 | |||
635 | union p_polymorph { | ||
636 | union p_header header; | ||
637 | struct p_handshake handshake; | ||
638 | struct p_data data; | ||
639 | struct p_block_ack block_ack; | ||
640 | struct p_barrier barrier; | ||
641 | struct p_barrier_ack barrier_ack; | ||
642 | struct p_rs_param_89 rs_param_89; | ||
643 | struct p_rs_param_95 rs_param_95; | ||
644 | struct p_protocol protocol; | ||
645 | struct p_sizes sizes; | ||
646 | struct p_uuids uuids; | ||
647 | struct p_state state; | ||
648 | struct p_req_state req_state; | ||
649 | struct p_req_state_reply req_state_reply; | ||
650 | struct p_block_req block_req; | ||
651 | struct p_delay_probe93 delay_probe93; | ||
652 | struct p_rs_uuid rs_uuid; | ||
653 | struct p_block_desc block_desc; | ||
654 | } __packed; | ||
655 | 505 | ||
656 | /**********************************************************************/ | 506 | /**********************************************************************/ |
657 | enum drbd_thread_state { | 507 | enum drbd_thread_state { |
658 | None, | 508 | NONE, |
659 | Running, | 509 | RUNNING, |
660 | Exiting, | 510 | EXITING, |
661 | Restarting | 511 | RESTARTING |
662 | }; | 512 | }; |
663 | 513 | ||
664 | struct drbd_thread { | 514 | struct drbd_thread { |
@@ -667,8 +517,9 @@ struct drbd_thread { | |||
667 | struct completion stop; | 517 | struct completion stop; |
668 | enum drbd_thread_state t_state; | 518 | enum drbd_thread_state t_state; |
669 | int (*function) (struct drbd_thread *); | 519 | int (*function) (struct drbd_thread *); |
670 | struct drbd_conf *mdev; | 520 | struct drbd_tconn *tconn; |
671 | int reset_cpu_mask; | 521 | int reset_cpu_mask; |
522 | char name[9]; | ||
672 | }; | 523 | }; |
673 | 524 | ||
674 | static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi) | 525 | static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi) |
@@ -681,58 +532,54 @@ static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi) | |||
681 | return thi->t_state; | 532 | return thi->t_state; |
682 | } | 533 | } |
683 | 534 | ||
684 | struct drbd_work; | ||
685 | typedef int (*drbd_work_cb)(struct drbd_conf *, struct drbd_work *, int cancel); | ||
686 | struct drbd_work { | 535 | struct drbd_work { |
687 | struct list_head list; | 536 | struct list_head list; |
688 | drbd_work_cb cb; | 537 | int (*cb)(struct drbd_work *, int cancel); |
538 | union { | ||
539 | struct drbd_conf *mdev; | ||
540 | struct drbd_tconn *tconn; | ||
541 | }; | ||
689 | }; | 542 | }; |
690 | 543 | ||
691 | struct drbd_tl_epoch; | 544 | #include "drbd_interval.h" |
545 | |||
546 | extern int drbd_wait_misc(struct drbd_conf *, struct drbd_interval *); | ||
547 | |||
692 | struct drbd_request { | 548 | struct drbd_request { |
693 | struct drbd_work w; | 549 | struct drbd_work w; |
694 | struct drbd_conf *mdev; | ||
695 | 550 | ||
696 | /* if local IO is not allowed, will be NULL. | 551 | /* if local IO is not allowed, will be NULL. |
697 | * if local IO _is_ allowed, holds the locally submitted bio clone, | 552 | * if local IO _is_ allowed, holds the locally submitted bio clone, |
698 | * or, after local IO completion, the ERR_PTR(error). | 553 | * or, after local IO completion, the ERR_PTR(error). |
699 | * see drbd_endio_pri(). */ | 554 | * see drbd_request_endio(). */ |
700 | struct bio *private_bio; | 555 | struct bio *private_bio; |
701 | 556 | ||
702 | struct hlist_node collision; | 557 | struct drbd_interval i; |
703 | sector_t sector; | ||
704 | unsigned int size; | ||
705 | unsigned int epoch; /* barrier_nr */ | ||
706 | 558 | ||
707 | /* barrier_nr: used to check on "completion" whether this req was in | 559 | /* epoch: used to check on "completion" whether this req was in |
708 | * the current epoch, and we therefore have to close it, | 560 | * the current epoch, and we therefore have to close it, |
709 | * starting a new epoch... | 561 | * causing a p_barrier packet to be send, starting a new epoch. |
562 | * | ||
563 | * This corresponds to "barrier" in struct p_barrier[_ack], | ||
564 | * and to "barrier_nr" in struct drbd_epoch (and various | ||
565 | * comments/function parameters/local variable names). | ||
710 | */ | 566 | */ |
567 | unsigned int epoch; | ||
711 | 568 | ||
712 | struct list_head tl_requests; /* ring list in the transfer log */ | 569 | struct list_head tl_requests; /* ring list in the transfer log */ |
713 | struct bio *master_bio; /* master bio pointer */ | 570 | struct bio *master_bio; /* master bio pointer */ |
714 | unsigned long rq_state; /* see comments above _req_mod() */ | ||
715 | unsigned long start_time; | 571 | unsigned long start_time; |
716 | }; | ||
717 | |||
718 | struct drbd_tl_epoch { | ||
719 | struct drbd_work w; | ||
720 | struct list_head requests; /* requests before */ | ||
721 | struct drbd_tl_epoch *next; /* pointer to the next barrier */ | ||
722 | unsigned int br_number; /* the barriers identifier. */ | ||
723 | int n_writes; /* number of requests attached before this barrier */ | ||
724 | }; | ||
725 | 572 | ||
726 | struct drbd_request; | 573 | /* once it hits 0, we may complete the master_bio */ |
574 | atomic_t completion_ref; | ||
575 | /* once it hits 0, we may destroy this drbd_request object */ | ||
576 | struct kref kref; | ||
727 | 577 | ||
728 | /* These Tl_epoch_entries may be in one of 6 lists: | 578 | unsigned rq_state; /* see comments above _req_mod() */ |
729 | active_ee .. data packet being written | 579 | }; |
730 | sync_ee .. syncer block being written | ||
731 | done_ee .. block written, need to send P_WRITE_ACK | ||
732 | read_ee .. [RS]P_DATA_REQUEST being read | ||
733 | */ | ||
734 | 580 | ||
735 | struct drbd_epoch { | 581 | struct drbd_epoch { |
582 | struct drbd_tconn *tconn; | ||
736 | struct list_head list; | 583 | struct list_head list; |
737 | unsigned int barrier_nr; | 584 | unsigned int barrier_nr; |
738 | atomic_t epoch_size; /* increased on every request added. */ | 585 | atomic_t epoch_size; /* increased on every request added. */ |
@@ -762,17 +609,14 @@ struct digest_info { | |||
762 | void *digest; | 609 | void *digest; |
763 | }; | 610 | }; |
764 | 611 | ||
765 | struct drbd_epoch_entry { | 612 | struct drbd_peer_request { |
766 | struct drbd_work w; | 613 | struct drbd_work w; |
767 | struct hlist_node collision; | ||
768 | struct drbd_epoch *epoch; /* for writes */ | 614 | struct drbd_epoch *epoch; /* for writes */ |
769 | struct drbd_conf *mdev; | ||
770 | struct page *pages; | 615 | struct page *pages; |
771 | atomic_t pending_bios; | 616 | atomic_t pending_bios; |
772 | unsigned int size; | 617 | struct drbd_interval i; |
773 | /* see comments on ee flag bits below */ | 618 | /* see comments on ee flag bits below */ |
774 | unsigned long flags; | 619 | unsigned long flags; |
775 | sector_t sector; | ||
776 | union { | 620 | union { |
777 | u64 block_id; | 621 | u64 block_id; |
778 | struct digest_info *digest; | 622 | struct digest_info *digest; |
@@ -793,31 +637,37 @@ enum { | |||
793 | * we need to resubmit without the barrier flag. */ | 637 | * we need to resubmit without the barrier flag. */ |
794 | __EE_RESUBMITTED, | 638 | __EE_RESUBMITTED, |
795 | 639 | ||
796 | /* we may have several bios per epoch entry. | 640 | /* we may have several bios per peer request. |
797 | * if any of those fail, we set this flag atomically | 641 | * if any of those fail, we set this flag atomically |
798 | * from the endio callback */ | 642 | * from the endio callback */ |
799 | __EE_WAS_ERROR, | 643 | __EE_WAS_ERROR, |
800 | 644 | ||
801 | /* This ee has a pointer to a digest instead of a block id */ | 645 | /* This ee has a pointer to a digest instead of a block id */ |
802 | __EE_HAS_DIGEST, | 646 | __EE_HAS_DIGEST, |
647 | |||
648 | /* Conflicting local requests need to be restarted after this request */ | ||
649 | __EE_RESTART_REQUESTS, | ||
650 | |||
651 | /* The peer wants a write ACK for this (wire proto C) */ | ||
652 | __EE_SEND_WRITE_ACK, | ||
653 | |||
654 | /* Is set when net_conf had two_primaries set while creating this peer_req */ | ||
655 | __EE_IN_INTERVAL_TREE, | ||
803 | }; | 656 | }; |
804 | #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) | 657 | #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) |
805 | #define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) | 658 | #define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) |
806 | #define EE_RESUBMITTED (1<<__EE_RESUBMITTED) | 659 | #define EE_RESUBMITTED (1<<__EE_RESUBMITTED) |
807 | #define EE_WAS_ERROR (1<<__EE_WAS_ERROR) | 660 | #define EE_WAS_ERROR (1<<__EE_WAS_ERROR) |
808 | #define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST) | 661 | #define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST) |
662 | #define EE_RESTART_REQUESTS (1<<__EE_RESTART_REQUESTS) | ||
663 | #define EE_SEND_WRITE_ACK (1<<__EE_SEND_WRITE_ACK) | ||
664 | #define EE_IN_INTERVAL_TREE (1<<__EE_IN_INTERVAL_TREE) | ||
809 | 665 | ||
810 | /* global flag bits */ | 666 | /* flag bits per mdev */ |
811 | enum { | 667 | enum { |
812 | CREATE_BARRIER, /* next P_DATA is preceded by a P_BARRIER */ | ||
813 | SIGNAL_ASENDER, /* whether asender wants to be interrupted */ | ||
814 | SEND_PING, /* whether asender should send a ping asap */ | ||
815 | |||
816 | UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */ | 668 | UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */ |
817 | MD_DIRTY, /* current uuids and flags not yet on disk */ | 669 | MD_DIRTY, /* current uuids and flags not yet on disk */ |
818 | DISCARD_CONCURRENT, /* Set on one node, cleared on the peer! */ | ||
819 | USE_DEGR_WFC_T, /* degr-wfc-timeout instead of wfc-timeout. */ | 670 | USE_DEGR_WFC_T, /* degr-wfc-timeout instead of wfc-timeout. */ |
820 | CLUSTER_ST_CHANGE, /* Cluster wide state change going on... */ | ||
821 | CL_ST_CHG_SUCCESS, | 671 | CL_ST_CHG_SUCCESS, |
822 | CL_ST_CHG_FAIL, | 672 | CL_ST_CHG_FAIL, |
823 | CRASHED_PRIMARY, /* This node was a crashed primary. | 673 | CRASHED_PRIMARY, /* This node was a crashed primary. |
@@ -831,32 +681,18 @@ enum { | |||
831 | once no more io in flight, start bitmap io */ | 681 | once no more io in flight, start bitmap io */ |
832 | BITMAP_IO_QUEUED, /* Started bitmap IO */ | 682 | BITMAP_IO_QUEUED, /* Started bitmap IO */ |
833 | GO_DISKLESS, /* Disk is being detached, on io-error or admin request. */ | 683 | GO_DISKLESS, /* Disk is being detached, on io-error or admin request. */ |
834 | WAS_IO_ERROR, /* Local disk failed returned IO error */ | 684 | WAS_IO_ERROR, /* Local disk failed, returned IO error */ |
685 | WAS_READ_ERROR, /* Local disk READ failed (set additionally to the above) */ | ||
835 | FORCE_DETACH, /* Force-detach from local disk, aborting any pending local IO */ | 686 | FORCE_DETACH, /* Force-detach from local disk, aborting any pending local IO */ |
836 | RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */ | 687 | RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */ |
837 | NET_CONGESTED, /* The data socket is congested */ | ||
838 | |||
839 | CONFIG_PENDING, /* serialization of (re)configuration requests. | ||
840 | * if set, also prevents the device from dying */ | ||
841 | DEVICE_DYING, /* device became unconfigured, | ||
842 | * but worker thread is still handling the cleanup. | ||
843 | * reconfiguring (nl_disk_conf, nl_net_conf) is dissalowed, | ||
844 | * while this is set. */ | ||
845 | RESIZE_PENDING, /* Size change detected locally, waiting for the response from | 688 | RESIZE_PENDING, /* Size change detected locally, waiting for the response from |
846 | * the peer, if it changed there as well. */ | 689 | * the peer, if it changed there as well. */ |
847 | CONN_DRY_RUN, /* Expect disconnect after resync handshake. */ | ||
848 | GOT_PING_ACK, /* set when we receive a ping_ack packet, misc wait gets woken */ | ||
849 | NEW_CUR_UUID, /* Create new current UUID when thawing IO */ | 690 | NEW_CUR_UUID, /* Create new current UUID when thawing IO */ |
850 | AL_SUSPENDED, /* Activity logging is currently suspended. */ | 691 | AL_SUSPENDED, /* Activity logging is currently suspended. */ |
851 | AHEAD_TO_SYNC_SOURCE, /* Ahead -> SyncSource queued */ | 692 | AHEAD_TO_SYNC_SOURCE, /* Ahead -> SyncSource queued */ |
852 | STATE_SENT, /* Do not change state/UUIDs while this is set */ | 693 | B_RS_H_DONE, /* Before resync handler done (already executed) */ |
853 | 694 | DISCARD_MY_DATA, /* discard_my_data flag per volume */ | |
854 | CALLBACK_PENDING, /* Whether we have a call_usermodehelper(, UMH_WAIT_PROC) | 695 | READ_BALANCE_RR, |
855 | * pending, from drbd worker context. | ||
856 | * If set, bdi_write_congested() returns true, | ||
857 | * so shrink_page_list() would not recurse into, | ||
858 | * and potentially deadlock on, this drbd worker. | ||
859 | */ | ||
860 | }; | 696 | }; |
861 | 697 | ||
862 | struct drbd_bitmap; /* opaque for drbd_conf */ | 698 | struct drbd_bitmap; /* opaque for drbd_conf */ |
@@ -894,24 +730,24 @@ enum bm_flag { | |||
894 | 730 | ||
895 | struct drbd_work_queue { | 731 | struct drbd_work_queue { |
896 | struct list_head q; | 732 | struct list_head q; |
897 | struct semaphore s; /* producers up it, worker down()s it */ | ||
898 | spinlock_t q_lock; /* to protect the list. */ | 733 | spinlock_t q_lock; /* to protect the list. */ |
734 | wait_queue_head_t q_wait; | ||
899 | }; | 735 | }; |
900 | 736 | ||
901 | struct drbd_socket { | 737 | struct drbd_socket { |
902 | struct drbd_work_queue work; | ||
903 | struct mutex mutex; | 738 | struct mutex mutex; |
904 | struct socket *socket; | 739 | struct socket *socket; |
905 | /* this way we get our | 740 | /* this way we get our |
906 | * send/receive buffers off the stack */ | 741 | * send/receive buffers off the stack */ |
907 | union p_polymorph sbuf; | 742 | void *sbuf; |
908 | union p_polymorph rbuf; | 743 | void *rbuf; |
909 | }; | 744 | }; |
910 | 745 | ||
911 | struct drbd_md { | 746 | struct drbd_md { |
912 | u64 md_offset; /* sector offset to 'super' block */ | 747 | u64 md_offset; /* sector offset to 'super' block */ |
913 | 748 | ||
914 | u64 la_size_sect; /* last agreed size, unit sectors */ | 749 | u64 la_size_sect; /* last agreed size, unit sectors */ |
750 | spinlock_t uuid_lock; | ||
915 | u64 uuid[UI_SIZE]; | 751 | u64 uuid[UI_SIZE]; |
916 | u64 device_uuid; | 752 | u64 device_uuid; |
917 | u32 flags; | 753 | u32 flags; |
@@ -921,24 +757,16 @@ struct drbd_md { | |||
921 | s32 bm_offset; /* signed relative sector offset to bitmap */ | 757 | s32 bm_offset; /* signed relative sector offset to bitmap */ |
922 | 758 | ||
923 | /* u32 al_nr_extents; important for restoring the AL | 759 | /* u32 al_nr_extents; important for restoring the AL |
924 | * is stored into sync_conf.al_extents, which in turn | 760 | * is stored into ldev->dc.al_extents, which in turn |
925 | * gets applied to act_log->nr_elements | 761 | * gets applied to act_log->nr_elements |
926 | */ | 762 | */ |
927 | }; | 763 | }; |
928 | 764 | ||
929 | /* for sync_conf and other types... */ | ||
930 | #define NL_PACKET(name, number, fields) struct name { fields }; | ||
931 | #define NL_INTEGER(pn,pr,member) int member; | ||
932 | #define NL_INT64(pn,pr,member) __u64 member; | ||
933 | #define NL_BIT(pn,pr,member) unsigned member:1; | ||
934 | #define NL_STRING(pn,pr,member,len) unsigned char member[len]; int member ## _len; | ||
935 | #include <linux/drbd_nl.h> | ||
936 | |||
937 | struct drbd_backing_dev { | 765 | struct drbd_backing_dev { |
938 | struct block_device *backing_bdev; | 766 | struct block_device *backing_bdev; |
939 | struct block_device *md_bdev; | 767 | struct block_device *md_bdev; |
940 | struct drbd_md md; | 768 | struct drbd_md md; |
941 | struct disk_conf dc; /* The user provided config... */ | 769 | struct disk_conf *disk_conf; /* RCU, for updates: mdev->tconn->conf_update */ |
942 | sector_t known_size; /* last known size of that backing device */ | 770 | sector_t known_size; /* last known size of that backing device */ |
943 | }; | 771 | }; |
944 | 772 | ||
@@ -962,18 +790,116 @@ enum write_ordering_e { | |||
962 | }; | 790 | }; |
963 | 791 | ||
964 | struct fifo_buffer { | 792 | struct fifo_buffer { |
965 | int *values; | ||
966 | unsigned int head_index; | 793 | unsigned int head_index; |
967 | unsigned int size; | 794 | unsigned int size; |
795 | int total; /* sum of all values */ | ||
796 | int values[0]; | ||
797 | }; | ||
798 | extern struct fifo_buffer *fifo_alloc(int fifo_size); | ||
799 | |||
800 | /* flag bits per tconn */ | ||
801 | enum { | ||
802 | NET_CONGESTED, /* The data socket is congested */ | ||
803 | RESOLVE_CONFLICTS, /* Set on one node, cleared on the peer! */ | ||
804 | SEND_PING, /* whether asender should send a ping asap */ | ||
805 | SIGNAL_ASENDER, /* whether asender wants to be interrupted */ | ||
806 | GOT_PING_ACK, /* set when we receive a ping_ack packet, ping_wait gets woken */ | ||
807 | CONN_WD_ST_CHG_REQ, /* A cluster wide state change on the connection is active */ | ||
808 | CONN_WD_ST_CHG_OKAY, | ||
809 | CONN_WD_ST_CHG_FAIL, | ||
810 | CONN_DRY_RUN, /* Expect disconnect after resync handshake. */ | ||
811 | CREATE_BARRIER, /* next P_DATA is preceded by a P_BARRIER */ | ||
812 | STATE_SENT, /* Do not change state/UUIDs while this is set */ | ||
813 | CALLBACK_PENDING, /* Whether we have a call_usermodehelper(, UMH_WAIT_PROC) | ||
814 | * pending, from drbd worker context. | ||
815 | * If set, bdi_write_congested() returns true, | ||
816 | * so shrink_page_list() would not recurse into, | ||
817 | * and potentially deadlock on, this drbd worker. | ||
818 | */ | ||
819 | DISCONNECT_SENT, | ||
820 | }; | ||
821 | |||
822 | struct drbd_tconn { /* is a resource from the config file */ | ||
823 | char *name; /* Resource name */ | ||
824 | struct list_head all_tconn; /* linked on global drbd_tconns */ | ||
825 | struct kref kref; | ||
826 | struct idr volumes; /* <tconn, vnr> to mdev mapping */ | ||
827 | enum drbd_conns cstate; /* Only C_STANDALONE to C_WF_REPORT_PARAMS */ | ||
828 | unsigned susp:1; /* IO suspended by user */ | ||
829 | unsigned susp_nod:1; /* IO suspended because no data */ | ||
830 | unsigned susp_fen:1; /* IO suspended because fence peer handler runs */ | ||
831 | struct mutex cstate_mutex; /* Protects graceful disconnects */ | ||
832 | |||
833 | unsigned long flags; | ||
834 | struct net_conf *net_conf; /* content protected by rcu */ | ||
835 | struct mutex conf_update; /* mutex for ready-copy-update of net_conf and disk_conf */ | ||
836 | wait_queue_head_t ping_wait; /* Woken upon reception of a ping, and a state change */ | ||
837 | struct res_opts res_opts; | ||
838 | |||
839 | struct sockaddr_storage my_addr; | ||
840 | int my_addr_len; | ||
841 | struct sockaddr_storage peer_addr; | ||
842 | int peer_addr_len; | ||
843 | |||
844 | struct drbd_socket data; /* data/barrier/cstate/parameter packets */ | ||
845 | struct drbd_socket meta; /* ping/ack (metadata) packets */ | ||
846 | int agreed_pro_version; /* actually used protocol version */ | ||
847 | unsigned long last_received; /* in jiffies, either socket */ | ||
848 | unsigned int ko_count; | ||
849 | |||
850 | spinlock_t req_lock; | ||
851 | |||
852 | struct list_head transfer_log; /* all requests not yet fully processed */ | ||
853 | |||
854 | struct crypto_hash *cram_hmac_tfm; | ||
855 | struct crypto_hash *integrity_tfm; /* checksums we compute, updates protected by tconn->data->mutex */ | ||
856 | struct crypto_hash *peer_integrity_tfm; /* checksums we verify, only accessed from receiver thread */ | ||
857 | struct crypto_hash *csums_tfm; | ||
858 | struct crypto_hash *verify_tfm; | ||
859 | void *int_dig_in; | ||
860 | void *int_dig_vv; | ||
861 | |||
862 | /* receiver side */ | ||
863 | struct drbd_epoch *current_epoch; | ||
864 | spinlock_t epoch_lock; | ||
865 | unsigned int epochs; | ||
866 | enum write_ordering_e write_ordering; | ||
867 | atomic_t current_tle_nr; /* transfer log epoch number */ | ||
868 | unsigned current_tle_writes; /* writes seen within this tl epoch */ | ||
869 | |||
870 | unsigned long last_reconnect_jif; | ||
871 | struct drbd_thread receiver; | ||
872 | struct drbd_thread worker; | ||
873 | struct drbd_thread asender; | ||
874 | cpumask_var_t cpu_mask; | ||
875 | |||
876 | /* sender side */ | ||
877 | struct drbd_work_queue sender_work; | ||
878 | |||
879 | struct { | ||
880 | /* whether this sender thread | ||
881 | * has processed a single write yet. */ | ||
882 | bool seen_any_write_yet; | ||
883 | |||
884 | /* Which barrier number to send with the next P_BARRIER */ | ||
885 | int current_epoch_nr; | ||
886 | |||
887 | /* how many write requests have been sent | ||
888 | * with req->epoch == current_epoch_nr. | ||
889 | * If none, no P_BARRIER will be sent. */ | ||
890 | unsigned current_epoch_writes; | ||
891 | } send; | ||
968 | }; | 892 | }; |
969 | 893 | ||
970 | struct drbd_conf { | 894 | struct drbd_conf { |
895 | struct drbd_tconn *tconn; | ||
896 | int vnr; /* volume number within the connection */ | ||
897 | struct kref kref; | ||
898 | |||
971 | /* things that are stored as / read from meta data on disk */ | 899 | /* things that are stored as / read from meta data on disk */ |
972 | unsigned long flags; | 900 | unsigned long flags; |
973 | 901 | ||
974 | /* configured by drbdsetup */ | 902 | /* configured by drbdsetup */ |
975 | struct net_conf *net_conf; /* protected by get_net_conf() and put_net_conf() */ | ||
976 | struct syncer_conf sync_conf; | ||
977 | struct drbd_backing_dev *ldev __protected_by(local); | 903 | struct drbd_backing_dev *ldev __protected_by(local); |
978 | 904 | ||
979 | sector_t p_size; /* partner's disk size */ | 905 | sector_t p_size; /* partner's disk size */ |
@@ -981,11 +907,7 @@ struct drbd_conf { | |||
981 | struct block_device *this_bdev; | 907 | struct block_device *this_bdev; |
982 | struct gendisk *vdisk; | 908 | struct gendisk *vdisk; |
983 | 909 | ||
984 | struct drbd_socket data; /* data/barrier/cstate/parameter packets */ | 910 | unsigned long last_reattach_jif; |
985 | struct drbd_socket meta; /* ping/ack (metadata) packets */ | ||
986 | int agreed_pro_version; /* actually used protocol version */ | ||
987 | unsigned long last_received; /* in jiffies, either socket */ | ||
988 | unsigned int ko_count; | ||
989 | struct drbd_work resync_work, | 911 | struct drbd_work resync_work, |
990 | unplug_work, | 912 | unplug_work, |
991 | go_diskless, | 913 | go_diskless, |
@@ -1005,10 +927,9 @@ struct drbd_conf { | |||
1005 | /* Used after attach while negotiating new disk state. */ | 927 | /* Used after attach while negotiating new disk state. */ |
1006 | union drbd_state new_state_tmp; | 928 | union drbd_state new_state_tmp; |
1007 | 929 | ||
1008 | union drbd_state state; | 930 | union drbd_dev_state state; |
1009 | wait_queue_head_t misc_wait; | 931 | wait_queue_head_t misc_wait; |
1010 | wait_queue_head_t state_wait; /* upon each state change. */ | 932 | wait_queue_head_t state_wait; /* upon each state change. */ |
1011 | wait_queue_head_t net_cnt_wait; | ||
1012 | unsigned int send_cnt; | 933 | unsigned int send_cnt; |
1013 | unsigned int recv_cnt; | 934 | unsigned int recv_cnt; |
1014 | unsigned int read_cnt; | 935 | unsigned int read_cnt; |
@@ -1018,17 +939,12 @@ struct drbd_conf { | |||
1018 | atomic_t ap_bio_cnt; /* Requests we need to complete */ | 939 | atomic_t ap_bio_cnt; /* Requests we need to complete */ |
1019 | atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */ | 940 | atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */ |
1020 | atomic_t rs_pending_cnt; /* RS request/data packets on the wire */ | 941 | atomic_t rs_pending_cnt; /* RS request/data packets on the wire */ |
1021 | atomic_t unacked_cnt; /* Need to send replys for */ | 942 | atomic_t unacked_cnt; /* Need to send replies for */ |
1022 | atomic_t local_cnt; /* Waiting for local completion */ | 943 | atomic_t local_cnt; /* Waiting for local completion */ |
1023 | atomic_t net_cnt; /* Users of net_conf */ | 944 | |
1024 | spinlock_t req_lock; | 945 | /* Interval tree of pending local requests */ |
1025 | struct drbd_tl_epoch *unused_spare_tle; /* for pre-allocation */ | 946 | struct rb_root read_requests; |
1026 | struct drbd_tl_epoch *newest_tle; | 947 | struct rb_root write_requests; |
1027 | struct drbd_tl_epoch *oldest_tle; | ||
1028 | struct list_head out_of_sequence_requests; | ||
1029 | struct list_head barrier_acked_requests; | ||
1030 | struct hlist_head *tl_hash; | ||
1031 | unsigned int tl_hash_s; | ||
1032 | 948 | ||
1033 | /* blocks to resync in this run [unit BM_BLOCK_SIZE] */ | 949 | /* blocks to resync in this run [unit BM_BLOCK_SIZE] */ |
1034 | unsigned long rs_total; | 950 | unsigned long rs_total; |
@@ -1048,9 +964,11 @@ struct drbd_conf { | |||
1048 | unsigned long rs_mark_time[DRBD_SYNC_MARKS]; | 964 | unsigned long rs_mark_time[DRBD_SYNC_MARKS]; |
1049 | /* current index into rs_mark_{left,time} */ | 965 | /* current index into rs_mark_{left,time} */ |
1050 | int rs_last_mark; | 966 | int rs_last_mark; |
967 | unsigned long rs_last_bcast; /* [unit jiffies] */ | ||
1051 | 968 | ||
1052 | /* where does the admin want us to start? (sector) */ | 969 | /* where does the admin want us to start? (sector) */ |
1053 | sector_t ov_start_sector; | 970 | sector_t ov_start_sector; |
971 | sector_t ov_stop_sector; | ||
1054 | /* where are we now? (sector) */ | 972 | /* where are we now? (sector) */ |
1055 | sector_t ov_position; | 973 | sector_t ov_position; |
1056 | /* Start sector of out of sync range (to merge printk reporting). */ | 974 | /* Start sector of out of sync range (to merge printk reporting). */ |
@@ -1058,14 +976,7 @@ struct drbd_conf { | |||
1058 | /* size of out-of-sync range in sectors. */ | 976 | /* size of out-of-sync range in sectors. */ |
1059 | sector_t ov_last_oos_size; | 977 | sector_t ov_last_oos_size; |
1060 | unsigned long ov_left; /* in bits */ | 978 | unsigned long ov_left; /* in bits */ |
1061 | struct crypto_hash *csums_tfm; | ||
1062 | struct crypto_hash *verify_tfm; | ||
1063 | 979 | ||
1064 | unsigned long last_reattach_jif; | ||
1065 | unsigned long last_reconnect_jif; | ||
1066 | struct drbd_thread receiver; | ||
1067 | struct drbd_thread worker; | ||
1068 | struct drbd_thread asender; | ||
1069 | struct drbd_bitmap *bitmap; | 980 | struct drbd_bitmap *bitmap; |
1070 | unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */ | 981 | unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */ |
1071 | 982 | ||
@@ -1078,29 +989,19 @@ struct drbd_conf { | |||
1078 | 989 | ||
1079 | int open_cnt; | 990 | int open_cnt; |
1080 | u64 *p_uuid; | 991 | u64 *p_uuid; |
1081 | struct drbd_epoch *current_epoch; | 992 | |
1082 | spinlock_t epoch_lock; | ||
1083 | unsigned int epochs; | ||
1084 | enum write_ordering_e write_ordering; | ||
1085 | struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */ | 993 | struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */ |
1086 | struct list_head sync_ee; /* IO in progress (P_RS_DATA_REPLY gets written to disk) */ | 994 | struct list_head sync_ee; /* IO in progress (P_RS_DATA_REPLY gets written to disk) */ |
1087 | struct list_head done_ee; /* send ack */ | 995 | struct list_head done_ee; /* need to send P_WRITE_ACK */ |
1088 | struct list_head read_ee; /* IO in progress (any read) */ | 996 | struct list_head read_ee; /* [RS]P_DATA_REQUEST being read */ |
1089 | struct list_head net_ee; /* zero-copy network send in progress */ | 997 | struct list_head net_ee; /* zero-copy network send in progress */ |
1090 | struct hlist_head *ee_hash; /* is proteced by req_lock! */ | ||
1091 | unsigned int ee_hash_s; | ||
1092 | |||
1093 | /* this one is protected by ee_lock, single thread */ | ||
1094 | struct drbd_epoch_entry *last_write_w_barrier; | ||
1095 | 998 | ||
1096 | int next_barrier_nr; | 999 | int next_barrier_nr; |
1097 | struct hlist_head *app_reads_hash; /* is proteced by req_lock */ | ||
1098 | struct list_head resync_reads; | 1000 | struct list_head resync_reads; |
1099 | atomic_t pp_in_use; /* allocated from page pool */ | 1001 | atomic_t pp_in_use; /* allocated from page pool */ |
1100 | atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */ | 1002 | atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */ |
1101 | wait_queue_head_t ee_wait; | 1003 | wait_queue_head_t ee_wait; |
1102 | struct page *md_io_page; /* one page buffer for md_io */ | 1004 | struct page *md_io_page; /* one page buffer for md_io */ |
1103 | struct page *md_io_tmpp; /* for logical_block_size != 512 */ | ||
1104 | struct drbd_md_io md_io; | 1005 | struct drbd_md_io md_io; |
1105 | atomic_t md_io_in_use; /* protects the md_io, md_io_page and md_io_tmpp */ | 1006 | atomic_t md_io_in_use; /* protects the md_io, md_io_page and md_io_tmpp */ |
1106 | spinlock_t al_lock; | 1007 | spinlock_t al_lock; |
@@ -1109,22 +1010,16 @@ struct drbd_conf { | |||
1109 | unsigned int al_tr_number; | 1010 | unsigned int al_tr_number; |
1110 | int al_tr_cycle; | 1011 | int al_tr_cycle; |
1111 | int al_tr_pos; /* position of the next transaction in the journal */ | 1012 | int al_tr_pos; /* position of the next transaction in the journal */ |
1112 | struct crypto_hash *cram_hmac_tfm; | ||
1113 | struct crypto_hash *integrity_w_tfm; /* to be used by the worker thread */ | ||
1114 | struct crypto_hash *integrity_r_tfm; /* to be used by the receiver thread */ | ||
1115 | void *int_dig_out; | ||
1116 | void *int_dig_in; | ||
1117 | void *int_dig_vv; | ||
1118 | wait_queue_head_t seq_wait; | 1013 | wait_queue_head_t seq_wait; |
1119 | atomic_t packet_seq; | 1014 | atomic_t packet_seq; |
1120 | unsigned int peer_seq; | 1015 | unsigned int peer_seq; |
1121 | spinlock_t peer_seq_lock; | 1016 | spinlock_t peer_seq_lock; |
1122 | unsigned int minor; | 1017 | unsigned int minor; |
1123 | unsigned long comm_bm_set; /* communicated number of set bits. */ | 1018 | unsigned long comm_bm_set; /* communicated number of set bits. */ |
1124 | cpumask_var_t cpu_mask; | ||
1125 | struct bm_io_work bm_io_work; | 1019 | struct bm_io_work bm_io_work; |
1126 | u64 ed_uuid; /* UUID of the exposed data */ | 1020 | u64 ed_uuid; /* UUID of the exposed data */ |
1127 | struct mutex state_mutex; | 1021 | struct mutex own_state_mutex; |
1022 | struct mutex *state_mutex; /* either own_state_mutex or mdev->tconn->cstate_mutex */ | ||
1128 | char congestion_reason; /* Why we where congested... */ | 1023 | char congestion_reason; /* Why we where congested... */ |
1129 | atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */ | 1024 | atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */ |
1130 | atomic_t rs_sect_ev; /* for submitted resync data rate, both */ | 1025 | atomic_t rs_sect_ev; /* for submitted resync data rate, both */ |
@@ -1132,9 +1027,8 @@ struct drbd_conf { | |||
1132 | int rs_last_events; /* counter of read or write "events" (unit sectors) | 1027 | int rs_last_events; /* counter of read or write "events" (unit sectors) |
1133 | * on the lower level device when we last looked. */ | 1028 | * on the lower level device when we last looked. */ |
1134 | int c_sync_rate; /* current resync rate after syncer throttle magic */ | 1029 | int c_sync_rate; /* current resync rate after syncer throttle magic */ |
1135 | struct fifo_buffer rs_plan_s; /* correction values of resync planer */ | 1030 | struct fifo_buffer *rs_plan_s; /* correction values of resync planer (RCU, tconn->conn_update) */ |
1136 | int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */ | 1031 | int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */ |
1137 | int rs_planed; /* resync sectors already planned */ | ||
1138 | atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */ | 1032 | atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */ |
1139 | unsigned int peer_max_bio_size; | 1033 | unsigned int peer_max_bio_size; |
1140 | unsigned int local_max_bio_size; | 1034 | unsigned int local_max_bio_size; |
@@ -1142,11 +1036,7 @@ struct drbd_conf { | |||
1142 | 1036 | ||
1143 | static inline struct drbd_conf *minor_to_mdev(unsigned int minor) | 1037 | static inline struct drbd_conf *minor_to_mdev(unsigned int minor) |
1144 | { | 1038 | { |
1145 | struct drbd_conf *mdev; | 1039 | return (struct drbd_conf *)idr_find(&minors, minor); |
1146 | |||
1147 | mdev = minor < minor_count ? minor_table[minor] : NULL; | ||
1148 | |||
1149 | return mdev; | ||
1150 | } | 1040 | } |
1151 | 1041 | ||
1152 | static inline unsigned int mdev_to_minor(struct drbd_conf *mdev) | 1042 | static inline unsigned int mdev_to_minor(struct drbd_conf *mdev) |
@@ -1154,29 +1044,9 @@ static inline unsigned int mdev_to_minor(struct drbd_conf *mdev) | |||
1154 | return mdev->minor; | 1044 | return mdev->minor; |
1155 | } | 1045 | } |
1156 | 1046 | ||
1157 | /* returns 1 if it was successful, | 1047 | static inline struct drbd_conf *vnr_to_mdev(struct drbd_tconn *tconn, int vnr) |
1158 | * returns 0 if there was no data socket. | ||
1159 | * so wherever you are going to use the data.socket, e.g. do | ||
1160 | * if (!drbd_get_data_sock(mdev)) | ||
1161 | * return 0; | ||
1162 | * CODE(); | ||
1163 | * drbd_put_data_sock(mdev); | ||
1164 | */ | ||
1165 | static inline int drbd_get_data_sock(struct drbd_conf *mdev) | ||
1166 | { | ||
1167 | mutex_lock(&mdev->data.mutex); | ||
1168 | /* drbd_disconnect() could have called drbd_free_sock() | ||
1169 | * while we were waiting in down()... */ | ||
1170 | if (unlikely(mdev->data.socket == NULL)) { | ||
1171 | mutex_unlock(&mdev->data.mutex); | ||
1172 | return 0; | ||
1173 | } | ||
1174 | return 1; | ||
1175 | } | ||
1176 | |||
1177 | static inline void drbd_put_data_sock(struct drbd_conf *mdev) | ||
1178 | { | 1048 | { |
1179 | mutex_unlock(&mdev->data.mutex); | 1049 | return (struct drbd_conf *)idr_find(&tconn->volumes, vnr); |
1180 | } | 1050 | } |
1181 | 1051 | ||
1182 | /* | 1052 | /* |
@@ -1185,106 +1055,77 @@ static inline void drbd_put_data_sock(struct drbd_conf *mdev) | |||
1185 | 1055 | ||
1186 | /* drbd_main.c */ | 1056 | /* drbd_main.c */ |
1187 | 1057 | ||
1188 | enum chg_state_flags { | ||
1189 | CS_HARD = 1, | ||
1190 | CS_VERBOSE = 2, | ||
1191 | CS_WAIT_COMPLETE = 4, | ||
1192 | CS_SERIALIZE = 8, | ||
1193 | CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE, | ||
1194 | }; | ||
1195 | |||
1196 | enum dds_flags { | 1058 | enum dds_flags { |
1197 | DDSF_FORCED = 1, | 1059 | DDSF_FORCED = 1, |
1198 | DDSF_NO_RESYNC = 2, /* Do not run a resync for the new space */ | 1060 | DDSF_NO_RESYNC = 2, /* Do not run a resync for the new space */ |
1199 | }; | 1061 | }; |
1200 | 1062 | ||
1201 | extern void drbd_init_set_defaults(struct drbd_conf *mdev); | 1063 | extern void drbd_init_set_defaults(struct drbd_conf *mdev); |
1202 | extern enum drbd_state_rv drbd_change_state(struct drbd_conf *mdev, | ||
1203 | enum chg_state_flags f, | ||
1204 | union drbd_state mask, | ||
1205 | union drbd_state val); | ||
1206 | extern void drbd_force_state(struct drbd_conf *, union drbd_state, | ||
1207 | union drbd_state); | ||
1208 | extern enum drbd_state_rv _drbd_request_state(struct drbd_conf *, | ||
1209 | union drbd_state, | ||
1210 | union drbd_state, | ||
1211 | enum chg_state_flags); | ||
1212 | extern enum drbd_state_rv __drbd_set_state(struct drbd_conf *, union drbd_state, | ||
1213 | enum chg_state_flags, | ||
1214 | struct completion *done); | ||
1215 | extern void print_st_err(struct drbd_conf *, union drbd_state, | ||
1216 | union drbd_state, int); | ||
1217 | extern int drbd_thread_start(struct drbd_thread *thi); | 1064 | extern int drbd_thread_start(struct drbd_thread *thi); |
1218 | extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait); | 1065 | extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait); |
1066 | extern char *drbd_task_to_thread_name(struct drbd_tconn *tconn, struct task_struct *task); | ||
1219 | #ifdef CONFIG_SMP | 1067 | #ifdef CONFIG_SMP |
1220 | extern void drbd_thread_current_set_cpu(struct drbd_conf *mdev); | 1068 | extern void drbd_thread_current_set_cpu(struct drbd_thread *thi); |
1221 | extern void drbd_calc_cpu_mask(struct drbd_conf *mdev); | 1069 | extern void drbd_calc_cpu_mask(struct drbd_tconn *tconn); |
1222 | #else | 1070 | #else |
1223 | #define drbd_thread_current_set_cpu(A) ({}) | 1071 | #define drbd_thread_current_set_cpu(A) ({}) |
1224 | #define drbd_calc_cpu_mask(A) ({}) | 1072 | #define drbd_calc_cpu_mask(A) ({}) |
1225 | #endif | 1073 | #endif |
1226 | extern void drbd_free_resources(struct drbd_conf *mdev); | 1074 | extern void tl_release(struct drbd_tconn *, unsigned int barrier_nr, |
1227 | extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, | ||
1228 | unsigned int set_size); | 1075 | unsigned int set_size); |
1229 | extern void tl_clear(struct drbd_conf *mdev); | 1076 | extern void tl_clear(struct drbd_tconn *); |
1230 | extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *); | 1077 | extern void drbd_free_sock(struct drbd_tconn *tconn); |
1231 | extern void drbd_free_sock(struct drbd_conf *mdev); | 1078 | extern int drbd_send(struct drbd_tconn *tconn, struct socket *sock, |
1232 | extern int drbd_send(struct drbd_conf *mdev, struct socket *sock, | 1079 | void *buf, size_t size, unsigned msg_flags); |
1233 | void *buf, size_t size, unsigned msg_flags); | 1080 | extern int drbd_send_all(struct drbd_tconn *, struct socket *, void *, size_t, |
1234 | extern int drbd_send_protocol(struct drbd_conf *mdev); | 1081 | unsigned); |
1082 | |||
1083 | extern int __drbd_send_protocol(struct drbd_tconn *tconn, enum drbd_packet cmd); | ||
1084 | extern int drbd_send_protocol(struct drbd_tconn *tconn); | ||
1235 | extern int drbd_send_uuids(struct drbd_conf *mdev); | 1085 | extern int drbd_send_uuids(struct drbd_conf *mdev); |
1236 | extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev); | 1086 | extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev); |
1237 | extern int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev); | 1087 | extern void drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev); |
1238 | extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags); | 1088 | extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags); |
1239 | extern int drbd_send_state(struct drbd_conf *mdev, union drbd_state s); | 1089 | extern int drbd_send_state(struct drbd_conf *mdev, union drbd_state s); |
1240 | extern int drbd_send_current_state(struct drbd_conf *mdev); | 1090 | extern int drbd_send_current_state(struct drbd_conf *mdev); |
1241 | extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, | 1091 | extern int drbd_send_sync_param(struct drbd_conf *mdev); |
1242 | enum drbd_packets cmd, struct p_header80 *h, | 1092 | extern void drbd_send_b_ack(struct drbd_tconn *tconn, u32 barrier_nr, |
1243 | size_t size, unsigned msg_flags); | 1093 | u32 set_size); |
1244 | #define USE_DATA_SOCKET 1 | 1094 | extern int drbd_send_ack(struct drbd_conf *, enum drbd_packet, |
1245 | #define USE_META_SOCKET 0 | 1095 | struct drbd_peer_request *); |
1246 | extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket, | 1096 | extern void drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packet cmd, |
1247 | enum drbd_packets cmd, struct p_header80 *h, | 1097 | struct p_block_req *rp); |
1248 | size_t size); | 1098 | extern void drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packet cmd, |
1249 | extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, | 1099 | struct p_data *dp, int data_size); |
1250 | char *data, size_t size); | 1100 | extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packet cmd, |
1251 | extern int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc); | ||
1252 | extern int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, | ||
1253 | u32 set_size); | ||
1254 | extern int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
1255 | struct drbd_epoch_entry *e); | ||
1256 | extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
1257 | struct p_block_req *rp); | ||
1258 | extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
1259 | struct p_data *dp, int data_size); | ||
1260 | extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
1261 | sector_t sector, int blksize, u64 block_id); | 1101 | sector_t sector, int blksize, u64 block_id); |
1262 | extern int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req); | 1102 | extern int drbd_send_out_of_sync(struct drbd_conf *, struct drbd_request *); |
1263 | extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, | 1103 | extern int drbd_send_block(struct drbd_conf *, enum drbd_packet, |
1264 | struct drbd_epoch_entry *e); | 1104 | struct drbd_peer_request *); |
1265 | extern int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req); | 1105 | extern int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req); |
1266 | extern int drbd_send_drequest(struct drbd_conf *mdev, int cmd, | 1106 | extern int drbd_send_drequest(struct drbd_conf *mdev, int cmd, |
1267 | sector_t sector, int size, u64 block_id); | 1107 | sector_t sector, int size, u64 block_id); |
1268 | extern int drbd_send_drequest_csum(struct drbd_conf *mdev, | 1108 | extern int drbd_send_drequest_csum(struct drbd_conf *mdev, sector_t sector, |
1269 | sector_t sector,int size, | 1109 | int size, void *digest, int digest_size, |
1270 | void *digest, int digest_size, | 1110 | enum drbd_packet cmd); |
1271 | enum drbd_packets cmd); | ||
1272 | extern int drbd_send_ov_request(struct drbd_conf *mdev,sector_t sector,int size); | 1111 | extern int drbd_send_ov_request(struct drbd_conf *mdev,sector_t sector,int size); |
1273 | 1112 | ||
1274 | extern int drbd_send_bitmap(struct drbd_conf *mdev); | 1113 | extern int drbd_send_bitmap(struct drbd_conf *mdev); |
1275 | extern int _drbd_send_bitmap(struct drbd_conf *mdev); | 1114 | extern void drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode); |
1276 | extern int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode); | 1115 | extern void conn_send_sr_reply(struct drbd_tconn *tconn, enum drbd_state_rv retcode); |
1277 | extern void drbd_free_bc(struct drbd_backing_dev *ldev); | 1116 | extern void drbd_free_bc(struct drbd_backing_dev *ldev); |
1278 | extern void drbd_mdev_cleanup(struct drbd_conf *mdev); | 1117 | extern void drbd_mdev_cleanup(struct drbd_conf *mdev); |
1279 | void drbd_print_uuids(struct drbd_conf *mdev, const char *text); | 1118 | void drbd_print_uuids(struct drbd_conf *mdev, const char *text); |
1280 | 1119 | ||
1120 | extern void conn_md_sync(struct drbd_tconn *tconn); | ||
1281 | extern void drbd_md_sync(struct drbd_conf *mdev); | 1121 | extern void drbd_md_sync(struct drbd_conf *mdev); |
1282 | extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev); | 1122 | extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev); |
1283 | extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); | 1123 | extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); |
1284 | extern void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); | 1124 | extern void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); |
1285 | extern void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local); | 1125 | extern void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local); |
1286 | extern void _drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local); | ||
1287 | extern void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local); | 1126 | extern void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local); |
1127 | extern void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local); | ||
1128 | extern void __drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); | ||
1288 | extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local); | 1129 | extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local); |
1289 | extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local); | 1130 | extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local); |
1290 | extern int drbd_md_test_flag(struct drbd_backing_dev *, int); | 1131 | extern int drbd_md_test_flag(struct drbd_backing_dev *, int); |
@@ -1302,33 +1143,52 @@ extern void drbd_queue_bitmap_io(struct drbd_conf *mdev, | |||
1302 | extern int drbd_bitmap_io(struct drbd_conf *mdev, | 1143 | extern int drbd_bitmap_io(struct drbd_conf *mdev, |
1303 | int (*io_fn)(struct drbd_conf *), | 1144 | int (*io_fn)(struct drbd_conf *), |
1304 | char *why, enum bm_flag flags); | 1145 | char *why, enum bm_flag flags); |
1146 | extern int drbd_bitmap_io_from_worker(struct drbd_conf *mdev, | ||
1147 | int (*io_fn)(struct drbd_conf *), | ||
1148 | char *why, enum bm_flag flags); | ||
1305 | extern int drbd_bmio_set_n_write(struct drbd_conf *mdev); | 1149 | extern int drbd_bmio_set_n_write(struct drbd_conf *mdev); |
1306 | extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); | 1150 | extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); |
1307 | extern void drbd_go_diskless(struct drbd_conf *mdev); | 1151 | extern void drbd_go_diskless(struct drbd_conf *mdev); |
1308 | extern void drbd_ldev_destroy(struct drbd_conf *mdev); | 1152 | extern void drbd_ldev_destroy(struct drbd_conf *mdev); |
1309 | 1153 | ||
1310 | |||
1311 | /* Meta data layout | 1154 | /* Meta data layout |
1312 | We reserve a 128MB Block (4k aligned) | 1155 | We reserve a 128MB Block (4k aligned) |
1313 | * either at the end of the backing device | 1156 | * either at the end of the backing device |
1314 | * or on a separate meta data device. */ | 1157 | * or on a separate meta data device. */ |
1315 | 1158 | ||
1316 | #define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */ | ||
1317 | /* The following numbers are sectors */ | 1159 | /* The following numbers are sectors */ |
1318 | #define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */ | 1160 | /* Allows up to about 3.8TB, so if you want more, |
1319 | #define MD_AL_MAX_SIZE 64 /* = 32 kb LOG ~ 3776 extents ~ 14 GB Storage */ | 1161 | * you need to use the "flexible" meta data format. */ |
1320 | /* Allows up to about 3.8TB */ | 1162 | #define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */ |
1321 | #define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_MAX_SIZE) | 1163 | #define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */ |
1322 | 1164 | #define MD_AL_SECTORS 64 /* = 32 kB on disk activity log ring buffer */ | |
1323 | /* Since the smalles IO unit is usually 512 byte */ | 1165 | #define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_SECTORS) |
1324 | #define MD_SECTOR_SHIFT 9 | 1166 | |
1325 | #define MD_SECTOR_SIZE (1<<MD_SECTOR_SHIFT) | 1167 | /* we do all meta data IO in 4k blocks */ |
1326 | 1168 | #define MD_BLOCK_SHIFT 12 | |
1327 | /* activity log */ | 1169 | #define MD_BLOCK_SIZE (1<<MD_BLOCK_SHIFT) |
1328 | #define AL_EXTENTS_PT ((MD_SECTOR_SIZE-12)/8-1) /* 61 ; Extents per 512B sector */ | 1170 | |
1329 | #define AL_EXTENT_SHIFT 22 /* One extent represents 4M Storage */ | 1171 | /* One activity log extent represents 4M of storage */ |
1172 | #define AL_EXTENT_SHIFT 22 | ||
1330 | #define AL_EXTENT_SIZE (1<<AL_EXTENT_SHIFT) | 1173 | #define AL_EXTENT_SIZE (1<<AL_EXTENT_SHIFT) |
1331 | 1174 | ||
1175 | /* We could make these currently hardcoded constants configurable | ||
1176 | * variables at create-md time (or even re-configurable at runtime?). | ||
1177 | * Which will require some more changes to the DRBD "super block" | ||
1178 | * and attach code. | ||
1179 | * | ||
1180 | * updates per transaction: | ||
1181 | * This many changes to the active set can be logged with one transaction. | ||
1182 | * This number is arbitrary. | ||
1183 | * context per transaction: | ||
1184 | * This many context extent numbers are logged with each transaction. | ||
1185 | * This number is resulting from the transaction block size (4k), the layout | ||
1186 | * of the transaction header, and the number of updates per transaction. | ||
1187 | * See drbd_actlog.c:struct al_transaction_on_disk | ||
1188 | * */ | ||
1189 | #define AL_UPDATES_PER_TRANSACTION 64 // arbitrary | ||
1190 | #define AL_CONTEXT_PER_TRANSACTION 919 // (4096 - 36 - 6*64)/4 | ||
1191 | |||
1332 | #if BITS_PER_LONG == 32 | 1192 | #if BITS_PER_LONG == 32 |
1333 | #define LN2_BPL 5 | 1193 | #define LN2_BPL 5 |
1334 | #define cpu_to_lel(A) cpu_to_le32(A) | 1194 | #define cpu_to_lel(A) cpu_to_le32(A) |
@@ -1364,11 +1224,14 @@ struct bm_extent { | |||
1364 | 1224 | ||
1365 | #define SLEEP_TIME (HZ/10) | 1225 | #define SLEEP_TIME (HZ/10) |
1366 | 1226 | ||
1367 | #define BM_BLOCK_SHIFT 12 /* 4k per bit */ | 1227 | /* We do bitmap IO in units of 4k blocks. |
1228 | * We also still have a hardcoded 4k per bit relation. */ | ||
1229 | #define BM_BLOCK_SHIFT 12 /* 4k per bit */ | ||
1368 | #define BM_BLOCK_SIZE (1<<BM_BLOCK_SHIFT) | 1230 | #define BM_BLOCK_SIZE (1<<BM_BLOCK_SHIFT) |
1369 | /* (9+3) : 512 bytes @ 8 bits; representing 16M storage | 1231 | /* mostly arbitrarily set the represented size of one bitmap extent, |
1370 | * per sector of on disk bitmap */ | 1232 | * aka resync extent, to 16 MiB (which is also 512 Byte worth of bitmap |
1371 | #define BM_EXT_SHIFT (BM_BLOCK_SHIFT + MD_SECTOR_SHIFT + 3) /* = 24 */ | 1233 | * at 4k per bit resolution) */ |
1234 | #define BM_EXT_SHIFT 24 /* 16 MiB per resync extent */ | ||
1372 | #define BM_EXT_SIZE (1<<BM_EXT_SHIFT) | 1235 | #define BM_EXT_SIZE (1<<BM_EXT_SHIFT) |
1373 | 1236 | ||
1374 | #if (BM_EXT_SHIFT != 24) || (BM_BLOCK_SHIFT != 12) | 1237 | #if (BM_EXT_SHIFT != 24) || (BM_BLOCK_SHIFT != 12) |
@@ -1436,17 +1299,20 @@ struct bm_extent { | |||
1436 | #endif | 1299 | #endif |
1437 | #endif | 1300 | #endif |
1438 | 1301 | ||
1439 | /* Sector shift value for the "hash" functions of tl_hash and ee_hash tables. | 1302 | /* BIO_MAX_SIZE is 256 * PAGE_CACHE_SIZE, |
1440 | * With a value of 8 all IO in one 128K block make it to the same slot of the | 1303 | * so for typical PAGE_CACHE_SIZE of 4k, that is (1<<20) Byte. |
1441 | * hash table. */ | 1304 | * Since we may live in a mixed-platform cluster, |
1442 | #define HT_SHIFT 8 | 1305 | * we limit us to a platform agnostic constant here for now. |
1443 | #define DRBD_MAX_BIO_SIZE (1U<<(9+HT_SHIFT)) | 1306 | * A followup commit may allow even bigger BIO sizes, |
1307 | * once we thought that through. */ | ||
1308 | #define DRBD_MAX_BIO_SIZE (1U << 20) | ||
1309 | #if DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE | ||
1310 | #error Architecture not supported: DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE | ||
1311 | #endif | ||
1444 | #define DRBD_MAX_BIO_SIZE_SAFE (1U << 12) /* Works always = 4k */ | 1312 | #define DRBD_MAX_BIO_SIZE_SAFE (1U << 12) /* Works always = 4k */ |
1445 | 1313 | ||
1446 | #define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* The old header only allows packets up to 32Kib data */ | 1314 | #define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */ |
1447 | 1315 | #define DRBD_MAX_BIO_SIZE_P95 (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */ | |
1448 | /* Number of elements in the app_reads_hash */ | ||
1449 | #define APP_R_HSIZE 15 | ||
1450 | 1316 | ||
1451 | extern int drbd_bm_init(struct drbd_conf *mdev); | 1317 | extern int drbd_bm_init(struct drbd_conf *mdev); |
1452 | extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors, int set_new_bits); | 1318 | extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors, int set_new_bits); |
@@ -1468,11 +1334,11 @@ extern int drbd_bm_test_bit(struct drbd_conf *mdev, unsigned long bitnr); | |||
1468 | extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr); | 1334 | extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr); |
1469 | extern int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local); | 1335 | extern int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local); |
1470 | extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local); | 1336 | extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local); |
1337 | extern void drbd_bm_mark_for_writeout(struct drbd_conf *mdev, int page_nr); | ||
1471 | extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local); | 1338 | extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local); |
1339 | extern int drbd_bm_write_hinted(struct drbd_conf *mdev) __must_hold(local); | ||
1472 | extern int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local); | 1340 | extern int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local); |
1473 | extern int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local); | 1341 | extern int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local); |
1474 | extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, | ||
1475 | unsigned long al_enr); | ||
1476 | extern size_t drbd_bm_words(struct drbd_conf *mdev); | 1342 | extern size_t drbd_bm_words(struct drbd_conf *mdev); |
1477 | extern unsigned long drbd_bm_bits(struct drbd_conf *mdev); | 1343 | extern unsigned long drbd_bm_bits(struct drbd_conf *mdev); |
1478 | extern sector_t drbd_bm_capacity(struct drbd_conf *mdev); | 1344 | extern sector_t drbd_bm_capacity(struct drbd_conf *mdev); |
@@ -1497,7 +1363,7 @@ extern void drbd_bm_unlock(struct drbd_conf *mdev); | |||
1497 | /* drbd_main.c */ | 1363 | /* drbd_main.c */ |
1498 | 1364 | ||
1499 | extern struct kmem_cache *drbd_request_cache; | 1365 | extern struct kmem_cache *drbd_request_cache; |
1500 | extern struct kmem_cache *drbd_ee_cache; /* epoch entries */ | 1366 | extern struct kmem_cache *drbd_ee_cache; /* peer requests */ |
1501 | extern struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ | 1367 | extern struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ |
1502 | extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ | 1368 | extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ |
1503 | extern mempool_t *drbd_request_mempool; | 1369 | extern mempool_t *drbd_request_mempool; |
@@ -1537,12 +1403,22 @@ extern struct bio *bio_alloc_drbd(gfp_t gfp_mask); | |||
1537 | 1403 | ||
1538 | extern rwlock_t global_state_lock; | 1404 | extern rwlock_t global_state_lock; |
1539 | 1405 | ||
1540 | extern struct drbd_conf *drbd_new_device(unsigned int minor); | 1406 | extern int conn_lowest_minor(struct drbd_tconn *tconn); |
1541 | extern void drbd_free_mdev(struct drbd_conf *mdev); | 1407 | enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr); |
1408 | extern void drbd_minor_destroy(struct kref *kref); | ||
1409 | |||
1410 | extern int set_resource_options(struct drbd_tconn *tconn, struct res_opts *res_opts); | ||
1411 | extern struct drbd_tconn *conn_create(const char *name, struct res_opts *res_opts); | ||
1412 | extern void conn_destroy(struct kref *kref); | ||
1413 | struct drbd_tconn *conn_get_by_name(const char *name); | ||
1414 | extern struct drbd_tconn *conn_get_by_addrs(void *my_addr, int my_addr_len, | ||
1415 | void *peer_addr, int peer_addr_len); | ||
1416 | extern void conn_free_crypto(struct drbd_tconn *tconn); | ||
1542 | 1417 | ||
1543 | extern int proc_details; | 1418 | extern int proc_details; |
1544 | 1419 | ||
1545 | /* drbd_req */ | 1420 | /* drbd_req */ |
1421 | extern void __drbd_make_request(struct drbd_conf *, struct bio *, unsigned long); | ||
1546 | extern void drbd_make_request(struct request_queue *q, struct bio *bio); | 1422 | extern void drbd_make_request(struct request_queue *q, struct bio *bio); |
1547 | extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req); | 1423 | extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req); |
1548 | extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec); | 1424 | extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec); |
@@ -1550,10 +1426,11 @@ extern int is_valid_ar_handle(struct drbd_request *, sector_t); | |||
1550 | 1426 | ||
1551 | 1427 | ||
1552 | /* drbd_nl.c */ | 1428 | /* drbd_nl.c */ |
1429 | extern int drbd_msg_put_info(const char *info); | ||
1553 | extern void drbd_suspend_io(struct drbd_conf *mdev); | 1430 | extern void drbd_suspend_io(struct drbd_conf *mdev); |
1554 | extern void drbd_resume_io(struct drbd_conf *mdev); | 1431 | extern void drbd_resume_io(struct drbd_conf *mdev); |
1555 | extern char *ppsize(char *buf, unsigned long long size); | 1432 | extern char *ppsize(char *buf, unsigned long long size); |
1556 | extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, int); | 1433 | extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, sector_t, int); |
1557 | enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 }; | 1434 | enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 }; |
1558 | extern enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local); | 1435 | extern enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local); |
1559 | extern void resync_after_online_grow(struct drbd_conf *); | 1436 | extern void resync_after_online_grow(struct drbd_conf *); |
@@ -1561,13 +1438,14 @@ extern void drbd_reconsider_max_bio_size(struct drbd_conf *mdev); | |||
1561 | extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev, | 1438 | extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev, |
1562 | enum drbd_role new_role, | 1439 | enum drbd_role new_role, |
1563 | int force); | 1440 | int force); |
1564 | extern enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev); | 1441 | extern bool conn_try_outdate_peer(struct drbd_tconn *tconn); |
1565 | extern void drbd_try_outdate_peer_async(struct drbd_conf *mdev); | 1442 | extern void conn_try_outdate_peer_async(struct drbd_tconn *tconn); |
1566 | extern int drbd_khelper(struct drbd_conf *mdev, char *cmd); | 1443 | extern int drbd_khelper(struct drbd_conf *mdev, char *cmd); |
1567 | 1444 | ||
1568 | /* drbd_worker.c */ | 1445 | /* drbd_worker.c */ |
1569 | extern int drbd_worker(struct drbd_thread *thi); | 1446 | extern int drbd_worker(struct drbd_thread *thi); |
1570 | extern int drbd_alter_sa(struct drbd_conf *mdev, int na); | 1447 | enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor); |
1448 | void drbd_resync_after_changed(struct drbd_conf *mdev); | ||
1571 | extern void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side); | 1449 | extern void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side); |
1572 | extern void resume_next_sg(struct drbd_conf *mdev); | 1450 | extern void resume_next_sg(struct drbd_conf *mdev); |
1573 | extern void suspend_other_sg(struct drbd_conf *mdev); | 1451 | extern void suspend_other_sg(struct drbd_conf *mdev); |
@@ -1576,13 +1454,13 @@ extern int drbd_resync_finished(struct drbd_conf *mdev); | |||
1576 | extern void *drbd_md_get_buffer(struct drbd_conf *mdev); | 1454 | extern void *drbd_md_get_buffer(struct drbd_conf *mdev); |
1577 | extern void drbd_md_put_buffer(struct drbd_conf *mdev); | 1455 | extern void drbd_md_put_buffer(struct drbd_conf *mdev); |
1578 | extern int drbd_md_sync_page_io(struct drbd_conf *mdev, | 1456 | extern int drbd_md_sync_page_io(struct drbd_conf *mdev, |
1579 | struct drbd_backing_dev *bdev, sector_t sector, int rw); | 1457 | struct drbd_backing_dev *bdev, sector_t sector, int rw); |
1580 | extern void wait_until_done_or_disk_failure(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, | 1458 | extern void drbd_ov_out_of_sync_found(struct drbd_conf *, sector_t, int); |
1581 | unsigned int *done); | 1459 | extern void wait_until_done_or_force_detached(struct drbd_conf *mdev, |
1582 | extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int); | 1460 | struct drbd_backing_dev *bdev, unsigned int *done); |
1583 | extern void drbd_rs_controller_reset(struct drbd_conf *mdev); | 1461 | extern void drbd_rs_controller_reset(struct drbd_conf *mdev); |
1584 | 1462 | ||
1585 | static inline void ov_oos_print(struct drbd_conf *mdev) | 1463 | static inline void ov_out_of_sync_print(struct drbd_conf *mdev) |
1586 | { | 1464 | { |
1587 | if (mdev->ov_last_oos_size) { | 1465 | if (mdev->ov_last_oos_size) { |
1588 | dev_err(DEV, "Out of sync: start=%llu, size=%lu (sectors)\n", | 1466 | dev_err(DEV, "Out of sync: start=%llu, size=%lu (sectors)\n", |
@@ -1594,97 +1472,102 @@ static inline void ov_oos_print(struct drbd_conf *mdev) | |||
1594 | 1472 | ||
1595 | 1473 | ||
1596 | extern void drbd_csum_bio(struct drbd_conf *, struct crypto_hash *, struct bio *, void *); | 1474 | extern void drbd_csum_bio(struct drbd_conf *, struct crypto_hash *, struct bio *, void *); |
1597 | extern void drbd_csum_ee(struct drbd_conf *, struct crypto_hash *, struct drbd_epoch_entry *, void *); | 1475 | extern void drbd_csum_ee(struct drbd_conf *, struct crypto_hash *, |
1476 | struct drbd_peer_request *, void *); | ||
1598 | /* worker callbacks */ | 1477 | /* worker callbacks */ |
1599 | extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int); | 1478 | extern int w_e_end_data_req(struct drbd_work *, int); |
1600 | extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int); | 1479 | extern int w_e_end_rsdata_req(struct drbd_work *, int); |
1601 | extern int w_e_end_data_req(struct drbd_conf *, struct drbd_work *, int); | 1480 | extern int w_e_end_csum_rs_req(struct drbd_work *, int); |
1602 | extern int w_e_end_rsdata_req(struct drbd_conf *, struct drbd_work *, int); | 1481 | extern int w_e_end_ov_reply(struct drbd_work *, int); |
1603 | extern int w_e_end_csum_rs_req(struct drbd_conf *, struct drbd_work *, int); | 1482 | extern int w_e_end_ov_req(struct drbd_work *, int); |
1604 | extern int w_e_end_ov_reply(struct drbd_conf *, struct drbd_work *, int); | 1483 | extern int w_ov_finished(struct drbd_work *, int); |
1605 | extern int w_e_end_ov_req(struct drbd_conf *, struct drbd_work *, int); | 1484 | extern int w_resync_timer(struct drbd_work *, int); |
1606 | extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int); | 1485 | extern int w_send_write_hint(struct drbd_work *, int); |
1607 | extern int w_resync_timer(struct drbd_conf *, struct drbd_work *, int); | 1486 | extern int w_make_resync_request(struct drbd_work *, int); |
1608 | extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int); | 1487 | extern int w_send_dblock(struct drbd_work *, int); |
1609 | extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int); | 1488 | extern int w_send_read_req(struct drbd_work *, int); |
1610 | extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int); | 1489 | extern int w_prev_work_done(struct drbd_work *, int); |
1611 | extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int); | 1490 | extern int w_e_reissue(struct drbd_work *, int); |
1612 | extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int); | 1491 | extern int w_restart_disk_io(struct drbd_work *, int); |
1613 | extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int); | 1492 | extern int w_send_out_of_sync(struct drbd_work *, int); |
1614 | extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int); | 1493 | extern int w_start_resync(struct drbd_work *, int); |
1615 | extern int w_restart_disk_io(struct drbd_conf *, struct drbd_work *, int); | ||
1616 | extern int w_send_oos(struct drbd_conf *, struct drbd_work *, int); | ||
1617 | extern int w_start_resync(struct drbd_conf *, struct drbd_work *, int); | ||
1618 | 1494 | ||
1619 | extern void resync_timer_fn(unsigned long data); | 1495 | extern void resync_timer_fn(unsigned long data); |
1620 | extern void start_resync_timer_fn(unsigned long data); | 1496 | extern void start_resync_timer_fn(unsigned long data); |
1621 | 1497 | ||
1622 | /* drbd_receiver.c */ | 1498 | /* drbd_receiver.c */ |
1623 | extern int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector); | 1499 | extern int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector); |
1624 | extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, | 1500 | extern int drbd_submit_peer_request(struct drbd_conf *, |
1625 | const unsigned rw, const int fault_type); | 1501 | struct drbd_peer_request *, const unsigned, |
1626 | extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list); | 1502 | const int); |
1627 | extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, | 1503 | extern int drbd_free_peer_reqs(struct drbd_conf *, struct list_head *); |
1628 | u64 id, | 1504 | extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_conf *, u64, |
1629 | sector_t sector, | 1505 | sector_t, unsigned int, |
1630 | unsigned int data_size, | 1506 | gfp_t) __must_hold(local); |
1631 | gfp_t gfp_mask) __must_hold(local); | 1507 | extern void __drbd_free_peer_req(struct drbd_conf *, struct drbd_peer_request *, |
1632 | extern void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, | 1508 | int); |
1633 | int is_net); | 1509 | #define drbd_free_peer_req(m,e) __drbd_free_peer_req(m, e, 0) |
1634 | #define drbd_free_ee(m,e) drbd_free_some_ee(m, e, 0) | 1510 | #define drbd_free_net_peer_req(m,e) __drbd_free_peer_req(m, e, 1) |
1635 | #define drbd_free_net_ee(m,e) drbd_free_some_ee(m, e, 1) | 1511 | extern struct page *drbd_alloc_pages(struct drbd_conf *, unsigned int, bool); |
1636 | extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev, | ||
1637 | struct list_head *head); | ||
1638 | extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, | ||
1639 | struct list_head *head); | ||
1640 | extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled); | 1512 | extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled); |
1641 | extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed); | 1513 | extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed); |
1642 | extern void drbd_flush_workqueue(struct drbd_conf *mdev); | 1514 | extern void conn_flush_workqueue(struct drbd_tconn *tconn); |
1643 | extern void drbd_free_tl_hash(struct drbd_conf *mdev); | 1515 | extern int drbd_connected(struct drbd_conf *mdev); |
1516 | static inline void drbd_flush_workqueue(struct drbd_conf *mdev) | ||
1517 | { | ||
1518 | conn_flush_workqueue(mdev->tconn); | ||
1519 | } | ||
1644 | 1520 | ||
1645 | /* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to | 1521 | /* Yes, there is kernel_setsockopt, but only since 2.6.18. |
1646 | * mess with get_fs/set_fs, we know we are KERNEL_DS always. */ | 1522 | * So we have our own copy of it here. */ |
1647 | static inline int drbd_setsockopt(struct socket *sock, int level, int optname, | 1523 | static inline int drbd_setsockopt(struct socket *sock, int level, int optname, |
1648 | char __user *optval, int optlen) | 1524 | char *optval, int optlen) |
1649 | { | 1525 | { |
1526 | mm_segment_t oldfs = get_fs(); | ||
1527 | char __user *uoptval; | ||
1650 | int err; | 1528 | int err; |
1529 | |||
1530 | uoptval = (char __user __force *)optval; | ||
1531 | |||
1532 | set_fs(KERNEL_DS); | ||
1651 | if (level == SOL_SOCKET) | 1533 | if (level == SOL_SOCKET) |
1652 | err = sock_setsockopt(sock, level, optname, optval, optlen); | 1534 | err = sock_setsockopt(sock, level, optname, uoptval, optlen); |
1653 | else | 1535 | else |
1654 | err = sock->ops->setsockopt(sock, level, optname, optval, | 1536 | err = sock->ops->setsockopt(sock, level, optname, uoptval, |
1655 | optlen); | 1537 | optlen); |
1538 | set_fs(oldfs); | ||
1656 | return err; | 1539 | return err; |
1657 | } | 1540 | } |
1658 | 1541 | ||
1659 | static inline void drbd_tcp_cork(struct socket *sock) | 1542 | static inline void drbd_tcp_cork(struct socket *sock) |
1660 | { | 1543 | { |
1661 | int __user val = 1; | 1544 | int val = 1; |
1662 | (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK, | 1545 | (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK, |
1663 | (char __user *)&val, sizeof(val)); | 1546 | (char*)&val, sizeof(val)); |
1664 | } | 1547 | } |
1665 | 1548 | ||
1666 | static inline void drbd_tcp_uncork(struct socket *sock) | 1549 | static inline void drbd_tcp_uncork(struct socket *sock) |
1667 | { | 1550 | { |
1668 | int __user val = 0; | 1551 | int val = 0; |
1669 | (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK, | 1552 | (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK, |
1670 | (char __user *)&val, sizeof(val)); | 1553 | (char*)&val, sizeof(val)); |
1671 | } | 1554 | } |
1672 | 1555 | ||
1673 | static inline void drbd_tcp_nodelay(struct socket *sock) | 1556 | static inline void drbd_tcp_nodelay(struct socket *sock) |
1674 | { | 1557 | { |
1675 | int __user val = 1; | 1558 | int val = 1; |
1676 | (void) drbd_setsockopt(sock, SOL_TCP, TCP_NODELAY, | 1559 | (void) drbd_setsockopt(sock, SOL_TCP, TCP_NODELAY, |
1677 | (char __user *)&val, sizeof(val)); | 1560 | (char*)&val, sizeof(val)); |
1678 | } | 1561 | } |
1679 | 1562 | ||
1680 | static inline void drbd_tcp_quickack(struct socket *sock) | 1563 | static inline void drbd_tcp_quickack(struct socket *sock) |
1681 | { | 1564 | { |
1682 | int __user val = 2; | 1565 | int val = 2; |
1683 | (void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK, | 1566 | (void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK, |
1684 | (char __user *)&val, sizeof(val)); | 1567 | (char*)&val, sizeof(val)); |
1685 | } | 1568 | } |
1686 | 1569 | ||
1687 | void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo); | 1570 | void drbd_bump_write_ordering(struct drbd_tconn *tconn, enum write_ordering_e wo); |
1688 | 1571 | ||
1689 | /* drbd_proc.c */ | 1572 | /* drbd_proc.c */ |
1690 | extern struct proc_dir_entry *drbd_proc; | 1573 | extern struct proc_dir_entry *drbd_proc; |
@@ -1693,8 +1576,8 @@ extern const char *drbd_conn_str(enum drbd_conns s); | |||
1693 | extern const char *drbd_role_str(enum drbd_role s); | 1576 | extern const char *drbd_role_str(enum drbd_role s); |
1694 | 1577 | ||
1695 | /* drbd_actlog.c */ | 1578 | /* drbd_actlog.c */ |
1696 | extern void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector); | 1579 | extern void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i); |
1697 | extern void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector); | 1580 | extern void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i); |
1698 | extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector); | 1581 | extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector); |
1699 | extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector); | 1582 | extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector); |
1700 | extern int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector); | 1583 | extern int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector); |
@@ -1702,7 +1585,6 @@ extern void drbd_rs_cancel_all(struct drbd_conf *mdev); | |||
1702 | extern int drbd_rs_del_all(struct drbd_conf *mdev); | 1585 | extern int drbd_rs_del_all(struct drbd_conf *mdev); |
1703 | extern void drbd_rs_failed_io(struct drbd_conf *mdev, | 1586 | extern void drbd_rs_failed_io(struct drbd_conf *mdev, |
1704 | sector_t sector, int size); | 1587 | sector_t sector, int size); |
1705 | extern int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *); | ||
1706 | extern void drbd_advance_rs_marks(struct drbd_conf *mdev, unsigned long still_to_go); | 1588 | extern void drbd_advance_rs_marks(struct drbd_conf *mdev, unsigned long still_to_go); |
1707 | extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, | 1589 | extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, |
1708 | int size, const char *file, const unsigned int line); | 1590 | int size, const char *file, const unsigned int line); |
@@ -1712,73 +1594,24 @@ extern int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, | |||
1712 | int size, const char *file, const unsigned int line); | 1594 | int size, const char *file, const unsigned int line); |
1713 | #define drbd_set_out_of_sync(mdev, sector, size) \ | 1595 | #define drbd_set_out_of_sync(mdev, sector, size) \ |
1714 | __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__) | 1596 | __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__) |
1715 | extern void drbd_al_apply_to_bm(struct drbd_conf *mdev); | ||
1716 | extern void drbd_al_shrink(struct drbd_conf *mdev); | 1597 | extern void drbd_al_shrink(struct drbd_conf *mdev); |
1717 | 1598 | ||
1718 | |||
1719 | /* drbd_nl.c */ | 1599 | /* drbd_nl.c */ |
1720 | 1600 | /* state info broadcast */ | |
1721 | void drbd_nl_cleanup(void); | 1601 | struct sib_info { |
1722 | int __init drbd_nl_init(void); | 1602 | enum drbd_state_info_bcast_reason sib_reason; |
1723 | void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state); | 1603 | union { |
1724 | void drbd_bcast_sync_progress(struct drbd_conf *mdev); | 1604 | struct { |
1725 | void drbd_bcast_ee(struct drbd_conf *mdev, | 1605 | char *helper_name; |
1726 | const char *reason, const int dgs, | 1606 | unsigned helper_exit_code; |
1727 | const char* seen_hash, const char* calc_hash, | 1607 | }; |
1728 | const struct drbd_epoch_entry* e); | 1608 | struct { |
1729 | 1609 | union drbd_state os; | |
1730 | 1610 | union drbd_state ns; | |
1731 | /** | 1611 | }; |
1732 | * DOC: DRBD State macros | 1612 | }; |
1733 | * | 1613 | }; |
1734 | * These macros are used to express state changes in easily readable form. | 1614 | void drbd_bcast_event(struct drbd_conf *mdev, const struct sib_info *sib); |
1735 | * | ||
1736 | * The NS macros expand to a mask and a value, that can be bit ored onto the | ||
1737 | * current state as soon as the spinlock (req_lock) was taken. | ||
1738 | * | ||
1739 | * The _NS macros are used for state functions that get called with the | ||
1740 | * spinlock. These macros expand directly to the new state value. | ||
1741 | * | ||
1742 | * Besides the basic forms NS() and _NS() additional _?NS[23] are defined | ||
1743 | * to express state changes that affect more than one aspect of the state. | ||
1744 | * | ||
1745 | * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY) | ||
1746 | * Means that the network connection was established and that the peer | ||
1747 | * is in secondary role. | ||
1748 | */ | ||
1749 | #define role_MASK R_MASK | ||
1750 | #define peer_MASK R_MASK | ||
1751 | #define disk_MASK D_MASK | ||
1752 | #define pdsk_MASK D_MASK | ||
1753 | #define conn_MASK C_MASK | ||
1754 | #define susp_MASK 1 | ||
1755 | #define user_isp_MASK 1 | ||
1756 | #define aftr_isp_MASK 1 | ||
1757 | #define susp_nod_MASK 1 | ||
1758 | #define susp_fen_MASK 1 | ||
1759 | |||
1760 | #define NS(T, S) \ | ||
1761 | ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \ | ||
1762 | ({ union drbd_state val; val.i = 0; val.T = (S); val; }) | ||
1763 | #define NS2(T1, S1, T2, S2) \ | ||
1764 | ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \ | ||
1765 | mask.T2 = T2##_MASK; mask; }), \ | ||
1766 | ({ union drbd_state val; val.i = 0; val.T1 = (S1); \ | ||
1767 | val.T2 = (S2); val; }) | ||
1768 | #define NS3(T1, S1, T2, S2, T3, S3) \ | ||
1769 | ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \ | ||
1770 | mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \ | ||
1771 | ({ union drbd_state val; val.i = 0; val.T1 = (S1); \ | ||
1772 | val.T2 = (S2); val.T3 = (S3); val; }) | ||
1773 | |||
1774 | #define _NS(D, T, S) \ | ||
1775 | D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T = (S); __ns; }) | ||
1776 | #define _NS2(D, T1, S1, T2, S2) \ | ||
1777 | D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \ | ||
1778 | __ns.T2 = (S2); __ns; }) | ||
1779 | #define _NS3(D, T1, S1, T2, S2, T3, S3) \ | ||
1780 | D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \ | ||
1781 | __ns.T2 = (S2); __ns.T3 = (S3); __ns; }) | ||
1782 | 1615 | ||
1783 | /* | 1616 | /* |
1784 | * inline helper functions | 1617 | * inline helper functions |
@@ -1795,9 +1628,10 @@ static inline struct page *page_chain_next(struct page *page) | |||
1795 | #define page_chain_for_each_safe(page, n) \ | 1628 | #define page_chain_for_each_safe(page, n) \ |
1796 | for (; page && ({ n = page_chain_next(page); 1; }); page = n) | 1629 | for (; page && ({ n = page_chain_next(page); 1; }); page = n) |
1797 | 1630 | ||
1798 | static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e) | 1631 | |
1632 | static inline int drbd_peer_req_has_active_page(struct drbd_peer_request *peer_req) | ||
1799 | { | 1633 | { |
1800 | struct page *page = e->pages; | 1634 | struct page *page = peer_req->pages; |
1801 | page_chain_for_each(page) { | 1635 | page_chain_for_each(page) { |
1802 | if (page_count(page) > 1) | 1636 | if (page_count(page) > 1) |
1803 | return 1; | 1637 | return 1; |
@@ -1805,18 +1639,6 @@ static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e) | |||
1805 | return 0; | 1639 | return 0; |
1806 | } | 1640 | } |
1807 | 1641 | ||
1808 | static inline void drbd_state_lock(struct drbd_conf *mdev) | ||
1809 | { | ||
1810 | wait_event(mdev->misc_wait, | ||
1811 | !test_and_set_bit(CLUSTER_ST_CHANGE, &mdev->flags)); | ||
1812 | } | ||
1813 | |||
1814 | static inline void drbd_state_unlock(struct drbd_conf *mdev) | ||
1815 | { | ||
1816 | clear_bit(CLUSTER_ST_CHANGE, &mdev->flags); | ||
1817 | wake_up(&mdev->misc_wait); | ||
1818 | } | ||
1819 | |||
1820 | static inline enum drbd_state_rv | 1642 | static inline enum drbd_state_rv |
1821 | _drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, | 1643 | _drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, |
1822 | enum chg_state_flags flags, struct completion *done) | 1644 | enum chg_state_flags flags, struct completion *done) |
@@ -1830,48 +1652,71 @@ _drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, | |||
1830 | return rv; | 1652 | return rv; |
1831 | } | 1653 | } |
1832 | 1654 | ||
1833 | /** | 1655 | static inline union drbd_state drbd_read_state(struct drbd_conf *mdev) |
1834 | * drbd_request_state() - Reqest a state change | ||
1835 | * @mdev: DRBD device. | ||
1836 | * @mask: mask of state bits to change. | ||
1837 | * @val: value of new state bits. | ||
1838 | * | ||
1839 | * This is the most graceful way of requesting a state change. It is verbose | ||
1840 | * quite verbose in case the state change is not possible, and all those | ||
1841 | * state changes are globally serialized. | ||
1842 | */ | ||
1843 | static inline int drbd_request_state(struct drbd_conf *mdev, | ||
1844 | union drbd_state mask, | ||
1845 | union drbd_state val) | ||
1846 | { | 1656 | { |
1847 | return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED); | 1657 | union drbd_state rv; |
1658 | |||
1659 | rv.i = mdev->state.i; | ||
1660 | rv.susp = mdev->tconn->susp; | ||
1661 | rv.susp_nod = mdev->tconn->susp_nod; | ||
1662 | rv.susp_fen = mdev->tconn->susp_fen; | ||
1663 | |||
1664 | return rv; | ||
1848 | } | 1665 | } |
1849 | 1666 | ||
1850 | enum drbd_force_detach_flags { | 1667 | enum drbd_force_detach_flags { |
1851 | DRBD_IO_ERROR, | 1668 | DRBD_READ_ERROR, |
1669 | DRBD_WRITE_ERROR, | ||
1852 | DRBD_META_IO_ERROR, | 1670 | DRBD_META_IO_ERROR, |
1853 | DRBD_FORCE_DETACH, | 1671 | DRBD_FORCE_DETACH, |
1854 | }; | 1672 | }; |
1855 | 1673 | ||
1856 | #define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__) | 1674 | #define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__) |
1857 | static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, | 1675 | static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, |
1858 | enum drbd_force_detach_flags forcedetach, | 1676 | enum drbd_force_detach_flags df, |
1859 | const char *where) | 1677 | const char *where) |
1860 | { | 1678 | { |
1861 | switch (mdev->ldev->dc.on_io_error) { | 1679 | enum drbd_io_error_p ep; |
1862 | case EP_PASS_ON: | 1680 | |
1863 | if (forcedetach == DRBD_IO_ERROR) { | 1681 | rcu_read_lock(); |
1682 | ep = rcu_dereference(mdev->ldev->disk_conf)->on_io_error; | ||
1683 | rcu_read_unlock(); | ||
1684 | switch (ep) { | ||
1685 | case EP_PASS_ON: /* FIXME would this be better named "Ignore"? */ | ||
1686 | if (df == DRBD_READ_ERROR || df == DRBD_WRITE_ERROR) { | ||
1864 | if (__ratelimit(&drbd_ratelimit_state)) | 1687 | if (__ratelimit(&drbd_ratelimit_state)) |
1865 | dev_err(DEV, "Local IO failed in %s.\n", where); | 1688 | dev_err(DEV, "Local IO failed in %s.\n", where); |
1866 | if (mdev->state.disk > D_INCONSISTENT) | 1689 | if (mdev->state.disk > D_INCONSISTENT) |
1867 | _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_HARD, NULL); | 1690 | _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_HARD, NULL); |
1868 | break; | 1691 | break; |
1869 | } | 1692 | } |
1870 | /* NOTE fall through to detach case if forcedetach set */ | 1693 | /* NOTE fall through for DRBD_META_IO_ERROR or DRBD_FORCE_DETACH */ |
1871 | case EP_DETACH: | 1694 | case EP_DETACH: |
1872 | case EP_CALL_HELPER: | 1695 | case EP_CALL_HELPER: |
1696 | /* Remember whether we saw a READ or WRITE error. | ||
1697 | * | ||
1698 | * Recovery of the affected area for WRITE failure is covered | ||
1699 | * by the activity log. | ||
1700 | * READ errors may fall outside that area though. Certain READ | ||
1701 | * errors can be "healed" by writing good data to the affected | ||
1702 | * blocks, which triggers block re-allocation in lower layers. | ||
1703 | * | ||
1704 | * If we can not write the bitmap after a READ error, | ||
1705 | * we may need to trigger a full sync (see w_go_diskless()). | ||
1706 | * | ||
1707 | * Force-detach is not really an IO error, but rather a | ||
1708 | * desperate measure to try to deal with a completely | ||
1709 | * unresponsive lower level IO stack. | ||
1710 | * Still it should be treated as a WRITE error. | ||
1711 | * | ||
1712 | * Meta IO error is always WRITE error: | ||
1713 | * we read meta data only once during attach, | ||
1714 | * which will fail in case of errors. | ||
1715 | */ | ||
1873 | set_bit(WAS_IO_ERROR, &mdev->flags); | 1716 | set_bit(WAS_IO_ERROR, &mdev->flags); |
1874 | if (forcedetach == DRBD_FORCE_DETACH) | 1717 | if (df == DRBD_READ_ERROR) |
1718 | set_bit(WAS_READ_ERROR, &mdev->flags); | ||
1719 | if (df == DRBD_FORCE_DETACH) | ||
1875 | set_bit(FORCE_DETACH, &mdev->flags); | 1720 | set_bit(FORCE_DETACH, &mdev->flags); |
1876 | if (mdev->state.disk > D_FAILED) { | 1721 | if (mdev->state.disk > D_FAILED) { |
1877 | _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL); | 1722 | _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL); |
@@ -1896,9 +1741,9 @@ static inline void drbd_chk_io_error_(struct drbd_conf *mdev, | |||
1896 | { | 1741 | { |
1897 | if (error) { | 1742 | if (error) { |
1898 | unsigned long flags; | 1743 | unsigned long flags; |
1899 | spin_lock_irqsave(&mdev->req_lock, flags); | 1744 | spin_lock_irqsave(&mdev->tconn->req_lock, flags); |
1900 | __drbd_chk_io_error_(mdev, forcedetach, where); | 1745 | __drbd_chk_io_error_(mdev, forcedetach, where); |
1901 | spin_unlock_irqrestore(&mdev->req_lock, flags); | 1746 | spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); |
1902 | } | 1747 | } |
1903 | } | 1748 | } |
1904 | 1749 | ||
@@ -1910,9 +1755,9 @@ static inline void drbd_chk_io_error_(struct drbd_conf *mdev, | |||
1910 | * BTW, for internal meta data, this happens to be the maximum capacity | 1755 | * BTW, for internal meta data, this happens to be the maximum capacity |
1911 | * we could agree upon with our peer node. | 1756 | * we could agree upon with our peer node. |
1912 | */ | 1757 | */ |
1913 | static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev) | 1758 | static inline sector_t _drbd_md_first_sector(int meta_dev_idx, struct drbd_backing_dev *bdev) |
1914 | { | 1759 | { |
1915 | switch (bdev->dc.meta_dev_idx) { | 1760 | switch (meta_dev_idx) { |
1916 | case DRBD_MD_INDEX_INTERNAL: | 1761 | case DRBD_MD_INDEX_INTERNAL: |
1917 | case DRBD_MD_INDEX_FLEX_INT: | 1762 | case DRBD_MD_INDEX_FLEX_INT: |
1918 | return bdev->md.md_offset + bdev->md.bm_offset; | 1763 | return bdev->md.md_offset + bdev->md.bm_offset; |
@@ -1922,13 +1767,30 @@ static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev) | |||
1922 | } | 1767 | } |
1923 | } | 1768 | } |
1924 | 1769 | ||
1770 | static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev) | ||
1771 | { | ||
1772 | int meta_dev_idx; | ||
1773 | |||
1774 | rcu_read_lock(); | ||
1775 | meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; | ||
1776 | rcu_read_unlock(); | ||
1777 | |||
1778 | return _drbd_md_first_sector(meta_dev_idx, bdev); | ||
1779 | } | ||
1780 | |||
1925 | /** | 1781 | /** |
1926 | * drbd_md_last_sector() - Return the last sector number of the meta data area | 1782 | * drbd_md_last_sector() - Return the last sector number of the meta data area |
1927 | * @bdev: Meta data block device. | 1783 | * @bdev: Meta data block device. |
1928 | */ | 1784 | */ |
1929 | static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev) | 1785 | static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev) |
1930 | { | 1786 | { |
1931 | switch (bdev->dc.meta_dev_idx) { | 1787 | int meta_dev_idx; |
1788 | |||
1789 | rcu_read_lock(); | ||
1790 | meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; | ||
1791 | rcu_read_unlock(); | ||
1792 | |||
1793 | switch (meta_dev_idx) { | ||
1932 | case DRBD_MD_INDEX_INTERNAL: | 1794 | case DRBD_MD_INDEX_INTERNAL: |
1933 | case DRBD_MD_INDEX_FLEX_INT: | 1795 | case DRBD_MD_INDEX_FLEX_INT: |
1934 | return bdev->md.md_offset + MD_AL_OFFSET - 1; | 1796 | return bdev->md.md_offset + MD_AL_OFFSET - 1; |
@@ -1956,12 +1818,18 @@ static inline sector_t drbd_get_capacity(struct block_device *bdev) | |||
1956 | static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) | 1818 | static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) |
1957 | { | 1819 | { |
1958 | sector_t s; | 1820 | sector_t s; |
1959 | switch (bdev->dc.meta_dev_idx) { | 1821 | int meta_dev_idx; |
1822 | |||
1823 | rcu_read_lock(); | ||
1824 | meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; | ||
1825 | rcu_read_unlock(); | ||
1826 | |||
1827 | switch (meta_dev_idx) { | ||
1960 | case DRBD_MD_INDEX_INTERNAL: | 1828 | case DRBD_MD_INDEX_INTERNAL: |
1961 | case DRBD_MD_INDEX_FLEX_INT: | 1829 | case DRBD_MD_INDEX_FLEX_INT: |
1962 | s = drbd_get_capacity(bdev->backing_bdev) | 1830 | s = drbd_get_capacity(bdev->backing_bdev) |
1963 | ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX, | 1831 | ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX, |
1964 | drbd_md_first_sector(bdev)) | 1832 | _drbd_md_first_sector(meta_dev_idx, bdev)) |
1965 | : 0; | 1833 | : 0; |
1966 | break; | 1834 | break; |
1967 | case DRBD_MD_INDEX_FLEX_EXT: | 1835 | case DRBD_MD_INDEX_FLEX_EXT: |
@@ -1987,9 +1855,15 @@ static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) | |||
1987 | static inline sector_t drbd_md_ss__(struct drbd_conf *mdev, | 1855 | static inline sector_t drbd_md_ss__(struct drbd_conf *mdev, |
1988 | struct drbd_backing_dev *bdev) | 1856 | struct drbd_backing_dev *bdev) |
1989 | { | 1857 | { |
1990 | switch (bdev->dc.meta_dev_idx) { | 1858 | int meta_dev_idx; |
1859 | |||
1860 | rcu_read_lock(); | ||
1861 | meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; | ||
1862 | rcu_read_unlock(); | ||
1863 | |||
1864 | switch (meta_dev_idx) { | ||
1991 | default: /* external, some index */ | 1865 | default: /* external, some index */ |
1992 | return MD_RESERVED_SECT * bdev->dc.meta_dev_idx; | 1866 | return MD_RESERVED_SECT * meta_dev_idx; |
1993 | case DRBD_MD_INDEX_INTERNAL: | 1867 | case DRBD_MD_INDEX_INTERNAL: |
1994 | /* with drbd08, internal meta data is always "flexible" */ | 1868 | /* with drbd08, internal meta data is always "flexible" */ |
1995 | case DRBD_MD_INDEX_FLEX_INT: | 1869 | case DRBD_MD_INDEX_FLEX_INT: |
@@ -2015,9 +1889,8 @@ drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w) | |||
2015 | unsigned long flags; | 1889 | unsigned long flags; |
2016 | spin_lock_irqsave(&q->q_lock, flags); | 1890 | spin_lock_irqsave(&q->q_lock, flags); |
2017 | list_add(&w->list, &q->q); | 1891 | list_add(&w->list, &q->q); |
2018 | up(&q->s); /* within the spinlock, | ||
2019 | see comment near end of drbd_worker() */ | ||
2020 | spin_unlock_irqrestore(&q->q_lock, flags); | 1892 | spin_unlock_irqrestore(&q->q_lock, flags); |
1893 | wake_up(&q->q_wait); | ||
2021 | } | 1894 | } |
2022 | 1895 | ||
2023 | static inline void | 1896 | static inline void |
@@ -2026,41 +1899,35 @@ drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w) | |||
2026 | unsigned long flags; | 1899 | unsigned long flags; |
2027 | spin_lock_irqsave(&q->q_lock, flags); | 1900 | spin_lock_irqsave(&q->q_lock, flags); |
2028 | list_add_tail(&w->list, &q->q); | 1901 | list_add_tail(&w->list, &q->q); |
2029 | up(&q->s); /* within the spinlock, | ||
2030 | see comment near end of drbd_worker() */ | ||
2031 | spin_unlock_irqrestore(&q->q_lock, flags); | 1902 | spin_unlock_irqrestore(&q->q_lock, flags); |
1903 | wake_up(&q->q_wait); | ||
2032 | } | 1904 | } |
2033 | 1905 | ||
2034 | static inline void wake_asender(struct drbd_conf *mdev) | 1906 | static inline void wake_asender(struct drbd_tconn *tconn) |
2035 | { | ||
2036 | if (test_bit(SIGNAL_ASENDER, &mdev->flags)) | ||
2037 | force_sig(DRBD_SIG, mdev->asender.task); | ||
2038 | } | ||
2039 | |||
2040 | static inline void request_ping(struct drbd_conf *mdev) | ||
2041 | { | 1907 | { |
2042 | set_bit(SEND_PING, &mdev->flags); | 1908 | if (test_bit(SIGNAL_ASENDER, &tconn->flags)) |
2043 | wake_asender(mdev); | 1909 | force_sig(DRBD_SIG, tconn->asender.task); |
2044 | } | 1910 | } |
2045 | 1911 | ||
2046 | static inline int drbd_send_short_cmd(struct drbd_conf *mdev, | 1912 | static inline void request_ping(struct drbd_tconn *tconn) |
2047 | enum drbd_packets cmd) | ||
2048 | { | 1913 | { |
2049 | struct p_header80 h; | 1914 | set_bit(SEND_PING, &tconn->flags); |
2050 | return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h)); | 1915 | wake_asender(tconn); |
2051 | } | 1916 | } |
2052 | 1917 | ||
2053 | static inline int drbd_send_ping(struct drbd_conf *mdev) | 1918 | extern void *conn_prepare_command(struct drbd_tconn *, struct drbd_socket *); |
2054 | { | 1919 | extern void *drbd_prepare_command(struct drbd_conf *, struct drbd_socket *); |
2055 | struct p_header80 h; | 1920 | extern int conn_send_command(struct drbd_tconn *, struct drbd_socket *, |
2056 | return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h)); | 1921 | enum drbd_packet, unsigned int, void *, |
2057 | } | 1922 | unsigned int); |
1923 | extern int drbd_send_command(struct drbd_conf *, struct drbd_socket *, | ||
1924 | enum drbd_packet, unsigned int, void *, | ||
1925 | unsigned int); | ||
2058 | 1926 | ||
2059 | static inline int drbd_send_ping_ack(struct drbd_conf *mdev) | 1927 | extern int drbd_send_ping(struct drbd_tconn *tconn); |
2060 | { | 1928 | extern int drbd_send_ping_ack(struct drbd_tconn *tconn); |
2061 | struct p_header80 h; | 1929 | extern int drbd_send_state_req(struct drbd_conf *, union drbd_state, union drbd_state); |
2062 | return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h)); | 1930 | extern int conn_send_state_req(struct drbd_tconn *, union drbd_state, union drbd_state); |
2063 | } | ||
2064 | 1931 | ||
2065 | static inline void drbd_thread_stop(struct drbd_thread *thi) | 1932 | static inline void drbd_thread_stop(struct drbd_thread *thi) |
2066 | { | 1933 | { |
@@ -2082,21 +1949,21 @@ static inline void drbd_thread_restart_nowait(struct drbd_thread *thi) | |||
2082 | * or implicit barrier packets as necessary. | 1949 | * or implicit barrier packets as necessary. |
2083 | * increased: | 1950 | * increased: |
2084 | * w_send_barrier | 1951 | * w_send_barrier |
2085 | * _req_mod(req, queue_for_net_write or queue_for_net_read); | 1952 | * _req_mod(req, QUEUE_FOR_NET_WRITE or QUEUE_FOR_NET_READ); |
2086 | * it is much easier and equally valid to count what we queue for the | 1953 | * it is much easier and equally valid to count what we queue for the |
2087 | * worker, even before it actually was queued or send. | 1954 | * worker, even before it actually was queued or send. |
2088 | * (drbd_make_request_common; recovery path on read io-error) | 1955 | * (drbd_make_request_common; recovery path on read io-error) |
2089 | * decreased: | 1956 | * decreased: |
2090 | * got_BarrierAck (respective tl_clear, tl_clear_barrier) | 1957 | * got_BarrierAck (respective tl_clear, tl_clear_barrier) |
2091 | * _req_mod(req, data_received) | 1958 | * _req_mod(req, DATA_RECEIVED) |
2092 | * [from receive_DataReply] | 1959 | * [from receive_DataReply] |
2093 | * _req_mod(req, write_acked_by_peer or recv_acked_by_peer or neg_acked) | 1960 | * _req_mod(req, WRITE_ACKED_BY_PEER or RECV_ACKED_BY_PEER or NEG_ACKED) |
2094 | * [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)] | 1961 | * [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)] |
2095 | * for some reason it is NOT decreased in got_NegAck, | 1962 | * for some reason it is NOT decreased in got_NegAck, |
2096 | * but in the resulting cleanup code from report_params. | 1963 | * but in the resulting cleanup code from report_params. |
2097 | * we should try to remember the reason for that... | 1964 | * we should try to remember the reason for that... |
2098 | * _req_mod(req, send_failed or send_canceled) | 1965 | * _req_mod(req, SEND_FAILED or SEND_CANCELED) |
2099 | * _req_mod(req, connection_lost_while_pending) | 1966 | * _req_mod(req, CONNECTION_LOST_WHILE_PENDING) |
2100 | * [from tl_clear_barrier] | 1967 | * [from tl_clear_barrier] |
2101 | */ | 1968 | */ |
2102 | static inline void inc_ap_pending(struct drbd_conf *mdev) | 1969 | static inline void inc_ap_pending(struct drbd_conf *mdev) |
@@ -2104,17 +1971,19 @@ static inline void inc_ap_pending(struct drbd_conf *mdev) | |||
2104 | atomic_inc(&mdev->ap_pending_cnt); | 1971 | atomic_inc(&mdev->ap_pending_cnt); |
2105 | } | 1972 | } |
2106 | 1973 | ||
2107 | #define ERR_IF_CNT_IS_NEGATIVE(which) \ | 1974 | #define ERR_IF_CNT_IS_NEGATIVE(which, func, line) \ |
2108 | if (atomic_read(&mdev->which) < 0) \ | 1975 | if (atomic_read(&mdev->which) < 0) \ |
2109 | dev_err(DEV, "in %s:%d: " #which " = %d < 0 !\n", \ | 1976 | dev_err(DEV, "in %s:%d: " #which " = %d < 0 !\n", \ |
2110 | __func__ , __LINE__ , \ | 1977 | func, line, \ |
2111 | atomic_read(&mdev->which)) | 1978 | atomic_read(&mdev->which)) |
2112 | 1979 | ||
2113 | #define dec_ap_pending(mdev) do { \ | 1980 | #define dec_ap_pending(mdev) _dec_ap_pending(mdev, __FUNCTION__, __LINE__) |
2114 | typecheck(struct drbd_conf *, mdev); \ | 1981 | static inline void _dec_ap_pending(struct drbd_conf *mdev, const char *func, int line) |
2115 | if (atomic_dec_and_test(&mdev->ap_pending_cnt)) \ | 1982 | { |
2116 | wake_up(&mdev->misc_wait); \ | 1983 | if (atomic_dec_and_test(&mdev->ap_pending_cnt)) |
2117 | ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt); } while (0) | 1984 | wake_up(&mdev->misc_wait); |
1985 | ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt, func, line); | ||
1986 | } | ||
2118 | 1987 | ||
2119 | /* counts how many resync-related answers we still expect from the peer | 1988 | /* counts how many resync-related answers we still expect from the peer |
2120 | * increase decrease | 1989 | * increase decrease |
@@ -2127,10 +1996,12 @@ static inline void inc_rs_pending(struct drbd_conf *mdev) | |||
2127 | atomic_inc(&mdev->rs_pending_cnt); | 1996 | atomic_inc(&mdev->rs_pending_cnt); |
2128 | } | 1997 | } |
2129 | 1998 | ||
2130 | #define dec_rs_pending(mdev) do { \ | 1999 | #define dec_rs_pending(mdev) _dec_rs_pending(mdev, __FUNCTION__, __LINE__) |
2131 | typecheck(struct drbd_conf *, mdev); \ | 2000 | static inline void _dec_rs_pending(struct drbd_conf *mdev, const char *func, int line) |
2132 | atomic_dec(&mdev->rs_pending_cnt); \ | 2001 | { |
2133 | ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt); } while (0) | 2002 | atomic_dec(&mdev->rs_pending_cnt); |
2003 | ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt, func, line); | ||
2004 | } | ||
2134 | 2005 | ||
2135 | /* counts how many answers we still need to send to the peer. | 2006 | /* counts how many answers we still need to send to the peer. |
2136 | * increased on | 2007 | * increased on |
@@ -2146,38 +2017,18 @@ static inline void inc_unacked(struct drbd_conf *mdev) | |||
2146 | atomic_inc(&mdev->unacked_cnt); | 2017 | atomic_inc(&mdev->unacked_cnt); |
2147 | } | 2018 | } |
2148 | 2019 | ||
2149 | #define dec_unacked(mdev) do { \ | 2020 | #define dec_unacked(mdev) _dec_unacked(mdev, __FUNCTION__, __LINE__) |
2150 | typecheck(struct drbd_conf *, mdev); \ | 2021 | static inline void _dec_unacked(struct drbd_conf *mdev, const char *func, int line) |
2151 | atomic_dec(&mdev->unacked_cnt); \ | ||
2152 | ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0) | ||
2153 | |||
2154 | #define sub_unacked(mdev, n) do { \ | ||
2155 | typecheck(struct drbd_conf *, mdev); \ | ||
2156 | atomic_sub(n, &mdev->unacked_cnt); \ | ||
2157 | ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0) | ||
2158 | |||
2159 | |||
2160 | static inline void put_net_conf(struct drbd_conf *mdev) | ||
2161 | { | 2022 | { |
2162 | if (atomic_dec_and_test(&mdev->net_cnt)) | 2023 | atomic_dec(&mdev->unacked_cnt); |
2163 | wake_up(&mdev->net_cnt_wait); | 2024 | ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line); |
2164 | } | 2025 | } |
2165 | 2026 | ||
2166 | /** | 2027 | #define sub_unacked(mdev, n) _sub_unacked(mdev, n, __FUNCTION__, __LINE__) |
2167 | * get_net_conf() - Increase ref count on mdev->net_conf; Returns 0 if nothing there | 2028 | static inline void _sub_unacked(struct drbd_conf *mdev, int n, const char *func, int line) |
2168 | * @mdev: DRBD device. | ||
2169 | * | ||
2170 | * You have to call put_net_conf() when finished working with mdev->net_conf. | ||
2171 | */ | ||
2172 | static inline int get_net_conf(struct drbd_conf *mdev) | ||
2173 | { | 2029 | { |
2174 | int have_net_conf; | 2030 | atomic_sub(n, &mdev->unacked_cnt); |
2175 | 2031 | ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line); | |
2176 | atomic_inc(&mdev->net_cnt); | ||
2177 | have_net_conf = mdev->state.conn >= C_UNCONNECTED; | ||
2178 | if (!have_net_conf) | ||
2179 | put_net_conf(mdev); | ||
2180 | return have_net_conf; | ||
2181 | } | 2032 | } |
2182 | 2033 | ||
2183 | /** | 2034 | /** |
@@ -2281,17 +2132,20 @@ static inline void drbd_get_syncer_progress(struct drbd_conf *mdev, | |||
2281 | * maybe re-implement using semaphores? */ | 2132 | * maybe re-implement using semaphores? */ |
2282 | static inline int drbd_get_max_buffers(struct drbd_conf *mdev) | 2133 | static inline int drbd_get_max_buffers(struct drbd_conf *mdev) |
2283 | { | 2134 | { |
2284 | int mxb = 1000000; /* arbitrary limit on open requests */ | 2135 | struct net_conf *nc; |
2285 | if (get_net_conf(mdev)) { | 2136 | int mxb; |
2286 | mxb = mdev->net_conf->max_buffers; | 2137 | |
2287 | put_net_conf(mdev); | 2138 | rcu_read_lock(); |
2288 | } | 2139 | nc = rcu_dereference(mdev->tconn->net_conf); |
2140 | mxb = nc ? nc->max_buffers : 1000000; /* arbitrary limit on open requests */ | ||
2141 | rcu_read_unlock(); | ||
2142 | |||
2289 | return mxb; | 2143 | return mxb; |
2290 | } | 2144 | } |
2291 | 2145 | ||
2292 | static inline int drbd_state_is_stable(struct drbd_conf *mdev) | 2146 | static inline int drbd_state_is_stable(struct drbd_conf *mdev) |
2293 | { | 2147 | { |
2294 | union drbd_state s = mdev->state; | 2148 | union drbd_dev_state s = mdev->state; |
2295 | 2149 | ||
2296 | /* DO NOT add a default clause, we want the compiler to warn us | 2150 | /* DO NOT add a default clause, we want the compiler to warn us |
2297 | * for any newly introduced state we may have forgotten to add here */ | 2151 | * for any newly introduced state we may have forgotten to add here */ |
@@ -2325,7 +2179,7 @@ static inline int drbd_state_is_stable(struct drbd_conf *mdev) | |||
2325 | 2179 | ||
2326 | /* Allow IO in BM exchange states with new protocols */ | 2180 | /* Allow IO in BM exchange states with new protocols */ |
2327 | case C_WF_BITMAP_S: | 2181 | case C_WF_BITMAP_S: |
2328 | if (mdev->agreed_pro_version < 96) | 2182 | if (mdev->tconn->agreed_pro_version < 96) |
2329 | return 0; | 2183 | return 0; |
2330 | break; | 2184 | break; |
2331 | 2185 | ||
@@ -2347,7 +2201,7 @@ static inline int drbd_state_is_stable(struct drbd_conf *mdev) | |||
2347 | /* disk state is stable as well. */ | 2201 | /* disk state is stable as well. */ |
2348 | break; | 2202 | break; |
2349 | 2203 | ||
2350 | /* no new io accepted during tansitional states */ | 2204 | /* no new io accepted during transitional states */ |
2351 | case D_ATTACHING: | 2205 | case D_ATTACHING: |
2352 | case D_NEGOTIATING: | 2206 | case D_NEGOTIATING: |
2353 | case D_UNKNOWN: | 2207 | case D_UNKNOWN: |
@@ -2359,16 +2213,18 @@ static inline int drbd_state_is_stable(struct drbd_conf *mdev) | |||
2359 | return 1; | 2213 | return 1; |
2360 | } | 2214 | } |
2361 | 2215 | ||
2362 | static inline int is_susp(union drbd_state s) | 2216 | static inline int drbd_suspended(struct drbd_conf *mdev) |
2363 | { | 2217 | { |
2364 | return s.susp || s.susp_nod || s.susp_fen; | 2218 | struct drbd_tconn *tconn = mdev->tconn; |
2219 | |||
2220 | return tconn->susp || tconn->susp_fen || tconn->susp_nod; | ||
2365 | } | 2221 | } |
2366 | 2222 | ||
2367 | static inline bool may_inc_ap_bio(struct drbd_conf *mdev) | 2223 | static inline bool may_inc_ap_bio(struct drbd_conf *mdev) |
2368 | { | 2224 | { |
2369 | int mxb = drbd_get_max_buffers(mdev); | 2225 | int mxb = drbd_get_max_buffers(mdev); |
2370 | 2226 | ||
2371 | if (is_susp(mdev->state)) | 2227 | if (drbd_suspended(mdev)) |
2372 | return false; | 2228 | return false; |
2373 | if (test_bit(SUSPEND_IO, &mdev->flags)) | 2229 | if (test_bit(SUSPEND_IO, &mdev->flags)) |
2374 | return false; | 2230 | return false; |
@@ -2390,30 +2246,30 @@ static inline bool may_inc_ap_bio(struct drbd_conf *mdev) | |||
2390 | return true; | 2246 | return true; |
2391 | } | 2247 | } |
2392 | 2248 | ||
2393 | static inline bool inc_ap_bio_cond(struct drbd_conf *mdev, int count) | 2249 | static inline bool inc_ap_bio_cond(struct drbd_conf *mdev) |
2394 | { | 2250 | { |
2395 | bool rv = false; | 2251 | bool rv = false; |
2396 | 2252 | ||
2397 | spin_lock_irq(&mdev->req_lock); | 2253 | spin_lock_irq(&mdev->tconn->req_lock); |
2398 | rv = may_inc_ap_bio(mdev); | 2254 | rv = may_inc_ap_bio(mdev); |
2399 | if (rv) | 2255 | if (rv) |
2400 | atomic_add(count, &mdev->ap_bio_cnt); | 2256 | atomic_inc(&mdev->ap_bio_cnt); |
2401 | spin_unlock_irq(&mdev->req_lock); | 2257 | spin_unlock_irq(&mdev->tconn->req_lock); |
2402 | 2258 | ||
2403 | return rv; | 2259 | return rv; |
2404 | } | 2260 | } |
2405 | 2261 | ||
2406 | static inline void inc_ap_bio(struct drbd_conf *mdev, int count) | 2262 | static inline void inc_ap_bio(struct drbd_conf *mdev) |
2407 | { | 2263 | { |
2408 | /* we wait here | 2264 | /* we wait here |
2409 | * as long as the device is suspended | 2265 | * as long as the device is suspended |
2410 | * until the bitmap is no longer on the fly during connection | 2266 | * until the bitmap is no longer on the fly during connection |
2411 | * handshake as long as we would exeed the max_buffer limit. | 2267 | * handshake as long as we would exceed the max_buffer limit. |
2412 | * | 2268 | * |
2413 | * to avoid races with the reconnect code, | 2269 | * to avoid races with the reconnect code, |
2414 | * we need to atomic_inc within the spinlock. */ | 2270 | * we need to atomic_inc within the spinlock. */ |
2415 | 2271 | ||
2416 | wait_event(mdev->misc_wait, inc_ap_bio_cond(mdev, count)); | 2272 | wait_event(mdev->misc_wait, inc_ap_bio_cond(mdev)); |
2417 | } | 2273 | } |
2418 | 2274 | ||
2419 | static inline void dec_ap_bio(struct drbd_conf *mdev) | 2275 | static inline void dec_ap_bio(struct drbd_conf *mdev) |
@@ -2425,7 +2281,7 @@ static inline void dec_ap_bio(struct drbd_conf *mdev) | |||
2425 | 2281 | ||
2426 | if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) { | 2282 | if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) { |
2427 | if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags)) | 2283 | if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags)) |
2428 | drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w); | 2284 | drbd_queue_work(&mdev->tconn->sender_work, &mdev->bm_io_work.w); |
2429 | } | 2285 | } |
2430 | 2286 | ||
2431 | /* this currently does wake_up for every dec_ap_bio! | 2287 | /* this currently does wake_up for every dec_ap_bio! |
@@ -2435,6 +2291,12 @@ static inline void dec_ap_bio(struct drbd_conf *mdev) | |||
2435 | wake_up(&mdev->misc_wait); | 2291 | wake_up(&mdev->misc_wait); |
2436 | } | 2292 | } |
2437 | 2293 | ||
2294 | static inline bool verify_can_do_stop_sector(struct drbd_conf *mdev) | ||
2295 | { | ||
2296 | return mdev->tconn->agreed_pro_version >= 97 && | ||
2297 | mdev->tconn->agreed_pro_version != 100; | ||
2298 | } | ||
2299 | |||
2438 | static inline int drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val) | 2300 | static inline int drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val) |
2439 | { | 2301 | { |
2440 | int changed = mdev->ed_uuid != val; | 2302 | int changed = mdev->ed_uuid != val; |
@@ -2442,40 +2304,6 @@ static inline int drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val) | |||
2442 | return changed; | 2304 | return changed; |
2443 | } | 2305 | } |
2444 | 2306 | ||
2445 | static inline int seq_cmp(u32 a, u32 b) | ||
2446 | { | ||
2447 | /* we assume wrap around at 32bit. | ||
2448 | * for wrap around at 24bit (old atomic_t), | ||
2449 | * we'd have to | ||
2450 | * a <<= 8; b <<= 8; | ||
2451 | */ | ||
2452 | return (s32)(a) - (s32)(b); | ||
2453 | } | ||
2454 | #define seq_lt(a, b) (seq_cmp((a), (b)) < 0) | ||
2455 | #define seq_gt(a, b) (seq_cmp((a), (b)) > 0) | ||
2456 | #define seq_ge(a, b) (seq_cmp((a), (b)) >= 0) | ||
2457 | #define seq_le(a, b) (seq_cmp((a), (b)) <= 0) | ||
2458 | /* CAUTION: please no side effects in arguments! */ | ||
2459 | #define seq_max(a, b) ((u32)(seq_gt((a), (b)) ? (a) : (b))) | ||
2460 | |||
2461 | static inline void update_peer_seq(struct drbd_conf *mdev, unsigned int new_seq) | ||
2462 | { | ||
2463 | unsigned int m; | ||
2464 | spin_lock(&mdev->peer_seq_lock); | ||
2465 | m = seq_max(mdev->peer_seq, new_seq); | ||
2466 | mdev->peer_seq = m; | ||
2467 | spin_unlock(&mdev->peer_seq_lock); | ||
2468 | if (m == new_seq) | ||
2469 | wake_up(&mdev->seq_wait); | ||
2470 | } | ||
2471 | |||
2472 | static inline void drbd_update_congested(struct drbd_conf *mdev) | ||
2473 | { | ||
2474 | struct sock *sk = mdev->data.socket->sk; | ||
2475 | if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5) | ||
2476 | set_bit(NET_CONGESTED, &mdev->flags); | ||
2477 | } | ||
2478 | |||
2479 | static inline int drbd_queue_order_type(struct drbd_conf *mdev) | 2307 | static inline int drbd_queue_order_type(struct drbd_conf *mdev) |
2480 | { | 2308 | { |
2481 | /* sorry, we currently have no working implementation | 2309 | /* sorry, we currently have no working implementation |
@@ -2490,10 +2318,15 @@ static inline void drbd_md_flush(struct drbd_conf *mdev) | |||
2490 | { | 2318 | { |
2491 | int r; | 2319 | int r; |
2492 | 2320 | ||
2321 | if (mdev->ldev == NULL) { | ||
2322 | dev_warn(DEV, "mdev->ldev == NULL in drbd_md_flush\n"); | ||
2323 | return; | ||
2324 | } | ||
2325 | |||
2493 | if (test_bit(MD_NO_FUA, &mdev->flags)) | 2326 | if (test_bit(MD_NO_FUA, &mdev->flags)) |
2494 | return; | 2327 | return; |
2495 | 2328 | ||
2496 | r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL); | 2329 | r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_NOIO, NULL); |
2497 | if (r) { | 2330 | if (r) { |
2498 | set_bit(MD_NO_FUA, &mdev->flags); | 2331 | set_bit(MD_NO_FUA, &mdev->flags); |
2499 | dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r); | 2332 | dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r); |
diff --git a/drivers/block/drbd/drbd_interval.c b/drivers/block/drbd/drbd_interval.c new file mode 100644 index 000000000000..89c497c630b4 --- /dev/null +++ b/drivers/block/drbd/drbd_interval.c | |||
@@ -0,0 +1,207 @@ | |||
1 | #include <asm/bug.h> | ||
2 | #include <linux/rbtree_augmented.h> | ||
3 | #include "drbd_interval.h" | ||
4 | |||
5 | /** | ||
6 | * interval_end - return end of @node | ||
7 | */ | ||
8 | static inline | ||
9 | sector_t interval_end(struct rb_node *node) | ||
10 | { | ||
11 | struct drbd_interval *this = rb_entry(node, struct drbd_interval, rb); | ||
12 | return this->end; | ||
13 | } | ||
14 | |||
15 | /** | ||
16 | * compute_subtree_last - compute end of @node | ||
17 | * | ||
18 | * The end of an interval is the highest (start + (size >> 9)) value of this | ||
19 | * node and of its children. Called for @node and its parents whenever the end | ||
20 | * may have changed. | ||
21 | */ | ||
22 | static inline sector_t | ||
23 | compute_subtree_last(struct drbd_interval *node) | ||
24 | { | ||
25 | sector_t max = node->sector + (node->size >> 9); | ||
26 | |||
27 | if (node->rb.rb_left) { | ||
28 | sector_t left = interval_end(node->rb.rb_left); | ||
29 | if (left > max) | ||
30 | max = left; | ||
31 | } | ||
32 | if (node->rb.rb_right) { | ||
33 | sector_t right = interval_end(node->rb.rb_right); | ||
34 | if (right > max) | ||
35 | max = right; | ||
36 | } | ||
37 | return max; | ||
38 | } | ||
39 | |||
40 | static void augment_propagate(struct rb_node *rb, struct rb_node *stop) | ||
41 | { | ||
42 | while (rb != stop) { | ||
43 | struct drbd_interval *node = rb_entry(rb, struct drbd_interval, rb); | ||
44 | sector_t subtree_last = compute_subtree_last(node); | ||
45 | if (node->end == subtree_last) | ||
46 | break; | ||
47 | node->end = subtree_last; | ||
48 | rb = rb_parent(&node->rb); | ||
49 | } | ||
50 | } | ||
51 | |||
52 | static void augment_copy(struct rb_node *rb_old, struct rb_node *rb_new) | ||
53 | { | ||
54 | struct drbd_interval *old = rb_entry(rb_old, struct drbd_interval, rb); | ||
55 | struct drbd_interval *new = rb_entry(rb_new, struct drbd_interval, rb); | ||
56 | |||
57 | new->end = old->end; | ||
58 | } | ||
59 | |||
60 | static void augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new) | ||
61 | { | ||
62 | struct drbd_interval *old = rb_entry(rb_old, struct drbd_interval, rb); | ||
63 | struct drbd_interval *new = rb_entry(rb_new, struct drbd_interval, rb); | ||
64 | |||
65 | new->end = old->end; | ||
66 | old->end = compute_subtree_last(old); | ||
67 | } | ||
68 | |||
69 | static const struct rb_augment_callbacks augment_callbacks = { | ||
70 | augment_propagate, | ||
71 | augment_copy, | ||
72 | augment_rotate, | ||
73 | }; | ||
74 | |||
75 | /** | ||
76 | * drbd_insert_interval - insert a new interval into a tree | ||
77 | */ | ||
78 | bool | ||
79 | drbd_insert_interval(struct rb_root *root, struct drbd_interval *this) | ||
80 | { | ||
81 | struct rb_node **new = &root->rb_node, *parent = NULL; | ||
82 | |||
83 | BUG_ON(!IS_ALIGNED(this->size, 512)); | ||
84 | |||
85 | while (*new) { | ||
86 | struct drbd_interval *here = | ||
87 | rb_entry(*new, struct drbd_interval, rb); | ||
88 | |||
89 | parent = *new; | ||
90 | if (this->sector < here->sector) | ||
91 | new = &(*new)->rb_left; | ||
92 | else if (this->sector > here->sector) | ||
93 | new = &(*new)->rb_right; | ||
94 | else if (this < here) | ||
95 | new = &(*new)->rb_left; | ||
96 | else if (this > here) | ||
97 | new = &(*new)->rb_right; | ||
98 | else | ||
99 | return false; | ||
100 | } | ||
101 | |||
102 | rb_link_node(&this->rb, parent, new); | ||
103 | rb_insert_augmented(&this->rb, root, &augment_callbacks); | ||
104 | return true; | ||
105 | } | ||
106 | |||
107 | /** | ||
108 | * drbd_contains_interval - check if a tree contains a given interval | ||
109 | * @sector: start sector of @interval | ||
110 | * @interval: may not be a valid pointer | ||
111 | * | ||
112 | * Returns if the tree contains the node @interval with start sector @start. | ||
113 | * Does not dereference @interval until @interval is known to be a valid object | ||
114 | * in @tree. Returns %false if @interval is in the tree but with a different | ||
115 | * sector number. | ||
116 | */ | ||
117 | bool | ||
118 | drbd_contains_interval(struct rb_root *root, sector_t sector, | ||
119 | struct drbd_interval *interval) | ||
120 | { | ||
121 | struct rb_node *node = root->rb_node; | ||
122 | |||
123 | while (node) { | ||
124 | struct drbd_interval *here = | ||
125 | rb_entry(node, struct drbd_interval, rb); | ||
126 | |||
127 | if (sector < here->sector) | ||
128 | node = node->rb_left; | ||
129 | else if (sector > here->sector) | ||
130 | node = node->rb_right; | ||
131 | else if (interval < here) | ||
132 | node = node->rb_left; | ||
133 | else if (interval > here) | ||
134 | node = node->rb_right; | ||
135 | else | ||
136 | return true; | ||
137 | } | ||
138 | return false; | ||
139 | } | ||
140 | |||
141 | /** | ||
142 | * drbd_remove_interval - remove an interval from a tree | ||
143 | */ | ||
144 | void | ||
145 | drbd_remove_interval(struct rb_root *root, struct drbd_interval *this) | ||
146 | { | ||
147 | rb_erase_augmented(&this->rb, root, &augment_callbacks); | ||
148 | } | ||
149 | |||
150 | /** | ||
151 | * drbd_find_overlap - search for an interval overlapping with [sector, sector + size) | ||
152 | * @sector: start sector | ||
153 | * @size: size, aligned to 512 bytes | ||
154 | * | ||
155 | * Returns an interval overlapping with [sector, sector + size), or NULL if | ||
156 | * there is none. When there is more than one overlapping interval in the | ||
157 | * tree, the interval with the lowest start sector is returned, and all other | ||
158 | * overlapping intervals will be on the right side of the tree, reachable with | ||
159 | * rb_next(). | ||
160 | */ | ||
161 | struct drbd_interval * | ||
162 | drbd_find_overlap(struct rb_root *root, sector_t sector, unsigned int size) | ||
163 | { | ||
164 | struct rb_node *node = root->rb_node; | ||
165 | struct drbd_interval *overlap = NULL; | ||
166 | sector_t end = sector + (size >> 9); | ||
167 | |||
168 | BUG_ON(!IS_ALIGNED(size, 512)); | ||
169 | |||
170 | while (node) { | ||
171 | struct drbd_interval *here = | ||
172 | rb_entry(node, struct drbd_interval, rb); | ||
173 | |||
174 | if (node->rb_left && | ||
175 | sector < interval_end(node->rb_left)) { | ||
176 | /* Overlap if any must be on left side */ | ||
177 | node = node->rb_left; | ||
178 | } else if (here->sector < end && | ||
179 | sector < here->sector + (here->size >> 9)) { | ||
180 | overlap = here; | ||
181 | break; | ||
182 | } else if (sector >= here->sector) { | ||
183 | /* Overlap if any must be on right side */ | ||
184 | node = node->rb_right; | ||
185 | } else | ||
186 | break; | ||
187 | } | ||
188 | return overlap; | ||
189 | } | ||
190 | |||
191 | struct drbd_interval * | ||
192 | drbd_next_overlap(struct drbd_interval *i, sector_t sector, unsigned int size) | ||
193 | { | ||
194 | sector_t end = sector + (size >> 9); | ||
195 | struct rb_node *node; | ||
196 | |||
197 | for (;;) { | ||
198 | node = rb_next(&i->rb); | ||
199 | if (!node) | ||
200 | return NULL; | ||
201 | i = rb_entry(node, struct drbd_interval, rb); | ||
202 | if (i->sector >= end) | ||
203 | return NULL; | ||
204 | if (sector < i->sector + (i->size >> 9)) | ||
205 | return i; | ||
206 | } | ||
207 | } | ||
diff --git a/drivers/block/drbd/drbd_interval.h b/drivers/block/drbd/drbd_interval.h new file mode 100644 index 000000000000..f38fcb00c10d --- /dev/null +++ b/drivers/block/drbd/drbd_interval.h | |||
@@ -0,0 +1,40 @@ | |||
1 | #ifndef __DRBD_INTERVAL_H | ||
2 | #define __DRBD_INTERVAL_H | ||
3 | |||
4 | #include <linux/types.h> | ||
5 | #include <linux/rbtree.h> | ||
6 | |||
7 | struct drbd_interval { | ||
8 | struct rb_node rb; | ||
9 | sector_t sector; /* start sector of the interval */ | ||
10 | unsigned int size; /* size in bytes */ | ||
11 | sector_t end; /* highest interval end in subtree */ | ||
12 | int local:1 /* local or remote request? */; | ||
13 | int waiting:1; | ||
14 | }; | ||
15 | |||
16 | static inline void drbd_clear_interval(struct drbd_interval *i) | ||
17 | { | ||
18 | RB_CLEAR_NODE(&i->rb); | ||
19 | } | ||
20 | |||
21 | static inline bool drbd_interval_empty(struct drbd_interval *i) | ||
22 | { | ||
23 | return RB_EMPTY_NODE(&i->rb); | ||
24 | } | ||
25 | |||
26 | extern bool drbd_insert_interval(struct rb_root *, struct drbd_interval *); | ||
27 | extern bool drbd_contains_interval(struct rb_root *, sector_t, | ||
28 | struct drbd_interval *); | ||
29 | extern void drbd_remove_interval(struct rb_root *, struct drbd_interval *); | ||
30 | extern struct drbd_interval *drbd_find_overlap(struct rb_root *, sector_t, | ||
31 | unsigned int); | ||
32 | extern struct drbd_interval *drbd_next_overlap(struct drbd_interval *, sector_t, | ||
33 | unsigned int); | ||
34 | |||
35 | #define drbd_for_each_overlap(i, root, sector, size) \ | ||
36 | for (i = drbd_find_overlap(root, sector, size); \ | ||
37 | i; \ | ||
38 | i = drbd_next_overlap(i, sector, size)) | ||
39 | |||
40 | #endif /* __DRBD_INTERVAL_H */ | ||
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index f93a0320e952..8c13eeb83c53 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c | |||
@@ -56,14 +56,6 @@ | |||
56 | 56 | ||
57 | #include "drbd_vli.h" | 57 | #include "drbd_vli.h" |
58 | 58 | ||
59 | struct after_state_chg_work { | ||
60 | struct drbd_work w; | ||
61 | union drbd_state os; | ||
62 | union drbd_state ns; | ||
63 | enum chg_state_flags flags; | ||
64 | struct completion *done; | ||
65 | }; | ||
66 | |||
67 | static DEFINE_MUTEX(drbd_main_mutex); | 59 | static DEFINE_MUTEX(drbd_main_mutex); |
68 | int drbdd_init(struct drbd_thread *); | 60 | int drbdd_init(struct drbd_thread *); |
69 | int drbd_worker(struct drbd_thread *); | 61 | int drbd_worker(struct drbd_thread *); |
@@ -72,21 +64,17 @@ int drbd_asender(struct drbd_thread *); | |||
72 | int drbd_init(void); | 64 | int drbd_init(void); |
73 | static int drbd_open(struct block_device *bdev, fmode_t mode); | 65 | static int drbd_open(struct block_device *bdev, fmode_t mode); |
74 | static int drbd_release(struct gendisk *gd, fmode_t mode); | 66 | static int drbd_release(struct gendisk *gd, fmode_t mode); |
75 | static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused); | 67 | static int w_md_sync(struct drbd_work *w, int unused); |
76 | static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, | ||
77 | union drbd_state ns, enum chg_state_flags flags); | ||
78 | static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused); | ||
79 | static void md_sync_timer_fn(unsigned long data); | 68 | static void md_sync_timer_fn(unsigned long data); |
80 | static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused); | 69 | static int w_bitmap_io(struct drbd_work *w, int unused); |
81 | static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused); | 70 | static int w_go_diskless(struct drbd_work *w, int unused); |
82 | static void _tl_clear(struct drbd_conf *mdev); | ||
83 | 71 | ||
84 | MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, " | 72 | MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, " |
85 | "Lars Ellenberg <lars@linbit.com>"); | 73 | "Lars Ellenberg <lars@linbit.com>"); |
86 | MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION); | 74 | MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION); |
87 | MODULE_VERSION(REL_VERSION); | 75 | MODULE_VERSION(REL_VERSION); |
88 | MODULE_LICENSE("GPL"); | 76 | MODULE_LICENSE("GPL"); |
89 | MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (" | 77 | MODULE_PARM_DESC(minor_count, "Approximate number of drbd devices (" |
90 | __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")"); | 78 | __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")"); |
91 | MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR); | 79 | MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR); |
92 | 80 | ||
@@ -98,7 +86,6 @@ MODULE_PARM_DESC(allow_oos, "DONT USE!"); | |||
98 | module_param(minor_count, uint, 0444); | 86 | module_param(minor_count, uint, 0444); |
99 | module_param(disable_sendpage, bool, 0644); | 87 | module_param(disable_sendpage, bool, 0644); |
100 | module_param(allow_oos, bool, 0); | 88 | module_param(allow_oos, bool, 0); |
101 | module_param(cn_idx, uint, 0444); | ||
102 | module_param(proc_details, int, 0644); | 89 | module_param(proc_details, int, 0644); |
103 | 90 | ||
104 | #ifdef CONFIG_DRBD_FAULT_INJECTION | 91 | #ifdef CONFIG_DRBD_FAULT_INJECTION |
@@ -120,7 +107,6 @@ module_param(fault_devs, int, 0644); | |||
120 | unsigned int minor_count = DRBD_MINOR_COUNT_DEF; | 107 | unsigned int minor_count = DRBD_MINOR_COUNT_DEF; |
121 | bool disable_sendpage; | 108 | bool disable_sendpage; |
122 | bool allow_oos; | 109 | bool allow_oos; |
123 | unsigned int cn_idx = CN_IDX_DRBD; | ||
124 | int proc_details; /* Detail level in proc drbd*/ | 110 | int proc_details; /* Detail level in proc drbd*/ |
125 | 111 | ||
126 | /* Module parameter for setting the user mode helper program | 112 | /* Module parameter for setting the user mode helper program |
@@ -132,10 +118,11 @@ module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0 | |||
132 | /* in 2.6.x, our device mapping and config info contains our virtual gendisks | 118 | /* in 2.6.x, our device mapping and config info contains our virtual gendisks |
133 | * as member "struct gendisk *vdisk;" | 119 | * as member "struct gendisk *vdisk;" |
134 | */ | 120 | */ |
135 | struct drbd_conf **minor_table; | 121 | struct idr minors; |
122 | struct list_head drbd_tconns; /* list of struct drbd_tconn */ | ||
136 | 123 | ||
137 | struct kmem_cache *drbd_request_cache; | 124 | struct kmem_cache *drbd_request_cache; |
138 | struct kmem_cache *drbd_ee_cache; /* epoch entries */ | 125 | struct kmem_cache *drbd_ee_cache; /* peer requests */ |
139 | struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ | 126 | struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ |
140 | struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ | 127 | struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ |
141 | mempool_t *drbd_request_mempool; | 128 | mempool_t *drbd_request_mempool; |
@@ -162,11 +149,6 @@ static const struct block_device_operations drbd_ops = { | |||
162 | .release = drbd_release, | 149 | .release = drbd_release, |
163 | }; | 150 | }; |
164 | 151 | ||
165 | static void bio_destructor_drbd(struct bio *bio) | ||
166 | { | ||
167 | bio_free(bio, drbd_md_io_bio_set); | ||
168 | } | ||
169 | |||
170 | struct bio *bio_alloc_drbd(gfp_t gfp_mask) | 152 | struct bio *bio_alloc_drbd(gfp_t gfp_mask) |
171 | { | 153 | { |
172 | struct bio *bio; | 154 | struct bio *bio; |
@@ -177,7 +159,6 @@ struct bio *bio_alloc_drbd(gfp_t gfp_mask) | |||
177 | bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set); | 159 | bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set); |
178 | if (!bio) | 160 | if (!bio) |
179 | return NULL; | 161 | return NULL; |
180 | bio->bi_destructor = bio_destructor_drbd; | ||
181 | return bio; | 162 | return bio; |
182 | } | 163 | } |
183 | 164 | ||
@@ -201,158 +182,87 @@ int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins) | |||
201 | #endif | 182 | #endif |
202 | 183 | ||
203 | /** | 184 | /** |
204 | * DOC: The transfer log | 185 | * tl_release() - mark as BARRIER_ACKED all requests in the corresponding transfer log epoch |
205 | * | 186 | * @tconn: DRBD connection. |
206 | * The transfer log is a single linked list of &struct drbd_tl_epoch objects. | ||
207 | * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail | ||
208 | * of the list. There is always at least one &struct drbd_tl_epoch object. | ||
209 | * | ||
210 | * Each &struct drbd_tl_epoch has a circular double linked list of requests | ||
211 | * attached. | ||
212 | */ | ||
213 | static int tl_init(struct drbd_conf *mdev) | ||
214 | { | ||
215 | struct drbd_tl_epoch *b; | ||
216 | |||
217 | /* during device minor initialization, we may well use GFP_KERNEL */ | ||
218 | b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL); | ||
219 | if (!b) | ||
220 | return 0; | ||
221 | INIT_LIST_HEAD(&b->requests); | ||
222 | INIT_LIST_HEAD(&b->w.list); | ||
223 | b->next = NULL; | ||
224 | b->br_number = 4711; | ||
225 | b->n_writes = 0; | ||
226 | b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */ | ||
227 | |||
228 | mdev->oldest_tle = b; | ||
229 | mdev->newest_tle = b; | ||
230 | INIT_LIST_HEAD(&mdev->out_of_sequence_requests); | ||
231 | INIT_LIST_HEAD(&mdev->barrier_acked_requests); | ||
232 | |||
233 | mdev->tl_hash = NULL; | ||
234 | mdev->tl_hash_s = 0; | ||
235 | |||
236 | return 1; | ||
237 | } | ||
238 | |||
239 | static void tl_cleanup(struct drbd_conf *mdev) | ||
240 | { | ||
241 | D_ASSERT(mdev->oldest_tle == mdev->newest_tle); | ||
242 | D_ASSERT(list_empty(&mdev->out_of_sequence_requests)); | ||
243 | kfree(mdev->oldest_tle); | ||
244 | mdev->oldest_tle = NULL; | ||
245 | kfree(mdev->unused_spare_tle); | ||
246 | mdev->unused_spare_tle = NULL; | ||
247 | kfree(mdev->tl_hash); | ||
248 | mdev->tl_hash = NULL; | ||
249 | mdev->tl_hash_s = 0; | ||
250 | } | ||
251 | |||
252 | /** | ||
253 | * _tl_add_barrier() - Adds a barrier to the transfer log | ||
254 | * @mdev: DRBD device. | ||
255 | * @new: Barrier to be added before the current head of the TL. | ||
256 | * | ||
257 | * The caller must hold the req_lock. | ||
258 | */ | ||
259 | void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new) | ||
260 | { | ||
261 | struct drbd_tl_epoch *newest_before; | ||
262 | |||
263 | INIT_LIST_HEAD(&new->requests); | ||
264 | INIT_LIST_HEAD(&new->w.list); | ||
265 | new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */ | ||
266 | new->next = NULL; | ||
267 | new->n_writes = 0; | ||
268 | |||
269 | newest_before = mdev->newest_tle; | ||
270 | new->br_number = newest_before->br_number+1; | ||
271 | if (mdev->newest_tle != new) { | ||
272 | mdev->newest_tle->next = new; | ||
273 | mdev->newest_tle = new; | ||
274 | } | ||
275 | } | ||
276 | |||
277 | /** | ||
278 | * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL | ||
279 | * @mdev: DRBD device. | ||
280 | * @barrier_nr: Expected identifier of the DRBD write barrier packet. | 187 | * @barrier_nr: Expected identifier of the DRBD write barrier packet. |
281 | * @set_size: Expected number of requests before that barrier. | 188 | * @set_size: Expected number of requests before that barrier. |
282 | * | 189 | * |
283 | * In case the passed barrier_nr or set_size does not match the oldest | 190 | * In case the passed barrier_nr or set_size does not match the oldest |
284 | * &struct drbd_tl_epoch objects this function will cause a termination | 191 | * epoch of not yet barrier-acked requests, this function will cause a |
285 | * of the connection. | 192 | * termination of the connection. |
286 | */ | 193 | */ |
287 | void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, | 194 | void tl_release(struct drbd_tconn *tconn, unsigned int barrier_nr, |
288 | unsigned int set_size) | 195 | unsigned int set_size) |
289 | { | 196 | { |
290 | struct drbd_tl_epoch *b, *nob; /* next old barrier */ | ||
291 | struct list_head *le, *tle; | ||
292 | struct drbd_request *r; | 197 | struct drbd_request *r; |
293 | 198 | struct drbd_request *req = NULL; | |
294 | spin_lock_irq(&mdev->req_lock); | 199 | int expect_epoch = 0; |
295 | 200 | int expect_size = 0; | |
296 | b = mdev->oldest_tle; | 201 | |
202 | spin_lock_irq(&tconn->req_lock); | ||
203 | |||
204 | /* find oldest not yet barrier-acked write request, | ||
205 | * count writes in its epoch. */ | ||
206 | list_for_each_entry(r, &tconn->transfer_log, tl_requests) { | ||
207 | const unsigned s = r->rq_state; | ||
208 | if (!req) { | ||
209 | if (!(s & RQ_WRITE)) | ||
210 | continue; | ||
211 | if (!(s & RQ_NET_MASK)) | ||
212 | continue; | ||
213 | if (s & RQ_NET_DONE) | ||
214 | continue; | ||
215 | req = r; | ||
216 | expect_epoch = req->epoch; | ||
217 | expect_size ++; | ||
218 | } else { | ||
219 | if (r->epoch != expect_epoch) | ||
220 | break; | ||
221 | if (!(s & RQ_WRITE)) | ||
222 | continue; | ||
223 | /* if (s & RQ_DONE): not expected */ | ||
224 | /* if (!(s & RQ_NET_MASK)): not expected */ | ||
225 | expect_size++; | ||
226 | } | ||
227 | } | ||
297 | 228 | ||
298 | /* first some paranoia code */ | 229 | /* first some paranoia code */ |
299 | if (b == NULL) { | 230 | if (req == NULL) { |
300 | dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n", | 231 | conn_err(tconn, "BAD! BarrierAck #%u received, but no epoch in tl!?\n", |
301 | barrier_nr); | 232 | barrier_nr); |
302 | goto bail; | 233 | goto bail; |
303 | } | 234 | } |
304 | if (b->br_number != barrier_nr) { | 235 | if (expect_epoch != barrier_nr) { |
305 | dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n", | 236 | conn_err(tconn, "BAD! BarrierAck #%u received, expected #%u!\n", |
306 | barrier_nr, b->br_number); | 237 | barrier_nr, expect_epoch); |
307 | goto bail; | 238 | goto bail; |
308 | } | 239 | } |
309 | if (b->n_writes != set_size) { | 240 | |
310 | dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n", | 241 | if (expect_size != set_size) { |
311 | barrier_nr, set_size, b->n_writes); | 242 | conn_err(tconn, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n", |
243 | barrier_nr, set_size, expect_size); | ||
312 | goto bail; | 244 | goto bail; |
313 | } | 245 | } |
314 | 246 | ||
315 | /* Clean up list of requests processed during current epoch */ | 247 | /* Clean up list of requests processed during current epoch. */ |
316 | list_for_each_safe(le, tle, &b->requests) { | 248 | /* this extra list walk restart is paranoia, |
317 | r = list_entry(le, struct drbd_request, tl_requests); | 249 | * to catch requests being barrier-acked "unexpectedly". |
318 | _req_mod(r, barrier_acked); | 250 | * It usually should find the same req again, or some READ preceding it. */ |
319 | } | 251 | list_for_each_entry(req, &tconn->transfer_log, tl_requests) |
320 | /* There could be requests on the list waiting for completion | 252 | if (req->epoch == expect_epoch) |
321 | of the write to the local disk. To avoid corruptions of | 253 | break; |
322 | slab's data structures we have to remove the lists head. | 254 | list_for_each_entry_safe_from(req, r, &tconn->transfer_log, tl_requests) { |
323 | 255 | if (req->epoch != expect_epoch) | |
324 | Also there could have been a barrier ack out of sequence, overtaking | 256 | break; |
325 | the write acks - which would be a bug and violating write ordering. | 257 | _req_mod(req, BARRIER_ACKED); |
326 | To not deadlock in case we lose connection while such requests are | ||
327 | still pending, we need some way to find them for the | ||
328 | _req_mode(connection_lost_while_pending). | ||
329 | |||
330 | These have been list_move'd to the out_of_sequence_requests list in | ||
331 | _req_mod(, barrier_acked) above. | ||
332 | */ | ||
333 | list_splice_init(&b->requests, &mdev->barrier_acked_requests); | ||
334 | |||
335 | nob = b->next; | ||
336 | if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) { | ||
337 | _tl_add_barrier(mdev, b); | ||
338 | if (nob) | ||
339 | mdev->oldest_tle = nob; | ||
340 | /* if nob == NULL b was the only barrier, and becomes the new | ||
341 | barrier. Therefore mdev->oldest_tle points already to b */ | ||
342 | } else { | ||
343 | D_ASSERT(nob != NULL); | ||
344 | mdev->oldest_tle = nob; | ||
345 | kfree(b); | ||
346 | } | 258 | } |
347 | 259 | spin_unlock_irq(&tconn->req_lock); | |
348 | spin_unlock_irq(&mdev->req_lock); | ||
349 | dec_ap_pending(mdev); | ||
350 | 260 | ||
351 | return; | 261 | return; |
352 | 262 | ||
353 | bail: | 263 | bail: |
354 | spin_unlock_irq(&mdev->req_lock); | 264 | spin_unlock_irq(&tconn->req_lock); |
355 | drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); | 265 | conn_request_state(tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD); |
356 | } | 266 | } |
357 | 267 | ||
358 | 268 | ||
@@ -361,85 +271,24 @@ bail: | |||
361 | * @mdev: DRBD device. | 271 | * @mdev: DRBD device. |
362 | * @what: The action/event to perform with all request objects | 272 | * @what: The action/event to perform with all request objects |
363 | * | 273 | * |
364 | * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io, | 274 | * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO, |
365 | * restart_frozen_disk_io. | 275 | * RESTART_FROZEN_DISK_IO. |
366 | */ | 276 | */ |
367 | static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) | 277 | /* must hold resource->req_lock */ |
368 | { | 278 | void _tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what) |
369 | struct drbd_tl_epoch *b, *tmp, **pn; | 279 | { |
370 | struct list_head *le, *tle, carry_reads; | 280 | struct drbd_request *req, *r; |
371 | struct drbd_request *req; | ||
372 | int rv, n_writes, n_reads; | ||
373 | |||
374 | b = mdev->oldest_tle; | ||
375 | pn = &mdev->oldest_tle; | ||
376 | while (b) { | ||
377 | n_writes = 0; | ||
378 | n_reads = 0; | ||
379 | INIT_LIST_HEAD(&carry_reads); | ||
380 | list_for_each_safe(le, tle, &b->requests) { | ||
381 | req = list_entry(le, struct drbd_request, tl_requests); | ||
382 | rv = _req_mod(req, what); | ||
383 | |||
384 | n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT; | ||
385 | n_reads += (rv & MR_READ) >> MR_READ_SHIFT; | ||
386 | } | ||
387 | tmp = b->next; | ||
388 | |||
389 | if (n_writes) { | ||
390 | if (what == resend) { | ||
391 | b->n_writes = n_writes; | ||
392 | if (b->w.cb == NULL) { | ||
393 | b->w.cb = w_send_barrier; | ||
394 | inc_ap_pending(mdev); | ||
395 | set_bit(CREATE_BARRIER, &mdev->flags); | ||
396 | } | ||
397 | |||
398 | drbd_queue_work(&mdev->data.work, &b->w); | ||
399 | } | ||
400 | pn = &b->next; | ||
401 | } else { | ||
402 | if (n_reads) | ||
403 | list_add(&carry_reads, &b->requests); | ||
404 | /* there could still be requests on that ring list, | ||
405 | * in case local io is still pending */ | ||
406 | list_del(&b->requests); | ||
407 | |||
408 | /* dec_ap_pending corresponding to queue_barrier. | ||
409 | * the newest barrier may not have been queued yet, | ||
410 | * in which case w.cb is still NULL. */ | ||
411 | if (b->w.cb != NULL) | ||
412 | dec_ap_pending(mdev); | ||
413 | |||
414 | if (b == mdev->newest_tle) { | ||
415 | /* recycle, but reinit! */ | ||
416 | D_ASSERT(tmp == NULL); | ||
417 | INIT_LIST_HEAD(&b->requests); | ||
418 | list_splice(&carry_reads, &b->requests); | ||
419 | INIT_LIST_HEAD(&b->w.list); | ||
420 | b->w.cb = NULL; | ||
421 | b->br_number = net_random(); | ||
422 | b->n_writes = 0; | ||
423 | |||
424 | *pn = b; | ||
425 | break; | ||
426 | } | ||
427 | *pn = tmp; | ||
428 | kfree(b); | ||
429 | } | ||
430 | b = tmp; | ||
431 | list_splice(&carry_reads, &b->requests); | ||
432 | } | ||
433 | |||
434 | /* Actions operating on the disk state, also want to work on | ||
435 | requests that got barrier acked. */ | ||
436 | 281 | ||
437 | list_for_each_safe(le, tle, &mdev->barrier_acked_requests) { | 282 | list_for_each_entry_safe(req, r, &tconn->transfer_log, tl_requests) |
438 | req = list_entry(le, struct drbd_request, tl_requests); | ||
439 | _req_mod(req, what); | 283 | _req_mod(req, what); |
440 | } | ||
441 | } | 284 | } |
442 | 285 | ||
286 | void tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what) | ||
287 | { | ||
288 | spin_lock_irq(&tconn->req_lock); | ||
289 | _tl_restart(tconn, what); | ||
290 | spin_unlock_irq(&tconn->req_lock); | ||
291 | } | ||
443 | 292 | ||
444 | /** | 293 | /** |
445 | * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL | 294 | * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL |
@@ -449,43 +298,9 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) | |||
449 | * by the requests on the transfer gets marked as our of sync. Called from the | 298 | * by the requests on the transfer gets marked as our of sync. Called from the |
450 | * receiver thread and the worker thread. | 299 | * receiver thread and the worker thread. |
451 | */ | 300 | */ |
452 | void tl_clear(struct drbd_conf *mdev) | 301 | void tl_clear(struct drbd_tconn *tconn) |
453 | { | 302 | { |
454 | spin_lock_irq(&mdev->req_lock); | 303 | tl_restart(tconn, CONNECTION_LOST_WHILE_PENDING); |
455 | _tl_clear(mdev); | ||
456 | spin_unlock_irq(&mdev->req_lock); | ||
457 | } | ||
458 | |||
459 | static void _tl_clear(struct drbd_conf *mdev) | ||
460 | { | ||
461 | struct list_head *le, *tle; | ||
462 | struct drbd_request *r; | ||
463 | |||
464 | _tl_restart(mdev, connection_lost_while_pending); | ||
465 | |||
466 | /* we expect this list to be empty. */ | ||
467 | D_ASSERT(list_empty(&mdev->out_of_sequence_requests)); | ||
468 | |||
469 | /* but just in case, clean it up anyways! */ | ||
470 | list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) { | ||
471 | r = list_entry(le, struct drbd_request, tl_requests); | ||
472 | /* It would be nice to complete outside of spinlock. | ||
473 | * But this is easier for now. */ | ||
474 | _req_mod(r, connection_lost_while_pending); | ||
475 | } | ||
476 | |||
477 | /* ensure bit indicating barrier is required is clear */ | ||
478 | clear_bit(CREATE_BARRIER, &mdev->flags); | ||
479 | |||
480 | memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *)); | ||
481 | |||
482 | } | ||
483 | |||
484 | void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) | ||
485 | { | ||
486 | spin_lock_irq(&mdev->req_lock); | ||
487 | _tl_restart(mdev, what); | ||
488 | spin_unlock_irq(&mdev->req_lock); | ||
489 | } | 304 | } |
490 | 305 | ||
491 | /** | 306 | /** |
@@ -494,1377 +309,131 @@ void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) | |||
494 | */ | 309 | */ |
495 | void tl_abort_disk_io(struct drbd_conf *mdev) | 310 | void tl_abort_disk_io(struct drbd_conf *mdev) |
496 | { | 311 | { |
497 | struct drbd_tl_epoch *b; | 312 | struct drbd_tconn *tconn = mdev->tconn; |
498 | struct list_head *le, *tle; | 313 | struct drbd_request *req, *r; |
499 | struct drbd_request *req; | ||
500 | |||
501 | spin_lock_irq(&mdev->req_lock); | ||
502 | b = mdev->oldest_tle; | ||
503 | while (b) { | ||
504 | list_for_each_safe(le, tle, &b->requests) { | ||
505 | req = list_entry(le, struct drbd_request, tl_requests); | ||
506 | if (!(req->rq_state & RQ_LOCAL_PENDING)) | ||
507 | continue; | ||
508 | _req_mod(req, abort_disk_io); | ||
509 | } | ||
510 | b = b->next; | ||
511 | } | ||
512 | 314 | ||
513 | list_for_each_safe(le, tle, &mdev->barrier_acked_requests) { | 315 | spin_lock_irq(&tconn->req_lock); |
514 | req = list_entry(le, struct drbd_request, tl_requests); | 316 | list_for_each_entry_safe(req, r, &tconn->transfer_log, tl_requests) { |
515 | if (!(req->rq_state & RQ_LOCAL_PENDING)) | 317 | if (!(req->rq_state & RQ_LOCAL_PENDING)) |
516 | continue; | 318 | continue; |
517 | _req_mod(req, abort_disk_io); | 319 | if (req->w.mdev != mdev) |
518 | } | 320 | continue; |
519 | 321 | _req_mod(req, ABORT_DISK_IO); | |
520 | spin_unlock_irq(&mdev->req_lock); | ||
521 | } | ||
522 | |||
523 | /** | ||
524 | * cl_wide_st_chg() - true if the state change is a cluster wide one | ||
525 | * @mdev: DRBD device. | ||
526 | * @os: old (current) state. | ||
527 | * @ns: new (wanted) state. | ||
528 | */ | ||
529 | static int cl_wide_st_chg(struct drbd_conf *mdev, | ||
530 | union drbd_state os, union drbd_state ns) | ||
531 | { | ||
532 | return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED && | ||
533 | ((os.role != R_PRIMARY && ns.role == R_PRIMARY) || | ||
534 | (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || | ||
535 | (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) || | ||
536 | (os.disk != D_FAILED && ns.disk == D_FAILED))) || | ||
537 | (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) || | ||
538 | (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S); | ||
539 | } | ||
540 | |||
541 | enum drbd_state_rv | ||
542 | drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f, | ||
543 | union drbd_state mask, union drbd_state val) | ||
544 | { | ||
545 | unsigned long flags; | ||
546 | union drbd_state os, ns; | ||
547 | enum drbd_state_rv rv; | ||
548 | |||
549 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
550 | os = mdev->state; | ||
551 | ns.i = (os.i & ~mask.i) | val.i; | ||
552 | rv = _drbd_set_state(mdev, ns, f, NULL); | ||
553 | ns = mdev->state; | ||
554 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
555 | |||
556 | return rv; | ||
557 | } | ||
558 | |||
559 | /** | ||
560 | * drbd_force_state() - Impose a change which happens outside our control on our state | ||
561 | * @mdev: DRBD device. | ||
562 | * @mask: mask of state bits to change. | ||
563 | * @val: value of new state bits. | ||
564 | */ | ||
565 | void drbd_force_state(struct drbd_conf *mdev, | ||
566 | union drbd_state mask, union drbd_state val) | ||
567 | { | ||
568 | drbd_change_state(mdev, CS_HARD, mask, val); | ||
569 | } | ||
570 | |||
571 | static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state); | ||
572 | static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *, | ||
573 | union drbd_state, | ||
574 | union drbd_state); | ||
575 | enum sanitize_state_warnings { | ||
576 | NO_WARNING, | ||
577 | ABORTED_ONLINE_VERIFY, | ||
578 | ABORTED_RESYNC, | ||
579 | CONNECTION_LOST_NEGOTIATING, | ||
580 | IMPLICITLY_UPGRADED_DISK, | ||
581 | IMPLICITLY_UPGRADED_PDSK, | ||
582 | }; | ||
583 | static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, | ||
584 | union drbd_state ns, enum sanitize_state_warnings *warn); | ||
585 | int drbd_send_state_req(struct drbd_conf *, | ||
586 | union drbd_state, union drbd_state); | ||
587 | |||
588 | static enum drbd_state_rv | ||
589 | _req_st_cond(struct drbd_conf *mdev, union drbd_state mask, | ||
590 | union drbd_state val) | ||
591 | { | ||
592 | union drbd_state os, ns; | ||
593 | unsigned long flags; | ||
594 | enum drbd_state_rv rv; | ||
595 | |||
596 | if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags)) | ||
597 | return SS_CW_SUCCESS; | ||
598 | |||
599 | if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags)) | ||
600 | return SS_CW_FAILED_BY_PEER; | ||
601 | |||
602 | rv = 0; | ||
603 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
604 | os = mdev->state; | ||
605 | ns.i = (os.i & ~mask.i) | val.i; | ||
606 | ns = sanitize_state(mdev, os, ns, NULL); | ||
607 | |||
608 | if (!cl_wide_st_chg(mdev, os, ns)) | ||
609 | rv = SS_CW_NO_NEED; | ||
610 | if (!rv) { | ||
611 | rv = is_valid_state(mdev, ns); | ||
612 | if (rv == SS_SUCCESS) { | ||
613 | rv = is_valid_state_transition(mdev, ns, os); | ||
614 | if (rv == SS_SUCCESS) | ||
615 | rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */ | ||
616 | } | ||
617 | } | ||
618 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
619 | |||
620 | return rv; | ||
621 | } | ||
622 | |||
623 | /** | ||
624 | * drbd_req_state() - Perform an eventually cluster wide state change | ||
625 | * @mdev: DRBD device. | ||
626 | * @mask: mask of state bits to change. | ||
627 | * @val: value of new state bits. | ||
628 | * @f: flags | ||
629 | * | ||
630 | * Should not be called directly, use drbd_request_state() or | ||
631 | * _drbd_request_state(). | ||
632 | */ | ||
633 | static enum drbd_state_rv | ||
634 | drbd_req_state(struct drbd_conf *mdev, union drbd_state mask, | ||
635 | union drbd_state val, enum chg_state_flags f) | ||
636 | { | ||
637 | struct completion done; | ||
638 | unsigned long flags; | ||
639 | union drbd_state os, ns; | ||
640 | enum drbd_state_rv rv; | ||
641 | |||
642 | init_completion(&done); | ||
643 | |||
644 | if (f & CS_SERIALIZE) | ||
645 | mutex_lock(&mdev->state_mutex); | ||
646 | |||
647 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
648 | os = mdev->state; | ||
649 | ns.i = (os.i & ~mask.i) | val.i; | ||
650 | ns = sanitize_state(mdev, os, ns, NULL); | ||
651 | |||
652 | if (cl_wide_st_chg(mdev, os, ns)) { | ||
653 | rv = is_valid_state(mdev, ns); | ||
654 | if (rv == SS_SUCCESS) | ||
655 | rv = is_valid_state_transition(mdev, ns, os); | ||
656 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
657 | |||
658 | if (rv < SS_SUCCESS) { | ||
659 | if (f & CS_VERBOSE) | ||
660 | print_st_err(mdev, os, ns, rv); | ||
661 | goto abort; | ||
662 | } | ||
663 | |||
664 | drbd_state_lock(mdev); | ||
665 | if (!drbd_send_state_req(mdev, mask, val)) { | ||
666 | drbd_state_unlock(mdev); | ||
667 | rv = SS_CW_FAILED_BY_PEER; | ||
668 | if (f & CS_VERBOSE) | ||
669 | print_st_err(mdev, os, ns, rv); | ||
670 | goto abort; | ||
671 | } | ||
672 | |||
673 | wait_event(mdev->state_wait, | ||
674 | (rv = _req_st_cond(mdev, mask, val))); | ||
675 | |||
676 | if (rv < SS_SUCCESS) { | ||
677 | drbd_state_unlock(mdev); | ||
678 | if (f & CS_VERBOSE) | ||
679 | print_st_err(mdev, os, ns, rv); | ||
680 | goto abort; | ||
681 | } | ||
682 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
683 | os = mdev->state; | ||
684 | ns.i = (os.i & ~mask.i) | val.i; | ||
685 | rv = _drbd_set_state(mdev, ns, f, &done); | ||
686 | drbd_state_unlock(mdev); | ||
687 | } else { | ||
688 | rv = _drbd_set_state(mdev, ns, f, &done); | ||
689 | } | ||
690 | |||
691 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
692 | |||
693 | if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) { | ||
694 | D_ASSERT(current != mdev->worker.task); | ||
695 | wait_for_completion(&done); | ||
696 | } | ||
697 | |||
698 | abort: | ||
699 | if (f & CS_SERIALIZE) | ||
700 | mutex_unlock(&mdev->state_mutex); | ||
701 | |||
702 | return rv; | ||
703 | } | ||
704 | |||
705 | /** | ||
706 | * _drbd_request_state() - Request a state change (with flags) | ||
707 | * @mdev: DRBD device. | ||
708 | * @mask: mask of state bits to change. | ||
709 | * @val: value of new state bits. | ||
710 | * @f: flags | ||
711 | * | ||
712 | * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE | ||
713 | * flag, or when logging of failed state change requests is not desired. | ||
714 | */ | ||
715 | enum drbd_state_rv | ||
716 | _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask, | ||
717 | union drbd_state val, enum chg_state_flags f) | ||
718 | { | ||
719 | enum drbd_state_rv rv; | ||
720 | |||
721 | wait_event(mdev->state_wait, | ||
722 | (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE); | ||
723 | |||
724 | return rv; | ||
725 | } | ||
726 | |||
727 | static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns) | ||
728 | { | ||
729 | dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n", | ||
730 | name, | ||
731 | drbd_conn_str(ns.conn), | ||
732 | drbd_role_str(ns.role), | ||
733 | drbd_role_str(ns.peer), | ||
734 | drbd_disk_str(ns.disk), | ||
735 | drbd_disk_str(ns.pdsk), | ||
736 | is_susp(ns) ? 's' : 'r', | ||
737 | ns.aftr_isp ? 'a' : '-', | ||
738 | ns.peer_isp ? 'p' : '-', | ||
739 | ns.user_isp ? 'u' : '-' | ||
740 | ); | ||
741 | } | ||
742 | |||
743 | void print_st_err(struct drbd_conf *mdev, union drbd_state os, | ||
744 | union drbd_state ns, enum drbd_state_rv err) | ||
745 | { | ||
746 | if (err == SS_IN_TRANSIENT_STATE) | ||
747 | return; | ||
748 | dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err)); | ||
749 | print_st(mdev, " state", os); | ||
750 | print_st(mdev, "wanted", ns); | ||
751 | } | ||
752 | |||
753 | |||
754 | /** | ||
755 | * is_valid_state() - Returns an SS_ error code if ns is not valid | ||
756 | * @mdev: DRBD device. | ||
757 | * @ns: State to consider. | ||
758 | */ | ||
759 | static enum drbd_state_rv | ||
760 | is_valid_state(struct drbd_conf *mdev, union drbd_state ns) | ||
761 | { | ||
762 | /* See drbd_state_sw_errors in drbd_strings.c */ | ||
763 | |||
764 | enum drbd_fencing_p fp; | ||
765 | enum drbd_state_rv rv = SS_SUCCESS; | ||
766 | |||
767 | fp = FP_DONT_CARE; | ||
768 | if (get_ldev(mdev)) { | ||
769 | fp = mdev->ldev->dc.fencing; | ||
770 | put_ldev(mdev); | ||
771 | } | ||
772 | |||
773 | if (get_net_conf(mdev)) { | ||
774 | if (!mdev->net_conf->two_primaries && | ||
775 | ns.role == R_PRIMARY && ns.peer == R_PRIMARY) | ||
776 | rv = SS_TWO_PRIMARIES; | ||
777 | put_net_conf(mdev); | ||
778 | } | ||
779 | |||
780 | if (rv <= 0) | ||
781 | /* already found a reason to abort */; | ||
782 | else if (ns.role == R_SECONDARY && mdev->open_cnt) | ||
783 | rv = SS_DEVICE_IN_USE; | ||
784 | |||
785 | else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE) | ||
786 | rv = SS_NO_UP_TO_DATE_DISK; | ||
787 | |||
788 | else if (fp >= FP_RESOURCE && | ||
789 | ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN) | ||
790 | rv = SS_PRIMARY_NOP; | ||
791 | |||
792 | else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT) | ||
793 | rv = SS_NO_UP_TO_DATE_DISK; | ||
794 | |||
795 | else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT) | ||
796 | rv = SS_NO_LOCAL_DISK; | ||
797 | |||
798 | else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT) | ||
799 | rv = SS_NO_REMOTE_DISK; | ||
800 | |||
801 | else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) | ||
802 | rv = SS_NO_UP_TO_DATE_DISK; | ||
803 | |||
804 | else if ((ns.conn == C_CONNECTED || | ||
805 | ns.conn == C_WF_BITMAP_S || | ||
806 | ns.conn == C_SYNC_SOURCE || | ||
807 | ns.conn == C_PAUSED_SYNC_S) && | ||
808 | ns.disk == D_OUTDATED) | ||
809 | rv = SS_CONNECTED_OUTDATES; | ||
810 | |||
811 | else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && | ||
812 | (mdev->sync_conf.verify_alg[0] == 0)) | ||
813 | rv = SS_NO_VERIFY_ALG; | ||
814 | |||
815 | else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && | ||
816 | mdev->agreed_pro_version < 88) | ||
817 | rv = SS_NOT_SUPPORTED; | ||
818 | |||
819 | else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN) | ||
820 | rv = SS_CONNECTED_OUTDATES; | ||
821 | |||
822 | return rv; | ||
823 | } | ||
824 | |||
825 | /** | ||
826 | * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible | ||
827 | * @mdev: DRBD device. | ||
828 | * @ns: new state. | ||
829 | * @os: old state. | ||
830 | */ | ||
831 | static enum drbd_state_rv | ||
832 | is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns, | ||
833 | union drbd_state os) | ||
834 | { | ||
835 | enum drbd_state_rv rv = SS_SUCCESS; | ||
836 | |||
837 | if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) && | ||
838 | os.conn > C_CONNECTED) | ||
839 | rv = SS_RESYNC_RUNNING; | ||
840 | |||
841 | if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE) | ||
842 | rv = SS_ALREADY_STANDALONE; | ||
843 | |||
844 | if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS) | ||
845 | rv = SS_IS_DISKLESS; | ||
846 | |||
847 | if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED) | ||
848 | rv = SS_NO_NET_CONFIG; | ||
849 | |||
850 | if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING) | ||
851 | rv = SS_LOWER_THAN_OUTDATED; | ||
852 | |||
853 | if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED) | ||
854 | rv = SS_IN_TRANSIENT_STATE; | ||
855 | |||
856 | if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS) | ||
857 | rv = SS_IN_TRANSIENT_STATE; | ||
858 | |||
859 | /* While establishing a connection only allow cstate to change. | ||
860 | Delay/refuse role changes, detach attach etc... */ | ||
861 | if (test_bit(STATE_SENT, &mdev->flags) && | ||
862 | !(os.conn == C_WF_REPORT_PARAMS || | ||
863 | (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION))) | ||
864 | rv = SS_IN_TRANSIENT_STATE; | ||
865 | |||
866 | if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED) | ||
867 | rv = SS_NEED_CONNECTION; | ||
868 | |||
869 | if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && | ||
870 | ns.conn != os.conn && os.conn > C_CONNECTED) | ||
871 | rv = SS_RESYNC_RUNNING; | ||
872 | |||
873 | if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) && | ||
874 | os.conn < C_CONNECTED) | ||
875 | rv = SS_NEED_CONNECTION; | ||
876 | |||
877 | if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE) | ||
878 | && os.conn < C_WF_REPORT_PARAMS) | ||
879 | rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */ | ||
880 | |||
881 | return rv; | ||
882 | } | ||
883 | |||
884 | static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn) | ||
885 | { | ||
886 | static const char *msg_table[] = { | ||
887 | [NO_WARNING] = "", | ||
888 | [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.", | ||
889 | [ABORTED_RESYNC] = "Resync aborted.", | ||
890 | [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!", | ||
891 | [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk", | ||
892 | [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk", | ||
893 | }; | ||
894 | |||
895 | if (warn != NO_WARNING) | ||
896 | dev_warn(DEV, "%s\n", msg_table[warn]); | ||
897 | } | ||
898 | |||
899 | /** | ||
900 | * sanitize_state() - Resolves implicitly necessary additional changes to a state transition | ||
901 | * @mdev: DRBD device. | ||
902 | * @os: old state. | ||
903 | * @ns: new state. | ||
904 | * @warn_sync_abort: | ||
905 | * | ||
906 | * When we loose connection, we have to set the state of the peers disk (pdsk) | ||
907 | * to D_UNKNOWN. This rule and many more along those lines are in this function. | ||
908 | */ | ||
909 | static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, | ||
910 | union drbd_state ns, enum sanitize_state_warnings *warn) | ||
911 | { | ||
912 | enum drbd_fencing_p fp; | ||
913 | enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max; | ||
914 | |||
915 | if (warn) | ||
916 | *warn = NO_WARNING; | ||
917 | |||
918 | fp = FP_DONT_CARE; | ||
919 | if (get_ldev(mdev)) { | ||
920 | fp = mdev->ldev->dc.fencing; | ||
921 | put_ldev(mdev); | ||
922 | } | ||
923 | |||
924 | /* Disallow Network errors to configure a device's network part */ | ||
925 | if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) && | ||
926 | os.conn <= C_DISCONNECTING) | ||
927 | ns.conn = os.conn; | ||
928 | |||
929 | /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow. | ||
930 | * If you try to go into some Sync* state, that shall fail (elsewhere). */ | ||
931 | if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN && | ||
932 | ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_CONNECTED) | ||
933 | ns.conn = os.conn; | ||
934 | |||
935 | /* we cannot fail (again) if we already detached */ | ||
936 | if (ns.disk == D_FAILED && os.disk == D_DISKLESS) | ||
937 | ns.disk = D_DISKLESS; | ||
938 | |||
939 | /* After C_DISCONNECTING only C_STANDALONE may follow */ | ||
940 | if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE) | ||
941 | ns.conn = os.conn; | ||
942 | |||
943 | if (ns.conn < C_CONNECTED) { | ||
944 | ns.peer_isp = 0; | ||
945 | ns.peer = R_UNKNOWN; | ||
946 | if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT) | ||
947 | ns.pdsk = D_UNKNOWN; | ||
948 | } | ||
949 | |||
950 | /* Clear the aftr_isp when becoming unconfigured */ | ||
951 | if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY) | ||
952 | ns.aftr_isp = 0; | ||
953 | |||
954 | /* Abort resync if a disk fails/detaches */ | ||
955 | if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED && | ||
956 | (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) { | ||
957 | if (warn) | ||
958 | *warn = os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ? | ||
959 | ABORTED_ONLINE_VERIFY : ABORTED_RESYNC; | ||
960 | ns.conn = C_CONNECTED; | ||
961 | } | ||
962 | |||
963 | /* Connection breaks down before we finished "Negotiating" */ | ||
964 | if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING && | ||
965 | get_ldev_if_state(mdev, D_NEGOTIATING)) { | ||
966 | if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) { | ||
967 | ns.disk = mdev->new_state_tmp.disk; | ||
968 | ns.pdsk = mdev->new_state_tmp.pdsk; | ||
969 | } else { | ||
970 | if (warn) | ||
971 | *warn = CONNECTION_LOST_NEGOTIATING; | ||
972 | ns.disk = D_DISKLESS; | ||
973 | ns.pdsk = D_UNKNOWN; | ||
974 | } | ||
975 | put_ldev(mdev); | ||
976 | } | ||
977 | |||
978 | /* D_CONSISTENT and D_OUTDATED vanish when we get connected */ | ||
979 | if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) { | ||
980 | if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) | ||
981 | ns.disk = D_UP_TO_DATE; | ||
982 | if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED) | ||
983 | ns.pdsk = D_UP_TO_DATE; | ||
984 | } | ||
985 | |||
986 | /* Implications of the connection stat on the disk states */ | ||
987 | disk_min = D_DISKLESS; | ||
988 | disk_max = D_UP_TO_DATE; | ||
989 | pdsk_min = D_INCONSISTENT; | ||
990 | pdsk_max = D_UNKNOWN; | ||
991 | switch ((enum drbd_conns)ns.conn) { | ||
992 | case C_WF_BITMAP_T: | ||
993 | case C_PAUSED_SYNC_T: | ||
994 | case C_STARTING_SYNC_T: | ||
995 | case C_WF_SYNC_UUID: | ||
996 | case C_BEHIND: | ||
997 | disk_min = D_INCONSISTENT; | ||
998 | disk_max = D_OUTDATED; | ||
999 | pdsk_min = D_UP_TO_DATE; | ||
1000 | pdsk_max = D_UP_TO_DATE; | ||
1001 | break; | ||
1002 | case C_VERIFY_S: | ||
1003 | case C_VERIFY_T: | ||
1004 | disk_min = D_UP_TO_DATE; | ||
1005 | disk_max = D_UP_TO_DATE; | ||
1006 | pdsk_min = D_UP_TO_DATE; | ||
1007 | pdsk_max = D_UP_TO_DATE; | ||
1008 | break; | ||
1009 | case C_CONNECTED: | ||
1010 | disk_min = D_DISKLESS; | ||
1011 | disk_max = D_UP_TO_DATE; | ||
1012 | pdsk_min = D_DISKLESS; | ||
1013 | pdsk_max = D_UP_TO_DATE; | ||
1014 | break; | ||
1015 | case C_WF_BITMAP_S: | ||
1016 | case C_PAUSED_SYNC_S: | ||
1017 | case C_STARTING_SYNC_S: | ||
1018 | case C_AHEAD: | ||
1019 | disk_min = D_UP_TO_DATE; | ||
1020 | disk_max = D_UP_TO_DATE; | ||
1021 | pdsk_min = D_INCONSISTENT; | ||
1022 | pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/ | ||
1023 | break; | ||
1024 | case C_SYNC_TARGET: | ||
1025 | disk_min = D_INCONSISTENT; | ||
1026 | disk_max = D_INCONSISTENT; | ||
1027 | pdsk_min = D_UP_TO_DATE; | ||
1028 | pdsk_max = D_UP_TO_DATE; | ||
1029 | break; | ||
1030 | case C_SYNC_SOURCE: | ||
1031 | disk_min = D_UP_TO_DATE; | ||
1032 | disk_max = D_UP_TO_DATE; | ||
1033 | pdsk_min = D_INCONSISTENT; | ||
1034 | pdsk_max = D_INCONSISTENT; | ||
1035 | break; | ||
1036 | case C_STANDALONE: | ||
1037 | case C_DISCONNECTING: | ||
1038 | case C_UNCONNECTED: | ||
1039 | case C_TIMEOUT: | ||
1040 | case C_BROKEN_PIPE: | ||
1041 | case C_NETWORK_FAILURE: | ||
1042 | case C_PROTOCOL_ERROR: | ||
1043 | case C_TEAR_DOWN: | ||
1044 | case C_WF_CONNECTION: | ||
1045 | case C_WF_REPORT_PARAMS: | ||
1046 | case C_MASK: | ||
1047 | break; | ||
1048 | } | ||
1049 | if (ns.disk > disk_max) | ||
1050 | ns.disk = disk_max; | ||
1051 | |||
1052 | if (ns.disk < disk_min) { | ||
1053 | if (warn) | ||
1054 | *warn = IMPLICITLY_UPGRADED_DISK; | ||
1055 | ns.disk = disk_min; | ||
1056 | } | ||
1057 | if (ns.pdsk > pdsk_max) | ||
1058 | ns.pdsk = pdsk_max; | ||
1059 | |||
1060 | if (ns.pdsk < pdsk_min) { | ||
1061 | if (warn) | ||
1062 | *warn = IMPLICITLY_UPGRADED_PDSK; | ||
1063 | ns.pdsk = pdsk_min; | ||
1064 | } | ||
1065 | |||
1066 | if (fp == FP_STONITH && | ||
1067 | (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) && | ||
1068 | !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED)) | ||
1069 | ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */ | ||
1070 | |||
1071 | if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO && | ||
1072 | (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) && | ||
1073 | !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE)) | ||
1074 | ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */ | ||
1075 | |||
1076 | if (ns.aftr_isp || ns.peer_isp || ns.user_isp) { | ||
1077 | if (ns.conn == C_SYNC_SOURCE) | ||
1078 | ns.conn = C_PAUSED_SYNC_S; | ||
1079 | if (ns.conn == C_SYNC_TARGET) | ||
1080 | ns.conn = C_PAUSED_SYNC_T; | ||
1081 | } else { | ||
1082 | if (ns.conn == C_PAUSED_SYNC_S) | ||
1083 | ns.conn = C_SYNC_SOURCE; | ||
1084 | if (ns.conn == C_PAUSED_SYNC_T) | ||
1085 | ns.conn = C_SYNC_TARGET; | ||
1086 | } | ||
1087 | |||
1088 | return ns; | ||
1089 | } | ||
1090 | |||
1091 | /* helper for __drbd_set_state */ | ||
1092 | static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs) | ||
1093 | { | ||
1094 | if (mdev->agreed_pro_version < 90) | ||
1095 | mdev->ov_start_sector = 0; | ||
1096 | mdev->rs_total = drbd_bm_bits(mdev); | ||
1097 | mdev->ov_position = 0; | ||
1098 | if (cs == C_VERIFY_T) { | ||
1099 | /* starting online verify from an arbitrary position | ||
1100 | * does not fit well into the existing protocol. | ||
1101 | * on C_VERIFY_T, we initialize ov_left and friends | ||
1102 | * implicitly in receive_DataRequest once the | ||
1103 | * first P_OV_REQUEST is received */ | ||
1104 | mdev->ov_start_sector = ~(sector_t)0; | ||
1105 | } else { | ||
1106 | unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector); | ||
1107 | if (bit >= mdev->rs_total) { | ||
1108 | mdev->ov_start_sector = | ||
1109 | BM_BIT_TO_SECT(mdev->rs_total - 1); | ||
1110 | mdev->rs_total = 1; | ||
1111 | } else | ||
1112 | mdev->rs_total -= bit; | ||
1113 | mdev->ov_position = mdev->ov_start_sector; | ||
1114 | } | ||
1115 | mdev->ov_left = mdev->rs_total; | ||
1116 | } | ||
1117 | |||
1118 | static void drbd_resume_al(struct drbd_conf *mdev) | ||
1119 | { | ||
1120 | if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags)) | ||
1121 | dev_info(DEV, "Resumed AL updates\n"); | ||
1122 | } | ||
1123 | |||
1124 | /** | ||
1125 | * __drbd_set_state() - Set a new DRBD state | ||
1126 | * @mdev: DRBD device. | ||
1127 | * @ns: new state. | ||
1128 | * @flags: Flags | ||
1129 | * @done: Optional completion, that will get completed after the after_state_ch() finished | ||
1130 | * | ||
1131 | * Caller needs to hold req_lock, and global_state_lock. Do not call directly. | ||
1132 | */ | ||
1133 | enum drbd_state_rv | ||
1134 | __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, | ||
1135 | enum chg_state_flags flags, struct completion *done) | ||
1136 | { | ||
1137 | union drbd_state os; | ||
1138 | enum drbd_state_rv rv = SS_SUCCESS; | ||
1139 | enum sanitize_state_warnings ssw; | ||
1140 | struct after_state_chg_work *ascw; | ||
1141 | |||
1142 | os = mdev->state; | ||
1143 | |||
1144 | ns = sanitize_state(mdev, os, ns, &ssw); | ||
1145 | |||
1146 | if (ns.i == os.i) | ||
1147 | return SS_NOTHING_TO_DO; | ||
1148 | |||
1149 | if (!(flags & CS_HARD)) { | ||
1150 | /* pre-state-change checks ; only look at ns */ | ||
1151 | /* See drbd_state_sw_errors in drbd_strings.c */ | ||
1152 | |||
1153 | rv = is_valid_state(mdev, ns); | ||
1154 | if (rv < SS_SUCCESS) { | ||
1155 | /* If the old state was illegal as well, then let | ||
1156 | this happen...*/ | ||
1157 | |||
1158 | if (is_valid_state(mdev, os) == rv) | ||
1159 | rv = is_valid_state_transition(mdev, ns, os); | ||
1160 | } else | ||
1161 | rv = is_valid_state_transition(mdev, ns, os); | ||
1162 | } | ||
1163 | |||
1164 | if (rv < SS_SUCCESS) { | ||
1165 | if (flags & CS_VERBOSE) | ||
1166 | print_st_err(mdev, os, ns, rv); | ||
1167 | return rv; | ||
1168 | } | ||
1169 | |||
1170 | print_sanitize_warnings(mdev, ssw); | ||
1171 | |||
1172 | { | ||
1173 | char *pbp, pb[300]; | ||
1174 | pbp = pb; | ||
1175 | *pbp = 0; | ||
1176 | if (ns.role != os.role) | ||
1177 | pbp += sprintf(pbp, "role( %s -> %s ) ", | ||
1178 | drbd_role_str(os.role), | ||
1179 | drbd_role_str(ns.role)); | ||
1180 | if (ns.peer != os.peer) | ||
1181 | pbp += sprintf(pbp, "peer( %s -> %s ) ", | ||
1182 | drbd_role_str(os.peer), | ||
1183 | drbd_role_str(ns.peer)); | ||
1184 | if (ns.conn != os.conn) | ||
1185 | pbp += sprintf(pbp, "conn( %s -> %s ) ", | ||
1186 | drbd_conn_str(os.conn), | ||
1187 | drbd_conn_str(ns.conn)); | ||
1188 | if (ns.disk != os.disk) | ||
1189 | pbp += sprintf(pbp, "disk( %s -> %s ) ", | ||
1190 | drbd_disk_str(os.disk), | ||
1191 | drbd_disk_str(ns.disk)); | ||
1192 | if (ns.pdsk != os.pdsk) | ||
1193 | pbp += sprintf(pbp, "pdsk( %s -> %s ) ", | ||
1194 | drbd_disk_str(os.pdsk), | ||
1195 | drbd_disk_str(ns.pdsk)); | ||
1196 | if (is_susp(ns) != is_susp(os)) | ||
1197 | pbp += sprintf(pbp, "susp( %d -> %d ) ", | ||
1198 | is_susp(os), | ||
1199 | is_susp(ns)); | ||
1200 | if (ns.aftr_isp != os.aftr_isp) | ||
1201 | pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ", | ||
1202 | os.aftr_isp, | ||
1203 | ns.aftr_isp); | ||
1204 | if (ns.peer_isp != os.peer_isp) | ||
1205 | pbp += sprintf(pbp, "peer_isp( %d -> %d ) ", | ||
1206 | os.peer_isp, | ||
1207 | ns.peer_isp); | ||
1208 | if (ns.user_isp != os.user_isp) | ||
1209 | pbp += sprintf(pbp, "user_isp( %d -> %d ) ", | ||
1210 | os.user_isp, | ||
1211 | ns.user_isp); | ||
1212 | dev_info(DEV, "%s\n", pb); | ||
1213 | } | ||
1214 | |||
1215 | /* solve the race between becoming unconfigured, | ||
1216 | * worker doing the cleanup, and | ||
1217 | * admin reconfiguring us: | ||
1218 | * on (re)configure, first set CONFIG_PENDING, | ||
1219 | * then wait for a potentially exiting worker, | ||
1220 | * start the worker, and schedule one no_op. | ||
1221 | * then proceed with configuration. | ||
1222 | */ | ||
1223 | if (ns.disk == D_DISKLESS && | ||
1224 | ns.conn == C_STANDALONE && | ||
1225 | ns.role == R_SECONDARY && | ||
1226 | !test_and_set_bit(CONFIG_PENDING, &mdev->flags)) | ||
1227 | set_bit(DEVICE_DYING, &mdev->flags); | ||
1228 | |||
1229 | /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference | ||
1230 | * on the ldev here, to be sure the transition -> D_DISKLESS resp. | ||
1231 | * drbd_ldev_destroy() won't happen before our corresponding | ||
1232 | * after_state_ch works run, where we put_ldev again. */ | ||
1233 | if ((os.disk != D_FAILED && ns.disk == D_FAILED) || | ||
1234 | (os.disk != D_DISKLESS && ns.disk == D_DISKLESS)) | ||
1235 | atomic_inc(&mdev->local_cnt); | ||
1236 | |||
1237 | mdev->state = ns; | ||
1238 | |||
1239 | if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING) | ||
1240 | drbd_print_uuids(mdev, "attached to UUIDs"); | ||
1241 | |||
1242 | wake_up(&mdev->misc_wait); | ||
1243 | wake_up(&mdev->state_wait); | ||
1244 | |||
1245 | /* aborted verify run. log the last position */ | ||
1246 | if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) && | ||
1247 | ns.conn < C_CONNECTED) { | ||
1248 | mdev->ov_start_sector = | ||
1249 | BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left); | ||
1250 | dev_info(DEV, "Online Verify reached sector %llu\n", | ||
1251 | (unsigned long long)mdev->ov_start_sector); | ||
1252 | } | ||
1253 | |||
1254 | if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) && | ||
1255 | (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) { | ||
1256 | dev_info(DEV, "Syncer continues.\n"); | ||
1257 | mdev->rs_paused += (long)jiffies | ||
1258 | -(long)mdev->rs_mark_time[mdev->rs_last_mark]; | ||
1259 | if (ns.conn == C_SYNC_TARGET) | ||
1260 | mod_timer(&mdev->resync_timer, jiffies); | ||
1261 | } | ||
1262 | |||
1263 | if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) && | ||
1264 | (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) { | ||
1265 | dev_info(DEV, "Resync suspended\n"); | ||
1266 | mdev->rs_mark_time[mdev->rs_last_mark] = jiffies; | ||
1267 | } | ||
1268 | |||
1269 | if (os.conn == C_CONNECTED && | ||
1270 | (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) { | ||
1271 | unsigned long now = jiffies; | ||
1272 | int i; | ||
1273 | |||
1274 | set_ov_position(mdev, ns.conn); | ||
1275 | mdev->rs_start = now; | ||
1276 | mdev->rs_last_events = 0; | ||
1277 | mdev->rs_last_sect_ev = 0; | ||
1278 | mdev->ov_last_oos_size = 0; | ||
1279 | mdev->ov_last_oos_start = 0; | ||
1280 | |||
1281 | for (i = 0; i < DRBD_SYNC_MARKS; i++) { | ||
1282 | mdev->rs_mark_left[i] = mdev->ov_left; | ||
1283 | mdev->rs_mark_time[i] = now; | ||
1284 | } | ||
1285 | |||
1286 | drbd_rs_controller_reset(mdev); | ||
1287 | |||
1288 | if (ns.conn == C_VERIFY_S) { | ||
1289 | dev_info(DEV, "Starting Online Verify from sector %llu\n", | ||
1290 | (unsigned long long)mdev->ov_position); | ||
1291 | mod_timer(&mdev->resync_timer, jiffies); | ||
1292 | } | ||
1293 | } | ||
1294 | |||
1295 | if (get_ldev(mdev)) { | ||
1296 | u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND| | ||
1297 | MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE| | ||
1298 | MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY); | ||
1299 | |||
1300 | if (test_bit(CRASHED_PRIMARY, &mdev->flags)) | ||
1301 | mdf |= MDF_CRASHED_PRIMARY; | ||
1302 | if (mdev->state.role == R_PRIMARY || | ||
1303 | (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY)) | ||
1304 | mdf |= MDF_PRIMARY_IND; | ||
1305 | if (mdev->state.conn > C_WF_REPORT_PARAMS) | ||
1306 | mdf |= MDF_CONNECTED_IND; | ||
1307 | if (mdev->state.disk > D_INCONSISTENT) | ||
1308 | mdf |= MDF_CONSISTENT; | ||
1309 | if (mdev->state.disk > D_OUTDATED) | ||
1310 | mdf |= MDF_WAS_UP_TO_DATE; | ||
1311 | if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT) | ||
1312 | mdf |= MDF_PEER_OUT_DATED; | ||
1313 | if (mdf != mdev->ldev->md.flags) { | ||
1314 | mdev->ldev->md.flags = mdf; | ||
1315 | drbd_md_mark_dirty(mdev); | ||
1316 | } | ||
1317 | if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT) | ||
1318 | drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]); | ||
1319 | put_ldev(mdev); | ||
1320 | } | ||
1321 | |||
1322 | /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */ | ||
1323 | if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT && | ||
1324 | os.peer == R_SECONDARY && ns.peer == R_PRIMARY) | ||
1325 | set_bit(CONSIDER_RESYNC, &mdev->flags); | ||
1326 | |||
1327 | /* Receiver should clean up itself */ | ||
1328 | if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING) | ||
1329 | drbd_thread_stop_nowait(&mdev->receiver); | ||
1330 | |||
1331 | /* Now the receiver finished cleaning up itself, it should die */ | ||
1332 | if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE) | ||
1333 | drbd_thread_stop_nowait(&mdev->receiver); | ||
1334 | |||
1335 | /* Upon network failure, we need to restart the receiver. */ | ||
1336 | if (os.conn > C_WF_CONNECTION && | ||
1337 | ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT) | ||
1338 | drbd_thread_restart_nowait(&mdev->receiver); | ||
1339 | |||
1340 | /* Resume AL writing if we get a connection */ | ||
1341 | if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) | ||
1342 | drbd_resume_al(mdev); | ||
1343 | |||
1344 | /* remember last connect and attach times so request_timer_fn() won't | ||
1345 | * kill newly established sessions while we are still trying to thaw | ||
1346 | * previously frozen IO */ | ||
1347 | if (os.conn != C_WF_REPORT_PARAMS && ns.conn == C_WF_REPORT_PARAMS) | ||
1348 | mdev->last_reconnect_jif = jiffies; | ||
1349 | if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && | ||
1350 | ns.disk > D_NEGOTIATING) | ||
1351 | mdev->last_reattach_jif = jiffies; | ||
1352 | |||
1353 | ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC); | ||
1354 | if (ascw) { | ||
1355 | ascw->os = os; | ||
1356 | ascw->ns = ns; | ||
1357 | ascw->flags = flags; | ||
1358 | ascw->w.cb = w_after_state_ch; | ||
1359 | ascw->done = done; | ||
1360 | drbd_queue_work(&mdev->data.work, &ascw->w); | ||
1361 | } else { | ||
1362 | dev_warn(DEV, "Could not kmalloc an ascw\n"); | ||
1363 | } | ||
1364 | |||
1365 | return rv; | ||
1366 | } | ||
1367 | |||
1368 | static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused) | ||
1369 | { | ||
1370 | struct after_state_chg_work *ascw = | ||
1371 | container_of(w, struct after_state_chg_work, w); | ||
1372 | after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags); | ||
1373 | if (ascw->flags & CS_WAIT_COMPLETE) { | ||
1374 | D_ASSERT(ascw->done != NULL); | ||
1375 | complete(ascw->done); | ||
1376 | } | ||
1377 | kfree(ascw); | ||
1378 | |||
1379 | return 1; | ||
1380 | } | ||
1381 | |||
1382 | static void abw_start_sync(struct drbd_conf *mdev, int rv) | ||
1383 | { | ||
1384 | if (rv) { | ||
1385 | dev_err(DEV, "Writing the bitmap failed not starting resync.\n"); | ||
1386 | _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE); | ||
1387 | return; | ||
1388 | } | ||
1389 | |||
1390 | switch (mdev->state.conn) { | ||
1391 | case C_STARTING_SYNC_T: | ||
1392 | _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); | ||
1393 | break; | ||
1394 | case C_STARTING_SYNC_S: | ||
1395 | drbd_start_resync(mdev, C_SYNC_SOURCE); | ||
1396 | break; | ||
1397 | } | ||
1398 | } | ||
1399 | |||
1400 | int drbd_bitmap_io_from_worker(struct drbd_conf *mdev, | ||
1401 | int (*io_fn)(struct drbd_conf *), | ||
1402 | char *why, enum bm_flag flags) | ||
1403 | { | ||
1404 | int rv; | ||
1405 | |||
1406 | D_ASSERT(current == mdev->worker.task); | ||
1407 | |||
1408 | /* open coded non-blocking drbd_suspend_io(mdev); */ | ||
1409 | set_bit(SUSPEND_IO, &mdev->flags); | ||
1410 | |||
1411 | drbd_bm_lock(mdev, why, flags); | ||
1412 | rv = io_fn(mdev); | ||
1413 | drbd_bm_unlock(mdev); | ||
1414 | |||
1415 | drbd_resume_io(mdev); | ||
1416 | |||
1417 | return rv; | ||
1418 | } | ||
1419 | |||
1420 | /** | ||
1421 | * after_state_ch() - Perform after state change actions that may sleep | ||
1422 | * @mdev: DRBD device. | ||
1423 | * @os: old state. | ||
1424 | * @ns: new state. | ||
1425 | * @flags: Flags | ||
1426 | */ | ||
1427 | static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, | ||
1428 | union drbd_state ns, enum chg_state_flags flags) | ||
1429 | { | ||
1430 | enum drbd_fencing_p fp; | ||
1431 | enum drbd_req_event what = nothing; | ||
1432 | union drbd_state nsm = (union drbd_state){ .i = -1 }; | ||
1433 | |||
1434 | if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) { | ||
1435 | clear_bit(CRASHED_PRIMARY, &mdev->flags); | ||
1436 | if (mdev->p_uuid) | ||
1437 | mdev->p_uuid[UI_FLAGS] &= ~((u64)2); | ||
1438 | } | ||
1439 | |||
1440 | fp = FP_DONT_CARE; | ||
1441 | if (get_ldev(mdev)) { | ||
1442 | fp = mdev->ldev->dc.fencing; | ||
1443 | put_ldev(mdev); | ||
1444 | } | ||
1445 | |||
1446 | /* Inform userspace about the change... */ | ||
1447 | drbd_bcast_state(mdev, ns); | ||
1448 | |||
1449 | if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) && | ||
1450 | (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)) | ||
1451 | drbd_khelper(mdev, "pri-on-incon-degr"); | ||
1452 | |||
1453 | /* Here we have the actions that are performed after a | ||
1454 | state change. This function might sleep */ | ||
1455 | |||
1456 | if (os.disk <= D_NEGOTIATING && ns.disk > D_NEGOTIATING) | ||
1457 | mod_timer(&mdev->request_timer, jiffies + HZ); | ||
1458 | |||
1459 | nsm.i = -1; | ||
1460 | if (ns.susp_nod) { | ||
1461 | if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) | ||
1462 | what = resend; | ||
1463 | |||
1464 | if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && | ||
1465 | ns.disk > D_NEGOTIATING) | ||
1466 | what = restart_frozen_disk_io; | ||
1467 | |||
1468 | if (what != nothing) | ||
1469 | nsm.susp_nod = 0; | ||
1470 | } | ||
1471 | |||
1472 | if (ns.susp_fen) { | ||
1473 | /* case1: The outdate peer handler is successful: */ | ||
1474 | if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) { | ||
1475 | if (test_bit(NEW_CUR_UUID, &mdev->flags)) { | ||
1476 | drbd_uuid_new_current(mdev); | ||
1477 | clear_bit(NEW_CUR_UUID, &mdev->flags); | ||
1478 | } | ||
1479 | spin_lock_irq(&mdev->req_lock); | ||
1480 | _tl_clear(mdev); | ||
1481 | _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL); | ||
1482 | spin_unlock_irq(&mdev->req_lock); | ||
1483 | } | ||
1484 | /* case2: The connection was established again: */ | ||
1485 | if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) { | ||
1486 | clear_bit(NEW_CUR_UUID, &mdev->flags); | ||
1487 | what = resend; | ||
1488 | nsm.susp_fen = 0; | ||
1489 | } | ||
1490 | } | ||
1491 | |||
1492 | if (what != nothing) { | ||
1493 | spin_lock_irq(&mdev->req_lock); | ||
1494 | _tl_restart(mdev, what); | ||
1495 | nsm.i &= mdev->state.i; | ||
1496 | _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL); | ||
1497 | spin_unlock_irq(&mdev->req_lock); | ||
1498 | } | ||
1499 | |||
1500 | /* Became sync source. With protocol >= 96, we still need to send out | ||
1501 | * the sync uuid now. Need to do that before any drbd_send_state, or | ||
1502 | * the other side may go "paused sync" before receiving the sync uuids, | ||
1503 | * which is unexpected. */ | ||
1504 | if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) && | ||
1505 | (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) && | ||
1506 | mdev->agreed_pro_version >= 96 && get_ldev(mdev)) { | ||
1507 | drbd_gen_and_send_sync_uuid(mdev); | ||
1508 | put_ldev(mdev); | ||
1509 | } | ||
1510 | |||
1511 | /* Do not change the order of the if above and the two below... */ | ||
1512 | if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */ | ||
1513 | /* we probably will start a resync soon. | ||
1514 | * make sure those things are properly reset. */ | ||
1515 | mdev->rs_total = 0; | ||
1516 | mdev->rs_failed = 0; | ||
1517 | atomic_set(&mdev->rs_pending_cnt, 0); | ||
1518 | drbd_rs_cancel_all(mdev); | ||
1519 | |||
1520 | drbd_send_uuids(mdev); | ||
1521 | drbd_send_state(mdev, ns); | ||
1522 | } | ||
1523 | /* No point in queuing send_bitmap if we don't have a connection | ||
1524 | * anymore, so check also the _current_ state, not only the new state | ||
1525 | * at the time this work was queued. */ | ||
1526 | if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S && | ||
1527 | mdev->state.conn == C_WF_BITMAP_S) | ||
1528 | drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, | ||
1529 | "send_bitmap (WFBitMapS)", | ||
1530 | BM_LOCKED_TEST_ALLOWED); | ||
1531 | |||
1532 | /* Lost contact to peer's copy of the data */ | ||
1533 | if ((os.pdsk >= D_INCONSISTENT && | ||
1534 | os.pdsk != D_UNKNOWN && | ||
1535 | os.pdsk != D_OUTDATED) | ||
1536 | && (ns.pdsk < D_INCONSISTENT || | ||
1537 | ns.pdsk == D_UNKNOWN || | ||
1538 | ns.pdsk == D_OUTDATED)) { | ||
1539 | if (get_ldev(mdev)) { | ||
1540 | if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && | ||
1541 | mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { | ||
1542 | if (is_susp(mdev->state)) { | ||
1543 | set_bit(NEW_CUR_UUID, &mdev->flags); | ||
1544 | } else { | ||
1545 | drbd_uuid_new_current(mdev); | ||
1546 | drbd_send_uuids(mdev); | ||
1547 | } | ||
1548 | } | ||
1549 | put_ldev(mdev); | ||
1550 | } | ||
1551 | } | ||
1552 | |||
1553 | if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) { | ||
1554 | if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY && | ||
1555 | mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { | ||
1556 | drbd_uuid_new_current(mdev); | ||
1557 | drbd_send_uuids(mdev); | ||
1558 | } | ||
1559 | /* D_DISKLESS Peer becomes secondary */ | ||
1560 | if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) | ||
1561 | /* We may still be Primary ourselves. | ||
1562 | * No harm done if the bitmap still changes, | ||
1563 | * redirtied pages will follow later. */ | ||
1564 | drbd_bitmap_io_from_worker(mdev, &drbd_bm_write, | ||
1565 | "demote diskless peer", BM_LOCKED_SET_ALLOWED); | ||
1566 | put_ldev(mdev); | ||
1567 | } | ||
1568 | |||
1569 | /* Write out all changed bits on demote. | ||
1570 | * Though, no need to da that just yet | ||
1571 | * if there is a resync going on still */ | ||
1572 | if (os.role == R_PRIMARY && ns.role == R_SECONDARY && | ||
1573 | mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) { | ||
1574 | /* No changes to the bitmap expected this time, so assert that, | ||
1575 | * even though no harm was done if it did change. */ | ||
1576 | drbd_bitmap_io_from_worker(mdev, &drbd_bm_write, | ||
1577 | "demote", BM_LOCKED_TEST_ALLOWED); | ||
1578 | put_ldev(mdev); | ||
1579 | } | ||
1580 | |||
1581 | /* Last part of the attaching process ... */ | ||
1582 | if (ns.conn >= C_CONNECTED && | ||
1583 | os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) { | ||
1584 | drbd_send_sizes(mdev, 0, 0); /* to start sync... */ | ||
1585 | drbd_send_uuids(mdev); | ||
1586 | drbd_send_state(mdev, ns); | ||
1587 | } | ||
1588 | |||
1589 | /* We want to pause/continue resync, tell peer. */ | ||
1590 | if (ns.conn >= C_CONNECTED && | ||
1591 | ((os.aftr_isp != ns.aftr_isp) || | ||
1592 | (os.user_isp != ns.user_isp))) | ||
1593 | drbd_send_state(mdev, ns); | ||
1594 | |||
1595 | /* In case one of the isp bits got set, suspend other devices. */ | ||
1596 | if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) && | ||
1597 | (ns.aftr_isp || ns.peer_isp || ns.user_isp)) | ||
1598 | suspend_other_sg(mdev); | ||
1599 | |||
1600 | /* Make sure the peer gets informed about eventual state | ||
1601 | changes (ISP bits) while we were in WFReportParams. */ | ||
1602 | if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED) | ||
1603 | drbd_send_state(mdev, ns); | ||
1604 | |||
1605 | if (os.conn != C_AHEAD && ns.conn == C_AHEAD) | ||
1606 | drbd_send_state(mdev, ns); | ||
1607 | |||
1608 | /* We are in the progress to start a full sync... */ | ||
1609 | if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || | ||
1610 | (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S)) | ||
1611 | /* no other bitmap changes expected during this phase */ | ||
1612 | drbd_queue_bitmap_io(mdev, | ||
1613 | &drbd_bmio_set_n_write, &abw_start_sync, | ||
1614 | "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED); | ||
1615 | |||
1616 | /* We are invalidating our self... */ | ||
1617 | if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED && | ||
1618 | os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT) | ||
1619 | /* other bitmap operation expected during this phase */ | ||
1620 | drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, | ||
1621 | "set_n_write from invalidate", BM_LOCKED_MASK); | ||
1622 | |||
1623 | /* first half of local IO error, failure to attach, | ||
1624 | * or administrative detach */ | ||
1625 | if (os.disk != D_FAILED && ns.disk == D_FAILED) { | ||
1626 | enum drbd_io_error_p eh = EP_PASS_ON; | ||
1627 | int was_io_error = 0; | ||
1628 | /* corresponding get_ldev was in __drbd_set_state, to serialize | ||
1629 | * our cleanup here with the transition to D_DISKLESS. | ||
1630 | * But is is still not save to dreference ldev here, since | ||
1631 | * we might come from an failed Attach before ldev was set. */ | ||
1632 | if (mdev->ldev) { | ||
1633 | eh = mdev->ldev->dc.on_io_error; | ||
1634 | was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags); | ||
1635 | |||
1636 | if (was_io_error && eh == EP_CALL_HELPER) | ||
1637 | drbd_khelper(mdev, "local-io-error"); | ||
1638 | |||
1639 | /* Immediately allow completion of all application IO, | ||
1640 | * that waits for completion from the local disk, | ||
1641 | * if this was a force-detach due to disk_timeout | ||
1642 | * or administrator request (drbdsetup detach --force). | ||
1643 | * Do NOT abort otherwise. | ||
1644 | * Aborting local requests may cause serious problems, | ||
1645 | * if requests are completed to upper layers already, | ||
1646 | * and then later the already submitted local bio completes. | ||
1647 | * This can cause DMA into former bio pages that meanwhile | ||
1648 | * have been re-used for other things. | ||
1649 | * So aborting local requests may cause crashes, | ||
1650 | * or even worse, silent data corruption. | ||
1651 | */ | ||
1652 | if (test_and_clear_bit(FORCE_DETACH, &mdev->flags)) | ||
1653 | tl_abort_disk_io(mdev); | ||
1654 | |||
1655 | /* current state still has to be D_FAILED, | ||
1656 | * there is only one way out: to D_DISKLESS, | ||
1657 | * and that may only happen after our put_ldev below. */ | ||
1658 | if (mdev->state.disk != D_FAILED) | ||
1659 | dev_err(DEV, | ||
1660 | "ASSERT FAILED: disk is %s during detach\n", | ||
1661 | drbd_disk_str(mdev->state.disk)); | ||
1662 | |||
1663 | if (ns.conn >= C_CONNECTED) | ||
1664 | drbd_send_state(mdev, ns); | ||
1665 | |||
1666 | drbd_rs_cancel_all(mdev); | ||
1667 | |||
1668 | /* In case we want to get something to stable storage still, | ||
1669 | * this may be the last chance. | ||
1670 | * Following put_ldev may transition to D_DISKLESS. */ | ||
1671 | drbd_md_sync(mdev); | ||
1672 | } | ||
1673 | put_ldev(mdev); | ||
1674 | } | ||
1675 | |||
1676 | /* second half of local IO error, failure to attach, | ||
1677 | * or administrative detach, | ||
1678 | * after local_cnt references have reached zero again */ | ||
1679 | if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) { | ||
1680 | /* We must still be diskless, | ||
1681 | * re-attach has to be serialized with this! */ | ||
1682 | if (mdev->state.disk != D_DISKLESS) | ||
1683 | dev_err(DEV, | ||
1684 | "ASSERT FAILED: disk is %s while going diskless\n", | ||
1685 | drbd_disk_str(mdev->state.disk)); | ||
1686 | |||
1687 | if (ns.conn >= C_CONNECTED) | ||
1688 | drbd_send_state(mdev, ns); | ||
1689 | |||
1690 | /* corresponding get_ldev in __drbd_set_state | ||
1691 | * this may finally trigger drbd_ldev_destroy. */ | ||
1692 | put_ldev(mdev); | ||
1693 | } | ||
1694 | |||
1695 | /* Notify peer that I had a local IO error, and did not detached.. */ | ||
1696 | if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED) | ||
1697 | drbd_send_state(mdev, ns); | ||
1698 | |||
1699 | /* Disks got bigger while they were detached */ | ||
1700 | if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING && | ||
1701 | test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) { | ||
1702 | if (ns.conn == C_CONNECTED) | ||
1703 | resync_after_online_grow(mdev); | ||
1704 | } | ||
1705 | |||
1706 | /* A resync finished or aborted, wake paused devices... */ | ||
1707 | if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) || | ||
1708 | (os.peer_isp && !ns.peer_isp) || | ||
1709 | (os.user_isp && !ns.user_isp)) | ||
1710 | resume_next_sg(mdev); | ||
1711 | |||
1712 | /* sync target done with resync. Explicitly notify peer, even though | ||
1713 | * it should (at least for non-empty resyncs) already know itself. */ | ||
1714 | if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED) | ||
1715 | drbd_send_state(mdev, ns); | ||
1716 | |||
1717 | /* Wake up role changes, that were delayed because of connection establishing */ | ||
1718 | if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS) { | ||
1719 | clear_bit(STATE_SENT, &mdev->flags); | ||
1720 | wake_up(&mdev->state_wait); | ||
1721 | } | ||
1722 | |||
1723 | /* This triggers bitmap writeout of potentially still unwritten pages | ||
1724 | * if the resync finished cleanly, or aborted because of peer disk | ||
1725 | * failure, or because of connection loss. | ||
1726 | * For resync aborted because of local disk failure, we cannot do | ||
1727 | * any bitmap writeout anymore. | ||
1728 | * No harm done if some bits change during this phase. | ||
1729 | */ | ||
1730 | if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) { | ||
1731 | drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL, | ||
1732 | "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED); | ||
1733 | put_ldev(mdev); | ||
1734 | } | ||
1735 | |||
1736 | /* free tl_hash if we Got thawed and are C_STANDALONE */ | ||
1737 | if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash) | ||
1738 | drbd_free_tl_hash(mdev); | ||
1739 | |||
1740 | /* Upon network connection, we need to start the receiver */ | ||
1741 | if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED) | ||
1742 | drbd_thread_start(&mdev->receiver); | ||
1743 | |||
1744 | /* Terminate worker thread if we are unconfigured - it will be | ||
1745 | restarted as needed... */ | ||
1746 | if (ns.disk == D_DISKLESS && | ||
1747 | ns.conn == C_STANDALONE && | ||
1748 | ns.role == R_SECONDARY) { | ||
1749 | if (os.aftr_isp != ns.aftr_isp) | ||
1750 | resume_next_sg(mdev); | ||
1751 | /* set in __drbd_set_state, unless CONFIG_PENDING was set */ | ||
1752 | if (test_bit(DEVICE_DYING, &mdev->flags)) | ||
1753 | drbd_thread_stop_nowait(&mdev->worker); | ||
1754 | } | 322 | } |
1755 | 323 | spin_unlock_irq(&tconn->req_lock); | |
1756 | drbd_md_sync(mdev); | ||
1757 | } | 324 | } |
1758 | 325 | ||
1759 | |||
1760 | static int drbd_thread_setup(void *arg) | 326 | static int drbd_thread_setup(void *arg) |
1761 | { | 327 | { |
1762 | struct drbd_thread *thi = (struct drbd_thread *) arg; | 328 | struct drbd_thread *thi = (struct drbd_thread *) arg; |
1763 | struct drbd_conf *mdev = thi->mdev; | 329 | struct drbd_tconn *tconn = thi->tconn; |
1764 | unsigned long flags; | 330 | unsigned long flags; |
1765 | int retval; | 331 | int retval; |
1766 | 332 | ||
333 | snprintf(current->comm, sizeof(current->comm), "drbd_%c_%s", | ||
334 | thi->name[0], thi->tconn->name); | ||
335 | |||
1767 | restart: | 336 | restart: |
1768 | retval = thi->function(thi); | 337 | retval = thi->function(thi); |
1769 | 338 | ||
1770 | spin_lock_irqsave(&thi->t_lock, flags); | 339 | spin_lock_irqsave(&thi->t_lock, flags); |
1771 | 340 | ||
1772 | /* if the receiver has been "Exiting", the last thing it did | 341 | /* if the receiver has been "EXITING", the last thing it did |
1773 | * was set the conn state to "StandAlone", | 342 | * was set the conn state to "StandAlone", |
1774 | * if now a re-connect request comes in, conn state goes C_UNCONNECTED, | 343 | * if now a re-connect request comes in, conn state goes C_UNCONNECTED, |
1775 | * and receiver thread will be "started". | 344 | * and receiver thread will be "started". |
1776 | * drbd_thread_start needs to set "Restarting" in that case. | 345 | * drbd_thread_start needs to set "RESTARTING" in that case. |
1777 | * t_state check and assignment needs to be within the same spinlock, | 346 | * t_state check and assignment needs to be within the same spinlock, |
1778 | * so either thread_start sees Exiting, and can remap to Restarting, | 347 | * so either thread_start sees EXITING, and can remap to RESTARTING, |
1779 | * or thread_start see None, and can proceed as normal. | 348 | * or thread_start see NONE, and can proceed as normal. |
1780 | */ | 349 | */ |
1781 | 350 | ||
1782 | if (thi->t_state == Restarting) { | 351 | if (thi->t_state == RESTARTING) { |
1783 | dev_info(DEV, "Restarting %s\n", current->comm); | 352 | conn_info(tconn, "Restarting %s thread\n", thi->name); |
1784 | thi->t_state = Running; | 353 | thi->t_state = RUNNING; |
1785 | spin_unlock_irqrestore(&thi->t_lock, flags); | 354 | spin_unlock_irqrestore(&thi->t_lock, flags); |
1786 | goto restart; | 355 | goto restart; |
1787 | } | 356 | } |
1788 | 357 | ||
1789 | thi->task = NULL; | 358 | thi->task = NULL; |
1790 | thi->t_state = None; | 359 | thi->t_state = NONE; |
1791 | smp_mb(); | 360 | smp_mb(); |
1792 | complete(&thi->stop); | 361 | complete_all(&thi->stop); |
1793 | spin_unlock_irqrestore(&thi->t_lock, flags); | 362 | spin_unlock_irqrestore(&thi->t_lock, flags); |
1794 | 363 | ||
1795 | dev_info(DEV, "Terminating %s\n", current->comm); | 364 | conn_info(tconn, "Terminating %s\n", current->comm); |
1796 | 365 | ||
1797 | /* Release mod reference taken when thread was started */ | 366 | /* Release mod reference taken when thread was started */ |
367 | |||
368 | kref_put(&tconn->kref, &conn_destroy); | ||
1798 | module_put(THIS_MODULE); | 369 | module_put(THIS_MODULE); |
1799 | return retval; | 370 | return retval; |
1800 | } | 371 | } |
1801 | 372 | ||
1802 | static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi, | 373 | static void drbd_thread_init(struct drbd_tconn *tconn, struct drbd_thread *thi, |
1803 | int (*func) (struct drbd_thread *)) | 374 | int (*func) (struct drbd_thread *), char *name) |
1804 | { | 375 | { |
1805 | spin_lock_init(&thi->t_lock); | 376 | spin_lock_init(&thi->t_lock); |
1806 | thi->task = NULL; | 377 | thi->task = NULL; |
1807 | thi->t_state = None; | 378 | thi->t_state = NONE; |
1808 | thi->function = func; | 379 | thi->function = func; |
1809 | thi->mdev = mdev; | 380 | thi->tconn = tconn; |
381 | strncpy(thi->name, name, ARRAY_SIZE(thi->name)); | ||
1810 | } | 382 | } |
1811 | 383 | ||
1812 | int drbd_thread_start(struct drbd_thread *thi) | 384 | int drbd_thread_start(struct drbd_thread *thi) |
1813 | { | 385 | { |
1814 | struct drbd_conf *mdev = thi->mdev; | 386 | struct drbd_tconn *tconn = thi->tconn; |
1815 | struct task_struct *nt; | 387 | struct task_struct *nt; |
1816 | unsigned long flags; | 388 | unsigned long flags; |
1817 | 389 | ||
1818 | const char *me = | ||
1819 | thi == &mdev->receiver ? "receiver" : | ||
1820 | thi == &mdev->asender ? "asender" : | ||
1821 | thi == &mdev->worker ? "worker" : "NONSENSE"; | ||
1822 | |||
1823 | /* is used from state engine doing drbd_thread_stop_nowait, | 390 | /* is used from state engine doing drbd_thread_stop_nowait, |
1824 | * while holding the req lock irqsave */ | 391 | * while holding the req lock irqsave */ |
1825 | spin_lock_irqsave(&thi->t_lock, flags); | 392 | spin_lock_irqsave(&thi->t_lock, flags); |
1826 | 393 | ||
1827 | switch (thi->t_state) { | 394 | switch (thi->t_state) { |
1828 | case None: | 395 | case NONE: |
1829 | dev_info(DEV, "Starting %s thread (from %s [%d])\n", | 396 | conn_info(tconn, "Starting %s thread (from %s [%d])\n", |
1830 | me, current->comm, current->pid); | 397 | thi->name, current->comm, current->pid); |
1831 | 398 | ||
1832 | /* Get ref on module for thread - this is released when thread exits */ | 399 | /* Get ref on module for thread - this is released when thread exits */ |
1833 | if (!try_module_get(THIS_MODULE)) { | 400 | if (!try_module_get(THIS_MODULE)) { |
1834 | dev_err(DEV, "Failed to get module reference in drbd_thread_start\n"); | 401 | conn_err(tconn, "Failed to get module reference in drbd_thread_start\n"); |
1835 | spin_unlock_irqrestore(&thi->t_lock, flags); | 402 | spin_unlock_irqrestore(&thi->t_lock, flags); |
1836 | return false; | 403 | return false; |
1837 | } | 404 | } |
1838 | 405 | ||
406 | kref_get(&thi->tconn->kref); | ||
407 | |||
1839 | init_completion(&thi->stop); | 408 | init_completion(&thi->stop); |
1840 | D_ASSERT(thi->task == NULL); | ||
1841 | thi->reset_cpu_mask = 1; | 409 | thi->reset_cpu_mask = 1; |
1842 | thi->t_state = Running; | 410 | thi->t_state = RUNNING; |
1843 | spin_unlock_irqrestore(&thi->t_lock, flags); | 411 | spin_unlock_irqrestore(&thi->t_lock, flags); |
1844 | flush_signals(current); /* otherw. may get -ERESTARTNOINTR */ | 412 | flush_signals(current); /* otherw. may get -ERESTARTNOINTR */ |
1845 | 413 | ||
1846 | nt = kthread_create(drbd_thread_setup, (void *) thi, | 414 | nt = kthread_create(drbd_thread_setup, (void *) thi, |
1847 | "drbd%d_%s", mdev_to_minor(mdev), me); | 415 | "drbd_%c_%s", thi->name[0], thi->tconn->name); |
1848 | 416 | ||
1849 | if (IS_ERR(nt)) { | 417 | if (IS_ERR(nt)) { |
1850 | dev_err(DEV, "Couldn't start thread\n"); | 418 | conn_err(tconn, "Couldn't start thread\n"); |
1851 | 419 | ||
420 | kref_put(&tconn->kref, &conn_destroy); | ||
1852 | module_put(THIS_MODULE); | 421 | module_put(THIS_MODULE); |
1853 | return false; | 422 | return false; |
1854 | } | 423 | } |
1855 | spin_lock_irqsave(&thi->t_lock, flags); | 424 | spin_lock_irqsave(&thi->t_lock, flags); |
1856 | thi->task = nt; | 425 | thi->task = nt; |
1857 | thi->t_state = Running; | 426 | thi->t_state = RUNNING; |
1858 | spin_unlock_irqrestore(&thi->t_lock, flags); | 427 | spin_unlock_irqrestore(&thi->t_lock, flags); |
1859 | wake_up_process(nt); | 428 | wake_up_process(nt); |
1860 | break; | 429 | break; |
1861 | case Exiting: | 430 | case EXITING: |
1862 | thi->t_state = Restarting; | 431 | thi->t_state = RESTARTING; |
1863 | dev_info(DEV, "Restarting %s thread (from %s [%d])\n", | 432 | conn_info(tconn, "Restarting %s thread (from %s [%d])\n", |
1864 | me, current->comm, current->pid); | 433 | thi->name, current->comm, current->pid); |
1865 | /* fall through */ | 434 | /* fall through */ |
1866 | case Running: | 435 | case RUNNING: |
1867 | case Restarting: | 436 | case RESTARTING: |
1868 | default: | 437 | default: |
1869 | spin_unlock_irqrestore(&thi->t_lock, flags); | 438 | spin_unlock_irqrestore(&thi->t_lock, flags); |
1870 | break; | 439 | break; |
@@ -1878,12 +447,12 @@ void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait) | |||
1878 | { | 447 | { |
1879 | unsigned long flags; | 448 | unsigned long flags; |
1880 | 449 | ||
1881 | enum drbd_thread_state ns = restart ? Restarting : Exiting; | 450 | enum drbd_thread_state ns = restart ? RESTARTING : EXITING; |
1882 | 451 | ||
1883 | /* may be called from state engine, holding the req lock irqsave */ | 452 | /* may be called from state engine, holding the req lock irqsave */ |
1884 | spin_lock_irqsave(&thi->t_lock, flags); | 453 | spin_lock_irqsave(&thi->t_lock, flags); |
1885 | 454 | ||
1886 | if (thi->t_state == None) { | 455 | if (thi->t_state == NONE) { |
1887 | spin_unlock_irqrestore(&thi->t_lock, flags); | 456 | spin_unlock_irqrestore(&thi->t_lock, flags); |
1888 | if (restart) | 457 | if (restart) |
1889 | drbd_thread_start(thi); | 458 | drbd_thread_start(thi); |
@@ -1901,7 +470,6 @@ void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait) | |||
1901 | init_completion(&thi->stop); | 470 | init_completion(&thi->stop); |
1902 | if (thi->task != current) | 471 | if (thi->task != current) |
1903 | force_sig(DRBD_SIGKILL, thi->task); | 472 | force_sig(DRBD_SIGKILL, thi->task); |
1904 | |||
1905 | } | 473 | } |
1906 | 474 | ||
1907 | spin_unlock_irqrestore(&thi->t_lock, flags); | 475 | spin_unlock_irqrestore(&thi->t_lock, flags); |
@@ -1910,6 +478,35 @@ void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait) | |||
1910 | wait_for_completion(&thi->stop); | 478 | wait_for_completion(&thi->stop); |
1911 | } | 479 | } |
1912 | 480 | ||
481 | static struct drbd_thread *drbd_task_to_thread(struct drbd_tconn *tconn, struct task_struct *task) | ||
482 | { | ||
483 | struct drbd_thread *thi = | ||
484 | task == tconn->receiver.task ? &tconn->receiver : | ||
485 | task == tconn->asender.task ? &tconn->asender : | ||
486 | task == tconn->worker.task ? &tconn->worker : NULL; | ||
487 | |||
488 | return thi; | ||
489 | } | ||
490 | |||
491 | char *drbd_task_to_thread_name(struct drbd_tconn *tconn, struct task_struct *task) | ||
492 | { | ||
493 | struct drbd_thread *thi = drbd_task_to_thread(tconn, task); | ||
494 | return thi ? thi->name : task->comm; | ||
495 | } | ||
496 | |||
497 | int conn_lowest_minor(struct drbd_tconn *tconn) | ||
498 | { | ||
499 | struct drbd_conf *mdev; | ||
500 | int vnr = 0, m; | ||
501 | |||
502 | rcu_read_lock(); | ||
503 | mdev = idr_get_next(&tconn->volumes, &vnr); | ||
504 | m = mdev ? mdev_to_minor(mdev) : -1; | ||
505 | rcu_read_unlock(); | ||
506 | |||
507 | return m; | ||
508 | } | ||
509 | |||
1913 | #ifdef CONFIG_SMP | 510 | #ifdef CONFIG_SMP |
1914 | /** | 511 | /** |
1915 | * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs | 512 | * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs |
@@ -1918,238 +515,345 @@ void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait) | |||
1918 | * Forces all threads of a device onto the same CPU. This is beneficial for | 515 | * Forces all threads of a device onto the same CPU. This is beneficial for |
1919 | * DRBD's performance. May be overwritten by user's configuration. | 516 | * DRBD's performance. May be overwritten by user's configuration. |
1920 | */ | 517 | */ |
1921 | void drbd_calc_cpu_mask(struct drbd_conf *mdev) | 518 | void drbd_calc_cpu_mask(struct drbd_tconn *tconn) |
1922 | { | 519 | { |
1923 | int ord, cpu; | 520 | int ord, cpu; |
1924 | 521 | ||
1925 | /* user override. */ | 522 | /* user override. */ |
1926 | if (cpumask_weight(mdev->cpu_mask)) | 523 | if (cpumask_weight(tconn->cpu_mask)) |
1927 | return; | 524 | return; |
1928 | 525 | ||
1929 | ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask); | 526 | ord = conn_lowest_minor(tconn) % cpumask_weight(cpu_online_mask); |
1930 | for_each_online_cpu(cpu) { | 527 | for_each_online_cpu(cpu) { |
1931 | if (ord-- == 0) { | 528 | if (ord-- == 0) { |
1932 | cpumask_set_cpu(cpu, mdev->cpu_mask); | 529 | cpumask_set_cpu(cpu, tconn->cpu_mask); |
1933 | return; | 530 | return; |
1934 | } | 531 | } |
1935 | } | 532 | } |
1936 | /* should not be reached */ | 533 | /* should not be reached */ |
1937 | cpumask_setall(mdev->cpu_mask); | 534 | cpumask_setall(tconn->cpu_mask); |
1938 | } | 535 | } |
1939 | 536 | ||
1940 | /** | 537 | /** |
1941 | * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread | 538 | * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread |
1942 | * @mdev: DRBD device. | 539 | * @mdev: DRBD device. |
540 | * @thi: drbd_thread object | ||
1943 | * | 541 | * |
1944 | * call in the "main loop" of _all_ threads, no need for any mutex, current won't die | 542 | * call in the "main loop" of _all_ threads, no need for any mutex, current won't die |
1945 | * prematurely. | 543 | * prematurely. |
1946 | */ | 544 | */ |
1947 | void drbd_thread_current_set_cpu(struct drbd_conf *mdev) | 545 | void drbd_thread_current_set_cpu(struct drbd_thread *thi) |
1948 | { | 546 | { |
1949 | struct task_struct *p = current; | 547 | struct task_struct *p = current; |
1950 | struct drbd_thread *thi = | 548 | |
1951 | p == mdev->asender.task ? &mdev->asender : | ||
1952 | p == mdev->receiver.task ? &mdev->receiver : | ||
1953 | p == mdev->worker.task ? &mdev->worker : | ||
1954 | NULL; | ||
1955 | ERR_IF(thi == NULL) | ||
1956 | return; | ||
1957 | if (!thi->reset_cpu_mask) | 549 | if (!thi->reset_cpu_mask) |
1958 | return; | 550 | return; |
1959 | thi->reset_cpu_mask = 0; | 551 | thi->reset_cpu_mask = 0; |
1960 | set_cpus_allowed_ptr(p, mdev->cpu_mask); | 552 | set_cpus_allowed_ptr(p, thi->tconn->cpu_mask); |
1961 | } | 553 | } |
1962 | #endif | 554 | #endif |
1963 | 555 | ||
1964 | /* the appropriate socket mutex must be held already */ | 556 | /** |
1965 | int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, | 557 | * drbd_header_size - size of a packet header |
1966 | enum drbd_packets cmd, struct p_header80 *h, | 558 | * |
1967 | size_t size, unsigned msg_flags) | 559 | * The header size is a multiple of 8, so any payload following the header is |
560 | * word aligned on 64-bit architectures. (The bitmap send and receive code | ||
561 | * relies on this.) | ||
562 | */ | ||
563 | unsigned int drbd_header_size(struct drbd_tconn *tconn) | ||
1968 | { | 564 | { |
1969 | int sent, ok; | 565 | if (tconn->agreed_pro_version >= 100) { |
566 | BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header100), 8)); | ||
567 | return sizeof(struct p_header100); | ||
568 | } else { | ||
569 | BUILD_BUG_ON(sizeof(struct p_header80) != | ||
570 | sizeof(struct p_header95)); | ||
571 | BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header80), 8)); | ||
572 | return sizeof(struct p_header80); | ||
573 | } | ||
574 | } | ||
1970 | 575 | ||
1971 | ERR_IF(!h) return false; | 576 | static unsigned int prepare_header80(struct p_header80 *h, enum drbd_packet cmd, int size) |
1972 | ERR_IF(!size) return false; | 577 | { |
578 | h->magic = cpu_to_be32(DRBD_MAGIC); | ||
579 | h->command = cpu_to_be16(cmd); | ||
580 | h->length = cpu_to_be16(size); | ||
581 | return sizeof(struct p_header80); | ||
582 | } | ||
1973 | 583 | ||
1974 | h->magic = BE_DRBD_MAGIC; | 584 | static unsigned int prepare_header95(struct p_header95 *h, enum drbd_packet cmd, int size) |
585 | { | ||
586 | h->magic = cpu_to_be16(DRBD_MAGIC_BIG); | ||
1975 | h->command = cpu_to_be16(cmd); | 587 | h->command = cpu_to_be16(cmd); |
1976 | h->length = cpu_to_be16(size-sizeof(struct p_header80)); | 588 | h->length = cpu_to_be32(size); |
589 | return sizeof(struct p_header95); | ||
590 | } | ||
1977 | 591 | ||
1978 | sent = drbd_send(mdev, sock, h, size, msg_flags); | 592 | static unsigned int prepare_header100(struct p_header100 *h, enum drbd_packet cmd, |
593 | int size, int vnr) | ||
594 | { | ||
595 | h->magic = cpu_to_be32(DRBD_MAGIC_100); | ||
596 | h->volume = cpu_to_be16(vnr); | ||
597 | h->command = cpu_to_be16(cmd); | ||
598 | h->length = cpu_to_be32(size); | ||
599 | h->pad = 0; | ||
600 | return sizeof(struct p_header100); | ||
601 | } | ||
1979 | 602 | ||
1980 | ok = (sent == size); | 603 | static unsigned int prepare_header(struct drbd_tconn *tconn, int vnr, |
1981 | if (!ok && !signal_pending(current)) | 604 | void *buffer, enum drbd_packet cmd, int size) |
1982 | dev_warn(DEV, "short sent %s size=%d sent=%d\n", | 605 | { |
1983 | cmdname(cmd), (int)size, sent); | 606 | if (tconn->agreed_pro_version >= 100) |
1984 | return ok; | 607 | return prepare_header100(buffer, cmd, size, vnr); |
608 | else if (tconn->agreed_pro_version >= 95 && | ||
609 | size > DRBD_MAX_SIZE_H80_PACKET) | ||
610 | return prepare_header95(buffer, cmd, size); | ||
611 | else | ||
612 | return prepare_header80(buffer, cmd, size); | ||
1985 | } | 613 | } |
1986 | 614 | ||
1987 | /* don't pass the socket. we may only look at it | 615 | static void *__conn_prepare_command(struct drbd_tconn *tconn, |
1988 | * when we hold the appropriate socket mutex. | 616 | struct drbd_socket *sock) |
1989 | */ | ||
1990 | int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket, | ||
1991 | enum drbd_packets cmd, struct p_header80 *h, size_t size) | ||
1992 | { | 617 | { |
1993 | int ok = 0; | 618 | if (!sock->socket) |
1994 | struct socket *sock; | 619 | return NULL; |
620 | return sock->sbuf + drbd_header_size(tconn); | ||
621 | } | ||
1995 | 622 | ||
1996 | if (use_data_socket) { | 623 | void *conn_prepare_command(struct drbd_tconn *tconn, struct drbd_socket *sock) |
1997 | mutex_lock(&mdev->data.mutex); | 624 | { |
1998 | sock = mdev->data.socket; | 625 | void *p; |
1999 | } else { | ||
2000 | mutex_lock(&mdev->meta.mutex); | ||
2001 | sock = mdev->meta.socket; | ||
2002 | } | ||
2003 | 626 | ||
2004 | /* drbd_disconnect() could have called drbd_free_sock() | 627 | mutex_lock(&sock->mutex); |
2005 | * while we were waiting in down()... */ | 628 | p = __conn_prepare_command(tconn, sock); |
2006 | if (likely(sock != NULL)) | 629 | if (!p) |
2007 | ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0); | 630 | mutex_unlock(&sock->mutex); |
2008 | 631 | ||
2009 | if (use_data_socket) | 632 | return p; |
2010 | mutex_unlock(&mdev->data.mutex); | ||
2011 | else | ||
2012 | mutex_unlock(&mdev->meta.mutex); | ||
2013 | return ok; | ||
2014 | } | 633 | } |
2015 | 634 | ||
2016 | int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data, | 635 | void *drbd_prepare_command(struct drbd_conf *mdev, struct drbd_socket *sock) |
2017 | size_t size) | ||
2018 | { | 636 | { |
2019 | struct p_header80 h; | 637 | return conn_prepare_command(mdev->tconn, sock); |
2020 | int ok; | 638 | } |
2021 | 639 | ||
2022 | h.magic = BE_DRBD_MAGIC; | 640 | static int __send_command(struct drbd_tconn *tconn, int vnr, |
2023 | h.command = cpu_to_be16(cmd); | 641 | struct drbd_socket *sock, enum drbd_packet cmd, |
2024 | h.length = cpu_to_be16(size); | 642 | unsigned int header_size, void *data, |
643 | unsigned int size) | ||
644 | { | ||
645 | int msg_flags; | ||
646 | int err; | ||
2025 | 647 | ||
2026 | if (!drbd_get_data_sock(mdev)) | 648 | /* |
2027 | return 0; | 649 | * Called with @data == NULL and the size of the data blocks in @size |
650 | * for commands that send data blocks. For those commands, omit the | ||
651 | * MSG_MORE flag: this will increase the likelihood that data blocks | ||
652 | * which are page aligned on the sender will end up page aligned on the | ||
653 | * receiver. | ||
654 | */ | ||
655 | msg_flags = data ? MSG_MORE : 0; | ||
656 | |||
657 | header_size += prepare_header(tconn, vnr, sock->sbuf, cmd, | ||
658 | header_size + size); | ||
659 | err = drbd_send_all(tconn, sock->socket, sock->sbuf, header_size, | ||
660 | msg_flags); | ||
661 | if (data && !err) | ||
662 | err = drbd_send_all(tconn, sock->socket, data, size, 0); | ||
663 | return err; | ||
664 | } | ||
2028 | 665 | ||
2029 | ok = (sizeof(h) == | 666 | static int __conn_send_command(struct drbd_tconn *tconn, struct drbd_socket *sock, |
2030 | drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0)); | 667 | enum drbd_packet cmd, unsigned int header_size, |
2031 | ok = ok && (size == | 668 | void *data, unsigned int size) |
2032 | drbd_send(mdev, mdev->data.socket, data, size, 0)); | 669 | { |
670 | return __send_command(tconn, 0, sock, cmd, header_size, data, size); | ||
671 | } | ||
2033 | 672 | ||
2034 | drbd_put_data_sock(mdev); | 673 | int conn_send_command(struct drbd_tconn *tconn, struct drbd_socket *sock, |
674 | enum drbd_packet cmd, unsigned int header_size, | ||
675 | void *data, unsigned int size) | ||
676 | { | ||
677 | int err; | ||
2035 | 678 | ||
2036 | return ok; | 679 | err = __conn_send_command(tconn, sock, cmd, header_size, data, size); |
680 | mutex_unlock(&sock->mutex); | ||
681 | return err; | ||
2037 | } | 682 | } |
2038 | 683 | ||
2039 | int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc) | 684 | int drbd_send_command(struct drbd_conf *mdev, struct drbd_socket *sock, |
685 | enum drbd_packet cmd, unsigned int header_size, | ||
686 | void *data, unsigned int size) | ||
2040 | { | 687 | { |
688 | int err; | ||
689 | |||
690 | err = __send_command(mdev->tconn, mdev->vnr, sock, cmd, header_size, | ||
691 | data, size); | ||
692 | mutex_unlock(&sock->mutex); | ||
693 | return err; | ||
694 | } | ||
695 | |||
696 | int drbd_send_ping(struct drbd_tconn *tconn) | ||
697 | { | ||
698 | struct drbd_socket *sock; | ||
699 | |||
700 | sock = &tconn->meta; | ||
701 | if (!conn_prepare_command(tconn, sock)) | ||
702 | return -EIO; | ||
703 | return conn_send_command(tconn, sock, P_PING, 0, NULL, 0); | ||
704 | } | ||
705 | |||
706 | int drbd_send_ping_ack(struct drbd_tconn *tconn) | ||
707 | { | ||
708 | struct drbd_socket *sock; | ||
709 | |||
710 | sock = &tconn->meta; | ||
711 | if (!conn_prepare_command(tconn, sock)) | ||
712 | return -EIO; | ||
713 | return conn_send_command(tconn, sock, P_PING_ACK, 0, NULL, 0); | ||
714 | } | ||
715 | |||
716 | int drbd_send_sync_param(struct drbd_conf *mdev) | ||
717 | { | ||
718 | struct drbd_socket *sock; | ||
2041 | struct p_rs_param_95 *p; | 719 | struct p_rs_param_95 *p; |
2042 | struct socket *sock; | 720 | int size; |
2043 | int size, rv; | 721 | const int apv = mdev->tconn->agreed_pro_version; |
2044 | const int apv = mdev->agreed_pro_version; | 722 | enum drbd_packet cmd; |
723 | struct net_conf *nc; | ||
724 | struct disk_conf *dc; | ||
725 | |||
726 | sock = &mdev->tconn->data; | ||
727 | p = drbd_prepare_command(mdev, sock); | ||
728 | if (!p) | ||
729 | return -EIO; | ||
730 | |||
731 | rcu_read_lock(); | ||
732 | nc = rcu_dereference(mdev->tconn->net_conf); | ||
2045 | 733 | ||
2046 | size = apv <= 87 ? sizeof(struct p_rs_param) | 734 | size = apv <= 87 ? sizeof(struct p_rs_param) |
2047 | : apv == 88 ? sizeof(struct p_rs_param) | 735 | : apv == 88 ? sizeof(struct p_rs_param) |
2048 | + strlen(mdev->sync_conf.verify_alg) + 1 | 736 | + strlen(nc->verify_alg) + 1 |
2049 | : apv <= 94 ? sizeof(struct p_rs_param_89) | 737 | : apv <= 94 ? sizeof(struct p_rs_param_89) |
2050 | : /* apv >= 95 */ sizeof(struct p_rs_param_95); | 738 | : /* apv >= 95 */ sizeof(struct p_rs_param_95); |
2051 | 739 | ||
2052 | /* used from admin command context and receiver/worker context. | 740 | cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM; |
2053 | * to avoid kmalloc, grab the socket right here, | ||
2054 | * then use the pre-allocated sbuf there */ | ||
2055 | mutex_lock(&mdev->data.mutex); | ||
2056 | sock = mdev->data.socket; | ||
2057 | |||
2058 | if (likely(sock != NULL)) { | ||
2059 | enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM; | ||
2060 | |||
2061 | p = &mdev->data.sbuf.rs_param_95; | ||
2062 | |||
2063 | /* initialize verify_alg and csums_alg */ | ||
2064 | memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); | ||
2065 | |||
2066 | p->rate = cpu_to_be32(sc->rate); | ||
2067 | p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead); | ||
2068 | p->c_delay_target = cpu_to_be32(sc->c_delay_target); | ||
2069 | p->c_fill_target = cpu_to_be32(sc->c_fill_target); | ||
2070 | p->c_max_rate = cpu_to_be32(sc->c_max_rate); | ||
2071 | 741 | ||
2072 | if (apv >= 88) | 742 | /* initialize verify_alg and csums_alg */ |
2073 | strcpy(p->verify_alg, mdev->sync_conf.verify_alg); | 743 | memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); |
2074 | if (apv >= 89) | ||
2075 | strcpy(p->csums_alg, mdev->sync_conf.csums_alg); | ||
2076 | 744 | ||
2077 | rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0); | 745 | if (get_ldev(mdev)) { |
2078 | } else | 746 | dc = rcu_dereference(mdev->ldev->disk_conf); |
2079 | rv = 0; /* not ok */ | 747 | p->resync_rate = cpu_to_be32(dc->resync_rate); |
748 | p->c_plan_ahead = cpu_to_be32(dc->c_plan_ahead); | ||
749 | p->c_delay_target = cpu_to_be32(dc->c_delay_target); | ||
750 | p->c_fill_target = cpu_to_be32(dc->c_fill_target); | ||
751 | p->c_max_rate = cpu_to_be32(dc->c_max_rate); | ||
752 | put_ldev(mdev); | ||
753 | } else { | ||
754 | p->resync_rate = cpu_to_be32(DRBD_RESYNC_RATE_DEF); | ||
755 | p->c_plan_ahead = cpu_to_be32(DRBD_C_PLAN_AHEAD_DEF); | ||
756 | p->c_delay_target = cpu_to_be32(DRBD_C_DELAY_TARGET_DEF); | ||
757 | p->c_fill_target = cpu_to_be32(DRBD_C_FILL_TARGET_DEF); | ||
758 | p->c_max_rate = cpu_to_be32(DRBD_C_MAX_RATE_DEF); | ||
759 | } | ||
2080 | 760 | ||
2081 | mutex_unlock(&mdev->data.mutex); | 761 | if (apv >= 88) |
762 | strcpy(p->verify_alg, nc->verify_alg); | ||
763 | if (apv >= 89) | ||
764 | strcpy(p->csums_alg, nc->csums_alg); | ||
765 | rcu_read_unlock(); | ||
2082 | 766 | ||
2083 | return rv; | 767 | return drbd_send_command(mdev, sock, cmd, size, NULL, 0); |
2084 | } | 768 | } |
2085 | 769 | ||
2086 | int drbd_send_protocol(struct drbd_conf *mdev) | 770 | int __drbd_send_protocol(struct drbd_tconn *tconn, enum drbd_packet cmd) |
2087 | { | 771 | { |
772 | struct drbd_socket *sock; | ||
2088 | struct p_protocol *p; | 773 | struct p_protocol *p; |
2089 | int size, cf, rv; | 774 | struct net_conf *nc; |
775 | int size, cf; | ||
2090 | 776 | ||
2091 | size = sizeof(struct p_protocol); | 777 | sock = &tconn->data; |
778 | p = __conn_prepare_command(tconn, sock); | ||
779 | if (!p) | ||
780 | return -EIO; | ||
2092 | 781 | ||
2093 | if (mdev->agreed_pro_version >= 87) | 782 | rcu_read_lock(); |
2094 | size += strlen(mdev->net_conf->integrity_alg) + 1; | 783 | nc = rcu_dereference(tconn->net_conf); |
2095 | 784 | ||
2096 | /* we must not recurse into our own queue, | 785 | if (nc->tentative && tconn->agreed_pro_version < 92) { |
2097 | * as that is blocked during handshake */ | 786 | rcu_read_unlock(); |
2098 | p = kmalloc(size, GFP_NOIO); | 787 | mutex_unlock(&sock->mutex); |
2099 | if (p == NULL) | 788 | conn_err(tconn, "--dry-run is not supported by peer"); |
2100 | return 0; | 789 | return -EOPNOTSUPP; |
790 | } | ||
2101 | 791 | ||
2102 | p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol); | 792 | size = sizeof(*p); |
2103 | p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p); | 793 | if (tconn->agreed_pro_version >= 87) |
2104 | p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p); | 794 | size += strlen(nc->integrity_alg) + 1; |
2105 | p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p); | ||
2106 | p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries); | ||
2107 | 795 | ||
796 | p->protocol = cpu_to_be32(nc->wire_protocol); | ||
797 | p->after_sb_0p = cpu_to_be32(nc->after_sb_0p); | ||
798 | p->after_sb_1p = cpu_to_be32(nc->after_sb_1p); | ||
799 | p->after_sb_2p = cpu_to_be32(nc->after_sb_2p); | ||
800 | p->two_primaries = cpu_to_be32(nc->two_primaries); | ||
2108 | cf = 0; | 801 | cf = 0; |
2109 | if (mdev->net_conf->want_lose) | 802 | if (nc->discard_my_data) |
2110 | cf |= CF_WANT_LOSE; | 803 | cf |= CF_DISCARD_MY_DATA; |
2111 | if (mdev->net_conf->dry_run) { | 804 | if (nc->tentative) |
2112 | if (mdev->agreed_pro_version >= 92) | 805 | cf |= CF_DRY_RUN; |
2113 | cf |= CF_DRY_RUN; | ||
2114 | else { | ||
2115 | dev_err(DEV, "--dry-run is not supported by peer"); | ||
2116 | kfree(p); | ||
2117 | return -1; | ||
2118 | } | ||
2119 | } | ||
2120 | p->conn_flags = cpu_to_be32(cf); | 806 | p->conn_flags = cpu_to_be32(cf); |
2121 | 807 | ||
2122 | if (mdev->agreed_pro_version >= 87) | 808 | if (tconn->agreed_pro_version >= 87) |
2123 | strcpy(p->integrity_alg, mdev->net_conf->integrity_alg); | 809 | strcpy(p->integrity_alg, nc->integrity_alg); |
810 | rcu_read_unlock(); | ||
2124 | 811 | ||
2125 | rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL, | 812 | return __conn_send_command(tconn, sock, cmd, size, NULL, 0); |
2126 | (struct p_header80 *)p, size); | 813 | } |
2127 | kfree(p); | 814 | |
2128 | return rv; | 815 | int drbd_send_protocol(struct drbd_tconn *tconn) |
816 | { | ||
817 | int err; | ||
818 | |||
819 | mutex_lock(&tconn->data.mutex); | ||
820 | err = __drbd_send_protocol(tconn, P_PROTOCOL); | ||
821 | mutex_unlock(&tconn->data.mutex); | ||
822 | |||
823 | return err; | ||
2129 | } | 824 | } |
2130 | 825 | ||
2131 | int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags) | 826 | int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags) |
2132 | { | 827 | { |
2133 | struct p_uuids p; | 828 | struct drbd_socket *sock; |
829 | struct p_uuids *p; | ||
2134 | int i; | 830 | int i; |
2135 | 831 | ||
2136 | if (!get_ldev_if_state(mdev, D_NEGOTIATING)) | 832 | if (!get_ldev_if_state(mdev, D_NEGOTIATING)) |
2137 | return 1; | 833 | return 0; |
2138 | 834 | ||
835 | sock = &mdev->tconn->data; | ||
836 | p = drbd_prepare_command(mdev, sock); | ||
837 | if (!p) { | ||
838 | put_ldev(mdev); | ||
839 | return -EIO; | ||
840 | } | ||
841 | spin_lock_irq(&mdev->ldev->md.uuid_lock); | ||
2139 | for (i = UI_CURRENT; i < UI_SIZE; i++) | 842 | for (i = UI_CURRENT; i < UI_SIZE; i++) |
2140 | p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0; | 843 | p->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]); |
844 | spin_unlock_irq(&mdev->ldev->md.uuid_lock); | ||
2141 | 845 | ||
2142 | mdev->comm_bm_set = drbd_bm_total_weight(mdev); | 846 | mdev->comm_bm_set = drbd_bm_total_weight(mdev); |
2143 | p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set); | 847 | p->uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set); |
2144 | uuid_flags |= mdev->net_conf->want_lose ? 1 : 0; | 848 | rcu_read_lock(); |
849 | uuid_flags |= rcu_dereference(mdev->tconn->net_conf)->discard_my_data ? 1 : 0; | ||
850 | rcu_read_unlock(); | ||
2145 | uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0; | 851 | uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0; |
2146 | uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0; | 852 | uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0; |
2147 | p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags); | 853 | p->uuid[UI_FLAGS] = cpu_to_be64(uuid_flags); |
2148 | 854 | ||
2149 | put_ldev(mdev); | 855 | put_ldev(mdev); |
2150 | 856 | return drbd_send_command(mdev, sock, P_UUIDS, sizeof(*p), NULL, 0); | |
2151 | return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS, | ||
2152 | (struct p_header80 *)&p, sizeof(p)); | ||
2153 | } | 857 | } |
2154 | 858 | ||
2155 | int drbd_send_uuids(struct drbd_conf *mdev) | 859 | int drbd_send_uuids(struct drbd_conf *mdev) |
@@ -2180,9 +884,10 @@ void drbd_print_uuids(struct drbd_conf *mdev, const char *text) | |||
2180 | } | 884 | } |
2181 | } | 885 | } |
2182 | 886 | ||
2183 | int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev) | 887 | void drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev) |
2184 | { | 888 | { |
2185 | struct p_rs_uuid p; | 889 | struct drbd_socket *sock; |
890 | struct p_rs_uuid *p; | ||
2186 | u64 uuid; | 891 | u64 uuid; |
2187 | 892 | ||
2188 | D_ASSERT(mdev->state.disk == D_UP_TO_DATE); | 893 | D_ASSERT(mdev->state.disk == D_UP_TO_DATE); |
@@ -2195,24 +900,29 @@ int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev) | |||
2195 | drbd_uuid_set(mdev, UI_BITMAP, uuid); | 900 | drbd_uuid_set(mdev, UI_BITMAP, uuid); |
2196 | drbd_print_uuids(mdev, "updated sync UUID"); | 901 | drbd_print_uuids(mdev, "updated sync UUID"); |
2197 | drbd_md_sync(mdev); | 902 | drbd_md_sync(mdev); |
2198 | p.uuid = cpu_to_be64(uuid); | ||
2199 | 903 | ||
2200 | return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID, | 904 | sock = &mdev->tconn->data; |
2201 | (struct p_header80 *)&p, sizeof(p)); | 905 | p = drbd_prepare_command(mdev, sock); |
906 | if (p) { | ||
907 | p->uuid = cpu_to_be64(uuid); | ||
908 | drbd_send_command(mdev, sock, P_SYNC_UUID, sizeof(*p), NULL, 0); | ||
909 | } | ||
2202 | } | 910 | } |
2203 | 911 | ||
2204 | int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags) | 912 | int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags) |
2205 | { | 913 | { |
2206 | struct p_sizes p; | 914 | struct drbd_socket *sock; |
915 | struct p_sizes *p; | ||
2207 | sector_t d_size, u_size; | 916 | sector_t d_size, u_size; |
2208 | int q_order_type; | 917 | int q_order_type; |
2209 | unsigned int max_bio_size; | 918 | unsigned int max_bio_size; |
2210 | int ok; | ||
2211 | 919 | ||
2212 | if (get_ldev_if_state(mdev, D_NEGOTIATING)) { | 920 | if (get_ldev_if_state(mdev, D_NEGOTIATING)) { |
2213 | D_ASSERT(mdev->ldev->backing_bdev); | 921 | D_ASSERT(mdev->ldev->backing_bdev); |
2214 | d_size = drbd_get_max_capacity(mdev->ldev); | 922 | d_size = drbd_get_max_capacity(mdev->ldev); |
2215 | u_size = mdev->ldev->dc.disk_size; | 923 | rcu_read_lock(); |
924 | u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size; | ||
925 | rcu_read_unlock(); | ||
2216 | q_order_type = drbd_queue_order_type(mdev); | 926 | q_order_type = drbd_queue_order_type(mdev); |
2217 | max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9; | 927 | max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9; |
2218 | max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE); | 928 | max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE); |
@@ -2224,20 +934,23 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl | |||
2224 | max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */ | 934 | max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */ |
2225 | } | 935 | } |
2226 | 936 | ||
2227 | /* Never allow old drbd (up to 8.3.7) to see more than 32KiB */ | 937 | sock = &mdev->tconn->data; |
2228 | if (mdev->agreed_pro_version <= 94) | 938 | p = drbd_prepare_command(mdev, sock); |
2229 | max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET); | 939 | if (!p) |
940 | return -EIO; | ||
2230 | 941 | ||
2231 | p.d_size = cpu_to_be64(d_size); | 942 | if (mdev->tconn->agreed_pro_version <= 94) |
2232 | p.u_size = cpu_to_be64(u_size); | 943 | max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET); |
2233 | p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev)); | 944 | else if (mdev->tconn->agreed_pro_version < 100) |
2234 | p.max_bio_size = cpu_to_be32(max_bio_size); | 945 | max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE_P95); |
2235 | p.queue_order_type = cpu_to_be16(q_order_type); | ||
2236 | p.dds_flags = cpu_to_be16(flags); | ||
2237 | 946 | ||
2238 | ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES, | 947 | p->d_size = cpu_to_be64(d_size); |
2239 | (struct p_header80 *)&p, sizeof(p)); | 948 | p->u_size = cpu_to_be64(u_size); |
2240 | return ok; | 949 | p->c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev)); |
950 | p->max_bio_size = cpu_to_be32(max_bio_size); | ||
951 | p->queue_order_type = cpu_to_be16(q_order_type); | ||
952 | p->dds_flags = cpu_to_be16(flags); | ||
953 | return drbd_send_command(mdev, sock, P_SIZES, sizeof(*p), NULL, 0); | ||
2241 | } | 954 | } |
2242 | 955 | ||
2243 | /** | 956 | /** |
@@ -2246,34 +959,21 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl | |||
2246 | */ | 959 | */ |
2247 | int drbd_send_current_state(struct drbd_conf *mdev) | 960 | int drbd_send_current_state(struct drbd_conf *mdev) |
2248 | { | 961 | { |
2249 | struct socket *sock; | 962 | struct drbd_socket *sock; |
2250 | struct p_state p; | 963 | struct p_state *p; |
2251 | int ok = 0; | ||
2252 | |||
2253 | /* Grab state lock so we wont send state if we're in the middle | ||
2254 | * of a cluster wide state change on another thread */ | ||
2255 | drbd_state_lock(mdev); | ||
2256 | |||
2257 | mutex_lock(&mdev->data.mutex); | ||
2258 | 964 | ||
2259 | p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */ | 965 | sock = &mdev->tconn->data; |
2260 | sock = mdev->data.socket; | 966 | p = drbd_prepare_command(mdev, sock); |
2261 | 967 | if (!p) | |
2262 | if (likely(sock != NULL)) { | 968 | return -EIO; |
2263 | ok = _drbd_send_cmd(mdev, sock, P_STATE, | 969 | p->state = cpu_to_be32(mdev->state.i); /* Within the send mutex */ |
2264 | (struct p_header80 *)&p, sizeof(p), 0); | 970 | return drbd_send_command(mdev, sock, P_STATE, sizeof(*p), NULL, 0); |
2265 | } | ||
2266 | |||
2267 | mutex_unlock(&mdev->data.mutex); | ||
2268 | |||
2269 | drbd_state_unlock(mdev); | ||
2270 | return ok; | ||
2271 | } | 971 | } |
2272 | 972 | ||
2273 | /** | 973 | /** |
2274 | * drbd_send_state() - After a state change, sends the new state to the peer | 974 | * drbd_send_state() - After a state change, sends the new state to the peer |
2275 | * @mdev: DRBD device. | 975 | * @mdev: DRBD device. |
2276 | * @state: the state to send, not necessarily the current state. | 976 | * @state: the state to send, not necessarily the current state. |
2277 | * | 977 | * |
2278 | * Each state change queues an "after_state_ch" work, which will eventually | 978 | * Each state change queues an "after_state_ch" work, which will eventually |
2279 | * send the resulting new state to the peer. If more state changes happen | 979 | * send the resulting new state to the peer. If more state changes happen |
@@ -2282,50 +982,95 @@ int drbd_send_current_state(struct drbd_conf *mdev) | |||
2282 | */ | 982 | */ |
2283 | int drbd_send_state(struct drbd_conf *mdev, union drbd_state state) | 983 | int drbd_send_state(struct drbd_conf *mdev, union drbd_state state) |
2284 | { | 984 | { |
2285 | struct socket *sock; | 985 | struct drbd_socket *sock; |
2286 | struct p_state p; | 986 | struct p_state *p; |
2287 | int ok = 0; | ||
2288 | 987 | ||
2289 | mutex_lock(&mdev->data.mutex); | 988 | sock = &mdev->tconn->data; |
989 | p = drbd_prepare_command(mdev, sock); | ||
990 | if (!p) | ||
991 | return -EIO; | ||
992 | p->state = cpu_to_be32(state.i); /* Within the send mutex */ | ||
993 | return drbd_send_command(mdev, sock, P_STATE, sizeof(*p), NULL, 0); | ||
994 | } | ||
2290 | 995 | ||
2291 | p.state = cpu_to_be32(state.i); | 996 | int drbd_send_state_req(struct drbd_conf *mdev, union drbd_state mask, union drbd_state val) |
2292 | sock = mdev->data.socket; | 997 | { |
998 | struct drbd_socket *sock; | ||
999 | struct p_req_state *p; | ||
2293 | 1000 | ||
2294 | if (likely(sock != NULL)) { | 1001 | sock = &mdev->tconn->data; |
2295 | ok = _drbd_send_cmd(mdev, sock, P_STATE, | 1002 | p = drbd_prepare_command(mdev, sock); |
2296 | (struct p_header80 *)&p, sizeof(p), 0); | 1003 | if (!p) |
2297 | } | 1004 | return -EIO; |
1005 | p->mask = cpu_to_be32(mask.i); | ||
1006 | p->val = cpu_to_be32(val.i); | ||
1007 | return drbd_send_command(mdev, sock, P_STATE_CHG_REQ, sizeof(*p), NULL, 0); | ||
1008 | } | ||
2298 | 1009 | ||
2299 | mutex_unlock(&mdev->data.mutex); | 1010 | int conn_send_state_req(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val) |
1011 | { | ||
1012 | enum drbd_packet cmd; | ||
1013 | struct drbd_socket *sock; | ||
1014 | struct p_req_state *p; | ||
2300 | 1015 | ||
2301 | return ok; | 1016 | cmd = tconn->agreed_pro_version < 100 ? P_STATE_CHG_REQ : P_CONN_ST_CHG_REQ; |
1017 | sock = &tconn->data; | ||
1018 | p = conn_prepare_command(tconn, sock); | ||
1019 | if (!p) | ||
1020 | return -EIO; | ||
1021 | p->mask = cpu_to_be32(mask.i); | ||
1022 | p->val = cpu_to_be32(val.i); | ||
1023 | return conn_send_command(tconn, sock, cmd, sizeof(*p), NULL, 0); | ||
2302 | } | 1024 | } |
2303 | 1025 | ||
2304 | int drbd_send_state_req(struct drbd_conf *mdev, | 1026 | void drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode) |
2305 | union drbd_state mask, union drbd_state val) | ||
2306 | { | 1027 | { |
2307 | struct p_req_state p; | 1028 | struct drbd_socket *sock; |
1029 | struct p_req_state_reply *p; | ||
1030 | |||
1031 | sock = &mdev->tconn->meta; | ||
1032 | p = drbd_prepare_command(mdev, sock); | ||
1033 | if (p) { | ||
1034 | p->retcode = cpu_to_be32(retcode); | ||
1035 | drbd_send_command(mdev, sock, P_STATE_CHG_REPLY, sizeof(*p), NULL, 0); | ||
1036 | } | ||
1037 | } | ||
2308 | 1038 | ||
2309 | p.mask = cpu_to_be32(mask.i); | 1039 | void conn_send_sr_reply(struct drbd_tconn *tconn, enum drbd_state_rv retcode) |
2310 | p.val = cpu_to_be32(val.i); | 1040 | { |
1041 | struct drbd_socket *sock; | ||
1042 | struct p_req_state_reply *p; | ||
1043 | enum drbd_packet cmd = tconn->agreed_pro_version < 100 ? P_STATE_CHG_REPLY : P_CONN_ST_CHG_REPLY; | ||
2311 | 1044 | ||
2312 | return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ, | 1045 | sock = &tconn->meta; |
2313 | (struct p_header80 *)&p, sizeof(p)); | 1046 | p = conn_prepare_command(tconn, sock); |
1047 | if (p) { | ||
1048 | p->retcode = cpu_to_be32(retcode); | ||
1049 | conn_send_command(tconn, sock, cmd, sizeof(*p), NULL, 0); | ||
1050 | } | ||
2314 | } | 1051 | } |
2315 | 1052 | ||
2316 | int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode) | 1053 | static void dcbp_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code) |
2317 | { | 1054 | { |
2318 | struct p_req_state_reply p; | 1055 | BUG_ON(code & ~0xf); |
1056 | p->encoding = (p->encoding & ~0xf) | code; | ||
1057 | } | ||
2319 | 1058 | ||
2320 | p.retcode = cpu_to_be32(retcode); | 1059 | static void dcbp_set_start(struct p_compressed_bm *p, int set) |
1060 | { | ||
1061 | p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0); | ||
1062 | } | ||
2321 | 1063 | ||
2322 | return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY, | 1064 | static void dcbp_set_pad_bits(struct p_compressed_bm *p, int n) |
2323 | (struct p_header80 *)&p, sizeof(p)); | 1065 | { |
1066 | BUG_ON(n & ~0x7); | ||
1067 | p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4); | ||
2324 | } | 1068 | } |
2325 | 1069 | ||
2326 | int fill_bitmap_rle_bits(struct drbd_conf *mdev, | 1070 | int fill_bitmap_rle_bits(struct drbd_conf *mdev, |
2327 | struct p_compressed_bm *p, | 1071 | struct p_compressed_bm *p, |
2328 | struct bm_xfer_ctx *c) | 1072 | unsigned int size, |
1073 | struct bm_xfer_ctx *c) | ||
2329 | { | 1074 | { |
2330 | struct bitstream bs; | 1075 | struct bitstream bs; |
2331 | unsigned long plain_bits; | 1076 | unsigned long plain_bits; |
@@ -2333,19 +1078,21 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev, | |||
2333 | unsigned long rl; | 1078 | unsigned long rl; |
2334 | unsigned len; | 1079 | unsigned len; |
2335 | unsigned toggle; | 1080 | unsigned toggle; |
2336 | int bits; | 1081 | int bits, use_rle; |
2337 | 1082 | ||
2338 | /* may we use this feature? */ | 1083 | /* may we use this feature? */ |
2339 | if ((mdev->sync_conf.use_rle == 0) || | 1084 | rcu_read_lock(); |
2340 | (mdev->agreed_pro_version < 90)) | 1085 | use_rle = rcu_dereference(mdev->tconn->net_conf)->use_rle; |
2341 | return 0; | 1086 | rcu_read_unlock(); |
1087 | if (!use_rle || mdev->tconn->agreed_pro_version < 90) | ||
1088 | return 0; | ||
2342 | 1089 | ||
2343 | if (c->bit_offset >= c->bm_bits) | 1090 | if (c->bit_offset >= c->bm_bits) |
2344 | return 0; /* nothing to do. */ | 1091 | return 0; /* nothing to do. */ |
2345 | 1092 | ||
2346 | /* use at most thus many bytes */ | 1093 | /* use at most thus many bytes */ |
2347 | bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0); | 1094 | bitstream_init(&bs, p->code, size, 0); |
2348 | memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX); | 1095 | memset(p->code, 0, size); |
2349 | /* plain bits covered in this code string */ | 1096 | /* plain bits covered in this code string */ |
2350 | plain_bits = 0; | 1097 | plain_bits = 0; |
2351 | 1098 | ||
@@ -2367,12 +1114,12 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev, | |||
2367 | if (rl == 0) { | 1114 | if (rl == 0) { |
2368 | /* the first checked bit was set, | 1115 | /* the first checked bit was set, |
2369 | * store start value, */ | 1116 | * store start value, */ |
2370 | DCBP_set_start(p, 1); | 1117 | dcbp_set_start(p, 1); |
2371 | /* but skip encoding of zero run length */ | 1118 | /* but skip encoding of zero run length */ |
2372 | toggle = !toggle; | 1119 | toggle = !toggle; |
2373 | continue; | 1120 | continue; |
2374 | } | 1121 | } |
2375 | DCBP_set_start(p, 0); | 1122 | dcbp_set_start(p, 0); |
2376 | } | 1123 | } |
2377 | 1124 | ||
2378 | /* paranoia: catch zero runlength. | 1125 | /* paranoia: catch zero runlength. |
@@ -2412,7 +1159,7 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev, | |||
2412 | bm_xfer_ctx_bit_to_word_offset(c); | 1159 | bm_xfer_ctx_bit_to_word_offset(c); |
2413 | 1160 | ||
2414 | /* store pad_bits */ | 1161 | /* store pad_bits */ |
2415 | DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7); | 1162 | dcbp_set_pad_bits(p, (8 - bs.cur.bit) & 0x7); |
2416 | 1163 | ||
2417 | return len; | 1164 | return len; |
2418 | } | 1165 | } |
@@ -2424,48 +1171,52 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev, | |||
2424 | * code upon failure. | 1171 | * code upon failure. |
2425 | */ | 1172 | */ |
2426 | static int | 1173 | static int |
2427 | send_bitmap_rle_or_plain(struct drbd_conf *mdev, | 1174 | send_bitmap_rle_or_plain(struct drbd_conf *mdev, struct bm_xfer_ctx *c) |
2428 | struct p_header80 *h, struct bm_xfer_ctx *c) | ||
2429 | { | 1175 | { |
2430 | struct p_compressed_bm *p = (void*)h; | 1176 | struct drbd_socket *sock = &mdev->tconn->data; |
2431 | unsigned long num_words; | 1177 | unsigned int header_size = drbd_header_size(mdev->tconn); |
2432 | int len; | 1178 | struct p_compressed_bm *p = sock->sbuf + header_size; |
2433 | int ok; | 1179 | int len, err; |
2434 | |||
2435 | len = fill_bitmap_rle_bits(mdev, p, c); | ||
2436 | 1180 | ||
1181 | len = fill_bitmap_rle_bits(mdev, p, | ||
1182 | DRBD_SOCKET_BUFFER_SIZE - header_size - sizeof(*p), c); | ||
2437 | if (len < 0) | 1183 | if (len < 0) |
2438 | return -EIO; | 1184 | return -EIO; |
2439 | 1185 | ||
2440 | if (len) { | 1186 | if (len) { |
2441 | DCBP_set_code(p, RLE_VLI_Bits); | 1187 | dcbp_set_code(p, RLE_VLI_Bits); |
2442 | ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h, | 1188 | err = __send_command(mdev->tconn, mdev->vnr, sock, |
2443 | sizeof(*p) + len, 0); | 1189 | P_COMPRESSED_BITMAP, sizeof(*p) + len, |
2444 | 1190 | NULL, 0); | |
2445 | c->packets[0]++; | 1191 | c->packets[0]++; |
2446 | c->bytes[0] += sizeof(*p) + len; | 1192 | c->bytes[0] += header_size + sizeof(*p) + len; |
2447 | 1193 | ||
2448 | if (c->bit_offset >= c->bm_bits) | 1194 | if (c->bit_offset >= c->bm_bits) |
2449 | len = 0; /* DONE */ | 1195 | len = 0; /* DONE */ |
2450 | } else { | 1196 | } else { |
2451 | /* was not compressible. | 1197 | /* was not compressible. |
2452 | * send a buffer full of plain text bits instead. */ | 1198 | * send a buffer full of plain text bits instead. */ |
2453 | num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset); | 1199 | unsigned int data_size; |
2454 | len = num_words * sizeof(long); | 1200 | unsigned long num_words; |
1201 | unsigned long *p = sock->sbuf + header_size; | ||
1202 | |||
1203 | data_size = DRBD_SOCKET_BUFFER_SIZE - header_size; | ||
1204 | num_words = min_t(size_t, data_size / sizeof(*p), | ||
1205 | c->bm_words - c->word_offset); | ||
1206 | len = num_words * sizeof(*p); | ||
2455 | if (len) | 1207 | if (len) |
2456 | drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload); | 1208 | drbd_bm_get_lel(mdev, c->word_offset, num_words, p); |
2457 | ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP, | 1209 | err = __send_command(mdev->tconn, mdev->vnr, sock, P_BITMAP, len, NULL, 0); |
2458 | h, sizeof(struct p_header80) + len, 0); | ||
2459 | c->word_offset += num_words; | 1210 | c->word_offset += num_words; |
2460 | c->bit_offset = c->word_offset * BITS_PER_LONG; | 1211 | c->bit_offset = c->word_offset * BITS_PER_LONG; |
2461 | 1212 | ||
2462 | c->packets[1]++; | 1213 | c->packets[1]++; |
2463 | c->bytes[1] += sizeof(struct p_header80) + len; | 1214 | c->bytes[1] += header_size + len; |
2464 | 1215 | ||
2465 | if (c->bit_offset > c->bm_bits) | 1216 | if (c->bit_offset > c->bm_bits) |
2466 | c->bit_offset = c->bm_bits; | 1217 | c->bit_offset = c->bm_bits; |
2467 | } | 1218 | } |
2468 | if (ok) { | 1219 | if (!err) { |
2469 | if (len == 0) { | 1220 | if (len == 0) { |
2470 | INFO_bm_xfer_stats(mdev, "send", c); | 1221 | INFO_bm_xfer_stats(mdev, "send", c); |
2471 | return 0; | 1222 | return 0; |
@@ -2476,21 +1227,13 @@ send_bitmap_rle_or_plain(struct drbd_conf *mdev, | |||
2476 | } | 1227 | } |
2477 | 1228 | ||
2478 | /* See the comment at receive_bitmap() */ | 1229 | /* See the comment at receive_bitmap() */ |
2479 | int _drbd_send_bitmap(struct drbd_conf *mdev) | 1230 | static int _drbd_send_bitmap(struct drbd_conf *mdev) |
2480 | { | 1231 | { |
2481 | struct bm_xfer_ctx c; | 1232 | struct bm_xfer_ctx c; |
2482 | struct p_header80 *p; | ||
2483 | int err; | 1233 | int err; |
2484 | 1234 | ||
2485 | ERR_IF(!mdev->bitmap) return false; | 1235 | if (!expect(mdev->bitmap)) |
2486 | |||
2487 | /* maybe we should use some per thread scratch page, | ||
2488 | * and allocate that during initial device creation? */ | ||
2489 | p = (struct p_header80 *) __get_free_page(GFP_NOIO); | ||
2490 | if (!p) { | ||
2491 | dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__); | ||
2492 | return false; | 1236 | return false; |
2493 | } | ||
2494 | 1237 | ||
2495 | if (get_ldev(mdev)) { | 1238 | if (get_ldev(mdev)) { |
2496 | if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) { | 1239 | if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) { |
@@ -2515,37 +1258,39 @@ int _drbd_send_bitmap(struct drbd_conf *mdev) | |||
2515 | }; | 1258 | }; |
2516 | 1259 | ||
2517 | do { | 1260 | do { |
2518 | err = send_bitmap_rle_or_plain(mdev, p, &c); | 1261 | err = send_bitmap_rle_or_plain(mdev, &c); |
2519 | } while (err > 0); | 1262 | } while (err > 0); |
2520 | 1263 | ||
2521 | free_page((unsigned long) p); | ||
2522 | return err == 0; | 1264 | return err == 0; |
2523 | } | 1265 | } |
2524 | 1266 | ||
2525 | int drbd_send_bitmap(struct drbd_conf *mdev) | 1267 | int drbd_send_bitmap(struct drbd_conf *mdev) |
2526 | { | 1268 | { |
2527 | int err; | 1269 | struct drbd_socket *sock = &mdev->tconn->data; |
1270 | int err = -1; | ||
2528 | 1271 | ||
2529 | if (!drbd_get_data_sock(mdev)) | 1272 | mutex_lock(&sock->mutex); |
2530 | return -1; | 1273 | if (sock->socket) |
2531 | err = !_drbd_send_bitmap(mdev); | 1274 | err = !_drbd_send_bitmap(mdev); |
2532 | drbd_put_data_sock(mdev); | 1275 | mutex_unlock(&sock->mutex); |
2533 | return err; | 1276 | return err; |
2534 | } | 1277 | } |
2535 | 1278 | ||
2536 | int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size) | 1279 | void drbd_send_b_ack(struct drbd_tconn *tconn, u32 barrier_nr, u32 set_size) |
2537 | { | 1280 | { |
2538 | int ok; | 1281 | struct drbd_socket *sock; |
2539 | struct p_barrier_ack p; | 1282 | struct p_barrier_ack *p; |
2540 | 1283 | ||
2541 | p.barrier = barrier_nr; | 1284 | if (tconn->cstate < C_WF_REPORT_PARAMS) |
2542 | p.set_size = cpu_to_be32(set_size); | 1285 | return; |
2543 | 1286 | ||
2544 | if (mdev->state.conn < C_CONNECTED) | 1287 | sock = &tconn->meta; |
2545 | return false; | 1288 | p = conn_prepare_command(tconn, sock); |
2546 | ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK, | 1289 | if (!p) |
2547 | (struct p_header80 *)&p, sizeof(p)); | 1290 | return; |
2548 | return ok; | 1291 | p->barrier = barrier_nr; |
1292 | p->set_size = cpu_to_be32(set_size); | ||
1293 | conn_send_command(tconn, sock, P_BARRIER_ACK, sizeof(*p), NULL, 0); | ||
2549 | } | 1294 | } |
2550 | 1295 | ||
2551 | /** | 1296 | /** |
@@ -2556,62 +1301,62 @@ int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size) | |||
2556 | * @blksize: size in byte, needs to be in big endian byte order | 1301 | * @blksize: size in byte, needs to be in big endian byte order |
2557 | * @block_id: Id, big endian byte order | 1302 | * @block_id: Id, big endian byte order |
2558 | */ | 1303 | */ |
2559 | static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd, | 1304 | static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packet cmd, |
2560 | u64 sector, | 1305 | u64 sector, u32 blksize, u64 block_id) |
2561 | u32 blksize, | ||
2562 | u64 block_id) | ||
2563 | { | 1306 | { |
2564 | int ok; | 1307 | struct drbd_socket *sock; |
2565 | struct p_block_ack p; | 1308 | struct p_block_ack *p; |
2566 | 1309 | ||
2567 | p.sector = sector; | 1310 | if (mdev->state.conn < C_CONNECTED) |
2568 | p.block_id = block_id; | 1311 | return -EIO; |
2569 | p.blksize = blksize; | ||
2570 | p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq)); | ||
2571 | 1312 | ||
2572 | if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED) | 1313 | sock = &mdev->tconn->meta; |
2573 | return false; | 1314 | p = drbd_prepare_command(mdev, sock); |
2574 | ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd, | 1315 | if (!p) |
2575 | (struct p_header80 *)&p, sizeof(p)); | 1316 | return -EIO; |
2576 | return ok; | 1317 | p->sector = sector; |
1318 | p->block_id = block_id; | ||
1319 | p->blksize = blksize; | ||
1320 | p->seq_num = cpu_to_be32(atomic_inc_return(&mdev->packet_seq)); | ||
1321 | return drbd_send_command(mdev, sock, cmd, sizeof(*p), NULL, 0); | ||
2577 | } | 1322 | } |
2578 | 1323 | ||
2579 | /* dp->sector and dp->block_id already/still in network byte order, | 1324 | /* dp->sector and dp->block_id already/still in network byte order, |
2580 | * data_size is payload size according to dp->head, | 1325 | * data_size is payload size according to dp->head, |
2581 | * and may need to be corrected for digest size. */ | 1326 | * and may need to be corrected for digest size. */ |
2582 | int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd, | 1327 | void drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packet cmd, |
2583 | struct p_data *dp, int data_size) | 1328 | struct p_data *dp, int data_size) |
2584 | { | 1329 | { |
2585 | data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ? | 1330 | if (mdev->tconn->peer_integrity_tfm) |
2586 | crypto_hash_digestsize(mdev->integrity_r_tfm) : 0; | 1331 | data_size -= crypto_hash_digestsize(mdev->tconn->peer_integrity_tfm); |
2587 | return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size), | 1332 | _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size), |
2588 | dp->block_id); | 1333 | dp->block_id); |
2589 | } | 1334 | } |
2590 | 1335 | ||
2591 | int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd, | 1336 | void drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packet cmd, |
2592 | struct p_block_req *rp) | 1337 | struct p_block_req *rp) |
2593 | { | 1338 | { |
2594 | return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id); | 1339 | _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id); |
2595 | } | 1340 | } |
2596 | 1341 | ||
2597 | /** | 1342 | /** |
2598 | * drbd_send_ack() - Sends an ack packet | 1343 | * drbd_send_ack() - Sends an ack packet |
2599 | * @mdev: DRBD device. | 1344 | * @mdev: DRBD device |
2600 | * @cmd: Packet command code. | 1345 | * @cmd: packet command code |
2601 | * @e: Epoch entry. | 1346 | * @peer_req: peer request |
2602 | */ | 1347 | */ |
2603 | int drbd_send_ack(struct drbd_conf *mdev, | 1348 | int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packet cmd, |
2604 | enum drbd_packets cmd, struct drbd_epoch_entry *e) | 1349 | struct drbd_peer_request *peer_req) |
2605 | { | 1350 | { |
2606 | return _drbd_send_ack(mdev, cmd, | 1351 | return _drbd_send_ack(mdev, cmd, |
2607 | cpu_to_be64(e->sector), | 1352 | cpu_to_be64(peer_req->i.sector), |
2608 | cpu_to_be32(e->size), | 1353 | cpu_to_be32(peer_req->i.size), |
2609 | e->block_id); | 1354 | peer_req->block_id); |
2610 | } | 1355 | } |
2611 | 1356 | ||
2612 | /* This function misuses the block_id field to signal if the blocks | 1357 | /* This function misuses the block_id field to signal if the blocks |
2613 | * are is sync or not. */ | 1358 | * are is sync or not. */ |
2614 | int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd, | 1359 | int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packet cmd, |
2615 | sector_t sector, int blksize, u64 block_id) | 1360 | sector_t sector, int blksize, u64 block_id) |
2616 | { | 1361 | { |
2617 | return _drbd_send_ack(mdev, cmd, | 1362 | return _drbd_send_ack(mdev, cmd, |
@@ -2623,85 +1368,87 @@ int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd, | |||
2623 | int drbd_send_drequest(struct drbd_conf *mdev, int cmd, | 1368 | int drbd_send_drequest(struct drbd_conf *mdev, int cmd, |
2624 | sector_t sector, int size, u64 block_id) | 1369 | sector_t sector, int size, u64 block_id) |
2625 | { | 1370 | { |
2626 | int ok; | 1371 | struct drbd_socket *sock; |
2627 | struct p_block_req p; | 1372 | struct p_block_req *p; |
2628 | 1373 | ||
2629 | p.sector = cpu_to_be64(sector); | 1374 | sock = &mdev->tconn->data; |
2630 | p.block_id = block_id; | 1375 | p = drbd_prepare_command(mdev, sock); |
2631 | p.blksize = cpu_to_be32(size); | 1376 | if (!p) |
2632 | 1377 | return -EIO; | |
2633 | ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, | 1378 | p->sector = cpu_to_be64(sector); |
2634 | (struct p_header80 *)&p, sizeof(p)); | 1379 | p->block_id = block_id; |
2635 | return ok; | 1380 | p->blksize = cpu_to_be32(size); |
1381 | return drbd_send_command(mdev, sock, cmd, sizeof(*p), NULL, 0); | ||
2636 | } | 1382 | } |
2637 | 1383 | ||
2638 | int drbd_send_drequest_csum(struct drbd_conf *mdev, | 1384 | int drbd_send_drequest_csum(struct drbd_conf *mdev, sector_t sector, int size, |
2639 | sector_t sector, int size, | 1385 | void *digest, int digest_size, enum drbd_packet cmd) |
2640 | void *digest, int digest_size, | ||
2641 | enum drbd_packets cmd) | ||
2642 | { | 1386 | { |
2643 | int ok; | 1387 | struct drbd_socket *sock; |
2644 | struct p_block_req p; | 1388 | struct p_block_req *p; |
2645 | |||
2646 | p.sector = cpu_to_be64(sector); | ||
2647 | p.block_id = BE_DRBD_MAGIC + 0xbeef; | ||
2648 | p.blksize = cpu_to_be32(size); | ||
2649 | 1389 | ||
2650 | p.head.magic = BE_DRBD_MAGIC; | 1390 | /* FIXME: Put the digest into the preallocated socket buffer. */ |
2651 | p.head.command = cpu_to_be16(cmd); | ||
2652 | p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size); | ||
2653 | 1391 | ||
2654 | mutex_lock(&mdev->data.mutex); | 1392 | sock = &mdev->tconn->data; |
2655 | 1393 | p = drbd_prepare_command(mdev, sock); | |
2656 | ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0)); | 1394 | if (!p) |
2657 | ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0)); | 1395 | return -EIO; |
2658 | 1396 | p->sector = cpu_to_be64(sector); | |
2659 | mutex_unlock(&mdev->data.mutex); | 1397 | p->block_id = ID_SYNCER /* unused */; |
2660 | 1398 | p->blksize = cpu_to_be32(size); | |
2661 | return ok; | 1399 | return drbd_send_command(mdev, sock, cmd, sizeof(*p), |
1400 | digest, digest_size); | ||
2662 | } | 1401 | } |
2663 | 1402 | ||
2664 | int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size) | 1403 | int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size) |
2665 | { | 1404 | { |
2666 | int ok; | 1405 | struct drbd_socket *sock; |
2667 | struct p_block_req p; | 1406 | struct p_block_req *p; |
2668 | |||
2669 | p.sector = cpu_to_be64(sector); | ||
2670 | p.block_id = BE_DRBD_MAGIC + 0xbabe; | ||
2671 | p.blksize = cpu_to_be32(size); | ||
2672 | 1407 | ||
2673 | ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST, | 1408 | sock = &mdev->tconn->data; |
2674 | (struct p_header80 *)&p, sizeof(p)); | 1409 | p = drbd_prepare_command(mdev, sock); |
2675 | return ok; | 1410 | if (!p) |
1411 | return -EIO; | ||
1412 | p->sector = cpu_to_be64(sector); | ||
1413 | p->block_id = ID_SYNCER /* unused */; | ||
1414 | p->blksize = cpu_to_be32(size); | ||
1415 | return drbd_send_command(mdev, sock, P_OV_REQUEST, sizeof(*p), NULL, 0); | ||
2676 | } | 1416 | } |
2677 | 1417 | ||
2678 | /* called on sndtimeo | 1418 | /* called on sndtimeo |
2679 | * returns false if we should retry, | 1419 | * returns false if we should retry, |
2680 | * true if we think connection is dead | 1420 | * true if we think connection is dead |
2681 | */ | 1421 | */ |
2682 | static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock) | 1422 | static int we_should_drop_the_connection(struct drbd_tconn *tconn, struct socket *sock) |
2683 | { | 1423 | { |
2684 | int drop_it; | 1424 | int drop_it; |
2685 | /* long elapsed = (long)(jiffies - mdev->last_received); */ | 1425 | /* long elapsed = (long)(jiffies - mdev->last_received); */ |
2686 | 1426 | ||
2687 | drop_it = mdev->meta.socket == sock | 1427 | drop_it = tconn->meta.socket == sock |
2688 | || !mdev->asender.task | 1428 | || !tconn->asender.task |
2689 | || get_t_state(&mdev->asender) != Running | 1429 | || get_t_state(&tconn->asender) != RUNNING |
2690 | || mdev->state.conn < C_CONNECTED; | 1430 | || tconn->cstate < C_WF_REPORT_PARAMS; |
2691 | 1431 | ||
2692 | if (drop_it) | 1432 | if (drop_it) |
2693 | return true; | 1433 | return true; |
2694 | 1434 | ||
2695 | drop_it = !--mdev->ko_count; | 1435 | drop_it = !--tconn->ko_count; |
2696 | if (!drop_it) { | 1436 | if (!drop_it) { |
2697 | dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n", | 1437 | conn_err(tconn, "[%s/%d] sock_sendmsg time expired, ko = %u\n", |
2698 | current->comm, current->pid, mdev->ko_count); | 1438 | current->comm, current->pid, tconn->ko_count); |
2699 | request_ping(mdev); | 1439 | request_ping(tconn); |
2700 | } | 1440 | } |
2701 | 1441 | ||
2702 | return drop_it; /* && (mdev->state == R_PRIMARY) */; | 1442 | return drop_it; /* && (mdev->state == R_PRIMARY) */; |
2703 | } | 1443 | } |
2704 | 1444 | ||
1445 | static void drbd_update_congested(struct drbd_tconn *tconn) | ||
1446 | { | ||
1447 | struct sock *sk = tconn->data.socket->sk; | ||
1448 | if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5) | ||
1449 | set_bit(NET_CONGESTED, &tconn->flags); | ||
1450 | } | ||
1451 | |||
2705 | /* The idea of sendpage seems to be to put some kind of reference | 1452 | /* The idea of sendpage seems to be to put some kind of reference |
2706 | * to the page into the skb, and to hand it over to the NIC. In | 1453 | * to the page into the skb, and to hand it over to the NIC. In |
2707 | * this process get_page() gets called. | 1454 | * this process get_page() gets called. |
@@ -2724,21 +1471,28 @@ static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket * | |||
2724 | * with page_count == 0 or PageSlab. | 1471 | * with page_count == 0 or PageSlab. |
2725 | */ | 1472 | */ |
2726 | static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page, | 1473 | static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page, |
2727 | int offset, size_t size, unsigned msg_flags) | 1474 | int offset, size_t size, unsigned msg_flags) |
2728 | { | 1475 | { |
2729 | int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags); | 1476 | struct socket *socket; |
1477 | void *addr; | ||
1478 | int err; | ||
1479 | |||
1480 | socket = mdev->tconn->data.socket; | ||
1481 | addr = kmap(page) + offset; | ||
1482 | err = drbd_send_all(mdev->tconn, socket, addr, size, msg_flags); | ||
2730 | kunmap(page); | 1483 | kunmap(page); |
2731 | if (sent == size) | 1484 | if (!err) |
2732 | mdev->send_cnt += size>>9; | 1485 | mdev->send_cnt += size >> 9; |
2733 | return sent == size; | 1486 | return err; |
2734 | } | 1487 | } |
2735 | 1488 | ||
2736 | static int _drbd_send_page(struct drbd_conf *mdev, struct page *page, | 1489 | static int _drbd_send_page(struct drbd_conf *mdev, struct page *page, |
2737 | int offset, size_t size, unsigned msg_flags) | 1490 | int offset, size_t size, unsigned msg_flags) |
2738 | { | 1491 | { |
1492 | struct socket *socket = mdev->tconn->data.socket; | ||
2739 | mm_segment_t oldfs = get_fs(); | 1493 | mm_segment_t oldfs = get_fs(); |
2740 | int sent, ok; | ||
2741 | int len = size; | 1494 | int len = size; |
1495 | int err = -EIO; | ||
2742 | 1496 | ||
2743 | /* e.g. XFS meta- & log-data is in slab pages, which have a | 1497 | /* e.g. XFS meta- & log-data is in slab pages, which have a |
2744 | * page_count of 0 and/or have PageSlab() set. | 1498 | * page_count of 0 and/or have PageSlab() set. |
@@ -2750,34 +1504,35 @@ static int _drbd_send_page(struct drbd_conf *mdev, struct page *page, | |||
2750 | return _drbd_no_send_page(mdev, page, offset, size, msg_flags); | 1504 | return _drbd_no_send_page(mdev, page, offset, size, msg_flags); |
2751 | 1505 | ||
2752 | msg_flags |= MSG_NOSIGNAL; | 1506 | msg_flags |= MSG_NOSIGNAL; |
2753 | drbd_update_congested(mdev); | 1507 | drbd_update_congested(mdev->tconn); |
2754 | set_fs(KERNEL_DS); | 1508 | set_fs(KERNEL_DS); |
2755 | do { | 1509 | do { |
2756 | sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page, | 1510 | int sent; |
2757 | offset, len, | 1511 | |
2758 | msg_flags); | 1512 | sent = socket->ops->sendpage(socket, page, offset, len, msg_flags); |
2759 | if (sent == -EAGAIN) { | ||
2760 | if (we_should_drop_the_connection(mdev, | ||
2761 | mdev->data.socket)) | ||
2762 | break; | ||
2763 | else | ||
2764 | continue; | ||
2765 | } | ||
2766 | if (sent <= 0) { | 1513 | if (sent <= 0) { |
1514 | if (sent == -EAGAIN) { | ||
1515 | if (we_should_drop_the_connection(mdev->tconn, socket)) | ||
1516 | break; | ||
1517 | continue; | ||
1518 | } | ||
2767 | dev_warn(DEV, "%s: size=%d len=%d sent=%d\n", | 1519 | dev_warn(DEV, "%s: size=%d len=%d sent=%d\n", |
2768 | __func__, (int)size, len, sent); | 1520 | __func__, (int)size, len, sent); |
1521 | if (sent < 0) | ||
1522 | err = sent; | ||
2769 | break; | 1523 | break; |
2770 | } | 1524 | } |
2771 | len -= sent; | 1525 | len -= sent; |
2772 | offset += sent; | 1526 | offset += sent; |
2773 | } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/); | 1527 | } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/); |
2774 | set_fs(oldfs); | 1528 | set_fs(oldfs); |
2775 | clear_bit(NET_CONGESTED, &mdev->flags); | 1529 | clear_bit(NET_CONGESTED, &mdev->tconn->flags); |
2776 | 1530 | ||
2777 | ok = (len == 0); | 1531 | if (len == 0) { |
2778 | if (likely(ok)) | 1532 | err = 0; |
2779 | mdev->send_cnt += size>>9; | 1533 | mdev->send_cnt += size >> 9; |
2780 | return ok; | 1534 | } |
1535 | return err; | ||
2781 | } | 1536 | } |
2782 | 1537 | ||
2783 | static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio) | 1538 | static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio) |
@@ -2786,12 +1541,15 @@ static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio) | |||
2786 | int i; | 1541 | int i; |
2787 | /* hint all but last page with MSG_MORE */ | 1542 | /* hint all but last page with MSG_MORE */ |
2788 | bio_for_each_segment(bvec, bio, i) { | 1543 | bio_for_each_segment(bvec, bio, i) { |
2789 | if (!_drbd_no_send_page(mdev, bvec->bv_page, | 1544 | int err; |
2790 | bvec->bv_offset, bvec->bv_len, | 1545 | |
2791 | i == bio->bi_vcnt -1 ? 0 : MSG_MORE)) | 1546 | err = _drbd_no_send_page(mdev, bvec->bv_page, |
2792 | return 0; | 1547 | bvec->bv_offset, bvec->bv_len, |
1548 | i == bio->bi_vcnt - 1 ? 0 : MSG_MORE); | ||
1549 | if (err) | ||
1550 | return err; | ||
2793 | } | 1551 | } |
2794 | return 1; | 1552 | return 0; |
2795 | } | 1553 | } |
2796 | 1554 | ||
2797 | static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio) | 1555 | static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio) |
@@ -2800,32 +1558,40 @@ static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio) | |||
2800 | int i; | 1558 | int i; |
2801 | /* hint all but last page with MSG_MORE */ | 1559 | /* hint all but last page with MSG_MORE */ |
2802 | bio_for_each_segment(bvec, bio, i) { | 1560 | bio_for_each_segment(bvec, bio, i) { |
2803 | if (!_drbd_send_page(mdev, bvec->bv_page, | 1561 | int err; |
2804 | bvec->bv_offset, bvec->bv_len, | 1562 | |
2805 | i == bio->bi_vcnt -1 ? 0 : MSG_MORE)) | 1563 | err = _drbd_send_page(mdev, bvec->bv_page, |
2806 | return 0; | 1564 | bvec->bv_offset, bvec->bv_len, |
1565 | i == bio->bi_vcnt - 1 ? 0 : MSG_MORE); | ||
1566 | if (err) | ||
1567 | return err; | ||
2807 | } | 1568 | } |
2808 | return 1; | 1569 | return 0; |
2809 | } | 1570 | } |
2810 | 1571 | ||
2811 | static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e) | 1572 | static int _drbd_send_zc_ee(struct drbd_conf *mdev, |
1573 | struct drbd_peer_request *peer_req) | ||
2812 | { | 1574 | { |
2813 | struct page *page = e->pages; | 1575 | struct page *page = peer_req->pages; |
2814 | unsigned len = e->size; | 1576 | unsigned len = peer_req->i.size; |
1577 | int err; | ||
1578 | |||
2815 | /* hint all but last page with MSG_MORE */ | 1579 | /* hint all but last page with MSG_MORE */ |
2816 | page_chain_for_each(page) { | 1580 | page_chain_for_each(page) { |
2817 | unsigned l = min_t(unsigned, len, PAGE_SIZE); | 1581 | unsigned l = min_t(unsigned, len, PAGE_SIZE); |
2818 | if (!_drbd_send_page(mdev, page, 0, l, | 1582 | |
2819 | page_chain_next(page) ? MSG_MORE : 0)) | 1583 | err = _drbd_send_page(mdev, page, 0, l, |
2820 | return 0; | 1584 | page_chain_next(page) ? MSG_MORE : 0); |
1585 | if (err) | ||
1586 | return err; | ||
2821 | len -= l; | 1587 | len -= l; |
2822 | } | 1588 | } |
2823 | return 1; | 1589 | return 0; |
2824 | } | 1590 | } |
2825 | 1591 | ||
2826 | static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw) | 1592 | static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw) |
2827 | { | 1593 | { |
2828 | if (mdev->agreed_pro_version >= 95) | 1594 | if (mdev->tconn->agreed_pro_version >= 95) |
2829 | return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) | | 1595 | return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) | |
2830 | (bi_rw & REQ_FUA ? DP_FUA : 0) | | 1596 | (bi_rw & REQ_FUA ? DP_FUA : 0) | |
2831 | (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) | | 1597 | (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) | |
@@ -2839,50 +1605,36 @@ static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw) | |||
2839 | */ | 1605 | */ |
2840 | int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) | 1606 | int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) |
2841 | { | 1607 | { |
2842 | int ok = 1; | 1608 | struct drbd_socket *sock; |
2843 | struct p_data p; | 1609 | struct p_data *p; |
2844 | unsigned int dp_flags = 0; | 1610 | unsigned int dp_flags = 0; |
2845 | void *dgb; | ||
2846 | int dgs; | 1611 | int dgs; |
1612 | int err; | ||
2847 | 1613 | ||
2848 | if (!drbd_get_data_sock(mdev)) | 1614 | sock = &mdev->tconn->data; |
2849 | return 0; | 1615 | p = drbd_prepare_command(mdev, sock); |
2850 | 1616 | dgs = mdev->tconn->integrity_tfm ? crypto_hash_digestsize(mdev->tconn->integrity_tfm) : 0; | |
2851 | dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ? | ||
2852 | crypto_hash_digestsize(mdev->integrity_w_tfm) : 0; | ||
2853 | |||
2854 | if (req->size <= DRBD_MAX_SIZE_H80_PACKET) { | ||
2855 | p.head.h80.magic = BE_DRBD_MAGIC; | ||
2856 | p.head.h80.command = cpu_to_be16(P_DATA); | ||
2857 | p.head.h80.length = | ||
2858 | cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size); | ||
2859 | } else { | ||
2860 | p.head.h95.magic = BE_DRBD_MAGIC_BIG; | ||
2861 | p.head.h95.command = cpu_to_be16(P_DATA); | ||
2862 | p.head.h95.length = | ||
2863 | cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size); | ||
2864 | } | ||
2865 | |||
2866 | p.sector = cpu_to_be64(req->sector); | ||
2867 | p.block_id = (unsigned long)req; | ||
2868 | p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq)); | ||
2869 | 1617 | ||
1618 | if (!p) | ||
1619 | return -EIO; | ||
1620 | p->sector = cpu_to_be64(req->i.sector); | ||
1621 | p->block_id = (unsigned long)req; | ||
1622 | p->seq_num = cpu_to_be32(atomic_inc_return(&mdev->packet_seq)); | ||
2870 | dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw); | 1623 | dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw); |
2871 | |||
2872 | if (mdev->state.conn >= C_SYNC_SOURCE && | 1624 | if (mdev->state.conn >= C_SYNC_SOURCE && |
2873 | mdev->state.conn <= C_PAUSED_SYNC_T) | 1625 | mdev->state.conn <= C_PAUSED_SYNC_T) |
2874 | dp_flags |= DP_MAY_SET_IN_SYNC; | 1626 | dp_flags |= DP_MAY_SET_IN_SYNC; |
2875 | 1627 | if (mdev->tconn->agreed_pro_version >= 100) { | |
2876 | p.dp_flags = cpu_to_be32(dp_flags); | 1628 | if (req->rq_state & RQ_EXP_RECEIVE_ACK) |
2877 | set_bit(UNPLUG_REMOTE, &mdev->flags); | 1629 | dp_flags |= DP_SEND_RECEIVE_ACK; |
2878 | ok = (sizeof(p) == | 1630 | if (req->rq_state & RQ_EXP_WRITE_ACK) |
2879 | drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0)); | 1631 | dp_flags |= DP_SEND_WRITE_ACK; |
2880 | if (ok && dgs) { | 1632 | } |
2881 | dgb = mdev->int_dig_out; | 1633 | p->dp_flags = cpu_to_be32(dp_flags); |
2882 | drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb); | 1634 | if (dgs) |
2883 | ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0); | 1635 | drbd_csum_bio(mdev, mdev->tconn->integrity_tfm, req->master_bio, p + 1); |
2884 | } | 1636 | err = __send_command(mdev->tconn, mdev->vnr, sock, P_DATA, sizeof(*p) + dgs, NULL, req->i.size); |
2885 | if (ok) { | 1637 | if (!err) { |
2886 | /* For protocol A, we have to memcpy the payload into | 1638 | /* For protocol A, we have to memcpy the payload into |
2887 | * socket buffers, as we may complete right away | 1639 | * socket buffers, as we may complete right away |
2888 | * as soon as we handed it over to tcp, at which point the data | 1640 | * as soon as we handed it over to tcp, at which point the data |
@@ -2894,92 +1646,76 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) | |||
2894 | * out ok after sending on this side, but does not fit on the | 1646 | * out ok after sending on this side, but does not fit on the |
2895 | * receiving side, we sure have detected corruption elsewhere. | 1647 | * receiving side, we sure have detected corruption elsewhere. |
2896 | */ | 1648 | */ |
2897 | if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs) | 1649 | if (!(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK)) || dgs) |
2898 | ok = _drbd_send_bio(mdev, req->master_bio); | 1650 | err = _drbd_send_bio(mdev, req->master_bio); |
2899 | else | 1651 | else |
2900 | ok = _drbd_send_zc_bio(mdev, req->master_bio); | 1652 | err = _drbd_send_zc_bio(mdev, req->master_bio); |
2901 | 1653 | ||
2902 | /* double check digest, sometimes buffers have been modified in flight. */ | 1654 | /* double check digest, sometimes buffers have been modified in flight. */ |
2903 | if (dgs > 0 && dgs <= 64) { | 1655 | if (dgs > 0 && dgs <= 64) { |
2904 | /* 64 byte, 512 bit, is the largest digest size | 1656 | /* 64 byte, 512 bit, is the largest digest size |
2905 | * currently supported in kernel crypto. */ | 1657 | * currently supported in kernel crypto. */ |
2906 | unsigned char digest[64]; | 1658 | unsigned char digest[64]; |
2907 | drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest); | 1659 | drbd_csum_bio(mdev, mdev->tconn->integrity_tfm, req->master_bio, digest); |
2908 | if (memcmp(mdev->int_dig_out, digest, dgs)) { | 1660 | if (memcmp(p + 1, digest, dgs)) { |
2909 | dev_warn(DEV, | 1661 | dev_warn(DEV, |
2910 | "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n", | 1662 | "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n", |
2911 | (unsigned long long)req->sector, req->size); | 1663 | (unsigned long long)req->i.sector, req->i.size); |
2912 | } | 1664 | } |
2913 | } /* else if (dgs > 64) { | 1665 | } /* else if (dgs > 64) { |
2914 | ... Be noisy about digest too large ... | 1666 | ... Be noisy about digest too large ... |
2915 | } */ | 1667 | } */ |
2916 | } | 1668 | } |
1669 | mutex_unlock(&sock->mutex); /* locked by drbd_prepare_command() */ | ||
2917 | 1670 | ||
2918 | drbd_put_data_sock(mdev); | 1671 | return err; |
2919 | |||
2920 | return ok; | ||
2921 | } | 1672 | } |
2922 | 1673 | ||
2923 | /* answer packet, used to send data back for read requests: | 1674 | /* answer packet, used to send data back for read requests: |
2924 | * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY) | 1675 | * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY) |
2925 | * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY) | 1676 | * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY) |
2926 | */ | 1677 | */ |
2927 | int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, | 1678 | int drbd_send_block(struct drbd_conf *mdev, enum drbd_packet cmd, |
2928 | struct drbd_epoch_entry *e) | 1679 | struct drbd_peer_request *peer_req) |
2929 | { | 1680 | { |
2930 | int ok; | 1681 | struct drbd_socket *sock; |
2931 | struct p_data p; | 1682 | struct p_data *p; |
2932 | void *dgb; | 1683 | int err; |
2933 | int dgs; | 1684 | int dgs; |
2934 | 1685 | ||
2935 | dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ? | 1686 | sock = &mdev->tconn->data; |
2936 | crypto_hash_digestsize(mdev->integrity_w_tfm) : 0; | 1687 | p = drbd_prepare_command(mdev, sock); |
2937 | |||
2938 | if (e->size <= DRBD_MAX_SIZE_H80_PACKET) { | ||
2939 | p.head.h80.magic = BE_DRBD_MAGIC; | ||
2940 | p.head.h80.command = cpu_to_be16(cmd); | ||
2941 | p.head.h80.length = | ||
2942 | cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size); | ||
2943 | } else { | ||
2944 | p.head.h95.magic = BE_DRBD_MAGIC_BIG; | ||
2945 | p.head.h95.command = cpu_to_be16(cmd); | ||
2946 | p.head.h95.length = | ||
2947 | cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size); | ||
2948 | } | ||
2949 | |||
2950 | p.sector = cpu_to_be64(e->sector); | ||
2951 | p.block_id = e->block_id; | ||
2952 | /* p.seq_num = 0; No sequence numbers here.. */ | ||
2953 | 1688 | ||
2954 | /* Only called by our kernel thread. | 1689 | dgs = mdev->tconn->integrity_tfm ? crypto_hash_digestsize(mdev->tconn->integrity_tfm) : 0; |
2955 | * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL | ||
2956 | * in response to admin command or module unload. | ||
2957 | */ | ||
2958 | if (!drbd_get_data_sock(mdev)) | ||
2959 | return 0; | ||
2960 | |||
2961 | ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0); | ||
2962 | if (ok && dgs) { | ||
2963 | dgb = mdev->int_dig_out; | ||
2964 | drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb); | ||
2965 | ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0); | ||
2966 | } | ||
2967 | if (ok) | ||
2968 | ok = _drbd_send_zc_ee(mdev, e); | ||
2969 | 1690 | ||
2970 | drbd_put_data_sock(mdev); | 1691 | if (!p) |
1692 | return -EIO; | ||
1693 | p->sector = cpu_to_be64(peer_req->i.sector); | ||
1694 | p->block_id = peer_req->block_id; | ||
1695 | p->seq_num = 0; /* unused */ | ||
1696 | p->dp_flags = 0; | ||
1697 | if (dgs) | ||
1698 | drbd_csum_ee(mdev, mdev->tconn->integrity_tfm, peer_req, p + 1); | ||
1699 | err = __send_command(mdev->tconn, mdev->vnr, sock, cmd, sizeof(*p) + dgs, NULL, peer_req->i.size); | ||
1700 | if (!err) | ||
1701 | err = _drbd_send_zc_ee(mdev, peer_req); | ||
1702 | mutex_unlock(&sock->mutex); /* locked by drbd_prepare_command() */ | ||
2971 | 1703 | ||
2972 | return ok; | 1704 | return err; |
2973 | } | 1705 | } |
2974 | 1706 | ||
2975 | int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req) | 1707 | int drbd_send_out_of_sync(struct drbd_conf *mdev, struct drbd_request *req) |
2976 | { | 1708 | { |
2977 | struct p_block_desc p; | 1709 | struct drbd_socket *sock; |
1710 | struct p_block_desc *p; | ||
2978 | 1711 | ||
2979 | p.sector = cpu_to_be64(req->sector); | 1712 | sock = &mdev->tconn->data; |
2980 | p.blksize = cpu_to_be32(req->size); | 1713 | p = drbd_prepare_command(mdev, sock); |
2981 | 1714 | if (!p) | |
2982 | return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p)); | 1715 | return -EIO; |
1716 | p->sector = cpu_to_be64(req->i.sector); | ||
1717 | p->blksize = cpu_to_be32(req->i.size); | ||
1718 | return drbd_send_command(mdev, sock, P_OUT_OF_SYNC, sizeof(*p), NULL, 0); | ||
2983 | } | 1719 | } |
2984 | 1720 | ||
2985 | /* | 1721 | /* |
@@ -2998,7 +1734,7 @@ int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req) | |||
2998 | /* | 1734 | /* |
2999 | * you must have down()ed the appropriate [m]sock_mutex elsewhere! | 1735 | * you must have down()ed the appropriate [m]sock_mutex elsewhere! |
3000 | */ | 1736 | */ |
3001 | int drbd_send(struct drbd_conf *mdev, struct socket *sock, | 1737 | int drbd_send(struct drbd_tconn *tconn, struct socket *sock, |
3002 | void *buf, size_t size, unsigned msg_flags) | 1738 | void *buf, size_t size, unsigned msg_flags) |
3003 | { | 1739 | { |
3004 | struct kvec iov; | 1740 | struct kvec iov; |
@@ -3006,7 +1742,7 @@ int drbd_send(struct drbd_conf *mdev, struct socket *sock, | |||
3006 | int rv, sent = 0; | 1742 | int rv, sent = 0; |
3007 | 1743 | ||
3008 | if (!sock) | 1744 | if (!sock) |
3009 | return -1000; | 1745 | return -EBADR; |
3010 | 1746 | ||
3011 | /* THINK if (signal_pending) return ... ? */ | 1747 | /* THINK if (signal_pending) return ... ? */ |
3012 | 1748 | ||
@@ -3019,9 +1755,11 @@ int drbd_send(struct drbd_conf *mdev, struct socket *sock, | |||
3019 | msg.msg_controllen = 0; | 1755 | msg.msg_controllen = 0; |
3020 | msg.msg_flags = msg_flags | MSG_NOSIGNAL; | 1756 | msg.msg_flags = msg_flags | MSG_NOSIGNAL; |
3021 | 1757 | ||
3022 | if (sock == mdev->data.socket) { | 1758 | if (sock == tconn->data.socket) { |
3023 | mdev->ko_count = mdev->net_conf->ko_count; | 1759 | rcu_read_lock(); |
3024 | drbd_update_congested(mdev); | 1760 | tconn->ko_count = rcu_dereference(tconn->net_conf)->ko_count; |
1761 | rcu_read_unlock(); | ||
1762 | drbd_update_congested(tconn); | ||
3025 | } | 1763 | } |
3026 | do { | 1764 | do { |
3027 | /* STRANGE | 1765 | /* STRANGE |
@@ -3035,12 +1773,11 @@ int drbd_send(struct drbd_conf *mdev, struct socket *sock, | |||
3035 | */ | 1773 | */ |
3036 | rv = kernel_sendmsg(sock, &msg, &iov, 1, size); | 1774 | rv = kernel_sendmsg(sock, &msg, &iov, 1, size); |
3037 | if (rv == -EAGAIN) { | 1775 | if (rv == -EAGAIN) { |
3038 | if (we_should_drop_the_connection(mdev, sock)) | 1776 | if (we_should_drop_the_connection(tconn, sock)) |
3039 | break; | 1777 | break; |
3040 | else | 1778 | else |
3041 | continue; | 1779 | continue; |
3042 | } | 1780 | } |
3043 | D_ASSERT(rv != 0); | ||
3044 | if (rv == -EINTR) { | 1781 | if (rv == -EINTR) { |
3045 | flush_signals(current); | 1782 | flush_signals(current); |
3046 | rv = 0; | 1783 | rv = 0; |
@@ -3052,22 +1789,40 @@ int drbd_send(struct drbd_conf *mdev, struct socket *sock, | |||
3052 | iov.iov_len -= rv; | 1789 | iov.iov_len -= rv; |
3053 | } while (sent < size); | 1790 | } while (sent < size); |
3054 | 1791 | ||
3055 | if (sock == mdev->data.socket) | 1792 | if (sock == tconn->data.socket) |
3056 | clear_bit(NET_CONGESTED, &mdev->flags); | 1793 | clear_bit(NET_CONGESTED, &tconn->flags); |
3057 | 1794 | ||
3058 | if (rv <= 0) { | 1795 | if (rv <= 0) { |
3059 | if (rv != -EAGAIN) { | 1796 | if (rv != -EAGAIN) { |
3060 | dev_err(DEV, "%s_sendmsg returned %d\n", | 1797 | conn_err(tconn, "%s_sendmsg returned %d\n", |
3061 | sock == mdev->meta.socket ? "msock" : "sock", | 1798 | sock == tconn->meta.socket ? "msock" : "sock", |
3062 | rv); | 1799 | rv); |
3063 | drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE)); | 1800 | conn_request_state(tconn, NS(conn, C_BROKEN_PIPE), CS_HARD); |
3064 | } else | 1801 | } else |
3065 | drbd_force_state(mdev, NS(conn, C_TIMEOUT)); | 1802 | conn_request_state(tconn, NS(conn, C_TIMEOUT), CS_HARD); |
3066 | } | 1803 | } |
3067 | 1804 | ||
3068 | return sent; | 1805 | return sent; |
3069 | } | 1806 | } |
3070 | 1807 | ||
1808 | /** | ||
1809 | * drbd_send_all - Send an entire buffer | ||
1810 | * | ||
1811 | * Returns 0 upon success and a negative error value otherwise. | ||
1812 | */ | ||
1813 | int drbd_send_all(struct drbd_tconn *tconn, struct socket *sock, void *buffer, | ||
1814 | size_t size, unsigned msg_flags) | ||
1815 | { | ||
1816 | int err; | ||
1817 | |||
1818 | err = drbd_send(tconn, sock, buffer, size, msg_flags); | ||
1819 | if (err < 0) | ||
1820 | return err; | ||
1821 | if (err != size) | ||
1822 | return -EIO; | ||
1823 | return 0; | ||
1824 | } | ||
1825 | |||
3071 | static int drbd_open(struct block_device *bdev, fmode_t mode) | 1826 | static int drbd_open(struct block_device *bdev, fmode_t mode) |
3072 | { | 1827 | { |
3073 | struct drbd_conf *mdev = bdev->bd_disk->private_data; | 1828 | struct drbd_conf *mdev = bdev->bd_disk->private_data; |
@@ -3075,7 +1830,7 @@ static int drbd_open(struct block_device *bdev, fmode_t mode) | |||
3075 | int rv = 0; | 1830 | int rv = 0; |
3076 | 1831 | ||
3077 | mutex_lock(&drbd_main_mutex); | 1832 | mutex_lock(&drbd_main_mutex); |
3078 | spin_lock_irqsave(&mdev->req_lock, flags); | 1833 | spin_lock_irqsave(&mdev->tconn->req_lock, flags); |
3079 | /* to have a stable mdev->state.role | 1834 | /* to have a stable mdev->state.role |
3080 | * and no race with updating open_cnt */ | 1835 | * and no race with updating open_cnt */ |
3081 | 1836 | ||
@@ -3088,7 +1843,7 @@ static int drbd_open(struct block_device *bdev, fmode_t mode) | |||
3088 | 1843 | ||
3089 | if (!rv) | 1844 | if (!rv) |
3090 | mdev->open_cnt++; | 1845 | mdev->open_cnt++; |
3091 | spin_unlock_irqrestore(&mdev->req_lock, flags); | 1846 | spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); |
3092 | mutex_unlock(&drbd_main_mutex); | 1847 | mutex_unlock(&drbd_main_mutex); |
3093 | 1848 | ||
3094 | return rv; | 1849 | return rv; |
@@ -3105,35 +1860,14 @@ static int drbd_release(struct gendisk *gd, fmode_t mode) | |||
3105 | 1860 | ||
3106 | static void drbd_set_defaults(struct drbd_conf *mdev) | 1861 | static void drbd_set_defaults(struct drbd_conf *mdev) |
3107 | { | 1862 | { |
3108 | /* This way we get a compile error when sync_conf grows, | 1863 | /* Beware! The actual layout differs |
3109 | and we forgot to initialize it here */ | 1864 | * between big endian and little endian */ |
3110 | mdev->sync_conf = (struct syncer_conf) { | 1865 | mdev->state = (union drbd_dev_state) { |
3111 | /* .rate = */ DRBD_RATE_DEF, | ||
3112 | /* .after = */ DRBD_AFTER_DEF, | ||
3113 | /* .al_extents = */ DRBD_AL_EXTENTS_DEF, | ||
3114 | /* .verify_alg = */ {}, 0, | ||
3115 | /* .cpu_mask = */ {}, 0, | ||
3116 | /* .csums_alg = */ {}, 0, | ||
3117 | /* .use_rle = */ 0, | ||
3118 | /* .on_no_data = */ DRBD_ON_NO_DATA_DEF, | ||
3119 | /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF, | ||
3120 | /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF, | ||
3121 | /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF, | ||
3122 | /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF, | ||
3123 | /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF | ||
3124 | }; | ||
3125 | |||
3126 | /* Have to use that way, because the layout differs between | ||
3127 | big endian and little endian */ | ||
3128 | mdev->state = (union drbd_state) { | ||
3129 | { .role = R_SECONDARY, | 1866 | { .role = R_SECONDARY, |
3130 | .peer = R_UNKNOWN, | 1867 | .peer = R_UNKNOWN, |
3131 | .conn = C_STANDALONE, | 1868 | .conn = C_STANDALONE, |
3132 | .disk = D_DISKLESS, | 1869 | .disk = D_DISKLESS, |
3133 | .pdsk = D_UNKNOWN, | 1870 | .pdsk = D_UNKNOWN, |
3134 | .susp = 0, | ||
3135 | .susp_nod = 0, | ||
3136 | .susp_fen = 0 | ||
3137 | } }; | 1871 | } }; |
3138 | } | 1872 | } |
3139 | 1873 | ||
@@ -3149,28 +1883,17 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) | |||
3149 | atomic_set(&mdev->rs_pending_cnt, 0); | 1883 | atomic_set(&mdev->rs_pending_cnt, 0); |
3150 | atomic_set(&mdev->unacked_cnt, 0); | 1884 | atomic_set(&mdev->unacked_cnt, 0); |
3151 | atomic_set(&mdev->local_cnt, 0); | 1885 | atomic_set(&mdev->local_cnt, 0); |
3152 | atomic_set(&mdev->net_cnt, 0); | ||
3153 | atomic_set(&mdev->packet_seq, 0); | ||
3154 | atomic_set(&mdev->pp_in_use, 0); | ||
3155 | atomic_set(&mdev->pp_in_use_by_net, 0); | 1886 | atomic_set(&mdev->pp_in_use_by_net, 0); |
3156 | atomic_set(&mdev->rs_sect_in, 0); | 1887 | atomic_set(&mdev->rs_sect_in, 0); |
3157 | atomic_set(&mdev->rs_sect_ev, 0); | 1888 | atomic_set(&mdev->rs_sect_ev, 0); |
3158 | atomic_set(&mdev->ap_in_flight, 0); | 1889 | atomic_set(&mdev->ap_in_flight, 0); |
3159 | atomic_set(&mdev->md_io_in_use, 0); | 1890 | atomic_set(&mdev->md_io_in_use, 0); |
3160 | 1891 | ||
3161 | mutex_init(&mdev->data.mutex); | 1892 | mutex_init(&mdev->own_state_mutex); |
3162 | mutex_init(&mdev->meta.mutex); | 1893 | mdev->state_mutex = &mdev->own_state_mutex; |
3163 | sema_init(&mdev->data.work.s, 0); | ||
3164 | sema_init(&mdev->meta.work.s, 0); | ||
3165 | mutex_init(&mdev->state_mutex); | ||
3166 | |||
3167 | spin_lock_init(&mdev->data.work.q_lock); | ||
3168 | spin_lock_init(&mdev->meta.work.q_lock); | ||
3169 | 1894 | ||
3170 | spin_lock_init(&mdev->al_lock); | 1895 | spin_lock_init(&mdev->al_lock); |
3171 | spin_lock_init(&mdev->req_lock); | ||
3172 | spin_lock_init(&mdev->peer_seq_lock); | 1896 | spin_lock_init(&mdev->peer_seq_lock); |
3173 | spin_lock_init(&mdev->epoch_lock); | ||
3174 | 1897 | ||
3175 | INIT_LIST_HEAD(&mdev->active_ee); | 1898 | INIT_LIST_HEAD(&mdev->active_ee); |
3176 | INIT_LIST_HEAD(&mdev->sync_ee); | 1899 | INIT_LIST_HEAD(&mdev->sync_ee); |
@@ -3178,8 +1901,6 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) | |||
3178 | INIT_LIST_HEAD(&mdev->read_ee); | 1901 | INIT_LIST_HEAD(&mdev->read_ee); |
3179 | INIT_LIST_HEAD(&mdev->net_ee); | 1902 | INIT_LIST_HEAD(&mdev->net_ee); |
3180 | INIT_LIST_HEAD(&mdev->resync_reads); | 1903 | INIT_LIST_HEAD(&mdev->resync_reads); |
3181 | INIT_LIST_HEAD(&mdev->data.work.q); | ||
3182 | INIT_LIST_HEAD(&mdev->meta.work.q); | ||
3183 | INIT_LIST_HEAD(&mdev->resync_work.list); | 1904 | INIT_LIST_HEAD(&mdev->resync_work.list); |
3184 | INIT_LIST_HEAD(&mdev->unplug_work.list); | 1905 | INIT_LIST_HEAD(&mdev->unplug_work.list); |
3185 | INIT_LIST_HEAD(&mdev->go_diskless.list); | 1906 | INIT_LIST_HEAD(&mdev->go_diskless.list); |
@@ -3193,6 +1914,14 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) | |||
3193 | mdev->md_sync_work.cb = w_md_sync; | 1914 | mdev->md_sync_work.cb = w_md_sync; |
3194 | mdev->bm_io_work.w.cb = w_bitmap_io; | 1915 | mdev->bm_io_work.w.cb = w_bitmap_io; |
3195 | mdev->start_resync_work.cb = w_start_resync; | 1916 | mdev->start_resync_work.cb = w_start_resync; |
1917 | |||
1918 | mdev->resync_work.mdev = mdev; | ||
1919 | mdev->unplug_work.mdev = mdev; | ||
1920 | mdev->go_diskless.mdev = mdev; | ||
1921 | mdev->md_sync_work.mdev = mdev; | ||
1922 | mdev->bm_io_work.w.mdev = mdev; | ||
1923 | mdev->start_resync_work.mdev = mdev; | ||
1924 | |||
3196 | init_timer(&mdev->resync_timer); | 1925 | init_timer(&mdev->resync_timer); |
3197 | init_timer(&mdev->md_sync_timer); | 1926 | init_timer(&mdev->md_sync_timer); |
3198 | init_timer(&mdev->start_resync_timer); | 1927 | init_timer(&mdev->start_resync_timer); |
@@ -3208,17 +1937,10 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) | |||
3208 | 1937 | ||
3209 | init_waitqueue_head(&mdev->misc_wait); | 1938 | init_waitqueue_head(&mdev->misc_wait); |
3210 | init_waitqueue_head(&mdev->state_wait); | 1939 | init_waitqueue_head(&mdev->state_wait); |
3211 | init_waitqueue_head(&mdev->net_cnt_wait); | ||
3212 | init_waitqueue_head(&mdev->ee_wait); | 1940 | init_waitqueue_head(&mdev->ee_wait); |
3213 | init_waitqueue_head(&mdev->al_wait); | 1941 | init_waitqueue_head(&mdev->al_wait); |
3214 | init_waitqueue_head(&mdev->seq_wait); | 1942 | init_waitqueue_head(&mdev->seq_wait); |
3215 | 1943 | ||
3216 | drbd_thread_init(mdev, &mdev->receiver, drbdd_init); | ||
3217 | drbd_thread_init(mdev, &mdev->worker, drbd_worker); | ||
3218 | drbd_thread_init(mdev, &mdev->asender, drbd_asender); | ||
3219 | |||
3220 | mdev->agreed_pro_version = PRO_VERSION_MAX; | ||
3221 | mdev->write_ordering = WO_bdev_flush; | ||
3222 | mdev->resync_wenr = LC_FREE; | 1944 | mdev->resync_wenr = LC_FREE; |
3223 | mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE; | 1945 | mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE; |
3224 | mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE; | 1946 | mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE; |
@@ -3227,13 +1949,10 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) | |||
3227 | void drbd_mdev_cleanup(struct drbd_conf *mdev) | 1949 | void drbd_mdev_cleanup(struct drbd_conf *mdev) |
3228 | { | 1950 | { |
3229 | int i; | 1951 | int i; |
3230 | if (mdev->receiver.t_state != None) | 1952 | if (mdev->tconn->receiver.t_state != NONE) |
3231 | dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n", | 1953 | dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n", |
3232 | mdev->receiver.t_state); | 1954 | mdev->tconn->receiver.t_state); |
3233 | 1955 | ||
3234 | /* no need to lock it, I'm the only thread alive */ | ||
3235 | if (atomic_read(&mdev->current_epoch->epoch_size) != 0) | ||
3236 | dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size)); | ||
3237 | mdev->al_writ_cnt = | 1956 | mdev->al_writ_cnt = |
3238 | mdev->bm_writ_cnt = | 1957 | mdev->bm_writ_cnt = |
3239 | mdev->read_cnt = | 1958 | mdev->read_cnt = |
@@ -3250,7 +1969,7 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev) | |||
3250 | mdev->rs_mark_left[i] = 0; | 1969 | mdev->rs_mark_left[i] = 0; |
3251 | mdev->rs_mark_time[i] = 0; | 1970 | mdev->rs_mark_time[i] = 0; |
3252 | } | 1971 | } |
3253 | D_ASSERT(mdev->net_conf == NULL); | 1972 | D_ASSERT(mdev->tconn->net_conf == NULL); |
3254 | 1973 | ||
3255 | drbd_set_my_capacity(mdev, 0); | 1974 | drbd_set_my_capacity(mdev, 0); |
3256 | if (mdev->bitmap) { | 1975 | if (mdev->bitmap) { |
@@ -3259,21 +1978,18 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev) | |||
3259 | drbd_bm_cleanup(mdev); | 1978 | drbd_bm_cleanup(mdev); |
3260 | } | 1979 | } |
3261 | 1980 | ||
3262 | drbd_free_resources(mdev); | 1981 | drbd_free_bc(mdev->ldev); |
1982 | mdev->ldev = NULL; | ||
1983 | |||
3263 | clear_bit(AL_SUSPENDED, &mdev->flags); | 1984 | clear_bit(AL_SUSPENDED, &mdev->flags); |
3264 | 1985 | ||
3265 | /* | ||
3266 | * currently we drbd_init_ee only on module load, so | ||
3267 | * we may do drbd_release_ee only on module unload! | ||
3268 | */ | ||
3269 | D_ASSERT(list_empty(&mdev->active_ee)); | 1986 | D_ASSERT(list_empty(&mdev->active_ee)); |
3270 | D_ASSERT(list_empty(&mdev->sync_ee)); | 1987 | D_ASSERT(list_empty(&mdev->sync_ee)); |
3271 | D_ASSERT(list_empty(&mdev->done_ee)); | 1988 | D_ASSERT(list_empty(&mdev->done_ee)); |
3272 | D_ASSERT(list_empty(&mdev->read_ee)); | 1989 | D_ASSERT(list_empty(&mdev->read_ee)); |
3273 | D_ASSERT(list_empty(&mdev->net_ee)); | 1990 | D_ASSERT(list_empty(&mdev->net_ee)); |
3274 | D_ASSERT(list_empty(&mdev->resync_reads)); | 1991 | D_ASSERT(list_empty(&mdev->resync_reads)); |
3275 | D_ASSERT(list_empty(&mdev->data.work.q)); | 1992 | D_ASSERT(list_empty(&mdev->tconn->sender_work.q)); |
3276 | D_ASSERT(list_empty(&mdev->meta.work.q)); | ||
3277 | D_ASSERT(list_empty(&mdev->resync_work.list)); | 1993 | D_ASSERT(list_empty(&mdev->resync_work.list)); |
3278 | D_ASSERT(list_empty(&mdev->unplug_work.list)); | 1994 | D_ASSERT(list_empty(&mdev->unplug_work.list)); |
3279 | D_ASSERT(list_empty(&mdev->go_diskless.list)); | 1995 | D_ASSERT(list_empty(&mdev->go_diskless.list)); |
@@ -3347,7 +2063,7 @@ static int drbd_create_mempools(void) | |||
3347 | goto Enomem; | 2063 | goto Enomem; |
3348 | 2064 | ||
3349 | drbd_ee_cache = kmem_cache_create( | 2065 | drbd_ee_cache = kmem_cache_create( |
3350 | "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL); | 2066 | "drbd_ee", sizeof(struct drbd_peer_request), 0, 0, NULL); |
3351 | if (drbd_ee_cache == NULL) | 2067 | if (drbd_ee_cache == NULL) |
3352 | goto Enomem; | 2068 | goto Enomem; |
3353 | 2069 | ||
@@ -3362,11 +2078,9 @@ static int drbd_create_mempools(void) | |||
3362 | goto Enomem; | 2078 | goto Enomem; |
3363 | 2079 | ||
3364 | /* mempools */ | 2080 | /* mempools */ |
3365 | #ifdef COMPAT_HAVE_BIOSET_CREATE | ||
3366 | drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0); | 2081 | drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0); |
3367 | if (drbd_md_io_bio_set == NULL) | 2082 | if (drbd_md_io_bio_set == NULL) |
3368 | goto Enomem; | 2083 | goto Enomem; |
3369 | #endif | ||
3370 | 2084 | ||
3371 | drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0); | 2085 | drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0); |
3372 | if (drbd_md_io_page_pool == NULL) | 2086 | if (drbd_md_io_page_pool == NULL) |
@@ -3415,73 +2129,53 @@ static struct notifier_block drbd_notifier = { | |||
3415 | .notifier_call = drbd_notify_sys, | 2129 | .notifier_call = drbd_notify_sys, |
3416 | }; | 2130 | }; |
3417 | 2131 | ||
3418 | static void drbd_release_ee_lists(struct drbd_conf *mdev) | 2132 | static void drbd_release_all_peer_reqs(struct drbd_conf *mdev) |
3419 | { | 2133 | { |
3420 | int rr; | 2134 | int rr; |
3421 | 2135 | ||
3422 | rr = drbd_release_ee(mdev, &mdev->active_ee); | 2136 | rr = drbd_free_peer_reqs(mdev, &mdev->active_ee); |
3423 | if (rr) | 2137 | if (rr) |
3424 | dev_err(DEV, "%d EEs in active list found!\n", rr); | 2138 | dev_err(DEV, "%d EEs in active list found!\n", rr); |
3425 | 2139 | ||
3426 | rr = drbd_release_ee(mdev, &mdev->sync_ee); | 2140 | rr = drbd_free_peer_reqs(mdev, &mdev->sync_ee); |
3427 | if (rr) | 2141 | if (rr) |
3428 | dev_err(DEV, "%d EEs in sync list found!\n", rr); | 2142 | dev_err(DEV, "%d EEs in sync list found!\n", rr); |
3429 | 2143 | ||
3430 | rr = drbd_release_ee(mdev, &mdev->read_ee); | 2144 | rr = drbd_free_peer_reqs(mdev, &mdev->read_ee); |
3431 | if (rr) | 2145 | if (rr) |
3432 | dev_err(DEV, "%d EEs in read list found!\n", rr); | 2146 | dev_err(DEV, "%d EEs in read list found!\n", rr); |
3433 | 2147 | ||
3434 | rr = drbd_release_ee(mdev, &mdev->done_ee); | 2148 | rr = drbd_free_peer_reqs(mdev, &mdev->done_ee); |
3435 | if (rr) | 2149 | if (rr) |
3436 | dev_err(DEV, "%d EEs in done list found!\n", rr); | 2150 | dev_err(DEV, "%d EEs in done list found!\n", rr); |
3437 | 2151 | ||
3438 | rr = drbd_release_ee(mdev, &mdev->net_ee); | 2152 | rr = drbd_free_peer_reqs(mdev, &mdev->net_ee); |
3439 | if (rr) | 2153 | if (rr) |
3440 | dev_err(DEV, "%d EEs in net list found!\n", rr); | 2154 | dev_err(DEV, "%d EEs in net list found!\n", rr); |
3441 | } | 2155 | } |
3442 | 2156 | ||
3443 | /* caution. no locking. | 2157 | /* caution. no locking. */ |
3444 | * currently only used from module cleanup code. */ | 2158 | void drbd_minor_destroy(struct kref *kref) |
3445 | static void drbd_delete_device(unsigned int minor) | ||
3446 | { | 2159 | { |
3447 | struct drbd_conf *mdev = minor_to_mdev(minor); | 2160 | struct drbd_conf *mdev = container_of(kref, struct drbd_conf, kref); |
3448 | 2161 | struct drbd_tconn *tconn = mdev->tconn; | |
3449 | if (!mdev) | ||
3450 | return; | ||
3451 | 2162 | ||
3452 | del_timer_sync(&mdev->request_timer); | 2163 | del_timer_sync(&mdev->request_timer); |
3453 | 2164 | ||
3454 | /* paranoia asserts */ | 2165 | /* paranoia asserts */ |
3455 | if (mdev->open_cnt != 0) | 2166 | D_ASSERT(mdev->open_cnt == 0); |
3456 | dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt, | ||
3457 | __FILE__ , __LINE__); | ||
3458 | |||
3459 | ERR_IF (!list_empty(&mdev->data.work.q)) { | ||
3460 | struct list_head *lp; | ||
3461 | list_for_each(lp, &mdev->data.work.q) { | ||
3462 | dev_err(DEV, "lp = %p\n", lp); | ||
3463 | } | ||
3464 | }; | ||
3465 | /* end paranoia asserts */ | 2167 | /* end paranoia asserts */ |
3466 | 2168 | ||
3467 | del_gendisk(mdev->vdisk); | ||
3468 | |||
3469 | /* cleanup stuff that may have been allocated during | 2169 | /* cleanup stuff that may have been allocated during |
3470 | * device (re-)configuration or state changes */ | 2170 | * device (re-)configuration or state changes */ |
3471 | 2171 | ||
3472 | if (mdev->this_bdev) | 2172 | if (mdev->this_bdev) |
3473 | bdput(mdev->this_bdev); | 2173 | bdput(mdev->this_bdev); |
3474 | 2174 | ||
3475 | drbd_free_resources(mdev); | 2175 | drbd_free_bc(mdev->ldev); |
2176 | mdev->ldev = NULL; | ||
3476 | 2177 | ||
3477 | drbd_release_ee_lists(mdev); | 2178 | drbd_release_all_peer_reqs(mdev); |
3478 | |||
3479 | /* should be freed on disconnect? */ | ||
3480 | kfree(mdev->ee_hash); | ||
3481 | /* | ||
3482 | mdev->ee_hash_s = 0; | ||
3483 | mdev->ee_hash = NULL; | ||
3484 | */ | ||
3485 | 2179 | ||
3486 | lc_destroy(mdev->act_log); | 2180 | lc_destroy(mdev->act_log); |
3487 | lc_destroy(mdev->resync); | 2181 | lc_destroy(mdev->resync); |
@@ -3489,19 +2183,101 @@ static void drbd_delete_device(unsigned int minor) | |||
3489 | kfree(mdev->p_uuid); | 2183 | kfree(mdev->p_uuid); |
3490 | /* mdev->p_uuid = NULL; */ | 2184 | /* mdev->p_uuid = NULL; */ |
3491 | 2185 | ||
3492 | kfree(mdev->int_dig_out); | 2186 | if (mdev->bitmap) /* should no longer be there. */ |
3493 | kfree(mdev->int_dig_in); | 2187 | drbd_bm_cleanup(mdev); |
3494 | kfree(mdev->int_dig_vv); | 2188 | __free_page(mdev->md_io_page); |
2189 | put_disk(mdev->vdisk); | ||
2190 | blk_cleanup_queue(mdev->rq_queue); | ||
2191 | kfree(mdev->rs_plan_s); | ||
2192 | kfree(mdev); | ||
3495 | 2193 | ||
3496 | /* cleanup the rest that has been | 2194 | kref_put(&tconn->kref, &conn_destroy); |
3497 | * allocated from drbd_new_device | ||
3498 | * and actually free the mdev itself */ | ||
3499 | drbd_free_mdev(mdev); | ||
3500 | } | 2195 | } |
3501 | 2196 | ||
2197 | /* One global retry thread, if we need to push back some bio and have it | ||
2198 | * reinserted through our make request function. | ||
2199 | */ | ||
2200 | static struct retry_worker { | ||
2201 | struct workqueue_struct *wq; | ||
2202 | struct work_struct worker; | ||
2203 | |||
2204 | spinlock_t lock; | ||
2205 | struct list_head writes; | ||
2206 | } retry; | ||
2207 | |||
2208 | static void do_retry(struct work_struct *ws) | ||
2209 | { | ||
2210 | struct retry_worker *retry = container_of(ws, struct retry_worker, worker); | ||
2211 | LIST_HEAD(writes); | ||
2212 | struct drbd_request *req, *tmp; | ||
2213 | |||
2214 | spin_lock_irq(&retry->lock); | ||
2215 | list_splice_init(&retry->writes, &writes); | ||
2216 | spin_unlock_irq(&retry->lock); | ||
2217 | |||
2218 | list_for_each_entry_safe(req, tmp, &writes, tl_requests) { | ||
2219 | struct drbd_conf *mdev = req->w.mdev; | ||
2220 | struct bio *bio = req->master_bio; | ||
2221 | unsigned long start_time = req->start_time; | ||
2222 | bool expected; | ||
2223 | |||
2224 | expected = | ||
2225 | expect(atomic_read(&req->completion_ref) == 0) && | ||
2226 | expect(req->rq_state & RQ_POSTPONED) && | ||
2227 | expect((req->rq_state & RQ_LOCAL_PENDING) == 0 || | ||
2228 | (req->rq_state & RQ_LOCAL_ABORTED) != 0); | ||
2229 | |||
2230 | if (!expected) | ||
2231 | dev_err(DEV, "req=%p completion_ref=%d rq_state=%x\n", | ||
2232 | req, atomic_read(&req->completion_ref), | ||
2233 | req->rq_state); | ||
2234 | |||
2235 | /* We still need to put one kref associated with the | ||
2236 | * "completion_ref" going zero in the code path that queued it | ||
2237 | * here. The request object may still be referenced by a | ||
2238 | * frozen local req->private_bio, in case we force-detached. | ||
2239 | */ | ||
2240 | kref_put(&req->kref, drbd_req_destroy); | ||
2241 | |||
2242 | /* A single suspended or otherwise blocking device may stall | ||
2243 | * all others as well. Fortunately, this code path is to | ||
2244 | * recover from a situation that "should not happen": | ||
2245 | * concurrent writes in multi-primary setup. | ||
2246 | * In a "normal" lifecycle, this workqueue is supposed to be | ||
2247 | * destroyed without ever doing anything. | ||
2248 | * If it turns out to be an issue anyways, we can do per | ||
2249 | * resource (replication group) or per device (minor) retry | ||
2250 | * workqueues instead. | ||
2251 | */ | ||
2252 | |||
2253 | /* We are not just doing generic_make_request(), | ||
2254 | * as we want to keep the start_time information. */ | ||
2255 | inc_ap_bio(mdev); | ||
2256 | __drbd_make_request(mdev, bio, start_time); | ||
2257 | } | ||
2258 | } | ||
2259 | |||
2260 | void drbd_restart_request(struct drbd_request *req) | ||
2261 | { | ||
2262 | unsigned long flags; | ||
2263 | spin_lock_irqsave(&retry.lock, flags); | ||
2264 | list_move_tail(&req->tl_requests, &retry.writes); | ||
2265 | spin_unlock_irqrestore(&retry.lock, flags); | ||
2266 | |||
2267 | /* Drop the extra reference that would otherwise | ||
2268 | * have been dropped by complete_master_bio. | ||
2269 | * do_retry() needs to grab a new one. */ | ||
2270 | dec_ap_bio(req->w.mdev); | ||
2271 | |||
2272 | queue_work(retry.wq, &retry.worker); | ||
2273 | } | ||
2274 | |||
2275 | |||
3502 | static void drbd_cleanup(void) | 2276 | static void drbd_cleanup(void) |
3503 | { | 2277 | { |
3504 | unsigned int i; | 2278 | unsigned int i; |
2279 | struct drbd_conf *mdev; | ||
2280 | struct drbd_tconn *tconn, *tmp; | ||
3505 | 2281 | ||
3506 | unregister_reboot_notifier(&drbd_notifier); | 2282 | unregister_reboot_notifier(&drbd_notifier); |
3507 | 2283 | ||
@@ -3516,19 +2292,31 @@ static void drbd_cleanup(void) | |||
3516 | if (drbd_proc) | 2292 | if (drbd_proc) |
3517 | remove_proc_entry("drbd", NULL); | 2293 | remove_proc_entry("drbd", NULL); |
3518 | 2294 | ||
3519 | drbd_nl_cleanup(); | 2295 | if (retry.wq) |
2296 | destroy_workqueue(retry.wq); | ||
2297 | |||
2298 | drbd_genl_unregister(); | ||
3520 | 2299 | ||
3521 | if (minor_table) { | 2300 | idr_for_each_entry(&minors, mdev, i) { |
3522 | i = minor_count; | 2301 | idr_remove(&minors, mdev_to_minor(mdev)); |
3523 | while (i--) | 2302 | idr_remove(&mdev->tconn->volumes, mdev->vnr); |
3524 | drbd_delete_device(i); | 2303 | del_gendisk(mdev->vdisk); |
3525 | drbd_destroy_mempools(); | 2304 | /* synchronize_rcu(); No other threads running at this point */ |
2305 | kref_put(&mdev->kref, &drbd_minor_destroy); | ||
3526 | } | 2306 | } |
3527 | 2307 | ||
3528 | kfree(minor_table); | 2308 | /* not _rcu since, no other updater anymore. Genl already unregistered */ |
2309 | list_for_each_entry_safe(tconn, tmp, &drbd_tconns, all_tconn) { | ||
2310 | list_del(&tconn->all_tconn); /* not _rcu no proc, not other threads */ | ||
2311 | /* synchronize_rcu(); */ | ||
2312 | kref_put(&tconn->kref, &conn_destroy); | ||
2313 | } | ||
3529 | 2314 | ||
2315 | drbd_destroy_mempools(); | ||
3530 | unregister_blkdev(DRBD_MAJOR, "drbd"); | 2316 | unregister_blkdev(DRBD_MAJOR, "drbd"); |
3531 | 2317 | ||
2318 | idr_destroy(&minors); | ||
2319 | |||
3532 | printk(KERN_INFO "drbd: module cleanup done.\n"); | 2320 | printk(KERN_INFO "drbd: module cleanup done.\n"); |
3533 | } | 2321 | } |
3534 | 2322 | ||
@@ -3553,7 +2341,7 @@ static int drbd_congested(void *congested_data, int bdi_bits) | |||
3553 | goto out; | 2341 | goto out; |
3554 | } | 2342 | } |
3555 | 2343 | ||
3556 | if (test_bit(CALLBACK_PENDING, &mdev->flags)) { | 2344 | if (test_bit(CALLBACK_PENDING, &mdev->tconn->flags)) { |
3557 | r |= (1 << BDI_async_congested); | 2345 | r |= (1 << BDI_async_congested); |
3558 | /* Without good local data, we would need to read from remote, | 2346 | /* Without good local data, we would need to read from remote, |
3559 | * and that would need the worker thread as well, which is | 2347 | * and that would need the worker thread as well, which is |
@@ -3577,7 +2365,7 @@ static int drbd_congested(void *congested_data, int bdi_bits) | |||
3577 | reason = 'b'; | 2365 | reason = 'b'; |
3578 | } | 2366 | } |
3579 | 2367 | ||
3580 | if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) { | 2368 | if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->tconn->flags)) { |
3581 | r |= (1 << BDI_async_congested); | 2369 | r |= (1 << BDI_async_congested); |
3582 | reason = reason == 'b' ? 'a' : 'n'; | 2370 | reason = reason == 'b' ? 'a' : 'n'; |
3583 | } | 2371 | } |
@@ -3587,20 +2375,243 @@ out: | |||
3587 | return r; | 2375 | return r; |
3588 | } | 2376 | } |
3589 | 2377 | ||
3590 | struct drbd_conf *drbd_new_device(unsigned int minor) | 2378 | static void drbd_init_workqueue(struct drbd_work_queue* wq) |
2379 | { | ||
2380 | spin_lock_init(&wq->q_lock); | ||
2381 | INIT_LIST_HEAD(&wq->q); | ||
2382 | init_waitqueue_head(&wq->q_wait); | ||
2383 | } | ||
2384 | |||
2385 | struct drbd_tconn *conn_get_by_name(const char *name) | ||
2386 | { | ||
2387 | struct drbd_tconn *tconn; | ||
2388 | |||
2389 | if (!name || !name[0]) | ||
2390 | return NULL; | ||
2391 | |||
2392 | rcu_read_lock(); | ||
2393 | list_for_each_entry_rcu(tconn, &drbd_tconns, all_tconn) { | ||
2394 | if (!strcmp(tconn->name, name)) { | ||
2395 | kref_get(&tconn->kref); | ||
2396 | goto found; | ||
2397 | } | ||
2398 | } | ||
2399 | tconn = NULL; | ||
2400 | found: | ||
2401 | rcu_read_unlock(); | ||
2402 | return tconn; | ||
2403 | } | ||
2404 | |||
2405 | struct drbd_tconn *conn_get_by_addrs(void *my_addr, int my_addr_len, | ||
2406 | void *peer_addr, int peer_addr_len) | ||
2407 | { | ||
2408 | struct drbd_tconn *tconn; | ||
2409 | |||
2410 | rcu_read_lock(); | ||
2411 | list_for_each_entry_rcu(tconn, &drbd_tconns, all_tconn) { | ||
2412 | if (tconn->my_addr_len == my_addr_len && | ||
2413 | tconn->peer_addr_len == peer_addr_len && | ||
2414 | !memcmp(&tconn->my_addr, my_addr, my_addr_len) && | ||
2415 | !memcmp(&tconn->peer_addr, peer_addr, peer_addr_len)) { | ||
2416 | kref_get(&tconn->kref); | ||
2417 | goto found; | ||
2418 | } | ||
2419 | } | ||
2420 | tconn = NULL; | ||
2421 | found: | ||
2422 | rcu_read_unlock(); | ||
2423 | return tconn; | ||
2424 | } | ||
2425 | |||
2426 | static int drbd_alloc_socket(struct drbd_socket *socket) | ||
2427 | { | ||
2428 | socket->rbuf = (void *) __get_free_page(GFP_KERNEL); | ||
2429 | if (!socket->rbuf) | ||
2430 | return -ENOMEM; | ||
2431 | socket->sbuf = (void *) __get_free_page(GFP_KERNEL); | ||
2432 | if (!socket->sbuf) | ||
2433 | return -ENOMEM; | ||
2434 | return 0; | ||
2435 | } | ||
2436 | |||
2437 | static void drbd_free_socket(struct drbd_socket *socket) | ||
2438 | { | ||
2439 | free_page((unsigned long) socket->sbuf); | ||
2440 | free_page((unsigned long) socket->rbuf); | ||
2441 | } | ||
2442 | |||
2443 | void conn_free_crypto(struct drbd_tconn *tconn) | ||
2444 | { | ||
2445 | drbd_free_sock(tconn); | ||
2446 | |||
2447 | crypto_free_hash(tconn->csums_tfm); | ||
2448 | crypto_free_hash(tconn->verify_tfm); | ||
2449 | crypto_free_hash(tconn->cram_hmac_tfm); | ||
2450 | crypto_free_hash(tconn->integrity_tfm); | ||
2451 | crypto_free_hash(tconn->peer_integrity_tfm); | ||
2452 | kfree(tconn->int_dig_in); | ||
2453 | kfree(tconn->int_dig_vv); | ||
2454 | |||
2455 | tconn->csums_tfm = NULL; | ||
2456 | tconn->verify_tfm = NULL; | ||
2457 | tconn->cram_hmac_tfm = NULL; | ||
2458 | tconn->integrity_tfm = NULL; | ||
2459 | tconn->peer_integrity_tfm = NULL; | ||
2460 | tconn->int_dig_in = NULL; | ||
2461 | tconn->int_dig_vv = NULL; | ||
2462 | } | ||
2463 | |||
2464 | int set_resource_options(struct drbd_tconn *tconn, struct res_opts *res_opts) | ||
2465 | { | ||
2466 | cpumask_var_t new_cpu_mask; | ||
2467 | int err; | ||
2468 | |||
2469 | if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) | ||
2470 | return -ENOMEM; | ||
2471 | /* | ||
2472 | retcode = ERR_NOMEM; | ||
2473 | drbd_msg_put_info("unable to allocate cpumask"); | ||
2474 | */ | ||
2475 | |||
2476 | /* silently ignore cpu mask on UP kernel */ | ||
2477 | if (nr_cpu_ids > 1 && res_opts->cpu_mask[0] != 0) { | ||
2478 | /* FIXME: Get rid of constant 32 here */ | ||
2479 | err = bitmap_parse(res_opts->cpu_mask, 32, | ||
2480 | cpumask_bits(new_cpu_mask), nr_cpu_ids); | ||
2481 | if (err) { | ||
2482 | conn_warn(tconn, "bitmap_parse() failed with %d\n", err); | ||
2483 | /* retcode = ERR_CPU_MASK_PARSE; */ | ||
2484 | goto fail; | ||
2485 | } | ||
2486 | } | ||
2487 | tconn->res_opts = *res_opts; | ||
2488 | if (!cpumask_equal(tconn->cpu_mask, new_cpu_mask)) { | ||
2489 | cpumask_copy(tconn->cpu_mask, new_cpu_mask); | ||
2490 | drbd_calc_cpu_mask(tconn); | ||
2491 | tconn->receiver.reset_cpu_mask = 1; | ||
2492 | tconn->asender.reset_cpu_mask = 1; | ||
2493 | tconn->worker.reset_cpu_mask = 1; | ||
2494 | } | ||
2495 | err = 0; | ||
2496 | |||
2497 | fail: | ||
2498 | free_cpumask_var(new_cpu_mask); | ||
2499 | return err; | ||
2500 | |||
2501 | } | ||
2502 | |||
2503 | /* caller must be under genl_lock() */ | ||
2504 | struct drbd_tconn *conn_create(const char *name, struct res_opts *res_opts) | ||
2505 | { | ||
2506 | struct drbd_tconn *tconn; | ||
2507 | |||
2508 | tconn = kzalloc(sizeof(struct drbd_tconn), GFP_KERNEL); | ||
2509 | if (!tconn) | ||
2510 | return NULL; | ||
2511 | |||
2512 | tconn->name = kstrdup(name, GFP_KERNEL); | ||
2513 | if (!tconn->name) | ||
2514 | goto fail; | ||
2515 | |||
2516 | if (drbd_alloc_socket(&tconn->data)) | ||
2517 | goto fail; | ||
2518 | if (drbd_alloc_socket(&tconn->meta)) | ||
2519 | goto fail; | ||
2520 | |||
2521 | if (!zalloc_cpumask_var(&tconn->cpu_mask, GFP_KERNEL)) | ||
2522 | goto fail; | ||
2523 | |||
2524 | if (set_resource_options(tconn, res_opts)) | ||
2525 | goto fail; | ||
2526 | |||
2527 | tconn->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL); | ||
2528 | if (!tconn->current_epoch) | ||
2529 | goto fail; | ||
2530 | |||
2531 | INIT_LIST_HEAD(&tconn->transfer_log); | ||
2532 | |||
2533 | INIT_LIST_HEAD(&tconn->current_epoch->list); | ||
2534 | tconn->epochs = 1; | ||
2535 | spin_lock_init(&tconn->epoch_lock); | ||
2536 | tconn->write_ordering = WO_bdev_flush; | ||
2537 | |||
2538 | tconn->send.seen_any_write_yet = false; | ||
2539 | tconn->send.current_epoch_nr = 0; | ||
2540 | tconn->send.current_epoch_writes = 0; | ||
2541 | |||
2542 | tconn->cstate = C_STANDALONE; | ||
2543 | mutex_init(&tconn->cstate_mutex); | ||
2544 | spin_lock_init(&tconn->req_lock); | ||
2545 | mutex_init(&tconn->conf_update); | ||
2546 | init_waitqueue_head(&tconn->ping_wait); | ||
2547 | idr_init(&tconn->volumes); | ||
2548 | |||
2549 | drbd_init_workqueue(&tconn->sender_work); | ||
2550 | mutex_init(&tconn->data.mutex); | ||
2551 | mutex_init(&tconn->meta.mutex); | ||
2552 | |||
2553 | drbd_thread_init(tconn, &tconn->receiver, drbdd_init, "receiver"); | ||
2554 | drbd_thread_init(tconn, &tconn->worker, drbd_worker, "worker"); | ||
2555 | drbd_thread_init(tconn, &tconn->asender, drbd_asender, "asender"); | ||
2556 | |||
2557 | kref_init(&tconn->kref); | ||
2558 | list_add_tail_rcu(&tconn->all_tconn, &drbd_tconns); | ||
2559 | |||
2560 | return tconn; | ||
2561 | |||
2562 | fail: | ||
2563 | kfree(tconn->current_epoch); | ||
2564 | free_cpumask_var(tconn->cpu_mask); | ||
2565 | drbd_free_socket(&tconn->meta); | ||
2566 | drbd_free_socket(&tconn->data); | ||
2567 | kfree(tconn->name); | ||
2568 | kfree(tconn); | ||
2569 | |||
2570 | return NULL; | ||
2571 | } | ||
2572 | |||
2573 | void conn_destroy(struct kref *kref) | ||
2574 | { | ||
2575 | struct drbd_tconn *tconn = container_of(kref, struct drbd_tconn, kref); | ||
2576 | |||
2577 | if (atomic_read(&tconn->current_epoch->epoch_size) != 0) | ||
2578 | conn_err(tconn, "epoch_size:%d\n", atomic_read(&tconn->current_epoch->epoch_size)); | ||
2579 | kfree(tconn->current_epoch); | ||
2580 | |||
2581 | idr_destroy(&tconn->volumes); | ||
2582 | |||
2583 | free_cpumask_var(tconn->cpu_mask); | ||
2584 | drbd_free_socket(&tconn->meta); | ||
2585 | drbd_free_socket(&tconn->data); | ||
2586 | kfree(tconn->name); | ||
2587 | kfree(tconn->int_dig_in); | ||
2588 | kfree(tconn->int_dig_vv); | ||
2589 | kfree(tconn); | ||
2590 | } | ||
2591 | |||
2592 | enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr) | ||
3591 | { | 2593 | { |
3592 | struct drbd_conf *mdev; | 2594 | struct drbd_conf *mdev; |
3593 | struct gendisk *disk; | 2595 | struct gendisk *disk; |
3594 | struct request_queue *q; | 2596 | struct request_queue *q; |
2597 | int vnr_got = vnr; | ||
2598 | int minor_got = minor; | ||
2599 | enum drbd_ret_code err = ERR_NOMEM; | ||
2600 | |||
2601 | mdev = minor_to_mdev(minor); | ||
2602 | if (mdev) | ||
2603 | return ERR_MINOR_EXISTS; | ||
3595 | 2604 | ||
3596 | /* GFP_KERNEL, we are outside of all write-out paths */ | 2605 | /* GFP_KERNEL, we are outside of all write-out paths */ |
3597 | mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL); | 2606 | mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL); |
3598 | if (!mdev) | 2607 | if (!mdev) |
3599 | return NULL; | 2608 | return ERR_NOMEM; |
3600 | if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL)) | 2609 | |
3601 | goto out_no_cpumask; | 2610 | kref_get(&tconn->kref); |
2611 | mdev->tconn = tconn; | ||
3602 | 2612 | ||
3603 | mdev->minor = minor; | 2613 | mdev->minor = minor; |
2614 | mdev->vnr = vnr; | ||
3604 | 2615 | ||
3605 | drbd_init_set_defaults(mdev); | 2616 | drbd_init_set_defaults(mdev); |
3606 | 2617 | ||
@@ -3638,7 +2649,7 @@ struct drbd_conf *drbd_new_device(unsigned int minor) | |||
3638 | blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8); | 2649 | blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8); |
3639 | blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); | 2650 | blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); |
3640 | blk_queue_merge_bvec(q, drbd_merge_bvec); | 2651 | blk_queue_merge_bvec(q, drbd_merge_bvec); |
3641 | q->queue_lock = &mdev->req_lock; | 2652 | q->queue_lock = &mdev->tconn->req_lock; /* needed since we use */ |
3642 | 2653 | ||
3643 | mdev->md_io_page = alloc_page(GFP_KERNEL); | 2654 | mdev->md_io_page = alloc_page(GFP_KERNEL); |
3644 | if (!mdev->md_io_page) | 2655 | if (!mdev->md_io_page) |
@@ -3646,30 +2657,44 @@ struct drbd_conf *drbd_new_device(unsigned int minor) | |||
3646 | 2657 | ||
3647 | if (drbd_bm_init(mdev)) | 2658 | if (drbd_bm_init(mdev)) |
3648 | goto out_no_bitmap; | 2659 | goto out_no_bitmap; |
3649 | /* no need to lock access, we are still initializing this minor device. */ | 2660 | mdev->read_requests = RB_ROOT; |
3650 | if (!tl_init(mdev)) | 2661 | mdev->write_requests = RB_ROOT; |
3651 | goto out_no_tl; | 2662 | |
3652 | 2663 | if (!idr_pre_get(&minors, GFP_KERNEL)) | |
3653 | mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL); | 2664 | goto out_no_minor_idr; |
3654 | if (!mdev->app_reads_hash) | 2665 | if (idr_get_new_above(&minors, mdev, minor, &minor_got)) |
3655 | goto out_no_app_reads; | 2666 | goto out_no_minor_idr; |
3656 | 2667 | if (minor_got != minor) { | |
3657 | mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL); | 2668 | err = ERR_MINOR_EXISTS; |
3658 | if (!mdev->current_epoch) | 2669 | drbd_msg_put_info("requested minor exists already"); |
3659 | goto out_no_epoch; | 2670 | goto out_idr_remove_minor; |
3660 | 2671 | } | |
3661 | INIT_LIST_HEAD(&mdev->current_epoch->list); | 2672 | |
3662 | mdev->epochs = 1; | 2673 | if (!idr_pre_get(&tconn->volumes, GFP_KERNEL)) |
3663 | 2674 | goto out_idr_remove_minor; | |
3664 | return mdev; | 2675 | if (idr_get_new_above(&tconn->volumes, mdev, vnr, &vnr_got)) |
3665 | 2676 | goto out_idr_remove_minor; | |
3666 | /* out_whatever_else: | 2677 | if (vnr_got != vnr) { |
3667 | kfree(mdev->current_epoch); */ | 2678 | err = ERR_INVALID_REQUEST; |
3668 | out_no_epoch: | 2679 | drbd_msg_put_info("requested volume exists already"); |
3669 | kfree(mdev->app_reads_hash); | 2680 | goto out_idr_remove_vol; |
3670 | out_no_app_reads: | 2681 | } |
3671 | tl_cleanup(mdev); | 2682 | add_disk(disk); |
3672 | out_no_tl: | 2683 | kref_init(&mdev->kref); /* one ref for both idrs and the the add_disk */ |
2684 | |||
2685 | /* inherit the connection state */ | ||
2686 | mdev->state.conn = tconn->cstate; | ||
2687 | if (mdev->state.conn == C_WF_REPORT_PARAMS) | ||
2688 | drbd_connected(mdev); | ||
2689 | |||
2690 | return NO_ERROR; | ||
2691 | |||
2692 | out_idr_remove_vol: | ||
2693 | idr_remove(&tconn->volumes, vnr_got); | ||
2694 | out_idr_remove_minor: | ||
2695 | idr_remove(&minors, minor_got); | ||
2696 | synchronize_rcu(); | ||
2697 | out_no_minor_idr: | ||
3673 | drbd_bm_cleanup(mdev); | 2698 | drbd_bm_cleanup(mdev); |
3674 | out_no_bitmap: | 2699 | out_no_bitmap: |
3675 | __free_page(mdev->md_io_page); | 2700 | __free_page(mdev->md_io_page); |
@@ -3678,55 +2703,25 @@ out_no_io_page: | |||
3678 | out_no_disk: | 2703 | out_no_disk: |
3679 | blk_cleanup_queue(q); | 2704 | blk_cleanup_queue(q); |
3680 | out_no_q: | 2705 | out_no_q: |
3681 | free_cpumask_var(mdev->cpu_mask); | ||
3682 | out_no_cpumask: | ||
3683 | kfree(mdev); | ||
3684 | return NULL; | ||
3685 | } | ||
3686 | |||
3687 | /* counterpart of drbd_new_device. | ||
3688 | * last part of drbd_delete_device. */ | ||
3689 | void drbd_free_mdev(struct drbd_conf *mdev) | ||
3690 | { | ||
3691 | kfree(mdev->current_epoch); | ||
3692 | kfree(mdev->app_reads_hash); | ||
3693 | tl_cleanup(mdev); | ||
3694 | if (mdev->bitmap) /* should no longer be there. */ | ||
3695 | drbd_bm_cleanup(mdev); | ||
3696 | __free_page(mdev->md_io_page); | ||
3697 | put_disk(mdev->vdisk); | ||
3698 | blk_cleanup_queue(mdev->rq_queue); | ||
3699 | free_cpumask_var(mdev->cpu_mask); | ||
3700 | drbd_free_tl_hash(mdev); | ||
3701 | kfree(mdev); | 2706 | kfree(mdev); |
2707 | kref_put(&tconn->kref, &conn_destroy); | ||
2708 | return err; | ||
3702 | } | 2709 | } |
3703 | 2710 | ||
3704 | |||
3705 | int __init drbd_init(void) | 2711 | int __init drbd_init(void) |
3706 | { | 2712 | { |
3707 | int err; | 2713 | int err; |
3708 | 2714 | ||
3709 | if (sizeof(struct p_handshake) != 80) { | ||
3710 | printk(KERN_ERR | ||
3711 | "drbd: never change the size or layout " | ||
3712 | "of the HandShake packet.\n"); | ||
3713 | return -EINVAL; | ||
3714 | } | ||
3715 | |||
3716 | if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) { | 2715 | if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) { |
3717 | printk(KERN_ERR | 2716 | printk(KERN_ERR |
3718 | "drbd: invalid minor_count (%d)\n", minor_count); | 2717 | "drbd: invalid minor_count (%d)\n", minor_count); |
3719 | #ifdef MODULE | 2718 | #ifdef MODULE |
3720 | return -EINVAL; | 2719 | return -EINVAL; |
3721 | #else | 2720 | #else |
3722 | minor_count = 8; | 2721 | minor_count = DRBD_MINOR_COUNT_DEF; |
3723 | #endif | 2722 | #endif |
3724 | } | 2723 | } |
3725 | 2724 | ||
3726 | err = drbd_nl_init(); | ||
3727 | if (err) | ||
3728 | return err; | ||
3729 | |||
3730 | err = register_blkdev(DRBD_MAJOR, "drbd"); | 2725 | err = register_blkdev(DRBD_MAJOR, "drbd"); |
3731 | if (err) { | 2726 | if (err) { |
3732 | printk(KERN_ERR | 2727 | printk(KERN_ERR |
@@ -3735,6 +2730,13 @@ int __init drbd_init(void) | |||
3735 | return err; | 2730 | return err; |
3736 | } | 2731 | } |
3737 | 2732 | ||
2733 | err = drbd_genl_register(); | ||
2734 | if (err) { | ||
2735 | printk(KERN_ERR "drbd: unable to register generic netlink family\n"); | ||
2736 | goto fail; | ||
2737 | } | ||
2738 | |||
2739 | |||
3738 | register_reboot_notifier(&drbd_notifier); | 2740 | register_reboot_notifier(&drbd_notifier); |
3739 | 2741 | ||
3740 | /* | 2742 | /* |
@@ -3745,22 +2747,29 @@ int __init drbd_init(void) | |||
3745 | init_waitqueue_head(&drbd_pp_wait); | 2747 | init_waitqueue_head(&drbd_pp_wait); |
3746 | 2748 | ||
3747 | drbd_proc = NULL; /* play safe for drbd_cleanup */ | 2749 | drbd_proc = NULL; /* play safe for drbd_cleanup */ |
3748 | minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count, | 2750 | idr_init(&minors); |
3749 | GFP_KERNEL); | ||
3750 | if (!minor_table) | ||
3751 | goto Enomem; | ||
3752 | 2751 | ||
3753 | err = drbd_create_mempools(); | 2752 | err = drbd_create_mempools(); |
3754 | if (err) | 2753 | if (err) |
3755 | goto Enomem; | 2754 | goto fail; |
3756 | 2755 | ||
3757 | drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL); | 2756 | drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL); |
3758 | if (!drbd_proc) { | 2757 | if (!drbd_proc) { |
3759 | printk(KERN_ERR "drbd: unable to register proc file\n"); | 2758 | printk(KERN_ERR "drbd: unable to register proc file\n"); |
3760 | goto Enomem; | 2759 | goto fail; |
3761 | } | 2760 | } |
3762 | 2761 | ||
3763 | rwlock_init(&global_state_lock); | 2762 | rwlock_init(&global_state_lock); |
2763 | INIT_LIST_HEAD(&drbd_tconns); | ||
2764 | |||
2765 | retry.wq = create_singlethread_workqueue("drbd-reissue"); | ||
2766 | if (!retry.wq) { | ||
2767 | printk(KERN_ERR "drbd: unable to create retry workqueue\n"); | ||
2768 | goto fail; | ||
2769 | } | ||
2770 | INIT_WORK(&retry.worker, do_retry); | ||
2771 | spin_lock_init(&retry.lock); | ||
2772 | INIT_LIST_HEAD(&retry.writes); | ||
3764 | 2773 | ||
3765 | printk(KERN_INFO "drbd: initialized. " | 2774 | printk(KERN_INFO "drbd: initialized. " |
3766 | "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n", | 2775 | "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n", |
@@ -3768,11 +2777,10 @@ int __init drbd_init(void) | |||
3768 | printk(KERN_INFO "drbd: %s\n", drbd_buildtag()); | 2777 | printk(KERN_INFO "drbd: %s\n", drbd_buildtag()); |
3769 | printk(KERN_INFO "drbd: registered as block device major %d\n", | 2778 | printk(KERN_INFO "drbd: registered as block device major %d\n", |
3770 | DRBD_MAJOR); | 2779 | DRBD_MAJOR); |
3771 | printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table); | ||
3772 | 2780 | ||
3773 | return 0; /* Success! */ | 2781 | return 0; /* Success! */ |
3774 | 2782 | ||
3775 | Enomem: | 2783 | fail: |
3776 | drbd_cleanup(); | 2784 | drbd_cleanup(); |
3777 | if (err == -ENOMEM) | 2785 | if (err == -ENOMEM) |
3778 | /* currently always the case */ | 2786 | /* currently always the case */ |
@@ -3793,47 +2801,42 @@ void drbd_free_bc(struct drbd_backing_dev *ldev) | |||
3793 | kfree(ldev); | 2801 | kfree(ldev); |
3794 | } | 2802 | } |
3795 | 2803 | ||
3796 | void drbd_free_sock(struct drbd_conf *mdev) | 2804 | void drbd_free_sock(struct drbd_tconn *tconn) |
3797 | { | 2805 | { |
3798 | if (mdev->data.socket) { | 2806 | if (tconn->data.socket) { |
3799 | mutex_lock(&mdev->data.mutex); | 2807 | mutex_lock(&tconn->data.mutex); |
3800 | kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR); | 2808 | kernel_sock_shutdown(tconn->data.socket, SHUT_RDWR); |
3801 | sock_release(mdev->data.socket); | 2809 | sock_release(tconn->data.socket); |
3802 | mdev->data.socket = NULL; | 2810 | tconn->data.socket = NULL; |
3803 | mutex_unlock(&mdev->data.mutex); | 2811 | mutex_unlock(&tconn->data.mutex); |
3804 | } | 2812 | } |
3805 | if (mdev->meta.socket) { | 2813 | if (tconn->meta.socket) { |
3806 | mutex_lock(&mdev->meta.mutex); | 2814 | mutex_lock(&tconn->meta.mutex); |
3807 | kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR); | 2815 | kernel_sock_shutdown(tconn->meta.socket, SHUT_RDWR); |
3808 | sock_release(mdev->meta.socket); | 2816 | sock_release(tconn->meta.socket); |
3809 | mdev->meta.socket = NULL; | 2817 | tconn->meta.socket = NULL; |
3810 | mutex_unlock(&mdev->meta.mutex); | 2818 | mutex_unlock(&tconn->meta.mutex); |
3811 | } | 2819 | } |
3812 | } | 2820 | } |
3813 | 2821 | ||
2822 | /* meta data management */ | ||
3814 | 2823 | ||
3815 | void drbd_free_resources(struct drbd_conf *mdev) | 2824 | void conn_md_sync(struct drbd_tconn *tconn) |
3816 | { | 2825 | { |
3817 | crypto_free_hash(mdev->csums_tfm); | 2826 | struct drbd_conf *mdev; |
3818 | mdev->csums_tfm = NULL; | 2827 | int vnr; |
3819 | crypto_free_hash(mdev->verify_tfm); | ||
3820 | mdev->verify_tfm = NULL; | ||
3821 | crypto_free_hash(mdev->cram_hmac_tfm); | ||
3822 | mdev->cram_hmac_tfm = NULL; | ||
3823 | crypto_free_hash(mdev->integrity_w_tfm); | ||
3824 | mdev->integrity_w_tfm = NULL; | ||
3825 | crypto_free_hash(mdev->integrity_r_tfm); | ||
3826 | mdev->integrity_r_tfm = NULL; | ||
3827 | |||
3828 | drbd_free_sock(mdev); | ||
3829 | 2828 | ||
3830 | __no_warn(local, | 2829 | rcu_read_lock(); |
3831 | drbd_free_bc(mdev->ldev); | 2830 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { |
3832 | mdev->ldev = NULL;); | 2831 | kref_get(&mdev->kref); |
2832 | rcu_read_unlock(); | ||
2833 | drbd_md_sync(mdev); | ||
2834 | kref_put(&mdev->kref, &drbd_minor_destroy); | ||
2835 | rcu_read_lock(); | ||
2836 | } | ||
2837 | rcu_read_unlock(); | ||
3833 | } | 2838 | } |
3834 | 2839 | ||
3835 | /* meta data management */ | ||
3836 | |||
3837 | struct meta_data_on_disk { | 2840 | struct meta_data_on_disk { |
3838 | u64 la_size; /* last agreed size. */ | 2841 | u64 la_size; /* last agreed size. */ |
3839 | u64 uuid[UI_SIZE]; /* UUIDs. */ | 2842 | u64 uuid[UI_SIZE]; /* UUIDs. */ |
@@ -3844,7 +2847,7 @@ struct meta_data_on_disk { | |||
3844 | u32 md_size_sect; | 2847 | u32 md_size_sect; |
3845 | u32 al_offset; /* offset to this block */ | 2848 | u32 al_offset; /* offset to this block */ |
3846 | u32 al_nr_extents; /* important for restoring the AL */ | 2849 | u32 al_nr_extents; /* important for restoring the AL */ |
3847 | /* `-- act_log->nr_elements <-- sync_conf.al_extents */ | 2850 | /* `-- act_log->nr_elements <-- ldev->dc.al_extents */ |
3848 | u32 bm_offset; /* offset to the bitmap, from here */ | 2851 | u32 bm_offset; /* offset to the bitmap, from here */ |
3849 | u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */ | 2852 | u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */ |
3850 | u32 la_peer_max_bio_size; /* last peer max_bio_size */ | 2853 | u32 la_peer_max_bio_size; /* last peer max_bio_size */ |
@@ -3882,7 +2885,7 @@ void drbd_md_sync(struct drbd_conf *mdev) | |||
3882 | for (i = UI_CURRENT; i < UI_SIZE; i++) | 2885 | for (i = UI_CURRENT; i < UI_SIZE; i++) |
3883 | buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]); | 2886 | buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]); |
3884 | buffer->flags = cpu_to_be32(mdev->ldev->md.flags); | 2887 | buffer->flags = cpu_to_be32(mdev->ldev->md.flags); |
3885 | buffer->magic = cpu_to_be32(DRBD_MD_MAGIC); | 2888 | buffer->magic = cpu_to_be32(DRBD_MD_MAGIC_84_UNCLEAN); |
3886 | 2889 | ||
3887 | buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect); | 2890 | buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect); |
3888 | buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset); | 2891 | buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset); |
@@ -3896,7 +2899,7 @@ void drbd_md_sync(struct drbd_conf *mdev) | |||
3896 | D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset); | 2899 | D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset); |
3897 | sector = mdev->ldev->md.md_offset; | 2900 | sector = mdev->ldev->md.md_offset; |
3898 | 2901 | ||
3899 | if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { | 2902 | if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { |
3900 | /* this was a try anyways ... */ | 2903 | /* this was a try anyways ... */ |
3901 | dev_err(DEV, "meta data update failed!\n"); | 2904 | dev_err(DEV, "meta data update failed!\n"); |
3902 | drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); | 2905 | drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); |
@@ -3917,11 +2920,12 @@ out: | |||
3917 | * @bdev: Device from which the meta data should be read in. | 2920 | * @bdev: Device from which the meta data should be read in. |
3918 | * | 2921 | * |
3919 | * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case | 2922 | * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case |
3920 | * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID. | 2923 | * something goes wrong. |
3921 | */ | 2924 | */ |
3922 | int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) | 2925 | int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) |
3923 | { | 2926 | { |
3924 | struct meta_data_on_disk *buffer; | 2927 | struct meta_data_on_disk *buffer; |
2928 | u32 magic, flags; | ||
3925 | int i, rv = NO_ERROR; | 2929 | int i, rv = NO_ERROR; |
3926 | 2930 | ||
3927 | if (!get_ldev_if_state(mdev, D_ATTACHING)) | 2931 | if (!get_ldev_if_state(mdev, D_ATTACHING)) |
@@ -3931,7 +2935,7 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) | |||
3931 | if (!buffer) | 2935 | if (!buffer) |
3932 | goto out; | 2936 | goto out; |
3933 | 2937 | ||
3934 | if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) { | 2938 | if (drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) { |
3935 | /* NOTE: can't do normal error processing here as this is | 2939 | /* NOTE: can't do normal error processing here as this is |
3936 | called BEFORE disk is attached */ | 2940 | called BEFORE disk is attached */ |
3937 | dev_err(DEV, "Error while reading metadata.\n"); | 2941 | dev_err(DEV, "Error while reading metadata.\n"); |
@@ -3939,8 +2943,20 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) | |||
3939 | goto err; | 2943 | goto err; |
3940 | } | 2944 | } |
3941 | 2945 | ||
3942 | if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) { | 2946 | magic = be32_to_cpu(buffer->magic); |
3943 | dev_err(DEV, "Error while reading metadata, magic not found.\n"); | 2947 | flags = be32_to_cpu(buffer->flags); |
2948 | if (magic == DRBD_MD_MAGIC_84_UNCLEAN || | ||
2949 | (magic == DRBD_MD_MAGIC_08 && !(flags & MDF_AL_CLEAN))) { | ||
2950 | /* btw: that's Activity Log clean, not "all" clean. */ | ||
2951 | dev_err(DEV, "Found unclean meta data. Did you \"drbdadm apply-al\"?\n"); | ||
2952 | rv = ERR_MD_UNCLEAN; | ||
2953 | goto err; | ||
2954 | } | ||
2955 | if (magic != DRBD_MD_MAGIC_08) { | ||
2956 | if (magic == DRBD_MD_MAGIC_07) | ||
2957 | dev_err(DEV, "Found old (0.7) meta data magic. Did you \"drbdadm create-md\"?\n"); | ||
2958 | else | ||
2959 | dev_err(DEV, "Meta data magic not found. Did you \"drbdadm create-md\"?\n"); | ||
3944 | rv = ERR_MD_INVALID; | 2960 | rv = ERR_MD_INVALID; |
3945 | goto err; | 2961 | goto err; |
3946 | } | 2962 | } |
@@ -3974,20 +2990,16 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) | |||
3974 | for (i = UI_CURRENT; i < UI_SIZE; i++) | 2990 | for (i = UI_CURRENT; i < UI_SIZE; i++) |
3975 | bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]); | 2991 | bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]); |
3976 | bdev->md.flags = be32_to_cpu(buffer->flags); | 2992 | bdev->md.flags = be32_to_cpu(buffer->flags); |
3977 | mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents); | ||
3978 | bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid); | 2993 | bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid); |
3979 | 2994 | ||
3980 | spin_lock_irq(&mdev->req_lock); | 2995 | spin_lock_irq(&mdev->tconn->req_lock); |
3981 | if (mdev->state.conn < C_CONNECTED) { | 2996 | if (mdev->state.conn < C_CONNECTED) { |
3982 | unsigned int peer; | 2997 | unsigned int peer; |
3983 | peer = be32_to_cpu(buffer->la_peer_max_bio_size); | 2998 | peer = be32_to_cpu(buffer->la_peer_max_bio_size); |
3984 | peer = max(peer, DRBD_MAX_BIO_SIZE_SAFE); | 2999 | peer = max(peer, DRBD_MAX_BIO_SIZE_SAFE); |
3985 | mdev->peer_max_bio_size = peer; | 3000 | mdev->peer_max_bio_size = peer; |
3986 | } | 3001 | } |
3987 | spin_unlock_irq(&mdev->req_lock); | 3002 | spin_unlock_irq(&mdev->tconn->req_lock); |
3988 | |||
3989 | if (mdev->sync_conf.al_extents < 7) | ||
3990 | mdev->sync_conf.al_extents = 127; | ||
3991 | 3003 | ||
3992 | err: | 3004 | err: |
3993 | drbd_md_put_buffer(mdev); | 3005 | drbd_md_put_buffer(mdev); |
@@ -4022,7 +3034,7 @@ void drbd_md_mark_dirty(struct drbd_conf *mdev) | |||
4022 | } | 3034 | } |
4023 | #endif | 3035 | #endif |
4024 | 3036 | ||
4025 | static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local) | 3037 | void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local) |
4026 | { | 3038 | { |
4027 | int i; | 3039 | int i; |
4028 | 3040 | ||
@@ -4030,7 +3042,7 @@ static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local) | |||
4030 | mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i]; | 3042 | mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i]; |
4031 | } | 3043 | } |
4032 | 3044 | ||
4033 | void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) | 3045 | void __drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) |
4034 | { | 3046 | { |
4035 | if (idx == UI_CURRENT) { | 3047 | if (idx == UI_CURRENT) { |
4036 | if (mdev->state.role == R_PRIMARY) | 3048 | if (mdev->state.role == R_PRIMARY) |
@@ -4045,14 +3057,24 @@ void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) | |||
4045 | drbd_md_mark_dirty(mdev); | 3057 | drbd_md_mark_dirty(mdev); |
4046 | } | 3058 | } |
4047 | 3059 | ||
3060 | void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) | ||
3061 | { | ||
3062 | unsigned long flags; | ||
3063 | spin_lock_irqsave(&mdev->ldev->md.uuid_lock, flags); | ||
3064 | __drbd_uuid_set(mdev, idx, val); | ||
3065 | spin_unlock_irqrestore(&mdev->ldev->md.uuid_lock, flags); | ||
3066 | } | ||
4048 | 3067 | ||
4049 | void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) | 3068 | void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) |
4050 | { | 3069 | { |
3070 | unsigned long flags; | ||
3071 | spin_lock_irqsave(&mdev->ldev->md.uuid_lock, flags); | ||
4051 | if (mdev->ldev->md.uuid[idx]) { | 3072 | if (mdev->ldev->md.uuid[idx]) { |
4052 | drbd_uuid_move_history(mdev); | 3073 | drbd_uuid_move_history(mdev); |
4053 | mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx]; | 3074 | mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx]; |
4054 | } | 3075 | } |
4055 | _drbd_uuid_set(mdev, idx, val); | 3076 | __drbd_uuid_set(mdev, idx, val); |
3077 | spin_unlock_irqrestore(&mdev->ldev->md.uuid_lock, flags); | ||
4056 | } | 3078 | } |
4057 | 3079 | ||
4058 | /** | 3080 | /** |
@@ -4065,15 +3087,20 @@ void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) | |||
4065 | void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local) | 3087 | void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local) |
4066 | { | 3088 | { |
4067 | u64 val; | 3089 | u64 val; |
4068 | unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP]; | 3090 | unsigned long long bm_uuid; |
3091 | |||
3092 | get_random_bytes(&val, sizeof(u64)); | ||
3093 | |||
3094 | spin_lock_irq(&mdev->ldev->md.uuid_lock); | ||
3095 | bm_uuid = mdev->ldev->md.uuid[UI_BITMAP]; | ||
4069 | 3096 | ||
4070 | if (bm_uuid) | 3097 | if (bm_uuid) |
4071 | dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid); | 3098 | dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid); |
4072 | 3099 | ||
4073 | mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT]; | 3100 | mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT]; |
3101 | __drbd_uuid_set(mdev, UI_CURRENT, val); | ||
3102 | spin_unlock_irq(&mdev->ldev->md.uuid_lock); | ||
4074 | 3103 | ||
4075 | get_random_bytes(&val, sizeof(u64)); | ||
4076 | _drbd_uuid_set(mdev, UI_CURRENT, val); | ||
4077 | drbd_print_uuids(mdev, "new current UUID"); | 3104 | drbd_print_uuids(mdev, "new current UUID"); |
4078 | /* get it to stable storage _now_ */ | 3105 | /* get it to stable storage _now_ */ |
4079 | drbd_md_sync(mdev); | 3106 | drbd_md_sync(mdev); |
@@ -4081,9 +3108,11 @@ void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local) | |||
4081 | 3108 | ||
4082 | void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local) | 3109 | void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local) |
4083 | { | 3110 | { |
3111 | unsigned long flags; | ||
4084 | if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0) | 3112 | if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0) |
4085 | return; | 3113 | return; |
4086 | 3114 | ||
3115 | spin_lock_irqsave(&mdev->ldev->md.uuid_lock, flags); | ||
4087 | if (val == 0) { | 3116 | if (val == 0) { |
4088 | drbd_uuid_move_history(mdev); | 3117 | drbd_uuid_move_history(mdev); |
4089 | mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP]; | 3118 | mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP]; |
@@ -4095,6 +3124,8 @@ void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local) | |||
4095 | 3124 | ||
4096 | mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1); | 3125 | mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1); |
4097 | } | 3126 | } |
3127 | spin_unlock_irqrestore(&mdev->ldev->md.uuid_lock, flags); | ||
3128 | |||
4098 | drbd_md_mark_dirty(mdev); | 3129 | drbd_md_mark_dirty(mdev); |
4099 | } | 3130 | } |
4100 | 3131 | ||
@@ -4146,9 +3177,10 @@ int drbd_bmio_clear_n_write(struct drbd_conf *mdev) | |||
4146 | return rv; | 3177 | return rv; |
4147 | } | 3178 | } |
4148 | 3179 | ||
4149 | static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused) | 3180 | static int w_bitmap_io(struct drbd_work *w, int unused) |
4150 | { | 3181 | { |
4151 | struct bm_io_work *work = container_of(w, struct bm_io_work, w); | 3182 | struct bm_io_work *work = container_of(w, struct bm_io_work, w); |
3183 | struct drbd_conf *mdev = w->mdev; | ||
4152 | int rv = -EIO; | 3184 | int rv = -EIO; |
4153 | 3185 | ||
4154 | D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0); | 3186 | D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0); |
@@ -4160,8 +3192,7 @@ static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused) | |||
4160 | put_ldev(mdev); | 3192 | put_ldev(mdev); |
4161 | } | 3193 | } |
4162 | 3194 | ||
4163 | clear_bit(BITMAP_IO, &mdev->flags); | 3195 | clear_bit_unlock(BITMAP_IO, &mdev->flags); |
4164 | smp_mb__after_clear_bit(); | ||
4165 | wake_up(&mdev->misc_wait); | 3196 | wake_up(&mdev->misc_wait); |
4166 | 3197 | ||
4167 | if (work->done) | 3198 | if (work->done) |
@@ -4171,7 +3202,7 @@ static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused) | |||
4171 | work->why = NULL; | 3202 | work->why = NULL; |
4172 | work->flags = 0; | 3203 | work->flags = 0; |
4173 | 3204 | ||
4174 | return 1; | 3205 | return 0; |
4175 | } | 3206 | } |
4176 | 3207 | ||
4177 | void drbd_ldev_destroy(struct drbd_conf *mdev) | 3208 | void drbd_ldev_destroy(struct drbd_conf *mdev) |
@@ -4184,29 +3215,51 @@ void drbd_ldev_destroy(struct drbd_conf *mdev) | |||
4184 | drbd_free_bc(mdev->ldev); | 3215 | drbd_free_bc(mdev->ldev); |
4185 | mdev->ldev = NULL;); | 3216 | mdev->ldev = NULL;); |
4186 | 3217 | ||
4187 | if (mdev->md_io_tmpp) { | ||
4188 | __free_page(mdev->md_io_tmpp); | ||
4189 | mdev->md_io_tmpp = NULL; | ||
4190 | } | ||
4191 | clear_bit(GO_DISKLESS, &mdev->flags); | 3218 | clear_bit(GO_DISKLESS, &mdev->flags); |
4192 | } | 3219 | } |
4193 | 3220 | ||
4194 | static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused) | 3221 | static int w_go_diskless(struct drbd_work *w, int unused) |
4195 | { | 3222 | { |
3223 | struct drbd_conf *mdev = w->mdev; | ||
3224 | |||
4196 | D_ASSERT(mdev->state.disk == D_FAILED); | 3225 | D_ASSERT(mdev->state.disk == D_FAILED); |
4197 | /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will | 3226 | /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will |
4198 | * inc/dec it frequently. Once we are D_DISKLESS, no one will touch | 3227 | * inc/dec it frequently. Once we are D_DISKLESS, no one will touch |
4199 | * the protected members anymore, though, so once put_ldev reaches zero | 3228 | * the protected members anymore, though, so once put_ldev reaches zero |
4200 | * again, it will be safe to free them. */ | 3229 | * again, it will be safe to free them. */ |
3230 | |||
3231 | /* Try to write changed bitmap pages, read errors may have just | ||
3232 | * set some bits outside the area covered by the activity log. | ||
3233 | * | ||
3234 | * If we have an IO error during the bitmap writeout, | ||
3235 | * we will want a full sync next time, just in case. | ||
3236 | * (Do we want a specific meta data flag for this?) | ||
3237 | * | ||
3238 | * If that does not make it to stable storage either, | ||
3239 | * we cannot do anything about that anymore. | ||
3240 | * | ||
3241 | * We still need to check if both bitmap and ldev are present, we may | ||
3242 | * end up here after a failed attach, before ldev was even assigned. | ||
3243 | */ | ||
3244 | if (mdev->bitmap && mdev->ldev) { | ||
3245 | if (drbd_bitmap_io_from_worker(mdev, drbd_bm_write, | ||
3246 | "detach", BM_LOCKED_MASK)) { | ||
3247 | if (test_bit(WAS_READ_ERROR, &mdev->flags)) { | ||
3248 | drbd_md_set_flag(mdev, MDF_FULL_SYNC); | ||
3249 | drbd_md_sync(mdev); | ||
3250 | } | ||
3251 | } | ||
3252 | } | ||
3253 | |||
4201 | drbd_force_state(mdev, NS(disk, D_DISKLESS)); | 3254 | drbd_force_state(mdev, NS(disk, D_DISKLESS)); |
4202 | return 1; | 3255 | return 0; |
4203 | } | 3256 | } |
4204 | 3257 | ||
4205 | void drbd_go_diskless(struct drbd_conf *mdev) | 3258 | void drbd_go_diskless(struct drbd_conf *mdev) |
4206 | { | 3259 | { |
4207 | D_ASSERT(mdev->state.disk == D_FAILED); | 3260 | D_ASSERT(mdev->state.disk == D_FAILED); |
4208 | if (!test_and_set_bit(GO_DISKLESS, &mdev->flags)) | 3261 | if (!test_and_set_bit(GO_DISKLESS, &mdev->flags)) |
4209 | drbd_queue_work(&mdev->data.work, &mdev->go_diskless); | 3262 | drbd_queue_work(&mdev->tconn->sender_work, &mdev->go_diskless); |
4210 | } | 3263 | } |
4211 | 3264 | ||
4212 | /** | 3265 | /** |
@@ -4226,7 +3279,7 @@ void drbd_queue_bitmap_io(struct drbd_conf *mdev, | |||
4226 | void (*done)(struct drbd_conf *, int), | 3279 | void (*done)(struct drbd_conf *, int), |
4227 | char *why, enum bm_flag flags) | 3280 | char *why, enum bm_flag flags) |
4228 | { | 3281 | { |
4229 | D_ASSERT(current == mdev->worker.task); | 3282 | D_ASSERT(current == mdev->tconn->worker.task); |
4230 | 3283 | ||
4231 | D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags)); | 3284 | D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags)); |
4232 | D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags)); | 3285 | D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags)); |
@@ -4240,13 +3293,13 @@ void drbd_queue_bitmap_io(struct drbd_conf *mdev, | |||
4240 | mdev->bm_io_work.why = why; | 3293 | mdev->bm_io_work.why = why; |
4241 | mdev->bm_io_work.flags = flags; | 3294 | mdev->bm_io_work.flags = flags; |
4242 | 3295 | ||
4243 | spin_lock_irq(&mdev->req_lock); | 3296 | spin_lock_irq(&mdev->tconn->req_lock); |
4244 | set_bit(BITMAP_IO, &mdev->flags); | 3297 | set_bit(BITMAP_IO, &mdev->flags); |
4245 | if (atomic_read(&mdev->ap_bio_cnt) == 0) { | 3298 | if (atomic_read(&mdev->ap_bio_cnt) == 0) { |
4246 | if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags)) | 3299 | if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags)) |
4247 | drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w); | 3300 | drbd_queue_work(&mdev->tconn->sender_work, &mdev->bm_io_work.w); |
4248 | } | 3301 | } |
4249 | spin_unlock_irq(&mdev->req_lock); | 3302 | spin_unlock_irq(&mdev->tconn->req_lock); |
4250 | } | 3303 | } |
4251 | 3304 | ||
4252 | /** | 3305 | /** |
@@ -4263,7 +3316,7 @@ int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), | |||
4263 | { | 3316 | { |
4264 | int rv; | 3317 | int rv; |
4265 | 3318 | ||
4266 | D_ASSERT(current != mdev->worker.task); | 3319 | D_ASSERT(current != mdev->tconn->worker.task); |
4267 | 3320 | ||
4268 | if ((flags & BM_LOCKED_SET_ALLOWED) == 0) | 3321 | if ((flags & BM_LOCKED_SET_ALLOWED) == 0) |
4269 | drbd_suspend_io(mdev); | 3322 | drbd_suspend_io(mdev); |
@@ -4302,18 +3355,127 @@ static void md_sync_timer_fn(unsigned long data) | |||
4302 | { | 3355 | { |
4303 | struct drbd_conf *mdev = (struct drbd_conf *) data; | 3356 | struct drbd_conf *mdev = (struct drbd_conf *) data; |
4304 | 3357 | ||
4305 | drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work); | 3358 | /* must not double-queue! */ |
3359 | if (list_empty(&mdev->md_sync_work.list)) | ||
3360 | drbd_queue_work_front(&mdev->tconn->sender_work, &mdev->md_sync_work); | ||
4306 | } | 3361 | } |
4307 | 3362 | ||
4308 | static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused) | 3363 | static int w_md_sync(struct drbd_work *w, int unused) |
4309 | { | 3364 | { |
3365 | struct drbd_conf *mdev = w->mdev; | ||
3366 | |||
4310 | dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n"); | 3367 | dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n"); |
4311 | #ifdef DEBUG | 3368 | #ifdef DEBUG |
4312 | dev_warn(DEV, "last md_mark_dirty: %s:%u\n", | 3369 | dev_warn(DEV, "last md_mark_dirty: %s:%u\n", |
4313 | mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line); | 3370 | mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line); |
4314 | #endif | 3371 | #endif |
4315 | drbd_md_sync(mdev); | 3372 | drbd_md_sync(mdev); |
4316 | return 1; | 3373 | return 0; |
3374 | } | ||
3375 | |||
3376 | const char *cmdname(enum drbd_packet cmd) | ||
3377 | { | ||
3378 | /* THINK may need to become several global tables | ||
3379 | * when we want to support more than | ||
3380 | * one PRO_VERSION */ | ||
3381 | static const char *cmdnames[] = { | ||
3382 | [P_DATA] = "Data", | ||
3383 | [P_DATA_REPLY] = "DataReply", | ||
3384 | [P_RS_DATA_REPLY] = "RSDataReply", | ||
3385 | [P_BARRIER] = "Barrier", | ||
3386 | [P_BITMAP] = "ReportBitMap", | ||
3387 | [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget", | ||
3388 | [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource", | ||
3389 | [P_UNPLUG_REMOTE] = "UnplugRemote", | ||
3390 | [P_DATA_REQUEST] = "DataRequest", | ||
3391 | [P_RS_DATA_REQUEST] = "RSDataRequest", | ||
3392 | [P_SYNC_PARAM] = "SyncParam", | ||
3393 | [P_SYNC_PARAM89] = "SyncParam89", | ||
3394 | [P_PROTOCOL] = "ReportProtocol", | ||
3395 | [P_UUIDS] = "ReportUUIDs", | ||
3396 | [P_SIZES] = "ReportSizes", | ||
3397 | [P_STATE] = "ReportState", | ||
3398 | [P_SYNC_UUID] = "ReportSyncUUID", | ||
3399 | [P_AUTH_CHALLENGE] = "AuthChallenge", | ||
3400 | [P_AUTH_RESPONSE] = "AuthResponse", | ||
3401 | [P_PING] = "Ping", | ||
3402 | [P_PING_ACK] = "PingAck", | ||
3403 | [P_RECV_ACK] = "RecvAck", | ||
3404 | [P_WRITE_ACK] = "WriteAck", | ||
3405 | [P_RS_WRITE_ACK] = "RSWriteAck", | ||
3406 | [P_SUPERSEDED] = "Superseded", | ||
3407 | [P_NEG_ACK] = "NegAck", | ||
3408 | [P_NEG_DREPLY] = "NegDReply", | ||
3409 | [P_NEG_RS_DREPLY] = "NegRSDReply", | ||
3410 | [P_BARRIER_ACK] = "BarrierAck", | ||
3411 | [P_STATE_CHG_REQ] = "StateChgRequest", | ||
3412 | [P_STATE_CHG_REPLY] = "StateChgReply", | ||
3413 | [P_OV_REQUEST] = "OVRequest", | ||
3414 | [P_OV_REPLY] = "OVReply", | ||
3415 | [P_OV_RESULT] = "OVResult", | ||
3416 | [P_CSUM_RS_REQUEST] = "CsumRSRequest", | ||
3417 | [P_RS_IS_IN_SYNC] = "CsumRSIsInSync", | ||
3418 | [P_COMPRESSED_BITMAP] = "CBitmap", | ||
3419 | [P_DELAY_PROBE] = "DelayProbe", | ||
3420 | [P_OUT_OF_SYNC] = "OutOfSync", | ||
3421 | [P_RETRY_WRITE] = "RetryWrite", | ||
3422 | [P_RS_CANCEL] = "RSCancel", | ||
3423 | [P_CONN_ST_CHG_REQ] = "conn_st_chg_req", | ||
3424 | [P_CONN_ST_CHG_REPLY] = "conn_st_chg_reply", | ||
3425 | [P_RETRY_WRITE] = "retry_write", | ||
3426 | [P_PROTOCOL_UPDATE] = "protocol_update", | ||
3427 | |||
3428 | /* enum drbd_packet, but not commands - obsoleted flags: | ||
3429 | * P_MAY_IGNORE | ||
3430 | * P_MAX_OPT_CMD | ||
3431 | */ | ||
3432 | }; | ||
3433 | |||
3434 | /* too big for the array: 0xfffX */ | ||
3435 | if (cmd == P_INITIAL_META) | ||
3436 | return "InitialMeta"; | ||
3437 | if (cmd == P_INITIAL_DATA) | ||
3438 | return "InitialData"; | ||
3439 | if (cmd == P_CONNECTION_FEATURES) | ||
3440 | return "ConnectionFeatures"; | ||
3441 | if (cmd >= ARRAY_SIZE(cmdnames)) | ||
3442 | return "Unknown"; | ||
3443 | return cmdnames[cmd]; | ||
3444 | } | ||
3445 | |||
3446 | /** | ||
3447 | * drbd_wait_misc - wait for a request to make progress | ||
3448 | * @mdev: device associated with the request | ||
3449 | * @i: the struct drbd_interval embedded in struct drbd_request or | ||
3450 | * struct drbd_peer_request | ||
3451 | */ | ||
3452 | int drbd_wait_misc(struct drbd_conf *mdev, struct drbd_interval *i) | ||
3453 | { | ||
3454 | struct net_conf *nc; | ||
3455 | DEFINE_WAIT(wait); | ||
3456 | long timeout; | ||
3457 | |||
3458 | rcu_read_lock(); | ||
3459 | nc = rcu_dereference(mdev->tconn->net_conf); | ||
3460 | if (!nc) { | ||
3461 | rcu_read_unlock(); | ||
3462 | return -ETIMEDOUT; | ||
3463 | } | ||
3464 | timeout = nc->ko_count ? nc->timeout * HZ / 10 * nc->ko_count : MAX_SCHEDULE_TIMEOUT; | ||
3465 | rcu_read_unlock(); | ||
3466 | |||
3467 | /* Indicate to wake up mdev->misc_wait on progress. */ | ||
3468 | i->waiting = true; | ||
3469 | prepare_to_wait(&mdev->misc_wait, &wait, TASK_INTERRUPTIBLE); | ||
3470 | spin_unlock_irq(&mdev->tconn->req_lock); | ||
3471 | timeout = schedule_timeout(timeout); | ||
3472 | finish_wait(&mdev->misc_wait, &wait); | ||
3473 | spin_lock_irq(&mdev->tconn->req_lock); | ||
3474 | if (!timeout || mdev->state.conn < C_CONNECTED) | ||
3475 | return -ETIMEDOUT; | ||
3476 | if (signal_pending(current)) | ||
3477 | return -ERESTARTSYS; | ||
3478 | return 0; | ||
4317 | } | 3479 | } |
4318 | 3480 | ||
4319 | #ifdef CONFIG_DRBD_FAULT_INJECTION | 3481 | #ifdef CONFIG_DRBD_FAULT_INJECTION |
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index edb490aad8b4..2af26fc95280 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c | |||
@@ -29,159 +29,317 @@ | |||
29 | #include <linux/fs.h> | 29 | #include <linux/fs.h> |
30 | #include <linux/file.h> | 30 | #include <linux/file.h> |
31 | #include <linux/slab.h> | 31 | #include <linux/slab.h> |
32 | #include <linux/connector.h> | ||
33 | #include <linux/blkpg.h> | 32 | #include <linux/blkpg.h> |
34 | #include <linux/cpumask.h> | 33 | #include <linux/cpumask.h> |
35 | #include "drbd_int.h" | 34 | #include "drbd_int.h" |
36 | #include "drbd_req.h" | 35 | #include "drbd_req.h" |
37 | #include "drbd_wrappers.h" | 36 | #include "drbd_wrappers.h" |
38 | #include <asm/unaligned.h> | 37 | #include <asm/unaligned.h> |
39 | #include <linux/drbd_tag_magic.h> | ||
40 | #include <linux/drbd_limits.h> | 38 | #include <linux/drbd_limits.h> |
41 | #include <linux/compiler.h> | ||
42 | #include <linux/kthread.h> | 39 | #include <linux/kthread.h> |
43 | 40 | ||
44 | static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int); | 41 | #include <net/genetlink.h> |
45 | static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *); | 42 | |
46 | static unsigned short *tl_add_int(unsigned short *, enum drbd_tags, const void *); | 43 | /* .doit */ |
47 | 44 | // int drbd_adm_create_resource(struct sk_buff *skb, struct genl_info *info); | |
48 | /* see get_sb_bdev and bd_claim */ | 45 | // int drbd_adm_delete_resource(struct sk_buff *skb, struct genl_info *info); |
46 | |||
47 | int drbd_adm_add_minor(struct sk_buff *skb, struct genl_info *info); | ||
48 | int drbd_adm_delete_minor(struct sk_buff *skb, struct genl_info *info); | ||
49 | |||
50 | int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info); | ||
51 | int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info); | ||
52 | int drbd_adm_down(struct sk_buff *skb, struct genl_info *info); | ||
53 | |||
54 | int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info); | ||
55 | int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info); | ||
56 | int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info); | ||
57 | int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info); | ||
58 | int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info); | ||
59 | int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info); | ||
60 | int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info); | ||
61 | int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info); | ||
62 | int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info); | ||
63 | int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info); | ||
64 | int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info); | ||
65 | int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info); | ||
66 | int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info); | ||
67 | int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info); | ||
68 | int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info); | ||
69 | int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info); | ||
70 | int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info); | ||
71 | int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info); | ||
72 | int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info); | ||
73 | int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info); | ||
74 | /* .dumpit */ | ||
75 | int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb); | ||
76 | |||
77 | #include <linux/drbd_genl_api.h> | ||
78 | #include "drbd_nla.h" | ||
79 | #include <linux/genl_magic_func.h> | ||
80 | |||
81 | /* used blkdev_get_by_path, to claim our meta data device(s) */ | ||
49 | static char *drbd_m_holder = "Hands off! this is DRBD's meta data device."; | 82 | static char *drbd_m_holder = "Hands off! this is DRBD's meta data device."; |
50 | 83 | ||
51 | /* Generate the tag_list to struct functions */ | 84 | /* Configuration is strictly serialized, because generic netlink message |
52 | #define NL_PACKET(name, number, fields) \ | 85 | * processing is strictly serialized by the genl_lock(). |
53 | static int name ## _from_tags(struct drbd_conf *mdev, \ | 86 | * Which means we can use one static global drbd_config_context struct. |
54 | unsigned short *tags, struct name *arg) __attribute__ ((unused)); \ | 87 | */ |
55 | static int name ## _from_tags(struct drbd_conf *mdev, \ | 88 | static struct drbd_config_context { |
56 | unsigned short *tags, struct name *arg) \ | 89 | /* assigned from drbd_genlmsghdr */ |
57 | { \ | 90 | unsigned int minor; |
58 | int tag; \ | 91 | /* assigned from request attributes, if present */ |
59 | int dlen; \ | 92 | unsigned int volume; |
60 | \ | 93 | #define VOLUME_UNSPECIFIED (-1U) |
61 | while ((tag = get_unaligned(tags++)) != TT_END) { \ | 94 | /* pointer into the request skb, |
62 | dlen = get_unaligned(tags++); \ | 95 | * limited lifetime! */ |
63 | switch (tag_number(tag)) { \ | 96 | char *resource_name; |
64 | fields \ | 97 | struct nlattr *my_addr; |
65 | default: \ | 98 | struct nlattr *peer_addr; |
66 | if (tag & T_MANDATORY) { \ | 99 | |
67 | dev_err(DEV, "Unknown tag: %d\n", tag_number(tag)); \ | 100 | /* reply buffer */ |
68 | return 0; \ | 101 | struct sk_buff *reply_skb; |
69 | } \ | 102 | /* pointer into reply buffer */ |
70 | } \ | 103 | struct drbd_genlmsghdr *reply_dh; |
71 | tags = (unsigned short *)((char *)tags + dlen); \ | 104 | /* resolved from attributes, if possible */ |
72 | } \ | 105 | struct drbd_conf *mdev; |
73 | return 1; \ | 106 | struct drbd_tconn *tconn; |
74 | } | 107 | } adm_ctx; |
75 | #define NL_INTEGER(pn, pr, member) \ | 108 | |
76 | case pn: /* D_ASSERT( tag_type(tag) == TT_INTEGER ); */ \ | 109 | static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info) |
77 | arg->member = get_unaligned((int *)(tags)); \ | 110 | { |
78 | break; | 111 | genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb)))); |
79 | #define NL_INT64(pn, pr, member) \ | 112 | if (genlmsg_reply(skb, info)) |
80 | case pn: /* D_ASSERT( tag_type(tag) == TT_INT64 ); */ \ | 113 | printk(KERN_ERR "drbd: error sending genl reply\n"); |
81 | arg->member = get_unaligned((u64 *)(tags)); \ | 114 | } |
115 | |||
116 | /* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only | ||
117 | * reason it could fail was no space in skb, and there are 4k available. */ | ||
118 | int drbd_msg_put_info(const char *info) | ||
119 | { | ||
120 | struct sk_buff *skb = adm_ctx.reply_skb; | ||
121 | struct nlattr *nla; | ||
122 | int err = -EMSGSIZE; | ||
123 | |||
124 | if (!info || !info[0]) | ||
125 | return 0; | ||
126 | |||
127 | nla = nla_nest_start(skb, DRBD_NLA_CFG_REPLY); | ||
128 | if (!nla) | ||
129 | return err; | ||
130 | |||
131 | err = nla_put_string(skb, T_info_text, info); | ||
132 | if (err) { | ||
133 | nla_nest_cancel(skb, nla); | ||
134 | return err; | ||
135 | } else | ||
136 | nla_nest_end(skb, nla); | ||
137 | return 0; | ||
138 | } | ||
139 | |||
140 | /* This would be a good candidate for a "pre_doit" hook, | ||
141 | * and per-family private info->pointers. | ||
142 | * But we need to stay compatible with older kernels. | ||
143 | * If it returns successfully, adm_ctx members are valid. | ||
144 | */ | ||
145 | #define DRBD_ADM_NEED_MINOR 1 | ||
146 | #define DRBD_ADM_NEED_RESOURCE 2 | ||
147 | #define DRBD_ADM_NEED_CONNECTION 4 | ||
148 | static int drbd_adm_prepare(struct sk_buff *skb, struct genl_info *info, | ||
149 | unsigned flags) | ||
150 | { | ||
151 | struct drbd_genlmsghdr *d_in = info->userhdr; | ||
152 | const u8 cmd = info->genlhdr->cmd; | ||
153 | int err; | ||
154 | |||
155 | memset(&adm_ctx, 0, sizeof(adm_ctx)); | ||
156 | |||
157 | /* genl_rcv_msg only checks for CAP_NET_ADMIN on "GENL_ADMIN_PERM" :( */ | ||
158 | if (cmd != DRBD_ADM_GET_STATUS && !capable(CAP_NET_ADMIN)) | ||
159 | return -EPERM; | ||
160 | |||
161 | adm_ctx.reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); | ||
162 | if (!adm_ctx.reply_skb) { | ||
163 | err = -ENOMEM; | ||
164 | goto fail; | ||
165 | } | ||
166 | |||
167 | adm_ctx.reply_dh = genlmsg_put_reply(adm_ctx.reply_skb, | ||
168 | info, &drbd_genl_family, 0, cmd); | ||
169 | /* put of a few bytes into a fresh skb of >= 4k will always succeed. | ||
170 | * but anyways */ | ||
171 | if (!adm_ctx.reply_dh) { | ||
172 | err = -ENOMEM; | ||
173 | goto fail; | ||
174 | } | ||
175 | |||
176 | adm_ctx.reply_dh->minor = d_in->minor; | ||
177 | adm_ctx.reply_dh->ret_code = NO_ERROR; | ||
178 | |||
179 | adm_ctx.volume = VOLUME_UNSPECIFIED; | ||
180 | if (info->attrs[DRBD_NLA_CFG_CONTEXT]) { | ||
181 | struct nlattr *nla; | ||
182 | /* parse and validate only */ | ||
183 | err = drbd_cfg_context_from_attrs(NULL, info); | ||
184 | if (err) | ||
185 | goto fail; | ||
186 | |||
187 | /* It was present, and valid, | ||
188 | * copy it over to the reply skb. */ | ||
189 | err = nla_put_nohdr(adm_ctx.reply_skb, | ||
190 | info->attrs[DRBD_NLA_CFG_CONTEXT]->nla_len, | ||
191 | info->attrs[DRBD_NLA_CFG_CONTEXT]); | ||
192 | if (err) | ||
193 | goto fail; | ||
194 | |||
195 | /* and assign stuff to the global adm_ctx */ | ||
196 | nla = nested_attr_tb[__nla_type(T_ctx_volume)]; | ||
197 | if (nla) | ||
198 | adm_ctx.volume = nla_get_u32(nla); | ||
199 | nla = nested_attr_tb[__nla_type(T_ctx_resource_name)]; | ||
200 | if (nla) | ||
201 | adm_ctx.resource_name = nla_data(nla); | ||
202 | adm_ctx.my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)]; | ||
203 | adm_ctx.peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)]; | ||
204 | if ((adm_ctx.my_addr && | ||
205 | nla_len(adm_ctx.my_addr) > sizeof(adm_ctx.tconn->my_addr)) || | ||
206 | (adm_ctx.peer_addr && | ||
207 | nla_len(adm_ctx.peer_addr) > sizeof(adm_ctx.tconn->peer_addr))) { | ||
208 | err = -EINVAL; | ||
209 | goto fail; | ||
210 | } | ||
211 | } | ||
212 | |||
213 | adm_ctx.minor = d_in->minor; | ||
214 | adm_ctx.mdev = minor_to_mdev(d_in->minor); | ||
215 | adm_ctx.tconn = conn_get_by_name(adm_ctx.resource_name); | ||
216 | |||
217 | if (!adm_ctx.mdev && (flags & DRBD_ADM_NEED_MINOR)) { | ||
218 | drbd_msg_put_info("unknown minor"); | ||
219 | return ERR_MINOR_INVALID; | ||
220 | } | ||
221 | if (!adm_ctx.tconn && (flags & DRBD_ADM_NEED_RESOURCE)) { | ||
222 | drbd_msg_put_info("unknown resource"); | ||
223 | return ERR_INVALID_REQUEST; | ||
224 | } | ||
225 | |||
226 | if (flags & DRBD_ADM_NEED_CONNECTION) { | ||
227 | if (adm_ctx.tconn && !(flags & DRBD_ADM_NEED_RESOURCE)) { | ||
228 | drbd_msg_put_info("no resource name expected"); | ||
229 | return ERR_INVALID_REQUEST; | ||
230 | } | ||
231 | if (adm_ctx.mdev) { | ||
232 | drbd_msg_put_info("no minor number expected"); | ||
233 | return ERR_INVALID_REQUEST; | ||
234 | } | ||
235 | if (adm_ctx.my_addr && adm_ctx.peer_addr) | ||
236 | adm_ctx.tconn = conn_get_by_addrs(nla_data(adm_ctx.my_addr), | ||
237 | nla_len(adm_ctx.my_addr), | ||
238 | nla_data(adm_ctx.peer_addr), | ||
239 | nla_len(adm_ctx.peer_addr)); | ||
240 | if (!adm_ctx.tconn) { | ||
241 | drbd_msg_put_info("unknown connection"); | ||
242 | return ERR_INVALID_REQUEST; | ||
243 | } | ||
244 | } | ||
245 | |||
246 | /* some more paranoia, if the request was over-determined */ | ||
247 | if (adm_ctx.mdev && adm_ctx.tconn && | ||
248 | adm_ctx.mdev->tconn != adm_ctx.tconn) { | ||
249 | pr_warning("request: minor=%u, resource=%s; but that minor belongs to connection %s\n", | ||
250 | adm_ctx.minor, adm_ctx.resource_name, | ||
251 | adm_ctx.mdev->tconn->name); | ||
252 | drbd_msg_put_info("minor exists in different resource"); | ||
253 | return ERR_INVALID_REQUEST; | ||
254 | } | ||
255 | if (adm_ctx.mdev && | ||
256 | adm_ctx.volume != VOLUME_UNSPECIFIED && | ||
257 | adm_ctx.volume != adm_ctx.mdev->vnr) { | ||
258 | pr_warning("request: minor=%u, volume=%u; but that minor is volume %u in %s\n", | ||
259 | adm_ctx.minor, adm_ctx.volume, | ||
260 | adm_ctx.mdev->vnr, adm_ctx.mdev->tconn->name); | ||
261 | drbd_msg_put_info("minor exists as different volume"); | ||
262 | return ERR_INVALID_REQUEST; | ||
263 | } | ||
264 | |||
265 | return NO_ERROR; | ||
266 | |||
267 | fail: | ||
268 | nlmsg_free(adm_ctx.reply_skb); | ||
269 | adm_ctx.reply_skb = NULL; | ||
270 | return err; | ||
271 | } | ||
272 | |||
273 | static int drbd_adm_finish(struct genl_info *info, int retcode) | ||
274 | { | ||
275 | if (adm_ctx.tconn) { | ||
276 | kref_put(&adm_ctx.tconn->kref, &conn_destroy); | ||
277 | adm_ctx.tconn = NULL; | ||
278 | } | ||
279 | |||
280 | if (!adm_ctx.reply_skb) | ||
281 | return -ENOMEM; | ||
282 | |||
283 | adm_ctx.reply_dh->ret_code = retcode; | ||
284 | drbd_adm_send_reply(adm_ctx.reply_skb, info); | ||
285 | return 0; | ||
286 | } | ||
287 | |||
288 | static void setup_khelper_env(struct drbd_tconn *tconn, char **envp) | ||
289 | { | ||
290 | char *afs; | ||
291 | |||
292 | /* FIXME: A future version will not allow this case. */ | ||
293 | if (tconn->my_addr_len == 0 || tconn->peer_addr_len == 0) | ||
294 | return; | ||
295 | |||
296 | switch (((struct sockaddr *)&tconn->peer_addr)->sa_family) { | ||
297 | case AF_INET6: | ||
298 | afs = "ipv6"; | ||
299 | snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI6", | ||
300 | &((struct sockaddr_in6 *)&tconn->peer_addr)->sin6_addr); | ||
82 | break; | 301 | break; |
83 | #define NL_BIT(pn, pr, member) \ | 302 | case AF_INET: |
84 | case pn: /* D_ASSERT( tag_type(tag) == TT_BIT ); */ \ | 303 | afs = "ipv4"; |
85 | arg->member = *(char *)(tags) ? 1 : 0; \ | 304 | snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4", |
305 | &((struct sockaddr_in *)&tconn->peer_addr)->sin_addr); | ||
86 | break; | 306 | break; |
87 | #define NL_STRING(pn, pr, member, len) \ | 307 | default: |
88 | case pn: /* D_ASSERT( tag_type(tag) == TT_STRING ); */ \ | 308 | afs = "ssocks"; |
89 | if (dlen > len) { \ | 309 | snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4", |
90 | dev_err(DEV, "arg too long: %s (%u wanted, max len: %u bytes)\n", \ | 310 | &((struct sockaddr_in *)&tconn->peer_addr)->sin_addr); |
91 | #member, dlen, (unsigned int)len); \ | 311 | } |
92 | return 0; \ | 312 | snprintf(envp[3], 20, "DRBD_PEER_AF=%s", afs); |
93 | } \ | 313 | } |
94 | arg->member ## _len = dlen; \ | ||
95 | memcpy(arg->member, tags, min_t(size_t, dlen, len)); \ | ||
96 | break; | ||
97 | #include <linux/drbd_nl.h> | ||
98 | |||
99 | /* Generate the struct to tag_list functions */ | ||
100 | #define NL_PACKET(name, number, fields) \ | ||
101 | static unsigned short* \ | ||
102 | name ## _to_tags(struct drbd_conf *mdev, \ | ||
103 | struct name *arg, unsigned short *tags) __attribute__ ((unused)); \ | ||
104 | static unsigned short* \ | ||
105 | name ## _to_tags(struct drbd_conf *mdev, \ | ||
106 | struct name *arg, unsigned short *tags) \ | ||
107 | { \ | ||
108 | fields \ | ||
109 | return tags; \ | ||
110 | } | ||
111 | |||
112 | #define NL_INTEGER(pn, pr, member) \ | ||
113 | put_unaligned(pn | pr | TT_INTEGER, tags++); \ | ||
114 | put_unaligned(sizeof(int), tags++); \ | ||
115 | put_unaligned(arg->member, (int *)tags); \ | ||
116 | tags = (unsigned short *)((char *)tags+sizeof(int)); | ||
117 | #define NL_INT64(pn, pr, member) \ | ||
118 | put_unaligned(pn | pr | TT_INT64, tags++); \ | ||
119 | put_unaligned(sizeof(u64), tags++); \ | ||
120 | put_unaligned(arg->member, (u64 *)tags); \ | ||
121 | tags = (unsigned short *)((char *)tags+sizeof(u64)); | ||
122 | #define NL_BIT(pn, pr, member) \ | ||
123 | put_unaligned(pn | pr | TT_BIT, tags++); \ | ||
124 | put_unaligned(sizeof(char), tags++); \ | ||
125 | *(char *)tags = arg->member; \ | ||
126 | tags = (unsigned short *)((char *)tags+sizeof(char)); | ||
127 | #define NL_STRING(pn, pr, member, len) \ | ||
128 | put_unaligned(pn | pr | TT_STRING, tags++); \ | ||
129 | put_unaligned(arg->member ## _len, tags++); \ | ||
130 | memcpy(tags, arg->member, arg->member ## _len); \ | ||
131 | tags = (unsigned short *)((char *)tags + arg->member ## _len); | ||
132 | #include <linux/drbd_nl.h> | ||
133 | |||
134 | void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name); | ||
135 | void drbd_nl_send_reply(struct cn_msg *, int); | ||
136 | 314 | ||
137 | int drbd_khelper(struct drbd_conf *mdev, char *cmd) | 315 | int drbd_khelper(struct drbd_conf *mdev, char *cmd) |
138 | { | 316 | { |
139 | char *envp[] = { "HOME=/", | 317 | char *envp[] = { "HOME=/", |
140 | "TERM=linux", | 318 | "TERM=linux", |
141 | "PATH=/sbin:/usr/sbin:/bin:/usr/bin", | 319 | "PATH=/sbin:/usr/sbin:/bin:/usr/bin", |
142 | NULL, /* Will be set to address family */ | 320 | (char[20]) { }, /* address family */ |
143 | NULL, /* Will be set to address */ | 321 | (char[60]) { }, /* address */ |
144 | NULL }; | 322 | NULL }; |
145 | 323 | char mb[12]; | |
146 | char mb[12], af[20], ad[60], *afs; | ||
147 | char *argv[] = {usermode_helper, cmd, mb, NULL }; | 324 | char *argv[] = {usermode_helper, cmd, mb, NULL }; |
325 | struct drbd_tconn *tconn = mdev->tconn; | ||
326 | struct sib_info sib; | ||
148 | int ret; | 327 | int ret; |
149 | 328 | ||
150 | if (current == mdev->worker.task) | 329 | if (current == tconn->worker.task) |
151 | set_bit(CALLBACK_PENDING, &mdev->flags); | 330 | set_bit(CALLBACK_PENDING, &tconn->flags); |
152 | 331 | ||
153 | snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev)); | 332 | snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev)); |
154 | 333 | setup_khelper_env(tconn, envp); | |
155 | if (get_net_conf(mdev)) { | ||
156 | switch (((struct sockaddr *)mdev->net_conf->peer_addr)->sa_family) { | ||
157 | case AF_INET6: | ||
158 | afs = "ipv6"; | ||
159 | snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI6", | ||
160 | &((struct sockaddr_in6 *)mdev->net_conf->peer_addr)->sin6_addr); | ||
161 | break; | ||
162 | case AF_INET: | ||
163 | afs = "ipv4"; | ||
164 | snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4", | ||
165 | &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr); | ||
166 | break; | ||
167 | default: | ||
168 | afs = "ssocks"; | ||
169 | snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4", | ||
170 | &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr); | ||
171 | } | ||
172 | snprintf(af, 20, "DRBD_PEER_AF=%s", afs); | ||
173 | envp[3]=af; | ||
174 | envp[4]=ad; | ||
175 | put_net_conf(mdev); | ||
176 | } | ||
177 | 334 | ||
178 | /* The helper may take some time. | 335 | /* The helper may take some time. |
179 | * write out any unsynced meta data changes now */ | 336 | * write out any unsynced meta data changes now */ |
180 | drbd_md_sync(mdev); | 337 | drbd_md_sync(mdev); |
181 | 338 | ||
182 | dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb); | 339 | dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb); |
183 | 340 | sib.sib_reason = SIB_HELPER_PRE; | |
184 | drbd_bcast_ev_helper(mdev, cmd); | 341 | sib.helper_name = cmd; |
342 | drbd_bcast_event(mdev, &sib); | ||
185 | ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC); | 343 | ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC); |
186 | if (ret) | 344 | if (ret) |
187 | dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n", | 345 | dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n", |
@@ -191,9 +349,46 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd) | |||
191 | dev_info(DEV, "helper command: %s %s %s exit code %u (0x%x)\n", | 349 | dev_info(DEV, "helper command: %s %s %s exit code %u (0x%x)\n", |
192 | usermode_helper, cmd, mb, | 350 | usermode_helper, cmd, mb, |
193 | (ret >> 8) & 0xff, ret); | 351 | (ret >> 8) & 0xff, ret); |
352 | sib.sib_reason = SIB_HELPER_POST; | ||
353 | sib.helper_exit_code = ret; | ||
354 | drbd_bcast_event(mdev, &sib); | ||
355 | |||
356 | if (current == tconn->worker.task) | ||
357 | clear_bit(CALLBACK_PENDING, &tconn->flags); | ||
358 | |||
359 | if (ret < 0) /* Ignore any ERRNOs we got. */ | ||
360 | ret = 0; | ||
361 | |||
362 | return ret; | ||
363 | } | ||
364 | |||
365 | int conn_khelper(struct drbd_tconn *tconn, char *cmd) | ||
366 | { | ||
367 | char *envp[] = { "HOME=/", | ||
368 | "TERM=linux", | ||
369 | "PATH=/sbin:/usr/sbin:/bin:/usr/bin", | ||
370 | (char[20]) { }, /* address family */ | ||
371 | (char[60]) { }, /* address */ | ||
372 | NULL }; | ||
373 | char *argv[] = {usermode_helper, cmd, tconn->name, NULL }; | ||
374 | int ret; | ||
375 | |||
376 | setup_khelper_env(tconn, envp); | ||
377 | conn_md_sync(tconn); | ||
194 | 378 | ||
195 | if (current == mdev->worker.task) | 379 | conn_info(tconn, "helper command: %s %s %s\n", usermode_helper, cmd, tconn->name); |
196 | clear_bit(CALLBACK_PENDING, &mdev->flags); | 380 | /* TODO: conn_bcast_event() ?? */ |
381 | |||
382 | ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC); | ||
383 | if (ret) | ||
384 | conn_warn(tconn, "helper command: %s %s %s exit code %u (0x%x)\n", | ||
385 | usermode_helper, cmd, tconn->name, | ||
386 | (ret >> 8) & 0xff, ret); | ||
387 | else | ||
388 | conn_info(tconn, "helper command: %s %s %s exit code %u (0x%x)\n", | ||
389 | usermode_helper, cmd, tconn->name, | ||
390 | (ret >> 8) & 0xff, ret); | ||
391 | /* TODO: conn_bcast_event() ?? */ | ||
197 | 392 | ||
198 | if (ret < 0) /* Ignore any ERRNOs we got. */ | 393 | if (ret < 0) /* Ignore any ERRNOs we got. */ |
199 | ret = 0; | 394 | ret = 0; |
@@ -201,116 +396,129 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd) | |||
201 | return ret; | 396 | return ret; |
202 | } | 397 | } |
203 | 398 | ||
204 | enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev) | 399 | static enum drbd_fencing_p highest_fencing_policy(struct drbd_tconn *tconn) |
205 | { | 400 | { |
401 | enum drbd_fencing_p fp = FP_NOT_AVAIL; | ||
402 | struct drbd_conf *mdev; | ||
403 | int vnr; | ||
404 | |||
405 | rcu_read_lock(); | ||
406 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
407 | if (get_ldev_if_state(mdev, D_CONSISTENT)) { | ||
408 | fp = max_t(enum drbd_fencing_p, fp, | ||
409 | rcu_dereference(mdev->ldev->disk_conf)->fencing); | ||
410 | put_ldev(mdev); | ||
411 | } | ||
412 | } | ||
413 | rcu_read_unlock(); | ||
414 | |||
415 | return fp; | ||
416 | } | ||
417 | |||
418 | bool conn_try_outdate_peer(struct drbd_tconn *tconn) | ||
419 | { | ||
420 | union drbd_state mask = { }; | ||
421 | union drbd_state val = { }; | ||
422 | enum drbd_fencing_p fp; | ||
206 | char *ex_to_string; | 423 | char *ex_to_string; |
207 | int r; | 424 | int r; |
208 | enum drbd_disk_state nps; | ||
209 | enum drbd_fencing_p fp; | ||
210 | 425 | ||
211 | D_ASSERT(mdev->state.pdsk == D_UNKNOWN); | 426 | if (tconn->cstate >= C_WF_REPORT_PARAMS) { |
427 | conn_err(tconn, "Expected cstate < C_WF_REPORT_PARAMS\n"); | ||
428 | return false; | ||
429 | } | ||
212 | 430 | ||
213 | if (get_ldev_if_state(mdev, D_CONSISTENT)) { | 431 | fp = highest_fencing_policy(tconn); |
214 | fp = mdev->ldev->dc.fencing; | 432 | switch (fp) { |
215 | put_ldev(mdev); | 433 | case FP_NOT_AVAIL: |
216 | } else { | 434 | conn_warn(tconn, "Not fencing peer, I'm not even Consistent myself.\n"); |
217 | dev_warn(DEV, "Not fencing peer, I'm not even Consistent myself.\n"); | ||
218 | nps = mdev->state.pdsk; | ||
219 | goto out; | 435 | goto out; |
436 | case FP_DONT_CARE: | ||
437 | return true; | ||
438 | default: ; | ||
220 | } | 439 | } |
221 | 440 | ||
222 | r = drbd_khelper(mdev, "fence-peer"); | 441 | r = conn_khelper(tconn, "fence-peer"); |
223 | 442 | ||
224 | switch ((r>>8) & 0xff) { | 443 | switch ((r>>8) & 0xff) { |
225 | case 3: /* peer is inconsistent */ | 444 | case 3: /* peer is inconsistent */ |
226 | ex_to_string = "peer is inconsistent or worse"; | 445 | ex_to_string = "peer is inconsistent or worse"; |
227 | nps = D_INCONSISTENT; | 446 | mask.pdsk = D_MASK; |
447 | val.pdsk = D_INCONSISTENT; | ||
228 | break; | 448 | break; |
229 | case 4: /* peer got outdated, or was already outdated */ | 449 | case 4: /* peer got outdated, or was already outdated */ |
230 | ex_to_string = "peer was fenced"; | 450 | ex_to_string = "peer was fenced"; |
231 | nps = D_OUTDATED; | 451 | mask.pdsk = D_MASK; |
452 | val.pdsk = D_OUTDATED; | ||
232 | break; | 453 | break; |
233 | case 5: /* peer was down */ | 454 | case 5: /* peer was down */ |
234 | if (mdev->state.disk == D_UP_TO_DATE) { | 455 | if (conn_highest_disk(tconn) == D_UP_TO_DATE) { |
235 | /* we will(have) create(d) a new UUID anyways... */ | 456 | /* we will(have) create(d) a new UUID anyways... */ |
236 | ex_to_string = "peer is unreachable, assumed to be dead"; | 457 | ex_to_string = "peer is unreachable, assumed to be dead"; |
237 | nps = D_OUTDATED; | 458 | mask.pdsk = D_MASK; |
459 | val.pdsk = D_OUTDATED; | ||
238 | } else { | 460 | } else { |
239 | ex_to_string = "peer unreachable, doing nothing since disk != UpToDate"; | 461 | ex_to_string = "peer unreachable, doing nothing since disk != UpToDate"; |
240 | nps = mdev->state.pdsk; | ||
241 | } | 462 | } |
242 | break; | 463 | break; |
243 | case 6: /* Peer is primary, voluntarily outdate myself. | 464 | case 6: /* Peer is primary, voluntarily outdate myself. |
244 | * This is useful when an unconnected R_SECONDARY is asked to | 465 | * This is useful when an unconnected R_SECONDARY is asked to |
245 | * become R_PRIMARY, but finds the other peer being active. */ | 466 | * become R_PRIMARY, but finds the other peer being active. */ |
246 | ex_to_string = "peer is active"; | 467 | ex_to_string = "peer is active"; |
247 | dev_warn(DEV, "Peer is primary, outdating myself.\n"); | 468 | conn_warn(tconn, "Peer is primary, outdating myself.\n"); |
248 | nps = D_UNKNOWN; | 469 | mask.disk = D_MASK; |
249 | _drbd_request_state(mdev, NS(disk, D_OUTDATED), CS_WAIT_COMPLETE); | 470 | val.disk = D_OUTDATED; |
250 | break; | 471 | break; |
251 | case 7: | 472 | case 7: |
252 | if (fp != FP_STONITH) | 473 | if (fp != FP_STONITH) |
253 | dev_err(DEV, "fence-peer() = 7 && fencing != Stonith !!!\n"); | 474 | conn_err(tconn, "fence-peer() = 7 && fencing != Stonith !!!\n"); |
254 | ex_to_string = "peer was stonithed"; | 475 | ex_to_string = "peer was stonithed"; |
255 | nps = D_OUTDATED; | 476 | mask.pdsk = D_MASK; |
477 | val.pdsk = D_OUTDATED; | ||
256 | break; | 478 | break; |
257 | default: | 479 | default: |
258 | /* The script is broken ... */ | 480 | /* The script is broken ... */ |
259 | nps = D_UNKNOWN; | 481 | conn_err(tconn, "fence-peer helper broken, returned %d\n", (r>>8)&0xff); |
260 | dev_err(DEV, "fence-peer helper broken, returned %d\n", (r>>8)&0xff); | 482 | return false; /* Eventually leave IO frozen */ |
261 | return nps; | ||
262 | } | 483 | } |
263 | 484 | ||
264 | dev_info(DEV, "fence-peer helper returned %d (%s)\n", | 485 | conn_info(tconn, "fence-peer helper returned %d (%s)\n", |
265 | (r>>8) & 0xff, ex_to_string); | 486 | (r>>8) & 0xff, ex_to_string); |
266 | 487 | ||
267 | out: | 488 | out: |
268 | if (mdev->state.susp_fen && nps >= D_UNKNOWN) { | ||
269 | /* The handler was not successful... unfreeze here, the | ||
270 | state engine can not unfreeze... */ | ||
271 | _drbd_request_state(mdev, NS(susp_fen, 0), CS_VERBOSE); | ||
272 | } | ||
273 | 489 | ||
274 | return nps; | 490 | /* Not using |
491 | conn_request_state(tconn, mask, val, CS_VERBOSE); | ||
492 | here, because we might were able to re-establish the connection in the | ||
493 | meantime. */ | ||
494 | spin_lock_irq(&tconn->req_lock); | ||
495 | if (tconn->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &tconn->flags)) | ||
496 | _conn_request_state(tconn, mask, val, CS_VERBOSE); | ||
497 | spin_unlock_irq(&tconn->req_lock); | ||
498 | |||
499 | return conn_highest_pdsk(tconn) <= D_OUTDATED; | ||
275 | } | 500 | } |
276 | 501 | ||
277 | static int _try_outdate_peer_async(void *data) | 502 | static int _try_outdate_peer_async(void *data) |
278 | { | 503 | { |
279 | struct drbd_conf *mdev = (struct drbd_conf *)data; | 504 | struct drbd_tconn *tconn = (struct drbd_tconn *)data; |
280 | enum drbd_disk_state nps; | ||
281 | union drbd_state ns; | ||
282 | 505 | ||
283 | nps = drbd_try_outdate_peer(mdev); | 506 | conn_try_outdate_peer(tconn); |
284 | |||
285 | /* Not using | ||
286 | drbd_request_state(mdev, NS(pdsk, nps)); | ||
287 | here, because we might were able to re-establish the connection | ||
288 | in the meantime. This can only partially be solved in the state's | ||
289 | engine is_valid_state() and is_valid_state_transition() | ||
290 | functions. | ||
291 | |||
292 | nps can be D_INCONSISTENT, D_OUTDATED or D_UNKNOWN. | ||
293 | pdsk == D_INCONSISTENT while conn >= C_CONNECTED is valid, | ||
294 | therefore we have to have the pre state change check here. | ||
295 | */ | ||
296 | spin_lock_irq(&mdev->req_lock); | ||
297 | ns = mdev->state; | ||
298 | if (ns.conn < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &mdev->flags)) { | ||
299 | ns.pdsk = nps; | ||
300 | _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); | ||
301 | } | ||
302 | spin_unlock_irq(&mdev->req_lock); | ||
303 | 507 | ||
508 | kref_put(&tconn->kref, &conn_destroy); | ||
304 | return 0; | 509 | return 0; |
305 | } | 510 | } |
306 | 511 | ||
307 | void drbd_try_outdate_peer_async(struct drbd_conf *mdev) | 512 | void conn_try_outdate_peer_async(struct drbd_tconn *tconn) |
308 | { | 513 | { |
309 | struct task_struct *opa; | 514 | struct task_struct *opa; |
310 | 515 | ||
311 | opa = kthread_run(_try_outdate_peer_async, mdev, "drbd%d_a_helper", mdev_to_minor(mdev)); | 516 | kref_get(&tconn->kref); |
312 | if (IS_ERR(opa)) | 517 | opa = kthread_run(_try_outdate_peer_async, tconn, "drbd_async_h"); |
313 | dev_err(DEV, "out of mem, failed to invoke fence-peer helper\n"); | 518 | if (IS_ERR(opa)) { |
519 | conn_err(tconn, "out of mem, failed to invoke fence-peer helper\n"); | ||
520 | kref_put(&tconn->kref, &conn_destroy); | ||
521 | } | ||
314 | } | 522 | } |
315 | 523 | ||
316 | enum drbd_state_rv | 524 | enum drbd_state_rv |
@@ -318,15 +526,15 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) | |||
318 | { | 526 | { |
319 | const int max_tries = 4; | 527 | const int max_tries = 4; |
320 | enum drbd_state_rv rv = SS_UNKNOWN_ERROR; | 528 | enum drbd_state_rv rv = SS_UNKNOWN_ERROR; |
529 | struct net_conf *nc; | ||
321 | int try = 0; | 530 | int try = 0; |
322 | int forced = 0; | 531 | int forced = 0; |
323 | union drbd_state mask, val; | 532 | union drbd_state mask, val; |
324 | enum drbd_disk_state nps; | ||
325 | 533 | ||
326 | if (new_role == R_PRIMARY) | 534 | if (new_role == R_PRIMARY) |
327 | request_ping(mdev); /* Detect a dead peer ASAP */ | 535 | request_ping(mdev->tconn); /* Detect a dead peer ASAP */ |
328 | 536 | ||
329 | mutex_lock(&mdev->state_mutex); | 537 | mutex_lock(mdev->state_mutex); |
330 | 538 | ||
331 | mask.i = 0; mask.role = R_MASK; | 539 | mask.i = 0; mask.role = R_MASK; |
332 | val.i = 0; val.role = new_role; | 540 | val.i = 0; val.role = new_role; |
@@ -354,38 +562,34 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) | |||
354 | if (rv == SS_NO_UP_TO_DATE_DISK && | 562 | if (rv == SS_NO_UP_TO_DATE_DISK && |
355 | mdev->state.disk == D_CONSISTENT && mask.pdsk == 0) { | 563 | mdev->state.disk == D_CONSISTENT && mask.pdsk == 0) { |
356 | D_ASSERT(mdev->state.pdsk == D_UNKNOWN); | 564 | D_ASSERT(mdev->state.pdsk == D_UNKNOWN); |
357 | nps = drbd_try_outdate_peer(mdev); | ||
358 | 565 | ||
359 | if (nps == D_OUTDATED || nps == D_INCONSISTENT) { | 566 | if (conn_try_outdate_peer(mdev->tconn)) { |
360 | val.disk = D_UP_TO_DATE; | 567 | val.disk = D_UP_TO_DATE; |
361 | mask.disk = D_MASK; | 568 | mask.disk = D_MASK; |
362 | } | 569 | } |
363 | |||
364 | val.pdsk = nps; | ||
365 | mask.pdsk = D_MASK; | ||
366 | |||
367 | continue; | 570 | continue; |
368 | } | 571 | } |
369 | 572 | ||
370 | if (rv == SS_NOTHING_TO_DO) | 573 | if (rv == SS_NOTHING_TO_DO) |
371 | goto fail; | 574 | goto out; |
372 | if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) { | 575 | if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) { |
373 | nps = drbd_try_outdate_peer(mdev); | 576 | if (!conn_try_outdate_peer(mdev->tconn) && force) { |
374 | |||
375 | if (force && nps > D_OUTDATED) { | ||
376 | dev_warn(DEV, "Forced into split brain situation!\n"); | 577 | dev_warn(DEV, "Forced into split brain situation!\n"); |
377 | nps = D_OUTDATED; | 578 | mask.pdsk = D_MASK; |
378 | } | 579 | val.pdsk = D_OUTDATED; |
379 | |||
380 | mask.pdsk = D_MASK; | ||
381 | val.pdsk = nps; | ||
382 | 580 | ||
581 | } | ||
383 | continue; | 582 | continue; |
384 | } | 583 | } |
385 | if (rv == SS_TWO_PRIMARIES) { | 584 | if (rv == SS_TWO_PRIMARIES) { |
386 | /* Maybe the peer is detected as dead very soon... | 585 | /* Maybe the peer is detected as dead very soon... |
387 | retry at most once more in this case. */ | 586 | retry at most once more in this case. */ |
388 | schedule_timeout_interruptible((mdev->net_conf->ping_timeo+1)*HZ/10); | 587 | int timeo; |
588 | rcu_read_lock(); | ||
589 | nc = rcu_dereference(mdev->tconn->net_conf); | ||
590 | timeo = nc ? (nc->ping_timeo + 1) * HZ / 10 : 1; | ||
591 | rcu_read_unlock(); | ||
592 | schedule_timeout_interruptible(timeo); | ||
389 | if (try < max_tries) | 593 | if (try < max_tries) |
390 | try = max_tries - 1; | 594 | try = max_tries - 1; |
391 | continue; | 595 | continue; |
@@ -394,13 +598,13 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) | |||
394 | rv = _drbd_request_state(mdev, mask, val, | 598 | rv = _drbd_request_state(mdev, mask, val, |
395 | CS_VERBOSE + CS_WAIT_COMPLETE); | 599 | CS_VERBOSE + CS_WAIT_COMPLETE); |
396 | if (rv < SS_SUCCESS) | 600 | if (rv < SS_SUCCESS) |
397 | goto fail; | 601 | goto out; |
398 | } | 602 | } |
399 | break; | 603 | break; |
400 | } | 604 | } |
401 | 605 | ||
402 | if (rv < SS_SUCCESS) | 606 | if (rv < SS_SUCCESS) |
403 | goto fail; | 607 | goto out; |
404 | 608 | ||
405 | if (forced) | 609 | if (forced) |
406 | dev_warn(DEV, "Forced to consider local data as UpToDate!\n"); | 610 | dev_warn(DEV, "Forced to consider local data as UpToDate!\n"); |
@@ -408,6 +612,8 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) | |||
408 | /* Wait until nothing is on the fly :) */ | 612 | /* Wait until nothing is on the fly :) */ |
409 | wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0); | 613 | wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0); |
410 | 614 | ||
615 | /* FIXME also wait for all pending P_BARRIER_ACK? */ | ||
616 | |||
411 | if (new_role == R_SECONDARY) { | 617 | if (new_role == R_SECONDARY) { |
412 | set_disk_ro(mdev->vdisk, true); | 618 | set_disk_ro(mdev->vdisk, true); |
413 | if (get_ldev(mdev)) { | 619 | if (get_ldev(mdev)) { |
@@ -415,10 +621,12 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) | |||
415 | put_ldev(mdev); | 621 | put_ldev(mdev); |
416 | } | 622 | } |
417 | } else { | 623 | } else { |
418 | if (get_net_conf(mdev)) { | 624 | mutex_lock(&mdev->tconn->conf_update); |
419 | mdev->net_conf->want_lose = 0; | 625 | nc = mdev->tconn->net_conf; |
420 | put_net_conf(mdev); | 626 | if (nc) |
421 | } | 627 | nc->discard_my_data = 0; /* without copy; single bit op is atomic */ |
628 | mutex_unlock(&mdev->tconn->conf_update); | ||
629 | |||
422 | set_disk_ro(mdev->vdisk, false); | 630 | set_disk_ro(mdev->vdisk, false); |
423 | if (get_ldev(mdev)) { | 631 | if (get_ldev(mdev)) { |
424 | if (((mdev->state.conn < C_CONNECTED || | 632 | if (((mdev->state.conn < C_CONNECTED || |
@@ -444,67 +652,47 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) | |||
444 | drbd_md_sync(mdev); | 652 | drbd_md_sync(mdev); |
445 | 653 | ||
446 | kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); | 654 | kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); |
447 | fail: | 655 | out: |
448 | mutex_unlock(&mdev->state_mutex); | 656 | mutex_unlock(mdev->state_mutex); |
449 | return rv; | 657 | return rv; |
450 | } | 658 | } |
451 | 659 | ||
452 | static struct drbd_conf *ensure_mdev(int minor, int create) | 660 | static const char *from_attrs_err_to_txt(int err) |
453 | { | 661 | { |
454 | struct drbd_conf *mdev; | 662 | return err == -ENOMSG ? "required attribute missing" : |
455 | 663 | err == -EOPNOTSUPP ? "unknown mandatory attribute" : | |
456 | if (minor >= minor_count) | 664 | err == -EEXIST ? "can not change invariant setting" : |
457 | return NULL; | 665 | "invalid attribute value"; |
458 | |||
459 | mdev = minor_to_mdev(minor); | ||
460 | |||
461 | if (!mdev && create) { | ||
462 | struct gendisk *disk = NULL; | ||
463 | mdev = drbd_new_device(minor); | ||
464 | |||
465 | spin_lock_irq(&drbd_pp_lock); | ||
466 | if (minor_table[minor] == NULL) { | ||
467 | minor_table[minor] = mdev; | ||
468 | disk = mdev->vdisk; | ||
469 | mdev = NULL; | ||
470 | } /* else: we lost the race */ | ||
471 | spin_unlock_irq(&drbd_pp_lock); | ||
472 | |||
473 | if (disk) /* we won the race above */ | ||
474 | /* in case we ever add a drbd_delete_device(), | ||
475 | * don't forget the del_gendisk! */ | ||
476 | add_disk(disk); | ||
477 | else /* we lost the race above */ | ||
478 | drbd_free_mdev(mdev); | ||
479 | |||
480 | mdev = minor_to_mdev(minor); | ||
481 | } | ||
482 | |||
483 | return mdev; | ||
484 | } | 666 | } |
485 | 667 | ||
486 | static int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 668 | int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info) |
487 | struct drbd_nl_cfg_reply *reply) | ||
488 | { | 669 | { |
489 | struct primary primary_args; | 670 | struct set_role_parms parms; |
490 | 671 | int err; | |
491 | memset(&primary_args, 0, sizeof(struct primary)); | 672 | enum drbd_ret_code retcode; |
492 | if (!primary_from_tags(mdev, nlp->tag_list, &primary_args)) { | ||
493 | reply->ret_code = ERR_MANDATORY_TAG; | ||
494 | return 0; | ||
495 | } | ||
496 | |||
497 | reply->ret_code = | ||
498 | drbd_set_role(mdev, R_PRIMARY, primary_args.primary_force); | ||
499 | 673 | ||
500 | return 0; | 674 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); |
501 | } | 675 | if (!adm_ctx.reply_skb) |
676 | return retcode; | ||
677 | if (retcode != NO_ERROR) | ||
678 | goto out; | ||
502 | 679 | ||
503 | static int drbd_nl_secondary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 680 | memset(&parms, 0, sizeof(parms)); |
504 | struct drbd_nl_cfg_reply *reply) | 681 | if (info->attrs[DRBD_NLA_SET_ROLE_PARMS]) { |
505 | { | 682 | err = set_role_parms_from_attrs(&parms, info); |
506 | reply->ret_code = drbd_set_role(mdev, R_SECONDARY, 0); | 683 | if (err) { |
684 | retcode = ERR_MANDATORY_TAG; | ||
685 | drbd_msg_put_info(from_attrs_err_to_txt(err)); | ||
686 | goto out; | ||
687 | } | ||
688 | } | ||
507 | 689 | ||
690 | if (info->genlhdr->cmd == DRBD_ADM_PRIMARY) | ||
691 | retcode = drbd_set_role(adm_ctx.mdev, R_PRIMARY, parms.assume_uptodate); | ||
692 | else | ||
693 | retcode = drbd_set_role(adm_ctx.mdev, R_SECONDARY, 0); | ||
694 | out: | ||
695 | drbd_adm_finish(info, retcode); | ||
508 | return 0; | 696 | return 0; |
509 | } | 697 | } |
510 | 698 | ||
@@ -514,7 +702,12 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, | |||
514 | struct drbd_backing_dev *bdev) | 702 | struct drbd_backing_dev *bdev) |
515 | { | 703 | { |
516 | sector_t md_size_sect = 0; | 704 | sector_t md_size_sect = 0; |
517 | switch (bdev->dc.meta_dev_idx) { | 705 | int meta_dev_idx; |
706 | |||
707 | rcu_read_lock(); | ||
708 | meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; | ||
709 | |||
710 | switch (meta_dev_idx) { | ||
518 | default: | 711 | default: |
519 | /* v07 style fixed size indexed meta data */ | 712 | /* v07 style fixed size indexed meta data */ |
520 | bdev->md.md_size_sect = MD_RESERVED_SECT; | 713 | bdev->md.md_size_sect = MD_RESERVED_SECT; |
@@ -533,7 +726,7 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, | |||
533 | case DRBD_MD_INDEX_FLEX_INT: | 726 | case DRBD_MD_INDEX_FLEX_INT: |
534 | bdev->md.md_offset = drbd_md_ss__(mdev, bdev); | 727 | bdev->md.md_offset = drbd_md_ss__(mdev, bdev); |
535 | /* al size is still fixed */ | 728 | /* al size is still fixed */ |
536 | bdev->md.al_offset = -MD_AL_MAX_SIZE; | 729 | bdev->md.al_offset = -MD_AL_SECTORS; |
537 | /* we need (slightly less than) ~ this much bitmap sectors: */ | 730 | /* we need (slightly less than) ~ this much bitmap sectors: */ |
538 | md_size_sect = drbd_get_capacity(bdev->backing_bdev); | 731 | md_size_sect = drbd_get_capacity(bdev->backing_bdev); |
539 | md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT); | 732 | md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT); |
@@ -549,6 +742,7 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, | |||
549 | bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET; | 742 | bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET; |
550 | break; | 743 | break; |
551 | } | 744 | } |
745 | rcu_read_unlock(); | ||
552 | } | 746 | } |
553 | 747 | ||
554 | /* input size is expected to be in KB */ | 748 | /* input size is expected to be in KB */ |
@@ -581,10 +775,16 @@ char *ppsize(char *buf, unsigned long long size) | |||
581 | * R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET: | 775 | * R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET: |
582 | * peer may not initiate a resize. | 776 | * peer may not initiate a resize. |
583 | */ | 777 | */ |
778 | /* Note these are not to be confused with | ||
779 | * drbd_adm_suspend_io/drbd_adm_resume_io, | ||
780 | * which are (sub) state changes triggered by admin (drbdsetup), | ||
781 | * and can be long lived. | ||
782 | * This changes an mdev->flag, is triggered by drbd internals, | ||
783 | * and should be short-lived. */ | ||
584 | void drbd_suspend_io(struct drbd_conf *mdev) | 784 | void drbd_suspend_io(struct drbd_conf *mdev) |
585 | { | 785 | { |
586 | set_bit(SUSPEND_IO, &mdev->flags); | 786 | set_bit(SUSPEND_IO, &mdev->flags); |
587 | if (is_susp(mdev->state)) | 787 | if (drbd_suspended(mdev)) |
588 | return; | 788 | return; |
589 | wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); | 789 | wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); |
590 | } | 790 | } |
@@ -605,7 +805,7 @@ void drbd_resume_io(struct drbd_conf *mdev) | |||
605 | enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local) | 805 | enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local) |
606 | { | 806 | { |
607 | sector_t prev_first_sect, prev_size; /* previous meta location */ | 807 | sector_t prev_first_sect, prev_size; /* previous meta location */ |
608 | sector_t la_size; | 808 | sector_t la_size, u_size; |
609 | sector_t size; | 809 | sector_t size; |
610 | char ppb[10]; | 810 | char ppb[10]; |
611 | 811 | ||
@@ -633,7 +833,10 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds | |||
633 | /* TODO: should only be some assert here, not (re)init... */ | 833 | /* TODO: should only be some assert here, not (re)init... */ |
634 | drbd_md_set_sector_offsets(mdev, mdev->ldev); | 834 | drbd_md_set_sector_offsets(mdev, mdev->ldev); |
635 | 835 | ||
636 | size = drbd_new_dev_size(mdev, mdev->ldev, flags & DDSF_FORCED); | 836 | rcu_read_lock(); |
837 | u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size; | ||
838 | rcu_read_unlock(); | ||
839 | size = drbd_new_dev_size(mdev, mdev->ldev, u_size, flags & DDSF_FORCED); | ||
637 | 840 | ||
638 | if (drbd_get_capacity(mdev->this_bdev) != size || | 841 | if (drbd_get_capacity(mdev->this_bdev) != size || |
639 | drbd_bm_capacity(mdev) != size) { | 842 | drbd_bm_capacity(mdev) != size) { |
@@ -696,12 +899,12 @@ out: | |||
696 | } | 899 | } |
697 | 900 | ||
698 | sector_t | 901 | sector_t |
699 | drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, int assume_peer_has_space) | 902 | drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, |
903 | sector_t u_size, int assume_peer_has_space) | ||
700 | { | 904 | { |
701 | sector_t p_size = mdev->p_size; /* partner's disk size. */ | 905 | sector_t p_size = mdev->p_size; /* partner's disk size. */ |
702 | sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */ | 906 | sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */ |
703 | sector_t m_size; /* my size */ | 907 | sector_t m_size; /* my size */ |
704 | sector_t u_size = bdev->dc.disk_size; /* size requested by user. */ | ||
705 | sector_t size = 0; | 908 | sector_t size = 0; |
706 | 909 | ||
707 | m_size = drbd_get_max_capacity(bdev); | 910 | m_size = drbd_get_max_capacity(bdev); |
@@ -750,24 +953,21 @@ drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, int ass | |||
750 | * failed, and 0 on success. You should call drbd_md_sync() after you called | 953 | * failed, and 0 on success. You should call drbd_md_sync() after you called |
751 | * this function. | 954 | * this function. |
752 | */ | 955 | */ |
753 | static int drbd_check_al_size(struct drbd_conf *mdev) | 956 | static int drbd_check_al_size(struct drbd_conf *mdev, struct disk_conf *dc) |
754 | { | 957 | { |
755 | struct lru_cache *n, *t; | 958 | struct lru_cache *n, *t; |
756 | struct lc_element *e; | 959 | struct lc_element *e; |
757 | unsigned int in_use; | 960 | unsigned int in_use; |
758 | int i; | 961 | int i; |
759 | 962 | ||
760 | ERR_IF(mdev->sync_conf.al_extents < 7) | ||
761 | mdev->sync_conf.al_extents = 127; | ||
762 | |||
763 | if (mdev->act_log && | 963 | if (mdev->act_log && |
764 | mdev->act_log->nr_elements == mdev->sync_conf.al_extents) | 964 | mdev->act_log->nr_elements == dc->al_extents) |
765 | return 0; | 965 | return 0; |
766 | 966 | ||
767 | in_use = 0; | 967 | in_use = 0; |
768 | t = mdev->act_log; | 968 | t = mdev->act_log; |
769 | n = lc_create("act_log", drbd_al_ext_cache, | 969 | n = lc_create("act_log", drbd_al_ext_cache, AL_UPDATES_PER_TRANSACTION, |
770 | mdev->sync_conf.al_extents, sizeof(struct lc_element), 0); | 970 | dc->al_extents, sizeof(struct lc_element), 0); |
771 | 971 | ||
772 | if (n == NULL) { | 972 | if (n == NULL) { |
773 | dev_err(DEV, "Cannot allocate act_log lru!\n"); | 973 | dev_err(DEV, "Cannot allocate act_log lru!\n"); |
@@ -808,7 +1008,9 @@ static void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_ | |||
808 | struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; | 1008 | struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; |
809 | 1009 | ||
810 | max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9); | 1010 | max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9); |
811 | max_segments = mdev->ldev->dc.max_bio_bvecs; | 1011 | rcu_read_lock(); |
1012 | max_segments = rcu_dereference(mdev->ldev->disk_conf)->max_bio_bvecs; | ||
1013 | rcu_read_unlock(); | ||
812 | put_ldev(mdev); | 1014 | put_ldev(mdev); |
813 | } | 1015 | } |
814 | 1016 | ||
@@ -852,12 +1054,14 @@ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev) | |||
852 | Because new from 8.3.8 onwards the peer can use multiple | 1054 | Because new from 8.3.8 onwards the peer can use multiple |
853 | BIOs for a single peer_request */ | 1055 | BIOs for a single peer_request */ |
854 | if (mdev->state.conn >= C_CONNECTED) { | 1056 | if (mdev->state.conn >= C_CONNECTED) { |
855 | if (mdev->agreed_pro_version < 94) { | 1057 | if (mdev->tconn->agreed_pro_version < 94) |
856 | peer = min(mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); | 1058 | peer = min( mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); |
857 | /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */ | 1059 | /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */ |
858 | } else if (mdev->agreed_pro_version == 94) | 1060 | else if (mdev->tconn->agreed_pro_version == 94) |
859 | peer = DRBD_MAX_SIZE_H80_PACKET; | 1061 | peer = DRBD_MAX_SIZE_H80_PACKET; |
860 | else /* drbd 8.3.8 onwards */ | 1062 | else if (mdev->tconn->agreed_pro_version < 100) |
1063 | peer = DRBD_MAX_BIO_SIZE_P95; /* drbd 8.3.8 onwards, before 8.4.0 */ | ||
1064 | else | ||
861 | peer = DRBD_MAX_BIO_SIZE; | 1065 | peer = DRBD_MAX_BIO_SIZE; |
862 | } | 1066 | } |
863 | 1067 | ||
@@ -872,36 +1076,27 @@ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev) | |||
872 | drbd_setup_queue_param(mdev, new); | 1076 | drbd_setup_queue_param(mdev, new); |
873 | } | 1077 | } |
874 | 1078 | ||
875 | /* serialize deconfig (worker exiting, doing cleanup) | 1079 | /* Starts the worker thread */ |
876 | * and reconfig (drbdsetup disk, drbdsetup net) | 1080 | static void conn_reconfig_start(struct drbd_tconn *tconn) |
877 | * | ||
878 | * Wait for a potentially exiting worker, then restart it, | ||
879 | * or start a new one. Flush any pending work, there may still be an | ||
880 | * after_state_change queued. | ||
881 | */ | ||
882 | static void drbd_reconfig_start(struct drbd_conf *mdev) | ||
883 | { | 1081 | { |
884 | wait_event(mdev->state_wait, !test_and_set_bit(CONFIG_PENDING, &mdev->flags)); | 1082 | drbd_thread_start(&tconn->worker); |
885 | wait_event(mdev->state_wait, !test_bit(DEVICE_DYING, &mdev->flags)); | 1083 | conn_flush_workqueue(tconn); |
886 | drbd_thread_start(&mdev->worker); | ||
887 | drbd_flush_workqueue(mdev); | ||
888 | } | 1084 | } |
889 | 1085 | ||
890 | /* if still unconfigured, stops worker again. | 1086 | /* if still unconfigured, stops worker again. */ |
891 | * if configured now, clears CONFIG_PENDING. | 1087 | static void conn_reconfig_done(struct drbd_tconn *tconn) |
892 | * wakes potential waiters */ | ||
893 | static void drbd_reconfig_done(struct drbd_conf *mdev) | ||
894 | { | 1088 | { |
895 | spin_lock_irq(&mdev->req_lock); | 1089 | bool stop_threads; |
896 | if (mdev->state.disk == D_DISKLESS && | 1090 | spin_lock_irq(&tconn->req_lock); |
897 | mdev->state.conn == C_STANDALONE && | 1091 | stop_threads = conn_all_vols_unconf(tconn) && |
898 | mdev->state.role == R_SECONDARY) { | 1092 | tconn->cstate == C_STANDALONE; |
899 | set_bit(DEVICE_DYING, &mdev->flags); | 1093 | spin_unlock_irq(&tconn->req_lock); |
900 | drbd_thread_stop_nowait(&mdev->worker); | 1094 | if (stop_threads) { |
901 | } else | 1095 | /* asender is implicitly stopped by receiver |
902 | clear_bit(CONFIG_PENDING, &mdev->flags); | 1096 | * in conn_disconnect() */ |
903 | spin_unlock_irq(&mdev->req_lock); | 1097 | drbd_thread_stop(&tconn->receiver); |
904 | wake_up(&mdev->state_wait); | 1098 | drbd_thread_stop(&tconn->worker); |
1099 | } | ||
905 | } | 1100 | } |
906 | 1101 | ||
907 | /* Make sure IO is suspended before calling this function(). */ | 1102 | /* Make sure IO is suspended before calling this function(). */ |
@@ -909,42 +1104,187 @@ static void drbd_suspend_al(struct drbd_conf *mdev) | |||
909 | { | 1104 | { |
910 | int s = 0; | 1105 | int s = 0; |
911 | 1106 | ||
912 | if (lc_try_lock(mdev->act_log)) { | 1107 | if (!lc_try_lock(mdev->act_log)) { |
913 | drbd_al_shrink(mdev); | ||
914 | lc_unlock(mdev->act_log); | ||
915 | } else { | ||
916 | dev_warn(DEV, "Failed to lock al in drbd_suspend_al()\n"); | 1108 | dev_warn(DEV, "Failed to lock al in drbd_suspend_al()\n"); |
917 | return; | 1109 | return; |
918 | } | 1110 | } |
919 | 1111 | ||
920 | spin_lock_irq(&mdev->req_lock); | 1112 | drbd_al_shrink(mdev); |
1113 | spin_lock_irq(&mdev->tconn->req_lock); | ||
921 | if (mdev->state.conn < C_CONNECTED) | 1114 | if (mdev->state.conn < C_CONNECTED) |
922 | s = !test_and_set_bit(AL_SUSPENDED, &mdev->flags); | 1115 | s = !test_and_set_bit(AL_SUSPENDED, &mdev->flags); |
923 | 1116 | spin_unlock_irq(&mdev->tconn->req_lock); | |
924 | spin_unlock_irq(&mdev->req_lock); | 1117 | lc_unlock(mdev->act_log); |
925 | 1118 | ||
926 | if (s) | 1119 | if (s) |
927 | dev_info(DEV, "Suspended AL updates\n"); | 1120 | dev_info(DEV, "Suspended AL updates\n"); |
928 | } | 1121 | } |
929 | 1122 | ||
930 | /* does always return 0; | 1123 | |
931 | * interesting return code is in reply->ret_code */ | 1124 | static bool should_set_defaults(struct genl_info *info) |
932 | static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 1125 | { |
933 | struct drbd_nl_cfg_reply *reply) | 1126 | unsigned flags = ((struct drbd_genlmsghdr*)info->userhdr)->flags; |
1127 | return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS); | ||
1128 | } | ||
1129 | |||
1130 | static void enforce_disk_conf_limits(struct disk_conf *dc) | ||
1131 | { | ||
1132 | if (dc->al_extents < DRBD_AL_EXTENTS_MIN) | ||
1133 | dc->al_extents = DRBD_AL_EXTENTS_MIN; | ||
1134 | if (dc->al_extents > DRBD_AL_EXTENTS_MAX) | ||
1135 | dc->al_extents = DRBD_AL_EXTENTS_MAX; | ||
1136 | |||
1137 | if (dc->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX) | ||
1138 | dc->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX; | ||
1139 | } | ||
1140 | |||
1141 | int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) | ||
934 | { | 1142 | { |
935 | enum drbd_ret_code retcode; | 1143 | enum drbd_ret_code retcode; |
1144 | struct drbd_conf *mdev; | ||
1145 | struct disk_conf *new_disk_conf, *old_disk_conf; | ||
1146 | struct fifo_buffer *old_plan = NULL, *new_plan = NULL; | ||
1147 | int err, fifo_size; | ||
1148 | |||
1149 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); | ||
1150 | if (!adm_ctx.reply_skb) | ||
1151 | return retcode; | ||
1152 | if (retcode != NO_ERROR) | ||
1153 | goto out; | ||
1154 | |||
1155 | mdev = adm_ctx.mdev; | ||
1156 | |||
1157 | /* we also need a disk | ||
1158 | * to change the options on */ | ||
1159 | if (!get_ldev(mdev)) { | ||
1160 | retcode = ERR_NO_DISK; | ||
1161 | goto out; | ||
1162 | } | ||
1163 | |||
1164 | new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL); | ||
1165 | if (!new_disk_conf) { | ||
1166 | retcode = ERR_NOMEM; | ||
1167 | goto fail; | ||
1168 | } | ||
1169 | |||
1170 | mutex_lock(&mdev->tconn->conf_update); | ||
1171 | old_disk_conf = mdev->ldev->disk_conf; | ||
1172 | *new_disk_conf = *old_disk_conf; | ||
1173 | if (should_set_defaults(info)) | ||
1174 | set_disk_conf_defaults(new_disk_conf); | ||
1175 | |||
1176 | err = disk_conf_from_attrs_for_change(new_disk_conf, info); | ||
1177 | if (err && err != -ENOMSG) { | ||
1178 | retcode = ERR_MANDATORY_TAG; | ||
1179 | drbd_msg_put_info(from_attrs_err_to_txt(err)); | ||
1180 | } | ||
1181 | |||
1182 | if (!expect(new_disk_conf->resync_rate >= 1)) | ||
1183 | new_disk_conf->resync_rate = 1; | ||
1184 | |||
1185 | enforce_disk_conf_limits(new_disk_conf); | ||
1186 | |||
1187 | fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ; | ||
1188 | if (fifo_size != mdev->rs_plan_s->size) { | ||
1189 | new_plan = fifo_alloc(fifo_size); | ||
1190 | if (!new_plan) { | ||
1191 | dev_err(DEV, "kmalloc of fifo_buffer failed"); | ||
1192 | retcode = ERR_NOMEM; | ||
1193 | goto fail_unlock; | ||
1194 | } | ||
1195 | } | ||
1196 | |||
1197 | drbd_suspend_io(mdev); | ||
1198 | wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); | ||
1199 | drbd_al_shrink(mdev); | ||
1200 | err = drbd_check_al_size(mdev, new_disk_conf); | ||
1201 | lc_unlock(mdev->act_log); | ||
1202 | wake_up(&mdev->al_wait); | ||
1203 | drbd_resume_io(mdev); | ||
1204 | |||
1205 | if (err) { | ||
1206 | retcode = ERR_NOMEM; | ||
1207 | goto fail_unlock; | ||
1208 | } | ||
1209 | |||
1210 | write_lock_irq(&global_state_lock); | ||
1211 | retcode = drbd_resync_after_valid(mdev, new_disk_conf->resync_after); | ||
1212 | if (retcode == NO_ERROR) { | ||
1213 | rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf); | ||
1214 | drbd_resync_after_changed(mdev); | ||
1215 | } | ||
1216 | write_unlock_irq(&global_state_lock); | ||
1217 | |||
1218 | if (retcode != NO_ERROR) | ||
1219 | goto fail_unlock; | ||
1220 | |||
1221 | if (new_plan) { | ||
1222 | old_plan = mdev->rs_plan_s; | ||
1223 | rcu_assign_pointer(mdev->rs_plan_s, new_plan); | ||
1224 | } | ||
1225 | |||
1226 | mutex_unlock(&mdev->tconn->conf_update); | ||
1227 | |||
1228 | if (new_disk_conf->al_updates) | ||
1229 | mdev->ldev->md.flags &= ~MDF_AL_DISABLED; | ||
1230 | else | ||
1231 | mdev->ldev->md.flags |= MDF_AL_DISABLED; | ||
1232 | |||
1233 | if (new_disk_conf->md_flushes) | ||
1234 | clear_bit(MD_NO_FUA, &mdev->flags); | ||
1235 | else | ||
1236 | set_bit(MD_NO_FUA, &mdev->flags); | ||
1237 | |||
1238 | drbd_bump_write_ordering(mdev->tconn, WO_bdev_flush); | ||
1239 | |||
1240 | drbd_md_sync(mdev); | ||
1241 | |||
1242 | if (mdev->state.conn >= C_CONNECTED) | ||
1243 | drbd_send_sync_param(mdev); | ||
1244 | |||
1245 | synchronize_rcu(); | ||
1246 | kfree(old_disk_conf); | ||
1247 | kfree(old_plan); | ||
1248 | mod_timer(&mdev->request_timer, jiffies + HZ); | ||
1249 | goto success; | ||
1250 | |||
1251 | fail_unlock: | ||
1252 | mutex_unlock(&mdev->tconn->conf_update); | ||
1253 | fail: | ||
1254 | kfree(new_disk_conf); | ||
1255 | kfree(new_plan); | ||
1256 | success: | ||
1257 | put_ldev(mdev); | ||
1258 | out: | ||
1259 | drbd_adm_finish(info, retcode); | ||
1260 | return 0; | ||
1261 | } | ||
1262 | |||
1263 | int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) | ||
1264 | { | ||
1265 | struct drbd_conf *mdev; | ||
1266 | int err; | ||
1267 | enum drbd_ret_code retcode; | ||
936 | enum determine_dev_size dd; | 1268 | enum determine_dev_size dd; |
937 | sector_t max_possible_sectors; | 1269 | sector_t max_possible_sectors; |
938 | sector_t min_md_device_sectors; | 1270 | sector_t min_md_device_sectors; |
939 | struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */ | 1271 | struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */ |
1272 | struct disk_conf *new_disk_conf = NULL; | ||
940 | struct block_device *bdev; | 1273 | struct block_device *bdev; |
941 | struct lru_cache *resync_lru = NULL; | 1274 | struct lru_cache *resync_lru = NULL; |
1275 | struct fifo_buffer *new_plan = NULL; | ||
942 | union drbd_state ns, os; | 1276 | union drbd_state ns, os; |
943 | enum drbd_state_rv rv; | 1277 | enum drbd_state_rv rv; |
944 | int cp_discovered = 0; | 1278 | struct net_conf *nc; |
945 | int logical_block_size; | ||
946 | 1279 | ||
947 | drbd_reconfig_start(mdev); | 1280 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); |
1281 | if (!adm_ctx.reply_skb) | ||
1282 | return retcode; | ||
1283 | if (retcode != NO_ERROR) | ||
1284 | goto finish; | ||
1285 | |||
1286 | mdev = adm_ctx.mdev; | ||
1287 | conn_reconfig_start(mdev->tconn); | ||
948 | 1288 | ||
949 | /* if you want to reconfigure, please tear down first */ | 1289 | /* if you want to reconfigure, please tear down first */ |
950 | if (mdev->state.disk > D_DISKLESS) { | 1290 | if (mdev->state.disk > D_DISKLESS) { |
@@ -959,47 +1299,65 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
959 | 1299 | ||
960 | /* make sure there is no leftover from previous force-detach attempts */ | 1300 | /* make sure there is no leftover from previous force-detach attempts */ |
961 | clear_bit(FORCE_DETACH, &mdev->flags); | 1301 | clear_bit(FORCE_DETACH, &mdev->flags); |
1302 | clear_bit(WAS_IO_ERROR, &mdev->flags); | ||
1303 | clear_bit(WAS_READ_ERROR, &mdev->flags); | ||
962 | 1304 | ||
963 | /* and no leftover from previously aborted resync or verify, either */ | 1305 | /* and no leftover from previously aborted resync or verify, either */ |
964 | mdev->rs_total = 0; | 1306 | mdev->rs_total = 0; |
965 | mdev->rs_failed = 0; | 1307 | mdev->rs_failed = 0; |
966 | atomic_set(&mdev->rs_pending_cnt, 0); | 1308 | atomic_set(&mdev->rs_pending_cnt, 0); |
967 | 1309 | ||
968 | /* allocation not in the IO path, cqueue thread context */ | 1310 | /* allocation not in the IO path, drbdsetup context */ |
969 | nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL); | 1311 | nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL); |
970 | if (!nbc) { | 1312 | if (!nbc) { |
971 | retcode = ERR_NOMEM; | 1313 | retcode = ERR_NOMEM; |
972 | goto fail; | 1314 | goto fail; |
973 | } | 1315 | } |
1316 | spin_lock_init(&nbc->md.uuid_lock); | ||
974 | 1317 | ||
975 | nbc->dc.disk_size = DRBD_DISK_SIZE_SECT_DEF; | 1318 | new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL); |
976 | nbc->dc.on_io_error = DRBD_ON_IO_ERROR_DEF; | 1319 | if (!new_disk_conf) { |
977 | nbc->dc.fencing = DRBD_FENCING_DEF; | 1320 | retcode = ERR_NOMEM; |
978 | nbc->dc.max_bio_bvecs = DRBD_MAX_BIO_BVECS_DEF; | 1321 | goto fail; |
1322 | } | ||
1323 | nbc->disk_conf = new_disk_conf; | ||
979 | 1324 | ||
980 | if (!disk_conf_from_tags(mdev, nlp->tag_list, &nbc->dc)) { | 1325 | set_disk_conf_defaults(new_disk_conf); |
1326 | err = disk_conf_from_attrs(new_disk_conf, info); | ||
1327 | if (err) { | ||
981 | retcode = ERR_MANDATORY_TAG; | 1328 | retcode = ERR_MANDATORY_TAG; |
1329 | drbd_msg_put_info(from_attrs_err_to_txt(err)); | ||
982 | goto fail; | 1330 | goto fail; |
983 | } | 1331 | } |
984 | 1332 | ||
985 | if (nbc->dc.meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) { | 1333 | enforce_disk_conf_limits(new_disk_conf); |
1334 | |||
1335 | new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ); | ||
1336 | if (!new_plan) { | ||
1337 | retcode = ERR_NOMEM; | ||
1338 | goto fail; | ||
1339 | } | ||
1340 | |||
1341 | if (new_disk_conf->meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) { | ||
986 | retcode = ERR_MD_IDX_INVALID; | 1342 | retcode = ERR_MD_IDX_INVALID; |
987 | goto fail; | 1343 | goto fail; |
988 | } | 1344 | } |
989 | 1345 | ||
990 | if (get_net_conf(mdev)) { | 1346 | rcu_read_lock(); |
991 | int prot = mdev->net_conf->wire_protocol; | 1347 | nc = rcu_dereference(mdev->tconn->net_conf); |
992 | put_net_conf(mdev); | 1348 | if (nc) { |
993 | if (nbc->dc.fencing == FP_STONITH && prot == DRBD_PROT_A) { | 1349 | if (new_disk_conf->fencing == FP_STONITH && nc->wire_protocol == DRBD_PROT_A) { |
1350 | rcu_read_unlock(); | ||
994 | retcode = ERR_STONITH_AND_PROT_A; | 1351 | retcode = ERR_STONITH_AND_PROT_A; |
995 | goto fail; | 1352 | goto fail; |
996 | } | 1353 | } |
997 | } | 1354 | } |
1355 | rcu_read_unlock(); | ||
998 | 1356 | ||
999 | bdev = blkdev_get_by_path(nbc->dc.backing_dev, | 1357 | bdev = blkdev_get_by_path(new_disk_conf->backing_dev, |
1000 | FMODE_READ | FMODE_WRITE | FMODE_EXCL, mdev); | 1358 | FMODE_READ | FMODE_WRITE | FMODE_EXCL, mdev); |
1001 | if (IS_ERR(bdev)) { | 1359 | if (IS_ERR(bdev)) { |
1002 | dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev, | 1360 | dev_err(DEV, "open(\"%s\") failed with %ld\n", new_disk_conf->backing_dev, |
1003 | PTR_ERR(bdev)); | 1361 | PTR_ERR(bdev)); |
1004 | retcode = ERR_OPEN_DISK; | 1362 | retcode = ERR_OPEN_DISK; |
1005 | goto fail; | 1363 | goto fail; |
@@ -1014,12 +1372,12 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1014 | * should check it for you already; but if you don't, or | 1372 | * should check it for you already; but if you don't, or |
1015 | * someone fooled it, we need to double check here) | 1373 | * someone fooled it, we need to double check here) |
1016 | */ | 1374 | */ |
1017 | bdev = blkdev_get_by_path(nbc->dc.meta_dev, | 1375 | bdev = blkdev_get_by_path(new_disk_conf->meta_dev, |
1018 | FMODE_READ | FMODE_WRITE | FMODE_EXCL, | 1376 | FMODE_READ | FMODE_WRITE | FMODE_EXCL, |
1019 | (nbc->dc.meta_dev_idx < 0) ? | 1377 | (new_disk_conf->meta_dev_idx < 0) ? |
1020 | (void *)mdev : (void *)drbd_m_holder); | 1378 | (void *)mdev : (void *)drbd_m_holder); |
1021 | if (IS_ERR(bdev)) { | 1379 | if (IS_ERR(bdev)) { |
1022 | dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev, | 1380 | dev_err(DEV, "open(\"%s\") failed with %ld\n", new_disk_conf->meta_dev, |
1023 | PTR_ERR(bdev)); | 1381 | PTR_ERR(bdev)); |
1024 | retcode = ERR_OPEN_MD_DISK; | 1382 | retcode = ERR_OPEN_MD_DISK; |
1025 | goto fail; | 1383 | goto fail; |
@@ -1027,14 +1385,14 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1027 | nbc->md_bdev = bdev; | 1385 | nbc->md_bdev = bdev; |
1028 | 1386 | ||
1029 | if ((nbc->backing_bdev == nbc->md_bdev) != | 1387 | if ((nbc->backing_bdev == nbc->md_bdev) != |
1030 | (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL || | 1388 | (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL || |
1031 | nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) { | 1389 | new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) { |
1032 | retcode = ERR_MD_IDX_INVALID; | 1390 | retcode = ERR_MD_IDX_INVALID; |
1033 | goto fail; | 1391 | goto fail; |
1034 | } | 1392 | } |
1035 | 1393 | ||
1036 | resync_lru = lc_create("resync", drbd_bm_ext_cache, | 1394 | resync_lru = lc_create("resync", drbd_bm_ext_cache, |
1037 | 61, sizeof(struct bm_extent), | 1395 | 1, 61, sizeof(struct bm_extent), |
1038 | offsetof(struct bm_extent, lce)); | 1396 | offsetof(struct bm_extent, lce)); |
1039 | if (!resync_lru) { | 1397 | if (!resync_lru) { |
1040 | retcode = ERR_NOMEM; | 1398 | retcode = ERR_NOMEM; |
@@ -1044,21 +1402,21 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1044 | /* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */ | 1402 | /* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */ |
1045 | drbd_md_set_sector_offsets(mdev, nbc); | 1403 | drbd_md_set_sector_offsets(mdev, nbc); |
1046 | 1404 | ||
1047 | if (drbd_get_max_capacity(nbc) < nbc->dc.disk_size) { | 1405 | if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) { |
1048 | dev_err(DEV, "max capacity %llu smaller than disk size %llu\n", | 1406 | dev_err(DEV, "max capacity %llu smaller than disk size %llu\n", |
1049 | (unsigned long long) drbd_get_max_capacity(nbc), | 1407 | (unsigned long long) drbd_get_max_capacity(nbc), |
1050 | (unsigned long long) nbc->dc.disk_size); | 1408 | (unsigned long long) new_disk_conf->disk_size); |
1051 | retcode = ERR_DISK_TOO_SMALL; | 1409 | retcode = ERR_DISK_TOO_SMALL; |
1052 | goto fail; | 1410 | goto fail; |
1053 | } | 1411 | } |
1054 | 1412 | ||
1055 | if (nbc->dc.meta_dev_idx < 0) { | 1413 | if (new_disk_conf->meta_dev_idx < 0) { |
1056 | max_possible_sectors = DRBD_MAX_SECTORS_FLEX; | 1414 | max_possible_sectors = DRBD_MAX_SECTORS_FLEX; |
1057 | /* at least one MB, otherwise it does not make sense */ | 1415 | /* at least one MB, otherwise it does not make sense */ |
1058 | min_md_device_sectors = (2<<10); | 1416 | min_md_device_sectors = (2<<10); |
1059 | } else { | 1417 | } else { |
1060 | max_possible_sectors = DRBD_MAX_SECTORS; | 1418 | max_possible_sectors = DRBD_MAX_SECTORS; |
1061 | min_md_device_sectors = MD_RESERVED_SECT * (nbc->dc.meta_dev_idx + 1); | 1419 | min_md_device_sectors = MD_RESERVED_SECT * (new_disk_conf->meta_dev_idx + 1); |
1062 | } | 1420 | } |
1063 | 1421 | ||
1064 | if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) { | 1422 | if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) { |
@@ -1083,14 +1441,20 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1083 | dev_warn(DEV, "==> truncating very big lower level device " | 1441 | dev_warn(DEV, "==> truncating very big lower level device " |
1084 | "to currently maximum possible %llu sectors <==\n", | 1442 | "to currently maximum possible %llu sectors <==\n", |
1085 | (unsigned long long) max_possible_sectors); | 1443 | (unsigned long long) max_possible_sectors); |
1086 | if (nbc->dc.meta_dev_idx >= 0) | 1444 | if (new_disk_conf->meta_dev_idx >= 0) |
1087 | dev_warn(DEV, "==>> using internal or flexible " | 1445 | dev_warn(DEV, "==>> using internal or flexible " |
1088 | "meta data may help <<==\n"); | 1446 | "meta data may help <<==\n"); |
1089 | } | 1447 | } |
1090 | 1448 | ||
1091 | drbd_suspend_io(mdev); | 1449 | drbd_suspend_io(mdev); |
1092 | /* also wait for the last barrier ack. */ | 1450 | /* also wait for the last barrier ack. */ |
1093 | wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt) || is_susp(mdev->state)); | 1451 | /* FIXME see also https://daiquiri.linbit/cgi-bin/bugzilla/show_bug.cgi?id=171 |
1452 | * We need a way to either ignore barrier acks for barriers sent before a device | ||
1453 | * was attached, or a way to wait for all pending barrier acks to come in. | ||
1454 | * As barriers are counted per resource, | ||
1455 | * we'd need to suspend io on all devices of a resource. | ||
1456 | */ | ||
1457 | wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt) || drbd_suspended(mdev)); | ||
1094 | /* and for any other previously queued work */ | 1458 | /* and for any other previously queued work */ |
1095 | drbd_flush_workqueue(mdev); | 1459 | drbd_flush_workqueue(mdev); |
1096 | 1460 | ||
@@ -1105,25 +1469,6 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1105 | 1469 | ||
1106 | drbd_md_set_sector_offsets(mdev, nbc); | 1470 | drbd_md_set_sector_offsets(mdev, nbc); |
1107 | 1471 | ||
1108 | /* allocate a second IO page if logical_block_size != 512 */ | ||
1109 | logical_block_size = bdev_logical_block_size(nbc->md_bdev); | ||
1110 | if (logical_block_size == 0) | ||
1111 | logical_block_size = MD_SECTOR_SIZE; | ||
1112 | |||
1113 | if (logical_block_size != MD_SECTOR_SIZE) { | ||
1114 | if (!mdev->md_io_tmpp) { | ||
1115 | struct page *page = alloc_page(GFP_NOIO); | ||
1116 | if (!page) | ||
1117 | goto force_diskless_dec; | ||
1118 | |||
1119 | dev_warn(DEV, "Meta data's bdev logical_block_size = %d != %d\n", | ||
1120 | logical_block_size, MD_SECTOR_SIZE); | ||
1121 | dev_warn(DEV, "Workaround engaged (has performance impact).\n"); | ||
1122 | |||
1123 | mdev->md_io_tmpp = page; | ||
1124 | } | ||
1125 | } | ||
1126 | |||
1127 | if (!mdev->bitmap) { | 1472 | if (!mdev->bitmap) { |
1128 | if (drbd_bm_init(mdev)) { | 1473 | if (drbd_bm_init(mdev)) { |
1129 | retcode = ERR_NOMEM; | 1474 | retcode = ERR_NOMEM; |
@@ -1145,30 +1490,25 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1145 | } | 1490 | } |
1146 | 1491 | ||
1147 | /* Since we are diskless, fix the activity log first... */ | 1492 | /* Since we are diskless, fix the activity log first... */ |
1148 | if (drbd_check_al_size(mdev)) { | 1493 | if (drbd_check_al_size(mdev, new_disk_conf)) { |
1149 | retcode = ERR_NOMEM; | 1494 | retcode = ERR_NOMEM; |
1150 | goto force_diskless_dec; | 1495 | goto force_diskless_dec; |
1151 | } | 1496 | } |
1152 | 1497 | ||
1153 | /* Prevent shrinking of consistent devices ! */ | 1498 | /* Prevent shrinking of consistent devices ! */ |
1154 | if (drbd_md_test_flag(nbc, MDF_CONSISTENT) && | 1499 | if (drbd_md_test_flag(nbc, MDF_CONSISTENT) && |
1155 | drbd_new_dev_size(mdev, nbc, 0) < nbc->md.la_size_sect) { | 1500 | drbd_new_dev_size(mdev, nbc, nbc->disk_conf->disk_size, 0) < nbc->md.la_size_sect) { |
1156 | dev_warn(DEV, "refusing to truncate a consistent device\n"); | 1501 | dev_warn(DEV, "refusing to truncate a consistent device\n"); |
1157 | retcode = ERR_DISK_TOO_SMALL; | 1502 | retcode = ERR_DISK_TOO_SMALL; |
1158 | goto force_diskless_dec; | 1503 | goto force_diskless_dec; |
1159 | } | 1504 | } |
1160 | 1505 | ||
1161 | if (!drbd_al_read_log(mdev, nbc)) { | ||
1162 | retcode = ERR_IO_MD_DISK; | ||
1163 | goto force_diskless_dec; | ||
1164 | } | ||
1165 | |||
1166 | /* Reset the "barriers don't work" bits here, then force meta data to | 1506 | /* Reset the "barriers don't work" bits here, then force meta data to |
1167 | * be written, to ensure we determine if barriers are supported. */ | 1507 | * be written, to ensure we determine if barriers are supported. */ |
1168 | if (nbc->dc.no_md_flush) | 1508 | if (new_disk_conf->md_flushes) |
1169 | set_bit(MD_NO_FUA, &mdev->flags); | ||
1170 | else | ||
1171 | clear_bit(MD_NO_FUA, &mdev->flags); | 1509 | clear_bit(MD_NO_FUA, &mdev->flags); |
1510 | else | ||
1511 | set_bit(MD_NO_FUA, &mdev->flags); | ||
1172 | 1512 | ||
1173 | /* Point of no return reached. | 1513 | /* Point of no return reached. |
1174 | * Devices and memory are no longer released by error cleanup below. | 1514 | * Devices and memory are no longer released by error cleanup below. |
@@ -1177,11 +1517,13 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1177 | D_ASSERT(mdev->ldev == NULL); | 1517 | D_ASSERT(mdev->ldev == NULL); |
1178 | mdev->ldev = nbc; | 1518 | mdev->ldev = nbc; |
1179 | mdev->resync = resync_lru; | 1519 | mdev->resync = resync_lru; |
1520 | mdev->rs_plan_s = new_plan; | ||
1180 | nbc = NULL; | 1521 | nbc = NULL; |
1181 | resync_lru = NULL; | 1522 | resync_lru = NULL; |
1523 | new_disk_conf = NULL; | ||
1524 | new_plan = NULL; | ||
1182 | 1525 | ||
1183 | mdev->write_ordering = WO_bdev_flush; | 1526 | drbd_bump_write_ordering(mdev->tconn, WO_bdev_flush); |
1184 | drbd_bump_write_ordering(mdev, WO_bdev_flush); | ||
1185 | 1527 | ||
1186 | if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY)) | 1528 | if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY)) |
1187 | set_bit(CRASHED_PRIMARY, &mdev->flags); | 1529 | set_bit(CRASHED_PRIMARY, &mdev->flags); |
@@ -1189,10 +1531,8 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1189 | clear_bit(CRASHED_PRIMARY, &mdev->flags); | 1531 | clear_bit(CRASHED_PRIMARY, &mdev->flags); |
1190 | 1532 | ||
1191 | if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) && | 1533 | if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) && |
1192 | !(mdev->state.role == R_PRIMARY && mdev->state.susp_nod)) { | 1534 | !(mdev->state.role == R_PRIMARY && mdev->tconn->susp_nod)) |
1193 | set_bit(CRASHED_PRIMARY, &mdev->flags); | 1535 | set_bit(CRASHED_PRIMARY, &mdev->flags); |
1194 | cp_discovered = 1; | ||
1195 | } | ||
1196 | 1536 | ||
1197 | mdev->send_cnt = 0; | 1537 | mdev->send_cnt = 0; |
1198 | mdev->recv_cnt = 0; | 1538 | mdev->recv_cnt = 0; |
@@ -1228,7 +1568,9 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1228 | } else if (dd == grew) | 1568 | } else if (dd == grew) |
1229 | set_bit(RESYNC_AFTER_NEG, &mdev->flags); | 1569 | set_bit(RESYNC_AFTER_NEG, &mdev->flags); |
1230 | 1570 | ||
1231 | if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) { | 1571 | if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC) || |
1572 | (test_bit(CRASHED_PRIMARY, &mdev->flags) && | ||
1573 | drbd_md_test_flag(mdev->ldev, MDF_AL_DISABLED))) { | ||
1232 | dev_info(DEV, "Assuming that all blocks are out of sync " | 1574 | dev_info(DEV, "Assuming that all blocks are out of sync " |
1233 | "(aka FullSync)\n"); | 1575 | "(aka FullSync)\n"); |
1234 | if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, | 1576 | if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, |
@@ -1238,16 +1580,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1238 | } | 1580 | } |
1239 | } else { | 1581 | } else { |
1240 | if (drbd_bitmap_io(mdev, &drbd_bm_read, | 1582 | if (drbd_bitmap_io(mdev, &drbd_bm_read, |
1241 | "read from attaching", BM_LOCKED_MASK) < 0) { | 1583 | "read from attaching", BM_LOCKED_MASK)) { |
1242 | retcode = ERR_IO_MD_DISK; | ||
1243 | goto force_diskless_dec; | ||
1244 | } | ||
1245 | } | ||
1246 | |||
1247 | if (cp_discovered) { | ||
1248 | drbd_al_apply_to_bm(mdev); | ||
1249 | if (drbd_bitmap_io(mdev, &drbd_bm_write, | ||
1250 | "crashed primary apply AL", BM_LOCKED_MASK)) { | ||
1251 | retcode = ERR_IO_MD_DISK; | 1584 | retcode = ERR_IO_MD_DISK; |
1252 | goto force_diskless_dec; | 1585 | goto force_diskless_dec; |
1253 | } | 1586 | } |
@@ -1256,9 +1589,9 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1256 | if (_drbd_bm_total_weight(mdev) == drbd_bm_bits(mdev)) | 1589 | if (_drbd_bm_total_weight(mdev) == drbd_bm_bits(mdev)) |
1257 | drbd_suspend_al(mdev); /* IO is still suspended here... */ | 1590 | drbd_suspend_al(mdev); /* IO is still suspended here... */ |
1258 | 1591 | ||
1259 | spin_lock_irq(&mdev->req_lock); | 1592 | spin_lock_irq(&mdev->tconn->req_lock); |
1260 | os = mdev->state; | 1593 | os = drbd_read_state(mdev); |
1261 | ns.i = os.i; | 1594 | ns = os; |
1262 | /* If MDF_CONSISTENT is not set go into inconsistent state, | 1595 | /* If MDF_CONSISTENT is not set go into inconsistent state, |
1263 | otherwise investigate MDF_WasUpToDate... | 1596 | otherwise investigate MDF_WasUpToDate... |
1264 | If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state, | 1597 | If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state, |
@@ -1276,8 +1609,9 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1276 | if (drbd_md_test_flag(mdev->ldev, MDF_PEER_OUT_DATED)) | 1609 | if (drbd_md_test_flag(mdev->ldev, MDF_PEER_OUT_DATED)) |
1277 | ns.pdsk = D_OUTDATED; | 1610 | ns.pdsk = D_OUTDATED; |
1278 | 1611 | ||
1279 | if ( ns.disk == D_CONSISTENT && | 1612 | rcu_read_lock(); |
1280 | (ns.pdsk == D_OUTDATED || mdev->ldev->dc.fencing == FP_DONT_CARE)) | 1613 | if (ns.disk == D_CONSISTENT && |
1614 | (ns.pdsk == D_OUTDATED || rcu_dereference(mdev->ldev->disk_conf)->fencing == FP_DONT_CARE)) | ||
1281 | ns.disk = D_UP_TO_DATE; | 1615 | ns.disk = D_UP_TO_DATE; |
1282 | 1616 | ||
1283 | /* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND, | 1617 | /* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND, |
@@ -1285,6 +1619,13 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1285 | this point, because drbd_request_state() modifies these | 1619 | this point, because drbd_request_state() modifies these |
1286 | flags. */ | 1620 | flags. */ |
1287 | 1621 | ||
1622 | if (rcu_dereference(mdev->ldev->disk_conf)->al_updates) | ||
1623 | mdev->ldev->md.flags &= ~MDF_AL_DISABLED; | ||
1624 | else | ||
1625 | mdev->ldev->md.flags |= MDF_AL_DISABLED; | ||
1626 | |||
1627 | rcu_read_unlock(); | ||
1628 | |||
1288 | /* In case we are C_CONNECTED postpone any decision on the new disk | 1629 | /* In case we are C_CONNECTED postpone any decision on the new disk |
1289 | state after the negotiation phase. */ | 1630 | state after the negotiation phase. */ |
1290 | if (mdev->state.conn == C_CONNECTED) { | 1631 | if (mdev->state.conn == C_CONNECTED) { |
@@ -1300,12 +1641,13 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1300 | } | 1641 | } |
1301 | 1642 | ||
1302 | rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); | 1643 | rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); |
1303 | ns = mdev->state; | 1644 | spin_unlock_irq(&mdev->tconn->req_lock); |
1304 | spin_unlock_irq(&mdev->req_lock); | ||
1305 | 1645 | ||
1306 | if (rv < SS_SUCCESS) | 1646 | if (rv < SS_SUCCESS) |
1307 | goto force_diskless_dec; | 1647 | goto force_diskless_dec; |
1308 | 1648 | ||
1649 | mod_timer(&mdev->request_timer, jiffies + HZ); | ||
1650 | |||
1309 | if (mdev->state.role == R_PRIMARY) | 1651 | if (mdev->state.role == R_PRIMARY) |
1310 | mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1; | 1652 | mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1; |
1311 | else | 1653 | else |
@@ -1316,16 +1658,17 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1316 | 1658 | ||
1317 | kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); | 1659 | kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); |
1318 | put_ldev(mdev); | 1660 | put_ldev(mdev); |
1319 | reply->ret_code = retcode; | 1661 | conn_reconfig_done(mdev->tconn); |
1320 | drbd_reconfig_done(mdev); | 1662 | drbd_adm_finish(info, retcode); |
1321 | return 0; | 1663 | return 0; |
1322 | 1664 | ||
1323 | force_diskless_dec: | 1665 | force_diskless_dec: |
1324 | put_ldev(mdev); | 1666 | put_ldev(mdev); |
1325 | force_diskless: | 1667 | force_diskless: |
1326 | drbd_force_state(mdev, NS(disk, D_FAILED)); | 1668 | drbd_force_state(mdev, NS(disk, D_DISKLESS)); |
1327 | drbd_md_sync(mdev); | 1669 | drbd_md_sync(mdev); |
1328 | fail: | 1670 | fail: |
1671 | conn_reconfig_done(mdev->tconn); | ||
1329 | if (nbc) { | 1672 | if (nbc) { |
1330 | if (nbc->backing_bdev) | 1673 | if (nbc->backing_bdev) |
1331 | blkdev_put(nbc->backing_bdev, | 1674 | blkdev_put(nbc->backing_bdev, |
@@ -1335,34 +1678,24 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1335 | FMODE_READ | FMODE_WRITE | FMODE_EXCL); | 1678 | FMODE_READ | FMODE_WRITE | FMODE_EXCL); |
1336 | kfree(nbc); | 1679 | kfree(nbc); |
1337 | } | 1680 | } |
1681 | kfree(new_disk_conf); | ||
1338 | lc_destroy(resync_lru); | 1682 | lc_destroy(resync_lru); |
1683 | kfree(new_plan); | ||
1339 | 1684 | ||
1340 | reply->ret_code = retcode; | 1685 | finish: |
1341 | drbd_reconfig_done(mdev); | 1686 | drbd_adm_finish(info, retcode); |
1342 | return 0; | 1687 | return 0; |
1343 | } | 1688 | } |
1344 | 1689 | ||
1345 | /* Detaching the disk is a process in multiple stages. First we need to lock | 1690 | static int adm_detach(struct drbd_conf *mdev, int force) |
1346 | * out application IO, in-flight IO, IO stuck in drbd_al_begin_io. | ||
1347 | * Then we transition to D_DISKLESS, and wait for put_ldev() to return all | ||
1348 | * internal references as well. | ||
1349 | * Only then we have finally detached. */ | ||
1350 | static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1351 | struct drbd_nl_cfg_reply *reply) | ||
1352 | { | 1691 | { |
1353 | enum drbd_ret_code retcode; | 1692 | enum drbd_state_rv retcode; |
1354 | int ret; | 1693 | int ret; |
1355 | struct detach dt = {}; | ||
1356 | 1694 | ||
1357 | if (!detach_from_tags(mdev, nlp->tag_list, &dt)) { | 1695 | if (force) { |
1358 | reply->ret_code = ERR_MANDATORY_TAG; | ||
1359 | goto out; | ||
1360 | } | ||
1361 | |||
1362 | if (dt.detach_force) { | ||
1363 | set_bit(FORCE_DETACH, &mdev->flags); | 1696 | set_bit(FORCE_DETACH, &mdev->flags); |
1364 | drbd_force_state(mdev, NS(disk, D_FAILED)); | 1697 | drbd_force_state(mdev, NS(disk, D_FAILED)); |
1365 | reply->ret_code = SS_SUCCESS; | 1698 | retcode = SS_SUCCESS; |
1366 | goto out; | 1699 | goto out; |
1367 | } | 1700 | } |
1368 | 1701 | ||
@@ -1374,326 +1707,529 @@ static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | |||
1374 | ret = wait_event_interruptible(mdev->misc_wait, | 1707 | ret = wait_event_interruptible(mdev->misc_wait, |
1375 | mdev->state.disk != D_FAILED); | 1708 | mdev->state.disk != D_FAILED); |
1376 | drbd_resume_io(mdev); | 1709 | drbd_resume_io(mdev); |
1377 | |||
1378 | if ((int)retcode == (int)SS_IS_DISKLESS) | 1710 | if ((int)retcode == (int)SS_IS_DISKLESS) |
1379 | retcode = SS_NOTHING_TO_DO; | 1711 | retcode = SS_NOTHING_TO_DO; |
1380 | if (ret) | 1712 | if (ret) |
1381 | retcode = ERR_INTR; | 1713 | retcode = ERR_INTR; |
1382 | reply->ret_code = retcode; | ||
1383 | out: | 1714 | out: |
1384 | return 0; | 1715 | return retcode; |
1385 | } | 1716 | } |
1386 | 1717 | ||
1387 | static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 1718 | /* Detaching the disk is a process in multiple stages. First we need to lock |
1388 | struct drbd_nl_cfg_reply *reply) | 1719 | * out application IO, in-flight IO, IO stuck in drbd_al_begin_io. |
1720 | * Then we transition to D_DISKLESS, and wait for put_ldev() to return all | ||
1721 | * internal references as well. | ||
1722 | * Only then we have finally detached. */ | ||
1723 | int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info) | ||
1389 | { | 1724 | { |
1390 | int i, ns; | ||
1391 | enum drbd_ret_code retcode; | 1725 | enum drbd_ret_code retcode; |
1392 | struct net_conf *new_conf = NULL; | 1726 | struct detach_parms parms = { }; |
1393 | struct crypto_hash *tfm = NULL; | 1727 | int err; |
1394 | struct crypto_hash *integrity_w_tfm = NULL; | ||
1395 | struct crypto_hash *integrity_r_tfm = NULL; | ||
1396 | struct hlist_head *new_tl_hash = NULL; | ||
1397 | struct hlist_head *new_ee_hash = NULL; | ||
1398 | struct drbd_conf *odev; | ||
1399 | char hmac_name[CRYPTO_MAX_ALG_NAME]; | ||
1400 | void *int_dig_out = NULL; | ||
1401 | void *int_dig_in = NULL; | ||
1402 | void *int_dig_vv = NULL; | ||
1403 | struct sockaddr *new_my_addr, *new_peer_addr, *taken_addr; | ||
1404 | 1728 | ||
1405 | drbd_reconfig_start(mdev); | 1729 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); |
1730 | if (!adm_ctx.reply_skb) | ||
1731 | return retcode; | ||
1732 | if (retcode != NO_ERROR) | ||
1733 | goto out; | ||
1406 | 1734 | ||
1407 | if (mdev->state.conn > C_STANDALONE) { | 1735 | if (info->attrs[DRBD_NLA_DETACH_PARMS]) { |
1408 | retcode = ERR_NET_CONFIGURED; | 1736 | err = detach_parms_from_attrs(&parms, info); |
1409 | goto fail; | 1737 | if (err) { |
1738 | retcode = ERR_MANDATORY_TAG; | ||
1739 | drbd_msg_put_info(from_attrs_err_to_txt(err)); | ||
1740 | goto out; | ||
1741 | } | ||
1410 | } | 1742 | } |
1411 | 1743 | ||
1412 | /* allocation not in the IO path, cqueue thread context */ | 1744 | retcode = adm_detach(adm_ctx.mdev, parms.force_detach); |
1745 | out: | ||
1746 | drbd_adm_finish(info, retcode); | ||
1747 | return 0; | ||
1748 | } | ||
1749 | |||
1750 | static bool conn_resync_running(struct drbd_tconn *tconn) | ||
1751 | { | ||
1752 | struct drbd_conf *mdev; | ||
1753 | bool rv = false; | ||
1754 | int vnr; | ||
1755 | |||
1756 | rcu_read_lock(); | ||
1757 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
1758 | if (mdev->state.conn == C_SYNC_SOURCE || | ||
1759 | mdev->state.conn == C_SYNC_TARGET || | ||
1760 | mdev->state.conn == C_PAUSED_SYNC_S || | ||
1761 | mdev->state.conn == C_PAUSED_SYNC_T) { | ||
1762 | rv = true; | ||
1763 | break; | ||
1764 | } | ||
1765 | } | ||
1766 | rcu_read_unlock(); | ||
1767 | |||
1768 | return rv; | ||
1769 | } | ||
1770 | |||
1771 | static bool conn_ov_running(struct drbd_tconn *tconn) | ||
1772 | { | ||
1773 | struct drbd_conf *mdev; | ||
1774 | bool rv = false; | ||
1775 | int vnr; | ||
1776 | |||
1777 | rcu_read_lock(); | ||
1778 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
1779 | if (mdev->state.conn == C_VERIFY_S || | ||
1780 | mdev->state.conn == C_VERIFY_T) { | ||
1781 | rv = true; | ||
1782 | break; | ||
1783 | } | ||
1784 | } | ||
1785 | rcu_read_unlock(); | ||
1786 | |||
1787 | return rv; | ||
1788 | } | ||
1789 | |||
1790 | static enum drbd_ret_code | ||
1791 | _check_net_options(struct drbd_tconn *tconn, struct net_conf *old_conf, struct net_conf *new_conf) | ||
1792 | { | ||
1793 | struct drbd_conf *mdev; | ||
1794 | int i; | ||
1795 | |||
1796 | if (old_conf && tconn->cstate == C_WF_REPORT_PARAMS && tconn->agreed_pro_version < 100) { | ||
1797 | if (new_conf->wire_protocol != old_conf->wire_protocol) | ||
1798 | return ERR_NEED_APV_100; | ||
1799 | |||
1800 | if (new_conf->two_primaries != old_conf->two_primaries) | ||
1801 | return ERR_NEED_APV_100; | ||
1802 | |||
1803 | if (strcmp(new_conf->integrity_alg, old_conf->integrity_alg)) | ||
1804 | return ERR_NEED_APV_100; | ||
1805 | } | ||
1806 | |||
1807 | if (!new_conf->two_primaries && | ||
1808 | conn_highest_role(tconn) == R_PRIMARY && | ||
1809 | conn_highest_peer(tconn) == R_PRIMARY) | ||
1810 | return ERR_NEED_ALLOW_TWO_PRI; | ||
1811 | |||
1812 | if (new_conf->two_primaries && | ||
1813 | (new_conf->wire_protocol != DRBD_PROT_C)) | ||
1814 | return ERR_NOT_PROTO_C; | ||
1815 | |||
1816 | idr_for_each_entry(&tconn->volumes, mdev, i) { | ||
1817 | if (get_ldev(mdev)) { | ||
1818 | enum drbd_fencing_p fp = rcu_dereference(mdev->ldev->disk_conf)->fencing; | ||
1819 | put_ldev(mdev); | ||
1820 | if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH) | ||
1821 | return ERR_STONITH_AND_PROT_A; | ||
1822 | } | ||
1823 | if (mdev->state.role == R_PRIMARY && new_conf->discard_my_data) | ||
1824 | return ERR_DISCARD_IMPOSSIBLE; | ||
1825 | } | ||
1826 | |||
1827 | if (new_conf->on_congestion != OC_BLOCK && new_conf->wire_protocol != DRBD_PROT_A) | ||
1828 | return ERR_CONG_NOT_PROTO_A; | ||
1829 | |||
1830 | return NO_ERROR; | ||
1831 | } | ||
1832 | |||
1833 | static enum drbd_ret_code | ||
1834 | check_net_options(struct drbd_tconn *tconn, struct net_conf *new_conf) | ||
1835 | { | ||
1836 | static enum drbd_ret_code rv; | ||
1837 | struct drbd_conf *mdev; | ||
1838 | int i; | ||
1839 | |||
1840 | rcu_read_lock(); | ||
1841 | rv = _check_net_options(tconn, rcu_dereference(tconn->net_conf), new_conf); | ||
1842 | rcu_read_unlock(); | ||
1843 | |||
1844 | /* tconn->volumes protected by genl_lock() here */ | ||
1845 | idr_for_each_entry(&tconn->volumes, mdev, i) { | ||
1846 | if (!mdev->bitmap) { | ||
1847 | if(drbd_bm_init(mdev)) | ||
1848 | return ERR_NOMEM; | ||
1849 | } | ||
1850 | } | ||
1851 | |||
1852 | return rv; | ||
1853 | } | ||
1854 | |||
1855 | struct crypto { | ||
1856 | struct crypto_hash *verify_tfm; | ||
1857 | struct crypto_hash *csums_tfm; | ||
1858 | struct crypto_hash *cram_hmac_tfm; | ||
1859 | struct crypto_hash *integrity_tfm; | ||
1860 | }; | ||
1861 | |||
1862 | static int | ||
1863 | alloc_hash(struct crypto_hash **tfm, char *tfm_name, int err_alg) | ||
1864 | { | ||
1865 | if (!tfm_name[0]) | ||
1866 | return NO_ERROR; | ||
1867 | |||
1868 | *tfm = crypto_alloc_hash(tfm_name, 0, CRYPTO_ALG_ASYNC); | ||
1869 | if (IS_ERR(*tfm)) { | ||
1870 | *tfm = NULL; | ||
1871 | return err_alg; | ||
1872 | } | ||
1873 | |||
1874 | return NO_ERROR; | ||
1875 | } | ||
1876 | |||
1877 | static enum drbd_ret_code | ||
1878 | alloc_crypto(struct crypto *crypto, struct net_conf *new_conf) | ||
1879 | { | ||
1880 | char hmac_name[CRYPTO_MAX_ALG_NAME]; | ||
1881 | enum drbd_ret_code rv; | ||
1882 | |||
1883 | rv = alloc_hash(&crypto->csums_tfm, new_conf->csums_alg, | ||
1884 | ERR_CSUMS_ALG); | ||
1885 | if (rv != NO_ERROR) | ||
1886 | return rv; | ||
1887 | rv = alloc_hash(&crypto->verify_tfm, new_conf->verify_alg, | ||
1888 | ERR_VERIFY_ALG); | ||
1889 | if (rv != NO_ERROR) | ||
1890 | return rv; | ||
1891 | rv = alloc_hash(&crypto->integrity_tfm, new_conf->integrity_alg, | ||
1892 | ERR_INTEGRITY_ALG); | ||
1893 | if (rv != NO_ERROR) | ||
1894 | return rv; | ||
1895 | if (new_conf->cram_hmac_alg[0] != 0) { | ||
1896 | snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)", | ||
1897 | new_conf->cram_hmac_alg); | ||
1898 | |||
1899 | rv = alloc_hash(&crypto->cram_hmac_tfm, hmac_name, | ||
1900 | ERR_AUTH_ALG); | ||
1901 | } | ||
1902 | |||
1903 | return rv; | ||
1904 | } | ||
1905 | |||
1906 | static void free_crypto(struct crypto *crypto) | ||
1907 | { | ||
1908 | crypto_free_hash(crypto->cram_hmac_tfm); | ||
1909 | crypto_free_hash(crypto->integrity_tfm); | ||
1910 | crypto_free_hash(crypto->csums_tfm); | ||
1911 | crypto_free_hash(crypto->verify_tfm); | ||
1912 | } | ||
1913 | |||
1914 | int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) | ||
1915 | { | ||
1916 | enum drbd_ret_code retcode; | ||
1917 | struct drbd_tconn *tconn; | ||
1918 | struct net_conf *old_conf, *new_conf = NULL; | ||
1919 | int err; | ||
1920 | int ovr; /* online verify running */ | ||
1921 | int rsr; /* re-sync running */ | ||
1922 | struct crypto crypto = { }; | ||
1923 | |||
1924 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION); | ||
1925 | if (!adm_ctx.reply_skb) | ||
1926 | return retcode; | ||
1927 | if (retcode != NO_ERROR) | ||
1928 | goto out; | ||
1929 | |||
1930 | tconn = adm_ctx.tconn; | ||
1931 | |||
1413 | new_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL); | 1932 | new_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL); |
1414 | if (!new_conf) { | 1933 | if (!new_conf) { |
1415 | retcode = ERR_NOMEM; | 1934 | retcode = ERR_NOMEM; |
1935 | goto out; | ||
1936 | } | ||
1937 | |||
1938 | conn_reconfig_start(tconn); | ||
1939 | |||
1940 | mutex_lock(&tconn->data.mutex); | ||
1941 | mutex_lock(&tconn->conf_update); | ||
1942 | old_conf = tconn->net_conf; | ||
1943 | |||
1944 | if (!old_conf) { | ||
1945 | drbd_msg_put_info("net conf missing, try connect"); | ||
1946 | retcode = ERR_INVALID_REQUEST; | ||
1416 | goto fail; | 1947 | goto fail; |
1417 | } | 1948 | } |
1418 | 1949 | ||
1419 | new_conf->timeout = DRBD_TIMEOUT_DEF; | 1950 | *new_conf = *old_conf; |
1420 | new_conf->try_connect_int = DRBD_CONNECT_INT_DEF; | 1951 | if (should_set_defaults(info)) |
1421 | new_conf->ping_int = DRBD_PING_INT_DEF; | 1952 | set_net_conf_defaults(new_conf); |
1422 | new_conf->max_epoch_size = DRBD_MAX_EPOCH_SIZE_DEF; | 1953 | |
1423 | new_conf->max_buffers = DRBD_MAX_BUFFERS_DEF; | 1954 | err = net_conf_from_attrs_for_change(new_conf, info); |
1424 | new_conf->unplug_watermark = DRBD_UNPLUG_WATERMARK_DEF; | 1955 | if (err && err != -ENOMSG) { |
1425 | new_conf->sndbuf_size = DRBD_SNDBUF_SIZE_DEF; | ||
1426 | new_conf->rcvbuf_size = DRBD_RCVBUF_SIZE_DEF; | ||
1427 | new_conf->ko_count = DRBD_KO_COUNT_DEF; | ||
1428 | new_conf->after_sb_0p = DRBD_AFTER_SB_0P_DEF; | ||
1429 | new_conf->after_sb_1p = DRBD_AFTER_SB_1P_DEF; | ||
1430 | new_conf->after_sb_2p = DRBD_AFTER_SB_2P_DEF; | ||
1431 | new_conf->want_lose = 0; | ||
1432 | new_conf->two_primaries = 0; | ||
1433 | new_conf->wire_protocol = DRBD_PROT_C; | ||
1434 | new_conf->ping_timeo = DRBD_PING_TIMEO_DEF; | ||
1435 | new_conf->rr_conflict = DRBD_RR_CONFLICT_DEF; | ||
1436 | new_conf->on_congestion = DRBD_ON_CONGESTION_DEF; | ||
1437 | new_conf->cong_extents = DRBD_CONG_EXTENTS_DEF; | ||
1438 | |||
1439 | if (!net_conf_from_tags(mdev, nlp->tag_list, new_conf)) { | ||
1440 | retcode = ERR_MANDATORY_TAG; | 1956 | retcode = ERR_MANDATORY_TAG; |
1957 | drbd_msg_put_info(from_attrs_err_to_txt(err)); | ||
1441 | goto fail; | 1958 | goto fail; |
1442 | } | 1959 | } |
1443 | 1960 | ||
1444 | if (new_conf->two_primaries | 1961 | retcode = check_net_options(tconn, new_conf); |
1445 | && (new_conf->wire_protocol != DRBD_PROT_C)) { | 1962 | if (retcode != NO_ERROR) |
1446 | retcode = ERR_NOT_PROTO_C; | ||
1447 | goto fail; | 1963 | goto fail; |
1448 | } | ||
1449 | 1964 | ||
1450 | if (get_ldev(mdev)) { | 1965 | /* re-sync running */ |
1451 | enum drbd_fencing_p fp = mdev->ldev->dc.fencing; | 1966 | rsr = conn_resync_running(tconn); |
1452 | put_ldev(mdev); | 1967 | if (rsr && strcmp(new_conf->csums_alg, old_conf->csums_alg)) { |
1453 | if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH) { | 1968 | retcode = ERR_CSUMS_RESYNC_RUNNING; |
1454 | retcode = ERR_STONITH_AND_PROT_A; | 1969 | goto fail; |
1455 | goto fail; | ||
1456 | } | ||
1457 | } | 1970 | } |
1458 | 1971 | ||
1459 | if (new_conf->on_congestion != OC_BLOCK && new_conf->wire_protocol != DRBD_PROT_A) { | 1972 | /* online verify running */ |
1460 | retcode = ERR_CONG_NOT_PROTO_A; | 1973 | ovr = conn_ov_running(tconn); |
1974 | if (ovr && strcmp(new_conf->verify_alg, old_conf->verify_alg)) { | ||
1975 | retcode = ERR_VERIFY_RUNNING; | ||
1461 | goto fail; | 1976 | goto fail; |
1462 | } | 1977 | } |
1463 | 1978 | ||
1464 | if (mdev->state.role == R_PRIMARY && new_conf->want_lose) { | 1979 | retcode = alloc_crypto(&crypto, new_conf); |
1465 | retcode = ERR_DISCARD; | 1980 | if (retcode != NO_ERROR) |
1466 | goto fail; | 1981 | goto fail; |
1467 | } | ||
1468 | 1982 | ||
1469 | retcode = NO_ERROR; | 1983 | rcu_assign_pointer(tconn->net_conf, new_conf); |
1470 | 1984 | ||
1471 | new_my_addr = (struct sockaddr *)&new_conf->my_addr; | 1985 | if (!rsr) { |
1472 | new_peer_addr = (struct sockaddr *)&new_conf->peer_addr; | 1986 | crypto_free_hash(tconn->csums_tfm); |
1473 | for (i = 0; i < minor_count; i++) { | 1987 | tconn->csums_tfm = crypto.csums_tfm; |
1474 | odev = minor_to_mdev(i); | 1988 | crypto.csums_tfm = NULL; |
1475 | if (!odev || odev == mdev) | 1989 | } |
1476 | continue; | 1990 | if (!ovr) { |
1477 | if (get_net_conf(odev)) { | 1991 | crypto_free_hash(tconn->verify_tfm); |
1478 | taken_addr = (struct sockaddr *)&odev->net_conf->my_addr; | 1992 | tconn->verify_tfm = crypto.verify_tfm; |
1479 | if (new_conf->my_addr_len == odev->net_conf->my_addr_len && | 1993 | crypto.verify_tfm = NULL; |
1480 | !memcmp(new_my_addr, taken_addr, new_conf->my_addr_len)) | ||
1481 | retcode = ERR_LOCAL_ADDR; | ||
1482 | |||
1483 | taken_addr = (struct sockaddr *)&odev->net_conf->peer_addr; | ||
1484 | if (new_conf->peer_addr_len == odev->net_conf->peer_addr_len && | ||
1485 | !memcmp(new_peer_addr, taken_addr, new_conf->peer_addr_len)) | ||
1486 | retcode = ERR_PEER_ADDR; | ||
1487 | |||
1488 | put_net_conf(odev); | ||
1489 | if (retcode != NO_ERROR) | ||
1490 | goto fail; | ||
1491 | } | ||
1492 | } | 1994 | } |
1493 | 1995 | ||
1494 | if (new_conf->cram_hmac_alg[0] != 0) { | 1996 | crypto_free_hash(tconn->integrity_tfm); |
1495 | snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)", | 1997 | tconn->integrity_tfm = crypto.integrity_tfm; |
1496 | new_conf->cram_hmac_alg); | 1998 | if (tconn->cstate >= C_WF_REPORT_PARAMS && tconn->agreed_pro_version >= 100) |
1497 | tfm = crypto_alloc_hash(hmac_name, 0, CRYPTO_ALG_ASYNC); | 1999 | /* Do this without trying to take tconn->data.mutex again. */ |
1498 | if (IS_ERR(tfm)) { | 2000 | __drbd_send_protocol(tconn, P_PROTOCOL_UPDATE); |
1499 | tfm = NULL; | ||
1500 | retcode = ERR_AUTH_ALG; | ||
1501 | goto fail; | ||
1502 | } | ||
1503 | 2001 | ||
1504 | if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) { | 2002 | crypto_free_hash(tconn->cram_hmac_tfm); |
1505 | retcode = ERR_AUTH_ALG_ND; | 2003 | tconn->cram_hmac_tfm = crypto.cram_hmac_tfm; |
1506 | goto fail; | ||
1507 | } | ||
1508 | } | ||
1509 | 2004 | ||
1510 | if (new_conf->integrity_alg[0]) { | 2005 | mutex_unlock(&tconn->conf_update); |
1511 | integrity_w_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC); | 2006 | mutex_unlock(&tconn->data.mutex); |
1512 | if (IS_ERR(integrity_w_tfm)) { | 2007 | synchronize_rcu(); |
1513 | integrity_w_tfm = NULL; | 2008 | kfree(old_conf); |
1514 | retcode=ERR_INTEGRITY_ALG; | ||
1515 | goto fail; | ||
1516 | } | ||
1517 | 2009 | ||
1518 | if (!drbd_crypto_is_hash(crypto_hash_tfm(integrity_w_tfm))) { | 2010 | if (tconn->cstate >= C_WF_REPORT_PARAMS) |
1519 | retcode=ERR_INTEGRITY_ALG_ND; | 2011 | drbd_send_sync_param(minor_to_mdev(conn_lowest_minor(tconn))); |
1520 | goto fail; | ||
1521 | } | ||
1522 | 2012 | ||
1523 | integrity_r_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC); | 2013 | goto done; |
1524 | if (IS_ERR(integrity_r_tfm)) { | 2014 | |
1525 | integrity_r_tfm = NULL; | 2015 | fail: |
1526 | retcode=ERR_INTEGRITY_ALG; | 2016 | mutex_unlock(&tconn->conf_update); |
1527 | goto fail; | 2017 | mutex_unlock(&tconn->data.mutex); |
1528 | } | 2018 | free_crypto(&crypto); |
2019 | kfree(new_conf); | ||
2020 | done: | ||
2021 | conn_reconfig_done(tconn); | ||
2022 | out: | ||
2023 | drbd_adm_finish(info, retcode); | ||
2024 | return 0; | ||
2025 | } | ||
2026 | |||
2027 | int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) | ||
2028 | { | ||
2029 | struct drbd_conf *mdev; | ||
2030 | struct net_conf *old_conf, *new_conf = NULL; | ||
2031 | struct crypto crypto = { }; | ||
2032 | struct drbd_tconn *tconn; | ||
2033 | enum drbd_ret_code retcode; | ||
2034 | int i; | ||
2035 | int err; | ||
2036 | |||
2037 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); | ||
2038 | |||
2039 | if (!adm_ctx.reply_skb) | ||
2040 | return retcode; | ||
2041 | if (retcode != NO_ERROR) | ||
2042 | goto out; | ||
2043 | if (!(adm_ctx.my_addr && adm_ctx.peer_addr)) { | ||
2044 | drbd_msg_put_info("connection endpoint(s) missing"); | ||
2045 | retcode = ERR_INVALID_REQUEST; | ||
2046 | goto out; | ||
1529 | } | 2047 | } |
1530 | 2048 | ||
1531 | ns = new_conf->max_epoch_size/8; | 2049 | /* No need for _rcu here. All reconfiguration is |
1532 | if (mdev->tl_hash_s != ns) { | 2050 | * strictly serialized on genl_lock(). We are protected against |
1533 | new_tl_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL); | 2051 | * concurrent reconfiguration/addition/deletion */ |
1534 | if (!new_tl_hash) { | 2052 | list_for_each_entry(tconn, &drbd_tconns, all_tconn) { |
1535 | retcode = ERR_NOMEM; | 2053 | if (nla_len(adm_ctx.my_addr) == tconn->my_addr_len && |
1536 | goto fail; | 2054 | !memcmp(nla_data(adm_ctx.my_addr), &tconn->my_addr, tconn->my_addr_len)) { |
2055 | retcode = ERR_LOCAL_ADDR; | ||
2056 | goto out; | ||
1537 | } | 2057 | } |
1538 | } | ||
1539 | 2058 | ||
1540 | ns = new_conf->max_buffers/8; | 2059 | if (nla_len(adm_ctx.peer_addr) == tconn->peer_addr_len && |
1541 | if (new_conf->two_primaries && (mdev->ee_hash_s != ns)) { | 2060 | !memcmp(nla_data(adm_ctx.peer_addr), &tconn->peer_addr, tconn->peer_addr_len)) { |
1542 | new_ee_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL); | 2061 | retcode = ERR_PEER_ADDR; |
1543 | if (!new_ee_hash) { | 2062 | goto out; |
1544 | retcode = ERR_NOMEM; | ||
1545 | goto fail; | ||
1546 | } | 2063 | } |
1547 | } | 2064 | } |
1548 | 2065 | ||
1549 | ((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0; | 2066 | tconn = adm_ctx.tconn; |
2067 | conn_reconfig_start(tconn); | ||
1550 | 2068 | ||
1551 | if (integrity_w_tfm) { | 2069 | if (tconn->cstate > C_STANDALONE) { |
1552 | i = crypto_hash_digestsize(integrity_w_tfm); | 2070 | retcode = ERR_NET_CONFIGURED; |
1553 | int_dig_out = kmalloc(i, GFP_KERNEL); | 2071 | goto fail; |
1554 | if (!int_dig_out) { | ||
1555 | retcode = ERR_NOMEM; | ||
1556 | goto fail; | ||
1557 | } | ||
1558 | int_dig_in = kmalloc(i, GFP_KERNEL); | ||
1559 | if (!int_dig_in) { | ||
1560 | retcode = ERR_NOMEM; | ||
1561 | goto fail; | ||
1562 | } | ||
1563 | int_dig_vv = kmalloc(i, GFP_KERNEL); | ||
1564 | if (!int_dig_vv) { | ||
1565 | retcode = ERR_NOMEM; | ||
1566 | goto fail; | ||
1567 | } | ||
1568 | } | 2072 | } |
1569 | 2073 | ||
1570 | if (!mdev->bitmap) { | 2074 | /* allocation not in the IO path, drbdsetup / netlink process context */ |
1571 | if(drbd_bm_init(mdev)) { | 2075 | new_conf = kzalloc(sizeof(*new_conf), GFP_KERNEL); |
1572 | retcode = ERR_NOMEM; | 2076 | if (!new_conf) { |
1573 | goto fail; | 2077 | retcode = ERR_NOMEM; |
1574 | } | 2078 | goto fail; |
1575 | } | 2079 | } |
1576 | 2080 | ||
1577 | drbd_flush_workqueue(mdev); | 2081 | set_net_conf_defaults(new_conf); |
1578 | spin_lock_irq(&mdev->req_lock); | 2082 | |
1579 | if (mdev->net_conf != NULL) { | 2083 | err = net_conf_from_attrs(new_conf, info); |
1580 | retcode = ERR_NET_CONFIGURED; | 2084 | if (err && err != -ENOMSG) { |
1581 | spin_unlock_irq(&mdev->req_lock); | 2085 | retcode = ERR_MANDATORY_TAG; |
2086 | drbd_msg_put_info(from_attrs_err_to_txt(err)); | ||
1582 | goto fail; | 2087 | goto fail; |
1583 | } | 2088 | } |
1584 | mdev->net_conf = new_conf; | ||
1585 | 2089 | ||
1586 | mdev->send_cnt = 0; | 2090 | retcode = check_net_options(tconn, new_conf); |
1587 | mdev->recv_cnt = 0; | 2091 | if (retcode != NO_ERROR) |
2092 | goto fail; | ||
1588 | 2093 | ||
1589 | if (new_tl_hash) { | 2094 | retcode = alloc_crypto(&crypto, new_conf); |
1590 | kfree(mdev->tl_hash); | 2095 | if (retcode != NO_ERROR) |
1591 | mdev->tl_hash_s = mdev->net_conf->max_epoch_size/8; | 2096 | goto fail; |
1592 | mdev->tl_hash = new_tl_hash; | 2097 | |
1593 | } | 2098 | ((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0; |
2099 | |||
2100 | conn_flush_workqueue(tconn); | ||
1594 | 2101 | ||
1595 | if (new_ee_hash) { | 2102 | mutex_lock(&tconn->conf_update); |
1596 | kfree(mdev->ee_hash); | 2103 | old_conf = tconn->net_conf; |
1597 | mdev->ee_hash_s = mdev->net_conf->max_buffers/8; | 2104 | if (old_conf) { |
1598 | mdev->ee_hash = new_ee_hash; | 2105 | retcode = ERR_NET_CONFIGURED; |
2106 | mutex_unlock(&tconn->conf_update); | ||
2107 | goto fail; | ||
1599 | } | 2108 | } |
2109 | rcu_assign_pointer(tconn->net_conf, new_conf); | ||
1600 | 2110 | ||
1601 | crypto_free_hash(mdev->cram_hmac_tfm); | 2111 | conn_free_crypto(tconn); |
1602 | mdev->cram_hmac_tfm = tfm; | 2112 | tconn->cram_hmac_tfm = crypto.cram_hmac_tfm; |
2113 | tconn->integrity_tfm = crypto.integrity_tfm; | ||
2114 | tconn->csums_tfm = crypto.csums_tfm; | ||
2115 | tconn->verify_tfm = crypto.verify_tfm; | ||
1603 | 2116 | ||
1604 | crypto_free_hash(mdev->integrity_w_tfm); | 2117 | tconn->my_addr_len = nla_len(adm_ctx.my_addr); |
1605 | mdev->integrity_w_tfm = integrity_w_tfm; | 2118 | memcpy(&tconn->my_addr, nla_data(adm_ctx.my_addr), tconn->my_addr_len); |
2119 | tconn->peer_addr_len = nla_len(adm_ctx.peer_addr); | ||
2120 | memcpy(&tconn->peer_addr, nla_data(adm_ctx.peer_addr), tconn->peer_addr_len); | ||
1606 | 2121 | ||
1607 | crypto_free_hash(mdev->integrity_r_tfm); | 2122 | mutex_unlock(&tconn->conf_update); |
1608 | mdev->integrity_r_tfm = integrity_r_tfm; | ||
1609 | 2123 | ||
1610 | kfree(mdev->int_dig_out); | 2124 | rcu_read_lock(); |
1611 | kfree(mdev->int_dig_in); | 2125 | idr_for_each_entry(&tconn->volumes, mdev, i) { |
1612 | kfree(mdev->int_dig_vv); | 2126 | mdev->send_cnt = 0; |
1613 | mdev->int_dig_out=int_dig_out; | 2127 | mdev->recv_cnt = 0; |
1614 | mdev->int_dig_in=int_dig_in; | 2128 | } |
1615 | mdev->int_dig_vv=int_dig_vv; | 2129 | rcu_read_unlock(); |
1616 | retcode = _drbd_set_state(_NS(mdev, conn, C_UNCONNECTED), CS_VERBOSE, NULL); | ||
1617 | spin_unlock_irq(&mdev->req_lock); | ||
1618 | 2130 | ||
1619 | kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); | 2131 | retcode = conn_request_state(tconn, NS(conn, C_UNCONNECTED), CS_VERBOSE); |
1620 | reply->ret_code = retcode; | 2132 | |
1621 | drbd_reconfig_done(mdev); | 2133 | conn_reconfig_done(tconn); |
2134 | drbd_adm_finish(info, retcode); | ||
1622 | return 0; | 2135 | return 0; |
1623 | 2136 | ||
1624 | fail: | 2137 | fail: |
1625 | kfree(int_dig_out); | 2138 | free_crypto(&crypto); |
1626 | kfree(int_dig_in); | ||
1627 | kfree(int_dig_vv); | ||
1628 | crypto_free_hash(tfm); | ||
1629 | crypto_free_hash(integrity_w_tfm); | ||
1630 | crypto_free_hash(integrity_r_tfm); | ||
1631 | kfree(new_tl_hash); | ||
1632 | kfree(new_ee_hash); | ||
1633 | kfree(new_conf); | 2139 | kfree(new_conf); |
1634 | 2140 | ||
1635 | reply->ret_code = retcode; | 2141 | conn_reconfig_done(tconn); |
1636 | drbd_reconfig_done(mdev); | 2142 | out: |
2143 | drbd_adm_finish(info, retcode); | ||
1637 | return 0; | 2144 | return 0; |
1638 | } | 2145 | } |
1639 | 2146 | ||
1640 | static int drbd_nl_disconnect(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2147 | static enum drbd_state_rv conn_try_disconnect(struct drbd_tconn *tconn, bool force) |
1641 | struct drbd_nl_cfg_reply *reply) | ||
1642 | { | 2148 | { |
1643 | int retcode; | 2149 | enum drbd_state_rv rv; |
1644 | struct disconnect dc; | ||
1645 | |||
1646 | memset(&dc, 0, sizeof(struct disconnect)); | ||
1647 | if (!disconnect_from_tags(mdev, nlp->tag_list, &dc)) { | ||
1648 | retcode = ERR_MANDATORY_TAG; | ||
1649 | goto fail; | ||
1650 | } | ||
1651 | |||
1652 | if (dc.force) { | ||
1653 | spin_lock_irq(&mdev->req_lock); | ||
1654 | if (mdev->state.conn >= C_WF_CONNECTION) | ||
1655 | _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), CS_HARD, NULL); | ||
1656 | spin_unlock_irq(&mdev->req_lock); | ||
1657 | goto done; | ||
1658 | } | ||
1659 | 2150 | ||
1660 | retcode = _drbd_request_state(mdev, NS(conn, C_DISCONNECTING), CS_ORDERED); | 2151 | rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING), |
2152 | force ? CS_HARD : 0); | ||
1661 | 2153 | ||
1662 | if (retcode == SS_NOTHING_TO_DO) | 2154 | switch (rv) { |
1663 | goto done; | 2155 | case SS_NOTHING_TO_DO: |
1664 | else if (retcode == SS_ALREADY_STANDALONE) | 2156 | break; |
1665 | goto done; | 2157 | case SS_ALREADY_STANDALONE: |
1666 | else if (retcode == SS_PRIMARY_NOP) { | 2158 | return SS_SUCCESS; |
1667 | /* Our statche checking code wants to see the peer outdated. */ | 2159 | case SS_PRIMARY_NOP: |
1668 | retcode = drbd_request_state(mdev, NS2(conn, C_DISCONNECTING, | 2160 | /* Our state checking code wants to see the peer outdated. */ |
1669 | pdsk, D_OUTDATED)); | 2161 | rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING, |
1670 | } else if (retcode == SS_CW_FAILED_BY_PEER) { | 2162 | pdsk, D_OUTDATED), CS_VERBOSE); |
2163 | break; | ||
2164 | case SS_CW_FAILED_BY_PEER: | ||
1671 | /* The peer probably wants to see us outdated. */ | 2165 | /* The peer probably wants to see us outdated. */ |
1672 | retcode = _drbd_request_state(mdev, NS2(conn, C_DISCONNECTING, | 2166 | rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING, |
1673 | disk, D_OUTDATED), | 2167 | disk, D_OUTDATED), 0); |
1674 | CS_ORDERED); | 2168 | if (rv == SS_IS_DISKLESS || rv == SS_LOWER_THAN_OUTDATED) { |
1675 | if (retcode == SS_IS_DISKLESS || retcode == SS_LOWER_THAN_OUTDATED) { | 2169 | rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING), |
1676 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | 2170 | CS_HARD); |
1677 | retcode = SS_SUCCESS; | ||
1678 | } | 2171 | } |
2172 | break; | ||
2173 | default:; | ||
2174 | /* no special handling necessary */ | ||
2175 | } | ||
2176 | |||
2177 | if (rv >= SS_SUCCESS) { | ||
2178 | enum drbd_state_rv rv2; | ||
2179 | /* No one else can reconfigure the network while I am here. | ||
2180 | * The state handling only uses drbd_thread_stop_nowait(), | ||
2181 | * we want to really wait here until the receiver is no more. | ||
2182 | */ | ||
2183 | drbd_thread_stop(&adm_ctx.tconn->receiver); | ||
2184 | |||
2185 | /* Race breaker. This additional state change request may be | ||
2186 | * necessary, if this was a forced disconnect during a receiver | ||
2187 | * restart. We may have "killed" the receiver thread just | ||
2188 | * after drbdd_init() returned. Typically, we should be | ||
2189 | * C_STANDALONE already, now, and this becomes a no-op. | ||
2190 | */ | ||
2191 | rv2 = conn_request_state(tconn, NS(conn, C_STANDALONE), | ||
2192 | CS_VERBOSE | CS_HARD); | ||
2193 | if (rv2 < SS_SUCCESS) | ||
2194 | conn_err(tconn, | ||
2195 | "unexpected rv2=%d in conn_try_disconnect()\n", | ||
2196 | rv2); | ||
1679 | } | 2197 | } |
2198 | return rv; | ||
2199 | } | ||
1680 | 2200 | ||
1681 | if (retcode < SS_SUCCESS) | 2201 | int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info) |
1682 | goto fail; | 2202 | { |
2203 | struct disconnect_parms parms; | ||
2204 | struct drbd_tconn *tconn; | ||
2205 | enum drbd_state_rv rv; | ||
2206 | enum drbd_ret_code retcode; | ||
2207 | int err; | ||
1683 | 2208 | ||
1684 | if (wait_event_interruptible(mdev->state_wait, | 2209 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION); |
1685 | mdev->state.conn != C_DISCONNECTING)) { | 2210 | if (!adm_ctx.reply_skb) |
1686 | /* Do not test for mdev->state.conn == C_STANDALONE, since | 2211 | return retcode; |
1687 | someone else might connect us in the mean time! */ | 2212 | if (retcode != NO_ERROR) |
1688 | retcode = ERR_INTR; | ||
1689 | goto fail; | 2213 | goto fail; |
2214 | |||
2215 | tconn = adm_ctx.tconn; | ||
2216 | memset(&parms, 0, sizeof(parms)); | ||
2217 | if (info->attrs[DRBD_NLA_DISCONNECT_PARMS]) { | ||
2218 | err = disconnect_parms_from_attrs(&parms, info); | ||
2219 | if (err) { | ||
2220 | retcode = ERR_MANDATORY_TAG; | ||
2221 | drbd_msg_put_info(from_attrs_err_to_txt(err)); | ||
2222 | goto fail; | ||
2223 | } | ||
1690 | } | 2224 | } |
1691 | 2225 | ||
1692 | done: | 2226 | rv = conn_try_disconnect(tconn, parms.force_disconnect); |
1693 | retcode = NO_ERROR; | 2227 | if (rv < SS_SUCCESS) |
2228 | retcode = rv; /* FIXME: Type mismatch. */ | ||
2229 | else | ||
2230 | retcode = NO_ERROR; | ||
1694 | fail: | 2231 | fail: |
1695 | drbd_md_sync(mdev); | 2232 | drbd_adm_finish(info, retcode); |
1696 | reply->ret_code = retcode; | ||
1697 | return 0; | 2233 | return 0; |
1698 | } | 2234 | } |
1699 | 2235 | ||
@@ -1705,7 +2241,7 @@ void resync_after_online_grow(struct drbd_conf *mdev) | |||
1705 | if (mdev->state.role != mdev->state.peer) | 2241 | if (mdev->state.role != mdev->state.peer) |
1706 | iass = (mdev->state.role == R_PRIMARY); | 2242 | iass = (mdev->state.role == R_PRIMARY); |
1707 | else | 2243 | else |
1708 | iass = test_bit(DISCARD_CONCURRENT, &mdev->flags); | 2244 | iass = test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags); |
1709 | 2245 | ||
1710 | if (iass) | 2246 | if (iass) |
1711 | drbd_start_resync(mdev, C_SYNC_SOURCE); | 2247 | drbd_start_resync(mdev, C_SYNC_SOURCE); |
@@ -1713,20 +2249,34 @@ void resync_after_online_grow(struct drbd_conf *mdev) | |||
1713 | _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE); | 2249 | _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE); |
1714 | } | 2250 | } |
1715 | 2251 | ||
1716 | static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2252 | int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) |
1717 | struct drbd_nl_cfg_reply *reply) | ||
1718 | { | 2253 | { |
1719 | struct resize rs; | 2254 | struct disk_conf *old_disk_conf, *new_disk_conf = NULL; |
1720 | int retcode = NO_ERROR; | 2255 | struct resize_parms rs; |
2256 | struct drbd_conf *mdev; | ||
2257 | enum drbd_ret_code retcode; | ||
1721 | enum determine_dev_size dd; | 2258 | enum determine_dev_size dd; |
1722 | enum dds_flags ddsf; | 2259 | enum dds_flags ddsf; |
2260 | sector_t u_size; | ||
2261 | int err; | ||
1723 | 2262 | ||
1724 | memset(&rs, 0, sizeof(struct resize)); | 2263 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); |
1725 | if (!resize_from_tags(mdev, nlp->tag_list, &rs)) { | 2264 | if (!adm_ctx.reply_skb) |
1726 | retcode = ERR_MANDATORY_TAG; | 2265 | return retcode; |
2266 | if (retcode != NO_ERROR) | ||
1727 | goto fail; | 2267 | goto fail; |
2268 | |||
2269 | memset(&rs, 0, sizeof(struct resize_parms)); | ||
2270 | if (info->attrs[DRBD_NLA_RESIZE_PARMS]) { | ||
2271 | err = resize_parms_from_attrs(&rs, info); | ||
2272 | if (err) { | ||
2273 | retcode = ERR_MANDATORY_TAG; | ||
2274 | drbd_msg_put_info(from_attrs_err_to_txt(err)); | ||
2275 | goto fail; | ||
2276 | } | ||
1728 | } | 2277 | } |
1729 | 2278 | ||
2279 | mdev = adm_ctx.mdev; | ||
1730 | if (mdev->state.conn > C_CONNECTED) { | 2280 | if (mdev->state.conn > C_CONNECTED) { |
1731 | retcode = ERR_RESIZE_RESYNC; | 2281 | retcode = ERR_RESIZE_RESYNC; |
1732 | goto fail; | 2282 | goto fail; |
@@ -1743,15 +2293,36 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | |||
1743 | goto fail; | 2293 | goto fail; |
1744 | } | 2294 | } |
1745 | 2295 | ||
1746 | if (rs.no_resync && mdev->agreed_pro_version < 93) { | 2296 | if (rs.no_resync && mdev->tconn->agreed_pro_version < 93) { |
1747 | retcode = ERR_NEED_APV_93; | 2297 | retcode = ERR_NEED_APV_93; |
1748 | goto fail_ldev; | 2298 | goto fail_ldev; |
1749 | } | 2299 | } |
1750 | 2300 | ||
2301 | rcu_read_lock(); | ||
2302 | u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size; | ||
2303 | rcu_read_unlock(); | ||
2304 | if (u_size != (sector_t)rs.resize_size) { | ||
2305 | new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL); | ||
2306 | if (!new_disk_conf) { | ||
2307 | retcode = ERR_NOMEM; | ||
2308 | goto fail_ldev; | ||
2309 | } | ||
2310 | } | ||
2311 | |||
1751 | if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) | 2312 | if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) |
1752 | mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); | 2313 | mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); |
1753 | 2314 | ||
1754 | mdev->ldev->dc.disk_size = (sector_t)rs.resize_size; | 2315 | if (new_disk_conf) { |
2316 | mutex_lock(&mdev->tconn->conf_update); | ||
2317 | old_disk_conf = mdev->ldev->disk_conf; | ||
2318 | *new_disk_conf = *old_disk_conf; | ||
2319 | new_disk_conf->disk_size = (sector_t)rs.resize_size; | ||
2320 | rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf); | ||
2321 | mutex_unlock(&mdev->tconn->conf_update); | ||
2322 | synchronize_rcu(); | ||
2323 | kfree(old_disk_conf); | ||
2324 | } | ||
2325 | |||
1755 | ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0); | 2326 | ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0); |
1756 | dd = drbd_determine_dev_size(mdev, ddsf); | 2327 | dd = drbd_determine_dev_size(mdev, ddsf); |
1757 | drbd_md_sync(mdev); | 2328 | drbd_md_sync(mdev); |
@@ -1770,7 +2341,7 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | |||
1770 | } | 2341 | } |
1771 | 2342 | ||
1772 | fail: | 2343 | fail: |
1773 | reply->ret_code = retcode; | 2344 | drbd_adm_finish(info, retcode); |
1774 | return 0; | 2345 | return 0; |
1775 | 2346 | ||
1776 | fail_ldev: | 2347 | fail_ldev: |
@@ -1778,204 +2349,55 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | |||
1778 | goto fail; | 2349 | goto fail; |
1779 | } | 2350 | } |
1780 | 2351 | ||
1781 | static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2352 | int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info) |
1782 | struct drbd_nl_cfg_reply *reply) | ||
1783 | { | 2353 | { |
1784 | int retcode = NO_ERROR; | 2354 | enum drbd_ret_code retcode; |
2355 | struct drbd_tconn *tconn; | ||
2356 | struct res_opts res_opts; | ||
1785 | int err; | 2357 | int err; |
1786 | int ovr; /* online verify running */ | ||
1787 | int rsr; /* re-sync running */ | ||
1788 | struct crypto_hash *verify_tfm = NULL; | ||
1789 | struct crypto_hash *csums_tfm = NULL; | ||
1790 | struct syncer_conf sc; | ||
1791 | cpumask_var_t new_cpu_mask; | ||
1792 | int *rs_plan_s = NULL; | ||
1793 | int fifo_size; | ||
1794 | |||
1795 | if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) { | ||
1796 | retcode = ERR_NOMEM; | ||
1797 | goto fail; | ||
1798 | } | ||
1799 | 2358 | ||
1800 | if (nlp->flags & DRBD_NL_SET_DEFAULTS) { | 2359 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); |
1801 | memset(&sc, 0, sizeof(struct syncer_conf)); | 2360 | if (!adm_ctx.reply_skb) |
1802 | sc.rate = DRBD_RATE_DEF; | 2361 | return retcode; |
1803 | sc.after = DRBD_AFTER_DEF; | ||
1804 | sc.al_extents = DRBD_AL_EXTENTS_DEF; | ||
1805 | sc.on_no_data = DRBD_ON_NO_DATA_DEF; | ||
1806 | sc.c_plan_ahead = DRBD_C_PLAN_AHEAD_DEF; | ||
1807 | sc.c_delay_target = DRBD_C_DELAY_TARGET_DEF; | ||
1808 | sc.c_fill_target = DRBD_C_FILL_TARGET_DEF; | ||
1809 | sc.c_max_rate = DRBD_C_MAX_RATE_DEF; | ||
1810 | sc.c_min_rate = DRBD_C_MIN_RATE_DEF; | ||
1811 | } else | ||
1812 | memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf)); | ||
1813 | |||
1814 | if (!syncer_conf_from_tags(mdev, nlp->tag_list, &sc)) { | ||
1815 | retcode = ERR_MANDATORY_TAG; | ||
1816 | goto fail; | ||
1817 | } | ||
1818 | |||
1819 | /* re-sync running */ | ||
1820 | rsr = ( mdev->state.conn == C_SYNC_SOURCE || | ||
1821 | mdev->state.conn == C_SYNC_TARGET || | ||
1822 | mdev->state.conn == C_PAUSED_SYNC_S || | ||
1823 | mdev->state.conn == C_PAUSED_SYNC_T ); | ||
1824 | |||
1825 | if (rsr && strcmp(sc.csums_alg, mdev->sync_conf.csums_alg)) { | ||
1826 | retcode = ERR_CSUMS_RESYNC_RUNNING; | ||
1827 | goto fail; | ||
1828 | } | ||
1829 | |||
1830 | if (!rsr && sc.csums_alg[0]) { | ||
1831 | csums_tfm = crypto_alloc_hash(sc.csums_alg, 0, CRYPTO_ALG_ASYNC); | ||
1832 | if (IS_ERR(csums_tfm)) { | ||
1833 | csums_tfm = NULL; | ||
1834 | retcode = ERR_CSUMS_ALG; | ||
1835 | goto fail; | ||
1836 | } | ||
1837 | |||
1838 | if (!drbd_crypto_is_hash(crypto_hash_tfm(csums_tfm))) { | ||
1839 | retcode = ERR_CSUMS_ALG_ND; | ||
1840 | goto fail; | ||
1841 | } | ||
1842 | } | ||
1843 | |||
1844 | /* online verify running */ | ||
1845 | ovr = (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T); | ||
1846 | |||
1847 | if (ovr) { | ||
1848 | if (strcmp(sc.verify_alg, mdev->sync_conf.verify_alg)) { | ||
1849 | retcode = ERR_VERIFY_RUNNING; | ||
1850 | goto fail; | ||
1851 | } | ||
1852 | } | ||
1853 | |||
1854 | if (!ovr && sc.verify_alg[0]) { | ||
1855 | verify_tfm = crypto_alloc_hash(sc.verify_alg, 0, CRYPTO_ALG_ASYNC); | ||
1856 | if (IS_ERR(verify_tfm)) { | ||
1857 | verify_tfm = NULL; | ||
1858 | retcode = ERR_VERIFY_ALG; | ||
1859 | goto fail; | ||
1860 | } | ||
1861 | |||
1862 | if (!drbd_crypto_is_hash(crypto_hash_tfm(verify_tfm))) { | ||
1863 | retcode = ERR_VERIFY_ALG_ND; | ||
1864 | goto fail; | ||
1865 | } | ||
1866 | } | ||
1867 | |||
1868 | /* silently ignore cpu mask on UP kernel */ | ||
1869 | if (nr_cpu_ids > 1 && sc.cpu_mask[0] != 0) { | ||
1870 | err = bitmap_parse(sc.cpu_mask, 32, | ||
1871 | cpumask_bits(new_cpu_mask), nr_cpu_ids); | ||
1872 | if (err) { | ||
1873 | dev_warn(DEV, "bitmap_parse() failed with %d\n", err); | ||
1874 | retcode = ERR_CPU_MASK_PARSE; | ||
1875 | goto fail; | ||
1876 | } | ||
1877 | } | ||
1878 | |||
1879 | ERR_IF (sc.rate < 1) sc.rate = 1; | ||
1880 | ERR_IF (sc.al_extents < 7) sc.al_extents = 127; /* arbitrary minimum */ | ||
1881 | #define AL_MAX ((MD_AL_MAX_SIZE-1) * AL_EXTENTS_PT) | ||
1882 | if (sc.al_extents > AL_MAX) { | ||
1883 | dev_err(DEV, "sc.al_extents > %d\n", AL_MAX); | ||
1884 | sc.al_extents = AL_MAX; | ||
1885 | } | ||
1886 | #undef AL_MAX | ||
1887 | |||
1888 | /* to avoid spurious errors when configuring minors before configuring | ||
1889 | * the minors they depend on: if necessary, first create the minor we | ||
1890 | * depend on */ | ||
1891 | if (sc.after >= 0) | ||
1892 | ensure_mdev(sc.after, 1); | ||
1893 | |||
1894 | /* most sanity checks done, try to assign the new sync-after | ||
1895 | * dependency. need to hold the global lock in there, | ||
1896 | * to avoid a race in the dependency loop check. */ | ||
1897 | retcode = drbd_alter_sa(mdev, sc.after); | ||
1898 | if (retcode != NO_ERROR) | 2362 | if (retcode != NO_ERROR) |
1899 | goto fail; | 2363 | goto fail; |
2364 | tconn = adm_ctx.tconn; | ||
1900 | 2365 | ||
1901 | fifo_size = (sc.c_plan_ahead * 10 * SLEEP_TIME) / HZ; | 2366 | res_opts = tconn->res_opts; |
1902 | if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) { | 2367 | if (should_set_defaults(info)) |
1903 | rs_plan_s = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL); | 2368 | set_res_opts_defaults(&res_opts); |
1904 | if (!rs_plan_s) { | ||
1905 | dev_err(DEV, "kmalloc of fifo_buffer failed"); | ||
1906 | retcode = ERR_NOMEM; | ||
1907 | goto fail; | ||
1908 | } | ||
1909 | } | ||
1910 | 2369 | ||
1911 | /* ok, assign the rest of it as well. | 2370 | err = res_opts_from_attrs(&res_opts, info); |
1912 | * lock against receive_SyncParam() */ | 2371 | if (err && err != -ENOMSG) { |
1913 | spin_lock(&mdev->peer_seq_lock); | 2372 | retcode = ERR_MANDATORY_TAG; |
1914 | mdev->sync_conf = sc; | 2373 | drbd_msg_put_info(from_attrs_err_to_txt(err)); |
1915 | 2374 | goto fail; | |
1916 | if (!rsr) { | ||
1917 | crypto_free_hash(mdev->csums_tfm); | ||
1918 | mdev->csums_tfm = csums_tfm; | ||
1919 | csums_tfm = NULL; | ||
1920 | } | ||
1921 | |||
1922 | if (!ovr) { | ||
1923 | crypto_free_hash(mdev->verify_tfm); | ||
1924 | mdev->verify_tfm = verify_tfm; | ||
1925 | verify_tfm = NULL; | ||
1926 | } | ||
1927 | |||
1928 | if (fifo_size != mdev->rs_plan_s.size) { | ||
1929 | kfree(mdev->rs_plan_s.values); | ||
1930 | mdev->rs_plan_s.values = rs_plan_s; | ||
1931 | mdev->rs_plan_s.size = fifo_size; | ||
1932 | mdev->rs_planed = 0; | ||
1933 | rs_plan_s = NULL; | ||
1934 | } | 2375 | } |
1935 | 2376 | ||
1936 | spin_unlock(&mdev->peer_seq_lock); | 2377 | err = set_resource_options(tconn, &res_opts); |
1937 | 2378 | if (err) { | |
1938 | if (get_ldev(mdev)) { | 2379 | retcode = ERR_INVALID_REQUEST; |
1939 | wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); | 2380 | if (err == -ENOMEM) |
1940 | drbd_al_shrink(mdev); | ||
1941 | err = drbd_check_al_size(mdev); | ||
1942 | lc_unlock(mdev->act_log); | ||
1943 | wake_up(&mdev->al_wait); | ||
1944 | |||
1945 | put_ldev(mdev); | ||
1946 | drbd_md_sync(mdev); | ||
1947 | |||
1948 | if (err) { | ||
1949 | retcode = ERR_NOMEM; | 2381 | retcode = ERR_NOMEM; |
1950 | goto fail; | ||
1951 | } | ||
1952 | } | 2382 | } |
1953 | 2383 | ||
1954 | if (mdev->state.conn >= C_CONNECTED) | ||
1955 | drbd_send_sync_param(mdev, &sc); | ||
1956 | |||
1957 | if (!cpumask_equal(mdev->cpu_mask, new_cpu_mask)) { | ||
1958 | cpumask_copy(mdev->cpu_mask, new_cpu_mask); | ||
1959 | drbd_calc_cpu_mask(mdev); | ||
1960 | mdev->receiver.reset_cpu_mask = 1; | ||
1961 | mdev->asender.reset_cpu_mask = 1; | ||
1962 | mdev->worker.reset_cpu_mask = 1; | ||
1963 | } | ||
1964 | |||
1965 | kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); | ||
1966 | fail: | 2384 | fail: |
1967 | kfree(rs_plan_s); | 2385 | drbd_adm_finish(info, retcode); |
1968 | free_cpumask_var(new_cpu_mask); | ||
1969 | crypto_free_hash(csums_tfm); | ||
1970 | crypto_free_hash(verify_tfm); | ||
1971 | reply->ret_code = retcode; | ||
1972 | return 0; | 2386 | return 0; |
1973 | } | 2387 | } |
1974 | 2388 | ||
1975 | static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2389 | int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info) |
1976 | struct drbd_nl_cfg_reply *reply) | ||
1977 | { | 2390 | { |
1978 | int retcode; | 2391 | struct drbd_conf *mdev; |
2392 | int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ | ||
2393 | |||
2394 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); | ||
2395 | if (!adm_ctx.reply_skb) | ||
2396 | return retcode; | ||
2397 | if (retcode != NO_ERROR) | ||
2398 | goto out; | ||
2399 | |||
2400 | mdev = adm_ctx.mdev; | ||
1979 | 2401 | ||
1980 | /* If there is still bitmap IO pending, probably because of a previous | 2402 | /* If there is still bitmap IO pending, probably because of a previous |
1981 | * resync just being finished, wait for it before requesting a new resync. | 2403 | * resync just being finished, wait for it before requesting a new resync. |
@@ -1990,10 +2412,10 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl | |||
1990 | retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); | 2412 | retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); |
1991 | 2413 | ||
1992 | while (retcode == SS_NEED_CONNECTION) { | 2414 | while (retcode == SS_NEED_CONNECTION) { |
1993 | spin_lock_irq(&mdev->req_lock); | 2415 | spin_lock_irq(&mdev->tconn->req_lock); |
1994 | if (mdev->state.conn < C_CONNECTED) | 2416 | if (mdev->state.conn < C_CONNECTED) |
1995 | retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL); | 2417 | retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL); |
1996 | spin_unlock_irq(&mdev->req_lock); | 2418 | spin_unlock_irq(&mdev->tconn->req_lock); |
1997 | 2419 | ||
1998 | if (retcode != SS_NEED_CONNECTION) | 2420 | if (retcode != SS_NEED_CONNECTION) |
1999 | break; | 2421 | break; |
@@ -2002,7 +2424,25 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl | |||
2002 | } | 2424 | } |
2003 | drbd_resume_io(mdev); | 2425 | drbd_resume_io(mdev); |
2004 | 2426 | ||
2005 | reply->ret_code = retcode; | 2427 | out: |
2428 | drbd_adm_finish(info, retcode); | ||
2429 | return 0; | ||
2430 | } | ||
2431 | |||
2432 | static int drbd_adm_simple_request_state(struct sk_buff *skb, struct genl_info *info, | ||
2433 | union drbd_state mask, union drbd_state val) | ||
2434 | { | ||
2435 | enum drbd_ret_code retcode; | ||
2436 | |||
2437 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); | ||
2438 | if (!adm_ctx.reply_skb) | ||
2439 | return retcode; | ||
2440 | if (retcode != NO_ERROR) | ||
2441 | goto out; | ||
2442 | |||
2443 | retcode = drbd_request_state(adm_ctx.mdev, mask, val); | ||
2444 | out: | ||
2445 | drbd_adm_finish(info, retcode); | ||
2006 | return 0; | 2446 | return 0; |
2007 | } | 2447 | } |
2008 | 2448 | ||
@@ -2015,10 +2455,18 @@ static int drbd_bmio_set_susp_al(struct drbd_conf *mdev) | |||
2015 | return rv; | 2455 | return rv; |
2016 | } | 2456 | } |
2017 | 2457 | ||
2018 | static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2458 | int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info) |
2019 | struct drbd_nl_cfg_reply *reply) | ||
2020 | { | 2459 | { |
2021 | int retcode; | 2460 | int retcode; /* drbd_ret_code, drbd_state_rv */ |
2461 | struct drbd_conf *mdev; | ||
2462 | |||
2463 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); | ||
2464 | if (!adm_ctx.reply_skb) | ||
2465 | return retcode; | ||
2466 | if (retcode != NO_ERROR) | ||
2467 | goto out; | ||
2468 | |||
2469 | mdev = adm_ctx.mdev; | ||
2022 | 2470 | ||
2023 | /* If there is still bitmap IO pending, probably because of a previous | 2471 | /* If there is still bitmap IO pending, probably because of a previous |
2024 | * resync just being finished, wait for it before requesting a new resync. | 2472 | * resync just being finished, wait for it before requesting a new resync. |
@@ -2028,16 +2476,15 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re | |||
2028 | drbd_flush_workqueue(mdev); | 2476 | drbd_flush_workqueue(mdev); |
2029 | 2477 | ||
2030 | retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED); | 2478 | retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED); |
2031 | |||
2032 | if (retcode < SS_SUCCESS) { | 2479 | if (retcode < SS_SUCCESS) { |
2033 | if (retcode == SS_NEED_CONNECTION && mdev->state.role == R_PRIMARY) { | 2480 | if (retcode == SS_NEED_CONNECTION && mdev->state.role == R_PRIMARY) { |
2034 | /* The peer will get a resync upon connect anyways. Just make that | 2481 | /* The peer will get a resync upon connect anyways. |
2035 | into a full resync. */ | 2482 | * Just make that into a full resync. */ |
2036 | retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT)); | 2483 | retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT)); |
2037 | if (retcode >= SS_SUCCESS) { | 2484 | if (retcode >= SS_SUCCESS) { |
2038 | if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al, | 2485 | if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al, |
2039 | "set_n_write from invalidate_peer", | 2486 | "set_n_write from invalidate_peer", |
2040 | BM_LOCKED_SET_ALLOWED)) | 2487 | BM_LOCKED_SET_ALLOWED)) |
2041 | retcode = ERR_IO_MD_DISK; | 2488 | retcode = ERR_IO_MD_DISK; |
2042 | } | 2489 | } |
2043 | } else | 2490 | } else |
@@ -2045,30 +2492,41 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re | |||
2045 | } | 2492 | } |
2046 | drbd_resume_io(mdev); | 2493 | drbd_resume_io(mdev); |
2047 | 2494 | ||
2048 | reply->ret_code = retcode; | 2495 | out: |
2496 | drbd_adm_finish(info, retcode); | ||
2049 | return 0; | 2497 | return 0; |
2050 | } | 2498 | } |
2051 | 2499 | ||
2052 | static int drbd_nl_pause_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2500 | int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info) |
2053 | struct drbd_nl_cfg_reply *reply) | ||
2054 | { | 2501 | { |
2055 | int retcode = NO_ERROR; | 2502 | enum drbd_ret_code retcode; |
2056 | 2503 | ||
2057 | if (drbd_request_state(mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO) | 2504 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); |
2058 | retcode = ERR_PAUSE_IS_SET; | 2505 | if (!adm_ctx.reply_skb) |
2506 | return retcode; | ||
2507 | if (retcode != NO_ERROR) | ||
2508 | goto out; | ||
2059 | 2509 | ||
2060 | reply->ret_code = retcode; | 2510 | if (drbd_request_state(adm_ctx.mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO) |
2511 | retcode = ERR_PAUSE_IS_SET; | ||
2512 | out: | ||
2513 | drbd_adm_finish(info, retcode); | ||
2061 | return 0; | 2514 | return 0; |
2062 | } | 2515 | } |
2063 | 2516 | ||
2064 | static int drbd_nl_resume_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2517 | int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info) |
2065 | struct drbd_nl_cfg_reply *reply) | ||
2066 | { | 2518 | { |
2067 | int retcode = NO_ERROR; | 2519 | union drbd_dev_state s; |
2068 | union drbd_state s; | 2520 | enum drbd_ret_code retcode; |
2069 | 2521 | ||
2070 | if (drbd_request_state(mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO) { | 2522 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); |
2071 | s = mdev->state; | 2523 | if (!adm_ctx.reply_skb) |
2524 | return retcode; | ||
2525 | if (retcode != NO_ERROR) | ||
2526 | goto out; | ||
2527 | |||
2528 | if (drbd_request_state(adm_ctx.mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO) { | ||
2529 | s = adm_ctx.mdev->state; | ||
2072 | if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) { | 2530 | if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) { |
2073 | retcode = s.aftr_isp ? ERR_PIC_AFTER_DEP : | 2531 | retcode = s.aftr_isp ? ERR_PIC_AFTER_DEP : |
2074 | s.peer_isp ? ERR_PIC_PEER_DEP : ERR_PAUSE_IS_CLEAR; | 2532 | s.peer_isp ? ERR_PIC_PEER_DEP : ERR_PAUSE_IS_CLEAR; |
@@ -2077,172 +2535,482 @@ static int drbd_nl_resume_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n | |||
2077 | } | 2535 | } |
2078 | } | 2536 | } |
2079 | 2537 | ||
2080 | reply->ret_code = retcode; | 2538 | out: |
2539 | drbd_adm_finish(info, retcode); | ||
2081 | return 0; | 2540 | return 0; |
2082 | } | 2541 | } |
2083 | 2542 | ||
2084 | static int drbd_nl_suspend_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2543 | int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info) |
2085 | struct drbd_nl_cfg_reply *reply) | ||
2086 | { | 2544 | { |
2087 | reply->ret_code = drbd_request_state(mdev, NS(susp, 1)); | 2545 | return drbd_adm_simple_request_state(skb, info, NS(susp, 1)); |
2088 | |||
2089 | return 0; | ||
2090 | } | 2546 | } |
2091 | 2547 | ||
2092 | static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2548 | int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info) |
2093 | struct drbd_nl_cfg_reply *reply) | ||
2094 | { | 2549 | { |
2550 | struct drbd_conf *mdev; | ||
2551 | int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ | ||
2552 | |||
2553 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); | ||
2554 | if (!adm_ctx.reply_skb) | ||
2555 | return retcode; | ||
2556 | if (retcode != NO_ERROR) | ||
2557 | goto out; | ||
2558 | |||
2559 | mdev = adm_ctx.mdev; | ||
2095 | if (test_bit(NEW_CUR_UUID, &mdev->flags)) { | 2560 | if (test_bit(NEW_CUR_UUID, &mdev->flags)) { |
2096 | drbd_uuid_new_current(mdev); | 2561 | drbd_uuid_new_current(mdev); |
2097 | clear_bit(NEW_CUR_UUID, &mdev->flags); | 2562 | clear_bit(NEW_CUR_UUID, &mdev->flags); |
2098 | } | 2563 | } |
2099 | drbd_suspend_io(mdev); | 2564 | drbd_suspend_io(mdev); |
2100 | reply->ret_code = drbd_request_state(mdev, NS3(susp, 0, susp_nod, 0, susp_fen, 0)); | 2565 | retcode = drbd_request_state(mdev, NS3(susp, 0, susp_nod, 0, susp_fen, 0)); |
2101 | if (reply->ret_code == SS_SUCCESS) { | 2566 | if (retcode == SS_SUCCESS) { |
2102 | if (mdev->state.conn < C_CONNECTED) | 2567 | if (mdev->state.conn < C_CONNECTED) |
2103 | tl_clear(mdev); | 2568 | tl_clear(mdev->tconn); |
2104 | if (mdev->state.disk == D_DISKLESS || mdev->state.disk == D_FAILED) | 2569 | if (mdev->state.disk == D_DISKLESS || mdev->state.disk == D_FAILED) |
2105 | tl_restart(mdev, fail_frozen_disk_io); | 2570 | tl_restart(mdev->tconn, FAIL_FROZEN_DISK_IO); |
2106 | } | 2571 | } |
2107 | drbd_resume_io(mdev); | 2572 | drbd_resume_io(mdev); |
2108 | 2573 | ||
2574 | out: | ||
2575 | drbd_adm_finish(info, retcode); | ||
2109 | return 0; | 2576 | return 0; |
2110 | } | 2577 | } |
2111 | 2578 | ||
2112 | static int drbd_nl_outdate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2579 | int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info) |
2113 | struct drbd_nl_cfg_reply *reply) | ||
2114 | { | 2580 | { |
2115 | reply->ret_code = drbd_request_state(mdev, NS(disk, D_OUTDATED)); | 2581 | return drbd_adm_simple_request_state(skb, info, NS(disk, D_OUTDATED)); |
2116 | return 0; | ||
2117 | } | 2582 | } |
2118 | 2583 | ||
2119 | static int drbd_nl_get_config(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2584 | int nla_put_drbd_cfg_context(struct sk_buff *skb, struct drbd_tconn *tconn, unsigned vnr) |
2120 | struct drbd_nl_cfg_reply *reply) | ||
2121 | { | 2585 | { |
2122 | unsigned short *tl; | 2586 | struct nlattr *nla; |
2587 | nla = nla_nest_start(skb, DRBD_NLA_CFG_CONTEXT); | ||
2588 | if (!nla) | ||
2589 | goto nla_put_failure; | ||
2590 | if (vnr != VOLUME_UNSPECIFIED && | ||
2591 | nla_put_u32(skb, T_ctx_volume, vnr)) | ||
2592 | goto nla_put_failure; | ||
2593 | if (nla_put_string(skb, T_ctx_resource_name, tconn->name)) | ||
2594 | goto nla_put_failure; | ||
2595 | if (tconn->my_addr_len && | ||
2596 | nla_put(skb, T_ctx_my_addr, tconn->my_addr_len, &tconn->my_addr)) | ||
2597 | goto nla_put_failure; | ||
2598 | if (tconn->peer_addr_len && | ||
2599 | nla_put(skb, T_ctx_peer_addr, tconn->peer_addr_len, &tconn->peer_addr)) | ||
2600 | goto nla_put_failure; | ||
2601 | nla_nest_end(skb, nla); | ||
2602 | return 0; | ||
2123 | 2603 | ||
2124 | tl = reply->tag_list; | 2604 | nla_put_failure: |
2605 | if (nla) | ||
2606 | nla_nest_cancel(skb, nla); | ||
2607 | return -EMSGSIZE; | ||
2608 | } | ||
2125 | 2609 | ||
2126 | if (get_ldev(mdev)) { | 2610 | int nla_put_status_info(struct sk_buff *skb, struct drbd_conf *mdev, |
2127 | tl = disk_conf_to_tags(mdev, &mdev->ldev->dc, tl); | 2611 | const struct sib_info *sib) |
2128 | put_ldev(mdev); | 2612 | { |
2129 | } | 2613 | struct state_info *si = NULL; /* for sizeof(si->member); */ |
2614 | struct net_conf *nc; | ||
2615 | struct nlattr *nla; | ||
2616 | int got_ldev; | ||
2617 | int err = 0; | ||
2618 | int exclude_sensitive; | ||
2619 | |||
2620 | /* If sib != NULL, this is drbd_bcast_event, which anyone can listen | ||
2621 | * to. So we better exclude_sensitive information. | ||
2622 | * | ||
2623 | * If sib == NULL, this is drbd_adm_get_status, executed synchronously | ||
2624 | * in the context of the requesting user process. Exclude sensitive | ||
2625 | * information, unless current has superuser. | ||
2626 | * | ||
2627 | * NOTE: for drbd_adm_get_status_all(), this is a netlink dump, and | ||
2628 | * relies on the current implementation of netlink_dump(), which | ||
2629 | * executes the dump callback successively from netlink_recvmsg(), | ||
2630 | * always in the context of the receiving process */ | ||
2631 | exclude_sensitive = sib || !capable(CAP_SYS_ADMIN); | ||
2632 | |||
2633 | got_ldev = get_ldev(mdev); | ||
2634 | |||
2635 | /* We need to add connection name and volume number information still. | ||
2636 | * Minor number is in drbd_genlmsghdr. */ | ||
2637 | if (nla_put_drbd_cfg_context(skb, mdev->tconn, mdev->vnr)) | ||
2638 | goto nla_put_failure; | ||
2639 | |||
2640 | if (res_opts_to_skb(skb, &mdev->tconn->res_opts, exclude_sensitive)) | ||
2641 | goto nla_put_failure; | ||
2642 | |||
2643 | rcu_read_lock(); | ||
2644 | if (got_ldev) | ||
2645 | if (disk_conf_to_skb(skb, rcu_dereference(mdev->ldev->disk_conf), exclude_sensitive)) | ||
2646 | goto nla_put_failure; | ||
2647 | |||
2648 | nc = rcu_dereference(mdev->tconn->net_conf); | ||
2649 | if (nc) | ||
2650 | err = net_conf_to_skb(skb, nc, exclude_sensitive); | ||
2651 | rcu_read_unlock(); | ||
2652 | if (err) | ||
2653 | goto nla_put_failure; | ||
2654 | |||
2655 | nla = nla_nest_start(skb, DRBD_NLA_STATE_INFO); | ||
2656 | if (!nla) | ||
2657 | goto nla_put_failure; | ||
2658 | if (nla_put_u32(skb, T_sib_reason, sib ? sib->sib_reason : SIB_GET_STATUS_REPLY) || | ||
2659 | nla_put_u32(skb, T_current_state, mdev->state.i) || | ||
2660 | nla_put_u64(skb, T_ed_uuid, mdev->ed_uuid) || | ||
2661 | nla_put_u64(skb, T_capacity, drbd_get_capacity(mdev->this_bdev)) || | ||
2662 | nla_put_u64(skb, T_send_cnt, mdev->send_cnt) || | ||
2663 | nla_put_u64(skb, T_recv_cnt, mdev->recv_cnt) || | ||
2664 | nla_put_u64(skb, T_read_cnt, mdev->read_cnt) || | ||
2665 | nla_put_u64(skb, T_writ_cnt, mdev->writ_cnt) || | ||
2666 | nla_put_u64(skb, T_al_writ_cnt, mdev->al_writ_cnt) || | ||
2667 | nla_put_u64(skb, T_bm_writ_cnt, mdev->bm_writ_cnt) || | ||
2668 | nla_put_u32(skb, T_ap_bio_cnt, atomic_read(&mdev->ap_bio_cnt)) || | ||
2669 | nla_put_u32(skb, T_ap_pending_cnt, atomic_read(&mdev->ap_pending_cnt)) || | ||
2670 | nla_put_u32(skb, T_rs_pending_cnt, atomic_read(&mdev->rs_pending_cnt))) | ||
2671 | goto nla_put_failure; | ||
2672 | |||
2673 | if (got_ldev) { | ||
2674 | int err; | ||
2130 | 2675 | ||
2131 | if (get_net_conf(mdev)) { | 2676 | spin_lock_irq(&mdev->ldev->md.uuid_lock); |
2132 | tl = net_conf_to_tags(mdev, mdev->net_conf, tl); | 2677 | err = nla_put(skb, T_uuids, sizeof(si->uuids), mdev->ldev->md.uuid); |
2133 | put_net_conf(mdev); | 2678 | spin_unlock_irq(&mdev->ldev->md.uuid_lock); |
2679 | |||
2680 | if (err) | ||
2681 | goto nla_put_failure; | ||
2682 | |||
2683 | if (nla_put_u32(skb, T_disk_flags, mdev->ldev->md.flags) || | ||
2684 | nla_put_u64(skb, T_bits_total, drbd_bm_bits(mdev)) || | ||
2685 | nla_put_u64(skb, T_bits_oos, drbd_bm_total_weight(mdev))) | ||
2686 | goto nla_put_failure; | ||
2687 | if (C_SYNC_SOURCE <= mdev->state.conn && | ||
2688 | C_PAUSED_SYNC_T >= mdev->state.conn) { | ||
2689 | if (nla_put_u64(skb, T_bits_rs_total, mdev->rs_total) || | ||
2690 | nla_put_u64(skb, T_bits_rs_failed, mdev->rs_failed)) | ||
2691 | goto nla_put_failure; | ||
2692 | } | ||
2134 | } | 2693 | } |
2135 | tl = syncer_conf_to_tags(mdev, &mdev->sync_conf, tl); | ||
2136 | 2694 | ||
2137 | put_unaligned(TT_END, tl++); /* Close the tag list */ | 2695 | if (sib) { |
2696 | switch(sib->sib_reason) { | ||
2697 | case SIB_SYNC_PROGRESS: | ||
2698 | case SIB_GET_STATUS_REPLY: | ||
2699 | break; | ||
2700 | case SIB_STATE_CHANGE: | ||
2701 | if (nla_put_u32(skb, T_prev_state, sib->os.i) || | ||
2702 | nla_put_u32(skb, T_new_state, sib->ns.i)) | ||
2703 | goto nla_put_failure; | ||
2704 | break; | ||
2705 | case SIB_HELPER_POST: | ||
2706 | if (nla_put_u32(skb, T_helper_exit_code, | ||
2707 | sib->helper_exit_code)) | ||
2708 | goto nla_put_failure; | ||
2709 | /* fall through */ | ||
2710 | case SIB_HELPER_PRE: | ||
2711 | if (nla_put_string(skb, T_helper, sib->helper_name)) | ||
2712 | goto nla_put_failure; | ||
2713 | break; | ||
2714 | } | ||
2715 | } | ||
2716 | nla_nest_end(skb, nla); | ||
2138 | 2717 | ||
2139 | return (int)((char *)tl - (char *)reply->tag_list); | 2718 | if (0) |
2719 | nla_put_failure: | ||
2720 | err = -EMSGSIZE; | ||
2721 | if (got_ldev) | ||
2722 | put_ldev(mdev); | ||
2723 | return err; | ||
2140 | } | 2724 | } |
2141 | 2725 | ||
2142 | static int drbd_nl_get_state(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2726 | int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info) |
2143 | struct drbd_nl_cfg_reply *reply) | ||
2144 | { | 2727 | { |
2145 | unsigned short *tl = reply->tag_list; | 2728 | enum drbd_ret_code retcode; |
2146 | union drbd_state s = mdev->state; | 2729 | int err; |
2147 | unsigned long rs_left; | ||
2148 | unsigned int res; | ||
2149 | 2730 | ||
2150 | tl = get_state_to_tags(mdev, (struct get_state *)&s, tl); | 2731 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); |
2732 | if (!adm_ctx.reply_skb) | ||
2733 | return retcode; | ||
2734 | if (retcode != NO_ERROR) | ||
2735 | goto out; | ||
2151 | 2736 | ||
2152 | /* no local ref, no bitmap, no syncer progress. */ | 2737 | err = nla_put_status_info(adm_ctx.reply_skb, adm_ctx.mdev, NULL); |
2153 | if (s.conn >= C_SYNC_SOURCE && s.conn <= C_PAUSED_SYNC_T) { | 2738 | if (err) { |
2154 | if (get_ldev(mdev)) { | 2739 | nlmsg_free(adm_ctx.reply_skb); |
2155 | drbd_get_syncer_progress(mdev, &rs_left, &res); | 2740 | return err; |
2156 | tl = tl_add_int(tl, T_sync_progress, &res); | ||
2157 | put_ldev(mdev); | ||
2158 | } | ||
2159 | } | 2741 | } |
2160 | put_unaligned(TT_END, tl++); /* Close the tag list */ | 2742 | out: |
2161 | 2743 | drbd_adm_finish(info, retcode); | |
2162 | return (int)((char *)tl - (char *)reply->tag_list); | 2744 | return 0; |
2163 | } | 2745 | } |
2164 | 2746 | ||
2165 | static int drbd_nl_get_uuids(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2747 | int get_one_status(struct sk_buff *skb, struct netlink_callback *cb) |
2166 | struct drbd_nl_cfg_reply *reply) | ||
2167 | { | 2748 | { |
2168 | unsigned short *tl; | 2749 | struct drbd_conf *mdev; |
2169 | 2750 | struct drbd_genlmsghdr *dh; | |
2170 | tl = reply->tag_list; | 2751 | struct drbd_tconn *pos = (struct drbd_tconn*)cb->args[0]; |
2752 | struct drbd_tconn *tconn = NULL; | ||
2753 | struct drbd_tconn *tmp; | ||
2754 | unsigned volume = cb->args[1]; | ||
2755 | |||
2756 | /* Open coded, deferred, iteration: | ||
2757 | * list_for_each_entry_safe(tconn, tmp, &drbd_tconns, all_tconn) { | ||
2758 | * idr_for_each_entry(&tconn->volumes, mdev, i) { | ||
2759 | * ... | ||
2760 | * } | ||
2761 | * } | ||
2762 | * where tconn is cb->args[0]; | ||
2763 | * and i is cb->args[1]; | ||
2764 | * | ||
2765 | * cb->args[2] indicates if we shall loop over all resources, | ||
2766 | * or just dump all volumes of a single resource. | ||
2767 | * | ||
2768 | * This may miss entries inserted after this dump started, | ||
2769 | * or entries deleted before they are reached. | ||
2770 | * | ||
2771 | * We need to make sure the mdev won't disappear while | ||
2772 | * we are looking at it, and revalidate our iterators | ||
2773 | * on each iteration. | ||
2774 | */ | ||
2171 | 2775 | ||
2172 | if (get_ldev(mdev)) { | 2776 | /* synchronize with conn_create()/conn_destroy() */ |
2173 | tl = tl_add_blob(tl, T_uuids, mdev->ldev->md.uuid, UI_SIZE*sizeof(u64)); | 2777 | rcu_read_lock(); |
2174 | tl = tl_add_int(tl, T_uuids_flags, &mdev->ldev->md.flags); | 2778 | /* revalidate iterator position */ |
2175 | put_ldev(mdev); | 2779 | list_for_each_entry_rcu(tmp, &drbd_tconns, all_tconn) { |
2780 | if (pos == NULL) { | ||
2781 | /* first iteration */ | ||
2782 | pos = tmp; | ||
2783 | tconn = pos; | ||
2784 | break; | ||
2785 | } | ||
2786 | if (tmp == pos) { | ||
2787 | tconn = pos; | ||
2788 | break; | ||
2789 | } | ||
2176 | } | 2790 | } |
2177 | put_unaligned(TT_END, tl++); /* Close the tag list */ | 2791 | if (tconn) { |
2792 | next_tconn: | ||
2793 | mdev = idr_get_next(&tconn->volumes, &volume); | ||
2794 | if (!mdev) { | ||
2795 | /* No more volumes to dump on this tconn. | ||
2796 | * Advance tconn iterator. */ | ||
2797 | pos = list_entry_rcu(tconn->all_tconn.next, | ||
2798 | struct drbd_tconn, all_tconn); | ||
2799 | /* Did we dump any volume on this tconn yet? */ | ||
2800 | if (volume != 0) { | ||
2801 | /* If we reached the end of the list, | ||
2802 | * or only a single resource dump was requested, | ||
2803 | * we are done. */ | ||
2804 | if (&pos->all_tconn == &drbd_tconns || cb->args[2]) | ||
2805 | goto out; | ||
2806 | volume = 0; | ||
2807 | tconn = pos; | ||
2808 | goto next_tconn; | ||
2809 | } | ||
2810 | } | ||
2811 | |||
2812 | dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, | ||
2813 | cb->nlh->nlmsg_seq, &drbd_genl_family, | ||
2814 | NLM_F_MULTI, DRBD_ADM_GET_STATUS); | ||
2815 | if (!dh) | ||
2816 | goto out; | ||
2817 | |||
2818 | if (!mdev) { | ||
2819 | /* This is a tconn without a single volume. | ||
2820 | * Suprisingly enough, it may have a network | ||
2821 | * configuration. */ | ||
2822 | struct net_conf *nc; | ||
2823 | dh->minor = -1U; | ||
2824 | dh->ret_code = NO_ERROR; | ||
2825 | if (nla_put_drbd_cfg_context(skb, tconn, VOLUME_UNSPECIFIED)) | ||
2826 | goto cancel; | ||
2827 | nc = rcu_dereference(tconn->net_conf); | ||
2828 | if (nc && net_conf_to_skb(skb, nc, 1) != 0) | ||
2829 | goto cancel; | ||
2830 | goto done; | ||
2831 | } | ||
2832 | |||
2833 | D_ASSERT(mdev->vnr == volume); | ||
2834 | D_ASSERT(mdev->tconn == tconn); | ||
2835 | |||
2836 | dh->minor = mdev_to_minor(mdev); | ||
2837 | dh->ret_code = NO_ERROR; | ||
2178 | 2838 | ||
2179 | return (int)((char *)tl - (char *)reply->tag_list); | 2839 | if (nla_put_status_info(skb, mdev, NULL)) { |
2840 | cancel: | ||
2841 | genlmsg_cancel(skb, dh); | ||
2842 | goto out; | ||
2843 | } | ||
2844 | done: | ||
2845 | genlmsg_end(skb, dh); | ||
2846 | } | ||
2847 | |||
2848 | out: | ||
2849 | rcu_read_unlock(); | ||
2850 | /* where to start the next iteration */ | ||
2851 | cb->args[0] = (long)pos; | ||
2852 | cb->args[1] = (pos == tconn) ? volume + 1 : 0; | ||
2853 | |||
2854 | /* No more tconns/volumes/minors found results in an empty skb. | ||
2855 | * Which will terminate the dump. */ | ||
2856 | return skb->len; | ||
2180 | } | 2857 | } |
2181 | 2858 | ||
2182 | /** | 2859 | /* |
2183 | * drbd_nl_get_timeout_flag() - Used by drbdsetup to find out which timeout value to use | 2860 | * Request status of all resources, or of all volumes within a single resource. |
2184 | * @mdev: DRBD device. | 2861 | * |
2185 | * @nlp: Netlink/connector packet from drbdsetup | 2862 | * This is a dump, as the answer may not fit in a single reply skb otherwise. |
2186 | * @reply: Reply packet for drbdsetup | 2863 | * Which means we cannot use the family->attrbuf or other such members, because |
2864 | * dump is NOT protected by the genl_lock(). During dump, we only have access | ||
2865 | * to the incoming skb, and need to opencode "parsing" of the nlattr payload. | ||
2866 | * | ||
2867 | * Once things are setup properly, we call into get_one_status(). | ||
2187 | */ | 2868 | */ |
2188 | static int drbd_nl_get_timeout_flag(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2869 | int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb) |
2189 | struct drbd_nl_cfg_reply *reply) | ||
2190 | { | 2870 | { |
2191 | unsigned short *tl; | 2871 | const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ; |
2192 | char rv; | 2872 | struct nlattr *nla; |
2873 | const char *resource_name; | ||
2874 | struct drbd_tconn *tconn; | ||
2875 | int maxtype; | ||
2876 | |||
2877 | /* Is this a followup call? */ | ||
2878 | if (cb->args[0]) { | ||
2879 | /* ... of a single resource dump, | ||
2880 | * and the resource iterator has been advanced already? */ | ||
2881 | if (cb->args[2] && cb->args[2] != cb->args[0]) | ||
2882 | return 0; /* DONE. */ | ||
2883 | goto dump; | ||
2884 | } | ||
2885 | |||
2886 | /* First call (from netlink_dump_start). We need to figure out | ||
2887 | * which resource(s) the user wants us to dump. */ | ||
2888 | nla = nla_find(nlmsg_attrdata(cb->nlh, hdrlen), | ||
2889 | nlmsg_attrlen(cb->nlh, hdrlen), | ||
2890 | DRBD_NLA_CFG_CONTEXT); | ||
2891 | |||
2892 | /* No explicit context given. Dump all. */ | ||
2893 | if (!nla) | ||
2894 | goto dump; | ||
2895 | maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1; | ||
2896 | nla = drbd_nla_find_nested(maxtype, nla, __nla_type(T_ctx_resource_name)); | ||
2897 | if (IS_ERR(nla)) | ||
2898 | return PTR_ERR(nla); | ||
2899 | /* context given, but no name present? */ | ||
2900 | if (!nla) | ||
2901 | return -EINVAL; | ||
2902 | resource_name = nla_data(nla); | ||
2903 | tconn = conn_get_by_name(resource_name); | ||
2904 | |||
2905 | if (!tconn) | ||
2906 | return -ENODEV; | ||
2907 | |||
2908 | kref_put(&tconn->kref, &conn_destroy); /* get_one_status() (re)validates tconn by itself */ | ||
2909 | |||
2910 | /* prime iterators, and set "filter" mode mark: | ||
2911 | * only dump this tconn. */ | ||
2912 | cb->args[0] = (long)tconn; | ||
2913 | /* cb->args[1] = 0; passed in this way. */ | ||
2914 | cb->args[2] = (long)tconn; | ||
2915 | |||
2916 | dump: | ||
2917 | return get_one_status(skb, cb); | ||
2918 | } | ||
2193 | 2919 | ||
2194 | tl = reply->tag_list; | 2920 | int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info) |
2921 | { | ||
2922 | enum drbd_ret_code retcode; | ||
2923 | struct timeout_parms tp; | ||
2924 | int err; | ||
2195 | 2925 | ||
2196 | rv = mdev->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED : | 2926 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); |
2197 | test_bit(USE_DEGR_WFC_T, &mdev->flags) ? UT_DEGRADED : UT_DEFAULT; | 2927 | if (!adm_ctx.reply_skb) |
2928 | return retcode; | ||
2929 | if (retcode != NO_ERROR) | ||
2930 | goto out; | ||
2198 | 2931 | ||
2199 | tl = tl_add_blob(tl, T_use_degraded, &rv, sizeof(rv)); | 2932 | tp.timeout_type = |
2200 | put_unaligned(TT_END, tl++); /* Close the tag list */ | 2933 | adm_ctx.mdev->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED : |
2934 | test_bit(USE_DEGR_WFC_T, &adm_ctx.mdev->flags) ? UT_DEGRADED : | ||
2935 | UT_DEFAULT; | ||
2201 | 2936 | ||
2202 | return (int)((char *)tl - (char *)reply->tag_list); | 2937 | err = timeout_parms_to_priv_skb(adm_ctx.reply_skb, &tp); |
2938 | if (err) { | ||
2939 | nlmsg_free(adm_ctx.reply_skb); | ||
2940 | return err; | ||
2941 | } | ||
2942 | out: | ||
2943 | drbd_adm_finish(info, retcode); | ||
2944 | return 0; | ||
2203 | } | 2945 | } |
2204 | 2946 | ||
2205 | static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2947 | int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info) |
2206 | struct drbd_nl_cfg_reply *reply) | ||
2207 | { | 2948 | { |
2208 | /* default to resume from last known position, if possible */ | 2949 | struct drbd_conf *mdev; |
2209 | struct start_ov args = | 2950 | enum drbd_ret_code retcode; |
2210 | { .start_sector = mdev->ov_start_sector }; | 2951 | struct start_ov_parms parms; |
2211 | 2952 | ||
2212 | if (!start_ov_from_tags(mdev, nlp->tag_list, &args)) { | 2953 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); |
2213 | reply->ret_code = ERR_MANDATORY_TAG; | 2954 | if (!adm_ctx.reply_skb) |
2214 | return 0; | 2955 | return retcode; |
2956 | if (retcode != NO_ERROR) | ||
2957 | goto out; | ||
2958 | |||
2959 | mdev = adm_ctx.mdev; | ||
2960 | |||
2961 | /* resume from last known position, if possible */ | ||
2962 | parms.ov_start_sector = mdev->ov_start_sector; | ||
2963 | parms.ov_stop_sector = ULLONG_MAX; | ||
2964 | if (info->attrs[DRBD_NLA_START_OV_PARMS]) { | ||
2965 | int err = start_ov_parms_from_attrs(&parms, info); | ||
2966 | if (err) { | ||
2967 | retcode = ERR_MANDATORY_TAG; | ||
2968 | drbd_msg_put_info(from_attrs_err_to_txt(err)); | ||
2969 | goto out; | ||
2970 | } | ||
2215 | } | 2971 | } |
2972 | /* w_make_ov_request expects position to be aligned */ | ||
2973 | mdev->ov_start_sector = parms.ov_start_sector & ~(BM_SECT_PER_BIT-1); | ||
2974 | mdev->ov_stop_sector = parms.ov_stop_sector; | ||
2216 | 2975 | ||
2217 | /* If there is still bitmap IO pending, e.g. previous resync or verify | 2976 | /* If there is still bitmap IO pending, e.g. previous resync or verify |
2218 | * just being finished, wait for it before requesting a new resync. */ | 2977 | * just being finished, wait for it before requesting a new resync. */ |
2219 | drbd_suspend_io(mdev); | 2978 | drbd_suspend_io(mdev); |
2220 | wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); | 2979 | wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); |
2221 | 2980 | retcode = drbd_request_state(mdev,NS(conn,C_VERIFY_S)); | |
2222 | /* w_make_ov_request expects position to be aligned */ | ||
2223 | mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT; | ||
2224 | reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S)); | ||
2225 | drbd_resume_io(mdev); | 2981 | drbd_resume_io(mdev); |
2982 | out: | ||
2983 | drbd_adm_finish(info, retcode); | ||
2226 | return 0; | 2984 | return 0; |
2227 | } | 2985 | } |
2228 | 2986 | ||
2229 | 2987 | ||
2230 | static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2988 | int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info) |
2231 | struct drbd_nl_cfg_reply *reply) | ||
2232 | { | 2989 | { |
2233 | int retcode = NO_ERROR; | 2990 | struct drbd_conf *mdev; |
2991 | enum drbd_ret_code retcode; | ||
2234 | int skip_initial_sync = 0; | 2992 | int skip_initial_sync = 0; |
2235 | int err; | 2993 | int err; |
2994 | struct new_c_uuid_parms args; | ||
2236 | 2995 | ||
2237 | struct new_c_uuid args; | 2996 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); |
2997 | if (!adm_ctx.reply_skb) | ||
2998 | return retcode; | ||
2999 | if (retcode != NO_ERROR) | ||
3000 | goto out_nolock; | ||
2238 | 3001 | ||
2239 | memset(&args, 0, sizeof(struct new_c_uuid)); | 3002 | mdev = adm_ctx.mdev; |
2240 | if (!new_c_uuid_from_tags(mdev, nlp->tag_list, &args)) { | 3003 | memset(&args, 0, sizeof(args)); |
2241 | reply->ret_code = ERR_MANDATORY_TAG; | 3004 | if (info->attrs[DRBD_NLA_NEW_C_UUID_PARMS]) { |
2242 | return 0; | 3005 | err = new_c_uuid_parms_from_attrs(&args, info); |
3006 | if (err) { | ||
3007 | retcode = ERR_MANDATORY_TAG; | ||
3008 | drbd_msg_put_info(from_attrs_err_to_txt(err)); | ||
3009 | goto out_nolock; | ||
3010 | } | ||
2243 | } | 3011 | } |
2244 | 3012 | ||
2245 | mutex_lock(&mdev->state_mutex); /* Protects us against serialized state changes. */ | 3013 | mutex_lock(mdev->state_mutex); /* Protects us against serialized state changes. */ |
2246 | 3014 | ||
2247 | if (!get_ldev(mdev)) { | 3015 | if (!get_ldev(mdev)) { |
2248 | retcode = ERR_NO_DISK; | 3016 | retcode = ERR_NO_DISK; |
@@ -2250,7 +3018,7 @@ static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl | |||
2250 | } | 3018 | } |
2251 | 3019 | ||
2252 | /* this is "skip initial sync", assume to be clean */ | 3020 | /* this is "skip initial sync", assume to be clean */ |
2253 | if (mdev->state.conn == C_CONNECTED && mdev->agreed_pro_version >= 90 && | 3021 | if (mdev->state.conn == C_CONNECTED && mdev->tconn->agreed_pro_version >= 90 && |
2254 | mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) { | 3022 | mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) { |
2255 | dev_info(DEV, "Preparing to skip initial sync\n"); | 3023 | dev_info(DEV, "Preparing to skip initial sync\n"); |
2256 | skip_initial_sync = 1; | 3024 | skip_initial_sync = 1; |
@@ -2273,10 +3041,10 @@ static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl | |||
2273 | drbd_send_uuids_skip_initial_sync(mdev); | 3041 | drbd_send_uuids_skip_initial_sync(mdev); |
2274 | _drbd_uuid_set(mdev, UI_BITMAP, 0); | 3042 | _drbd_uuid_set(mdev, UI_BITMAP, 0); |
2275 | drbd_print_uuids(mdev, "cleared bitmap UUID"); | 3043 | drbd_print_uuids(mdev, "cleared bitmap UUID"); |
2276 | spin_lock_irq(&mdev->req_lock); | 3044 | spin_lock_irq(&mdev->tconn->req_lock); |
2277 | _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE), | 3045 | _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE), |
2278 | CS_VERBOSE, NULL); | 3046 | CS_VERBOSE, NULL); |
2279 | spin_unlock_irq(&mdev->req_lock); | 3047 | spin_unlock_irq(&mdev->tconn->req_lock); |
2280 | } | 3048 | } |
2281 | } | 3049 | } |
2282 | 3050 | ||
@@ -2284,416 +3052,284 @@ static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl | |||
2284 | out_dec: | 3052 | out_dec: |
2285 | put_ldev(mdev); | 3053 | put_ldev(mdev); |
2286 | out: | 3054 | out: |
2287 | mutex_unlock(&mdev->state_mutex); | 3055 | mutex_unlock(mdev->state_mutex); |
2288 | 3056 | out_nolock: | |
2289 | reply->ret_code = retcode; | 3057 | drbd_adm_finish(info, retcode); |
2290 | return 0; | 3058 | return 0; |
2291 | } | 3059 | } |
2292 | 3060 | ||
2293 | struct cn_handler_struct { | 3061 | static enum drbd_ret_code |
2294 | int (*function)(struct drbd_conf *, | 3062 | drbd_check_resource_name(const char *name) |
2295 | struct drbd_nl_cfg_req *, | ||
2296 | struct drbd_nl_cfg_reply *); | ||
2297 | int reply_body_size; | ||
2298 | }; | ||
2299 | |||
2300 | static struct cn_handler_struct cnd_table[] = { | ||
2301 | [ P_primary ] = { &drbd_nl_primary, 0 }, | ||
2302 | [ P_secondary ] = { &drbd_nl_secondary, 0 }, | ||
2303 | [ P_disk_conf ] = { &drbd_nl_disk_conf, 0 }, | ||
2304 | [ P_detach ] = { &drbd_nl_detach, 0 }, | ||
2305 | [ P_net_conf ] = { &drbd_nl_net_conf, 0 }, | ||
2306 | [ P_disconnect ] = { &drbd_nl_disconnect, 0 }, | ||
2307 | [ P_resize ] = { &drbd_nl_resize, 0 }, | ||
2308 | [ P_syncer_conf ] = { &drbd_nl_syncer_conf, 0 }, | ||
2309 | [ P_invalidate ] = { &drbd_nl_invalidate, 0 }, | ||
2310 | [ P_invalidate_peer ] = { &drbd_nl_invalidate_peer, 0 }, | ||
2311 | [ P_pause_sync ] = { &drbd_nl_pause_sync, 0 }, | ||
2312 | [ P_resume_sync ] = { &drbd_nl_resume_sync, 0 }, | ||
2313 | [ P_suspend_io ] = { &drbd_nl_suspend_io, 0 }, | ||
2314 | [ P_resume_io ] = { &drbd_nl_resume_io, 0 }, | ||
2315 | [ P_outdate ] = { &drbd_nl_outdate, 0 }, | ||
2316 | [ P_get_config ] = { &drbd_nl_get_config, | ||
2317 | sizeof(struct syncer_conf_tag_len_struct) + | ||
2318 | sizeof(struct disk_conf_tag_len_struct) + | ||
2319 | sizeof(struct net_conf_tag_len_struct) }, | ||
2320 | [ P_get_state ] = { &drbd_nl_get_state, | ||
2321 | sizeof(struct get_state_tag_len_struct) + | ||
2322 | sizeof(struct sync_progress_tag_len_struct) }, | ||
2323 | [ P_get_uuids ] = { &drbd_nl_get_uuids, | ||
2324 | sizeof(struct get_uuids_tag_len_struct) }, | ||
2325 | [ P_get_timeout_flag ] = { &drbd_nl_get_timeout_flag, | ||
2326 | sizeof(struct get_timeout_flag_tag_len_struct)}, | ||
2327 | [ P_start_ov ] = { &drbd_nl_start_ov, 0 }, | ||
2328 | [ P_new_c_uuid ] = { &drbd_nl_new_c_uuid, 0 }, | ||
2329 | }; | ||
2330 | |||
2331 | static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms *nsp) | ||
2332 | { | 3063 | { |
2333 | struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req *)req->data; | 3064 | if (!name || !name[0]) { |
2334 | struct cn_handler_struct *cm; | 3065 | drbd_msg_put_info("resource name missing"); |
2335 | struct cn_msg *cn_reply; | 3066 | return ERR_MANDATORY_TAG; |
2336 | struct drbd_nl_cfg_reply *reply; | ||
2337 | struct drbd_conf *mdev; | ||
2338 | int retcode, rr; | ||
2339 | int reply_size = sizeof(struct cn_msg) | ||
2340 | + sizeof(struct drbd_nl_cfg_reply) | ||
2341 | + sizeof(short int); | ||
2342 | |||
2343 | if (!try_module_get(THIS_MODULE)) { | ||
2344 | printk(KERN_ERR "drbd: try_module_get() failed!\n"); | ||
2345 | return; | ||
2346 | } | 3067 | } |
2347 | 3068 | /* if we want to use these in sysfs/configfs/debugfs some day, | |
2348 | if (!capable(CAP_SYS_ADMIN)) { | 3069 | * we must not allow slashes */ |
2349 | retcode = ERR_PERM; | 3070 | if (strchr(name, '/')) { |
2350 | goto fail; | 3071 | drbd_msg_put_info("invalid resource name"); |
2351 | } | 3072 | return ERR_INVALID_REQUEST; |
2352 | |||
2353 | mdev = ensure_mdev(nlp->drbd_minor, | ||
2354 | (nlp->flags & DRBD_NL_CREATE_DEVICE)); | ||
2355 | if (!mdev) { | ||
2356 | retcode = ERR_MINOR_INVALID; | ||
2357 | goto fail; | ||
2358 | } | 3073 | } |
3074 | return NO_ERROR; | ||
3075 | } | ||
2359 | 3076 | ||
2360 | if (nlp->packet_type >= P_nl_after_last_packet || | 3077 | int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info) |
2361 | nlp->packet_type == P_return_code_only) { | 3078 | { |
2362 | retcode = ERR_PACKET_NR; | 3079 | enum drbd_ret_code retcode; |
2363 | goto fail; | 3080 | struct res_opts res_opts; |
2364 | } | 3081 | int err; |
2365 | 3082 | ||
2366 | cm = cnd_table + nlp->packet_type; | 3083 | retcode = drbd_adm_prepare(skb, info, 0); |
3084 | if (!adm_ctx.reply_skb) | ||
3085 | return retcode; | ||
3086 | if (retcode != NO_ERROR) | ||
3087 | goto out; | ||
2367 | 3088 | ||
2368 | /* This may happen if packet number is 0: */ | 3089 | set_res_opts_defaults(&res_opts); |
2369 | if (cm->function == NULL) { | 3090 | err = res_opts_from_attrs(&res_opts, info); |
2370 | retcode = ERR_PACKET_NR; | 3091 | if (err && err != -ENOMSG) { |
2371 | goto fail; | 3092 | retcode = ERR_MANDATORY_TAG; |
3093 | drbd_msg_put_info(from_attrs_err_to_txt(err)); | ||
3094 | goto out; | ||
2372 | } | 3095 | } |
2373 | 3096 | ||
2374 | reply_size += cm->reply_body_size; | 3097 | retcode = drbd_check_resource_name(adm_ctx.resource_name); |
3098 | if (retcode != NO_ERROR) | ||
3099 | goto out; | ||
2375 | 3100 | ||
2376 | /* allocation not in the IO path, cqueue thread context */ | 3101 | if (adm_ctx.tconn) { |
2377 | cn_reply = kzalloc(reply_size, GFP_KERNEL); | 3102 | if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) { |
2378 | if (!cn_reply) { | 3103 | retcode = ERR_INVALID_REQUEST; |
2379 | retcode = ERR_NOMEM; | 3104 | drbd_msg_put_info("resource exists"); |
2380 | goto fail; | 3105 | } |
3106 | /* else: still NO_ERROR */ | ||
3107 | goto out; | ||
2381 | } | 3108 | } |
2382 | reply = (struct drbd_nl_cfg_reply *) cn_reply->data; | ||
2383 | |||
2384 | reply->packet_type = | ||
2385 | cm->reply_body_size ? nlp->packet_type : P_return_code_only; | ||
2386 | reply->minor = nlp->drbd_minor; | ||
2387 | reply->ret_code = NO_ERROR; /* Might by modified by cm->function. */ | ||
2388 | /* reply->tag_list; might be modified by cm->function. */ | ||
2389 | |||
2390 | rr = cm->function(mdev, nlp, reply); | ||
2391 | |||
2392 | cn_reply->id = req->id; | ||
2393 | cn_reply->seq = req->seq; | ||
2394 | cn_reply->ack = req->ack + 1; | ||
2395 | cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + rr; | ||
2396 | cn_reply->flags = 0; | ||
2397 | |||
2398 | rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_KERNEL); | ||
2399 | if (rr && rr != -ESRCH) | ||
2400 | printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr); | ||
2401 | 3109 | ||
2402 | kfree(cn_reply); | 3110 | if (!conn_create(adm_ctx.resource_name, &res_opts)) |
2403 | module_put(THIS_MODULE); | 3111 | retcode = ERR_NOMEM; |
2404 | return; | 3112 | out: |
2405 | fail: | 3113 | drbd_adm_finish(info, retcode); |
2406 | drbd_nl_send_reply(req, retcode); | 3114 | return 0; |
2407 | module_put(THIS_MODULE); | ||
2408 | } | 3115 | } |
2409 | 3116 | ||
2410 | static atomic_t drbd_nl_seq = ATOMIC_INIT(2); /* two. */ | 3117 | int drbd_adm_add_minor(struct sk_buff *skb, struct genl_info *info) |
2411 | |||
2412 | static unsigned short * | ||
2413 | __tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, | ||
2414 | unsigned short len, int nul_terminated) | ||
2415 | { | 3118 | { |
2416 | unsigned short l = tag_descriptions[tag_number(tag)].max_len; | 3119 | struct drbd_genlmsghdr *dh = info->userhdr; |
2417 | len = (len < l) ? len : l; | 3120 | enum drbd_ret_code retcode; |
2418 | put_unaligned(tag, tl++); | ||
2419 | put_unaligned(len, tl++); | ||
2420 | memcpy(tl, data, len); | ||
2421 | tl = (unsigned short*)((char*)tl + len); | ||
2422 | if (nul_terminated) | ||
2423 | *((char*)tl - 1) = 0; | ||
2424 | return tl; | ||
2425 | } | ||
2426 | 3121 | ||
2427 | static unsigned short * | 3122 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); |
2428 | tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, int len) | 3123 | if (!adm_ctx.reply_skb) |
2429 | { | 3124 | return retcode; |
2430 | return __tl_add_blob(tl, tag, data, len, 0); | 3125 | if (retcode != NO_ERROR) |
2431 | } | 3126 | goto out; |
2432 | 3127 | ||
2433 | static unsigned short * | 3128 | if (dh->minor > MINORMASK) { |
2434 | tl_add_str(unsigned short *tl, enum drbd_tags tag, const char *str) | 3129 | drbd_msg_put_info("requested minor out of range"); |
2435 | { | 3130 | retcode = ERR_INVALID_REQUEST; |
2436 | return __tl_add_blob(tl, tag, str, strlen(str)+1, 0); | 3131 | goto out; |
2437 | } | 3132 | } |
3133 | if (adm_ctx.volume > DRBD_VOLUME_MAX) { | ||
3134 | drbd_msg_put_info("requested volume id out of range"); | ||
3135 | retcode = ERR_INVALID_REQUEST; | ||
3136 | goto out; | ||
3137 | } | ||
2438 | 3138 | ||
2439 | static unsigned short * | 3139 | /* drbd_adm_prepare made sure already |
2440 | tl_add_int(unsigned short *tl, enum drbd_tags tag, const void *val) | 3140 | * that mdev->tconn and mdev->vnr match the request. */ |
2441 | { | 3141 | if (adm_ctx.mdev) { |
2442 | put_unaligned(tag, tl++); | 3142 | if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) |
2443 | switch(tag_type(tag)) { | 3143 | retcode = ERR_MINOR_EXISTS; |
2444 | case TT_INTEGER: | 3144 | /* else: still NO_ERROR */ |
2445 | put_unaligned(sizeof(int), tl++); | 3145 | goto out; |
2446 | put_unaligned(*(int *)val, (int *)tl); | ||
2447 | tl = (unsigned short*)((char*)tl+sizeof(int)); | ||
2448 | break; | ||
2449 | case TT_INT64: | ||
2450 | put_unaligned(sizeof(u64), tl++); | ||
2451 | put_unaligned(*(u64 *)val, (u64 *)tl); | ||
2452 | tl = (unsigned short*)((char*)tl+sizeof(u64)); | ||
2453 | break; | ||
2454 | default: | ||
2455 | /* someone did something stupid. */ | ||
2456 | ; | ||
2457 | } | 3146 | } |
2458 | return tl; | 3147 | |
3148 | retcode = conn_new_minor(adm_ctx.tconn, dh->minor, adm_ctx.volume); | ||
3149 | out: | ||
3150 | drbd_adm_finish(info, retcode); | ||
3151 | return 0; | ||
2459 | } | 3152 | } |
2460 | 3153 | ||
2461 | void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state state) | 3154 | static enum drbd_ret_code adm_delete_minor(struct drbd_conf *mdev) |
2462 | { | 3155 | { |
2463 | char buffer[sizeof(struct cn_msg)+ | 3156 | if (mdev->state.disk == D_DISKLESS && |
2464 | sizeof(struct drbd_nl_cfg_reply)+ | 3157 | /* no need to be mdev->state.conn == C_STANDALONE && |
2465 | sizeof(struct get_state_tag_len_struct)+ | 3158 | * we may want to delete a minor from a live replication group. |
2466 | sizeof(short int)]; | 3159 | */ |
2467 | struct cn_msg *cn_reply = (struct cn_msg *) buffer; | 3160 | mdev->state.role == R_SECONDARY) { |
2468 | struct drbd_nl_cfg_reply *reply = | 3161 | _drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS), |
2469 | (struct drbd_nl_cfg_reply *)cn_reply->data; | 3162 | CS_VERBOSE + CS_WAIT_COMPLETE); |
2470 | unsigned short *tl = reply->tag_list; | 3163 | idr_remove(&mdev->tconn->volumes, mdev->vnr); |
2471 | 3164 | idr_remove(&minors, mdev_to_minor(mdev)); | |
2472 | /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */ | 3165 | del_gendisk(mdev->vdisk); |
2473 | 3166 | synchronize_rcu(); | |
2474 | tl = get_state_to_tags(mdev, (struct get_state *)&state, tl); | 3167 | kref_put(&mdev->kref, &drbd_minor_destroy); |
2475 | 3168 | return NO_ERROR; | |
2476 | put_unaligned(TT_END, tl++); /* Close the tag list */ | 3169 | } else |
2477 | 3170 | return ERR_MINOR_CONFIGURED; | |
2478 | cn_reply->id.idx = CN_IDX_DRBD; | ||
2479 | cn_reply->id.val = CN_VAL_DRBD; | ||
2480 | |||
2481 | cn_reply->seq = atomic_add_return(1, &drbd_nl_seq); | ||
2482 | cn_reply->ack = 0; /* not used here. */ | ||
2483 | cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + | ||
2484 | (int)((char *)tl - (char *)reply->tag_list); | ||
2485 | cn_reply->flags = 0; | ||
2486 | |||
2487 | reply->packet_type = P_get_state; | ||
2488 | reply->minor = mdev_to_minor(mdev); | ||
2489 | reply->ret_code = NO_ERROR; | ||
2490 | |||
2491 | cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); | ||
2492 | } | 3171 | } |
2493 | 3172 | ||
2494 | void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name) | 3173 | int drbd_adm_delete_minor(struct sk_buff *skb, struct genl_info *info) |
2495 | { | 3174 | { |
2496 | char buffer[sizeof(struct cn_msg)+ | 3175 | enum drbd_ret_code retcode; |
2497 | sizeof(struct drbd_nl_cfg_reply)+ | ||
2498 | sizeof(struct call_helper_tag_len_struct)+ | ||
2499 | sizeof(short int)]; | ||
2500 | struct cn_msg *cn_reply = (struct cn_msg *) buffer; | ||
2501 | struct drbd_nl_cfg_reply *reply = | ||
2502 | (struct drbd_nl_cfg_reply *)cn_reply->data; | ||
2503 | unsigned short *tl = reply->tag_list; | ||
2504 | |||
2505 | /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */ | ||
2506 | |||
2507 | tl = tl_add_str(tl, T_helper, helper_name); | ||
2508 | put_unaligned(TT_END, tl++); /* Close the tag list */ | ||
2509 | |||
2510 | cn_reply->id.idx = CN_IDX_DRBD; | ||
2511 | cn_reply->id.val = CN_VAL_DRBD; | ||
2512 | |||
2513 | cn_reply->seq = atomic_add_return(1, &drbd_nl_seq); | ||
2514 | cn_reply->ack = 0; /* not used here. */ | ||
2515 | cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + | ||
2516 | (int)((char *)tl - (char *)reply->tag_list); | ||
2517 | cn_reply->flags = 0; | ||
2518 | 3176 | ||
2519 | reply->packet_type = P_call_helper; | 3177 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); |
2520 | reply->minor = mdev_to_minor(mdev); | 3178 | if (!adm_ctx.reply_skb) |
2521 | reply->ret_code = NO_ERROR; | 3179 | return retcode; |
3180 | if (retcode != NO_ERROR) | ||
3181 | goto out; | ||
2522 | 3182 | ||
2523 | cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); | 3183 | retcode = adm_delete_minor(adm_ctx.mdev); |
3184 | out: | ||
3185 | drbd_adm_finish(info, retcode); | ||
3186 | return 0; | ||
2524 | } | 3187 | } |
2525 | 3188 | ||
2526 | void drbd_bcast_ee(struct drbd_conf *mdev, | 3189 | int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) |
2527 | const char *reason, const int dgs, | ||
2528 | const char* seen_hash, const char* calc_hash, | ||
2529 | const struct drbd_epoch_entry* e) | ||
2530 | { | 3190 | { |
2531 | struct cn_msg *cn_reply; | 3191 | int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ |
2532 | struct drbd_nl_cfg_reply *reply; | 3192 | struct drbd_conf *mdev; |
2533 | unsigned short *tl; | 3193 | unsigned i; |
2534 | struct page *page; | ||
2535 | unsigned len; | ||
2536 | 3194 | ||
2537 | if (!e) | 3195 | retcode = drbd_adm_prepare(skb, info, 0); |
2538 | return; | 3196 | if (!adm_ctx.reply_skb) |
2539 | if (!reason || !reason[0]) | 3197 | return retcode; |
2540 | return; | 3198 | if (retcode != NO_ERROR) |
3199 | goto out; | ||
2541 | 3200 | ||
2542 | /* apparently we have to memcpy twice, first to prepare the data for the | 3201 | if (!adm_ctx.tconn) { |
2543 | * struct cn_msg, then within cn_netlink_send from the cn_msg to the | 3202 | retcode = ERR_RES_NOT_KNOWN; |
2544 | * netlink skb. */ | 3203 | goto out; |
2545 | /* receiver thread context, which is not in the writeout path (of this node), | ||
2546 | * but may be in the writeout path of the _other_ node. | ||
2547 | * GFP_NOIO to avoid potential "distributed deadlock". */ | ||
2548 | cn_reply = kzalloc( | ||
2549 | sizeof(struct cn_msg)+ | ||
2550 | sizeof(struct drbd_nl_cfg_reply)+ | ||
2551 | sizeof(struct dump_ee_tag_len_struct)+ | ||
2552 | sizeof(short int), | ||
2553 | GFP_NOIO); | ||
2554 | |||
2555 | if (!cn_reply) { | ||
2556 | dev_err(DEV, "could not kmalloc buffer for drbd_bcast_ee, sector %llu, size %u\n", | ||
2557 | (unsigned long long)e->sector, e->size); | ||
2558 | return; | ||
2559 | } | 3204 | } |
2560 | 3205 | ||
2561 | reply = (struct drbd_nl_cfg_reply*)cn_reply->data; | 3206 | /* demote */ |
2562 | tl = reply->tag_list; | 3207 | idr_for_each_entry(&adm_ctx.tconn->volumes, mdev, i) { |
2563 | 3208 | retcode = drbd_set_role(mdev, R_SECONDARY, 0); | |
2564 | tl = tl_add_str(tl, T_dump_ee_reason, reason); | 3209 | if (retcode < SS_SUCCESS) { |
2565 | tl = tl_add_blob(tl, T_seen_digest, seen_hash, dgs); | 3210 | drbd_msg_put_info("failed to demote"); |
2566 | tl = tl_add_blob(tl, T_calc_digest, calc_hash, dgs); | 3211 | goto out; |
2567 | tl = tl_add_int(tl, T_ee_sector, &e->sector); | 3212 | } |
2568 | tl = tl_add_int(tl, T_ee_block_id, &e->block_id); | ||
2569 | |||
2570 | /* dump the first 32k */ | ||
2571 | len = min_t(unsigned, e->size, 32 << 10); | ||
2572 | put_unaligned(T_ee_data, tl++); | ||
2573 | put_unaligned(len, tl++); | ||
2574 | |||
2575 | page = e->pages; | ||
2576 | page_chain_for_each(page) { | ||
2577 | void *d = kmap_atomic(page); | ||
2578 | unsigned l = min_t(unsigned, len, PAGE_SIZE); | ||
2579 | memcpy(tl, d, l); | ||
2580 | kunmap_atomic(d); | ||
2581 | tl = (unsigned short*)((char*)tl + l); | ||
2582 | len -= l; | ||
2583 | if (len == 0) | ||
2584 | break; | ||
2585 | } | 3213 | } |
2586 | put_unaligned(TT_END, tl++); /* Close the tag list */ | ||
2587 | |||
2588 | cn_reply->id.idx = CN_IDX_DRBD; | ||
2589 | cn_reply->id.val = CN_VAL_DRBD; | ||
2590 | |||
2591 | cn_reply->seq = atomic_add_return(1,&drbd_nl_seq); | ||
2592 | cn_reply->ack = 0; // not used here. | ||
2593 | cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + | ||
2594 | (int)((char*)tl - (char*)reply->tag_list); | ||
2595 | cn_reply->flags = 0; | ||
2596 | |||
2597 | reply->packet_type = P_dump_ee; | ||
2598 | reply->minor = mdev_to_minor(mdev); | ||
2599 | reply->ret_code = NO_ERROR; | ||
2600 | 3214 | ||
2601 | cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); | 3215 | retcode = conn_try_disconnect(adm_ctx.tconn, 0); |
2602 | kfree(cn_reply); | 3216 | if (retcode < SS_SUCCESS) { |
2603 | } | 3217 | drbd_msg_put_info("failed to disconnect"); |
2604 | 3218 | goto out; | |
2605 | void drbd_bcast_sync_progress(struct drbd_conf *mdev) | 3219 | } |
2606 | { | ||
2607 | char buffer[sizeof(struct cn_msg)+ | ||
2608 | sizeof(struct drbd_nl_cfg_reply)+ | ||
2609 | sizeof(struct sync_progress_tag_len_struct)+ | ||
2610 | sizeof(short int)]; | ||
2611 | struct cn_msg *cn_reply = (struct cn_msg *) buffer; | ||
2612 | struct drbd_nl_cfg_reply *reply = | ||
2613 | (struct drbd_nl_cfg_reply *)cn_reply->data; | ||
2614 | unsigned short *tl = reply->tag_list; | ||
2615 | unsigned long rs_left; | ||
2616 | unsigned int res; | ||
2617 | 3220 | ||
2618 | /* no local ref, no bitmap, no syncer progress, no broadcast. */ | 3221 | /* detach */ |
2619 | if (!get_ldev(mdev)) | 3222 | idr_for_each_entry(&adm_ctx.tconn->volumes, mdev, i) { |
2620 | return; | 3223 | retcode = adm_detach(mdev, 0); |
2621 | drbd_get_syncer_progress(mdev, &rs_left, &res); | 3224 | if (retcode < SS_SUCCESS || retcode > NO_ERROR) { |
2622 | put_ldev(mdev); | 3225 | drbd_msg_put_info("failed to detach"); |
3226 | goto out; | ||
3227 | } | ||
3228 | } | ||
2623 | 3229 | ||
2624 | tl = tl_add_int(tl, T_sync_progress, &res); | 3230 | /* If we reach this, all volumes (of this tconn) are Secondary, |
2625 | put_unaligned(TT_END, tl++); /* Close the tag list */ | 3231 | * Disconnected, Diskless, aka Unconfigured. Make sure all threads have |
3232 | * actually stopped, state handling only does drbd_thread_stop_nowait(). */ | ||
3233 | drbd_thread_stop(&adm_ctx.tconn->worker); | ||
2626 | 3234 | ||
2627 | cn_reply->id.idx = CN_IDX_DRBD; | 3235 | /* Now, nothing can fail anymore */ |
2628 | cn_reply->id.val = CN_VAL_DRBD; | ||
2629 | 3236 | ||
2630 | cn_reply->seq = atomic_add_return(1, &drbd_nl_seq); | 3237 | /* delete volumes */ |
2631 | cn_reply->ack = 0; /* not used here. */ | 3238 | idr_for_each_entry(&adm_ctx.tconn->volumes, mdev, i) { |
2632 | cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + | 3239 | retcode = adm_delete_minor(mdev); |
2633 | (int)((char *)tl - (char *)reply->tag_list); | 3240 | if (retcode != NO_ERROR) { |
2634 | cn_reply->flags = 0; | 3241 | /* "can not happen" */ |
3242 | drbd_msg_put_info("failed to delete volume"); | ||
3243 | goto out; | ||
3244 | } | ||
3245 | } | ||
2635 | 3246 | ||
2636 | reply->packet_type = P_sync_progress; | 3247 | /* delete connection */ |
2637 | reply->minor = mdev_to_minor(mdev); | 3248 | if (conn_lowest_minor(adm_ctx.tconn) < 0) { |
2638 | reply->ret_code = NO_ERROR; | 3249 | list_del_rcu(&adm_ctx.tconn->all_tconn); |
3250 | synchronize_rcu(); | ||
3251 | kref_put(&adm_ctx.tconn->kref, &conn_destroy); | ||
2639 | 3252 | ||
2640 | cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); | 3253 | retcode = NO_ERROR; |
3254 | } else { | ||
3255 | /* "can not happen" */ | ||
3256 | retcode = ERR_RES_IN_USE; | ||
3257 | drbd_msg_put_info("failed to delete connection"); | ||
3258 | } | ||
3259 | goto out; | ||
3260 | out: | ||
3261 | drbd_adm_finish(info, retcode); | ||
3262 | return 0; | ||
2641 | } | 3263 | } |
2642 | 3264 | ||
2643 | int __init drbd_nl_init(void) | 3265 | int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info) |
2644 | { | 3266 | { |
2645 | static struct cb_id cn_id_drbd; | 3267 | enum drbd_ret_code retcode; |
2646 | int err, try=10; | ||
2647 | 3268 | ||
2648 | cn_id_drbd.val = CN_VAL_DRBD; | 3269 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); |
2649 | do { | 3270 | if (!adm_ctx.reply_skb) |
2650 | cn_id_drbd.idx = cn_idx; | 3271 | return retcode; |
2651 | err = cn_add_callback(&cn_id_drbd, "cn_drbd", &drbd_connector_callback); | 3272 | if (retcode != NO_ERROR) |
2652 | if (!err) | 3273 | goto out; |
2653 | break; | ||
2654 | cn_idx = (cn_idx + CN_IDX_STEP); | ||
2655 | } while (try--); | ||
2656 | 3274 | ||
2657 | if (err) { | 3275 | if (conn_lowest_minor(adm_ctx.tconn) < 0) { |
2658 | printk(KERN_ERR "drbd: cn_drbd failed to register\n"); | 3276 | list_del_rcu(&adm_ctx.tconn->all_tconn); |
2659 | return err; | 3277 | synchronize_rcu(); |
3278 | kref_put(&adm_ctx.tconn->kref, &conn_destroy); | ||
3279 | |||
3280 | retcode = NO_ERROR; | ||
3281 | } else { | ||
3282 | retcode = ERR_RES_IN_USE; | ||
2660 | } | 3283 | } |
2661 | 3284 | ||
3285 | if (retcode == NO_ERROR) | ||
3286 | drbd_thread_stop(&adm_ctx.tconn->worker); | ||
3287 | out: | ||
3288 | drbd_adm_finish(info, retcode); | ||
2662 | return 0; | 3289 | return 0; |
2663 | } | 3290 | } |
2664 | 3291 | ||
2665 | void drbd_nl_cleanup(void) | 3292 | void drbd_bcast_event(struct drbd_conf *mdev, const struct sib_info *sib) |
2666 | { | 3293 | { |
2667 | static struct cb_id cn_id_drbd; | 3294 | static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */ |
2668 | 3295 | struct sk_buff *msg; | |
2669 | cn_id_drbd.idx = cn_idx; | 3296 | struct drbd_genlmsghdr *d_out; |
2670 | cn_id_drbd.val = CN_VAL_DRBD; | 3297 | unsigned seq; |
2671 | 3298 | int err = -ENOMEM; | |
2672 | cn_del_callback(&cn_id_drbd); | 3299 | |
2673 | } | 3300 | if (sib->sib_reason == SIB_SYNC_PROGRESS) { |
3301 | if (time_after(jiffies, mdev->rs_last_bcast + HZ)) | ||
3302 | mdev->rs_last_bcast = jiffies; | ||
3303 | else | ||
3304 | return; | ||
3305 | } | ||
2674 | 3306 | ||
2675 | void drbd_nl_send_reply(struct cn_msg *req, int ret_code) | 3307 | seq = atomic_inc_return(&drbd_genl_seq); |
2676 | { | 3308 | msg = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO); |
2677 | char buffer[sizeof(struct cn_msg)+sizeof(struct drbd_nl_cfg_reply)]; | 3309 | if (!msg) |
2678 | struct cn_msg *cn_reply = (struct cn_msg *) buffer; | 3310 | goto failed; |
2679 | struct drbd_nl_cfg_reply *reply = | ||
2680 | (struct drbd_nl_cfg_reply *)cn_reply->data; | ||
2681 | int rr; | ||
2682 | 3311 | ||
2683 | memset(buffer, 0, sizeof(buffer)); | 3312 | err = -EMSGSIZE; |
2684 | cn_reply->id = req->id; | 3313 | d_out = genlmsg_put(msg, 0, seq, &drbd_genl_family, 0, DRBD_EVENT); |
3314 | if (!d_out) /* cannot happen, but anyways. */ | ||
3315 | goto nla_put_failure; | ||
3316 | d_out->minor = mdev_to_minor(mdev); | ||
3317 | d_out->ret_code = NO_ERROR; | ||
2685 | 3318 | ||
2686 | cn_reply->seq = req->seq; | 3319 | if (nla_put_status_info(msg, mdev, sib)) |
2687 | cn_reply->ack = req->ack + 1; | 3320 | goto nla_put_failure; |
2688 | cn_reply->len = sizeof(struct drbd_nl_cfg_reply); | 3321 | genlmsg_end(msg, d_out); |
2689 | cn_reply->flags = 0; | 3322 | err = drbd_genl_multicast_events(msg, 0); |
3323 | /* msg has been consumed or freed in netlink_broadcast() */ | ||
3324 | if (err && err != -ESRCH) | ||
3325 | goto failed; | ||
2690 | 3326 | ||
2691 | reply->packet_type = P_return_code_only; | 3327 | return; |
2692 | reply->minor = ((struct drbd_nl_cfg_req *)req->data)->drbd_minor; | ||
2693 | reply->ret_code = ret_code; | ||
2694 | 3328 | ||
2695 | rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); | 3329 | nla_put_failure: |
2696 | if (rr && rr != -ESRCH) | 3330 | nlmsg_free(msg); |
2697 | printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr); | 3331 | failed: |
3332 | dev_err(DEV, "Error %d while broadcasting event. " | ||
3333 | "Event seq:%u sib_reason:%u\n", | ||
3334 | err, seq, sib->sib_reason); | ||
2698 | } | 3335 | } |
2699 | |||
diff --git a/drivers/block/drbd/drbd_nla.c b/drivers/block/drbd/drbd_nla.c new file mode 100644 index 000000000000..fa672b6df8d6 --- /dev/null +++ b/drivers/block/drbd/drbd_nla.c | |||
@@ -0,0 +1,55 @@ | |||
1 | #include "drbd_wrappers.h" | ||
2 | #include <linux/kernel.h> | ||
3 | #include <net/netlink.h> | ||
4 | #include <linux/drbd_genl_api.h> | ||
5 | #include "drbd_nla.h" | ||
6 | |||
7 | static int drbd_nla_check_mandatory(int maxtype, struct nlattr *nla) | ||
8 | { | ||
9 | struct nlattr *head = nla_data(nla); | ||
10 | int len = nla_len(nla); | ||
11 | int rem; | ||
12 | |||
13 | /* | ||
14 | * validate_nla (called from nla_parse_nested) ignores attributes | ||
15 | * beyond maxtype, and does not understand the DRBD_GENLA_F_MANDATORY flag. | ||
16 | * In order to have it validate attributes with the DRBD_GENLA_F_MANDATORY | ||
17 | * flag set also, check and remove that flag before calling | ||
18 | * nla_parse_nested. | ||
19 | */ | ||
20 | |||
21 | nla_for_each_attr(nla, head, len, rem) { | ||
22 | if (nla->nla_type & DRBD_GENLA_F_MANDATORY) { | ||
23 | nla->nla_type &= ~DRBD_GENLA_F_MANDATORY; | ||
24 | if (nla_type(nla) > maxtype) | ||
25 | return -EOPNOTSUPP; | ||
26 | } | ||
27 | } | ||
28 | return 0; | ||
29 | } | ||
30 | |||
31 | int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla, | ||
32 | const struct nla_policy *policy) | ||
33 | { | ||
34 | int err; | ||
35 | |||
36 | err = drbd_nla_check_mandatory(maxtype, nla); | ||
37 | if (!err) | ||
38 | err = nla_parse_nested(tb, maxtype, nla, policy); | ||
39 | |||
40 | return err; | ||
41 | } | ||
42 | |||
43 | struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype) | ||
44 | { | ||
45 | int err; | ||
46 | /* | ||
47 | * If any nested attribute has the DRBD_GENLA_F_MANDATORY flag set and | ||
48 | * we don't know about that attribute, reject all the nested | ||
49 | * attributes. | ||
50 | */ | ||
51 | err = drbd_nla_check_mandatory(maxtype, nla); | ||
52 | if (err) | ||
53 | return ERR_PTR(err); | ||
54 | return nla_find_nested(nla, attrtype); | ||
55 | } | ||
diff --git a/drivers/block/drbd/drbd_nla.h b/drivers/block/drbd/drbd_nla.h new file mode 100644 index 000000000000..679c2d5b4535 --- /dev/null +++ b/drivers/block/drbd/drbd_nla.h | |||
@@ -0,0 +1,8 @@ | |||
1 | #ifndef __DRBD_NLA_H | ||
2 | #define __DRBD_NLA_H | ||
3 | |||
4 | extern int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla, | ||
5 | const struct nla_policy *policy); | ||
6 | extern struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype); | ||
7 | |||
8 | #endif /* __DRBD_NLA_H */ | ||
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index 5496104f90b9..56672a61eb94 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c | |||
@@ -167,18 +167,24 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq) | |||
167 | * we convert to sectors in the display below. */ | 167 | * we convert to sectors in the display below. */ |
168 | unsigned long bm_bits = drbd_bm_bits(mdev); | 168 | unsigned long bm_bits = drbd_bm_bits(mdev); |
169 | unsigned long bit_pos; | 169 | unsigned long bit_pos; |
170 | unsigned long long stop_sector = 0; | ||
170 | if (mdev->state.conn == C_VERIFY_S || | 171 | if (mdev->state.conn == C_VERIFY_S || |
171 | mdev->state.conn == C_VERIFY_T) | 172 | mdev->state.conn == C_VERIFY_T) { |
172 | bit_pos = bm_bits - mdev->ov_left; | 173 | bit_pos = bm_bits - mdev->ov_left; |
173 | else | 174 | if (verify_can_do_stop_sector(mdev)) |
175 | stop_sector = mdev->ov_stop_sector; | ||
176 | } else | ||
174 | bit_pos = mdev->bm_resync_fo; | 177 | bit_pos = mdev->bm_resync_fo; |
175 | /* Total sectors may be slightly off for oddly | 178 | /* Total sectors may be slightly off for oddly |
176 | * sized devices. So what. */ | 179 | * sized devices. So what. */ |
177 | seq_printf(seq, | 180 | seq_printf(seq, |
178 | "\t%3d%% sector pos: %llu/%llu\n", | 181 | "\t%3d%% sector pos: %llu/%llu", |
179 | (int)(bit_pos / (bm_bits/100+1)), | 182 | (int)(bit_pos / (bm_bits/100+1)), |
180 | (unsigned long long)bit_pos * BM_SECT_PER_BIT, | 183 | (unsigned long long)bit_pos * BM_SECT_PER_BIT, |
181 | (unsigned long long)bm_bits * BM_SECT_PER_BIT); | 184 | (unsigned long long)bm_bits * BM_SECT_PER_BIT); |
185 | if (stop_sector != 0 && stop_sector != ULLONG_MAX) | ||
186 | seq_printf(seq, " stop sector: %llu", stop_sector); | ||
187 | seq_printf(seq, "\n"); | ||
182 | } | 188 | } |
183 | } | 189 | } |
184 | 190 | ||
@@ -194,9 +200,11 @@ static void resync_dump_detail(struct seq_file *seq, struct lc_element *e) | |||
194 | 200 | ||
195 | static int drbd_seq_show(struct seq_file *seq, void *v) | 201 | static int drbd_seq_show(struct seq_file *seq, void *v) |
196 | { | 202 | { |
197 | int i, hole = 0; | 203 | int i, prev_i = -1; |
198 | const char *sn; | 204 | const char *sn; |
199 | struct drbd_conf *mdev; | 205 | struct drbd_conf *mdev; |
206 | struct net_conf *nc; | ||
207 | char wp; | ||
200 | 208 | ||
201 | static char write_ordering_chars[] = { | 209 | static char write_ordering_chars[] = { |
202 | [WO_none] = 'n', | 210 | [WO_none] = 'n', |
@@ -227,16 +235,11 @@ static int drbd_seq_show(struct seq_file *seq, void *v) | |||
227 | oos .. known out-of-sync kB | 235 | oos .. known out-of-sync kB |
228 | */ | 236 | */ |
229 | 237 | ||
230 | for (i = 0; i < minor_count; i++) { | 238 | rcu_read_lock(); |
231 | mdev = minor_to_mdev(i); | 239 | idr_for_each_entry(&minors, mdev, i) { |
232 | if (!mdev) { | 240 | if (prev_i != i - 1) |
233 | hole = 1; | ||
234 | continue; | ||
235 | } | ||
236 | if (hole) { | ||
237 | hole = 0; | ||
238 | seq_printf(seq, "\n"); | 241 | seq_printf(seq, "\n"); |
239 | } | 242 | prev_i = i; |
240 | 243 | ||
241 | sn = drbd_conn_str(mdev->state.conn); | 244 | sn = drbd_conn_str(mdev->state.conn); |
242 | 245 | ||
@@ -248,6 +251,8 @@ static int drbd_seq_show(struct seq_file *seq, void *v) | |||
248 | /* reset mdev->congestion_reason */ | 251 | /* reset mdev->congestion_reason */ |
249 | bdi_rw_congested(&mdev->rq_queue->backing_dev_info); | 252 | bdi_rw_congested(&mdev->rq_queue->backing_dev_info); |
250 | 253 | ||
254 | nc = rcu_dereference(mdev->tconn->net_conf); | ||
255 | wp = nc ? nc->wire_protocol - DRBD_PROT_A + 'A' : ' '; | ||
251 | seq_printf(seq, | 256 | seq_printf(seq, |
252 | "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n" | 257 | "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n" |
253 | " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u " | 258 | " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u " |
@@ -257,9 +262,8 @@ static int drbd_seq_show(struct seq_file *seq, void *v) | |||
257 | drbd_role_str(mdev->state.peer), | 262 | drbd_role_str(mdev->state.peer), |
258 | drbd_disk_str(mdev->state.disk), | 263 | drbd_disk_str(mdev->state.disk), |
259 | drbd_disk_str(mdev->state.pdsk), | 264 | drbd_disk_str(mdev->state.pdsk), |
260 | (mdev->net_conf == NULL ? ' ' : | 265 | wp, |
261 | (mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')), | 266 | drbd_suspended(mdev) ? 's' : 'r', |
262 | is_susp(mdev->state) ? 's' : 'r', | ||
263 | mdev->state.aftr_isp ? 'a' : '-', | 267 | mdev->state.aftr_isp ? 'a' : '-', |
264 | mdev->state.peer_isp ? 'p' : '-', | 268 | mdev->state.peer_isp ? 'p' : '-', |
265 | mdev->state.user_isp ? 'u' : '-', | 269 | mdev->state.user_isp ? 'u' : '-', |
@@ -276,8 +280,8 @@ static int drbd_seq_show(struct seq_file *seq, void *v) | |||
276 | atomic_read(&mdev->rs_pending_cnt), | 280 | atomic_read(&mdev->rs_pending_cnt), |
277 | atomic_read(&mdev->unacked_cnt), | 281 | atomic_read(&mdev->unacked_cnt), |
278 | atomic_read(&mdev->ap_bio_cnt), | 282 | atomic_read(&mdev->ap_bio_cnt), |
279 | mdev->epochs, | 283 | mdev->tconn->epochs, |
280 | write_ordering_chars[mdev->write_ordering] | 284 | write_ordering_chars[mdev->tconn->write_ordering] |
281 | ); | 285 | ); |
282 | seq_printf(seq, " oos:%llu\n", | 286 | seq_printf(seq, " oos:%llu\n", |
283 | Bit2KB((unsigned long long) | 287 | Bit2KB((unsigned long long) |
@@ -302,6 +306,7 @@ static int drbd_seq_show(struct seq_file *seq, void *v) | |||
302 | } | 306 | } |
303 | } | 307 | } |
304 | } | 308 | } |
309 | rcu_read_unlock(); | ||
305 | 310 | ||
306 | return 0; | 311 | return 0; |
307 | } | 312 | } |
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index c74ca2df7431..a9eccfc6079b 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c | |||
@@ -48,17 +48,25 @@ | |||
48 | 48 | ||
49 | #include "drbd_vli.h" | 49 | #include "drbd_vli.h" |
50 | 50 | ||
51 | struct packet_info { | ||
52 | enum drbd_packet cmd; | ||
53 | unsigned int size; | ||
54 | unsigned int vnr; | ||
55 | void *data; | ||
56 | }; | ||
57 | |||
51 | enum finish_epoch { | 58 | enum finish_epoch { |
52 | FE_STILL_LIVE, | 59 | FE_STILL_LIVE, |
53 | FE_DESTROYED, | 60 | FE_DESTROYED, |
54 | FE_RECYCLED, | 61 | FE_RECYCLED, |
55 | }; | 62 | }; |
56 | 63 | ||
57 | static int drbd_do_handshake(struct drbd_conf *mdev); | 64 | static int drbd_do_features(struct drbd_tconn *tconn); |
58 | static int drbd_do_auth(struct drbd_conf *mdev); | 65 | static int drbd_do_auth(struct drbd_tconn *tconn); |
66 | static int drbd_disconnected(struct drbd_conf *mdev); | ||
59 | 67 | ||
60 | static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event); | 68 | static enum finish_epoch drbd_may_finish_epoch(struct drbd_tconn *, struct drbd_epoch *, enum epoch_event); |
61 | static int e_end_block(struct drbd_conf *, struct drbd_work *, int); | 69 | static int e_end_block(struct drbd_work *, int); |
62 | 70 | ||
63 | 71 | ||
64 | #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) | 72 | #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) |
@@ -142,11 +150,12 @@ static void page_chain_add(struct page **head, | |||
142 | *head = chain_first; | 150 | *head = chain_first; |
143 | } | 151 | } |
144 | 152 | ||
145 | static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int number) | 153 | static struct page *__drbd_alloc_pages(struct drbd_conf *mdev, |
154 | unsigned int number) | ||
146 | { | 155 | { |
147 | struct page *page = NULL; | 156 | struct page *page = NULL; |
148 | struct page *tmp = NULL; | 157 | struct page *tmp = NULL; |
149 | int i = 0; | 158 | unsigned int i = 0; |
150 | 159 | ||
151 | /* Yes, testing drbd_pp_vacant outside the lock is racy. | 160 | /* Yes, testing drbd_pp_vacant outside the lock is racy. |
152 | * So what. It saves a spin_lock. */ | 161 | * So what. It saves a spin_lock. */ |
@@ -175,7 +184,7 @@ static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int | |||
175 | return page; | 184 | return page; |
176 | 185 | ||
177 | /* Not enough pages immediately available this time. | 186 | /* Not enough pages immediately available this time. |
178 | * No need to jump around here, drbd_pp_alloc will retry this | 187 | * No need to jump around here, drbd_alloc_pages will retry this |
179 | * function "soon". */ | 188 | * function "soon". */ |
180 | if (page) { | 189 | if (page) { |
181 | tmp = page_chain_tail(page, NULL); | 190 | tmp = page_chain_tail(page, NULL); |
@@ -187,9 +196,10 @@ static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int | |||
187 | return NULL; | 196 | return NULL; |
188 | } | 197 | } |
189 | 198 | ||
190 | static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed) | 199 | static void reclaim_finished_net_peer_reqs(struct drbd_conf *mdev, |
200 | struct list_head *to_be_freed) | ||
191 | { | 201 | { |
192 | struct drbd_epoch_entry *e; | 202 | struct drbd_peer_request *peer_req; |
193 | struct list_head *le, *tle; | 203 | struct list_head *le, *tle; |
194 | 204 | ||
195 | /* The EEs are always appended to the end of the list. Since | 205 | /* The EEs are always appended to the end of the list. Since |
@@ -198,8 +208,8 @@ static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed | |||
198 | stop to examine the list... */ | 208 | stop to examine the list... */ |
199 | 209 | ||
200 | list_for_each_safe(le, tle, &mdev->net_ee) { | 210 | list_for_each_safe(le, tle, &mdev->net_ee) { |
201 | e = list_entry(le, struct drbd_epoch_entry, w.list); | 211 | peer_req = list_entry(le, struct drbd_peer_request, w.list); |
202 | if (drbd_ee_has_active_page(e)) | 212 | if (drbd_peer_req_has_active_page(peer_req)) |
203 | break; | 213 | break; |
204 | list_move(le, to_be_freed); | 214 | list_move(le, to_be_freed); |
205 | } | 215 | } |
@@ -208,18 +218,18 @@ static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed | |||
208 | static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev) | 218 | static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev) |
209 | { | 219 | { |
210 | LIST_HEAD(reclaimed); | 220 | LIST_HEAD(reclaimed); |
211 | struct drbd_epoch_entry *e, *t; | 221 | struct drbd_peer_request *peer_req, *t; |
212 | 222 | ||
213 | spin_lock_irq(&mdev->req_lock); | 223 | spin_lock_irq(&mdev->tconn->req_lock); |
214 | reclaim_net_ee(mdev, &reclaimed); | 224 | reclaim_finished_net_peer_reqs(mdev, &reclaimed); |
215 | spin_unlock_irq(&mdev->req_lock); | 225 | spin_unlock_irq(&mdev->tconn->req_lock); |
216 | 226 | ||
217 | list_for_each_entry_safe(e, t, &reclaimed, w.list) | 227 | list_for_each_entry_safe(peer_req, t, &reclaimed, w.list) |
218 | drbd_free_net_ee(mdev, e); | 228 | drbd_free_net_peer_req(mdev, peer_req); |
219 | } | 229 | } |
220 | 230 | ||
221 | /** | 231 | /** |
222 | * drbd_pp_alloc() - Returns @number pages, retries forever (or until signalled) | 232 | * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled) |
223 | * @mdev: DRBD device. | 233 | * @mdev: DRBD device. |
224 | * @number: number of pages requested | 234 | * @number: number of pages requested |
225 | * @retry: whether to retry, if not enough pages are available right now | 235 | * @retry: whether to retry, if not enough pages are available right now |
@@ -230,23 +240,31 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev) | |||
230 | * | 240 | * |
231 | * Returns a page chain linked via page->private. | 241 | * Returns a page chain linked via page->private. |
232 | */ | 242 | */ |
233 | static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool retry) | 243 | struct page *drbd_alloc_pages(struct drbd_conf *mdev, unsigned int number, |
244 | bool retry) | ||
234 | { | 245 | { |
235 | struct page *page = NULL; | 246 | struct page *page = NULL; |
247 | struct net_conf *nc; | ||
236 | DEFINE_WAIT(wait); | 248 | DEFINE_WAIT(wait); |
249 | int mxb; | ||
237 | 250 | ||
238 | /* Yes, we may run up to @number over max_buffers. If we | 251 | /* Yes, we may run up to @number over max_buffers. If we |
239 | * follow it strictly, the admin will get it wrong anyways. */ | 252 | * follow it strictly, the admin will get it wrong anyways. */ |
240 | if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) | 253 | rcu_read_lock(); |
241 | page = drbd_pp_first_pages_or_try_alloc(mdev, number); | 254 | nc = rcu_dereference(mdev->tconn->net_conf); |
255 | mxb = nc ? nc->max_buffers : 1000000; | ||
256 | rcu_read_unlock(); | ||
257 | |||
258 | if (atomic_read(&mdev->pp_in_use) < mxb) | ||
259 | page = __drbd_alloc_pages(mdev, number); | ||
242 | 260 | ||
243 | while (page == NULL) { | 261 | while (page == NULL) { |
244 | prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE); | 262 | prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE); |
245 | 263 | ||
246 | drbd_kick_lo_and_reclaim_net(mdev); | 264 | drbd_kick_lo_and_reclaim_net(mdev); |
247 | 265 | ||
248 | if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) { | 266 | if (atomic_read(&mdev->pp_in_use) < mxb) { |
249 | page = drbd_pp_first_pages_or_try_alloc(mdev, number); | 267 | page = __drbd_alloc_pages(mdev, number); |
250 | if (page) | 268 | if (page) |
251 | break; | 269 | break; |
252 | } | 270 | } |
@@ -255,7 +273,7 @@ static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool | |||
255 | break; | 273 | break; |
256 | 274 | ||
257 | if (signal_pending(current)) { | 275 | if (signal_pending(current)) { |
258 | dev_warn(DEV, "drbd_pp_alloc interrupted!\n"); | 276 | dev_warn(DEV, "drbd_alloc_pages interrupted!\n"); |
259 | break; | 277 | break; |
260 | } | 278 | } |
261 | 279 | ||
@@ -268,11 +286,11 @@ static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool | |||
268 | return page; | 286 | return page; |
269 | } | 287 | } |
270 | 288 | ||
271 | /* Must not be used from irq, as that may deadlock: see drbd_pp_alloc. | 289 | /* Must not be used from irq, as that may deadlock: see drbd_alloc_pages. |
272 | * Is also used from inside an other spin_lock_irq(&mdev->req_lock); | 290 | * Is also used from inside an other spin_lock_irq(&mdev->tconn->req_lock); |
273 | * Either links the page chain back to the global pool, | 291 | * Either links the page chain back to the global pool, |
274 | * or returns all pages to the system. */ | 292 | * or returns all pages to the system. */ |
275 | static void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net) | 293 | static void drbd_free_pages(struct drbd_conf *mdev, struct page *page, int is_net) |
276 | { | 294 | { |
277 | atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use; | 295 | atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use; |
278 | int i; | 296 | int i; |
@@ -280,7 +298,7 @@ static void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net) | |||
280 | if (page == NULL) | 298 | if (page == NULL) |
281 | return; | 299 | return; |
282 | 300 | ||
283 | if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE)*minor_count) | 301 | if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count) |
284 | i = page_chain_free(page); | 302 | i = page_chain_free(page); |
285 | else { | 303 | else { |
286 | struct page *tmp; | 304 | struct page *tmp; |
@@ -302,127 +320,130 @@ You need to hold the req_lock: | |||
302 | _drbd_wait_ee_list_empty() | 320 | _drbd_wait_ee_list_empty() |
303 | 321 | ||
304 | You must not have the req_lock: | 322 | You must not have the req_lock: |
305 | drbd_free_ee() | 323 | drbd_free_peer_req() |
306 | drbd_alloc_ee() | 324 | drbd_alloc_peer_req() |
307 | drbd_init_ee() | 325 | drbd_free_peer_reqs() |
308 | drbd_release_ee() | ||
309 | drbd_ee_fix_bhs() | 326 | drbd_ee_fix_bhs() |
310 | drbd_process_done_ee() | 327 | drbd_finish_peer_reqs() |
311 | drbd_clear_done_ee() | 328 | drbd_clear_done_ee() |
312 | drbd_wait_ee_list_empty() | 329 | drbd_wait_ee_list_empty() |
313 | */ | 330 | */ |
314 | 331 | ||
315 | struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, | 332 | struct drbd_peer_request * |
316 | u64 id, | 333 | drbd_alloc_peer_req(struct drbd_conf *mdev, u64 id, sector_t sector, |
317 | sector_t sector, | 334 | unsigned int data_size, gfp_t gfp_mask) __must_hold(local) |
318 | unsigned int data_size, | ||
319 | gfp_t gfp_mask) __must_hold(local) | ||
320 | { | 335 | { |
321 | struct drbd_epoch_entry *e; | 336 | struct drbd_peer_request *peer_req; |
322 | struct page *page = NULL; | 337 | struct page *page = NULL; |
323 | unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT; | 338 | unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT; |
324 | 339 | ||
325 | if (drbd_insert_fault(mdev, DRBD_FAULT_AL_EE)) | 340 | if (drbd_insert_fault(mdev, DRBD_FAULT_AL_EE)) |
326 | return NULL; | 341 | return NULL; |
327 | 342 | ||
328 | e = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM); | 343 | peer_req = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM); |
329 | if (!e) { | 344 | if (!peer_req) { |
330 | if (!(gfp_mask & __GFP_NOWARN)) | 345 | if (!(gfp_mask & __GFP_NOWARN)) |
331 | dev_err(DEV, "alloc_ee: Allocation of an EE failed\n"); | 346 | dev_err(DEV, "%s: allocation failed\n", __func__); |
332 | return NULL; | 347 | return NULL; |
333 | } | 348 | } |
334 | 349 | ||
335 | if (data_size) { | 350 | if (data_size) { |
336 | page = drbd_pp_alloc(mdev, nr_pages, (gfp_mask & __GFP_WAIT)); | 351 | page = drbd_alloc_pages(mdev, nr_pages, (gfp_mask & __GFP_WAIT)); |
337 | if (!page) | 352 | if (!page) |
338 | goto fail; | 353 | goto fail; |
339 | } | 354 | } |
340 | 355 | ||
341 | INIT_HLIST_NODE(&e->collision); | 356 | drbd_clear_interval(&peer_req->i); |
342 | e->epoch = NULL; | 357 | peer_req->i.size = data_size; |
343 | e->mdev = mdev; | 358 | peer_req->i.sector = sector; |
344 | e->pages = page; | 359 | peer_req->i.local = false; |
345 | atomic_set(&e->pending_bios, 0); | 360 | peer_req->i.waiting = false; |
346 | e->size = data_size; | 361 | |
347 | e->flags = 0; | 362 | peer_req->epoch = NULL; |
348 | e->sector = sector; | 363 | peer_req->w.mdev = mdev; |
349 | e->block_id = id; | 364 | peer_req->pages = page; |
365 | atomic_set(&peer_req->pending_bios, 0); | ||
366 | peer_req->flags = 0; | ||
367 | /* | ||
368 | * The block_id is opaque to the receiver. It is not endianness | ||
369 | * converted, and sent back to the sender unchanged. | ||
370 | */ | ||
371 | peer_req->block_id = id; | ||
350 | 372 | ||
351 | return e; | 373 | return peer_req; |
352 | 374 | ||
353 | fail: | 375 | fail: |
354 | mempool_free(e, drbd_ee_mempool); | 376 | mempool_free(peer_req, drbd_ee_mempool); |
355 | return NULL; | 377 | return NULL; |
356 | } | 378 | } |
357 | 379 | ||
358 | void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, int is_net) | 380 | void __drbd_free_peer_req(struct drbd_conf *mdev, struct drbd_peer_request *peer_req, |
381 | int is_net) | ||
359 | { | 382 | { |
360 | if (e->flags & EE_HAS_DIGEST) | 383 | if (peer_req->flags & EE_HAS_DIGEST) |
361 | kfree(e->digest); | 384 | kfree(peer_req->digest); |
362 | drbd_pp_free(mdev, e->pages, is_net); | 385 | drbd_free_pages(mdev, peer_req->pages, is_net); |
363 | D_ASSERT(atomic_read(&e->pending_bios) == 0); | 386 | D_ASSERT(atomic_read(&peer_req->pending_bios) == 0); |
364 | D_ASSERT(hlist_unhashed(&e->collision)); | 387 | D_ASSERT(drbd_interval_empty(&peer_req->i)); |
365 | mempool_free(e, drbd_ee_mempool); | 388 | mempool_free(peer_req, drbd_ee_mempool); |
366 | } | 389 | } |
367 | 390 | ||
368 | int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list) | 391 | int drbd_free_peer_reqs(struct drbd_conf *mdev, struct list_head *list) |
369 | { | 392 | { |
370 | LIST_HEAD(work_list); | 393 | LIST_HEAD(work_list); |
371 | struct drbd_epoch_entry *e, *t; | 394 | struct drbd_peer_request *peer_req, *t; |
372 | int count = 0; | 395 | int count = 0; |
373 | int is_net = list == &mdev->net_ee; | 396 | int is_net = list == &mdev->net_ee; |
374 | 397 | ||
375 | spin_lock_irq(&mdev->req_lock); | 398 | spin_lock_irq(&mdev->tconn->req_lock); |
376 | list_splice_init(list, &work_list); | 399 | list_splice_init(list, &work_list); |
377 | spin_unlock_irq(&mdev->req_lock); | 400 | spin_unlock_irq(&mdev->tconn->req_lock); |
378 | 401 | ||
379 | list_for_each_entry_safe(e, t, &work_list, w.list) { | 402 | list_for_each_entry_safe(peer_req, t, &work_list, w.list) { |
380 | drbd_free_some_ee(mdev, e, is_net); | 403 | __drbd_free_peer_req(mdev, peer_req, is_net); |
381 | count++; | 404 | count++; |
382 | } | 405 | } |
383 | return count; | 406 | return count; |
384 | } | 407 | } |
385 | 408 | ||
386 | |||
387 | /* | 409 | /* |
388 | * This function is called from _asender only_ | 410 | * See also comments in _req_mod(,BARRIER_ACKED) and receive_Barrier. |
389 | * but see also comments in _req_mod(,barrier_acked) | ||
390 | * and receive_Barrier. | ||
391 | * | ||
392 | * Move entries from net_ee to done_ee, if ready. | ||
393 | * Grab done_ee, call all callbacks, free the entries. | ||
394 | * The callbacks typically send out ACKs. | ||
395 | */ | 411 | */ |
396 | static int drbd_process_done_ee(struct drbd_conf *mdev) | 412 | static int drbd_finish_peer_reqs(struct drbd_conf *mdev) |
397 | { | 413 | { |
398 | LIST_HEAD(work_list); | 414 | LIST_HEAD(work_list); |
399 | LIST_HEAD(reclaimed); | 415 | LIST_HEAD(reclaimed); |
400 | struct drbd_epoch_entry *e, *t; | 416 | struct drbd_peer_request *peer_req, *t; |
401 | int ok = (mdev->state.conn >= C_WF_REPORT_PARAMS); | 417 | int err = 0; |
402 | 418 | ||
403 | spin_lock_irq(&mdev->req_lock); | 419 | spin_lock_irq(&mdev->tconn->req_lock); |
404 | reclaim_net_ee(mdev, &reclaimed); | 420 | reclaim_finished_net_peer_reqs(mdev, &reclaimed); |
405 | list_splice_init(&mdev->done_ee, &work_list); | 421 | list_splice_init(&mdev->done_ee, &work_list); |
406 | spin_unlock_irq(&mdev->req_lock); | 422 | spin_unlock_irq(&mdev->tconn->req_lock); |
407 | 423 | ||
408 | list_for_each_entry_safe(e, t, &reclaimed, w.list) | 424 | list_for_each_entry_safe(peer_req, t, &reclaimed, w.list) |
409 | drbd_free_net_ee(mdev, e); | 425 | drbd_free_net_peer_req(mdev, peer_req); |
410 | 426 | ||
411 | /* possible callbacks here: | 427 | /* possible callbacks here: |
412 | * e_end_block, and e_end_resync_block, e_send_discard_ack. | 428 | * e_end_block, and e_end_resync_block, e_send_superseded. |
413 | * all ignore the last argument. | 429 | * all ignore the last argument. |
414 | */ | 430 | */ |
415 | list_for_each_entry_safe(e, t, &work_list, w.list) { | 431 | list_for_each_entry_safe(peer_req, t, &work_list, w.list) { |
432 | int err2; | ||
433 | |||
416 | /* list_del not necessary, next/prev members not touched */ | 434 | /* list_del not necessary, next/prev members not touched */ |
417 | ok = e->w.cb(mdev, &e->w, !ok) && ok; | 435 | err2 = peer_req->w.cb(&peer_req->w, !!err); |
418 | drbd_free_ee(mdev, e); | 436 | if (!err) |
437 | err = err2; | ||
438 | drbd_free_peer_req(mdev, peer_req); | ||
419 | } | 439 | } |
420 | wake_up(&mdev->ee_wait); | 440 | wake_up(&mdev->ee_wait); |
421 | 441 | ||
422 | return ok; | 442 | return err; |
423 | } | 443 | } |
424 | 444 | ||
425 | void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head) | 445 | static void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, |
446 | struct list_head *head) | ||
426 | { | 447 | { |
427 | DEFINE_WAIT(wait); | 448 | DEFINE_WAIT(wait); |
428 | 449 | ||
@@ -430,55 +451,22 @@ void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head) | |||
430 | * and calling prepare_to_wait in the fast path */ | 451 | * and calling prepare_to_wait in the fast path */ |
431 | while (!list_empty(head)) { | 452 | while (!list_empty(head)) { |
432 | prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE); | 453 | prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE); |
433 | spin_unlock_irq(&mdev->req_lock); | 454 | spin_unlock_irq(&mdev->tconn->req_lock); |
434 | io_schedule(); | 455 | io_schedule(); |
435 | finish_wait(&mdev->ee_wait, &wait); | 456 | finish_wait(&mdev->ee_wait, &wait); |
436 | spin_lock_irq(&mdev->req_lock); | 457 | spin_lock_irq(&mdev->tconn->req_lock); |
437 | } | 458 | } |
438 | } | 459 | } |
439 | 460 | ||
440 | void drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head) | 461 | static void drbd_wait_ee_list_empty(struct drbd_conf *mdev, |
462 | struct list_head *head) | ||
441 | { | 463 | { |
442 | spin_lock_irq(&mdev->req_lock); | 464 | spin_lock_irq(&mdev->tconn->req_lock); |
443 | _drbd_wait_ee_list_empty(mdev, head); | 465 | _drbd_wait_ee_list_empty(mdev, head); |
444 | spin_unlock_irq(&mdev->req_lock); | 466 | spin_unlock_irq(&mdev->tconn->req_lock); |
445 | } | ||
446 | |||
447 | /* see also kernel_accept; which is only present since 2.6.18. | ||
448 | * also we want to log which part of it failed, exactly */ | ||
449 | static int drbd_accept(struct drbd_conf *mdev, const char **what, | ||
450 | struct socket *sock, struct socket **newsock) | ||
451 | { | ||
452 | struct sock *sk = sock->sk; | ||
453 | int err = 0; | ||
454 | |||
455 | *what = "listen"; | ||
456 | err = sock->ops->listen(sock, 5); | ||
457 | if (err < 0) | ||
458 | goto out; | ||
459 | |||
460 | *what = "sock_create_lite"; | ||
461 | err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol, | ||
462 | newsock); | ||
463 | if (err < 0) | ||
464 | goto out; | ||
465 | |||
466 | *what = "accept"; | ||
467 | err = sock->ops->accept(sock, *newsock, 0); | ||
468 | if (err < 0) { | ||
469 | sock_release(*newsock); | ||
470 | *newsock = NULL; | ||
471 | goto out; | ||
472 | } | ||
473 | (*newsock)->ops = sock->ops; | ||
474 | __module_get((*newsock)->ops->owner); | ||
475 | |||
476 | out: | ||
477 | return err; | ||
478 | } | 467 | } |
479 | 468 | ||
480 | static int drbd_recv_short(struct drbd_conf *mdev, struct socket *sock, | 469 | static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags) |
481 | void *buf, size_t size, int flags) | ||
482 | { | 470 | { |
483 | mm_segment_t oldfs; | 471 | mm_segment_t oldfs; |
484 | struct kvec iov = { | 472 | struct kvec iov = { |
@@ -500,59 +488,62 @@ static int drbd_recv_short(struct drbd_conf *mdev, struct socket *sock, | |||
500 | return rv; | 488 | return rv; |
501 | } | 489 | } |
502 | 490 | ||
503 | static int drbd_recv(struct drbd_conf *mdev, void *buf, size_t size) | 491 | static int drbd_recv(struct drbd_tconn *tconn, void *buf, size_t size) |
504 | { | 492 | { |
505 | mm_segment_t oldfs; | ||
506 | struct kvec iov = { | ||
507 | .iov_base = buf, | ||
508 | .iov_len = size, | ||
509 | }; | ||
510 | struct msghdr msg = { | ||
511 | .msg_iovlen = 1, | ||
512 | .msg_iov = (struct iovec *)&iov, | ||
513 | .msg_flags = MSG_WAITALL | MSG_NOSIGNAL | ||
514 | }; | ||
515 | int rv; | 493 | int rv; |
516 | 494 | ||
517 | oldfs = get_fs(); | 495 | rv = drbd_recv_short(tconn->data.socket, buf, size, 0); |
518 | set_fs(KERNEL_DS); | ||
519 | 496 | ||
520 | for (;;) { | 497 | if (rv < 0) { |
521 | rv = sock_recvmsg(mdev->data.socket, &msg, size, msg.msg_flags); | 498 | if (rv == -ECONNRESET) |
522 | if (rv == size) | 499 | conn_info(tconn, "sock was reset by peer\n"); |
523 | break; | 500 | else if (rv != -ERESTARTSYS) |
501 | conn_err(tconn, "sock_recvmsg returned %d\n", rv); | ||
502 | } else if (rv == 0) { | ||
503 | if (test_bit(DISCONNECT_SENT, &tconn->flags)) { | ||
504 | long t; | ||
505 | rcu_read_lock(); | ||
506 | t = rcu_dereference(tconn->net_conf)->ping_timeo * HZ/10; | ||
507 | rcu_read_unlock(); | ||
524 | 508 | ||
525 | /* Note: | 509 | t = wait_event_timeout(tconn->ping_wait, tconn->cstate < C_WF_REPORT_PARAMS, t); |
526 | * ECONNRESET other side closed the connection | ||
527 | * ERESTARTSYS (on sock) we got a signal | ||
528 | */ | ||
529 | 510 | ||
530 | if (rv < 0) { | 511 | if (t) |
531 | if (rv == -ECONNRESET) | 512 | goto out; |
532 | dev_info(DEV, "sock was reset by peer\n"); | ||
533 | else if (rv != -ERESTARTSYS) | ||
534 | dev_err(DEV, "sock_recvmsg returned %d\n", rv); | ||
535 | break; | ||
536 | } else if (rv == 0) { | ||
537 | dev_info(DEV, "sock was shut down by peer\n"); | ||
538 | break; | ||
539 | } else { | ||
540 | /* signal came in, or peer/link went down, | ||
541 | * after we read a partial message | ||
542 | */ | ||
543 | /* D_ASSERT(signal_pending(current)); */ | ||
544 | break; | ||
545 | } | 513 | } |
546 | }; | 514 | conn_info(tconn, "sock was shut down by peer\n"); |
547 | 515 | } | |
548 | set_fs(oldfs); | ||
549 | 516 | ||
550 | if (rv != size) | 517 | if (rv != size) |
551 | drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE)); | 518 | conn_request_state(tconn, NS(conn, C_BROKEN_PIPE), CS_HARD); |
552 | 519 | ||
520 | out: | ||
553 | return rv; | 521 | return rv; |
554 | } | 522 | } |
555 | 523 | ||
524 | static int drbd_recv_all(struct drbd_tconn *tconn, void *buf, size_t size) | ||
525 | { | ||
526 | int err; | ||
527 | |||
528 | err = drbd_recv(tconn, buf, size); | ||
529 | if (err != size) { | ||
530 | if (err >= 0) | ||
531 | err = -EIO; | ||
532 | } else | ||
533 | err = 0; | ||
534 | return err; | ||
535 | } | ||
536 | |||
537 | static int drbd_recv_all_warn(struct drbd_tconn *tconn, void *buf, size_t size) | ||
538 | { | ||
539 | int err; | ||
540 | |||
541 | err = drbd_recv_all(tconn, buf, size); | ||
542 | if (err && !signal_pending(current)) | ||
543 | conn_warn(tconn, "short read (expected size %d)\n", (int)size); | ||
544 | return err; | ||
545 | } | ||
546 | |||
556 | /* quoting tcp(7): | 547 | /* quoting tcp(7): |
557 | * On individual connections, the socket buffer size must be set prior to the | 548 | * On individual connections, the socket buffer size must be set prior to the |
558 | * listen(2) or connect(2) calls in order to have it take effect. | 549 | * listen(2) or connect(2) calls in order to have it take effect. |
@@ -572,29 +563,50 @@ static void drbd_setbufsize(struct socket *sock, unsigned int snd, | |||
572 | } | 563 | } |
573 | } | 564 | } |
574 | 565 | ||
575 | static struct socket *drbd_try_connect(struct drbd_conf *mdev) | 566 | static struct socket *drbd_try_connect(struct drbd_tconn *tconn) |
576 | { | 567 | { |
577 | const char *what; | 568 | const char *what; |
578 | struct socket *sock; | 569 | struct socket *sock; |
579 | struct sockaddr_in6 src_in6; | 570 | struct sockaddr_in6 src_in6; |
580 | int err; | 571 | struct sockaddr_in6 peer_in6; |
572 | struct net_conf *nc; | ||
573 | int err, peer_addr_len, my_addr_len; | ||
574 | int sndbuf_size, rcvbuf_size, connect_int; | ||
581 | int disconnect_on_error = 1; | 575 | int disconnect_on_error = 1; |
582 | 576 | ||
583 | if (!get_net_conf(mdev)) | 577 | rcu_read_lock(); |
578 | nc = rcu_dereference(tconn->net_conf); | ||
579 | if (!nc) { | ||
580 | rcu_read_unlock(); | ||
584 | return NULL; | 581 | return NULL; |
582 | } | ||
583 | sndbuf_size = nc->sndbuf_size; | ||
584 | rcvbuf_size = nc->rcvbuf_size; | ||
585 | connect_int = nc->connect_int; | ||
586 | rcu_read_unlock(); | ||
587 | |||
588 | my_addr_len = min_t(int, tconn->my_addr_len, sizeof(src_in6)); | ||
589 | memcpy(&src_in6, &tconn->my_addr, my_addr_len); | ||
590 | |||
591 | if (((struct sockaddr *)&tconn->my_addr)->sa_family == AF_INET6) | ||
592 | src_in6.sin6_port = 0; | ||
593 | else | ||
594 | ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */ | ||
595 | |||
596 | peer_addr_len = min_t(int, tconn->peer_addr_len, sizeof(src_in6)); | ||
597 | memcpy(&peer_in6, &tconn->peer_addr, peer_addr_len); | ||
585 | 598 | ||
586 | what = "sock_create_kern"; | 599 | what = "sock_create_kern"; |
587 | err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family, | 600 | err = sock_create_kern(((struct sockaddr *)&src_in6)->sa_family, |
588 | SOCK_STREAM, IPPROTO_TCP, &sock); | 601 | SOCK_STREAM, IPPROTO_TCP, &sock); |
589 | if (err < 0) { | 602 | if (err < 0) { |
590 | sock = NULL; | 603 | sock = NULL; |
591 | goto out; | 604 | goto out; |
592 | } | 605 | } |
593 | 606 | ||
594 | sock->sk->sk_rcvtimeo = | 607 | sock->sk->sk_rcvtimeo = |
595 | sock->sk->sk_sndtimeo = mdev->net_conf->try_connect_int*HZ; | 608 | sock->sk->sk_sndtimeo = connect_int * HZ; |
596 | drbd_setbufsize(sock, mdev->net_conf->sndbuf_size, | 609 | drbd_setbufsize(sock, sndbuf_size, rcvbuf_size); |
597 | mdev->net_conf->rcvbuf_size); | ||
598 | 610 | ||
599 | /* explicitly bind to the configured IP as source IP | 611 | /* explicitly bind to the configured IP as source IP |
600 | * for the outgoing connections. | 612 | * for the outgoing connections. |
@@ -603,17 +615,8 @@ static struct socket *drbd_try_connect(struct drbd_conf *mdev) | |||
603 | * Make sure to use 0 as port number, so linux selects | 615 | * Make sure to use 0 as port number, so linux selects |
604 | * a free one dynamically. | 616 | * a free one dynamically. |
605 | */ | 617 | */ |
606 | memcpy(&src_in6, mdev->net_conf->my_addr, | ||
607 | min_t(int, mdev->net_conf->my_addr_len, sizeof(src_in6))); | ||
608 | if (((struct sockaddr *)mdev->net_conf->my_addr)->sa_family == AF_INET6) | ||
609 | src_in6.sin6_port = 0; | ||
610 | else | ||
611 | ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */ | ||
612 | |||
613 | what = "bind before connect"; | 618 | what = "bind before connect"; |
614 | err = sock->ops->bind(sock, | 619 | err = sock->ops->bind(sock, (struct sockaddr *) &src_in6, my_addr_len); |
615 | (struct sockaddr *) &src_in6, | ||
616 | mdev->net_conf->my_addr_len); | ||
617 | if (err < 0) | 620 | if (err < 0) |
618 | goto out; | 621 | goto out; |
619 | 622 | ||
@@ -621,9 +624,7 @@ static struct socket *drbd_try_connect(struct drbd_conf *mdev) | |||
621 | * stay C_WF_CONNECTION, don't go Disconnecting! */ | 624 | * stay C_WF_CONNECTION, don't go Disconnecting! */ |
622 | disconnect_on_error = 0; | 625 | disconnect_on_error = 0; |
623 | what = "connect"; | 626 | what = "connect"; |
624 | err = sock->ops->connect(sock, | 627 | err = sock->ops->connect(sock, (struct sockaddr *) &peer_in6, peer_addr_len, 0); |
625 | (struct sockaddr *)mdev->net_conf->peer_addr, | ||
626 | mdev->net_conf->peer_addr_len, 0); | ||
627 | 628 | ||
628 | out: | 629 | out: |
629 | if (err < 0) { | 630 | if (err < 0) { |
@@ -641,91 +642,174 @@ out: | |||
641 | disconnect_on_error = 0; | 642 | disconnect_on_error = 0; |
642 | break; | 643 | break; |
643 | default: | 644 | default: |
644 | dev_err(DEV, "%s failed, err = %d\n", what, err); | 645 | conn_err(tconn, "%s failed, err = %d\n", what, err); |
645 | } | 646 | } |
646 | if (disconnect_on_error) | 647 | if (disconnect_on_error) |
647 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | 648 | conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD); |
648 | } | 649 | } |
649 | put_net_conf(mdev); | 650 | |
650 | return sock; | 651 | return sock; |
651 | } | 652 | } |
652 | 653 | ||
653 | static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev) | 654 | struct accept_wait_data { |
655 | struct drbd_tconn *tconn; | ||
656 | struct socket *s_listen; | ||
657 | struct completion door_bell; | ||
658 | void (*original_sk_state_change)(struct sock *sk); | ||
659 | |||
660 | }; | ||
661 | |||
662 | static void drbd_incoming_connection(struct sock *sk) | ||
654 | { | 663 | { |
655 | int timeo, err; | 664 | struct accept_wait_data *ad = sk->sk_user_data; |
656 | struct socket *s_estab = NULL, *s_listen; | 665 | void (*state_change)(struct sock *sk); |
666 | |||
667 | state_change = ad->original_sk_state_change; | ||
668 | if (sk->sk_state == TCP_ESTABLISHED) | ||
669 | complete(&ad->door_bell); | ||
670 | state_change(sk); | ||
671 | } | ||
672 | |||
673 | static int prepare_listen_socket(struct drbd_tconn *tconn, struct accept_wait_data *ad) | ||
674 | { | ||
675 | int err, sndbuf_size, rcvbuf_size, my_addr_len; | ||
676 | struct sockaddr_in6 my_addr; | ||
677 | struct socket *s_listen; | ||
678 | struct net_conf *nc; | ||
657 | const char *what; | 679 | const char *what; |
658 | 680 | ||
659 | if (!get_net_conf(mdev)) | 681 | rcu_read_lock(); |
660 | return NULL; | 682 | nc = rcu_dereference(tconn->net_conf); |
683 | if (!nc) { | ||
684 | rcu_read_unlock(); | ||
685 | return -EIO; | ||
686 | } | ||
687 | sndbuf_size = nc->sndbuf_size; | ||
688 | rcvbuf_size = nc->rcvbuf_size; | ||
689 | rcu_read_unlock(); | ||
690 | |||
691 | my_addr_len = min_t(int, tconn->my_addr_len, sizeof(struct sockaddr_in6)); | ||
692 | memcpy(&my_addr, &tconn->my_addr, my_addr_len); | ||
661 | 693 | ||
662 | what = "sock_create_kern"; | 694 | what = "sock_create_kern"; |
663 | err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family, | 695 | err = sock_create_kern(((struct sockaddr *)&my_addr)->sa_family, |
664 | SOCK_STREAM, IPPROTO_TCP, &s_listen); | 696 | SOCK_STREAM, IPPROTO_TCP, &s_listen); |
665 | if (err) { | 697 | if (err) { |
666 | s_listen = NULL; | 698 | s_listen = NULL; |
667 | goto out; | 699 | goto out; |
668 | } | 700 | } |
669 | 701 | ||
670 | timeo = mdev->net_conf->try_connect_int * HZ; | 702 | s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ |
671 | timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */ | 703 | drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size); |
672 | |||
673 | s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ | ||
674 | s_listen->sk->sk_rcvtimeo = timeo; | ||
675 | s_listen->sk->sk_sndtimeo = timeo; | ||
676 | drbd_setbufsize(s_listen, mdev->net_conf->sndbuf_size, | ||
677 | mdev->net_conf->rcvbuf_size); | ||
678 | 704 | ||
679 | what = "bind before listen"; | 705 | what = "bind before listen"; |
680 | err = s_listen->ops->bind(s_listen, | 706 | err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len); |
681 | (struct sockaddr *) mdev->net_conf->my_addr, | ||
682 | mdev->net_conf->my_addr_len); | ||
683 | if (err < 0) | 707 | if (err < 0) |
684 | goto out; | 708 | goto out; |
685 | 709 | ||
686 | err = drbd_accept(mdev, &what, s_listen, &s_estab); | 710 | ad->s_listen = s_listen; |
711 | write_lock_bh(&s_listen->sk->sk_callback_lock); | ||
712 | ad->original_sk_state_change = s_listen->sk->sk_state_change; | ||
713 | s_listen->sk->sk_state_change = drbd_incoming_connection; | ||
714 | s_listen->sk->sk_user_data = ad; | ||
715 | write_unlock_bh(&s_listen->sk->sk_callback_lock); | ||
716 | |||
717 | what = "listen"; | ||
718 | err = s_listen->ops->listen(s_listen, 5); | ||
719 | if (err < 0) | ||
720 | goto out; | ||
687 | 721 | ||
722 | return 0; | ||
688 | out: | 723 | out: |
689 | if (s_listen) | 724 | if (s_listen) |
690 | sock_release(s_listen); | 725 | sock_release(s_listen); |
691 | if (err < 0) { | 726 | if (err < 0) { |
692 | if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) { | 727 | if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) { |
693 | dev_err(DEV, "%s failed, err = %d\n", what, err); | 728 | conn_err(tconn, "%s failed, err = %d\n", what, err); |
694 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | 729 | conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD); |
695 | } | 730 | } |
696 | } | 731 | } |
697 | put_net_conf(mdev); | ||
698 | 732 | ||
699 | return s_estab; | 733 | return -EIO; |
700 | } | 734 | } |
701 | 735 | ||
702 | static int drbd_send_fp(struct drbd_conf *mdev, | 736 | static void unregister_state_change(struct sock *sk, struct accept_wait_data *ad) |
703 | struct socket *sock, enum drbd_packets cmd) | ||
704 | { | 737 | { |
705 | struct p_header80 *h = &mdev->data.sbuf.header.h80; | 738 | write_lock_bh(&sk->sk_callback_lock); |
706 | 739 | sk->sk_state_change = ad->original_sk_state_change; | |
707 | return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0); | 740 | sk->sk_user_data = NULL; |
741 | write_unlock_bh(&sk->sk_callback_lock); | ||
708 | } | 742 | } |
709 | 743 | ||
710 | static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock) | 744 | static struct socket *drbd_wait_for_connect(struct drbd_tconn *tconn, struct accept_wait_data *ad) |
711 | { | 745 | { |
712 | struct p_header80 *h = &mdev->data.rbuf.header.h80; | 746 | int timeo, connect_int, err = 0; |
713 | int rr; | 747 | struct socket *s_estab = NULL; |
748 | struct net_conf *nc; | ||
749 | |||
750 | rcu_read_lock(); | ||
751 | nc = rcu_dereference(tconn->net_conf); | ||
752 | if (!nc) { | ||
753 | rcu_read_unlock(); | ||
754 | return NULL; | ||
755 | } | ||
756 | connect_int = nc->connect_int; | ||
757 | rcu_read_unlock(); | ||
758 | |||
759 | timeo = connect_int * HZ; | ||
760 | timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */ | ||
761 | |||
762 | err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo); | ||
763 | if (err <= 0) | ||
764 | return NULL; | ||
765 | |||
766 | err = kernel_accept(ad->s_listen, &s_estab, 0); | ||
767 | if (err < 0) { | ||
768 | if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) { | ||
769 | conn_err(tconn, "accept failed, err = %d\n", err); | ||
770 | conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD); | ||
771 | } | ||
772 | } | ||
773 | |||
774 | if (s_estab) | ||
775 | unregister_state_change(s_estab->sk, ad); | ||
776 | |||
777 | return s_estab; | ||
778 | } | ||
714 | 779 | ||
715 | rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0); | 780 | static int decode_header(struct drbd_tconn *, void *, struct packet_info *); |
716 | 781 | ||
717 | if (rr == sizeof(*h) && h->magic == BE_DRBD_MAGIC) | 782 | static int send_first_packet(struct drbd_tconn *tconn, struct drbd_socket *sock, |
718 | return be16_to_cpu(h->command); | 783 | enum drbd_packet cmd) |
784 | { | ||
785 | if (!conn_prepare_command(tconn, sock)) | ||
786 | return -EIO; | ||
787 | return conn_send_command(tconn, sock, cmd, 0, NULL, 0); | ||
788 | } | ||
719 | 789 | ||
720 | return 0xffff; | 790 | static int receive_first_packet(struct drbd_tconn *tconn, struct socket *sock) |
791 | { | ||
792 | unsigned int header_size = drbd_header_size(tconn); | ||
793 | struct packet_info pi; | ||
794 | int err; | ||
795 | |||
796 | err = drbd_recv_short(sock, tconn->data.rbuf, header_size, 0); | ||
797 | if (err != header_size) { | ||
798 | if (err >= 0) | ||
799 | err = -EIO; | ||
800 | return err; | ||
801 | } | ||
802 | err = decode_header(tconn, tconn->data.rbuf, &pi); | ||
803 | if (err) | ||
804 | return err; | ||
805 | return pi.cmd; | ||
721 | } | 806 | } |
722 | 807 | ||
723 | /** | 808 | /** |
724 | * drbd_socket_okay() - Free the socket if its connection is not okay | 809 | * drbd_socket_okay() - Free the socket if its connection is not okay |
725 | * @mdev: DRBD device. | ||
726 | * @sock: pointer to the pointer to the socket. | 810 | * @sock: pointer to the pointer to the socket. |
727 | */ | 811 | */ |
728 | static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock) | 812 | static int drbd_socket_okay(struct socket **sock) |
729 | { | 813 | { |
730 | int rr; | 814 | int rr; |
731 | char tb[4]; | 815 | char tb[4]; |
@@ -733,7 +817,7 @@ static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock) | |||
733 | if (!*sock) | 817 | if (!*sock) |
734 | return false; | 818 | return false; |
735 | 819 | ||
736 | rr = drbd_recv_short(mdev, *sock, tb, 4, MSG_DONTWAIT | MSG_PEEK); | 820 | rr = drbd_recv_short(*sock, tb, 4, MSG_DONTWAIT | MSG_PEEK); |
737 | 821 | ||
738 | if (rr > 0 || rr == -EAGAIN) { | 822 | if (rr > 0 || rr == -EAGAIN) { |
739 | return true; | 823 | return true; |
@@ -743,6 +827,31 @@ static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock) | |||
743 | return false; | 827 | return false; |
744 | } | 828 | } |
745 | } | 829 | } |
830 | /* Gets called if a connection is established, or if a new minor gets created | ||
831 | in a connection */ | ||
832 | int drbd_connected(struct drbd_conf *mdev) | ||
833 | { | ||
834 | int err; | ||
835 | |||
836 | atomic_set(&mdev->packet_seq, 0); | ||
837 | mdev->peer_seq = 0; | ||
838 | |||
839 | mdev->state_mutex = mdev->tconn->agreed_pro_version < 100 ? | ||
840 | &mdev->tconn->cstate_mutex : | ||
841 | &mdev->own_state_mutex; | ||
842 | |||
843 | err = drbd_send_sync_param(mdev); | ||
844 | if (!err) | ||
845 | err = drbd_send_sizes(mdev, 0, 0); | ||
846 | if (!err) | ||
847 | err = drbd_send_uuids(mdev); | ||
848 | if (!err) | ||
849 | err = drbd_send_current_state(mdev); | ||
850 | clear_bit(USE_DEGR_WFC_T, &mdev->flags); | ||
851 | clear_bit(RESIZE_PENDING, &mdev->flags); | ||
852 | mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */ | ||
853 | return err; | ||
854 | } | ||
746 | 855 | ||
747 | /* | 856 | /* |
748 | * return values: | 857 | * return values: |
@@ -752,232 +861,315 @@ static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock) | |||
752 | * no point in trying again, please go standalone. | 861 | * no point in trying again, please go standalone. |
753 | * -2 We do not have a network config... | 862 | * -2 We do not have a network config... |
754 | */ | 863 | */ |
755 | static int drbd_connect(struct drbd_conf *mdev) | 864 | static int conn_connect(struct drbd_tconn *tconn) |
756 | { | 865 | { |
757 | struct socket *s, *sock, *msock; | 866 | struct drbd_socket sock, msock; |
758 | int try, h, ok; | 867 | struct drbd_conf *mdev; |
868 | struct net_conf *nc; | ||
869 | int vnr, timeout, h, ok; | ||
870 | bool discard_my_data; | ||
759 | enum drbd_state_rv rv; | 871 | enum drbd_state_rv rv; |
872 | struct accept_wait_data ad = { | ||
873 | .tconn = tconn, | ||
874 | .door_bell = COMPLETION_INITIALIZER_ONSTACK(ad.door_bell), | ||
875 | }; | ||
760 | 876 | ||
761 | D_ASSERT(!mdev->data.socket); | 877 | clear_bit(DISCONNECT_SENT, &tconn->flags); |
762 | 878 | if (conn_request_state(tconn, NS(conn, C_WF_CONNECTION), CS_VERBOSE) < SS_SUCCESS) | |
763 | if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS) | ||
764 | return -2; | 879 | return -2; |
765 | 880 | ||
766 | clear_bit(DISCARD_CONCURRENT, &mdev->flags); | 881 | mutex_init(&sock.mutex); |
882 | sock.sbuf = tconn->data.sbuf; | ||
883 | sock.rbuf = tconn->data.rbuf; | ||
884 | sock.socket = NULL; | ||
885 | mutex_init(&msock.mutex); | ||
886 | msock.sbuf = tconn->meta.sbuf; | ||
887 | msock.rbuf = tconn->meta.rbuf; | ||
888 | msock.socket = NULL; | ||
889 | |||
890 | /* Assume that the peer only understands protocol 80 until we know better. */ | ||
891 | tconn->agreed_pro_version = 80; | ||
767 | 892 | ||
768 | sock = NULL; | 893 | if (prepare_listen_socket(tconn, &ad)) |
769 | msock = NULL; | 894 | return 0; |
770 | 895 | ||
771 | do { | 896 | do { |
772 | for (try = 0;;) { | 897 | struct socket *s; |
773 | /* 3 tries, this should take less than a second! */ | ||
774 | s = drbd_try_connect(mdev); | ||
775 | if (s || ++try >= 3) | ||
776 | break; | ||
777 | /* give the other side time to call bind() & listen() */ | ||
778 | schedule_timeout_interruptible(HZ / 10); | ||
779 | } | ||
780 | 898 | ||
899 | s = drbd_try_connect(tconn); | ||
781 | if (s) { | 900 | if (s) { |
782 | if (!sock) { | 901 | if (!sock.socket) { |
783 | drbd_send_fp(mdev, s, P_HAND_SHAKE_S); | 902 | sock.socket = s; |
784 | sock = s; | 903 | send_first_packet(tconn, &sock, P_INITIAL_DATA); |
785 | s = NULL; | 904 | } else if (!msock.socket) { |
786 | } else if (!msock) { | 905 | clear_bit(RESOLVE_CONFLICTS, &tconn->flags); |
787 | drbd_send_fp(mdev, s, P_HAND_SHAKE_M); | 906 | msock.socket = s; |
788 | msock = s; | 907 | send_first_packet(tconn, &msock, P_INITIAL_META); |
789 | s = NULL; | ||
790 | } else { | 908 | } else { |
791 | dev_err(DEV, "Logic error in drbd_connect()\n"); | 909 | conn_err(tconn, "Logic error in conn_connect()\n"); |
792 | goto out_release_sockets; | 910 | goto out_release_sockets; |
793 | } | 911 | } |
794 | } | 912 | } |
795 | 913 | ||
796 | if (sock && msock) { | 914 | if (sock.socket && msock.socket) { |
797 | schedule_timeout_interruptible(mdev->net_conf->ping_timeo*HZ/10); | 915 | rcu_read_lock(); |
798 | ok = drbd_socket_okay(mdev, &sock); | 916 | nc = rcu_dereference(tconn->net_conf); |
799 | ok = drbd_socket_okay(mdev, &msock) && ok; | 917 | timeout = nc->ping_timeo * HZ / 10; |
918 | rcu_read_unlock(); | ||
919 | schedule_timeout_interruptible(timeout); | ||
920 | ok = drbd_socket_okay(&sock.socket); | ||
921 | ok = drbd_socket_okay(&msock.socket) && ok; | ||
800 | if (ok) | 922 | if (ok) |
801 | break; | 923 | break; |
802 | } | 924 | } |
803 | 925 | ||
804 | retry: | 926 | retry: |
805 | s = drbd_wait_for_connect(mdev); | 927 | s = drbd_wait_for_connect(tconn, &ad); |
806 | if (s) { | 928 | if (s) { |
807 | try = drbd_recv_fp(mdev, s); | 929 | int fp = receive_first_packet(tconn, s); |
808 | drbd_socket_okay(mdev, &sock); | 930 | drbd_socket_okay(&sock.socket); |
809 | drbd_socket_okay(mdev, &msock); | 931 | drbd_socket_okay(&msock.socket); |
810 | switch (try) { | 932 | switch (fp) { |
811 | case P_HAND_SHAKE_S: | 933 | case P_INITIAL_DATA: |
812 | if (sock) { | 934 | if (sock.socket) { |
813 | dev_warn(DEV, "initial packet S crossed\n"); | 935 | conn_warn(tconn, "initial packet S crossed\n"); |
814 | sock_release(sock); | 936 | sock_release(sock.socket); |
937 | sock.socket = s; | ||
938 | goto randomize; | ||
815 | } | 939 | } |
816 | sock = s; | 940 | sock.socket = s; |
817 | break; | 941 | break; |
818 | case P_HAND_SHAKE_M: | 942 | case P_INITIAL_META: |
819 | if (msock) { | 943 | set_bit(RESOLVE_CONFLICTS, &tconn->flags); |
820 | dev_warn(DEV, "initial packet M crossed\n"); | 944 | if (msock.socket) { |
821 | sock_release(msock); | 945 | conn_warn(tconn, "initial packet M crossed\n"); |
946 | sock_release(msock.socket); | ||
947 | msock.socket = s; | ||
948 | goto randomize; | ||
822 | } | 949 | } |
823 | msock = s; | 950 | msock.socket = s; |
824 | set_bit(DISCARD_CONCURRENT, &mdev->flags); | ||
825 | break; | 951 | break; |
826 | default: | 952 | default: |
827 | dev_warn(DEV, "Error receiving initial packet\n"); | 953 | conn_warn(tconn, "Error receiving initial packet\n"); |
828 | sock_release(s); | 954 | sock_release(s); |
955 | randomize: | ||
829 | if (random32() & 1) | 956 | if (random32() & 1) |
830 | goto retry; | 957 | goto retry; |
831 | } | 958 | } |
832 | } | 959 | } |
833 | 960 | ||
834 | if (mdev->state.conn <= C_DISCONNECTING) | 961 | if (tconn->cstate <= C_DISCONNECTING) |
835 | goto out_release_sockets; | 962 | goto out_release_sockets; |
836 | if (signal_pending(current)) { | 963 | if (signal_pending(current)) { |
837 | flush_signals(current); | 964 | flush_signals(current); |
838 | smp_rmb(); | 965 | smp_rmb(); |
839 | if (get_t_state(&mdev->receiver) == Exiting) | 966 | if (get_t_state(&tconn->receiver) == EXITING) |
840 | goto out_release_sockets; | 967 | goto out_release_sockets; |
841 | } | 968 | } |
842 | 969 | ||
843 | if (sock && msock) { | 970 | ok = drbd_socket_okay(&sock.socket); |
844 | ok = drbd_socket_okay(mdev, &sock); | 971 | ok = drbd_socket_okay(&msock.socket) && ok; |
845 | ok = drbd_socket_okay(mdev, &msock) && ok; | 972 | } while (!ok); |
846 | if (ok) | 973 | |
847 | break; | 974 | if (ad.s_listen) |
848 | } | 975 | sock_release(ad.s_listen); |
849 | } while (1); | ||
850 | 976 | ||
851 | msock->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ | 977 | sock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ |
852 | sock->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ | 978 | msock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ |
853 | 979 | ||
854 | sock->sk->sk_allocation = GFP_NOIO; | 980 | sock.socket->sk->sk_allocation = GFP_NOIO; |
855 | msock->sk->sk_allocation = GFP_NOIO; | 981 | msock.socket->sk->sk_allocation = GFP_NOIO; |
856 | 982 | ||
857 | sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK; | 983 | sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK; |
858 | msock->sk->sk_priority = TC_PRIO_INTERACTIVE; | 984 | msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE; |
859 | 985 | ||
860 | /* NOT YET ... | 986 | /* NOT YET ... |
861 | * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; | 987 | * sock.socket->sk->sk_sndtimeo = tconn->net_conf->timeout*HZ/10; |
862 | * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; | 988 | * sock.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; |
863 | * first set it to the P_HAND_SHAKE timeout, | 989 | * first set it to the P_CONNECTION_FEATURES timeout, |
864 | * which we set to 4x the configured ping_timeout. */ | 990 | * which we set to 4x the configured ping_timeout. */ |
865 | sock->sk->sk_sndtimeo = | 991 | rcu_read_lock(); |
866 | sock->sk->sk_rcvtimeo = mdev->net_conf->ping_timeo*4*HZ/10; | 992 | nc = rcu_dereference(tconn->net_conf); |
867 | 993 | ||
868 | msock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; | 994 | sock.socket->sk->sk_sndtimeo = |
869 | msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ; | 995 | sock.socket->sk->sk_rcvtimeo = nc->ping_timeo*4*HZ/10; |
996 | |||
997 | msock.socket->sk->sk_rcvtimeo = nc->ping_int*HZ; | ||
998 | timeout = nc->timeout * HZ / 10; | ||
999 | discard_my_data = nc->discard_my_data; | ||
1000 | rcu_read_unlock(); | ||
1001 | |||
1002 | msock.socket->sk->sk_sndtimeo = timeout; | ||
870 | 1003 | ||
871 | /* we don't want delays. | 1004 | /* we don't want delays. |
872 | * we use TCP_CORK where appropriate, though */ | 1005 | * we use TCP_CORK where appropriate, though */ |
873 | drbd_tcp_nodelay(sock); | 1006 | drbd_tcp_nodelay(sock.socket); |
874 | drbd_tcp_nodelay(msock); | 1007 | drbd_tcp_nodelay(msock.socket); |
875 | |||
876 | mdev->data.socket = sock; | ||
877 | mdev->meta.socket = msock; | ||
878 | mdev->last_received = jiffies; | ||
879 | 1008 | ||
880 | D_ASSERT(mdev->asender.task == NULL); | 1009 | tconn->data.socket = sock.socket; |
1010 | tconn->meta.socket = msock.socket; | ||
1011 | tconn->last_received = jiffies; | ||
881 | 1012 | ||
882 | h = drbd_do_handshake(mdev); | 1013 | h = drbd_do_features(tconn); |
883 | if (h <= 0) | 1014 | if (h <= 0) |
884 | return h; | 1015 | return h; |
885 | 1016 | ||
886 | if (mdev->cram_hmac_tfm) { | 1017 | if (tconn->cram_hmac_tfm) { |
887 | /* drbd_request_state(mdev, NS(conn, WFAuth)); */ | 1018 | /* drbd_request_state(mdev, NS(conn, WFAuth)); */ |
888 | switch (drbd_do_auth(mdev)) { | 1019 | switch (drbd_do_auth(tconn)) { |
889 | case -1: | 1020 | case -1: |
890 | dev_err(DEV, "Authentication of peer failed\n"); | 1021 | conn_err(tconn, "Authentication of peer failed\n"); |
891 | return -1; | 1022 | return -1; |
892 | case 0: | 1023 | case 0: |
893 | dev_err(DEV, "Authentication of peer failed, trying again.\n"); | 1024 | conn_err(tconn, "Authentication of peer failed, trying again.\n"); |
894 | return 0; | 1025 | return 0; |
895 | } | 1026 | } |
896 | } | 1027 | } |
897 | 1028 | ||
898 | sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; | 1029 | tconn->data.socket->sk->sk_sndtimeo = timeout; |
899 | sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; | 1030 | tconn->data.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; |
900 | 1031 | ||
901 | atomic_set(&mdev->packet_seq, 0); | 1032 | if (drbd_send_protocol(tconn) == -EOPNOTSUPP) |
902 | mdev->peer_seq = 0; | ||
903 | |||
904 | if (drbd_send_protocol(mdev) == -1) | ||
905 | return -1; | 1033 | return -1; |
906 | set_bit(STATE_SENT, &mdev->flags); | ||
907 | drbd_send_sync_param(mdev, &mdev->sync_conf); | ||
908 | drbd_send_sizes(mdev, 0, 0); | ||
909 | drbd_send_uuids(mdev); | ||
910 | drbd_send_current_state(mdev); | ||
911 | clear_bit(USE_DEGR_WFC_T, &mdev->flags); | ||
912 | clear_bit(RESIZE_PENDING, &mdev->flags); | ||
913 | 1034 | ||
914 | spin_lock_irq(&mdev->req_lock); | 1035 | set_bit(STATE_SENT, &tconn->flags); |
915 | rv = _drbd_set_state(_NS(mdev, conn, C_WF_REPORT_PARAMS), CS_VERBOSE, NULL); | 1036 | |
916 | if (mdev->state.conn != C_WF_REPORT_PARAMS) | 1037 | rcu_read_lock(); |
917 | clear_bit(STATE_SENT, &mdev->flags); | 1038 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { |
918 | spin_unlock_irq(&mdev->req_lock); | 1039 | kref_get(&mdev->kref); |
1040 | /* Prevent a race between resync-handshake and | ||
1041 | * being promoted to Primary. | ||
1042 | * | ||
1043 | * Grab and release the state mutex, so we know that any current | ||
1044 | * drbd_set_role() is finished, and any incoming drbd_set_role | ||
1045 | * will see the STATE_SENT flag, and wait for it to be cleared. | ||
1046 | */ | ||
1047 | mutex_lock(mdev->state_mutex); | ||
1048 | mutex_unlock(mdev->state_mutex); | ||
1049 | |||
1050 | rcu_read_unlock(); | ||
1051 | |||
1052 | if (discard_my_data) | ||
1053 | set_bit(DISCARD_MY_DATA, &mdev->flags); | ||
1054 | else | ||
1055 | clear_bit(DISCARD_MY_DATA, &mdev->flags); | ||
1056 | |||
1057 | drbd_connected(mdev); | ||
1058 | kref_put(&mdev->kref, &drbd_minor_destroy); | ||
1059 | rcu_read_lock(); | ||
1060 | } | ||
1061 | rcu_read_unlock(); | ||
919 | 1062 | ||
920 | if (rv < SS_SUCCESS) | 1063 | rv = conn_request_state(tconn, NS(conn, C_WF_REPORT_PARAMS), CS_VERBOSE); |
1064 | if (rv < SS_SUCCESS || tconn->cstate != C_WF_REPORT_PARAMS) { | ||
1065 | clear_bit(STATE_SENT, &tconn->flags); | ||
921 | return 0; | 1066 | return 0; |
1067 | } | ||
922 | 1068 | ||
923 | drbd_thread_start(&mdev->asender); | 1069 | drbd_thread_start(&tconn->asender); |
924 | mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */ | ||
925 | 1070 | ||
926 | return 1; | 1071 | mutex_lock(&tconn->conf_update); |
1072 | /* The discard_my_data flag is a single-shot modifier to the next | ||
1073 | * connection attempt, the handshake of which is now well underway. | ||
1074 | * No need for rcu style copying of the whole struct | ||
1075 | * just to clear a single value. */ | ||
1076 | tconn->net_conf->discard_my_data = 0; | ||
1077 | mutex_unlock(&tconn->conf_update); | ||
1078 | |||
1079 | return h; | ||
927 | 1080 | ||
928 | out_release_sockets: | 1081 | out_release_sockets: |
929 | if (sock) | 1082 | if (ad.s_listen) |
930 | sock_release(sock); | 1083 | sock_release(ad.s_listen); |
931 | if (msock) | 1084 | if (sock.socket) |
932 | sock_release(msock); | 1085 | sock_release(sock.socket); |
1086 | if (msock.socket) | ||
1087 | sock_release(msock.socket); | ||
933 | return -1; | 1088 | return -1; |
934 | } | 1089 | } |
935 | 1090 | ||
936 | static int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsigned int *packet_size) | 1091 | static int decode_header(struct drbd_tconn *tconn, void *header, struct packet_info *pi) |
937 | { | 1092 | { |
938 | union p_header *h = &mdev->data.rbuf.header; | 1093 | unsigned int header_size = drbd_header_size(tconn); |
939 | int r; | 1094 | |
940 | 1095 | if (header_size == sizeof(struct p_header100) && | |
941 | r = drbd_recv(mdev, h, sizeof(*h)); | 1096 | *(__be32 *)header == cpu_to_be32(DRBD_MAGIC_100)) { |
942 | if (unlikely(r != sizeof(*h))) { | 1097 | struct p_header100 *h = header; |
943 | if (!signal_pending(current)) | 1098 | if (h->pad != 0) { |
944 | dev_warn(DEV, "short read expecting header on sock: r=%d\n", r); | 1099 | conn_err(tconn, "Header padding is not zero\n"); |
945 | return false; | 1100 | return -EINVAL; |
946 | } | 1101 | } |
947 | 1102 | pi->vnr = be16_to_cpu(h->volume); | |
948 | if (likely(h->h80.magic == BE_DRBD_MAGIC)) { | 1103 | pi->cmd = be16_to_cpu(h->command); |
949 | *cmd = be16_to_cpu(h->h80.command); | 1104 | pi->size = be32_to_cpu(h->length); |
950 | *packet_size = be16_to_cpu(h->h80.length); | 1105 | } else if (header_size == sizeof(struct p_header95) && |
951 | } else if (h->h95.magic == BE_DRBD_MAGIC_BIG) { | 1106 | *(__be16 *)header == cpu_to_be16(DRBD_MAGIC_BIG)) { |
952 | *cmd = be16_to_cpu(h->h95.command); | 1107 | struct p_header95 *h = header; |
953 | *packet_size = be32_to_cpu(h->h95.length); | 1108 | pi->cmd = be16_to_cpu(h->command); |
1109 | pi->size = be32_to_cpu(h->length); | ||
1110 | pi->vnr = 0; | ||
1111 | } else if (header_size == sizeof(struct p_header80) && | ||
1112 | *(__be32 *)header == cpu_to_be32(DRBD_MAGIC)) { | ||
1113 | struct p_header80 *h = header; | ||
1114 | pi->cmd = be16_to_cpu(h->command); | ||
1115 | pi->size = be16_to_cpu(h->length); | ||
1116 | pi->vnr = 0; | ||
954 | } else { | 1117 | } else { |
955 | dev_err(DEV, "magic?? on data m: 0x%08x c: %d l: %d\n", | 1118 | conn_err(tconn, "Wrong magic value 0x%08x in protocol version %d\n", |
956 | be32_to_cpu(h->h80.magic), | 1119 | be32_to_cpu(*(__be32 *)header), |
957 | be16_to_cpu(h->h80.command), | 1120 | tconn->agreed_pro_version); |
958 | be16_to_cpu(h->h80.length)); | 1121 | return -EINVAL; |
959 | return false; | ||
960 | } | 1122 | } |
961 | mdev->last_received = jiffies; | 1123 | pi->data = header + header_size; |
1124 | return 0; | ||
1125 | } | ||
962 | 1126 | ||
963 | return true; | 1127 | static int drbd_recv_header(struct drbd_tconn *tconn, struct packet_info *pi) |
1128 | { | ||
1129 | void *buffer = tconn->data.rbuf; | ||
1130 | int err; | ||
1131 | |||
1132 | err = drbd_recv_all_warn(tconn, buffer, drbd_header_size(tconn)); | ||
1133 | if (err) | ||
1134 | return err; | ||
1135 | |||
1136 | err = decode_header(tconn, buffer, pi); | ||
1137 | tconn->last_received = jiffies; | ||
1138 | |||
1139 | return err; | ||
964 | } | 1140 | } |
965 | 1141 | ||
966 | static void drbd_flush(struct drbd_conf *mdev) | 1142 | static void drbd_flush(struct drbd_tconn *tconn) |
967 | { | 1143 | { |
968 | int rv; | 1144 | int rv; |
1145 | struct drbd_conf *mdev; | ||
1146 | int vnr; | ||
969 | 1147 | ||
970 | if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) { | 1148 | if (tconn->write_ordering >= WO_bdev_flush) { |
971 | rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL, | 1149 | rcu_read_lock(); |
972 | NULL); | 1150 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { |
973 | if (rv) { | 1151 | if (!get_ldev(mdev)) |
974 | dev_info(DEV, "local disk flush failed with status %d\n", rv); | 1152 | continue; |
975 | /* would rather check on EOPNOTSUPP, but that is not reliable. | 1153 | kref_get(&mdev->kref); |
976 | * don't try again for ANY return value != 0 | 1154 | rcu_read_unlock(); |
977 | * if (rv == -EOPNOTSUPP) */ | 1155 | |
978 | drbd_bump_write_ordering(mdev, WO_drain_io); | 1156 | rv = blkdev_issue_flush(mdev->ldev->backing_bdev, |
1157 | GFP_NOIO, NULL); | ||
1158 | if (rv) { | ||
1159 | dev_info(DEV, "local disk flush failed with status %d\n", rv); | ||
1160 | /* would rather check on EOPNOTSUPP, but that is not reliable. | ||
1161 | * don't try again for ANY return value != 0 | ||
1162 | * if (rv == -EOPNOTSUPP) */ | ||
1163 | drbd_bump_write_ordering(tconn, WO_drain_io); | ||
1164 | } | ||
1165 | put_ldev(mdev); | ||
1166 | kref_put(&mdev->kref, &drbd_minor_destroy); | ||
1167 | |||
1168 | rcu_read_lock(); | ||
1169 | if (rv) | ||
1170 | break; | ||
979 | } | 1171 | } |
980 | put_ldev(mdev); | 1172 | rcu_read_unlock(); |
981 | } | 1173 | } |
982 | } | 1174 | } |
983 | 1175 | ||
@@ -987,7 +1179,7 @@ static void drbd_flush(struct drbd_conf *mdev) | |||
987 | * @epoch: Epoch object. | 1179 | * @epoch: Epoch object. |
988 | * @ev: Epoch event. | 1180 | * @ev: Epoch event. |
989 | */ | 1181 | */ |
990 | static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, | 1182 | static enum finish_epoch drbd_may_finish_epoch(struct drbd_tconn *tconn, |
991 | struct drbd_epoch *epoch, | 1183 | struct drbd_epoch *epoch, |
992 | enum epoch_event ev) | 1184 | enum epoch_event ev) |
993 | { | 1185 | { |
@@ -995,7 +1187,7 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, | |||
995 | struct drbd_epoch *next_epoch; | 1187 | struct drbd_epoch *next_epoch; |
996 | enum finish_epoch rv = FE_STILL_LIVE; | 1188 | enum finish_epoch rv = FE_STILL_LIVE; |
997 | 1189 | ||
998 | spin_lock(&mdev->epoch_lock); | 1190 | spin_lock(&tconn->epoch_lock); |
999 | do { | 1191 | do { |
1000 | next_epoch = NULL; | 1192 | next_epoch = NULL; |
1001 | 1193 | ||
@@ -1017,18 +1209,22 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, | |||
1017 | atomic_read(&epoch->active) == 0 && | 1209 | atomic_read(&epoch->active) == 0 && |
1018 | (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) { | 1210 | (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) { |
1019 | if (!(ev & EV_CLEANUP)) { | 1211 | if (!(ev & EV_CLEANUP)) { |
1020 | spin_unlock(&mdev->epoch_lock); | 1212 | spin_unlock(&tconn->epoch_lock); |
1021 | drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size); | 1213 | drbd_send_b_ack(epoch->tconn, epoch->barrier_nr, epoch_size); |
1022 | spin_lock(&mdev->epoch_lock); | 1214 | spin_lock(&tconn->epoch_lock); |
1023 | } | 1215 | } |
1216 | #if 0 | ||
1217 | /* FIXME: dec unacked on connection, once we have | ||
1218 | * something to count pending connection packets in. */ | ||
1024 | if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) | 1219 | if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) |
1025 | dec_unacked(mdev); | 1220 | dec_unacked(epoch->tconn); |
1221 | #endif | ||
1026 | 1222 | ||
1027 | if (mdev->current_epoch != epoch) { | 1223 | if (tconn->current_epoch != epoch) { |
1028 | next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list); | 1224 | next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list); |
1029 | list_del(&epoch->list); | 1225 | list_del(&epoch->list); |
1030 | ev = EV_BECAME_LAST | (ev & EV_CLEANUP); | 1226 | ev = EV_BECAME_LAST | (ev & EV_CLEANUP); |
1031 | mdev->epochs--; | 1227 | tconn->epochs--; |
1032 | kfree(epoch); | 1228 | kfree(epoch); |
1033 | 1229 | ||
1034 | if (rv == FE_STILL_LIVE) | 1230 | if (rv == FE_STILL_LIVE) |
@@ -1039,7 +1235,6 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, | |||
1039 | /* atomic_set(&epoch->active, 0); is already zero */ | 1235 | /* atomic_set(&epoch->active, 0); is already zero */ |
1040 | if (rv == FE_STILL_LIVE) | 1236 | if (rv == FE_STILL_LIVE) |
1041 | rv = FE_RECYCLED; | 1237 | rv = FE_RECYCLED; |
1042 | wake_up(&mdev->ee_wait); | ||
1043 | } | 1238 | } |
1044 | } | 1239 | } |
1045 | 1240 | ||
@@ -1049,40 +1244,52 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, | |||
1049 | epoch = next_epoch; | 1244 | epoch = next_epoch; |
1050 | } while (1); | 1245 | } while (1); |
1051 | 1246 | ||
1052 | spin_unlock(&mdev->epoch_lock); | 1247 | spin_unlock(&tconn->epoch_lock); |
1053 | 1248 | ||
1054 | return rv; | 1249 | return rv; |
1055 | } | 1250 | } |
1056 | 1251 | ||
1057 | /** | 1252 | /** |
1058 | * drbd_bump_write_ordering() - Fall back to an other write ordering method | 1253 | * drbd_bump_write_ordering() - Fall back to an other write ordering method |
1059 | * @mdev: DRBD device. | 1254 | * @tconn: DRBD connection. |
1060 | * @wo: Write ordering method to try. | 1255 | * @wo: Write ordering method to try. |
1061 | */ | 1256 | */ |
1062 | void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) __must_hold(local) | 1257 | void drbd_bump_write_ordering(struct drbd_tconn *tconn, enum write_ordering_e wo) |
1063 | { | 1258 | { |
1259 | struct disk_conf *dc; | ||
1260 | struct drbd_conf *mdev; | ||
1064 | enum write_ordering_e pwo; | 1261 | enum write_ordering_e pwo; |
1262 | int vnr; | ||
1065 | static char *write_ordering_str[] = { | 1263 | static char *write_ordering_str[] = { |
1066 | [WO_none] = "none", | 1264 | [WO_none] = "none", |
1067 | [WO_drain_io] = "drain", | 1265 | [WO_drain_io] = "drain", |
1068 | [WO_bdev_flush] = "flush", | 1266 | [WO_bdev_flush] = "flush", |
1069 | }; | 1267 | }; |
1070 | 1268 | ||
1071 | pwo = mdev->write_ordering; | 1269 | pwo = tconn->write_ordering; |
1072 | wo = min(pwo, wo); | 1270 | wo = min(pwo, wo); |
1073 | if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush) | 1271 | rcu_read_lock(); |
1074 | wo = WO_drain_io; | 1272 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { |
1075 | if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain) | 1273 | if (!get_ldev_if_state(mdev, D_ATTACHING)) |
1076 | wo = WO_none; | 1274 | continue; |
1077 | mdev->write_ordering = wo; | 1275 | dc = rcu_dereference(mdev->ldev->disk_conf); |
1078 | if (pwo != mdev->write_ordering || wo == WO_bdev_flush) | 1276 | |
1079 | dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]); | 1277 | if (wo == WO_bdev_flush && !dc->disk_flushes) |
1278 | wo = WO_drain_io; | ||
1279 | if (wo == WO_drain_io && !dc->disk_drain) | ||
1280 | wo = WO_none; | ||
1281 | put_ldev(mdev); | ||
1282 | } | ||
1283 | rcu_read_unlock(); | ||
1284 | tconn->write_ordering = wo; | ||
1285 | if (pwo != tconn->write_ordering || wo == WO_bdev_flush) | ||
1286 | conn_info(tconn, "Method to ensure write ordering: %s\n", write_ordering_str[tconn->write_ordering]); | ||
1080 | } | 1287 | } |
1081 | 1288 | ||
1082 | /** | 1289 | /** |
1083 | * drbd_submit_ee() | 1290 | * drbd_submit_peer_request() |
1084 | * @mdev: DRBD device. | 1291 | * @mdev: DRBD device. |
1085 | * @e: epoch entry | 1292 | * @peer_req: peer request |
1086 | * @rw: flag field, see bio->bi_rw | 1293 | * @rw: flag field, see bio->bi_rw |
1087 | * | 1294 | * |
1088 | * May spread the pages to multiple bios, | 1295 | * May spread the pages to multiple bios, |
@@ -1096,14 +1303,15 @@ void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) | |||
1096 | * on certain Xen deployments. | 1303 | * on certain Xen deployments. |
1097 | */ | 1304 | */ |
1098 | /* TODO allocate from our own bio_set. */ | 1305 | /* TODO allocate from our own bio_set. */ |
1099 | int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, | 1306 | int drbd_submit_peer_request(struct drbd_conf *mdev, |
1100 | const unsigned rw, const int fault_type) | 1307 | struct drbd_peer_request *peer_req, |
1308 | const unsigned rw, const int fault_type) | ||
1101 | { | 1309 | { |
1102 | struct bio *bios = NULL; | 1310 | struct bio *bios = NULL; |
1103 | struct bio *bio; | 1311 | struct bio *bio; |
1104 | struct page *page = e->pages; | 1312 | struct page *page = peer_req->pages; |
1105 | sector_t sector = e->sector; | 1313 | sector_t sector = peer_req->i.sector; |
1106 | unsigned ds = e->size; | 1314 | unsigned ds = peer_req->i.size; |
1107 | unsigned n_bios = 0; | 1315 | unsigned n_bios = 0; |
1108 | unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT; | 1316 | unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT; |
1109 | int err = -ENOMEM; | 1317 | int err = -ENOMEM; |
@@ -1122,12 +1330,12 @@ next_bio: | |||
1122 | dev_err(DEV, "submit_ee: Allocation of a bio failed\n"); | 1330 | dev_err(DEV, "submit_ee: Allocation of a bio failed\n"); |
1123 | goto fail; | 1331 | goto fail; |
1124 | } | 1332 | } |
1125 | /* > e->sector, unless this is the first bio */ | 1333 | /* > peer_req->i.sector, unless this is the first bio */ |
1126 | bio->bi_sector = sector; | 1334 | bio->bi_sector = sector; |
1127 | bio->bi_bdev = mdev->ldev->backing_bdev; | 1335 | bio->bi_bdev = mdev->ldev->backing_bdev; |
1128 | bio->bi_rw = rw; | 1336 | bio->bi_rw = rw; |
1129 | bio->bi_private = e; | 1337 | bio->bi_private = peer_req; |
1130 | bio->bi_end_io = drbd_endio_sec; | 1338 | bio->bi_end_io = drbd_peer_request_endio; |
1131 | 1339 | ||
1132 | bio->bi_next = bios; | 1340 | bio->bi_next = bios; |
1133 | bios = bio; | 1341 | bios = bio; |
@@ -1156,7 +1364,7 @@ next_bio: | |||
1156 | D_ASSERT(page == NULL); | 1364 | D_ASSERT(page == NULL); |
1157 | D_ASSERT(ds == 0); | 1365 | D_ASSERT(ds == 0); |
1158 | 1366 | ||
1159 | atomic_set(&e->pending_bios, n_bios); | 1367 | atomic_set(&peer_req->pending_bios, n_bios); |
1160 | do { | 1368 | do { |
1161 | bio = bios; | 1369 | bio = bios; |
1162 | bios = bios->bi_next; | 1370 | bios = bios->bi_next; |
@@ -1175,26 +1383,57 @@ fail: | |||
1175 | return err; | 1383 | return err; |
1176 | } | 1384 | } |
1177 | 1385 | ||
1178 | static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 1386 | static void drbd_remove_epoch_entry_interval(struct drbd_conf *mdev, |
1387 | struct drbd_peer_request *peer_req) | ||
1388 | { | ||
1389 | struct drbd_interval *i = &peer_req->i; | ||
1390 | |||
1391 | drbd_remove_interval(&mdev->write_requests, i); | ||
1392 | drbd_clear_interval(i); | ||
1393 | |||
1394 | /* Wake up any processes waiting for this peer request to complete. */ | ||
1395 | if (i->waiting) | ||
1396 | wake_up(&mdev->misc_wait); | ||
1397 | } | ||
1398 | |||
1399 | void conn_wait_active_ee_empty(struct drbd_tconn *tconn) | ||
1400 | { | ||
1401 | struct drbd_conf *mdev; | ||
1402 | int vnr; | ||
1403 | |||
1404 | rcu_read_lock(); | ||
1405 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
1406 | kref_get(&mdev->kref); | ||
1407 | rcu_read_unlock(); | ||
1408 | drbd_wait_ee_list_empty(mdev, &mdev->active_ee); | ||
1409 | kref_put(&mdev->kref, &drbd_minor_destroy); | ||
1410 | rcu_read_lock(); | ||
1411 | } | ||
1412 | rcu_read_unlock(); | ||
1413 | } | ||
1414 | |||
1415 | static int receive_Barrier(struct drbd_tconn *tconn, struct packet_info *pi) | ||
1179 | { | 1416 | { |
1180 | int rv; | 1417 | int rv; |
1181 | struct p_barrier *p = &mdev->data.rbuf.barrier; | 1418 | struct p_barrier *p = pi->data; |
1182 | struct drbd_epoch *epoch; | 1419 | struct drbd_epoch *epoch; |
1183 | 1420 | ||
1184 | inc_unacked(mdev); | 1421 | /* FIXME these are unacked on connection, |
1185 | 1422 | * not a specific (peer)device. | |
1186 | mdev->current_epoch->barrier_nr = p->barrier; | 1423 | */ |
1187 | rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_GOT_BARRIER_NR); | 1424 | tconn->current_epoch->barrier_nr = p->barrier; |
1425 | tconn->current_epoch->tconn = tconn; | ||
1426 | rv = drbd_may_finish_epoch(tconn, tconn->current_epoch, EV_GOT_BARRIER_NR); | ||
1188 | 1427 | ||
1189 | /* P_BARRIER_ACK may imply that the corresponding extent is dropped from | 1428 | /* P_BARRIER_ACK may imply that the corresponding extent is dropped from |
1190 | * the activity log, which means it would not be resynced in case the | 1429 | * the activity log, which means it would not be resynced in case the |
1191 | * R_PRIMARY crashes now. | 1430 | * R_PRIMARY crashes now. |
1192 | * Therefore we must send the barrier_ack after the barrier request was | 1431 | * Therefore we must send the barrier_ack after the barrier request was |
1193 | * completed. */ | 1432 | * completed. */ |
1194 | switch (mdev->write_ordering) { | 1433 | switch (tconn->write_ordering) { |
1195 | case WO_none: | 1434 | case WO_none: |
1196 | if (rv == FE_RECYCLED) | 1435 | if (rv == FE_RECYCLED) |
1197 | return true; | 1436 | return 0; |
1198 | 1437 | ||
1199 | /* receiver context, in the writeout path of the other node. | 1438 | /* receiver context, in the writeout path of the other node. |
1200 | * avoid potential distributed deadlock */ | 1439 | * avoid potential distributed deadlock */ |
@@ -1202,81 +1441,75 @@ static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsign | |||
1202 | if (epoch) | 1441 | if (epoch) |
1203 | break; | 1442 | break; |
1204 | else | 1443 | else |
1205 | dev_warn(DEV, "Allocation of an epoch failed, slowing down\n"); | 1444 | conn_warn(tconn, "Allocation of an epoch failed, slowing down\n"); |
1206 | /* Fall through */ | 1445 | /* Fall through */ |
1207 | 1446 | ||
1208 | case WO_bdev_flush: | 1447 | case WO_bdev_flush: |
1209 | case WO_drain_io: | 1448 | case WO_drain_io: |
1210 | drbd_wait_ee_list_empty(mdev, &mdev->active_ee); | 1449 | conn_wait_active_ee_empty(tconn); |
1211 | drbd_flush(mdev); | 1450 | drbd_flush(tconn); |
1212 | 1451 | ||
1213 | if (atomic_read(&mdev->current_epoch->epoch_size)) { | 1452 | if (atomic_read(&tconn->current_epoch->epoch_size)) { |
1214 | epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO); | 1453 | epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO); |
1215 | if (epoch) | 1454 | if (epoch) |
1216 | break; | 1455 | break; |
1217 | } | 1456 | } |
1218 | 1457 | ||
1219 | epoch = mdev->current_epoch; | 1458 | return 0; |
1220 | wait_event(mdev->ee_wait, atomic_read(&epoch->epoch_size) == 0); | ||
1221 | |||
1222 | D_ASSERT(atomic_read(&epoch->active) == 0); | ||
1223 | D_ASSERT(epoch->flags == 0); | ||
1224 | |||
1225 | return true; | ||
1226 | default: | 1459 | default: |
1227 | dev_err(DEV, "Strangeness in mdev->write_ordering %d\n", mdev->write_ordering); | 1460 | conn_err(tconn, "Strangeness in tconn->write_ordering %d\n", tconn->write_ordering); |
1228 | return false; | 1461 | return -EIO; |
1229 | } | 1462 | } |
1230 | 1463 | ||
1231 | epoch->flags = 0; | 1464 | epoch->flags = 0; |
1232 | atomic_set(&epoch->epoch_size, 0); | 1465 | atomic_set(&epoch->epoch_size, 0); |
1233 | atomic_set(&epoch->active, 0); | 1466 | atomic_set(&epoch->active, 0); |
1234 | 1467 | ||
1235 | spin_lock(&mdev->epoch_lock); | 1468 | spin_lock(&tconn->epoch_lock); |
1236 | if (atomic_read(&mdev->current_epoch->epoch_size)) { | 1469 | if (atomic_read(&tconn->current_epoch->epoch_size)) { |
1237 | list_add(&epoch->list, &mdev->current_epoch->list); | 1470 | list_add(&epoch->list, &tconn->current_epoch->list); |
1238 | mdev->current_epoch = epoch; | 1471 | tconn->current_epoch = epoch; |
1239 | mdev->epochs++; | 1472 | tconn->epochs++; |
1240 | } else { | 1473 | } else { |
1241 | /* The current_epoch got recycled while we allocated this one... */ | 1474 | /* The current_epoch got recycled while we allocated this one... */ |
1242 | kfree(epoch); | 1475 | kfree(epoch); |
1243 | } | 1476 | } |
1244 | spin_unlock(&mdev->epoch_lock); | 1477 | spin_unlock(&tconn->epoch_lock); |
1245 | 1478 | ||
1246 | return true; | 1479 | return 0; |
1247 | } | 1480 | } |
1248 | 1481 | ||
1249 | /* used from receive_RSDataReply (recv_resync_read) | 1482 | /* used from receive_RSDataReply (recv_resync_read) |
1250 | * and from receive_Data */ | 1483 | * and from receive_Data */ |
1251 | static struct drbd_epoch_entry * | 1484 | static struct drbd_peer_request * |
1252 | read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local) | 1485 | read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, |
1486 | int data_size) __must_hold(local) | ||
1253 | { | 1487 | { |
1254 | const sector_t capacity = drbd_get_capacity(mdev->this_bdev); | 1488 | const sector_t capacity = drbd_get_capacity(mdev->this_bdev); |
1255 | struct drbd_epoch_entry *e; | 1489 | struct drbd_peer_request *peer_req; |
1256 | struct page *page; | 1490 | struct page *page; |
1257 | int dgs, ds, rr; | 1491 | int dgs, ds, err; |
1258 | void *dig_in = mdev->int_dig_in; | 1492 | void *dig_in = mdev->tconn->int_dig_in; |
1259 | void *dig_vv = mdev->int_dig_vv; | 1493 | void *dig_vv = mdev->tconn->int_dig_vv; |
1260 | unsigned long *data; | 1494 | unsigned long *data; |
1261 | 1495 | ||
1262 | dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ? | 1496 | dgs = 0; |
1263 | crypto_hash_digestsize(mdev->integrity_r_tfm) : 0; | 1497 | if (mdev->tconn->peer_integrity_tfm) { |
1264 | 1498 | dgs = crypto_hash_digestsize(mdev->tconn->peer_integrity_tfm); | |
1265 | if (dgs) { | 1499 | /* |
1266 | rr = drbd_recv(mdev, dig_in, dgs); | 1500 | * FIXME: Receive the incoming digest into the receive buffer |
1267 | if (rr != dgs) { | 1501 | * here, together with its struct p_data? |
1268 | if (!signal_pending(current)) | 1502 | */ |
1269 | dev_warn(DEV, | 1503 | err = drbd_recv_all_warn(mdev->tconn, dig_in, dgs); |
1270 | "short read receiving data digest: read %d expected %d\n", | 1504 | if (err) |
1271 | rr, dgs); | ||
1272 | return NULL; | 1505 | return NULL; |
1273 | } | 1506 | data_size -= dgs; |
1274 | } | 1507 | } |
1275 | 1508 | ||
1276 | data_size -= dgs; | 1509 | if (!expect(IS_ALIGNED(data_size, 512))) |
1277 | 1510 | return NULL; | |
1278 | ERR_IF(data_size & 0x1ff) return NULL; | 1511 | if (!expect(data_size <= DRBD_MAX_BIO_SIZE)) |
1279 | ERR_IF(data_size > DRBD_MAX_BIO_SIZE) return NULL; | 1512 | return NULL; |
1280 | 1513 | ||
1281 | /* even though we trust out peer, | 1514 | /* even though we trust out peer, |
1282 | * we sometimes have to double check. */ | 1515 | * we sometimes have to double check. */ |
@@ -1291,47 +1524,42 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __ | |||
1291 | /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD | 1524 | /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD |
1292 | * "criss-cross" setup, that might cause write-out on some other DRBD, | 1525 | * "criss-cross" setup, that might cause write-out on some other DRBD, |
1293 | * which in turn might block on the other node at this very place. */ | 1526 | * which in turn might block on the other node at this very place. */ |
1294 | e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO); | 1527 | peer_req = drbd_alloc_peer_req(mdev, id, sector, data_size, GFP_NOIO); |
1295 | if (!e) | 1528 | if (!peer_req) |
1296 | return NULL; | 1529 | return NULL; |
1297 | 1530 | ||
1298 | if (!data_size) | 1531 | if (!data_size) |
1299 | return e; | 1532 | return peer_req; |
1300 | 1533 | ||
1301 | ds = data_size; | 1534 | ds = data_size; |
1302 | page = e->pages; | 1535 | page = peer_req->pages; |
1303 | page_chain_for_each(page) { | 1536 | page_chain_for_each(page) { |
1304 | unsigned len = min_t(int, ds, PAGE_SIZE); | 1537 | unsigned len = min_t(int, ds, PAGE_SIZE); |
1305 | data = kmap(page); | 1538 | data = kmap(page); |
1306 | rr = drbd_recv(mdev, data, len); | 1539 | err = drbd_recv_all_warn(mdev->tconn, data, len); |
1307 | if (drbd_insert_fault(mdev, DRBD_FAULT_RECEIVE)) { | 1540 | if (drbd_insert_fault(mdev, DRBD_FAULT_RECEIVE)) { |
1308 | dev_err(DEV, "Fault injection: Corrupting data on receive\n"); | 1541 | dev_err(DEV, "Fault injection: Corrupting data on receive\n"); |
1309 | data[0] = data[0] ^ (unsigned long)-1; | 1542 | data[0] = data[0] ^ (unsigned long)-1; |
1310 | } | 1543 | } |
1311 | kunmap(page); | 1544 | kunmap(page); |
1312 | if (rr != len) { | 1545 | if (err) { |
1313 | drbd_free_ee(mdev, e); | 1546 | drbd_free_peer_req(mdev, peer_req); |
1314 | if (!signal_pending(current)) | ||
1315 | dev_warn(DEV, "short read receiving data: read %d expected %d\n", | ||
1316 | rr, len); | ||
1317 | return NULL; | 1547 | return NULL; |
1318 | } | 1548 | } |
1319 | ds -= rr; | 1549 | ds -= len; |
1320 | } | 1550 | } |
1321 | 1551 | ||
1322 | if (dgs) { | 1552 | if (dgs) { |
1323 | drbd_csum_ee(mdev, mdev->integrity_r_tfm, e, dig_vv); | 1553 | drbd_csum_ee(mdev, mdev->tconn->peer_integrity_tfm, peer_req, dig_vv); |
1324 | if (memcmp(dig_in, dig_vv, dgs)) { | 1554 | if (memcmp(dig_in, dig_vv, dgs)) { |
1325 | dev_err(DEV, "Digest integrity check FAILED: %llus +%u\n", | 1555 | dev_err(DEV, "Digest integrity check FAILED: %llus +%u\n", |
1326 | (unsigned long long)sector, data_size); | 1556 | (unsigned long long)sector, data_size); |
1327 | drbd_bcast_ee(mdev, "digest failed", | 1557 | drbd_free_peer_req(mdev, peer_req); |
1328 | dgs, dig_in, dig_vv, e); | ||
1329 | drbd_free_ee(mdev, e); | ||
1330 | return NULL; | 1558 | return NULL; |
1331 | } | 1559 | } |
1332 | } | 1560 | } |
1333 | mdev->recv_cnt += data_size>>9; | 1561 | mdev->recv_cnt += data_size>>9; |
1334 | return e; | 1562 | return peer_req; |
1335 | } | 1563 | } |
1336 | 1564 | ||
1337 | /* drbd_drain_block() just takes a data block | 1565 | /* drbd_drain_block() just takes a data block |
@@ -1340,30 +1568,26 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __ | |||
1340 | static int drbd_drain_block(struct drbd_conf *mdev, int data_size) | 1568 | static int drbd_drain_block(struct drbd_conf *mdev, int data_size) |
1341 | { | 1569 | { |
1342 | struct page *page; | 1570 | struct page *page; |
1343 | int rr, rv = 1; | 1571 | int err = 0; |
1344 | void *data; | 1572 | void *data; |
1345 | 1573 | ||
1346 | if (!data_size) | 1574 | if (!data_size) |
1347 | return true; | 1575 | return 0; |
1348 | 1576 | ||
1349 | page = drbd_pp_alloc(mdev, 1, 1); | 1577 | page = drbd_alloc_pages(mdev, 1, 1); |
1350 | 1578 | ||
1351 | data = kmap(page); | 1579 | data = kmap(page); |
1352 | while (data_size) { | 1580 | while (data_size) { |
1353 | rr = drbd_recv(mdev, data, min_t(int, data_size, PAGE_SIZE)); | 1581 | unsigned int len = min_t(int, data_size, PAGE_SIZE); |
1354 | if (rr != min_t(int, data_size, PAGE_SIZE)) { | 1582 | |
1355 | rv = 0; | 1583 | err = drbd_recv_all_warn(mdev->tconn, data, len); |
1356 | if (!signal_pending(current)) | 1584 | if (err) |
1357 | dev_warn(DEV, | ||
1358 | "short read receiving data: read %d expected %d\n", | ||
1359 | rr, min_t(int, data_size, PAGE_SIZE)); | ||
1360 | break; | 1585 | break; |
1361 | } | 1586 | data_size -= len; |
1362 | data_size -= rr; | ||
1363 | } | 1587 | } |
1364 | kunmap(page); | 1588 | kunmap(page); |
1365 | drbd_pp_free(mdev, page, 0); | 1589 | drbd_free_pages(mdev, page, 0); |
1366 | return rv; | 1590 | return err; |
1367 | } | 1591 | } |
1368 | 1592 | ||
1369 | static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req, | 1593 | static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req, |
@@ -1371,26 +1595,19 @@ static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req, | |||
1371 | { | 1595 | { |
1372 | struct bio_vec *bvec; | 1596 | struct bio_vec *bvec; |
1373 | struct bio *bio; | 1597 | struct bio *bio; |
1374 | int dgs, rr, i, expect; | 1598 | int dgs, err, i, expect; |
1375 | void *dig_in = mdev->int_dig_in; | 1599 | void *dig_in = mdev->tconn->int_dig_in; |
1376 | void *dig_vv = mdev->int_dig_vv; | 1600 | void *dig_vv = mdev->tconn->int_dig_vv; |
1377 | |||
1378 | dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ? | ||
1379 | crypto_hash_digestsize(mdev->integrity_r_tfm) : 0; | ||
1380 | 1601 | ||
1381 | if (dgs) { | 1602 | dgs = 0; |
1382 | rr = drbd_recv(mdev, dig_in, dgs); | 1603 | if (mdev->tconn->peer_integrity_tfm) { |
1383 | if (rr != dgs) { | 1604 | dgs = crypto_hash_digestsize(mdev->tconn->peer_integrity_tfm); |
1384 | if (!signal_pending(current)) | 1605 | err = drbd_recv_all_warn(mdev->tconn, dig_in, dgs); |
1385 | dev_warn(DEV, | 1606 | if (err) |
1386 | "short read receiving data reply digest: read %d expected %d\n", | 1607 | return err; |
1387 | rr, dgs); | 1608 | data_size -= dgs; |
1388 | return 0; | ||
1389 | } | ||
1390 | } | 1609 | } |
1391 | 1610 | ||
1392 | data_size -= dgs; | ||
1393 | |||
1394 | /* optimistically update recv_cnt. if receiving fails below, | 1611 | /* optimistically update recv_cnt. if receiving fails below, |
1395 | * we disconnect anyways, and counters will be reset. */ | 1612 | * we disconnect anyways, and counters will be reset. */ |
1396 | mdev->recv_cnt += data_size>>9; | 1613 | mdev->recv_cnt += data_size>>9; |
@@ -1399,63 +1616,61 @@ static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req, | |||
1399 | D_ASSERT(sector == bio->bi_sector); | 1616 | D_ASSERT(sector == bio->bi_sector); |
1400 | 1617 | ||
1401 | bio_for_each_segment(bvec, bio, i) { | 1618 | bio_for_each_segment(bvec, bio, i) { |
1619 | void *mapped = kmap(bvec->bv_page) + bvec->bv_offset; | ||
1402 | expect = min_t(int, data_size, bvec->bv_len); | 1620 | expect = min_t(int, data_size, bvec->bv_len); |
1403 | rr = drbd_recv(mdev, | 1621 | err = drbd_recv_all_warn(mdev->tconn, mapped, expect); |
1404 | kmap(bvec->bv_page)+bvec->bv_offset, | ||
1405 | expect); | ||
1406 | kunmap(bvec->bv_page); | 1622 | kunmap(bvec->bv_page); |
1407 | if (rr != expect) { | 1623 | if (err) |
1408 | if (!signal_pending(current)) | 1624 | return err; |
1409 | dev_warn(DEV, "short read receiving data reply: " | 1625 | data_size -= expect; |
1410 | "read %d expected %d\n", | ||
1411 | rr, expect); | ||
1412 | return 0; | ||
1413 | } | ||
1414 | data_size -= rr; | ||
1415 | } | 1626 | } |
1416 | 1627 | ||
1417 | if (dgs) { | 1628 | if (dgs) { |
1418 | drbd_csum_bio(mdev, mdev->integrity_r_tfm, bio, dig_vv); | 1629 | drbd_csum_bio(mdev, mdev->tconn->peer_integrity_tfm, bio, dig_vv); |
1419 | if (memcmp(dig_in, dig_vv, dgs)) { | 1630 | if (memcmp(dig_in, dig_vv, dgs)) { |
1420 | dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n"); | 1631 | dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n"); |
1421 | return 0; | 1632 | return -EINVAL; |
1422 | } | 1633 | } |
1423 | } | 1634 | } |
1424 | 1635 | ||
1425 | D_ASSERT(data_size == 0); | 1636 | D_ASSERT(data_size == 0); |
1426 | return 1; | 1637 | return 0; |
1427 | } | 1638 | } |
1428 | 1639 | ||
1429 | /* e_end_resync_block() is called via | 1640 | /* |
1430 | * drbd_process_done_ee() by asender only */ | 1641 | * e_end_resync_block() is called in asender context via |
1431 | static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int unused) | 1642 | * drbd_finish_peer_reqs(). |
1643 | */ | ||
1644 | static int e_end_resync_block(struct drbd_work *w, int unused) | ||
1432 | { | 1645 | { |
1433 | struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; | 1646 | struct drbd_peer_request *peer_req = |
1434 | sector_t sector = e->sector; | 1647 | container_of(w, struct drbd_peer_request, w); |
1435 | int ok; | 1648 | struct drbd_conf *mdev = w->mdev; |
1649 | sector_t sector = peer_req->i.sector; | ||
1650 | int err; | ||
1436 | 1651 | ||
1437 | D_ASSERT(hlist_unhashed(&e->collision)); | 1652 | D_ASSERT(drbd_interval_empty(&peer_req->i)); |
1438 | 1653 | ||
1439 | if (likely((e->flags & EE_WAS_ERROR) == 0)) { | 1654 | if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { |
1440 | drbd_set_in_sync(mdev, sector, e->size); | 1655 | drbd_set_in_sync(mdev, sector, peer_req->i.size); |
1441 | ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e); | 1656 | err = drbd_send_ack(mdev, P_RS_WRITE_ACK, peer_req); |
1442 | } else { | 1657 | } else { |
1443 | /* Record failure to sync */ | 1658 | /* Record failure to sync */ |
1444 | drbd_rs_failed_io(mdev, sector, e->size); | 1659 | drbd_rs_failed_io(mdev, sector, peer_req->i.size); |
1445 | 1660 | ||
1446 | ok = drbd_send_ack(mdev, P_NEG_ACK, e); | 1661 | err = drbd_send_ack(mdev, P_NEG_ACK, peer_req); |
1447 | } | 1662 | } |
1448 | dec_unacked(mdev); | 1663 | dec_unacked(mdev); |
1449 | 1664 | ||
1450 | return ok; | 1665 | return err; |
1451 | } | 1666 | } |
1452 | 1667 | ||
1453 | static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_size) __releases(local) | 1668 | static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_size) __releases(local) |
1454 | { | 1669 | { |
1455 | struct drbd_epoch_entry *e; | 1670 | struct drbd_peer_request *peer_req; |
1456 | 1671 | ||
1457 | e = read_in_block(mdev, ID_SYNCER, sector, data_size); | 1672 | peer_req = read_in_block(mdev, ID_SYNCER, sector, data_size); |
1458 | if (!e) | 1673 | if (!peer_req) |
1459 | goto fail; | 1674 | goto fail; |
1460 | 1675 | ||
1461 | dec_rs_pending(mdev); | 1676 | dec_rs_pending(mdev); |
@@ -1464,64 +1679,88 @@ static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_si | |||
1464 | /* corresponding dec_unacked() in e_end_resync_block() | 1679 | /* corresponding dec_unacked() in e_end_resync_block() |
1465 | * respective _drbd_clear_done_ee */ | 1680 | * respective _drbd_clear_done_ee */ |
1466 | 1681 | ||
1467 | e->w.cb = e_end_resync_block; | 1682 | peer_req->w.cb = e_end_resync_block; |
1468 | 1683 | ||
1469 | spin_lock_irq(&mdev->req_lock); | 1684 | spin_lock_irq(&mdev->tconn->req_lock); |
1470 | list_add(&e->w.list, &mdev->sync_ee); | 1685 | list_add(&peer_req->w.list, &mdev->sync_ee); |
1471 | spin_unlock_irq(&mdev->req_lock); | 1686 | spin_unlock_irq(&mdev->tconn->req_lock); |
1472 | 1687 | ||
1473 | atomic_add(data_size >> 9, &mdev->rs_sect_ev); | 1688 | atomic_add(data_size >> 9, &mdev->rs_sect_ev); |
1474 | if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0) | 1689 | if (drbd_submit_peer_request(mdev, peer_req, WRITE, DRBD_FAULT_RS_WR) == 0) |
1475 | return true; | 1690 | return 0; |
1476 | 1691 | ||
1477 | /* don't care for the reason here */ | 1692 | /* don't care for the reason here */ |
1478 | dev_err(DEV, "submit failed, triggering re-connect\n"); | 1693 | dev_err(DEV, "submit failed, triggering re-connect\n"); |
1479 | spin_lock_irq(&mdev->req_lock); | 1694 | spin_lock_irq(&mdev->tconn->req_lock); |
1480 | list_del(&e->w.list); | 1695 | list_del(&peer_req->w.list); |
1481 | spin_unlock_irq(&mdev->req_lock); | 1696 | spin_unlock_irq(&mdev->tconn->req_lock); |
1482 | 1697 | ||
1483 | drbd_free_ee(mdev, e); | 1698 | drbd_free_peer_req(mdev, peer_req); |
1484 | fail: | 1699 | fail: |
1485 | put_ldev(mdev); | 1700 | put_ldev(mdev); |
1486 | return false; | 1701 | return -EIO; |
1487 | } | 1702 | } |
1488 | 1703 | ||
1489 | static int receive_DataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 1704 | static struct drbd_request * |
1705 | find_request(struct drbd_conf *mdev, struct rb_root *root, u64 id, | ||
1706 | sector_t sector, bool missing_ok, const char *func) | ||
1490 | { | 1707 | { |
1491 | struct drbd_request *req; | 1708 | struct drbd_request *req; |
1709 | |||
1710 | /* Request object according to our peer */ | ||
1711 | req = (struct drbd_request *)(unsigned long)id; | ||
1712 | if (drbd_contains_interval(root, sector, &req->i) && req->i.local) | ||
1713 | return req; | ||
1714 | if (!missing_ok) { | ||
1715 | dev_err(DEV, "%s: failed to find request 0x%lx, sector %llus\n", func, | ||
1716 | (unsigned long)id, (unsigned long long)sector); | ||
1717 | } | ||
1718 | return NULL; | ||
1719 | } | ||
1720 | |||
1721 | static int receive_DataReply(struct drbd_tconn *tconn, struct packet_info *pi) | ||
1722 | { | ||
1723 | struct drbd_conf *mdev; | ||
1724 | struct drbd_request *req; | ||
1492 | sector_t sector; | 1725 | sector_t sector; |
1493 | int ok; | 1726 | int err; |
1494 | struct p_data *p = &mdev->data.rbuf.data; | 1727 | struct p_data *p = pi->data; |
1728 | |||
1729 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
1730 | if (!mdev) | ||
1731 | return -EIO; | ||
1495 | 1732 | ||
1496 | sector = be64_to_cpu(p->sector); | 1733 | sector = be64_to_cpu(p->sector); |
1497 | 1734 | ||
1498 | spin_lock_irq(&mdev->req_lock); | 1735 | spin_lock_irq(&mdev->tconn->req_lock); |
1499 | req = _ar_id_to_req(mdev, p->block_id, sector); | 1736 | req = find_request(mdev, &mdev->read_requests, p->block_id, sector, false, __func__); |
1500 | spin_unlock_irq(&mdev->req_lock); | 1737 | spin_unlock_irq(&mdev->tconn->req_lock); |
1501 | if (unlikely(!req)) { | 1738 | if (unlikely(!req)) |
1502 | dev_err(DEV, "Got a corrupt block_id/sector pair(1).\n"); | 1739 | return -EIO; |
1503 | return false; | ||
1504 | } | ||
1505 | 1740 | ||
1506 | /* hlist_del(&req->collision) is done in _req_may_be_done, to avoid | 1741 | /* hlist_del(&req->collision) is done in _req_may_be_done, to avoid |
1507 | * special casing it there for the various failure cases. | 1742 | * special casing it there for the various failure cases. |
1508 | * still no race with drbd_fail_pending_reads */ | 1743 | * still no race with drbd_fail_pending_reads */ |
1509 | ok = recv_dless_read(mdev, req, sector, data_size); | 1744 | err = recv_dless_read(mdev, req, sector, pi->size); |
1510 | 1745 | if (!err) | |
1511 | if (ok) | 1746 | req_mod(req, DATA_RECEIVED); |
1512 | req_mod(req, data_received); | ||
1513 | /* else: nothing. handled from drbd_disconnect... | 1747 | /* else: nothing. handled from drbd_disconnect... |
1514 | * I don't think we may complete this just yet | 1748 | * I don't think we may complete this just yet |
1515 | * in case we are "on-disconnect: freeze" */ | 1749 | * in case we are "on-disconnect: freeze" */ |
1516 | 1750 | ||
1517 | return ok; | 1751 | return err; |
1518 | } | 1752 | } |
1519 | 1753 | ||
1520 | static int receive_RSDataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 1754 | static int receive_RSDataReply(struct drbd_tconn *tconn, struct packet_info *pi) |
1521 | { | 1755 | { |
1756 | struct drbd_conf *mdev; | ||
1522 | sector_t sector; | 1757 | sector_t sector; |
1523 | int ok; | 1758 | int err; |
1524 | struct p_data *p = &mdev->data.rbuf.data; | 1759 | struct p_data *p = pi->data; |
1760 | |||
1761 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
1762 | if (!mdev) | ||
1763 | return -EIO; | ||
1525 | 1764 | ||
1526 | sector = be64_to_cpu(p->sector); | 1765 | sector = be64_to_cpu(p->sector); |
1527 | D_ASSERT(p->block_id == ID_SYNCER); | 1766 | D_ASSERT(p->block_id == ID_SYNCER); |
@@ -1529,42 +1768,63 @@ static int receive_RSDataReply(struct drbd_conf *mdev, enum drbd_packets cmd, un | |||
1529 | if (get_ldev(mdev)) { | 1768 | if (get_ldev(mdev)) { |
1530 | /* data is submitted to disk within recv_resync_read. | 1769 | /* data is submitted to disk within recv_resync_read. |
1531 | * corresponding put_ldev done below on error, | 1770 | * corresponding put_ldev done below on error, |
1532 | * or in drbd_endio_write_sec. */ | 1771 | * or in drbd_peer_request_endio. */ |
1533 | ok = recv_resync_read(mdev, sector, data_size); | 1772 | err = recv_resync_read(mdev, sector, pi->size); |
1534 | } else { | 1773 | } else { |
1535 | if (__ratelimit(&drbd_ratelimit_state)) | 1774 | if (__ratelimit(&drbd_ratelimit_state)) |
1536 | dev_err(DEV, "Can not write resync data to local disk.\n"); | 1775 | dev_err(DEV, "Can not write resync data to local disk.\n"); |
1537 | 1776 | ||
1538 | ok = drbd_drain_block(mdev, data_size); | 1777 | err = drbd_drain_block(mdev, pi->size); |
1539 | 1778 | ||
1540 | drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size); | 1779 | drbd_send_ack_dp(mdev, P_NEG_ACK, p, pi->size); |
1541 | } | 1780 | } |
1542 | 1781 | ||
1543 | atomic_add(data_size >> 9, &mdev->rs_sect_in); | 1782 | atomic_add(pi->size >> 9, &mdev->rs_sect_in); |
1544 | 1783 | ||
1545 | return ok; | 1784 | return err; |
1546 | } | 1785 | } |
1547 | 1786 | ||
1548 | /* e_end_block() is called via drbd_process_done_ee(). | 1787 | static void restart_conflicting_writes(struct drbd_conf *mdev, |
1549 | * this means this function only runs in the asender thread | 1788 | sector_t sector, int size) |
1550 | */ | ||
1551 | static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
1552 | { | 1789 | { |
1553 | struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; | 1790 | struct drbd_interval *i; |
1554 | sector_t sector = e->sector; | 1791 | struct drbd_request *req; |
1555 | int ok = 1, pcmd; | ||
1556 | 1792 | ||
1557 | if (mdev->net_conf->wire_protocol == DRBD_PROT_C) { | 1793 | drbd_for_each_overlap(i, &mdev->write_requests, sector, size) { |
1558 | if (likely((e->flags & EE_WAS_ERROR) == 0)) { | 1794 | if (!i->local) |
1795 | continue; | ||
1796 | req = container_of(i, struct drbd_request, i); | ||
1797 | if (req->rq_state & RQ_LOCAL_PENDING || | ||
1798 | !(req->rq_state & RQ_POSTPONED)) | ||
1799 | continue; | ||
1800 | /* as it is RQ_POSTPONED, this will cause it to | ||
1801 | * be queued on the retry workqueue. */ | ||
1802 | __req_mod(req, CONFLICT_RESOLVED, NULL); | ||
1803 | } | ||
1804 | } | ||
1805 | |||
1806 | /* | ||
1807 | * e_end_block() is called in asender context via drbd_finish_peer_reqs(). | ||
1808 | */ | ||
1809 | static int e_end_block(struct drbd_work *w, int cancel) | ||
1810 | { | ||
1811 | struct drbd_peer_request *peer_req = | ||
1812 | container_of(w, struct drbd_peer_request, w); | ||
1813 | struct drbd_conf *mdev = w->mdev; | ||
1814 | sector_t sector = peer_req->i.sector; | ||
1815 | int err = 0, pcmd; | ||
1816 | |||
1817 | if (peer_req->flags & EE_SEND_WRITE_ACK) { | ||
1818 | if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { | ||
1559 | pcmd = (mdev->state.conn >= C_SYNC_SOURCE && | 1819 | pcmd = (mdev->state.conn >= C_SYNC_SOURCE && |
1560 | mdev->state.conn <= C_PAUSED_SYNC_T && | 1820 | mdev->state.conn <= C_PAUSED_SYNC_T && |
1561 | e->flags & EE_MAY_SET_IN_SYNC) ? | 1821 | peer_req->flags & EE_MAY_SET_IN_SYNC) ? |
1562 | P_RS_WRITE_ACK : P_WRITE_ACK; | 1822 | P_RS_WRITE_ACK : P_WRITE_ACK; |
1563 | ok &= drbd_send_ack(mdev, pcmd, e); | 1823 | err = drbd_send_ack(mdev, pcmd, peer_req); |
1564 | if (pcmd == P_RS_WRITE_ACK) | 1824 | if (pcmd == P_RS_WRITE_ACK) |
1565 | drbd_set_in_sync(mdev, sector, e->size); | 1825 | drbd_set_in_sync(mdev, sector, peer_req->i.size); |
1566 | } else { | 1826 | } else { |
1567 | ok = drbd_send_ack(mdev, P_NEG_ACK, e); | 1827 | err = drbd_send_ack(mdev, P_NEG_ACK, peer_req); |
1568 | /* we expect it to be marked out of sync anyways... | 1828 | /* we expect it to be marked out of sync anyways... |
1569 | * maybe assert this? */ | 1829 | * maybe assert this? */ |
1570 | } | 1830 | } |
@@ -1572,52 +1832,115 @@ static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | |||
1572 | } | 1832 | } |
1573 | /* we delete from the conflict detection hash _after_ we sent out the | 1833 | /* we delete from the conflict detection hash _after_ we sent out the |
1574 | * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */ | 1834 | * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */ |
1575 | if (mdev->net_conf->two_primaries) { | 1835 | if (peer_req->flags & EE_IN_INTERVAL_TREE) { |
1576 | spin_lock_irq(&mdev->req_lock); | 1836 | spin_lock_irq(&mdev->tconn->req_lock); |
1577 | D_ASSERT(!hlist_unhashed(&e->collision)); | 1837 | D_ASSERT(!drbd_interval_empty(&peer_req->i)); |
1578 | hlist_del_init(&e->collision); | 1838 | drbd_remove_epoch_entry_interval(mdev, peer_req); |
1579 | spin_unlock_irq(&mdev->req_lock); | 1839 | if (peer_req->flags & EE_RESTART_REQUESTS) |
1580 | } else { | 1840 | restart_conflicting_writes(mdev, sector, peer_req->i.size); |
1581 | D_ASSERT(hlist_unhashed(&e->collision)); | 1841 | spin_unlock_irq(&mdev->tconn->req_lock); |
1582 | } | 1842 | } else |
1843 | D_ASSERT(drbd_interval_empty(&peer_req->i)); | ||
1583 | 1844 | ||
1584 | drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0)); | 1845 | drbd_may_finish_epoch(mdev->tconn, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0)); |
1585 | 1846 | ||
1586 | return ok; | 1847 | return err; |
1587 | } | 1848 | } |
1588 | 1849 | ||
1589 | static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int unused) | 1850 | static int e_send_ack(struct drbd_work *w, enum drbd_packet ack) |
1590 | { | 1851 | { |
1591 | struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; | 1852 | struct drbd_conf *mdev = w->mdev; |
1592 | int ok = 1; | 1853 | struct drbd_peer_request *peer_req = |
1854 | container_of(w, struct drbd_peer_request, w); | ||
1855 | int err; | ||
1593 | 1856 | ||
1594 | D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); | 1857 | err = drbd_send_ack(mdev, ack, peer_req); |
1595 | ok = drbd_send_ack(mdev, P_DISCARD_ACK, e); | 1858 | dec_unacked(mdev); |
1596 | 1859 | ||
1597 | spin_lock_irq(&mdev->req_lock); | 1860 | return err; |
1598 | D_ASSERT(!hlist_unhashed(&e->collision)); | 1861 | } |
1599 | hlist_del_init(&e->collision); | ||
1600 | spin_unlock_irq(&mdev->req_lock); | ||
1601 | 1862 | ||
1602 | dec_unacked(mdev); | 1863 | static int e_send_superseded(struct drbd_work *w, int unused) |
1864 | { | ||
1865 | return e_send_ack(w, P_SUPERSEDED); | ||
1866 | } | ||
1867 | |||
1868 | static int e_send_retry_write(struct drbd_work *w, int unused) | ||
1869 | { | ||
1870 | struct drbd_tconn *tconn = w->mdev->tconn; | ||
1871 | |||
1872 | return e_send_ack(w, tconn->agreed_pro_version >= 100 ? | ||
1873 | P_RETRY_WRITE : P_SUPERSEDED); | ||
1874 | } | ||
1875 | |||
1876 | static bool seq_greater(u32 a, u32 b) | ||
1877 | { | ||
1878 | /* | ||
1879 | * We assume 32-bit wrap-around here. | ||
1880 | * For 24-bit wrap-around, we would have to shift: | ||
1881 | * a <<= 8; b <<= 8; | ||
1882 | */ | ||
1883 | return (s32)a - (s32)b > 0; | ||
1884 | } | ||
1885 | |||
1886 | static u32 seq_max(u32 a, u32 b) | ||
1887 | { | ||
1888 | return seq_greater(a, b) ? a : b; | ||
1889 | } | ||
1890 | |||
1891 | static bool need_peer_seq(struct drbd_conf *mdev) | ||
1892 | { | ||
1893 | struct drbd_tconn *tconn = mdev->tconn; | ||
1894 | int tp; | ||
1603 | 1895 | ||
1604 | return ok; | 1896 | /* |
1897 | * We only need to keep track of the last packet_seq number of our peer | ||
1898 | * if we are in dual-primary mode and we have the resolve-conflicts flag set; see | ||
1899 | * handle_write_conflicts(). | ||
1900 | */ | ||
1901 | |||
1902 | rcu_read_lock(); | ||
1903 | tp = rcu_dereference(mdev->tconn->net_conf)->two_primaries; | ||
1904 | rcu_read_unlock(); | ||
1905 | |||
1906 | return tp && test_bit(RESOLVE_CONFLICTS, &tconn->flags); | ||
1605 | } | 1907 | } |
1606 | 1908 | ||
1607 | static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_epoch_entry *data_e) | 1909 | static void update_peer_seq(struct drbd_conf *mdev, unsigned int peer_seq) |
1608 | { | 1910 | { |
1911 | unsigned int newest_peer_seq; | ||
1609 | 1912 | ||
1610 | struct drbd_epoch_entry *rs_e; | 1913 | if (need_peer_seq(mdev)) { |
1914 | spin_lock(&mdev->peer_seq_lock); | ||
1915 | newest_peer_seq = seq_max(mdev->peer_seq, peer_seq); | ||
1916 | mdev->peer_seq = newest_peer_seq; | ||
1917 | spin_unlock(&mdev->peer_seq_lock); | ||
1918 | /* wake up only if we actually changed mdev->peer_seq */ | ||
1919 | if (peer_seq == newest_peer_seq) | ||
1920 | wake_up(&mdev->seq_wait); | ||
1921 | } | ||
1922 | } | ||
1923 | |||
1924 | static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2) | ||
1925 | { | ||
1926 | return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9))); | ||
1927 | } | ||
1928 | |||
1929 | /* maybe change sync_ee into interval trees as well? */ | ||
1930 | static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_peer_request *peer_req) | ||
1931 | { | ||
1932 | struct drbd_peer_request *rs_req; | ||
1611 | bool rv = 0; | 1933 | bool rv = 0; |
1612 | 1934 | ||
1613 | spin_lock_irq(&mdev->req_lock); | 1935 | spin_lock_irq(&mdev->tconn->req_lock); |
1614 | list_for_each_entry(rs_e, &mdev->sync_ee, w.list) { | 1936 | list_for_each_entry(rs_req, &mdev->sync_ee, w.list) { |
1615 | if (overlaps(data_e->sector, data_e->size, rs_e->sector, rs_e->size)) { | 1937 | if (overlaps(peer_req->i.sector, peer_req->i.size, |
1938 | rs_req->i.sector, rs_req->i.size)) { | ||
1616 | rv = 1; | 1939 | rv = 1; |
1617 | break; | 1940 | break; |
1618 | } | 1941 | } |
1619 | } | 1942 | } |
1620 | spin_unlock_irq(&mdev->req_lock); | 1943 | spin_unlock_irq(&mdev->tconn->req_lock); |
1621 | 1944 | ||
1622 | return rv; | 1945 | return rv; |
1623 | } | 1946 | } |
@@ -1643,35 +1966,41 @@ static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_epoch_e | |||
1643 | * | 1966 | * |
1644 | * returns 0 if we may process the packet, | 1967 | * returns 0 if we may process the packet, |
1645 | * -ERESTARTSYS if we were interrupted (by disconnect signal). */ | 1968 | * -ERESTARTSYS if we were interrupted (by disconnect signal). */ |
1646 | static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq) | 1969 | static int wait_for_and_update_peer_seq(struct drbd_conf *mdev, const u32 peer_seq) |
1647 | { | 1970 | { |
1648 | DEFINE_WAIT(wait); | 1971 | DEFINE_WAIT(wait); |
1649 | unsigned int p_seq; | ||
1650 | long timeout; | 1972 | long timeout; |
1651 | int ret = 0; | 1973 | int ret; |
1974 | |||
1975 | if (!need_peer_seq(mdev)) | ||
1976 | return 0; | ||
1977 | |||
1652 | spin_lock(&mdev->peer_seq_lock); | 1978 | spin_lock(&mdev->peer_seq_lock); |
1653 | for (;;) { | 1979 | for (;;) { |
1654 | prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE); | 1980 | if (!seq_greater(peer_seq - 1, mdev->peer_seq)) { |
1655 | if (seq_le(packet_seq, mdev->peer_seq+1)) | 1981 | mdev->peer_seq = seq_max(mdev->peer_seq, peer_seq); |
1982 | ret = 0; | ||
1656 | break; | 1983 | break; |
1984 | } | ||
1657 | if (signal_pending(current)) { | 1985 | if (signal_pending(current)) { |
1658 | ret = -ERESTARTSYS; | 1986 | ret = -ERESTARTSYS; |
1659 | break; | 1987 | break; |
1660 | } | 1988 | } |
1661 | p_seq = mdev->peer_seq; | 1989 | prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE); |
1662 | spin_unlock(&mdev->peer_seq_lock); | 1990 | spin_unlock(&mdev->peer_seq_lock); |
1663 | timeout = schedule_timeout(30*HZ); | 1991 | rcu_read_lock(); |
1992 | timeout = rcu_dereference(mdev->tconn->net_conf)->ping_timeo*HZ/10; | ||
1993 | rcu_read_unlock(); | ||
1994 | timeout = schedule_timeout(timeout); | ||
1664 | spin_lock(&mdev->peer_seq_lock); | 1995 | spin_lock(&mdev->peer_seq_lock); |
1665 | if (timeout == 0 && p_seq == mdev->peer_seq) { | 1996 | if (!timeout) { |
1666 | ret = -ETIMEDOUT; | 1997 | ret = -ETIMEDOUT; |
1667 | dev_err(DEV, "ASSERT FAILED waited 30 seconds for sequence update, forcing reconnect\n"); | 1998 | dev_err(DEV, "Timed out waiting for missing ack packets; disconnecting\n"); |
1668 | break; | 1999 | break; |
1669 | } | 2000 | } |
1670 | } | 2001 | } |
1671 | finish_wait(&mdev->seq_wait, &wait); | ||
1672 | if (mdev->peer_seq+1 == packet_seq) | ||
1673 | mdev->peer_seq++; | ||
1674 | spin_unlock(&mdev->peer_seq_lock); | 2002 | spin_unlock(&mdev->peer_seq_lock); |
2003 | finish_wait(&mdev->seq_wait, &wait); | ||
1675 | return ret; | 2004 | return ret; |
1676 | } | 2005 | } |
1677 | 2006 | ||
@@ -1686,233 +2015,277 @@ static unsigned long wire_flags_to_bio(struct drbd_conf *mdev, u32 dpf) | |||
1686 | (dpf & DP_DISCARD ? REQ_DISCARD : 0); | 2015 | (dpf & DP_DISCARD ? REQ_DISCARD : 0); |
1687 | } | 2016 | } |
1688 | 2017 | ||
2018 | static void fail_postponed_requests(struct drbd_conf *mdev, sector_t sector, | ||
2019 | unsigned int size) | ||
2020 | { | ||
2021 | struct drbd_interval *i; | ||
2022 | |||
2023 | repeat: | ||
2024 | drbd_for_each_overlap(i, &mdev->write_requests, sector, size) { | ||
2025 | struct drbd_request *req; | ||
2026 | struct bio_and_error m; | ||
2027 | |||
2028 | if (!i->local) | ||
2029 | continue; | ||
2030 | req = container_of(i, struct drbd_request, i); | ||
2031 | if (!(req->rq_state & RQ_POSTPONED)) | ||
2032 | continue; | ||
2033 | req->rq_state &= ~RQ_POSTPONED; | ||
2034 | __req_mod(req, NEG_ACKED, &m); | ||
2035 | spin_unlock_irq(&mdev->tconn->req_lock); | ||
2036 | if (m.bio) | ||
2037 | complete_master_bio(mdev, &m); | ||
2038 | spin_lock_irq(&mdev->tconn->req_lock); | ||
2039 | goto repeat; | ||
2040 | } | ||
2041 | } | ||
2042 | |||
2043 | static int handle_write_conflicts(struct drbd_conf *mdev, | ||
2044 | struct drbd_peer_request *peer_req) | ||
2045 | { | ||
2046 | struct drbd_tconn *tconn = mdev->tconn; | ||
2047 | bool resolve_conflicts = test_bit(RESOLVE_CONFLICTS, &tconn->flags); | ||
2048 | sector_t sector = peer_req->i.sector; | ||
2049 | const unsigned int size = peer_req->i.size; | ||
2050 | struct drbd_interval *i; | ||
2051 | bool equal; | ||
2052 | int err; | ||
2053 | |||
2054 | /* | ||
2055 | * Inserting the peer request into the write_requests tree will prevent | ||
2056 | * new conflicting local requests from being added. | ||
2057 | */ | ||
2058 | drbd_insert_interval(&mdev->write_requests, &peer_req->i); | ||
2059 | |||
2060 | repeat: | ||
2061 | drbd_for_each_overlap(i, &mdev->write_requests, sector, size) { | ||
2062 | if (i == &peer_req->i) | ||
2063 | continue; | ||
2064 | |||
2065 | if (!i->local) { | ||
2066 | /* | ||
2067 | * Our peer has sent a conflicting remote request; this | ||
2068 | * should not happen in a two-node setup. Wait for the | ||
2069 | * earlier peer request to complete. | ||
2070 | */ | ||
2071 | err = drbd_wait_misc(mdev, i); | ||
2072 | if (err) | ||
2073 | goto out; | ||
2074 | goto repeat; | ||
2075 | } | ||
2076 | |||
2077 | equal = i->sector == sector && i->size == size; | ||
2078 | if (resolve_conflicts) { | ||
2079 | /* | ||
2080 | * If the peer request is fully contained within the | ||
2081 | * overlapping request, it can be considered overwritten | ||
2082 | * and thus superseded; otherwise, it will be retried | ||
2083 | * once all overlapping requests have completed. | ||
2084 | */ | ||
2085 | bool superseded = i->sector <= sector && i->sector + | ||
2086 | (i->size >> 9) >= sector + (size >> 9); | ||
2087 | |||
2088 | if (!equal) | ||
2089 | dev_alert(DEV, "Concurrent writes detected: " | ||
2090 | "local=%llus +%u, remote=%llus +%u, " | ||
2091 | "assuming %s came first\n", | ||
2092 | (unsigned long long)i->sector, i->size, | ||
2093 | (unsigned long long)sector, size, | ||
2094 | superseded ? "local" : "remote"); | ||
2095 | |||
2096 | inc_unacked(mdev); | ||
2097 | peer_req->w.cb = superseded ? e_send_superseded : | ||
2098 | e_send_retry_write; | ||
2099 | list_add_tail(&peer_req->w.list, &mdev->done_ee); | ||
2100 | wake_asender(mdev->tconn); | ||
2101 | |||
2102 | err = -ENOENT; | ||
2103 | goto out; | ||
2104 | } else { | ||
2105 | struct drbd_request *req = | ||
2106 | container_of(i, struct drbd_request, i); | ||
2107 | |||
2108 | if (!equal) | ||
2109 | dev_alert(DEV, "Concurrent writes detected: " | ||
2110 | "local=%llus +%u, remote=%llus +%u\n", | ||
2111 | (unsigned long long)i->sector, i->size, | ||
2112 | (unsigned long long)sector, size); | ||
2113 | |||
2114 | if (req->rq_state & RQ_LOCAL_PENDING || | ||
2115 | !(req->rq_state & RQ_POSTPONED)) { | ||
2116 | /* | ||
2117 | * Wait for the node with the discard flag to | ||
2118 | * decide if this request has been superseded | ||
2119 | * or needs to be retried. | ||
2120 | * Requests that have been superseded will | ||
2121 | * disappear from the write_requests tree. | ||
2122 | * | ||
2123 | * In addition, wait for the conflicting | ||
2124 | * request to finish locally before submitting | ||
2125 | * the conflicting peer request. | ||
2126 | */ | ||
2127 | err = drbd_wait_misc(mdev, &req->i); | ||
2128 | if (err) { | ||
2129 | _conn_request_state(mdev->tconn, | ||
2130 | NS(conn, C_TIMEOUT), | ||
2131 | CS_HARD); | ||
2132 | fail_postponed_requests(mdev, sector, size); | ||
2133 | goto out; | ||
2134 | } | ||
2135 | goto repeat; | ||
2136 | } | ||
2137 | /* | ||
2138 | * Remember to restart the conflicting requests after | ||
2139 | * the new peer request has completed. | ||
2140 | */ | ||
2141 | peer_req->flags |= EE_RESTART_REQUESTS; | ||
2142 | } | ||
2143 | } | ||
2144 | err = 0; | ||
2145 | |||
2146 | out: | ||
2147 | if (err) | ||
2148 | drbd_remove_epoch_entry_interval(mdev, peer_req); | ||
2149 | return err; | ||
2150 | } | ||
2151 | |||
1689 | /* mirrored write */ | 2152 | /* mirrored write */ |
1690 | static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 2153 | static int receive_Data(struct drbd_tconn *tconn, struct packet_info *pi) |
1691 | { | 2154 | { |
2155 | struct drbd_conf *mdev; | ||
1692 | sector_t sector; | 2156 | sector_t sector; |
1693 | struct drbd_epoch_entry *e; | 2157 | struct drbd_peer_request *peer_req; |
1694 | struct p_data *p = &mdev->data.rbuf.data; | 2158 | struct p_data *p = pi->data; |
2159 | u32 peer_seq = be32_to_cpu(p->seq_num); | ||
1695 | int rw = WRITE; | 2160 | int rw = WRITE; |
1696 | u32 dp_flags; | 2161 | u32 dp_flags; |
2162 | int err, tp; | ||
1697 | 2163 | ||
1698 | if (!get_ldev(mdev)) { | 2164 | mdev = vnr_to_mdev(tconn, pi->vnr); |
1699 | spin_lock(&mdev->peer_seq_lock); | 2165 | if (!mdev) |
1700 | if (mdev->peer_seq+1 == be32_to_cpu(p->seq_num)) | 2166 | return -EIO; |
1701 | mdev->peer_seq++; | ||
1702 | spin_unlock(&mdev->peer_seq_lock); | ||
1703 | 2167 | ||
1704 | drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size); | 2168 | if (!get_ldev(mdev)) { |
1705 | atomic_inc(&mdev->current_epoch->epoch_size); | 2169 | int err2; |
1706 | return drbd_drain_block(mdev, data_size); | 2170 | |
2171 | err = wait_for_and_update_peer_seq(mdev, peer_seq); | ||
2172 | drbd_send_ack_dp(mdev, P_NEG_ACK, p, pi->size); | ||
2173 | atomic_inc(&tconn->current_epoch->epoch_size); | ||
2174 | err2 = drbd_drain_block(mdev, pi->size); | ||
2175 | if (!err) | ||
2176 | err = err2; | ||
2177 | return err; | ||
1707 | } | 2178 | } |
1708 | 2179 | ||
1709 | /* get_ldev(mdev) successful. | 2180 | /* |
1710 | * Corresponding put_ldev done either below (on various errors), | 2181 | * Corresponding put_ldev done either below (on various errors), or in |
1711 | * or in drbd_endio_write_sec, if we successfully submit the data at | 2182 | * drbd_peer_request_endio, if we successfully submit the data at the |
1712 | * the end of this function. */ | 2183 | * end of this function. |
2184 | */ | ||
1713 | 2185 | ||
1714 | sector = be64_to_cpu(p->sector); | 2186 | sector = be64_to_cpu(p->sector); |
1715 | e = read_in_block(mdev, p->block_id, sector, data_size); | 2187 | peer_req = read_in_block(mdev, p->block_id, sector, pi->size); |
1716 | if (!e) { | 2188 | if (!peer_req) { |
1717 | put_ldev(mdev); | 2189 | put_ldev(mdev); |
1718 | return false; | 2190 | return -EIO; |
1719 | } | 2191 | } |
1720 | 2192 | ||
1721 | e->w.cb = e_end_block; | 2193 | peer_req->w.cb = e_end_block; |
1722 | 2194 | ||
1723 | dp_flags = be32_to_cpu(p->dp_flags); | 2195 | dp_flags = be32_to_cpu(p->dp_flags); |
1724 | rw |= wire_flags_to_bio(mdev, dp_flags); | 2196 | rw |= wire_flags_to_bio(mdev, dp_flags); |
1725 | if (e->pages == NULL) { | 2197 | if (peer_req->pages == NULL) { |
1726 | D_ASSERT(e->size == 0); | 2198 | D_ASSERT(peer_req->i.size == 0); |
1727 | D_ASSERT(dp_flags & DP_FLUSH); | 2199 | D_ASSERT(dp_flags & DP_FLUSH); |
1728 | } | 2200 | } |
1729 | 2201 | ||
1730 | if (dp_flags & DP_MAY_SET_IN_SYNC) | 2202 | if (dp_flags & DP_MAY_SET_IN_SYNC) |
1731 | e->flags |= EE_MAY_SET_IN_SYNC; | 2203 | peer_req->flags |= EE_MAY_SET_IN_SYNC; |
1732 | 2204 | ||
1733 | spin_lock(&mdev->epoch_lock); | 2205 | spin_lock(&tconn->epoch_lock); |
1734 | e->epoch = mdev->current_epoch; | 2206 | peer_req->epoch = tconn->current_epoch; |
1735 | atomic_inc(&e->epoch->epoch_size); | 2207 | atomic_inc(&peer_req->epoch->epoch_size); |
1736 | atomic_inc(&e->epoch->active); | 2208 | atomic_inc(&peer_req->epoch->active); |
1737 | spin_unlock(&mdev->epoch_lock); | 2209 | spin_unlock(&tconn->epoch_lock); |
1738 | 2210 | ||
1739 | /* I'm the receiver, I do hold a net_cnt reference. */ | 2211 | rcu_read_lock(); |
1740 | if (!mdev->net_conf->two_primaries) { | 2212 | tp = rcu_dereference(mdev->tconn->net_conf)->two_primaries; |
1741 | spin_lock_irq(&mdev->req_lock); | 2213 | rcu_read_unlock(); |
1742 | } else { | 2214 | if (tp) { |
1743 | /* don't get the req_lock yet, | 2215 | peer_req->flags |= EE_IN_INTERVAL_TREE; |
1744 | * we may sleep in drbd_wait_peer_seq */ | 2216 | err = wait_for_and_update_peer_seq(mdev, peer_seq); |
1745 | const int size = e->size; | 2217 | if (err) |
1746 | const int discard = test_bit(DISCARD_CONCURRENT, &mdev->flags); | ||
1747 | DEFINE_WAIT(wait); | ||
1748 | struct drbd_request *i; | ||
1749 | struct hlist_node *n; | ||
1750 | struct hlist_head *slot; | ||
1751 | int first; | ||
1752 | |||
1753 | D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); | ||
1754 | BUG_ON(mdev->ee_hash == NULL); | ||
1755 | BUG_ON(mdev->tl_hash == NULL); | ||
1756 | |||
1757 | /* conflict detection and handling: | ||
1758 | * 1. wait on the sequence number, | ||
1759 | * in case this data packet overtook ACK packets. | ||
1760 | * 2. check our hash tables for conflicting requests. | ||
1761 | * we only need to walk the tl_hash, since an ee can not | ||
1762 | * have a conflict with an other ee: on the submitting | ||
1763 | * node, the corresponding req had already been conflicting, | ||
1764 | * and a conflicting req is never sent. | ||
1765 | * | ||
1766 | * Note: for two_primaries, we are protocol C, | ||
1767 | * so there cannot be any request that is DONE | ||
1768 | * but still on the transfer log. | ||
1769 | * | ||
1770 | * unconditionally add to the ee_hash. | ||
1771 | * | ||
1772 | * if no conflicting request is found: | ||
1773 | * submit. | ||
1774 | * | ||
1775 | * if any conflicting request is found | ||
1776 | * that has not yet been acked, | ||
1777 | * AND I have the "discard concurrent writes" flag: | ||
1778 | * queue (via done_ee) the P_DISCARD_ACK; OUT. | ||
1779 | * | ||
1780 | * if any conflicting request is found: | ||
1781 | * block the receiver, waiting on misc_wait | ||
1782 | * until no more conflicting requests are there, | ||
1783 | * or we get interrupted (disconnect). | ||
1784 | * | ||
1785 | * we do not just write after local io completion of those | ||
1786 | * requests, but only after req is done completely, i.e. | ||
1787 | * we wait for the P_DISCARD_ACK to arrive! | ||
1788 | * | ||
1789 | * then proceed normally, i.e. submit. | ||
1790 | */ | ||
1791 | if (drbd_wait_peer_seq(mdev, be32_to_cpu(p->seq_num))) | ||
1792 | goto out_interrupted; | 2218 | goto out_interrupted; |
1793 | 2219 | spin_lock_irq(&mdev->tconn->req_lock); | |
1794 | spin_lock_irq(&mdev->req_lock); | 2220 | err = handle_write_conflicts(mdev, peer_req); |
1795 | 2221 | if (err) { | |
1796 | hlist_add_head(&e->collision, ee_hash_slot(mdev, sector)); | 2222 | spin_unlock_irq(&mdev->tconn->req_lock); |
1797 | 2223 | if (err == -ENOENT) { | |
1798 | #define OVERLAPS overlaps(i->sector, i->size, sector, size) | ||
1799 | slot = tl_hash_slot(mdev, sector); | ||
1800 | first = 1; | ||
1801 | for (;;) { | ||
1802 | int have_unacked = 0; | ||
1803 | int have_conflict = 0; | ||
1804 | prepare_to_wait(&mdev->misc_wait, &wait, | ||
1805 | TASK_INTERRUPTIBLE); | ||
1806 | hlist_for_each_entry(i, n, slot, collision) { | ||
1807 | if (OVERLAPS) { | ||
1808 | /* only ALERT on first iteration, | ||
1809 | * we may be woken up early... */ | ||
1810 | if (first) | ||
1811 | dev_alert(DEV, "%s[%u] Concurrent local write detected!" | ||
1812 | " new: %llus +%u; pending: %llus +%u\n", | ||
1813 | current->comm, current->pid, | ||
1814 | (unsigned long long)sector, size, | ||
1815 | (unsigned long long)i->sector, i->size); | ||
1816 | if (i->rq_state & RQ_NET_PENDING) | ||
1817 | ++have_unacked; | ||
1818 | ++have_conflict; | ||
1819 | } | ||
1820 | } | ||
1821 | #undef OVERLAPS | ||
1822 | if (!have_conflict) | ||
1823 | break; | ||
1824 | |||
1825 | /* Discard Ack only for the _first_ iteration */ | ||
1826 | if (first && discard && have_unacked) { | ||
1827 | dev_alert(DEV, "Concurrent write! [DISCARD BY FLAG] sec=%llus\n", | ||
1828 | (unsigned long long)sector); | ||
1829 | inc_unacked(mdev); | ||
1830 | e->w.cb = e_send_discard_ack; | ||
1831 | list_add_tail(&e->w.list, &mdev->done_ee); | ||
1832 | |||
1833 | spin_unlock_irq(&mdev->req_lock); | ||
1834 | |||
1835 | /* we could probably send that P_DISCARD_ACK ourselves, | ||
1836 | * but I don't like the receiver using the msock */ | ||
1837 | |||
1838 | put_ldev(mdev); | 2224 | put_ldev(mdev); |
1839 | wake_asender(mdev); | 2225 | return 0; |
1840 | finish_wait(&mdev->misc_wait, &wait); | ||
1841 | return true; | ||
1842 | } | 2226 | } |
2227 | goto out_interrupted; | ||
2228 | } | ||
2229 | } else | ||
2230 | spin_lock_irq(&mdev->tconn->req_lock); | ||
2231 | list_add(&peer_req->w.list, &mdev->active_ee); | ||
2232 | spin_unlock_irq(&mdev->tconn->req_lock); | ||
1843 | 2233 | ||
1844 | if (signal_pending(current)) { | 2234 | if (mdev->state.conn == C_SYNC_TARGET) |
1845 | hlist_del_init(&e->collision); | 2235 | wait_event(mdev->ee_wait, !overlapping_resync_write(mdev, peer_req)); |
1846 | |||
1847 | spin_unlock_irq(&mdev->req_lock); | ||
1848 | |||
1849 | finish_wait(&mdev->misc_wait, &wait); | ||
1850 | goto out_interrupted; | ||
1851 | } | ||
1852 | 2236 | ||
1853 | spin_unlock_irq(&mdev->req_lock); | 2237 | if (mdev->tconn->agreed_pro_version < 100) { |
1854 | if (first) { | 2238 | rcu_read_lock(); |
1855 | first = 0; | 2239 | switch (rcu_dereference(mdev->tconn->net_conf)->wire_protocol) { |
1856 | dev_alert(DEV, "Concurrent write! [W AFTERWARDS] " | 2240 | case DRBD_PROT_C: |
1857 | "sec=%llus\n", (unsigned long long)sector); | 2241 | dp_flags |= DP_SEND_WRITE_ACK; |
1858 | } else if (discard) { | 2242 | break; |
1859 | /* we had none on the first iteration. | 2243 | case DRBD_PROT_B: |
1860 | * there must be none now. */ | 2244 | dp_flags |= DP_SEND_RECEIVE_ACK; |
1861 | D_ASSERT(have_unacked == 0); | 2245 | break; |
1862 | } | ||
1863 | schedule(); | ||
1864 | spin_lock_irq(&mdev->req_lock); | ||
1865 | } | 2246 | } |
1866 | finish_wait(&mdev->misc_wait, &wait); | 2247 | rcu_read_unlock(); |
1867 | } | 2248 | } |
1868 | 2249 | ||
1869 | list_add(&e->w.list, &mdev->active_ee); | 2250 | if (dp_flags & DP_SEND_WRITE_ACK) { |
1870 | spin_unlock_irq(&mdev->req_lock); | 2251 | peer_req->flags |= EE_SEND_WRITE_ACK; |
1871 | |||
1872 | if (mdev->state.conn == C_SYNC_TARGET) | ||
1873 | wait_event(mdev->ee_wait, !overlapping_resync_write(mdev, e)); | ||
1874 | |||
1875 | switch (mdev->net_conf->wire_protocol) { | ||
1876 | case DRBD_PROT_C: | ||
1877 | inc_unacked(mdev); | 2252 | inc_unacked(mdev); |
1878 | /* corresponding dec_unacked() in e_end_block() | 2253 | /* corresponding dec_unacked() in e_end_block() |
1879 | * respective _drbd_clear_done_ee */ | 2254 | * respective _drbd_clear_done_ee */ |
1880 | break; | 2255 | } |
1881 | case DRBD_PROT_B: | 2256 | |
2257 | if (dp_flags & DP_SEND_RECEIVE_ACK) { | ||
1882 | /* I really don't like it that the receiver thread | 2258 | /* I really don't like it that the receiver thread |
1883 | * sends on the msock, but anyways */ | 2259 | * sends on the msock, but anyways */ |
1884 | drbd_send_ack(mdev, P_RECV_ACK, e); | 2260 | drbd_send_ack(mdev, P_RECV_ACK, peer_req); |
1885 | break; | ||
1886 | case DRBD_PROT_A: | ||
1887 | /* nothing to do */ | ||
1888 | break; | ||
1889 | } | 2261 | } |
1890 | 2262 | ||
1891 | if (mdev->state.pdsk < D_INCONSISTENT) { | 2263 | if (mdev->state.pdsk < D_INCONSISTENT) { |
1892 | /* In case we have the only disk of the cluster, */ | 2264 | /* In case we have the only disk of the cluster, */ |
1893 | drbd_set_out_of_sync(mdev, e->sector, e->size); | 2265 | drbd_set_out_of_sync(mdev, peer_req->i.sector, peer_req->i.size); |
1894 | e->flags |= EE_CALL_AL_COMPLETE_IO; | 2266 | peer_req->flags |= EE_CALL_AL_COMPLETE_IO; |
1895 | e->flags &= ~EE_MAY_SET_IN_SYNC; | 2267 | peer_req->flags &= ~EE_MAY_SET_IN_SYNC; |
1896 | drbd_al_begin_io(mdev, e->sector); | 2268 | drbd_al_begin_io(mdev, &peer_req->i); |
1897 | } | 2269 | } |
1898 | 2270 | ||
1899 | if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0) | 2271 | err = drbd_submit_peer_request(mdev, peer_req, rw, DRBD_FAULT_DT_WR); |
1900 | return true; | 2272 | if (!err) |
2273 | return 0; | ||
1901 | 2274 | ||
1902 | /* don't care for the reason here */ | 2275 | /* don't care for the reason here */ |
1903 | dev_err(DEV, "submit failed, triggering re-connect\n"); | 2276 | dev_err(DEV, "submit failed, triggering re-connect\n"); |
1904 | spin_lock_irq(&mdev->req_lock); | 2277 | spin_lock_irq(&mdev->tconn->req_lock); |
1905 | list_del(&e->w.list); | 2278 | list_del(&peer_req->w.list); |
1906 | hlist_del_init(&e->collision); | 2279 | drbd_remove_epoch_entry_interval(mdev, peer_req); |
1907 | spin_unlock_irq(&mdev->req_lock); | 2280 | spin_unlock_irq(&mdev->tconn->req_lock); |
1908 | if (e->flags & EE_CALL_AL_COMPLETE_IO) | 2281 | if (peer_req->flags & EE_CALL_AL_COMPLETE_IO) |
1909 | drbd_al_complete_io(mdev, e->sector); | 2282 | drbd_al_complete_io(mdev, &peer_req->i); |
1910 | 2283 | ||
1911 | out_interrupted: | 2284 | out_interrupted: |
1912 | drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + EV_CLEANUP); | 2285 | drbd_may_finish_epoch(tconn, peer_req->epoch, EV_PUT + EV_CLEANUP); |
1913 | put_ldev(mdev); | 2286 | put_ldev(mdev); |
1914 | drbd_free_ee(mdev, e); | 2287 | drbd_free_peer_req(mdev, peer_req); |
1915 | return false; | 2288 | return err; |
1916 | } | 2289 | } |
1917 | 2290 | ||
1918 | /* We may throttle resync, if the lower device seems to be busy, | 2291 | /* We may throttle resync, if the lower device seems to be busy, |
@@ -1933,9 +2306,14 @@ int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector) | |||
1933 | struct lc_element *tmp; | 2306 | struct lc_element *tmp; |
1934 | int curr_events; | 2307 | int curr_events; |
1935 | int throttle = 0; | 2308 | int throttle = 0; |
2309 | unsigned int c_min_rate; | ||
2310 | |||
2311 | rcu_read_lock(); | ||
2312 | c_min_rate = rcu_dereference(mdev->ldev->disk_conf)->c_min_rate; | ||
2313 | rcu_read_unlock(); | ||
1936 | 2314 | ||
1937 | /* feature disabled? */ | 2315 | /* feature disabled? */ |
1938 | if (mdev->sync_conf.c_min_rate == 0) | 2316 | if (c_min_rate == 0) |
1939 | return 0; | 2317 | return 0; |
1940 | 2318 | ||
1941 | spin_lock_irq(&mdev->al_lock); | 2319 | spin_lock_irq(&mdev->al_lock); |
@@ -1975,40 +2353,46 @@ int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector) | |||
1975 | db = mdev->rs_mark_left[i] - rs_left; | 2353 | db = mdev->rs_mark_left[i] - rs_left; |
1976 | dbdt = Bit2KB(db/dt); | 2354 | dbdt = Bit2KB(db/dt); |
1977 | 2355 | ||
1978 | if (dbdt > mdev->sync_conf.c_min_rate) | 2356 | if (dbdt > c_min_rate) |
1979 | throttle = 1; | 2357 | throttle = 1; |
1980 | } | 2358 | } |
1981 | return throttle; | 2359 | return throttle; |
1982 | } | 2360 | } |
1983 | 2361 | ||
1984 | 2362 | ||
1985 | static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int digest_size) | 2363 | static int receive_DataRequest(struct drbd_tconn *tconn, struct packet_info *pi) |
1986 | { | 2364 | { |
2365 | struct drbd_conf *mdev; | ||
1987 | sector_t sector; | 2366 | sector_t sector; |
1988 | const sector_t capacity = drbd_get_capacity(mdev->this_bdev); | 2367 | sector_t capacity; |
1989 | struct drbd_epoch_entry *e; | 2368 | struct drbd_peer_request *peer_req; |
1990 | struct digest_info *di = NULL; | 2369 | struct digest_info *di = NULL; |
1991 | int size, verb; | 2370 | int size, verb; |
1992 | unsigned int fault_type; | 2371 | unsigned int fault_type; |
1993 | struct p_block_req *p = &mdev->data.rbuf.block_req; | 2372 | struct p_block_req *p = pi->data; |
2373 | |||
2374 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
2375 | if (!mdev) | ||
2376 | return -EIO; | ||
2377 | capacity = drbd_get_capacity(mdev->this_bdev); | ||
1994 | 2378 | ||
1995 | sector = be64_to_cpu(p->sector); | 2379 | sector = be64_to_cpu(p->sector); |
1996 | size = be32_to_cpu(p->blksize); | 2380 | size = be32_to_cpu(p->blksize); |
1997 | 2381 | ||
1998 | if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) { | 2382 | if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { |
1999 | dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, | 2383 | dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, |
2000 | (unsigned long long)sector, size); | 2384 | (unsigned long long)sector, size); |
2001 | return false; | 2385 | return -EINVAL; |
2002 | } | 2386 | } |
2003 | if (sector + (size>>9) > capacity) { | 2387 | if (sector + (size>>9) > capacity) { |
2004 | dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, | 2388 | dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, |
2005 | (unsigned long long)sector, size); | 2389 | (unsigned long long)sector, size); |
2006 | return false; | 2390 | return -EINVAL; |
2007 | } | 2391 | } |
2008 | 2392 | ||
2009 | if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) { | 2393 | if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) { |
2010 | verb = 1; | 2394 | verb = 1; |
2011 | switch (cmd) { | 2395 | switch (pi->cmd) { |
2012 | case P_DATA_REQUEST: | 2396 | case P_DATA_REQUEST: |
2013 | drbd_send_ack_rp(mdev, P_NEG_DREPLY, p); | 2397 | drbd_send_ack_rp(mdev, P_NEG_DREPLY, p); |
2014 | break; | 2398 | break; |
@@ -2023,35 +2407,34 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un | |||
2023 | drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, ID_IN_SYNC); | 2407 | drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, ID_IN_SYNC); |
2024 | break; | 2408 | break; |
2025 | default: | 2409 | default: |
2026 | dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n", | 2410 | BUG(); |
2027 | cmdname(cmd)); | ||
2028 | } | 2411 | } |
2029 | if (verb && __ratelimit(&drbd_ratelimit_state)) | 2412 | if (verb && __ratelimit(&drbd_ratelimit_state)) |
2030 | dev_err(DEV, "Can not satisfy peer's read request, " | 2413 | dev_err(DEV, "Can not satisfy peer's read request, " |
2031 | "no local data.\n"); | 2414 | "no local data.\n"); |
2032 | 2415 | ||
2033 | /* drain possibly payload */ | 2416 | /* drain possibly payload */ |
2034 | return drbd_drain_block(mdev, digest_size); | 2417 | return drbd_drain_block(mdev, pi->size); |
2035 | } | 2418 | } |
2036 | 2419 | ||
2037 | /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD | 2420 | /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD |
2038 | * "criss-cross" setup, that might cause write-out on some other DRBD, | 2421 | * "criss-cross" setup, that might cause write-out on some other DRBD, |
2039 | * which in turn might block on the other node at this very place. */ | 2422 | * which in turn might block on the other node at this very place. */ |
2040 | e = drbd_alloc_ee(mdev, p->block_id, sector, size, GFP_NOIO); | 2423 | peer_req = drbd_alloc_peer_req(mdev, p->block_id, sector, size, GFP_NOIO); |
2041 | if (!e) { | 2424 | if (!peer_req) { |
2042 | put_ldev(mdev); | 2425 | put_ldev(mdev); |
2043 | return false; | 2426 | return -ENOMEM; |
2044 | } | 2427 | } |
2045 | 2428 | ||
2046 | switch (cmd) { | 2429 | switch (pi->cmd) { |
2047 | case P_DATA_REQUEST: | 2430 | case P_DATA_REQUEST: |
2048 | e->w.cb = w_e_end_data_req; | 2431 | peer_req->w.cb = w_e_end_data_req; |
2049 | fault_type = DRBD_FAULT_DT_RD; | 2432 | fault_type = DRBD_FAULT_DT_RD; |
2050 | /* application IO, don't drbd_rs_begin_io */ | 2433 | /* application IO, don't drbd_rs_begin_io */ |
2051 | goto submit; | 2434 | goto submit; |
2052 | 2435 | ||
2053 | case P_RS_DATA_REQUEST: | 2436 | case P_RS_DATA_REQUEST: |
2054 | e->w.cb = w_e_end_rsdata_req; | 2437 | peer_req->w.cb = w_e_end_rsdata_req; |
2055 | fault_type = DRBD_FAULT_RS_RD; | 2438 | fault_type = DRBD_FAULT_RS_RD; |
2056 | /* used in the sector offset progress display */ | 2439 | /* used in the sector offset progress display */ |
2057 | mdev->bm_resync_fo = BM_SECT_TO_BIT(sector); | 2440 | mdev->bm_resync_fo = BM_SECT_TO_BIT(sector); |
@@ -2060,28 +2443,28 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un | |||
2060 | case P_OV_REPLY: | 2443 | case P_OV_REPLY: |
2061 | case P_CSUM_RS_REQUEST: | 2444 | case P_CSUM_RS_REQUEST: |
2062 | fault_type = DRBD_FAULT_RS_RD; | 2445 | fault_type = DRBD_FAULT_RS_RD; |
2063 | di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO); | 2446 | di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO); |
2064 | if (!di) | 2447 | if (!di) |
2065 | goto out_free_e; | 2448 | goto out_free_e; |
2066 | 2449 | ||
2067 | di->digest_size = digest_size; | 2450 | di->digest_size = pi->size; |
2068 | di->digest = (((char *)di)+sizeof(struct digest_info)); | 2451 | di->digest = (((char *)di)+sizeof(struct digest_info)); |
2069 | 2452 | ||
2070 | e->digest = di; | 2453 | peer_req->digest = di; |
2071 | e->flags |= EE_HAS_DIGEST; | 2454 | peer_req->flags |= EE_HAS_DIGEST; |
2072 | 2455 | ||
2073 | if (drbd_recv(mdev, di->digest, digest_size) != digest_size) | 2456 | if (drbd_recv_all(mdev->tconn, di->digest, pi->size)) |
2074 | goto out_free_e; | 2457 | goto out_free_e; |
2075 | 2458 | ||
2076 | if (cmd == P_CSUM_RS_REQUEST) { | 2459 | if (pi->cmd == P_CSUM_RS_REQUEST) { |
2077 | D_ASSERT(mdev->agreed_pro_version >= 89); | 2460 | D_ASSERT(mdev->tconn->agreed_pro_version >= 89); |
2078 | e->w.cb = w_e_end_csum_rs_req; | 2461 | peer_req->w.cb = w_e_end_csum_rs_req; |
2079 | /* used in the sector offset progress display */ | 2462 | /* used in the sector offset progress display */ |
2080 | mdev->bm_resync_fo = BM_SECT_TO_BIT(sector); | 2463 | mdev->bm_resync_fo = BM_SECT_TO_BIT(sector); |
2081 | } else if (cmd == P_OV_REPLY) { | 2464 | } else if (pi->cmd == P_OV_REPLY) { |
2082 | /* track progress, we may need to throttle */ | 2465 | /* track progress, we may need to throttle */ |
2083 | atomic_add(size >> 9, &mdev->rs_sect_in); | 2466 | atomic_add(size >> 9, &mdev->rs_sect_in); |
2084 | e->w.cb = w_e_end_ov_reply; | 2467 | peer_req->w.cb = w_e_end_ov_reply; |
2085 | dec_rs_pending(mdev); | 2468 | dec_rs_pending(mdev); |
2086 | /* drbd_rs_begin_io done when we sent this request, | 2469 | /* drbd_rs_begin_io done when we sent this request, |
2087 | * but accounting still needs to be done. */ | 2470 | * but accounting still needs to be done. */ |
@@ -2091,7 +2474,7 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un | |||
2091 | 2474 | ||
2092 | case P_OV_REQUEST: | 2475 | case P_OV_REQUEST: |
2093 | if (mdev->ov_start_sector == ~(sector_t)0 && | 2476 | if (mdev->ov_start_sector == ~(sector_t)0 && |
2094 | mdev->agreed_pro_version >= 90) { | 2477 | mdev->tconn->agreed_pro_version >= 90) { |
2095 | unsigned long now = jiffies; | 2478 | unsigned long now = jiffies; |
2096 | int i; | 2479 | int i; |
2097 | mdev->ov_start_sector = sector; | 2480 | mdev->ov_start_sector = sector; |
@@ -2105,15 +2488,12 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un | |||
2105 | dev_info(DEV, "Online Verify start sector: %llu\n", | 2488 | dev_info(DEV, "Online Verify start sector: %llu\n", |
2106 | (unsigned long long)sector); | 2489 | (unsigned long long)sector); |
2107 | } | 2490 | } |
2108 | e->w.cb = w_e_end_ov_req; | 2491 | peer_req->w.cb = w_e_end_ov_req; |
2109 | fault_type = DRBD_FAULT_RS_RD; | 2492 | fault_type = DRBD_FAULT_RS_RD; |
2110 | break; | 2493 | break; |
2111 | 2494 | ||
2112 | default: | 2495 | default: |
2113 | dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n", | 2496 | BUG(); |
2114 | cmdname(cmd)); | ||
2115 | fault_type = DRBD_FAULT_MAX; | ||
2116 | goto out_free_e; | ||
2117 | } | 2497 | } |
2118 | 2498 | ||
2119 | /* Throttle, drbd_rs_begin_io and submit should become asynchronous | 2499 | /* Throttle, drbd_rs_begin_io and submit should become asynchronous |
@@ -2148,30 +2528,31 @@ submit_for_resync: | |||
2148 | 2528 | ||
2149 | submit: | 2529 | submit: |
2150 | inc_unacked(mdev); | 2530 | inc_unacked(mdev); |
2151 | spin_lock_irq(&mdev->req_lock); | 2531 | spin_lock_irq(&mdev->tconn->req_lock); |
2152 | list_add_tail(&e->w.list, &mdev->read_ee); | 2532 | list_add_tail(&peer_req->w.list, &mdev->read_ee); |
2153 | spin_unlock_irq(&mdev->req_lock); | 2533 | spin_unlock_irq(&mdev->tconn->req_lock); |
2154 | 2534 | ||
2155 | if (drbd_submit_ee(mdev, e, READ, fault_type) == 0) | 2535 | if (drbd_submit_peer_request(mdev, peer_req, READ, fault_type) == 0) |
2156 | return true; | 2536 | return 0; |
2157 | 2537 | ||
2158 | /* don't care for the reason here */ | 2538 | /* don't care for the reason here */ |
2159 | dev_err(DEV, "submit failed, triggering re-connect\n"); | 2539 | dev_err(DEV, "submit failed, triggering re-connect\n"); |
2160 | spin_lock_irq(&mdev->req_lock); | 2540 | spin_lock_irq(&mdev->tconn->req_lock); |
2161 | list_del(&e->w.list); | 2541 | list_del(&peer_req->w.list); |
2162 | spin_unlock_irq(&mdev->req_lock); | 2542 | spin_unlock_irq(&mdev->tconn->req_lock); |
2163 | /* no drbd_rs_complete_io(), we are dropping the connection anyways */ | 2543 | /* no drbd_rs_complete_io(), we are dropping the connection anyways */ |
2164 | 2544 | ||
2165 | out_free_e: | 2545 | out_free_e: |
2166 | put_ldev(mdev); | 2546 | put_ldev(mdev); |
2167 | drbd_free_ee(mdev, e); | 2547 | drbd_free_peer_req(mdev, peer_req); |
2168 | return false; | 2548 | return -EIO; |
2169 | } | 2549 | } |
2170 | 2550 | ||
2171 | static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local) | 2551 | static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local) |
2172 | { | 2552 | { |
2173 | int self, peer, rv = -100; | 2553 | int self, peer, rv = -100; |
2174 | unsigned long ch_self, ch_peer; | 2554 | unsigned long ch_self, ch_peer; |
2555 | enum drbd_after_sb_p after_sb_0p; | ||
2175 | 2556 | ||
2176 | self = mdev->ldev->md.uuid[UI_BITMAP] & 1; | 2557 | self = mdev->ldev->md.uuid[UI_BITMAP] & 1; |
2177 | peer = mdev->p_uuid[UI_BITMAP] & 1; | 2558 | peer = mdev->p_uuid[UI_BITMAP] & 1; |
@@ -2179,10 +2560,14 @@ static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local) | |||
2179 | ch_peer = mdev->p_uuid[UI_SIZE]; | 2560 | ch_peer = mdev->p_uuid[UI_SIZE]; |
2180 | ch_self = mdev->comm_bm_set; | 2561 | ch_self = mdev->comm_bm_set; |
2181 | 2562 | ||
2182 | switch (mdev->net_conf->after_sb_0p) { | 2563 | rcu_read_lock(); |
2564 | after_sb_0p = rcu_dereference(mdev->tconn->net_conf)->after_sb_0p; | ||
2565 | rcu_read_unlock(); | ||
2566 | switch (after_sb_0p) { | ||
2183 | case ASB_CONSENSUS: | 2567 | case ASB_CONSENSUS: |
2184 | case ASB_DISCARD_SECONDARY: | 2568 | case ASB_DISCARD_SECONDARY: |
2185 | case ASB_CALL_HELPER: | 2569 | case ASB_CALL_HELPER: |
2570 | case ASB_VIOLENTLY: | ||
2186 | dev_err(DEV, "Configuration error.\n"); | 2571 | dev_err(DEV, "Configuration error.\n"); |
2187 | break; | 2572 | break; |
2188 | case ASB_DISCONNECT: | 2573 | case ASB_DISCONNECT: |
@@ -2211,14 +2596,14 @@ static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local) | |||
2211 | "Using discard-least-changes instead\n"); | 2596 | "Using discard-least-changes instead\n"); |
2212 | case ASB_DISCARD_ZERO_CHG: | 2597 | case ASB_DISCARD_ZERO_CHG: |
2213 | if (ch_peer == 0 && ch_self == 0) { | 2598 | if (ch_peer == 0 && ch_self == 0) { |
2214 | rv = test_bit(DISCARD_CONCURRENT, &mdev->flags) | 2599 | rv = test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags) |
2215 | ? -1 : 1; | 2600 | ? -1 : 1; |
2216 | break; | 2601 | break; |
2217 | } else { | 2602 | } else { |
2218 | if (ch_peer == 0) { rv = 1; break; } | 2603 | if (ch_peer == 0) { rv = 1; break; } |
2219 | if (ch_self == 0) { rv = -1; break; } | 2604 | if (ch_self == 0) { rv = -1; break; } |
2220 | } | 2605 | } |
2221 | if (mdev->net_conf->after_sb_0p == ASB_DISCARD_ZERO_CHG) | 2606 | if (after_sb_0p == ASB_DISCARD_ZERO_CHG) |
2222 | break; | 2607 | break; |
2223 | case ASB_DISCARD_LEAST_CHG: | 2608 | case ASB_DISCARD_LEAST_CHG: |
2224 | if (ch_self < ch_peer) | 2609 | if (ch_self < ch_peer) |
@@ -2227,7 +2612,7 @@ static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local) | |||
2227 | rv = 1; | 2612 | rv = 1; |
2228 | else /* ( ch_self == ch_peer ) */ | 2613 | else /* ( ch_self == ch_peer ) */ |
2229 | /* Well, then use something else. */ | 2614 | /* Well, then use something else. */ |
2230 | rv = test_bit(DISCARD_CONCURRENT, &mdev->flags) | 2615 | rv = test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags) |
2231 | ? -1 : 1; | 2616 | ? -1 : 1; |
2232 | break; | 2617 | break; |
2233 | case ASB_DISCARD_LOCAL: | 2618 | case ASB_DISCARD_LOCAL: |
@@ -2243,13 +2628,18 @@ static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local) | |||
2243 | static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local) | 2628 | static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local) |
2244 | { | 2629 | { |
2245 | int hg, rv = -100; | 2630 | int hg, rv = -100; |
2631 | enum drbd_after_sb_p after_sb_1p; | ||
2246 | 2632 | ||
2247 | switch (mdev->net_conf->after_sb_1p) { | 2633 | rcu_read_lock(); |
2634 | after_sb_1p = rcu_dereference(mdev->tconn->net_conf)->after_sb_1p; | ||
2635 | rcu_read_unlock(); | ||
2636 | switch (after_sb_1p) { | ||
2248 | case ASB_DISCARD_YOUNGER_PRI: | 2637 | case ASB_DISCARD_YOUNGER_PRI: |
2249 | case ASB_DISCARD_OLDER_PRI: | 2638 | case ASB_DISCARD_OLDER_PRI: |
2250 | case ASB_DISCARD_LEAST_CHG: | 2639 | case ASB_DISCARD_LEAST_CHG: |
2251 | case ASB_DISCARD_LOCAL: | 2640 | case ASB_DISCARD_LOCAL: |
2252 | case ASB_DISCARD_REMOTE: | 2641 | case ASB_DISCARD_REMOTE: |
2642 | case ASB_DISCARD_ZERO_CHG: | ||
2253 | dev_err(DEV, "Configuration error.\n"); | 2643 | dev_err(DEV, "Configuration error.\n"); |
2254 | break; | 2644 | break; |
2255 | case ASB_DISCONNECT: | 2645 | case ASB_DISCONNECT: |
@@ -2292,8 +2682,12 @@ static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local) | |||
2292 | static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local) | 2682 | static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local) |
2293 | { | 2683 | { |
2294 | int hg, rv = -100; | 2684 | int hg, rv = -100; |
2685 | enum drbd_after_sb_p after_sb_2p; | ||
2295 | 2686 | ||
2296 | switch (mdev->net_conf->after_sb_2p) { | 2687 | rcu_read_lock(); |
2688 | after_sb_2p = rcu_dereference(mdev->tconn->net_conf)->after_sb_2p; | ||
2689 | rcu_read_unlock(); | ||
2690 | switch (after_sb_2p) { | ||
2297 | case ASB_DISCARD_YOUNGER_PRI: | 2691 | case ASB_DISCARD_YOUNGER_PRI: |
2298 | case ASB_DISCARD_OLDER_PRI: | 2692 | case ASB_DISCARD_OLDER_PRI: |
2299 | case ASB_DISCARD_LEAST_CHG: | 2693 | case ASB_DISCARD_LEAST_CHG: |
@@ -2301,6 +2695,7 @@ static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local) | |||
2301 | case ASB_DISCARD_REMOTE: | 2695 | case ASB_DISCARD_REMOTE: |
2302 | case ASB_CONSENSUS: | 2696 | case ASB_CONSENSUS: |
2303 | case ASB_DISCARD_SECONDARY: | 2697 | case ASB_DISCARD_SECONDARY: |
2698 | case ASB_DISCARD_ZERO_CHG: | ||
2304 | dev_err(DEV, "Configuration error.\n"); | 2699 | dev_err(DEV, "Configuration error.\n"); |
2305 | break; | 2700 | break; |
2306 | case ASB_VIOLENTLY: | 2701 | case ASB_VIOLENTLY: |
@@ -2386,13 +2781,15 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l | |||
2386 | 2781 | ||
2387 | if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) { | 2782 | if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) { |
2388 | 2783 | ||
2389 | if (mdev->agreed_pro_version < 91) | 2784 | if (mdev->tconn->agreed_pro_version < 91) |
2390 | return -1091; | 2785 | return -1091; |
2391 | 2786 | ||
2392 | if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) && | 2787 | if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) && |
2393 | (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) { | 2788 | (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) { |
2394 | dev_info(DEV, "was SyncSource, missed the resync finished event, corrected myself:\n"); | 2789 | dev_info(DEV, "was SyncSource, missed the resync finished event, corrected myself:\n"); |
2395 | drbd_uuid_set_bm(mdev, 0UL); | 2790 | drbd_uuid_move_history(mdev); |
2791 | mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP]; | ||
2792 | mdev->ldev->md.uuid[UI_BITMAP] = 0; | ||
2396 | 2793 | ||
2397 | drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, | 2794 | drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, |
2398 | mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0); | 2795 | mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0); |
@@ -2407,7 +2804,7 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l | |||
2407 | 2804 | ||
2408 | if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) { | 2805 | if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) { |
2409 | 2806 | ||
2410 | if (mdev->agreed_pro_version < 91) | 2807 | if (mdev->tconn->agreed_pro_version < 91) |
2411 | return -1091; | 2808 | return -1091; |
2412 | 2809 | ||
2413 | if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) && | 2810 | if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) && |
@@ -2440,7 +2837,7 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l | |||
2440 | case 1: /* self_pri && !peer_pri */ return 1; | 2837 | case 1: /* self_pri && !peer_pri */ return 1; |
2441 | case 2: /* !self_pri && peer_pri */ return -1; | 2838 | case 2: /* !self_pri && peer_pri */ return -1; |
2442 | case 3: /* self_pri && peer_pri */ | 2839 | case 3: /* self_pri && peer_pri */ |
2443 | dc = test_bit(DISCARD_CONCURRENT, &mdev->flags); | 2840 | dc = test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags); |
2444 | return dc ? -1 : 1; | 2841 | return dc ? -1 : 1; |
2445 | } | 2842 | } |
2446 | } | 2843 | } |
@@ -2453,14 +2850,14 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l | |||
2453 | *rule_nr = 51; | 2850 | *rule_nr = 51; |
2454 | peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1); | 2851 | peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1); |
2455 | if (self == peer) { | 2852 | if (self == peer) { |
2456 | if (mdev->agreed_pro_version < 96 ? | 2853 | if (mdev->tconn->agreed_pro_version < 96 ? |
2457 | (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == | 2854 | (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == |
2458 | (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) : | 2855 | (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) : |
2459 | peer + UUID_NEW_BM_OFFSET == (mdev->p_uuid[UI_BITMAP] & ~((u64)1))) { | 2856 | peer + UUID_NEW_BM_OFFSET == (mdev->p_uuid[UI_BITMAP] & ~((u64)1))) { |
2460 | /* The last P_SYNC_UUID did not get though. Undo the last start of | 2857 | /* The last P_SYNC_UUID did not get though. Undo the last start of |
2461 | resync as sync source modifications of the peer's UUIDs. */ | 2858 | resync as sync source modifications of the peer's UUIDs. */ |
2462 | 2859 | ||
2463 | if (mdev->agreed_pro_version < 91) | 2860 | if (mdev->tconn->agreed_pro_version < 91) |
2464 | return -1091; | 2861 | return -1091; |
2465 | 2862 | ||
2466 | mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START]; | 2863 | mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START]; |
@@ -2490,18 +2887,18 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l | |||
2490 | *rule_nr = 71; | 2887 | *rule_nr = 71; |
2491 | self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1); | 2888 | self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1); |
2492 | if (self == peer) { | 2889 | if (self == peer) { |
2493 | if (mdev->agreed_pro_version < 96 ? | 2890 | if (mdev->tconn->agreed_pro_version < 96 ? |
2494 | (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == | 2891 | (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == |
2495 | (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) : | 2892 | (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) : |
2496 | self + UUID_NEW_BM_OFFSET == (mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) { | 2893 | self + UUID_NEW_BM_OFFSET == (mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) { |
2497 | /* The last P_SYNC_UUID did not get though. Undo the last start of | 2894 | /* The last P_SYNC_UUID did not get though. Undo the last start of |
2498 | resync as sync source modifications of our UUIDs. */ | 2895 | resync as sync source modifications of our UUIDs. */ |
2499 | 2896 | ||
2500 | if (mdev->agreed_pro_version < 91) | 2897 | if (mdev->tconn->agreed_pro_version < 91) |
2501 | return -1091; | 2898 | return -1091; |
2502 | 2899 | ||
2503 | _drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]); | 2900 | __drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]); |
2504 | _drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]); | 2901 | __drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]); |
2505 | 2902 | ||
2506 | dev_info(DEV, "Last syncUUID did not get through, corrected:\n"); | 2903 | dev_info(DEV, "Last syncUUID did not get through, corrected:\n"); |
2507 | drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, | 2904 | drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, |
@@ -2545,20 +2942,24 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l | |||
2545 | static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role, | 2942 | static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role, |
2546 | enum drbd_disk_state peer_disk) __must_hold(local) | 2943 | enum drbd_disk_state peer_disk) __must_hold(local) |
2547 | { | 2944 | { |
2548 | int hg, rule_nr; | ||
2549 | enum drbd_conns rv = C_MASK; | 2945 | enum drbd_conns rv = C_MASK; |
2550 | enum drbd_disk_state mydisk; | 2946 | enum drbd_disk_state mydisk; |
2947 | struct net_conf *nc; | ||
2948 | int hg, rule_nr, rr_conflict, tentative; | ||
2551 | 2949 | ||
2552 | mydisk = mdev->state.disk; | 2950 | mydisk = mdev->state.disk; |
2553 | if (mydisk == D_NEGOTIATING) | 2951 | if (mydisk == D_NEGOTIATING) |
2554 | mydisk = mdev->new_state_tmp.disk; | 2952 | mydisk = mdev->new_state_tmp.disk; |
2555 | 2953 | ||
2556 | dev_info(DEV, "drbd_sync_handshake:\n"); | 2954 | dev_info(DEV, "drbd_sync_handshake:\n"); |
2955 | |||
2956 | spin_lock_irq(&mdev->ldev->md.uuid_lock); | ||
2557 | drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, mdev->comm_bm_set, 0); | 2957 | drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, mdev->comm_bm_set, 0); |
2558 | drbd_uuid_dump(mdev, "peer", mdev->p_uuid, | 2958 | drbd_uuid_dump(mdev, "peer", mdev->p_uuid, |
2559 | mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]); | 2959 | mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]); |
2560 | 2960 | ||
2561 | hg = drbd_uuid_compare(mdev, &rule_nr); | 2961 | hg = drbd_uuid_compare(mdev, &rule_nr); |
2962 | spin_unlock_irq(&mdev->ldev->md.uuid_lock); | ||
2562 | 2963 | ||
2563 | dev_info(DEV, "uuid_compare()=%d by rule %d\n", hg, rule_nr); | 2964 | dev_info(DEV, "uuid_compare()=%d by rule %d\n", hg, rule_nr); |
2564 | 2965 | ||
@@ -2584,7 +2985,10 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol | |||
2584 | if (abs(hg) == 100) | 2985 | if (abs(hg) == 100) |
2585 | drbd_khelper(mdev, "initial-split-brain"); | 2986 | drbd_khelper(mdev, "initial-split-brain"); |
2586 | 2987 | ||
2587 | if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) { | 2988 | rcu_read_lock(); |
2989 | nc = rcu_dereference(mdev->tconn->net_conf); | ||
2990 | |||
2991 | if (hg == 100 || (hg == -100 && nc->always_asbp)) { | ||
2588 | int pcount = (mdev->state.role == R_PRIMARY) | 2992 | int pcount = (mdev->state.role == R_PRIMARY) |
2589 | + (peer_role == R_PRIMARY); | 2993 | + (peer_role == R_PRIMARY); |
2590 | int forced = (hg == -100); | 2994 | int forced = (hg == -100); |
@@ -2613,9 +3017,9 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol | |||
2613 | } | 3017 | } |
2614 | 3018 | ||
2615 | if (hg == -100) { | 3019 | if (hg == -100) { |
2616 | if (mdev->net_conf->want_lose && !(mdev->p_uuid[UI_FLAGS]&1)) | 3020 | if (test_bit(DISCARD_MY_DATA, &mdev->flags) && !(mdev->p_uuid[UI_FLAGS]&1)) |
2617 | hg = -1; | 3021 | hg = -1; |
2618 | if (!mdev->net_conf->want_lose && (mdev->p_uuid[UI_FLAGS]&1)) | 3022 | if (!test_bit(DISCARD_MY_DATA, &mdev->flags) && (mdev->p_uuid[UI_FLAGS]&1)) |
2619 | hg = 1; | 3023 | hg = 1; |
2620 | 3024 | ||
2621 | if (abs(hg) < 100) | 3025 | if (abs(hg) < 100) |
@@ -2623,6 +3027,9 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol | |||
2623 | "Sync from %s node\n", | 3027 | "Sync from %s node\n", |
2624 | (hg < 0) ? "peer" : "this"); | 3028 | (hg < 0) ? "peer" : "this"); |
2625 | } | 3029 | } |
3030 | rr_conflict = nc->rr_conflict; | ||
3031 | tentative = nc->tentative; | ||
3032 | rcu_read_unlock(); | ||
2626 | 3033 | ||
2627 | if (hg == -100) { | 3034 | if (hg == -100) { |
2628 | /* FIXME this log message is not correct if we end up here | 3035 | /* FIXME this log message is not correct if we end up here |
@@ -2641,7 +3048,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol | |||
2641 | 3048 | ||
2642 | if (hg < 0 && /* by intention we do not use mydisk here. */ | 3049 | if (hg < 0 && /* by intention we do not use mydisk here. */ |
2643 | mdev->state.role == R_PRIMARY && mdev->state.disk >= D_CONSISTENT) { | 3050 | mdev->state.role == R_PRIMARY && mdev->state.disk >= D_CONSISTENT) { |
2644 | switch (mdev->net_conf->rr_conflict) { | 3051 | switch (rr_conflict) { |
2645 | case ASB_CALL_HELPER: | 3052 | case ASB_CALL_HELPER: |
2646 | drbd_khelper(mdev, "pri-lost"); | 3053 | drbd_khelper(mdev, "pri-lost"); |
2647 | /* fall through */ | 3054 | /* fall through */ |
@@ -2654,7 +3061,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol | |||
2654 | } | 3061 | } |
2655 | } | 3062 | } |
2656 | 3063 | ||
2657 | if (mdev->net_conf->dry_run || test_bit(CONN_DRY_RUN, &mdev->flags)) { | 3064 | if (tentative || test_bit(CONN_DRY_RUN, &mdev->tconn->flags)) { |
2658 | if (hg == 0) | 3065 | if (hg == 0) |
2659 | dev_info(DEV, "dry-run connect: No resync, would become Connected immediately.\n"); | 3066 | dev_info(DEV, "dry-run connect: No resync, would become Connected immediately.\n"); |
2660 | else | 3067 | else |
@@ -2686,33 +3093,29 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol | |||
2686 | return rv; | 3093 | return rv; |
2687 | } | 3094 | } |
2688 | 3095 | ||
2689 | /* returns 1 if invalid */ | 3096 | static enum drbd_after_sb_p convert_after_sb(enum drbd_after_sb_p peer) |
2690 | static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self) | ||
2691 | { | 3097 | { |
2692 | /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */ | 3098 | /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */ |
2693 | if ((peer == ASB_DISCARD_REMOTE && self == ASB_DISCARD_LOCAL) || | 3099 | if (peer == ASB_DISCARD_REMOTE) |
2694 | (self == ASB_DISCARD_REMOTE && peer == ASB_DISCARD_LOCAL)) | 3100 | return ASB_DISCARD_LOCAL; |
2695 | return 0; | ||
2696 | 3101 | ||
2697 | /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */ | 3102 | /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */ |
2698 | if (peer == ASB_DISCARD_REMOTE || peer == ASB_DISCARD_LOCAL || | 3103 | if (peer == ASB_DISCARD_LOCAL) |
2699 | self == ASB_DISCARD_REMOTE || self == ASB_DISCARD_LOCAL) | 3104 | return ASB_DISCARD_REMOTE; |
2700 | return 1; | ||
2701 | 3105 | ||
2702 | /* everything else is valid if they are equal on both sides. */ | 3106 | /* everything else is valid if they are equal on both sides. */ |
2703 | if (peer == self) | 3107 | return peer; |
2704 | return 0; | ||
2705 | |||
2706 | /* everything es is invalid. */ | ||
2707 | return 1; | ||
2708 | } | 3108 | } |
2709 | 3109 | ||
2710 | static int receive_protocol(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 3110 | static int receive_protocol(struct drbd_tconn *tconn, struct packet_info *pi) |
2711 | { | 3111 | { |
2712 | struct p_protocol *p = &mdev->data.rbuf.protocol; | 3112 | struct p_protocol *p = pi->data; |
2713 | int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p; | 3113 | enum drbd_after_sb_p p_after_sb_0p, p_after_sb_1p, p_after_sb_2p; |
2714 | int p_want_lose, p_two_primaries, cf; | 3114 | int p_proto, p_discard_my_data, p_two_primaries, cf; |
2715 | char p_integrity_alg[SHARED_SECRET_MAX] = ""; | 3115 | struct net_conf *nc, *old_net_conf, *new_net_conf = NULL; |
3116 | char integrity_alg[SHARED_SECRET_MAX] = ""; | ||
3117 | struct crypto_hash *peer_integrity_tfm = NULL; | ||
3118 | void *int_dig_in = NULL, *int_dig_vv = NULL; | ||
2716 | 3119 | ||
2717 | p_proto = be32_to_cpu(p->protocol); | 3120 | p_proto = be32_to_cpu(p->protocol); |
2718 | p_after_sb_0p = be32_to_cpu(p->after_sb_0p); | 3121 | p_after_sb_0p = be32_to_cpu(p->after_sb_0p); |
@@ -2720,63 +3123,138 @@ static int receive_protocol(struct drbd_conf *mdev, enum drbd_packets cmd, unsig | |||
2720 | p_after_sb_2p = be32_to_cpu(p->after_sb_2p); | 3123 | p_after_sb_2p = be32_to_cpu(p->after_sb_2p); |
2721 | p_two_primaries = be32_to_cpu(p->two_primaries); | 3124 | p_two_primaries = be32_to_cpu(p->two_primaries); |
2722 | cf = be32_to_cpu(p->conn_flags); | 3125 | cf = be32_to_cpu(p->conn_flags); |
2723 | p_want_lose = cf & CF_WANT_LOSE; | 3126 | p_discard_my_data = cf & CF_DISCARD_MY_DATA; |
2724 | |||
2725 | clear_bit(CONN_DRY_RUN, &mdev->flags); | ||
2726 | 3127 | ||
2727 | if (cf & CF_DRY_RUN) | 3128 | if (tconn->agreed_pro_version >= 87) { |
2728 | set_bit(CONN_DRY_RUN, &mdev->flags); | 3129 | int err; |
2729 | 3130 | ||
2730 | if (p_proto != mdev->net_conf->wire_protocol) { | 3131 | if (pi->size > sizeof(integrity_alg)) |
2731 | dev_err(DEV, "incompatible communication protocols\n"); | 3132 | return -EIO; |
2732 | goto disconnect; | 3133 | err = drbd_recv_all(tconn, integrity_alg, pi->size); |
3134 | if (err) | ||
3135 | return err; | ||
3136 | integrity_alg[SHARED_SECRET_MAX - 1] = 0; | ||
2733 | } | 3137 | } |
2734 | 3138 | ||
2735 | if (cmp_after_sb(p_after_sb_0p, mdev->net_conf->after_sb_0p)) { | 3139 | if (pi->cmd != P_PROTOCOL_UPDATE) { |
2736 | dev_err(DEV, "incompatible after-sb-0pri settings\n"); | 3140 | clear_bit(CONN_DRY_RUN, &tconn->flags); |
2737 | goto disconnect; | ||
2738 | } | ||
2739 | 3141 | ||
2740 | if (cmp_after_sb(p_after_sb_1p, mdev->net_conf->after_sb_1p)) { | 3142 | if (cf & CF_DRY_RUN) |
2741 | dev_err(DEV, "incompatible after-sb-1pri settings\n"); | 3143 | set_bit(CONN_DRY_RUN, &tconn->flags); |
2742 | goto disconnect; | ||
2743 | } | ||
2744 | 3144 | ||
2745 | if (cmp_after_sb(p_after_sb_2p, mdev->net_conf->after_sb_2p)) { | 3145 | rcu_read_lock(); |
2746 | dev_err(DEV, "incompatible after-sb-2pri settings\n"); | 3146 | nc = rcu_dereference(tconn->net_conf); |
2747 | goto disconnect; | ||
2748 | } | ||
2749 | 3147 | ||
2750 | if (p_want_lose && mdev->net_conf->want_lose) { | 3148 | if (p_proto != nc->wire_protocol) { |
2751 | dev_err(DEV, "both sides have the 'want_lose' flag set\n"); | 3149 | conn_err(tconn, "incompatible %s settings\n", "protocol"); |
2752 | goto disconnect; | 3150 | goto disconnect_rcu_unlock; |
2753 | } | 3151 | } |
2754 | 3152 | ||
2755 | if (p_two_primaries != mdev->net_conf->two_primaries) { | 3153 | if (convert_after_sb(p_after_sb_0p) != nc->after_sb_0p) { |
2756 | dev_err(DEV, "incompatible setting of the two-primaries options\n"); | 3154 | conn_err(tconn, "incompatible %s settings\n", "after-sb-0pri"); |
2757 | goto disconnect; | 3155 | goto disconnect_rcu_unlock; |
3156 | } | ||
3157 | |||
3158 | if (convert_after_sb(p_after_sb_1p) != nc->after_sb_1p) { | ||
3159 | conn_err(tconn, "incompatible %s settings\n", "after-sb-1pri"); | ||
3160 | goto disconnect_rcu_unlock; | ||
3161 | } | ||
3162 | |||
3163 | if (convert_after_sb(p_after_sb_2p) != nc->after_sb_2p) { | ||
3164 | conn_err(tconn, "incompatible %s settings\n", "after-sb-2pri"); | ||
3165 | goto disconnect_rcu_unlock; | ||
3166 | } | ||
3167 | |||
3168 | if (p_discard_my_data && nc->discard_my_data) { | ||
3169 | conn_err(tconn, "incompatible %s settings\n", "discard-my-data"); | ||
3170 | goto disconnect_rcu_unlock; | ||
3171 | } | ||
3172 | |||
3173 | if (p_two_primaries != nc->two_primaries) { | ||
3174 | conn_err(tconn, "incompatible %s settings\n", "allow-two-primaries"); | ||
3175 | goto disconnect_rcu_unlock; | ||
3176 | } | ||
3177 | |||
3178 | if (strcmp(integrity_alg, nc->integrity_alg)) { | ||
3179 | conn_err(tconn, "incompatible %s settings\n", "data-integrity-alg"); | ||
3180 | goto disconnect_rcu_unlock; | ||
3181 | } | ||
3182 | |||
3183 | rcu_read_unlock(); | ||
2758 | } | 3184 | } |
2759 | 3185 | ||
2760 | if (mdev->agreed_pro_version >= 87) { | 3186 | if (integrity_alg[0]) { |
2761 | unsigned char *my_alg = mdev->net_conf->integrity_alg; | 3187 | int hash_size; |
2762 | 3188 | ||
2763 | if (drbd_recv(mdev, p_integrity_alg, data_size) != data_size) | 3189 | /* |
2764 | return false; | 3190 | * We can only change the peer data integrity algorithm |
3191 | * here. Changing our own data integrity algorithm | ||
3192 | * requires that we send a P_PROTOCOL_UPDATE packet at | ||
3193 | * the same time; otherwise, the peer has no way to | ||
3194 | * tell between which packets the algorithm should | ||
3195 | * change. | ||
3196 | */ | ||
2765 | 3197 | ||
2766 | p_integrity_alg[SHARED_SECRET_MAX-1] = 0; | 3198 | peer_integrity_tfm = crypto_alloc_hash(integrity_alg, 0, CRYPTO_ALG_ASYNC); |
2767 | if (strcmp(p_integrity_alg, my_alg)) { | 3199 | if (!peer_integrity_tfm) { |
2768 | dev_err(DEV, "incompatible setting of the data-integrity-alg\n"); | 3200 | conn_err(tconn, "peer data-integrity-alg %s not supported\n", |
3201 | integrity_alg); | ||
2769 | goto disconnect; | 3202 | goto disconnect; |
2770 | } | 3203 | } |
2771 | dev_info(DEV, "data-integrity-alg: %s\n", | 3204 | |
2772 | my_alg[0] ? my_alg : (unsigned char *)"<not-used>"); | 3205 | hash_size = crypto_hash_digestsize(peer_integrity_tfm); |
3206 | int_dig_in = kmalloc(hash_size, GFP_KERNEL); | ||
3207 | int_dig_vv = kmalloc(hash_size, GFP_KERNEL); | ||
3208 | if (!(int_dig_in && int_dig_vv)) { | ||
3209 | conn_err(tconn, "Allocation of buffers for data integrity checking failed\n"); | ||
3210 | goto disconnect; | ||
3211 | } | ||
3212 | } | ||
3213 | |||
3214 | new_net_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL); | ||
3215 | if (!new_net_conf) { | ||
3216 | conn_err(tconn, "Allocation of new net_conf failed\n"); | ||
3217 | goto disconnect; | ||
2773 | } | 3218 | } |
2774 | 3219 | ||
2775 | return true; | 3220 | mutex_lock(&tconn->data.mutex); |
3221 | mutex_lock(&tconn->conf_update); | ||
3222 | old_net_conf = tconn->net_conf; | ||
3223 | *new_net_conf = *old_net_conf; | ||
3224 | |||
3225 | new_net_conf->wire_protocol = p_proto; | ||
3226 | new_net_conf->after_sb_0p = convert_after_sb(p_after_sb_0p); | ||
3227 | new_net_conf->after_sb_1p = convert_after_sb(p_after_sb_1p); | ||
3228 | new_net_conf->after_sb_2p = convert_after_sb(p_after_sb_2p); | ||
3229 | new_net_conf->two_primaries = p_two_primaries; | ||
2776 | 3230 | ||
3231 | rcu_assign_pointer(tconn->net_conf, new_net_conf); | ||
3232 | mutex_unlock(&tconn->conf_update); | ||
3233 | mutex_unlock(&tconn->data.mutex); | ||
3234 | |||
3235 | crypto_free_hash(tconn->peer_integrity_tfm); | ||
3236 | kfree(tconn->int_dig_in); | ||
3237 | kfree(tconn->int_dig_vv); | ||
3238 | tconn->peer_integrity_tfm = peer_integrity_tfm; | ||
3239 | tconn->int_dig_in = int_dig_in; | ||
3240 | tconn->int_dig_vv = int_dig_vv; | ||
3241 | |||
3242 | if (strcmp(old_net_conf->integrity_alg, integrity_alg)) | ||
3243 | conn_info(tconn, "peer data-integrity-alg: %s\n", | ||
3244 | integrity_alg[0] ? integrity_alg : "(none)"); | ||
3245 | |||
3246 | synchronize_rcu(); | ||
3247 | kfree(old_net_conf); | ||
3248 | return 0; | ||
3249 | |||
3250 | disconnect_rcu_unlock: | ||
3251 | rcu_read_unlock(); | ||
2777 | disconnect: | 3252 | disconnect: |
2778 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | 3253 | crypto_free_hash(peer_integrity_tfm); |
2779 | return false; | 3254 | kfree(int_dig_in); |
3255 | kfree(int_dig_vv); | ||
3256 | conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD); | ||
3257 | return -EIO; | ||
2780 | } | 3258 | } |
2781 | 3259 | ||
2782 | /* helper function | 3260 | /* helper function |
@@ -2798,24 +3276,64 @@ struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev, | |||
2798 | alg, name, PTR_ERR(tfm)); | 3276 | alg, name, PTR_ERR(tfm)); |
2799 | return tfm; | 3277 | return tfm; |
2800 | } | 3278 | } |
2801 | if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) { | ||
2802 | crypto_free_hash(tfm); | ||
2803 | dev_err(DEV, "\"%s\" is not a digest (%s)\n", alg, name); | ||
2804 | return ERR_PTR(-EINVAL); | ||
2805 | } | ||
2806 | return tfm; | 3279 | return tfm; |
2807 | } | 3280 | } |
2808 | 3281 | ||
2809 | static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int packet_size) | 3282 | static int ignore_remaining_packet(struct drbd_tconn *tconn, struct packet_info *pi) |
3283 | { | ||
3284 | void *buffer = tconn->data.rbuf; | ||
3285 | int size = pi->size; | ||
3286 | |||
3287 | while (size) { | ||
3288 | int s = min_t(int, size, DRBD_SOCKET_BUFFER_SIZE); | ||
3289 | s = drbd_recv(tconn, buffer, s); | ||
3290 | if (s <= 0) { | ||
3291 | if (s < 0) | ||
3292 | return s; | ||
3293 | break; | ||
3294 | } | ||
3295 | size -= s; | ||
3296 | } | ||
3297 | if (size) | ||
3298 | return -EIO; | ||
3299 | return 0; | ||
3300 | } | ||
3301 | |||
3302 | /* | ||
3303 | * config_unknown_volume - device configuration command for unknown volume | ||
3304 | * | ||
3305 | * When a device is added to an existing connection, the node on which the | ||
3306 | * device is added first will send configuration commands to its peer but the | ||
3307 | * peer will not know about the device yet. It will warn and ignore these | ||
3308 | * commands. Once the device is added on the second node, the second node will | ||
3309 | * send the same device configuration commands, but in the other direction. | ||
3310 | * | ||
3311 | * (We can also end up here if drbd is misconfigured.) | ||
3312 | */ | ||
3313 | static int config_unknown_volume(struct drbd_tconn *tconn, struct packet_info *pi) | ||
2810 | { | 3314 | { |
2811 | int ok = true; | 3315 | conn_warn(tconn, "%s packet received for volume %u, which is not configured locally\n", |
2812 | struct p_rs_param_95 *p = &mdev->data.rbuf.rs_param_95; | 3316 | cmdname(pi->cmd), pi->vnr); |
3317 | return ignore_remaining_packet(tconn, pi); | ||
3318 | } | ||
3319 | |||
3320 | static int receive_SyncParam(struct drbd_tconn *tconn, struct packet_info *pi) | ||
3321 | { | ||
3322 | struct drbd_conf *mdev; | ||
3323 | struct p_rs_param_95 *p; | ||
2813 | unsigned int header_size, data_size, exp_max_sz; | 3324 | unsigned int header_size, data_size, exp_max_sz; |
2814 | struct crypto_hash *verify_tfm = NULL; | 3325 | struct crypto_hash *verify_tfm = NULL; |
2815 | struct crypto_hash *csums_tfm = NULL; | 3326 | struct crypto_hash *csums_tfm = NULL; |
2816 | const int apv = mdev->agreed_pro_version; | 3327 | struct net_conf *old_net_conf, *new_net_conf = NULL; |
2817 | int *rs_plan_s = NULL; | 3328 | struct disk_conf *old_disk_conf = NULL, *new_disk_conf = NULL; |
3329 | const int apv = tconn->agreed_pro_version; | ||
3330 | struct fifo_buffer *old_plan = NULL, *new_plan = NULL; | ||
2818 | int fifo_size = 0; | 3331 | int fifo_size = 0; |
3332 | int err; | ||
3333 | |||
3334 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
3335 | if (!mdev) | ||
3336 | return config_unknown_volume(tconn, pi); | ||
2819 | 3337 | ||
2820 | exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param) | 3338 | exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param) |
2821 | : apv == 88 ? sizeof(struct p_rs_param) | 3339 | : apv == 88 ? sizeof(struct p_rs_param) |
@@ -2823,32 +3341,49 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi | |||
2823 | : apv <= 94 ? sizeof(struct p_rs_param_89) | 3341 | : apv <= 94 ? sizeof(struct p_rs_param_89) |
2824 | : /* apv >= 95 */ sizeof(struct p_rs_param_95); | 3342 | : /* apv >= 95 */ sizeof(struct p_rs_param_95); |
2825 | 3343 | ||
2826 | if (packet_size > exp_max_sz) { | 3344 | if (pi->size > exp_max_sz) { |
2827 | dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n", | 3345 | dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n", |
2828 | packet_size, exp_max_sz); | 3346 | pi->size, exp_max_sz); |
2829 | return false; | 3347 | return -EIO; |
2830 | } | 3348 | } |
2831 | 3349 | ||
2832 | if (apv <= 88) { | 3350 | if (apv <= 88) { |
2833 | header_size = sizeof(struct p_rs_param) - sizeof(struct p_header80); | 3351 | header_size = sizeof(struct p_rs_param); |
2834 | data_size = packet_size - header_size; | 3352 | data_size = pi->size - header_size; |
2835 | } else if (apv <= 94) { | 3353 | } else if (apv <= 94) { |
2836 | header_size = sizeof(struct p_rs_param_89) - sizeof(struct p_header80); | 3354 | header_size = sizeof(struct p_rs_param_89); |
2837 | data_size = packet_size - header_size; | 3355 | data_size = pi->size - header_size; |
2838 | D_ASSERT(data_size == 0); | 3356 | D_ASSERT(data_size == 0); |
2839 | } else { | 3357 | } else { |
2840 | header_size = sizeof(struct p_rs_param_95) - sizeof(struct p_header80); | 3358 | header_size = sizeof(struct p_rs_param_95); |
2841 | data_size = packet_size - header_size; | 3359 | data_size = pi->size - header_size; |
2842 | D_ASSERT(data_size == 0); | 3360 | D_ASSERT(data_size == 0); |
2843 | } | 3361 | } |
2844 | 3362 | ||
2845 | /* initialize verify_alg and csums_alg */ | 3363 | /* initialize verify_alg and csums_alg */ |
3364 | p = pi->data; | ||
2846 | memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); | 3365 | memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); |
2847 | 3366 | ||
2848 | if (drbd_recv(mdev, &p->head.payload, header_size) != header_size) | 3367 | err = drbd_recv_all(mdev->tconn, p, header_size); |
2849 | return false; | 3368 | if (err) |
3369 | return err; | ||
2850 | 3370 | ||
2851 | mdev->sync_conf.rate = be32_to_cpu(p->rate); | 3371 | mutex_lock(&mdev->tconn->conf_update); |
3372 | old_net_conf = mdev->tconn->net_conf; | ||
3373 | if (get_ldev(mdev)) { | ||
3374 | new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL); | ||
3375 | if (!new_disk_conf) { | ||
3376 | put_ldev(mdev); | ||
3377 | mutex_unlock(&mdev->tconn->conf_update); | ||
3378 | dev_err(DEV, "Allocation of new disk_conf failed\n"); | ||
3379 | return -ENOMEM; | ||
3380 | } | ||
3381 | |||
3382 | old_disk_conf = mdev->ldev->disk_conf; | ||
3383 | *new_disk_conf = *old_disk_conf; | ||
3384 | |||
3385 | new_disk_conf->resync_rate = be32_to_cpu(p->resync_rate); | ||
3386 | } | ||
2852 | 3387 | ||
2853 | if (apv >= 88) { | 3388 | if (apv >= 88) { |
2854 | if (apv == 88) { | 3389 | if (apv == 88) { |
@@ -2856,12 +3391,13 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi | |||
2856 | dev_err(DEV, "verify-alg of wrong size, " | 3391 | dev_err(DEV, "verify-alg of wrong size, " |
2857 | "peer wants %u, accepting only up to %u byte\n", | 3392 | "peer wants %u, accepting only up to %u byte\n", |
2858 | data_size, SHARED_SECRET_MAX); | 3393 | data_size, SHARED_SECRET_MAX); |
2859 | return false; | 3394 | err = -EIO; |
3395 | goto reconnect; | ||
2860 | } | 3396 | } |
2861 | 3397 | ||
2862 | if (drbd_recv(mdev, p->verify_alg, data_size) != data_size) | 3398 | err = drbd_recv_all(mdev->tconn, p->verify_alg, data_size); |
2863 | return false; | 3399 | if (err) |
2864 | 3400 | goto reconnect; | |
2865 | /* we expect NUL terminated string */ | 3401 | /* we expect NUL terminated string */ |
2866 | /* but just in case someone tries to be evil */ | 3402 | /* but just in case someone tries to be evil */ |
2867 | D_ASSERT(p->verify_alg[data_size-1] == 0); | 3403 | D_ASSERT(p->verify_alg[data_size-1] == 0); |
@@ -2876,10 +3412,10 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi | |||
2876 | p->csums_alg[SHARED_SECRET_MAX-1] = 0; | 3412 | p->csums_alg[SHARED_SECRET_MAX-1] = 0; |
2877 | } | 3413 | } |
2878 | 3414 | ||
2879 | if (strcmp(mdev->sync_conf.verify_alg, p->verify_alg)) { | 3415 | if (strcmp(old_net_conf->verify_alg, p->verify_alg)) { |
2880 | if (mdev->state.conn == C_WF_REPORT_PARAMS) { | 3416 | if (mdev->state.conn == C_WF_REPORT_PARAMS) { |
2881 | dev_err(DEV, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n", | 3417 | dev_err(DEV, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n", |
2882 | mdev->sync_conf.verify_alg, p->verify_alg); | 3418 | old_net_conf->verify_alg, p->verify_alg); |
2883 | goto disconnect; | 3419 | goto disconnect; |
2884 | } | 3420 | } |
2885 | verify_tfm = drbd_crypto_alloc_digest_safe(mdev, | 3421 | verify_tfm = drbd_crypto_alloc_digest_safe(mdev, |
@@ -2890,10 +3426,10 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi | |||
2890 | } | 3426 | } |
2891 | } | 3427 | } |
2892 | 3428 | ||
2893 | if (apv >= 89 && strcmp(mdev->sync_conf.csums_alg, p->csums_alg)) { | 3429 | if (apv >= 89 && strcmp(old_net_conf->csums_alg, p->csums_alg)) { |
2894 | if (mdev->state.conn == C_WF_REPORT_PARAMS) { | 3430 | if (mdev->state.conn == C_WF_REPORT_PARAMS) { |
2895 | dev_err(DEV, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n", | 3431 | dev_err(DEV, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n", |
2896 | mdev->sync_conf.csums_alg, p->csums_alg); | 3432 | old_net_conf->csums_alg, p->csums_alg); |
2897 | goto disconnect; | 3433 | goto disconnect; |
2898 | } | 3434 | } |
2899 | csums_tfm = drbd_crypto_alloc_digest_safe(mdev, | 3435 | csums_tfm = drbd_crypto_alloc_digest_safe(mdev, |
@@ -2904,57 +3440,91 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi | |||
2904 | } | 3440 | } |
2905 | } | 3441 | } |
2906 | 3442 | ||
2907 | if (apv > 94) { | 3443 | if (apv > 94 && new_disk_conf) { |
2908 | mdev->sync_conf.rate = be32_to_cpu(p->rate); | 3444 | new_disk_conf->c_plan_ahead = be32_to_cpu(p->c_plan_ahead); |
2909 | mdev->sync_conf.c_plan_ahead = be32_to_cpu(p->c_plan_ahead); | 3445 | new_disk_conf->c_delay_target = be32_to_cpu(p->c_delay_target); |
2910 | mdev->sync_conf.c_delay_target = be32_to_cpu(p->c_delay_target); | 3446 | new_disk_conf->c_fill_target = be32_to_cpu(p->c_fill_target); |
2911 | mdev->sync_conf.c_fill_target = be32_to_cpu(p->c_fill_target); | 3447 | new_disk_conf->c_max_rate = be32_to_cpu(p->c_max_rate); |
2912 | mdev->sync_conf.c_max_rate = be32_to_cpu(p->c_max_rate); | 3448 | |
2913 | 3449 | fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ; | |
2914 | fifo_size = (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ; | 3450 | if (fifo_size != mdev->rs_plan_s->size) { |
2915 | if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) { | 3451 | new_plan = fifo_alloc(fifo_size); |
2916 | rs_plan_s = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL); | 3452 | if (!new_plan) { |
2917 | if (!rs_plan_s) { | ||
2918 | dev_err(DEV, "kmalloc of fifo_buffer failed"); | 3453 | dev_err(DEV, "kmalloc of fifo_buffer failed"); |
3454 | put_ldev(mdev); | ||
2919 | goto disconnect; | 3455 | goto disconnect; |
2920 | } | 3456 | } |
2921 | } | 3457 | } |
2922 | } | 3458 | } |
2923 | 3459 | ||
2924 | spin_lock(&mdev->peer_seq_lock); | 3460 | if (verify_tfm || csums_tfm) { |
2925 | /* lock against drbd_nl_syncer_conf() */ | 3461 | new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL); |
2926 | if (verify_tfm) { | 3462 | if (!new_net_conf) { |
2927 | strcpy(mdev->sync_conf.verify_alg, p->verify_alg); | 3463 | dev_err(DEV, "Allocation of new net_conf failed\n"); |
2928 | mdev->sync_conf.verify_alg_len = strlen(p->verify_alg) + 1; | 3464 | goto disconnect; |
2929 | crypto_free_hash(mdev->verify_tfm); | 3465 | } |
2930 | mdev->verify_tfm = verify_tfm; | 3466 | |
2931 | dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg); | 3467 | *new_net_conf = *old_net_conf; |
2932 | } | 3468 | |
2933 | if (csums_tfm) { | 3469 | if (verify_tfm) { |
2934 | strcpy(mdev->sync_conf.csums_alg, p->csums_alg); | 3470 | strcpy(new_net_conf->verify_alg, p->verify_alg); |
2935 | mdev->sync_conf.csums_alg_len = strlen(p->csums_alg) + 1; | 3471 | new_net_conf->verify_alg_len = strlen(p->verify_alg) + 1; |
2936 | crypto_free_hash(mdev->csums_tfm); | 3472 | crypto_free_hash(mdev->tconn->verify_tfm); |
2937 | mdev->csums_tfm = csums_tfm; | 3473 | mdev->tconn->verify_tfm = verify_tfm; |
2938 | dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg); | 3474 | dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg); |
2939 | } | 3475 | } |
2940 | if (fifo_size != mdev->rs_plan_s.size) { | 3476 | if (csums_tfm) { |
2941 | kfree(mdev->rs_plan_s.values); | 3477 | strcpy(new_net_conf->csums_alg, p->csums_alg); |
2942 | mdev->rs_plan_s.values = rs_plan_s; | 3478 | new_net_conf->csums_alg_len = strlen(p->csums_alg) + 1; |
2943 | mdev->rs_plan_s.size = fifo_size; | 3479 | crypto_free_hash(mdev->tconn->csums_tfm); |
2944 | mdev->rs_planed = 0; | 3480 | mdev->tconn->csums_tfm = csums_tfm; |
3481 | dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg); | ||
3482 | } | ||
3483 | rcu_assign_pointer(tconn->net_conf, new_net_conf); | ||
2945 | } | 3484 | } |
2946 | spin_unlock(&mdev->peer_seq_lock); | ||
2947 | } | 3485 | } |
2948 | 3486 | ||
2949 | return ok; | 3487 | if (new_disk_conf) { |
3488 | rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf); | ||
3489 | put_ldev(mdev); | ||
3490 | } | ||
3491 | |||
3492 | if (new_plan) { | ||
3493 | old_plan = mdev->rs_plan_s; | ||
3494 | rcu_assign_pointer(mdev->rs_plan_s, new_plan); | ||
3495 | } | ||
3496 | |||
3497 | mutex_unlock(&mdev->tconn->conf_update); | ||
3498 | synchronize_rcu(); | ||
3499 | if (new_net_conf) | ||
3500 | kfree(old_net_conf); | ||
3501 | kfree(old_disk_conf); | ||
3502 | kfree(old_plan); | ||
3503 | |||
3504 | return 0; | ||
3505 | |||
3506 | reconnect: | ||
3507 | if (new_disk_conf) { | ||
3508 | put_ldev(mdev); | ||
3509 | kfree(new_disk_conf); | ||
3510 | } | ||
3511 | mutex_unlock(&mdev->tconn->conf_update); | ||
3512 | return -EIO; | ||
3513 | |||
2950 | disconnect: | 3514 | disconnect: |
3515 | kfree(new_plan); | ||
3516 | if (new_disk_conf) { | ||
3517 | put_ldev(mdev); | ||
3518 | kfree(new_disk_conf); | ||
3519 | } | ||
3520 | mutex_unlock(&mdev->tconn->conf_update); | ||
2951 | /* just for completeness: actually not needed, | 3521 | /* just for completeness: actually not needed, |
2952 | * as this is not reached if csums_tfm was ok. */ | 3522 | * as this is not reached if csums_tfm was ok. */ |
2953 | crypto_free_hash(csums_tfm); | 3523 | crypto_free_hash(csums_tfm); |
2954 | /* but free the verify_tfm again, if csums_tfm did not work out */ | 3524 | /* but free the verify_tfm again, if csums_tfm did not work out */ |
2955 | crypto_free_hash(verify_tfm); | 3525 | crypto_free_hash(verify_tfm); |
2956 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | 3526 | conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD); |
2957 | return false; | 3527 | return -EIO; |
2958 | } | 3528 | } |
2959 | 3529 | ||
2960 | /* warn if the arguments differ by more than 12.5% */ | 3530 | /* warn if the arguments differ by more than 12.5% */ |
@@ -2970,59 +3540,77 @@ static void warn_if_differ_considerably(struct drbd_conf *mdev, | |||
2970 | (unsigned long long)a, (unsigned long long)b); | 3540 | (unsigned long long)a, (unsigned long long)b); |
2971 | } | 3541 | } |
2972 | 3542 | ||
2973 | static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 3543 | static int receive_sizes(struct drbd_tconn *tconn, struct packet_info *pi) |
2974 | { | 3544 | { |
2975 | struct p_sizes *p = &mdev->data.rbuf.sizes; | 3545 | struct drbd_conf *mdev; |
3546 | struct p_sizes *p = pi->data; | ||
2976 | enum determine_dev_size dd = unchanged; | 3547 | enum determine_dev_size dd = unchanged; |
2977 | sector_t p_size, p_usize, my_usize; | 3548 | sector_t p_size, p_usize, my_usize; |
2978 | int ldsc = 0; /* local disk size changed */ | 3549 | int ldsc = 0; /* local disk size changed */ |
2979 | enum dds_flags ddsf; | 3550 | enum dds_flags ddsf; |
2980 | 3551 | ||
3552 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
3553 | if (!mdev) | ||
3554 | return config_unknown_volume(tconn, pi); | ||
3555 | |||
2981 | p_size = be64_to_cpu(p->d_size); | 3556 | p_size = be64_to_cpu(p->d_size); |
2982 | p_usize = be64_to_cpu(p->u_size); | 3557 | p_usize = be64_to_cpu(p->u_size); |
2983 | 3558 | ||
2984 | if (p_size == 0 && mdev->state.disk == D_DISKLESS) { | ||
2985 | dev_err(DEV, "some backing storage is needed\n"); | ||
2986 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
2987 | return false; | ||
2988 | } | ||
2989 | |||
2990 | /* just store the peer's disk size for now. | 3559 | /* just store the peer's disk size for now. |
2991 | * we still need to figure out whether we accept that. */ | 3560 | * we still need to figure out whether we accept that. */ |
2992 | mdev->p_size = p_size; | 3561 | mdev->p_size = p_size; |
2993 | 3562 | ||
2994 | if (get_ldev(mdev)) { | 3563 | if (get_ldev(mdev)) { |
3564 | rcu_read_lock(); | ||
3565 | my_usize = rcu_dereference(mdev->ldev->disk_conf)->disk_size; | ||
3566 | rcu_read_unlock(); | ||
3567 | |||
2995 | warn_if_differ_considerably(mdev, "lower level device sizes", | 3568 | warn_if_differ_considerably(mdev, "lower level device sizes", |
2996 | p_size, drbd_get_max_capacity(mdev->ldev)); | 3569 | p_size, drbd_get_max_capacity(mdev->ldev)); |
2997 | warn_if_differ_considerably(mdev, "user requested size", | 3570 | warn_if_differ_considerably(mdev, "user requested size", |
2998 | p_usize, mdev->ldev->dc.disk_size); | 3571 | p_usize, my_usize); |
2999 | 3572 | ||
3000 | /* if this is the first connect, or an otherwise expected | 3573 | /* if this is the first connect, or an otherwise expected |
3001 | * param exchange, choose the minimum */ | 3574 | * param exchange, choose the minimum */ |
3002 | if (mdev->state.conn == C_WF_REPORT_PARAMS) | 3575 | if (mdev->state.conn == C_WF_REPORT_PARAMS) |
3003 | p_usize = min_not_zero((sector_t)mdev->ldev->dc.disk_size, | 3576 | p_usize = min_not_zero(my_usize, p_usize); |
3004 | p_usize); | ||
3005 | |||
3006 | my_usize = mdev->ldev->dc.disk_size; | ||
3007 | |||
3008 | if (mdev->ldev->dc.disk_size != p_usize) { | ||
3009 | mdev->ldev->dc.disk_size = p_usize; | ||
3010 | dev_info(DEV, "Peer sets u_size to %lu sectors\n", | ||
3011 | (unsigned long)mdev->ldev->dc.disk_size); | ||
3012 | } | ||
3013 | 3577 | ||
3014 | /* Never shrink a device with usable data during connect. | 3578 | /* Never shrink a device with usable data during connect. |
3015 | But allow online shrinking if we are connected. */ | 3579 | But allow online shrinking if we are connected. */ |
3016 | if (drbd_new_dev_size(mdev, mdev->ldev, 0) < | 3580 | if (drbd_new_dev_size(mdev, mdev->ldev, p_usize, 0) < |
3017 | drbd_get_capacity(mdev->this_bdev) && | 3581 | drbd_get_capacity(mdev->this_bdev) && |
3018 | mdev->state.disk >= D_OUTDATED && | 3582 | mdev->state.disk >= D_OUTDATED && |
3019 | mdev->state.conn < C_CONNECTED) { | 3583 | mdev->state.conn < C_CONNECTED) { |
3020 | dev_err(DEV, "The peer's disk size is too small!\n"); | 3584 | dev_err(DEV, "The peer's disk size is too small!\n"); |
3021 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | 3585 | conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD); |
3022 | mdev->ldev->dc.disk_size = my_usize; | ||
3023 | put_ldev(mdev); | 3586 | put_ldev(mdev); |
3024 | return false; | 3587 | return -EIO; |
3588 | } | ||
3589 | |||
3590 | if (my_usize != p_usize) { | ||
3591 | struct disk_conf *old_disk_conf, *new_disk_conf = NULL; | ||
3592 | |||
3593 | new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL); | ||
3594 | if (!new_disk_conf) { | ||
3595 | dev_err(DEV, "Allocation of new disk_conf failed\n"); | ||
3596 | put_ldev(mdev); | ||
3597 | return -ENOMEM; | ||
3598 | } | ||
3599 | |||
3600 | mutex_lock(&mdev->tconn->conf_update); | ||
3601 | old_disk_conf = mdev->ldev->disk_conf; | ||
3602 | *new_disk_conf = *old_disk_conf; | ||
3603 | new_disk_conf->disk_size = p_usize; | ||
3604 | |||
3605 | rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf); | ||
3606 | mutex_unlock(&mdev->tconn->conf_update); | ||
3607 | synchronize_rcu(); | ||
3608 | kfree(old_disk_conf); | ||
3609 | |||
3610 | dev_info(DEV, "Peer sets u_size to %lu sectors\n", | ||
3611 | (unsigned long)my_usize); | ||
3025 | } | 3612 | } |
3613 | |||
3026 | put_ldev(mdev); | 3614 | put_ldev(mdev); |
3027 | } | 3615 | } |
3028 | 3616 | ||
@@ -3031,7 +3619,7 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned | |||
3031 | dd = drbd_determine_dev_size(mdev, ddsf); | 3619 | dd = drbd_determine_dev_size(mdev, ddsf); |
3032 | put_ldev(mdev); | 3620 | put_ldev(mdev); |
3033 | if (dd == dev_size_error) | 3621 | if (dd == dev_size_error) |
3034 | return false; | 3622 | return -EIO; |
3035 | drbd_md_sync(mdev); | 3623 | drbd_md_sync(mdev); |
3036 | } else { | 3624 | } else { |
3037 | /* I am diskless, need to accept the peer's size. */ | 3625 | /* I am diskless, need to accept the peer's size. */ |
@@ -3070,16 +3658,25 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned | |||
3070 | } | 3658 | } |
3071 | } | 3659 | } |
3072 | 3660 | ||
3073 | return true; | 3661 | return 0; |
3074 | } | 3662 | } |
3075 | 3663 | ||
3076 | static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 3664 | static int receive_uuids(struct drbd_tconn *tconn, struct packet_info *pi) |
3077 | { | 3665 | { |
3078 | struct p_uuids *p = &mdev->data.rbuf.uuids; | 3666 | struct drbd_conf *mdev; |
3667 | struct p_uuids *p = pi->data; | ||
3079 | u64 *p_uuid; | 3668 | u64 *p_uuid; |
3080 | int i, updated_uuids = 0; | 3669 | int i, updated_uuids = 0; |
3081 | 3670 | ||
3671 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
3672 | if (!mdev) | ||
3673 | return config_unknown_volume(tconn, pi); | ||
3674 | |||
3082 | p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO); | 3675 | p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO); |
3676 | if (!p_uuid) { | ||
3677 | dev_err(DEV, "kmalloc of p_uuid failed\n"); | ||
3678 | return false; | ||
3679 | } | ||
3083 | 3680 | ||
3084 | for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++) | 3681 | for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++) |
3085 | p_uuid[i] = be64_to_cpu(p->uuid[i]); | 3682 | p_uuid[i] = be64_to_cpu(p->uuid[i]); |
@@ -3093,14 +3690,14 @@ static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned | |||
3093 | (mdev->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) { | 3690 | (mdev->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) { |
3094 | dev_err(DEV, "Can only connect to data with current UUID=%016llX\n", | 3691 | dev_err(DEV, "Can only connect to data with current UUID=%016llX\n", |
3095 | (unsigned long long)mdev->ed_uuid); | 3692 | (unsigned long long)mdev->ed_uuid); |
3096 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | 3693 | conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD); |
3097 | return false; | 3694 | return -EIO; |
3098 | } | 3695 | } |
3099 | 3696 | ||
3100 | if (get_ldev(mdev)) { | 3697 | if (get_ldev(mdev)) { |
3101 | int skip_initial_sync = | 3698 | int skip_initial_sync = |
3102 | mdev->state.conn == C_CONNECTED && | 3699 | mdev->state.conn == C_CONNECTED && |
3103 | mdev->agreed_pro_version >= 90 && | 3700 | mdev->tconn->agreed_pro_version >= 90 && |
3104 | mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && | 3701 | mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && |
3105 | (p_uuid[UI_FLAGS] & 8); | 3702 | (p_uuid[UI_FLAGS] & 8); |
3106 | if (skip_initial_sync) { | 3703 | if (skip_initial_sync) { |
@@ -3127,14 +3724,15 @@ static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned | |||
3127 | ongoing cluster wide state change is finished. That is important if | 3724 | ongoing cluster wide state change is finished. That is important if |
3128 | we are primary and are detaching from our disk. We need to see the | 3725 | we are primary and are detaching from our disk. We need to see the |
3129 | new disk state... */ | 3726 | new disk state... */ |
3130 | wait_event(mdev->misc_wait, !test_bit(CLUSTER_ST_CHANGE, &mdev->flags)); | 3727 | mutex_lock(mdev->state_mutex); |
3728 | mutex_unlock(mdev->state_mutex); | ||
3131 | if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT) | 3729 | if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT) |
3132 | updated_uuids |= drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]); | 3730 | updated_uuids |= drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]); |
3133 | 3731 | ||
3134 | if (updated_uuids) | 3732 | if (updated_uuids) |
3135 | drbd_print_uuids(mdev, "receiver updated UUIDs to"); | 3733 | drbd_print_uuids(mdev, "receiver updated UUIDs to"); |
3136 | 3734 | ||
3137 | return true; | 3735 | return 0; |
3138 | } | 3736 | } |
3139 | 3737 | ||
3140 | /** | 3738 | /** |
@@ -3146,6 +3744,7 @@ static union drbd_state convert_state(union drbd_state ps) | |||
3146 | union drbd_state ms; | 3744 | union drbd_state ms; |
3147 | 3745 | ||
3148 | static enum drbd_conns c_tab[] = { | 3746 | static enum drbd_conns c_tab[] = { |
3747 | [C_WF_REPORT_PARAMS] = C_WF_REPORT_PARAMS, | ||
3149 | [C_CONNECTED] = C_CONNECTED, | 3748 | [C_CONNECTED] = C_CONNECTED, |
3150 | 3749 | ||
3151 | [C_STARTING_SYNC_S] = C_STARTING_SYNC_T, | 3750 | [C_STARTING_SYNC_S] = C_STARTING_SYNC_T, |
@@ -3167,40 +3766,74 @@ static union drbd_state convert_state(union drbd_state ps) | |||
3167 | return ms; | 3766 | return ms; |
3168 | } | 3767 | } |
3169 | 3768 | ||
3170 | static int receive_req_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 3769 | static int receive_req_state(struct drbd_tconn *tconn, struct packet_info *pi) |
3171 | { | 3770 | { |
3172 | struct p_req_state *p = &mdev->data.rbuf.req_state; | 3771 | struct drbd_conf *mdev; |
3772 | struct p_req_state *p = pi->data; | ||
3173 | union drbd_state mask, val; | 3773 | union drbd_state mask, val; |
3174 | enum drbd_state_rv rv; | 3774 | enum drbd_state_rv rv; |
3175 | 3775 | ||
3776 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
3777 | if (!mdev) | ||
3778 | return -EIO; | ||
3779 | |||
3176 | mask.i = be32_to_cpu(p->mask); | 3780 | mask.i = be32_to_cpu(p->mask); |
3177 | val.i = be32_to_cpu(p->val); | 3781 | val.i = be32_to_cpu(p->val); |
3178 | 3782 | ||
3179 | if (test_bit(DISCARD_CONCURRENT, &mdev->flags) && | 3783 | if (test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags) && |
3180 | test_bit(CLUSTER_ST_CHANGE, &mdev->flags)) { | 3784 | mutex_is_locked(mdev->state_mutex)) { |
3181 | drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG); | 3785 | drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG); |
3182 | return true; | 3786 | return 0; |
3183 | } | 3787 | } |
3184 | 3788 | ||
3185 | mask = convert_state(mask); | 3789 | mask = convert_state(mask); |
3186 | val = convert_state(val); | 3790 | val = convert_state(val); |
3187 | 3791 | ||
3188 | rv = drbd_change_state(mdev, CS_VERBOSE, mask, val); | 3792 | rv = drbd_change_state(mdev, CS_VERBOSE, mask, val); |
3189 | |||
3190 | drbd_send_sr_reply(mdev, rv); | 3793 | drbd_send_sr_reply(mdev, rv); |
3794 | |||
3191 | drbd_md_sync(mdev); | 3795 | drbd_md_sync(mdev); |
3192 | 3796 | ||
3193 | return true; | 3797 | return 0; |
3194 | } | 3798 | } |
3195 | 3799 | ||
3196 | static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 3800 | static int receive_req_conn_state(struct drbd_tconn *tconn, struct packet_info *pi) |
3197 | { | 3801 | { |
3198 | struct p_state *p = &mdev->data.rbuf.state; | 3802 | struct p_req_state *p = pi->data; |
3803 | union drbd_state mask, val; | ||
3804 | enum drbd_state_rv rv; | ||
3805 | |||
3806 | mask.i = be32_to_cpu(p->mask); | ||
3807 | val.i = be32_to_cpu(p->val); | ||
3808 | |||
3809 | if (test_bit(RESOLVE_CONFLICTS, &tconn->flags) && | ||
3810 | mutex_is_locked(&tconn->cstate_mutex)) { | ||
3811 | conn_send_sr_reply(tconn, SS_CONCURRENT_ST_CHG); | ||
3812 | return 0; | ||
3813 | } | ||
3814 | |||
3815 | mask = convert_state(mask); | ||
3816 | val = convert_state(val); | ||
3817 | |||
3818 | rv = conn_request_state(tconn, mask, val, CS_VERBOSE | CS_LOCAL_ONLY | CS_IGN_OUTD_FAIL); | ||
3819 | conn_send_sr_reply(tconn, rv); | ||
3820 | |||
3821 | return 0; | ||
3822 | } | ||
3823 | |||
3824 | static int receive_state(struct drbd_tconn *tconn, struct packet_info *pi) | ||
3825 | { | ||
3826 | struct drbd_conf *mdev; | ||
3827 | struct p_state *p = pi->data; | ||
3199 | union drbd_state os, ns, peer_state; | 3828 | union drbd_state os, ns, peer_state; |
3200 | enum drbd_disk_state real_peer_disk; | 3829 | enum drbd_disk_state real_peer_disk; |
3201 | enum chg_state_flags cs_flags; | 3830 | enum chg_state_flags cs_flags; |
3202 | int rv; | 3831 | int rv; |
3203 | 3832 | ||
3833 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
3834 | if (!mdev) | ||
3835 | return config_unknown_volume(tconn, pi); | ||
3836 | |||
3204 | peer_state.i = be32_to_cpu(p->state); | 3837 | peer_state.i = be32_to_cpu(p->state); |
3205 | 3838 | ||
3206 | real_peer_disk = peer_state.disk; | 3839 | real_peer_disk = peer_state.disk; |
@@ -3209,16 +3842,16 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned | |||
3209 | dev_info(DEV, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk)); | 3842 | dev_info(DEV, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk)); |
3210 | } | 3843 | } |
3211 | 3844 | ||
3212 | spin_lock_irq(&mdev->req_lock); | 3845 | spin_lock_irq(&mdev->tconn->req_lock); |
3213 | retry: | 3846 | retry: |
3214 | os = ns = mdev->state; | 3847 | os = ns = drbd_read_state(mdev); |
3215 | spin_unlock_irq(&mdev->req_lock); | 3848 | spin_unlock_irq(&mdev->tconn->req_lock); |
3216 | 3849 | ||
3217 | /* If some other part of the code (asender thread, timeout) | 3850 | /* If some other part of the code (asender thread, timeout) |
3218 | * already decided to close the connection again, | 3851 | * already decided to close the connection again, |
3219 | * we must not "re-establish" it here. */ | 3852 | * we must not "re-establish" it here. */ |
3220 | if (os.conn <= C_TEAR_DOWN) | 3853 | if (os.conn <= C_TEAR_DOWN) |
3221 | return false; | 3854 | return -ECONNRESET; |
3222 | 3855 | ||
3223 | /* If this is the "end of sync" confirmation, usually the peer disk | 3856 | /* If this is the "end of sync" confirmation, usually the peer disk |
3224 | * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits | 3857 | * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits |
@@ -3246,10 +3879,18 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned | |||
3246 | peer_state.conn == C_CONNECTED) { | 3879 | peer_state.conn == C_CONNECTED) { |
3247 | if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) | 3880 | if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) |
3248 | drbd_resync_finished(mdev); | 3881 | drbd_resync_finished(mdev); |
3249 | return true; | 3882 | return 0; |
3250 | } | 3883 | } |
3251 | } | 3884 | } |
3252 | 3885 | ||
3886 | /* explicit verify finished notification, stop sector reached. */ | ||
3887 | if (os.conn == C_VERIFY_T && os.disk == D_UP_TO_DATE && | ||
3888 | peer_state.conn == C_CONNECTED && real_peer_disk == D_UP_TO_DATE) { | ||
3889 | ov_out_of_sync_print(mdev); | ||
3890 | drbd_resync_finished(mdev); | ||
3891 | return 0; | ||
3892 | } | ||
3893 | |||
3253 | /* peer says his disk is inconsistent, while we think it is uptodate, | 3894 | /* peer says his disk is inconsistent, while we think it is uptodate, |
3254 | * and this happens while the peer still thinks we have a sync going on, | 3895 | * and this happens while the peer still thinks we have a sync going on, |
3255 | * but we think we are already done with the sync. | 3896 | * but we think we are already done with the sync. |
@@ -3298,17 +3939,17 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned | |||
3298 | peer_state.disk = D_DISKLESS; | 3939 | peer_state.disk = D_DISKLESS; |
3299 | real_peer_disk = D_DISKLESS; | 3940 | real_peer_disk = D_DISKLESS; |
3300 | } else { | 3941 | } else { |
3301 | if (test_and_clear_bit(CONN_DRY_RUN, &mdev->flags)) | 3942 | if (test_and_clear_bit(CONN_DRY_RUN, &mdev->tconn->flags)) |
3302 | return false; | 3943 | return -EIO; |
3303 | D_ASSERT(os.conn == C_WF_REPORT_PARAMS); | 3944 | D_ASSERT(os.conn == C_WF_REPORT_PARAMS); |
3304 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | 3945 | conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD); |
3305 | return false; | 3946 | return -EIO; |
3306 | } | 3947 | } |
3307 | } | 3948 | } |
3308 | } | 3949 | } |
3309 | 3950 | ||
3310 | spin_lock_irq(&mdev->req_lock); | 3951 | spin_lock_irq(&mdev->tconn->req_lock); |
3311 | if (mdev->state.i != os.i) | 3952 | if (os.i != drbd_read_state(mdev).i) |
3312 | goto retry; | 3953 | goto retry; |
3313 | clear_bit(CONSIDER_RESYNC, &mdev->flags); | 3954 | clear_bit(CONSIDER_RESYNC, &mdev->flags); |
3314 | ns.peer = peer_state.role; | 3955 | ns.peer = peer_state.role; |
@@ -3317,25 +3958,25 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned | |||
3317 | if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING) | 3958 | if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING) |
3318 | ns.disk = mdev->new_state_tmp.disk; | 3959 | ns.disk = mdev->new_state_tmp.disk; |
3319 | cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD); | 3960 | cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD); |
3320 | if (ns.pdsk == D_CONSISTENT && is_susp(ns) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED && | 3961 | if (ns.pdsk == D_CONSISTENT && drbd_suspended(mdev) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED && |
3321 | test_bit(NEW_CUR_UUID, &mdev->flags)) { | 3962 | test_bit(NEW_CUR_UUID, &mdev->flags)) { |
3322 | /* Do not allow tl_restart(resend) for a rebooted peer. We can only allow this | 3963 | /* Do not allow tl_restart(RESEND) for a rebooted peer. We can only allow this |
3323 | for temporal network outages! */ | 3964 | for temporal network outages! */ |
3324 | spin_unlock_irq(&mdev->req_lock); | 3965 | spin_unlock_irq(&mdev->tconn->req_lock); |
3325 | dev_err(DEV, "Aborting Connect, can not thaw IO with an only Consistent peer\n"); | 3966 | dev_err(DEV, "Aborting Connect, can not thaw IO with an only Consistent peer\n"); |
3326 | tl_clear(mdev); | 3967 | tl_clear(mdev->tconn); |
3327 | drbd_uuid_new_current(mdev); | 3968 | drbd_uuid_new_current(mdev); |
3328 | clear_bit(NEW_CUR_UUID, &mdev->flags); | 3969 | clear_bit(NEW_CUR_UUID, &mdev->flags); |
3329 | drbd_force_state(mdev, NS2(conn, C_PROTOCOL_ERROR, susp, 0)); | 3970 | conn_request_state(mdev->tconn, NS2(conn, C_PROTOCOL_ERROR, susp, 0), CS_HARD); |
3330 | return false; | 3971 | return -EIO; |
3331 | } | 3972 | } |
3332 | rv = _drbd_set_state(mdev, ns, cs_flags, NULL); | 3973 | rv = _drbd_set_state(mdev, ns, cs_flags, NULL); |
3333 | ns = mdev->state; | 3974 | ns = drbd_read_state(mdev); |
3334 | spin_unlock_irq(&mdev->req_lock); | 3975 | spin_unlock_irq(&mdev->tconn->req_lock); |
3335 | 3976 | ||
3336 | if (rv < SS_SUCCESS) { | 3977 | if (rv < SS_SUCCESS) { |
3337 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | 3978 | conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD); |
3338 | return false; | 3979 | return -EIO; |
3339 | } | 3980 | } |
3340 | 3981 | ||
3341 | if (os.conn > C_WF_REPORT_PARAMS) { | 3982 | if (os.conn > C_WF_REPORT_PARAMS) { |
@@ -3349,16 +3990,21 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned | |||
3349 | } | 3990 | } |
3350 | } | 3991 | } |
3351 | 3992 | ||
3352 | mdev->net_conf->want_lose = 0; | 3993 | clear_bit(DISCARD_MY_DATA, &mdev->flags); |
3353 | 3994 | ||
3354 | drbd_md_sync(mdev); /* update connected indicator, la_size, ... */ | 3995 | drbd_md_sync(mdev); /* update connected indicator, la_size, ... */ |
3355 | 3996 | ||
3356 | return true; | 3997 | return 0; |
3357 | } | 3998 | } |
3358 | 3999 | ||
3359 | static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 4000 | static int receive_sync_uuid(struct drbd_tconn *tconn, struct packet_info *pi) |
3360 | { | 4001 | { |
3361 | struct p_rs_uuid *p = &mdev->data.rbuf.rs_uuid; | 4002 | struct drbd_conf *mdev; |
4003 | struct p_rs_uuid *p = pi->data; | ||
4004 | |||
4005 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
4006 | if (!mdev) | ||
4007 | return -EIO; | ||
3362 | 4008 | ||
3363 | wait_event(mdev->misc_wait, | 4009 | wait_event(mdev->misc_wait, |
3364 | mdev->state.conn == C_WF_SYNC_UUID || | 4010 | mdev->state.conn == C_WF_SYNC_UUID || |
@@ -3381,7 +4027,7 @@ static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsi | |||
3381 | } else | 4027 | } else |
3382 | dev_err(DEV, "Ignoring SyncUUID packet!\n"); | 4028 | dev_err(DEV, "Ignoring SyncUUID packet!\n"); |
3383 | 4029 | ||
3384 | return true; | 4030 | return 0; |
3385 | } | 4031 | } |
3386 | 4032 | ||
3387 | /** | 4033 | /** |
@@ -3391,27 +4037,27 @@ static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsi | |||
3391 | * code upon failure. | 4037 | * code upon failure. |
3392 | */ | 4038 | */ |
3393 | static int | 4039 | static int |
3394 | receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size, | 4040 | receive_bitmap_plain(struct drbd_conf *mdev, unsigned int size, |
3395 | unsigned long *buffer, struct bm_xfer_ctx *c) | 4041 | unsigned long *p, struct bm_xfer_ctx *c) |
3396 | { | 4042 | { |
3397 | unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset); | 4043 | unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - |
3398 | unsigned want = num_words * sizeof(long); | 4044 | drbd_header_size(mdev->tconn); |
4045 | unsigned int num_words = min_t(size_t, data_size / sizeof(*p), | ||
4046 | c->bm_words - c->word_offset); | ||
4047 | unsigned int want = num_words * sizeof(*p); | ||
3399 | int err; | 4048 | int err; |
3400 | 4049 | ||
3401 | if (want != data_size) { | 4050 | if (want != size) { |
3402 | dev_err(DEV, "%s:want (%u) != data_size (%u)\n", __func__, want, data_size); | 4051 | dev_err(DEV, "%s:want (%u) != size (%u)\n", __func__, want, size); |
3403 | return -EIO; | 4052 | return -EIO; |
3404 | } | 4053 | } |
3405 | if (want == 0) | 4054 | if (want == 0) |
3406 | return 0; | 4055 | return 0; |
3407 | err = drbd_recv(mdev, buffer, want); | 4056 | err = drbd_recv_all(mdev->tconn, p, want); |
3408 | if (err != want) { | 4057 | if (err) |
3409 | if (err >= 0) | ||
3410 | err = -EIO; | ||
3411 | return err; | 4058 | return err; |
3412 | } | ||
3413 | 4059 | ||
3414 | drbd_bm_merge_lel(mdev, c->word_offset, num_words, buffer); | 4060 | drbd_bm_merge_lel(mdev, c->word_offset, num_words, p); |
3415 | 4061 | ||
3416 | c->word_offset += num_words; | 4062 | c->word_offset += num_words; |
3417 | c->bit_offset = c->word_offset * BITS_PER_LONG; | 4063 | c->bit_offset = c->word_offset * BITS_PER_LONG; |
@@ -3421,6 +4067,21 @@ receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size, | |||
3421 | return 1; | 4067 | return 1; |
3422 | } | 4068 | } |
3423 | 4069 | ||
4070 | static enum drbd_bitmap_code dcbp_get_code(struct p_compressed_bm *p) | ||
4071 | { | ||
4072 | return (enum drbd_bitmap_code)(p->encoding & 0x0f); | ||
4073 | } | ||
4074 | |||
4075 | static int dcbp_get_start(struct p_compressed_bm *p) | ||
4076 | { | ||
4077 | return (p->encoding & 0x80) != 0; | ||
4078 | } | ||
4079 | |||
4080 | static int dcbp_get_pad_bits(struct p_compressed_bm *p) | ||
4081 | { | ||
4082 | return (p->encoding >> 4) & 0x7; | ||
4083 | } | ||
4084 | |||
3424 | /** | 4085 | /** |
3425 | * recv_bm_rle_bits | 4086 | * recv_bm_rle_bits |
3426 | * | 4087 | * |
@@ -3430,7 +4091,8 @@ receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size, | |||
3430 | static int | 4091 | static int |
3431 | recv_bm_rle_bits(struct drbd_conf *mdev, | 4092 | recv_bm_rle_bits(struct drbd_conf *mdev, |
3432 | struct p_compressed_bm *p, | 4093 | struct p_compressed_bm *p, |
3433 | struct bm_xfer_ctx *c) | 4094 | struct bm_xfer_ctx *c, |
4095 | unsigned int len) | ||
3434 | { | 4096 | { |
3435 | struct bitstream bs; | 4097 | struct bitstream bs; |
3436 | u64 look_ahead; | 4098 | u64 look_ahead; |
@@ -3438,12 +4100,11 @@ recv_bm_rle_bits(struct drbd_conf *mdev, | |||
3438 | u64 tmp; | 4100 | u64 tmp; |
3439 | unsigned long s = c->bit_offset; | 4101 | unsigned long s = c->bit_offset; |
3440 | unsigned long e; | 4102 | unsigned long e; |
3441 | int len = be16_to_cpu(p->head.length) - (sizeof(*p) - sizeof(p->head)); | 4103 | int toggle = dcbp_get_start(p); |
3442 | int toggle = DCBP_get_start(p); | ||
3443 | int have; | 4104 | int have; |
3444 | int bits; | 4105 | int bits; |
3445 | 4106 | ||
3446 | bitstream_init(&bs, p->code, len, DCBP_get_pad_bits(p)); | 4107 | bitstream_init(&bs, p->code, len, dcbp_get_pad_bits(p)); |
3447 | 4108 | ||
3448 | bits = bitstream_get_bits(&bs, &look_ahead, 64); | 4109 | bits = bitstream_get_bits(&bs, &look_ahead, 64); |
3449 | if (bits < 0) | 4110 | if (bits < 0) |
@@ -3495,17 +4156,18 @@ recv_bm_rle_bits(struct drbd_conf *mdev, | |||
3495 | static int | 4156 | static int |
3496 | decode_bitmap_c(struct drbd_conf *mdev, | 4157 | decode_bitmap_c(struct drbd_conf *mdev, |
3497 | struct p_compressed_bm *p, | 4158 | struct p_compressed_bm *p, |
3498 | struct bm_xfer_ctx *c) | 4159 | struct bm_xfer_ctx *c, |
4160 | unsigned int len) | ||
3499 | { | 4161 | { |
3500 | if (DCBP_get_code(p) == RLE_VLI_Bits) | 4162 | if (dcbp_get_code(p) == RLE_VLI_Bits) |
3501 | return recv_bm_rle_bits(mdev, p, c); | 4163 | return recv_bm_rle_bits(mdev, p, c, len - sizeof(*p)); |
3502 | 4164 | ||
3503 | /* other variants had been implemented for evaluation, | 4165 | /* other variants had been implemented for evaluation, |
3504 | * but have been dropped as this one turned out to be "best" | 4166 | * but have been dropped as this one turned out to be "best" |
3505 | * during all our tests. */ | 4167 | * during all our tests. */ |
3506 | 4168 | ||
3507 | dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding); | 4169 | dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding); |
3508 | drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); | 4170 | conn_request_state(mdev->tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD); |
3509 | return -EIO; | 4171 | return -EIO; |
3510 | } | 4172 | } |
3511 | 4173 | ||
@@ -3513,11 +4175,13 @@ void INFO_bm_xfer_stats(struct drbd_conf *mdev, | |||
3513 | const char *direction, struct bm_xfer_ctx *c) | 4175 | const char *direction, struct bm_xfer_ctx *c) |
3514 | { | 4176 | { |
3515 | /* what would it take to transfer it "plaintext" */ | 4177 | /* what would it take to transfer it "plaintext" */ |
3516 | unsigned plain = sizeof(struct p_header80) * | 4178 | unsigned int header_size = drbd_header_size(mdev->tconn); |
3517 | ((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1) | 4179 | unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - header_size; |
3518 | + c->bm_words * sizeof(long); | 4180 | unsigned int plain = |
3519 | unsigned total = c->bytes[0] + c->bytes[1]; | 4181 | header_size * (DIV_ROUND_UP(c->bm_words, data_size) + 1) + |
3520 | unsigned r; | 4182 | c->bm_words * sizeof(unsigned long); |
4183 | unsigned int total = c->bytes[0] + c->bytes[1]; | ||
4184 | unsigned int r; | ||
3521 | 4185 | ||
3522 | /* total can not be zero. but just in case: */ | 4186 | /* total can not be zero. but just in case: */ |
3523 | if (total == 0) | 4187 | if (total == 0) |
@@ -3551,67 +4215,63 @@ void INFO_bm_xfer_stats(struct drbd_conf *mdev, | |||
3551 | in order to be agnostic to the 32 vs 64 bits issue. | 4215 | in order to be agnostic to the 32 vs 64 bits issue. |
3552 | 4216 | ||
3553 | returns 0 on failure, 1 if we successfully received it. */ | 4217 | returns 0 on failure, 1 if we successfully received it. */ |
3554 | static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 4218 | static int receive_bitmap(struct drbd_tconn *tconn, struct packet_info *pi) |
3555 | { | 4219 | { |
4220 | struct drbd_conf *mdev; | ||
3556 | struct bm_xfer_ctx c; | 4221 | struct bm_xfer_ctx c; |
3557 | void *buffer; | ||
3558 | int err; | 4222 | int err; |
3559 | int ok = false; | 4223 | |
3560 | struct p_header80 *h = &mdev->data.rbuf.header.h80; | 4224 | mdev = vnr_to_mdev(tconn, pi->vnr); |
4225 | if (!mdev) | ||
4226 | return -EIO; | ||
3561 | 4227 | ||
3562 | drbd_bm_lock(mdev, "receive bitmap", BM_LOCKED_SET_ALLOWED); | 4228 | drbd_bm_lock(mdev, "receive bitmap", BM_LOCKED_SET_ALLOWED); |
3563 | /* you are supposed to send additional out-of-sync information | 4229 | /* you are supposed to send additional out-of-sync information |
3564 | * if you actually set bits during this phase */ | 4230 | * if you actually set bits during this phase */ |
3565 | 4231 | ||
3566 | /* maybe we should use some per thread scratch page, | ||
3567 | * and allocate that during initial device creation? */ | ||
3568 | buffer = (unsigned long *) __get_free_page(GFP_NOIO); | ||
3569 | if (!buffer) { | ||
3570 | dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__); | ||
3571 | goto out; | ||
3572 | } | ||
3573 | |||
3574 | c = (struct bm_xfer_ctx) { | 4232 | c = (struct bm_xfer_ctx) { |
3575 | .bm_bits = drbd_bm_bits(mdev), | 4233 | .bm_bits = drbd_bm_bits(mdev), |
3576 | .bm_words = drbd_bm_words(mdev), | 4234 | .bm_words = drbd_bm_words(mdev), |
3577 | }; | 4235 | }; |
3578 | 4236 | ||
3579 | for(;;) { | 4237 | for(;;) { |
3580 | if (cmd == P_BITMAP) { | 4238 | if (pi->cmd == P_BITMAP) |
3581 | err = receive_bitmap_plain(mdev, data_size, buffer, &c); | 4239 | err = receive_bitmap_plain(mdev, pi->size, pi->data, &c); |
3582 | } else if (cmd == P_COMPRESSED_BITMAP) { | 4240 | else if (pi->cmd == P_COMPRESSED_BITMAP) { |
3583 | /* MAYBE: sanity check that we speak proto >= 90, | 4241 | /* MAYBE: sanity check that we speak proto >= 90, |
3584 | * and the feature is enabled! */ | 4242 | * and the feature is enabled! */ |
3585 | struct p_compressed_bm *p; | 4243 | struct p_compressed_bm *p = pi->data; |
3586 | 4244 | ||
3587 | if (data_size > BM_PACKET_PAYLOAD_BYTES) { | 4245 | if (pi->size > DRBD_SOCKET_BUFFER_SIZE - drbd_header_size(tconn)) { |
3588 | dev_err(DEV, "ReportCBitmap packet too large\n"); | 4246 | dev_err(DEV, "ReportCBitmap packet too large\n"); |
4247 | err = -EIO; | ||
3589 | goto out; | 4248 | goto out; |
3590 | } | 4249 | } |
3591 | /* use the page buff */ | 4250 | if (pi->size <= sizeof(*p)) { |
3592 | p = buffer; | 4251 | dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", pi->size); |
3593 | memcpy(p, h, sizeof(*h)); | 4252 | err = -EIO; |
3594 | if (drbd_recv(mdev, p->head.payload, data_size) != data_size) | ||
3595 | goto out; | ||
3596 | if (data_size <= (sizeof(*p) - sizeof(p->head))) { | ||
3597 | dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", data_size); | ||
3598 | goto out; | 4253 | goto out; |
3599 | } | 4254 | } |
3600 | err = decode_bitmap_c(mdev, p, &c); | 4255 | err = drbd_recv_all(mdev->tconn, p, pi->size); |
4256 | if (err) | ||
4257 | goto out; | ||
4258 | err = decode_bitmap_c(mdev, p, &c, pi->size); | ||
3601 | } else { | 4259 | } else { |
3602 | dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", cmd); | 4260 | dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", pi->cmd); |
4261 | err = -EIO; | ||
3603 | goto out; | 4262 | goto out; |
3604 | } | 4263 | } |
3605 | 4264 | ||
3606 | c.packets[cmd == P_BITMAP]++; | 4265 | c.packets[pi->cmd == P_BITMAP]++; |
3607 | c.bytes[cmd == P_BITMAP] += sizeof(struct p_header80) + data_size; | 4266 | c.bytes[pi->cmd == P_BITMAP] += drbd_header_size(tconn) + pi->size; |
3608 | 4267 | ||
3609 | if (err <= 0) { | 4268 | if (err <= 0) { |
3610 | if (err < 0) | 4269 | if (err < 0) |
3611 | goto out; | 4270 | goto out; |
3612 | break; | 4271 | break; |
3613 | } | 4272 | } |
3614 | if (!drbd_recv_header(mdev, &cmd, &data_size)) | 4273 | err = drbd_recv_header(mdev->tconn, pi); |
4274 | if (err) | ||
3615 | goto out; | 4275 | goto out; |
3616 | } | 4276 | } |
3617 | 4277 | ||
@@ -3620,8 +4280,8 @@ static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigne | |||
3620 | if (mdev->state.conn == C_WF_BITMAP_T) { | 4280 | if (mdev->state.conn == C_WF_BITMAP_T) { |
3621 | enum drbd_state_rv rv; | 4281 | enum drbd_state_rv rv; |
3622 | 4282 | ||
3623 | ok = !drbd_send_bitmap(mdev); | 4283 | err = drbd_send_bitmap(mdev); |
3624 | if (!ok) | 4284 | if (err) |
3625 | goto out; | 4285 | goto out; |
3626 | /* Omit CS_ORDERED with this state transition to avoid deadlocks. */ | 4286 | /* Omit CS_ORDERED with this state transition to avoid deadlocks. */ |
3627 | rv = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); | 4287 | rv = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); |
@@ -3632,47 +4292,40 @@ static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigne | |||
3632 | dev_info(DEV, "unexpected cstate (%s) in receive_bitmap\n", | 4292 | dev_info(DEV, "unexpected cstate (%s) in receive_bitmap\n", |
3633 | drbd_conn_str(mdev->state.conn)); | 4293 | drbd_conn_str(mdev->state.conn)); |
3634 | } | 4294 | } |
4295 | err = 0; | ||
3635 | 4296 | ||
3636 | ok = true; | ||
3637 | out: | 4297 | out: |
3638 | drbd_bm_unlock(mdev); | 4298 | drbd_bm_unlock(mdev); |
3639 | if (ok && mdev->state.conn == C_WF_BITMAP_S) | 4299 | if (!err && mdev->state.conn == C_WF_BITMAP_S) |
3640 | drbd_start_resync(mdev, C_SYNC_SOURCE); | 4300 | drbd_start_resync(mdev, C_SYNC_SOURCE); |
3641 | free_page((unsigned long) buffer); | 4301 | return err; |
3642 | return ok; | ||
3643 | } | 4302 | } |
3644 | 4303 | ||
3645 | static int receive_skip(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 4304 | static int receive_skip(struct drbd_tconn *tconn, struct packet_info *pi) |
3646 | { | 4305 | { |
3647 | /* TODO zero copy sink :) */ | 4306 | conn_warn(tconn, "skipping unknown optional packet type %d, l: %d!\n", |
3648 | static char sink[128]; | 4307 | pi->cmd, pi->size); |
3649 | int size, want, r; | ||
3650 | 4308 | ||
3651 | dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n", | 4309 | return ignore_remaining_packet(tconn, pi); |
3652 | cmd, data_size); | ||
3653 | |||
3654 | size = data_size; | ||
3655 | while (size > 0) { | ||
3656 | want = min_t(int, size, sizeof(sink)); | ||
3657 | r = drbd_recv(mdev, sink, want); | ||
3658 | ERR_IF(r <= 0) break; | ||
3659 | size -= r; | ||
3660 | } | ||
3661 | return size == 0; | ||
3662 | } | 4310 | } |
3663 | 4311 | ||
3664 | static int receive_UnplugRemote(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 4312 | static int receive_UnplugRemote(struct drbd_tconn *tconn, struct packet_info *pi) |
3665 | { | 4313 | { |
3666 | /* Make sure we've acked all the TCP data associated | 4314 | /* Make sure we've acked all the TCP data associated |
3667 | * with the data requests being unplugged */ | 4315 | * with the data requests being unplugged */ |
3668 | drbd_tcp_quickack(mdev->data.socket); | 4316 | drbd_tcp_quickack(tconn->data.socket); |
3669 | 4317 | ||
3670 | return true; | 4318 | return 0; |
3671 | } | 4319 | } |
3672 | 4320 | ||
3673 | static int receive_out_of_sync(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 4321 | static int receive_out_of_sync(struct drbd_tconn *tconn, struct packet_info *pi) |
3674 | { | 4322 | { |
3675 | struct p_block_desc *p = &mdev->data.rbuf.block_desc; | 4323 | struct drbd_conf *mdev; |
4324 | struct p_block_desc *p = pi->data; | ||
4325 | |||
4326 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
4327 | if (!mdev) | ||
4328 | return -EIO; | ||
3676 | 4329 | ||
3677 | switch (mdev->state.conn) { | 4330 | switch (mdev->state.conn) { |
3678 | case C_WF_SYNC_UUID: | 4331 | case C_WF_SYNC_UUID: |
@@ -3686,15 +4339,13 @@ static int receive_out_of_sync(struct drbd_conf *mdev, enum drbd_packets cmd, un | |||
3686 | 4339 | ||
3687 | drbd_set_out_of_sync(mdev, be64_to_cpu(p->sector), be32_to_cpu(p->blksize)); | 4340 | drbd_set_out_of_sync(mdev, be64_to_cpu(p->sector), be32_to_cpu(p->blksize)); |
3688 | 4341 | ||
3689 | return true; | 4342 | return 0; |
3690 | } | 4343 | } |
3691 | 4344 | ||
3692 | typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, enum drbd_packets cmd, unsigned int to_receive); | ||
3693 | |||
3694 | struct data_cmd { | 4345 | struct data_cmd { |
3695 | int expect_payload; | 4346 | int expect_payload; |
3696 | size_t pkt_size; | 4347 | size_t pkt_size; |
3697 | drbd_cmd_handler_f function; | 4348 | int (*fn)(struct drbd_tconn *, struct packet_info *); |
3698 | }; | 4349 | }; |
3699 | 4350 | ||
3700 | static struct data_cmd drbd_cmd_handler[] = { | 4351 | static struct data_cmd drbd_cmd_handler[] = { |
@@ -3702,13 +4353,13 @@ static struct data_cmd drbd_cmd_handler[] = { | |||
3702 | [P_DATA_REPLY] = { 1, sizeof(struct p_data), receive_DataReply }, | 4353 | [P_DATA_REPLY] = { 1, sizeof(struct p_data), receive_DataReply }, |
3703 | [P_RS_DATA_REPLY] = { 1, sizeof(struct p_data), receive_RSDataReply } , | 4354 | [P_RS_DATA_REPLY] = { 1, sizeof(struct p_data), receive_RSDataReply } , |
3704 | [P_BARRIER] = { 0, sizeof(struct p_barrier), receive_Barrier } , | 4355 | [P_BARRIER] = { 0, sizeof(struct p_barrier), receive_Barrier } , |
3705 | [P_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } , | 4356 | [P_BITMAP] = { 1, 0, receive_bitmap } , |
3706 | [P_COMPRESSED_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } , | 4357 | [P_COMPRESSED_BITMAP] = { 1, 0, receive_bitmap } , |
3707 | [P_UNPLUG_REMOTE] = { 0, sizeof(struct p_header80), receive_UnplugRemote }, | 4358 | [P_UNPLUG_REMOTE] = { 0, 0, receive_UnplugRemote }, |
3708 | [P_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, | 4359 | [P_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, |
3709 | [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, | 4360 | [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, |
3710 | [P_SYNC_PARAM] = { 1, sizeof(struct p_header80), receive_SyncParam }, | 4361 | [P_SYNC_PARAM] = { 1, 0, receive_SyncParam }, |
3711 | [P_SYNC_PARAM89] = { 1, sizeof(struct p_header80), receive_SyncParam }, | 4362 | [P_SYNC_PARAM89] = { 1, 0, receive_SyncParam }, |
3712 | [P_PROTOCOL] = { 1, sizeof(struct p_protocol), receive_protocol }, | 4363 | [P_PROTOCOL] = { 1, sizeof(struct p_protocol), receive_protocol }, |
3713 | [P_UUIDS] = { 0, sizeof(struct p_uuids), receive_uuids }, | 4364 | [P_UUIDS] = { 0, sizeof(struct p_uuids), receive_uuids }, |
3714 | [P_SIZES] = { 0, sizeof(struct p_sizes), receive_sizes }, | 4365 | [P_SIZES] = { 0, sizeof(struct p_sizes), receive_sizes }, |
@@ -3720,124 +4371,75 @@ static struct data_cmd drbd_cmd_handler[] = { | |||
3720 | [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest }, | 4371 | [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest }, |
3721 | [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip }, | 4372 | [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip }, |
3722 | [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync }, | 4373 | [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync }, |
3723 | /* anything missing from this table is in | 4374 | [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state }, |
3724 | * the asender_tbl, see get_asender_cmd */ | 4375 | [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol }, |
3725 | [P_MAX_CMD] = { 0, 0, NULL }, | ||
3726 | }; | 4376 | }; |
3727 | 4377 | ||
3728 | /* All handler functions that expect a sub-header get that sub-heder in | 4378 | static void drbdd(struct drbd_tconn *tconn) |
3729 | mdev->data.rbuf.header.head.payload. | ||
3730 | |||
3731 | Usually in mdev->data.rbuf.header.head the callback can find the usual | ||
3732 | p_header, but they may not rely on that. Since there is also p_header95 ! | ||
3733 | */ | ||
3734 | |||
3735 | static void drbdd(struct drbd_conf *mdev) | ||
3736 | { | 4379 | { |
3737 | union p_header *header = &mdev->data.rbuf.header; | 4380 | struct packet_info pi; |
3738 | unsigned int packet_size; | ||
3739 | enum drbd_packets cmd; | ||
3740 | size_t shs; /* sub header size */ | 4381 | size_t shs; /* sub header size */ |
3741 | int rv; | 4382 | int err; |
4383 | |||
4384 | while (get_t_state(&tconn->receiver) == RUNNING) { | ||
4385 | struct data_cmd *cmd; | ||
3742 | 4386 | ||
3743 | while (get_t_state(&mdev->receiver) == Running) { | 4387 | drbd_thread_current_set_cpu(&tconn->receiver); |
3744 | drbd_thread_current_set_cpu(mdev); | 4388 | if (drbd_recv_header(tconn, &pi)) |
3745 | if (!drbd_recv_header(mdev, &cmd, &packet_size)) | ||
3746 | goto err_out; | 4389 | goto err_out; |
3747 | 4390 | ||
3748 | if (unlikely(cmd >= P_MAX_CMD || !drbd_cmd_handler[cmd].function)) { | 4391 | cmd = &drbd_cmd_handler[pi.cmd]; |
3749 | dev_err(DEV, "unknown packet type %d, l: %d!\n", cmd, packet_size); | 4392 | if (unlikely(pi.cmd >= ARRAY_SIZE(drbd_cmd_handler) || !cmd->fn)) { |
4393 | conn_err(tconn, "Unexpected data packet %s (0x%04x)", | ||
4394 | cmdname(pi.cmd), pi.cmd); | ||
3750 | goto err_out; | 4395 | goto err_out; |
3751 | } | 4396 | } |
3752 | 4397 | ||
3753 | shs = drbd_cmd_handler[cmd].pkt_size - sizeof(union p_header); | 4398 | shs = cmd->pkt_size; |
3754 | if (packet_size - shs > 0 && !drbd_cmd_handler[cmd].expect_payload) { | 4399 | if (pi.size > shs && !cmd->expect_payload) { |
3755 | dev_err(DEV, "No payload expected %s l:%d\n", cmdname(cmd), packet_size); | 4400 | conn_err(tconn, "No payload expected %s l:%d\n", |
4401 | cmdname(pi.cmd), pi.size); | ||
3756 | goto err_out; | 4402 | goto err_out; |
3757 | } | 4403 | } |
3758 | 4404 | ||
3759 | if (shs) { | 4405 | if (shs) { |
3760 | rv = drbd_recv(mdev, &header->h80.payload, shs); | 4406 | err = drbd_recv_all_warn(tconn, pi.data, shs); |
3761 | if (unlikely(rv != shs)) { | 4407 | if (err) |
3762 | if (!signal_pending(current)) | ||
3763 | dev_warn(DEV, "short read while reading sub header: rv=%d\n", rv); | ||
3764 | goto err_out; | 4408 | goto err_out; |
3765 | } | 4409 | pi.size -= shs; |
3766 | } | 4410 | } |
3767 | 4411 | ||
3768 | rv = drbd_cmd_handler[cmd].function(mdev, cmd, packet_size - shs); | 4412 | err = cmd->fn(tconn, &pi); |
3769 | 4413 | if (err) { | |
3770 | if (unlikely(!rv)) { | 4414 | conn_err(tconn, "error receiving %s, e: %d l: %d!\n", |
3771 | dev_err(DEV, "error receiving %s, l: %d!\n", | 4415 | cmdname(pi.cmd), err, pi.size); |
3772 | cmdname(cmd), packet_size); | ||
3773 | goto err_out; | 4416 | goto err_out; |
3774 | } | 4417 | } |
3775 | } | 4418 | } |
4419 | return; | ||
3776 | 4420 | ||
3777 | if (0) { | 4421 | err_out: |
3778 | err_out: | 4422 | conn_request_state(tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD); |
3779 | drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); | ||
3780 | } | ||
3781 | /* If we leave here, we probably want to update at least the | ||
3782 | * "Connected" indicator on stable storage. Do so explicitly here. */ | ||
3783 | drbd_md_sync(mdev); | ||
3784 | } | 4423 | } |
3785 | 4424 | ||
3786 | void drbd_flush_workqueue(struct drbd_conf *mdev) | 4425 | void conn_flush_workqueue(struct drbd_tconn *tconn) |
3787 | { | 4426 | { |
3788 | struct drbd_wq_barrier barr; | 4427 | struct drbd_wq_barrier barr; |
3789 | 4428 | ||
3790 | barr.w.cb = w_prev_work_done; | 4429 | barr.w.cb = w_prev_work_done; |
4430 | barr.w.tconn = tconn; | ||
3791 | init_completion(&barr.done); | 4431 | init_completion(&barr.done); |
3792 | drbd_queue_work(&mdev->data.work, &barr.w); | 4432 | drbd_queue_work(&tconn->sender_work, &barr.w); |
3793 | wait_for_completion(&barr.done); | 4433 | wait_for_completion(&barr.done); |
3794 | } | 4434 | } |
3795 | 4435 | ||
3796 | void drbd_free_tl_hash(struct drbd_conf *mdev) | 4436 | static void conn_disconnect(struct drbd_tconn *tconn) |
3797 | { | ||
3798 | struct hlist_head *h; | ||
3799 | |||
3800 | spin_lock_irq(&mdev->req_lock); | ||
3801 | |||
3802 | if (!mdev->tl_hash || mdev->state.conn != C_STANDALONE) { | ||
3803 | spin_unlock_irq(&mdev->req_lock); | ||
3804 | return; | ||
3805 | } | ||
3806 | /* paranoia code */ | ||
3807 | for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++) | ||
3808 | if (h->first) | ||
3809 | dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n", | ||
3810 | (int)(h - mdev->ee_hash), h->first); | ||
3811 | kfree(mdev->ee_hash); | ||
3812 | mdev->ee_hash = NULL; | ||
3813 | mdev->ee_hash_s = 0; | ||
3814 | |||
3815 | /* We may not have had the chance to wait for all locally pending | ||
3816 | * application requests. The hlist_add_fake() prevents access after | ||
3817 | * free on master bio completion. */ | ||
3818 | for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++) { | ||
3819 | struct drbd_request *req; | ||
3820 | struct hlist_node *pos, *n; | ||
3821 | hlist_for_each_entry_safe(req, pos, n, h, collision) { | ||
3822 | hlist_del_init(&req->collision); | ||
3823 | hlist_add_fake(&req->collision); | ||
3824 | } | ||
3825 | } | ||
3826 | |||
3827 | kfree(mdev->tl_hash); | ||
3828 | mdev->tl_hash = NULL; | ||
3829 | mdev->tl_hash_s = 0; | ||
3830 | spin_unlock_irq(&mdev->req_lock); | ||
3831 | } | ||
3832 | |||
3833 | static void drbd_disconnect(struct drbd_conf *mdev) | ||
3834 | { | 4437 | { |
3835 | enum drbd_fencing_p fp; | 4438 | struct drbd_conf *mdev; |
3836 | union drbd_state os, ns; | 4439 | enum drbd_conns oc; |
3837 | int rv = SS_UNKNOWN_ERROR; | 4440 | int vnr; |
3838 | unsigned int i; | ||
3839 | 4441 | ||
3840 | if (mdev->state.conn == C_STANDALONE) | 4442 | if (tconn->cstate == C_STANDALONE) |
3841 | return; | 4443 | return; |
3842 | 4444 | ||
3843 | /* We are about to start the cleanup after connection loss. | 4445 | /* We are about to start the cleanup after connection loss. |
@@ -3845,18 +4447,54 @@ static void drbd_disconnect(struct drbd_conf *mdev) | |||
3845 | * Usually we should be in some network failure state already, | 4447 | * Usually we should be in some network failure state already, |
3846 | * but just in case we are not, we fix it up here. | 4448 | * but just in case we are not, we fix it up here. |
3847 | */ | 4449 | */ |
3848 | drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE)); | 4450 | conn_request_state(tconn, NS(conn, C_NETWORK_FAILURE), CS_HARD); |
3849 | 4451 | ||
3850 | /* asender does not clean up anything. it must not interfere, either */ | 4452 | /* asender does not clean up anything. it must not interfere, either */ |
3851 | drbd_thread_stop(&mdev->asender); | 4453 | drbd_thread_stop(&tconn->asender); |
3852 | drbd_free_sock(mdev); | 4454 | drbd_free_sock(tconn); |
4455 | |||
4456 | rcu_read_lock(); | ||
4457 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
4458 | kref_get(&mdev->kref); | ||
4459 | rcu_read_unlock(); | ||
4460 | drbd_disconnected(mdev); | ||
4461 | kref_put(&mdev->kref, &drbd_minor_destroy); | ||
4462 | rcu_read_lock(); | ||
4463 | } | ||
4464 | rcu_read_unlock(); | ||
4465 | |||
4466 | if (!list_empty(&tconn->current_epoch->list)) | ||
4467 | conn_err(tconn, "ASSERTION FAILED: tconn->current_epoch->list not empty\n"); | ||
4468 | /* ok, no more ee's on the fly, it is safe to reset the epoch_size */ | ||
4469 | atomic_set(&tconn->current_epoch->epoch_size, 0); | ||
4470 | tconn->send.seen_any_write_yet = false; | ||
4471 | |||
4472 | conn_info(tconn, "Connection closed\n"); | ||
4473 | |||
4474 | if (conn_highest_role(tconn) == R_PRIMARY && conn_highest_pdsk(tconn) >= D_UNKNOWN) | ||
4475 | conn_try_outdate_peer_async(tconn); | ||
4476 | |||
4477 | spin_lock_irq(&tconn->req_lock); | ||
4478 | oc = tconn->cstate; | ||
4479 | if (oc >= C_UNCONNECTED) | ||
4480 | _conn_request_state(tconn, NS(conn, C_UNCONNECTED), CS_VERBOSE); | ||
4481 | |||
4482 | spin_unlock_irq(&tconn->req_lock); | ||
4483 | |||
4484 | if (oc == C_DISCONNECTING) | ||
4485 | conn_request_state(tconn, NS(conn, C_STANDALONE), CS_VERBOSE | CS_HARD); | ||
4486 | } | ||
4487 | |||
4488 | static int drbd_disconnected(struct drbd_conf *mdev) | ||
4489 | { | ||
4490 | unsigned int i; | ||
3853 | 4491 | ||
3854 | /* wait for current activity to cease. */ | 4492 | /* wait for current activity to cease. */ |
3855 | spin_lock_irq(&mdev->req_lock); | 4493 | spin_lock_irq(&mdev->tconn->req_lock); |
3856 | _drbd_wait_ee_list_empty(mdev, &mdev->active_ee); | 4494 | _drbd_wait_ee_list_empty(mdev, &mdev->active_ee); |
3857 | _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee); | 4495 | _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee); |
3858 | _drbd_wait_ee_list_empty(mdev, &mdev->read_ee); | 4496 | _drbd_wait_ee_list_empty(mdev, &mdev->read_ee); |
3859 | spin_unlock_irq(&mdev->req_lock); | 4497 | spin_unlock_irq(&mdev->tconn->req_lock); |
3860 | 4498 | ||
3861 | /* We do not have data structures that would allow us to | 4499 | /* We do not have data structures that would allow us to |
3862 | * get the rs_pending_cnt down to 0 again. | 4500 | * get the rs_pending_cnt down to 0 again. |
@@ -3874,7 +4512,6 @@ static void drbd_disconnect(struct drbd_conf *mdev) | |||
3874 | atomic_set(&mdev->rs_pending_cnt, 0); | 4512 | atomic_set(&mdev->rs_pending_cnt, 0); |
3875 | wake_up(&mdev->misc_wait); | 4513 | wake_up(&mdev->misc_wait); |
3876 | 4514 | ||
3877 | /* make sure syncer is stopped and w_resume_next_sg queued */ | ||
3878 | del_timer_sync(&mdev->resync_timer); | 4515 | del_timer_sync(&mdev->resync_timer); |
3879 | resync_timer_fn((unsigned long)mdev); | 4516 | resync_timer_fn((unsigned long)mdev); |
3880 | 4517 | ||
@@ -3883,50 +4520,25 @@ static void drbd_disconnect(struct drbd_conf *mdev) | |||
3883 | * to be "canceled" */ | 4520 | * to be "canceled" */ |
3884 | drbd_flush_workqueue(mdev); | 4521 | drbd_flush_workqueue(mdev); |
3885 | 4522 | ||
3886 | /* This also does reclaim_net_ee(). If we do this too early, we might | 4523 | drbd_finish_peer_reqs(mdev); |
3887 | * miss some resync ee and pages.*/ | 4524 | |
3888 | drbd_process_done_ee(mdev); | 4525 | /* This second workqueue flush is necessary, since drbd_finish_peer_reqs() |
4526 | might have issued a work again. The one before drbd_finish_peer_reqs() is | ||
4527 | necessary to reclain net_ee in drbd_finish_peer_reqs(). */ | ||
4528 | drbd_flush_workqueue(mdev); | ||
4529 | |||
4530 | /* need to do it again, drbd_finish_peer_reqs() may have populated it | ||
4531 | * again via drbd_try_clear_on_disk_bm(). */ | ||
4532 | drbd_rs_cancel_all(mdev); | ||
3889 | 4533 | ||
3890 | kfree(mdev->p_uuid); | 4534 | kfree(mdev->p_uuid); |
3891 | mdev->p_uuid = NULL; | 4535 | mdev->p_uuid = NULL; |
3892 | 4536 | ||
3893 | if (!is_susp(mdev->state)) | 4537 | if (!drbd_suspended(mdev)) |
3894 | tl_clear(mdev); | 4538 | tl_clear(mdev->tconn); |
3895 | |||
3896 | dev_info(DEV, "Connection closed\n"); | ||
3897 | 4539 | ||
3898 | drbd_md_sync(mdev); | 4540 | drbd_md_sync(mdev); |
3899 | 4541 | ||
3900 | fp = FP_DONT_CARE; | ||
3901 | if (get_ldev(mdev)) { | ||
3902 | fp = mdev->ldev->dc.fencing; | ||
3903 | put_ldev(mdev); | ||
3904 | } | ||
3905 | |||
3906 | if (mdev->state.role == R_PRIMARY && fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN) | ||
3907 | drbd_try_outdate_peer_async(mdev); | ||
3908 | |||
3909 | spin_lock_irq(&mdev->req_lock); | ||
3910 | os = mdev->state; | ||
3911 | if (os.conn >= C_UNCONNECTED) { | ||
3912 | /* Do not restart in case we are C_DISCONNECTING */ | ||
3913 | ns = os; | ||
3914 | ns.conn = C_UNCONNECTED; | ||
3915 | rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); | ||
3916 | } | ||
3917 | spin_unlock_irq(&mdev->req_lock); | ||
3918 | |||
3919 | if (os.conn == C_DISCONNECTING) { | ||
3920 | wait_event(mdev->net_cnt_wait, atomic_read(&mdev->net_cnt) == 0); | ||
3921 | |||
3922 | crypto_free_hash(mdev->cram_hmac_tfm); | ||
3923 | mdev->cram_hmac_tfm = NULL; | ||
3924 | |||
3925 | kfree(mdev->net_conf); | ||
3926 | mdev->net_conf = NULL; | ||
3927 | drbd_request_state(mdev, NS(conn, C_STANDALONE)); | ||
3928 | } | ||
3929 | |||
3930 | /* serialize with bitmap writeout triggered by the state change, | 4542 | /* serialize with bitmap writeout triggered by the state change, |
3931 | * if any. */ | 4543 | * if any. */ |
3932 | wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); | 4544 | wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); |
@@ -3938,7 +4550,7 @@ static void drbd_disconnect(struct drbd_conf *mdev) | |||
3938 | * Actually we don't care for exactly when the network stack does its | 4550 | * Actually we don't care for exactly when the network stack does its |
3939 | * put_page(), but release our reference on these pages right here. | 4551 | * put_page(), but release our reference on these pages right here. |
3940 | */ | 4552 | */ |
3941 | i = drbd_release_ee(mdev, &mdev->net_ee); | 4553 | i = drbd_free_peer_reqs(mdev, &mdev->net_ee); |
3942 | if (i) | 4554 | if (i) |
3943 | dev_info(DEV, "net_ee not empty, killed %u entries\n", i); | 4555 | dev_info(DEV, "net_ee not empty, killed %u entries\n", i); |
3944 | i = atomic_read(&mdev->pp_in_use_by_net); | 4556 | i = atomic_read(&mdev->pp_in_use_by_net); |
@@ -3953,9 +4565,7 @@ static void drbd_disconnect(struct drbd_conf *mdev) | |||
3953 | D_ASSERT(list_empty(&mdev->sync_ee)); | 4565 | D_ASSERT(list_empty(&mdev->sync_ee)); |
3954 | D_ASSERT(list_empty(&mdev->done_ee)); | 4566 | D_ASSERT(list_empty(&mdev->done_ee)); |
3955 | 4567 | ||
3956 | /* ok, no more ee's on the fly, it is safe to reset the epoch_size */ | 4568 | return 0; |
3957 | atomic_set(&mdev->current_epoch->epoch_size, 0); | ||
3958 | D_ASSERT(list_empty(&mdev->current_epoch->list)); | ||
3959 | } | 4569 | } |
3960 | 4570 | ||
3961 | /* | 4571 | /* |
@@ -3967,29 +4577,19 @@ static void drbd_disconnect(struct drbd_conf *mdev) | |||
3967 | * | 4577 | * |
3968 | * for now, they are expected to be zero, but ignored. | 4578 | * for now, they are expected to be zero, but ignored. |
3969 | */ | 4579 | */ |
3970 | static int drbd_send_handshake(struct drbd_conf *mdev) | 4580 | static int drbd_send_features(struct drbd_tconn *tconn) |
3971 | { | 4581 | { |
3972 | /* ASSERT current == mdev->receiver ... */ | 4582 | struct drbd_socket *sock; |
3973 | struct p_handshake *p = &mdev->data.sbuf.handshake; | 4583 | struct p_connection_features *p; |
3974 | int ok; | ||
3975 | |||
3976 | if (mutex_lock_interruptible(&mdev->data.mutex)) { | ||
3977 | dev_err(DEV, "interrupted during initial handshake\n"); | ||
3978 | return 0; /* interrupted. not ok. */ | ||
3979 | } | ||
3980 | |||
3981 | if (mdev->data.socket == NULL) { | ||
3982 | mutex_unlock(&mdev->data.mutex); | ||
3983 | return 0; | ||
3984 | } | ||
3985 | 4584 | ||
4585 | sock = &tconn->data; | ||
4586 | p = conn_prepare_command(tconn, sock); | ||
4587 | if (!p) | ||
4588 | return -EIO; | ||
3986 | memset(p, 0, sizeof(*p)); | 4589 | memset(p, 0, sizeof(*p)); |
3987 | p->protocol_min = cpu_to_be32(PRO_VERSION_MIN); | 4590 | p->protocol_min = cpu_to_be32(PRO_VERSION_MIN); |
3988 | p->protocol_max = cpu_to_be32(PRO_VERSION_MAX); | 4591 | p->protocol_max = cpu_to_be32(PRO_VERSION_MAX); |
3989 | ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE, | 4592 | return conn_send_command(tconn, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0); |
3990 | (struct p_header80 *)p, sizeof(*p), 0 ); | ||
3991 | mutex_unlock(&mdev->data.mutex); | ||
3992 | return ok; | ||
3993 | } | 4593 | } |
3994 | 4594 | ||
3995 | /* | 4595 | /* |
@@ -3999,42 +4599,38 @@ static int drbd_send_handshake(struct drbd_conf *mdev) | |||
3999 | * -1 peer talks different language, | 4599 | * -1 peer talks different language, |
4000 | * no point in trying again, please go standalone. | 4600 | * no point in trying again, please go standalone. |
4001 | */ | 4601 | */ |
4002 | static int drbd_do_handshake(struct drbd_conf *mdev) | 4602 | static int drbd_do_features(struct drbd_tconn *tconn) |
4003 | { | 4603 | { |
4004 | /* ASSERT current == mdev->receiver ... */ | 4604 | /* ASSERT current == tconn->receiver ... */ |
4005 | struct p_handshake *p = &mdev->data.rbuf.handshake; | 4605 | struct p_connection_features *p; |
4006 | const int expect = sizeof(struct p_handshake) - sizeof(struct p_header80); | 4606 | const int expect = sizeof(struct p_connection_features); |
4007 | unsigned int length; | 4607 | struct packet_info pi; |
4008 | enum drbd_packets cmd; | 4608 | int err; |
4009 | int rv; | ||
4010 | 4609 | ||
4011 | rv = drbd_send_handshake(mdev); | 4610 | err = drbd_send_features(tconn); |
4012 | if (!rv) | 4611 | if (err) |
4013 | return 0; | 4612 | return 0; |
4014 | 4613 | ||
4015 | rv = drbd_recv_header(mdev, &cmd, &length); | 4614 | err = drbd_recv_header(tconn, &pi); |
4016 | if (!rv) | 4615 | if (err) |
4017 | return 0; | 4616 | return 0; |
4018 | 4617 | ||
4019 | if (cmd != P_HAND_SHAKE) { | 4618 | if (pi.cmd != P_CONNECTION_FEATURES) { |
4020 | dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n", | 4619 | conn_err(tconn, "expected ConnectionFeatures packet, received: %s (0x%04x)\n", |
4021 | cmdname(cmd), cmd); | 4620 | cmdname(pi.cmd), pi.cmd); |
4022 | return -1; | 4621 | return -1; |
4023 | } | 4622 | } |
4024 | 4623 | ||
4025 | if (length != expect) { | 4624 | if (pi.size != expect) { |
4026 | dev_err(DEV, "expected HandShake length: %u, received: %u\n", | 4625 | conn_err(tconn, "expected ConnectionFeatures length: %u, received: %u\n", |
4027 | expect, length); | 4626 | expect, pi.size); |
4028 | return -1; | 4627 | return -1; |
4029 | } | 4628 | } |
4030 | 4629 | ||
4031 | rv = drbd_recv(mdev, &p->head.payload, expect); | 4630 | p = pi.data; |
4032 | 4631 | err = drbd_recv_all_warn(tconn, p, expect); | |
4033 | if (rv != expect) { | 4632 | if (err) |
4034 | if (!signal_pending(current)) | ||
4035 | dev_warn(DEV, "short read receiving handshake packet: l=%u\n", rv); | ||
4036 | return 0; | 4633 | return 0; |
4037 | } | ||
4038 | 4634 | ||
4039 | p->protocol_min = be32_to_cpu(p->protocol_min); | 4635 | p->protocol_min = be32_to_cpu(p->protocol_min); |
4040 | p->protocol_max = be32_to_cpu(p->protocol_max); | 4636 | p->protocol_max = be32_to_cpu(p->protocol_max); |
@@ -4045,15 +4641,15 @@ static int drbd_do_handshake(struct drbd_conf *mdev) | |||
4045 | PRO_VERSION_MIN > p->protocol_max) | 4641 | PRO_VERSION_MIN > p->protocol_max) |
4046 | goto incompat; | 4642 | goto incompat; |
4047 | 4643 | ||
4048 | mdev->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max); | 4644 | tconn->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max); |
4049 | 4645 | ||
4050 | dev_info(DEV, "Handshake successful: " | 4646 | conn_info(tconn, "Handshake successful: " |
4051 | "Agreed network protocol version %d\n", mdev->agreed_pro_version); | 4647 | "Agreed network protocol version %d\n", tconn->agreed_pro_version); |
4052 | 4648 | ||
4053 | return 1; | 4649 | return 1; |
4054 | 4650 | ||
4055 | incompat: | 4651 | incompat: |
4056 | dev_err(DEV, "incompatible DRBD dialects: " | 4652 | conn_err(tconn, "incompatible DRBD dialects: " |
4057 | "I support %d-%d, peer supports %d-%d\n", | 4653 | "I support %d-%d, peer supports %d-%d\n", |
4058 | PRO_VERSION_MIN, PRO_VERSION_MAX, | 4654 | PRO_VERSION_MIN, PRO_VERSION_MAX, |
4059 | p->protocol_min, p->protocol_max); | 4655 | p->protocol_min, p->protocol_max); |
@@ -4061,7 +4657,7 @@ static int drbd_do_handshake(struct drbd_conf *mdev) | |||
4061 | } | 4657 | } |
4062 | 4658 | ||
4063 | #if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE) | 4659 | #if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE) |
4064 | static int drbd_do_auth(struct drbd_conf *mdev) | 4660 | static int drbd_do_auth(struct drbd_tconn *tconn) |
4065 | { | 4661 | { |
4066 | dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n"); | 4662 | dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n"); |
4067 | dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n"); | 4663 | dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n"); |
@@ -4076,121 +4672,139 @@ static int drbd_do_auth(struct drbd_conf *mdev) | |||
4076 | -1 - auth failed, don't try again. | 4672 | -1 - auth failed, don't try again. |
4077 | */ | 4673 | */ |
4078 | 4674 | ||
4079 | static int drbd_do_auth(struct drbd_conf *mdev) | 4675 | static int drbd_do_auth(struct drbd_tconn *tconn) |
4080 | { | 4676 | { |
4677 | struct drbd_socket *sock; | ||
4081 | char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */ | 4678 | char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */ |
4082 | struct scatterlist sg; | 4679 | struct scatterlist sg; |
4083 | char *response = NULL; | 4680 | char *response = NULL; |
4084 | char *right_response = NULL; | 4681 | char *right_response = NULL; |
4085 | char *peers_ch = NULL; | 4682 | char *peers_ch = NULL; |
4086 | unsigned int key_len = strlen(mdev->net_conf->shared_secret); | 4683 | unsigned int key_len; |
4684 | char secret[SHARED_SECRET_MAX]; /* 64 byte */ | ||
4087 | unsigned int resp_size; | 4685 | unsigned int resp_size; |
4088 | struct hash_desc desc; | 4686 | struct hash_desc desc; |
4089 | enum drbd_packets cmd; | 4687 | struct packet_info pi; |
4090 | unsigned int length; | 4688 | struct net_conf *nc; |
4091 | int rv; | 4689 | int err, rv; |
4690 | |||
4691 | /* FIXME: Put the challenge/response into the preallocated socket buffer. */ | ||
4092 | 4692 | ||
4093 | desc.tfm = mdev->cram_hmac_tfm; | 4693 | rcu_read_lock(); |
4694 | nc = rcu_dereference(tconn->net_conf); | ||
4695 | key_len = strlen(nc->shared_secret); | ||
4696 | memcpy(secret, nc->shared_secret, key_len); | ||
4697 | rcu_read_unlock(); | ||
4698 | |||
4699 | desc.tfm = tconn->cram_hmac_tfm; | ||
4094 | desc.flags = 0; | 4700 | desc.flags = 0; |
4095 | 4701 | ||
4096 | rv = crypto_hash_setkey(mdev->cram_hmac_tfm, | 4702 | rv = crypto_hash_setkey(tconn->cram_hmac_tfm, (u8 *)secret, key_len); |
4097 | (u8 *)mdev->net_conf->shared_secret, key_len); | ||
4098 | if (rv) { | 4703 | if (rv) { |
4099 | dev_err(DEV, "crypto_hash_setkey() failed with %d\n", rv); | 4704 | conn_err(tconn, "crypto_hash_setkey() failed with %d\n", rv); |
4100 | rv = -1; | 4705 | rv = -1; |
4101 | goto fail; | 4706 | goto fail; |
4102 | } | 4707 | } |
4103 | 4708 | ||
4104 | get_random_bytes(my_challenge, CHALLENGE_LEN); | 4709 | get_random_bytes(my_challenge, CHALLENGE_LEN); |
4105 | 4710 | ||
4106 | rv = drbd_send_cmd2(mdev, P_AUTH_CHALLENGE, my_challenge, CHALLENGE_LEN); | 4711 | sock = &tconn->data; |
4712 | if (!conn_prepare_command(tconn, sock)) { | ||
4713 | rv = 0; | ||
4714 | goto fail; | ||
4715 | } | ||
4716 | rv = !conn_send_command(tconn, sock, P_AUTH_CHALLENGE, 0, | ||
4717 | my_challenge, CHALLENGE_LEN); | ||
4107 | if (!rv) | 4718 | if (!rv) |
4108 | goto fail; | 4719 | goto fail; |
4109 | 4720 | ||
4110 | rv = drbd_recv_header(mdev, &cmd, &length); | 4721 | err = drbd_recv_header(tconn, &pi); |
4111 | if (!rv) | 4722 | if (err) { |
4723 | rv = 0; | ||
4112 | goto fail; | 4724 | goto fail; |
4725 | } | ||
4113 | 4726 | ||
4114 | if (cmd != P_AUTH_CHALLENGE) { | 4727 | if (pi.cmd != P_AUTH_CHALLENGE) { |
4115 | dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n", | 4728 | conn_err(tconn, "expected AuthChallenge packet, received: %s (0x%04x)\n", |
4116 | cmdname(cmd), cmd); | 4729 | cmdname(pi.cmd), pi.cmd); |
4117 | rv = 0; | 4730 | rv = 0; |
4118 | goto fail; | 4731 | goto fail; |
4119 | } | 4732 | } |
4120 | 4733 | ||
4121 | if (length > CHALLENGE_LEN * 2) { | 4734 | if (pi.size > CHALLENGE_LEN * 2) { |
4122 | dev_err(DEV, "expected AuthChallenge payload too big.\n"); | 4735 | conn_err(tconn, "expected AuthChallenge payload too big.\n"); |
4123 | rv = -1; | 4736 | rv = -1; |
4124 | goto fail; | 4737 | goto fail; |
4125 | } | 4738 | } |
4126 | 4739 | ||
4127 | peers_ch = kmalloc(length, GFP_NOIO); | 4740 | peers_ch = kmalloc(pi.size, GFP_NOIO); |
4128 | if (peers_ch == NULL) { | 4741 | if (peers_ch == NULL) { |
4129 | dev_err(DEV, "kmalloc of peers_ch failed\n"); | 4742 | conn_err(tconn, "kmalloc of peers_ch failed\n"); |
4130 | rv = -1; | 4743 | rv = -1; |
4131 | goto fail; | 4744 | goto fail; |
4132 | } | 4745 | } |
4133 | 4746 | ||
4134 | rv = drbd_recv(mdev, peers_ch, length); | 4747 | err = drbd_recv_all_warn(tconn, peers_ch, pi.size); |
4135 | 4748 | if (err) { | |
4136 | if (rv != length) { | ||
4137 | if (!signal_pending(current)) | ||
4138 | dev_warn(DEV, "short read AuthChallenge: l=%u\n", rv); | ||
4139 | rv = 0; | 4749 | rv = 0; |
4140 | goto fail; | 4750 | goto fail; |
4141 | } | 4751 | } |
4142 | 4752 | ||
4143 | resp_size = crypto_hash_digestsize(mdev->cram_hmac_tfm); | 4753 | resp_size = crypto_hash_digestsize(tconn->cram_hmac_tfm); |
4144 | response = kmalloc(resp_size, GFP_NOIO); | 4754 | response = kmalloc(resp_size, GFP_NOIO); |
4145 | if (response == NULL) { | 4755 | if (response == NULL) { |
4146 | dev_err(DEV, "kmalloc of response failed\n"); | 4756 | conn_err(tconn, "kmalloc of response failed\n"); |
4147 | rv = -1; | 4757 | rv = -1; |
4148 | goto fail; | 4758 | goto fail; |
4149 | } | 4759 | } |
4150 | 4760 | ||
4151 | sg_init_table(&sg, 1); | 4761 | sg_init_table(&sg, 1); |
4152 | sg_set_buf(&sg, peers_ch, length); | 4762 | sg_set_buf(&sg, peers_ch, pi.size); |
4153 | 4763 | ||
4154 | rv = crypto_hash_digest(&desc, &sg, sg.length, response); | 4764 | rv = crypto_hash_digest(&desc, &sg, sg.length, response); |
4155 | if (rv) { | 4765 | if (rv) { |
4156 | dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv); | 4766 | conn_err(tconn, "crypto_hash_digest() failed with %d\n", rv); |
4157 | rv = -1; | 4767 | rv = -1; |
4158 | goto fail; | 4768 | goto fail; |
4159 | } | 4769 | } |
4160 | 4770 | ||
4161 | rv = drbd_send_cmd2(mdev, P_AUTH_RESPONSE, response, resp_size); | 4771 | if (!conn_prepare_command(tconn, sock)) { |
4162 | if (!rv) | 4772 | rv = 0; |
4163 | goto fail; | 4773 | goto fail; |
4164 | 4774 | } | |
4165 | rv = drbd_recv_header(mdev, &cmd, &length); | 4775 | rv = !conn_send_command(tconn, sock, P_AUTH_RESPONSE, 0, |
4776 | response, resp_size); | ||
4166 | if (!rv) | 4777 | if (!rv) |
4167 | goto fail; | 4778 | goto fail; |
4168 | 4779 | ||
4169 | if (cmd != P_AUTH_RESPONSE) { | 4780 | err = drbd_recv_header(tconn, &pi); |
4170 | dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n", | 4781 | if (err) { |
4171 | cmdname(cmd), cmd); | ||
4172 | rv = 0; | 4782 | rv = 0; |
4173 | goto fail; | 4783 | goto fail; |
4174 | } | 4784 | } |
4175 | 4785 | ||
4176 | if (length != resp_size) { | 4786 | if (pi.cmd != P_AUTH_RESPONSE) { |
4177 | dev_err(DEV, "expected AuthResponse payload of wrong size\n"); | 4787 | conn_err(tconn, "expected AuthResponse packet, received: %s (0x%04x)\n", |
4788 | cmdname(pi.cmd), pi.cmd); | ||
4178 | rv = 0; | 4789 | rv = 0; |
4179 | goto fail; | 4790 | goto fail; |
4180 | } | 4791 | } |
4181 | 4792 | ||
4182 | rv = drbd_recv(mdev, response , resp_size); | 4793 | if (pi.size != resp_size) { |
4794 | conn_err(tconn, "expected AuthResponse payload of wrong size\n"); | ||
4795 | rv = 0; | ||
4796 | goto fail; | ||
4797 | } | ||
4183 | 4798 | ||
4184 | if (rv != resp_size) { | 4799 | err = drbd_recv_all_warn(tconn, response , resp_size); |
4185 | if (!signal_pending(current)) | 4800 | if (err) { |
4186 | dev_warn(DEV, "short read receiving AuthResponse: l=%u\n", rv); | ||
4187 | rv = 0; | 4801 | rv = 0; |
4188 | goto fail; | 4802 | goto fail; |
4189 | } | 4803 | } |
4190 | 4804 | ||
4191 | right_response = kmalloc(resp_size, GFP_NOIO); | 4805 | right_response = kmalloc(resp_size, GFP_NOIO); |
4192 | if (right_response == NULL) { | 4806 | if (right_response == NULL) { |
4193 | dev_err(DEV, "kmalloc of right_response failed\n"); | 4807 | conn_err(tconn, "kmalloc of right_response failed\n"); |
4194 | rv = -1; | 4808 | rv = -1; |
4195 | goto fail; | 4809 | goto fail; |
4196 | } | 4810 | } |
@@ -4199,7 +4813,7 @@ static int drbd_do_auth(struct drbd_conf *mdev) | |||
4199 | 4813 | ||
4200 | rv = crypto_hash_digest(&desc, &sg, sg.length, right_response); | 4814 | rv = crypto_hash_digest(&desc, &sg, sg.length, right_response); |
4201 | if (rv) { | 4815 | if (rv) { |
4202 | dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv); | 4816 | conn_err(tconn, "crypto_hash_digest() failed with %d\n", rv); |
4203 | rv = -1; | 4817 | rv = -1; |
4204 | goto fail; | 4818 | goto fail; |
4205 | } | 4819 | } |
@@ -4207,8 +4821,8 @@ static int drbd_do_auth(struct drbd_conf *mdev) | |||
4207 | rv = !memcmp(response, right_response, resp_size); | 4821 | rv = !memcmp(response, right_response, resp_size); |
4208 | 4822 | ||
4209 | if (rv) | 4823 | if (rv) |
4210 | dev_info(DEV, "Peer authenticated using %d bytes of '%s' HMAC\n", | 4824 | conn_info(tconn, "Peer authenticated using %d bytes HMAC\n", |
4211 | resp_size, mdev->net_conf->cram_hmac_alg); | 4825 | resp_size); |
4212 | else | 4826 | else |
4213 | rv = -1; | 4827 | rv = -1; |
4214 | 4828 | ||
@@ -4223,82 +4837,106 @@ static int drbd_do_auth(struct drbd_conf *mdev) | |||
4223 | 4837 | ||
4224 | int drbdd_init(struct drbd_thread *thi) | 4838 | int drbdd_init(struct drbd_thread *thi) |
4225 | { | 4839 | { |
4226 | struct drbd_conf *mdev = thi->mdev; | 4840 | struct drbd_tconn *tconn = thi->tconn; |
4227 | unsigned int minor = mdev_to_minor(mdev); | ||
4228 | int h; | 4841 | int h; |
4229 | 4842 | ||
4230 | sprintf(current->comm, "drbd%d_receiver", minor); | 4843 | conn_info(tconn, "receiver (re)started\n"); |
4231 | |||
4232 | dev_info(DEV, "receiver (re)started\n"); | ||
4233 | 4844 | ||
4234 | do { | 4845 | do { |
4235 | h = drbd_connect(mdev); | 4846 | h = conn_connect(tconn); |
4236 | if (h == 0) { | 4847 | if (h == 0) { |
4237 | drbd_disconnect(mdev); | 4848 | conn_disconnect(tconn); |
4238 | schedule_timeout_interruptible(HZ); | 4849 | schedule_timeout_interruptible(HZ); |
4239 | } | 4850 | } |
4240 | if (h == -1) { | 4851 | if (h == -1) { |
4241 | dev_warn(DEV, "Discarding network configuration.\n"); | 4852 | conn_warn(tconn, "Discarding network configuration.\n"); |
4242 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | 4853 | conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD); |
4243 | } | 4854 | } |
4244 | } while (h == 0); | 4855 | } while (h == 0); |
4245 | 4856 | ||
4246 | if (h > 0) { | 4857 | if (h > 0) |
4247 | if (get_net_conf(mdev)) { | 4858 | drbdd(tconn); |
4248 | drbdd(mdev); | ||
4249 | put_net_conf(mdev); | ||
4250 | } | ||
4251 | } | ||
4252 | 4859 | ||
4253 | drbd_disconnect(mdev); | 4860 | conn_disconnect(tconn); |
4254 | 4861 | ||
4255 | dev_info(DEV, "receiver terminated\n"); | 4862 | conn_info(tconn, "receiver terminated\n"); |
4256 | return 0; | 4863 | return 0; |
4257 | } | 4864 | } |
4258 | 4865 | ||
4259 | /* ********* acknowledge sender ******** */ | 4866 | /* ********* acknowledge sender ******** */ |
4260 | 4867 | ||
4261 | static int got_RqSReply(struct drbd_conf *mdev, struct p_header80 *h) | 4868 | static int got_conn_RqSReply(struct drbd_tconn *tconn, struct packet_info *pi) |
4262 | { | 4869 | { |
4263 | struct p_req_state_reply *p = (struct p_req_state_reply *)h; | 4870 | struct p_req_state_reply *p = pi->data; |
4871 | int retcode = be32_to_cpu(p->retcode); | ||
4872 | |||
4873 | if (retcode >= SS_SUCCESS) { | ||
4874 | set_bit(CONN_WD_ST_CHG_OKAY, &tconn->flags); | ||
4875 | } else { | ||
4876 | set_bit(CONN_WD_ST_CHG_FAIL, &tconn->flags); | ||
4877 | conn_err(tconn, "Requested state change failed by peer: %s (%d)\n", | ||
4878 | drbd_set_st_err_str(retcode), retcode); | ||
4879 | } | ||
4880 | wake_up(&tconn->ping_wait); | ||
4881 | |||
4882 | return 0; | ||
4883 | } | ||
4264 | 4884 | ||
4885 | static int got_RqSReply(struct drbd_tconn *tconn, struct packet_info *pi) | ||
4886 | { | ||
4887 | struct drbd_conf *mdev; | ||
4888 | struct p_req_state_reply *p = pi->data; | ||
4265 | int retcode = be32_to_cpu(p->retcode); | 4889 | int retcode = be32_to_cpu(p->retcode); |
4266 | 4890 | ||
4891 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
4892 | if (!mdev) | ||
4893 | return -EIO; | ||
4894 | |||
4895 | if (test_bit(CONN_WD_ST_CHG_REQ, &tconn->flags)) { | ||
4896 | D_ASSERT(tconn->agreed_pro_version < 100); | ||
4897 | return got_conn_RqSReply(tconn, pi); | ||
4898 | } | ||
4899 | |||
4267 | if (retcode >= SS_SUCCESS) { | 4900 | if (retcode >= SS_SUCCESS) { |
4268 | set_bit(CL_ST_CHG_SUCCESS, &mdev->flags); | 4901 | set_bit(CL_ST_CHG_SUCCESS, &mdev->flags); |
4269 | } else { | 4902 | } else { |
4270 | set_bit(CL_ST_CHG_FAIL, &mdev->flags); | 4903 | set_bit(CL_ST_CHG_FAIL, &mdev->flags); |
4271 | dev_err(DEV, "Requested state change failed by peer: %s (%d)\n", | 4904 | dev_err(DEV, "Requested state change failed by peer: %s (%d)\n", |
4272 | drbd_set_st_err_str(retcode), retcode); | 4905 | drbd_set_st_err_str(retcode), retcode); |
4273 | } | 4906 | } |
4274 | wake_up(&mdev->state_wait); | 4907 | wake_up(&mdev->state_wait); |
4275 | 4908 | ||
4276 | return true; | 4909 | return 0; |
4277 | } | 4910 | } |
4278 | 4911 | ||
4279 | static int got_Ping(struct drbd_conf *mdev, struct p_header80 *h) | 4912 | static int got_Ping(struct drbd_tconn *tconn, struct packet_info *pi) |
4280 | { | 4913 | { |
4281 | return drbd_send_ping_ack(mdev); | 4914 | return drbd_send_ping_ack(tconn); |
4282 | 4915 | ||
4283 | } | 4916 | } |
4284 | 4917 | ||
4285 | static int got_PingAck(struct drbd_conf *mdev, struct p_header80 *h) | 4918 | static int got_PingAck(struct drbd_tconn *tconn, struct packet_info *pi) |
4286 | { | 4919 | { |
4287 | /* restore idle timeout */ | 4920 | /* restore idle timeout */ |
4288 | mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ; | 4921 | tconn->meta.socket->sk->sk_rcvtimeo = tconn->net_conf->ping_int*HZ; |
4289 | if (!test_and_set_bit(GOT_PING_ACK, &mdev->flags)) | 4922 | if (!test_and_set_bit(GOT_PING_ACK, &tconn->flags)) |
4290 | wake_up(&mdev->misc_wait); | 4923 | wake_up(&tconn->ping_wait); |
4291 | 4924 | ||
4292 | return true; | 4925 | return 0; |
4293 | } | 4926 | } |
4294 | 4927 | ||
4295 | static int got_IsInSync(struct drbd_conf *mdev, struct p_header80 *h) | 4928 | static int got_IsInSync(struct drbd_tconn *tconn, struct packet_info *pi) |
4296 | { | 4929 | { |
4297 | struct p_block_ack *p = (struct p_block_ack *)h; | 4930 | struct drbd_conf *mdev; |
4931 | struct p_block_ack *p = pi->data; | ||
4298 | sector_t sector = be64_to_cpu(p->sector); | 4932 | sector_t sector = be64_to_cpu(p->sector); |
4299 | int blksize = be32_to_cpu(p->blksize); | 4933 | int blksize = be32_to_cpu(p->blksize); |
4300 | 4934 | ||
4301 | D_ASSERT(mdev->agreed_pro_version >= 89); | 4935 | mdev = vnr_to_mdev(tconn, pi->vnr); |
4936 | if (!mdev) | ||
4937 | return -EIO; | ||
4938 | |||
4939 | D_ASSERT(mdev->tconn->agreed_pro_version >= 89); | ||
4302 | 4940 | ||
4303 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); | 4941 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); |
4304 | 4942 | ||
@@ -4312,162 +4950,139 @@ static int got_IsInSync(struct drbd_conf *mdev, struct p_header80 *h) | |||
4312 | dec_rs_pending(mdev); | 4950 | dec_rs_pending(mdev); |
4313 | atomic_add(blksize >> 9, &mdev->rs_sect_in); | 4951 | atomic_add(blksize >> 9, &mdev->rs_sect_in); |
4314 | 4952 | ||
4315 | return true; | 4953 | return 0; |
4316 | } | ||
4317 | |||
4318 | /* when we receive the ACK for a write request, | ||
4319 | * verify that we actually know about it */ | ||
4320 | static struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev, | ||
4321 | u64 id, sector_t sector) | ||
4322 | { | ||
4323 | struct hlist_head *slot = tl_hash_slot(mdev, sector); | ||
4324 | struct hlist_node *n; | ||
4325 | struct drbd_request *req; | ||
4326 | |||
4327 | hlist_for_each_entry(req, n, slot, collision) { | ||
4328 | if ((unsigned long)req == (unsigned long)id) { | ||
4329 | if (req->sector != sector) { | ||
4330 | dev_err(DEV, "_ack_id_to_req: found req %p but it has " | ||
4331 | "wrong sector (%llus versus %llus)\n", req, | ||
4332 | (unsigned long long)req->sector, | ||
4333 | (unsigned long long)sector); | ||
4334 | break; | ||
4335 | } | ||
4336 | return req; | ||
4337 | } | ||
4338 | } | ||
4339 | return NULL; | ||
4340 | } | 4954 | } |
4341 | 4955 | ||
4342 | typedef struct drbd_request *(req_validator_fn) | 4956 | static int |
4343 | (struct drbd_conf *mdev, u64 id, sector_t sector); | 4957 | validate_req_change_req_state(struct drbd_conf *mdev, u64 id, sector_t sector, |
4344 | 4958 | struct rb_root *root, const char *func, | |
4345 | static int validate_req_change_req_state(struct drbd_conf *mdev, | 4959 | enum drbd_req_event what, bool missing_ok) |
4346 | u64 id, sector_t sector, req_validator_fn validator, | ||
4347 | const char *func, enum drbd_req_event what) | ||
4348 | { | 4960 | { |
4349 | struct drbd_request *req; | 4961 | struct drbd_request *req; |
4350 | struct bio_and_error m; | 4962 | struct bio_and_error m; |
4351 | 4963 | ||
4352 | spin_lock_irq(&mdev->req_lock); | 4964 | spin_lock_irq(&mdev->tconn->req_lock); |
4353 | req = validator(mdev, id, sector); | 4965 | req = find_request(mdev, root, id, sector, missing_ok, func); |
4354 | if (unlikely(!req)) { | 4966 | if (unlikely(!req)) { |
4355 | spin_unlock_irq(&mdev->req_lock); | 4967 | spin_unlock_irq(&mdev->tconn->req_lock); |
4356 | 4968 | return -EIO; | |
4357 | dev_err(DEV, "%s: failed to find req %p, sector %llus\n", func, | ||
4358 | (void *)(unsigned long)id, (unsigned long long)sector); | ||
4359 | return false; | ||
4360 | } | 4969 | } |
4361 | __req_mod(req, what, &m); | 4970 | __req_mod(req, what, &m); |
4362 | spin_unlock_irq(&mdev->req_lock); | 4971 | spin_unlock_irq(&mdev->tconn->req_lock); |
4363 | 4972 | ||
4364 | if (m.bio) | 4973 | if (m.bio) |
4365 | complete_master_bio(mdev, &m); | 4974 | complete_master_bio(mdev, &m); |
4366 | return true; | 4975 | return 0; |
4367 | } | 4976 | } |
4368 | 4977 | ||
4369 | static int got_BlockAck(struct drbd_conf *mdev, struct p_header80 *h) | 4978 | static int got_BlockAck(struct drbd_tconn *tconn, struct packet_info *pi) |
4370 | { | 4979 | { |
4371 | struct p_block_ack *p = (struct p_block_ack *)h; | 4980 | struct drbd_conf *mdev; |
4981 | struct p_block_ack *p = pi->data; | ||
4372 | sector_t sector = be64_to_cpu(p->sector); | 4982 | sector_t sector = be64_to_cpu(p->sector); |
4373 | int blksize = be32_to_cpu(p->blksize); | 4983 | int blksize = be32_to_cpu(p->blksize); |
4374 | enum drbd_req_event what; | 4984 | enum drbd_req_event what; |
4375 | 4985 | ||
4986 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
4987 | if (!mdev) | ||
4988 | return -EIO; | ||
4989 | |||
4376 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); | 4990 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); |
4377 | 4991 | ||
4378 | if (is_syncer_block_id(p->block_id)) { | 4992 | if (p->block_id == ID_SYNCER) { |
4379 | drbd_set_in_sync(mdev, sector, blksize); | 4993 | drbd_set_in_sync(mdev, sector, blksize); |
4380 | dec_rs_pending(mdev); | 4994 | dec_rs_pending(mdev); |
4381 | return true; | 4995 | return 0; |
4382 | } | 4996 | } |
4383 | switch (be16_to_cpu(h->command)) { | 4997 | switch (pi->cmd) { |
4384 | case P_RS_WRITE_ACK: | 4998 | case P_RS_WRITE_ACK: |
4385 | D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); | 4999 | what = WRITE_ACKED_BY_PEER_AND_SIS; |
4386 | what = write_acked_by_peer_and_sis; | ||
4387 | break; | 5000 | break; |
4388 | case P_WRITE_ACK: | 5001 | case P_WRITE_ACK: |
4389 | D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); | 5002 | what = WRITE_ACKED_BY_PEER; |
4390 | what = write_acked_by_peer; | ||
4391 | break; | 5003 | break; |
4392 | case P_RECV_ACK: | 5004 | case P_RECV_ACK: |
4393 | D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_B); | 5005 | what = RECV_ACKED_BY_PEER; |
4394 | what = recv_acked_by_peer; | ||
4395 | break; | 5006 | break; |
4396 | case P_DISCARD_ACK: | 5007 | case P_SUPERSEDED: |
4397 | D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); | 5008 | what = CONFLICT_RESOLVED; |
4398 | what = conflict_discarded_by_peer; | 5009 | break; |
5010 | case P_RETRY_WRITE: | ||
5011 | what = POSTPONE_WRITE; | ||
4399 | break; | 5012 | break; |
4400 | default: | 5013 | default: |
4401 | D_ASSERT(0); | 5014 | BUG(); |
4402 | return false; | ||
4403 | } | 5015 | } |
4404 | 5016 | ||
4405 | return validate_req_change_req_state(mdev, p->block_id, sector, | 5017 | return validate_req_change_req_state(mdev, p->block_id, sector, |
4406 | _ack_id_to_req, __func__ , what); | 5018 | &mdev->write_requests, __func__, |
5019 | what, false); | ||
4407 | } | 5020 | } |
4408 | 5021 | ||
4409 | static int got_NegAck(struct drbd_conf *mdev, struct p_header80 *h) | 5022 | static int got_NegAck(struct drbd_tconn *tconn, struct packet_info *pi) |
4410 | { | 5023 | { |
4411 | struct p_block_ack *p = (struct p_block_ack *)h; | 5024 | struct drbd_conf *mdev; |
5025 | struct p_block_ack *p = pi->data; | ||
4412 | sector_t sector = be64_to_cpu(p->sector); | 5026 | sector_t sector = be64_to_cpu(p->sector); |
4413 | int size = be32_to_cpu(p->blksize); | 5027 | int size = be32_to_cpu(p->blksize); |
4414 | struct drbd_request *req; | 5028 | int err; |
4415 | struct bio_and_error m; | 5029 | |
5030 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
5031 | if (!mdev) | ||
5032 | return -EIO; | ||
4416 | 5033 | ||
4417 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); | 5034 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); |
4418 | 5035 | ||
4419 | if (is_syncer_block_id(p->block_id)) { | 5036 | if (p->block_id == ID_SYNCER) { |
4420 | dec_rs_pending(mdev); | 5037 | dec_rs_pending(mdev); |
4421 | drbd_rs_failed_io(mdev, sector, size); | 5038 | drbd_rs_failed_io(mdev, sector, size); |
4422 | return true; | 5039 | return 0; |
4423 | } | 5040 | } |
4424 | 5041 | ||
4425 | spin_lock_irq(&mdev->req_lock); | 5042 | err = validate_req_change_req_state(mdev, p->block_id, sector, |
4426 | req = _ack_id_to_req(mdev, p->block_id, sector); | 5043 | &mdev->write_requests, __func__, |
4427 | if (!req) { | 5044 | NEG_ACKED, true); |
4428 | spin_unlock_irq(&mdev->req_lock); | 5045 | if (err) { |
4429 | if (mdev->net_conf->wire_protocol == DRBD_PROT_A || | 5046 | /* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs. |
4430 | mdev->net_conf->wire_protocol == DRBD_PROT_B) { | 5047 | The master bio might already be completed, therefore the |
4431 | /* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs. | 5048 | request is no longer in the collision hash. */ |
4432 | The master bio might already be completed, therefore the | 5049 | /* In Protocol B we might already have got a P_RECV_ACK |
4433 | request is no longer in the collision hash. | 5050 | but then get a P_NEG_ACK afterwards. */ |
4434 | => Do not try to validate block_id as request. */ | 5051 | drbd_set_out_of_sync(mdev, sector, size); |
4435 | /* In Protocol B we might already have got a P_RECV_ACK | ||
4436 | but then get a P_NEG_ACK after wards. */ | ||
4437 | drbd_set_out_of_sync(mdev, sector, size); | ||
4438 | return true; | ||
4439 | } else { | ||
4440 | dev_err(DEV, "%s: failed to find req %p, sector %llus\n", __func__, | ||
4441 | (void *)(unsigned long)p->block_id, (unsigned long long)sector); | ||
4442 | return false; | ||
4443 | } | ||
4444 | } | 5052 | } |
4445 | __req_mod(req, neg_acked, &m); | 5053 | return 0; |
4446 | spin_unlock_irq(&mdev->req_lock); | ||
4447 | |||
4448 | if (m.bio) | ||
4449 | complete_master_bio(mdev, &m); | ||
4450 | return true; | ||
4451 | } | 5054 | } |
4452 | 5055 | ||
4453 | static int got_NegDReply(struct drbd_conf *mdev, struct p_header80 *h) | 5056 | static int got_NegDReply(struct drbd_tconn *tconn, struct packet_info *pi) |
4454 | { | 5057 | { |
4455 | struct p_block_ack *p = (struct p_block_ack *)h; | 5058 | struct drbd_conf *mdev; |
5059 | struct p_block_ack *p = pi->data; | ||
4456 | sector_t sector = be64_to_cpu(p->sector); | 5060 | sector_t sector = be64_to_cpu(p->sector); |
4457 | 5061 | ||
5062 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
5063 | if (!mdev) | ||
5064 | return -EIO; | ||
5065 | |||
4458 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); | 5066 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); |
4459 | dev_err(DEV, "Got NegDReply; Sector %llus, len %u; Fail original request.\n", | 5067 | |
5068 | dev_err(DEV, "Got NegDReply; Sector %llus, len %u.\n", | ||
4460 | (unsigned long long)sector, be32_to_cpu(p->blksize)); | 5069 | (unsigned long long)sector, be32_to_cpu(p->blksize)); |
4461 | 5070 | ||
4462 | return validate_req_change_req_state(mdev, p->block_id, sector, | 5071 | return validate_req_change_req_state(mdev, p->block_id, sector, |
4463 | _ar_id_to_req, __func__ , neg_acked); | 5072 | &mdev->read_requests, __func__, |
5073 | NEG_ACKED, false); | ||
4464 | } | 5074 | } |
4465 | 5075 | ||
4466 | static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header80 *h) | 5076 | static int got_NegRSDReply(struct drbd_tconn *tconn, struct packet_info *pi) |
4467 | { | 5077 | { |
5078 | struct drbd_conf *mdev; | ||
4468 | sector_t sector; | 5079 | sector_t sector; |
4469 | int size; | 5080 | int size; |
4470 | struct p_block_ack *p = (struct p_block_ack *)h; | 5081 | struct p_block_ack *p = pi->data; |
5082 | |||
5083 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
5084 | if (!mdev) | ||
5085 | return -EIO; | ||
4471 | 5086 | ||
4472 | sector = be64_to_cpu(p->sector); | 5087 | sector = be64_to_cpu(p->sector); |
4473 | size = be32_to_cpu(p->blksize); | 5088 | size = be32_to_cpu(p->blksize); |
@@ -4478,57 +5093,66 @@ static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header80 *h) | |||
4478 | 5093 | ||
4479 | if (get_ldev_if_state(mdev, D_FAILED)) { | 5094 | if (get_ldev_if_state(mdev, D_FAILED)) { |
4480 | drbd_rs_complete_io(mdev, sector); | 5095 | drbd_rs_complete_io(mdev, sector); |
4481 | switch (be16_to_cpu(h->command)) { | 5096 | switch (pi->cmd) { |
4482 | case P_NEG_RS_DREPLY: | 5097 | case P_NEG_RS_DREPLY: |
4483 | drbd_rs_failed_io(mdev, sector, size); | 5098 | drbd_rs_failed_io(mdev, sector, size); |
4484 | case P_RS_CANCEL: | 5099 | case P_RS_CANCEL: |
4485 | break; | 5100 | break; |
4486 | default: | 5101 | default: |
4487 | D_ASSERT(0); | 5102 | BUG(); |
4488 | put_ldev(mdev); | ||
4489 | return false; | ||
4490 | } | 5103 | } |
4491 | put_ldev(mdev); | 5104 | put_ldev(mdev); |
4492 | } | 5105 | } |
4493 | 5106 | ||
4494 | return true; | 5107 | return 0; |
4495 | } | 5108 | } |
4496 | 5109 | ||
4497 | static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h) | 5110 | static int got_BarrierAck(struct drbd_tconn *tconn, struct packet_info *pi) |
4498 | { | 5111 | { |
4499 | struct p_barrier_ack *p = (struct p_barrier_ack *)h; | 5112 | struct p_barrier_ack *p = pi->data; |
4500 | 5113 | struct drbd_conf *mdev; | |
4501 | tl_release(mdev, p->barrier, be32_to_cpu(p->set_size)); | 5114 | int vnr; |
4502 | 5115 | ||
4503 | if (mdev->state.conn == C_AHEAD && | 5116 | tl_release(tconn, p->barrier, be32_to_cpu(p->set_size)); |
4504 | atomic_read(&mdev->ap_in_flight) == 0 && | 5117 | |
4505 | !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags)) { | 5118 | rcu_read_lock(); |
4506 | mdev->start_resync_timer.expires = jiffies + HZ; | 5119 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { |
4507 | add_timer(&mdev->start_resync_timer); | 5120 | if (mdev->state.conn == C_AHEAD && |
5121 | atomic_read(&mdev->ap_in_flight) == 0 && | ||
5122 | !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags)) { | ||
5123 | mdev->start_resync_timer.expires = jiffies + HZ; | ||
5124 | add_timer(&mdev->start_resync_timer); | ||
5125 | } | ||
4508 | } | 5126 | } |
5127 | rcu_read_unlock(); | ||
4509 | 5128 | ||
4510 | return true; | 5129 | return 0; |
4511 | } | 5130 | } |
4512 | 5131 | ||
4513 | static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h) | 5132 | static int got_OVResult(struct drbd_tconn *tconn, struct packet_info *pi) |
4514 | { | 5133 | { |
4515 | struct p_block_ack *p = (struct p_block_ack *)h; | 5134 | struct drbd_conf *mdev; |
5135 | struct p_block_ack *p = pi->data; | ||
4516 | struct drbd_work *w; | 5136 | struct drbd_work *w; |
4517 | sector_t sector; | 5137 | sector_t sector; |
4518 | int size; | 5138 | int size; |
4519 | 5139 | ||
5140 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
5141 | if (!mdev) | ||
5142 | return -EIO; | ||
5143 | |||
4520 | sector = be64_to_cpu(p->sector); | 5144 | sector = be64_to_cpu(p->sector); |
4521 | size = be32_to_cpu(p->blksize); | 5145 | size = be32_to_cpu(p->blksize); |
4522 | 5146 | ||
4523 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); | 5147 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); |
4524 | 5148 | ||
4525 | if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC) | 5149 | if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC) |
4526 | drbd_ov_oos_found(mdev, sector, size); | 5150 | drbd_ov_out_of_sync_found(mdev, sector, size); |
4527 | else | 5151 | else |
4528 | ov_oos_print(mdev); | 5152 | ov_out_of_sync_print(mdev); |
4529 | 5153 | ||
4530 | if (!get_ldev(mdev)) | 5154 | if (!get_ldev(mdev)) |
4531 | return true; | 5155 | return 0; |
4532 | 5156 | ||
4533 | drbd_rs_complete_io(mdev, sector); | 5157 | drbd_rs_complete_io(mdev, sector); |
4534 | dec_rs_pending(mdev); | 5158 | dec_rs_pending(mdev); |
@@ -4543,114 +5167,137 @@ static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h) | |||
4543 | w = kmalloc(sizeof(*w), GFP_NOIO); | 5167 | w = kmalloc(sizeof(*w), GFP_NOIO); |
4544 | if (w) { | 5168 | if (w) { |
4545 | w->cb = w_ov_finished; | 5169 | w->cb = w_ov_finished; |
4546 | drbd_queue_work_front(&mdev->data.work, w); | 5170 | w->mdev = mdev; |
5171 | drbd_queue_work(&mdev->tconn->sender_work, w); | ||
4547 | } else { | 5172 | } else { |
4548 | dev_err(DEV, "kmalloc(w) failed."); | 5173 | dev_err(DEV, "kmalloc(w) failed."); |
4549 | ov_oos_print(mdev); | 5174 | ov_out_of_sync_print(mdev); |
4550 | drbd_resync_finished(mdev); | 5175 | drbd_resync_finished(mdev); |
4551 | } | 5176 | } |
4552 | } | 5177 | } |
4553 | put_ldev(mdev); | 5178 | put_ldev(mdev); |
4554 | return true; | 5179 | return 0; |
5180 | } | ||
5181 | |||
5182 | static int got_skip(struct drbd_tconn *tconn, struct packet_info *pi) | ||
5183 | { | ||
5184 | return 0; | ||
4555 | } | 5185 | } |
4556 | 5186 | ||
4557 | static int got_skip(struct drbd_conf *mdev, struct p_header80 *h) | 5187 | static int tconn_finish_peer_reqs(struct drbd_tconn *tconn) |
4558 | { | 5188 | { |
4559 | return true; | 5189 | struct drbd_conf *mdev; |
5190 | int vnr, not_empty = 0; | ||
5191 | |||
5192 | do { | ||
5193 | clear_bit(SIGNAL_ASENDER, &tconn->flags); | ||
5194 | flush_signals(current); | ||
5195 | |||
5196 | rcu_read_lock(); | ||
5197 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
5198 | kref_get(&mdev->kref); | ||
5199 | rcu_read_unlock(); | ||
5200 | if (drbd_finish_peer_reqs(mdev)) { | ||
5201 | kref_put(&mdev->kref, &drbd_minor_destroy); | ||
5202 | return 1; | ||
5203 | } | ||
5204 | kref_put(&mdev->kref, &drbd_minor_destroy); | ||
5205 | rcu_read_lock(); | ||
5206 | } | ||
5207 | set_bit(SIGNAL_ASENDER, &tconn->flags); | ||
5208 | |||
5209 | spin_lock_irq(&tconn->req_lock); | ||
5210 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
5211 | not_empty = !list_empty(&mdev->done_ee); | ||
5212 | if (not_empty) | ||
5213 | break; | ||
5214 | } | ||
5215 | spin_unlock_irq(&tconn->req_lock); | ||
5216 | rcu_read_unlock(); | ||
5217 | } while (not_empty); | ||
5218 | |||
5219 | return 0; | ||
4560 | } | 5220 | } |
4561 | 5221 | ||
4562 | struct asender_cmd { | 5222 | struct asender_cmd { |
4563 | size_t pkt_size; | 5223 | size_t pkt_size; |
4564 | int (*process)(struct drbd_conf *mdev, struct p_header80 *h); | 5224 | int (*fn)(struct drbd_tconn *tconn, struct packet_info *); |
4565 | }; | 5225 | }; |
4566 | 5226 | ||
4567 | static struct asender_cmd *get_asender_cmd(int cmd) | 5227 | static struct asender_cmd asender_tbl[] = { |
4568 | { | 5228 | [P_PING] = { 0, got_Ping }, |
4569 | static struct asender_cmd asender_tbl[] = { | 5229 | [P_PING_ACK] = { 0, got_PingAck }, |
4570 | /* anything missing from this table is in | ||
4571 | * the drbd_cmd_handler (drbd_default_handler) table, | ||
4572 | * see the beginning of drbdd() */ | ||
4573 | [P_PING] = { sizeof(struct p_header80), got_Ping }, | ||
4574 | [P_PING_ACK] = { sizeof(struct p_header80), got_PingAck }, | ||
4575 | [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, | 5230 | [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, |
4576 | [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, | 5231 | [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, |
4577 | [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, | 5232 | [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, |
4578 | [P_DISCARD_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, | 5233 | [P_SUPERSEDED] = { sizeof(struct p_block_ack), got_BlockAck }, |
4579 | [P_NEG_ACK] = { sizeof(struct p_block_ack), got_NegAck }, | 5234 | [P_NEG_ACK] = { sizeof(struct p_block_ack), got_NegAck }, |
4580 | [P_NEG_DREPLY] = { sizeof(struct p_block_ack), got_NegDReply }, | 5235 | [P_NEG_DREPLY] = { sizeof(struct p_block_ack), got_NegDReply }, |
4581 | [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply}, | 5236 | [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply }, |
4582 | [P_OV_RESULT] = { sizeof(struct p_block_ack), got_OVResult }, | 5237 | [P_OV_RESULT] = { sizeof(struct p_block_ack), got_OVResult }, |
4583 | [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck }, | 5238 | [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck }, |
4584 | [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply }, | 5239 | [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply }, |
4585 | [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync }, | 5240 | [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync }, |
4586 | [P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_skip }, | 5241 | [P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_skip }, |
4587 | [P_RS_CANCEL] = { sizeof(struct p_block_ack), got_NegRSDReply}, | 5242 | [P_RS_CANCEL] = { sizeof(struct p_block_ack), got_NegRSDReply }, |
4588 | [P_MAX_CMD] = { 0, NULL }, | 5243 | [P_CONN_ST_CHG_REPLY]={ sizeof(struct p_req_state_reply), got_conn_RqSReply }, |
4589 | }; | 5244 | [P_RETRY_WRITE] = { sizeof(struct p_block_ack), got_BlockAck }, |
4590 | if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL) | 5245 | }; |
4591 | return NULL; | ||
4592 | return &asender_tbl[cmd]; | ||
4593 | } | ||
4594 | 5246 | ||
4595 | int drbd_asender(struct drbd_thread *thi) | 5247 | int drbd_asender(struct drbd_thread *thi) |
4596 | { | 5248 | { |
4597 | struct drbd_conf *mdev = thi->mdev; | 5249 | struct drbd_tconn *tconn = thi->tconn; |
4598 | struct p_header80 *h = &mdev->meta.rbuf.header.h80; | ||
4599 | struct asender_cmd *cmd = NULL; | 5250 | struct asender_cmd *cmd = NULL; |
4600 | 5251 | struct packet_info pi; | |
4601 | int rv, len; | 5252 | int rv; |
4602 | void *buf = h; | 5253 | void *buf = tconn->meta.rbuf; |
4603 | int received = 0; | 5254 | int received = 0; |
4604 | int expect = sizeof(struct p_header80); | 5255 | unsigned int header_size = drbd_header_size(tconn); |
4605 | int empty; | 5256 | int expect = header_size; |
4606 | int ping_timeout_active = 0; | 5257 | bool ping_timeout_active = false; |
4607 | 5258 | struct net_conf *nc; | |
4608 | sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev)); | 5259 | int ping_timeo, tcp_cork, ping_int; |
4609 | 5260 | ||
4610 | current->policy = SCHED_RR; /* Make this a realtime task! */ | 5261 | current->policy = SCHED_RR; /* Make this a realtime task! */ |
4611 | current->rt_priority = 2; /* more important than all other tasks */ | 5262 | current->rt_priority = 2; /* more important than all other tasks */ |
4612 | 5263 | ||
4613 | while (get_t_state(thi) == Running) { | 5264 | while (get_t_state(thi) == RUNNING) { |
4614 | drbd_thread_current_set_cpu(mdev); | 5265 | drbd_thread_current_set_cpu(thi); |
4615 | if (test_and_clear_bit(SEND_PING, &mdev->flags)) { | ||
4616 | ERR_IF(!drbd_send_ping(mdev)) goto reconnect; | ||
4617 | mdev->meta.socket->sk->sk_rcvtimeo = | ||
4618 | mdev->net_conf->ping_timeo*HZ/10; | ||
4619 | ping_timeout_active = 1; | ||
4620 | } | ||
4621 | 5266 | ||
4622 | /* conditionally cork; | 5267 | rcu_read_lock(); |
4623 | * it may hurt latency if we cork without much to send */ | 5268 | nc = rcu_dereference(tconn->net_conf); |
4624 | if (!mdev->net_conf->no_cork && | 5269 | ping_timeo = nc->ping_timeo; |
4625 | 3 < atomic_read(&mdev->unacked_cnt)) | 5270 | tcp_cork = nc->tcp_cork; |
4626 | drbd_tcp_cork(mdev->meta.socket); | 5271 | ping_int = nc->ping_int; |
4627 | while (1) { | 5272 | rcu_read_unlock(); |
4628 | clear_bit(SIGNAL_ASENDER, &mdev->flags); | 5273 | |
4629 | flush_signals(current); | 5274 | if (test_and_clear_bit(SEND_PING, &tconn->flags)) { |
4630 | if (!drbd_process_done_ee(mdev)) | 5275 | if (drbd_send_ping(tconn)) { |
5276 | conn_err(tconn, "drbd_send_ping has failed\n"); | ||
4631 | goto reconnect; | 5277 | goto reconnect; |
4632 | /* to avoid race with newly queued ACKs */ | 5278 | } |
4633 | set_bit(SIGNAL_ASENDER, &mdev->flags); | 5279 | tconn->meta.socket->sk->sk_rcvtimeo = ping_timeo * HZ / 10; |
4634 | spin_lock_irq(&mdev->req_lock); | 5280 | ping_timeout_active = true; |
4635 | empty = list_empty(&mdev->done_ee); | 5281 | } |
4636 | spin_unlock_irq(&mdev->req_lock); | 5282 | |
4637 | /* new ack may have been queued right here, | 5283 | /* TODO: conditionally cork; it may hurt latency if we cork without |
4638 | * but then there is also a signal pending, | 5284 | much to send */ |
4639 | * and we start over... */ | 5285 | if (tcp_cork) |
4640 | if (empty) | 5286 | drbd_tcp_cork(tconn->meta.socket); |
4641 | break; | 5287 | if (tconn_finish_peer_reqs(tconn)) { |
5288 | conn_err(tconn, "tconn_finish_peer_reqs() failed\n"); | ||
5289 | goto reconnect; | ||
4642 | } | 5290 | } |
4643 | /* but unconditionally uncork unless disabled */ | 5291 | /* but unconditionally uncork unless disabled */ |
4644 | if (!mdev->net_conf->no_cork) | 5292 | if (tcp_cork) |
4645 | drbd_tcp_uncork(mdev->meta.socket); | 5293 | drbd_tcp_uncork(tconn->meta.socket); |
4646 | 5294 | ||
4647 | /* short circuit, recv_msg would return EINTR anyways. */ | 5295 | /* short circuit, recv_msg would return EINTR anyways. */ |
4648 | if (signal_pending(current)) | 5296 | if (signal_pending(current)) |
4649 | continue; | 5297 | continue; |
4650 | 5298 | ||
4651 | rv = drbd_recv_short(mdev, mdev->meta.socket, | 5299 | rv = drbd_recv_short(tconn->meta.socket, buf, expect-received, 0); |
4652 | buf, expect-received, 0); | 5300 | clear_bit(SIGNAL_ASENDER, &tconn->flags); |
4653 | clear_bit(SIGNAL_ASENDER, &mdev->flags); | ||
4654 | 5301 | ||
4655 | flush_signals(current); | 5302 | flush_signals(current); |
4656 | 5303 | ||
@@ -4668,80 +5315,91 @@ int drbd_asender(struct drbd_thread *thi) | |||
4668 | received += rv; | 5315 | received += rv; |
4669 | buf += rv; | 5316 | buf += rv; |
4670 | } else if (rv == 0) { | 5317 | } else if (rv == 0) { |
4671 | dev_err(DEV, "meta connection shut down by peer.\n"); | 5318 | if (test_bit(DISCONNECT_SENT, &tconn->flags)) { |
5319 | long t; | ||
5320 | rcu_read_lock(); | ||
5321 | t = rcu_dereference(tconn->net_conf)->ping_timeo * HZ/10; | ||
5322 | rcu_read_unlock(); | ||
5323 | |||
5324 | t = wait_event_timeout(tconn->ping_wait, | ||
5325 | tconn->cstate < C_WF_REPORT_PARAMS, | ||
5326 | t); | ||
5327 | if (t) | ||
5328 | break; | ||
5329 | } | ||
5330 | conn_err(tconn, "meta connection shut down by peer.\n"); | ||
4672 | goto reconnect; | 5331 | goto reconnect; |
4673 | } else if (rv == -EAGAIN) { | 5332 | } else if (rv == -EAGAIN) { |
4674 | /* If the data socket received something meanwhile, | 5333 | /* If the data socket received something meanwhile, |
4675 | * that is good enough: peer is still alive. */ | 5334 | * that is good enough: peer is still alive. */ |
4676 | if (time_after(mdev->last_received, | 5335 | if (time_after(tconn->last_received, |
4677 | jiffies - mdev->meta.socket->sk->sk_rcvtimeo)) | 5336 | jiffies - tconn->meta.socket->sk->sk_rcvtimeo)) |
4678 | continue; | 5337 | continue; |
4679 | if (ping_timeout_active) { | 5338 | if (ping_timeout_active) { |
4680 | dev_err(DEV, "PingAck did not arrive in time.\n"); | 5339 | conn_err(tconn, "PingAck did not arrive in time.\n"); |
4681 | goto reconnect; | 5340 | goto reconnect; |
4682 | } | 5341 | } |
4683 | set_bit(SEND_PING, &mdev->flags); | 5342 | set_bit(SEND_PING, &tconn->flags); |
4684 | continue; | 5343 | continue; |
4685 | } else if (rv == -EINTR) { | 5344 | } else if (rv == -EINTR) { |
4686 | continue; | 5345 | continue; |
4687 | } else { | 5346 | } else { |
4688 | dev_err(DEV, "sock_recvmsg returned %d\n", rv); | 5347 | conn_err(tconn, "sock_recvmsg returned %d\n", rv); |
4689 | goto reconnect; | 5348 | goto reconnect; |
4690 | } | 5349 | } |
4691 | 5350 | ||
4692 | if (received == expect && cmd == NULL) { | 5351 | if (received == expect && cmd == NULL) { |
4693 | if (unlikely(h->magic != BE_DRBD_MAGIC)) { | 5352 | if (decode_header(tconn, tconn->meta.rbuf, &pi)) |
4694 | dev_err(DEV, "magic?? on meta m: 0x%08x c: %d l: %d\n", | ||
4695 | be32_to_cpu(h->magic), | ||
4696 | be16_to_cpu(h->command), | ||
4697 | be16_to_cpu(h->length)); | ||
4698 | goto reconnect; | 5353 | goto reconnect; |
4699 | } | 5354 | cmd = &asender_tbl[pi.cmd]; |
4700 | cmd = get_asender_cmd(be16_to_cpu(h->command)); | 5355 | if (pi.cmd >= ARRAY_SIZE(asender_tbl) || !cmd->fn) { |
4701 | len = be16_to_cpu(h->length); | 5356 | conn_err(tconn, "Unexpected meta packet %s (0x%04x)\n", |
4702 | if (unlikely(cmd == NULL)) { | 5357 | cmdname(pi.cmd), pi.cmd); |
4703 | dev_err(DEV, "unknown command?? on meta m: 0x%08x c: %d l: %d\n", | ||
4704 | be32_to_cpu(h->magic), | ||
4705 | be16_to_cpu(h->command), | ||
4706 | be16_to_cpu(h->length)); | ||
4707 | goto disconnect; | 5358 | goto disconnect; |
4708 | } | 5359 | } |
4709 | expect = cmd->pkt_size; | 5360 | expect = header_size + cmd->pkt_size; |
4710 | ERR_IF(len != expect-sizeof(struct p_header80)) | 5361 | if (pi.size != expect - header_size) { |
5362 | conn_err(tconn, "Wrong packet size on meta (c: %d, l: %d)\n", | ||
5363 | pi.cmd, pi.size); | ||
4711 | goto reconnect; | 5364 | goto reconnect; |
5365 | } | ||
4712 | } | 5366 | } |
4713 | if (received == expect) { | 5367 | if (received == expect) { |
4714 | mdev->last_received = jiffies; | 5368 | bool err; |
4715 | D_ASSERT(cmd != NULL); | 5369 | |
4716 | if (!cmd->process(mdev, h)) | 5370 | err = cmd->fn(tconn, &pi); |
5371 | if (err) { | ||
5372 | conn_err(tconn, "%pf failed\n", cmd->fn); | ||
4717 | goto reconnect; | 5373 | goto reconnect; |
5374 | } | ||
5375 | |||
5376 | tconn->last_received = jiffies; | ||
4718 | 5377 | ||
4719 | /* the idle_timeout (ping-int) | 5378 | if (cmd == &asender_tbl[P_PING_ACK]) { |
4720 | * has been restored in got_PingAck() */ | 5379 | /* restore idle timeout */ |
4721 | if (cmd == get_asender_cmd(P_PING_ACK)) | 5380 | tconn->meta.socket->sk->sk_rcvtimeo = ping_int * HZ; |
4722 | ping_timeout_active = 0; | 5381 | ping_timeout_active = false; |
5382 | } | ||
4723 | 5383 | ||
4724 | buf = h; | 5384 | buf = tconn->meta.rbuf; |
4725 | received = 0; | 5385 | received = 0; |
4726 | expect = sizeof(struct p_header80); | 5386 | expect = header_size; |
4727 | cmd = NULL; | 5387 | cmd = NULL; |
4728 | } | 5388 | } |
4729 | } | 5389 | } |
4730 | 5390 | ||
4731 | if (0) { | 5391 | if (0) { |
4732 | reconnect: | 5392 | reconnect: |
4733 | drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE)); | 5393 | conn_request_state(tconn, NS(conn, C_NETWORK_FAILURE), CS_HARD); |
4734 | drbd_md_sync(mdev); | 5394 | conn_md_sync(tconn); |
4735 | } | 5395 | } |
4736 | if (0) { | 5396 | if (0) { |
4737 | disconnect: | 5397 | disconnect: |
4738 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | 5398 | conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD); |
4739 | drbd_md_sync(mdev); | ||
4740 | } | 5399 | } |
4741 | clear_bit(SIGNAL_ASENDER, &mdev->flags); | 5400 | clear_bit(SIGNAL_ASENDER, &tconn->flags); |
4742 | 5401 | ||
4743 | D_ASSERT(mdev->state.conn < C_CONNECTED); | 5402 | conn_info(tconn, "asender terminated\n"); |
4744 | dev_info(DEV, "asender terminated\n"); | ||
4745 | 5403 | ||
4746 | return 0; | 5404 | return 0; |
4747 | } | 5405 | } |
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 01b2ac641c7b..f58a4a4b4dfb 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c | |||
@@ -31,6 +31,8 @@ | |||
31 | #include "drbd_req.h" | 31 | #include "drbd_req.h" |
32 | 32 | ||
33 | 33 | ||
34 | static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size); | ||
35 | |||
34 | /* Update disk stats at start of I/O request */ | 36 | /* Update disk stats at start of I/O request */ |
35 | static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio) | 37 | static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio) |
36 | { | 38 | { |
@@ -40,6 +42,8 @@ static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req | |||
40 | part_round_stats(cpu, &mdev->vdisk->part0); | 42 | part_round_stats(cpu, &mdev->vdisk->part0); |
41 | part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); | 43 | part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); |
42 | part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio)); | 44 | part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio)); |
45 | (void) cpu; /* The macro invocations above want the cpu argument, I do not like | ||
46 | the compiler warning about cpu only assigned but never used... */ | ||
43 | part_inc_in_flight(&mdev->vdisk->part0, rw); | 47 | part_inc_in_flight(&mdev->vdisk->part0, rw); |
44 | part_stat_unlock(); | 48 | part_stat_unlock(); |
45 | } | 49 | } |
@@ -57,9 +61,51 @@ static void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request *req) | |||
57 | part_stat_unlock(); | 61 | part_stat_unlock(); |
58 | } | 62 | } |
59 | 63 | ||
60 | static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw) | 64 | static struct drbd_request *drbd_req_new(struct drbd_conf *mdev, |
65 | struct bio *bio_src) | ||
66 | { | ||
67 | struct drbd_request *req; | ||
68 | |||
69 | req = mempool_alloc(drbd_request_mempool, GFP_NOIO); | ||
70 | if (!req) | ||
71 | return NULL; | ||
72 | |||
73 | drbd_req_make_private_bio(req, bio_src); | ||
74 | req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0; | ||
75 | req->w.mdev = mdev; | ||
76 | req->master_bio = bio_src; | ||
77 | req->epoch = 0; | ||
78 | |||
79 | drbd_clear_interval(&req->i); | ||
80 | req->i.sector = bio_src->bi_sector; | ||
81 | req->i.size = bio_src->bi_size; | ||
82 | req->i.local = true; | ||
83 | req->i.waiting = false; | ||
84 | |||
85 | INIT_LIST_HEAD(&req->tl_requests); | ||
86 | INIT_LIST_HEAD(&req->w.list); | ||
87 | |||
88 | /* one reference to be put by __drbd_make_request */ | ||
89 | atomic_set(&req->completion_ref, 1); | ||
90 | /* one kref as long as completion_ref > 0 */ | ||
91 | kref_init(&req->kref); | ||
92 | return req; | ||
93 | } | ||
94 | |||
95 | void drbd_req_destroy(struct kref *kref) | ||
61 | { | 96 | { |
62 | const unsigned long s = req->rq_state; | 97 | struct drbd_request *req = container_of(kref, struct drbd_request, kref); |
98 | struct drbd_conf *mdev = req->w.mdev; | ||
99 | const unsigned s = req->rq_state; | ||
100 | |||
101 | if ((req->master_bio && !(s & RQ_POSTPONED)) || | ||
102 | atomic_read(&req->completion_ref) || | ||
103 | (s & RQ_LOCAL_PENDING) || | ||
104 | ((s & RQ_NET_MASK) && !(s & RQ_NET_DONE))) { | ||
105 | dev_err(DEV, "drbd_req_destroy: Logic BUG rq_state = 0x%x, completion_ref = %d\n", | ||
106 | s, atomic_read(&req->completion_ref)); | ||
107 | return; | ||
108 | } | ||
63 | 109 | ||
64 | /* remove it from the transfer log. | 110 | /* remove it from the transfer log. |
65 | * well, only if it had been there in the first | 111 | * well, only if it had been there in the first |
@@ -67,24 +113,33 @@ static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const | |||
67 | * and never sent), it should still be "empty" as | 113 | * and never sent), it should still be "empty" as |
68 | * initialized in drbd_req_new(), so we can list_del() it | 114 | * initialized in drbd_req_new(), so we can list_del() it |
69 | * here unconditionally */ | 115 | * here unconditionally */ |
70 | list_del(&req->tl_requests); | 116 | list_del_init(&req->tl_requests); |
71 | 117 | ||
72 | /* if it was a write, we may have to set the corresponding | 118 | /* if it was a write, we may have to set the corresponding |
73 | * bit(s) out-of-sync first. If it had a local part, we need to | 119 | * bit(s) out-of-sync first. If it had a local part, we need to |
74 | * release the reference to the activity log. */ | 120 | * release the reference to the activity log. */ |
75 | if (rw == WRITE) { | 121 | if (s & RQ_WRITE) { |
76 | /* Set out-of-sync unless both OK flags are set | 122 | /* Set out-of-sync unless both OK flags are set |
77 | * (local only or remote failed). | 123 | * (local only or remote failed). |
78 | * Other places where we set out-of-sync: | 124 | * Other places where we set out-of-sync: |
79 | * READ with local io-error */ | 125 | * READ with local io-error */ |
80 | if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK)) | ||
81 | drbd_set_out_of_sync(mdev, req->sector, req->size); | ||
82 | 126 | ||
83 | if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS)) | 127 | /* There is a special case: |
84 | drbd_set_in_sync(mdev, req->sector, req->size); | 128 | * we may notice late that IO was suspended, |
129 | * and postpone, or schedule for retry, a write, | ||
130 | * before it even was submitted or sent. | ||
131 | * In that case we do not want to touch the bitmap at all. | ||
132 | */ | ||
133 | if ((s & (RQ_POSTPONED|RQ_LOCAL_MASK|RQ_NET_MASK)) != RQ_POSTPONED) { | ||
134 | if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK)) | ||
135 | drbd_set_out_of_sync(mdev, req->i.sector, req->i.size); | ||
136 | |||
137 | if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS)) | ||
138 | drbd_set_in_sync(mdev, req->i.sector, req->i.size); | ||
139 | } | ||
85 | 140 | ||
86 | /* one might be tempted to move the drbd_al_complete_io | 141 | /* one might be tempted to move the drbd_al_complete_io |
87 | * to the local io completion callback drbd_endio_pri. | 142 | * to the local io completion callback drbd_request_endio. |
88 | * but, if this was a mirror write, we may only | 143 | * but, if this was a mirror write, we may only |
89 | * drbd_al_complete_io after this is RQ_NET_DONE, | 144 | * drbd_al_complete_io after this is RQ_NET_DONE, |
90 | * otherwise the extent could be dropped from the al | 145 | * otherwise the extent could be dropped from the al |
@@ -93,109 +148,35 @@ static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const | |||
93 | * but after the extent has been dropped from the al, | 148 | * but after the extent has been dropped from the al, |
94 | * we would forget to resync the corresponding extent. | 149 | * we would forget to resync the corresponding extent. |
95 | */ | 150 | */ |
96 | if (s & RQ_LOCAL_MASK) { | 151 | if (s & RQ_IN_ACT_LOG) { |
97 | if (get_ldev_if_state(mdev, D_FAILED)) { | 152 | if (get_ldev_if_state(mdev, D_FAILED)) { |
98 | if (s & RQ_IN_ACT_LOG) | 153 | drbd_al_complete_io(mdev, &req->i); |
99 | drbd_al_complete_io(mdev, req->sector); | ||
100 | put_ldev(mdev); | 154 | put_ldev(mdev); |
101 | } else if (__ratelimit(&drbd_ratelimit_state)) { | 155 | } else if (__ratelimit(&drbd_ratelimit_state)) { |
102 | dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu), " | 156 | dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu, %u), " |
103 | "but my Disk seems to have failed :(\n", | 157 | "but my Disk seems to have failed :(\n", |
104 | (unsigned long long) req->sector); | 158 | (unsigned long long) req->i.sector, req->i.size); |
105 | } | 159 | } |
106 | } | 160 | } |
107 | } | 161 | } |
108 | 162 | ||
109 | drbd_req_free(req); | 163 | mempool_free(req, drbd_request_mempool); |
110 | } | 164 | } |
111 | 165 | ||
112 | static void queue_barrier(struct drbd_conf *mdev) | 166 | static void wake_all_senders(struct drbd_tconn *tconn) { |
113 | { | 167 | wake_up(&tconn->sender_work.q_wait); |
114 | struct drbd_tl_epoch *b; | ||
115 | |||
116 | /* We are within the req_lock. Once we queued the barrier for sending, | ||
117 | * we set the CREATE_BARRIER bit. It is cleared as soon as a new | ||
118 | * barrier/epoch object is added. This is the only place this bit is | ||
119 | * set. It indicates that the barrier for this epoch is already queued, | ||
120 | * and no new epoch has been created yet. */ | ||
121 | if (test_bit(CREATE_BARRIER, &mdev->flags)) | ||
122 | return; | ||
123 | |||
124 | b = mdev->newest_tle; | ||
125 | b->w.cb = w_send_barrier; | ||
126 | /* inc_ap_pending done here, so we won't | ||
127 | * get imbalanced on connection loss. | ||
128 | * dec_ap_pending will be done in got_BarrierAck | ||
129 | * or (on connection loss) in tl_clear. */ | ||
130 | inc_ap_pending(mdev); | ||
131 | drbd_queue_work(&mdev->data.work, &b->w); | ||
132 | set_bit(CREATE_BARRIER, &mdev->flags); | ||
133 | } | 168 | } |
134 | 169 | ||
135 | static void _about_to_complete_local_write(struct drbd_conf *mdev, | 170 | /* must hold resource->req_lock */ |
136 | struct drbd_request *req) | 171 | static void start_new_tl_epoch(struct drbd_tconn *tconn) |
137 | { | 172 | { |
138 | const unsigned long s = req->rq_state; | 173 | /* no point closing an epoch, if it is empty, anyways. */ |
139 | struct drbd_request *i; | 174 | if (tconn->current_tle_writes == 0) |
140 | struct drbd_epoch_entry *e; | 175 | return; |
141 | struct hlist_node *n; | ||
142 | struct hlist_head *slot; | ||
143 | |||
144 | /* Before we can signal completion to the upper layers, | ||
145 | * we may need to close the current epoch. | ||
146 | * We can skip this, if this request has not even been sent, because we | ||
147 | * did not have a fully established connection yet/anymore, during | ||
148 | * bitmap exchange, or while we are C_AHEAD due to congestion policy. | ||
149 | */ | ||
150 | if (mdev->state.conn >= C_CONNECTED && | ||
151 | (s & RQ_NET_SENT) != 0 && | ||
152 | req->epoch == mdev->newest_tle->br_number) | ||
153 | queue_barrier(mdev); | ||
154 | |||
155 | /* we need to do the conflict detection stuff, | ||
156 | * if we have the ee_hash (two_primaries) and | ||
157 | * this has been on the network */ | ||
158 | if ((s & RQ_NET_DONE) && mdev->ee_hash != NULL) { | ||
159 | const sector_t sector = req->sector; | ||
160 | const int size = req->size; | ||
161 | |||
162 | /* ASSERT: | ||
163 | * there must be no conflicting requests, since | ||
164 | * they must have been failed on the spot */ | ||
165 | #define OVERLAPS overlaps(sector, size, i->sector, i->size) | ||
166 | slot = tl_hash_slot(mdev, sector); | ||
167 | hlist_for_each_entry(i, n, slot, collision) { | ||
168 | if (OVERLAPS) { | ||
169 | dev_alert(DEV, "LOGIC BUG: completed: %p %llus +%u; " | ||
170 | "other: %p %llus +%u\n", | ||
171 | req, (unsigned long long)sector, size, | ||
172 | i, (unsigned long long)i->sector, i->size); | ||
173 | } | ||
174 | } | ||
175 | 176 | ||
176 | /* maybe "wake" those conflicting epoch entries | 177 | tconn->current_tle_writes = 0; |
177 | * that wait for this request to finish. | 178 | atomic_inc(&tconn->current_tle_nr); |
178 | * | 179 | wake_all_senders(tconn); |
179 | * currently, there can be only _one_ such ee | ||
180 | * (well, or some more, which would be pending | ||
181 | * P_DISCARD_ACK not yet sent by the asender...), | ||
182 | * since we block the receiver thread upon the | ||
183 | * first conflict detection, which will wait on | ||
184 | * misc_wait. maybe we want to assert that? | ||
185 | * | ||
186 | * anyways, if we found one, | ||
187 | * we just have to do a wake_up. */ | ||
188 | #undef OVERLAPS | ||
189 | #define OVERLAPS overlaps(sector, size, e->sector, e->size) | ||
190 | slot = ee_hash_slot(mdev, req->sector); | ||
191 | hlist_for_each_entry(e, n, slot, collision) { | ||
192 | if (OVERLAPS) { | ||
193 | wake_up(&mdev->misc_wait); | ||
194 | break; | ||
195 | } | ||
196 | } | ||
197 | } | ||
198 | #undef OVERLAPS | ||
199 | } | 180 | } |
200 | 181 | ||
201 | void complete_master_bio(struct drbd_conf *mdev, | 182 | void complete_master_bio(struct drbd_conf *mdev, |
@@ -205,17 +186,33 @@ void complete_master_bio(struct drbd_conf *mdev, | |||
205 | dec_ap_bio(mdev); | 186 | dec_ap_bio(mdev); |
206 | } | 187 | } |
207 | 188 | ||
189 | |||
190 | static void drbd_remove_request_interval(struct rb_root *root, | ||
191 | struct drbd_request *req) | ||
192 | { | ||
193 | struct drbd_conf *mdev = req->w.mdev; | ||
194 | struct drbd_interval *i = &req->i; | ||
195 | |||
196 | drbd_remove_interval(root, i); | ||
197 | |||
198 | /* Wake up any processes waiting for this request to complete. */ | ||
199 | if (i->waiting) | ||
200 | wake_up(&mdev->misc_wait); | ||
201 | } | ||
202 | |||
208 | /* Helper for __req_mod(). | 203 | /* Helper for __req_mod(). |
209 | * Set m->bio to the master bio, if it is fit to be completed, | 204 | * Set m->bio to the master bio, if it is fit to be completed, |
210 | * or leave it alone (it is initialized to NULL in __req_mod), | 205 | * or leave it alone (it is initialized to NULL in __req_mod), |
211 | * if it has already been completed, or cannot be completed yet. | 206 | * if it has already been completed, or cannot be completed yet. |
212 | * If m->bio is set, the error status to be returned is placed in m->error. | 207 | * If m->bio is set, the error status to be returned is placed in m->error. |
213 | */ | 208 | */ |
214 | void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) | 209 | static |
210 | void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m) | ||
215 | { | 211 | { |
216 | const unsigned long s = req->rq_state; | 212 | const unsigned s = req->rq_state; |
217 | struct drbd_conf *mdev = req->mdev; | 213 | struct drbd_conf *mdev = req->w.mdev; |
218 | int rw = req->rq_state & RQ_WRITE ? WRITE : READ; | 214 | int rw; |
215 | int error, ok; | ||
219 | 216 | ||
220 | /* we must not complete the master bio, while it is | 217 | /* we must not complete the master bio, while it is |
221 | * still being processed by _drbd_send_zc_bio (drbd_send_dblock) | 218 | * still being processed by _drbd_send_zc_bio (drbd_send_dblock) |
@@ -226,165 +223,220 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) | |||
226 | * the receiver, | 223 | * the receiver, |
227 | * the bio_endio completion callbacks. | 224 | * the bio_endio completion callbacks. |
228 | */ | 225 | */ |
229 | if (s & RQ_NET_QUEUED) | 226 | if ((s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED)) || |
230 | return; | 227 | (s & RQ_NET_QUEUED) || (s & RQ_NET_PENDING) || |
231 | if (s & RQ_NET_PENDING) | 228 | (s & RQ_COMPLETION_SUSP)) { |
229 | dev_err(DEV, "drbd_req_complete: Logic BUG rq_state = 0x%x\n", s); | ||
232 | return; | 230 | return; |
233 | if (s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED)) | 231 | } |
232 | |||
233 | if (!req->master_bio) { | ||
234 | dev_err(DEV, "drbd_req_complete: Logic BUG, master_bio == NULL!\n"); | ||
234 | return; | 235 | return; |
236 | } | ||
235 | 237 | ||
236 | if (req->master_bio) { | 238 | rw = bio_rw(req->master_bio); |
237 | /* this is data_received (remote read) | ||
238 | * or protocol C P_WRITE_ACK | ||
239 | * or protocol B P_RECV_ACK | ||
240 | * or protocol A "handed_over_to_network" (SendAck) | ||
241 | * or canceled or failed, | ||
242 | * or killed from the transfer log due to connection loss. | ||
243 | */ | ||
244 | 239 | ||
245 | /* | 240 | /* |
246 | * figure out whether to report success or failure. | 241 | * figure out whether to report success or failure. |
247 | * | 242 | * |
248 | * report success when at least one of the operations succeeded. | 243 | * report success when at least one of the operations succeeded. |
249 | * or, to put the other way, | 244 | * or, to put the other way, |
250 | * only report failure, when both operations failed. | 245 | * only report failure, when both operations failed. |
251 | * | 246 | * |
252 | * what to do about the failures is handled elsewhere. | 247 | * what to do about the failures is handled elsewhere. |
253 | * what we need to do here is just: complete the master_bio. | 248 | * what we need to do here is just: complete the master_bio. |
254 | * | 249 | * |
255 | * local completion error, if any, has been stored as ERR_PTR | 250 | * local completion error, if any, has been stored as ERR_PTR |
256 | * in private_bio within drbd_endio_pri. | 251 | * in private_bio within drbd_request_endio. |
257 | */ | 252 | */ |
258 | int ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK); | 253 | ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK); |
259 | int error = PTR_ERR(req->private_bio); | 254 | error = PTR_ERR(req->private_bio); |
260 | 255 | ||
261 | /* remove the request from the conflict detection | 256 | /* remove the request from the conflict detection |
262 | * respective block_id verification hash */ | 257 | * respective block_id verification hash */ |
263 | if (!hlist_unhashed(&req->collision)) | 258 | if (!drbd_interval_empty(&req->i)) { |
264 | hlist_del(&req->collision); | 259 | struct rb_root *root; |
265 | else | ||
266 | D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0); | ||
267 | 260 | ||
268 | /* for writes we need to do some extra housekeeping */ | ||
269 | if (rw == WRITE) | 261 | if (rw == WRITE) |
270 | _about_to_complete_local_write(mdev, req); | 262 | root = &mdev->write_requests; |
263 | else | ||
264 | root = &mdev->read_requests; | ||
265 | drbd_remove_request_interval(root, req); | ||
266 | } else if (!(s & RQ_POSTPONED)) | ||
267 | D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0); | ||
271 | 268 | ||
272 | /* Update disk stats */ | 269 | /* Before we can signal completion to the upper layers, |
273 | _drbd_end_io_acct(mdev, req); | 270 | * we may need to close the current transfer log epoch. |
271 | * We are within the request lock, so we can simply compare | ||
272 | * the request epoch number with the current transfer log | ||
273 | * epoch number. If they match, increase the current_tle_nr, | ||
274 | * and reset the transfer log epoch write_cnt. | ||
275 | */ | ||
276 | if (rw == WRITE && | ||
277 | req->epoch == atomic_read(&mdev->tconn->current_tle_nr)) | ||
278 | start_new_tl_epoch(mdev->tconn); | ||
279 | |||
280 | /* Update disk stats */ | ||
281 | _drbd_end_io_acct(mdev, req); | ||
282 | |||
283 | /* If READ failed, | ||
284 | * have it be pushed back to the retry work queue, | ||
285 | * so it will re-enter __drbd_make_request(), | ||
286 | * and be re-assigned to a suitable local or remote path, | ||
287 | * or failed if we do not have access to good data anymore. | ||
288 | * | ||
289 | * Unless it was failed early by __drbd_make_request(), | ||
290 | * because no path was available, in which case | ||
291 | * it was not even added to the transfer_log. | ||
292 | * | ||
293 | * READA may fail, and will not be retried. | ||
294 | * | ||
295 | * WRITE should have used all available paths already. | ||
296 | */ | ||
297 | if (!ok && rw == READ && !list_empty(&req->tl_requests)) | ||
298 | req->rq_state |= RQ_POSTPONED; | ||
274 | 299 | ||
300 | if (!(req->rq_state & RQ_POSTPONED)) { | ||
275 | m->error = ok ? 0 : (error ?: -EIO); | 301 | m->error = ok ? 0 : (error ?: -EIO); |
276 | m->bio = req->master_bio; | 302 | m->bio = req->master_bio; |
277 | req->master_bio = NULL; | 303 | req->master_bio = NULL; |
278 | } | 304 | } |
305 | } | ||
279 | 306 | ||
280 | if (s & RQ_LOCAL_PENDING) | 307 | static int drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_error *m, int put) |
281 | return; | 308 | { |
309 | struct drbd_conf *mdev = req->w.mdev; | ||
310 | D_ASSERT(m || (req->rq_state & RQ_POSTPONED)); | ||
311 | |||
312 | if (!atomic_sub_and_test(put, &req->completion_ref)) | ||
313 | return 0; | ||
282 | 314 | ||
283 | if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) { | 315 | drbd_req_complete(req, m); |
284 | /* this is disconnected (local only) operation, | 316 | |
285 | * or protocol C P_WRITE_ACK, | 317 | if (req->rq_state & RQ_POSTPONED) { |
286 | * or protocol A or B P_BARRIER_ACK, | 318 | /* don't destroy the req object just yet, |
287 | * or killed from the transfer log due to connection loss. */ | 319 | * but queue it for retry */ |
288 | _req_is_done(mdev, req, rw); | 320 | drbd_restart_request(req); |
321 | return 0; | ||
289 | } | 322 | } |
290 | /* else: network part and not DONE yet. that is | 323 | |
291 | * protocol A or B, barrier ack still pending... */ | 324 | return 1; |
292 | } | 325 | } |
293 | 326 | ||
294 | static void _req_may_be_done_not_susp(struct drbd_request *req, struct bio_and_error *m) | 327 | /* I'd like this to be the only place that manipulates |
328 | * req->completion_ref and req->kref. */ | ||
329 | static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m, | ||
330 | int clear, int set) | ||
295 | { | 331 | { |
296 | struct drbd_conf *mdev = req->mdev; | 332 | struct drbd_conf *mdev = req->w.mdev; |
333 | unsigned s = req->rq_state; | ||
334 | int c_put = 0; | ||
335 | int k_put = 0; | ||
297 | 336 | ||
298 | if (!is_susp(mdev->state)) | 337 | if (drbd_suspended(mdev) && !((s | clear) & RQ_COMPLETION_SUSP)) |
299 | _req_may_be_done(req, m); | 338 | set |= RQ_COMPLETION_SUSP; |
300 | } | ||
301 | 339 | ||
302 | /* | 340 | /* apply */ |
303 | * checks whether there was an overlapping request | ||
304 | * or ee already registered. | ||
305 | * | ||
306 | * if so, return 1, in which case this request is completed on the spot, | ||
307 | * without ever being submitted or send. | ||
308 | * | ||
309 | * return 0 if it is ok to submit this request. | ||
310 | * | ||
311 | * NOTE: | ||
312 | * paranoia: assume something above us is broken, and issues different write | ||
313 | * requests for the same block simultaneously... | ||
314 | * | ||
315 | * To ensure these won't be reordered differently on both nodes, resulting in | ||
316 | * diverging data sets, we discard the later one(s). Not that this is supposed | ||
317 | * to happen, but this is the rationale why we also have to check for | ||
318 | * conflicting requests with local origin, and why we have to do so regardless | ||
319 | * of whether we allowed multiple primaries. | ||
320 | * | ||
321 | * BTW, in case we only have one primary, the ee_hash is empty anyways, and the | ||
322 | * second hlist_for_each_entry becomes a noop. This is even simpler than to | ||
323 | * grab a reference on the net_conf, and check for the two_primaries flag... | ||
324 | */ | ||
325 | static int _req_conflicts(struct drbd_request *req) | ||
326 | { | ||
327 | struct drbd_conf *mdev = req->mdev; | ||
328 | const sector_t sector = req->sector; | ||
329 | const int size = req->size; | ||
330 | struct drbd_request *i; | ||
331 | struct drbd_epoch_entry *e; | ||
332 | struct hlist_node *n; | ||
333 | struct hlist_head *slot; | ||
334 | 341 | ||
335 | D_ASSERT(hlist_unhashed(&req->collision)); | 342 | req->rq_state &= ~clear; |
343 | req->rq_state |= set; | ||
336 | 344 | ||
337 | if (!get_net_conf(mdev)) | 345 | /* no change? */ |
338 | return 0; | 346 | if (req->rq_state == s) |
347 | return; | ||
339 | 348 | ||
340 | /* BUG_ON */ | 349 | /* intent: get references */ |
341 | ERR_IF (mdev->tl_hash_s == 0) | 350 | |
342 | goto out_no_conflict; | 351 | if (!(s & RQ_LOCAL_PENDING) && (set & RQ_LOCAL_PENDING)) |
343 | BUG_ON(mdev->tl_hash == NULL); | 352 | atomic_inc(&req->completion_ref); |
344 | 353 | ||
345 | #define OVERLAPS overlaps(i->sector, i->size, sector, size) | 354 | if (!(s & RQ_NET_PENDING) && (set & RQ_NET_PENDING)) { |
346 | slot = tl_hash_slot(mdev, sector); | 355 | inc_ap_pending(mdev); |
347 | hlist_for_each_entry(i, n, slot, collision) { | 356 | atomic_inc(&req->completion_ref); |
348 | if (OVERLAPS) { | ||
349 | dev_alert(DEV, "%s[%u] Concurrent local write detected! " | ||
350 | "[DISCARD L] new: %llus +%u; " | ||
351 | "pending: %llus +%u\n", | ||
352 | current->comm, current->pid, | ||
353 | (unsigned long long)sector, size, | ||
354 | (unsigned long long)i->sector, i->size); | ||
355 | goto out_conflict; | ||
356 | } | ||
357 | } | 357 | } |
358 | 358 | ||
359 | if (mdev->ee_hash_s) { | 359 | if (!(s & RQ_NET_QUEUED) && (set & RQ_NET_QUEUED)) |
360 | /* now, check for overlapping requests with remote origin */ | 360 | atomic_inc(&req->completion_ref); |
361 | BUG_ON(mdev->ee_hash == NULL); | 361 | |
362 | #undef OVERLAPS | 362 | if (!(s & RQ_EXP_BARR_ACK) && (set & RQ_EXP_BARR_ACK)) |
363 | #define OVERLAPS overlaps(e->sector, e->size, sector, size) | 363 | kref_get(&req->kref); /* wait for the DONE */ |
364 | slot = ee_hash_slot(mdev, sector); | 364 | |
365 | hlist_for_each_entry(e, n, slot, collision) { | 365 | if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) |
366 | if (OVERLAPS) { | 366 | atomic_add(req->i.size >> 9, &mdev->ap_in_flight); |
367 | dev_alert(DEV, "%s[%u] Concurrent remote write detected!" | 367 | |
368 | " [DISCARD L] new: %llus +%u; " | 368 | if (!(s & RQ_COMPLETION_SUSP) && (set & RQ_COMPLETION_SUSP)) |
369 | "pending: %llus +%u\n", | 369 | atomic_inc(&req->completion_ref); |
370 | current->comm, current->pid, | 370 | |
371 | (unsigned long long)sector, size, | 371 | /* progress: put references */ |
372 | (unsigned long long)e->sector, e->size); | 372 | |
373 | goto out_conflict; | 373 | if ((s & RQ_COMPLETION_SUSP) && (clear & RQ_COMPLETION_SUSP)) |
374 | } | 374 | ++c_put; |
375 | } | 375 | |
376 | if (!(s & RQ_LOCAL_ABORTED) && (set & RQ_LOCAL_ABORTED)) { | ||
377 | D_ASSERT(req->rq_state & RQ_LOCAL_PENDING); | ||
378 | /* local completion may still come in later, | ||
379 | * we need to keep the req object around. */ | ||
380 | kref_get(&req->kref); | ||
381 | ++c_put; | ||
382 | } | ||
383 | |||
384 | if ((s & RQ_LOCAL_PENDING) && (clear & RQ_LOCAL_PENDING)) { | ||
385 | if (req->rq_state & RQ_LOCAL_ABORTED) | ||
386 | ++k_put; | ||
387 | else | ||
388 | ++c_put; | ||
376 | } | 389 | } |
377 | #undef OVERLAPS | ||
378 | 390 | ||
379 | out_no_conflict: | 391 | if ((s & RQ_NET_PENDING) && (clear & RQ_NET_PENDING)) { |
380 | /* this is like it should be, and what we expected. | 392 | dec_ap_pending(mdev); |
381 | * our users do behave after all... */ | 393 | ++c_put; |
382 | put_net_conf(mdev); | 394 | } |
383 | return 0; | ||
384 | 395 | ||
385 | out_conflict: | 396 | if ((s & RQ_NET_QUEUED) && (clear & RQ_NET_QUEUED)) |
386 | put_net_conf(mdev); | 397 | ++c_put; |
387 | return 1; | 398 | |
399 | if ((s & RQ_EXP_BARR_ACK) && !(s & RQ_NET_DONE) && (set & RQ_NET_DONE)) { | ||
400 | if (req->rq_state & RQ_NET_SENT) | ||
401 | atomic_sub(req->i.size >> 9, &mdev->ap_in_flight); | ||
402 | ++k_put; | ||
403 | } | ||
404 | |||
405 | /* potentially complete and destroy */ | ||
406 | |||
407 | if (k_put || c_put) { | ||
408 | /* Completion does it's own kref_put. If we are going to | ||
409 | * kref_sub below, we need req to be still around then. */ | ||
410 | int at_least = k_put + !!c_put; | ||
411 | int refcount = atomic_read(&req->kref.refcount); | ||
412 | if (refcount < at_least) | ||
413 | dev_err(DEV, | ||
414 | "mod_rq_state: Logic BUG: %x -> %x: refcount = %d, should be >= %d\n", | ||
415 | s, req->rq_state, refcount, at_least); | ||
416 | } | ||
417 | |||
418 | /* If we made progress, retry conflicting peer requests, if any. */ | ||
419 | if (req->i.waiting) | ||
420 | wake_up(&mdev->misc_wait); | ||
421 | |||
422 | if (c_put) | ||
423 | k_put += drbd_req_put_completion_ref(req, m, c_put); | ||
424 | if (k_put) | ||
425 | kref_sub(&req->kref, k_put, drbd_req_destroy); | ||
426 | } | ||
427 | |||
428 | static void drbd_report_io_error(struct drbd_conf *mdev, struct drbd_request *req) | ||
429 | { | ||
430 | char b[BDEVNAME_SIZE]; | ||
431 | |||
432 | if (!__ratelimit(&drbd_ratelimit_state)) | ||
433 | return; | ||
434 | |||
435 | dev_warn(DEV, "local %s IO error sector %llu+%u on %s\n", | ||
436 | (req->rq_state & RQ_WRITE) ? "WRITE" : "READ", | ||
437 | (unsigned long long)req->i.sector, | ||
438 | req->i.size >> 9, | ||
439 | bdevname(mdev->ldev->backing_bdev, b)); | ||
388 | } | 440 | } |
389 | 441 | ||
390 | /* obviously this could be coded as many single functions | 442 | /* obviously this could be coded as many single functions |
@@ -402,9 +454,12 @@ out_conflict: | |||
402 | int __req_mod(struct drbd_request *req, enum drbd_req_event what, | 454 | int __req_mod(struct drbd_request *req, enum drbd_req_event what, |
403 | struct bio_and_error *m) | 455 | struct bio_and_error *m) |
404 | { | 456 | { |
405 | struct drbd_conf *mdev = req->mdev; | 457 | struct drbd_conf *mdev = req->w.mdev; |
406 | int rv = 0; | 458 | struct net_conf *nc; |
407 | m->bio = NULL; | 459 | int p, rv = 0; |
460 | |||
461 | if (m) | ||
462 | m->bio = NULL; | ||
408 | 463 | ||
409 | switch (what) { | 464 | switch (what) { |
410 | default: | 465 | default: |
@@ -413,116 +468,91 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, | |||
413 | 468 | ||
414 | /* does not happen... | 469 | /* does not happen... |
415 | * initialization done in drbd_req_new | 470 | * initialization done in drbd_req_new |
416 | case created: | 471 | case CREATED: |
417 | break; | 472 | break; |
418 | */ | 473 | */ |
419 | 474 | ||
420 | case to_be_send: /* via network */ | 475 | case TO_BE_SENT: /* via network */ |
421 | /* reached via drbd_make_request_common | 476 | /* reached via __drbd_make_request |
422 | * and from w_read_retry_remote */ | 477 | * and from w_read_retry_remote */ |
423 | D_ASSERT(!(req->rq_state & RQ_NET_MASK)); | 478 | D_ASSERT(!(req->rq_state & RQ_NET_MASK)); |
424 | req->rq_state |= RQ_NET_PENDING; | 479 | rcu_read_lock(); |
425 | inc_ap_pending(mdev); | 480 | nc = rcu_dereference(mdev->tconn->net_conf); |
481 | p = nc->wire_protocol; | ||
482 | rcu_read_unlock(); | ||
483 | req->rq_state |= | ||
484 | p == DRBD_PROT_C ? RQ_EXP_WRITE_ACK : | ||
485 | p == DRBD_PROT_B ? RQ_EXP_RECEIVE_ACK : 0; | ||
486 | mod_rq_state(req, m, 0, RQ_NET_PENDING); | ||
426 | break; | 487 | break; |
427 | 488 | ||
428 | case to_be_submitted: /* locally */ | 489 | case TO_BE_SUBMITTED: /* locally */ |
429 | /* reached via drbd_make_request_common */ | 490 | /* reached via __drbd_make_request */ |
430 | D_ASSERT(!(req->rq_state & RQ_LOCAL_MASK)); | 491 | D_ASSERT(!(req->rq_state & RQ_LOCAL_MASK)); |
431 | req->rq_state |= RQ_LOCAL_PENDING; | 492 | mod_rq_state(req, m, 0, RQ_LOCAL_PENDING); |
432 | break; | 493 | break; |
433 | 494 | ||
434 | case completed_ok: | 495 | case COMPLETED_OK: |
435 | if (req->rq_state & RQ_WRITE) | 496 | if (req->rq_state & RQ_WRITE) |
436 | mdev->writ_cnt += req->size>>9; | 497 | mdev->writ_cnt += req->i.size >> 9; |
437 | else | 498 | else |
438 | mdev->read_cnt += req->size>>9; | 499 | mdev->read_cnt += req->i.size >> 9; |
439 | 500 | ||
440 | req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK); | 501 | mod_rq_state(req, m, RQ_LOCAL_PENDING, |
441 | req->rq_state &= ~RQ_LOCAL_PENDING; | 502 | RQ_LOCAL_COMPLETED|RQ_LOCAL_OK); |
442 | |||
443 | _req_may_be_done_not_susp(req, m); | ||
444 | break; | 503 | break; |
445 | 504 | ||
446 | case abort_disk_io: | 505 | case ABORT_DISK_IO: |
447 | req->rq_state |= RQ_LOCAL_ABORTED; | 506 | mod_rq_state(req, m, 0, RQ_LOCAL_ABORTED); |
448 | if (req->rq_state & RQ_WRITE) | ||
449 | _req_may_be_done_not_susp(req, m); | ||
450 | else | ||
451 | goto goto_queue_for_net_read; | ||
452 | break; | 507 | break; |
453 | 508 | ||
454 | case write_completed_with_error: | 509 | case WRITE_COMPLETED_WITH_ERROR: |
455 | req->rq_state |= RQ_LOCAL_COMPLETED; | 510 | drbd_report_io_error(mdev, req); |
456 | req->rq_state &= ~RQ_LOCAL_PENDING; | 511 | __drbd_chk_io_error(mdev, DRBD_WRITE_ERROR); |
457 | 512 | mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED); | |
458 | __drbd_chk_io_error(mdev, DRBD_IO_ERROR); | ||
459 | _req_may_be_done_not_susp(req, m); | ||
460 | break; | 513 | break; |
461 | 514 | ||
462 | case read_ahead_completed_with_error: | 515 | case READ_COMPLETED_WITH_ERROR: |
463 | /* it is legal to fail READA */ | 516 | drbd_set_out_of_sync(mdev, req->i.sector, req->i.size); |
464 | req->rq_state |= RQ_LOCAL_COMPLETED; | 517 | drbd_report_io_error(mdev, req); |
465 | req->rq_state &= ~RQ_LOCAL_PENDING; | 518 | __drbd_chk_io_error(mdev, DRBD_READ_ERROR); |
466 | _req_may_be_done_not_susp(req, m); | 519 | /* fall through. */ |
520 | case READ_AHEAD_COMPLETED_WITH_ERROR: | ||
521 | /* it is legal to fail READA, no __drbd_chk_io_error in that case. */ | ||
522 | mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED); | ||
467 | break; | 523 | break; |
468 | 524 | ||
469 | case read_completed_with_error: | 525 | case QUEUE_FOR_NET_READ: |
470 | drbd_set_out_of_sync(mdev, req->sector, req->size); | ||
471 | |||
472 | req->rq_state |= RQ_LOCAL_COMPLETED; | ||
473 | req->rq_state &= ~RQ_LOCAL_PENDING; | ||
474 | |||
475 | if (req->rq_state & RQ_LOCAL_ABORTED) { | ||
476 | _req_may_be_done(req, m); | ||
477 | break; | ||
478 | } | ||
479 | |||
480 | __drbd_chk_io_error(mdev, DRBD_IO_ERROR); | ||
481 | |||
482 | goto_queue_for_net_read: | ||
483 | |||
484 | D_ASSERT(!(req->rq_state & RQ_NET_MASK)); | ||
485 | |||
486 | /* no point in retrying if there is no good remote data, | ||
487 | * or we have no connection. */ | ||
488 | if (mdev->state.pdsk != D_UP_TO_DATE) { | ||
489 | _req_may_be_done_not_susp(req, m); | ||
490 | break; | ||
491 | } | ||
492 | |||
493 | /* _req_mod(req,to_be_send); oops, recursion... */ | ||
494 | req->rq_state |= RQ_NET_PENDING; | ||
495 | inc_ap_pending(mdev); | ||
496 | /* fall through: _req_mod(req,queue_for_net_read); */ | ||
497 | |||
498 | case queue_for_net_read: | ||
499 | /* READ or READA, and | 526 | /* READ or READA, and |
500 | * no local disk, | 527 | * no local disk, |
501 | * or target area marked as invalid, | 528 | * or target area marked as invalid, |
502 | * or just got an io-error. */ | 529 | * or just got an io-error. */ |
503 | /* from drbd_make_request_common | 530 | /* from __drbd_make_request |
504 | * or from bio_endio during read io-error recovery */ | 531 | * or from bio_endio during read io-error recovery */ |
505 | 532 | ||
506 | /* so we can verify the handle in the answer packet | 533 | /* So we can verify the handle in the answer packet. |
507 | * corresponding hlist_del is in _req_may_be_done() */ | 534 | * Corresponding drbd_remove_request_interval is in |
508 | hlist_add_head(&req->collision, ar_hash_slot(mdev, req->sector)); | 535 | * drbd_req_complete() */ |
536 | D_ASSERT(drbd_interval_empty(&req->i)); | ||
537 | drbd_insert_interval(&mdev->read_requests, &req->i); | ||
509 | 538 | ||
510 | set_bit(UNPLUG_REMOTE, &mdev->flags); | 539 | set_bit(UNPLUG_REMOTE, &mdev->flags); |
511 | 540 | ||
512 | D_ASSERT(req->rq_state & RQ_NET_PENDING); | 541 | D_ASSERT(req->rq_state & RQ_NET_PENDING); |
513 | req->rq_state |= RQ_NET_QUEUED; | 542 | D_ASSERT((req->rq_state & RQ_LOCAL_MASK) == 0); |
514 | req->w.cb = (req->rq_state & RQ_LOCAL_MASK) | 543 | mod_rq_state(req, m, 0, RQ_NET_QUEUED); |
515 | ? w_read_retry_remote | 544 | req->w.cb = w_send_read_req; |
516 | : w_send_read_req; | 545 | drbd_queue_work(&mdev->tconn->sender_work, &req->w); |
517 | drbd_queue_work(&mdev->data.work, &req->w); | ||
518 | break; | 546 | break; |
519 | 547 | ||
520 | case queue_for_net_write: | 548 | case QUEUE_FOR_NET_WRITE: |
521 | /* assert something? */ | 549 | /* assert something? */ |
522 | /* from drbd_make_request_common only */ | 550 | /* from __drbd_make_request only */ |
523 | 551 | ||
524 | hlist_add_head(&req->collision, tl_hash_slot(mdev, req->sector)); | 552 | /* Corresponding drbd_remove_request_interval is in |
525 | /* corresponding hlist_del is in _req_may_be_done() */ | 553 | * drbd_req_complete() */ |
554 | D_ASSERT(drbd_interval_empty(&req->i)); | ||
555 | drbd_insert_interval(&mdev->write_requests, &req->i); | ||
526 | 556 | ||
527 | /* NOTE | 557 | /* NOTE |
528 | * In case the req ended up on the transfer log before being | 558 | * In case the req ended up on the transfer log before being |
@@ -533,7 +563,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, | |||
533 | * | 563 | * |
534 | * _req_add_to_epoch(req); this has to be after the | 564 | * _req_add_to_epoch(req); this has to be after the |
535 | * _maybe_start_new_epoch(req); which happened in | 565 | * _maybe_start_new_epoch(req); which happened in |
536 | * drbd_make_request_common, because we now may set the bit | 566 | * __drbd_make_request, because we now may set the bit |
537 | * again ourselves to close the current epoch. | 567 | * again ourselves to close the current epoch. |
538 | * | 568 | * |
539 | * Add req to the (now) current epoch (barrier). */ | 569 | * Add req to the (now) current epoch (barrier). */ |
@@ -543,202 +573,187 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, | |||
543 | * hurting performance. */ | 573 | * hurting performance. */ |
544 | set_bit(UNPLUG_REMOTE, &mdev->flags); | 574 | set_bit(UNPLUG_REMOTE, &mdev->flags); |
545 | 575 | ||
546 | /* see drbd_make_request_common, | ||
547 | * just after it grabs the req_lock */ | ||
548 | D_ASSERT(test_bit(CREATE_BARRIER, &mdev->flags) == 0); | ||
549 | |||
550 | req->epoch = mdev->newest_tle->br_number; | ||
551 | |||
552 | /* increment size of current epoch */ | ||
553 | mdev->newest_tle->n_writes++; | ||
554 | |||
555 | /* queue work item to send data */ | 576 | /* queue work item to send data */ |
556 | D_ASSERT(req->rq_state & RQ_NET_PENDING); | 577 | D_ASSERT(req->rq_state & RQ_NET_PENDING); |
557 | req->rq_state |= RQ_NET_QUEUED; | 578 | mod_rq_state(req, m, 0, RQ_NET_QUEUED|RQ_EXP_BARR_ACK); |
558 | req->w.cb = w_send_dblock; | 579 | req->w.cb = w_send_dblock; |
559 | drbd_queue_work(&mdev->data.work, &req->w); | 580 | drbd_queue_work(&mdev->tconn->sender_work, &req->w); |
560 | 581 | ||
561 | /* close the epoch, in case it outgrew the limit */ | 582 | /* close the epoch, in case it outgrew the limit */ |
562 | if (mdev->newest_tle->n_writes >= mdev->net_conf->max_epoch_size) | 583 | rcu_read_lock(); |
563 | queue_barrier(mdev); | 584 | nc = rcu_dereference(mdev->tconn->net_conf); |
585 | p = nc->max_epoch_size; | ||
586 | rcu_read_unlock(); | ||
587 | if (mdev->tconn->current_tle_writes >= p) | ||
588 | start_new_tl_epoch(mdev->tconn); | ||
564 | 589 | ||
565 | break; | 590 | break; |
566 | 591 | ||
567 | case queue_for_send_oos: | 592 | case QUEUE_FOR_SEND_OOS: |
568 | req->rq_state |= RQ_NET_QUEUED; | 593 | mod_rq_state(req, m, 0, RQ_NET_QUEUED); |
569 | req->w.cb = w_send_oos; | 594 | req->w.cb = w_send_out_of_sync; |
570 | drbd_queue_work(&mdev->data.work, &req->w); | 595 | drbd_queue_work(&mdev->tconn->sender_work, &req->w); |
571 | break; | 596 | break; |
572 | 597 | ||
573 | case read_retry_remote_canceled: | 598 | case READ_RETRY_REMOTE_CANCELED: |
574 | case send_canceled: | 599 | case SEND_CANCELED: |
575 | case send_failed: | 600 | case SEND_FAILED: |
576 | /* real cleanup will be done from tl_clear. just update flags | 601 | /* real cleanup will be done from tl_clear. just update flags |
577 | * so it is no longer marked as on the worker queue */ | 602 | * so it is no longer marked as on the worker queue */ |
578 | req->rq_state &= ~RQ_NET_QUEUED; | 603 | mod_rq_state(req, m, RQ_NET_QUEUED, 0); |
579 | /* if we did it right, tl_clear should be scheduled only after | ||
580 | * this, so this should not be necessary! */ | ||
581 | _req_may_be_done_not_susp(req, m); | ||
582 | break; | 604 | break; |
583 | 605 | ||
584 | case handed_over_to_network: | 606 | case HANDED_OVER_TO_NETWORK: |
585 | /* assert something? */ | 607 | /* assert something? */ |
586 | if (bio_data_dir(req->master_bio) == WRITE) | ||
587 | atomic_add(req->size>>9, &mdev->ap_in_flight); | ||
588 | |||
589 | if (bio_data_dir(req->master_bio) == WRITE && | 608 | if (bio_data_dir(req->master_bio) == WRITE && |
590 | mdev->net_conf->wire_protocol == DRBD_PROT_A) { | 609 | !(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK))) { |
591 | /* this is what is dangerous about protocol A: | 610 | /* this is what is dangerous about protocol A: |
592 | * pretend it was successfully written on the peer. */ | 611 | * pretend it was successfully written on the peer. */ |
593 | if (req->rq_state & RQ_NET_PENDING) { | 612 | if (req->rq_state & RQ_NET_PENDING) |
594 | dec_ap_pending(mdev); | 613 | mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK); |
595 | req->rq_state &= ~RQ_NET_PENDING; | 614 | /* else: neg-ack was faster... */ |
596 | req->rq_state |= RQ_NET_OK; | ||
597 | } /* else: neg-ack was faster... */ | ||
598 | /* it is still not yet RQ_NET_DONE until the | 615 | /* it is still not yet RQ_NET_DONE until the |
599 | * corresponding epoch barrier got acked as well, | 616 | * corresponding epoch barrier got acked as well, |
600 | * so we know what to dirty on connection loss */ | 617 | * so we know what to dirty on connection loss */ |
601 | } | 618 | } |
602 | req->rq_state &= ~RQ_NET_QUEUED; | 619 | mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_SENT); |
603 | req->rq_state |= RQ_NET_SENT; | ||
604 | _req_may_be_done_not_susp(req, m); | ||
605 | break; | 620 | break; |
606 | 621 | ||
607 | case oos_handed_to_network: | 622 | case OOS_HANDED_TO_NETWORK: |
608 | /* Was not set PENDING, no longer QUEUED, so is now DONE | 623 | /* Was not set PENDING, no longer QUEUED, so is now DONE |
609 | * as far as this connection is concerned. */ | 624 | * as far as this connection is concerned. */ |
610 | req->rq_state &= ~RQ_NET_QUEUED; | 625 | mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_DONE); |
611 | req->rq_state |= RQ_NET_DONE; | ||
612 | _req_may_be_done_not_susp(req, m); | ||
613 | break; | 626 | break; |
614 | 627 | ||
615 | case connection_lost_while_pending: | 628 | case CONNECTION_LOST_WHILE_PENDING: |
616 | /* transfer log cleanup after connection loss */ | 629 | /* transfer log cleanup after connection loss */ |
617 | /* assert something? */ | 630 | mod_rq_state(req, m, |
618 | if (req->rq_state & RQ_NET_PENDING) | 631 | RQ_NET_OK|RQ_NET_PENDING|RQ_COMPLETION_SUSP, |
619 | dec_ap_pending(mdev); | 632 | RQ_NET_DONE); |
620 | req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING); | ||
621 | req->rq_state |= RQ_NET_DONE; | ||
622 | if (req->rq_state & RQ_NET_SENT && req->rq_state & RQ_WRITE) | ||
623 | atomic_sub(req->size>>9, &mdev->ap_in_flight); | ||
624 | |||
625 | /* if it is still queued, we may not complete it here. | ||
626 | * it will be canceled soon. */ | ||
627 | if (!(req->rq_state & RQ_NET_QUEUED)) | ||
628 | _req_may_be_done(req, m); /* Allowed while state.susp */ | ||
629 | break; | 633 | break; |
630 | 634 | ||
631 | case conflict_discarded_by_peer: | 635 | case CONFLICT_RESOLVED: |
632 | /* for discarded conflicting writes of multiple primaries, | 636 | /* for superseded conflicting writes of multiple primaries, |
633 | * there is no need to keep anything in the tl, potential | 637 | * there is no need to keep anything in the tl, potential |
634 | * node crashes are covered by the activity log. */ | 638 | * node crashes are covered by the activity log. |
635 | if (what == conflict_discarded_by_peer) | 639 | * |
636 | dev_alert(DEV, "Got DiscardAck packet %llus +%u!" | 640 | * If this request had been marked as RQ_POSTPONED before, |
637 | " DRBD is not a random data generator!\n", | 641 | * it will actually not be completed, but "restarted", |
638 | (unsigned long long)req->sector, req->size); | 642 | * resubmitted from the retry worker context. */ |
639 | req->rq_state |= RQ_NET_DONE; | 643 | D_ASSERT(req->rq_state & RQ_NET_PENDING); |
640 | /* fall through */ | 644 | D_ASSERT(req->rq_state & RQ_EXP_WRITE_ACK); |
641 | case write_acked_by_peer_and_sis: | 645 | mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_DONE|RQ_NET_OK); |
642 | case write_acked_by_peer: | 646 | break; |
643 | if (what == write_acked_by_peer_and_sis) | 647 | |
644 | req->rq_state |= RQ_NET_SIS; | 648 | case WRITE_ACKED_BY_PEER_AND_SIS: |
649 | req->rq_state |= RQ_NET_SIS; | ||
650 | case WRITE_ACKED_BY_PEER: | ||
651 | D_ASSERT(req->rq_state & RQ_EXP_WRITE_ACK); | ||
645 | /* protocol C; successfully written on peer. | 652 | /* protocol C; successfully written on peer. |
646 | * Nothing more to do here. | 653 | * Nothing more to do here. |
647 | * We want to keep the tl in place for all protocols, to cater | 654 | * We want to keep the tl in place for all protocols, to cater |
648 | * for volatile write-back caches on lower level devices. */ | 655 | * for volatile write-back caches on lower level devices. */ |
649 | 656 | ||
650 | case recv_acked_by_peer: | 657 | goto ack_common; |
658 | case RECV_ACKED_BY_PEER: | ||
659 | D_ASSERT(req->rq_state & RQ_EXP_RECEIVE_ACK); | ||
651 | /* protocol B; pretends to be successfully written on peer. | 660 | /* protocol B; pretends to be successfully written on peer. |
652 | * see also notes above in handed_over_to_network about | 661 | * see also notes above in HANDED_OVER_TO_NETWORK about |
653 | * protocol != C */ | 662 | * protocol != C */ |
654 | req->rq_state |= RQ_NET_OK; | 663 | ack_common: |
655 | D_ASSERT(req->rq_state & RQ_NET_PENDING); | 664 | D_ASSERT(req->rq_state & RQ_NET_PENDING); |
656 | dec_ap_pending(mdev); | 665 | mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK); |
657 | atomic_sub(req->size>>9, &mdev->ap_in_flight); | ||
658 | req->rq_state &= ~RQ_NET_PENDING; | ||
659 | _req_may_be_done_not_susp(req, m); | ||
660 | break; | 666 | break; |
661 | 667 | ||
662 | case neg_acked: | 668 | case POSTPONE_WRITE: |
663 | /* assert something? */ | 669 | D_ASSERT(req->rq_state & RQ_EXP_WRITE_ACK); |
664 | if (req->rq_state & RQ_NET_PENDING) { | 670 | /* If this node has already detected the write conflict, the |
665 | dec_ap_pending(mdev); | 671 | * worker will be waiting on misc_wait. Wake it up once this |
666 | atomic_sub(req->size>>9, &mdev->ap_in_flight); | 672 | * request has completed locally. |
667 | } | 673 | */ |
668 | req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING); | 674 | D_ASSERT(req->rq_state & RQ_NET_PENDING); |
675 | req->rq_state |= RQ_POSTPONED; | ||
676 | if (req->i.waiting) | ||
677 | wake_up(&mdev->misc_wait); | ||
678 | /* Do not clear RQ_NET_PENDING. This request will make further | ||
679 | * progress via restart_conflicting_writes() or | ||
680 | * fail_postponed_requests(). Hopefully. */ | ||
681 | break; | ||
669 | 682 | ||
670 | req->rq_state |= RQ_NET_DONE; | 683 | case NEG_ACKED: |
671 | _req_may_be_done_not_susp(req, m); | 684 | mod_rq_state(req, m, RQ_NET_OK|RQ_NET_PENDING, 0); |
672 | /* else: done by handed_over_to_network */ | ||
673 | break; | 685 | break; |
674 | 686 | ||
675 | case fail_frozen_disk_io: | 687 | case FAIL_FROZEN_DISK_IO: |
676 | if (!(req->rq_state & RQ_LOCAL_COMPLETED)) | 688 | if (!(req->rq_state & RQ_LOCAL_COMPLETED)) |
677 | break; | 689 | break; |
678 | 690 | mod_rq_state(req, m, RQ_COMPLETION_SUSP, 0); | |
679 | _req_may_be_done(req, m); /* Allowed while state.susp */ | ||
680 | break; | 691 | break; |
681 | 692 | ||
682 | case restart_frozen_disk_io: | 693 | case RESTART_FROZEN_DISK_IO: |
683 | if (!(req->rq_state & RQ_LOCAL_COMPLETED)) | 694 | if (!(req->rq_state & RQ_LOCAL_COMPLETED)) |
684 | break; | 695 | break; |
685 | 696 | ||
686 | req->rq_state &= ~RQ_LOCAL_COMPLETED; | 697 | mod_rq_state(req, m, |
698 | RQ_COMPLETION_SUSP|RQ_LOCAL_COMPLETED, | ||
699 | RQ_LOCAL_PENDING); | ||
687 | 700 | ||
688 | rv = MR_READ; | 701 | rv = MR_READ; |
689 | if (bio_data_dir(req->master_bio) == WRITE) | 702 | if (bio_data_dir(req->master_bio) == WRITE) |
690 | rv = MR_WRITE; | 703 | rv = MR_WRITE; |
691 | 704 | ||
692 | get_ldev(mdev); | 705 | get_ldev(mdev); /* always succeeds in this call path */ |
693 | req->w.cb = w_restart_disk_io; | 706 | req->w.cb = w_restart_disk_io; |
694 | drbd_queue_work(&mdev->data.work, &req->w); | 707 | drbd_queue_work(&mdev->tconn->sender_work, &req->w); |
695 | break; | 708 | break; |
696 | 709 | ||
697 | case resend: | 710 | case RESEND: |
698 | /* Simply complete (local only) READs. */ | 711 | /* Simply complete (local only) READs. */ |
699 | if (!(req->rq_state & RQ_WRITE) && !req->w.cb) { | 712 | if (!(req->rq_state & RQ_WRITE) && !req->w.cb) { |
700 | _req_may_be_done(req, m); | 713 | mod_rq_state(req, m, RQ_COMPLETION_SUSP, 0); |
701 | break; | 714 | break; |
702 | } | 715 | } |
703 | 716 | ||
704 | /* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK | 717 | /* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK |
705 | before the connection loss (B&C only); only P_BARRIER_ACK was missing. | 718 | before the connection loss (B&C only); only P_BARRIER_ACK |
706 | Trowing them out of the TL here by pretending we got a BARRIER_ACK | 719 | (or the local completion?) was missing when we suspended. |
707 | We ensure that the peer was not rebooted */ | 720 | Throwing them out of the TL here by pretending we got a BARRIER_ACK. |
721 | During connection handshake, we ensure that the peer was not rebooted. */ | ||
708 | if (!(req->rq_state & RQ_NET_OK)) { | 722 | if (!(req->rq_state & RQ_NET_OK)) { |
723 | /* FIXME could this possibly be a req->w.cb == w_send_out_of_sync? | ||
724 | * in that case we must not set RQ_NET_PENDING. */ | ||
725 | |||
726 | mod_rq_state(req, m, RQ_COMPLETION_SUSP, RQ_NET_QUEUED|RQ_NET_PENDING); | ||
709 | if (req->w.cb) { | 727 | if (req->w.cb) { |
710 | drbd_queue_work(&mdev->data.work, &req->w); | 728 | drbd_queue_work(&mdev->tconn->sender_work, &req->w); |
711 | rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ; | 729 | rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ; |
712 | } | 730 | } /* else: FIXME can this happen? */ |
713 | break; | 731 | break; |
714 | } | 732 | } |
715 | /* else, fall through to barrier_acked */ | 733 | /* else, fall through to BARRIER_ACKED */ |
716 | 734 | ||
717 | case barrier_acked: | 735 | case BARRIER_ACKED: |
736 | /* barrier ack for READ requests does not make sense */ | ||
718 | if (!(req->rq_state & RQ_WRITE)) | 737 | if (!(req->rq_state & RQ_WRITE)) |
719 | break; | 738 | break; |
720 | 739 | ||
721 | if (req->rq_state & RQ_NET_PENDING) { | 740 | if (req->rq_state & RQ_NET_PENDING) { |
722 | /* barrier came in before all requests have been acked. | 741 | /* barrier came in before all requests were acked. |
723 | * this is bad, because if the connection is lost now, | 742 | * this is bad, because if the connection is lost now, |
724 | * we won't be able to clean them up... */ | 743 | * we won't be able to clean them up... */ |
725 | dev_err(DEV, "FIXME (barrier_acked but pending)\n"); | 744 | dev_err(DEV, "FIXME (BARRIER_ACKED but pending)\n"); |
726 | list_move(&req->tl_requests, &mdev->out_of_sequence_requests); | ||
727 | } | 745 | } |
728 | if ((req->rq_state & RQ_NET_MASK) != 0) { | 746 | /* Allowed to complete requests, even while suspended. |
729 | req->rq_state |= RQ_NET_DONE; | 747 | * As this is called for all requests within a matching epoch, |
730 | if (mdev->net_conf->wire_protocol == DRBD_PROT_A) | 748 | * we need to filter, and only set RQ_NET_DONE for those that |
731 | atomic_sub(req->size>>9, &mdev->ap_in_flight); | 749 | * have actually been on the wire. */ |
732 | } | 750 | mod_rq_state(req, m, RQ_COMPLETION_SUSP, |
733 | _req_may_be_done(req, m); /* Allowed while state.susp */ | 751 | (req->rq_state & RQ_NET_MASK) ? RQ_NET_DONE : 0); |
734 | break; | 752 | break; |
735 | 753 | ||
736 | case data_received: | 754 | case DATA_RECEIVED: |
737 | D_ASSERT(req->rq_state & RQ_NET_PENDING); | 755 | D_ASSERT(req->rq_state & RQ_NET_PENDING); |
738 | dec_ap_pending(mdev); | 756 | mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK|RQ_NET_DONE); |
739 | req->rq_state &= ~RQ_NET_PENDING; | ||
740 | req->rq_state |= (RQ_NET_OK|RQ_NET_DONE); | ||
741 | _req_may_be_done_not_susp(req, m); | ||
742 | break; | 757 | break; |
743 | }; | 758 | }; |
744 | 759 | ||
@@ -752,75 +767,265 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, | |||
752 | * since size may be bigger than BM_BLOCK_SIZE, | 767 | * since size may be bigger than BM_BLOCK_SIZE, |
753 | * we may need to check several bits. | 768 | * we may need to check several bits. |
754 | */ | 769 | */ |
755 | static int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size) | 770 | static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size) |
756 | { | 771 | { |
757 | unsigned long sbnr, ebnr; | 772 | unsigned long sbnr, ebnr; |
758 | sector_t esector, nr_sectors; | 773 | sector_t esector, nr_sectors; |
759 | 774 | ||
760 | if (mdev->state.disk == D_UP_TO_DATE) | 775 | if (mdev->state.disk == D_UP_TO_DATE) |
761 | return 1; | 776 | return true; |
762 | if (mdev->state.disk >= D_OUTDATED) | 777 | if (mdev->state.disk != D_INCONSISTENT) |
763 | return 0; | 778 | return false; |
764 | if (mdev->state.disk < D_INCONSISTENT) | ||
765 | return 0; | ||
766 | /* state.disk == D_INCONSISTENT We will have a look at the BitMap */ | ||
767 | nr_sectors = drbd_get_capacity(mdev->this_bdev); | ||
768 | esector = sector + (size >> 9) - 1; | 779 | esector = sector + (size >> 9) - 1; |
769 | 780 | nr_sectors = drbd_get_capacity(mdev->this_bdev); | |
770 | D_ASSERT(sector < nr_sectors); | 781 | D_ASSERT(sector < nr_sectors); |
771 | D_ASSERT(esector < nr_sectors); | 782 | D_ASSERT(esector < nr_sectors); |
772 | 783 | ||
773 | sbnr = BM_SECT_TO_BIT(sector); | 784 | sbnr = BM_SECT_TO_BIT(sector); |
774 | ebnr = BM_SECT_TO_BIT(esector); | 785 | ebnr = BM_SECT_TO_BIT(esector); |
775 | 786 | ||
776 | return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr); | 787 | return drbd_bm_count_bits(mdev, sbnr, ebnr) == 0; |
788 | } | ||
789 | |||
790 | static bool remote_due_to_read_balancing(struct drbd_conf *mdev, sector_t sector, | ||
791 | enum drbd_read_balancing rbm) | ||
792 | { | ||
793 | struct backing_dev_info *bdi; | ||
794 | int stripe_shift; | ||
795 | |||
796 | switch (rbm) { | ||
797 | case RB_CONGESTED_REMOTE: | ||
798 | bdi = &mdev->ldev->backing_bdev->bd_disk->queue->backing_dev_info; | ||
799 | return bdi_read_congested(bdi); | ||
800 | case RB_LEAST_PENDING: | ||
801 | return atomic_read(&mdev->local_cnt) > | ||
802 | atomic_read(&mdev->ap_pending_cnt) + atomic_read(&mdev->rs_pending_cnt); | ||
803 | case RB_32K_STRIPING: /* stripe_shift = 15 */ | ||
804 | case RB_64K_STRIPING: | ||
805 | case RB_128K_STRIPING: | ||
806 | case RB_256K_STRIPING: | ||
807 | case RB_512K_STRIPING: | ||
808 | case RB_1M_STRIPING: /* stripe_shift = 20 */ | ||
809 | stripe_shift = (rbm - RB_32K_STRIPING + 15); | ||
810 | return (sector >> (stripe_shift - 9)) & 1; | ||
811 | case RB_ROUND_ROBIN: | ||
812 | return test_and_change_bit(READ_BALANCE_RR, &mdev->flags); | ||
813 | case RB_PREFER_REMOTE: | ||
814 | return true; | ||
815 | case RB_PREFER_LOCAL: | ||
816 | default: | ||
817 | return false; | ||
818 | } | ||
819 | } | ||
820 | |||
821 | /* | ||
822 | * complete_conflicting_writes - wait for any conflicting write requests | ||
823 | * | ||
824 | * The write_requests tree contains all active write requests which we | ||
825 | * currently know about. Wait for any requests to complete which conflict with | ||
826 | * the new one. | ||
827 | * | ||
828 | * Only way out: remove the conflicting intervals from the tree. | ||
829 | */ | ||
830 | static void complete_conflicting_writes(struct drbd_request *req) | ||
831 | { | ||
832 | DEFINE_WAIT(wait); | ||
833 | struct drbd_conf *mdev = req->w.mdev; | ||
834 | struct drbd_interval *i; | ||
835 | sector_t sector = req->i.sector; | ||
836 | int size = req->i.size; | ||
837 | |||
838 | i = drbd_find_overlap(&mdev->write_requests, sector, size); | ||
839 | if (!i) | ||
840 | return; | ||
841 | |||
842 | for (;;) { | ||
843 | prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE); | ||
844 | i = drbd_find_overlap(&mdev->write_requests, sector, size); | ||
845 | if (!i) | ||
846 | break; | ||
847 | /* Indicate to wake up device->misc_wait on progress. */ | ||
848 | i->waiting = true; | ||
849 | spin_unlock_irq(&mdev->tconn->req_lock); | ||
850 | schedule(); | ||
851 | spin_lock_irq(&mdev->tconn->req_lock); | ||
852 | } | ||
853 | finish_wait(&mdev->misc_wait, &wait); | ||
777 | } | 854 | } |
778 | 855 | ||
856 | /* called within req_lock and rcu_read_lock() */ | ||
779 | static void maybe_pull_ahead(struct drbd_conf *mdev) | 857 | static void maybe_pull_ahead(struct drbd_conf *mdev) |
780 | { | 858 | { |
781 | int congested = 0; | 859 | struct drbd_tconn *tconn = mdev->tconn; |
860 | struct net_conf *nc; | ||
861 | bool congested = false; | ||
862 | enum drbd_on_congestion on_congestion; | ||
863 | |||
864 | nc = rcu_dereference(tconn->net_conf); | ||
865 | on_congestion = nc ? nc->on_congestion : OC_BLOCK; | ||
866 | if (on_congestion == OC_BLOCK || | ||
867 | tconn->agreed_pro_version < 96) | ||
868 | return; | ||
782 | 869 | ||
783 | /* If I don't even have good local storage, we can not reasonably try | 870 | /* If I don't even have good local storage, we can not reasonably try |
784 | * to pull ahead of the peer. We also need the local reference to make | 871 | * to pull ahead of the peer. We also need the local reference to make |
785 | * sure mdev->act_log is there. | 872 | * sure mdev->act_log is there. |
786 | * Note: caller has to make sure that net_conf is there. | ||
787 | */ | 873 | */ |
788 | if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) | 874 | if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) |
789 | return; | 875 | return; |
790 | 876 | ||
791 | if (mdev->net_conf->cong_fill && | 877 | if (nc->cong_fill && |
792 | atomic_read(&mdev->ap_in_flight) >= mdev->net_conf->cong_fill) { | 878 | atomic_read(&mdev->ap_in_flight) >= nc->cong_fill) { |
793 | dev_info(DEV, "Congestion-fill threshold reached\n"); | 879 | dev_info(DEV, "Congestion-fill threshold reached\n"); |
794 | congested = 1; | 880 | congested = true; |
795 | } | 881 | } |
796 | 882 | ||
797 | if (mdev->act_log->used >= mdev->net_conf->cong_extents) { | 883 | if (mdev->act_log->used >= nc->cong_extents) { |
798 | dev_info(DEV, "Congestion-extents threshold reached\n"); | 884 | dev_info(DEV, "Congestion-extents threshold reached\n"); |
799 | congested = 1; | 885 | congested = true; |
800 | } | 886 | } |
801 | 887 | ||
802 | if (congested) { | 888 | if (congested) { |
803 | queue_barrier(mdev); /* last barrier, after mirrored writes */ | 889 | /* start a new epoch for non-mirrored writes */ |
890 | start_new_tl_epoch(mdev->tconn); | ||
804 | 891 | ||
805 | if (mdev->net_conf->on_congestion == OC_PULL_AHEAD) | 892 | if (on_congestion == OC_PULL_AHEAD) |
806 | _drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL); | 893 | _drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL); |
807 | else /*mdev->net_conf->on_congestion == OC_DISCONNECT */ | 894 | else /*nc->on_congestion == OC_DISCONNECT */ |
808 | _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL); | 895 | _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL); |
809 | } | 896 | } |
810 | put_ldev(mdev); | 897 | put_ldev(mdev); |
811 | } | 898 | } |
812 | 899 | ||
813 | static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time) | 900 | /* If this returns false, and req->private_bio is still set, |
901 | * this should be submitted locally. | ||
902 | * | ||
903 | * If it returns false, but req->private_bio is not set, | ||
904 | * we do not have access to good data :( | ||
905 | * | ||
906 | * Otherwise, this destroys req->private_bio, if any, | ||
907 | * and returns true. | ||
908 | */ | ||
909 | static bool do_remote_read(struct drbd_request *req) | ||
910 | { | ||
911 | struct drbd_conf *mdev = req->w.mdev; | ||
912 | enum drbd_read_balancing rbm; | ||
913 | |||
914 | if (req->private_bio) { | ||
915 | if (!drbd_may_do_local_read(mdev, | ||
916 | req->i.sector, req->i.size)) { | ||
917 | bio_put(req->private_bio); | ||
918 | req->private_bio = NULL; | ||
919 | put_ldev(mdev); | ||
920 | } | ||
921 | } | ||
922 | |||
923 | if (mdev->state.pdsk != D_UP_TO_DATE) | ||
924 | return false; | ||
925 | |||
926 | if (req->private_bio == NULL) | ||
927 | return true; | ||
928 | |||
929 | /* TODO: improve read balancing decisions, take into account drbd | ||
930 | * protocol, pending requests etc. */ | ||
931 | |||
932 | rcu_read_lock(); | ||
933 | rbm = rcu_dereference(mdev->ldev->disk_conf)->read_balancing; | ||
934 | rcu_read_unlock(); | ||
935 | |||
936 | if (rbm == RB_PREFER_LOCAL && req->private_bio) | ||
937 | return false; /* submit locally */ | ||
938 | |||
939 | if (remote_due_to_read_balancing(mdev, req->i.sector, rbm)) { | ||
940 | if (req->private_bio) { | ||
941 | bio_put(req->private_bio); | ||
942 | req->private_bio = NULL; | ||
943 | put_ldev(mdev); | ||
944 | } | ||
945 | return true; | ||
946 | } | ||
947 | |||
948 | return false; | ||
949 | } | ||
950 | |||
951 | /* returns number of connections (== 1, for drbd 8.4) | ||
952 | * expected to actually write this data, | ||
953 | * which does NOT include those that we are L_AHEAD for. */ | ||
954 | static int drbd_process_write_request(struct drbd_request *req) | ||
955 | { | ||
956 | struct drbd_conf *mdev = req->w.mdev; | ||
957 | int remote, send_oos; | ||
958 | |||
959 | rcu_read_lock(); | ||
960 | remote = drbd_should_do_remote(mdev->state); | ||
961 | if (remote) { | ||
962 | maybe_pull_ahead(mdev); | ||
963 | remote = drbd_should_do_remote(mdev->state); | ||
964 | } | ||
965 | send_oos = drbd_should_send_out_of_sync(mdev->state); | ||
966 | rcu_read_unlock(); | ||
967 | |||
968 | /* Need to replicate writes. Unless it is an empty flush, | ||
969 | * which is better mapped to a DRBD P_BARRIER packet, | ||
970 | * also for drbd wire protocol compatibility reasons. | ||
971 | * If this was a flush, just start a new epoch. | ||
972 | * Unless the current epoch was empty anyways, or we are not currently | ||
973 | * replicating, in which case there is no point. */ | ||
974 | if (unlikely(req->i.size == 0)) { | ||
975 | /* The only size==0 bios we expect are empty flushes. */ | ||
976 | D_ASSERT(req->master_bio->bi_rw & REQ_FLUSH); | ||
977 | if (remote) | ||
978 | start_new_tl_epoch(mdev->tconn); | ||
979 | return 0; | ||
980 | } | ||
981 | |||
982 | if (!remote && !send_oos) | ||
983 | return 0; | ||
984 | |||
985 | D_ASSERT(!(remote && send_oos)); | ||
986 | |||
987 | if (remote) { | ||
988 | _req_mod(req, TO_BE_SENT); | ||
989 | _req_mod(req, QUEUE_FOR_NET_WRITE); | ||
990 | } else if (drbd_set_out_of_sync(mdev, req->i.sector, req->i.size)) | ||
991 | _req_mod(req, QUEUE_FOR_SEND_OOS); | ||
992 | |||
993 | return remote; | ||
994 | } | ||
995 | |||
996 | static void | ||
997 | drbd_submit_req_private_bio(struct drbd_request *req) | ||
998 | { | ||
999 | struct drbd_conf *mdev = req->w.mdev; | ||
1000 | struct bio *bio = req->private_bio; | ||
1001 | const int rw = bio_rw(bio); | ||
1002 | |||
1003 | bio->bi_bdev = mdev->ldev->backing_bdev; | ||
1004 | |||
1005 | /* State may have changed since we grabbed our reference on the | ||
1006 | * ->ldev member. Double check, and short-circuit to endio. | ||
1007 | * In case the last activity log transaction failed to get on | ||
1008 | * stable storage, and this is a WRITE, we may not even submit | ||
1009 | * this bio. */ | ||
1010 | if (get_ldev(mdev)) { | ||
1011 | if (drbd_insert_fault(mdev, | ||
1012 | rw == WRITE ? DRBD_FAULT_DT_WR | ||
1013 | : rw == READ ? DRBD_FAULT_DT_RD | ||
1014 | : DRBD_FAULT_DT_RA)) | ||
1015 | bio_endio(bio, -EIO); | ||
1016 | else | ||
1017 | generic_make_request(bio); | ||
1018 | put_ldev(mdev); | ||
1019 | } else | ||
1020 | bio_endio(bio, -EIO); | ||
1021 | } | ||
1022 | |||
1023 | void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time) | ||
814 | { | 1024 | { |
815 | const int rw = bio_rw(bio); | 1025 | const int rw = bio_rw(bio); |
816 | const int size = bio->bi_size; | 1026 | struct bio_and_error m = { NULL, }; |
817 | const sector_t sector = bio->bi_sector; | ||
818 | struct drbd_tl_epoch *b = NULL; | ||
819 | struct drbd_request *req; | 1027 | struct drbd_request *req; |
820 | int local, remote, send_oos = 0; | 1028 | bool no_remote = false; |
821 | int err = -EIO; | ||
822 | int ret = 0; | ||
823 | union drbd_state s; | ||
824 | 1029 | ||
825 | /* allocate outside of all locks; */ | 1030 | /* allocate outside of all locks; */ |
826 | req = drbd_req_new(mdev, bio); | 1031 | req = drbd_req_new(mdev, bio); |
@@ -830,55 +1035,14 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns | |||
830 | * if user cannot handle io errors, that's not our business. */ | 1035 | * if user cannot handle io errors, that's not our business. */ |
831 | dev_err(DEV, "could not kmalloc() req\n"); | 1036 | dev_err(DEV, "could not kmalloc() req\n"); |
832 | bio_endio(bio, -ENOMEM); | 1037 | bio_endio(bio, -ENOMEM); |
833 | return 0; | 1038 | return; |
834 | } | 1039 | } |
835 | req->start_time = start_time; | 1040 | req->start_time = start_time; |
836 | 1041 | ||
837 | local = get_ldev(mdev); | 1042 | if (!get_ldev(mdev)) { |
838 | if (!local) { | 1043 | bio_put(req->private_bio); |
839 | bio_put(req->private_bio); /* or we get a bio leak */ | ||
840 | req->private_bio = NULL; | 1044 | req->private_bio = NULL; |
841 | } | 1045 | } |
842 | if (rw == WRITE) { | ||
843 | /* Need to replicate writes. Unless it is an empty flush, | ||
844 | * which is better mapped to a DRBD P_BARRIER packet, | ||
845 | * also for drbd wire protocol compatibility reasons. */ | ||
846 | if (unlikely(size == 0)) { | ||
847 | /* The only size==0 bios we expect are empty flushes. */ | ||
848 | D_ASSERT(bio->bi_rw & REQ_FLUSH); | ||
849 | remote = 0; | ||
850 | } else | ||
851 | remote = 1; | ||
852 | } else { | ||
853 | /* READ || READA */ | ||
854 | if (local) { | ||
855 | if (!drbd_may_do_local_read(mdev, sector, size)) { | ||
856 | /* we could kick the syncer to | ||
857 | * sync this extent asap, wait for | ||
858 | * it, then continue locally. | ||
859 | * Or just issue the request remotely. | ||
860 | */ | ||
861 | local = 0; | ||
862 | bio_put(req->private_bio); | ||
863 | req->private_bio = NULL; | ||
864 | put_ldev(mdev); | ||
865 | } | ||
866 | } | ||
867 | remote = !local && mdev->state.pdsk >= D_UP_TO_DATE; | ||
868 | } | ||
869 | |||
870 | /* If we have a disk, but a READA request is mapped to remote, | ||
871 | * we are R_PRIMARY, D_INCONSISTENT, SyncTarget. | ||
872 | * Just fail that READA request right here. | ||
873 | * | ||
874 | * THINK: maybe fail all READA when not local? | ||
875 | * or make this configurable... | ||
876 | * if network is slow, READA won't do any good. | ||
877 | */ | ||
878 | if (rw == READA && mdev->state.disk >= D_INCONSISTENT && !local) { | ||
879 | err = -EWOULDBLOCK; | ||
880 | goto fail_and_free_req; | ||
881 | } | ||
882 | 1046 | ||
883 | /* For WRITES going to the local disk, grab a reference on the target | 1047 | /* For WRITES going to the local disk, grab a reference on the target |
884 | * extent. This waits for any resync activity in the corresponding | 1048 | * extent. This waits for any resync activity in the corresponding |
@@ -887,348 +1051,131 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns | |||
887 | * of transactional on-disk meta data updates. | 1051 | * of transactional on-disk meta data updates. |
888 | * Empty flushes don't need to go into the activity log, they can only | 1052 | * Empty flushes don't need to go into the activity log, they can only |
889 | * flush data for pending writes which are already in there. */ | 1053 | * flush data for pending writes which are already in there. */ |
890 | if (rw == WRITE && local && size | 1054 | if (rw == WRITE && req->private_bio && req->i.size |
891 | && !test_bit(AL_SUSPENDED, &mdev->flags)) { | 1055 | && !test_bit(AL_SUSPENDED, &mdev->flags)) { |
892 | req->rq_state |= RQ_IN_ACT_LOG; | 1056 | req->rq_state |= RQ_IN_ACT_LOG; |
893 | drbd_al_begin_io(mdev, sector); | 1057 | drbd_al_begin_io(mdev, &req->i); |
894 | } | ||
895 | |||
896 | s = mdev->state; | ||
897 | remote = remote && drbd_should_do_remote(s); | ||
898 | send_oos = rw == WRITE && drbd_should_send_oos(s); | ||
899 | D_ASSERT(!(remote && send_oos)); | ||
900 | |||
901 | if (!(local || remote) && !is_susp(mdev->state)) { | ||
902 | if (__ratelimit(&drbd_ratelimit_state)) | ||
903 | dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); | ||
904 | goto fail_free_complete; | ||
905 | } | 1058 | } |
906 | 1059 | ||
907 | /* For WRITE request, we have to make sure that we have an | 1060 | spin_lock_irq(&mdev->tconn->req_lock); |
908 | * unused_spare_tle, in case we need to start a new epoch. | 1061 | if (rw == WRITE) { |
909 | * I try to be smart and avoid to pre-allocate always "just in case", | 1062 | /* This may temporarily give up the req_lock, |
910 | * but there is a race between testing the bit and pointer outside the | 1063 | * but will re-aquire it before it returns here. |
911 | * spinlock, and grabbing the spinlock. | 1064 | * Needs to be before the check on drbd_suspended() */ |
912 | * if we lost that race, we retry. */ | 1065 | complete_conflicting_writes(req); |
913 | if (rw == WRITE && (remote || send_oos) && | ||
914 | mdev->unused_spare_tle == NULL && | ||
915 | test_bit(CREATE_BARRIER, &mdev->flags)) { | ||
916 | allocate_barrier: | ||
917 | b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_NOIO); | ||
918 | if (!b) { | ||
919 | dev_err(DEV, "Failed to alloc barrier.\n"); | ||
920 | err = -ENOMEM; | ||
921 | goto fail_free_complete; | ||
922 | } | ||
923 | } | 1066 | } |
924 | 1067 | ||
925 | /* GOOD, everything prepared, grab the spin_lock */ | 1068 | /* no more giving up req_lock from now on! */ |
926 | spin_lock_irq(&mdev->req_lock); | ||
927 | |||
928 | if (is_susp(mdev->state)) { | ||
929 | /* If we got suspended, use the retry mechanism of | ||
930 | drbd_make_request() to restart processing of this | ||
931 | bio. In the next call to drbd_make_request | ||
932 | we sleep in inc_ap_bio() */ | ||
933 | ret = 1; | ||
934 | spin_unlock_irq(&mdev->req_lock); | ||
935 | goto fail_free_complete; | ||
936 | } | ||
937 | 1069 | ||
938 | if (remote || send_oos) { | 1070 | if (drbd_suspended(mdev)) { |
939 | remote = drbd_should_do_remote(mdev->state); | 1071 | /* push back and retry: */ |
940 | send_oos = rw == WRITE && drbd_should_send_oos(mdev->state); | 1072 | req->rq_state |= RQ_POSTPONED; |
941 | D_ASSERT(!(remote && send_oos)); | 1073 | if (req->private_bio) { |
942 | 1074 | bio_put(req->private_bio); | |
943 | if (!(remote || send_oos)) | 1075 | req->private_bio = NULL; |
944 | dev_warn(DEV, "lost connection while grabbing the req_lock!\n"); | 1076 | put_ldev(mdev); |
945 | if (!(local || remote)) { | ||
946 | dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); | ||
947 | spin_unlock_irq(&mdev->req_lock); | ||
948 | goto fail_free_complete; | ||
949 | } | 1077 | } |
1078 | goto out; | ||
950 | } | 1079 | } |
951 | 1080 | ||
952 | if (b && mdev->unused_spare_tle == NULL) { | ||
953 | mdev->unused_spare_tle = b; | ||
954 | b = NULL; | ||
955 | } | ||
956 | if (rw == WRITE && (remote || send_oos) && | ||
957 | mdev->unused_spare_tle == NULL && | ||
958 | test_bit(CREATE_BARRIER, &mdev->flags)) { | ||
959 | /* someone closed the current epoch | ||
960 | * while we were grabbing the spinlock */ | ||
961 | spin_unlock_irq(&mdev->req_lock); | ||
962 | goto allocate_barrier; | ||
963 | } | ||
964 | |||
965 | |||
966 | /* Update disk stats */ | 1081 | /* Update disk stats */ |
967 | _drbd_start_io_acct(mdev, req, bio); | 1082 | _drbd_start_io_acct(mdev, req, bio); |
968 | 1083 | ||
969 | /* _maybe_start_new_epoch(mdev); | 1084 | /* We fail READ/READA early, if we can not serve it. |
970 | * If we need to generate a write barrier packet, we have to add the | 1085 | * We must do this before req is registered on any lists. |
971 | * new epoch (barrier) object, and queue the barrier packet for sending, | 1086 | * Otherwise, drbd_req_complete() will queue failed READ for retry. */ |
972 | * and queue the req's data after it _within the same lock_, otherwise | 1087 | if (rw != WRITE) { |
973 | * we have race conditions were the reorder domains could be mixed up. | 1088 | if (!do_remote_read(req) && !req->private_bio) |
974 | * | 1089 | goto nodata; |
975 | * Even read requests may start a new epoch and queue the corresponding | ||
976 | * barrier packet. To get the write ordering right, we only have to | ||
977 | * make sure that, if this is a write request and it triggered a | ||
978 | * barrier packet, this request is queued within the same spinlock. */ | ||
979 | if ((remote || send_oos) && mdev->unused_spare_tle && | ||
980 | test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) { | ||
981 | _tl_add_barrier(mdev, mdev->unused_spare_tle); | ||
982 | mdev->unused_spare_tle = NULL; | ||
983 | } else { | ||
984 | D_ASSERT(!(remote && rw == WRITE && | ||
985 | test_bit(CREATE_BARRIER, &mdev->flags))); | ||
986 | } | 1090 | } |
987 | 1091 | ||
988 | /* NOTE | 1092 | /* which transfer log epoch does this belong to? */ |
989 | * Actually, 'local' may be wrong here already, since we may have failed | 1093 | req->epoch = atomic_read(&mdev->tconn->current_tle_nr); |
990 | * to write to the meta data, and may become wrong anytime because of | ||
991 | * local io-error for some other request, which would lead to us | ||
992 | * "detaching" the local disk. | ||
993 | * | ||
994 | * 'remote' may become wrong any time because the network could fail. | ||
995 | * | ||
996 | * This is a harmless race condition, though, since it is handled | ||
997 | * correctly at the appropriate places; so it just defers the failure | ||
998 | * of the respective operation. | ||
999 | */ | ||
1000 | |||
1001 | /* mark them early for readability. | ||
1002 | * this just sets some state flags. */ | ||
1003 | if (remote) | ||
1004 | _req_mod(req, to_be_send); | ||
1005 | if (local) | ||
1006 | _req_mod(req, to_be_submitted); | ||
1007 | |||
1008 | /* check this request on the collision detection hash tables. | ||
1009 | * if we have a conflict, just complete it here. | ||
1010 | * THINK do we want to check reads, too? (I don't think so...) */ | ||
1011 | if (rw == WRITE && _req_conflicts(req)) | ||
1012 | goto fail_conflicting; | ||
1013 | 1094 | ||
1014 | /* no point in adding empty flushes to the transfer log, | 1095 | /* no point in adding empty flushes to the transfer log, |
1015 | * they are mapped to drbd barriers already. */ | 1096 | * they are mapped to drbd barriers already. */ |
1016 | if (likely(size!=0)) | 1097 | if (likely(req->i.size!=0)) { |
1017 | list_add_tail(&req->tl_requests, &mdev->newest_tle->requests); | 1098 | if (rw == WRITE) |
1099 | mdev->tconn->current_tle_writes++; | ||
1018 | 1100 | ||
1019 | /* NOTE remote first: to get the concurrent write detection right, | 1101 | list_add_tail(&req->tl_requests, &mdev->tconn->transfer_log); |
1020 | * we must register the request before start of local IO. */ | ||
1021 | if (remote) { | ||
1022 | /* either WRITE and C_CONNECTED, | ||
1023 | * or READ, and no local disk, | ||
1024 | * or READ, but not in sync. | ||
1025 | */ | ||
1026 | _req_mod(req, (rw == WRITE) | ||
1027 | ? queue_for_net_write | ||
1028 | : queue_for_net_read); | ||
1029 | } | 1102 | } |
1030 | if (send_oos && drbd_set_out_of_sync(mdev, sector, size)) | ||
1031 | _req_mod(req, queue_for_send_oos); | ||
1032 | 1103 | ||
1033 | if (remote && | 1104 | if (rw == WRITE) { |
1034 | mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96) | 1105 | if (!drbd_process_write_request(req)) |
1035 | maybe_pull_ahead(mdev); | 1106 | no_remote = true; |
1036 | 1107 | } else { | |
1037 | /* If this was a flush, queue a drbd barrier/start a new epoch. | 1108 | /* We either have a private_bio, or we can read from remote. |
1038 | * Unless the current epoch was empty anyways, or we are not currently | 1109 | * Otherwise we had done the goto nodata above. */ |
1039 | * replicating, in which case there is no point. */ | 1110 | if (req->private_bio == NULL) { |
1040 | if (unlikely(bio->bi_rw & REQ_FLUSH) | 1111 | _req_mod(req, TO_BE_SENT); |
1041 | && mdev->newest_tle->n_writes | 1112 | _req_mod(req, QUEUE_FOR_NET_READ); |
1042 | && drbd_should_do_remote(mdev->state)) | ||
1043 | queue_barrier(mdev); | ||
1044 | |||
1045 | spin_unlock_irq(&mdev->req_lock); | ||
1046 | kfree(b); /* if someone else has beaten us to it... */ | ||
1047 | |||
1048 | if (local) { | ||
1049 | req->private_bio->bi_bdev = mdev->ldev->backing_bdev; | ||
1050 | |||
1051 | /* State may have changed since we grabbed our reference on the | ||
1052 | * mdev->ldev member. Double check, and short-circuit to endio. | ||
1053 | * In case the last activity log transaction failed to get on | ||
1054 | * stable storage, and this is a WRITE, we may not even submit | ||
1055 | * this bio. */ | ||
1056 | if (get_ldev(mdev)) { | ||
1057 | if (drbd_insert_fault(mdev, rw == WRITE ? DRBD_FAULT_DT_WR | ||
1058 | : rw == READ ? DRBD_FAULT_DT_RD | ||
1059 | : DRBD_FAULT_DT_RA)) | ||
1060 | bio_endio(req->private_bio, -EIO); | ||
1061 | else | ||
1062 | generic_make_request(req->private_bio); | ||
1063 | put_ldev(mdev); | ||
1064 | } else | 1113 | } else |
1065 | bio_endio(req->private_bio, -EIO); | 1114 | no_remote = true; |
1066 | } | 1115 | } |
1067 | 1116 | ||
1068 | return 0; | 1117 | if (req->private_bio) { |
1069 | 1118 | /* needs to be marked within the same spinlock */ | |
1070 | fail_conflicting: | 1119 | _req_mod(req, TO_BE_SUBMITTED); |
1071 | /* this is a conflicting request. | 1120 | /* but we need to give up the spinlock to submit */ |
1072 | * even though it may have been only _partially_ | 1121 | spin_unlock_irq(&mdev->tconn->req_lock); |
1073 | * overlapping with one of the currently pending requests, | 1122 | drbd_submit_req_private_bio(req); |
1074 | * without even submitting or sending it, we will | 1123 | spin_lock_irq(&mdev->tconn->req_lock); |
1075 | * pretend that it was successfully served right now. | 1124 | } else if (no_remote) { |
1076 | */ | 1125 | nodata: |
1077 | _drbd_end_io_acct(mdev, req); | 1126 | if (__ratelimit(&drbd_ratelimit_state)) |
1078 | spin_unlock_irq(&mdev->req_lock); | 1127 | dev_err(DEV, "IO ERROR: neither local nor remote data, sector %llu+%u\n", |
1079 | if (remote) | 1128 | (unsigned long long)req->i.sector, req->i.size >> 9); |
1080 | dec_ap_pending(mdev); | 1129 | /* A write may have been queued for send_oos, however. |
1081 | /* THINK: do we want to fail it (-EIO), or pretend success? | 1130 | * So we can not simply free it, we must go through drbd_req_put_completion_ref() */ |
1082 | * this pretends success. */ | ||
1083 | err = 0; | ||
1084 | |||
1085 | fail_free_complete: | ||
1086 | if (req->rq_state & RQ_IN_ACT_LOG) | ||
1087 | drbd_al_complete_io(mdev, sector); | ||
1088 | fail_and_free_req: | ||
1089 | if (local) { | ||
1090 | bio_put(req->private_bio); | ||
1091 | req->private_bio = NULL; | ||
1092 | put_ldev(mdev); | ||
1093 | } | 1131 | } |
1094 | if (!ret) | ||
1095 | bio_endio(bio, err); | ||
1096 | |||
1097 | drbd_req_free(req); | ||
1098 | dec_ap_bio(mdev); | ||
1099 | kfree(b); | ||
1100 | |||
1101 | return ret; | ||
1102 | } | ||
1103 | 1132 | ||
1104 | /* helper function for drbd_make_request | 1133 | out: |
1105 | * if we can determine just by the mdev (state) that this request will fail, | 1134 | if (drbd_req_put_completion_ref(req, &m, 1)) |
1106 | * return 1 | 1135 | kref_put(&req->kref, drbd_req_destroy); |
1107 | * otherwise return 0 | 1136 | spin_unlock_irq(&mdev->tconn->req_lock); |
1108 | */ | ||
1109 | static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write) | ||
1110 | { | ||
1111 | if (mdev->state.role != R_PRIMARY && | ||
1112 | (!allow_oos || is_write)) { | ||
1113 | if (__ratelimit(&drbd_ratelimit_state)) { | ||
1114 | dev_err(DEV, "Process %s[%u] tried to %s; " | ||
1115 | "since we are not in Primary state, " | ||
1116 | "we cannot allow this\n", | ||
1117 | current->comm, current->pid, | ||
1118 | is_write ? "WRITE" : "READ"); | ||
1119 | } | ||
1120 | return 1; | ||
1121 | } | ||
1122 | 1137 | ||
1123 | return 0; | 1138 | if (m.bio) |
1139 | complete_master_bio(mdev, &m); | ||
1140 | return; | ||
1124 | } | 1141 | } |
1125 | 1142 | ||
1126 | void drbd_make_request(struct request_queue *q, struct bio *bio) | 1143 | void drbd_make_request(struct request_queue *q, struct bio *bio) |
1127 | { | 1144 | { |
1128 | unsigned int s_enr, e_enr; | ||
1129 | struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata; | 1145 | struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata; |
1130 | unsigned long start_time; | 1146 | unsigned long start_time; |
1131 | 1147 | ||
1132 | if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) { | ||
1133 | bio_endio(bio, -EPERM); | ||
1134 | return; | ||
1135 | } | ||
1136 | |||
1137 | start_time = jiffies; | 1148 | start_time = jiffies; |
1138 | 1149 | ||
1139 | /* | 1150 | /* |
1140 | * what we "blindly" assume: | 1151 | * what we "blindly" assume: |
1141 | */ | 1152 | */ |
1142 | D_ASSERT((bio->bi_size & 0x1ff) == 0); | 1153 | D_ASSERT(IS_ALIGNED(bio->bi_size, 512)); |
1143 | |||
1144 | /* to make some things easier, force alignment of requests within the | ||
1145 | * granularity of our hash tables */ | ||
1146 | s_enr = bio->bi_sector >> HT_SHIFT; | ||
1147 | e_enr = bio->bi_size ? (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT : s_enr; | ||
1148 | |||
1149 | if (likely(s_enr == e_enr)) { | ||
1150 | do { | ||
1151 | inc_ap_bio(mdev, 1); | ||
1152 | } while (drbd_make_request_common(mdev, bio, start_time)); | ||
1153 | return; | ||
1154 | } | ||
1155 | |||
1156 | /* can this bio be split generically? | ||
1157 | * Maybe add our own split-arbitrary-bios function. */ | ||
1158 | if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_BIO_SIZE) { | ||
1159 | /* rather error out here than BUG in bio_split */ | ||
1160 | dev_err(DEV, "bio would need to, but cannot, be split: " | ||
1161 | "(vcnt=%u,idx=%u,size=%u,sector=%llu)\n", | ||
1162 | bio->bi_vcnt, bio->bi_idx, bio->bi_size, | ||
1163 | (unsigned long long)bio->bi_sector); | ||
1164 | bio_endio(bio, -EINVAL); | ||
1165 | } else { | ||
1166 | /* This bio crosses some boundary, so we have to split it. */ | ||
1167 | struct bio_pair *bp; | ||
1168 | /* works for the "do not cross hash slot boundaries" case | ||
1169 | * e.g. sector 262269, size 4096 | ||
1170 | * s_enr = 262269 >> 6 = 4097 | ||
1171 | * e_enr = (262269+8-1) >> 6 = 4098 | ||
1172 | * HT_SHIFT = 6 | ||
1173 | * sps = 64, mask = 63 | ||
1174 | * first_sectors = 64 - (262269 & 63) = 3 | ||
1175 | */ | ||
1176 | const sector_t sect = bio->bi_sector; | ||
1177 | const int sps = 1 << HT_SHIFT; /* sectors per slot */ | ||
1178 | const int mask = sps - 1; | ||
1179 | const sector_t first_sectors = sps - (sect & mask); | ||
1180 | bp = bio_split(bio, first_sectors); | ||
1181 | 1154 | ||
1182 | /* we need to get a "reference count" (ap_bio_cnt) | 1155 | inc_ap_bio(mdev); |
1183 | * to avoid races with the disconnect/reconnect/suspend code. | 1156 | __drbd_make_request(mdev, bio, start_time); |
1184 | * In case we need to split the bio here, we need to get three references | ||
1185 | * atomically, otherwise we might deadlock when trying to submit the | ||
1186 | * second one! */ | ||
1187 | inc_ap_bio(mdev, 3); | ||
1188 | |||
1189 | D_ASSERT(e_enr == s_enr + 1); | ||
1190 | |||
1191 | while (drbd_make_request_common(mdev, &bp->bio1, start_time)) | ||
1192 | inc_ap_bio(mdev, 1); | ||
1193 | |||
1194 | while (drbd_make_request_common(mdev, &bp->bio2, start_time)) | ||
1195 | inc_ap_bio(mdev, 1); | ||
1196 | |||
1197 | dec_ap_bio(mdev); | ||
1198 | |||
1199 | bio_pair_release(bp); | ||
1200 | } | ||
1201 | } | 1157 | } |
1202 | 1158 | ||
1203 | /* This is called by bio_add_page(). With this function we reduce | 1159 | /* This is called by bio_add_page(). |
1204 | * the number of BIOs that span over multiple DRBD_MAX_BIO_SIZEs | 1160 | * |
1205 | * units (was AL_EXTENTs). | 1161 | * q->max_hw_sectors and other global limits are already enforced there. |
1206 | * | 1162 | * |
1207 | * we do the calculation within the lower 32bit of the byte offsets, | 1163 | * We need to call down to our lower level device, |
1208 | * since we don't care for actual offset, but only check whether it | 1164 | * in case it has special restrictions. |
1209 | * would cross "activity log extent" boundaries. | 1165 | * |
1166 | * We also may need to enforce configured max-bio-bvecs limits. | ||
1210 | * | 1167 | * |
1211 | * As long as the BIO is empty we have to allow at least one bvec, | 1168 | * As long as the BIO is empty we have to allow at least one bvec, |
1212 | * regardless of size and offset. so the resulting bio may still | 1169 | * regardless of size and offset, so no need to ask lower levels. |
1213 | * cross extent boundaries. those are dealt with (bio_split) in | ||
1214 | * drbd_make_request. | ||
1215 | */ | 1170 | */ |
1216 | int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec) | 1171 | int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec) |
1217 | { | 1172 | { |
1218 | struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata; | 1173 | struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata; |
1219 | unsigned int bio_offset = | ||
1220 | (unsigned int)bvm->bi_sector << 9; /* 32 bit */ | ||
1221 | unsigned int bio_size = bvm->bi_size; | 1174 | unsigned int bio_size = bvm->bi_size; |
1222 | int limit, backing_limit; | 1175 | int limit = DRBD_MAX_BIO_SIZE; |
1223 | 1176 | int backing_limit; | |
1224 | limit = DRBD_MAX_BIO_SIZE | 1177 | |
1225 | - ((bio_offset & (DRBD_MAX_BIO_SIZE-1)) + bio_size); | 1178 | if (bio_size && get_ldev(mdev)) { |
1226 | if (limit < 0) | ||
1227 | limit = 0; | ||
1228 | if (bio_size == 0) { | ||
1229 | if (limit <= bvec->bv_len) | ||
1230 | limit = bvec->bv_len; | ||
1231 | } else if (limit && get_ldev(mdev)) { | ||
1232 | struct request_queue * const b = | 1179 | struct request_queue * const b = |
1233 | mdev->ldev->backing_bdev->bd_disk->queue; | 1180 | mdev->ldev->backing_bdev->bd_disk->queue; |
1234 | if (b->merge_bvec_fn) { | 1181 | if (b->merge_bvec_fn) { |
@@ -1240,24 +1187,38 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct | |||
1240 | return limit; | 1187 | return limit; |
1241 | } | 1188 | } |
1242 | 1189 | ||
1190 | struct drbd_request *find_oldest_request(struct drbd_tconn *tconn) | ||
1191 | { | ||
1192 | /* Walk the transfer log, | ||
1193 | * and find the oldest not yet completed request */ | ||
1194 | struct drbd_request *r; | ||
1195 | list_for_each_entry(r, &tconn->transfer_log, tl_requests) { | ||
1196 | if (atomic_read(&r->completion_ref)) | ||
1197 | return r; | ||
1198 | } | ||
1199 | return NULL; | ||
1200 | } | ||
1201 | |||
1243 | void request_timer_fn(unsigned long data) | 1202 | void request_timer_fn(unsigned long data) |
1244 | { | 1203 | { |
1245 | struct drbd_conf *mdev = (struct drbd_conf *) data; | 1204 | struct drbd_conf *mdev = (struct drbd_conf *) data; |
1205 | struct drbd_tconn *tconn = mdev->tconn; | ||
1246 | struct drbd_request *req; /* oldest request */ | 1206 | struct drbd_request *req; /* oldest request */ |
1247 | struct list_head *le; | 1207 | struct net_conf *nc; |
1248 | unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */ | 1208 | unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */ |
1249 | unsigned long now; | 1209 | unsigned long now; |
1250 | 1210 | ||
1251 | if (get_net_conf(mdev)) { | 1211 | rcu_read_lock(); |
1252 | if (mdev->state.conn >= C_WF_REPORT_PARAMS) | 1212 | nc = rcu_dereference(tconn->net_conf); |
1253 | ent = mdev->net_conf->timeout*HZ/10 | 1213 | if (nc && mdev->state.conn >= C_WF_REPORT_PARAMS) |
1254 | * mdev->net_conf->ko_count; | 1214 | ent = nc->timeout * HZ/10 * nc->ko_count; |
1255 | put_net_conf(mdev); | 1215 | |
1256 | } | ||
1257 | if (get_ldev(mdev)) { /* implicit state.disk >= D_INCONSISTENT */ | 1216 | if (get_ldev(mdev)) { /* implicit state.disk >= D_INCONSISTENT */ |
1258 | dt = mdev->ldev->dc.disk_timeout * HZ / 10; | 1217 | dt = rcu_dereference(mdev->ldev->disk_conf)->disk_timeout * HZ / 10; |
1259 | put_ldev(mdev); | 1218 | put_ldev(mdev); |
1260 | } | 1219 | } |
1220 | rcu_read_unlock(); | ||
1221 | |||
1261 | et = min_not_zero(dt, ent); | 1222 | et = min_not_zero(dt, ent); |
1262 | 1223 | ||
1263 | if (!et) | 1224 | if (!et) |
@@ -1265,17 +1226,14 @@ void request_timer_fn(unsigned long data) | |||
1265 | 1226 | ||
1266 | now = jiffies; | 1227 | now = jiffies; |
1267 | 1228 | ||
1268 | spin_lock_irq(&mdev->req_lock); | 1229 | spin_lock_irq(&tconn->req_lock); |
1269 | le = &mdev->oldest_tle->requests; | 1230 | req = find_oldest_request(tconn); |
1270 | if (list_empty(le)) { | 1231 | if (!req) { |
1271 | spin_unlock_irq(&mdev->req_lock); | 1232 | spin_unlock_irq(&tconn->req_lock); |
1272 | mod_timer(&mdev->request_timer, now + et); | 1233 | mod_timer(&mdev->request_timer, now + et); |
1273 | return; | 1234 | return; |
1274 | } | 1235 | } |
1275 | 1236 | ||
1276 | le = le->prev; | ||
1277 | req = list_entry(le, struct drbd_request, tl_requests); | ||
1278 | |||
1279 | /* The request is considered timed out, if | 1237 | /* The request is considered timed out, if |
1280 | * - we have some effective timeout from the configuration, | 1238 | * - we have some effective timeout from the configuration, |
1281 | * with above state restrictions applied, | 1239 | * with above state restrictions applied, |
@@ -1294,17 +1252,17 @@ void request_timer_fn(unsigned long data) | |||
1294 | */ | 1252 | */ |
1295 | if (ent && req->rq_state & RQ_NET_PENDING && | 1253 | if (ent && req->rq_state & RQ_NET_PENDING && |
1296 | time_after(now, req->start_time + ent) && | 1254 | time_after(now, req->start_time + ent) && |
1297 | !time_in_range(now, mdev->last_reconnect_jif, mdev->last_reconnect_jif + ent)) { | 1255 | !time_in_range(now, tconn->last_reconnect_jif, tconn->last_reconnect_jif + ent)) { |
1298 | dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n"); | 1256 | dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n"); |
1299 | _drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL); | 1257 | _drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL); |
1300 | } | 1258 | } |
1301 | if (dt && req->rq_state & RQ_LOCAL_PENDING && | 1259 | if (dt && req->rq_state & RQ_LOCAL_PENDING && req->w.mdev == mdev && |
1302 | time_after(now, req->start_time + dt) && | 1260 | time_after(now, req->start_time + dt) && |
1303 | !time_in_range(now, mdev->last_reattach_jif, mdev->last_reattach_jif + dt)) { | 1261 | !time_in_range(now, mdev->last_reattach_jif, mdev->last_reattach_jif + dt)) { |
1304 | dev_warn(DEV, "Local backing device failed to meet the disk-timeout\n"); | 1262 | dev_warn(DEV, "Local backing device failed to meet the disk-timeout\n"); |
1305 | __drbd_chk_io_error(mdev, DRBD_FORCE_DETACH); | 1263 | __drbd_chk_io_error(mdev, DRBD_FORCE_DETACH); |
1306 | } | 1264 | } |
1307 | nt = (time_after(now, req->start_time + et) ? now : req->start_time) + et; | 1265 | nt = (time_after(now, req->start_time + et) ? now : req->start_time) + et; |
1308 | spin_unlock_irq(&mdev->req_lock); | 1266 | spin_unlock_irq(&tconn->req_lock); |
1309 | mod_timer(&mdev->request_timer, nt); | 1267 | mod_timer(&mdev->request_timer, nt); |
1310 | } | 1268 | } |
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index 3d2111919486..016de6b8bb57 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h | |||
@@ -77,40 +77,41 @@ | |||
77 | */ | 77 | */ |
78 | 78 | ||
79 | enum drbd_req_event { | 79 | enum drbd_req_event { |
80 | created, | 80 | CREATED, |
81 | to_be_send, | 81 | TO_BE_SENT, |
82 | to_be_submitted, | 82 | TO_BE_SUBMITTED, |
83 | 83 | ||
84 | /* XXX yes, now I am inconsistent... | 84 | /* XXX yes, now I am inconsistent... |
85 | * these are not "events" but "actions" | 85 | * these are not "events" but "actions" |
86 | * oh, well... */ | 86 | * oh, well... */ |
87 | queue_for_net_write, | 87 | QUEUE_FOR_NET_WRITE, |
88 | queue_for_net_read, | 88 | QUEUE_FOR_NET_READ, |
89 | queue_for_send_oos, | 89 | QUEUE_FOR_SEND_OOS, |
90 | 90 | ||
91 | send_canceled, | 91 | SEND_CANCELED, |
92 | send_failed, | 92 | SEND_FAILED, |
93 | handed_over_to_network, | 93 | HANDED_OVER_TO_NETWORK, |
94 | oos_handed_to_network, | 94 | OOS_HANDED_TO_NETWORK, |
95 | connection_lost_while_pending, | 95 | CONNECTION_LOST_WHILE_PENDING, |
96 | read_retry_remote_canceled, | 96 | READ_RETRY_REMOTE_CANCELED, |
97 | recv_acked_by_peer, | 97 | RECV_ACKED_BY_PEER, |
98 | write_acked_by_peer, | 98 | WRITE_ACKED_BY_PEER, |
99 | write_acked_by_peer_and_sis, /* and set_in_sync */ | 99 | WRITE_ACKED_BY_PEER_AND_SIS, /* and set_in_sync */ |
100 | conflict_discarded_by_peer, | 100 | CONFLICT_RESOLVED, |
101 | neg_acked, | 101 | POSTPONE_WRITE, |
102 | barrier_acked, /* in protocol A and B */ | 102 | NEG_ACKED, |
103 | data_received, /* (remote read) */ | 103 | BARRIER_ACKED, /* in protocol A and B */ |
104 | 104 | DATA_RECEIVED, /* (remote read) */ | |
105 | read_completed_with_error, | 105 | |
106 | read_ahead_completed_with_error, | 106 | READ_COMPLETED_WITH_ERROR, |
107 | write_completed_with_error, | 107 | READ_AHEAD_COMPLETED_WITH_ERROR, |
108 | abort_disk_io, | 108 | WRITE_COMPLETED_WITH_ERROR, |
109 | completed_ok, | 109 | ABORT_DISK_IO, |
110 | resend, | 110 | COMPLETED_OK, |
111 | fail_frozen_disk_io, | 111 | RESEND, |
112 | restart_frozen_disk_io, | 112 | FAIL_FROZEN_DISK_IO, |
113 | nothing, /* for tracing only */ | 113 | RESTART_FROZEN_DISK_IO, |
114 | NOTHING, | ||
114 | }; | 115 | }; |
115 | 116 | ||
116 | /* encoding of request states for now. we don't actually need that many bits. | 117 | /* encoding of request states for now. we don't actually need that many bits. |
@@ -142,8 +143,8 @@ enum drbd_req_state_bits { | |||
142 | * recv_ack (B) or implicit "ack" (A), | 143 | * recv_ack (B) or implicit "ack" (A), |
143 | * still waiting for the barrier ack. | 144 | * still waiting for the barrier ack. |
144 | * master_bio may already be completed and invalidated. | 145 | * master_bio may already be completed and invalidated. |
145 | * 11100: write_acked (C), | 146 | * 11100: write acked (C), |
146 | * data_received (for remote read, any protocol) | 147 | * data received (for remote read, any protocol) |
147 | * or finally the barrier ack has arrived (B,A)... | 148 | * or finally the barrier ack has arrived (B,A)... |
148 | * request can be freed | 149 | * request can be freed |
149 | * 01100: neg-acked (write, protocol C) | 150 | * 01100: neg-acked (write, protocol C) |
@@ -198,6 +199,22 @@ enum drbd_req_state_bits { | |||
198 | 199 | ||
199 | /* Should call drbd_al_complete_io() for this request... */ | 200 | /* Should call drbd_al_complete_io() for this request... */ |
200 | __RQ_IN_ACT_LOG, | 201 | __RQ_IN_ACT_LOG, |
202 | |||
203 | /* The peer has sent a retry ACK */ | ||
204 | __RQ_POSTPONED, | ||
205 | |||
206 | /* would have been completed, | ||
207 | * but was not, because of drbd_suspended() */ | ||
208 | __RQ_COMPLETION_SUSP, | ||
209 | |||
210 | /* We expect a receive ACK (wire proto B) */ | ||
211 | __RQ_EXP_RECEIVE_ACK, | ||
212 | |||
213 | /* We expect a write ACK (wite proto C) */ | ||
214 | __RQ_EXP_WRITE_ACK, | ||
215 | |||
216 | /* waiting for a barrier ack, did an extra kref_get */ | ||
217 | __RQ_EXP_BARR_ACK, | ||
201 | }; | 218 | }; |
202 | 219 | ||
203 | #define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING) | 220 | #define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING) |
@@ -219,56 +236,16 @@ enum drbd_req_state_bits { | |||
219 | 236 | ||
220 | #define RQ_WRITE (1UL << __RQ_WRITE) | 237 | #define RQ_WRITE (1UL << __RQ_WRITE) |
221 | #define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG) | 238 | #define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG) |
239 | #define RQ_POSTPONED (1UL << __RQ_POSTPONED) | ||
240 | #define RQ_COMPLETION_SUSP (1UL << __RQ_COMPLETION_SUSP) | ||
241 | #define RQ_EXP_RECEIVE_ACK (1UL << __RQ_EXP_RECEIVE_ACK) | ||
242 | #define RQ_EXP_WRITE_ACK (1UL << __RQ_EXP_WRITE_ACK) | ||
243 | #define RQ_EXP_BARR_ACK (1UL << __RQ_EXP_BARR_ACK) | ||
222 | 244 | ||
223 | /* For waking up the frozen transfer log mod_req() has to return if the request | 245 | /* For waking up the frozen transfer log mod_req() has to return if the request |
224 | should be counted in the epoch object*/ | 246 | should be counted in the epoch object*/ |
225 | #define MR_WRITE_SHIFT 0 | 247 | #define MR_WRITE 1 |
226 | #define MR_WRITE (1 << MR_WRITE_SHIFT) | 248 | #define MR_READ 2 |
227 | #define MR_READ_SHIFT 1 | ||
228 | #define MR_READ (1 << MR_READ_SHIFT) | ||
229 | |||
230 | /* epoch entries */ | ||
231 | static inline | ||
232 | struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector) | ||
233 | { | ||
234 | BUG_ON(mdev->ee_hash_s == 0); | ||
235 | return mdev->ee_hash + | ||
236 | ((unsigned int)(sector>>HT_SHIFT) % mdev->ee_hash_s); | ||
237 | } | ||
238 | |||
239 | /* transfer log (drbd_request objects) */ | ||
240 | static inline | ||
241 | struct hlist_head *tl_hash_slot(struct drbd_conf *mdev, sector_t sector) | ||
242 | { | ||
243 | BUG_ON(mdev->tl_hash_s == 0); | ||
244 | return mdev->tl_hash + | ||
245 | ((unsigned int)(sector>>HT_SHIFT) % mdev->tl_hash_s); | ||
246 | } | ||
247 | |||
248 | /* application reads (drbd_request objects) */ | ||
249 | static struct hlist_head *ar_hash_slot(struct drbd_conf *mdev, sector_t sector) | ||
250 | { | ||
251 | return mdev->app_reads_hash | ||
252 | + ((unsigned int)(sector) % APP_R_HSIZE); | ||
253 | } | ||
254 | |||
255 | /* when we receive the answer for a read request, | ||
256 | * verify that we actually know about it */ | ||
257 | static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev, | ||
258 | u64 id, sector_t sector) | ||
259 | { | ||
260 | struct hlist_head *slot = ar_hash_slot(mdev, sector); | ||
261 | struct hlist_node *n; | ||
262 | struct drbd_request *req; | ||
263 | |||
264 | hlist_for_each_entry(req, n, slot, collision) { | ||
265 | if ((unsigned long)req == (unsigned long)id) { | ||
266 | D_ASSERT(req->sector == sector); | ||
267 | return req; | ||
268 | } | ||
269 | } | ||
270 | return NULL; | ||
271 | } | ||
272 | 249 | ||
273 | static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src) | 250 | static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src) |
274 | { | 251 | { |
@@ -278,41 +255,10 @@ static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bi | |||
278 | req->private_bio = bio; | 255 | req->private_bio = bio; |
279 | 256 | ||
280 | bio->bi_private = req; | 257 | bio->bi_private = req; |
281 | bio->bi_end_io = drbd_endio_pri; | 258 | bio->bi_end_io = drbd_request_endio; |
282 | bio->bi_next = NULL; | 259 | bio->bi_next = NULL; |
283 | } | 260 | } |
284 | 261 | ||
285 | static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev, | ||
286 | struct bio *bio_src) | ||
287 | { | ||
288 | struct drbd_request *req = | ||
289 | mempool_alloc(drbd_request_mempool, GFP_NOIO); | ||
290 | if (likely(req)) { | ||
291 | drbd_req_make_private_bio(req, bio_src); | ||
292 | |||
293 | req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0; | ||
294 | req->mdev = mdev; | ||
295 | req->master_bio = bio_src; | ||
296 | req->epoch = 0; | ||
297 | req->sector = bio_src->bi_sector; | ||
298 | req->size = bio_src->bi_size; | ||
299 | INIT_HLIST_NODE(&req->collision); | ||
300 | INIT_LIST_HEAD(&req->tl_requests); | ||
301 | INIT_LIST_HEAD(&req->w.list); | ||
302 | } | ||
303 | return req; | ||
304 | } | ||
305 | |||
306 | static inline void drbd_req_free(struct drbd_request *req) | ||
307 | { | ||
308 | mempool_free(req, drbd_request_mempool); | ||
309 | } | ||
310 | |||
311 | static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2) | ||
312 | { | ||
313 | return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9))); | ||
314 | } | ||
315 | |||
316 | /* Short lived temporary struct on the stack. | 262 | /* Short lived temporary struct on the stack. |
317 | * We could squirrel the error to be returned into | 263 | * We could squirrel the error to be returned into |
318 | * bio->bi_size, or similar. But that would be too ugly. */ | 264 | * bio->bi_size, or similar. But that would be too ugly. */ |
@@ -321,6 +267,7 @@ struct bio_and_error { | |||
321 | int error; | 267 | int error; |
322 | }; | 268 | }; |
323 | 269 | ||
270 | extern void drbd_req_destroy(struct kref *kref); | ||
324 | extern void _req_may_be_done(struct drbd_request *req, | 271 | extern void _req_may_be_done(struct drbd_request *req, |
325 | struct bio_and_error *m); | 272 | struct bio_and_error *m); |
326 | extern int __req_mod(struct drbd_request *req, enum drbd_req_event what, | 273 | extern int __req_mod(struct drbd_request *req, enum drbd_req_event what, |
@@ -328,13 +275,17 @@ extern int __req_mod(struct drbd_request *req, enum drbd_req_event what, | |||
328 | extern void complete_master_bio(struct drbd_conf *mdev, | 275 | extern void complete_master_bio(struct drbd_conf *mdev, |
329 | struct bio_and_error *m); | 276 | struct bio_and_error *m); |
330 | extern void request_timer_fn(unsigned long data); | 277 | extern void request_timer_fn(unsigned long data); |
331 | extern void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what); | 278 | extern void tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what); |
279 | extern void _tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what); | ||
280 | |||
281 | /* this is in drbd_main.c */ | ||
282 | extern void drbd_restart_request(struct drbd_request *req); | ||
332 | 283 | ||
333 | /* use this if you don't want to deal with calling complete_master_bio() | 284 | /* use this if you don't want to deal with calling complete_master_bio() |
334 | * outside the spinlock, e.g. when walking some list on cleanup. */ | 285 | * outside the spinlock, e.g. when walking some list on cleanup. */ |
335 | static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what) | 286 | static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what) |
336 | { | 287 | { |
337 | struct drbd_conf *mdev = req->mdev; | 288 | struct drbd_conf *mdev = req->w.mdev; |
338 | struct bio_and_error m; | 289 | struct bio_and_error m; |
339 | int rv; | 290 | int rv; |
340 | 291 | ||
@@ -354,13 +305,13 @@ static inline int req_mod(struct drbd_request *req, | |||
354 | enum drbd_req_event what) | 305 | enum drbd_req_event what) |
355 | { | 306 | { |
356 | unsigned long flags; | 307 | unsigned long flags; |
357 | struct drbd_conf *mdev = req->mdev; | 308 | struct drbd_conf *mdev = req->w.mdev; |
358 | struct bio_and_error m; | 309 | struct bio_and_error m; |
359 | int rv; | 310 | int rv; |
360 | 311 | ||
361 | spin_lock_irqsave(&mdev->req_lock, flags); | 312 | spin_lock_irqsave(&mdev->tconn->req_lock, flags); |
362 | rv = __req_mod(req, what, &m); | 313 | rv = __req_mod(req, what, &m); |
363 | spin_unlock_irqrestore(&mdev->req_lock, flags); | 314 | spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); |
364 | 315 | ||
365 | if (m.bio) | 316 | if (m.bio) |
366 | complete_master_bio(mdev, &m); | 317 | complete_master_bio(mdev, &m); |
@@ -368,7 +319,7 @@ static inline int req_mod(struct drbd_request *req, | |||
368 | return rv; | 319 | return rv; |
369 | } | 320 | } |
370 | 321 | ||
371 | static inline bool drbd_should_do_remote(union drbd_state s) | 322 | static inline bool drbd_should_do_remote(union drbd_dev_state s) |
372 | { | 323 | { |
373 | return s.pdsk == D_UP_TO_DATE || | 324 | return s.pdsk == D_UP_TO_DATE || |
374 | (s.pdsk >= D_INCONSISTENT && | 325 | (s.pdsk >= D_INCONSISTENT && |
@@ -378,7 +329,7 @@ static inline bool drbd_should_do_remote(union drbd_state s) | |||
378 | That is equivalent since before 96 IO was frozen in the C_WF_BITMAP* | 329 | That is equivalent since before 96 IO was frozen in the C_WF_BITMAP* |
379 | states. */ | 330 | states. */ |
380 | } | 331 | } |
381 | static inline bool drbd_should_send_oos(union drbd_state s) | 332 | static inline bool drbd_should_send_out_of_sync(union drbd_dev_state s) |
382 | { | 333 | { |
383 | return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S; | 334 | return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S; |
384 | /* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary | 335 | /* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary |
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c new file mode 100644 index 000000000000..53bf6182bac4 --- /dev/null +++ b/drivers/block/drbd/drbd_state.c | |||
@@ -0,0 +1,1856 @@ | |||
1 | /* | ||
2 | drbd_state.c | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev | ||
11 | from Logicworks, Inc. for making SDP replication support possible. | ||
12 | |||
13 | drbd is free software; you can redistribute it and/or modify | ||
14 | it under the terms of the GNU General Public License as published by | ||
15 | the Free Software Foundation; either version 2, or (at your option) | ||
16 | any later version. | ||
17 | |||
18 | drbd is distributed in the hope that it will be useful, | ||
19 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
20 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
21 | GNU General Public License for more details. | ||
22 | |||
23 | You should have received a copy of the GNU General Public License | ||
24 | along with drbd; see the file COPYING. If not, write to | ||
25 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
26 | */ | ||
27 | |||
28 | #include <linux/drbd_limits.h> | ||
29 | #include "drbd_int.h" | ||
30 | #include "drbd_req.h" | ||
31 | |||
32 | /* in drbd_main.c */ | ||
33 | extern void tl_abort_disk_io(struct drbd_conf *mdev); | ||
34 | |||
35 | struct after_state_chg_work { | ||
36 | struct drbd_work w; | ||
37 | union drbd_state os; | ||
38 | union drbd_state ns; | ||
39 | enum chg_state_flags flags; | ||
40 | struct completion *done; | ||
41 | }; | ||
42 | |||
43 | enum sanitize_state_warnings { | ||
44 | NO_WARNING, | ||
45 | ABORTED_ONLINE_VERIFY, | ||
46 | ABORTED_RESYNC, | ||
47 | CONNECTION_LOST_NEGOTIATING, | ||
48 | IMPLICITLY_UPGRADED_DISK, | ||
49 | IMPLICITLY_UPGRADED_PDSK, | ||
50 | }; | ||
51 | |||
52 | static int w_after_state_ch(struct drbd_work *w, int unused); | ||
53 | static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, | ||
54 | union drbd_state ns, enum chg_state_flags flags); | ||
55 | static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state); | ||
56 | static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_tconn *); | ||
57 | static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns); | ||
58 | static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state ns, | ||
59 | enum sanitize_state_warnings *warn); | ||
60 | |||
61 | static inline bool is_susp(union drbd_state s) | ||
62 | { | ||
63 | return s.susp || s.susp_nod || s.susp_fen; | ||
64 | } | ||
65 | |||
66 | bool conn_all_vols_unconf(struct drbd_tconn *tconn) | ||
67 | { | ||
68 | struct drbd_conf *mdev; | ||
69 | bool rv = true; | ||
70 | int vnr; | ||
71 | |||
72 | rcu_read_lock(); | ||
73 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
74 | if (mdev->state.disk != D_DISKLESS || | ||
75 | mdev->state.conn != C_STANDALONE || | ||
76 | mdev->state.role != R_SECONDARY) { | ||
77 | rv = false; | ||
78 | break; | ||
79 | } | ||
80 | } | ||
81 | rcu_read_unlock(); | ||
82 | |||
83 | return rv; | ||
84 | } | ||
85 | |||
86 | /* Unfortunately the states where not correctly ordered, when | ||
87 | they where defined. therefore can not use max_t() here. */ | ||
88 | static enum drbd_role max_role(enum drbd_role role1, enum drbd_role role2) | ||
89 | { | ||
90 | if (role1 == R_PRIMARY || role2 == R_PRIMARY) | ||
91 | return R_PRIMARY; | ||
92 | if (role1 == R_SECONDARY || role2 == R_SECONDARY) | ||
93 | return R_SECONDARY; | ||
94 | return R_UNKNOWN; | ||
95 | } | ||
96 | static enum drbd_role min_role(enum drbd_role role1, enum drbd_role role2) | ||
97 | { | ||
98 | if (role1 == R_UNKNOWN || role2 == R_UNKNOWN) | ||
99 | return R_UNKNOWN; | ||
100 | if (role1 == R_SECONDARY || role2 == R_SECONDARY) | ||
101 | return R_SECONDARY; | ||
102 | return R_PRIMARY; | ||
103 | } | ||
104 | |||
105 | enum drbd_role conn_highest_role(struct drbd_tconn *tconn) | ||
106 | { | ||
107 | enum drbd_role role = R_UNKNOWN; | ||
108 | struct drbd_conf *mdev; | ||
109 | int vnr; | ||
110 | |||
111 | rcu_read_lock(); | ||
112 | idr_for_each_entry(&tconn->volumes, mdev, vnr) | ||
113 | role = max_role(role, mdev->state.role); | ||
114 | rcu_read_unlock(); | ||
115 | |||
116 | return role; | ||
117 | } | ||
118 | |||
119 | enum drbd_role conn_highest_peer(struct drbd_tconn *tconn) | ||
120 | { | ||
121 | enum drbd_role peer = R_UNKNOWN; | ||
122 | struct drbd_conf *mdev; | ||
123 | int vnr; | ||
124 | |||
125 | rcu_read_lock(); | ||
126 | idr_for_each_entry(&tconn->volumes, mdev, vnr) | ||
127 | peer = max_role(peer, mdev->state.peer); | ||
128 | rcu_read_unlock(); | ||
129 | |||
130 | return peer; | ||
131 | } | ||
132 | |||
133 | enum drbd_disk_state conn_highest_disk(struct drbd_tconn *tconn) | ||
134 | { | ||
135 | enum drbd_disk_state ds = D_DISKLESS; | ||
136 | struct drbd_conf *mdev; | ||
137 | int vnr; | ||
138 | |||
139 | rcu_read_lock(); | ||
140 | idr_for_each_entry(&tconn->volumes, mdev, vnr) | ||
141 | ds = max_t(enum drbd_disk_state, ds, mdev->state.disk); | ||
142 | rcu_read_unlock(); | ||
143 | |||
144 | return ds; | ||
145 | } | ||
146 | |||
147 | enum drbd_disk_state conn_lowest_disk(struct drbd_tconn *tconn) | ||
148 | { | ||
149 | enum drbd_disk_state ds = D_MASK; | ||
150 | struct drbd_conf *mdev; | ||
151 | int vnr; | ||
152 | |||
153 | rcu_read_lock(); | ||
154 | idr_for_each_entry(&tconn->volumes, mdev, vnr) | ||
155 | ds = min_t(enum drbd_disk_state, ds, mdev->state.disk); | ||
156 | rcu_read_unlock(); | ||
157 | |||
158 | return ds; | ||
159 | } | ||
160 | |||
161 | enum drbd_disk_state conn_highest_pdsk(struct drbd_tconn *tconn) | ||
162 | { | ||
163 | enum drbd_disk_state ds = D_DISKLESS; | ||
164 | struct drbd_conf *mdev; | ||
165 | int vnr; | ||
166 | |||
167 | rcu_read_lock(); | ||
168 | idr_for_each_entry(&tconn->volumes, mdev, vnr) | ||
169 | ds = max_t(enum drbd_disk_state, ds, mdev->state.pdsk); | ||
170 | rcu_read_unlock(); | ||
171 | |||
172 | return ds; | ||
173 | } | ||
174 | |||
175 | enum drbd_conns conn_lowest_conn(struct drbd_tconn *tconn) | ||
176 | { | ||
177 | enum drbd_conns conn = C_MASK; | ||
178 | struct drbd_conf *mdev; | ||
179 | int vnr; | ||
180 | |||
181 | rcu_read_lock(); | ||
182 | idr_for_each_entry(&tconn->volumes, mdev, vnr) | ||
183 | conn = min_t(enum drbd_conns, conn, mdev->state.conn); | ||
184 | rcu_read_unlock(); | ||
185 | |||
186 | return conn; | ||
187 | } | ||
188 | |||
189 | static bool no_peer_wf_report_params(struct drbd_tconn *tconn) | ||
190 | { | ||
191 | struct drbd_conf *mdev; | ||
192 | int vnr; | ||
193 | bool rv = true; | ||
194 | |||
195 | rcu_read_lock(); | ||
196 | idr_for_each_entry(&tconn->volumes, mdev, vnr) | ||
197 | if (mdev->state.conn == C_WF_REPORT_PARAMS) { | ||
198 | rv = false; | ||
199 | break; | ||
200 | } | ||
201 | rcu_read_unlock(); | ||
202 | |||
203 | return rv; | ||
204 | } | ||
205 | |||
206 | |||
207 | /** | ||
208 | * cl_wide_st_chg() - true if the state change is a cluster wide one | ||
209 | * @mdev: DRBD device. | ||
210 | * @os: old (current) state. | ||
211 | * @ns: new (wanted) state. | ||
212 | */ | ||
213 | static int cl_wide_st_chg(struct drbd_conf *mdev, | ||
214 | union drbd_state os, union drbd_state ns) | ||
215 | { | ||
216 | return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED && | ||
217 | ((os.role != R_PRIMARY && ns.role == R_PRIMARY) || | ||
218 | (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || | ||
219 | (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) || | ||
220 | (os.disk != D_FAILED && ns.disk == D_FAILED))) || | ||
221 | (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) || | ||
222 | (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S) || | ||
223 | (os.conn == C_CONNECTED && ns.conn == C_WF_REPORT_PARAMS); | ||
224 | } | ||
225 | |||
226 | static union drbd_state | ||
227 | apply_mask_val(union drbd_state os, union drbd_state mask, union drbd_state val) | ||
228 | { | ||
229 | union drbd_state ns; | ||
230 | ns.i = (os.i & ~mask.i) | val.i; | ||
231 | return ns; | ||
232 | } | ||
233 | |||
234 | enum drbd_state_rv | ||
235 | drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f, | ||
236 | union drbd_state mask, union drbd_state val) | ||
237 | { | ||
238 | unsigned long flags; | ||
239 | union drbd_state ns; | ||
240 | enum drbd_state_rv rv; | ||
241 | |||
242 | spin_lock_irqsave(&mdev->tconn->req_lock, flags); | ||
243 | ns = apply_mask_val(drbd_read_state(mdev), mask, val); | ||
244 | rv = _drbd_set_state(mdev, ns, f, NULL); | ||
245 | spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); | ||
246 | |||
247 | return rv; | ||
248 | } | ||
249 | |||
250 | /** | ||
251 | * drbd_force_state() - Impose a change which happens outside our control on our state | ||
252 | * @mdev: DRBD device. | ||
253 | * @mask: mask of state bits to change. | ||
254 | * @val: value of new state bits. | ||
255 | */ | ||
256 | void drbd_force_state(struct drbd_conf *mdev, | ||
257 | union drbd_state mask, union drbd_state val) | ||
258 | { | ||
259 | drbd_change_state(mdev, CS_HARD, mask, val); | ||
260 | } | ||
261 | |||
262 | static enum drbd_state_rv | ||
263 | _req_st_cond(struct drbd_conf *mdev, union drbd_state mask, | ||
264 | union drbd_state val) | ||
265 | { | ||
266 | union drbd_state os, ns; | ||
267 | unsigned long flags; | ||
268 | enum drbd_state_rv rv; | ||
269 | |||
270 | if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags)) | ||
271 | return SS_CW_SUCCESS; | ||
272 | |||
273 | if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags)) | ||
274 | return SS_CW_FAILED_BY_PEER; | ||
275 | |||
276 | spin_lock_irqsave(&mdev->tconn->req_lock, flags); | ||
277 | os = drbd_read_state(mdev); | ||
278 | ns = sanitize_state(mdev, apply_mask_val(os, mask, val), NULL); | ||
279 | rv = is_valid_transition(os, ns); | ||
280 | if (rv >= SS_SUCCESS) | ||
281 | rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */ | ||
282 | |||
283 | if (!cl_wide_st_chg(mdev, os, ns)) | ||
284 | rv = SS_CW_NO_NEED; | ||
285 | if (rv == SS_UNKNOWN_ERROR) { | ||
286 | rv = is_valid_state(mdev, ns); | ||
287 | if (rv >= SS_SUCCESS) { | ||
288 | rv = is_valid_soft_transition(os, ns, mdev->tconn); | ||
289 | if (rv >= SS_SUCCESS) | ||
290 | rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */ | ||
291 | } | ||
292 | } | ||
293 | spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); | ||
294 | |||
295 | return rv; | ||
296 | } | ||
297 | |||
298 | /** | ||
299 | * drbd_req_state() - Perform an eventually cluster wide state change | ||
300 | * @mdev: DRBD device. | ||
301 | * @mask: mask of state bits to change. | ||
302 | * @val: value of new state bits. | ||
303 | * @f: flags | ||
304 | * | ||
305 | * Should not be called directly, use drbd_request_state() or | ||
306 | * _drbd_request_state(). | ||
307 | */ | ||
308 | static enum drbd_state_rv | ||
309 | drbd_req_state(struct drbd_conf *mdev, union drbd_state mask, | ||
310 | union drbd_state val, enum chg_state_flags f) | ||
311 | { | ||
312 | struct completion done; | ||
313 | unsigned long flags; | ||
314 | union drbd_state os, ns; | ||
315 | enum drbd_state_rv rv; | ||
316 | |||
317 | init_completion(&done); | ||
318 | |||
319 | if (f & CS_SERIALIZE) | ||
320 | mutex_lock(mdev->state_mutex); | ||
321 | |||
322 | spin_lock_irqsave(&mdev->tconn->req_lock, flags); | ||
323 | os = drbd_read_state(mdev); | ||
324 | ns = sanitize_state(mdev, apply_mask_val(os, mask, val), NULL); | ||
325 | rv = is_valid_transition(os, ns); | ||
326 | if (rv < SS_SUCCESS) { | ||
327 | spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); | ||
328 | goto abort; | ||
329 | } | ||
330 | |||
331 | if (cl_wide_st_chg(mdev, os, ns)) { | ||
332 | rv = is_valid_state(mdev, ns); | ||
333 | if (rv == SS_SUCCESS) | ||
334 | rv = is_valid_soft_transition(os, ns, mdev->tconn); | ||
335 | spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); | ||
336 | |||
337 | if (rv < SS_SUCCESS) { | ||
338 | if (f & CS_VERBOSE) | ||
339 | print_st_err(mdev, os, ns, rv); | ||
340 | goto abort; | ||
341 | } | ||
342 | |||
343 | if (drbd_send_state_req(mdev, mask, val)) { | ||
344 | rv = SS_CW_FAILED_BY_PEER; | ||
345 | if (f & CS_VERBOSE) | ||
346 | print_st_err(mdev, os, ns, rv); | ||
347 | goto abort; | ||
348 | } | ||
349 | |||
350 | wait_event(mdev->state_wait, | ||
351 | (rv = _req_st_cond(mdev, mask, val))); | ||
352 | |||
353 | if (rv < SS_SUCCESS) { | ||
354 | if (f & CS_VERBOSE) | ||
355 | print_st_err(mdev, os, ns, rv); | ||
356 | goto abort; | ||
357 | } | ||
358 | spin_lock_irqsave(&mdev->tconn->req_lock, flags); | ||
359 | ns = apply_mask_val(drbd_read_state(mdev), mask, val); | ||
360 | rv = _drbd_set_state(mdev, ns, f, &done); | ||
361 | } else { | ||
362 | rv = _drbd_set_state(mdev, ns, f, &done); | ||
363 | } | ||
364 | |||
365 | spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); | ||
366 | |||
367 | if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) { | ||
368 | D_ASSERT(current != mdev->tconn->worker.task); | ||
369 | wait_for_completion(&done); | ||
370 | } | ||
371 | |||
372 | abort: | ||
373 | if (f & CS_SERIALIZE) | ||
374 | mutex_unlock(mdev->state_mutex); | ||
375 | |||
376 | return rv; | ||
377 | } | ||
378 | |||
379 | /** | ||
380 | * _drbd_request_state() - Request a state change (with flags) | ||
381 | * @mdev: DRBD device. | ||
382 | * @mask: mask of state bits to change. | ||
383 | * @val: value of new state bits. | ||
384 | * @f: flags | ||
385 | * | ||
386 | * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE | ||
387 | * flag, or when logging of failed state change requests is not desired. | ||
388 | */ | ||
389 | enum drbd_state_rv | ||
390 | _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask, | ||
391 | union drbd_state val, enum chg_state_flags f) | ||
392 | { | ||
393 | enum drbd_state_rv rv; | ||
394 | |||
395 | wait_event(mdev->state_wait, | ||
396 | (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE); | ||
397 | |||
398 | return rv; | ||
399 | } | ||
400 | |||
401 | static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns) | ||
402 | { | ||
403 | dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c%c%c }\n", | ||
404 | name, | ||
405 | drbd_conn_str(ns.conn), | ||
406 | drbd_role_str(ns.role), | ||
407 | drbd_role_str(ns.peer), | ||
408 | drbd_disk_str(ns.disk), | ||
409 | drbd_disk_str(ns.pdsk), | ||
410 | is_susp(ns) ? 's' : 'r', | ||
411 | ns.aftr_isp ? 'a' : '-', | ||
412 | ns.peer_isp ? 'p' : '-', | ||
413 | ns.user_isp ? 'u' : '-', | ||
414 | ns.susp_fen ? 'F' : '-', | ||
415 | ns.susp_nod ? 'N' : '-' | ||
416 | ); | ||
417 | } | ||
418 | |||
419 | void print_st_err(struct drbd_conf *mdev, union drbd_state os, | ||
420 | union drbd_state ns, enum drbd_state_rv err) | ||
421 | { | ||
422 | if (err == SS_IN_TRANSIENT_STATE) | ||
423 | return; | ||
424 | dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err)); | ||
425 | print_st(mdev, " state", os); | ||
426 | print_st(mdev, "wanted", ns); | ||
427 | } | ||
428 | |||
429 | static long print_state_change(char *pb, union drbd_state os, union drbd_state ns, | ||
430 | enum chg_state_flags flags) | ||
431 | { | ||
432 | char *pbp; | ||
433 | pbp = pb; | ||
434 | *pbp = 0; | ||
435 | |||
436 | if (ns.role != os.role && flags & CS_DC_ROLE) | ||
437 | pbp += sprintf(pbp, "role( %s -> %s ) ", | ||
438 | drbd_role_str(os.role), | ||
439 | drbd_role_str(ns.role)); | ||
440 | if (ns.peer != os.peer && flags & CS_DC_PEER) | ||
441 | pbp += sprintf(pbp, "peer( %s -> %s ) ", | ||
442 | drbd_role_str(os.peer), | ||
443 | drbd_role_str(ns.peer)); | ||
444 | if (ns.conn != os.conn && flags & CS_DC_CONN) | ||
445 | pbp += sprintf(pbp, "conn( %s -> %s ) ", | ||
446 | drbd_conn_str(os.conn), | ||
447 | drbd_conn_str(ns.conn)); | ||
448 | if (ns.disk != os.disk && flags & CS_DC_DISK) | ||
449 | pbp += sprintf(pbp, "disk( %s -> %s ) ", | ||
450 | drbd_disk_str(os.disk), | ||
451 | drbd_disk_str(ns.disk)); | ||
452 | if (ns.pdsk != os.pdsk && flags & CS_DC_PDSK) | ||
453 | pbp += sprintf(pbp, "pdsk( %s -> %s ) ", | ||
454 | drbd_disk_str(os.pdsk), | ||
455 | drbd_disk_str(ns.pdsk)); | ||
456 | |||
457 | return pbp - pb; | ||
458 | } | ||
459 | |||
460 | static void drbd_pr_state_change(struct drbd_conf *mdev, union drbd_state os, union drbd_state ns, | ||
461 | enum chg_state_flags flags) | ||
462 | { | ||
463 | char pb[300]; | ||
464 | char *pbp = pb; | ||
465 | |||
466 | pbp += print_state_change(pbp, os, ns, flags ^ CS_DC_MASK); | ||
467 | |||
468 | if (ns.aftr_isp != os.aftr_isp) | ||
469 | pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ", | ||
470 | os.aftr_isp, | ||
471 | ns.aftr_isp); | ||
472 | if (ns.peer_isp != os.peer_isp) | ||
473 | pbp += sprintf(pbp, "peer_isp( %d -> %d ) ", | ||
474 | os.peer_isp, | ||
475 | ns.peer_isp); | ||
476 | if (ns.user_isp != os.user_isp) | ||
477 | pbp += sprintf(pbp, "user_isp( %d -> %d ) ", | ||
478 | os.user_isp, | ||
479 | ns.user_isp); | ||
480 | |||
481 | if (pbp != pb) | ||
482 | dev_info(DEV, "%s\n", pb); | ||
483 | } | ||
484 | |||
485 | static void conn_pr_state_change(struct drbd_tconn *tconn, union drbd_state os, union drbd_state ns, | ||
486 | enum chg_state_flags flags) | ||
487 | { | ||
488 | char pb[300]; | ||
489 | char *pbp = pb; | ||
490 | |||
491 | pbp += print_state_change(pbp, os, ns, flags); | ||
492 | |||
493 | if (is_susp(ns) != is_susp(os) && flags & CS_DC_SUSP) | ||
494 | pbp += sprintf(pbp, "susp( %d -> %d ) ", | ||
495 | is_susp(os), | ||
496 | is_susp(ns)); | ||
497 | |||
498 | if (pbp != pb) | ||
499 | conn_info(tconn, "%s\n", pb); | ||
500 | } | ||
501 | |||
502 | |||
503 | /** | ||
504 | * is_valid_state() - Returns an SS_ error code if ns is not valid | ||
505 | * @mdev: DRBD device. | ||
506 | * @ns: State to consider. | ||
507 | */ | ||
508 | static enum drbd_state_rv | ||
509 | is_valid_state(struct drbd_conf *mdev, union drbd_state ns) | ||
510 | { | ||
511 | /* See drbd_state_sw_errors in drbd_strings.c */ | ||
512 | |||
513 | enum drbd_fencing_p fp; | ||
514 | enum drbd_state_rv rv = SS_SUCCESS; | ||
515 | struct net_conf *nc; | ||
516 | |||
517 | rcu_read_lock(); | ||
518 | fp = FP_DONT_CARE; | ||
519 | if (get_ldev(mdev)) { | ||
520 | fp = rcu_dereference(mdev->ldev->disk_conf)->fencing; | ||
521 | put_ldev(mdev); | ||
522 | } | ||
523 | |||
524 | nc = rcu_dereference(mdev->tconn->net_conf); | ||
525 | if (nc) { | ||
526 | if (!nc->two_primaries && ns.role == R_PRIMARY) { | ||
527 | if (ns.peer == R_PRIMARY) | ||
528 | rv = SS_TWO_PRIMARIES; | ||
529 | else if (conn_highest_peer(mdev->tconn) == R_PRIMARY) | ||
530 | rv = SS_O_VOL_PEER_PRI; | ||
531 | } | ||
532 | } | ||
533 | |||
534 | if (rv <= 0) | ||
535 | /* already found a reason to abort */; | ||
536 | else if (ns.role == R_SECONDARY && mdev->open_cnt) | ||
537 | rv = SS_DEVICE_IN_USE; | ||
538 | |||
539 | else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE) | ||
540 | rv = SS_NO_UP_TO_DATE_DISK; | ||
541 | |||
542 | else if (fp >= FP_RESOURCE && | ||
543 | ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN) | ||
544 | rv = SS_PRIMARY_NOP; | ||
545 | |||
546 | else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT) | ||
547 | rv = SS_NO_UP_TO_DATE_DISK; | ||
548 | |||
549 | else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT) | ||
550 | rv = SS_NO_LOCAL_DISK; | ||
551 | |||
552 | else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT) | ||
553 | rv = SS_NO_REMOTE_DISK; | ||
554 | |||
555 | else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) | ||
556 | rv = SS_NO_UP_TO_DATE_DISK; | ||
557 | |||
558 | else if ((ns.conn == C_CONNECTED || | ||
559 | ns.conn == C_WF_BITMAP_S || | ||
560 | ns.conn == C_SYNC_SOURCE || | ||
561 | ns.conn == C_PAUSED_SYNC_S) && | ||
562 | ns.disk == D_OUTDATED) | ||
563 | rv = SS_CONNECTED_OUTDATES; | ||
564 | |||
565 | else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && | ||
566 | (nc->verify_alg[0] == 0)) | ||
567 | rv = SS_NO_VERIFY_ALG; | ||
568 | |||
569 | else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && | ||
570 | mdev->tconn->agreed_pro_version < 88) | ||
571 | rv = SS_NOT_SUPPORTED; | ||
572 | |||
573 | else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN) | ||
574 | rv = SS_CONNECTED_OUTDATES; | ||
575 | |||
576 | rcu_read_unlock(); | ||
577 | |||
578 | return rv; | ||
579 | } | ||
580 | |||
581 | /** | ||
582 | * is_valid_soft_transition() - Returns an SS_ error code if the state transition is not possible | ||
583 | * This function limits state transitions that may be declined by DRBD. I.e. | ||
584 | * user requests (aka soft transitions). | ||
585 | * @mdev: DRBD device. | ||
586 | * @ns: new state. | ||
587 | * @os: old state. | ||
588 | */ | ||
589 | static enum drbd_state_rv | ||
590 | is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_tconn *tconn) | ||
591 | { | ||
592 | enum drbd_state_rv rv = SS_SUCCESS; | ||
593 | |||
594 | if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) && | ||
595 | os.conn > C_CONNECTED) | ||
596 | rv = SS_RESYNC_RUNNING; | ||
597 | |||
598 | if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE) | ||
599 | rv = SS_ALREADY_STANDALONE; | ||
600 | |||
601 | if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS) | ||
602 | rv = SS_IS_DISKLESS; | ||
603 | |||
604 | if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED) | ||
605 | rv = SS_NO_NET_CONFIG; | ||
606 | |||
607 | if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING) | ||
608 | rv = SS_LOWER_THAN_OUTDATED; | ||
609 | |||
610 | if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED) | ||
611 | rv = SS_IN_TRANSIENT_STATE; | ||
612 | |||
613 | /* if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS) | ||
614 | rv = SS_IN_TRANSIENT_STATE; */ | ||
615 | |||
616 | /* While establishing a connection only allow cstate to change. | ||
617 | Delay/refuse role changes, detach attach etc... */ | ||
618 | if (test_bit(STATE_SENT, &tconn->flags) && | ||
619 | !(os.conn == C_WF_REPORT_PARAMS || | ||
620 | (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION))) | ||
621 | rv = SS_IN_TRANSIENT_STATE; | ||
622 | |||
623 | if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED) | ||
624 | rv = SS_NEED_CONNECTION; | ||
625 | |||
626 | if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && | ||
627 | ns.conn != os.conn && os.conn > C_CONNECTED) | ||
628 | rv = SS_RESYNC_RUNNING; | ||
629 | |||
630 | if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) && | ||
631 | os.conn < C_CONNECTED) | ||
632 | rv = SS_NEED_CONNECTION; | ||
633 | |||
634 | if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE) | ||
635 | && os.conn < C_WF_REPORT_PARAMS) | ||
636 | rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */ | ||
637 | |||
638 | return rv; | ||
639 | } | ||
640 | |||
641 | static enum drbd_state_rv | ||
642 | is_valid_conn_transition(enum drbd_conns oc, enum drbd_conns nc) | ||
643 | { | ||
644 | /* no change -> nothing to do, at least for the connection part */ | ||
645 | if (oc == nc) | ||
646 | return SS_NOTHING_TO_DO; | ||
647 | |||
648 | /* disconnect of an unconfigured connection does not make sense */ | ||
649 | if (oc == C_STANDALONE && nc == C_DISCONNECTING) | ||
650 | return SS_ALREADY_STANDALONE; | ||
651 | |||
652 | /* from C_STANDALONE, we start with C_UNCONNECTED */ | ||
653 | if (oc == C_STANDALONE && nc != C_UNCONNECTED) | ||
654 | return SS_NEED_CONNECTION; | ||
655 | |||
656 | /* When establishing a connection we need to go through WF_REPORT_PARAMS! | ||
657 | Necessary to do the right thing upon invalidate-remote on a disconnected resource */ | ||
658 | if (oc < C_WF_REPORT_PARAMS && nc >= C_CONNECTED) | ||
659 | return SS_NEED_CONNECTION; | ||
660 | |||
661 | /* After a network error only C_UNCONNECTED or C_DISCONNECTING may follow. */ | ||
662 | if (oc >= C_TIMEOUT && oc <= C_TEAR_DOWN && nc != C_UNCONNECTED && nc != C_DISCONNECTING) | ||
663 | return SS_IN_TRANSIENT_STATE; | ||
664 | |||
665 | /* After C_DISCONNECTING only C_STANDALONE may follow */ | ||
666 | if (oc == C_DISCONNECTING && nc != C_STANDALONE) | ||
667 | return SS_IN_TRANSIENT_STATE; | ||
668 | |||
669 | return SS_SUCCESS; | ||
670 | } | ||
671 | |||
672 | |||
673 | /** | ||
674 | * is_valid_transition() - Returns an SS_ error code if the state transition is not possible | ||
675 | * This limits hard state transitions. Hard state transitions are facts there are | ||
676 | * imposed on DRBD by the environment. E.g. disk broke or network broke down. | ||
677 | * But those hard state transitions are still not allowed to do everything. | ||
678 | * @ns: new state. | ||
679 | * @os: old state. | ||
680 | */ | ||
681 | static enum drbd_state_rv | ||
682 | is_valid_transition(union drbd_state os, union drbd_state ns) | ||
683 | { | ||
684 | enum drbd_state_rv rv; | ||
685 | |||
686 | rv = is_valid_conn_transition(os.conn, ns.conn); | ||
687 | |||
688 | /* we cannot fail (again) if we already detached */ | ||
689 | if (ns.disk == D_FAILED && os.disk == D_DISKLESS) | ||
690 | rv = SS_IS_DISKLESS; | ||
691 | |||
692 | return rv; | ||
693 | } | ||
694 | |||
695 | static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn) | ||
696 | { | ||
697 | static const char *msg_table[] = { | ||
698 | [NO_WARNING] = "", | ||
699 | [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.", | ||
700 | [ABORTED_RESYNC] = "Resync aborted.", | ||
701 | [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!", | ||
702 | [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk", | ||
703 | [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk", | ||
704 | }; | ||
705 | |||
706 | if (warn != NO_WARNING) | ||
707 | dev_warn(DEV, "%s\n", msg_table[warn]); | ||
708 | } | ||
709 | |||
710 | /** | ||
711 | * sanitize_state() - Resolves implicitly necessary additional changes to a state transition | ||
712 | * @mdev: DRBD device. | ||
713 | * @os: old state. | ||
714 | * @ns: new state. | ||
715 | * @warn_sync_abort: | ||
716 | * | ||
717 | * When we loose connection, we have to set the state of the peers disk (pdsk) | ||
718 | * to D_UNKNOWN. This rule and many more along those lines are in this function. | ||
719 | */ | ||
720 | static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state ns, | ||
721 | enum sanitize_state_warnings *warn) | ||
722 | { | ||
723 | enum drbd_fencing_p fp; | ||
724 | enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max; | ||
725 | |||
726 | if (warn) | ||
727 | *warn = NO_WARNING; | ||
728 | |||
729 | fp = FP_DONT_CARE; | ||
730 | if (get_ldev(mdev)) { | ||
731 | rcu_read_lock(); | ||
732 | fp = rcu_dereference(mdev->ldev->disk_conf)->fencing; | ||
733 | rcu_read_unlock(); | ||
734 | put_ldev(mdev); | ||
735 | } | ||
736 | |||
737 | /* Implications from connection to peer and peer_isp */ | ||
738 | if (ns.conn < C_CONNECTED) { | ||
739 | ns.peer_isp = 0; | ||
740 | ns.peer = R_UNKNOWN; | ||
741 | if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT) | ||
742 | ns.pdsk = D_UNKNOWN; | ||
743 | } | ||
744 | |||
745 | /* Clear the aftr_isp when becoming unconfigured */ | ||
746 | if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY) | ||
747 | ns.aftr_isp = 0; | ||
748 | |||
749 | /* An implication of the disk states onto the connection state */ | ||
750 | /* Abort resync if a disk fails/detaches */ | ||
751 | if (ns.conn > C_CONNECTED && (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) { | ||
752 | if (warn) | ||
753 | *warn = ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T ? | ||
754 | ABORTED_ONLINE_VERIFY : ABORTED_RESYNC; | ||
755 | ns.conn = C_CONNECTED; | ||
756 | } | ||
757 | |||
758 | /* Connection breaks down before we finished "Negotiating" */ | ||
759 | if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING && | ||
760 | get_ldev_if_state(mdev, D_NEGOTIATING)) { | ||
761 | if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) { | ||
762 | ns.disk = mdev->new_state_tmp.disk; | ||
763 | ns.pdsk = mdev->new_state_tmp.pdsk; | ||
764 | } else { | ||
765 | if (warn) | ||
766 | *warn = CONNECTION_LOST_NEGOTIATING; | ||
767 | ns.disk = D_DISKLESS; | ||
768 | ns.pdsk = D_UNKNOWN; | ||
769 | } | ||
770 | put_ldev(mdev); | ||
771 | } | ||
772 | |||
773 | /* D_CONSISTENT and D_OUTDATED vanish when we get connected */ | ||
774 | if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) { | ||
775 | if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) | ||
776 | ns.disk = D_UP_TO_DATE; | ||
777 | if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED) | ||
778 | ns.pdsk = D_UP_TO_DATE; | ||
779 | } | ||
780 | |||
781 | /* Implications of the connection stat on the disk states */ | ||
782 | disk_min = D_DISKLESS; | ||
783 | disk_max = D_UP_TO_DATE; | ||
784 | pdsk_min = D_INCONSISTENT; | ||
785 | pdsk_max = D_UNKNOWN; | ||
786 | switch ((enum drbd_conns)ns.conn) { | ||
787 | case C_WF_BITMAP_T: | ||
788 | case C_PAUSED_SYNC_T: | ||
789 | case C_STARTING_SYNC_T: | ||
790 | case C_WF_SYNC_UUID: | ||
791 | case C_BEHIND: | ||
792 | disk_min = D_INCONSISTENT; | ||
793 | disk_max = D_OUTDATED; | ||
794 | pdsk_min = D_UP_TO_DATE; | ||
795 | pdsk_max = D_UP_TO_DATE; | ||
796 | break; | ||
797 | case C_VERIFY_S: | ||
798 | case C_VERIFY_T: | ||
799 | disk_min = D_UP_TO_DATE; | ||
800 | disk_max = D_UP_TO_DATE; | ||
801 | pdsk_min = D_UP_TO_DATE; | ||
802 | pdsk_max = D_UP_TO_DATE; | ||
803 | break; | ||
804 | case C_CONNECTED: | ||
805 | disk_min = D_DISKLESS; | ||
806 | disk_max = D_UP_TO_DATE; | ||
807 | pdsk_min = D_DISKLESS; | ||
808 | pdsk_max = D_UP_TO_DATE; | ||
809 | break; | ||
810 | case C_WF_BITMAP_S: | ||
811 | case C_PAUSED_SYNC_S: | ||
812 | case C_STARTING_SYNC_S: | ||
813 | case C_AHEAD: | ||
814 | disk_min = D_UP_TO_DATE; | ||
815 | disk_max = D_UP_TO_DATE; | ||
816 | pdsk_min = D_INCONSISTENT; | ||
817 | pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/ | ||
818 | break; | ||
819 | case C_SYNC_TARGET: | ||
820 | disk_min = D_INCONSISTENT; | ||
821 | disk_max = D_INCONSISTENT; | ||
822 | pdsk_min = D_UP_TO_DATE; | ||
823 | pdsk_max = D_UP_TO_DATE; | ||
824 | break; | ||
825 | case C_SYNC_SOURCE: | ||
826 | disk_min = D_UP_TO_DATE; | ||
827 | disk_max = D_UP_TO_DATE; | ||
828 | pdsk_min = D_INCONSISTENT; | ||
829 | pdsk_max = D_INCONSISTENT; | ||
830 | break; | ||
831 | case C_STANDALONE: | ||
832 | case C_DISCONNECTING: | ||
833 | case C_UNCONNECTED: | ||
834 | case C_TIMEOUT: | ||
835 | case C_BROKEN_PIPE: | ||
836 | case C_NETWORK_FAILURE: | ||
837 | case C_PROTOCOL_ERROR: | ||
838 | case C_TEAR_DOWN: | ||
839 | case C_WF_CONNECTION: | ||
840 | case C_WF_REPORT_PARAMS: | ||
841 | case C_MASK: | ||
842 | break; | ||
843 | } | ||
844 | if (ns.disk > disk_max) | ||
845 | ns.disk = disk_max; | ||
846 | |||
847 | if (ns.disk < disk_min) { | ||
848 | if (warn) | ||
849 | *warn = IMPLICITLY_UPGRADED_DISK; | ||
850 | ns.disk = disk_min; | ||
851 | } | ||
852 | if (ns.pdsk > pdsk_max) | ||
853 | ns.pdsk = pdsk_max; | ||
854 | |||
855 | if (ns.pdsk < pdsk_min) { | ||
856 | if (warn) | ||
857 | *warn = IMPLICITLY_UPGRADED_PDSK; | ||
858 | ns.pdsk = pdsk_min; | ||
859 | } | ||
860 | |||
861 | if (fp == FP_STONITH && | ||
862 | (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED)) | ||
863 | ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */ | ||
864 | |||
865 | if (mdev->tconn->res_opts.on_no_data == OND_SUSPEND_IO && | ||
866 | (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)) | ||
867 | ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */ | ||
868 | |||
869 | if (ns.aftr_isp || ns.peer_isp || ns.user_isp) { | ||
870 | if (ns.conn == C_SYNC_SOURCE) | ||
871 | ns.conn = C_PAUSED_SYNC_S; | ||
872 | if (ns.conn == C_SYNC_TARGET) | ||
873 | ns.conn = C_PAUSED_SYNC_T; | ||
874 | } else { | ||
875 | if (ns.conn == C_PAUSED_SYNC_S) | ||
876 | ns.conn = C_SYNC_SOURCE; | ||
877 | if (ns.conn == C_PAUSED_SYNC_T) | ||
878 | ns.conn = C_SYNC_TARGET; | ||
879 | } | ||
880 | |||
881 | return ns; | ||
882 | } | ||
883 | |||
884 | void drbd_resume_al(struct drbd_conf *mdev) | ||
885 | { | ||
886 | if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags)) | ||
887 | dev_info(DEV, "Resumed AL updates\n"); | ||
888 | } | ||
889 | |||
890 | /* helper for __drbd_set_state */ | ||
891 | static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs) | ||
892 | { | ||
893 | if (mdev->tconn->agreed_pro_version < 90) | ||
894 | mdev->ov_start_sector = 0; | ||
895 | mdev->rs_total = drbd_bm_bits(mdev); | ||
896 | mdev->ov_position = 0; | ||
897 | if (cs == C_VERIFY_T) { | ||
898 | /* starting online verify from an arbitrary position | ||
899 | * does not fit well into the existing protocol. | ||
900 | * on C_VERIFY_T, we initialize ov_left and friends | ||
901 | * implicitly in receive_DataRequest once the | ||
902 | * first P_OV_REQUEST is received */ | ||
903 | mdev->ov_start_sector = ~(sector_t)0; | ||
904 | } else { | ||
905 | unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector); | ||
906 | if (bit >= mdev->rs_total) { | ||
907 | mdev->ov_start_sector = | ||
908 | BM_BIT_TO_SECT(mdev->rs_total - 1); | ||
909 | mdev->rs_total = 1; | ||
910 | } else | ||
911 | mdev->rs_total -= bit; | ||
912 | mdev->ov_position = mdev->ov_start_sector; | ||
913 | } | ||
914 | mdev->ov_left = mdev->rs_total; | ||
915 | } | ||
916 | |||
917 | /** | ||
918 | * __drbd_set_state() - Set a new DRBD state | ||
919 | * @mdev: DRBD device. | ||
920 | * @ns: new state. | ||
921 | * @flags: Flags | ||
922 | * @done: Optional completion, that will get completed after the after_state_ch() finished | ||
923 | * | ||
924 | * Caller needs to hold req_lock, and global_state_lock. Do not call directly. | ||
925 | */ | ||
926 | enum drbd_state_rv | ||
927 | __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, | ||
928 | enum chg_state_flags flags, struct completion *done) | ||
929 | { | ||
930 | union drbd_state os; | ||
931 | enum drbd_state_rv rv = SS_SUCCESS; | ||
932 | enum sanitize_state_warnings ssw; | ||
933 | struct after_state_chg_work *ascw; | ||
934 | |||
935 | os = drbd_read_state(mdev); | ||
936 | |||
937 | ns = sanitize_state(mdev, ns, &ssw); | ||
938 | if (ns.i == os.i) | ||
939 | return SS_NOTHING_TO_DO; | ||
940 | |||
941 | rv = is_valid_transition(os, ns); | ||
942 | if (rv < SS_SUCCESS) | ||
943 | return rv; | ||
944 | |||
945 | if (!(flags & CS_HARD)) { | ||
946 | /* pre-state-change checks ; only look at ns */ | ||
947 | /* See drbd_state_sw_errors in drbd_strings.c */ | ||
948 | |||
949 | rv = is_valid_state(mdev, ns); | ||
950 | if (rv < SS_SUCCESS) { | ||
951 | /* If the old state was illegal as well, then let | ||
952 | this happen...*/ | ||
953 | |||
954 | if (is_valid_state(mdev, os) == rv) | ||
955 | rv = is_valid_soft_transition(os, ns, mdev->tconn); | ||
956 | } else | ||
957 | rv = is_valid_soft_transition(os, ns, mdev->tconn); | ||
958 | } | ||
959 | |||
960 | if (rv < SS_SUCCESS) { | ||
961 | if (flags & CS_VERBOSE) | ||
962 | print_st_err(mdev, os, ns, rv); | ||
963 | return rv; | ||
964 | } | ||
965 | |||
966 | print_sanitize_warnings(mdev, ssw); | ||
967 | |||
968 | drbd_pr_state_change(mdev, os, ns, flags); | ||
969 | |||
970 | /* Display changes to the susp* flags that where caused by the call to | ||
971 | sanitize_state(). Only display it here if we where not called from | ||
972 | _conn_request_state() */ | ||
973 | if (!(flags & CS_DC_SUSP)) | ||
974 | conn_pr_state_change(mdev->tconn, os, ns, (flags & ~CS_DC_MASK) | CS_DC_SUSP); | ||
975 | |||
976 | /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference | ||
977 | * on the ldev here, to be sure the transition -> D_DISKLESS resp. | ||
978 | * drbd_ldev_destroy() won't happen before our corresponding | ||
979 | * after_state_ch works run, where we put_ldev again. */ | ||
980 | if ((os.disk != D_FAILED && ns.disk == D_FAILED) || | ||
981 | (os.disk != D_DISKLESS && ns.disk == D_DISKLESS)) | ||
982 | atomic_inc(&mdev->local_cnt); | ||
983 | |||
984 | mdev->state.i = ns.i; | ||
985 | mdev->tconn->susp = ns.susp; | ||
986 | mdev->tconn->susp_nod = ns.susp_nod; | ||
987 | mdev->tconn->susp_fen = ns.susp_fen; | ||
988 | |||
989 | if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING) | ||
990 | drbd_print_uuids(mdev, "attached to UUIDs"); | ||
991 | |||
992 | /* Wake up role changes, that were delayed because of connection establishing */ | ||
993 | if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS && | ||
994 | no_peer_wf_report_params(mdev->tconn)) | ||
995 | clear_bit(STATE_SENT, &mdev->tconn->flags); | ||
996 | |||
997 | wake_up(&mdev->misc_wait); | ||
998 | wake_up(&mdev->state_wait); | ||
999 | wake_up(&mdev->tconn->ping_wait); | ||
1000 | |||
1001 | /* Aborted verify run, or we reached the stop sector. | ||
1002 | * Log the last position, unless end-of-device. */ | ||
1003 | if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) && | ||
1004 | ns.conn <= C_CONNECTED) { | ||
1005 | mdev->ov_start_sector = | ||
1006 | BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left); | ||
1007 | if (mdev->ov_left) | ||
1008 | dev_info(DEV, "Online Verify reached sector %llu\n", | ||
1009 | (unsigned long long)mdev->ov_start_sector); | ||
1010 | } | ||
1011 | |||
1012 | if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) && | ||
1013 | (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) { | ||
1014 | dev_info(DEV, "Syncer continues.\n"); | ||
1015 | mdev->rs_paused += (long)jiffies | ||
1016 | -(long)mdev->rs_mark_time[mdev->rs_last_mark]; | ||
1017 | if (ns.conn == C_SYNC_TARGET) | ||
1018 | mod_timer(&mdev->resync_timer, jiffies); | ||
1019 | } | ||
1020 | |||
1021 | if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) && | ||
1022 | (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) { | ||
1023 | dev_info(DEV, "Resync suspended\n"); | ||
1024 | mdev->rs_mark_time[mdev->rs_last_mark] = jiffies; | ||
1025 | } | ||
1026 | |||
1027 | if (os.conn == C_CONNECTED && | ||
1028 | (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) { | ||
1029 | unsigned long now = jiffies; | ||
1030 | int i; | ||
1031 | |||
1032 | set_ov_position(mdev, ns.conn); | ||
1033 | mdev->rs_start = now; | ||
1034 | mdev->rs_last_events = 0; | ||
1035 | mdev->rs_last_sect_ev = 0; | ||
1036 | mdev->ov_last_oos_size = 0; | ||
1037 | mdev->ov_last_oos_start = 0; | ||
1038 | |||
1039 | for (i = 0; i < DRBD_SYNC_MARKS; i++) { | ||
1040 | mdev->rs_mark_left[i] = mdev->ov_left; | ||
1041 | mdev->rs_mark_time[i] = now; | ||
1042 | } | ||
1043 | |||
1044 | drbd_rs_controller_reset(mdev); | ||
1045 | |||
1046 | if (ns.conn == C_VERIFY_S) { | ||
1047 | dev_info(DEV, "Starting Online Verify from sector %llu\n", | ||
1048 | (unsigned long long)mdev->ov_position); | ||
1049 | mod_timer(&mdev->resync_timer, jiffies); | ||
1050 | } | ||
1051 | } | ||
1052 | |||
1053 | if (get_ldev(mdev)) { | ||
1054 | u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND| | ||
1055 | MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE| | ||
1056 | MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY); | ||
1057 | |||
1058 | mdf &= ~MDF_AL_CLEAN; | ||
1059 | if (test_bit(CRASHED_PRIMARY, &mdev->flags)) | ||
1060 | mdf |= MDF_CRASHED_PRIMARY; | ||
1061 | if (mdev->state.role == R_PRIMARY || | ||
1062 | (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY)) | ||
1063 | mdf |= MDF_PRIMARY_IND; | ||
1064 | if (mdev->state.conn > C_WF_REPORT_PARAMS) | ||
1065 | mdf |= MDF_CONNECTED_IND; | ||
1066 | if (mdev->state.disk > D_INCONSISTENT) | ||
1067 | mdf |= MDF_CONSISTENT; | ||
1068 | if (mdev->state.disk > D_OUTDATED) | ||
1069 | mdf |= MDF_WAS_UP_TO_DATE; | ||
1070 | if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT) | ||
1071 | mdf |= MDF_PEER_OUT_DATED; | ||
1072 | if (mdf != mdev->ldev->md.flags) { | ||
1073 | mdev->ldev->md.flags = mdf; | ||
1074 | drbd_md_mark_dirty(mdev); | ||
1075 | } | ||
1076 | if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT) | ||
1077 | drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]); | ||
1078 | put_ldev(mdev); | ||
1079 | } | ||
1080 | |||
1081 | /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */ | ||
1082 | if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT && | ||
1083 | os.peer == R_SECONDARY && ns.peer == R_PRIMARY) | ||
1084 | set_bit(CONSIDER_RESYNC, &mdev->flags); | ||
1085 | |||
1086 | /* Receiver should clean up itself */ | ||
1087 | if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING) | ||
1088 | drbd_thread_stop_nowait(&mdev->tconn->receiver); | ||
1089 | |||
1090 | /* Now the receiver finished cleaning up itself, it should die */ | ||
1091 | if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE) | ||
1092 | drbd_thread_stop_nowait(&mdev->tconn->receiver); | ||
1093 | |||
1094 | /* Upon network failure, we need to restart the receiver. */ | ||
1095 | if (os.conn > C_WF_CONNECTION && | ||
1096 | ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT) | ||
1097 | drbd_thread_restart_nowait(&mdev->tconn->receiver); | ||
1098 | |||
1099 | /* Resume AL writing if we get a connection */ | ||
1100 | if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) | ||
1101 | drbd_resume_al(mdev); | ||
1102 | |||
1103 | /* remember last attach time so request_timer_fn() won't | ||
1104 | * kill newly established sessions while we are still trying to thaw | ||
1105 | * previously frozen IO */ | ||
1106 | if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && | ||
1107 | ns.disk > D_NEGOTIATING) | ||
1108 | mdev->last_reattach_jif = jiffies; | ||
1109 | |||
1110 | ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC); | ||
1111 | if (ascw) { | ||
1112 | ascw->os = os; | ||
1113 | ascw->ns = ns; | ||
1114 | ascw->flags = flags; | ||
1115 | ascw->w.cb = w_after_state_ch; | ||
1116 | ascw->w.mdev = mdev; | ||
1117 | ascw->done = done; | ||
1118 | drbd_queue_work(&mdev->tconn->sender_work, &ascw->w); | ||
1119 | } else { | ||
1120 | dev_err(DEV, "Could not kmalloc an ascw\n"); | ||
1121 | } | ||
1122 | |||
1123 | return rv; | ||
1124 | } | ||
1125 | |||
1126 | static int w_after_state_ch(struct drbd_work *w, int unused) | ||
1127 | { | ||
1128 | struct after_state_chg_work *ascw = | ||
1129 | container_of(w, struct after_state_chg_work, w); | ||
1130 | struct drbd_conf *mdev = w->mdev; | ||
1131 | |||
1132 | after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags); | ||
1133 | if (ascw->flags & CS_WAIT_COMPLETE) { | ||
1134 | D_ASSERT(ascw->done != NULL); | ||
1135 | complete(ascw->done); | ||
1136 | } | ||
1137 | kfree(ascw); | ||
1138 | |||
1139 | return 0; | ||
1140 | } | ||
1141 | |||
1142 | static void abw_start_sync(struct drbd_conf *mdev, int rv) | ||
1143 | { | ||
1144 | if (rv) { | ||
1145 | dev_err(DEV, "Writing the bitmap failed not starting resync.\n"); | ||
1146 | _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE); | ||
1147 | return; | ||
1148 | } | ||
1149 | |||
1150 | switch (mdev->state.conn) { | ||
1151 | case C_STARTING_SYNC_T: | ||
1152 | _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); | ||
1153 | break; | ||
1154 | case C_STARTING_SYNC_S: | ||
1155 | drbd_start_resync(mdev, C_SYNC_SOURCE); | ||
1156 | break; | ||
1157 | } | ||
1158 | } | ||
1159 | |||
1160 | int drbd_bitmap_io_from_worker(struct drbd_conf *mdev, | ||
1161 | int (*io_fn)(struct drbd_conf *), | ||
1162 | char *why, enum bm_flag flags) | ||
1163 | { | ||
1164 | int rv; | ||
1165 | |||
1166 | D_ASSERT(current == mdev->tconn->worker.task); | ||
1167 | |||
1168 | /* open coded non-blocking drbd_suspend_io(mdev); */ | ||
1169 | set_bit(SUSPEND_IO, &mdev->flags); | ||
1170 | |||
1171 | drbd_bm_lock(mdev, why, flags); | ||
1172 | rv = io_fn(mdev); | ||
1173 | drbd_bm_unlock(mdev); | ||
1174 | |||
1175 | drbd_resume_io(mdev); | ||
1176 | |||
1177 | return rv; | ||
1178 | } | ||
1179 | |||
1180 | /** | ||
1181 | * after_state_ch() - Perform after state change actions that may sleep | ||
1182 | * @mdev: DRBD device. | ||
1183 | * @os: old state. | ||
1184 | * @ns: new state. | ||
1185 | * @flags: Flags | ||
1186 | */ | ||
1187 | static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, | ||
1188 | union drbd_state ns, enum chg_state_flags flags) | ||
1189 | { | ||
1190 | struct sib_info sib; | ||
1191 | |||
1192 | sib.sib_reason = SIB_STATE_CHANGE; | ||
1193 | sib.os = os; | ||
1194 | sib.ns = ns; | ||
1195 | |||
1196 | if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) { | ||
1197 | clear_bit(CRASHED_PRIMARY, &mdev->flags); | ||
1198 | if (mdev->p_uuid) | ||
1199 | mdev->p_uuid[UI_FLAGS] &= ~((u64)2); | ||
1200 | } | ||
1201 | |||
1202 | /* Inform userspace about the change... */ | ||
1203 | drbd_bcast_event(mdev, &sib); | ||
1204 | |||
1205 | if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) && | ||
1206 | (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)) | ||
1207 | drbd_khelper(mdev, "pri-on-incon-degr"); | ||
1208 | |||
1209 | /* Here we have the actions that are performed after a | ||
1210 | state change. This function might sleep */ | ||
1211 | |||
1212 | if (ns.susp_nod) { | ||
1213 | struct drbd_tconn *tconn = mdev->tconn; | ||
1214 | enum drbd_req_event what = NOTHING; | ||
1215 | |||
1216 | spin_lock_irq(&tconn->req_lock); | ||
1217 | if (os.conn < C_CONNECTED && conn_lowest_conn(tconn) >= C_CONNECTED) | ||
1218 | what = RESEND; | ||
1219 | |||
1220 | if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && | ||
1221 | conn_lowest_disk(tconn) > D_NEGOTIATING) | ||
1222 | what = RESTART_FROZEN_DISK_IO; | ||
1223 | |||
1224 | if (tconn->susp_nod && what != NOTHING) { | ||
1225 | _tl_restart(tconn, what); | ||
1226 | _conn_request_state(tconn, | ||
1227 | (union drbd_state) { { .susp_nod = 1 } }, | ||
1228 | (union drbd_state) { { .susp_nod = 0 } }, | ||
1229 | CS_VERBOSE); | ||
1230 | } | ||
1231 | spin_unlock_irq(&tconn->req_lock); | ||
1232 | } | ||
1233 | |||
1234 | if (ns.susp_fen) { | ||
1235 | struct drbd_tconn *tconn = mdev->tconn; | ||
1236 | |||
1237 | spin_lock_irq(&tconn->req_lock); | ||
1238 | if (tconn->susp_fen && conn_lowest_conn(tconn) >= C_CONNECTED) { | ||
1239 | /* case2: The connection was established again: */ | ||
1240 | struct drbd_conf *odev; | ||
1241 | int vnr; | ||
1242 | |||
1243 | rcu_read_lock(); | ||
1244 | idr_for_each_entry(&tconn->volumes, odev, vnr) | ||
1245 | clear_bit(NEW_CUR_UUID, &odev->flags); | ||
1246 | rcu_read_unlock(); | ||
1247 | _tl_restart(tconn, RESEND); | ||
1248 | _conn_request_state(tconn, | ||
1249 | (union drbd_state) { { .susp_fen = 1 } }, | ||
1250 | (union drbd_state) { { .susp_fen = 0 } }, | ||
1251 | CS_VERBOSE); | ||
1252 | } | ||
1253 | spin_unlock_irq(&tconn->req_lock); | ||
1254 | } | ||
1255 | |||
1256 | /* Became sync source. With protocol >= 96, we still need to send out | ||
1257 | * the sync uuid now. Need to do that before any drbd_send_state, or | ||
1258 | * the other side may go "paused sync" before receiving the sync uuids, | ||
1259 | * which is unexpected. */ | ||
1260 | if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) && | ||
1261 | (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) && | ||
1262 | mdev->tconn->agreed_pro_version >= 96 && get_ldev(mdev)) { | ||
1263 | drbd_gen_and_send_sync_uuid(mdev); | ||
1264 | put_ldev(mdev); | ||
1265 | } | ||
1266 | |||
1267 | /* Do not change the order of the if above and the two below... */ | ||
1268 | if (os.pdsk == D_DISKLESS && | ||
1269 | ns.pdsk > D_DISKLESS && ns.pdsk != D_UNKNOWN) { /* attach on the peer */ | ||
1270 | /* we probably will start a resync soon. | ||
1271 | * make sure those things are properly reset. */ | ||
1272 | mdev->rs_total = 0; | ||
1273 | mdev->rs_failed = 0; | ||
1274 | atomic_set(&mdev->rs_pending_cnt, 0); | ||
1275 | drbd_rs_cancel_all(mdev); | ||
1276 | |||
1277 | drbd_send_uuids(mdev); | ||
1278 | drbd_send_state(mdev, ns); | ||
1279 | } | ||
1280 | /* No point in queuing send_bitmap if we don't have a connection | ||
1281 | * anymore, so check also the _current_ state, not only the new state | ||
1282 | * at the time this work was queued. */ | ||
1283 | if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S && | ||
1284 | mdev->state.conn == C_WF_BITMAP_S) | ||
1285 | drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, | ||
1286 | "send_bitmap (WFBitMapS)", | ||
1287 | BM_LOCKED_TEST_ALLOWED); | ||
1288 | |||
1289 | /* Lost contact to peer's copy of the data */ | ||
1290 | if ((os.pdsk >= D_INCONSISTENT && | ||
1291 | os.pdsk != D_UNKNOWN && | ||
1292 | os.pdsk != D_OUTDATED) | ||
1293 | && (ns.pdsk < D_INCONSISTENT || | ||
1294 | ns.pdsk == D_UNKNOWN || | ||
1295 | ns.pdsk == D_OUTDATED)) { | ||
1296 | if (get_ldev(mdev)) { | ||
1297 | if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && | ||
1298 | mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { | ||
1299 | if (drbd_suspended(mdev)) { | ||
1300 | set_bit(NEW_CUR_UUID, &mdev->flags); | ||
1301 | } else { | ||
1302 | drbd_uuid_new_current(mdev); | ||
1303 | drbd_send_uuids(mdev); | ||
1304 | } | ||
1305 | } | ||
1306 | put_ldev(mdev); | ||
1307 | } | ||
1308 | } | ||
1309 | |||
1310 | if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) { | ||
1311 | if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY && | ||
1312 | mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { | ||
1313 | drbd_uuid_new_current(mdev); | ||
1314 | drbd_send_uuids(mdev); | ||
1315 | } | ||
1316 | /* D_DISKLESS Peer becomes secondary */ | ||
1317 | if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) | ||
1318 | /* We may still be Primary ourselves. | ||
1319 | * No harm done if the bitmap still changes, | ||
1320 | * redirtied pages will follow later. */ | ||
1321 | drbd_bitmap_io_from_worker(mdev, &drbd_bm_write, | ||
1322 | "demote diskless peer", BM_LOCKED_SET_ALLOWED); | ||
1323 | put_ldev(mdev); | ||
1324 | } | ||
1325 | |||
1326 | /* Write out all changed bits on demote. | ||
1327 | * Though, no need to da that just yet | ||
1328 | * if there is a resync going on still */ | ||
1329 | if (os.role == R_PRIMARY && ns.role == R_SECONDARY && | ||
1330 | mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) { | ||
1331 | /* No changes to the bitmap expected this time, so assert that, | ||
1332 | * even though no harm was done if it did change. */ | ||
1333 | drbd_bitmap_io_from_worker(mdev, &drbd_bm_write, | ||
1334 | "demote", BM_LOCKED_TEST_ALLOWED); | ||
1335 | put_ldev(mdev); | ||
1336 | } | ||
1337 | |||
1338 | /* Last part of the attaching process ... */ | ||
1339 | if (ns.conn >= C_CONNECTED && | ||
1340 | os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) { | ||
1341 | drbd_send_sizes(mdev, 0, 0); /* to start sync... */ | ||
1342 | drbd_send_uuids(mdev); | ||
1343 | drbd_send_state(mdev, ns); | ||
1344 | } | ||
1345 | |||
1346 | /* We want to pause/continue resync, tell peer. */ | ||
1347 | if (ns.conn >= C_CONNECTED && | ||
1348 | ((os.aftr_isp != ns.aftr_isp) || | ||
1349 | (os.user_isp != ns.user_isp))) | ||
1350 | drbd_send_state(mdev, ns); | ||
1351 | |||
1352 | /* In case one of the isp bits got set, suspend other devices. */ | ||
1353 | if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) && | ||
1354 | (ns.aftr_isp || ns.peer_isp || ns.user_isp)) | ||
1355 | suspend_other_sg(mdev); | ||
1356 | |||
1357 | /* Make sure the peer gets informed about eventual state | ||
1358 | changes (ISP bits) while we were in WFReportParams. */ | ||
1359 | if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED) | ||
1360 | drbd_send_state(mdev, ns); | ||
1361 | |||
1362 | if (os.conn != C_AHEAD && ns.conn == C_AHEAD) | ||
1363 | drbd_send_state(mdev, ns); | ||
1364 | |||
1365 | /* We are in the progress to start a full sync... */ | ||
1366 | if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || | ||
1367 | (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S)) | ||
1368 | /* no other bitmap changes expected during this phase */ | ||
1369 | drbd_queue_bitmap_io(mdev, | ||
1370 | &drbd_bmio_set_n_write, &abw_start_sync, | ||
1371 | "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED); | ||
1372 | |||
1373 | /* We are invalidating our self... */ | ||
1374 | if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED && | ||
1375 | os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT) | ||
1376 | /* other bitmap operation expected during this phase */ | ||
1377 | drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, | ||
1378 | "set_n_write from invalidate", BM_LOCKED_MASK); | ||
1379 | |||
1380 | /* first half of local IO error, failure to attach, | ||
1381 | * or administrative detach */ | ||
1382 | if (os.disk != D_FAILED && ns.disk == D_FAILED) { | ||
1383 | enum drbd_io_error_p eh = EP_PASS_ON; | ||
1384 | int was_io_error = 0; | ||
1385 | /* corresponding get_ldev was in __drbd_set_state, to serialize | ||
1386 | * our cleanup here with the transition to D_DISKLESS. | ||
1387 | * But is is still not save to dreference ldev here, since | ||
1388 | * we might come from an failed Attach before ldev was set. */ | ||
1389 | if (mdev->ldev) { | ||
1390 | rcu_read_lock(); | ||
1391 | eh = rcu_dereference(mdev->ldev->disk_conf)->on_io_error; | ||
1392 | rcu_read_unlock(); | ||
1393 | |||
1394 | was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags); | ||
1395 | |||
1396 | if (was_io_error && eh == EP_CALL_HELPER) | ||
1397 | drbd_khelper(mdev, "local-io-error"); | ||
1398 | |||
1399 | /* Immediately allow completion of all application IO, | ||
1400 | * that waits for completion from the local disk, | ||
1401 | * if this was a force-detach due to disk_timeout | ||
1402 | * or administrator request (drbdsetup detach --force). | ||
1403 | * Do NOT abort otherwise. | ||
1404 | * Aborting local requests may cause serious problems, | ||
1405 | * if requests are completed to upper layers already, | ||
1406 | * and then later the already submitted local bio completes. | ||
1407 | * This can cause DMA into former bio pages that meanwhile | ||
1408 | * have been re-used for other things. | ||
1409 | * So aborting local requests may cause crashes, | ||
1410 | * or even worse, silent data corruption. | ||
1411 | */ | ||
1412 | if (test_and_clear_bit(FORCE_DETACH, &mdev->flags)) | ||
1413 | tl_abort_disk_io(mdev); | ||
1414 | |||
1415 | /* current state still has to be D_FAILED, | ||
1416 | * there is only one way out: to D_DISKLESS, | ||
1417 | * and that may only happen after our put_ldev below. */ | ||
1418 | if (mdev->state.disk != D_FAILED) | ||
1419 | dev_err(DEV, | ||
1420 | "ASSERT FAILED: disk is %s during detach\n", | ||
1421 | drbd_disk_str(mdev->state.disk)); | ||
1422 | |||
1423 | if (ns.conn >= C_CONNECTED) | ||
1424 | drbd_send_state(mdev, ns); | ||
1425 | |||
1426 | drbd_rs_cancel_all(mdev); | ||
1427 | |||
1428 | /* In case we want to get something to stable storage still, | ||
1429 | * this may be the last chance. | ||
1430 | * Following put_ldev may transition to D_DISKLESS. */ | ||
1431 | drbd_md_sync(mdev); | ||
1432 | } | ||
1433 | put_ldev(mdev); | ||
1434 | } | ||
1435 | |||
1436 | /* second half of local IO error, failure to attach, | ||
1437 | * or administrative detach, | ||
1438 | * after local_cnt references have reached zero again */ | ||
1439 | if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) { | ||
1440 | /* We must still be diskless, | ||
1441 | * re-attach has to be serialized with this! */ | ||
1442 | if (mdev->state.disk != D_DISKLESS) | ||
1443 | dev_err(DEV, | ||
1444 | "ASSERT FAILED: disk is %s while going diskless\n", | ||
1445 | drbd_disk_str(mdev->state.disk)); | ||
1446 | |||
1447 | if (ns.conn >= C_CONNECTED) | ||
1448 | drbd_send_state(mdev, ns); | ||
1449 | /* corresponding get_ldev in __drbd_set_state | ||
1450 | * this may finally trigger drbd_ldev_destroy. */ | ||
1451 | put_ldev(mdev); | ||
1452 | } | ||
1453 | |||
1454 | /* Notify peer that I had a local IO error, and did not detached.. */ | ||
1455 | if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED) | ||
1456 | drbd_send_state(mdev, ns); | ||
1457 | |||
1458 | /* Disks got bigger while they were detached */ | ||
1459 | if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING && | ||
1460 | test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) { | ||
1461 | if (ns.conn == C_CONNECTED) | ||
1462 | resync_after_online_grow(mdev); | ||
1463 | } | ||
1464 | |||
1465 | /* A resync finished or aborted, wake paused devices... */ | ||
1466 | if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) || | ||
1467 | (os.peer_isp && !ns.peer_isp) || | ||
1468 | (os.user_isp && !ns.user_isp)) | ||
1469 | resume_next_sg(mdev); | ||
1470 | |||
1471 | /* sync target done with resync. Explicitly notify peer, even though | ||
1472 | * it should (at least for non-empty resyncs) already know itself. */ | ||
1473 | if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED) | ||
1474 | drbd_send_state(mdev, ns); | ||
1475 | |||
1476 | /* Verify finished, or reached stop sector. Peer did not know about | ||
1477 | * the stop sector, and we may even have changed the stop sector during | ||
1478 | * verify to interrupt/stop early. Send the new state. */ | ||
1479 | if (os.conn == C_VERIFY_S && ns.conn == C_CONNECTED | ||
1480 | && verify_can_do_stop_sector(mdev)) | ||
1481 | drbd_send_state(mdev, ns); | ||
1482 | |||
1483 | /* This triggers bitmap writeout of potentially still unwritten pages | ||
1484 | * if the resync finished cleanly, or aborted because of peer disk | ||
1485 | * failure, or because of connection loss. | ||
1486 | * For resync aborted because of local disk failure, we cannot do | ||
1487 | * any bitmap writeout anymore. | ||
1488 | * No harm done if some bits change during this phase. | ||
1489 | */ | ||
1490 | if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) { | ||
1491 | drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL, | ||
1492 | "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED); | ||
1493 | put_ldev(mdev); | ||
1494 | } | ||
1495 | |||
1496 | if (ns.disk == D_DISKLESS && | ||
1497 | ns.conn == C_STANDALONE && | ||
1498 | ns.role == R_SECONDARY) { | ||
1499 | if (os.aftr_isp != ns.aftr_isp) | ||
1500 | resume_next_sg(mdev); | ||
1501 | } | ||
1502 | |||
1503 | drbd_md_sync(mdev); | ||
1504 | } | ||
1505 | |||
1506 | struct after_conn_state_chg_work { | ||
1507 | struct drbd_work w; | ||
1508 | enum drbd_conns oc; | ||
1509 | union drbd_state ns_min; | ||
1510 | union drbd_state ns_max; /* new, max state, over all mdevs */ | ||
1511 | enum chg_state_flags flags; | ||
1512 | }; | ||
1513 | |||
1514 | static int w_after_conn_state_ch(struct drbd_work *w, int unused) | ||
1515 | { | ||
1516 | struct after_conn_state_chg_work *acscw = | ||
1517 | container_of(w, struct after_conn_state_chg_work, w); | ||
1518 | struct drbd_tconn *tconn = w->tconn; | ||
1519 | enum drbd_conns oc = acscw->oc; | ||
1520 | union drbd_state ns_max = acscw->ns_max; | ||
1521 | struct drbd_conf *mdev; | ||
1522 | int vnr; | ||
1523 | |||
1524 | kfree(acscw); | ||
1525 | |||
1526 | /* Upon network configuration, we need to start the receiver */ | ||
1527 | if (oc == C_STANDALONE && ns_max.conn == C_UNCONNECTED) | ||
1528 | drbd_thread_start(&tconn->receiver); | ||
1529 | |||
1530 | if (oc == C_DISCONNECTING && ns_max.conn == C_STANDALONE) { | ||
1531 | struct net_conf *old_conf; | ||
1532 | |||
1533 | mutex_lock(&tconn->conf_update); | ||
1534 | old_conf = tconn->net_conf; | ||
1535 | tconn->my_addr_len = 0; | ||
1536 | tconn->peer_addr_len = 0; | ||
1537 | rcu_assign_pointer(tconn->net_conf, NULL); | ||
1538 | conn_free_crypto(tconn); | ||
1539 | mutex_unlock(&tconn->conf_update); | ||
1540 | |||
1541 | synchronize_rcu(); | ||
1542 | kfree(old_conf); | ||
1543 | } | ||
1544 | |||
1545 | if (ns_max.susp_fen) { | ||
1546 | /* case1: The outdate peer handler is successful: */ | ||
1547 | if (ns_max.pdsk <= D_OUTDATED) { | ||
1548 | rcu_read_lock(); | ||
1549 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
1550 | if (test_bit(NEW_CUR_UUID, &mdev->flags)) { | ||
1551 | drbd_uuid_new_current(mdev); | ||
1552 | clear_bit(NEW_CUR_UUID, &mdev->flags); | ||
1553 | } | ||
1554 | } | ||
1555 | rcu_read_unlock(); | ||
1556 | spin_lock_irq(&tconn->req_lock); | ||
1557 | _tl_restart(tconn, CONNECTION_LOST_WHILE_PENDING); | ||
1558 | _conn_request_state(tconn, | ||
1559 | (union drbd_state) { { .susp_fen = 1 } }, | ||
1560 | (union drbd_state) { { .susp_fen = 0 } }, | ||
1561 | CS_VERBOSE); | ||
1562 | spin_unlock_irq(&tconn->req_lock); | ||
1563 | } | ||
1564 | } | ||
1565 | kref_put(&tconn->kref, &conn_destroy); | ||
1566 | |||
1567 | conn_md_sync(tconn); | ||
1568 | |||
1569 | return 0; | ||
1570 | } | ||
1571 | |||
1572 | void conn_old_common_state(struct drbd_tconn *tconn, union drbd_state *pcs, enum chg_state_flags *pf) | ||
1573 | { | ||
1574 | enum chg_state_flags flags = ~0; | ||
1575 | struct drbd_conf *mdev; | ||
1576 | int vnr, first_vol = 1; | ||
1577 | union drbd_dev_state os, cs = { | ||
1578 | { .role = R_SECONDARY, | ||
1579 | .peer = R_UNKNOWN, | ||
1580 | .conn = tconn->cstate, | ||
1581 | .disk = D_DISKLESS, | ||
1582 | .pdsk = D_UNKNOWN, | ||
1583 | } }; | ||
1584 | |||
1585 | rcu_read_lock(); | ||
1586 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
1587 | os = mdev->state; | ||
1588 | |||
1589 | if (first_vol) { | ||
1590 | cs = os; | ||
1591 | first_vol = 0; | ||
1592 | continue; | ||
1593 | } | ||
1594 | |||
1595 | if (cs.role != os.role) | ||
1596 | flags &= ~CS_DC_ROLE; | ||
1597 | |||
1598 | if (cs.peer != os.peer) | ||
1599 | flags &= ~CS_DC_PEER; | ||
1600 | |||
1601 | if (cs.conn != os.conn) | ||
1602 | flags &= ~CS_DC_CONN; | ||
1603 | |||
1604 | if (cs.disk != os.disk) | ||
1605 | flags &= ~CS_DC_DISK; | ||
1606 | |||
1607 | if (cs.pdsk != os.pdsk) | ||
1608 | flags &= ~CS_DC_PDSK; | ||
1609 | } | ||
1610 | rcu_read_unlock(); | ||
1611 | |||
1612 | *pf |= CS_DC_MASK; | ||
1613 | *pf &= flags; | ||
1614 | (*pcs).i = cs.i; | ||
1615 | } | ||
1616 | |||
1617 | static enum drbd_state_rv | ||
1618 | conn_is_valid_transition(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val, | ||
1619 | enum chg_state_flags flags) | ||
1620 | { | ||
1621 | enum drbd_state_rv rv = SS_SUCCESS; | ||
1622 | union drbd_state ns, os; | ||
1623 | struct drbd_conf *mdev; | ||
1624 | int vnr; | ||
1625 | |||
1626 | rcu_read_lock(); | ||
1627 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
1628 | os = drbd_read_state(mdev); | ||
1629 | ns = sanitize_state(mdev, apply_mask_val(os, mask, val), NULL); | ||
1630 | |||
1631 | if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED) | ||
1632 | ns.disk = os.disk; | ||
1633 | |||
1634 | if (ns.i == os.i) | ||
1635 | continue; | ||
1636 | |||
1637 | rv = is_valid_transition(os, ns); | ||
1638 | if (rv < SS_SUCCESS) | ||
1639 | break; | ||
1640 | |||
1641 | if (!(flags & CS_HARD)) { | ||
1642 | rv = is_valid_state(mdev, ns); | ||
1643 | if (rv < SS_SUCCESS) { | ||
1644 | if (is_valid_state(mdev, os) == rv) | ||
1645 | rv = is_valid_soft_transition(os, ns, tconn); | ||
1646 | } else | ||
1647 | rv = is_valid_soft_transition(os, ns, tconn); | ||
1648 | } | ||
1649 | if (rv < SS_SUCCESS) | ||
1650 | break; | ||
1651 | } | ||
1652 | rcu_read_unlock(); | ||
1653 | |||
1654 | if (rv < SS_SUCCESS && flags & CS_VERBOSE) | ||
1655 | print_st_err(mdev, os, ns, rv); | ||
1656 | |||
1657 | return rv; | ||
1658 | } | ||
1659 | |||
1660 | void | ||
1661 | conn_set_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val, | ||
1662 | union drbd_state *pns_min, union drbd_state *pns_max, enum chg_state_flags flags) | ||
1663 | { | ||
1664 | union drbd_state ns, os, ns_max = { }; | ||
1665 | union drbd_state ns_min = { | ||
1666 | { .role = R_MASK, | ||
1667 | .peer = R_MASK, | ||
1668 | .conn = val.conn, | ||
1669 | .disk = D_MASK, | ||
1670 | .pdsk = D_MASK | ||
1671 | } }; | ||
1672 | struct drbd_conf *mdev; | ||
1673 | enum drbd_state_rv rv; | ||
1674 | int vnr, number_of_volumes = 0; | ||
1675 | |||
1676 | if (mask.conn == C_MASK) { | ||
1677 | /* remember last connect time so request_timer_fn() won't | ||
1678 | * kill newly established sessions while we are still trying to thaw | ||
1679 | * previously frozen IO */ | ||
1680 | if (tconn->cstate != C_WF_REPORT_PARAMS && val.conn == C_WF_REPORT_PARAMS) | ||
1681 | tconn->last_reconnect_jif = jiffies; | ||
1682 | |||
1683 | tconn->cstate = val.conn; | ||
1684 | } | ||
1685 | |||
1686 | rcu_read_lock(); | ||
1687 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
1688 | number_of_volumes++; | ||
1689 | os = drbd_read_state(mdev); | ||
1690 | ns = apply_mask_val(os, mask, val); | ||
1691 | ns = sanitize_state(mdev, ns, NULL); | ||
1692 | |||
1693 | if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED) | ||
1694 | ns.disk = os.disk; | ||
1695 | |||
1696 | rv = __drbd_set_state(mdev, ns, flags, NULL); | ||
1697 | if (rv < SS_SUCCESS) | ||
1698 | BUG(); | ||
1699 | |||
1700 | ns.i = mdev->state.i; | ||
1701 | ns_max.role = max_role(ns.role, ns_max.role); | ||
1702 | ns_max.peer = max_role(ns.peer, ns_max.peer); | ||
1703 | ns_max.conn = max_t(enum drbd_conns, ns.conn, ns_max.conn); | ||
1704 | ns_max.disk = max_t(enum drbd_disk_state, ns.disk, ns_max.disk); | ||
1705 | ns_max.pdsk = max_t(enum drbd_disk_state, ns.pdsk, ns_max.pdsk); | ||
1706 | |||
1707 | ns_min.role = min_role(ns.role, ns_min.role); | ||
1708 | ns_min.peer = min_role(ns.peer, ns_min.peer); | ||
1709 | ns_min.conn = min_t(enum drbd_conns, ns.conn, ns_min.conn); | ||
1710 | ns_min.disk = min_t(enum drbd_disk_state, ns.disk, ns_min.disk); | ||
1711 | ns_min.pdsk = min_t(enum drbd_disk_state, ns.pdsk, ns_min.pdsk); | ||
1712 | } | ||
1713 | rcu_read_unlock(); | ||
1714 | |||
1715 | if (number_of_volumes == 0) { | ||
1716 | ns_min = ns_max = (union drbd_state) { { | ||
1717 | .role = R_SECONDARY, | ||
1718 | .peer = R_UNKNOWN, | ||
1719 | .conn = val.conn, | ||
1720 | .disk = D_DISKLESS, | ||
1721 | .pdsk = D_UNKNOWN | ||
1722 | } }; | ||
1723 | } | ||
1724 | |||
1725 | ns_min.susp = ns_max.susp = tconn->susp; | ||
1726 | ns_min.susp_nod = ns_max.susp_nod = tconn->susp_nod; | ||
1727 | ns_min.susp_fen = ns_max.susp_fen = tconn->susp_fen; | ||
1728 | |||
1729 | *pns_min = ns_min; | ||
1730 | *pns_max = ns_max; | ||
1731 | } | ||
1732 | |||
1733 | static enum drbd_state_rv | ||
1734 | _conn_rq_cond(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val) | ||
1735 | { | ||
1736 | enum drbd_state_rv rv; | ||
1737 | |||
1738 | if (test_and_clear_bit(CONN_WD_ST_CHG_OKAY, &tconn->flags)) | ||
1739 | return SS_CW_SUCCESS; | ||
1740 | |||
1741 | if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &tconn->flags)) | ||
1742 | return SS_CW_FAILED_BY_PEER; | ||
1743 | |||
1744 | rv = tconn->cstate != C_WF_REPORT_PARAMS ? SS_CW_NO_NEED : SS_UNKNOWN_ERROR; | ||
1745 | |||
1746 | if (rv == SS_UNKNOWN_ERROR) | ||
1747 | rv = conn_is_valid_transition(tconn, mask, val, 0); | ||
1748 | |||
1749 | if (rv == SS_SUCCESS) | ||
1750 | rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */ | ||
1751 | |||
1752 | return rv; | ||
1753 | } | ||
1754 | |||
1755 | enum drbd_state_rv | ||
1756 | _conn_request_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val, | ||
1757 | enum chg_state_flags flags) | ||
1758 | { | ||
1759 | enum drbd_state_rv rv = SS_SUCCESS; | ||
1760 | struct after_conn_state_chg_work *acscw; | ||
1761 | enum drbd_conns oc = tconn->cstate; | ||
1762 | union drbd_state ns_max, ns_min, os; | ||
1763 | bool have_mutex = false; | ||
1764 | |||
1765 | if (mask.conn) { | ||
1766 | rv = is_valid_conn_transition(oc, val.conn); | ||
1767 | if (rv < SS_SUCCESS) | ||
1768 | goto abort; | ||
1769 | } | ||
1770 | |||
1771 | rv = conn_is_valid_transition(tconn, mask, val, flags); | ||
1772 | if (rv < SS_SUCCESS) | ||
1773 | goto abort; | ||
1774 | |||
1775 | if (oc == C_WF_REPORT_PARAMS && val.conn == C_DISCONNECTING && | ||
1776 | !(flags & (CS_LOCAL_ONLY | CS_HARD))) { | ||
1777 | |||
1778 | /* This will be a cluster-wide state change. | ||
1779 | * Need to give up the spinlock, grab the mutex, | ||
1780 | * then send the state change request, ... */ | ||
1781 | spin_unlock_irq(&tconn->req_lock); | ||
1782 | mutex_lock(&tconn->cstate_mutex); | ||
1783 | have_mutex = true; | ||
1784 | |||
1785 | set_bit(CONN_WD_ST_CHG_REQ, &tconn->flags); | ||
1786 | if (conn_send_state_req(tconn, mask, val)) { | ||
1787 | /* sending failed. */ | ||
1788 | clear_bit(CONN_WD_ST_CHG_REQ, &tconn->flags); | ||
1789 | rv = SS_CW_FAILED_BY_PEER; | ||
1790 | /* need to re-aquire the spin lock, though */ | ||
1791 | goto abort_unlocked; | ||
1792 | } | ||
1793 | |||
1794 | if (val.conn == C_DISCONNECTING) | ||
1795 | set_bit(DISCONNECT_SENT, &tconn->flags); | ||
1796 | |||
1797 | /* ... and re-aquire the spinlock. | ||
1798 | * If _conn_rq_cond() returned >= SS_SUCCESS, we must call | ||
1799 | * conn_set_state() within the same spinlock. */ | ||
1800 | spin_lock_irq(&tconn->req_lock); | ||
1801 | wait_event_lock_irq(tconn->ping_wait, | ||
1802 | (rv = _conn_rq_cond(tconn, mask, val)), | ||
1803 | tconn->req_lock); | ||
1804 | clear_bit(CONN_WD_ST_CHG_REQ, &tconn->flags); | ||
1805 | if (rv < SS_SUCCESS) | ||
1806 | goto abort; | ||
1807 | } | ||
1808 | |||
1809 | conn_old_common_state(tconn, &os, &flags); | ||
1810 | flags |= CS_DC_SUSP; | ||
1811 | conn_set_state(tconn, mask, val, &ns_min, &ns_max, flags); | ||
1812 | conn_pr_state_change(tconn, os, ns_max, flags); | ||
1813 | |||
1814 | acscw = kmalloc(sizeof(*acscw), GFP_ATOMIC); | ||
1815 | if (acscw) { | ||
1816 | acscw->oc = os.conn; | ||
1817 | acscw->ns_min = ns_min; | ||
1818 | acscw->ns_max = ns_max; | ||
1819 | acscw->flags = flags; | ||
1820 | acscw->w.cb = w_after_conn_state_ch; | ||
1821 | kref_get(&tconn->kref); | ||
1822 | acscw->w.tconn = tconn; | ||
1823 | drbd_queue_work(&tconn->sender_work, &acscw->w); | ||
1824 | } else { | ||
1825 | conn_err(tconn, "Could not kmalloc an acscw\n"); | ||
1826 | } | ||
1827 | |||
1828 | abort: | ||
1829 | if (have_mutex) { | ||
1830 | /* mutex_unlock() "... must not be used in interrupt context.", | ||
1831 | * so give up the spinlock, then re-aquire it */ | ||
1832 | spin_unlock_irq(&tconn->req_lock); | ||
1833 | abort_unlocked: | ||
1834 | mutex_unlock(&tconn->cstate_mutex); | ||
1835 | spin_lock_irq(&tconn->req_lock); | ||
1836 | } | ||
1837 | if (rv < SS_SUCCESS && flags & CS_VERBOSE) { | ||
1838 | conn_err(tconn, "State change failed: %s\n", drbd_set_st_err_str(rv)); | ||
1839 | conn_err(tconn, " mask = 0x%x val = 0x%x\n", mask.i, val.i); | ||
1840 | conn_err(tconn, " old_conn:%s wanted_conn:%s\n", drbd_conn_str(oc), drbd_conn_str(val.conn)); | ||
1841 | } | ||
1842 | return rv; | ||
1843 | } | ||
1844 | |||
1845 | enum drbd_state_rv | ||
1846 | conn_request_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val, | ||
1847 | enum chg_state_flags flags) | ||
1848 | { | ||
1849 | enum drbd_state_rv rv; | ||
1850 | |||
1851 | spin_lock_irq(&tconn->req_lock); | ||
1852 | rv = _conn_request_state(tconn, mask, val, flags); | ||
1853 | spin_unlock_irq(&tconn->req_lock); | ||
1854 | |||
1855 | return rv; | ||
1856 | } | ||
diff --git a/drivers/block/drbd/drbd_state.h b/drivers/block/drbd/drbd_state.h new file mode 100644 index 000000000000..a3c361bbc4b6 --- /dev/null +++ b/drivers/block/drbd/drbd_state.h | |||
@@ -0,0 +1,161 @@ | |||
1 | #ifndef DRBD_STATE_H | ||
2 | #define DRBD_STATE_H | ||
3 | |||
4 | struct drbd_conf; | ||
5 | struct drbd_tconn; | ||
6 | |||
7 | /** | ||
8 | * DOC: DRBD State macros | ||
9 | * | ||
10 | * These macros are used to express state changes in easily readable form. | ||
11 | * | ||
12 | * The NS macros expand to a mask and a value, that can be bit ored onto the | ||
13 | * current state as soon as the spinlock (req_lock) was taken. | ||
14 | * | ||
15 | * The _NS macros are used for state functions that get called with the | ||
16 | * spinlock. These macros expand directly to the new state value. | ||
17 | * | ||
18 | * Besides the basic forms NS() and _NS() additional _?NS[23] are defined | ||
19 | * to express state changes that affect more than one aspect of the state. | ||
20 | * | ||
21 | * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY) | ||
22 | * Means that the network connection was established and that the peer | ||
23 | * is in secondary role. | ||
24 | */ | ||
25 | #define role_MASK R_MASK | ||
26 | #define peer_MASK R_MASK | ||
27 | #define disk_MASK D_MASK | ||
28 | #define pdsk_MASK D_MASK | ||
29 | #define conn_MASK C_MASK | ||
30 | #define susp_MASK 1 | ||
31 | #define user_isp_MASK 1 | ||
32 | #define aftr_isp_MASK 1 | ||
33 | #define susp_nod_MASK 1 | ||
34 | #define susp_fen_MASK 1 | ||
35 | |||
36 | #define NS(T, S) \ | ||
37 | ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \ | ||
38 | ({ union drbd_state val; val.i = 0; val.T = (S); val; }) | ||
39 | #define NS2(T1, S1, T2, S2) \ | ||
40 | ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \ | ||
41 | mask.T2 = T2##_MASK; mask; }), \ | ||
42 | ({ union drbd_state val; val.i = 0; val.T1 = (S1); \ | ||
43 | val.T2 = (S2); val; }) | ||
44 | #define NS3(T1, S1, T2, S2, T3, S3) \ | ||
45 | ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \ | ||
46 | mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \ | ||
47 | ({ union drbd_state val; val.i = 0; val.T1 = (S1); \ | ||
48 | val.T2 = (S2); val.T3 = (S3); val; }) | ||
49 | |||
50 | #define _NS(D, T, S) \ | ||
51 | D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T = (S); __ns; }) | ||
52 | #define _NS2(D, T1, S1, T2, S2) \ | ||
53 | D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \ | ||
54 | __ns.T2 = (S2); __ns; }) | ||
55 | #define _NS3(D, T1, S1, T2, S2, T3, S3) \ | ||
56 | D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \ | ||
57 | __ns.T2 = (S2); __ns.T3 = (S3); __ns; }) | ||
58 | |||
59 | enum chg_state_flags { | ||
60 | CS_HARD = 1 << 0, | ||
61 | CS_VERBOSE = 1 << 1, | ||
62 | CS_WAIT_COMPLETE = 1 << 2, | ||
63 | CS_SERIALIZE = 1 << 3, | ||
64 | CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE, | ||
65 | CS_LOCAL_ONLY = 1 << 4, /* Do not consider a device pair wide state change */ | ||
66 | CS_DC_ROLE = 1 << 5, /* DC = display as connection state change */ | ||
67 | CS_DC_PEER = 1 << 6, | ||
68 | CS_DC_CONN = 1 << 7, | ||
69 | CS_DC_DISK = 1 << 8, | ||
70 | CS_DC_PDSK = 1 << 9, | ||
71 | CS_DC_SUSP = 1 << 10, | ||
72 | CS_DC_MASK = CS_DC_ROLE + CS_DC_PEER + CS_DC_CONN + CS_DC_DISK + CS_DC_PDSK, | ||
73 | CS_IGN_OUTD_FAIL = 1 << 11, | ||
74 | }; | ||
75 | |||
76 | /* drbd_dev_state and drbd_state are different types. This is to stress the | ||
77 | small difference. There is no suspended flag (.susp), and no suspended | ||
78 | while fence handler runs flas (susp_fen). */ | ||
79 | union drbd_dev_state { | ||
80 | struct { | ||
81 | #if defined(__LITTLE_ENDIAN_BITFIELD) | ||
82 | unsigned role:2 ; /* 3/4 primary/secondary/unknown */ | ||
83 | unsigned peer:2 ; /* 3/4 primary/secondary/unknown */ | ||
84 | unsigned conn:5 ; /* 17/32 cstates */ | ||
85 | unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ | ||
86 | unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ | ||
87 | unsigned _unused:1 ; | ||
88 | unsigned aftr_isp:1 ; /* isp .. imposed sync pause */ | ||
89 | unsigned peer_isp:1 ; | ||
90 | unsigned user_isp:1 ; | ||
91 | unsigned _pad:11; /* 0 unused */ | ||
92 | #elif defined(__BIG_ENDIAN_BITFIELD) | ||
93 | unsigned _pad:11; | ||
94 | unsigned user_isp:1 ; | ||
95 | unsigned peer_isp:1 ; | ||
96 | unsigned aftr_isp:1 ; /* isp .. imposed sync pause */ | ||
97 | unsigned _unused:1 ; | ||
98 | unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ | ||
99 | unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ | ||
100 | unsigned conn:5 ; /* 17/32 cstates */ | ||
101 | unsigned peer:2 ; /* 3/4 primary/secondary/unknown */ | ||
102 | unsigned role:2 ; /* 3/4 primary/secondary/unknown */ | ||
103 | #else | ||
104 | # error "this endianess is not supported" | ||
105 | #endif | ||
106 | }; | ||
107 | unsigned int i; | ||
108 | }; | ||
109 | |||
110 | extern enum drbd_state_rv drbd_change_state(struct drbd_conf *mdev, | ||
111 | enum chg_state_flags f, | ||
112 | union drbd_state mask, | ||
113 | union drbd_state val); | ||
114 | extern void drbd_force_state(struct drbd_conf *, union drbd_state, | ||
115 | union drbd_state); | ||
116 | extern enum drbd_state_rv _drbd_request_state(struct drbd_conf *, | ||
117 | union drbd_state, | ||
118 | union drbd_state, | ||
119 | enum chg_state_flags); | ||
120 | extern enum drbd_state_rv __drbd_set_state(struct drbd_conf *, union drbd_state, | ||
121 | enum chg_state_flags, | ||
122 | struct completion *done); | ||
123 | extern void print_st_err(struct drbd_conf *, union drbd_state, | ||
124 | union drbd_state, int); | ||
125 | |||
126 | enum drbd_state_rv | ||
127 | _conn_request_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val, | ||
128 | enum chg_state_flags flags); | ||
129 | |||
130 | enum drbd_state_rv | ||
131 | conn_request_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val, | ||
132 | enum chg_state_flags flags); | ||
133 | |||
134 | extern void drbd_resume_al(struct drbd_conf *mdev); | ||
135 | extern bool conn_all_vols_unconf(struct drbd_tconn *tconn); | ||
136 | |||
137 | /** | ||
138 | * drbd_request_state() - Reqest a state change | ||
139 | * @mdev: DRBD device. | ||
140 | * @mask: mask of state bits to change. | ||
141 | * @val: value of new state bits. | ||
142 | * | ||
143 | * This is the most graceful way of requesting a state change. It is verbose | ||
144 | * quite verbose in case the state change is not possible, and all those | ||
145 | * state changes are globally serialized. | ||
146 | */ | ||
147 | static inline int drbd_request_state(struct drbd_conf *mdev, | ||
148 | union drbd_state mask, | ||
149 | union drbd_state val) | ||
150 | { | ||
151 | return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED); | ||
152 | } | ||
153 | |||
154 | enum drbd_role conn_highest_role(struct drbd_tconn *tconn); | ||
155 | enum drbd_role conn_highest_peer(struct drbd_tconn *tconn); | ||
156 | enum drbd_disk_state conn_highest_disk(struct drbd_tconn *tconn); | ||
157 | enum drbd_disk_state conn_lowest_disk(struct drbd_tconn *tconn); | ||
158 | enum drbd_disk_state conn_highest_pdsk(struct drbd_tconn *tconn); | ||
159 | enum drbd_conns conn_lowest_conn(struct drbd_tconn *tconn); | ||
160 | |||
161 | #endif | ||
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c index c44a2a602772..9a664bd27404 100644 --- a/drivers/block/drbd/drbd_strings.c +++ b/drivers/block/drbd/drbd_strings.c | |||
@@ -89,6 +89,7 @@ static const char *drbd_state_sw_errors[] = { | |||
89 | [-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated", | 89 | [-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated", |
90 | [-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change", | 90 | [-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change", |
91 | [-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted", | 91 | [-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted", |
92 | [-SS_O_VOL_PEER_PRI] = "Other vol primary on peer not allowed by config", | ||
92 | }; | 93 | }; |
93 | 94 | ||
94 | const char *drbd_conn_str(enum drbd_conns s) | 95 | const char *drbd_conn_str(enum drbd_conns s) |
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 6bce2cc179d4..424dc7bdf9b7 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c | |||
@@ -38,16 +38,13 @@ | |||
38 | #include "drbd_int.h" | 38 | #include "drbd_int.h" |
39 | #include "drbd_req.h" | 39 | #include "drbd_req.h" |
40 | 40 | ||
41 | static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel); | 41 | static int w_make_ov_request(struct drbd_work *w, int cancel); |
42 | static int w_make_resync_request(struct drbd_conf *mdev, | ||
43 | struct drbd_work *w, int cancel); | ||
44 | |||
45 | 42 | ||
46 | 43 | ||
47 | /* endio handlers: | 44 | /* endio handlers: |
48 | * drbd_md_io_complete (defined here) | 45 | * drbd_md_io_complete (defined here) |
49 | * drbd_endio_pri (defined here) | 46 | * drbd_request_endio (defined here) |
50 | * drbd_endio_sec (defined here) | 47 | * drbd_peer_request_endio (defined here) |
51 | * bm_async_io_complete (defined in drbd_bitmap.c) | 48 | * bm_async_io_complete (defined in drbd_bitmap.c) |
52 | * | 49 | * |
53 | * For all these callbacks, note the following: | 50 | * For all these callbacks, note the following: |
@@ -60,7 +57,7 @@ static int w_make_resync_request(struct drbd_conf *mdev, | |||
60 | 57 | ||
61 | /* About the global_state_lock | 58 | /* About the global_state_lock |
62 | Each state transition on an device holds a read lock. In case we have | 59 | Each state transition on an device holds a read lock. In case we have |
63 | to evaluate the sync after dependencies, we grab a write lock, because | 60 | to evaluate the resync after dependencies, we grab a write lock, because |
64 | we need stable states on all devices for that. */ | 61 | we need stable states on all devices for that. */ |
65 | rwlock_t global_state_lock; | 62 | rwlock_t global_state_lock; |
66 | 63 | ||
@@ -98,97 +95,93 @@ void drbd_md_io_complete(struct bio *bio, int error) | |||
98 | /* reads on behalf of the partner, | 95 | /* reads on behalf of the partner, |
99 | * "submitted" by the receiver | 96 | * "submitted" by the receiver |
100 | */ | 97 | */ |
101 | void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local) | 98 | void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __releases(local) |
102 | { | 99 | { |
103 | unsigned long flags = 0; | 100 | unsigned long flags = 0; |
104 | struct drbd_conf *mdev = e->mdev; | 101 | struct drbd_conf *mdev = peer_req->w.mdev; |
105 | |||
106 | D_ASSERT(e->block_id != ID_VACANT); | ||
107 | 102 | ||
108 | spin_lock_irqsave(&mdev->req_lock, flags); | 103 | spin_lock_irqsave(&mdev->tconn->req_lock, flags); |
109 | mdev->read_cnt += e->size >> 9; | 104 | mdev->read_cnt += peer_req->i.size >> 9; |
110 | list_del(&e->w.list); | 105 | list_del(&peer_req->w.list); |
111 | if (list_empty(&mdev->read_ee)) | 106 | if (list_empty(&mdev->read_ee)) |
112 | wake_up(&mdev->ee_wait); | 107 | wake_up(&mdev->ee_wait); |
113 | if (test_bit(__EE_WAS_ERROR, &e->flags)) | 108 | if (test_bit(__EE_WAS_ERROR, &peer_req->flags)) |
114 | __drbd_chk_io_error(mdev, DRBD_IO_ERROR); | 109 | __drbd_chk_io_error(mdev, DRBD_READ_ERROR); |
115 | spin_unlock_irqrestore(&mdev->req_lock, flags); | 110 | spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); |
116 | 111 | ||
117 | drbd_queue_work(&mdev->data.work, &e->w); | 112 | drbd_queue_work(&mdev->tconn->sender_work, &peer_req->w); |
118 | put_ldev(mdev); | 113 | put_ldev(mdev); |
119 | } | 114 | } |
120 | 115 | ||
121 | /* writes on behalf of the partner, or resync writes, | 116 | /* writes on behalf of the partner, or resync writes, |
122 | * "submitted" by the receiver, final stage. */ | 117 | * "submitted" by the receiver, final stage. */ |
123 | static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(local) | 118 | static void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local) |
124 | { | 119 | { |
125 | unsigned long flags = 0; | 120 | unsigned long flags = 0; |
126 | struct drbd_conf *mdev = e->mdev; | 121 | struct drbd_conf *mdev = peer_req->w.mdev; |
127 | sector_t e_sector; | 122 | struct drbd_interval i; |
128 | int do_wake; | 123 | int do_wake; |
129 | int is_syncer_req; | 124 | u64 block_id; |
130 | int do_al_complete_io; | 125 | int do_al_complete_io; |
131 | 126 | ||
132 | D_ASSERT(e->block_id != ID_VACANT); | 127 | /* after we moved peer_req to done_ee, |
133 | |||
134 | /* after we moved e to done_ee, | ||
135 | * we may no longer access it, | 128 | * we may no longer access it, |
136 | * it may be freed/reused already! | 129 | * it may be freed/reused already! |
137 | * (as soon as we release the req_lock) */ | 130 | * (as soon as we release the req_lock) */ |
138 | e_sector = e->sector; | 131 | i = peer_req->i; |
139 | do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO; | 132 | do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO; |
140 | is_syncer_req = is_syncer_block_id(e->block_id); | 133 | block_id = peer_req->block_id; |
141 | 134 | ||
142 | spin_lock_irqsave(&mdev->req_lock, flags); | 135 | spin_lock_irqsave(&mdev->tconn->req_lock, flags); |
143 | mdev->writ_cnt += e->size >> 9; | 136 | mdev->writ_cnt += peer_req->i.size >> 9; |
144 | list_del(&e->w.list); /* has been on active_ee or sync_ee */ | 137 | list_move_tail(&peer_req->w.list, &mdev->done_ee); |
145 | list_add_tail(&e->w.list, &mdev->done_ee); | ||
146 | 138 | ||
147 | /* No hlist_del_init(&e->collision) here, we did not send the Ack yet, | 139 | /* |
148 | * neither did we wake possibly waiting conflicting requests. | 140 | * Do not remove from the write_requests tree here: we did not send the |
149 | * done from "drbd_process_done_ee" within the appropriate w.cb | 141 | * Ack yet and did not wake possibly waiting conflicting requests. |
150 | * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */ | 142 | * Removed from the tree from "drbd_process_done_ee" within the |
143 | * appropriate w.cb (e_end_block/e_end_resync_block) or from | ||
144 | * _drbd_clear_done_ee. | ||
145 | */ | ||
151 | 146 | ||
152 | do_wake = is_syncer_req | 147 | do_wake = list_empty(block_id == ID_SYNCER ? &mdev->sync_ee : &mdev->active_ee); |
153 | ? list_empty(&mdev->sync_ee) | ||
154 | : list_empty(&mdev->active_ee); | ||
155 | 148 | ||
156 | if (test_bit(__EE_WAS_ERROR, &e->flags)) | 149 | if (test_bit(__EE_WAS_ERROR, &peer_req->flags)) |
157 | __drbd_chk_io_error(mdev, DRBD_IO_ERROR); | 150 | __drbd_chk_io_error(mdev, DRBD_WRITE_ERROR); |
158 | spin_unlock_irqrestore(&mdev->req_lock, flags); | 151 | spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); |
159 | 152 | ||
160 | if (is_syncer_req) | 153 | if (block_id == ID_SYNCER) |
161 | drbd_rs_complete_io(mdev, e_sector); | 154 | drbd_rs_complete_io(mdev, i.sector); |
162 | 155 | ||
163 | if (do_wake) | 156 | if (do_wake) |
164 | wake_up(&mdev->ee_wait); | 157 | wake_up(&mdev->ee_wait); |
165 | 158 | ||
166 | if (do_al_complete_io) | 159 | if (do_al_complete_io) |
167 | drbd_al_complete_io(mdev, e_sector); | 160 | drbd_al_complete_io(mdev, &i); |
168 | 161 | ||
169 | wake_asender(mdev); | 162 | wake_asender(mdev->tconn); |
170 | put_ldev(mdev); | 163 | put_ldev(mdev); |
171 | } | 164 | } |
172 | 165 | ||
173 | /* writes on behalf of the partner, or resync writes, | 166 | /* writes on behalf of the partner, or resync writes, |
174 | * "submitted" by the receiver. | 167 | * "submitted" by the receiver. |
175 | */ | 168 | */ |
176 | void drbd_endio_sec(struct bio *bio, int error) | 169 | void drbd_peer_request_endio(struct bio *bio, int error) |
177 | { | 170 | { |
178 | struct drbd_epoch_entry *e = bio->bi_private; | 171 | struct drbd_peer_request *peer_req = bio->bi_private; |
179 | struct drbd_conf *mdev = e->mdev; | 172 | struct drbd_conf *mdev = peer_req->w.mdev; |
180 | int uptodate = bio_flagged(bio, BIO_UPTODATE); | 173 | int uptodate = bio_flagged(bio, BIO_UPTODATE); |
181 | int is_write = bio_data_dir(bio) == WRITE; | 174 | int is_write = bio_data_dir(bio) == WRITE; |
182 | 175 | ||
183 | if (error && __ratelimit(&drbd_ratelimit_state)) | 176 | if (error && __ratelimit(&drbd_ratelimit_state)) |
184 | dev_warn(DEV, "%s: error=%d s=%llus\n", | 177 | dev_warn(DEV, "%s: error=%d s=%llus\n", |
185 | is_write ? "write" : "read", error, | 178 | is_write ? "write" : "read", error, |
186 | (unsigned long long)e->sector); | 179 | (unsigned long long)peer_req->i.sector); |
187 | if (!error && !uptodate) { | 180 | if (!error && !uptodate) { |
188 | if (__ratelimit(&drbd_ratelimit_state)) | 181 | if (__ratelimit(&drbd_ratelimit_state)) |
189 | dev_warn(DEV, "%s: setting error to -EIO s=%llus\n", | 182 | dev_warn(DEV, "%s: setting error to -EIO s=%llus\n", |
190 | is_write ? "write" : "read", | 183 | is_write ? "write" : "read", |
191 | (unsigned long long)e->sector); | 184 | (unsigned long long)peer_req->i.sector); |
192 | /* strange behavior of some lower level drivers... | 185 | /* strange behavior of some lower level drivers... |
193 | * fail the request by clearing the uptodate flag, | 186 | * fail the request by clearing the uptodate flag, |
194 | * but do not return any error?! */ | 187 | * but do not return any error?! */ |
@@ -196,24 +189,24 @@ void drbd_endio_sec(struct bio *bio, int error) | |||
196 | } | 189 | } |
197 | 190 | ||
198 | if (error) | 191 | if (error) |
199 | set_bit(__EE_WAS_ERROR, &e->flags); | 192 | set_bit(__EE_WAS_ERROR, &peer_req->flags); |
200 | 193 | ||
201 | bio_put(bio); /* no need for the bio anymore */ | 194 | bio_put(bio); /* no need for the bio anymore */ |
202 | if (atomic_dec_and_test(&e->pending_bios)) { | 195 | if (atomic_dec_and_test(&peer_req->pending_bios)) { |
203 | if (is_write) | 196 | if (is_write) |
204 | drbd_endio_write_sec_final(e); | 197 | drbd_endio_write_sec_final(peer_req); |
205 | else | 198 | else |
206 | drbd_endio_read_sec_final(e); | 199 | drbd_endio_read_sec_final(peer_req); |
207 | } | 200 | } |
208 | } | 201 | } |
209 | 202 | ||
210 | /* read, readA or write requests on R_PRIMARY coming from drbd_make_request | 203 | /* read, readA or write requests on R_PRIMARY coming from drbd_make_request |
211 | */ | 204 | */ |
212 | void drbd_endio_pri(struct bio *bio, int error) | 205 | void drbd_request_endio(struct bio *bio, int error) |
213 | { | 206 | { |
214 | unsigned long flags; | 207 | unsigned long flags; |
215 | struct drbd_request *req = bio->bi_private; | 208 | struct drbd_request *req = bio->bi_private; |
216 | struct drbd_conf *mdev = req->mdev; | 209 | struct drbd_conf *mdev = req->w.mdev; |
217 | struct bio_and_error m; | 210 | struct bio_and_error m; |
218 | enum drbd_req_event what; | 211 | enum drbd_req_event what; |
219 | int uptodate = bio_flagged(bio, BIO_UPTODATE); | 212 | int uptodate = bio_flagged(bio, BIO_UPTODATE); |
@@ -227,53 +220,72 @@ void drbd_endio_pri(struct bio *bio, int error) | |||
227 | error = -EIO; | 220 | error = -EIO; |
228 | } | 221 | } |
229 | 222 | ||
223 | |||
224 | /* If this request was aborted locally before, | ||
225 | * but now was completed "successfully", | ||
226 | * chances are that this caused arbitrary data corruption. | ||
227 | * | ||
228 | * "aborting" requests, or force-detaching the disk, is intended for | ||
229 | * completely blocked/hung local backing devices which do no longer | ||
230 | * complete requests at all, not even do error completions. In this | ||
231 | * situation, usually a hard-reset and failover is the only way out. | ||
232 | * | ||
233 | * By "aborting", basically faking a local error-completion, | ||
234 | * we allow for a more graceful swichover by cleanly migrating services. | ||
235 | * Still the affected node has to be rebooted "soon". | ||
236 | * | ||
237 | * By completing these requests, we allow the upper layers to re-use | ||
238 | * the associated data pages. | ||
239 | * | ||
240 | * If later the local backing device "recovers", and now DMAs some data | ||
241 | * from disk into the original request pages, in the best case it will | ||
242 | * just put random data into unused pages; but typically it will corrupt | ||
243 | * meanwhile completely unrelated data, causing all sorts of damage. | ||
244 | * | ||
245 | * Which means delayed successful completion, | ||
246 | * especially for READ requests, | ||
247 | * is a reason to panic(). | ||
248 | * | ||
249 | * We assume that a delayed *error* completion is OK, | ||
250 | * though we still will complain noisily about it. | ||
251 | */ | ||
252 | if (unlikely(req->rq_state & RQ_LOCAL_ABORTED)) { | ||
253 | if (__ratelimit(&drbd_ratelimit_state)) | ||
254 | dev_emerg(DEV, "delayed completion of aborted local request; disk-timeout may be too aggressive\n"); | ||
255 | |||
256 | if (!error) | ||
257 | panic("possible random memory corruption caused by delayed completion of aborted local request\n"); | ||
258 | } | ||
259 | |||
230 | /* to avoid recursion in __req_mod */ | 260 | /* to avoid recursion in __req_mod */ |
231 | if (unlikely(error)) { | 261 | if (unlikely(error)) { |
232 | what = (bio_data_dir(bio) == WRITE) | 262 | what = (bio_data_dir(bio) == WRITE) |
233 | ? write_completed_with_error | 263 | ? WRITE_COMPLETED_WITH_ERROR |
234 | : (bio_rw(bio) == READ) | 264 | : (bio_rw(bio) == READ) |
235 | ? read_completed_with_error | 265 | ? READ_COMPLETED_WITH_ERROR |
236 | : read_ahead_completed_with_error; | 266 | : READ_AHEAD_COMPLETED_WITH_ERROR; |
237 | } else | 267 | } else |
238 | what = completed_ok; | 268 | what = COMPLETED_OK; |
239 | 269 | ||
240 | bio_put(req->private_bio); | 270 | bio_put(req->private_bio); |
241 | req->private_bio = ERR_PTR(error); | 271 | req->private_bio = ERR_PTR(error); |
242 | 272 | ||
243 | /* not req_mod(), we need irqsave here! */ | 273 | /* not req_mod(), we need irqsave here! */ |
244 | spin_lock_irqsave(&mdev->req_lock, flags); | 274 | spin_lock_irqsave(&mdev->tconn->req_lock, flags); |
245 | __req_mod(req, what, &m); | 275 | __req_mod(req, what, &m); |
246 | spin_unlock_irqrestore(&mdev->req_lock, flags); | 276 | spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); |
247 | put_ldev(mdev); | 277 | put_ldev(mdev); |
248 | 278 | ||
249 | if (m.bio) | 279 | if (m.bio) |
250 | complete_master_bio(mdev, &m); | 280 | complete_master_bio(mdev, &m); |
251 | } | 281 | } |
252 | 282 | ||
253 | int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 283 | void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, |
254 | { | 284 | struct drbd_peer_request *peer_req, void *digest) |
255 | struct drbd_request *req = container_of(w, struct drbd_request, w); | ||
256 | |||
257 | /* We should not detach for read io-error, | ||
258 | * but try to WRITE the P_DATA_REPLY to the failed location, | ||
259 | * to give the disk the chance to relocate that block */ | ||
260 | |||
261 | spin_lock_irq(&mdev->req_lock); | ||
262 | if (cancel || mdev->state.pdsk != D_UP_TO_DATE) { | ||
263 | _req_mod(req, read_retry_remote_canceled); | ||
264 | spin_unlock_irq(&mdev->req_lock); | ||
265 | return 1; | ||
266 | } | ||
267 | spin_unlock_irq(&mdev->req_lock); | ||
268 | |||
269 | return w_send_read_req(mdev, w, 0); | ||
270 | } | ||
271 | |||
272 | void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_epoch_entry *e, void *digest) | ||
273 | { | 285 | { |
274 | struct hash_desc desc; | 286 | struct hash_desc desc; |
275 | struct scatterlist sg; | 287 | struct scatterlist sg; |
276 | struct page *page = e->pages; | 288 | struct page *page = peer_req->pages; |
277 | struct page *tmp; | 289 | struct page *tmp; |
278 | unsigned len; | 290 | unsigned len; |
279 | 291 | ||
@@ -290,7 +302,7 @@ void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_e | |||
290 | page = tmp; | 302 | page = tmp; |
291 | } | 303 | } |
292 | /* and now the last, possibly only partially used page */ | 304 | /* and now the last, possibly only partially used page */ |
293 | len = e->size & (PAGE_SIZE - 1); | 305 | len = peer_req->i.size & (PAGE_SIZE - 1); |
294 | sg_set_page(&sg, page, len ?: PAGE_SIZE, 0); | 306 | sg_set_page(&sg, page, len ?: PAGE_SIZE, 0); |
295 | crypto_hash_update(&desc, &sg, sg.length); | 307 | crypto_hash_update(&desc, &sg, sg.length); |
296 | crypto_hash_final(&desc, digest); | 308 | crypto_hash_final(&desc, digest); |
@@ -316,59 +328,58 @@ void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio * | |||
316 | crypto_hash_final(&desc, digest); | 328 | crypto_hash_final(&desc, digest); |
317 | } | 329 | } |
318 | 330 | ||
319 | /* TODO merge common code with w_e_end_ov_req */ | 331 | /* MAYBE merge common code with w_e_end_ov_req */ |
320 | int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 332 | static int w_e_send_csum(struct drbd_work *w, int cancel) |
321 | { | 333 | { |
322 | struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); | 334 | struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); |
335 | struct drbd_conf *mdev = w->mdev; | ||
323 | int digest_size; | 336 | int digest_size; |
324 | void *digest; | 337 | void *digest; |
325 | int ok = 1; | 338 | int err = 0; |
326 | |||
327 | D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef); | ||
328 | 339 | ||
329 | if (unlikely(cancel)) | 340 | if (unlikely(cancel)) |
330 | goto out; | 341 | goto out; |
331 | 342 | ||
332 | if (likely((e->flags & EE_WAS_ERROR) != 0)) | 343 | if (unlikely((peer_req->flags & EE_WAS_ERROR) != 0)) |
333 | goto out; | 344 | goto out; |
334 | 345 | ||
335 | digest_size = crypto_hash_digestsize(mdev->csums_tfm); | 346 | digest_size = crypto_hash_digestsize(mdev->tconn->csums_tfm); |
336 | digest = kmalloc(digest_size, GFP_NOIO); | 347 | digest = kmalloc(digest_size, GFP_NOIO); |
337 | if (digest) { | 348 | if (digest) { |
338 | sector_t sector = e->sector; | 349 | sector_t sector = peer_req->i.sector; |
339 | unsigned int size = e->size; | 350 | unsigned int size = peer_req->i.size; |
340 | drbd_csum_ee(mdev, mdev->csums_tfm, e, digest); | 351 | drbd_csum_ee(mdev, mdev->tconn->csums_tfm, peer_req, digest); |
341 | /* Free e and pages before send. | 352 | /* Free peer_req and pages before send. |
342 | * In case we block on congestion, we could otherwise run into | 353 | * In case we block on congestion, we could otherwise run into |
343 | * some distributed deadlock, if the other side blocks on | 354 | * some distributed deadlock, if the other side blocks on |
344 | * congestion as well, because our receiver blocks in | 355 | * congestion as well, because our receiver blocks in |
345 | * drbd_pp_alloc due to pp_in_use > max_buffers. */ | 356 | * drbd_alloc_pages due to pp_in_use > max_buffers. */ |
346 | drbd_free_ee(mdev, e); | 357 | drbd_free_peer_req(mdev, peer_req); |
347 | e = NULL; | 358 | peer_req = NULL; |
348 | inc_rs_pending(mdev); | 359 | inc_rs_pending(mdev); |
349 | ok = drbd_send_drequest_csum(mdev, sector, size, | 360 | err = drbd_send_drequest_csum(mdev, sector, size, |
350 | digest, digest_size, | 361 | digest, digest_size, |
351 | P_CSUM_RS_REQUEST); | 362 | P_CSUM_RS_REQUEST); |
352 | kfree(digest); | 363 | kfree(digest); |
353 | } else { | 364 | } else { |
354 | dev_err(DEV, "kmalloc() of digest failed.\n"); | 365 | dev_err(DEV, "kmalloc() of digest failed.\n"); |
355 | ok = 0; | 366 | err = -ENOMEM; |
356 | } | 367 | } |
357 | 368 | ||
358 | out: | 369 | out: |
359 | if (e) | 370 | if (peer_req) |
360 | drbd_free_ee(mdev, e); | 371 | drbd_free_peer_req(mdev, peer_req); |
361 | 372 | ||
362 | if (unlikely(!ok)) | 373 | if (unlikely(err)) |
363 | dev_err(DEV, "drbd_send_drequest(..., csum) failed\n"); | 374 | dev_err(DEV, "drbd_send_drequest(..., csum) failed\n"); |
364 | return ok; | 375 | return err; |
365 | } | 376 | } |
366 | 377 | ||
367 | #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) | 378 | #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) |
368 | 379 | ||
369 | static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size) | 380 | static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size) |
370 | { | 381 | { |
371 | struct drbd_epoch_entry *e; | 382 | struct drbd_peer_request *peer_req; |
372 | 383 | ||
373 | if (!get_ldev(mdev)) | 384 | if (!get_ldev(mdev)) |
374 | return -EIO; | 385 | return -EIO; |
@@ -378,45 +389,47 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size) | |||
378 | 389 | ||
379 | /* GFP_TRY, because if there is no memory available right now, this may | 390 | /* GFP_TRY, because if there is no memory available right now, this may |
380 | * be rescheduled for later. It is "only" background resync, after all. */ | 391 | * be rescheduled for later. It is "only" background resync, after all. */ |
381 | e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY); | 392 | peer_req = drbd_alloc_peer_req(mdev, ID_SYNCER /* unused */, sector, |
382 | if (!e) | 393 | size, GFP_TRY); |
394 | if (!peer_req) | ||
383 | goto defer; | 395 | goto defer; |
384 | 396 | ||
385 | e->w.cb = w_e_send_csum; | 397 | peer_req->w.cb = w_e_send_csum; |
386 | spin_lock_irq(&mdev->req_lock); | 398 | spin_lock_irq(&mdev->tconn->req_lock); |
387 | list_add(&e->w.list, &mdev->read_ee); | 399 | list_add(&peer_req->w.list, &mdev->read_ee); |
388 | spin_unlock_irq(&mdev->req_lock); | 400 | spin_unlock_irq(&mdev->tconn->req_lock); |
389 | 401 | ||
390 | atomic_add(size >> 9, &mdev->rs_sect_ev); | 402 | atomic_add(size >> 9, &mdev->rs_sect_ev); |
391 | if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0) | 403 | if (drbd_submit_peer_request(mdev, peer_req, READ, DRBD_FAULT_RS_RD) == 0) |
392 | return 0; | 404 | return 0; |
393 | 405 | ||
394 | /* If it failed because of ENOMEM, retry should help. If it failed | 406 | /* If it failed because of ENOMEM, retry should help. If it failed |
395 | * because bio_add_page failed (probably broken lower level driver), | 407 | * because bio_add_page failed (probably broken lower level driver), |
396 | * retry may or may not help. | 408 | * retry may or may not help. |
397 | * If it does not, you may need to force disconnect. */ | 409 | * If it does not, you may need to force disconnect. */ |
398 | spin_lock_irq(&mdev->req_lock); | 410 | spin_lock_irq(&mdev->tconn->req_lock); |
399 | list_del(&e->w.list); | 411 | list_del(&peer_req->w.list); |
400 | spin_unlock_irq(&mdev->req_lock); | 412 | spin_unlock_irq(&mdev->tconn->req_lock); |
401 | 413 | ||
402 | drbd_free_ee(mdev, e); | 414 | drbd_free_peer_req(mdev, peer_req); |
403 | defer: | 415 | defer: |
404 | put_ldev(mdev); | 416 | put_ldev(mdev); |
405 | return -EAGAIN; | 417 | return -EAGAIN; |
406 | } | 418 | } |
407 | 419 | ||
408 | int w_resync_timer(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 420 | int w_resync_timer(struct drbd_work *w, int cancel) |
409 | { | 421 | { |
422 | struct drbd_conf *mdev = w->mdev; | ||
410 | switch (mdev->state.conn) { | 423 | switch (mdev->state.conn) { |
411 | case C_VERIFY_S: | 424 | case C_VERIFY_S: |
412 | w_make_ov_request(mdev, w, cancel); | 425 | w_make_ov_request(w, cancel); |
413 | break; | 426 | break; |
414 | case C_SYNC_TARGET: | 427 | case C_SYNC_TARGET: |
415 | w_make_resync_request(mdev, w, cancel); | 428 | w_make_resync_request(w, cancel); |
416 | break; | 429 | break; |
417 | } | 430 | } |
418 | 431 | ||
419 | return 1; | 432 | return 0; |
420 | } | 433 | } |
421 | 434 | ||
422 | void resync_timer_fn(unsigned long data) | 435 | void resync_timer_fn(unsigned long data) |
@@ -424,7 +437,7 @@ void resync_timer_fn(unsigned long data) | |||
424 | struct drbd_conf *mdev = (struct drbd_conf *) data; | 437 | struct drbd_conf *mdev = (struct drbd_conf *) data; |
425 | 438 | ||
426 | if (list_empty(&mdev->resync_work.list)) | 439 | if (list_empty(&mdev->resync_work.list)) |
427 | drbd_queue_work(&mdev->data.work, &mdev->resync_work); | 440 | drbd_queue_work(&mdev->tconn->sender_work, &mdev->resync_work); |
428 | } | 441 | } |
429 | 442 | ||
430 | static void fifo_set(struct fifo_buffer *fb, int value) | 443 | static void fifo_set(struct fifo_buffer *fb, int value) |
@@ -456,8 +469,24 @@ static void fifo_add_val(struct fifo_buffer *fb, int value) | |||
456 | fb->values[i] += value; | 469 | fb->values[i] += value; |
457 | } | 470 | } |
458 | 471 | ||
472 | struct fifo_buffer *fifo_alloc(int fifo_size) | ||
473 | { | ||
474 | struct fifo_buffer *fb; | ||
475 | |||
476 | fb = kzalloc(sizeof(struct fifo_buffer) + sizeof(int) * fifo_size, GFP_NOIO); | ||
477 | if (!fb) | ||
478 | return NULL; | ||
479 | |||
480 | fb->head_index = 0; | ||
481 | fb->size = fifo_size; | ||
482 | fb->total = 0; | ||
483 | |||
484 | return fb; | ||
485 | } | ||
486 | |||
459 | static int drbd_rs_controller(struct drbd_conf *mdev) | 487 | static int drbd_rs_controller(struct drbd_conf *mdev) |
460 | { | 488 | { |
489 | struct disk_conf *dc; | ||
461 | unsigned int sect_in; /* Number of sectors that came in since the last turn */ | 490 | unsigned int sect_in; /* Number of sectors that came in since the last turn */ |
462 | unsigned int want; /* The number of sectors we want in the proxy */ | 491 | unsigned int want; /* The number of sectors we want in the proxy */ |
463 | int req_sect; /* Number of sectors to request in this turn */ | 492 | int req_sect; /* Number of sectors to request in this turn */ |
@@ -466,38 +495,39 @@ static int drbd_rs_controller(struct drbd_conf *mdev) | |||
466 | int steps; /* Number of time steps to plan ahead */ | 495 | int steps; /* Number of time steps to plan ahead */ |
467 | int curr_corr; | 496 | int curr_corr; |
468 | int max_sect; | 497 | int max_sect; |
498 | struct fifo_buffer *plan; | ||
469 | 499 | ||
470 | sect_in = atomic_xchg(&mdev->rs_sect_in, 0); /* Number of sectors that came in */ | 500 | sect_in = atomic_xchg(&mdev->rs_sect_in, 0); /* Number of sectors that came in */ |
471 | mdev->rs_in_flight -= sect_in; | 501 | mdev->rs_in_flight -= sect_in; |
472 | 502 | ||
473 | spin_lock(&mdev->peer_seq_lock); /* get an atomic view on mdev->rs_plan_s */ | 503 | dc = rcu_dereference(mdev->ldev->disk_conf); |
504 | plan = rcu_dereference(mdev->rs_plan_s); | ||
474 | 505 | ||
475 | steps = mdev->rs_plan_s.size; /* (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ; */ | 506 | steps = plan->size; /* (dc->c_plan_ahead * 10 * SLEEP_TIME) / HZ; */ |
476 | 507 | ||
477 | if (mdev->rs_in_flight + sect_in == 0) { /* At start of resync */ | 508 | if (mdev->rs_in_flight + sect_in == 0) { /* At start of resync */ |
478 | want = ((mdev->sync_conf.rate * 2 * SLEEP_TIME) / HZ) * steps; | 509 | want = ((dc->resync_rate * 2 * SLEEP_TIME) / HZ) * steps; |
479 | } else { /* normal path */ | 510 | } else { /* normal path */ |
480 | want = mdev->sync_conf.c_fill_target ? mdev->sync_conf.c_fill_target : | 511 | want = dc->c_fill_target ? dc->c_fill_target : |
481 | sect_in * mdev->sync_conf.c_delay_target * HZ / (SLEEP_TIME * 10); | 512 | sect_in * dc->c_delay_target * HZ / (SLEEP_TIME * 10); |
482 | } | 513 | } |
483 | 514 | ||
484 | correction = want - mdev->rs_in_flight - mdev->rs_planed; | 515 | correction = want - mdev->rs_in_flight - plan->total; |
485 | 516 | ||
486 | /* Plan ahead */ | 517 | /* Plan ahead */ |
487 | cps = correction / steps; | 518 | cps = correction / steps; |
488 | fifo_add_val(&mdev->rs_plan_s, cps); | 519 | fifo_add_val(plan, cps); |
489 | mdev->rs_planed += cps * steps; | 520 | plan->total += cps * steps; |
490 | 521 | ||
491 | /* What we do in this step */ | 522 | /* What we do in this step */ |
492 | curr_corr = fifo_push(&mdev->rs_plan_s, 0); | 523 | curr_corr = fifo_push(plan, 0); |
493 | spin_unlock(&mdev->peer_seq_lock); | 524 | plan->total -= curr_corr; |
494 | mdev->rs_planed -= curr_corr; | ||
495 | 525 | ||
496 | req_sect = sect_in + curr_corr; | 526 | req_sect = sect_in + curr_corr; |
497 | if (req_sect < 0) | 527 | if (req_sect < 0) |
498 | req_sect = 0; | 528 | req_sect = 0; |
499 | 529 | ||
500 | max_sect = (mdev->sync_conf.c_max_rate * 2 * SLEEP_TIME) / HZ; | 530 | max_sect = (dc->c_max_rate * 2 * SLEEP_TIME) / HZ; |
501 | if (req_sect > max_sect) | 531 | if (req_sect > max_sect) |
502 | req_sect = max_sect; | 532 | req_sect = max_sect; |
503 | 533 | ||
@@ -513,22 +543,25 @@ static int drbd_rs_controller(struct drbd_conf *mdev) | |||
513 | static int drbd_rs_number_requests(struct drbd_conf *mdev) | 543 | static int drbd_rs_number_requests(struct drbd_conf *mdev) |
514 | { | 544 | { |
515 | int number; | 545 | int number; |
516 | if (mdev->rs_plan_s.size) { /* mdev->sync_conf.c_plan_ahead */ | 546 | |
547 | rcu_read_lock(); | ||
548 | if (rcu_dereference(mdev->rs_plan_s)->size) { | ||
517 | number = drbd_rs_controller(mdev) >> (BM_BLOCK_SHIFT - 9); | 549 | number = drbd_rs_controller(mdev) >> (BM_BLOCK_SHIFT - 9); |
518 | mdev->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME; | 550 | mdev->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME; |
519 | } else { | 551 | } else { |
520 | mdev->c_sync_rate = mdev->sync_conf.rate; | 552 | mdev->c_sync_rate = rcu_dereference(mdev->ldev->disk_conf)->resync_rate; |
521 | number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ); | 553 | number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ); |
522 | } | 554 | } |
555 | rcu_read_unlock(); | ||
523 | 556 | ||
524 | /* ignore the amount of pending requests, the resync controller should | 557 | /* ignore the amount of pending requests, the resync controller should |
525 | * throttle down to incoming reply rate soon enough anyways. */ | 558 | * throttle down to incoming reply rate soon enough anyways. */ |
526 | return number; | 559 | return number; |
527 | } | 560 | } |
528 | 561 | ||
529 | static int w_make_resync_request(struct drbd_conf *mdev, | 562 | int w_make_resync_request(struct drbd_work *w, int cancel) |
530 | struct drbd_work *w, int cancel) | ||
531 | { | 563 | { |
564 | struct drbd_conf *mdev = w->mdev; | ||
532 | unsigned long bit; | 565 | unsigned long bit; |
533 | sector_t sector; | 566 | sector_t sector; |
534 | const sector_t capacity = drbd_get_capacity(mdev->this_bdev); | 567 | const sector_t capacity = drbd_get_capacity(mdev->this_bdev); |
@@ -538,12 +571,12 @@ static int w_make_resync_request(struct drbd_conf *mdev, | |||
538 | int i = 0; | 571 | int i = 0; |
539 | 572 | ||
540 | if (unlikely(cancel)) | 573 | if (unlikely(cancel)) |
541 | return 1; | 574 | return 0; |
542 | 575 | ||
543 | if (mdev->rs_total == 0) { | 576 | if (mdev->rs_total == 0) { |
544 | /* empty resync? */ | 577 | /* empty resync? */ |
545 | drbd_resync_finished(mdev); | 578 | drbd_resync_finished(mdev); |
546 | return 1; | 579 | return 0; |
547 | } | 580 | } |
548 | 581 | ||
549 | if (!get_ldev(mdev)) { | 582 | if (!get_ldev(mdev)) { |
@@ -552,7 +585,7 @@ static int w_make_resync_request(struct drbd_conf *mdev, | |||
552 | to continue resync with a broken disk makes no sense at | 585 | to continue resync with a broken disk makes no sense at |
553 | all */ | 586 | all */ |
554 | dev_err(DEV, "Disk broke down during resync!\n"); | 587 | dev_err(DEV, "Disk broke down during resync!\n"); |
555 | return 1; | 588 | return 0; |
556 | } | 589 | } |
557 | 590 | ||
558 | max_bio_size = queue_max_hw_sectors(mdev->rq_queue) << 9; | 591 | max_bio_size = queue_max_hw_sectors(mdev->rq_queue) << 9; |
@@ -562,15 +595,15 @@ static int w_make_resync_request(struct drbd_conf *mdev, | |||
562 | 595 | ||
563 | for (i = 0; i < number; i++) { | 596 | for (i = 0; i < number; i++) { |
564 | /* Stop generating RS requests, when half of the send buffer is filled */ | 597 | /* Stop generating RS requests, when half of the send buffer is filled */ |
565 | mutex_lock(&mdev->data.mutex); | 598 | mutex_lock(&mdev->tconn->data.mutex); |
566 | if (mdev->data.socket) { | 599 | if (mdev->tconn->data.socket) { |
567 | queued = mdev->data.socket->sk->sk_wmem_queued; | 600 | queued = mdev->tconn->data.socket->sk->sk_wmem_queued; |
568 | sndbuf = mdev->data.socket->sk->sk_sndbuf; | 601 | sndbuf = mdev->tconn->data.socket->sk->sk_sndbuf; |
569 | } else { | 602 | } else { |
570 | queued = 1; | 603 | queued = 1; |
571 | sndbuf = 0; | 604 | sndbuf = 0; |
572 | } | 605 | } |
573 | mutex_unlock(&mdev->data.mutex); | 606 | mutex_unlock(&mdev->tconn->data.mutex); |
574 | if (queued > sndbuf / 2) | 607 | if (queued > sndbuf / 2) |
575 | goto requeue; | 608 | goto requeue; |
576 | 609 | ||
@@ -581,7 +614,7 @@ next_sector: | |||
581 | if (bit == DRBD_END_OF_BITMAP) { | 614 | if (bit == DRBD_END_OF_BITMAP) { |
582 | mdev->bm_resync_fo = drbd_bm_bits(mdev); | 615 | mdev->bm_resync_fo = drbd_bm_bits(mdev); |
583 | put_ldev(mdev); | 616 | put_ldev(mdev); |
584 | return 1; | 617 | return 0; |
585 | } | 618 | } |
586 | 619 | ||
587 | sector = BM_BIT_TO_SECT(bit); | 620 | sector = BM_BIT_TO_SECT(bit); |
@@ -640,11 +673,11 @@ next_sector: | |||
640 | /* adjust very last sectors, in case we are oddly sized */ | 673 | /* adjust very last sectors, in case we are oddly sized */ |
641 | if (sector + (size>>9) > capacity) | 674 | if (sector + (size>>9) > capacity) |
642 | size = (capacity-sector)<<9; | 675 | size = (capacity-sector)<<9; |
643 | if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) { | 676 | if (mdev->tconn->agreed_pro_version >= 89 && mdev->tconn->csums_tfm) { |
644 | switch (read_for_csum(mdev, sector, size)) { | 677 | switch (read_for_csum(mdev, sector, size)) { |
645 | case -EIO: /* Disk failure */ | 678 | case -EIO: /* Disk failure */ |
646 | put_ldev(mdev); | 679 | put_ldev(mdev); |
647 | return 0; | 680 | return -EIO; |
648 | case -EAGAIN: /* allocation failed, or ldev busy */ | 681 | case -EAGAIN: /* allocation failed, or ldev busy */ |
649 | drbd_rs_complete_io(mdev, sector); | 682 | drbd_rs_complete_io(mdev, sector); |
650 | mdev->bm_resync_fo = BM_SECT_TO_BIT(sector); | 683 | mdev->bm_resync_fo = BM_SECT_TO_BIT(sector); |
@@ -657,13 +690,16 @@ next_sector: | |||
657 | BUG(); | 690 | BUG(); |
658 | } | 691 | } |
659 | } else { | 692 | } else { |
693 | int err; | ||
694 | |||
660 | inc_rs_pending(mdev); | 695 | inc_rs_pending(mdev); |
661 | if (!drbd_send_drequest(mdev, P_RS_DATA_REQUEST, | 696 | err = drbd_send_drequest(mdev, P_RS_DATA_REQUEST, |
662 | sector, size, ID_SYNCER)) { | 697 | sector, size, ID_SYNCER); |
698 | if (err) { | ||
663 | dev_err(DEV, "drbd_send_drequest() failed, aborting...\n"); | 699 | dev_err(DEV, "drbd_send_drequest() failed, aborting...\n"); |
664 | dec_rs_pending(mdev); | 700 | dec_rs_pending(mdev); |
665 | put_ldev(mdev); | 701 | put_ldev(mdev); |
666 | return 0; | 702 | return err; |
667 | } | 703 | } |
668 | } | 704 | } |
669 | } | 705 | } |
@@ -676,21 +712,23 @@ next_sector: | |||
676 | * until then resync "work" is "inactive" ... | 712 | * until then resync "work" is "inactive" ... |
677 | */ | 713 | */ |
678 | put_ldev(mdev); | 714 | put_ldev(mdev); |
679 | return 1; | 715 | return 0; |
680 | } | 716 | } |
681 | 717 | ||
682 | requeue: | 718 | requeue: |
683 | mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9)); | 719 | mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9)); |
684 | mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); | 720 | mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); |
685 | put_ldev(mdev); | 721 | put_ldev(mdev); |
686 | return 1; | 722 | return 0; |
687 | } | 723 | } |
688 | 724 | ||
689 | static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 725 | static int w_make_ov_request(struct drbd_work *w, int cancel) |
690 | { | 726 | { |
727 | struct drbd_conf *mdev = w->mdev; | ||
691 | int number, i, size; | 728 | int number, i, size; |
692 | sector_t sector; | 729 | sector_t sector; |
693 | const sector_t capacity = drbd_get_capacity(mdev->this_bdev); | 730 | const sector_t capacity = drbd_get_capacity(mdev->this_bdev); |
731 | bool stop_sector_reached = false; | ||
694 | 732 | ||
695 | if (unlikely(cancel)) | 733 | if (unlikely(cancel)) |
696 | return 1; | 734 | return 1; |
@@ -699,9 +737,17 @@ static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int ca | |||
699 | 737 | ||
700 | sector = mdev->ov_position; | 738 | sector = mdev->ov_position; |
701 | for (i = 0; i < number; i++) { | 739 | for (i = 0; i < number; i++) { |
702 | if (sector >= capacity) { | 740 | if (sector >= capacity) |
703 | return 1; | 741 | return 1; |
704 | } | 742 | |
743 | /* We check for "finished" only in the reply path: | ||
744 | * w_e_end_ov_reply(). | ||
745 | * We need to send at least one request out. */ | ||
746 | stop_sector_reached = i > 0 | ||
747 | && verify_can_do_stop_sector(mdev) | ||
748 | && sector >= mdev->ov_stop_sector; | ||
749 | if (stop_sector_reached) | ||
750 | break; | ||
705 | 751 | ||
706 | size = BM_BLOCK_SIZE; | 752 | size = BM_BLOCK_SIZE; |
707 | 753 | ||
@@ -715,7 +761,7 @@ static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int ca | |||
715 | size = (capacity-sector)<<9; | 761 | size = (capacity-sector)<<9; |
716 | 762 | ||
717 | inc_rs_pending(mdev); | 763 | inc_rs_pending(mdev); |
718 | if (!drbd_send_ov_request(mdev, sector, size)) { | 764 | if (drbd_send_ov_request(mdev, sector, size)) { |
719 | dec_rs_pending(mdev); | 765 | dec_rs_pending(mdev); |
720 | return 0; | 766 | return 0; |
721 | } | 767 | } |
@@ -725,56 +771,39 @@ static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int ca | |||
725 | 771 | ||
726 | requeue: | 772 | requeue: |
727 | mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9)); | 773 | mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9)); |
728 | mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); | 774 | if (i == 0 || !stop_sector_reached) |
729 | return 1; | 775 | mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); |
730 | } | ||
731 | |||
732 | |||
733 | void start_resync_timer_fn(unsigned long data) | ||
734 | { | ||
735 | struct drbd_conf *mdev = (struct drbd_conf *) data; | ||
736 | |||
737 | drbd_queue_work(&mdev->data.work, &mdev->start_resync_work); | ||
738 | } | ||
739 | |||
740 | int w_start_resync(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
741 | { | ||
742 | if (atomic_read(&mdev->unacked_cnt) || atomic_read(&mdev->rs_pending_cnt)) { | ||
743 | dev_warn(DEV, "w_start_resync later...\n"); | ||
744 | mdev->start_resync_timer.expires = jiffies + HZ/10; | ||
745 | add_timer(&mdev->start_resync_timer); | ||
746 | return 1; | ||
747 | } | ||
748 | |||
749 | drbd_start_resync(mdev, C_SYNC_SOURCE); | ||
750 | clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags); | ||
751 | return 1; | 776 | return 1; |
752 | } | 777 | } |
753 | 778 | ||
754 | int w_ov_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 779 | int w_ov_finished(struct drbd_work *w, int cancel) |
755 | { | 780 | { |
781 | struct drbd_conf *mdev = w->mdev; | ||
756 | kfree(w); | 782 | kfree(w); |
757 | ov_oos_print(mdev); | 783 | ov_out_of_sync_print(mdev); |
758 | drbd_resync_finished(mdev); | 784 | drbd_resync_finished(mdev); |
759 | 785 | ||
760 | return 1; | 786 | return 0; |
761 | } | 787 | } |
762 | 788 | ||
763 | static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 789 | static int w_resync_finished(struct drbd_work *w, int cancel) |
764 | { | 790 | { |
791 | struct drbd_conf *mdev = w->mdev; | ||
765 | kfree(w); | 792 | kfree(w); |
766 | 793 | ||
767 | drbd_resync_finished(mdev); | 794 | drbd_resync_finished(mdev); |
768 | 795 | ||
769 | return 1; | 796 | return 0; |
770 | } | 797 | } |
771 | 798 | ||
772 | static void ping_peer(struct drbd_conf *mdev) | 799 | static void ping_peer(struct drbd_conf *mdev) |
773 | { | 800 | { |
774 | clear_bit(GOT_PING_ACK, &mdev->flags); | 801 | struct drbd_tconn *tconn = mdev->tconn; |
775 | request_ping(mdev); | 802 | |
776 | wait_event(mdev->misc_wait, | 803 | clear_bit(GOT_PING_ACK, &tconn->flags); |
777 | test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED); | 804 | request_ping(tconn); |
805 | wait_event(tconn->ping_wait, | ||
806 | test_bit(GOT_PING_ACK, &tconn->flags) || mdev->state.conn < C_CONNECTED); | ||
778 | } | 807 | } |
779 | 808 | ||
780 | int drbd_resync_finished(struct drbd_conf *mdev) | 809 | int drbd_resync_finished(struct drbd_conf *mdev) |
@@ -799,7 +828,8 @@ int drbd_resync_finished(struct drbd_conf *mdev) | |||
799 | w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC); | 828 | w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC); |
800 | if (w) { | 829 | if (w) { |
801 | w->cb = w_resync_finished; | 830 | w->cb = w_resync_finished; |
802 | drbd_queue_work(&mdev->data.work, w); | 831 | w->mdev = mdev; |
832 | drbd_queue_work(&mdev->tconn->sender_work, w); | ||
803 | return 1; | 833 | return 1; |
804 | } | 834 | } |
805 | dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n"); | 835 | dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n"); |
@@ -808,7 +838,12 @@ int drbd_resync_finished(struct drbd_conf *mdev) | |||
808 | dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ; | 838 | dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ; |
809 | if (dt <= 0) | 839 | if (dt <= 0) |
810 | dt = 1; | 840 | dt = 1; |
841 | |||
811 | db = mdev->rs_total; | 842 | db = mdev->rs_total; |
843 | /* adjust for verify start and stop sectors, respective reached position */ | ||
844 | if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T) | ||
845 | db -= mdev->ov_left; | ||
846 | |||
812 | dbdt = Bit2KB(db/dt); | 847 | dbdt = Bit2KB(db/dt); |
813 | mdev->rs_paused /= HZ; | 848 | mdev->rs_paused /= HZ; |
814 | 849 | ||
@@ -817,8 +852,8 @@ int drbd_resync_finished(struct drbd_conf *mdev) | |||
817 | 852 | ||
818 | ping_peer(mdev); | 853 | ping_peer(mdev); |
819 | 854 | ||
820 | spin_lock_irq(&mdev->req_lock); | 855 | spin_lock_irq(&mdev->tconn->req_lock); |
821 | os = mdev->state; | 856 | os = drbd_read_state(mdev); |
822 | 857 | ||
823 | verify_done = (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T); | 858 | verify_done = (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T); |
824 | 859 | ||
@@ -831,7 +866,7 @@ int drbd_resync_finished(struct drbd_conf *mdev) | |||
831 | ns.conn = C_CONNECTED; | 866 | ns.conn = C_CONNECTED; |
832 | 867 | ||
833 | dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n", | 868 | dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n", |
834 | verify_done ? "Online verify " : "Resync", | 869 | verify_done ? "Online verify" : "Resync", |
835 | dt + mdev->rs_paused, mdev->rs_paused, dbdt); | 870 | dt + mdev->rs_paused, mdev->rs_paused, dbdt); |
836 | 871 | ||
837 | n_oos = drbd_bm_total_weight(mdev); | 872 | n_oos = drbd_bm_total_weight(mdev); |
@@ -848,7 +883,7 @@ int drbd_resync_finished(struct drbd_conf *mdev) | |||
848 | if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) | 883 | if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) |
849 | khelper_cmd = "after-resync-target"; | 884 | khelper_cmd = "after-resync-target"; |
850 | 885 | ||
851 | if (mdev->csums_tfm && mdev->rs_total) { | 886 | if (mdev->tconn->csums_tfm && mdev->rs_total) { |
852 | const unsigned long s = mdev->rs_same_csum; | 887 | const unsigned long s = mdev->rs_same_csum; |
853 | const unsigned long t = mdev->rs_total; | 888 | const unsigned long t = mdev->rs_total; |
854 | const int ratio = | 889 | const int ratio = |
@@ -906,13 +941,15 @@ int drbd_resync_finished(struct drbd_conf *mdev) | |||
906 | 941 | ||
907 | _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); | 942 | _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); |
908 | out_unlock: | 943 | out_unlock: |
909 | spin_unlock_irq(&mdev->req_lock); | 944 | spin_unlock_irq(&mdev->tconn->req_lock); |
910 | put_ldev(mdev); | 945 | put_ldev(mdev); |
911 | out: | 946 | out: |
912 | mdev->rs_total = 0; | 947 | mdev->rs_total = 0; |
913 | mdev->rs_failed = 0; | 948 | mdev->rs_failed = 0; |
914 | mdev->rs_paused = 0; | 949 | mdev->rs_paused = 0; |
915 | if (verify_done) | 950 | |
951 | /* reset start sector, if we reached end of device */ | ||
952 | if (verify_done && mdev->ov_left == 0) | ||
916 | mdev->ov_start_sector = 0; | 953 | mdev->ov_start_sector = 0; |
917 | 954 | ||
918 | drbd_md_sync(mdev); | 955 | drbd_md_sync(mdev); |
@@ -924,19 +961,19 @@ out: | |||
924 | } | 961 | } |
925 | 962 | ||
926 | /* helper */ | 963 | /* helper */ |
927 | static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e) | 964 | static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_peer_request *peer_req) |
928 | { | 965 | { |
929 | if (drbd_ee_has_active_page(e)) { | 966 | if (drbd_peer_req_has_active_page(peer_req)) { |
930 | /* This might happen if sendpage() has not finished */ | 967 | /* This might happen if sendpage() has not finished */ |
931 | int i = (e->size + PAGE_SIZE -1) >> PAGE_SHIFT; | 968 | int i = (peer_req->i.size + PAGE_SIZE -1) >> PAGE_SHIFT; |
932 | atomic_add(i, &mdev->pp_in_use_by_net); | 969 | atomic_add(i, &mdev->pp_in_use_by_net); |
933 | atomic_sub(i, &mdev->pp_in_use); | 970 | atomic_sub(i, &mdev->pp_in_use); |
934 | spin_lock_irq(&mdev->req_lock); | 971 | spin_lock_irq(&mdev->tconn->req_lock); |
935 | list_add_tail(&e->w.list, &mdev->net_ee); | 972 | list_add_tail(&peer_req->w.list, &mdev->net_ee); |
936 | spin_unlock_irq(&mdev->req_lock); | 973 | spin_unlock_irq(&mdev->tconn->req_lock); |
937 | wake_up(&drbd_pp_wait); | 974 | wake_up(&drbd_pp_wait); |
938 | } else | 975 | } else |
939 | drbd_free_ee(mdev, e); | 976 | drbd_free_peer_req(mdev, peer_req); |
940 | } | 977 | } |
941 | 978 | ||
942 | /** | 979 | /** |
@@ -945,174 +982,177 @@ static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_ent | |||
945 | * @w: work object. | 982 | * @w: work object. |
946 | * @cancel: The connection will be closed anyways | 983 | * @cancel: The connection will be closed anyways |
947 | */ | 984 | */ |
948 | int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 985 | int w_e_end_data_req(struct drbd_work *w, int cancel) |
949 | { | 986 | { |
950 | struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); | 987 | struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); |
951 | int ok; | 988 | struct drbd_conf *mdev = w->mdev; |
989 | int err; | ||
952 | 990 | ||
953 | if (unlikely(cancel)) { | 991 | if (unlikely(cancel)) { |
954 | drbd_free_ee(mdev, e); | 992 | drbd_free_peer_req(mdev, peer_req); |
955 | dec_unacked(mdev); | 993 | dec_unacked(mdev); |
956 | return 1; | 994 | return 0; |
957 | } | 995 | } |
958 | 996 | ||
959 | if (likely((e->flags & EE_WAS_ERROR) == 0)) { | 997 | if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { |
960 | ok = drbd_send_block(mdev, P_DATA_REPLY, e); | 998 | err = drbd_send_block(mdev, P_DATA_REPLY, peer_req); |
961 | } else { | 999 | } else { |
962 | if (__ratelimit(&drbd_ratelimit_state)) | 1000 | if (__ratelimit(&drbd_ratelimit_state)) |
963 | dev_err(DEV, "Sending NegDReply. sector=%llus.\n", | 1001 | dev_err(DEV, "Sending NegDReply. sector=%llus.\n", |
964 | (unsigned long long)e->sector); | 1002 | (unsigned long long)peer_req->i.sector); |
965 | 1003 | ||
966 | ok = drbd_send_ack(mdev, P_NEG_DREPLY, e); | 1004 | err = drbd_send_ack(mdev, P_NEG_DREPLY, peer_req); |
967 | } | 1005 | } |
968 | 1006 | ||
969 | dec_unacked(mdev); | 1007 | dec_unacked(mdev); |
970 | 1008 | ||
971 | move_to_net_ee_or_free(mdev, e); | 1009 | move_to_net_ee_or_free(mdev, peer_req); |
972 | 1010 | ||
973 | if (unlikely(!ok)) | 1011 | if (unlikely(err)) |
974 | dev_err(DEV, "drbd_send_block() failed\n"); | 1012 | dev_err(DEV, "drbd_send_block() failed\n"); |
975 | return ok; | 1013 | return err; |
976 | } | 1014 | } |
977 | 1015 | ||
978 | /** | 1016 | /** |
979 | * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUESTRS | 1017 | * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST |
980 | * @mdev: DRBD device. | 1018 | * @mdev: DRBD device. |
981 | * @w: work object. | 1019 | * @w: work object. |
982 | * @cancel: The connection will be closed anyways | 1020 | * @cancel: The connection will be closed anyways |
983 | */ | 1021 | */ |
984 | int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 1022 | int w_e_end_rsdata_req(struct drbd_work *w, int cancel) |
985 | { | 1023 | { |
986 | struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); | 1024 | struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); |
987 | int ok; | 1025 | struct drbd_conf *mdev = w->mdev; |
1026 | int err; | ||
988 | 1027 | ||
989 | if (unlikely(cancel)) { | 1028 | if (unlikely(cancel)) { |
990 | drbd_free_ee(mdev, e); | 1029 | drbd_free_peer_req(mdev, peer_req); |
991 | dec_unacked(mdev); | 1030 | dec_unacked(mdev); |
992 | return 1; | 1031 | return 0; |
993 | } | 1032 | } |
994 | 1033 | ||
995 | if (get_ldev_if_state(mdev, D_FAILED)) { | 1034 | if (get_ldev_if_state(mdev, D_FAILED)) { |
996 | drbd_rs_complete_io(mdev, e->sector); | 1035 | drbd_rs_complete_io(mdev, peer_req->i.sector); |
997 | put_ldev(mdev); | 1036 | put_ldev(mdev); |
998 | } | 1037 | } |
999 | 1038 | ||
1000 | if (mdev->state.conn == C_AHEAD) { | 1039 | if (mdev->state.conn == C_AHEAD) { |
1001 | ok = drbd_send_ack(mdev, P_RS_CANCEL, e); | 1040 | err = drbd_send_ack(mdev, P_RS_CANCEL, peer_req); |
1002 | } else if (likely((e->flags & EE_WAS_ERROR) == 0)) { | 1041 | } else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { |
1003 | if (likely(mdev->state.pdsk >= D_INCONSISTENT)) { | 1042 | if (likely(mdev->state.pdsk >= D_INCONSISTENT)) { |
1004 | inc_rs_pending(mdev); | 1043 | inc_rs_pending(mdev); |
1005 | ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e); | 1044 | err = drbd_send_block(mdev, P_RS_DATA_REPLY, peer_req); |
1006 | } else { | 1045 | } else { |
1007 | if (__ratelimit(&drbd_ratelimit_state)) | 1046 | if (__ratelimit(&drbd_ratelimit_state)) |
1008 | dev_err(DEV, "Not sending RSDataReply, " | 1047 | dev_err(DEV, "Not sending RSDataReply, " |
1009 | "partner DISKLESS!\n"); | 1048 | "partner DISKLESS!\n"); |
1010 | ok = 1; | 1049 | err = 0; |
1011 | } | 1050 | } |
1012 | } else { | 1051 | } else { |
1013 | if (__ratelimit(&drbd_ratelimit_state)) | 1052 | if (__ratelimit(&drbd_ratelimit_state)) |
1014 | dev_err(DEV, "Sending NegRSDReply. sector %llus.\n", | 1053 | dev_err(DEV, "Sending NegRSDReply. sector %llus.\n", |
1015 | (unsigned long long)e->sector); | 1054 | (unsigned long long)peer_req->i.sector); |
1016 | 1055 | ||
1017 | ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e); | 1056 | err = drbd_send_ack(mdev, P_NEG_RS_DREPLY, peer_req); |
1018 | 1057 | ||
1019 | /* update resync data with failure */ | 1058 | /* update resync data with failure */ |
1020 | drbd_rs_failed_io(mdev, e->sector, e->size); | 1059 | drbd_rs_failed_io(mdev, peer_req->i.sector, peer_req->i.size); |
1021 | } | 1060 | } |
1022 | 1061 | ||
1023 | dec_unacked(mdev); | 1062 | dec_unacked(mdev); |
1024 | 1063 | ||
1025 | move_to_net_ee_or_free(mdev, e); | 1064 | move_to_net_ee_or_free(mdev, peer_req); |
1026 | 1065 | ||
1027 | if (unlikely(!ok)) | 1066 | if (unlikely(err)) |
1028 | dev_err(DEV, "drbd_send_block() failed\n"); | 1067 | dev_err(DEV, "drbd_send_block() failed\n"); |
1029 | return ok; | 1068 | return err; |
1030 | } | 1069 | } |
1031 | 1070 | ||
1032 | int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 1071 | int w_e_end_csum_rs_req(struct drbd_work *w, int cancel) |
1033 | { | 1072 | { |
1034 | struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); | 1073 | struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); |
1074 | struct drbd_conf *mdev = w->mdev; | ||
1035 | struct digest_info *di; | 1075 | struct digest_info *di; |
1036 | int digest_size; | 1076 | int digest_size; |
1037 | void *digest = NULL; | 1077 | void *digest = NULL; |
1038 | int ok, eq = 0; | 1078 | int err, eq = 0; |
1039 | 1079 | ||
1040 | if (unlikely(cancel)) { | 1080 | if (unlikely(cancel)) { |
1041 | drbd_free_ee(mdev, e); | 1081 | drbd_free_peer_req(mdev, peer_req); |
1042 | dec_unacked(mdev); | 1082 | dec_unacked(mdev); |
1043 | return 1; | 1083 | return 0; |
1044 | } | 1084 | } |
1045 | 1085 | ||
1046 | if (get_ldev(mdev)) { | 1086 | if (get_ldev(mdev)) { |
1047 | drbd_rs_complete_io(mdev, e->sector); | 1087 | drbd_rs_complete_io(mdev, peer_req->i.sector); |
1048 | put_ldev(mdev); | 1088 | put_ldev(mdev); |
1049 | } | 1089 | } |
1050 | 1090 | ||
1051 | di = e->digest; | 1091 | di = peer_req->digest; |
1052 | 1092 | ||
1053 | if (likely((e->flags & EE_WAS_ERROR) == 0)) { | 1093 | if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { |
1054 | /* quick hack to try to avoid a race against reconfiguration. | 1094 | /* quick hack to try to avoid a race against reconfiguration. |
1055 | * a real fix would be much more involved, | 1095 | * a real fix would be much more involved, |
1056 | * introducing more locking mechanisms */ | 1096 | * introducing more locking mechanisms */ |
1057 | if (mdev->csums_tfm) { | 1097 | if (mdev->tconn->csums_tfm) { |
1058 | digest_size = crypto_hash_digestsize(mdev->csums_tfm); | 1098 | digest_size = crypto_hash_digestsize(mdev->tconn->csums_tfm); |
1059 | D_ASSERT(digest_size == di->digest_size); | 1099 | D_ASSERT(digest_size == di->digest_size); |
1060 | digest = kmalloc(digest_size, GFP_NOIO); | 1100 | digest = kmalloc(digest_size, GFP_NOIO); |
1061 | } | 1101 | } |
1062 | if (digest) { | 1102 | if (digest) { |
1063 | drbd_csum_ee(mdev, mdev->csums_tfm, e, digest); | 1103 | drbd_csum_ee(mdev, mdev->tconn->csums_tfm, peer_req, digest); |
1064 | eq = !memcmp(digest, di->digest, digest_size); | 1104 | eq = !memcmp(digest, di->digest, digest_size); |
1065 | kfree(digest); | 1105 | kfree(digest); |
1066 | } | 1106 | } |
1067 | 1107 | ||
1068 | if (eq) { | 1108 | if (eq) { |
1069 | drbd_set_in_sync(mdev, e->sector, e->size); | 1109 | drbd_set_in_sync(mdev, peer_req->i.sector, peer_req->i.size); |
1070 | /* rs_same_csums unit is BM_BLOCK_SIZE */ | 1110 | /* rs_same_csums unit is BM_BLOCK_SIZE */ |
1071 | mdev->rs_same_csum += e->size >> BM_BLOCK_SHIFT; | 1111 | mdev->rs_same_csum += peer_req->i.size >> BM_BLOCK_SHIFT; |
1072 | ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e); | 1112 | err = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, peer_req); |
1073 | } else { | 1113 | } else { |
1074 | inc_rs_pending(mdev); | 1114 | inc_rs_pending(mdev); |
1075 | e->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */ | 1115 | peer_req->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */ |
1076 | e->flags &= ~EE_HAS_DIGEST; /* This e no longer has a digest pointer */ | 1116 | peer_req->flags &= ~EE_HAS_DIGEST; /* This peer request no longer has a digest pointer */ |
1077 | kfree(di); | 1117 | kfree(di); |
1078 | ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e); | 1118 | err = drbd_send_block(mdev, P_RS_DATA_REPLY, peer_req); |
1079 | } | 1119 | } |
1080 | } else { | 1120 | } else { |
1081 | ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e); | 1121 | err = drbd_send_ack(mdev, P_NEG_RS_DREPLY, peer_req); |
1082 | if (__ratelimit(&drbd_ratelimit_state)) | 1122 | if (__ratelimit(&drbd_ratelimit_state)) |
1083 | dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n"); | 1123 | dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n"); |
1084 | } | 1124 | } |
1085 | 1125 | ||
1086 | dec_unacked(mdev); | 1126 | dec_unacked(mdev); |
1087 | move_to_net_ee_or_free(mdev, e); | 1127 | move_to_net_ee_or_free(mdev, peer_req); |
1088 | 1128 | ||
1089 | if (unlikely(!ok)) | 1129 | if (unlikely(err)) |
1090 | dev_err(DEV, "drbd_send_block/ack() failed\n"); | 1130 | dev_err(DEV, "drbd_send_block/ack() failed\n"); |
1091 | return ok; | 1131 | return err; |
1092 | } | 1132 | } |
1093 | 1133 | ||
1094 | /* TODO merge common code with w_e_send_csum */ | 1134 | int w_e_end_ov_req(struct drbd_work *w, int cancel) |
1095 | int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
1096 | { | 1135 | { |
1097 | struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); | 1136 | struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); |
1098 | sector_t sector = e->sector; | 1137 | struct drbd_conf *mdev = w->mdev; |
1099 | unsigned int size = e->size; | 1138 | sector_t sector = peer_req->i.sector; |
1139 | unsigned int size = peer_req->i.size; | ||
1100 | int digest_size; | 1140 | int digest_size; |
1101 | void *digest; | 1141 | void *digest; |
1102 | int ok = 1; | 1142 | int err = 0; |
1103 | 1143 | ||
1104 | if (unlikely(cancel)) | 1144 | if (unlikely(cancel)) |
1105 | goto out; | 1145 | goto out; |
1106 | 1146 | ||
1107 | digest_size = crypto_hash_digestsize(mdev->verify_tfm); | 1147 | digest_size = crypto_hash_digestsize(mdev->tconn->verify_tfm); |
1108 | digest = kmalloc(digest_size, GFP_NOIO); | 1148 | digest = kmalloc(digest_size, GFP_NOIO); |
1109 | if (!digest) { | 1149 | if (!digest) { |
1110 | ok = 0; /* terminate the connection in case the allocation failed */ | 1150 | err = 1; /* terminate the connection in case the allocation failed */ |
1111 | goto out; | 1151 | goto out; |
1112 | } | 1152 | } |
1113 | 1153 | ||
1114 | if (likely(!(e->flags & EE_WAS_ERROR))) | 1154 | if (likely(!(peer_req->flags & EE_WAS_ERROR))) |
1115 | drbd_csum_ee(mdev, mdev->verify_tfm, e, digest); | 1155 | drbd_csum_ee(mdev, mdev->tconn->verify_tfm, peer_req, digest); |
1116 | else | 1156 | else |
1117 | memset(digest, 0, digest_size); | 1157 | memset(digest, 0, digest_size); |
1118 | 1158 | ||
@@ -1120,25 +1160,23 @@ int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | |||
1120 | * In case we block on congestion, we could otherwise run into | 1160 | * In case we block on congestion, we could otherwise run into |
1121 | * some distributed deadlock, if the other side blocks on | 1161 | * some distributed deadlock, if the other side blocks on |
1122 | * congestion as well, because our receiver blocks in | 1162 | * congestion as well, because our receiver blocks in |
1123 | * drbd_pp_alloc due to pp_in_use > max_buffers. */ | 1163 | * drbd_alloc_pages due to pp_in_use > max_buffers. */ |
1124 | drbd_free_ee(mdev, e); | 1164 | drbd_free_peer_req(mdev, peer_req); |
1125 | e = NULL; | 1165 | peer_req = NULL; |
1126 | inc_rs_pending(mdev); | 1166 | inc_rs_pending(mdev); |
1127 | ok = drbd_send_drequest_csum(mdev, sector, size, | 1167 | err = drbd_send_drequest_csum(mdev, sector, size, digest, digest_size, P_OV_REPLY); |
1128 | digest, digest_size, | 1168 | if (err) |
1129 | P_OV_REPLY); | ||
1130 | if (!ok) | ||
1131 | dec_rs_pending(mdev); | 1169 | dec_rs_pending(mdev); |
1132 | kfree(digest); | 1170 | kfree(digest); |
1133 | 1171 | ||
1134 | out: | 1172 | out: |
1135 | if (e) | 1173 | if (peer_req) |
1136 | drbd_free_ee(mdev, e); | 1174 | drbd_free_peer_req(mdev, peer_req); |
1137 | dec_unacked(mdev); | 1175 | dec_unacked(mdev); |
1138 | return ok; | 1176 | return err; |
1139 | } | 1177 | } |
1140 | 1178 | ||
1141 | void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size) | 1179 | void drbd_ov_out_of_sync_found(struct drbd_conf *mdev, sector_t sector, int size) |
1142 | { | 1180 | { |
1143 | if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) { | 1181 | if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) { |
1144 | mdev->ov_last_oos_size += size>>9; | 1182 | mdev->ov_last_oos_size += size>>9; |
@@ -1149,36 +1187,38 @@ void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size) | |||
1149 | drbd_set_out_of_sync(mdev, sector, size); | 1187 | drbd_set_out_of_sync(mdev, sector, size); |
1150 | } | 1188 | } |
1151 | 1189 | ||
1152 | int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 1190 | int w_e_end_ov_reply(struct drbd_work *w, int cancel) |
1153 | { | 1191 | { |
1154 | struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); | 1192 | struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); |
1193 | struct drbd_conf *mdev = w->mdev; | ||
1155 | struct digest_info *di; | 1194 | struct digest_info *di; |
1156 | void *digest; | 1195 | void *digest; |
1157 | sector_t sector = e->sector; | 1196 | sector_t sector = peer_req->i.sector; |
1158 | unsigned int size = e->size; | 1197 | unsigned int size = peer_req->i.size; |
1159 | int digest_size; | 1198 | int digest_size; |
1160 | int ok, eq = 0; | 1199 | int err, eq = 0; |
1200 | bool stop_sector_reached = false; | ||
1161 | 1201 | ||
1162 | if (unlikely(cancel)) { | 1202 | if (unlikely(cancel)) { |
1163 | drbd_free_ee(mdev, e); | 1203 | drbd_free_peer_req(mdev, peer_req); |
1164 | dec_unacked(mdev); | 1204 | dec_unacked(mdev); |
1165 | return 1; | 1205 | return 0; |
1166 | } | 1206 | } |
1167 | 1207 | ||
1168 | /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all | 1208 | /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all |
1169 | * the resync lru has been cleaned up already */ | 1209 | * the resync lru has been cleaned up already */ |
1170 | if (get_ldev(mdev)) { | 1210 | if (get_ldev(mdev)) { |
1171 | drbd_rs_complete_io(mdev, e->sector); | 1211 | drbd_rs_complete_io(mdev, peer_req->i.sector); |
1172 | put_ldev(mdev); | 1212 | put_ldev(mdev); |
1173 | } | 1213 | } |
1174 | 1214 | ||
1175 | di = e->digest; | 1215 | di = peer_req->digest; |
1176 | 1216 | ||
1177 | if (likely((e->flags & EE_WAS_ERROR) == 0)) { | 1217 | if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { |
1178 | digest_size = crypto_hash_digestsize(mdev->verify_tfm); | 1218 | digest_size = crypto_hash_digestsize(mdev->tconn->verify_tfm); |
1179 | digest = kmalloc(digest_size, GFP_NOIO); | 1219 | digest = kmalloc(digest_size, GFP_NOIO); |
1180 | if (digest) { | 1220 | if (digest) { |
1181 | drbd_csum_ee(mdev, mdev->verify_tfm, e, digest); | 1221 | drbd_csum_ee(mdev, mdev->tconn->verify_tfm, peer_req, digest); |
1182 | 1222 | ||
1183 | D_ASSERT(digest_size == di->digest_size); | 1223 | D_ASSERT(digest_size == di->digest_size); |
1184 | eq = !memcmp(digest, di->digest, digest_size); | 1224 | eq = !memcmp(digest, di->digest, digest_size); |
@@ -1186,19 +1226,19 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | |||
1186 | } | 1226 | } |
1187 | } | 1227 | } |
1188 | 1228 | ||
1189 | /* Free e and pages before send. | 1229 | /* Free peer_req and pages before send. |
1190 | * In case we block on congestion, we could otherwise run into | 1230 | * In case we block on congestion, we could otherwise run into |
1191 | * some distributed deadlock, if the other side blocks on | 1231 | * some distributed deadlock, if the other side blocks on |
1192 | * congestion as well, because our receiver blocks in | 1232 | * congestion as well, because our receiver blocks in |
1193 | * drbd_pp_alloc due to pp_in_use > max_buffers. */ | 1233 | * drbd_alloc_pages due to pp_in_use > max_buffers. */ |
1194 | drbd_free_ee(mdev, e); | 1234 | drbd_free_peer_req(mdev, peer_req); |
1195 | if (!eq) | 1235 | if (!eq) |
1196 | drbd_ov_oos_found(mdev, sector, size); | 1236 | drbd_ov_out_of_sync_found(mdev, sector, size); |
1197 | else | 1237 | else |
1198 | ov_oos_print(mdev); | 1238 | ov_out_of_sync_print(mdev); |
1199 | 1239 | ||
1200 | ok = drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, | 1240 | err = drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, |
1201 | eq ? ID_IN_SYNC : ID_OUT_OF_SYNC); | 1241 | eq ? ID_IN_SYNC : ID_OUT_OF_SYNC); |
1202 | 1242 | ||
1203 | dec_unacked(mdev); | 1243 | dec_unacked(mdev); |
1204 | 1244 | ||
@@ -1208,73 +1248,102 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | |||
1208 | if ((mdev->ov_left & 0x200) == 0x200) | 1248 | if ((mdev->ov_left & 0x200) == 0x200) |
1209 | drbd_advance_rs_marks(mdev, mdev->ov_left); | 1249 | drbd_advance_rs_marks(mdev, mdev->ov_left); |
1210 | 1250 | ||
1211 | if (mdev->ov_left == 0) { | 1251 | stop_sector_reached = verify_can_do_stop_sector(mdev) && |
1212 | ov_oos_print(mdev); | 1252 | (sector + (size>>9)) >= mdev->ov_stop_sector; |
1253 | |||
1254 | if (mdev->ov_left == 0 || stop_sector_reached) { | ||
1255 | ov_out_of_sync_print(mdev); | ||
1213 | drbd_resync_finished(mdev); | 1256 | drbd_resync_finished(mdev); |
1214 | } | 1257 | } |
1215 | 1258 | ||
1216 | return ok; | 1259 | return err; |
1217 | } | 1260 | } |
1218 | 1261 | ||
1219 | int w_prev_work_done(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 1262 | int w_prev_work_done(struct drbd_work *w, int cancel) |
1220 | { | 1263 | { |
1221 | struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w); | 1264 | struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w); |
1265 | |||
1222 | complete(&b->done); | 1266 | complete(&b->done); |
1223 | return 1; | 1267 | return 0; |
1224 | } | 1268 | } |
1225 | 1269 | ||
1226 | int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 1270 | /* FIXME |
1271 | * We need to track the number of pending barrier acks, | ||
1272 | * and to be able to wait for them. | ||
1273 | * See also comment in drbd_adm_attach before drbd_suspend_io. | ||
1274 | */ | ||
1275 | int drbd_send_barrier(struct drbd_tconn *tconn) | ||
1227 | { | 1276 | { |
1228 | struct drbd_tl_epoch *b = container_of(w, struct drbd_tl_epoch, w); | 1277 | struct p_barrier *p; |
1229 | struct p_barrier *p = &mdev->data.sbuf.barrier; | 1278 | struct drbd_socket *sock; |
1230 | int ok = 1; | ||
1231 | |||
1232 | /* really avoid racing with tl_clear. w.cb may have been referenced | ||
1233 | * just before it was reassigned and re-queued, so double check that. | ||
1234 | * actually, this race was harmless, since we only try to send the | ||
1235 | * barrier packet here, and otherwise do nothing with the object. | ||
1236 | * but compare with the head of w_clear_epoch */ | ||
1237 | spin_lock_irq(&mdev->req_lock); | ||
1238 | if (w->cb != w_send_barrier || mdev->state.conn < C_CONNECTED) | ||
1239 | cancel = 1; | ||
1240 | spin_unlock_irq(&mdev->req_lock); | ||
1241 | if (cancel) | ||
1242 | return 1; | ||
1243 | 1279 | ||
1244 | if (!drbd_get_data_sock(mdev)) | 1280 | sock = &tconn->data; |
1245 | return 0; | 1281 | p = conn_prepare_command(tconn, sock); |
1246 | p->barrier = b->br_number; | 1282 | if (!p) |
1247 | /* inc_ap_pending was done where this was queued. | 1283 | return -EIO; |
1248 | * dec_ap_pending will be done in got_BarrierAck | 1284 | p->barrier = tconn->send.current_epoch_nr; |
1249 | * or (on connection loss) in w_clear_epoch. */ | 1285 | p->pad = 0; |
1250 | ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER, | 1286 | tconn->send.current_epoch_writes = 0; |
1251 | (struct p_header80 *)p, sizeof(*p), 0); | 1287 | |
1252 | drbd_put_data_sock(mdev); | 1288 | return conn_send_command(tconn, sock, P_BARRIER, sizeof(*p), NULL, 0); |
1253 | |||
1254 | return ok; | ||
1255 | } | 1289 | } |
1256 | 1290 | ||
1257 | int w_send_write_hint(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 1291 | int w_send_write_hint(struct drbd_work *w, int cancel) |
1258 | { | 1292 | { |
1293 | struct drbd_conf *mdev = w->mdev; | ||
1294 | struct drbd_socket *sock; | ||
1295 | |||
1259 | if (cancel) | 1296 | if (cancel) |
1260 | return 1; | 1297 | return 0; |
1261 | return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE); | 1298 | sock = &mdev->tconn->data; |
1299 | if (!drbd_prepare_command(mdev, sock)) | ||
1300 | return -EIO; | ||
1301 | return drbd_send_command(mdev, sock, P_UNPLUG_REMOTE, 0, NULL, 0); | ||
1262 | } | 1302 | } |
1263 | 1303 | ||
1264 | int w_send_oos(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 1304 | static void re_init_if_first_write(struct drbd_tconn *tconn, unsigned int epoch) |
1305 | { | ||
1306 | if (!tconn->send.seen_any_write_yet) { | ||
1307 | tconn->send.seen_any_write_yet = true; | ||
1308 | tconn->send.current_epoch_nr = epoch; | ||
1309 | tconn->send.current_epoch_writes = 0; | ||
1310 | } | ||
1311 | } | ||
1312 | |||
1313 | static void maybe_send_barrier(struct drbd_tconn *tconn, unsigned int epoch) | ||
1314 | { | ||
1315 | /* re-init if first write on this connection */ | ||
1316 | if (!tconn->send.seen_any_write_yet) | ||
1317 | return; | ||
1318 | if (tconn->send.current_epoch_nr != epoch) { | ||
1319 | if (tconn->send.current_epoch_writes) | ||
1320 | drbd_send_barrier(tconn); | ||
1321 | tconn->send.current_epoch_nr = epoch; | ||
1322 | } | ||
1323 | } | ||
1324 | |||
1325 | int w_send_out_of_sync(struct drbd_work *w, int cancel) | ||
1265 | { | 1326 | { |
1266 | struct drbd_request *req = container_of(w, struct drbd_request, w); | 1327 | struct drbd_request *req = container_of(w, struct drbd_request, w); |
1267 | int ok; | 1328 | struct drbd_conf *mdev = w->mdev; |
1329 | struct drbd_tconn *tconn = mdev->tconn; | ||
1330 | int err; | ||
1268 | 1331 | ||
1269 | if (unlikely(cancel)) { | 1332 | if (unlikely(cancel)) { |
1270 | req_mod(req, send_canceled); | 1333 | req_mod(req, SEND_CANCELED); |
1271 | return 1; | 1334 | return 0; |
1272 | } | 1335 | } |
1273 | 1336 | ||
1274 | ok = drbd_send_oos(mdev, req); | 1337 | /* this time, no tconn->send.current_epoch_writes++; |
1275 | req_mod(req, oos_handed_to_network); | 1338 | * If it was sent, it was the closing barrier for the last |
1339 | * replicated epoch, before we went into AHEAD mode. | ||
1340 | * No more barriers will be sent, until we leave AHEAD mode again. */ | ||
1341 | maybe_send_barrier(tconn, req->epoch); | ||
1342 | |||
1343 | err = drbd_send_out_of_sync(mdev, req); | ||
1344 | req_mod(req, OOS_HANDED_TO_NETWORK); | ||
1276 | 1345 | ||
1277 | return ok; | 1346 | return err; |
1278 | } | 1347 | } |
1279 | 1348 | ||
1280 | /** | 1349 | /** |
@@ -1283,20 +1352,26 @@ int w_send_oos(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | |||
1283 | * @w: work object. | 1352 | * @w: work object. |
1284 | * @cancel: The connection will be closed anyways | 1353 | * @cancel: The connection will be closed anyways |
1285 | */ | 1354 | */ |
1286 | int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 1355 | int w_send_dblock(struct drbd_work *w, int cancel) |
1287 | { | 1356 | { |
1288 | struct drbd_request *req = container_of(w, struct drbd_request, w); | 1357 | struct drbd_request *req = container_of(w, struct drbd_request, w); |
1289 | int ok; | 1358 | struct drbd_conf *mdev = w->mdev; |
1359 | struct drbd_tconn *tconn = mdev->tconn; | ||
1360 | int err; | ||
1290 | 1361 | ||
1291 | if (unlikely(cancel)) { | 1362 | if (unlikely(cancel)) { |
1292 | req_mod(req, send_canceled); | 1363 | req_mod(req, SEND_CANCELED); |
1293 | return 1; | 1364 | return 0; |
1294 | } | 1365 | } |
1295 | 1366 | ||
1296 | ok = drbd_send_dblock(mdev, req); | 1367 | re_init_if_first_write(tconn, req->epoch); |
1297 | req_mod(req, ok ? handed_over_to_network : send_failed); | 1368 | maybe_send_barrier(tconn, req->epoch); |
1369 | tconn->send.current_epoch_writes++; | ||
1370 | |||
1371 | err = drbd_send_dblock(mdev, req); | ||
1372 | req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK); | ||
1298 | 1373 | ||
1299 | return ok; | 1374 | return err; |
1300 | } | 1375 | } |
1301 | 1376 | ||
1302 | /** | 1377 | /** |
@@ -1305,57 +1380,61 @@ int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | |||
1305 | * @w: work object. | 1380 | * @w: work object. |
1306 | * @cancel: The connection will be closed anyways | 1381 | * @cancel: The connection will be closed anyways |
1307 | */ | 1382 | */ |
1308 | int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 1383 | int w_send_read_req(struct drbd_work *w, int cancel) |
1309 | { | 1384 | { |
1310 | struct drbd_request *req = container_of(w, struct drbd_request, w); | 1385 | struct drbd_request *req = container_of(w, struct drbd_request, w); |
1311 | int ok; | 1386 | struct drbd_conf *mdev = w->mdev; |
1387 | struct drbd_tconn *tconn = mdev->tconn; | ||
1388 | int err; | ||
1312 | 1389 | ||
1313 | if (unlikely(cancel)) { | 1390 | if (unlikely(cancel)) { |
1314 | req_mod(req, send_canceled); | 1391 | req_mod(req, SEND_CANCELED); |
1315 | return 1; | 1392 | return 0; |
1316 | } | 1393 | } |
1317 | 1394 | ||
1318 | ok = drbd_send_drequest(mdev, P_DATA_REQUEST, req->sector, req->size, | 1395 | /* Even read requests may close a write epoch, |
1319 | (unsigned long)req); | 1396 | * if there was any yet. */ |
1397 | maybe_send_barrier(tconn, req->epoch); | ||
1320 | 1398 | ||
1321 | if (!ok) { | 1399 | err = drbd_send_drequest(mdev, P_DATA_REQUEST, req->i.sector, req->i.size, |
1322 | /* ?? we set C_TIMEOUT or C_BROKEN_PIPE in drbd_send(); | 1400 | (unsigned long)req); |
1323 | * so this is probably redundant */ | 1401 | |
1324 | if (mdev->state.conn >= C_CONNECTED) | 1402 | req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK); |
1325 | drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE)); | ||
1326 | } | ||
1327 | req_mod(req, ok ? handed_over_to_network : send_failed); | ||
1328 | 1403 | ||
1329 | return ok; | 1404 | return err; |
1330 | } | 1405 | } |
1331 | 1406 | ||
1332 | int w_restart_disk_io(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 1407 | int w_restart_disk_io(struct drbd_work *w, int cancel) |
1333 | { | 1408 | { |
1334 | struct drbd_request *req = container_of(w, struct drbd_request, w); | 1409 | struct drbd_request *req = container_of(w, struct drbd_request, w); |
1410 | struct drbd_conf *mdev = w->mdev; | ||
1335 | 1411 | ||
1336 | if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG) | 1412 | if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG) |
1337 | drbd_al_begin_io(mdev, req->sector); | 1413 | drbd_al_begin_io(mdev, &req->i); |
1338 | /* Calling drbd_al_begin_io() out of the worker might deadlocks | ||
1339 | theoretically. Practically it can not deadlock, since this is | ||
1340 | only used when unfreezing IOs. All the extents of the requests | ||
1341 | that made it into the TL are already active */ | ||
1342 | 1414 | ||
1343 | drbd_req_make_private_bio(req, req->master_bio); | 1415 | drbd_req_make_private_bio(req, req->master_bio); |
1344 | req->private_bio->bi_bdev = mdev->ldev->backing_bdev; | 1416 | req->private_bio->bi_bdev = mdev->ldev->backing_bdev; |
1345 | generic_make_request(req->private_bio); | 1417 | generic_make_request(req->private_bio); |
1346 | 1418 | ||
1347 | return 1; | 1419 | return 0; |
1348 | } | 1420 | } |
1349 | 1421 | ||
1350 | static int _drbd_may_sync_now(struct drbd_conf *mdev) | 1422 | static int _drbd_may_sync_now(struct drbd_conf *mdev) |
1351 | { | 1423 | { |
1352 | struct drbd_conf *odev = mdev; | 1424 | struct drbd_conf *odev = mdev; |
1425 | int resync_after; | ||
1353 | 1426 | ||
1354 | while (1) { | 1427 | while (1) { |
1355 | if (odev->sync_conf.after == -1) | 1428 | if (!odev->ldev) |
1429 | return 1; | ||
1430 | rcu_read_lock(); | ||
1431 | resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after; | ||
1432 | rcu_read_unlock(); | ||
1433 | if (resync_after == -1) | ||
1434 | return 1; | ||
1435 | odev = minor_to_mdev(resync_after); | ||
1436 | if (!expect(odev)) | ||
1356 | return 1; | 1437 | return 1; |
1357 | odev = minor_to_mdev(odev->sync_conf.after); | ||
1358 | ERR_IF(!odev) return 1; | ||
1359 | if ((odev->state.conn >= C_SYNC_SOURCE && | 1438 | if ((odev->state.conn >= C_SYNC_SOURCE && |
1360 | odev->state.conn <= C_PAUSED_SYNC_T) || | 1439 | odev->state.conn <= C_PAUSED_SYNC_T) || |
1361 | odev->state.aftr_isp || odev->state.peer_isp || | 1440 | odev->state.aftr_isp || odev->state.peer_isp || |
@@ -1375,16 +1454,15 @@ static int _drbd_pause_after(struct drbd_conf *mdev) | |||
1375 | struct drbd_conf *odev; | 1454 | struct drbd_conf *odev; |
1376 | int i, rv = 0; | 1455 | int i, rv = 0; |
1377 | 1456 | ||
1378 | for (i = 0; i < minor_count; i++) { | 1457 | rcu_read_lock(); |
1379 | odev = minor_to_mdev(i); | 1458 | idr_for_each_entry(&minors, odev, i) { |
1380 | if (!odev) | ||
1381 | continue; | ||
1382 | if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) | 1459 | if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) |
1383 | continue; | 1460 | continue; |
1384 | if (!_drbd_may_sync_now(odev)) | 1461 | if (!_drbd_may_sync_now(odev)) |
1385 | rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL) | 1462 | rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL) |
1386 | != SS_NOTHING_TO_DO); | 1463 | != SS_NOTHING_TO_DO); |
1387 | } | 1464 | } |
1465 | rcu_read_unlock(); | ||
1388 | 1466 | ||
1389 | return rv; | 1467 | return rv; |
1390 | } | 1468 | } |
@@ -1400,10 +1478,8 @@ static int _drbd_resume_next(struct drbd_conf *mdev) | |||
1400 | struct drbd_conf *odev; | 1478 | struct drbd_conf *odev; |
1401 | int i, rv = 0; | 1479 | int i, rv = 0; |
1402 | 1480 | ||
1403 | for (i = 0; i < minor_count; i++) { | 1481 | rcu_read_lock(); |
1404 | odev = minor_to_mdev(i); | 1482 | idr_for_each_entry(&minors, odev, i) { |
1405 | if (!odev) | ||
1406 | continue; | ||
1407 | if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) | 1483 | if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) |
1408 | continue; | 1484 | continue; |
1409 | if (odev->state.aftr_isp) { | 1485 | if (odev->state.aftr_isp) { |
@@ -1413,6 +1489,7 @@ static int _drbd_resume_next(struct drbd_conf *mdev) | |||
1413 | != SS_NOTHING_TO_DO) ; | 1489 | != SS_NOTHING_TO_DO) ; |
1414 | } | 1490 | } |
1415 | } | 1491 | } |
1492 | rcu_read_unlock(); | ||
1416 | return rv; | 1493 | return rv; |
1417 | } | 1494 | } |
1418 | 1495 | ||
@@ -1430,57 +1507,86 @@ void suspend_other_sg(struct drbd_conf *mdev) | |||
1430 | write_unlock_irq(&global_state_lock); | 1507 | write_unlock_irq(&global_state_lock); |
1431 | } | 1508 | } |
1432 | 1509 | ||
1433 | static int sync_after_error(struct drbd_conf *mdev, int o_minor) | 1510 | /* caller must hold global_state_lock */ |
1511 | enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor) | ||
1434 | { | 1512 | { |
1435 | struct drbd_conf *odev; | 1513 | struct drbd_conf *odev; |
1514 | int resync_after; | ||
1436 | 1515 | ||
1437 | if (o_minor == -1) | 1516 | if (o_minor == -1) |
1438 | return NO_ERROR; | 1517 | return NO_ERROR; |
1439 | if (o_minor < -1 || minor_to_mdev(o_minor) == NULL) | 1518 | if (o_minor < -1 || minor_to_mdev(o_minor) == NULL) |
1440 | return ERR_SYNC_AFTER; | 1519 | return ERR_RESYNC_AFTER; |
1441 | 1520 | ||
1442 | /* check for loops */ | 1521 | /* check for loops */ |
1443 | odev = minor_to_mdev(o_minor); | 1522 | odev = minor_to_mdev(o_minor); |
1444 | while (1) { | 1523 | while (1) { |
1445 | if (odev == mdev) | 1524 | if (odev == mdev) |
1446 | return ERR_SYNC_AFTER_CYCLE; | 1525 | return ERR_RESYNC_AFTER_CYCLE; |
1447 | 1526 | ||
1527 | rcu_read_lock(); | ||
1528 | resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after; | ||
1529 | rcu_read_unlock(); | ||
1448 | /* dependency chain ends here, no cycles. */ | 1530 | /* dependency chain ends here, no cycles. */ |
1449 | if (odev->sync_conf.after == -1) | 1531 | if (resync_after == -1) |
1450 | return NO_ERROR; | 1532 | return NO_ERROR; |
1451 | 1533 | ||
1452 | /* follow the dependency chain */ | 1534 | /* follow the dependency chain */ |
1453 | odev = minor_to_mdev(odev->sync_conf.after); | 1535 | odev = minor_to_mdev(resync_after); |
1454 | } | 1536 | } |
1455 | } | 1537 | } |
1456 | 1538 | ||
1457 | int drbd_alter_sa(struct drbd_conf *mdev, int na) | 1539 | /* caller must hold global_state_lock */ |
1540 | void drbd_resync_after_changed(struct drbd_conf *mdev) | ||
1458 | { | 1541 | { |
1459 | int changes; | 1542 | int changes; |
1460 | int retcode; | ||
1461 | 1543 | ||
1462 | write_lock_irq(&global_state_lock); | 1544 | do { |
1463 | retcode = sync_after_error(mdev, na); | 1545 | changes = _drbd_pause_after(mdev); |
1464 | if (retcode == NO_ERROR) { | 1546 | changes |= _drbd_resume_next(mdev); |
1465 | mdev->sync_conf.after = na; | 1547 | } while (changes); |
1466 | do { | ||
1467 | changes = _drbd_pause_after(mdev); | ||
1468 | changes |= _drbd_resume_next(mdev); | ||
1469 | } while (changes); | ||
1470 | } | ||
1471 | write_unlock_irq(&global_state_lock); | ||
1472 | return retcode; | ||
1473 | } | 1548 | } |
1474 | 1549 | ||
1475 | void drbd_rs_controller_reset(struct drbd_conf *mdev) | 1550 | void drbd_rs_controller_reset(struct drbd_conf *mdev) |
1476 | { | 1551 | { |
1552 | struct fifo_buffer *plan; | ||
1553 | |||
1477 | atomic_set(&mdev->rs_sect_in, 0); | 1554 | atomic_set(&mdev->rs_sect_in, 0); |
1478 | atomic_set(&mdev->rs_sect_ev, 0); | 1555 | atomic_set(&mdev->rs_sect_ev, 0); |
1479 | mdev->rs_in_flight = 0; | 1556 | mdev->rs_in_flight = 0; |
1480 | mdev->rs_planed = 0; | 1557 | |
1481 | spin_lock(&mdev->peer_seq_lock); | 1558 | /* Updating the RCU protected object in place is necessary since |
1482 | fifo_set(&mdev->rs_plan_s, 0); | 1559 | this function gets called from atomic context. |
1483 | spin_unlock(&mdev->peer_seq_lock); | 1560 | It is valid since all other updates also lead to an completely |
1561 | empty fifo */ | ||
1562 | rcu_read_lock(); | ||
1563 | plan = rcu_dereference(mdev->rs_plan_s); | ||
1564 | plan->total = 0; | ||
1565 | fifo_set(plan, 0); | ||
1566 | rcu_read_unlock(); | ||
1567 | } | ||
1568 | |||
1569 | void start_resync_timer_fn(unsigned long data) | ||
1570 | { | ||
1571 | struct drbd_conf *mdev = (struct drbd_conf *) data; | ||
1572 | |||
1573 | drbd_queue_work(&mdev->tconn->sender_work, &mdev->start_resync_work); | ||
1574 | } | ||
1575 | |||
1576 | int w_start_resync(struct drbd_work *w, int cancel) | ||
1577 | { | ||
1578 | struct drbd_conf *mdev = w->mdev; | ||
1579 | |||
1580 | if (atomic_read(&mdev->unacked_cnt) || atomic_read(&mdev->rs_pending_cnt)) { | ||
1581 | dev_warn(DEV, "w_start_resync later...\n"); | ||
1582 | mdev->start_resync_timer.expires = jiffies + HZ/10; | ||
1583 | add_timer(&mdev->start_resync_timer); | ||
1584 | return 0; | ||
1585 | } | ||
1586 | |||
1587 | drbd_start_resync(mdev, C_SYNC_SOURCE); | ||
1588 | clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags); | ||
1589 | return 0; | ||
1484 | } | 1590 | } |
1485 | 1591 | ||
1486 | /** | 1592 | /** |
@@ -1501,43 +1607,58 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) | |||
1501 | return; | 1607 | return; |
1502 | } | 1608 | } |
1503 | 1609 | ||
1504 | if (side == C_SYNC_TARGET) { | 1610 | if (!test_bit(B_RS_H_DONE, &mdev->flags)) { |
1505 | /* Since application IO was locked out during C_WF_BITMAP_T and | 1611 | if (side == C_SYNC_TARGET) { |
1506 | C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET | 1612 | /* Since application IO was locked out during C_WF_BITMAP_T and |
1507 | we check that we might make the data inconsistent. */ | 1613 | C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET |
1508 | r = drbd_khelper(mdev, "before-resync-target"); | 1614 | we check that we might make the data inconsistent. */ |
1509 | r = (r >> 8) & 0xff; | 1615 | r = drbd_khelper(mdev, "before-resync-target"); |
1510 | if (r > 0) { | 1616 | r = (r >> 8) & 0xff; |
1511 | dev_info(DEV, "before-resync-target handler returned %d, " | 1617 | if (r > 0) { |
1512 | "dropping connection.\n", r); | 1618 | dev_info(DEV, "before-resync-target handler returned %d, " |
1513 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
1514 | return; | ||
1515 | } | ||
1516 | } else /* C_SYNC_SOURCE */ { | ||
1517 | r = drbd_khelper(mdev, "before-resync-source"); | ||
1518 | r = (r >> 8) & 0xff; | ||
1519 | if (r > 0) { | ||
1520 | if (r == 3) { | ||
1521 | dev_info(DEV, "before-resync-source handler returned %d, " | ||
1522 | "ignoring. Old userland tools?", r); | ||
1523 | } else { | ||
1524 | dev_info(DEV, "before-resync-source handler returned %d, " | ||
1525 | "dropping connection.\n", r); | 1619 | "dropping connection.\n", r); |
1526 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | 1620 | conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD); |
1527 | return; | 1621 | return; |
1528 | } | 1622 | } |
1623 | } else /* C_SYNC_SOURCE */ { | ||
1624 | r = drbd_khelper(mdev, "before-resync-source"); | ||
1625 | r = (r >> 8) & 0xff; | ||
1626 | if (r > 0) { | ||
1627 | if (r == 3) { | ||
1628 | dev_info(DEV, "before-resync-source handler returned %d, " | ||
1629 | "ignoring. Old userland tools?", r); | ||
1630 | } else { | ||
1631 | dev_info(DEV, "before-resync-source handler returned %d, " | ||
1632 | "dropping connection.\n", r); | ||
1633 | conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD); | ||
1634 | return; | ||
1635 | } | ||
1636 | } | ||
1529 | } | 1637 | } |
1530 | } | 1638 | } |
1531 | 1639 | ||
1532 | drbd_state_lock(mdev); | 1640 | if (current == mdev->tconn->worker.task) { |
1641 | /* The worker should not sleep waiting for state_mutex, | ||
1642 | that can take long */ | ||
1643 | if (!mutex_trylock(mdev->state_mutex)) { | ||
1644 | set_bit(B_RS_H_DONE, &mdev->flags); | ||
1645 | mdev->start_resync_timer.expires = jiffies + HZ/5; | ||
1646 | add_timer(&mdev->start_resync_timer); | ||
1647 | return; | ||
1648 | } | ||
1649 | } else { | ||
1650 | mutex_lock(mdev->state_mutex); | ||
1651 | } | ||
1652 | clear_bit(B_RS_H_DONE, &mdev->flags); | ||
1653 | |||
1533 | write_lock_irq(&global_state_lock); | 1654 | write_lock_irq(&global_state_lock); |
1534 | if (!get_ldev_if_state(mdev, D_NEGOTIATING)) { | 1655 | if (!get_ldev_if_state(mdev, D_NEGOTIATING)) { |
1535 | write_unlock_irq(&global_state_lock); | 1656 | write_unlock_irq(&global_state_lock); |
1536 | drbd_state_unlock(mdev); | 1657 | mutex_unlock(mdev->state_mutex); |
1537 | return; | 1658 | return; |
1538 | } | 1659 | } |
1539 | 1660 | ||
1540 | ns.i = mdev->state.i; | 1661 | ns = drbd_read_state(mdev); |
1541 | 1662 | ||
1542 | ns.aftr_isp = !_drbd_may_sync_now(mdev); | 1663 | ns.aftr_isp = !_drbd_may_sync_now(mdev); |
1543 | 1664 | ||
@@ -1549,7 +1670,7 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) | |||
1549 | ns.pdsk = D_INCONSISTENT; | 1670 | ns.pdsk = D_INCONSISTENT; |
1550 | 1671 | ||
1551 | r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL); | 1672 | r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL); |
1552 | ns = mdev->state; | 1673 | ns = drbd_read_state(mdev); |
1553 | 1674 | ||
1554 | if (ns.conn < C_CONNECTED) | 1675 | if (ns.conn < C_CONNECTED) |
1555 | r = SS_UNKNOWN_ERROR; | 1676 | r = SS_UNKNOWN_ERROR; |
@@ -1575,6 +1696,10 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) | |||
1575 | write_unlock_irq(&global_state_lock); | 1696 | write_unlock_irq(&global_state_lock); |
1576 | 1697 | ||
1577 | if (r == SS_SUCCESS) { | 1698 | if (r == SS_SUCCESS) { |
1699 | /* reset rs_last_bcast when a resync or verify is started, | ||
1700 | * to deal with potential jiffies wrap. */ | ||
1701 | mdev->rs_last_bcast = jiffies - HZ; | ||
1702 | |||
1578 | dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n", | 1703 | dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n", |
1579 | drbd_conn_str(ns.conn), | 1704 | drbd_conn_str(ns.conn), |
1580 | (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10), | 1705 | (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10), |
@@ -1589,10 +1714,10 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) | |||
1589 | * drbd_resync_finished from here in that case. | 1714 | * drbd_resync_finished from here in that case. |
1590 | * We drbd_gen_and_send_sync_uuid here for protocol < 96, | 1715 | * We drbd_gen_and_send_sync_uuid here for protocol < 96, |
1591 | * and from after_state_ch otherwise. */ | 1716 | * and from after_state_ch otherwise. */ |
1592 | if (side == C_SYNC_SOURCE && mdev->agreed_pro_version < 96) | 1717 | if (side == C_SYNC_SOURCE && mdev->tconn->agreed_pro_version < 96) |
1593 | drbd_gen_and_send_sync_uuid(mdev); | 1718 | drbd_gen_and_send_sync_uuid(mdev); |
1594 | 1719 | ||
1595 | if (mdev->agreed_pro_version < 95 && mdev->rs_total == 0) { | 1720 | if (mdev->tconn->agreed_pro_version < 95 && mdev->rs_total == 0) { |
1596 | /* This still has a race (about when exactly the peers | 1721 | /* This still has a race (about when exactly the peers |
1597 | * detect connection loss) that can lead to a full sync | 1722 | * detect connection loss) that can lead to a full sync |
1598 | * on next handshake. In 8.3.9 we fixed this with explicit | 1723 | * on next handshake. In 8.3.9 we fixed this with explicit |
@@ -1603,10 +1728,16 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) | |||
1603 | * detect connection loss, then waiting for a ping | 1728 | * detect connection loss, then waiting for a ping |
1604 | * response (implicit in drbd_resync_finished) reduces | 1729 | * response (implicit in drbd_resync_finished) reduces |
1605 | * the race considerably, but does not solve it. */ | 1730 | * the race considerably, but does not solve it. */ |
1606 | if (side == C_SYNC_SOURCE) | 1731 | if (side == C_SYNC_SOURCE) { |
1607 | schedule_timeout_interruptible( | 1732 | struct net_conf *nc; |
1608 | mdev->net_conf->ping_int * HZ + | 1733 | int timeo; |
1609 | mdev->net_conf->ping_timeo*HZ/9); | 1734 | |
1735 | rcu_read_lock(); | ||
1736 | nc = rcu_dereference(mdev->tconn->net_conf); | ||
1737 | timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9; | ||
1738 | rcu_read_unlock(); | ||
1739 | schedule_timeout_interruptible(timeo); | ||
1740 | } | ||
1610 | drbd_resync_finished(mdev); | 1741 | drbd_resync_finished(mdev); |
1611 | } | 1742 | } |
1612 | 1743 | ||
@@ -1621,114 +1752,180 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) | |||
1621 | drbd_md_sync(mdev); | 1752 | drbd_md_sync(mdev); |
1622 | } | 1753 | } |
1623 | put_ldev(mdev); | 1754 | put_ldev(mdev); |
1624 | drbd_state_unlock(mdev); | 1755 | mutex_unlock(mdev->state_mutex); |
1625 | } | 1756 | } |
1626 | 1757 | ||
1627 | int drbd_worker(struct drbd_thread *thi) | 1758 | /* If the resource already closed the current epoch, but we did not |
1759 | * (because we have not yet seen new requests), we should send the | ||
1760 | * corresponding barrier now. Must be checked within the same spinlock | ||
1761 | * that is used to check for new requests. */ | ||
1762 | bool need_to_send_barrier(struct drbd_tconn *connection) | ||
1628 | { | 1763 | { |
1629 | struct drbd_conf *mdev = thi->mdev; | 1764 | if (!connection->send.seen_any_write_yet) |
1630 | struct drbd_work *w = NULL; | 1765 | return false; |
1631 | LIST_HEAD(work_list); | 1766 | |
1632 | int intr = 0, i; | 1767 | /* Skip barriers that do not contain any writes. |
1768 | * This may happen during AHEAD mode. */ | ||
1769 | if (!connection->send.current_epoch_writes) | ||
1770 | return false; | ||
1771 | |||
1772 | /* ->req_lock is held when requests are queued on | ||
1773 | * connection->sender_work, and put into ->transfer_log. | ||
1774 | * It is also held when ->current_tle_nr is increased. | ||
1775 | * So either there are already new requests queued, | ||
1776 | * and corresponding barriers will be send there. | ||
1777 | * Or nothing new is queued yet, so the difference will be 1. | ||
1778 | */ | ||
1779 | if (atomic_read(&connection->current_tle_nr) != | ||
1780 | connection->send.current_epoch_nr + 1) | ||
1781 | return false; | ||
1782 | |||
1783 | return true; | ||
1784 | } | ||
1785 | |||
1786 | bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list) | ||
1787 | { | ||
1788 | spin_lock_irq(&queue->q_lock); | ||
1789 | list_splice_init(&queue->q, work_list); | ||
1790 | spin_unlock_irq(&queue->q_lock); | ||
1791 | return !list_empty(work_list); | ||
1792 | } | ||
1633 | 1793 | ||
1634 | sprintf(current->comm, "drbd%d_worker", mdev_to_minor(mdev)); | 1794 | bool dequeue_work_item(struct drbd_work_queue *queue, struct list_head *work_list) |
1795 | { | ||
1796 | spin_lock_irq(&queue->q_lock); | ||
1797 | if (!list_empty(&queue->q)) | ||
1798 | list_move(queue->q.next, work_list); | ||
1799 | spin_unlock_irq(&queue->q_lock); | ||
1800 | return !list_empty(work_list); | ||
1801 | } | ||
1635 | 1802 | ||
1636 | while (get_t_state(thi) == Running) { | 1803 | void wait_for_work(struct drbd_tconn *connection, struct list_head *work_list) |
1637 | drbd_thread_current_set_cpu(mdev); | 1804 | { |
1805 | DEFINE_WAIT(wait); | ||
1806 | struct net_conf *nc; | ||
1807 | int uncork, cork; | ||
1638 | 1808 | ||
1639 | if (down_trylock(&mdev->data.work.s)) { | 1809 | dequeue_work_item(&connection->sender_work, work_list); |
1640 | mutex_lock(&mdev->data.mutex); | 1810 | if (!list_empty(work_list)) |
1641 | if (mdev->data.socket && !mdev->net_conf->no_cork) | 1811 | return; |
1642 | drbd_tcp_uncork(mdev->data.socket); | ||
1643 | mutex_unlock(&mdev->data.mutex); | ||
1644 | 1812 | ||
1645 | intr = down_interruptible(&mdev->data.work.s); | 1813 | /* Still nothing to do? |
1814 | * Maybe we still need to close the current epoch, | ||
1815 | * even if no new requests are queued yet. | ||
1816 | * | ||
1817 | * Also, poke TCP, just in case. | ||
1818 | * Then wait for new work (or signal). */ | ||
1819 | rcu_read_lock(); | ||
1820 | nc = rcu_dereference(connection->net_conf); | ||
1821 | uncork = nc ? nc->tcp_cork : 0; | ||
1822 | rcu_read_unlock(); | ||
1823 | if (uncork) { | ||
1824 | mutex_lock(&connection->data.mutex); | ||
1825 | if (connection->data.socket) | ||
1826 | drbd_tcp_uncork(connection->data.socket); | ||
1827 | mutex_unlock(&connection->data.mutex); | ||
1828 | } | ||
1646 | 1829 | ||
1647 | mutex_lock(&mdev->data.mutex); | 1830 | for (;;) { |
1648 | if (mdev->data.socket && !mdev->net_conf->no_cork) | 1831 | int send_barrier; |
1649 | drbd_tcp_cork(mdev->data.socket); | 1832 | prepare_to_wait(&connection->sender_work.q_wait, &wait, TASK_INTERRUPTIBLE); |
1650 | mutex_unlock(&mdev->data.mutex); | 1833 | spin_lock_irq(&connection->req_lock); |
1834 | spin_lock(&connection->sender_work.q_lock); /* FIXME get rid of this one? */ | ||
1835 | /* dequeue single item only, | ||
1836 | * we still use drbd_queue_work_front() in some places */ | ||
1837 | if (!list_empty(&connection->sender_work.q)) | ||
1838 | list_move(connection->sender_work.q.next, work_list); | ||
1839 | spin_unlock(&connection->sender_work.q_lock); /* FIXME get rid of this one? */ | ||
1840 | if (!list_empty(work_list) || signal_pending(current)) { | ||
1841 | spin_unlock_irq(&connection->req_lock); | ||
1842 | break; | ||
1651 | } | 1843 | } |
1844 | send_barrier = need_to_send_barrier(connection); | ||
1845 | spin_unlock_irq(&connection->req_lock); | ||
1846 | if (send_barrier) { | ||
1847 | drbd_send_barrier(connection); | ||
1848 | connection->send.current_epoch_nr++; | ||
1849 | } | ||
1850 | schedule(); | ||
1851 | /* may be woken up for other things but new work, too, | ||
1852 | * e.g. if the current epoch got closed. | ||
1853 | * In which case we send the barrier above. */ | ||
1854 | } | ||
1855 | finish_wait(&connection->sender_work.q_wait, &wait); | ||
1856 | |||
1857 | /* someone may have changed the config while we have been waiting above. */ | ||
1858 | rcu_read_lock(); | ||
1859 | nc = rcu_dereference(connection->net_conf); | ||
1860 | cork = nc ? nc->tcp_cork : 0; | ||
1861 | rcu_read_unlock(); | ||
1862 | mutex_lock(&connection->data.mutex); | ||
1863 | if (connection->data.socket) { | ||
1864 | if (cork) | ||
1865 | drbd_tcp_cork(connection->data.socket); | ||
1866 | else if (!uncork) | ||
1867 | drbd_tcp_uncork(connection->data.socket); | ||
1868 | } | ||
1869 | mutex_unlock(&connection->data.mutex); | ||
1870 | } | ||
1652 | 1871 | ||
1653 | if (intr) { | 1872 | int drbd_worker(struct drbd_thread *thi) |
1654 | D_ASSERT(intr == -EINTR); | 1873 | { |
1874 | struct drbd_tconn *tconn = thi->tconn; | ||
1875 | struct drbd_work *w = NULL; | ||
1876 | struct drbd_conf *mdev; | ||
1877 | LIST_HEAD(work_list); | ||
1878 | int vnr; | ||
1879 | |||
1880 | while (get_t_state(thi) == RUNNING) { | ||
1881 | drbd_thread_current_set_cpu(thi); | ||
1882 | |||
1883 | /* as long as we use drbd_queue_work_front(), | ||
1884 | * we may only dequeue single work items here, not batches. */ | ||
1885 | if (list_empty(&work_list)) | ||
1886 | wait_for_work(tconn, &work_list); | ||
1887 | |||
1888 | if (signal_pending(current)) { | ||
1655 | flush_signals(current); | 1889 | flush_signals(current); |
1656 | ERR_IF (get_t_state(thi) == Running) | 1890 | if (get_t_state(thi) == RUNNING) { |
1891 | conn_warn(tconn, "Worker got an unexpected signal\n"); | ||
1657 | continue; | 1892 | continue; |
1893 | } | ||
1658 | break; | 1894 | break; |
1659 | } | 1895 | } |
1660 | 1896 | ||
1661 | if (get_t_state(thi) != Running) | 1897 | if (get_t_state(thi) != RUNNING) |
1662 | break; | 1898 | break; |
1663 | /* With this break, we have done a down() but not consumed | 1899 | |
1664 | the entry from the list. The cleanup code takes care of | 1900 | while (!list_empty(&work_list)) { |
1665 | this... */ | 1901 | w = list_first_entry(&work_list, struct drbd_work, list); |
1666 | 1902 | list_del_init(&w->list); | |
1667 | w = NULL; | 1903 | if (w->cb(w, tconn->cstate < C_WF_REPORT_PARAMS) == 0) |
1668 | spin_lock_irq(&mdev->data.work.q_lock); | 1904 | continue; |
1669 | ERR_IF(list_empty(&mdev->data.work.q)) { | 1905 | if (tconn->cstate >= C_WF_REPORT_PARAMS) |
1670 | /* something terribly wrong in our logic. | 1906 | conn_request_state(tconn, NS(conn, C_NETWORK_FAILURE), CS_HARD); |
1671 | * we were able to down() the semaphore, | ||
1672 | * but the list is empty... doh. | ||
1673 | * | ||
1674 | * what is the best thing to do now? | ||
1675 | * try again from scratch, restarting the receiver, | ||
1676 | * asender, whatnot? could break even more ugly, | ||
1677 | * e.g. when we are primary, but no good local data. | ||
1678 | * | ||
1679 | * I'll try to get away just starting over this loop. | ||
1680 | */ | ||
1681 | spin_unlock_irq(&mdev->data.work.q_lock); | ||
1682 | continue; | ||
1683 | } | ||
1684 | w = list_entry(mdev->data.work.q.next, struct drbd_work, list); | ||
1685 | list_del_init(&w->list); | ||
1686 | spin_unlock_irq(&mdev->data.work.q_lock); | ||
1687 | |||
1688 | if (!w->cb(mdev, w, mdev->state.conn < C_CONNECTED)) { | ||
1689 | /* dev_warn(DEV, "worker: a callback failed! \n"); */ | ||
1690 | if (mdev->state.conn >= C_CONNECTED) | ||
1691 | drbd_force_state(mdev, | ||
1692 | NS(conn, C_NETWORK_FAILURE)); | ||
1693 | } | 1907 | } |
1694 | } | 1908 | } |
1695 | D_ASSERT(test_bit(DEVICE_DYING, &mdev->flags)); | ||
1696 | D_ASSERT(test_bit(CONFIG_PENDING, &mdev->flags)); | ||
1697 | |||
1698 | spin_lock_irq(&mdev->data.work.q_lock); | ||
1699 | i = 0; | ||
1700 | while (!list_empty(&mdev->data.work.q)) { | ||
1701 | list_splice_init(&mdev->data.work.q, &work_list); | ||
1702 | spin_unlock_irq(&mdev->data.work.q_lock); | ||
1703 | 1909 | ||
1910 | do { | ||
1704 | while (!list_empty(&work_list)) { | 1911 | while (!list_empty(&work_list)) { |
1705 | w = list_entry(work_list.next, struct drbd_work, list); | 1912 | w = list_first_entry(&work_list, struct drbd_work, list); |
1706 | list_del_init(&w->list); | 1913 | list_del_init(&w->list); |
1707 | w->cb(mdev, w, 1); | 1914 | w->cb(w, 1); |
1708 | i++; /* dead debugging code */ | ||
1709 | } | 1915 | } |
1710 | 1916 | dequeue_work_batch(&tconn->sender_work, &work_list); | |
1711 | spin_lock_irq(&mdev->data.work.q_lock); | 1917 | } while (!list_empty(&work_list)); |
1918 | |||
1919 | rcu_read_lock(); | ||
1920 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
1921 | D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE); | ||
1922 | kref_get(&mdev->kref); | ||
1923 | rcu_read_unlock(); | ||
1924 | drbd_mdev_cleanup(mdev); | ||
1925 | kref_put(&mdev->kref, &drbd_minor_destroy); | ||
1926 | rcu_read_lock(); | ||
1712 | } | 1927 | } |
1713 | sema_init(&mdev->data.work.s, 0); | 1928 | rcu_read_unlock(); |
1714 | /* DANGEROUS race: if someone did queue his work within the spinlock, | ||
1715 | * but up() ed outside the spinlock, we could get an up() on the | ||
1716 | * semaphore without corresponding list entry. | ||
1717 | * So don't do that. | ||
1718 | */ | ||
1719 | spin_unlock_irq(&mdev->data.work.q_lock); | ||
1720 | |||
1721 | D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE); | ||
1722 | /* _drbd_set_state only uses stop_nowait. | ||
1723 | * wait here for the Exiting receiver. */ | ||
1724 | drbd_thread_stop(&mdev->receiver); | ||
1725 | drbd_mdev_cleanup(mdev); | ||
1726 | |||
1727 | dev_info(DEV, "worker terminated\n"); | ||
1728 | |||
1729 | clear_bit(DEVICE_DYING, &mdev->flags); | ||
1730 | clear_bit(CONFIG_PENDING, &mdev->flags); | ||
1731 | wake_up(&mdev->state_wait); | ||
1732 | 1929 | ||
1733 | return 0; | 1930 | return 0; |
1734 | } | 1931 | } |
diff --git a/drivers/block/drbd/drbd_wrappers.h b/drivers/block/drbd/drbd_wrappers.h index 151f1a37478f..328f18e4b4ee 100644 --- a/drivers/block/drbd/drbd_wrappers.h +++ b/drivers/block/drbd/drbd_wrappers.h | |||
@@ -3,6 +3,7 @@ | |||
3 | 3 | ||
4 | #include <linux/ctype.h> | 4 | #include <linux/ctype.h> |
5 | #include <linux/mm.h> | 5 | #include <linux/mm.h> |
6 | #include "drbd_int.h" | ||
6 | 7 | ||
7 | /* see get_sb_bdev and bd_claim */ | 8 | /* see get_sb_bdev and bd_claim */ |
8 | extern char *drbd_sec_holder; | 9 | extern char *drbd_sec_holder; |
@@ -20,8 +21,8 @@ static inline void drbd_set_my_capacity(struct drbd_conf *mdev, | |||
20 | 21 | ||
21 | /* bi_end_io handlers */ | 22 | /* bi_end_io handlers */ |
22 | extern void drbd_md_io_complete(struct bio *bio, int error); | 23 | extern void drbd_md_io_complete(struct bio *bio, int error); |
23 | extern void drbd_endio_sec(struct bio *bio, int error); | 24 | extern void drbd_peer_request_endio(struct bio *bio, int error); |
24 | extern void drbd_endio_pri(struct bio *bio, int error); | 25 | extern void drbd_request_endio(struct bio *bio, int error); |
25 | 26 | ||
26 | /* | 27 | /* |
27 | * used to submit our private bio | 28 | * used to submit our private bio |
@@ -45,12 +46,6 @@ static inline void drbd_generic_make_request(struct drbd_conf *mdev, | |||
45 | generic_make_request(bio); | 46 | generic_make_request(bio); |
46 | } | 47 | } |
47 | 48 | ||
48 | static inline int drbd_crypto_is_hash(struct crypto_tfm *tfm) | ||
49 | { | ||
50 | return (crypto_tfm_alg_type(tfm) & CRYPTO_ALG_TYPE_HASH_MASK) | ||
51 | == CRYPTO_ALG_TYPE_HASH; | ||
52 | } | ||
53 | |||
54 | #ifndef __CHECKER__ | 49 | #ifndef __CHECKER__ |
55 | # undef __cond_lock | 50 | # undef __cond_lock |
56 | # define __cond_lock(x,c) (c) | 51 | # define __cond_lock(x,c) (c) |
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index a7d6347aaa79..2ddd64a9ffde 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c | |||
@@ -672,7 +672,6 @@ static void __reschedule_timeout(int drive, const char *message) | |||
672 | 672 | ||
673 | if (drive == current_reqD) | 673 | if (drive == current_reqD) |
674 | drive = current_drive; | 674 | drive = current_drive; |
675 | __cancel_delayed_work(&fd_timeout); | ||
676 | 675 | ||
677 | if (drive < 0 || drive >= N_DRIVE) { | 676 | if (drive < 0 || drive >= N_DRIVE) { |
678 | delay = 20UL * HZ; | 677 | delay = 20UL * HZ; |
@@ -680,7 +679,7 @@ static void __reschedule_timeout(int drive, const char *message) | |||
680 | } else | 679 | } else |
681 | delay = UDP->timeout; | 680 | delay = UDP->timeout; |
682 | 681 | ||
683 | queue_delayed_work(floppy_wq, &fd_timeout, delay); | 682 | mod_delayed_work(floppy_wq, &fd_timeout, delay); |
684 | if (UDP->flags & FD_DEBUG) | 683 | if (UDP->flags & FD_DEBUG) |
685 | DPRINT("reschedule timeout %s\n", message); | 684 | DPRINT("reschedule timeout %s\n", message); |
686 | timeout_message = message; | 685 | timeout_message = message; |
@@ -891,7 +890,7 @@ static void unlock_fdc(void) | |||
891 | 890 | ||
892 | raw_cmd = NULL; | 891 | raw_cmd = NULL; |
893 | command_status = FD_COMMAND_NONE; | 892 | command_status = FD_COMMAND_NONE; |
894 | __cancel_delayed_work(&fd_timeout); | 893 | cancel_delayed_work(&fd_timeout); |
895 | do_floppy = NULL; | 894 | do_floppy = NULL; |
896 | cont = NULL; | 895 | cont = NULL; |
897 | clear_bit(0, &fdc_busy); | 896 | clear_bit(0, &fdc_busy); |
@@ -4110,12 +4109,19 @@ static struct platform_driver floppy_driver = { | |||
4110 | 4109 | ||
4111 | static struct platform_device floppy_device[N_DRIVE]; | 4110 | static struct platform_device floppy_device[N_DRIVE]; |
4112 | 4111 | ||
4112 | static bool floppy_available(int drive) | ||
4113 | { | ||
4114 | if (!(allowed_drive_mask & (1 << drive))) | ||
4115 | return false; | ||
4116 | if (fdc_state[FDC(drive)].version == FDC_NONE) | ||
4117 | return false; | ||
4118 | return true; | ||
4119 | } | ||
4120 | |||
4113 | static struct kobject *floppy_find(dev_t dev, int *part, void *data) | 4121 | static struct kobject *floppy_find(dev_t dev, int *part, void *data) |
4114 | { | 4122 | { |
4115 | int drive = (*part & 3) | ((*part & 0x80) >> 5); | 4123 | int drive = (*part & 3) | ((*part & 0x80) >> 5); |
4116 | if (drive >= N_DRIVE || | 4124 | if (drive >= N_DRIVE || !floppy_available(drive)) |
4117 | !(allowed_drive_mask & (1 << drive)) || | ||
4118 | fdc_state[FDC(drive)].version == FDC_NONE) | ||
4119 | return NULL; | 4125 | return NULL; |
4120 | if (((*part >> 2) & 0x1f) >= ARRAY_SIZE(floppy_type)) | 4126 | if (((*part >> 2) & 0x1f) >= ARRAY_SIZE(floppy_type)) |
4121 | return NULL; | 4127 | return NULL; |
@@ -4125,8 +4131,7 @@ static struct kobject *floppy_find(dev_t dev, int *part, void *data) | |||
4125 | 4131 | ||
4126 | static int __init do_floppy_init(void) | 4132 | static int __init do_floppy_init(void) |
4127 | { | 4133 | { |
4128 | int i, unit, drive; | 4134 | int i, unit, drive, err; |
4129 | int err, dr; | ||
4130 | 4135 | ||
4131 | set_debugt(); | 4136 | set_debugt(); |
4132 | interruptjiffies = resultjiffies = jiffies; | 4137 | interruptjiffies = resultjiffies = jiffies; |
@@ -4138,34 +4143,32 @@ static int __init do_floppy_init(void) | |||
4138 | 4143 | ||
4139 | raw_cmd = NULL; | 4144 | raw_cmd = NULL; |
4140 | 4145 | ||
4141 | for (dr = 0; dr < N_DRIVE; dr++) { | 4146 | floppy_wq = alloc_ordered_workqueue("floppy", 0); |
4142 | disks[dr] = alloc_disk(1); | 4147 | if (!floppy_wq) |
4143 | if (!disks[dr]) { | 4148 | return -ENOMEM; |
4144 | err = -ENOMEM; | ||
4145 | goto out_put_disk; | ||
4146 | } | ||
4147 | 4149 | ||
4148 | floppy_wq = alloc_ordered_workqueue("floppy", 0); | 4150 | for (drive = 0; drive < N_DRIVE; drive++) { |
4149 | if (!floppy_wq) { | 4151 | disks[drive] = alloc_disk(1); |
4152 | if (!disks[drive]) { | ||
4150 | err = -ENOMEM; | 4153 | err = -ENOMEM; |
4151 | goto out_put_disk; | 4154 | goto out_put_disk; |
4152 | } | 4155 | } |
4153 | 4156 | ||
4154 | disks[dr]->queue = blk_init_queue(do_fd_request, &floppy_lock); | 4157 | disks[drive]->queue = blk_init_queue(do_fd_request, &floppy_lock); |
4155 | if (!disks[dr]->queue) { | 4158 | if (!disks[drive]->queue) { |
4156 | err = -ENOMEM; | 4159 | err = -ENOMEM; |
4157 | goto out_destroy_workq; | 4160 | goto out_put_disk; |
4158 | } | 4161 | } |
4159 | 4162 | ||
4160 | blk_queue_max_hw_sectors(disks[dr]->queue, 64); | 4163 | blk_queue_max_hw_sectors(disks[drive]->queue, 64); |
4161 | disks[dr]->major = FLOPPY_MAJOR; | 4164 | disks[drive]->major = FLOPPY_MAJOR; |
4162 | disks[dr]->first_minor = TOMINOR(dr); | 4165 | disks[drive]->first_minor = TOMINOR(drive); |
4163 | disks[dr]->fops = &floppy_fops; | 4166 | disks[drive]->fops = &floppy_fops; |
4164 | sprintf(disks[dr]->disk_name, "fd%d", dr); | 4167 | sprintf(disks[drive]->disk_name, "fd%d", drive); |
4165 | 4168 | ||
4166 | init_timer(&motor_off_timer[dr]); | 4169 | init_timer(&motor_off_timer[drive]); |
4167 | motor_off_timer[dr].data = dr; | 4170 | motor_off_timer[drive].data = drive; |
4168 | motor_off_timer[dr].function = motor_off_callback; | 4171 | motor_off_timer[drive].function = motor_off_callback; |
4169 | } | 4172 | } |
4170 | 4173 | ||
4171 | err = register_blkdev(FLOPPY_MAJOR, "fd"); | 4174 | err = register_blkdev(FLOPPY_MAJOR, "fd"); |
@@ -4283,9 +4286,7 @@ static int __init do_floppy_init(void) | |||
4283 | } | 4286 | } |
4284 | 4287 | ||
4285 | for (drive = 0; drive < N_DRIVE; drive++) { | 4288 | for (drive = 0; drive < N_DRIVE; drive++) { |
4286 | if (!(allowed_drive_mask & (1 << drive))) | 4289 | if (!floppy_available(drive)) |
4287 | continue; | ||
4288 | if (fdc_state[FDC(drive)].version == FDC_NONE) | ||
4289 | continue; | 4290 | continue; |
4290 | 4291 | ||
4291 | floppy_device[drive].name = floppy_device_name; | 4292 | floppy_device[drive].name = floppy_device_name; |
@@ -4294,7 +4295,7 @@ static int __init do_floppy_init(void) | |||
4294 | 4295 | ||
4295 | err = platform_device_register(&floppy_device[drive]); | 4296 | err = platform_device_register(&floppy_device[drive]); |
4296 | if (err) | 4297 | if (err) |
4297 | goto out_release_dma; | 4298 | goto out_remove_drives; |
4298 | 4299 | ||
4299 | err = device_create_file(&floppy_device[drive].dev, | 4300 | err = device_create_file(&floppy_device[drive].dev, |
4300 | &dev_attr_cmos); | 4301 | &dev_attr_cmos); |
@@ -4312,28 +4313,33 @@ static int __init do_floppy_init(void) | |||
4312 | 4313 | ||
4313 | out_unreg_platform_dev: | 4314 | out_unreg_platform_dev: |
4314 | platform_device_unregister(&floppy_device[drive]); | 4315 | platform_device_unregister(&floppy_device[drive]); |
4316 | out_remove_drives: | ||
4317 | while (drive--) { | ||
4318 | if (floppy_available(drive)) { | ||
4319 | del_gendisk(disks[drive]); | ||
4320 | device_remove_file(&floppy_device[drive].dev, &dev_attr_cmos); | ||
4321 | platform_device_unregister(&floppy_device[drive]); | ||
4322 | } | ||
4323 | } | ||
4315 | out_release_dma: | 4324 | out_release_dma: |
4316 | if (atomic_read(&usage_count)) | 4325 | if (atomic_read(&usage_count)) |
4317 | floppy_release_irq_and_dma(); | 4326 | floppy_release_irq_and_dma(); |
4318 | out_unreg_region: | 4327 | out_unreg_region: |
4319 | blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); | 4328 | blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); |
4320 | platform_driver_unregister(&floppy_driver); | 4329 | platform_driver_unregister(&floppy_driver); |
4321 | out_destroy_workq: | ||
4322 | destroy_workqueue(floppy_wq); | ||
4323 | out_unreg_blkdev: | 4330 | out_unreg_blkdev: |
4324 | unregister_blkdev(FLOPPY_MAJOR, "fd"); | 4331 | unregister_blkdev(FLOPPY_MAJOR, "fd"); |
4325 | out_put_disk: | 4332 | out_put_disk: |
4326 | while (dr--) { | 4333 | destroy_workqueue(floppy_wq); |
4327 | del_timer_sync(&motor_off_timer[dr]); | 4334 | for (drive = 0; drive < N_DRIVE; drive++) { |
4328 | if (disks[dr]->queue) { | 4335 | if (!disks[drive]) |
4329 | blk_cleanup_queue(disks[dr]->queue); | 4336 | break; |
4330 | /* | 4337 | if (disks[drive]->queue) { |
4331 | * put_disk() is not paired with add_disk() and | 4338 | del_timer_sync(&motor_off_timer[drive]); |
4332 | * will put queue reference one extra time. fix it. | 4339 | blk_cleanup_queue(disks[drive]->queue); |
4333 | */ | 4340 | disks[drive]->queue = NULL; |
4334 | disks[dr]->queue = NULL; | ||
4335 | } | 4341 | } |
4336 | put_disk(disks[dr]); | 4342 | put_disk(disks[drive]); |
4337 | } | 4343 | } |
4338 | return err; | 4344 | return err; |
4339 | } | 4345 | } |
@@ -4549,11 +4555,12 @@ static void __exit floppy_module_exit(void) | |||
4549 | unregister_blkdev(FLOPPY_MAJOR, "fd"); | 4555 | unregister_blkdev(FLOPPY_MAJOR, "fd"); |
4550 | platform_driver_unregister(&floppy_driver); | 4556 | platform_driver_unregister(&floppy_driver); |
4551 | 4557 | ||
4558 | destroy_workqueue(floppy_wq); | ||
4559 | |||
4552 | for (drive = 0; drive < N_DRIVE; drive++) { | 4560 | for (drive = 0; drive < N_DRIVE; drive++) { |
4553 | del_timer_sync(&motor_off_timer[drive]); | 4561 | del_timer_sync(&motor_off_timer[drive]); |
4554 | 4562 | ||
4555 | if ((allowed_drive_mask & (1 << drive)) && | 4563 | if (floppy_available(drive)) { |
4556 | fdc_state[FDC(drive)].version != FDC_NONE) { | ||
4557 | del_gendisk(disks[drive]); | 4564 | del_gendisk(disks[drive]); |
4558 | device_remove_file(&floppy_device[drive].dev, &dev_attr_cmos); | 4565 | device_remove_file(&floppy_device[drive].dev, &dev_attr_cmos); |
4559 | platform_device_unregister(&floppy_device[drive]); | 4566 | platform_device_unregister(&floppy_device[drive]); |
@@ -4573,7 +4580,6 @@ static void __exit floppy_module_exit(void) | |||
4573 | 4580 | ||
4574 | cancel_delayed_work_sync(&fd_timeout); | 4581 | cancel_delayed_work_sync(&fd_timeout); |
4575 | cancel_delayed_work_sync(&fd_timer); | 4582 | cancel_delayed_work_sync(&fd_timer); |
4576 | destroy_workqueue(floppy_wq); | ||
4577 | 4583 | ||
4578 | if (atomic_read(&usage_count)) | 4584 | if (atomic_read(&usage_count)) |
4579 | floppy_release_irq_and_dma(); | 4585 | floppy_release_irq_and_dma(); |
diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 3bba65510d23..ae1251270624 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c | |||
@@ -463,6 +463,7 @@ out: | |||
463 | */ | 463 | */ |
464 | static void loop_add_bio(struct loop_device *lo, struct bio *bio) | 464 | static void loop_add_bio(struct loop_device *lo, struct bio *bio) |
465 | { | 465 | { |
466 | lo->lo_bio_count++; | ||
466 | bio_list_add(&lo->lo_bio_list, bio); | 467 | bio_list_add(&lo->lo_bio_list, bio); |
467 | } | 468 | } |
468 | 469 | ||
@@ -471,6 +472,7 @@ static void loop_add_bio(struct loop_device *lo, struct bio *bio) | |||
471 | */ | 472 | */ |
472 | static struct bio *loop_get_bio(struct loop_device *lo) | 473 | static struct bio *loop_get_bio(struct loop_device *lo) |
473 | { | 474 | { |
475 | lo->lo_bio_count--; | ||
474 | return bio_list_pop(&lo->lo_bio_list); | 476 | return bio_list_pop(&lo->lo_bio_list); |
475 | } | 477 | } |
476 | 478 | ||
@@ -489,6 +491,10 @@ static void loop_make_request(struct request_queue *q, struct bio *old_bio) | |||
489 | goto out; | 491 | goto out; |
490 | if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY))) | 492 | if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY))) |
491 | goto out; | 493 | goto out; |
494 | if (lo->lo_bio_count >= q->nr_congestion_on) | ||
495 | wait_event_lock_irq(lo->lo_req_wait, | ||
496 | lo->lo_bio_count < q->nr_congestion_off, | ||
497 | lo->lo_lock); | ||
492 | loop_add_bio(lo, old_bio); | 498 | loop_add_bio(lo, old_bio); |
493 | wake_up(&lo->lo_event); | 499 | wake_up(&lo->lo_event); |
494 | spin_unlock_irq(&lo->lo_lock); | 500 | spin_unlock_irq(&lo->lo_lock); |
@@ -546,6 +552,8 @@ static int loop_thread(void *data) | |||
546 | continue; | 552 | continue; |
547 | spin_lock_irq(&lo->lo_lock); | 553 | spin_lock_irq(&lo->lo_lock); |
548 | bio = loop_get_bio(lo); | 554 | bio = loop_get_bio(lo); |
555 | if (lo->lo_bio_count < lo->lo_queue->nr_congestion_off) | ||
556 | wake_up(&lo->lo_req_wait); | ||
549 | spin_unlock_irq(&lo->lo_lock); | 557 | spin_unlock_irq(&lo->lo_lock); |
550 | 558 | ||
551 | BUG_ON(!bio); | 559 | BUG_ON(!bio); |
@@ -873,6 +881,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, | |||
873 | lo->transfer = transfer_none; | 881 | lo->transfer = transfer_none; |
874 | lo->ioctl = NULL; | 882 | lo->ioctl = NULL; |
875 | lo->lo_sizelimit = 0; | 883 | lo->lo_sizelimit = 0; |
884 | lo->lo_bio_count = 0; | ||
876 | lo->old_gfp_mask = mapping_gfp_mask(mapping); | 885 | lo->old_gfp_mask = mapping_gfp_mask(mapping); |
877 | mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); | 886 | mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); |
878 | 887 | ||
@@ -976,8 +985,21 @@ static int loop_clr_fd(struct loop_device *lo) | |||
976 | if (lo->lo_state != Lo_bound) | 985 | if (lo->lo_state != Lo_bound) |
977 | return -ENXIO; | 986 | return -ENXIO; |
978 | 987 | ||
979 | if (lo->lo_refcnt > 1) /* we needed one fd for the ioctl */ | 988 | /* |
980 | return -EBUSY; | 989 | * If we've explicitly asked to tear down the loop device, |
990 | * and it has an elevated reference count, set it for auto-teardown when | ||
991 | * the last reference goes away. This stops $!~#$@ udev from | ||
992 | * preventing teardown because it decided that it needs to run blkid on | ||
993 | * the loopback device whenever they appear. xfstests is notorious for | ||
994 | * failing tests because blkid via udev races with a losetup | ||
995 | * <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d | ||
996 | * command to fail with EBUSY. | ||
997 | */ | ||
998 | if (lo->lo_refcnt > 1) { | ||
999 | lo->lo_flags |= LO_FLAGS_AUTOCLEAR; | ||
1000 | mutex_unlock(&lo->lo_ctl_mutex); | ||
1001 | return 0; | ||
1002 | } | ||
981 | 1003 | ||
982 | if (filp == NULL) | 1004 | if (filp == NULL) |
983 | return -EINVAL; | 1005 | return -EINVAL; |
@@ -1038,10 +1060,10 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) | |||
1038 | { | 1060 | { |
1039 | int err; | 1061 | int err; |
1040 | struct loop_func_table *xfer; | 1062 | struct loop_func_table *xfer; |
1041 | uid_t uid = current_uid(); | 1063 | kuid_t uid = current_uid(); |
1042 | 1064 | ||
1043 | if (lo->lo_encrypt_key_size && | 1065 | if (lo->lo_encrypt_key_size && |
1044 | lo->lo_key_owner != uid && | 1066 | !uid_eq(lo->lo_key_owner, uid) && |
1045 | !capable(CAP_SYS_ADMIN)) | 1067 | !capable(CAP_SYS_ADMIN)) |
1046 | return -EPERM; | 1068 | return -EPERM; |
1047 | if (lo->lo_state != Lo_bound) | 1069 | if (lo->lo_state != Lo_bound) |
@@ -1660,6 +1682,7 @@ static int loop_add(struct loop_device **l, int i) | |||
1660 | lo->lo_number = i; | 1682 | lo->lo_number = i; |
1661 | lo->lo_thread = NULL; | 1683 | lo->lo_thread = NULL; |
1662 | init_waitqueue_head(&lo->lo_event); | 1684 | init_waitqueue_head(&lo->lo_event); |
1685 | init_waitqueue_head(&lo->lo_req_wait); | ||
1663 | spin_lock_init(&lo->lo_lock); | 1686 | spin_lock_init(&lo->lo_lock); |
1664 | disk->major = LOOP_MAJOR; | 1687 | disk->major = LOOP_MAJOR; |
1665 | disk->first_minor = i << part_shift; | 1688 | disk->first_minor = i << part_shift; |
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index f946d31d6917..9694dd99bbbc 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c | |||
@@ -559,7 +559,7 @@ static void mtip_timeout_function(unsigned long int data) | |||
559 | struct mtip_cmd *command; | 559 | struct mtip_cmd *command; |
560 | int tag, cmdto_cnt = 0; | 560 | int tag, cmdto_cnt = 0; |
561 | unsigned int bit, group; | 561 | unsigned int bit, group; |
562 | unsigned int num_command_slots = port->dd->slot_groups * 32; | 562 | unsigned int num_command_slots; |
563 | unsigned long to, tagaccum[SLOTBITS_IN_LONGS]; | 563 | unsigned long to, tagaccum[SLOTBITS_IN_LONGS]; |
564 | 564 | ||
565 | if (unlikely(!port)) | 565 | if (unlikely(!port)) |
@@ -572,6 +572,7 @@ static void mtip_timeout_function(unsigned long int data) | |||
572 | } | 572 | } |
573 | /* clear the tag accumulator */ | 573 | /* clear the tag accumulator */ |
574 | memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long)); | 574 | memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long)); |
575 | num_command_slots = port->dd->slot_groups * 32; | ||
575 | 576 | ||
576 | for (tag = 0; tag < num_command_slots; tag++) { | 577 | for (tag = 0; tag < num_command_slots; tag++) { |
577 | /* | 578 | /* |
@@ -2035,8 +2036,9 @@ static unsigned int implicit_sector(unsigned char command, | |||
2035 | } | 2036 | } |
2036 | return rv; | 2037 | return rv; |
2037 | } | 2038 | } |
2038 | 2039 | static void mtip_set_timeout(struct driver_data *dd, | |
2039 | static void mtip_set_timeout(struct host_to_dev_fis *fis, unsigned int *timeout) | 2040 | struct host_to_dev_fis *fis, |
2041 | unsigned int *timeout, u8 erasemode) | ||
2040 | { | 2042 | { |
2041 | switch (fis->command) { | 2043 | switch (fis->command) { |
2042 | case ATA_CMD_DOWNLOAD_MICRO: | 2044 | case ATA_CMD_DOWNLOAD_MICRO: |
@@ -2044,7 +2046,10 @@ static void mtip_set_timeout(struct host_to_dev_fis *fis, unsigned int *timeout) | |||
2044 | break; | 2046 | break; |
2045 | case ATA_CMD_SEC_ERASE_UNIT: | 2047 | case ATA_CMD_SEC_ERASE_UNIT: |
2046 | case 0xFC: | 2048 | case 0xFC: |
2047 | *timeout = 240000; /* 4 minutes */ | 2049 | if (erasemode) |
2050 | *timeout = ((*(dd->port->identify + 90) * 2) * 60000); | ||
2051 | else | ||
2052 | *timeout = ((*(dd->port->identify + 89) * 2) * 60000); | ||
2048 | break; | 2053 | break; |
2049 | case ATA_CMD_STANDBYNOW1: | 2054 | case ATA_CMD_STANDBYNOW1: |
2050 | *timeout = 120000; /* 2 minutes */ | 2055 | *timeout = 120000; /* 2 minutes */ |
@@ -2087,6 +2092,7 @@ static int exec_drive_taskfile(struct driver_data *dd, | |||
2087 | unsigned int transfer_size; | 2092 | unsigned int transfer_size; |
2088 | unsigned long task_file_data; | 2093 | unsigned long task_file_data; |
2089 | int intotal = outtotal + req_task->out_size; | 2094 | int intotal = outtotal + req_task->out_size; |
2095 | int erasemode = 0; | ||
2090 | 2096 | ||
2091 | taskout = req_task->out_size; | 2097 | taskout = req_task->out_size; |
2092 | taskin = req_task->in_size; | 2098 | taskin = req_task->in_size; |
@@ -2212,7 +2218,13 @@ static int exec_drive_taskfile(struct driver_data *dd, | |||
2212 | fis.lba_hi, | 2218 | fis.lba_hi, |
2213 | fis.device); | 2219 | fis.device); |
2214 | 2220 | ||
2215 | mtip_set_timeout(&fis, &timeout); | 2221 | /* check for erase mode support during secure erase.*/ |
2222 | if ((fis.command == ATA_CMD_SEC_ERASE_UNIT) && outbuf && | ||
2223 | (outbuf[0] & MTIP_SEC_ERASE_MODE)) { | ||
2224 | erasemode = 1; | ||
2225 | } | ||
2226 | |||
2227 | mtip_set_timeout(dd, &fis, &timeout, erasemode); | ||
2216 | 2228 | ||
2217 | /* Determine the correct transfer size.*/ | 2229 | /* Determine the correct transfer size.*/ |
2218 | if (force_single_sector) | 2230 | if (force_single_sector) |
@@ -2428,7 +2440,7 @@ static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd, | |||
2428 | * return value | 2440 | * return value |
2429 | * None | 2441 | * None |
2430 | */ | 2442 | */ |
2431 | static void mtip_hw_submit_io(struct driver_data *dd, sector_t start, | 2443 | static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector, |
2432 | int nsect, int nents, int tag, void *callback, | 2444 | int nsect, int nents, int tag, void *callback, |
2433 | void *data, int dir) | 2445 | void *data, int dir) |
2434 | { | 2446 | { |
@@ -2436,6 +2448,7 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t start, | |||
2436 | struct mtip_port *port = dd->port; | 2448 | struct mtip_port *port = dd->port; |
2437 | struct mtip_cmd *command = &port->commands[tag]; | 2449 | struct mtip_cmd *command = &port->commands[tag]; |
2438 | int dma_dir = (dir == READ) ? DMA_FROM_DEVICE : DMA_TO_DEVICE; | 2450 | int dma_dir = (dir == READ) ? DMA_FROM_DEVICE : DMA_TO_DEVICE; |
2451 | u64 start = sector; | ||
2439 | 2452 | ||
2440 | /* Map the scatter list for DMA access */ | 2453 | /* Map the scatter list for DMA access */ |
2441 | nents = dma_map_sg(&dd->pdev->dev, command->sg, nents, dma_dir); | 2454 | nents = dma_map_sg(&dd->pdev->dev, command->sg, nents, dma_dir); |
@@ -2454,8 +2467,12 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t start, | |||
2454 | fis->opts = 1 << 7; | 2467 | fis->opts = 1 << 7; |
2455 | fis->command = | 2468 | fis->command = |
2456 | (dir == READ ? ATA_CMD_FPDMA_READ : ATA_CMD_FPDMA_WRITE); | 2469 | (dir == READ ? ATA_CMD_FPDMA_READ : ATA_CMD_FPDMA_WRITE); |
2457 | *((unsigned int *) &fis->lba_low) = (start & 0xFFFFFF); | 2470 | fis->lba_low = start & 0xFF; |
2458 | *((unsigned int *) &fis->lba_low_ex) = ((start >> 24) & 0xFFFFFF); | 2471 | fis->lba_mid = (start >> 8) & 0xFF; |
2472 | fis->lba_hi = (start >> 16) & 0xFF; | ||
2473 | fis->lba_low_ex = (start >> 24) & 0xFF; | ||
2474 | fis->lba_mid_ex = (start >> 32) & 0xFF; | ||
2475 | fis->lba_hi_ex = (start >> 40) & 0xFF; | ||
2459 | fis->device = 1 << 6; | 2476 | fis->device = 1 << 6; |
2460 | fis->features = nsect & 0xFF; | 2477 | fis->features = nsect & 0xFF; |
2461 | fis->features_ex = (nsect >> 8) & 0xFF; | 2478 | fis->features_ex = (nsect >> 8) & 0xFF; |
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h index 18627a1d04c5..b1742640556a 100644 --- a/drivers/block/mtip32xx/mtip32xx.h +++ b/drivers/block/mtip32xx/mtip32xx.h | |||
@@ -33,6 +33,9 @@ | |||
33 | /* offset of Device Control register in PCIe extended capabilites space */ | 33 | /* offset of Device Control register in PCIe extended capabilites space */ |
34 | #define PCIE_CONFIG_EXT_DEVICE_CONTROL_OFFSET 0x48 | 34 | #define PCIE_CONFIG_EXT_DEVICE_CONTROL_OFFSET 0x48 |
35 | 35 | ||
36 | /* check for erase mode support during secure erase */ | ||
37 | #define MTIP_SEC_ERASE_MODE 0x2 | ||
38 | |||
36 | /* # of times to retry timed out/failed IOs */ | 39 | /* # of times to retry timed out/failed IOs */ |
37 | #define MTIP_MAX_RETRIES 2 | 40 | #define MTIP_MAX_RETRIES 2 |
38 | 41 | ||
@@ -152,14 +155,14 @@ enum { | |||
152 | MTIP_DDF_REBUILD_FAILED_BIT = 8, | 155 | MTIP_DDF_REBUILD_FAILED_BIT = 8, |
153 | }; | 156 | }; |
154 | 157 | ||
155 | __packed struct smart_attr{ | 158 | struct smart_attr { |
156 | u8 attr_id; | 159 | u8 attr_id; |
157 | u16 flags; | 160 | u16 flags; |
158 | u8 cur; | 161 | u8 cur; |
159 | u8 worst; | 162 | u8 worst; |
160 | u32 data; | 163 | u32 data; |
161 | u8 res[3]; | 164 | u8 res[3]; |
162 | }; | 165 | } __packed; |
163 | 166 | ||
164 | /* Register Frame Information Structure (FIS), host to device. */ | 167 | /* Register Frame Information Structure (FIS), host to device. */ |
165 | struct host_to_dev_fis { | 168 | struct host_to_dev_fis { |
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 0c03411c59eb..043ddcca4abf 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c | |||
@@ -78,6 +78,8 @@ static const char *ioctl_cmd_to_ascii(int cmd) | |||
78 | case NBD_SET_SOCK: return "set-sock"; | 78 | case NBD_SET_SOCK: return "set-sock"; |
79 | case NBD_SET_BLKSIZE: return "set-blksize"; | 79 | case NBD_SET_BLKSIZE: return "set-blksize"; |
80 | case NBD_SET_SIZE: return "set-size"; | 80 | case NBD_SET_SIZE: return "set-size"; |
81 | case NBD_SET_TIMEOUT: return "set-timeout"; | ||
82 | case NBD_SET_FLAGS: return "set-flags"; | ||
81 | case NBD_DO_IT: return "do-it"; | 83 | case NBD_DO_IT: return "do-it"; |
82 | case NBD_CLEAR_SOCK: return "clear-sock"; | 84 | case NBD_CLEAR_SOCK: return "clear-sock"; |
83 | case NBD_CLEAR_QUE: return "clear-que"; | 85 | case NBD_CLEAR_QUE: return "clear-que"; |
@@ -96,6 +98,7 @@ static const char *nbdcmd_to_ascii(int cmd) | |||
96 | case NBD_CMD_READ: return "read"; | 98 | case NBD_CMD_READ: return "read"; |
97 | case NBD_CMD_WRITE: return "write"; | 99 | case NBD_CMD_WRITE: return "write"; |
98 | case NBD_CMD_DISC: return "disconnect"; | 100 | case NBD_CMD_DISC: return "disconnect"; |
101 | case NBD_CMD_TRIM: return "trim/discard"; | ||
99 | } | 102 | } |
100 | return "invalid"; | 103 | return "invalid"; |
101 | } | 104 | } |
@@ -467,8 +470,12 @@ static void nbd_handle_req(struct nbd_device *nbd, struct request *req) | |||
467 | 470 | ||
468 | nbd_cmd(req) = NBD_CMD_READ; | 471 | nbd_cmd(req) = NBD_CMD_READ; |
469 | if (rq_data_dir(req) == WRITE) { | 472 | if (rq_data_dir(req) == WRITE) { |
470 | nbd_cmd(req) = NBD_CMD_WRITE; | 473 | if ((req->cmd_flags & REQ_DISCARD)) { |
471 | if (nbd->flags & NBD_READ_ONLY) { | 474 | WARN_ON(!(nbd->flags & NBD_FLAG_SEND_TRIM)); |
475 | nbd_cmd(req) = NBD_CMD_TRIM; | ||
476 | } else | ||
477 | nbd_cmd(req) = NBD_CMD_WRITE; | ||
478 | if (nbd->flags & NBD_FLAG_READ_ONLY) { | ||
472 | dev_err(disk_to_dev(nbd->disk), | 479 | dev_err(disk_to_dev(nbd->disk), |
473 | "Write on read-only\n"); | 480 | "Write on read-only\n"); |
474 | goto error_out; | 481 | goto error_out; |
@@ -651,6 +658,10 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, | |||
651 | nbd->xmit_timeout = arg * HZ; | 658 | nbd->xmit_timeout = arg * HZ; |
652 | return 0; | 659 | return 0; |
653 | 660 | ||
661 | case NBD_SET_FLAGS: | ||
662 | nbd->flags = arg; | ||
663 | return 0; | ||
664 | |||
654 | case NBD_SET_SIZE_BLOCKS: | 665 | case NBD_SET_SIZE_BLOCKS: |
655 | nbd->bytesize = ((u64) arg) * nbd->blksize; | 666 | nbd->bytesize = ((u64) arg) * nbd->blksize; |
656 | bdev->bd_inode->i_size = nbd->bytesize; | 667 | bdev->bd_inode->i_size = nbd->bytesize; |
@@ -670,6 +681,10 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, | |||
670 | 681 | ||
671 | mutex_unlock(&nbd->tx_lock); | 682 | mutex_unlock(&nbd->tx_lock); |
672 | 683 | ||
684 | if (nbd->flags & NBD_FLAG_SEND_TRIM) | ||
685 | queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, | ||
686 | nbd->disk->queue); | ||
687 | |||
673 | thread = kthread_create(nbd_thread, nbd, nbd->disk->disk_name); | 688 | thread = kthread_create(nbd_thread, nbd, nbd->disk->disk_name); |
674 | if (IS_ERR(thread)) { | 689 | if (IS_ERR(thread)) { |
675 | mutex_lock(&nbd->tx_lock); | 690 | mutex_lock(&nbd->tx_lock); |
@@ -687,6 +702,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, | |||
687 | nbd->file = NULL; | 702 | nbd->file = NULL; |
688 | nbd_clear_que(nbd); | 703 | nbd_clear_que(nbd); |
689 | dev_warn(disk_to_dev(nbd->disk), "queue cleared\n"); | 704 | dev_warn(disk_to_dev(nbd->disk), "queue cleared\n"); |
705 | queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue); | ||
690 | if (file) | 706 | if (file) |
691 | fput(file); | 707 | fput(file); |
692 | nbd->bytesize = 0; | 708 | nbd->bytesize = 0; |
@@ -805,6 +821,9 @@ static int __init nbd_init(void) | |||
805 | * Tell the block layer that we are not a rotational device | 821 | * Tell the block layer that we are not a rotational device |
806 | */ | 822 | */ |
807 | queue_flag_set_unlocked(QUEUE_FLAG_NONROT, disk->queue); | 823 | queue_flag_set_unlocked(QUEUE_FLAG_NONROT, disk->queue); |
824 | disk->queue->limits.discard_granularity = 512; | ||
825 | disk->queue->limits.max_discard_sectors = UINT_MAX; | ||
826 | disk->queue->limits.discard_zeroes_data = 0; | ||
808 | } | 827 | } |
809 | 828 | ||
810 | if (register_blkdev(NBD_MAJOR, "nbd")) { | 829 | if (register_blkdev(NBD_MAJOR, "nbd")) { |
diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c index ad16c68c8645..931769e133e5 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme.c | |||
@@ -1726,7 +1726,7 @@ static void __devexit nvme_remove(struct pci_dev *pdev) | |||
1726 | #define nvme_suspend NULL | 1726 | #define nvme_suspend NULL |
1727 | #define nvme_resume NULL | 1727 | #define nvme_resume NULL |
1728 | 1728 | ||
1729 | static struct pci_error_handlers nvme_err_handler = { | 1729 | static const struct pci_error_handlers nvme_err_handler = { |
1730 | .error_detected = nvme_error_detected, | 1730 | .error_detected = nvme_error_detected, |
1731 | .mmio_enabled = nvme_dump_registers, | 1731 | .mmio_enabled = nvme_dump_registers, |
1732 | .link_reset = nvme_link_reset, | 1732 | .link_reset = nvme_link_reset, |
diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c index 87311ebac0db..1bbc681688e4 100644 --- a/drivers/block/osdblk.c +++ b/drivers/block/osdblk.c | |||
@@ -266,11 +266,10 @@ static struct bio *bio_chain_clone(struct bio *old_chain, gfp_t gfpmask) | |||
266 | struct bio *tmp, *new_chain = NULL, *tail = NULL; | 266 | struct bio *tmp, *new_chain = NULL, *tail = NULL; |
267 | 267 | ||
268 | while (old_chain) { | 268 | while (old_chain) { |
269 | tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs); | 269 | tmp = bio_clone_kmalloc(old_chain, gfpmask); |
270 | if (!tmp) | 270 | if (!tmp) |
271 | goto err_out; | 271 | goto err_out; |
272 | 272 | ||
273 | __bio_clone(tmp, old_chain); | ||
274 | tmp->bi_bdev = NULL; | 273 | tmp->bi_bdev = NULL; |
275 | gfpmask &= ~__GFP_WAIT; | 274 | gfpmask &= ~__GFP_WAIT; |
276 | tmp->bi_next = NULL; | 275 | tmp->bi_next = NULL; |
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index ba66e4445f41..2e7de7a59bfc 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c | |||
@@ -522,38 +522,6 @@ static void pkt_bio_finished(struct pktcdvd_device *pd) | |||
522 | } | 522 | } |
523 | } | 523 | } |
524 | 524 | ||
525 | static void pkt_bio_destructor(struct bio *bio) | ||
526 | { | ||
527 | kfree(bio->bi_io_vec); | ||
528 | kfree(bio); | ||
529 | } | ||
530 | |||
531 | static struct bio *pkt_bio_alloc(int nr_iovecs) | ||
532 | { | ||
533 | struct bio_vec *bvl = NULL; | ||
534 | struct bio *bio; | ||
535 | |||
536 | bio = kmalloc(sizeof(struct bio), GFP_KERNEL); | ||
537 | if (!bio) | ||
538 | goto no_bio; | ||
539 | bio_init(bio); | ||
540 | |||
541 | bvl = kcalloc(nr_iovecs, sizeof(struct bio_vec), GFP_KERNEL); | ||
542 | if (!bvl) | ||
543 | goto no_bvl; | ||
544 | |||
545 | bio->bi_max_vecs = nr_iovecs; | ||
546 | bio->bi_io_vec = bvl; | ||
547 | bio->bi_destructor = pkt_bio_destructor; | ||
548 | |||
549 | return bio; | ||
550 | |||
551 | no_bvl: | ||
552 | kfree(bio); | ||
553 | no_bio: | ||
554 | return NULL; | ||
555 | } | ||
556 | |||
557 | /* | 525 | /* |
558 | * Allocate a packet_data struct | 526 | * Allocate a packet_data struct |
559 | */ | 527 | */ |
@@ -567,7 +535,7 @@ static struct packet_data *pkt_alloc_packet_data(int frames) | |||
567 | goto no_pkt; | 535 | goto no_pkt; |
568 | 536 | ||
569 | pkt->frames = frames; | 537 | pkt->frames = frames; |
570 | pkt->w_bio = pkt_bio_alloc(frames); | 538 | pkt->w_bio = bio_kmalloc(GFP_KERNEL, frames); |
571 | if (!pkt->w_bio) | 539 | if (!pkt->w_bio) |
572 | goto no_bio; | 540 | goto no_bio; |
573 | 541 | ||
@@ -581,9 +549,10 @@ static struct packet_data *pkt_alloc_packet_data(int frames) | |||
581 | bio_list_init(&pkt->orig_bios); | 549 | bio_list_init(&pkt->orig_bios); |
582 | 550 | ||
583 | for (i = 0; i < frames; i++) { | 551 | for (i = 0; i < frames; i++) { |
584 | struct bio *bio = pkt_bio_alloc(1); | 552 | struct bio *bio = bio_kmalloc(GFP_KERNEL, 1); |
585 | if (!bio) | 553 | if (!bio) |
586 | goto no_rd_bio; | 554 | goto no_rd_bio; |
555 | |||
587 | pkt->r_bios[i] = bio; | 556 | pkt->r_bios[i] = bio; |
588 | } | 557 | } |
589 | 558 | ||
@@ -1111,21 +1080,17 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt) | |||
1111 | * Schedule reads for missing parts of the packet. | 1080 | * Schedule reads for missing parts of the packet. |
1112 | */ | 1081 | */ |
1113 | for (f = 0; f < pkt->frames; f++) { | 1082 | for (f = 0; f < pkt->frames; f++) { |
1114 | struct bio_vec *vec; | ||
1115 | |||
1116 | int p, offset; | 1083 | int p, offset; |
1084 | |||
1117 | if (written[f]) | 1085 | if (written[f]) |
1118 | continue; | 1086 | continue; |
1087 | |||
1119 | bio = pkt->r_bios[f]; | 1088 | bio = pkt->r_bios[f]; |
1120 | vec = bio->bi_io_vec; | 1089 | bio_reset(bio); |
1121 | bio_init(bio); | ||
1122 | bio->bi_max_vecs = 1; | ||
1123 | bio->bi_sector = pkt->sector + f * (CD_FRAMESIZE >> 9); | 1090 | bio->bi_sector = pkt->sector + f * (CD_FRAMESIZE >> 9); |
1124 | bio->bi_bdev = pd->bdev; | 1091 | bio->bi_bdev = pd->bdev; |
1125 | bio->bi_end_io = pkt_end_io_read; | 1092 | bio->bi_end_io = pkt_end_io_read; |
1126 | bio->bi_private = pkt; | 1093 | bio->bi_private = pkt; |
1127 | bio->bi_io_vec = vec; | ||
1128 | bio->bi_destructor = pkt_bio_destructor; | ||
1129 | 1094 | ||
1130 | p = (f * CD_FRAMESIZE) / PAGE_SIZE; | 1095 | p = (f * CD_FRAMESIZE) / PAGE_SIZE; |
1131 | offset = (f * CD_FRAMESIZE) % PAGE_SIZE; | 1096 | offset = (f * CD_FRAMESIZE) % PAGE_SIZE; |
@@ -1418,14 +1383,11 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) | |||
1418 | } | 1383 | } |
1419 | 1384 | ||
1420 | /* Start the write request */ | 1385 | /* Start the write request */ |
1421 | bio_init(pkt->w_bio); | 1386 | bio_reset(pkt->w_bio); |
1422 | pkt->w_bio->bi_max_vecs = PACKET_MAX_SIZE; | ||
1423 | pkt->w_bio->bi_sector = pkt->sector; | 1387 | pkt->w_bio->bi_sector = pkt->sector; |
1424 | pkt->w_bio->bi_bdev = pd->bdev; | 1388 | pkt->w_bio->bi_bdev = pd->bdev; |
1425 | pkt->w_bio->bi_end_io = pkt_end_io_packet_write; | 1389 | pkt->w_bio->bi_end_io = pkt_end_io_packet_write; |
1426 | pkt->w_bio->bi_private = pkt; | 1390 | pkt->w_bio->bi_private = pkt; |
1427 | pkt->w_bio->bi_io_vec = bvec; | ||
1428 | pkt->w_bio->bi_destructor = pkt_bio_destructor; | ||
1429 | for (f = 0; f < pkt->frames; f++) | 1391 | for (f = 0; f < pkt->frames; f++) |
1430 | if (!bio_add_page(pkt->w_bio, bvec[f].bv_page, CD_FRAMESIZE, bvec[f].bv_offset)) | 1392 | if (!bio_add_page(pkt->w_bio, bvec[f].bv_page, CD_FRAMESIZE, bvec[f].bv_offset)) |
1431 | BUG(); | 1393 | BUG(); |
diff --git a/drivers/block/ub.c b/drivers/block/ub.c deleted file mode 100644 index fcec0225ac76..000000000000 --- a/drivers/block/ub.c +++ /dev/null | |||
@@ -1,2474 +0,0 @@ | |||
1 | /* | ||
2 | * The low performance USB storage driver (ub). | ||
3 | * | ||
4 | * Copyright (c) 1999, 2000 Matthew Dharm (mdharm-usb@one-eyed-alien.net) | ||
5 | * Copyright (C) 2004 Pete Zaitcev (zaitcev@yahoo.com) | ||
6 | * | ||
7 | * This work is a part of Linux kernel, is derived from it, | ||
8 | * and is not licensed separately. See file COPYING for details. | ||
9 | * | ||
10 | * TODO (sorted by decreasing priority) | ||
11 | * -- Return sense now that rq allows it (we always auto-sense anyway). | ||
12 | * -- set readonly flag for CDs, set removable flag for CF readers | ||
13 | * -- do inquiry and verify we got a disk and not a tape (for LUN mismatch) | ||
14 | * -- verify the 13 conditions and do bulk resets | ||
15 | * -- highmem | ||
16 | * -- move top_sense and work_bcs into separate allocations (if they survive) | ||
17 | * for cache purists and esoteric architectures. | ||
18 | * -- Allocate structure for LUN 0 before the first ub_sync_tur, avoid NULL. ? | ||
19 | * -- prune comments, they are too volumnous | ||
20 | * -- Resove XXX's | ||
21 | * -- CLEAR, CLR2STS, CLRRS seem to be ripe for refactoring. | ||
22 | */ | ||
23 | #include <linux/kernel.h> | ||
24 | #include <linux/module.h> | ||
25 | #include <linux/usb.h> | ||
26 | #include <linux/usb_usual.h> | ||
27 | #include <linux/blkdev.h> | ||
28 | #include <linux/timer.h> | ||
29 | #include <linux/scatterlist.h> | ||
30 | #include <linux/slab.h> | ||
31 | #include <linux/mutex.h> | ||
32 | #include <scsi/scsi.h> | ||
33 | |||
34 | #define DRV_NAME "ub" | ||
35 | |||
36 | #define UB_MAJOR 180 | ||
37 | |||
38 | /* | ||
39 | * The command state machine is the key model for understanding of this driver. | ||
40 | * | ||
41 | * The general rule is that all transitions are done towards the bottom | ||
42 | * of the diagram, thus preventing any loops. | ||
43 | * | ||
44 | * An exception to that is how the STAT state is handled. A counter allows it | ||
45 | * to be re-entered along the path marked with [C]. | ||
46 | * | ||
47 | * +--------+ | ||
48 | * ! INIT ! | ||
49 | * +--------+ | ||
50 | * ! | ||
51 | * ub_scsi_cmd_start fails ->--------------------------------------\ | ||
52 | * ! ! | ||
53 | * V ! | ||
54 | * +--------+ ! | ||
55 | * ! CMD ! ! | ||
56 | * +--------+ ! | ||
57 | * ! +--------+ ! | ||
58 | * was -EPIPE -->-------------------------------->! CLEAR ! ! | ||
59 | * ! +--------+ ! | ||
60 | * ! ! ! | ||
61 | * was error -->------------------------------------- ! --------->\ | ||
62 | * ! ! ! | ||
63 | * /--<-- cmd->dir == NONE ? ! ! | ||
64 | * ! ! ! ! | ||
65 | * ! V ! ! | ||
66 | * ! +--------+ ! ! | ||
67 | * ! ! DATA ! ! ! | ||
68 | * ! +--------+ ! ! | ||
69 | * ! ! +---------+ ! ! | ||
70 | * ! was -EPIPE -->--------------->! CLR2STS ! ! ! | ||
71 | * ! ! +---------+ ! ! | ||
72 | * ! ! ! ! ! | ||
73 | * ! ! was error -->---- ! --------->\ | ||
74 | * ! was error -->--------------------- ! ------------- ! --------->\ | ||
75 | * ! ! ! ! ! | ||
76 | * ! V ! ! ! | ||
77 | * \--->+--------+ ! ! ! | ||
78 | * ! STAT !<--------------------------/ ! ! | ||
79 | * /--->+--------+ ! ! | ||
80 | * ! ! ! ! | ||
81 | * [C] was -EPIPE -->-----------\ ! ! | ||
82 | * ! ! ! ! ! | ||
83 | * +<---- len == 0 ! ! ! | ||
84 | * ! ! ! ! ! | ||
85 | * ! was error -->--------------------------------------!---------->\ | ||
86 | * ! ! ! ! ! | ||
87 | * +<---- bad CSW ! ! ! | ||
88 | * +<---- bad tag ! ! ! | ||
89 | * ! ! V ! ! | ||
90 | * ! ! +--------+ ! ! | ||
91 | * ! ! ! CLRRS ! ! ! | ||
92 | * ! ! +--------+ ! ! | ||
93 | * ! ! ! ! ! | ||
94 | * \------- ! --------------------[C]--------\ ! ! | ||
95 | * ! ! ! ! | ||
96 | * cmd->error---\ +--------+ ! ! | ||
97 | * ! +--------------->! SENSE !<----------/ ! | ||
98 | * STAT_FAIL----/ +--------+ ! | ||
99 | * ! ! V | ||
100 | * ! V +--------+ | ||
101 | * \--------------------------------\--------------------->! DONE ! | ||
102 | * +--------+ | ||
103 | */ | ||
104 | |||
105 | /* | ||
106 | * This many LUNs per USB device. | ||
107 | * Every one of them takes a host, see UB_MAX_HOSTS. | ||
108 | */ | ||
109 | #define UB_MAX_LUNS 9 | ||
110 | |||
111 | /* | ||
112 | */ | ||
113 | |||
114 | #define UB_PARTS_PER_LUN 8 | ||
115 | |||
116 | #define UB_MAX_CDB_SIZE 16 /* Corresponds to Bulk */ | ||
117 | |||
118 | #define UB_SENSE_SIZE 18 | ||
119 | |||
120 | /* | ||
121 | */ | ||
122 | struct ub_dev; | ||
123 | |||
124 | #define UB_MAX_REQ_SG 9 /* cdrecord requires 32KB and maybe a header */ | ||
125 | #define UB_MAX_SECTORS 64 | ||
126 | |||
127 | /* | ||
128 | * A second is more than enough for a 32K transfer (UB_MAX_SECTORS) | ||
129 | * even if a webcam hogs the bus, but some devices need time to spin up. | ||
130 | */ | ||
131 | #define UB_URB_TIMEOUT (HZ*2) | ||
132 | #define UB_DATA_TIMEOUT (HZ*5) /* ZIP does spin-ups in the data phase */ | ||
133 | #define UB_STAT_TIMEOUT (HZ*5) /* Same spinups and eject for a dataless cmd. */ | ||
134 | #define UB_CTRL_TIMEOUT (HZ/2) /* 500ms ought to be enough to clear a stall */ | ||
135 | |||
136 | /* | ||
137 | * An instance of a SCSI command in transit. | ||
138 | */ | ||
139 | #define UB_DIR_NONE 0 | ||
140 | #define UB_DIR_READ 1 | ||
141 | #define UB_DIR_ILLEGAL2 2 | ||
142 | #define UB_DIR_WRITE 3 | ||
143 | |||
144 | #define UB_DIR_CHAR(c) (((c)==UB_DIR_WRITE)? 'w': \ | ||
145 | (((c)==UB_DIR_READ)? 'r': 'n')) | ||
146 | |||
147 | enum ub_scsi_cmd_state { | ||
148 | UB_CMDST_INIT, /* Initial state */ | ||
149 | UB_CMDST_CMD, /* Command submitted */ | ||
150 | UB_CMDST_DATA, /* Data phase */ | ||
151 | UB_CMDST_CLR2STS, /* Clearing before requesting status */ | ||
152 | UB_CMDST_STAT, /* Status phase */ | ||
153 | UB_CMDST_CLEAR, /* Clearing a stall (halt, actually) */ | ||
154 | UB_CMDST_CLRRS, /* Clearing before retrying status */ | ||
155 | UB_CMDST_SENSE, /* Sending Request Sense */ | ||
156 | UB_CMDST_DONE /* Final state */ | ||
157 | }; | ||
158 | |||
159 | struct ub_scsi_cmd { | ||
160 | unsigned char cdb[UB_MAX_CDB_SIZE]; | ||
161 | unsigned char cdb_len; | ||
162 | |||
163 | unsigned char dir; /* 0 - none, 1 - read, 3 - write. */ | ||
164 | enum ub_scsi_cmd_state state; | ||
165 | unsigned int tag; | ||
166 | struct ub_scsi_cmd *next; | ||
167 | |||
168 | int error; /* Return code - valid upon done */ | ||
169 | unsigned int act_len; /* Return size */ | ||
170 | unsigned char key, asc, ascq; /* May be valid if error==-EIO */ | ||
171 | |||
172 | int stat_count; /* Retries getting status. */ | ||
173 | unsigned int timeo; /* jiffies until rq->timeout changes */ | ||
174 | |||
175 | unsigned int len; /* Requested length */ | ||
176 | unsigned int current_sg; | ||
177 | unsigned int nsg; /* sgv[nsg] */ | ||
178 | struct scatterlist sgv[UB_MAX_REQ_SG]; | ||
179 | |||
180 | struct ub_lun *lun; | ||
181 | void (*done)(struct ub_dev *, struct ub_scsi_cmd *); | ||
182 | void *back; | ||
183 | }; | ||
184 | |||
185 | struct ub_request { | ||
186 | struct request *rq; | ||
187 | unsigned int current_try; | ||
188 | unsigned int nsg; /* sgv[nsg] */ | ||
189 | struct scatterlist sgv[UB_MAX_REQ_SG]; | ||
190 | }; | ||
191 | |||
192 | /* | ||
193 | */ | ||
194 | struct ub_capacity { | ||
195 | unsigned long nsec; /* Linux size - 512 byte sectors */ | ||
196 | unsigned int bsize; /* Linux hardsect_size */ | ||
197 | unsigned int bshift; /* Shift between 512 and hard sects */ | ||
198 | }; | ||
199 | |||
200 | /* | ||
201 | * This is a direct take-off from linux/include/completion.h | ||
202 | * The difference is that I do not wait on this thing, just poll. | ||
203 | * When I want to wait (ub_probe), I just use the stock completion. | ||
204 | * | ||
205 | * Note that INIT_COMPLETION takes no lock. It is correct. But why | ||
206 | * in the bloody hell that thing takes struct instead of pointer to struct | ||
207 | * is quite beyond me. I just copied it from the stock completion. | ||
208 | */ | ||
209 | struct ub_completion { | ||
210 | unsigned int done; | ||
211 | spinlock_t lock; | ||
212 | }; | ||
213 | |||
214 | static DEFINE_MUTEX(ub_mutex); | ||
215 | static inline void ub_init_completion(struct ub_completion *x) | ||
216 | { | ||
217 | x->done = 0; | ||
218 | spin_lock_init(&x->lock); | ||
219 | } | ||
220 | |||
221 | #define UB_INIT_COMPLETION(x) ((x).done = 0) | ||
222 | |||
223 | static void ub_complete(struct ub_completion *x) | ||
224 | { | ||
225 | unsigned long flags; | ||
226 | |||
227 | spin_lock_irqsave(&x->lock, flags); | ||
228 | x->done++; | ||
229 | spin_unlock_irqrestore(&x->lock, flags); | ||
230 | } | ||
231 | |||
232 | static int ub_is_completed(struct ub_completion *x) | ||
233 | { | ||
234 | unsigned long flags; | ||
235 | int ret; | ||
236 | |||
237 | spin_lock_irqsave(&x->lock, flags); | ||
238 | ret = x->done; | ||
239 | spin_unlock_irqrestore(&x->lock, flags); | ||
240 | return ret; | ||
241 | } | ||
242 | |||
243 | /* | ||
244 | */ | ||
245 | struct ub_scsi_cmd_queue { | ||
246 | int qlen, qmax; | ||
247 | struct ub_scsi_cmd *head, *tail; | ||
248 | }; | ||
249 | |||
250 | /* | ||
251 | * The block device instance (one per LUN). | ||
252 | */ | ||
253 | struct ub_lun { | ||
254 | struct ub_dev *udev; | ||
255 | struct list_head link; | ||
256 | struct gendisk *disk; | ||
257 | int id; /* Host index */ | ||
258 | int num; /* LUN number */ | ||
259 | char name[16]; | ||
260 | |||
261 | int changed; /* Media was changed */ | ||
262 | int removable; | ||
263 | int readonly; | ||
264 | |||
265 | struct ub_request urq; | ||
266 | |||
267 | /* Use Ingo's mempool if or when we have more than one command. */ | ||
268 | /* | ||
269 | * Currently we never need more than one command for the whole device. | ||
270 | * However, giving every LUN a command is a cheap and automatic way | ||
271 | * to enforce fairness between them. | ||
272 | */ | ||
273 | int cmda[1]; | ||
274 | struct ub_scsi_cmd cmdv[1]; | ||
275 | |||
276 | struct ub_capacity capacity; | ||
277 | }; | ||
278 | |||
279 | /* | ||
280 | * The USB device instance. | ||
281 | */ | ||
282 | struct ub_dev { | ||
283 | spinlock_t *lock; | ||
284 | atomic_t poison; /* The USB device is disconnected */ | ||
285 | int openc; /* protected by ub_lock! */ | ||
286 | /* kref is too implicit for our taste */ | ||
287 | int reset; /* Reset is running */ | ||
288 | int bad_resid; | ||
289 | unsigned int tagcnt; | ||
290 | char name[12]; | ||
291 | struct usb_device *dev; | ||
292 | struct usb_interface *intf; | ||
293 | |||
294 | struct list_head luns; | ||
295 | |||
296 | unsigned int send_bulk_pipe; /* cached pipe values */ | ||
297 | unsigned int recv_bulk_pipe; | ||
298 | unsigned int send_ctrl_pipe; | ||
299 | unsigned int recv_ctrl_pipe; | ||
300 | |||
301 | struct tasklet_struct tasklet; | ||
302 | |||
303 | struct ub_scsi_cmd_queue cmd_queue; | ||
304 | struct ub_scsi_cmd top_rqs_cmd; /* REQUEST SENSE */ | ||
305 | unsigned char top_sense[UB_SENSE_SIZE]; | ||
306 | |||
307 | struct ub_completion work_done; | ||
308 | struct urb work_urb; | ||
309 | struct timer_list work_timer; | ||
310 | int last_pipe; /* What might need clearing */ | ||
311 | __le32 signature; /* Learned signature */ | ||
312 | struct bulk_cb_wrap work_bcb; | ||
313 | struct bulk_cs_wrap work_bcs; | ||
314 | struct usb_ctrlrequest work_cr; | ||
315 | |||
316 | struct work_struct reset_work; | ||
317 | wait_queue_head_t reset_wait; | ||
318 | }; | ||
319 | |||
320 | /* | ||
321 | */ | ||
322 | static void ub_cleanup(struct ub_dev *sc); | ||
323 | static int ub_request_fn_1(struct ub_lun *lun, struct request *rq); | ||
324 | static void ub_cmd_build_block(struct ub_dev *sc, struct ub_lun *lun, | ||
325 | struct ub_scsi_cmd *cmd, struct ub_request *urq); | ||
326 | static void ub_cmd_build_packet(struct ub_dev *sc, struct ub_lun *lun, | ||
327 | struct ub_scsi_cmd *cmd, struct ub_request *urq); | ||
328 | static void ub_rw_cmd_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd); | ||
329 | static void ub_end_rq(struct request *rq, unsigned int status); | ||
330 | static int ub_rw_cmd_retry(struct ub_dev *sc, struct ub_lun *lun, | ||
331 | struct ub_request *urq, struct ub_scsi_cmd *cmd); | ||
332 | static int ub_submit_scsi(struct ub_dev *sc, struct ub_scsi_cmd *cmd); | ||
333 | static void ub_urb_complete(struct urb *urb); | ||
334 | static void ub_scsi_action(unsigned long _dev); | ||
335 | static void ub_scsi_dispatch(struct ub_dev *sc); | ||
336 | static void ub_scsi_urb_compl(struct ub_dev *sc, struct ub_scsi_cmd *cmd); | ||
337 | static void ub_data_start(struct ub_dev *sc, struct ub_scsi_cmd *cmd); | ||
338 | static void ub_state_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd, int rc); | ||
339 | static int __ub_state_stat(struct ub_dev *sc, struct ub_scsi_cmd *cmd); | ||
340 | static void ub_state_stat(struct ub_dev *sc, struct ub_scsi_cmd *cmd); | ||
341 | static void ub_state_stat_counted(struct ub_dev *sc, struct ub_scsi_cmd *cmd); | ||
342 | static void ub_state_sense(struct ub_dev *sc, struct ub_scsi_cmd *cmd); | ||
343 | static int ub_submit_clear_stall(struct ub_dev *sc, struct ub_scsi_cmd *cmd, | ||
344 | int stalled_pipe); | ||
345 | static void ub_top_sense_done(struct ub_dev *sc, struct ub_scsi_cmd *scmd); | ||
346 | static void ub_reset_enter(struct ub_dev *sc, int try); | ||
347 | static void ub_reset_task(struct work_struct *work); | ||
348 | static int ub_sync_tur(struct ub_dev *sc, struct ub_lun *lun); | ||
349 | static int ub_sync_read_cap(struct ub_dev *sc, struct ub_lun *lun, | ||
350 | struct ub_capacity *ret); | ||
351 | static int ub_sync_reset(struct ub_dev *sc); | ||
352 | static int ub_probe_clear_stall(struct ub_dev *sc, int stalled_pipe); | ||
353 | static int ub_probe_lun(struct ub_dev *sc, int lnum); | ||
354 | |||
355 | /* | ||
356 | */ | ||
357 | #ifdef CONFIG_USB_LIBUSUAL | ||
358 | |||
359 | #define ub_usb_ids usb_storage_usb_ids | ||
360 | #else | ||
361 | |||
362 | static const struct usb_device_id ub_usb_ids[] = { | ||
363 | { USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, USB_SC_SCSI, USB_PR_BULK) }, | ||
364 | { } | ||
365 | }; | ||
366 | |||
367 | MODULE_DEVICE_TABLE(usb, ub_usb_ids); | ||
368 | #endif /* CONFIG_USB_LIBUSUAL */ | ||
369 | |||
370 | /* | ||
371 | * Find me a way to identify "next free minor" for add_disk(), | ||
372 | * and the array disappears the next day. However, the number of | ||
373 | * hosts has something to do with the naming and /proc/partitions. | ||
374 | * This has to be thought out in detail before changing. | ||
375 | * If UB_MAX_HOST was 1000, we'd use a bitmap. Or a better data structure. | ||
376 | */ | ||
377 | #define UB_MAX_HOSTS 26 | ||
378 | static char ub_hostv[UB_MAX_HOSTS]; | ||
379 | |||
380 | #define UB_QLOCK_NUM 5 | ||
381 | static spinlock_t ub_qlockv[UB_QLOCK_NUM]; | ||
382 | static int ub_qlock_next = 0; | ||
383 | |||
384 | static DEFINE_SPINLOCK(ub_lock); /* Locks globals and ->openc */ | ||
385 | |||
386 | /* | ||
387 | * The id allocator. | ||
388 | * | ||
389 | * This also stores the host for indexing by minor, which is somewhat dirty. | ||
390 | */ | ||
391 | static int ub_id_get(void) | ||
392 | { | ||
393 | unsigned long flags; | ||
394 | int i; | ||
395 | |||
396 | spin_lock_irqsave(&ub_lock, flags); | ||
397 | for (i = 0; i < UB_MAX_HOSTS; i++) { | ||
398 | if (ub_hostv[i] == 0) { | ||
399 | ub_hostv[i] = 1; | ||
400 | spin_unlock_irqrestore(&ub_lock, flags); | ||
401 | return i; | ||
402 | } | ||
403 | } | ||
404 | spin_unlock_irqrestore(&ub_lock, flags); | ||
405 | return -1; | ||
406 | } | ||
407 | |||
408 | static void ub_id_put(int id) | ||
409 | { | ||
410 | unsigned long flags; | ||
411 | |||
412 | if (id < 0 || id >= UB_MAX_HOSTS) { | ||
413 | printk(KERN_ERR DRV_NAME ": bad host ID %d\n", id); | ||
414 | return; | ||
415 | } | ||
416 | |||
417 | spin_lock_irqsave(&ub_lock, flags); | ||
418 | if (ub_hostv[id] == 0) { | ||
419 | spin_unlock_irqrestore(&ub_lock, flags); | ||
420 | printk(KERN_ERR DRV_NAME ": freeing free host ID %d\n", id); | ||
421 | return; | ||
422 | } | ||
423 | ub_hostv[id] = 0; | ||
424 | spin_unlock_irqrestore(&ub_lock, flags); | ||
425 | } | ||
426 | |||
427 | /* | ||
428 | * This is necessitated by the fact that blk_cleanup_queue does not | ||
429 | * necesserily destroy the queue. Instead, it may merely decrease q->refcnt. | ||
430 | * Since our blk_init_queue() passes a spinlock common with ub_dev, | ||
431 | * we have life time issues when ub_cleanup frees ub_dev. | ||
432 | */ | ||
433 | static spinlock_t *ub_next_lock(void) | ||
434 | { | ||
435 | unsigned long flags; | ||
436 | spinlock_t *ret; | ||
437 | |||
438 | spin_lock_irqsave(&ub_lock, flags); | ||
439 | ret = &ub_qlockv[ub_qlock_next]; | ||
440 | ub_qlock_next = (ub_qlock_next + 1) % UB_QLOCK_NUM; | ||
441 | spin_unlock_irqrestore(&ub_lock, flags); | ||
442 | return ret; | ||
443 | } | ||
444 | |||
445 | /* | ||
446 | * Downcount for deallocation. This rides on two assumptions: | ||
447 | * - once something is poisoned, its refcount cannot grow | ||
448 | * - opens cannot happen at this time (del_gendisk was done) | ||
449 | * If the above is true, we can drop the lock, which we need for | ||
450 | * blk_cleanup_queue(): the silly thing may attempt to sleep. | ||
451 | * [Actually, it never needs to sleep for us, but it calls might_sleep()] | ||
452 | */ | ||
453 | static void ub_put(struct ub_dev *sc) | ||
454 | { | ||
455 | unsigned long flags; | ||
456 | |||
457 | spin_lock_irqsave(&ub_lock, flags); | ||
458 | --sc->openc; | ||
459 | if (sc->openc == 0 && atomic_read(&sc->poison)) { | ||
460 | spin_unlock_irqrestore(&ub_lock, flags); | ||
461 | ub_cleanup(sc); | ||
462 | } else { | ||
463 | spin_unlock_irqrestore(&ub_lock, flags); | ||
464 | } | ||
465 | } | ||
466 | |||
467 | /* | ||
468 | * Final cleanup and deallocation. | ||
469 | */ | ||
470 | static void ub_cleanup(struct ub_dev *sc) | ||
471 | { | ||
472 | struct list_head *p; | ||
473 | struct ub_lun *lun; | ||
474 | struct request_queue *q; | ||
475 | |||
476 | while (!list_empty(&sc->luns)) { | ||
477 | p = sc->luns.next; | ||
478 | lun = list_entry(p, struct ub_lun, link); | ||
479 | list_del(p); | ||
480 | |||
481 | /* I don't think queue can be NULL. But... Stolen from sx8.c */ | ||
482 | if ((q = lun->disk->queue) != NULL) | ||
483 | blk_cleanup_queue(q); | ||
484 | /* | ||
485 | * If we zero disk->private_data BEFORE put_disk, we have | ||
486 | * to check for NULL all over the place in open, release, | ||
487 | * check_media and revalidate, because the block level | ||
488 | * semaphore is well inside the put_disk. | ||
489 | * But we cannot zero after the call, because *disk is gone. | ||
490 | * The sd.c is blatantly racy in this area. | ||
491 | */ | ||
492 | /* disk->private_data = NULL; */ | ||
493 | put_disk(lun->disk); | ||
494 | lun->disk = NULL; | ||
495 | |||
496 | ub_id_put(lun->id); | ||
497 | kfree(lun); | ||
498 | } | ||
499 | |||
500 | usb_set_intfdata(sc->intf, NULL); | ||
501 | usb_put_intf(sc->intf); | ||
502 | usb_put_dev(sc->dev); | ||
503 | kfree(sc); | ||
504 | } | ||
505 | |||
506 | /* | ||
507 | * The "command allocator". | ||
508 | */ | ||
509 | static struct ub_scsi_cmd *ub_get_cmd(struct ub_lun *lun) | ||
510 | { | ||
511 | struct ub_scsi_cmd *ret; | ||
512 | |||
513 | if (lun->cmda[0]) | ||
514 | return NULL; | ||
515 | ret = &lun->cmdv[0]; | ||
516 | lun->cmda[0] = 1; | ||
517 | return ret; | ||
518 | } | ||
519 | |||
520 | static void ub_put_cmd(struct ub_lun *lun, struct ub_scsi_cmd *cmd) | ||
521 | { | ||
522 | if (cmd != &lun->cmdv[0]) { | ||
523 | printk(KERN_WARNING "%s: releasing a foreign cmd %p\n", | ||
524 | lun->name, cmd); | ||
525 | return; | ||
526 | } | ||
527 | if (!lun->cmda[0]) { | ||
528 | printk(KERN_WARNING "%s: releasing a free cmd\n", lun->name); | ||
529 | return; | ||
530 | } | ||
531 | lun->cmda[0] = 0; | ||
532 | } | ||
533 | |||
534 | /* | ||
535 | * The command queue. | ||
536 | */ | ||
537 | static void ub_cmdq_add(struct ub_dev *sc, struct ub_scsi_cmd *cmd) | ||
538 | { | ||
539 | struct ub_scsi_cmd_queue *t = &sc->cmd_queue; | ||
540 | |||
541 | if (t->qlen++ == 0) { | ||
542 | t->head = cmd; | ||
543 | t->tail = cmd; | ||
544 | } else { | ||
545 | t->tail->next = cmd; | ||
546 | t->tail = cmd; | ||
547 | } | ||
548 | |||
549 | if (t->qlen > t->qmax) | ||
550 | t->qmax = t->qlen; | ||
551 | } | ||
552 | |||
553 | static void ub_cmdq_insert(struct ub_dev *sc, struct ub_scsi_cmd *cmd) | ||
554 | { | ||
555 | struct ub_scsi_cmd_queue *t = &sc->cmd_queue; | ||
556 | |||
557 | if (t->qlen++ == 0) { | ||
558 | t->head = cmd; | ||
559 | t->tail = cmd; | ||
560 | } else { | ||
561 | cmd->next = t->head; | ||
562 | t->head = cmd; | ||
563 | } | ||
564 | |||
565 | if (t->qlen > t->qmax) | ||
566 | t->qmax = t->qlen; | ||
567 | } | ||
568 | |||
569 | static struct ub_scsi_cmd *ub_cmdq_pop(struct ub_dev *sc) | ||
570 | { | ||
571 | struct ub_scsi_cmd_queue *t = &sc->cmd_queue; | ||
572 | struct ub_scsi_cmd *cmd; | ||
573 | |||
574 | if (t->qlen == 0) | ||
575 | return NULL; | ||
576 | if (--t->qlen == 0) | ||
577 | t->tail = NULL; | ||
578 | cmd = t->head; | ||
579 | t->head = cmd->next; | ||
580 | cmd->next = NULL; | ||
581 | return cmd; | ||
582 | } | ||
583 | |||
584 | #define ub_cmdq_peek(sc) ((sc)->cmd_queue.head) | ||
585 | |||
586 | /* | ||
587 | * The request function is our main entry point | ||
588 | */ | ||
589 | |||
590 | static void ub_request_fn(struct request_queue *q) | ||
591 | { | ||
592 | struct ub_lun *lun = q->queuedata; | ||
593 | struct request *rq; | ||
594 | |||
595 | while ((rq = blk_peek_request(q)) != NULL) { | ||
596 | if (ub_request_fn_1(lun, rq) != 0) { | ||
597 | blk_stop_queue(q); | ||
598 | break; | ||
599 | } | ||
600 | } | ||
601 | } | ||
602 | |||
603 | static int ub_request_fn_1(struct ub_lun *lun, struct request *rq) | ||
604 | { | ||
605 | struct ub_dev *sc = lun->udev; | ||
606 | struct ub_scsi_cmd *cmd; | ||
607 | struct ub_request *urq; | ||
608 | int n_elem; | ||
609 | |||
610 | if (atomic_read(&sc->poison)) { | ||
611 | blk_start_request(rq); | ||
612 | ub_end_rq(rq, DID_NO_CONNECT << 16); | ||
613 | return 0; | ||
614 | } | ||
615 | |||
616 | if (lun->changed && rq->cmd_type != REQ_TYPE_BLOCK_PC) { | ||
617 | blk_start_request(rq); | ||
618 | ub_end_rq(rq, SAM_STAT_CHECK_CONDITION); | ||
619 | return 0; | ||
620 | } | ||
621 | |||
622 | if (lun->urq.rq != NULL) | ||
623 | return -1; | ||
624 | if ((cmd = ub_get_cmd(lun)) == NULL) | ||
625 | return -1; | ||
626 | memset(cmd, 0, sizeof(struct ub_scsi_cmd)); | ||
627 | |||
628 | blk_start_request(rq); | ||
629 | |||
630 | urq = &lun->urq; | ||
631 | memset(urq, 0, sizeof(struct ub_request)); | ||
632 | urq->rq = rq; | ||
633 | |||
634 | /* | ||
635 | * get scatterlist from block layer | ||
636 | */ | ||
637 | sg_init_table(&urq->sgv[0], UB_MAX_REQ_SG); | ||
638 | n_elem = blk_rq_map_sg(lun->disk->queue, rq, &urq->sgv[0]); | ||
639 | if (n_elem < 0) { | ||
640 | /* Impossible, because blk_rq_map_sg should not hit ENOMEM. */ | ||
641 | printk(KERN_INFO "%s: failed request map (%d)\n", | ||
642 | lun->name, n_elem); | ||
643 | goto drop; | ||
644 | } | ||
645 | if (n_elem > UB_MAX_REQ_SG) { /* Paranoia */ | ||
646 | printk(KERN_WARNING "%s: request with %d segments\n", | ||
647 | lun->name, n_elem); | ||
648 | goto drop; | ||
649 | } | ||
650 | urq->nsg = n_elem; | ||
651 | |||
652 | if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { | ||
653 | ub_cmd_build_packet(sc, lun, cmd, urq); | ||
654 | } else { | ||
655 | ub_cmd_build_block(sc, lun, cmd, urq); | ||
656 | } | ||
657 | cmd->state = UB_CMDST_INIT; | ||
658 | cmd->lun = lun; | ||
659 | cmd->done = ub_rw_cmd_done; | ||
660 | cmd->back = urq; | ||
661 | |||
662 | cmd->tag = sc->tagcnt++; | ||
663 | if (ub_submit_scsi(sc, cmd) != 0) | ||
664 | goto drop; | ||
665 | |||
666 | return 0; | ||
667 | |||
668 | drop: | ||
669 | ub_put_cmd(lun, cmd); | ||
670 | ub_end_rq(rq, DID_ERROR << 16); | ||
671 | return 0; | ||
672 | } | ||
673 | |||
674 | static void ub_cmd_build_block(struct ub_dev *sc, struct ub_lun *lun, | ||
675 | struct ub_scsi_cmd *cmd, struct ub_request *urq) | ||
676 | { | ||
677 | struct request *rq = urq->rq; | ||
678 | unsigned int block, nblks; | ||
679 | |||
680 | if (rq_data_dir(rq) == WRITE) | ||
681 | cmd->dir = UB_DIR_WRITE; | ||
682 | else | ||
683 | cmd->dir = UB_DIR_READ; | ||
684 | |||
685 | cmd->nsg = urq->nsg; | ||
686 | memcpy(cmd->sgv, urq->sgv, sizeof(struct scatterlist) * cmd->nsg); | ||
687 | |||
688 | /* | ||
689 | * build the command | ||
690 | * | ||
691 | * The call to blk_queue_logical_block_size() guarantees that request | ||
692 | * is aligned, but it is given in terms of 512 byte units, always. | ||
693 | */ | ||
694 | block = blk_rq_pos(rq) >> lun->capacity.bshift; | ||
695 | nblks = blk_rq_sectors(rq) >> lun->capacity.bshift; | ||
696 | |||
697 | cmd->cdb[0] = (cmd->dir == UB_DIR_READ)? READ_10: WRITE_10; | ||
698 | /* 10-byte uses 4 bytes of LBA: 2147483648KB, 2097152MB, 2048GB */ | ||
699 | cmd->cdb[2] = block >> 24; | ||
700 | cmd->cdb[3] = block >> 16; | ||
701 | cmd->cdb[4] = block >> 8; | ||
702 | cmd->cdb[5] = block; | ||
703 | cmd->cdb[7] = nblks >> 8; | ||
704 | cmd->cdb[8] = nblks; | ||
705 | cmd->cdb_len = 10; | ||
706 | |||
707 | cmd->len = blk_rq_bytes(rq); | ||
708 | } | ||
709 | |||
710 | static void ub_cmd_build_packet(struct ub_dev *sc, struct ub_lun *lun, | ||
711 | struct ub_scsi_cmd *cmd, struct ub_request *urq) | ||
712 | { | ||
713 | struct request *rq = urq->rq; | ||
714 | |||
715 | if (blk_rq_bytes(rq) == 0) { | ||
716 | cmd->dir = UB_DIR_NONE; | ||
717 | } else { | ||
718 | if (rq_data_dir(rq) == WRITE) | ||
719 | cmd->dir = UB_DIR_WRITE; | ||
720 | else | ||
721 | cmd->dir = UB_DIR_READ; | ||
722 | } | ||
723 | |||
724 | cmd->nsg = urq->nsg; | ||
725 | memcpy(cmd->sgv, urq->sgv, sizeof(struct scatterlist) * cmd->nsg); | ||
726 | |||
727 | memcpy(&cmd->cdb, rq->cmd, rq->cmd_len); | ||
728 | cmd->cdb_len = rq->cmd_len; | ||
729 | |||
730 | cmd->len = blk_rq_bytes(rq); | ||
731 | |||
732 | /* | ||
733 | * To reapply this to every URB is not as incorrect as it looks. | ||
734 | * In return, we avoid any complicated tracking calculations. | ||
735 | */ | ||
736 | cmd->timeo = rq->timeout; | ||
737 | } | ||
738 | |||
739 | static void ub_rw_cmd_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd) | ||
740 | { | ||
741 | struct ub_lun *lun = cmd->lun; | ||
742 | struct ub_request *urq = cmd->back; | ||
743 | struct request *rq; | ||
744 | unsigned int scsi_status; | ||
745 | |||
746 | rq = urq->rq; | ||
747 | |||
748 | if (cmd->error == 0) { | ||
749 | if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { | ||
750 | if (cmd->act_len >= rq->resid_len) | ||
751 | rq->resid_len = 0; | ||
752 | else | ||
753 | rq->resid_len -= cmd->act_len; | ||
754 | scsi_status = 0; | ||
755 | } else { | ||
756 | if (cmd->act_len != cmd->len) { | ||
757 | scsi_status = SAM_STAT_CHECK_CONDITION; | ||
758 | } else { | ||
759 | scsi_status = 0; | ||
760 | } | ||
761 | } | ||
762 | } else { | ||
763 | if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { | ||
764 | /* UB_SENSE_SIZE is smaller than SCSI_SENSE_BUFFERSIZE */ | ||
765 | memcpy(rq->sense, sc->top_sense, UB_SENSE_SIZE); | ||
766 | rq->sense_len = UB_SENSE_SIZE; | ||
767 | if (sc->top_sense[0] != 0) | ||
768 | scsi_status = SAM_STAT_CHECK_CONDITION; | ||
769 | else | ||
770 | scsi_status = DID_ERROR << 16; | ||
771 | } else { | ||
772 | if (cmd->error == -EIO && | ||
773 | (cmd->key == 0 || | ||
774 | cmd->key == MEDIUM_ERROR || | ||
775 | cmd->key == UNIT_ATTENTION)) { | ||
776 | if (ub_rw_cmd_retry(sc, lun, urq, cmd) == 0) | ||
777 | return; | ||
778 | } | ||
779 | scsi_status = SAM_STAT_CHECK_CONDITION; | ||
780 | } | ||
781 | } | ||
782 | |||
783 | urq->rq = NULL; | ||
784 | |||
785 | ub_put_cmd(lun, cmd); | ||
786 | ub_end_rq(rq, scsi_status); | ||
787 | blk_start_queue(lun->disk->queue); | ||
788 | } | ||
789 | |||
790 | static void ub_end_rq(struct request *rq, unsigned int scsi_status) | ||
791 | { | ||
792 | int error; | ||
793 | |||
794 | if (scsi_status == 0) { | ||
795 | error = 0; | ||
796 | } else { | ||
797 | error = -EIO; | ||
798 | rq->errors = scsi_status; | ||
799 | } | ||
800 | __blk_end_request_all(rq, error); | ||
801 | } | ||
802 | |||
803 | static int ub_rw_cmd_retry(struct ub_dev *sc, struct ub_lun *lun, | ||
804 | struct ub_request *urq, struct ub_scsi_cmd *cmd) | ||
805 | { | ||
806 | |||
807 | if (atomic_read(&sc->poison)) | ||
808 | return -ENXIO; | ||
809 | |||
810 | ub_reset_enter(sc, urq->current_try); | ||
811 | |||
812 | if (urq->current_try >= 3) | ||
813 | return -EIO; | ||
814 | urq->current_try++; | ||
815 | |||
816 | /* Remove this if anyone complains of flooding. */ | ||
817 | printk(KERN_DEBUG "%s: dir %c len/act %d/%d " | ||
818 | "[sense %x %02x %02x] retry %d\n", | ||
819 | sc->name, UB_DIR_CHAR(cmd->dir), cmd->len, cmd->act_len, | ||
820 | cmd->key, cmd->asc, cmd->ascq, urq->current_try); | ||
821 | |||
822 | memset(cmd, 0, sizeof(struct ub_scsi_cmd)); | ||
823 | ub_cmd_build_block(sc, lun, cmd, urq); | ||
824 | |||
825 | cmd->state = UB_CMDST_INIT; | ||
826 | cmd->lun = lun; | ||
827 | cmd->done = ub_rw_cmd_done; | ||
828 | cmd->back = urq; | ||
829 | |||
830 | cmd->tag = sc->tagcnt++; | ||
831 | |||
832 | #if 0 /* Wasteful */ | ||
833 | return ub_submit_scsi(sc, cmd); | ||
834 | #else | ||
835 | ub_cmdq_add(sc, cmd); | ||
836 | return 0; | ||
837 | #endif | ||
838 | } | ||
839 | |||
840 | /* | ||
841 | * Submit a regular SCSI operation (not an auto-sense). | ||
842 | * | ||
843 | * The Iron Law of Good Submit Routine is: | ||
844 | * Zero return - callback is done, Nonzero return - callback is not done. | ||
845 | * No exceptions. | ||
846 | * | ||
847 | * Host is assumed locked. | ||
848 | */ | ||
849 | static int ub_submit_scsi(struct ub_dev *sc, struct ub_scsi_cmd *cmd) | ||
850 | { | ||
851 | |||
852 | if (cmd->state != UB_CMDST_INIT || | ||
853 | (cmd->dir != UB_DIR_NONE && cmd->len == 0)) { | ||
854 | return -EINVAL; | ||
855 | } | ||
856 | |||
857 | ub_cmdq_add(sc, cmd); | ||
858 | /* | ||
859 | * We can call ub_scsi_dispatch(sc) right away here, but it's a little | ||
860 | * safer to jump to a tasklet, in case upper layers do something silly. | ||
861 | */ | ||
862 | tasklet_schedule(&sc->tasklet); | ||
863 | return 0; | ||
864 | } | ||
865 | |||
866 | /* | ||
867 | * Submit the first URB for the queued command. | ||
868 | * This function does not deal with queueing in any way. | ||
869 | */ | ||
870 | static int ub_scsi_cmd_start(struct ub_dev *sc, struct ub_scsi_cmd *cmd) | ||
871 | { | ||
872 | struct bulk_cb_wrap *bcb; | ||
873 | int rc; | ||
874 | |||
875 | bcb = &sc->work_bcb; | ||
876 | |||
877 | /* | ||
878 | * ``If the allocation length is eighteen or greater, and a device | ||
879 | * server returns less than eithteen bytes of data, the application | ||
880 | * client should assume that the bytes not transferred would have been | ||
881 | * zeroes had the device server returned those bytes.'' | ||
882 | * | ||
883 | * We zero sense for all commands so that when a packet request | ||
884 | * fails it does not return a stale sense. | ||
885 | */ | ||
886 | memset(&sc->top_sense, 0, UB_SENSE_SIZE); | ||
887 | |||
888 | /* set up the command wrapper */ | ||
889 | bcb->Signature = cpu_to_le32(US_BULK_CB_SIGN); | ||
890 | bcb->Tag = cmd->tag; /* Endianness is not important */ | ||
891 | bcb->DataTransferLength = cpu_to_le32(cmd->len); | ||
892 | bcb->Flags = (cmd->dir == UB_DIR_READ) ? 0x80 : 0; | ||
893 | bcb->Lun = (cmd->lun != NULL) ? cmd->lun->num : 0; | ||
894 | bcb->Length = cmd->cdb_len; | ||
895 | |||
896 | /* copy the command payload */ | ||
897 | memcpy(bcb->CDB, cmd->cdb, UB_MAX_CDB_SIZE); | ||
898 | |||
899 | UB_INIT_COMPLETION(sc->work_done); | ||
900 | |||
901 | sc->last_pipe = sc->send_bulk_pipe; | ||
902 | usb_fill_bulk_urb(&sc->work_urb, sc->dev, sc->send_bulk_pipe, | ||
903 | bcb, US_BULK_CB_WRAP_LEN, ub_urb_complete, sc); | ||
904 | |||
905 | if ((rc = usb_submit_urb(&sc->work_urb, GFP_ATOMIC)) != 0) { | ||
906 | /* XXX Clear stalls */ | ||
907 | ub_complete(&sc->work_done); | ||
908 | return rc; | ||
909 | } | ||
910 | |||
911 | sc->work_timer.expires = jiffies + UB_URB_TIMEOUT; | ||
912 | add_timer(&sc->work_timer); | ||
913 | |||
914 | cmd->state = UB_CMDST_CMD; | ||
915 | return 0; | ||
916 | } | ||
917 | |||
918 | /* | ||
919 | * Timeout handler. | ||
920 | */ | ||
921 | static void ub_urb_timeout(unsigned long arg) | ||
922 | { | ||
923 | struct ub_dev *sc = (struct ub_dev *) arg; | ||
924 | unsigned long flags; | ||
925 | |||
926 | spin_lock_irqsave(sc->lock, flags); | ||
927 | if (!ub_is_completed(&sc->work_done)) | ||
928 | usb_unlink_urb(&sc->work_urb); | ||
929 | spin_unlock_irqrestore(sc->lock, flags); | ||
930 | } | ||
931 | |||
932 | /* | ||
933 | * Completion routine for the work URB. | ||
934 | * | ||
935 | * This can be called directly from usb_submit_urb (while we have | ||
936 | * the sc->lock taken) and from an interrupt (while we do NOT have | ||
937 | * the sc->lock taken). Therefore, bounce this off to a tasklet. | ||
938 | */ | ||
939 | static void ub_urb_complete(struct urb *urb) | ||
940 | { | ||
941 | struct ub_dev *sc = urb->context; | ||
942 | |||
943 | ub_complete(&sc->work_done); | ||
944 | tasklet_schedule(&sc->tasklet); | ||
945 | } | ||
946 | |||
947 | static void ub_scsi_action(unsigned long _dev) | ||
948 | { | ||
949 | struct ub_dev *sc = (struct ub_dev *) _dev; | ||
950 | unsigned long flags; | ||
951 | |||
952 | spin_lock_irqsave(sc->lock, flags); | ||
953 | ub_scsi_dispatch(sc); | ||
954 | spin_unlock_irqrestore(sc->lock, flags); | ||
955 | } | ||
956 | |||
957 | static void ub_scsi_dispatch(struct ub_dev *sc) | ||
958 | { | ||
959 | struct ub_scsi_cmd *cmd; | ||
960 | int rc; | ||
961 | |||
962 | while (!sc->reset && (cmd = ub_cmdq_peek(sc)) != NULL) { | ||
963 | if (cmd->state == UB_CMDST_DONE) { | ||
964 | ub_cmdq_pop(sc); | ||
965 | (*cmd->done)(sc, cmd); | ||
966 | } else if (cmd->state == UB_CMDST_INIT) { | ||
967 | if ((rc = ub_scsi_cmd_start(sc, cmd)) == 0) | ||
968 | break; | ||
969 | cmd->error = rc; | ||
970 | cmd->state = UB_CMDST_DONE; | ||
971 | } else { | ||
972 | if (!ub_is_completed(&sc->work_done)) | ||
973 | break; | ||
974 | del_timer(&sc->work_timer); | ||
975 | ub_scsi_urb_compl(sc, cmd); | ||
976 | } | ||
977 | } | ||
978 | } | ||
979 | |||
980 | static void ub_scsi_urb_compl(struct ub_dev *sc, struct ub_scsi_cmd *cmd) | ||
981 | { | ||
982 | struct urb *urb = &sc->work_urb; | ||
983 | struct bulk_cs_wrap *bcs; | ||
984 | int endp; | ||
985 | int len; | ||
986 | int rc; | ||
987 | |||
988 | if (atomic_read(&sc->poison)) { | ||
989 | ub_state_done(sc, cmd, -ENODEV); | ||
990 | return; | ||
991 | } | ||
992 | |||
993 | endp = usb_pipeendpoint(sc->last_pipe); | ||
994 | if (usb_pipein(sc->last_pipe)) | ||
995 | endp |= USB_DIR_IN; | ||
996 | |||
997 | if (cmd->state == UB_CMDST_CLEAR) { | ||
998 | if (urb->status == -EPIPE) { | ||
999 | /* | ||
1000 | * STALL while clearning STALL. | ||
1001 | * The control pipe clears itself - nothing to do. | ||
1002 | */ | ||
1003 | printk(KERN_NOTICE "%s: stall on control pipe\n", | ||
1004 | sc->name); | ||
1005 | goto Bad_End; | ||
1006 | } | ||
1007 | |||
1008 | /* | ||
1009 | * We ignore the result for the halt clear. | ||
1010 | */ | ||
1011 | |||
1012 | usb_reset_endpoint(sc->dev, endp); | ||
1013 | |||
1014 | ub_state_sense(sc, cmd); | ||
1015 | |||
1016 | } else if (cmd->state == UB_CMDST_CLR2STS) { | ||
1017 | if (urb->status == -EPIPE) { | ||
1018 | printk(KERN_NOTICE "%s: stall on control pipe\n", | ||
1019 | sc->name); | ||
1020 | goto Bad_End; | ||
1021 | } | ||
1022 | |||
1023 | /* | ||
1024 | * We ignore the result for the halt clear. | ||
1025 | */ | ||
1026 | |||
1027 | usb_reset_endpoint(sc->dev, endp); | ||
1028 | |||
1029 | ub_state_stat(sc, cmd); | ||
1030 | |||
1031 | } else if (cmd->state == UB_CMDST_CLRRS) { | ||
1032 | if (urb->status == -EPIPE) { | ||
1033 | printk(KERN_NOTICE "%s: stall on control pipe\n", | ||
1034 | sc->name); | ||
1035 | goto Bad_End; | ||
1036 | } | ||
1037 | |||
1038 | /* | ||
1039 | * We ignore the result for the halt clear. | ||
1040 | */ | ||
1041 | |||
1042 | usb_reset_endpoint(sc->dev, endp); | ||
1043 | |||
1044 | ub_state_stat_counted(sc, cmd); | ||
1045 | |||
1046 | } else if (cmd->state == UB_CMDST_CMD) { | ||
1047 | switch (urb->status) { | ||
1048 | case 0: | ||
1049 | break; | ||
1050 | case -EOVERFLOW: | ||
1051 | goto Bad_End; | ||
1052 | case -EPIPE: | ||
1053 | rc = ub_submit_clear_stall(sc, cmd, sc->last_pipe); | ||
1054 | if (rc != 0) { | ||
1055 | printk(KERN_NOTICE "%s: " | ||
1056 | "unable to submit clear (%d)\n", | ||
1057 | sc->name, rc); | ||
1058 | /* | ||
1059 | * This is typically ENOMEM or some other such shit. | ||
1060 | * Retrying is pointless. Just do Bad End on it... | ||
1061 | */ | ||
1062 | ub_state_done(sc, cmd, rc); | ||
1063 | return; | ||
1064 | } | ||
1065 | cmd->state = UB_CMDST_CLEAR; | ||
1066 | return; | ||
1067 | case -ESHUTDOWN: /* unplug */ | ||
1068 | case -EILSEQ: /* unplug timeout on uhci */ | ||
1069 | ub_state_done(sc, cmd, -ENODEV); | ||
1070 | return; | ||
1071 | default: | ||
1072 | goto Bad_End; | ||
1073 | } | ||
1074 | if (urb->actual_length != US_BULK_CB_WRAP_LEN) { | ||
1075 | goto Bad_End; | ||
1076 | } | ||
1077 | |||
1078 | if (cmd->dir == UB_DIR_NONE || cmd->nsg < 1) { | ||
1079 | ub_state_stat(sc, cmd); | ||
1080 | return; | ||
1081 | } | ||
1082 | |||
1083 | // udelay(125); // usb-storage has this | ||
1084 | ub_data_start(sc, cmd); | ||
1085 | |||
1086 | } else if (cmd->state == UB_CMDST_DATA) { | ||
1087 | if (urb->status == -EPIPE) { | ||
1088 | rc = ub_submit_clear_stall(sc, cmd, sc->last_pipe); | ||
1089 | if (rc != 0) { | ||
1090 | printk(KERN_NOTICE "%s: " | ||
1091 | "unable to submit clear (%d)\n", | ||
1092 | sc->name, rc); | ||
1093 | ub_state_done(sc, cmd, rc); | ||
1094 | return; | ||
1095 | } | ||
1096 | cmd->state = UB_CMDST_CLR2STS; | ||
1097 | return; | ||
1098 | } | ||
1099 | if (urb->status == -EOVERFLOW) { | ||
1100 | /* | ||
1101 | * A babble? Failure, but we must transfer CSW now. | ||
1102 | */ | ||
1103 | cmd->error = -EOVERFLOW; /* A cheap trick... */ | ||
1104 | ub_state_stat(sc, cmd); | ||
1105 | return; | ||
1106 | } | ||
1107 | |||
1108 | if (cmd->dir == UB_DIR_WRITE) { | ||
1109 | /* | ||
1110 | * Do not continue writes in case of a failure. | ||
1111 | * Doing so would cause sectors to be mixed up, | ||
1112 | * which is worse than sectors lost. | ||
1113 | * | ||
1114 | * We must try to read the CSW, or many devices | ||
1115 | * get confused. | ||
1116 | */ | ||
1117 | len = urb->actual_length; | ||
1118 | if (urb->status != 0 || | ||
1119 | len != cmd->sgv[cmd->current_sg].length) { | ||
1120 | cmd->act_len += len; | ||
1121 | |||
1122 | cmd->error = -EIO; | ||
1123 | ub_state_stat(sc, cmd); | ||
1124 | return; | ||
1125 | } | ||
1126 | |||
1127 | } else { | ||
1128 | /* | ||
1129 | * If an error occurs on read, we record it, and | ||
1130 | * continue to fetch data in order to avoid bubble. | ||
1131 | * | ||
1132 | * As a small shortcut, we stop if we detect that | ||
1133 | * a CSW mixed into data. | ||
1134 | */ | ||
1135 | if (urb->status != 0) | ||
1136 | cmd->error = -EIO; | ||
1137 | |||
1138 | len = urb->actual_length; | ||
1139 | if (urb->status != 0 || | ||
1140 | len != cmd->sgv[cmd->current_sg].length) { | ||
1141 | if ((len & 0x1FF) == US_BULK_CS_WRAP_LEN) | ||
1142 | goto Bad_End; | ||
1143 | } | ||
1144 | } | ||
1145 | |||
1146 | cmd->act_len += urb->actual_length; | ||
1147 | |||
1148 | if (++cmd->current_sg < cmd->nsg) { | ||
1149 | ub_data_start(sc, cmd); | ||
1150 | return; | ||
1151 | } | ||
1152 | ub_state_stat(sc, cmd); | ||
1153 | |||
1154 | } else if (cmd->state == UB_CMDST_STAT) { | ||
1155 | if (urb->status == -EPIPE) { | ||
1156 | rc = ub_submit_clear_stall(sc, cmd, sc->last_pipe); | ||
1157 | if (rc != 0) { | ||
1158 | printk(KERN_NOTICE "%s: " | ||
1159 | "unable to submit clear (%d)\n", | ||
1160 | sc->name, rc); | ||
1161 | ub_state_done(sc, cmd, rc); | ||
1162 | return; | ||
1163 | } | ||
1164 | |||
1165 | /* | ||
1166 | * Having a stall when getting CSW is an error, so | ||
1167 | * make sure uppper levels are not oblivious to it. | ||
1168 | */ | ||
1169 | cmd->error = -EIO; /* A cheap trick... */ | ||
1170 | |||
1171 | cmd->state = UB_CMDST_CLRRS; | ||
1172 | return; | ||
1173 | } | ||
1174 | |||
1175 | /* Catch everything, including -EOVERFLOW and other nasties. */ | ||
1176 | if (urb->status != 0) | ||
1177 | goto Bad_End; | ||
1178 | |||
1179 | if (urb->actual_length == 0) { | ||
1180 | ub_state_stat_counted(sc, cmd); | ||
1181 | return; | ||
1182 | } | ||
1183 | |||
1184 | /* | ||
1185 | * Check the returned Bulk protocol status. | ||
1186 | * The status block has to be validated first. | ||
1187 | */ | ||
1188 | |||
1189 | bcs = &sc->work_bcs; | ||
1190 | |||
1191 | if (sc->signature == cpu_to_le32(0)) { | ||
1192 | /* | ||
1193 | * This is the first reply, so do not perform the check. | ||
1194 | * Instead, remember the signature the device uses | ||
1195 | * for future checks. But do not allow a nul. | ||
1196 | */ | ||
1197 | sc->signature = bcs->Signature; | ||
1198 | if (sc->signature == cpu_to_le32(0)) { | ||
1199 | ub_state_stat_counted(sc, cmd); | ||
1200 | return; | ||
1201 | } | ||
1202 | } else { | ||
1203 | if (bcs->Signature != sc->signature) { | ||
1204 | ub_state_stat_counted(sc, cmd); | ||
1205 | return; | ||
1206 | } | ||
1207 | } | ||
1208 | |||
1209 | if (bcs->Tag != cmd->tag) { | ||
1210 | /* | ||
1211 | * This usually happens when we disagree with the | ||
1212 | * device's microcode about something. For instance, | ||
1213 | * a few of them throw this after timeouts. They buffer | ||
1214 | * commands and reply at commands we timed out before. | ||
1215 | * Without flushing these replies we loop forever. | ||
1216 | */ | ||
1217 | ub_state_stat_counted(sc, cmd); | ||
1218 | return; | ||
1219 | } | ||
1220 | |||
1221 | if (!sc->bad_resid) { | ||
1222 | len = le32_to_cpu(bcs->Residue); | ||
1223 | if (len != cmd->len - cmd->act_len) { | ||
1224 | /* | ||
1225 | * Only start ignoring if this cmd ended well. | ||
1226 | */ | ||
1227 | if (cmd->len == cmd->act_len) { | ||
1228 | printk(KERN_NOTICE "%s: " | ||
1229 | "bad residual %d of %d, ignoring\n", | ||
1230 | sc->name, len, cmd->len); | ||
1231 | sc->bad_resid = 1; | ||
1232 | } | ||
1233 | } | ||
1234 | } | ||
1235 | |||
1236 | switch (bcs->Status) { | ||
1237 | case US_BULK_STAT_OK: | ||
1238 | break; | ||
1239 | case US_BULK_STAT_FAIL: | ||
1240 | ub_state_sense(sc, cmd); | ||
1241 | return; | ||
1242 | case US_BULK_STAT_PHASE: | ||
1243 | goto Bad_End; | ||
1244 | default: | ||
1245 | printk(KERN_INFO "%s: unknown CSW status 0x%x\n", | ||
1246 | sc->name, bcs->Status); | ||
1247 | ub_state_done(sc, cmd, -EINVAL); | ||
1248 | return; | ||
1249 | } | ||
1250 | |||
1251 | /* Not zeroing error to preserve a babble indicator */ | ||
1252 | if (cmd->error != 0) { | ||
1253 | ub_state_sense(sc, cmd); | ||
1254 | return; | ||
1255 | } | ||
1256 | cmd->state = UB_CMDST_DONE; | ||
1257 | ub_cmdq_pop(sc); | ||
1258 | (*cmd->done)(sc, cmd); | ||
1259 | |||
1260 | } else if (cmd->state == UB_CMDST_SENSE) { | ||
1261 | ub_state_done(sc, cmd, -EIO); | ||
1262 | |||
1263 | } else { | ||
1264 | printk(KERN_WARNING "%s: wrong command state %d\n", | ||
1265 | sc->name, cmd->state); | ||
1266 | ub_state_done(sc, cmd, -EINVAL); | ||
1267 | return; | ||
1268 | } | ||
1269 | return; | ||
1270 | |||
1271 | Bad_End: /* Little Excel is dead */ | ||
1272 | ub_state_done(sc, cmd, -EIO); | ||
1273 | } | ||
1274 | |||
1275 | /* | ||
1276 | * Factorization helper for the command state machine: | ||
1277 | * Initiate a data segment transfer. | ||
1278 | */ | ||
1279 | static void ub_data_start(struct ub_dev *sc, struct ub_scsi_cmd *cmd) | ||
1280 | { | ||
1281 | struct scatterlist *sg = &cmd->sgv[cmd->current_sg]; | ||
1282 | int pipe; | ||
1283 | int rc; | ||
1284 | |||
1285 | UB_INIT_COMPLETION(sc->work_done); | ||
1286 | |||
1287 | if (cmd->dir == UB_DIR_READ) | ||
1288 | pipe = sc->recv_bulk_pipe; | ||
1289 | else | ||
1290 | pipe = sc->send_bulk_pipe; | ||
1291 | sc->last_pipe = pipe; | ||
1292 | usb_fill_bulk_urb(&sc->work_urb, sc->dev, pipe, sg_virt(sg), | ||
1293 | sg->length, ub_urb_complete, sc); | ||
1294 | |||
1295 | if ((rc = usb_submit_urb(&sc->work_urb, GFP_ATOMIC)) != 0) { | ||
1296 | /* XXX Clear stalls */ | ||
1297 | ub_complete(&sc->work_done); | ||
1298 | ub_state_done(sc, cmd, rc); | ||
1299 | return; | ||
1300 | } | ||
1301 | |||
1302 | if (cmd->timeo) | ||
1303 | sc->work_timer.expires = jiffies + cmd->timeo; | ||
1304 | else | ||
1305 | sc->work_timer.expires = jiffies + UB_DATA_TIMEOUT; | ||
1306 | add_timer(&sc->work_timer); | ||
1307 | |||
1308 | cmd->state = UB_CMDST_DATA; | ||
1309 | } | ||
1310 | |||
1311 | /* | ||
1312 | * Factorization helper for the command state machine: | ||
1313 | * Finish the command. | ||
1314 | */ | ||
1315 | static void ub_state_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd, int rc) | ||
1316 | { | ||
1317 | |||
1318 | cmd->error = rc; | ||
1319 | cmd->state = UB_CMDST_DONE; | ||
1320 | ub_cmdq_pop(sc); | ||
1321 | (*cmd->done)(sc, cmd); | ||
1322 | } | ||
1323 | |||
1324 | /* | ||
1325 | * Factorization helper for the command state machine: | ||
1326 | * Submit a CSW read. | ||
1327 | */ | ||
1328 | static int __ub_state_stat(struct ub_dev *sc, struct ub_scsi_cmd *cmd) | ||
1329 | { | ||
1330 | int rc; | ||
1331 | |||
1332 | UB_INIT_COMPLETION(sc->work_done); | ||
1333 | |||
1334 | sc->last_pipe = sc->recv_bulk_pipe; | ||
1335 | usb_fill_bulk_urb(&sc->work_urb, sc->dev, sc->recv_bulk_pipe, | ||
1336 | &sc->work_bcs, US_BULK_CS_WRAP_LEN, ub_urb_complete, sc); | ||
1337 | |||
1338 | if ((rc = usb_submit_urb(&sc->work_urb, GFP_ATOMIC)) != 0) { | ||
1339 | /* XXX Clear stalls */ | ||
1340 | ub_complete(&sc->work_done); | ||
1341 | ub_state_done(sc, cmd, rc); | ||
1342 | return -1; | ||
1343 | } | ||
1344 | |||
1345 | if (cmd->timeo) | ||
1346 | sc->work_timer.expires = jiffies + cmd->timeo; | ||
1347 | else | ||
1348 | sc->work_timer.expires = jiffies + UB_STAT_TIMEOUT; | ||
1349 | add_timer(&sc->work_timer); | ||
1350 | return 0; | ||
1351 | } | ||
1352 | |||
1353 | /* | ||
1354 | * Factorization helper for the command state machine: | ||
1355 | * Submit a CSW read and go to STAT state. | ||
1356 | */ | ||
1357 | static void ub_state_stat(struct ub_dev *sc, struct ub_scsi_cmd *cmd) | ||
1358 | { | ||
1359 | |||
1360 | if (__ub_state_stat(sc, cmd) != 0) | ||
1361 | return; | ||
1362 | |||
1363 | cmd->stat_count = 0; | ||
1364 | cmd->state = UB_CMDST_STAT; | ||
1365 | } | ||
1366 | |||
1367 | /* | ||
1368 | * Factorization helper for the command state machine: | ||
1369 | * Submit a CSW read and go to STAT state with counter (along [C] path). | ||
1370 | */ | ||
1371 | static void ub_state_stat_counted(struct ub_dev *sc, struct ub_scsi_cmd *cmd) | ||
1372 | { | ||
1373 | |||
1374 | if (++cmd->stat_count >= 4) { | ||
1375 | ub_state_sense(sc, cmd); | ||
1376 | return; | ||
1377 | } | ||
1378 | |||
1379 | if (__ub_state_stat(sc, cmd) != 0) | ||
1380 | return; | ||
1381 | |||
1382 | cmd->state = UB_CMDST_STAT; | ||
1383 | } | ||
1384 | |||
1385 | /* | ||
1386 | * Factorization helper for the command state machine: | ||
1387 | * Submit a REQUEST SENSE and go to SENSE state. | ||
1388 | */ | ||
1389 | static void ub_state_sense(struct ub_dev *sc, struct ub_scsi_cmd *cmd) | ||
1390 | { | ||
1391 | struct ub_scsi_cmd *scmd; | ||
1392 | struct scatterlist *sg; | ||
1393 | int rc; | ||
1394 | |||
1395 | if (cmd->cdb[0] == REQUEST_SENSE) { | ||
1396 | rc = -EPIPE; | ||
1397 | goto error; | ||
1398 | } | ||
1399 | |||
1400 | scmd = &sc->top_rqs_cmd; | ||
1401 | memset(scmd, 0, sizeof(struct ub_scsi_cmd)); | ||
1402 | scmd->cdb[0] = REQUEST_SENSE; | ||
1403 | scmd->cdb[4] = UB_SENSE_SIZE; | ||
1404 | scmd->cdb_len = 6; | ||
1405 | scmd->dir = UB_DIR_READ; | ||
1406 | scmd->state = UB_CMDST_INIT; | ||
1407 | scmd->nsg = 1; | ||
1408 | sg = &scmd->sgv[0]; | ||
1409 | sg_init_table(sg, UB_MAX_REQ_SG); | ||
1410 | sg_set_page(sg, virt_to_page(sc->top_sense), UB_SENSE_SIZE, | ||
1411 | (unsigned long)sc->top_sense & (PAGE_SIZE-1)); | ||
1412 | scmd->len = UB_SENSE_SIZE; | ||
1413 | scmd->lun = cmd->lun; | ||
1414 | scmd->done = ub_top_sense_done; | ||
1415 | scmd->back = cmd; | ||
1416 | |||
1417 | scmd->tag = sc->tagcnt++; | ||
1418 | |||
1419 | cmd->state = UB_CMDST_SENSE; | ||
1420 | |||
1421 | ub_cmdq_insert(sc, scmd); | ||
1422 | return; | ||
1423 | |||
1424 | error: | ||
1425 | ub_state_done(sc, cmd, rc); | ||
1426 | } | ||
1427 | |||
1428 | /* | ||
1429 | * A helper for the command's state machine: | ||
1430 | * Submit a stall clear. | ||
1431 | */ | ||
1432 | static int ub_submit_clear_stall(struct ub_dev *sc, struct ub_scsi_cmd *cmd, | ||
1433 | int stalled_pipe) | ||
1434 | { | ||
1435 | int endp; | ||
1436 | struct usb_ctrlrequest *cr; | ||
1437 | int rc; | ||
1438 | |||
1439 | endp = usb_pipeendpoint(stalled_pipe); | ||
1440 | if (usb_pipein (stalled_pipe)) | ||
1441 | endp |= USB_DIR_IN; | ||
1442 | |||
1443 | cr = &sc->work_cr; | ||
1444 | cr->bRequestType = USB_RECIP_ENDPOINT; | ||
1445 | cr->bRequest = USB_REQ_CLEAR_FEATURE; | ||
1446 | cr->wValue = cpu_to_le16(USB_ENDPOINT_HALT); | ||
1447 | cr->wIndex = cpu_to_le16(endp); | ||
1448 | cr->wLength = cpu_to_le16(0); | ||
1449 | |||
1450 | UB_INIT_COMPLETION(sc->work_done); | ||
1451 | |||
1452 | usb_fill_control_urb(&sc->work_urb, sc->dev, sc->send_ctrl_pipe, | ||
1453 | (unsigned char*) cr, NULL, 0, ub_urb_complete, sc); | ||
1454 | |||
1455 | if ((rc = usb_submit_urb(&sc->work_urb, GFP_ATOMIC)) != 0) { | ||
1456 | ub_complete(&sc->work_done); | ||
1457 | return rc; | ||
1458 | } | ||
1459 | |||
1460 | sc->work_timer.expires = jiffies + UB_CTRL_TIMEOUT; | ||
1461 | add_timer(&sc->work_timer); | ||
1462 | return 0; | ||
1463 | } | ||
1464 | |||
1465 | /* | ||
1466 | */ | ||
1467 | static void ub_top_sense_done(struct ub_dev *sc, struct ub_scsi_cmd *scmd) | ||
1468 | { | ||
1469 | unsigned char *sense = sc->top_sense; | ||
1470 | struct ub_scsi_cmd *cmd; | ||
1471 | |||
1472 | /* | ||
1473 | * Find the command which triggered the unit attention or a check, | ||
1474 | * save the sense into it, and advance its state machine. | ||
1475 | */ | ||
1476 | if ((cmd = ub_cmdq_peek(sc)) == NULL) { | ||
1477 | printk(KERN_WARNING "%s: sense done while idle\n", sc->name); | ||
1478 | return; | ||
1479 | } | ||
1480 | if (cmd != scmd->back) { | ||
1481 | printk(KERN_WARNING "%s: " | ||
1482 | "sense done for wrong command 0x%x\n", | ||
1483 | sc->name, cmd->tag); | ||
1484 | return; | ||
1485 | } | ||
1486 | if (cmd->state != UB_CMDST_SENSE) { | ||
1487 | printk(KERN_WARNING "%s: sense done with bad cmd state %d\n", | ||
1488 | sc->name, cmd->state); | ||
1489 | return; | ||
1490 | } | ||
1491 | |||
1492 | /* | ||
1493 | * Ignoring scmd->act_len, because the buffer was pre-zeroed. | ||
1494 | */ | ||
1495 | cmd->key = sense[2] & 0x0F; | ||
1496 | cmd->asc = sense[12]; | ||
1497 | cmd->ascq = sense[13]; | ||
1498 | |||
1499 | ub_scsi_urb_compl(sc, cmd); | ||
1500 | } | ||
1501 | |||
1502 | /* | ||
1503 | * Reset management | ||
1504 | */ | ||
1505 | |||
1506 | static void ub_reset_enter(struct ub_dev *sc, int try) | ||
1507 | { | ||
1508 | |||
1509 | if (sc->reset) { | ||
1510 | /* This happens often on multi-LUN devices. */ | ||
1511 | return; | ||
1512 | } | ||
1513 | sc->reset = try + 1; | ||
1514 | |||
1515 | #if 0 /* Not needed because the disconnect waits for us. */ | ||
1516 | unsigned long flags; | ||
1517 | spin_lock_irqsave(&ub_lock, flags); | ||
1518 | sc->openc++; | ||
1519 | spin_unlock_irqrestore(&ub_lock, flags); | ||
1520 | #endif | ||
1521 | |||
1522 | #if 0 /* We let them stop themselves. */ | ||
1523 | struct ub_lun *lun; | ||
1524 | list_for_each_entry(lun, &sc->luns, link) { | ||
1525 | blk_stop_queue(lun->disk->queue); | ||
1526 | } | ||
1527 | #endif | ||
1528 | |||
1529 | schedule_work(&sc->reset_work); | ||
1530 | } | ||
1531 | |||
1532 | static void ub_reset_task(struct work_struct *work) | ||
1533 | { | ||
1534 | struct ub_dev *sc = container_of(work, struct ub_dev, reset_work); | ||
1535 | unsigned long flags; | ||
1536 | struct ub_lun *lun; | ||
1537 | int rc; | ||
1538 | |||
1539 | if (!sc->reset) { | ||
1540 | printk(KERN_WARNING "%s: Running reset unrequested\n", | ||
1541 | sc->name); | ||
1542 | return; | ||
1543 | } | ||
1544 | |||
1545 | if (atomic_read(&sc->poison)) { | ||
1546 | ; | ||
1547 | } else if ((sc->reset & 1) == 0) { | ||
1548 | ub_sync_reset(sc); | ||
1549 | msleep(700); /* usb-storage sleeps 6s (!) */ | ||
1550 | ub_probe_clear_stall(sc, sc->recv_bulk_pipe); | ||
1551 | ub_probe_clear_stall(sc, sc->send_bulk_pipe); | ||
1552 | } else if (sc->dev->actconfig->desc.bNumInterfaces != 1) { | ||
1553 | ; | ||
1554 | } else { | ||
1555 | rc = usb_lock_device_for_reset(sc->dev, sc->intf); | ||
1556 | if (rc < 0) { | ||
1557 | printk(KERN_NOTICE | ||
1558 | "%s: usb_lock_device_for_reset failed (%d)\n", | ||
1559 | sc->name, rc); | ||
1560 | } else { | ||
1561 | rc = usb_reset_device(sc->dev); | ||
1562 | if (rc < 0) { | ||
1563 | printk(KERN_NOTICE "%s: " | ||
1564 | "usb_lock_device_for_reset failed (%d)\n", | ||
1565 | sc->name, rc); | ||
1566 | } | ||
1567 | usb_unlock_device(sc->dev); | ||
1568 | } | ||
1569 | } | ||
1570 | |||
1571 | /* | ||
1572 | * In theory, no commands can be running while reset is active, | ||
1573 | * so nobody can ask for another reset, and so we do not need any | ||
1574 | * queues of resets or anything. We do need a spinlock though, | ||
1575 | * to interact with block layer. | ||
1576 | */ | ||
1577 | spin_lock_irqsave(sc->lock, flags); | ||
1578 | sc->reset = 0; | ||
1579 | tasklet_schedule(&sc->tasklet); | ||
1580 | list_for_each_entry(lun, &sc->luns, link) { | ||
1581 | blk_start_queue(lun->disk->queue); | ||
1582 | } | ||
1583 | wake_up(&sc->reset_wait); | ||
1584 | spin_unlock_irqrestore(sc->lock, flags); | ||
1585 | } | ||
1586 | |||
1587 | /* | ||
1588 | * XXX Reset brackets are too much hassle to implement, so just stub them | ||
1589 | * in order to prevent forced unbinding (which deadlocks solid when our | ||
1590 | * ->disconnect method waits for the reset to complete and this kills keventd). | ||
1591 | * | ||
1592 | * XXX Tell Alan to move usb_unlock_device inside of usb_reset_device, | ||
1593 | * or else the post_reset is invoked, and restats I/O on a locked device. | ||
1594 | */ | ||
1595 | static int ub_pre_reset(struct usb_interface *iface) { | ||
1596 | return 0; | ||
1597 | } | ||
1598 | |||
1599 | static int ub_post_reset(struct usb_interface *iface) { | ||
1600 | return 0; | ||
1601 | } | ||
1602 | |||
1603 | /* | ||
1604 | * This is called from a process context. | ||
1605 | */ | ||
1606 | static void ub_revalidate(struct ub_dev *sc, struct ub_lun *lun) | ||
1607 | { | ||
1608 | |||
1609 | lun->readonly = 0; /* XXX Query this from the device */ | ||
1610 | |||
1611 | lun->capacity.nsec = 0; | ||
1612 | lun->capacity.bsize = 512; | ||
1613 | lun->capacity.bshift = 0; | ||
1614 | |||
1615 | if (ub_sync_tur(sc, lun) != 0) | ||
1616 | return; /* Not ready */ | ||
1617 | lun->changed = 0; | ||
1618 | |||
1619 | if (ub_sync_read_cap(sc, lun, &lun->capacity) != 0) { | ||
1620 | /* | ||
1621 | * The retry here means something is wrong, either with the | ||
1622 | * device, with the transport, or with our code. | ||
1623 | * We keep this because sd.c has retries for capacity. | ||
1624 | */ | ||
1625 | if (ub_sync_read_cap(sc, lun, &lun->capacity) != 0) { | ||
1626 | lun->capacity.nsec = 0; | ||
1627 | lun->capacity.bsize = 512; | ||
1628 | lun->capacity.bshift = 0; | ||
1629 | } | ||
1630 | } | ||
1631 | } | ||
1632 | |||
1633 | /* | ||
1634 | * The open funcion. | ||
1635 | * This is mostly needed to keep refcounting, but also to support | ||
1636 | * media checks on removable media drives. | ||
1637 | */ | ||
1638 | static int ub_bd_open(struct block_device *bdev, fmode_t mode) | ||
1639 | { | ||
1640 | struct ub_lun *lun = bdev->bd_disk->private_data; | ||
1641 | struct ub_dev *sc = lun->udev; | ||
1642 | unsigned long flags; | ||
1643 | int rc; | ||
1644 | |||
1645 | spin_lock_irqsave(&ub_lock, flags); | ||
1646 | if (atomic_read(&sc->poison)) { | ||
1647 | spin_unlock_irqrestore(&ub_lock, flags); | ||
1648 | return -ENXIO; | ||
1649 | } | ||
1650 | sc->openc++; | ||
1651 | spin_unlock_irqrestore(&ub_lock, flags); | ||
1652 | |||
1653 | if (lun->removable || lun->readonly) | ||
1654 | check_disk_change(bdev); | ||
1655 | |||
1656 | /* | ||
1657 | * The sd.c considers ->media_present and ->changed not equivalent, | ||
1658 | * under some pretty murky conditions (a failure of READ CAPACITY). | ||
1659 | * We may need it one day. | ||
1660 | */ | ||
1661 | if (lun->removable && lun->changed && !(mode & FMODE_NDELAY)) { | ||
1662 | rc = -ENOMEDIUM; | ||
1663 | goto err_open; | ||
1664 | } | ||
1665 | |||
1666 | if (lun->readonly && (mode & FMODE_WRITE)) { | ||
1667 | rc = -EROFS; | ||
1668 | goto err_open; | ||
1669 | } | ||
1670 | |||
1671 | return 0; | ||
1672 | |||
1673 | err_open: | ||
1674 | ub_put(sc); | ||
1675 | return rc; | ||
1676 | } | ||
1677 | |||
1678 | static int ub_bd_unlocked_open(struct block_device *bdev, fmode_t mode) | ||
1679 | { | ||
1680 | int ret; | ||
1681 | |||
1682 | mutex_lock(&ub_mutex); | ||
1683 | ret = ub_bd_open(bdev, mode); | ||
1684 | mutex_unlock(&ub_mutex); | ||
1685 | |||
1686 | return ret; | ||
1687 | } | ||
1688 | |||
1689 | |||
1690 | /* | ||
1691 | */ | ||
1692 | static int ub_bd_release(struct gendisk *disk, fmode_t mode) | ||
1693 | { | ||
1694 | struct ub_lun *lun = disk->private_data; | ||
1695 | struct ub_dev *sc = lun->udev; | ||
1696 | |||
1697 | mutex_lock(&ub_mutex); | ||
1698 | ub_put(sc); | ||
1699 | mutex_unlock(&ub_mutex); | ||
1700 | |||
1701 | return 0; | ||
1702 | } | ||
1703 | |||
1704 | /* | ||
1705 | * The ioctl interface. | ||
1706 | */ | ||
1707 | static int ub_bd_ioctl(struct block_device *bdev, fmode_t mode, | ||
1708 | unsigned int cmd, unsigned long arg) | ||
1709 | { | ||
1710 | void __user *usermem = (void __user *) arg; | ||
1711 | int ret; | ||
1712 | |||
1713 | mutex_lock(&ub_mutex); | ||
1714 | ret = scsi_cmd_blk_ioctl(bdev, mode, cmd, usermem); | ||
1715 | mutex_unlock(&ub_mutex); | ||
1716 | |||
1717 | return ret; | ||
1718 | } | ||
1719 | |||
1720 | /* | ||
1721 | * This is called by check_disk_change if we reported a media change. | ||
1722 | * The main onjective here is to discover the features of the media such as | ||
1723 | * the capacity, read-only status, etc. USB storage generally does not | ||
1724 | * need to be spun up, but if we needed it, this would be the place. | ||
1725 | * | ||
1726 | * This call can sleep. | ||
1727 | * | ||
1728 | * The return code is not used. | ||
1729 | */ | ||
1730 | static int ub_bd_revalidate(struct gendisk *disk) | ||
1731 | { | ||
1732 | struct ub_lun *lun = disk->private_data; | ||
1733 | |||
1734 | ub_revalidate(lun->udev, lun); | ||
1735 | |||
1736 | /* XXX Support sector size switching like in sr.c */ | ||
1737 | blk_queue_logical_block_size(disk->queue, lun->capacity.bsize); | ||
1738 | set_capacity(disk, lun->capacity.nsec); | ||
1739 | // set_disk_ro(sdkp->disk, lun->readonly); | ||
1740 | |||
1741 | return 0; | ||
1742 | } | ||
1743 | |||
1744 | /* | ||
1745 | * The check is called by the block layer to verify if the media | ||
1746 | * is still available. It is supposed to be harmless, lightweight and | ||
1747 | * non-intrusive in case the media was not changed. | ||
1748 | * | ||
1749 | * This call can sleep. | ||
1750 | * | ||
1751 | * The return code is bool! | ||
1752 | */ | ||
1753 | static unsigned int ub_bd_check_events(struct gendisk *disk, | ||
1754 | unsigned int clearing) | ||
1755 | { | ||
1756 | struct ub_lun *lun = disk->private_data; | ||
1757 | |||
1758 | if (!lun->removable) | ||
1759 | return 0; | ||
1760 | |||
1761 | /* | ||
1762 | * We clean checks always after every command, so this is not | ||
1763 | * as dangerous as it looks. If the TEST_UNIT_READY fails here, | ||
1764 | * the device is actually not ready with operator or software | ||
1765 | * intervention required. One dangerous item might be a drive which | ||
1766 | * spins itself down, and come the time to write dirty pages, this | ||
1767 | * will fail, then block layer discards the data. Since we never | ||
1768 | * spin drives up, such devices simply cannot be used with ub anyway. | ||
1769 | */ | ||
1770 | if (ub_sync_tur(lun->udev, lun) != 0) { | ||
1771 | lun->changed = 1; | ||
1772 | return DISK_EVENT_MEDIA_CHANGE; | ||
1773 | } | ||
1774 | |||
1775 | return lun->changed ? DISK_EVENT_MEDIA_CHANGE : 0; | ||
1776 | } | ||
1777 | |||
1778 | static const struct block_device_operations ub_bd_fops = { | ||
1779 | .owner = THIS_MODULE, | ||
1780 | .open = ub_bd_unlocked_open, | ||
1781 | .release = ub_bd_release, | ||
1782 | .ioctl = ub_bd_ioctl, | ||
1783 | .check_events = ub_bd_check_events, | ||
1784 | .revalidate_disk = ub_bd_revalidate, | ||
1785 | }; | ||
1786 | |||
1787 | /* | ||
1788 | * Common ->done routine for commands executed synchronously. | ||
1789 | */ | ||
1790 | static void ub_probe_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd) | ||
1791 | { | ||
1792 | struct completion *cop = cmd->back; | ||
1793 | complete(cop); | ||
1794 | } | ||
1795 | |||
1796 | /* | ||
1797 | * Test if the device has a check condition on it, synchronously. | ||
1798 | */ | ||
1799 | static int ub_sync_tur(struct ub_dev *sc, struct ub_lun *lun) | ||
1800 | { | ||
1801 | struct ub_scsi_cmd *cmd; | ||
1802 | enum { ALLOC_SIZE = sizeof(struct ub_scsi_cmd) }; | ||
1803 | unsigned long flags; | ||
1804 | struct completion compl; | ||
1805 | int rc; | ||
1806 | |||
1807 | init_completion(&compl); | ||
1808 | |||
1809 | rc = -ENOMEM; | ||
1810 | if ((cmd = kzalloc(ALLOC_SIZE, GFP_KERNEL)) == NULL) | ||
1811 | goto err_alloc; | ||
1812 | |||
1813 | cmd->cdb[0] = TEST_UNIT_READY; | ||
1814 | cmd->cdb_len = 6; | ||
1815 | cmd->dir = UB_DIR_NONE; | ||
1816 | cmd->state = UB_CMDST_INIT; | ||
1817 | cmd->lun = lun; /* This may be NULL, but that's ok */ | ||
1818 | cmd->done = ub_probe_done; | ||
1819 | cmd->back = &compl; | ||
1820 | |||
1821 | spin_lock_irqsave(sc->lock, flags); | ||
1822 | cmd->tag = sc->tagcnt++; | ||
1823 | |||
1824 | rc = ub_submit_scsi(sc, cmd); | ||
1825 | spin_unlock_irqrestore(sc->lock, flags); | ||
1826 | |||
1827 | if (rc != 0) | ||
1828 | goto err_submit; | ||
1829 | |||
1830 | wait_for_completion(&compl); | ||
1831 | |||
1832 | rc = cmd->error; | ||
1833 | |||
1834 | if (rc == -EIO && cmd->key != 0) /* Retries for benh's key */ | ||
1835 | rc = cmd->key; | ||
1836 | |||
1837 | err_submit: | ||
1838 | kfree(cmd); | ||
1839 | err_alloc: | ||
1840 | return rc; | ||
1841 | } | ||
1842 | |||
1843 | /* | ||
1844 | * Read the SCSI capacity synchronously (for probing). | ||
1845 | */ | ||
1846 | static int ub_sync_read_cap(struct ub_dev *sc, struct ub_lun *lun, | ||
1847 | struct ub_capacity *ret) | ||
1848 | { | ||
1849 | struct ub_scsi_cmd *cmd; | ||
1850 | struct scatterlist *sg; | ||
1851 | char *p; | ||
1852 | enum { ALLOC_SIZE = sizeof(struct ub_scsi_cmd) + 8 }; | ||
1853 | unsigned long flags; | ||
1854 | unsigned int bsize, shift; | ||
1855 | unsigned long nsec; | ||
1856 | struct completion compl; | ||
1857 | int rc; | ||
1858 | |||
1859 | init_completion(&compl); | ||
1860 | |||
1861 | rc = -ENOMEM; | ||
1862 | if ((cmd = kzalloc(ALLOC_SIZE, GFP_KERNEL)) == NULL) | ||
1863 | goto err_alloc; | ||
1864 | p = (char *)cmd + sizeof(struct ub_scsi_cmd); | ||
1865 | |||
1866 | cmd->cdb[0] = 0x25; | ||
1867 | cmd->cdb_len = 10; | ||
1868 | cmd->dir = UB_DIR_READ; | ||
1869 | cmd->state = UB_CMDST_INIT; | ||
1870 | cmd->nsg = 1; | ||
1871 | sg = &cmd->sgv[0]; | ||
1872 | sg_init_table(sg, UB_MAX_REQ_SG); | ||
1873 | sg_set_page(sg, virt_to_page(p), 8, (unsigned long)p & (PAGE_SIZE-1)); | ||
1874 | cmd->len = 8; | ||
1875 | cmd->lun = lun; | ||
1876 | cmd->done = ub_probe_done; | ||
1877 | cmd->back = &compl; | ||
1878 | |||
1879 | spin_lock_irqsave(sc->lock, flags); | ||
1880 | cmd->tag = sc->tagcnt++; | ||
1881 | |||
1882 | rc = ub_submit_scsi(sc, cmd); | ||
1883 | spin_unlock_irqrestore(sc->lock, flags); | ||
1884 | |||
1885 | if (rc != 0) | ||
1886 | goto err_submit; | ||
1887 | |||
1888 | wait_for_completion(&compl); | ||
1889 | |||
1890 | if (cmd->error != 0) { | ||
1891 | rc = -EIO; | ||
1892 | goto err_read; | ||
1893 | } | ||
1894 | if (cmd->act_len != 8) { | ||
1895 | rc = -EIO; | ||
1896 | goto err_read; | ||
1897 | } | ||
1898 | |||
1899 | /* sd.c special-cases sector size of 0 to mean 512. Needed? Safe? */ | ||
1900 | nsec = be32_to_cpu(*(__be32 *)p) + 1; | ||
1901 | bsize = be32_to_cpu(*(__be32 *)(p + 4)); | ||
1902 | switch (bsize) { | ||
1903 | case 512: shift = 0; break; | ||
1904 | case 1024: shift = 1; break; | ||
1905 | case 2048: shift = 2; break; | ||
1906 | case 4096: shift = 3; break; | ||
1907 | default: | ||
1908 | rc = -EDOM; | ||
1909 | goto err_inv_bsize; | ||
1910 | } | ||
1911 | |||
1912 | ret->bsize = bsize; | ||
1913 | ret->bshift = shift; | ||
1914 | ret->nsec = nsec << shift; | ||
1915 | rc = 0; | ||
1916 | |||
1917 | err_inv_bsize: | ||
1918 | err_read: | ||
1919 | err_submit: | ||
1920 | kfree(cmd); | ||
1921 | err_alloc: | ||
1922 | return rc; | ||
1923 | } | ||
1924 | |||
1925 | /* | ||
1926 | */ | ||
1927 | static void ub_probe_urb_complete(struct urb *urb) | ||
1928 | { | ||
1929 | struct completion *cop = urb->context; | ||
1930 | complete(cop); | ||
1931 | } | ||
1932 | |||
1933 | static void ub_probe_timeout(unsigned long arg) | ||
1934 | { | ||
1935 | struct completion *cop = (struct completion *) arg; | ||
1936 | complete(cop); | ||
1937 | } | ||
1938 | |||
1939 | /* | ||
1940 | * Reset with a Bulk reset. | ||
1941 | */ | ||
1942 | static int ub_sync_reset(struct ub_dev *sc) | ||
1943 | { | ||
1944 | int ifnum = sc->intf->cur_altsetting->desc.bInterfaceNumber; | ||
1945 | struct usb_ctrlrequest *cr; | ||
1946 | struct completion compl; | ||
1947 | struct timer_list timer; | ||
1948 | int rc; | ||
1949 | |||
1950 | init_completion(&compl); | ||
1951 | |||
1952 | cr = &sc->work_cr; | ||
1953 | cr->bRequestType = USB_TYPE_CLASS | USB_RECIP_INTERFACE; | ||
1954 | cr->bRequest = US_BULK_RESET_REQUEST; | ||
1955 | cr->wValue = cpu_to_le16(0); | ||
1956 | cr->wIndex = cpu_to_le16(ifnum); | ||
1957 | cr->wLength = cpu_to_le16(0); | ||
1958 | |||
1959 | usb_fill_control_urb(&sc->work_urb, sc->dev, sc->send_ctrl_pipe, | ||
1960 | (unsigned char*) cr, NULL, 0, ub_probe_urb_complete, &compl); | ||
1961 | |||
1962 | if ((rc = usb_submit_urb(&sc->work_urb, GFP_KERNEL)) != 0) { | ||
1963 | printk(KERN_WARNING | ||
1964 | "%s: Unable to submit a bulk reset (%d)\n", sc->name, rc); | ||
1965 | return rc; | ||
1966 | } | ||
1967 | |||
1968 | init_timer(&timer); | ||
1969 | timer.function = ub_probe_timeout; | ||
1970 | timer.data = (unsigned long) &compl; | ||
1971 | timer.expires = jiffies + UB_CTRL_TIMEOUT; | ||
1972 | add_timer(&timer); | ||
1973 | |||
1974 | wait_for_completion(&compl); | ||
1975 | |||
1976 | del_timer_sync(&timer); | ||
1977 | usb_kill_urb(&sc->work_urb); | ||
1978 | |||
1979 | return sc->work_urb.status; | ||
1980 | } | ||
1981 | |||
1982 | /* | ||
1983 | * Get number of LUNs by the way of Bulk GetMaxLUN command. | ||
1984 | */ | ||
1985 | static int ub_sync_getmaxlun(struct ub_dev *sc) | ||
1986 | { | ||
1987 | int ifnum = sc->intf->cur_altsetting->desc.bInterfaceNumber; | ||
1988 | unsigned char *p; | ||
1989 | enum { ALLOC_SIZE = 1 }; | ||
1990 | struct usb_ctrlrequest *cr; | ||
1991 | struct completion compl; | ||
1992 | struct timer_list timer; | ||
1993 | int nluns; | ||
1994 | int rc; | ||
1995 | |||
1996 | init_completion(&compl); | ||
1997 | |||
1998 | rc = -ENOMEM; | ||
1999 | if ((p = kmalloc(ALLOC_SIZE, GFP_KERNEL)) == NULL) | ||
2000 | goto err_alloc; | ||
2001 | *p = 55; | ||
2002 | |||
2003 | cr = &sc->work_cr; | ||
2004 | cr->bRequestType = USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_INTERFACE; | ||
2005 | cr->bRequest = US_BULK_GET_MAX_LUN; | ||
2006 | cr->wValue = cpu_to_le16(0); | ||
2007 | cr->wIndex = cpu_to_le16(ifnum); | ||
2008 | cr->wLength = cpu_to_le16(1); | ||
2009 | |||
2010 | usb_fill_control_urb(&sc->work_urb, sc->dev, sc->recv_ctrl_pipe, | ||
2011 | (unsigned char*) cr, p, 1, ub_probe_urb_complete, &compl); | ||
2012 | |||
2013 | if ((rc = usb_submit_urb(&sc->work_urb, GFP_KERNEL)) != 0) | ||
2014 | goto err_submit; | ||
2015 | |||
2016 | init_timer(&timer); | ||
2017 | timer.function = ub_probe_timeout; | ||
2018 | timer.data = (unsigned long) &compl; | ||
2019 | timer.expires = jiffies + UB_CTRL_TIMEOUT; | ||
2020 | add_timer(&timer); | ||
2021 | |||
2022 | wait_for_completion(&compl); | ||
2023 | |||
2024 | del_timer_sync(&timer); | ||
2025 | usb_kill_urb(&sc->work_urb); | ||
2026 | |||
2027 | if ((rc = sc->work_urb.status) < 0) | ||
2028 | goto err_io; | ||
2029 | |||
2030 | if (sc->work_urb.actual_length != 1) { | ||
2031 | nluns = 0; | ||
2032 | } else { | ||
2033 | if ((nluns = *p) == 55) { | ||
2034 | nluns = 0; | ||
2035 | } else { | ||
2036 | /* GetMaxLUN returns the maximum LUN number */ | ||
2037 | nluns += 1; | ||
2038 | if (nluns > UB_MAX_LUNS) | ||
2039 | nluns = UB_MAX_LUNS; | ||
2040 | } | ||
2041 | } | ||
2042 | |||
2043 | kfree(p); | ||
2044 | return nluns; | ||
2045 | |||
2046 | err_io: | ||
2047 | err_submit: | ||
2048 | kfree(p); | ||
2049 | err_alloc: | ||
2050 | return rc; | ||
2051 | } | ||
2052 | |||
2053 | /* | ||
2054 | * Clear initial stalls. | ||
2055 | */ | ||
2056 | static int ub_probe_clear_stall(struct ub_dev *sc, int stalled_pipe) | ||
2057 | { | ||
2058 | int endp; | ||
2059 | struct usb_ctrlrequest *cr; | ||
2060 | struct completion compl; | ||
2061 | struct timer_list timer; | ||
2062 | int rc; | ||
2063 | |||
2064 | init_completion(&compl); | ||
2065 | |||
2066 | endp = usb_pipeendpoint(stalled_pipe); | ||
2067 | if (usb_pipein (stalled_pipe)) | ||
2068 | endp |= USB_DIR_IN; | ||
2069 | |||
2070 | cr = &sc->work_cr; | ||
2071 | cr->bRequestType = USB_RECIP_ENDPOINT; | ||
2072 | cr->bRequest = USB_REQ_CLEAR_FEATURE; | ||
2073 | cr->wValue = cpu_to_le16(USB_ENDPOINT_HALT); | ||
2074 | cr->wIndex = cpu_to_le16(endp); | ||
2075 | cr->wLength = cpu_to_le16(0); | ||
2076 | |||
2077 | usb_fill_control_urb(&sc->work_urb, sc->dev, sc->send_ctrl_pipe, | ||
2078 | (unsigned char*) cr, NULL, 0, ub_probe_urb_complete, &compl); | ||
2079 | |||
2080 | if ((rc = usb_submit_urb(&sc->work_urb, GFP_KERNEL)) != 0) { | ||
2081 | printk(KERN_WARNING | ||
2082 | "%s: Unable to submit a probe clear (%d)\n", sc->name, rc); | ||
2083 | return rc; | ||
2084 | } | ||
2085 | |||
2086 | init_timer(&timer); | ||
2087 | timer.function = ub_probe_timeout; | ||
2088 | timer.data = (unsigned long) &compl; | ||
2089 | timer.expires = jiffies + UB_CTRL_TIMEOUT; | ||
2090 | add_timer(&timer); | ||
2091 | |||
2092 | wait_for_completion(&compl); | ||
2093 | |||
2094 | del_timer_sync(&timer); | ||
2095 | usb_kill_urb(&sc->work_urb); | ||
2096 | |||
2097 | usb_reset_endpoint(sc->dev, endp); | ||
2098 | |||
2099 | return 0; | ||
2100 | } | ||
2101 | |||
2102 | /* | ||
2103 | * Get the pipe settings. | ||
2104 | */ | ||
2105 | static int ub_get_pipes(struct ub_dev *sc, struct usb_device *dev, | ||
2106 | struct usb_interface *intf) | ||
2107 | { | ||
2108 | struct usb_host_interface *altsetting = intf->cur_altsetting; | ||
2109 | struct usb_endpoint_descriptor *ep_in = NULL; | ||
2110 | struct usb_endpoint_descriptor *ep_out = NULL; | ||
2111 | struct usb_endpoint_descriptor *ep; | ||
2112 | int i; | ||
2113 | |||
2114 | /* | ||
2115 | * Find the endpoints we need. | ||
2116 | * We are expecting a minimum of 2 endpoints - in and out (bulk). | ||
2117 | * We will ignore any others. | ||
2118 | */ | ||
2119 | for (i = 0; i < altsetting->desc.bNumEndpoints; i++) { | ||
2120 | ep = &altsetting->endpoint[i].desc; | ||
2121 | |||
2122 | /* Is it a BULK endpoint? */ | ||
2123 | if (usb_endpoint_xfer_bulk(ep)) { | ||
2124 | /* BULK in or out? */ | ||
2125 | if (usb_endpoint_dir_in(ep)) { | ||
2126 | if (ep_in == NULL) | ||
2127 | ep_in = ep; | ||
2128 | } else { | ||
2129 | if (ep_out == NULL) | ||
2130 | ep_out = ep; | ||
2131 | } | ||
2132 | } | ||
2133 | } | ||
2134 | |||
2135 | if (ep_in == NULL || ep_out == NULL) { | ||
2136 | printk(KERN_NOTICE "%s: failed endpoint check\n", sc->name); | ||
2137 | return -ENODEV; | ||
2138 | } | ||
2139 | |||
2140 | /* Calculate and store the pipe values */ | ||
2141 | sc->send_ctrl_pipe = usb_sndctrlpipe(dev, 0); | ||
2142 | sc->recv_ctrl_pipe = usb_rcvctrlpipe(dev, 0); | ||
2143 | sc->send_bulk_pipe = usb_sndbulkpipe(dev, | ||
2144 | usb_endpoint_num(ep_out)); | ||
2145 | sc->recv_bulk_pipe = usb_rcvbulkpipe(dev, | ||
2146 | usb_endpoint_num(ep_in)); | ||
2147 | |||
2148 | return 0; | ||
2149 | } | ||
2150 | |||
2151 | /* | ||
2152 | * Probing is done in the process context, which allows us to cheat | ||
2153 | * and not to build a state machine for the discovery. | ||
2154 | */ | ||
2155 | static int ub_probe(struct usb_interface *intf, | ||
2156 | const struct usb_device_id *dev_id) | ||
2157 | { | ||
2158 | struct ub_dev *sc; | ||
2159 | int nluns; | ||
2160 | int rc; | ||
2161 | int i; | ||
2162 | |||
2163 | if (usb_usual_check_type(dev_id, USB_US_TYPE_UB)) | ||
2164 | return -ENXIO; | ||
2165 | |||
2166 | rc = -ENOMEM; | ||
2167 | if ((sc = kzalloc(sizeof(struct ub_dev), GFP_KERNEL)) == NULL) | ||
2168 | goto err_core; | ||
2169 | sc->lock = ub_next_lock(); | ||
2170 | INIT_LIST_HEAD(&sc->luns); | ||
2171 | usb_init_urb(&sc->work_urb); | ||
2172 | tasklet_init(&sc->tasklet, ub_scsi_action, (unsigned long)sc); | ||
2173 | atomic_set(&sc->poison, 0); | ||
2174 | INIT_WORK(&sc->reset_work, ub_reset_task); | ||
2175 | init_waitqueue_head(&sc->reset_wait); | ||
2176 | |||
2177 | init_timer(&sc->work_timer); | ||
2178 | sc->work_timer.data = (unsigned long) sc; | ||
2179 | sc->work_timer.function = ub_urb_timeout; | ||
2180 | |||
2181 | ub_init_completion(&sc->work_done); | ||
2182 | sc->work_done.done = 1; /* A little yuk, but oh well... */ | ||
2183 | |||
2184 | sc->dev = interface_to_usbdev(intf); | ||
2185 | sc->intf = intf; | ||
2186 | // sc->ifnum = intf->cur_altsetting->desc.bInterfaceNumber; | ||
2187 | usb_set_intfdata(intf, sc); | ||
2188 | usb_get_dev(sc->dev); | ||
2189 | /* | ||
2190 | * Since we give the interface struct to the block level through | ||
2191 | * disk->driverfs_dev, we have to pin it. Otherwise, block_uevent | ||
2192 | * oopses on close after a disconnect (kernels 2.6.16 and up). | ||
2193 | */ | ||
2194 | usb_get_intf(sc->intf); | ||
2195 | |||
2196 | snprintf(sc->name, 12, DRV_NAME "(%d.%d)", | ||
2197 | sc->dev->bus->busnum, sc->dev->devnum); | ||
2198 | |||
2199 | /* XXX Verify that we can handle the device (from descriptors) */ | ||
2200 | |||
2201 | if (ub_get_pipes(sc, sc->dev, intf) != 0) | ||
2202 | goto err_dev_desc; | ||
2203 | |||
2204 | /* | ||
2205 | * At this point, all USB initialization is done, do upper layer. | ||
2206 | * We really hate halfway initialized structures, so from the | ||
2207 | * invariants perspective, this ub_dev is fully constructed at | ||
2208 | * this point. | ||
2209 | */ | ||
2210 | |||
2211 | /* | ||
2212 | * This is needed to clear toggles. It is a problem only if we do | ||
2213 | * `rmmod ub && modprobe ub` without disconnects, but we like that. | ||
2214 | */ | ||
2215 | #if 0 /* iPod Mini fails if we do this (big white iPod works) */ | ||
2216 | ub_probe_clear_stall(sc, sc->recv_bulk_pipe); | ||
2217 | ub_probe_clear_stall(sc, sc->send_bulk_pipe); | ||
2218 | #endif | ||
2219 | |||
2220 | /* | ||
2221 | * The way this is used by the startup code is a little specific. | ||
2222 | * A SCSI check causes a USB stall. Our common case code sees it | ||
2223 | * and clears the check, after which the device is ready for use. | ||
2224 | * But if a check was not present, any command other than | ||
2225 | * TEST_UNIT_READY ends with a lockup (including REQUEST_SENSE). | ||
2226 | * | ||
2227 | * If we neglect to clear the SCSI check, the first real command fails | ||
2228 | * (which is the capacity readout). We clear that and retry, but why | ||
2229 | * causing spurious retries for no reason. | ||
2230 | * | ||
2231 | * Revalidation may start with its own TEST_UNIT_READY, but that one | ||
2232 | * has to succeed, so we clear checks with an additional one here. | ||
2233 | * In any case it's not our business how revaliadation is implemented. | ||
2234 | */ | ||
2235 | for (i = 0; i < 3; i++) { /* Retries for the schwag key from KS'04 */ | ||
2236 | if ((rc = ub_sync_tur(sc, NULL)) <= 0) break; | ||
2237 | if (rc != 0x6) break; | ||
2238 | msleep(10); | ||
2239 | } | ||
2240 | |||
2241 | nluns = 1; | ||
2242 | for (i = 0; i < 3; i++) { | ||
2243 | if ((rc = ub_sync_getmaxlun(sc)) < 0) | ||
2244 | break; | ||
2245 | if (rc != 0) { | ||
2246 | nluns = rc; | ||
2247 | break; | ||
2248 | } | ||
2249 | msleep(100); | ||
2250 | } | ||
2251 | |||
2252 | for (i = 0; i < nluns; i++) { | ||
2253 | ub_probe_lun(sc, i); | ||
2254 | } | ||
2255 | return 0; | ||
2256 | |||
2257 | err_dev_desc: | ||
2258 | usb_set_intfdata(intf, NULL); | ||
2259 | usb_put_intf(sc->intf); | ||
2260 | usb_put_dev(sc->dev); | ||
2261 | kfree(sc); | ||
2262 | err_core: | ||
2263 | return rc; | ||
2264 | } | ||
2265 | |||
2266 | static int ub_probe_lun(struct ub_dev *sc, int lnum) | ||
2267 | { | ||
2268 | struct ub_lun *lun; | ||
2269 | struct request_queue *q; | ||
2270 | struct gendisk *disk; | ||
2271 | int rc; | ||
2272 | |||
2273 | rc = -ENOMEM; | ||
2274 | if ((lun = kzalloc(sizeof(struct ub_lun), GFP_KERNEL)) == NULL) | ||
2275 | goto err_alloc; | ||
2276 | lun->num = lnum; | ||
2277 | |||
2278 | rc = -ENOSR; | ||
2279 | if ((lun->id = ub_id_get()) == -1) | ||
2280 | goto err_id; | ||
2281 | |||
2282 | lun->udev = sc; | ||
2283 | |||
2284 | snprintf(lun->name, 16, DRV_NAME "%c(%d.%d.%d)", | ||
2285 | lun->id + 'a', sc->dev->bus->busnum, sc->dev->devnum, lun->num); | ||
2286 | |||
2287 | lun->removable = 1; /* XXX Query this from the device */ | ||
2288 | lun->changed = 1; /* ub_revalidate clears only */ | ||
2289 | ub_revalidate(sc, lun); | ||
2290 | |||
2291 | rc = -ENOMEM; | ||
2292 | if ((disk = alloc_disk(UB_PARTS_PER_LUN)) == NULL) | ||
2293 | goto err_diskalloc; | ||
2294 | |||
2295 | sprintf(disk->disk_name, DRV_NAME "%c", lun->id + 'a'); | ||
2296 | disk->major = UB_MAJOR; | ||
2297 | disk->first_minor = lun->id * UB_PARTS_PER_LUN; | ||
2298 | disk->fops = &ub_bd_fops; | ||
2299 | disk->private_data = lun; | ||
2300 | disk->driverfs_dev = &sc->intf->dev; | ||
2301 | |||
2302 | rc = -ENOMEM; | ||
2303 | if ((q = blk_init_queue(ub_request_fn, sc->lock)) == NULL) | ||
2304 | goto err_blkqinit; | ||
2305 | |||
2306 | disk->queue = q; | ||
2307 | |||
2308 | blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); | ||
2309 | blk_queue_max_segments(q, UB_MAX_REQ_SG); | ||
2310 | blk_queue_segment_boundary(q, 0xffffffff); /* Dubious. */ | ||
2311 | blk_queue_max_hw_sectors(q, UB_MAX_SECTORS); | ||
2312 | blk_queue_logical_block_size(q, lun->capacity.bsize); | ||
2313 | |||
2314 | lun->disk = disk; | ||
2315 | q->queuedata = lun; | ||
2316 | list_add(&lun->link, &sc->luns); | ||
2317 | |||
2318 | set_capacity(disk, lun->capacity.nsec); | ||
2319 | if (lun->removable) | ||
2320 | disk->flags |= GENHD_FL_REMOVABLE; | ||
2321 | |||
2322 | add_disk(disk); | ||
2323 | |||
2324 | return 0; | ||
2325 | |||
2326 | err_blkqinit: | ||
2327 | put_disk(disk); | ||
2328 | err_diskalloc: | ||
2329 | ub_id_put(lun->id); | ||
2330 | err_id: | ||
2331 | kfree(lun); | ||
2332 | err_alloc: | ||
2333 | return rc; | ||
2334 | } | ||
2335 | |||
2336 | static void ub_disconnect(struct usb_interface *intf) | ||
2337 | { | ||
2338 | struct ub_dev *sc = usb_get_intfdata(intf); | ||
2339 | struct ub_lun *lun; | ||
2340 | unsigned long flags; | ||
2341 | |||
2342 | /* | ||
2343 | * Prevent ub_bd_release from pulling the rug from under us. | ||
2344 | * XXX This is starting to look like a kref. | ||
2345 | * XXX Why not to take this ref at probe time? | ||
2346 | */ | ||
2347 | spin_lock_irqsave(&ub_lock, flags); | ||
2348 | sc->openc++; | ||
2349 | spin_unlock_irqrestore(&ub_lock, flags); | ||
2350 | |||
2351 | /* | ||
2352 | * Fence stall clearings, operations triggered by unlinkings and so on. | ||
2353 | * We do not attempt to unlink any URBs, because we do not trust the | ||
2354 | * unlink paths in HC drivers. Also, we get -84 upon disconnect anyway. | ||
2355 | */ | ||
2356 | atomic_set(&sc->poison, 1); | ||
2357 | |||
2358 | /* | ||
2359 | * Wait for reset to end, if any. | ||
2360 | */ | ||
2361 | wait_event(sc->reset_wait, !sc->reset); | ||
2362 | |||
2363 | /* | ||
2364 | * Blow away queued commands. | ||
2365 | * | ||
2366 | * Actually, this never works, because before we get here | ||
2367 | * the HCD terminates outstanding URB(s). It causes our | ||
2368 | * SCSI command queue to advance, commands fail to submit, | ||
2369 | * and the whole queue drains. So, we just use this code to | ||
2370 | * print warnings. | ||
2371 | */ | ||
2372 | spin_lock_irqsave(sc->lock, flags); | ||
2373 | { | ||
2374 | struct ub_scsi_cmd *cmd; | ||
2375 | int cnt = 0; | ||
2376 | while ((cmd = ub_cmdq_peek(sc)) != NULL) { | ||
2377 | cmd->error = -ENOTCONN; | ||
2378 | cmd->state = UB_CMDST_DONE; | ||
2379 | ub_cmdq_pop(sc); | ||
2380 | (*cmd->done)(sc, cmd); | ||
2381 | cnt++; | ||
2382 | } | ||
2383 | if (cnt != 0) { | ||
2384 | printk(KERN_WARNING "%s: " | ||
2385 | "%d was queued after shutdown\n", sc->name, cnt); | ||
2386 | } | ||
2387 | } | ||
2388 | spin_unlock_irqrestore(sc->lock, flags); | ||
2389 | |||
2390 | /* | ||
2391 | * Unregister the upper layer. | ||
2392 | */ | ||
2393 | list_for_each_entry(lun, &sc->luns, link) { | ||
2394 | del_gendisk(lun->disk); | ||
2395 | /* | ||
2396 | * I wish I could do: | ||
2397 | * queue_flag_set(QUEUE_FLAG_DEAD, q); | ||
2398 | * As it is, we rely on our internal poisoning and let | ||
2399 | * the upper levels to spin furiously failing all the I/O. | ||
2400 | */ | ||
2401 | } | ||
2402 | |||
2403 | /* | ||
2404 | * Testing for -EINPROGRESS is always a bug, so we are bending | ||
2405 | * the rules a little. | ||
2406 | */ | ||
2407 | spin_lock_irqsave(sc->lock, flags); | ||
2408 | if (sc->work_urb.status == -EINPROGRESS) { /* janitors: ignore */ | ||
2409 | printk(KERN_WARNING "%s: " | ||
2410 | "URB is active after disconnect\n", sc->name); | ||
2411 | } | ||
2412 | spin_unlock_irqrestore(sc->lock, flags); | ||
2413 | |||
2414 | /* | ||
2415 | * There is virtually no chance that other CPU runs a timeout so long | ||
2416 | * after ub_urb_complete should have called del_timer, but only if HCD | ||
2417 | * didn't forget to deliver a callback on unlink. | ||
2418 | */ | ||
2419 | del_timer_sync(&sc->work_timer); | ||
2420 | |||
2421 | /* | ||
2422 | * At this point there must be no commands coming from anyone | ||
2423 | * and no URBs left in transit. | ||
2424 | */ | ||
2425 | |||
2426 | ub_put(sc); | ||
2427 | } | ||
2428 | |||
2429 | static struct usb_driver ub_driver = { | ||
2430 | .name = "ub", | ||
2431 | .probe = ub_probe, | ||
2432 | .disconnect = ub_disconnect, | ||
2433 | .id_table = ub_usb_ids, | ||
2434 | .pre_reset = ub_pre_reset, | ||
2435 | .post_reset = ub_post_reset, | ||
2436 | }; | ||
2437 | |||
2438 | static int __init ub_init(void) | ||
2439 | { | ||
2440 | int rc; | ||
2441 | int i; | ||
2442 | |||
2443 | pr_info("'Low Performance USB Block' driver is deprecated. " | ||
2444 | "Please switch to usb-storage\n"); | ||
2445 | for (i = 0; i < UB_QLOCK_NUM; i++) | ||
2446 | spin_lock_init(&ub_qlockv[i]); | ||
2447 | |||
2448 | if ((rc = register_blkdev(UB_MAJOR, DRV_NAME)) != 0) | ||
2449 | goto err_regblkdev; | ||
2450 | |||
2451 | if ((rc = usb_register(&ub_driver)) != 0) | ||
2452 | goto err_register; | ||
2453 | |||
2454 | usb_usual_set_present(USB_US_TYPE_UB); | ||
2455 | return 0; | ||
2456 | |||
2457 | err_register: | ||
2458 | unregister_blkdev(UB_MAJOR, DRV_NAME); | ||
2459 | err_regblkdev: | ||
2460 | return rc; | ||
2461 | } | ||
2462 | |||
2463 | static void __exit ub_exit(void) | ||
2464 | { | ||
2465 | usb_deregister(&ub_driver); | ||
2466 | |||
2467 | unregister_blkdev(UB_MAJOR, DRV_NAME); | ||
2468 | usb_usual_clear_present(USB_US_TYPE_UB); | ||
2469 | } | ||
2470 | |||
2471 | module_init(ub_init); | ||
2472 | module_exit(ub_exit); | ||
2473 | |||
2474 | MODULE_LICENSE("GPL"); | ||
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index c0bbeb470754..0bdde8fba397 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c | |||
@@ -14,6 +14,9 @@ | |||
14 | 14 | ||
15 | #define PART_BITS 4 | 15 | #define PART_BITS 4 |
16 | 16 | ||
17 | static bool use_bio; | ||
18 | module_param(use_bio, bool, S_IRUGO); | ||
19 | |||
17 | static int major; | 20 | static int major; |
18 | static DEFINE_IDA(vd_index_ida); | 21 | static DEFINE_IDA(vd_index_ida); |
19 | 22 | ||
@@ -23,6 +26,7 @@ struct virtio_blk | |||
23 | { | 26 | { |
24 | struct virtio_device *vdev; | 27 | struct virtio_device *vdev; |
25 | struct virtqueue *vq; | 28 | struct virtqueue *vq; |
29 | wait_queue_head_t queue_wait; | ||
26 | 30 | ||
27 | /* The disk structure for the kernel. */ | 31 | /* The disk structure for the kernel. */ |
28 | struct gendisk *disk; | 32 | struct gendisk *disk; |
@@ -51,53 +55,244 @@ struct virtio_blk | |||
51 | struct virtblk_req | 55 | struct virtblk_req |
52 | { | 56 | { |
53 | struct request *req; | 57 | struct request *req; |
58 | struct bio *bio; | ||
54 | struct virtio_blk_outhdr out_hdr; | 59 | struct virtio_blk_outhdr out_hdr; |
55 | struct virtio_scsi_inhdr in_hdr; | 60 | struct virtio_scsi_inhdr in_hdr; |
61 | struct work_struct work; | ||
62 | struct virtio_blk *vblk; | ||
63 | int flags; | ||
56 | u8 status; | 64 | u8 status; |
65 | struct scatterlist sg[]; | ||
66 | }; | ||
67 | |||
68 | enum { | ||
69 | VBLK_IS_FLUSH = 1, | ||
70 | VBLK_REQ_FLUSH = 2, | ||
71 | VBLK_REQ_DATA = 4, | ||
72 | VBLK_REQ_FUA = 8, | ||
57 | }; | 73 | }; |
58 | 74 | ||
59 | static void blk_done(struct virtqueue *vq) | 75 | static inline int virtblk_result(struct virtblk_req *vbr) |
76 | { | ||
77 | switch (vbr->status) { | ||
78 | case VIRTIO_BLK_S_OK: | ||
79 | return 0; | ||
80 | case VIRTIO_BLK_S_UNSUPP: | ||
81 | return -ENOTTY; | ||
82 | default: | ||
83 | return -EIO; | ||
84 | } | ||
85 | } | ||
86 | |||
87 | static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk, | ||
88 | gfp_t gfp_mask) | ||
60 | { | 89 | { |
61 | struct virtio_blk *vblk = vq->vdev->priv; | ||
62 | struct virtblk_req *vbr; | 90 | struct virtblk_req *vbr; |
63 | unsigned int len; | ||
64 | unsigned long flags; | ||
65 | 91 | ||
66 | spin_lock_irqsave(vblk->disk->queue->queue_lock, flags); | 92 | vbr = mempool_alloc(vblk->pool, gfp_mask); |
67 | while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) { | 93 | if (!vbr) |
68 | int error; | 94 | return NULL; |
69 | 95 | ||
70 | switch (vbr->status) { | 96 | vbr->vblk = vblk; |
71 | case VIRTIO_BLK_S_OK: | 97 | if (use_bio) |
72 | error = 0; | 98 | sg_init_table(vbr->sg, vblk->sg_elems); |
73 | break; | 99 | |
74 | case VIRTIO_BLK_S_UNSUPP: | 100 | return vbr; |
75 | error = -ENOTTY; | 101 | } |
76 | break; | 102 | |
77 | default: | 103 | static void virtblk_add_buf_wait(struct virtio_blk *vblk, |
78 | error = -EIO; | 104 | struct virtblk_req *vbr, |
105 | unsigned long out, | ||
106 | unsigned long in) | ||
107 | { | ||
108 | DEFINE_WAIT(wait); | ||
109 | |||
110 | for (;;) { | ||
111 | prepare_to_wait_exclusive(&vblk->queue_wait, &wait, | ||
112 | TASK_UNINTERRUPTIBLE); | ||
113 | |||
114 | spin_lock_irq(vblk->disk->queue->queue_lock); | ||
115 | if (virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr, | ||
116 | GFP_ATOMIC) < 0) { | ||
117 | spin_unlock_irq(vblk->disk->queue->queue_lock); | ||
118 | io_schedule(); | ||
119 | } else { | ||
120 | virtqueue_kick(vblk->vq); | ||
121 | spin_unlock_irq(vblk->disk->queue->queue_lock); | ||
79 | break; | 122 | break; |
80 | } | 123 | } |
81 | 124 | ||
82 | switch (vbr->req->cmd_type) { | 125 | } |
83 | case REQ_TYPE_BLOCK_PC: | 126 | |
84 | vbr->req->resid_len = vbr->in_hdr.residual; | 127 | finish_wait(&vblk->queue_wait, &wait); |
85 | vbr->req->sense_len = vbr->in_hdr.sense_len; | 128 | } |
86 | vbr->req->errors = vbr->in_hdr.errors; | 129 | |
87 | break; | 130 | static inline void virtblk_add_req(struct virtblk_req *vbr, |
88 | case REQ_TYPE_SPECIAL: | 131 | unsigned int out, unsigned int in) |
89 | vbr->req->errors = (error != 0); | 132 | { |
90 | break; | 133 | struct virtio_blk *vblk = vbr->vblk; |
91 | default: | 134 | |
92 | break; | 135 | spin_lock_irq(vblk->disk->queue->queue_lock); |
136 | if (unlikely(virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr, | ||
137 | GFP_ATOMIC) < 0)) { | ||
138 | spin_unlock_irq(vblk->disk->queue->queue_lock); | ||
139 | virtblk_add_buf_wait(vblk, vbr, out, in); | ||
140 | return; | ||
141 | } | ||
142 | virtqueue_kick(vblk->vq); | ||
143 | spin_unlock_irq(vblk->disk->queue->queue_lock); | ||
144 | } | ||
145 | |||
146 | static int virtblk_bio_send_flush(struct virtblk_req *vbr) | ||
147 | { | ||
148 | unsigned int out = 0, in = 0; | ||
149 | |||
150 | vbr->flags |= VBLK_IS_FLUSH; | ||
151 | vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH; | ||
152 | vbr->out_hdr.sector = 0; | ||
153 | vbr->out_hdr.ioprio = 0; | ||
154 | sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); | ||
155 | sg_set_buf(&vbr->sg[out + in++], &vbr->status, sizeof(vbr->status)); | ||
156 | |||
157 | virtblk_add_req(vbr, out, in); | ||
158 | |||
159 | return 0; | ||
160 | } | ||
161 | |||
162 | static int virtblk_bio_send_data(struct virtblk_req *vbr) | ||
163 | { | ||
164 | struct virtio_blk *vblk = vbr->vblk; | ||
165 | unsigned int num, out = 0, in = 0; | ||
166 | struct bio *bio = vbr->bio; | ||
167 | |||
168 | vbr->flags &= ~VBLK_IS_FLUSH; | ||
169 | vbr->out_hdr.type = 0; | ||
170 | vbr->out_hdr.sector = bio->bi_sector; | ||
171 | vbr->out_hdr.ioprio = bio_prio(bio); | ||
172 | |||
173 | sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); | ||
174 | |||
175 | num = blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg + out); | ||
176 | |||
177 | sg_set_buf(&vbr->sg[num + out + in++], &vbr->status, | ||
178 | sizeof(vbr->status)); | ||
179 | |||
180 | if (num) { | ||
181 | if (bio->bi_rw & REQ_WRITE) { | ||
182 | vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; | ||
183 | out += num; | ||
184 | } else { | ||
185 | vbr->out_hdr.type |= VIRTIO_BLK_T_IN; | ||
186 | in += num; | ||
93 | } | 187 | } |
188 | } | ||
189 | |||
190 | virtblk_add_req(vbr, out, in); | ||
191 | |||
192 | return 0; | ||
193 | } | ||
194 | |||
195 | static void virtblk_bio_send_data_work(struct work_struct *work) | ||
196 | { | ||
197 | struct virtblk_req *vbr; | ||
198 | |||
199 | vbr = container_of(work, struct virtblk_req, work); | ||
200 | |||
201 | virtblk_bio_send_data(vbr); | ||
202 | } | ||
203 | |||
204 | static void virtblk_bio_send_flush_work(struct work_struct *work) | ||
205 | { | ||
206 | struct virtblk_req *vbr; | ||
207 | |||
208 | vbr = container_of(work, struct virtblk_req, work); | ||
209 | |||
210 | virtblk_bio_send_flush(vbr); | ||
211 | } | ||
212 | |||
213 | static inline void virtblk_request_done(struct virtblk_req *vbr) | ||
214 | { | ||
215 | struct virtio_blk *vblk = vbr->vblk; | ||
216 | struct request *req = vbr->req; | ||
217 | int error = virtblk_result(vbr); | ||
218 | |||
219 | if (req->cmd_type == REQ_TYPE_BLOCK_PC) { | ||
220 | req->resid_len = vbr->in_hdr.residual; | ||
221 | req->sense_len = vbr->in_hdr.sense_len; | ||
222 | req->errors = vbr->in_hdr.errors; | ||
223 | } else if (req->cmd_type == REQ_TYPE_SPECIAL) { | ||
224 | req->errors = (error != 0); | ||
225 | } | ||
226 | |||
227 | __blk_end_request_all(req, error); | ||
228 | mempool_free(vbr, vblk->pool); | ||
229 | } | ||
230 | |||
231 | static inline void virtblk_bio_flush_done(struct virtblk_req *vbr) | ||
232 | { | ||
233 | struct virtio_blk *vblk = vbr->vblk; | ||
234 | |||
235 | if (vbr->flags & VBLK_REQ_DATA) { | ||
236 | /* Send out the actual write data */ | ||
237 | INIT_WORK(&vbr->work, virtblk_bio_send_data_work); | ||
238 | queue_work(virtblk_wq, &vbr->work); | ||
239 | } else { | ||
240 | bio_endio(vbr->bio, virtblk_result(vbr)); | ||
241 | mempool_free(vbr, vblk->pool); | ||
242 | } | ||
243 | } | ||
244 | |||
245 | static inline void virtblk_bio_data_done(struct virtblk_req *vbr) | ||
246 | { | ||
247 | struct virtio_blk *vblk = vbr->vblk; | ||
94 | 248 | ||
95 | __blk_end_request_all(vbr->req, error); | 249 | if (unlikely(vbr->flags & VBLK_REQ_FUA)) { |
250 | /* Send out a flush before end the bio */ | ||
251 | vbr->flags &= ~VBLK_REQ_DATA; | ||
252 | INIT_WORK(&vbr->work, virtblk_bio_send_flush_work); | ||
253 | queue_work(virtblk_wq, &vbr->work); | ||
254 | } else { | ||
255 | bio_endio(vbr->bio, virtblk_result(vbr)); | ||
96 | mempool_free(vbr, vblk->pool); | 256 | mempool_free(vbr, vblk->pool); |
97 | } | 257 | } |
258 | } | ||
259 | |||
260 | static inline void virtblk_bio_done(struct virtblk_req *vbr) | ||
261 | { | ||
262 | if (unlikely(vbr->flags & VBLK_IS_FLUSH)) | ||
263 | virtblk_bio_flush_done(vbr); | ||
264 | else | ||
265 | virtblk_bio_data_done(vbr); | ||
266 | } | ||
267 | |||
268 | static void virtblk_done(struct virtqueue *vq) | ||
269 | { | ||
270 | struct virtio_blk *vblk = vq->vdev->priv; | ||
271 | bool bio_done = false, req_done = false; | ||
272 | struct virtblk_req *vbr; | ||
273 | unsigned long flags; | ||
274 | unsigned int len; | ||
275 | |||
276 | spin_lock_irqsave(vblk->disk->queue->queue_lock, flags); | ||
277 | do { | ||
278 | virtqueue_disable_cb(vq); | ||
279 | while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) { | ||
280 | if (vbr->bio) { | ||
281 | virtblk_bio_done(vbr); | ||
282 | bio_done = true; | ||
283 | } else { | ||
284 | virtblk_request_done(vbr); | ||
285 | req_done = true; | ||
286 | } | ||
287 | } | ||
288 | } while (!virtqueue_enable_cb(vq)); | ||
98 | /* In case queue is stopped waiting for more buffers. */ | 289 | /* In case queue is stopped waiting for more buffers. */ |
99 | blk_start_queue(vblk->disk->queue); | 290 | if (req_done) |
291 | blk_start_queue(vblk->disk->queue); | ||
100 | spin_unlock_irqrestore(vblk->disk->queue->queue_lock, flags); | 292 | spin_unlock_irqrestore(vblk->disk->queue->queue_lock, flags); |
293 | |||
294 | if (bio_done) | ||
295 | wake_up(&vblk->queue_wait); | ||
101 | } | 296 | } |
102 | 297 | ||
103 | static bool do_req(struct request_queue *q, struct virtio_blk *vblk, | 298 | static bool do_req(struct request_queue *q, struct virtio_blk *vblk, |
@@ -106,13 +301,13 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk, | |||
106 | unsigned long num, out = 0, in = 0; | 301 | unsigned long num, out = 0, in = 0; |
107 | struct virtblk_req *vbr; | 302 | struct virtblk_req *vbr; |
108 | 303 | ||
109 | vbr = mempool_alloc(vblk->pool, GFP_ATOMIC); | 304 | vbr = virtblk_alloc_req(vblk, GFP_ATOMIC); |
110 | if (!vbr) | 305 | if (!vbr) |
111 | /* When another request finishes we'll try again. */ | 306 | /* When another request finishes we'll try again. */ |
112 | return false; | 307 | return false; |
113 | 308 | ||
114 | vbr->req = req; | 309 | vbr->req = req; |
115 | 310 | vbr->bio = NULL; | |
116 | if (req->cmd_flags & REQ_FLUSH) { | 311 | if (req->cmd_flags & REQ_FLUSH) { |
117 | vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH; | 312 | vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH; |
118 | vbr->out_hdr.sector = 0; | 313 | vbr->out_hdr.sector = 0; |
@@ -172,7 +367,8 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk, | |||
172 | } | 367 | } |
173 | } | 368 | } |
174 | 369 | ||
175 | if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr, GFP_ATOMIC)<0) { | 370 | if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr, |
371 | GFP_ATOMIC) < 0) { | ||
176 | mempool_free(vbr, vblk->pool); | 372 | mempool_free(vbr, vblk->pool); |
177 | return false; | 373 | return false; |
178 | } | 374 | } |
@@ -180,7 +376,7 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk, | |||
180 | return true; | 376 | return true; |
181 | } | 377 | } |
182 | 378 | ||
183 | static void do_virtblk_request(struct request_queue *q) | 379 | static void virtblk_request(struct request_queue *q) |
184 | { | 380 | { |
185 | struct virtio_blk *vblk = q->queuedata; | 381 | struct virtio_blk *vblk = q->queuedata; |
186 | struct request *req; | 382 | struct request *req; |
@@ -203,6 +399,34 @@ static void do_virtblk_request(struct request_queue *q) | |||
203 | virtqueue_kick(vblk->vq); | 399 | virtqueue_kick(vblk->vq); |
204 | } | 400 | } |
205 | 401 | ||
402 | static void virtblk_make_request(struct request_queue *q, struct bio *bio) | ||
403 | { | ||
404 | struct virtio_blk *vblk = q->queuedata; | ||
405 | struct virtblk_req *vbr; | ||
406 | |||
407 | BUG_ON(bio->bi_phys_segments + 2 > vblk->sg_elems); | ||
408 | |||
409 | vbr = virtblk_alloc_req(vblk, GFP_NOIO); | ||
410 | if (!vbr) { | ||
411 | bio_endio(bio, -ENOMEM); | ||
412 | return; | ||
413 | } | ||
414 | |||
415 | vbr->bio = bio; | ||
416 | vbr->flags = 0; | ||
417 | if (bio->bi_rw & REQ_FLUSH) | ||
418 | vbr->flags |= VBLK_REQ_FLUSH; | ||
419 | if (bio->bi_rw & REQ_FUA) | ||
420 | vbr->flags |= VBLK_REQ_FUA; | ||
421 | if (bio->bi_size) | ||
422 | vbr->flags |= VBLK_REQ_DATA; | ||
423 | |||
424 | if (unlikely(vbr->flags & VBLK_REQ_FLUSH)) | ||
425 | virtblk_bio_send_flush(vbr); | ||
426 | else | ||
427 | virtblk_bio_send_data(vbr); | ||
428 | } | ||
429 | |||
206 | /* return id (s/n) string for *disk to *id_str | 430 | /* return id (s/n) string for *disk to *id_str |
207 | */ | 431 | */ |
208 | static int virtblk_get_id(struct gendisk *disk, char *id_str) | 432 | static int virtblk_get_id(struct gendisk *disk, char *id_str) |
@@ -360,7 +584,7 @@ static int init_vq(struct virtio_blk *vblk) | |||
360 | int err = 0; | 584 | int err = 0; |
361 | 585 | ||
362 | /* We expect one virtqueue, for output. */ | 586 | /* We expect one virtqueue, for output. */ |
363 | vblk->vq = virtio_find_single_vq(vblk->vdev, blk_done, "requests"); | 587 | vblk->vq = virtio_find_single_vq(vblk->vdev, virtblk_done, "requests"); |
364 | if (IS_ERR(vblk->vq)) | 588 | if (IS_ERR(vblk->vq)) |
365 | err = PTR_ERR(vblk->vq); | 589 | err = PTR_ERR(vblk->vq); |
366 | 590 | ||
@@ -477,6 +701,8 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) | |||
477 | struct virtio_blk *vblk; | 701 | struct virtio_blk *vblk; |
478 | struct request_queue *q; | 702 | struct request_queue *q; |
479 | int err, index; | 703 | int err, index; |
704 | int pool_size; | ||
705 | |||
480 | u64 cap; | 706 | u64 cap; |
481 | u32 v, blk_size, sg_elems, opt_io_size; | 707 | u32 v, blk_size, sg_elems, opt_io_size; |
482 | u16 min_io_size; | 708 | u16 min_io_size; |
@@ -506,10 +732,12 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) | |||
506 | goto out_free_index; | 732 | goto out_free_index; |
507 | } | 733 | } |
508 | 734 | ||
735 | init_waitqueue_head(&vblk->queue_wait); | ||
509 | vblk->vdev = vdev; | 736 | vblk->vdev = vdev; |
510 | vblk->sg_elems = sg_elems; | 737 | vblk->sg_elems = sg_elems; |
511 | sg_init_table(vblk->sg, vblk->sg_elems); | 738 | sg_init_table(vblk->sg, vblk->sg_elems); |
512 | mutex_init(&vblk->config_lock); | 739 | mutex_init(&vblk->config_lock); |
740 | |||
513 | INIT_WORK(&vblk->config_work, virtblk_config_changed_work); | 741 | INIT_WORK(&vblk->config_work, virtblk_config_changed_work); |
514 | vblk->config_enable = true; | 742 | vblk->config_enable = true; |
515 | 743 | ||
@@ -517,7 +745,10 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) | |||
517 | if (err) | 745 | if (err) |
518 | goto out_free_vblk; | 746 | goto out_free_vblk; |
519 | 747 | ||
520 | vblk->pool = mempool_create_kmalloc_pool(1,sizeof(struct virtblk_req)); | 748 | pool_size = sizeof(struct virtblk_req); |
749 | if (use_bio) | ||
750 | pool_size += sizeof(struct scatterlist) * sg_elems; | ||
751 | vblk->pool = mempool_create_kmalloc_pool(1, pool_size); | ||
521 | if (!vblk->pool) { | 752 | if (!vblk->pool) { |
522 | err = -ENOMEM; | 753 | err = -ENOMEM; |
523 | goto out_free_vq; | 754 | goto out_free_vq; |
@@ -530,12 +761,14 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) | |||
530 | goto out_mempool; | 761 | goto out_mempool; |
531 | } | 762 | } |
532 | 763 | ||
533 | q = vblk->disk->queue = blk_init_queue(do_virtblk_request, NULL); | 764 | q = vblk->disk->queue = blk_init_queue(virtblk_request, NULL); |
534 | if (!q) { | 765 | if (!q) { |
535 | err = -ENOMEM; | 766 | err = -ENOMEM; |
536 | goto out_put_disk; | 767 | goto out_put_disk; |
537 | } | 768 | } |
538 | 769 | ||
770 | if (use_bio) | ||
771 | blk_queue_make_request(q, virtblk_make_request); | ||
539 | q->queuedata = vblk; | 772 | q->queuedata = vblk; |
540 | 773 | ||
541 | virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN); | 774 | virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN); |
@@ -620,7 +853,6 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) | |||
620 | if (!err && opt_io_size) | 853 | if (!err && opt_io_size) |
621 | blk_queue_io_opt(q, blk_size * opt_io_size); | 854 | blk_queue_io_opt(q, blk_size * opt_io_size); |
622 | 855 | ||
623 | |||
624 | add_disk(vblk->disk); | 856 | add_disk(vblk->disk); |
625 | err = device_create_file(disk_to_dev(vblk->disk), &dev_attr_serial); | 857 | err = device_create_file(disk_to_dev(vblk->disk), &dev_attr_serial); |
626 | if (err) | 858 | if (err) |
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index c6decb901e5e..74374fb762aa 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c | |||
@@ -39,9 +39,11 @@ | |||
39 | #include <linux/list.h> | 39 | #include <linux/list.h> |
40 | #include <linux/delay.h> | 40 | #include <linux/delay.h> |
41 | #include <linux/freezer.h> | 41 | #include <linux/freezer.h> |
42 | #include <linux/bitmap.h> | ||
42 | 43 | ||
43 | #include <xen/events.h> | 44 | #include <xen/events.h> |
44 | #include <xen/page.h> | 45 | #include <xen/page.h> |
46 | #include <xen/xen.h> | ||
45 | #include <asm/xen/hypervisor.h> | 47 | #include <asm/xen/hypervisor.h> |
46 | #include <asm/xen/hypercall.h> | 48 | #include <asm/xen/hypercall.h> |
47 | #include "common.h" | 49 | #include "common.h" |
@@ -78,6 +80,7 @@ struct pending_req { | |||
78 | unsigned short operation; | 80 | unsigned short operation; |
79 | int status; | 81 | int status; |
80 | struct list_head free_list; | 82 | struct list_head free_list; |
83 | DECLARE_BITMAP(unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST); | ||
81 | }; | 84 | }; |
82 | 85 | ||
83 | #define BLKBACK_INVALID_HANDLE (~0) | 86 | #define BLKBACK_INVALID_HANDLE (~0) |
@@ -98,6 +101,36 @@ struct xen_blkbk { | |||
98 | static struct xen_blkbk *blkbk; | 101 | static struct xen_blkbk *blkbk; |
99 | 102 | ||
100 | /* | 103 | /* |
104 | * Maximum number of grant pages that can be mapped in blkback. | ||
105 | * BLKIF_MAX_SEGMENTS_PER_REQUEST * RING_SIZE is the maximum number of | ||
106 | * pages that blkback will persistently map. | ||
107 | * Currently, this is: | ||
108 | * RING_SIZE = 32 (for all known ring types) | ||
109 | * BLKIF_MAX_SEGMENTS_PER_REQUEST = 11 | ||
110 | * sizeof(struct persistent_gnt) = 48 | ||
111 | * So the maximum memory used to store the grants is: | ||
112 | * 32 * 11 * 48 = 16896 bytes | ||
113 | */ | ||
114 | static inline unsigned int max_mapped_grant_pages(enum blkif_protocol protocol) | ||
115 | { | ||
116 | switch (protocol) { | ||
117 | case BLKIF_PROTOCOL_NATIVE: | ||
118 | return __CONST_RING_SIZE(blkif, PAGE_SIZE) * | ||
119 | BLKIF_MAX_SEGMENTS_PER_REQUEST; | ||
120 | case BLKIF_PROTOCOL_X86_32: | ||
121 | return __CONST_RING_SIZE(blkif_x86_32, PAGE_SIZE) * | ||
122 | BLKIF_MAX_SEGMENTS_PER_REQUEST; | ||
123 | case BLKIF_PROTOCOL_X86_64: | ||
124 | return __CONST_RING_SIZE(blkif_x86_64, PAGE_SIZE) * | ||
125 | BLKIF_MAX_SEGMENTS_PER_REQUEST; | ||
126 | default: | ||
127 | BUG(); | ||
128 | } | ||
129 | return 0; | ||
130 | } | ||
131 | |||
132 | |||
133 | /* | ||
101 | * Little helpful macro to figure out the index and virtual address of the | 134 | * Little helpful macro to figure out the index and virtual address of the |
102 | * pending_pages[..]. For each 'pending_req' we have have up to | 135 | * pending_pages[..]. For each 'pending_req' we have have up to |
103 | * BLKIF_MAX_SEGMENTS_PER_REQUEST (11) pages. The seg would be from 0 through | 136 | * BLKIF_MAX_SEGMENTS_PER_REQUEST (11) pages. The seg would be from 0 through |
@@ -128,6 +161,90 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, | |||
128 | static void make_response(struct xen_blkif *blkif, u64 id, | 161 | static void make_response(struct xen_blkif *blkif, u64 id, |
129 | unsigned short op, int st); | 162 | unsigned short op, int st); |
130 | 163 | ||
164 | #define foreach_grant(pos, rbtree, node) \ | ||
165 | for ((pos) = container_of(rb_first((rbtree)), typeof(*(pos)), node); \ | ||
166 | &(pos)->node != NULL; \ | ||
167 | (pos) = container_of(rb_next(&(pos)->node), typeof(*(pos)), node)) | ||
168 | |||
169 | |||
170 | static void add_persistent_gnt(struct rb_root *root, | ||
171 | struct persistent_gnt *persistent_gnt) | ||
172 | { | ||
173 | struct rb_node **new = &(root->rb_node), *parent = NULL; | ||
174 | struct persistent_gnt *this; | ||
175 | |||
176 | /* Figure out where to put new node */ | ||
177 | while (*new) { | ||
178 | this = container_of(*new, struct persistent_gnt, node); | ||
179 | |||
180 | parent = *new; | ||
181 | if (persistent_gnt->gnt < this->gnt) | ||
182 | new = &((*new)->rb_left); | ||
183 | else if (persistent_gnt->gnt > this->gnt) | ||
184 | new = &((*new)->rb_right); | ||
185 | else { | ||
186 | pr_alert(DRV_PFX " trying to add a gref that's already in the tree\n"); | ||
187 | BUG(); | ||
188 | } | ||
189 | } | ||
190 | |||
191 | /* Add new node and rebalance tree. */ | ||
192 | rb_link_node(&(persistent_gnt->node), parent, new); | ||
193 | rb_insert_color(&(persistent_gnt->node), root); | ||
194 | } | ||
195 | |||
196 | static struct persistent_gnt *get_persistent_gnt(struct rb_root *root, | ||
197 | grant_ref_t gref) | ||
198 | { | ||
199 | struct persistent_gnt *data; | ||
200 | struct rb_node *node = root->rb_node; | ||
201 | |||
202 | while (node) { | ||
203 | data = container_of(node, struct persistent_gnt, node); | ||
204 | |||
205 | if (gref < data->gnt) | ||
206 | node = node->rb_left; | ||
207 | else if (gref > data->gnt) | ||
208 | node = node->rb_right; | ||
209 | else | ||
210 | return data; | ||
211 | } | ||
212 | return NULL; | ||
213 | } | ||
214 | |||
215 | static void free_persistent_gnts(struct rb_root *root, unsigned int num) | ||
216 | { | ||
217 | struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | ||
218 | struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | ||
219 | struct persistent_gnt *persistent_gnt; | ||
220 | int ret = 0; | ||
221 | int segs_to_unmap = 0; | ||
222 | |||
223 | foreach_grant(persistent_gnt, root, node) { | ||
224 | BUG_ON(persistent_gnt->handle == | ||
225 | BLKBACK_INVALID_HANDLE); | ||
226 | gnttab_set_unmap_op(&unmap[segs_to_unmap], | ||
227 | (unsigned long) pfn_to_kaddr(page_to_pfn( | ||
228 | persistent_gnt->page)), | ||
229 | GNTMAP_host_map, | ||
230 | persistent_gnt->handle); | ||
231 | |||
232 | pages[segs_to_unmap] = persistent_gnt->page; | ||
233 | rb_erase(&persistent_gnt->node, root); | ||
234 | kfree(persistent_gnt); | ||
235 | num--; | ||
236 | |||
237 | if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST || | ||
238 | !rb_next(&persistent_gnt->node)) { | ||
239 | ret = gnttab_unmap_refs(unmap, NULL, pages, | ||
240 | segs_to_unmap); | ||
241 | BUG_ON(ret); | ||
242 | segs_to_unmap = 0; | ||
243 | } | ||
244 | } | ||
245 | BUG_ON(num != 0); | ||
246 | } | ||
247 | |||
131 | /* | 248 | /* |
132 | * Retrieve from the 'pending_reqs' a free pending_req structure to be used. | 249 | * Retrieve from the 'pending_reqs' a free pending_req structure to be used. |
133 | */ | 250 | */ |
@@ -301,6 +418,14 @@ int xen_blkif_schedule(void *arg) | |||
301 | print_stats(blkif); | 418 | print_stats(blkif); |
302 | } | 419 | } |
303 | 420 | ||
421 | /* Free all persistent grant pages */ | ||
422 | if (!RB_EMPTY_ROOT(&blkif->persistent_gnts)) | ||
423 | free_persistent_gnts(&blkif->persistent_gnts, | ||
424 | blkif->persistent_gnt_c); | ||
425 | |||
426 | BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); | ||
427 | blkif->persistent_gnt_c = 0; | ||
428 | |||
304 | if (log_stats) | 429 | if (log_stats) |
305 | print_stats(blkif); | 430 | print_stats(blkif); |
306 | 431 | ||
@@ -327,6 +452,8 @@ static void xen_blkbk_unmap(struct pending_req *req) | |||
327 | int ret; | 452 | int ret; |
328 | 453 | ||
329 | for (i = 0; i < req->nr_pages; i++) { | 454 | for (i = 0; i < req->nr_pages; i++) { |
455 | if (!test_bit(i, req->unmap_seg)) | ||
456 | continue; | ||
330 | handle = pending_handle(req, i); | 457 | handle = pending_handle(req, i); |
331 | if (handle == BLKBACK_INVALID_HANDLE) | 458 | if (handle == BLKBACK_INVALID_HANDLE) |
332 | continue; | 459 | continue; |
@@ -343,12 +470,26 @@ static void xen_blkbk_unmap(struct pending_req *req) | |||
343 | 470 | ||
344 | static int xen_blkbk_map(struct blkif_request *req, | 471 | static int xen_blkbk_map(struct blkif_request *req, |
345 | struct pending_req *pending_req, | 472 | struct pending_req *pending_req, |
346 | struct seg_buf seg[]) | 473 | struct seg_buf seg[], |
474 | struct page *pages[]) | ||
347 | { | 475 | { |
348 | struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | 476 | struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; |
349 | int i; | 477 | struct persistent_gnt *persistent_gnts[BLKIF_MAX_SEGMENTS_PER_REQUEST]; |
478 | struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | ||
479 | struct persistent_gnt *persistent_gnt = NULL; | ||
480 | struct xen_blkif *blkif = pending_req->blkif; | ||
481 | phys_addr_t addr = 0; | ||
482 | int i, j; | ||
483 | bool new_map; | ||
350 | int nseg = req->u.rw.nr_segments; | 484 | int nseg = req->u.rw.nr_segments; |
485 | int segs_to_map = 0; | ||
351 | int ret = 0; | 486 | int ret = 0; |
487 | int use_persistent_gnts; | ||
488 | |||
489 | use_persistent_gnts = (blkif->vbd.feature_gnt_persistent); | ||
490 | |||
491 | BUG_ON(blkif->persistent_gnt_c > | ||
492 | max_mapped_grant_pages(pending_req->blkif->blk_protocol)); | ||
352 | 493 | ||
353 | /* | 494 | /* |
354 | * Fill out preq.nr_sects with proper amount of sectors, and setup | 495 | * Fill out preq.nr_sects with proper amount of sectors, and setup |
@@ -358,36 +499,146 @@ static int xen_blkbk_map(struct blkif_request *req, | |||
358 | for (i = 0; i < nseg; i++) { | 499 | for (i = 0; i < nseg; i++) { |
359 | uint32_t flags; | 500 | uint32_t flags; |
360 | 501 | ||
361 | flags = GNTMAP_host_map; | 502 | if (use_persistent_gnts) |
362 | if (pending_req->operation != BLKIF_OP_READ) | 503 | persistent_gnt = get_persistent_gnt( |
363 | flags |= GNTMAP_readonly; | 504 | &blkif->persistent_gnts, |
364 | gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags, | 505 | req->u.rw.seg[i].gref); |
365 | req->u.rw.seg[i].gref, | 506 | |
366 | pending_req->blkif->domid); | 507 | if (persistent_gnt) { |
508 | /* | ||
509 | * We are using persistent grants and | ||
510 | * the grant is already mapped | ||
511 | */ | ||
512 | new_map = false; | ||
513 | } else if (use_persistent_gnts && | ||
514 | blkif->persistent_gnt_c < | ||
515 | max_mapped_grant_pages(blkif->blk_protocol)) { | ||
516 | /* | ||
517 | * We are using persistent grants, the grant is | ||
518 | * not mapped but we have room for it | ||
519 | */ | ||
520 | new_map = true; | ||
521 | persistent_gnt = kmalloc( | ||
522 | sizeof(struct persistent_gnt), | ||
523 | GFP_KERNEL); | ||
524 | if (!persistent_gnt) | ||
525 | return -ENOMEM; | ||
526 | persistent_gnt->page = alloc_page(GFP_KERNEL); | ||
527 | if (!persistent_gnt->page) { | ||
528 | kfree(persistent_gnt); | ||
529 | return -ENOMEM; | ||
530 | } | ||
531 | persistent_gnt->gnt = req->u.rw.seg[i].gref; | ||
532 | persistent_gnt->handle = BLKBACK_INVALID_HANDLE; | ||
533 | |||
534 | pages_to_gnt[segs_to_map] = | ||
535 | persistent_gnt->page; | ||
536 | addr = (unsigned long) pfn_to_kaddr( | ||
537 | page_to_pfn(persistent_gnt->page)); | ||
538 | |||
539 | add_persistent_gnt(&blkif->persistent_gnts, | ||
540 | persistent_gnt); | ||
541 | blkif->persistent_gnt_c++; | ||
542 | pr_debug(DRV_PFX " grant %u added to the tree of persistent grants, using %u/%u\n", | ||
543 | persistent_gnt->gnt, blkif->persistent_gnt_c, | ||
544 | max_mapped_grant_pages(blkif->blk_protocol)); | ||
545 | } else { | ||
546 | /* | ||
547 | * We are either using persistent grants and | ||
548 | * hit the maximum limit of grants mapped, | ||
549 | * or we are not using persistent grants. | ||
550 | */ | ||
551 | if (use_persistent_gnts && | ||
552 | !blkif->vbd.overflow_max_grants) { | ||
553 | blkif->vbd.overflow_max_grants = 1; | ||
554 | pr_alert(DRV_PFX " domain %u, device %#x is using maximum number of persistent grants\n", | ||
555 | blkif->domid, blkif->vbd.handle); | ||
556 | } | ||
557 | new_map = true; | ||
558 | pages[i] = blkbk->pending_page(pending_req, i); | ||
559 | addr = vaddr(pending_req, i); | ||
560 | pages_to_gnt[segs_to_map] = | ||
561 | blkbk->pending_page(pending_req, i); | ||
562 | } | ||
563 | |||
564 | if (persistent_gnt) { | ||
565 | pages[i] = persistent_gnt->page; | ||
566 | persistent_gnts[i] = persistent_gnt; | ||
567 | } else { | ||
568 | persistent_gnts[i] = NULL; | ||
569 | } | ||
570 | |||
571 | if (new_map) { | ||
572 | flags = GNTMAP_host_map; | ||
573 | if (!persistent_gnt && | ||
574 | (pending_req->operation != BLKIF_OP_READ)) | ||
575 | flags |= GNTMAP_readonly; | ||
576 | gnttab_set_map_op(&map[segs_to_map++], addr, | ||
577 | flags, req->u.rw.seg[i].gref, | ||
578 | blkif->domid); | ||
579 | } | ||
367 | } | 580 | } |
368 | 581 | ||
369 | ret = gnttab_map_refs(map, NULL, &blkbk->pending_page(pending_req, 0), nseg); | 582 | if (segs_to_map) { |
370 | BUG_ON(ret); | 583 | ret = gnttab_map_refs(map, NULL, pages_to_gnt, segs_to_map); |
584 | BUG_ON(ret); | ||
585 | } | ||
371 | 586 | ||
372 | /* | 587 | /* |
373 | * Now swizzle the MFN in our domain with the MFN from the other domain | 588 | * Now swizzle the MFN in our domain with the MFN from the other domain |
374 | * so that when we access vaddr(pending_req,i) it has the contents of | 589 | * so that when we access vaddr(pending_req,i) it has the contents of |
375 | * the page from the other domain. | 590 | * the page from the other domain. |
376 | */ | 591 | */ |
377 | for (i = 0; i < nseg; i++) { | 592 | bitmap_zero(pending_req->unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST); |
378 | if (unlikely(map[i].status != 0)) { | 593 | for (i = 0, j = 0; i < nseg; i++) { |
379 | pr_debug(DRV_PFX "invalid buffer -- could not remap it\n"); | 594 | if (!persistent_gnts[i] || |
380 | map[i].handle = BLKBACK_INVALID_HANDLE; | 595 | persistent_gnts[i]->handle == BLKBACK_INVALID_HANDLE) { |
381 | ret |= 1; | 596 | /* This is a newly mapped grant */ |
597 | BUG_ON(j >= segs_to_map); | ||
598 | if (unlikely(map[j].status != 0)) { | ||
599 | pr_debug(DRV_PFX "invalid buffer -- could not remap it\n"); | ||
600 | map[j].handle = BLKBACK_INVALID_HANDLE; | ||
601 | ret |= 1; | ||
602 | if (persistent_gnts[i]) { | ||
603 | rb_erase(&persistent_gnts[i]->node, | ||
604 | &blkif->persistent_gnts); | ||
605 | blkif->persistent_gnt_c--; | ||
606 | kfree(persistent_gnts[i]); | ||
607 | persistent_gnts[i] = NULL; | ||
608 | } | ||
609 | } | ||
610 | } | ||
611 | if (persistent_gnts[i]) { | ||
612 | if (persistent_gnts[i]->handle == | ||
613 | BLKBACK_INVALID_HANDLE) { | ||
614 | /* | ||
615 | * If this is a new persistent grant | ||
616 | * save the handler | ||
617 | */ | ||
618 | persistent_gnts[i]->handle = map[j].handle; | ||
619 | persistent_gnts[i]->dev_bus_addr = | ||
620 | map[j++].dev_bus_addr; | ||
621 | } | ||
622 | pending_handle(pending_req, i) = | ||
623 | persistent_gnts[i]->handle; | ||
624 | |||
625 | if (ret) | ||
626 | continue; | ||
627 | |||
628 | seg[i].buf = persistent_gnts[i]->dev_bus_addr | | ||
629 | (req->u.rw.seg[i].first_sect << 9); | ||
630 | } else { | ||
631 | pending_handle(pending_req, i) = map[j].handle; | ||
632 | bitmap_set(pending_req->unmap_seg, i, 1); | ||
633 | |||
634 | if (ret) { | ||
635 | j++; | ||
636 | continue; | ||
637 | } | ||
638 | |||
639 | seg[i].buf = map[j++].dev_bus_addr | | ||
640 | (req->u.rw.seg[i].first_sect << 9); | ||
382 | } | 641 | } |
383 | |||
384 | pending_handle(pending_req, i) = map[i].handle; | ||
385 | |||
386 | if (ret) | ||
387 | continue; | ||
388 | |||
389 | seg[i].buf = map[i].dev_bus_addr | | ||
390 | (req->u.rw.seg[i].first_sect << 9); | ||
391 | } | 642 | } |
392 | return ret; | 643 | return ret; |
393 | } | 644 | } |
@@ -590,6 +841,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, | |||
590 | int operation; | 841 | int operation; |
591 | struct blk_plug plug; | 842 | struct blk_plug plug; |
592 | bool drain = false; | 843 | bool drain = false; |
844 | struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | ||
593 | 845 | ||
594 | switch (req->operation) { | 846 | switch (req->operation) { |
595 | case BLKIF_OP_READ: | 847 | case BLKIF_OP_READ: |
@@ -676,7 +928,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, | |||
676 | * the hypercall to unmap the grants - that is all done in | 928 | * the hypercall to unmap the grants - that is all done in |
677 | * xen_blkbk_unmap. | 929 | * xen_blkbk_unmap. |
678 | */ | 930 | */ |
679 | if (xen_blkbk_map(req, pending_req, seg)) | 931 | if (xen_blkbk_map(req, pending_req, seg, pages)) |
680 | goto fail_flush; | 932 | goto fail_flush; |
681 | 933 | ||
682 | /* | 934 | /* |
@@ -688,7 +940,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, | |||
688 | for (i = 0; i < nseg; i++) { | 940 | for (i = 0; i < nseg; i++) { |
689 | while ((bio == NULL) || | 941 | while ((bio == NULL) || |
690 | (bio_add_page(bio, | 942 | (bio_add_page(bio, |
691 | blkbk->pending_page(pending_req, i), | 943 | pages[i], |
692 | seg[i].nsec << 9, | 944 | seg[i].nsec << 9, |
693 | seg[i].buf & ~PAGE_MASK) == 0)) { | 945 | seg[i].buf & ~PAGE_MASK) == 0)) { |
694 | 946 | ||
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index 9ad3b5ec1dc1..6072390c7f57 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h | |||
@@ -34,6 +34,7 @@ | |||
34 | #include <linux/vmalloc.h> | 34 | #include <linux/vmalloc.h> |
35 | #include <linux/wait.h> | 35 | #include <linux/wait.h> |
36 | #include <linux/io.h> | 36 | #include <linux/io.h> |
37 | #include <linux/rbtree.h> | ||
37 | #include <asm/setup.h> | 38 | #include <asm/setup.h> |
38 | #include <asm/pgalloc.h> | 39 | #include <asm/pgalloc.h> |
39 | #include <asm/hypervisor.h> | 40 | #include <asm/hypervisor.h> |
@@ -158,12 +159,23 @@ struct xen_vbd { | |||
158 | struct block_device *bdev; | 159 | struct block_device *bdev; |
159 | /* Cached size parameter. */ | 160 | /* Cached size parameter. */ |
160 | sector_t size; | 161 | sector_t size; |
161 | bool flush_support; | 162 | unsigned int flush_support:1; |
162 | bool discard_secure; | 163 | unsigned int discard_secure:1; |
164 | unsigned int feature_gnt_persistent:1; | ||
165 | unsigned int overflow_max_grants:1; | ||
163 | }; | 166 | }; |
164 | 167 | ||
165 | struct backend_info; | 168 | struct backend_info; |
166 | 169 | ||
170 | |||
171 | struct persistent_gnt { | ||
172 | struct page *page; | ||
173 | grant_ref_t gnt; | ||
174 | grant_handle_t handle; | ||
175 | uint64_t dev_bus_addr; | ||
176 | struct rb_node node; | ||
177 | }; | ||
178 | |||
167 | struct xen_blkif { | 179 | struct xen_blkif { |
168 | /* Unique identifier for this interface. */ | 180 | /* Unique identifier for this interface. */ |
169 | domid_t domid; | 181 | domid_t domid; |
@@ -190,6 +202,10 @@ struct xen_blkif { | |||
190 | struct task_struct *xenblkd; | 202 | struct task_struct *xenblkd; |
191 | unsigned int waiting_reqs; | 203 | unsigned int waiting_reqs; |
192 | 204 | ||
205 | /* tree to store persistent grants */ | ||
206 | struct rb_root persistent_gnts; | ||
207 | unsigned int persistent_gnt_c; | ||
208 | |||
193 | /* statistics */ | 209 | /* statistics */ |
194 | unsigned long st_print; | 210 | unsigned long st_print; |
195 | int st_rd_req; | 211 | int st_rd_req; |
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 4f66171c6683..63980722db41 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c | |||
@@ -105,11 +105,10 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid) | |||
105 | { | 105 | { |
106 | struct xen_blkif *blkif; | 106 | struct xen_blkif *blkif; |
107 | 107 | ||
108 | blkif = kmem_cache_alloc(xen_blkif_cachep, GFP_KERNEL); | 108 | blkif = kmem_cache_zalloc(xen_blkif_cachep, GFP_KERNEL); |
109 | if (!blkif) | 109 | if (!blkif) |
110 | return ERR_PTR(-ENOMEM); | 110 | return ERR_PTR(-ENOMEM); |
111 | 111 | ||
112 | memset(blkif, 0, sizeof(*blkif)); | ||
113 | blkif->domid = domid; | 112 | blkif->domid = domid; |
114 | spin_lock_init(&blkif->blk_ring_lock); | 113 | spin_lock_init(&blkif->blk_ring_lock); |
115 | atomic_set(&blkif->refcnt, 1); | 114 | atomic_set(&blkif->refcnt, 1); |
@@ -118,6 +117,7 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid) | |||
118 | atomic_set(&blkif->drain, 0); | 117 | atomic_set(&blkif->drain, 0); |
119 | blkif->st_print = jiffies; | 118 | blkif->st_print = jiffies; |
120 | init_waitqueue_head(&blkif->waiting_to_free); | 119 | init_waitqueue_head(&blkif->waiting_to_free); |
120 | blkif->persistent_gnts.rb_node = NULL; | ||
121 | 121 | ||
122 | return blkif; | 122 | return blkif; |
123 | } | 123 | } |
@@ -196,7 +196,7 @@ static void xen_blkif_disconnect(struct xen_blkif *blkif) | |||
196 | } | 196 | } |
197 | } | 197 | } |
198 | 198 | ||
199 | void xen_blkif_free(struct xen_blkif *blkif) | 199 | static void xen_blkif_free(struct xen_blkif *blkif) |
200 | { | 200 | { |
201 | if (!atomic_dec_and_test(&blkif->refcnt)) | 201 | if (!atomic_dec_and_test(&blkif->refcnt)) |
202 | BUG(); | 202 | BUG(); |
@@ -257,7 +257,7 @@ static struct attribute_group xen_vbdstat_group = { | |||
257 | VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor); | 257 | VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor); |
258 | VBD_SHOW(mode, "%s\n", be->mode); | 258 | VBD_SHOW(mode, "%s\n", be->mode); |
259 | 259 | ||
260 | int xenvbd_sysfs_addif(struct xenbus_device *dev) | 260 | static int xenvbd_sysfs_addif(struct xenbus_device *dev) |
261 | { | 261 | { |
262 | int error; | 262 | int error; |
263 | 263 | ||
@@ -281,7 +281,7 @@ fail1: device_remove_file(&dev->dev, &dev_attr_physical_device); | |||
281 | return error; | 281 | return error; |
282 | } | 282 | } |
283 | 283 | ||
284 | void xenvbd_sysfs_delif(struct xenbus_device *dev) | 284 | static void xenvbd_sysfs_delif(struct xenbus_device *dev) |
285 | { | 285 | { |
286 | sysfs_remove_group(&dev->dev.kobj, &xen_vbdstat_group); | 286 | sysfs_remove_group(&dev->dev.kobj, &xen_vbdstat_group); |
287 | device_remove_file(&dev->dev, &dev_attr_mode); | 287 | device_remove_file(&dev->dev, &dev_attr_mode); |
@@ -673,6 +673,13 @@ again: | |||
673 | 673 | ||
674 | xen_blkbk_barrier(xbt, be, be->blkif->vbd.flush_support); | 674 | xen_blkbk_barrier(xbt, be, be->blkif->vbd.flush_support); |
675 | 675 | ||
676 | err = xenbus_printf(xbt, dev->nodename, "feature-persistent", "%u", 1); | ||
677 | if (err) { | ||
678 | xenbus_dev_fatal(dev, err, "writing %s/feature-persistent", | ||
679 | dev->nodename); | ||
680 | goto abort; | ||
681 | } | ||
682 | |||
676 | err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", | 683 | err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", |
677 | (unsigned long long)vbd_sz(&be->blkif->vbd)); | 684 | (unsigned long long)vbd_sz(&be->blkif->vbd)); |
678 | if (err) { | 685 | if (err) { |
@@ -721,6 +728,7 @@ static int connect_ring(struct backend_info *be) | |||
721 | struct xenbus_device *dev = be->dev; | 728 | struct xenbus_device *dev = be->dev; |
722 | unsigned long ring_ref; | 729 | unsigned long ring_ref; |
723 | unsigned int evtchn; | 730 | unsigned int evtchn; |
731 | unsigned int pers_grants; | ||
724 | char protocol[64] = ""; | 732 | char protocol[64] = ""; |
725 | int err; | 733 | int err; |
726 | 734 | ||
@@ -750,8 +758,18 @@ static int connect_ring(struct backend_info *be) | |||
750 | xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol); | 758 | xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol); |
751 | return -1; | 759 | return -1; |
752 | } | 760 | } |
753 | pr_info(DRV_PFX "ring-ref %ld, event-channel %d, protocol %d (%s)\n", | 761 | err = xenbus_gather(XBT_NIL, dev->otherend, |
754 | ring_ref, evtchn, be->blkif->blk_protocol, protocol); | 762 | "feature-persistent", "%u", |
763 | &pers_grants, NULL); | ||
764 | if (err) | ||
765 | pers_grants = 0; | ||
766 | |||
767 | be->blkif->vbd.feature_gnt_persistent = pers_grants; | ||
768 | be->blkif->vbd.overflow_max_grants = 0; | ||
769 | |||
770 | pr_info(DRV_PFX "ring-ref %ld, event-channel %d, protocol %d (%s) %s\n", | ||
771 | ring_ref, evtchn, be->blkif->blk_protocol, protocol, | ||
772 | pers_grants ? "persistent grants" : ""); | ||
755 | 773 | ||
756 | /* Map the shared frame, irq etc. */ | 774 | /* Map the shared frame, irq etc. */ |
757 | err = xen_blkif_map(be->blkif, ring_ref, evtchn); | 775 | err = xen_blkif_map(be->blkif, ring_ref, evtchn); |
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 2c2d2e5c1597..96e9b00db081 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c | |||
@@ -44,6 +44,7 @@ | |||
44 | #include <linux/mutex.h> | 44 | #include <linux/mutex.h> |
45 | #include <linux/scatterlist.h> | 45 | #include <linux/scatterlist.h> |
46 | #include <linux/bitmap.h> | 46 | #include <linux/bitmap.h> |
47 | #include <linux/llist.h> | ||
47 | 48 | ||
48 | #include <xen/xen.h> | 49 | #include <xen/xen.h> |
49 | #include <xen/xenbus.h> | 50 | #include <xen/xenbus.h> |
@@ -64,10 +65,17 @@ enum blkif_state { | |||
64 | BLKIF_STATE_SUSPENDED, | 65 | BLKIF_STATE_SUSPENDED, |
65 | }; | 66 | }; |
66 | 67 | ||
68 | struct grant { | ||
69 | grant_ref_t gref; | ||
70 | unsigned long pfn; | ||
71 | struct llist_node node; | ||
72 | }; | ||
73 | |||
67 | struct blk_shadow { | 74 | struct blk_shadow { |
68 | struct blkif_request req; | 75 | struct blkif_request req; |
69 | struct request *request; | 76 | struct request *request; |
70 | unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | 77 | unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST]; |
78 | struct grant *grants_used[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | ||
71 | }; | 79 | }; |
72 | 80 | ||
73 | static DEFINE_MUTEX(blkfront_mutex); | 81 | static DEFINE_MUTEX(blkfront_mutex); |
@@ -97,6 +105,8 @@ struct blkfront_info | |||
97 | struct work_struct work; | 105 | struct work_struct work; |
98 | struct gnttab_free_callback callback; | 106 | struct gnttab_free_callback callback; |
99 | struct blk_shadow shadow[BLK_RING_SIZE]; | 107 | struct blk_shadow shadow[BLK_RING_SIZE]; |
108 | struct llist_head persistent_gnts; | ||
109 | unsigned int persistent_gnts_c; | ||
100 | unsigned long shadow_free; | 110 | unsigned long shadow_free; |
101 | unsigned int feature_flush; | 111 | unsigned int feature_flush; |
102 | unsigned int flush_op; | 112 | unsigned int flush_op; |
@@ -104,6 +114,7 @@ struct blkfront_info | |||
104 | unsigned int feature_secdiscard:1; | 114 | unsigned int feature_secdiscard:1; |
105 | unsigned int discard_granularity; | 115 | unsigned int discard_granularity; |
106 | unsigned int discard_alignment; | 116 | unsigned int discard_alignment; |
117 | unsigned int feature_persistent:1; | ||
107 | int is_ready; | 118 | int is_ready; |
108 | }; | 119 | }; |
109 | 120 | ||
@@ -287,21 +298,36 @@ static int blkif_queue_request(struct request *req) | |||
287 | unsigned long id; | 298 | unsigned long id; |
288 | unsigned int fsect, lsect; | 299 | unsigned int fsect, lsect; |
289 | int i, ref; | 300 | int i, ref; |
301 | |||
302 | /* | ||
303 | * Used to store if we are able to queue the request by just using | ||
304 | * existing persistent grants, or if we have to get new grants, | ||
305 | * as there are not sufficiently many free. | ||
306 | */ | ||
307 | bool new_persistent_gnts; | ||
290 | grant_ref_t gref_head; | 308 | grant_ref_t gref_head; |
309 | struct page *granted_page; | ||
310 | struct grant *gnt_list_entry = NULL; | ||
291 | struct scatterlist *sg; | 311 | struct scatterlist *sg; |
292 | 312 | ||
293 | if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) | 313 | if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) |
294 | return 1; | 314 | return 1; |
295 | 315 | ||
296 | if (gnttab_alloc_grant_references( | 316 | /* Check if we have enought grants to allocate a requests */ |
297 | BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) { | 317 | if (info->persistent_gnts_c < BLKIF_MAX_SEGMENTS_PER_REQUEST) { |
298 | gnttab_request_free_callback( | 318 | new_persistent_gnts = 1; |
299 | &info->callback, | 319 | if (gnttab_alloc_grant_references( |
300 | blkif_restart_queue_callback, | 320 | BLKIF_MAX_SEGMENTS_PER_REQUEST - info->persistent_gnts_c, |
301 | info, | 321 | &gref_head) < 0) { |
302 | BLKIF_MAX_SEGMENTS_PER_REQUEST); | 322 | gnttab_request_free_callback( |
303 | return 1; | 323 | &info->callback, |
304 | } | 324 | blkif_restart_queue_callback, |
325 | info, | ||
326 | BLKIF_MAX_SEGMENTS_PER_REQUEST); | ||
327 | return 1; | ||
328 | } | ||
329 | } else | ||
330 | new_persistent_gnts = 0; | ||
305 | 331 | ||
306 | /* Fill out a communications ring structure. */ | 332 | /* Fill out a communications ring structure. */ |
307 | ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); | 333 | ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); |
@@ -341,18 +367,73 @@ static int blkif_queue_request(struct request *req) | |||
341 | BLKIF_MAX_SEGMENTS_PER_REQUEST); | 367 | BLKIF_MAX_SEGMENTS_PER_REQUEST); |
342 | 368 | ||
343 | for_each_sg(info->sg, sg, ring_req->u.rw.nr_segments, i) { | 369 | for_each_sg(info->sg, sg, ring_req->u.rw.nr_segments, i) { |
344 | buffer_mfn = pfn_to_mfn(page_to_pfn(sg_page(sg))); | ||
345 | fsect = sg->offset >> 9; | 370 | fsect = sg->offset >> 9; |
346 | lsect = fsect + (sg->length >> 9) - 1; | 371 | lsect = fsect + (sg->length >> 9) - 1; |
347 | /* install a grant reference. */ | ||
348 | ref = gnttab_claim_grant_reference(&gref_head); | ||
349 | BUG_ON(ref == -ENOSPC); | ||
350 | 372 | ||
351 | gnttab_grant_foreign_access_ref( | 373 | if (info->persistent_gnts_c) { |
352 | ref, | 374 | BUG_ON(llist_empty(&info->persistent_gnts)); |
375 | gnt_list_entry = llist_entry( | ||
376 | llist_del_first(&info->persistent_gnts), | ||
377 | struct grant, node); | ||
378 | |||
379 | ref = gnt_list_entry->gref; | ||
380 | buffer_mfn = pfn_to_mfn(gnt_list_entry->pfn); | ||
381 | info->persistent_gnts_c--; | ||
382 | } else { | ||
383 | ref = gnttab_claim_grant_reference(&gref_head); | ||
384 | BUG_ON(ref == -ENOSPC); | ||
385 | |||
386 | gnt_list_entry = | ||
387 | kmalloc(sizeof(struct grant), | ||
388 | GFP_ATOMIC); | ||
389 | if (!gnt_list_entry) | ||
390 | return -ENOMEM; | ||
391 | |||
392 | granted_page = alloc_page(GFP_ATOMIC); | ||
393 | if (!granted_page) { | ||
394 | kfree(gnt_list_entry); | ||
395 | return -ENOMEM; | ||
396 | } | ||
397 | |||
398 | gnt_list_entry->pfn = | ||
399 | page_to_pfn(granted_page); | ||
400 | gnt_list_entry->gref = ref; | ||
401 | |||
402 | buffer_mfn = pfn_to_mfn(page_to_pfn( | ||
403 | granted_page)); | ||
404 | gnttab_grant_foreign_access_ref(ref, | ||
353 | info->xbdev->otherend_id, | 405 | info->xbdev->otherend_id, |
354 | buffer_mfn, | 406 | buffer_mfn, 0); |
355 | rq_data_dir(req)); | 407 | } |
408 | |||
409 | info->shadow[id].grants_used[i] = gnt_list_entry; | ||
410 | |||
411 | if (rq_data_dir(req)) { | ||
412 | char *bvec_data; | ||
413 | void *shared_data; | ||
414 | |||
415 | BUG_ON(sg->offset + sg->length > PAGE_SIZE); | ||
416 | |||
417 | shared_data = kmap_atomic( | ||
418 | pfn_to_page(gnt_list_entry->pfn)); | ||
419 | bvec_data = kmap_atomic(sg_page(sg)); | ||
420 | |||
421 | /* | ||
422 | * this does not wipe data stored outside the | ||
423 | * range sg->offset..sg->offset+sg->length. | ||
424 | * Therefore, blkback *could* see data from | ||
425 | * previous requests. This is OK as long as | ||
426 | * persistent grants are shared with just one | ||
427 | * domain. It may need refactoring if this | ||
428 | * changes | ||
429 | */ | ||
430 | memcpy(shared_data + sg->offset, | ||
431 | bvec_data + sg->offset, | ||
432 | sg->length); | ||
433 | |||
434 | kunmap_atomic(bvec_data); | ||
435 | kunmap_atomic(shared_data); | ||
436 | } | ||
356 | 437 | ||
357 | info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn); | 438 | info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn); |
358 | ring_req->u.rw.seg[i] = | 439 | ring_req->u.rw.seg[i] = |
@@ -368,7 +449,8 @@ static int blkif_queue_request(struct request *req) | |||
368 | /* Keep a private copy so we can reissue requests when recovering. */ | 449 | /* Keep a private copy so we can reissue requests when recovering. */ |
369 | info->shadow[id].req = *ring_req; | 450 | info->shadow[id].req = *ring_req; |
370 | 451 | ||
371 | gnttab_free_grant_references(gref_head); | 452 | if (new_persistent_gnts) |
453 | gnttab_free_grant_references(gref_head); | ||
372 | 454 | ||
373 | return 0; | 455 | return 0; |
374 | } | 456 | } |
@@ -480,12 +562,13 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) | |||
480 | static void xlvbd_flush(struct blkfront_info *info) | 562 | static void xlvbd_flush(struct blkfront_info *info) |
481 | { | 563 | { |
482 | blk_queue_flush(info->rq, info->feature_flush); | 564 | blk_queue_flush(info->rq, info->feature_flush); |
483 | printk(KERN_INFO "blkfront: %s: %s: %s\n", | 565 | printk(KERN_INFO "blkfront: %s: %s: %s %s\n", |
484 | info->gd->disk_name, | 566 | info->gd->disk_name, |
485 | info->flush_op == BLKIF_OP_WRITE_BARRIER ? | 567 | info->flush_op == BLKIF_OP_WRITE_BARRIER ? |
486 | "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ? | 568 | "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ? |
487 | "flush diskcache" : "barrier or flush"), | 569 | "flush diskcache" : "barrier or flush"), |
488 | info->feature_flush ? "enabled" : "disabled"); | 570 | info->feature_flush ? "enabled" : "disabled", |
571 | info->feature_persistent ? "using persistent grants" : ""); | ||
489 | } | 572 | } |
490 | 573 | ||
491 | static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) | 574 | static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) |
@@ -670,7 +753,7 @@ static void xlvbd_release_gendisk(struct blkfront_info *info) | |||
670 | spin_unlock_irqrestore(&info->io_lock, flags); | 753 | spin_unlock_irqrestore(&info->io_lock, flags); |
671 | 754 | ||
672 | /* Flush gnttab callback work. Must be done with no locks held. */ | 755 | /* Flush gnttab callback work. Must be done with no locks held. */ |
673 | flush_work_sync(&info->work); | 756 | flush_work(&info->work); |
674 | 757 | ||
675 | del_gendisk(info->gd); | 758 | del_gendisk(info->gd); |
676 | 759 | ||
@@ -707,6 +790,9 @@ static void blkif_restart_queue(struct work_struct *work) | |||
707 | 790 | ||
708 | static void blkif_free(struct blkfront_info *info, int suspend) | 791 | static void blkif_free(struct blkfront_info *info, int suspend) |
709 | { | 792 | { |
793 | struct llist_node *all_gnts; | ||
794 | struct grant *persistent_gnt; | ||
795 | |||
710 | /* Prevent new requests being issued until we fix things up. */ | 796 | /* Prevent new requests being issued until we fix things up. */ |
711 | spin_lock_irq(&info->io_lock); | 797 | spin_lock_irq(&info->io_lock); |
712 | info->connected = suspend ? | 798 | info->connected = suspend ? |
@@ -714,12 +800,24 @@ static void blkif_free(struct blkfront_info *info, int suspend) | |||
714 | /* No more blkif_request(). */ | 800 | /* No more blkif_request(). */ |
715 | if (info->rq) | 801 | if (info->rq) |
716 | blk_stop_queue(info->rq); | 802 | blk_stop_queue(info->rq); |
803 | |||
804 | /* Remove all persistent grants */ | ||
805 | if (info->persistent_gnts_c) { | ||
806 | all_gnts = llist_del_all(&info->persistent_gnts); | ||
807 | llist_for_each_entry(persistent_gnt, all_gnts, node) { | ||
808 | gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); | ||
809 | __free_page(pfn_to_page(persistent_gnt->pfn)); | ||
810 | kfree(persistent_gnt); | ||
811 | } | ||
812 | info->persistent_gnts_c = 0; | ||
813 | } | ||
814 | |||
717 | /* No more gnttab callback work. */ | 815 | /* No more gnttab callback work. */ |
718 | gnttab_cancel_free_callback(&info->callback); | 816 | gnttab_cancel_free_callback(&info->callback); |
719 | spin_unlock_irq(&info->io_lock); | 817 | spin_unlock_irq(&info->io_lock); |
720 | 818 | ||
721 | /* Flush gnttab callback work. Must be done with no locks held. */ | 819 | /* Flush gnttab callback work. Must be done with no locks held. */ |
722 | flush_work_sync(&info->work); | 820 | flush_work(&info->work); |
723 | 821 | ||
724 | /* Free resources associated with old device channel. */ | 822 | /* Free resources associated with old device channel. */ |
725 | if (info->ring_ref != GRANT_INVALID_REF) { | 823 | if (info->ring_ref != GRANT_INVALID_REF) { |
@@ -734,13 +832,43 @@ static void blkif_free(struct blkfront_info *info, int suspend) | |||
734 | 832 | ||
735 | } | 833 | } |
736 | 834 | ||
737 | static void blkif_completion(struct blk_shadow *s) | 835 | static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, |
836 | struct blkif_response *bret) | ||
738 | { | 837 | { |
739 | int i; | 838 | int i; |
740 | /* Do not let BLKIF_OP_DISCARD as nr_segment is in the same place | 839 | struct bio_vec *bvec; |
741 | * flag. */ | 840 | struct req_iterator iter; |
742 | for (i = 0; i < s->req.u.rw.nr_segments; i++) | 841 | unsigned long flags; |
743 | gnttab_end_foreign_access(s->req.u.rw.seg[i].gref, 0, 0UL); | 842 | char *bvec_data; |
843 | void *shared_data; | ||
844 | unsigned int offset = 0; | ||
845 | |||
846 | if (bret->operation == BLKIF_OP_READ) { | ||
847 | /* | ||
848 | * Copy the data received from the backend into the bvec. | ||
849 | * Since bv_offset can be different than 0, and bv_len different | ||
850 | * than PAGE_SIZE, we have to keep track of the current offset, | ||
851 | * to be sure we are copying the data from the right shared page. | ||
852 | */ | ||
853 | rq_for_each_segment(bvec, s->request, iter) { | ||
854 | BUG_ON((bvec->bv_offset + bvec->bv_len) > PAGE_SIZE); | ||
855 | i = offset >> PAGE_SHIFT; | ||
856 | BUG_ON(i >= s->req.u.rw.nr_segments); | ||
857 | shared_data = kmap_atomic( | ||
858 | pfn_to_page(s->grants_used[i]->pfn)); | ||
859 | bvec_data = bvec_kmap_irq(bvec, &flags); | ||
860 | memcpy(bvec_data, shared_data + bvec->bv_offset, | ||
861 | bvec->bv_len); | ||
862 | bvec_kunmap_irq(bvec_data, &flags); | ||
863 | kunmap_atomic(shared_data); | ||
864 | offset += bvec->bv_len; | ||
865 | } | ||
866 | } | ||
867 | /* Add the persistent grant into the list of free grants */ | ||
868 | for (i = 0; i < s->req.u.rw.nr_segments; i++) { | ||
869 | llist_add(&s->grants_used[i]->node, &info->persistent_gnts); | ||
870 | info->persistent_gnts_c++; | ||
871 | } | ||
744 | } | 872 | } |
745 | 873 | ||
746 | static irqreturn_t blkif_interrupt(int irq, void *dev_id) | 874 | static irqreturn_t blkif_interrupt(int irq, void *dev_id) |
@@ -783,7 +911,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) | |||
783 | req = info->shadow[id].request; | 911 | req = info->shadow[id].request; |
784 | 912 | ||
785 | if (bret->operation != BLKIF_OP_DISCARD) | 913 | if (bret->operation != BLKIF_OP_DISCARD) |
786 | blkif_completion(&info->shadow[id]); | 914 | blkif_completion(&info->shadow[id], info, bret); |
787 | 915 | ||
788 | if (add_id_to_freelist(info, id)) { | 916 | if (add_id_to_freelist(info, id)) { |
789 | WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n", | 917 | WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n", |
@@ -942,6 +1070,11 @@ again: | |||
942 | message = "writing protocol"; | 1070 | message = "writing protocol"; |
943 | goto abort_transaction; | 1071 | goto abort_transaction; |
944 | } | 1072 | } |
1073 | err = xenbus_printf(xbt, dev->nodename, | ||
1074 | "feature-persistent", "%u", 1); | ||
1075 | if (err) | ||
1076 | dev_warn(&dev->dev, | ||
1077 | "writing persistent grants feature to xenbus"); | ||
945 | 1078 | ||
946 | err = xenbus_transaction_end(xbt, 0); | 1079 | err = xenbus_transaction_end(xbt, 0); |
947 | if (err) { | 1080 | if (err) { |
@@ -1029,6 +1162,8 @@ static int blkfront_probe(struct xenbus_device *dev, | |||
1029 | spin_lock_init(&info->io_lock); | 1162 | spin_lock_init(&info->io_lock); |
1030 | info->xbdev = dev; | 1163 | info->xbdev = dev; |
1031 | info->vdevice = vdevice; | 1164 | info->vdevice = vdevice; |
1165 | init_llist_head(&info->persistent_gnts); | ||
1166 | info->persistent_gnts_c = 0; | ||
1032 | info->connected = BLKIF_STATE_DISCONNECTED; | 1167 | info->connected = BLKIF_STATE_DISCONNECTED; |
1033 | INIT_WORK(&info->work, blkif_restart_queue); | 1168 | INIT_WORK(&info->work, blkif_restart_queue); |
1034 | 1169 | ||
@@ -1093,7 +1228,7 @@ static int blkif_recover(struct blkfront_info *info) | |||
1093 | req->u.rw.seg[j].gref, | 1228 | req->u.rw.seg[j].gref, |
1094 | info->xbdev->otherend_id, | 1229 | info->xbdev->otherend_id, |
1095 | pfn_to_mfn(info->shadow[req->u.rw.id].frame[j]), | 1230 | pfn_to_mfn(info->shadow[req->u.rw.id].frame[j]), |
1096 | rq_data_dir(info->shadow[req->u.rw.id].request)); | 1231 | 0); |
1097 | } | 1232 | } |
1098 | info->shadow[req->u.rw.id].req = *req; | 1233 | info->shadow[req->u.rw.id].req = *req; |
1099 | 1234 | ||
@@ -1225,7 +1360,7 @@ static void blkfront_connect(struct blkfront_info *info) | |||
1225 | unsigned long sector_size; | 1360 | unsigned long sector_size; |
1226 | unsigned int binfo; | 1361 | unsigned int binfo; |
1227 | int err; | 1362 | int err; |
1228 | int barrier, flush, discard; | 1363 | int barrier, flush, discard, persistent; |
1229 | 1364 | ||
1230 | switch (info->connected) { | 1365 | switch (info->connected) { |
1231 | case BLKIF_STATE_CONNECTED: | 1366 | case BLKIF_STATE_CONNECTED: |
@@ -1303,6 +1438,14 @@ static void blkfront_connect(struct blkfront_info *info) | |||
1303 | if (!err && discard) | 1438 | if (!err && discard) |
1304 | blkfront_setup_discard(info); | 1439 | blkfront_setup_discard(info); |
1305 | 1440 | ||
1441 | err = xenbus_gather(XBT_NIL, info->xbdev->otherend, | ||
1442 | "feature-persistent", "%u", &persistent, | ||
1443 | NULL); | ||
1444 | if (err) | ||
1445 | info->feature_persistent = 0; | ||
1446 | else | ||
1447 | info->feature_persistent = persistent; | ||
1448 | |||
1306 | err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size); | 1449 | err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size); |
1307 | if (err) { | 1450 | if (err) { |
1308 | xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", | 1451 | xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", |