diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-12-17 16:39:11 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-12-17 16:39:11 -0500 |
commit | 9228ff90387e276ad67b10c0eb525c9d6a57d5e9 (patch) | |
tree | e7c87b68daba7cf7ca4c342c6b52165bd78fbe16 | |
parent | 9360b53661a2c7754517b2925580055bacc8ec38 (diff) | |
parent | d2ec180c23a5a1bfe34d8638b0342a47c00cf70f (diff) |
Merge branch 'for-3.8/drivers' of git://git.kernel.dk/linux-block
Pull block driver update from Jens Axboe:
"Now that the core bits are in, here are the driver bits for 3.8. The
branch contains:
- A huge pile of drbd bits that were dumped from the 3.7 merge
window. Following that, it was both made perfectly clear that
there is going to be no more over-the-wall pulls and how the
situation on individual pulls can be improved.
- A few cleanups from Akinobu Mita for drbd and cciss.
- Queue improvement for loop from Lukas. This grew into adding a
generic interface for waiting/checking an even with a specific
lock, allowing this to be pulled out of md and now loop and drbd is
also using it.
- A few fixes for xen back/front block driver from Roger Pau Monne.
- Partition improvements from Stephen Warren, allowing partiion UUID
to be used as an identifier."
* 'for-3.8/drivers' of git://git.kernel.dk/linux-block: (609 commits)
drbd: update Kconfig to match current dependencies
drbd: Fix drbdsetup wait-connect, wait-sync etc... commands
drbd: close race between drbd_set_role and drbd_connect
drbd: respect no-md-barriers setting also when changed online via disk-options
drbd: Remove obsolete check
drbd: fixup after wait_even_lock_irq() addition to generic code
loop: Limit the number of requests in the bio list
wait: add wait_event_lock_irq() interface
xen-blkfront: free allocated page
xen-blkback: move free persistent grants code
block: partition: msdos: provide UUIDs for partitions
init: reduce PARTUUID min length to 1 from 36
block: store partition_meta_info.uuid as a string
cciss: use check_signature()
cciss: cleanup bitops usage
drbd: use copy_highpage
drbd: if the replication link breaks during handshake, keep retrying
drbd: check return of kmalloc in receive_uuids
drbd: Broadcast sync progress no more often than once per second
drbd: don't try to clear bits once the disk has failed
...
49 files changed, 12920 insertions, 8636 deletions
diff --git a/block/genhd.c b/block/genhd.c index 2a6fdf539a69..9a289d7c84bb 100644 --- a/block/genhd.c +++ b/block/genhd.c | |||
@@ -743,7 +743,6 @@ void __init printk_all_partitions(void) | |||
743 | struct hd_struct *part; | 743 | struct hd_struct *part; |
744 | char name_buf[BDEVNAME_SIZE]; | 744 | char name_buf[BDEVNAME_SIZE]; |
745 | char devt_buf[BDEVT_SIZE]; | 745 | char devt_buf[BDEVT_SIZE]; |
746 | char uuid_buf[PARTITION_META_INFO_UUIDLTH * 2 + 5]; | ||
747 | 746 | ||
748 | /* | 747 | /* |
749 | * Don't show empty devices or things that have been | 748 | * Don't show empty devices or things that have been |
@@ -762,16 +761,11 @@ void __init printk_all_partitions(void) | |||
762 | while ((part = disk_part_iter_next(&piter))) { | 761 | while ((part = disk_part_iter_next(&piter))) { |
763 | bool is_part0 = part == &disk->part0; | 762 | bool is_part0 = part == &disk->part0; |
764 | 763 | ||
765 | uuid_buf[0] = '\0'; | ||
766 | if (part->info) | ||
767 | snprintf(uuid_buf, sizeof(uuid_buf), "%pU", | ||
768 | part->info->uuid); | ||
769 | |||
770 | printk("%s%s %10llu %s %s", is_part0 ? "" : " ", | 764 | printk("%s%s %10llu %s %s", is_part0 ? "" : " ", |
771 | bdevt_str(part_devt(part), devt_buf), | 765 | bdevt_str(part_devt(part), devt_buf), |
772 | (unsigned long long)part_nr_sects_read(part) >> 1 | 766 | (unsigned long long)part_nr_sects_read(part) >> 1 |
773 | , disk_name(disk, part->partno, name_buf), | 767 | , disk_name(disk, part->partno, name_buf), |
774 | uuid_buf); | 768 | part->info ? part->info->uuid : ""); |
775 | if (is_part0) { | 769 | if (is_part0) { |
776 | if (disk->driverfs_dev != NULL && | 770 | if (disk->driverfs_dev != NULL && |
777 | disk->driverfs_dev->driver != NULL) | 771 | disk->driverfs_dev->driver != NULL) |
diff --git a/block/partitions/efi.c b/block/partitions/efi.c index 6296b403c67a..b62fb88b8711 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c | |||
@@ -620,7 +620,6 @@ int efi_partition(struct parsed_partitions *state) | |||
620 | gpt_entry *ptes = NULL; | 620 | gpt_entry *ptes = NULL; |
621 | u32 i; | 621 | u32 i; |
622 | unsigned ssz = bdev_logical_block_size(state->bdev) / 512; | 622 | unsigned ssz = bdev_logical_block_size(state->bdev) / 512; |
623 | u8 unparsed_guid[37]; | ||
624 | 623 | ||
625 | if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) { | 624 | if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) { |
626 | kfree(gpt); | 625 | kfree(gpt); |
@@ -649,11 +648,7 @@ int efi_partition(struct parsed_partitions *state) | |||
649 | state->parts[i + 1].flags = ADDPART_FLAG_RAID; | 648 | state->parts[i + 1].flags = ADDPART_FLAG_RAID; |
650 | 649 | ||
651 | info = &state->parts[i + 1].info; | 650 | info = &state->parts[i + 1].info; |
652 | /* Instead of doing a manual swap to big endian, reuse the | 651 | efi_guid_unparse(&ptes[i].unique_partition_guid, info->uuid); |
653 | * common ASCII hex format as the interim. | ||
654 | */ | ||
655 | efi_guid_unparse(&ptes[i].unique_partition_guid, unparsed_guid); | ||
656 | part_pack_uuid(unparsed_guid, info->uuid); | ||
657 | 652 | ||
658 | /* Naively convert UTF16-LE to 7 bits. */ | 653 | /* Naively convert UTF16-LE to 7 bits. */ |
659 | label_max = min(sizeof(info->volname) - 1, | 654 | label_max = min(sizeof(info->volname) - 1, |
diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index 5f79a6677c69..8752a5d26565 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c | |||
@@ -94,6 +94,17 @@ static int aix_magic_present(struct parsed_partitions *state, unsigned char *p) | |||
94 | return ret; | 94 | return ret; |
95 | } | 95 | } |
96 | 96 | ||
97 | static void set_info(struct parsed_partitions *state, int slot, | ||
98 | u32 disksig) | ||
99 | { | ||
100 | struct partition_meta_info *info = &state->parts[slot].info; | ||
101 | |||
102 | snprintf(info->uuid, sizeof(info->uuid), "%08x-%02x", disksig, | ||
103 | slot); | ||
104 | info->volname[0] = 0; | ||
105 | state->parts[slot].has_info = true; | ||
106 | } | ||
107 | |||
97 | /* | 108 | /* |
98 | * Create devices for each logical partition in an extended partition. | 109 | * Create devices for each logical partition in an extended partition. |
99 | * The logical partitions form a linked list, with each entry being | 110 | * The logical partitions form a linked list, with each entry being |
@@ -106,7 +117,8 @@ static int aix_magic_present(struct parsed_partitions *state, unsigned char *p) | |||
106 | */ | 117 | */ |
107 | 118 | ||
108 | static void parse_extended(struct parsed_partitions *state, | 119 | static void parse_extended(struct parsed_partitions *state, |
109 | sector_t first_sector, sector_t first_size) | 120 | sector_t first_sector, sector_t first_size, |
121 | u32 disksig) | ||
110 | { | 122 | { |
111 | struct partition *p; | 123 | struct partition *p; |
112 | Sector sect; | 124 | Sector sect; |
@@ -166,6 +178,7 @@ static void parse_extended(struct parsed_partitions *state, | |||
166 | } | 178 | } |
167 | 179 | ||
168 | put_partition(state, state->next, next, size); | 180 | put_partition(state, state->next, next, size); |
181 | set_info(state, state->next, disksig); | ||
169 | if (SYS_IND(p) == LINUX_RAID_PARTITION) | 182 | if (SYS_IND(p) == LINUX_RAID_PARTITION) |
170 | state->parts[state->next].flags = ADDPART_FLAG_RAID; | 183 | state->parts[state->next].flags = ADDPART_FLAG_RAID; |
171 | loopct = 0; | 184 | loopct = 0; |
@@ -437,6 +450,7 @@ int msdos_partition(struct parsed_partitions *state) | |||
437 | struct partition *p; | 450 | struct partition *p; |
438 | struct fat_boot_sector *fb; | 451 | struct fat_boot_sector *fb; |
439 | int slot; | 452 | int slot; |
453 | u32 disksig; | ||
440 | 454 | ||
441 | data = read_part_sector(state, 0, §); | 455 | data = read_part_sector(state, 0, §); |
442 | if (!data) | 456 | if (!data) |
@@ -491,6 +505,8 @@ int msdos_partition(struct parsed_partitions *state) | |||
491 | #endif | 505 | #endif |
492 | p = (struct partition *) (data + 0x1be); | 506 | p = (struct partition *) (data + 0x1be); |
493 | 507 | ||
508 | disksig = le32_to_cpup((__le32 *)(data + 0x1b8)); | ||
509 | |||
494 | /* | 510 | /* |
495 | * Look for partitions in two passes: | 511 | * Look for partitions in two passes: |
496 | * First find the primary and DOS-type extended partitions. | 512 | * First find the primary and DOS-type extended partitions. |
@@ -515,11 +531,12 @@ int msdos_partition(struct parsed_partitions *state) | |||
515 | put_partition(state, slot, start, n); | 531 | put_partition(state, slot, start, n); |
516 | 532 | ||
517 | strlcat(state->pp_buf, " <", PAGE_SIZE); | 533 | strlcat(state->pp_buf, " <", PAGE_SIZE); |
518 | parse_extended(state, start, size); | 534 | parse_extended(state, start, size, disksig); |
519 | strlcat(state->pp_buf, " >", PAGE_SIZE); | 535 | strlcat(state->pp_buf, " >", PAGE_SIZE); |
520 | continue; | 536 | continue; |
521 | } | 537 | } |
522 | put_partition(state, slot, start, size); | 538 | put_partition(state, slot, start, size); |
539 | set_info(state, slot, disksig); | ||
523 | if (SYS_IND(p) == LINUX_RAID_PARTITION) | 540 | if (SYS_IND(p) == LINUX_RAID_PARTITION) |
524 | state->parts[slot].flags = ADDPART_FLAG_RAID; | 541 | state->parts[slot].flags = ADDPART_FLAG_RAID; |
525 | if (SYS_IND(p) == DM6_PARTITION) | 542 | if (SYS_IND(p) == DM6_PARTITION) |
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index ca83f96756ad..6526157edafc 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c | |||
@@ -41,8 +41,9 @@ | |||
41 | #include <linux/spinlock.h> | 41 | #include <linux/spinlock.h> |
42 | #include <linux/compat.h> | 42 | #include <linux/compat.h> |
43 | #include <linux/mutex.h> | 43 | #include <linux/mutex.h> |
44 | #include <linux/bitmap.h> | ||
45 | #include <linux/io.h> | ||
44 | #include <asm/uaccess.h> | 46 | #include <asm/uaccess.h> |
45 | #include <asm/io.h> | ||
46 | 47 | ||
47 | #include <linux/dma-mapping.h> | 48 | #include <linux/dma-mapping.h> |
48 | #include <linux/blkdev.h> | 49 | #include <linux/blkdev.h> |
@@ -978,8 +979,7 @@ static CommandList_struct *cmd_alloc(ctlr_info_t *h) | |||
978 | i = find_first_zero_bit(h->cmd_pool_bits, h->nr_cmds); | 979 | i = find_first_zero_bit(h->cmd_pool_bits, h->nr_cmds); |
979 | if (i == h->nr_cmds) | 980 | if (i == h->nr_cmds) |
980 | return NULL; | 981 | return NULL; |
981 | } while (test_and_set_bit(i & (BITS_PER_LONG - 1), | 982 | } while (test_and_set_bit(i, h->cmd_pool_bits) != 0); |
982 | h->cmd_pool_bits + (i / BITS_PER_LONG)) != 0); | ||
983 | c = h->cmd_pool + i; | 983 | c = h->cmd_pool + i; |
984 | memset(c, 0, sizeof(CommandList_struct)); | 984 | memset(c, 0, sizeof(CommandList_struct)); |
985 | cmd_dma_handle = h->cmd_pool_dhandle + i * sizeof(CommandList_struct); | 985 | cmd_dma_handle = h->cmd_pool_dhandle + i * sizeof(CommandList_struct); |
@@ -1046,8 +1046,7 @@ static void cmd_free(ctlr_info_t *h, CommandList_struct *c) | |||
1046 | int i; | 1046 | int i; |
1047 | 1047 | ||
1048 | i = c - h->cmd_pool; | 1048 | i = c - h->cmd_pool; |
1049 | clear_bit(i & (BITS_PER_LONG - 1), | 1049 | clear_bit(i, h->cmd_pool_bits); |
1050 | h->cmd_pool_bits + (i / BITS_PER_LONG)); | ||
1051 | h->nr_frees++; | 1050 | h->nr_frees++; |
1052 | } | 1051 | } |
1053 | 1052 | ||
@@ -4268,10 +4267,7 @@ static void __devinit cciss_find_board_params(ctlr_info_t *h) | |||
4268 | 4267 | ||
4269 | static inline bool CISS_signature_present(ctlr_info_t *h) | 4268 | static inline bool CISS_signature_present(ctlr_info_t *h) |
4270 | { | 4269 | { |
4271 | if ((readb(&h->cfgtable->Signature[0]) != 'C') || | 4270 | if (!check_signature(h->cfgtable->Signature, "CISS", 4)) { |
4272 | (readb(&h->cfgtable->Signature[1]) != 'I') || | ||
4273 | (readb(&h->cfgtable->Signature[2]) != 'S') || | ||
4274 | (readb(&h->cfgtable->Signature[3]) != 'S')) { | ||
4275 | dev_warn(&h->pdev->dev, "not a valid CISS config table\n"); | 4271 | dev_warn(&h->pdev->dev, "not a valid CISS config table\n"); |
4276 | return false; | 4272 | return false; |
4277 | } | 4273 | } |
@@ -4812,8 +4808,7 @@ static __devinit int cciss_init_reset_devices(struct pci_dev *pdev) | |||
4812 | 4808 | ||
4813 | static __devinit int cciss_allocate_cmd_pool(ctlr_info_t *h) | 4809 | static __devinit int cciss_allocate_cmd_pool(ctlr_info_t *h) |
4814 | { | 4810 | { |
4815 | h->cmd_pool_bits = kmalloc( | 4811 | h->cmd_pool_bits = kmalloc(BITS_TO_LONGS(h->nr_cmds) * |
4816 | DIV_ROUND_UP(h->nr_cmds, BITS_PER_LONG) * | ||
4817 | sizeof(unsigned long), GFP_KERNEL); | 4812 | sizeof(unsigned long), GFP_KERNEL); |
4818 | h->cmd_pool = pci_alloc_consistent(h->pdev, | 4813 | h->cmd_pool = pci_alloc_consistent(h->pdev, |
4819 | h->nr_cmds * sizeof(CommandList_struct), | 4814 | h->nr_cmds * sizeof(CommandList_struct), |
@@ -5068,9 +5063,7 @@ reinit_after_soft_reset: | |||
5068 | pci_set_drvdata(pdev, h); | 5063 | pci_set_drvdata(pdev, h); |
5069 | /* command and error info recs zeroed out before | 5064 | /* command and error info recs zeroed out before |
5070 | they are used */ | 5065 | they are used */ |
5071 | memset(h->cmd_pool_bits, 0, | 5066 | bitmap_zero(h->cmd_pool_bits, h->nr_cmds); |
5072 | DIV_ROUND_UP(h->nr_cmds, BITS_PER_LONG) | ||
5073 | * sizeof(unsigned long)); | ||
5074 | 5067 | ||
5075 | h->num_luns = 0; | 5068 | h->num_luns = 0; |
5076 | h->highest_lun = -1; | 5069 | h->highest_lun = -1; |
diff --git a/drivers/block/drbd/Kconfig b/drivers/block/drbd/Kconfig index df0983787390..7845bd6ee414 100644 --- a/drivers/block/drbd/Kconfig +++ b/drivers/block/drbd/Kconfig | |||
@@ -2,13 +2,14 @@ | |||
2 | # DRBD device driver configuration | 2 | # DRBD device driver configuration |
3 | # | 3 | # |
4 | 4 | ||
5 | comment "DRBD disabled because PROC_FS, INET or CONNECTOR not selected" | 5 | comment "DRBD disabled because PROC_FS or INET not selected" |
6 | depends on PROC_FS='n' || INET='n' || CONNECTOR='n' | 6 | depends on PROC_FS='n' || INET='n' |
7 | 7 | ||
8 | config BLK_DEV_DRBD | 8 | config BLK_DEV_DRBD |
9 | tristate "DRBD Distributed Replicated Block Device support" | 9 | tristate "DRBD Distributed Replicated Block Device support" |
10 | depends on PROC_FS && INET && CONNECTOR | 10 | depends on PROC_FS && INET |
11 | select LRU_CACHE | 11 | select LRU_CACHE |
12 | select LIBCRC32C | ||
12 | default n | 13 | default n |
13 | help | 14 | help |
14 | 15 | ||
@@ -58,7 +59,8 @@ config DRBD_FAULT_INJECTION | |||
58 | 32 data read | 59 | 32 data read |
59 | 64 read ahead | 60 | 64 read ahead |
60 | 128 kmalloc of bitmap | 61 | 128 kmalloc of bitmap |
61 | 256 allocation of EE (epoch_entries) | 62 | 256 allocation of peer_requests |
63 | 512 insert data corruption on receiving side | ||
62 | 64 | ||
63 | fault_devs: bitmask of minor numbers | 65 | fault_devs: bitmask of minor numbers |
64 | fault_rate: frequency in percent | 66 | fault_rate: frequency in percent |
diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile index 0d3f337ff5ff..8b450338075e 100644 --- a/drivers/block/drbd/Makefile +++ b/drivers/block/drbd/Makefile | |||
@@ -1,5 +1,7 @@ | |||
1 | drbd-y := drbd_bitmap.o drbd_proc.o | 1 | drbd-y := drbd_bitmap.o drbd_proc.o |
2 | drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o | 2 | drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o |
3 | drbd-y += drbd_main.o drbd_strings.o drbd_nl.o | 3 | drbd-y += drbd_main.o drbd_strings.o drbd_nl.o |
4 | drbd-y += drbd_interval.o drbd_state.o | ||
5 | drbd-y += drbd_nla.o | ||
4 | 6 | ||
5 | obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o | 7 | obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o |
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 3fbef018ce55..92510f8ad013 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c | |||
@@ -24,21 +24,73 @@ | |||
24 | */ | 24 | */ |
25 | 25 | ||
26 | #include <linux/slab.h> | 26 | #include <linux/slab.h> |
27 | #include <linux/crc32c.h> | ||
27 | #include <linux/drbd.h> | 28 | #include <linux/drbd.h> |
29 | #include <linux/drbd_limits.h> | ||
30 | #include <linux/dynamic_debug.h> | ||
28 | #include "drbd_int.h" | 31 | #include "drbd_int.h" |
29 | #include "drbd_wrappers.h" | 32 | #include "drbd_wrappers.h" |
30 | 33 | ||
31 | /* We maintain a trivial checksum in our on disk activity log. | 34 | |
32 | * With that we can ensure correct operation even when the storage | 35 | enum al_transaction_types { |
33 | * device might do a partial (last) sector write while losing power. | 36 | AL_TR_UPDATE = 0, |
34 | */ | 37 | AL_TR_INITIALIZED = 0xffff |
35 | struct __packed al_transaction { | 38 | }; |
36 | u32 magic; | 39 | /* all fields on disc in big endian */ |
37 | u32 tr_number; | 40 | struct __packed al_transaction_on_disk { |
38 | struct __packed { | 41 | /* don't we all like magic */ |
39 | u32 pos; | 42 | __be32 magic; |
40 | u32 extent; } updates[1 + AL_EXTENTS_PT]; | 43 | |
41 | u32 xor_sum; | 44 | /* to identify the most recent transaction block |
45 | * in the on disk ring buffer */ | ||
46 | __be32 tr_number; | ||
47 | |||
48 | /* checksum on the full 4k block, with this field set to 0. */ | ||
49 | __be32 crc32c; | ||
50 | |||
51 | /* type of transaction, special transaction types like: | ||
52 | * purge-all, set-all-idle, set-all-active, ... to-be-defined | ||
53 | * see also enum al_transaction_types */ | ||
54 | __be16 transaction_type; | ||
55 | |||
56 | /* we currently allow only a few thousand extents, | ||
57 | * so 16bit will be enough for the slot number. */ | ||
58 | |||
59 | /* how many updates in this transaction */ | ||
60 | __be16 n_updates; | ||
61 | |||
62 | /* maximum slot number, "al-extents" in drbd.conf speak. | ||
63 | * Having this in each transaction should make reconfiguration | ||
64 | * of that parameter easier. */ | ||
65 | __be16 context_size; | ||
66 | |||
67 | /* slot number the context starts with */ | ||
68 | __be16 context_start_slot_nr; | ||
69 | |||
70 | /* Some reserved bytes. Expected usage is a 64bit counter of | ||
71 | * sectors-written since device creation, and other data generation tag | ||
72 | * supporting usage */ | ||
73 | __be32 __reserved[4]; | ||
74 | |||
75 | /* --- 36 byte used --- */ | ||
76 | |||
77 | /* Reserve space for up to AL_UPDATES_PER_TRANSACTION changes | ||
78 | * in one transaction, then use the remaining byte in the 4k block for | ||
79 | * context information. "Flexible" number of updates per transaction | ||
80 | * does not help, as we have to account for the case when all update | ||
81 | * slots are used anyways, so it would only complicate code without | ||
82 | * additional benefit. | ||
83 | */ | ||
84 | __be16 update_slot_nr[AL_UPDATES_PER_TRANSACTION]; | ||
85 | |||
86 | /* but the extent number is 32bit, which at an extent size of 4 MiB | ||
87 | * allows to cover device sizes of up to 2**54 Byte (16 PiB) */ | ||
88 | __be32 update_extent_nr[AL_UPDATES_PER_TRANSACTION]; | ||
89 | |||
90 | /* --- 420 bytes used (36 + 64*6) --- */ | ||
91 | |||
92 | /* 4096 - 420 = 3676 = 919 * 4 */ | ||
93 | __be32 context[AL_CONTEXT_PER_TRANSACTION]; | ||
42 | }; | 94 | }; |
43 | 95 | ||
44 | struct update_odbm_work { | 96 | struct update_odbm_work { |
@@ -48,22 +100,11 @@ struct update_odbm_work { | |||
48 | 100 | ||
49 | struct update_al_work { | 101 | struct update_al_work { |
50 | struct drbd_work w; | 102 | struct drbd_work w; |
51 | struct lc_element *al_ext; | ||
52 | struct completion event; | 103 | struct completion event; |
53 | unsigned int enr; | 104 | int err; |
54 | /* if old_enr != LC_FREE, write corresponding bitmap sector, too */ | ||
55 | unsigned int old_enr; | ||
56 | }; | ||
57 | |||
58 | struct drbd_atodb_wait { | ||
59 | atomic_t count; | ||
60 | struct completion io_done; | ||
61 | struct drbd_conf *mdev; | ||
62 | int error; | ||
63 | }; | 105 | }; |
64 | 106 | ||
65 | 107 | static int al_write_transaction(struct drbd_conf *mdev); | |
66 | int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int); | ||
67 | 108 | ||
68 | void *drbd_md_get_buffer(struct drbd_conf *mdev) | 109 | void *drbd_md_get_buffer(struct drbd_conf *mdev) |
69 | { | 110 | { |
@@ -82,22 +123,24 @@ void drbd_md_put_buffer(struct drbd_conf *mdev) | |||
82 | wake_up(&mdev->misc_wait); | 123 | wake_up(&mdev->misc_wait); |
83 | } | 124 | } |
84 | 125 | ||
85 | static bool md_io_allowed(struct drbd_conf *mdev) | 126 | void wait_until_done_or_force_detached(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, |
86 | { | ||
87 | enum drbd_disk_state ds = mdev->state.disk; | ||
88 | return ds >= D_NEGOTIATING || ds == D_ATTACHING; | ||
89 | } | ||
90 | |||
91 | void wait_until_done_or_disk_failure(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, | ||
92 | unsigned int *done) | 127 | unsigned int *done) |
93 | { | 128 | { |
94 | long dt = bdev->dc.disk_timeout * HZ / 10; | 129 | long dt; |
130 | |||
131 | rcu_read_lock(); | ||
132 | dt = rcu_dereference(bdev->disk_conf)->disk_timeout; | ||
133 | rcu_read_unlock(); | ||
134 | dt = dt * HZ / 10; | ||
95 | if (dt == 0) | 135 | if (dt == 0) |
96 | dt = MAX_SCHEDULE_TIMEOUT; | 136 | dt = MAX_SCHEDULE_TIMEOUT; |
97 | 137 | ||
98 | dt = wait_event_timeout(mdev->misc_wait, *done || !md_io_allowed(mdev), dt); | 138 | dt = wait_event_timeout(mdev->misc_wait, |
99 | if (dt == 0) | 139 | *done || test_bit(FORCE_DETACH, &mdev->flags), dt); |
140 | if (dt == 0) { | ||
100 | dev_err(DEV, "meta-data IO operation timed out\n"); | 141 | dev_err(DEV, "meta-data IO operation timed out\n"); |
142 | drbd_chk_io_error(mdev, 1, DRBD_FORCE_DETACH); | ||
143 | } | ||
101 | } | 144 | } |
102 | 145 | ||
103 | static int _drbd_md_sync_page_io(struct drbd_conf *mdev, | 146 | static int _drbd_md_sync_page_io(struct drbd_conf *mdev, |
@@ -106,7 +149,7 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, | |||
106 | int rw, int size) | 149 | int rw, int size) |
107 | { | 150 | { |
108 | struct bio *bio; | 151 | struct bio *bio; |
109 | int ok; | 152 | int err; |
110 | 153 | ||
111 | mdev->md_io.done = 0; | 154 | mdev->md_io.done = 0; |
112 | mdev->md_io.error = -ENODEV; | 155 | mdev->md_io.error = -ENODEV; |
@@ -118,8 +161,8 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, | |||
118 | bio = bio_alloc_drbd(GFP_NOIO); | 161 | bio = bio_alloc_drbd(GFP_NOIO); |
119 | bio->bi_bdev = bdev->md_bdev; | 162 | bio->bi_bdev = bdev->md_bdev; |
120 | bio->bi_sector = sector; | 163 | bio->bi_sector = sector; |
121 | ok = (bio_add_page(bio, page, size, 0) == size); | 164 | err = -EIO; |
122 | if (!ok) | 165 | if (bio_add_page(bio, page, size, 0) != size) |
123 | goto out; | 166 | goto out; |
124 | bio->bi_private = &mdev->md_io; | 167 | bio->bi_private = &mdev->md_io; |
125 | bio->bi_end_io = drbd_md_io_complete; | 168 | bio->bi_end_io = drbd_md_io_complete; |
@@ -127,7 +170,7 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, | |||
127 | 170 | ||
128 | if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* Corresponding put_ldev in drbd_md_io_complete() */ | 171 | if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* Corresponding put_ldev in drbd_md_io_complete() */ |
129 | dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n"); | 172 | dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n"); |
130 | ok = 0; | 173 | err = -ENODEV; |
131 | goto out; | 174 | goto out; |
132 | } | 175 | } |
133 | 176 | ||
@@ -137,86 +180,47 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, | |||
137 | bio_endio(bio, -EIO); | 180 | bio_endio(bio, -EIO); |
138 | else | 181 | else |
139 | submit_bio(rw, bio); | 182 | submit_bio(rw, bio); |
140 | wait_until_done_or_disk_failure(mdev, bdev, &mdev->md_io.done); | 183 | wait_until_done_or_force_detached(mdev, bdev, &mdev->md_io.done); |
141 | ok = bio_flagged(bio, BIO_UPTODATE) && mdev->md_io.error == 0; | 184 | if (bio_flagged(bio, BIO_UPTODATE)) |
185 | err = mdev->md_io.error; | ||
142 | 186 | ||
143 | out: | 187 | out: |
144 | bio_put(bio); | 188 | bio_put(bio); |
145 | return ok; | 189 | return err; |
146 | } | 190 | } |
147 | 191 | ||
148 | int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, | 192 | int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, |
149 | sector_t sector, int rw) | 193 | sector_t sector, int rw) |
150 | { | 194 | { |
151 | int logical_block_size, mask, ok; | 195 | int err; |
152 | int offset = 0; | ||
153 | struct page *iop = mdev->md_io_page; | 196 | struct page *iop = mdev->md_io_page; |
154 | 197 | ||
155 | D_ASSERT(atomic_read(&mdev->md_io_in_use) == 1); | 198 | D_ASSERT(atomic_read(&mdev->md_io_in_use) == 1); |
156 | 199 | ||
157 | BUG_ON(!bdev->md_bdev); | 200 | BUG_ON(!bdev->md_bdev); |
158 | 201 | ||
159 | logical_block_size = bdev_logical_block_size(bdev->md_bdev); | 202 | dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s)\n", |
160 | if (logical_block_size == 0) | 203 | current->comm, current->pid, __func__, |
161 | logical_block_size = MD_SECTOR_SIZE; | 204 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); |
162 | |||
163 | /* in case logical_block_size != 512 [ s390 only? ] */ | ||
164 | if (logical_block_size != MD_SECTOR_SIZE) { | ||
165 | mask = (logical_block_size / MD_SECTOR_SIZE) - 1; | ||
166 | D_ASSERT(mask == 1 || mask == 3 || mask == 7); | ||
167 | D_ASSERT(logical_block_size == (mask+1) * MD_SECTOR_SIZE); | ||
168 | offset = sector & mask; | ||
169 | sector = sector & ~mask; | ||
170 | iop = mdev->md_io_tmpp; | ||
171 | |||
172 | if (rw & WRITE) { | ||
173 | /* these are GFP_KERNEL pages, pre-allocated | ||
174 | * on device initialization */ | ||
175 | void *p = page_address(mdev->md_io_page); | ||
176 | void *hp = page_address(mdev->md_io_tmpp); | ||
177 | |||
178 | ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, | ||
179 | READ, logical_block_size); | ||
180 | |||
181 | if (unlikely(!ok)) { | ||
182 | dev_err(DEV, "drbd_md_sync_page_io(,%llus," | ||
183 | "READ [logical_block_size!=512]) failed!\n", | ||
184 | (unsigned long long)sector); | ||
185 | return 0; | ||
186 | } | ||
187 | |||
188 | memcpy(hp + offset*MD_SECTOR_SIZE, p, MD_SECTOR_SIZE); | ||
189 | } | ||
190 | } | ||
191 | 205 | ||
192 | if (sector < drbd_md_first_sector(bdev) || | 206 | if (sector < drbd_md_first_sector(bdev) || |
193 | sector > drbd_md_last_sector(bdev)) | 207 | sector + 7 > drbd_md_last_sector(bdev)) |
194 | dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n", | 208 | dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n", |
195 | current->comm, current->pid, __func__, | 209 | current->comm, current->pid, __func__, |
196 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); | 210 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); |
197 | 211 | ||
198 | ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, logical_block_size); | 212 | err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, MD_BLOCK_SIZE); |
199 | if (unlikely(!ok)) { | 213 | if (err) { |
200 | dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed!\n", | 214 | dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n", |
201 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); | 215 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err); |
202 | return 0; | ||
203 | } | ||
204 | |||
205 | if (logical_block_size != MD_SECTOR_SIZE && !(rw & WRITE)) { | ||
206 | void *p = page_address(mdev->md_io_page); | ||
207 | void *hp = page_address(mdev->md_io_tmpp); | ||
208 | |||
209 | memcpy(p, hp + offset*MD_SECTOR_SIZE, MD_SECTOR_SIZE); | ||
210 | } | 216 | } |
211 | 217 | return err; | |
212 | return ok; | ||
213 | } | 218 | } |
214 | 219 | ||
215 | static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr) | 220 | static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr) |
216 | { | 221 | { |
217 | struct lc_element *al_ext; | 222 | struct lc_element *al_ext; |
218 | struct lc_element *tmp; | 223 | struct lc_element *tmp; |
219 | unsigned long al_flags = 0; | ||
220 | int wake; | 224 | int wake; |
221 | 225 | ||
222 | spin_lock_irq(&mdev->al_lock); | 226 | spin_lock_irq(&mdev->al_lock); |
@@ -231,76 +235,92 @@ static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr) | |||
231 | return NULL; | 235 | return NULL; |
232 | } | 236 | } |
233 | } | 237 | } |
234 | al_ext = lc_get(mdev->act_log, enr); | 238 | al_ext = lc_get(mdev->act_log, enr); |
235 | al_flags = mdev->act_log->flags; | ||
236 | spin_unlock_irq(&mdev->al_lock); | 239 | spin_unlock_irq(&mdev->al_lock); |
237 | |||
238 | /* | ||
239 | if (!al_ext) { | ||
240 | if (al_flags & LC_STARVING) | ||
241 | dev_warn(DEV, "Have to wait for LRU element (AL too small?)\n"); | ||
242 | if (al_flags & LC_DIRTY) | ||
243 | dev_warn(DEV, "Ongoing AL update (AL device too slow?)\n"); | ||
244 | } | ||
245 | */ | ||
246 | |||
247 | return al_ext; | 240 | return al_ext; |
248 | } | 241 | } |
249 | 242 | ||
250 | void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector) | 243 | void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i) |
251 | { | 244 | { |
252 | unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9)); | 245 | /* for bios crossing activity log extent boundaries, |
253 | struct lc_element *al_ext; | 246 | * we may need to activate two extents in one go */ |
254 | struct update_al_work al_work; | 247 | unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); |
248 | unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); | ||
249 | unsigned enr; | ||
250 | bool locked = false; | ||
255 | 251 | ||
252 | |||
253 | D_ASSERT(first <= last); | ||
256 | D_ASSERT(atomic_read(&mdev->local_cnt) > 0); | 254 | D_ASSERT(atomic_read(&mdev->local_cnt) > 0); |
257 | 255 | ||
258 | wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr))); | 256 | for (enr = first; enr <= last; enr++) |
257 | wait_event(mdev->al_wait, _al_get(mdev, enr) != NULL); | ||
258 | |||
259 | /* Serialize multiple transactions. | ||
260 | * This uses test_and_set_bit, memory barrier is implicit. | ||
261 | */ | ||
262 | wait_event(mdev->al_wait, | ||
263 | mdev->act_log->pending_changes == 0 || | ||
264 | (locked = lc_try_lock_for_transaction(mdev->act_log))); | ||
259 | 265 | ||
260 | if (al_ext->lc_number != enr) { | 266 | if (locked) { |
261 | /* drbd_al_write_transaction(mdev,al_ext,enr); | 267 | /* drbd_al_write_transaction(mdev,al_ext,enr); |
262 | * recurses into generic_make_request(), which | 268 | * recurses into generic_make_request(), which |
263 | * disallows recursion, bios being serialized on the | 269 | * disallows recursion, bios being serialized on the |
264 | * current->bio_tail list now. | 270 | * current->bio_tail list now. |
265 | * we have to delegate updates to the activity log | 271 | * we have to delegate updates to the activity log |
266 | * to the worker thread. */ | 272 | * to the worker thread. */ |
267 | init_completion(&al_work.event); | 273 | |
268 | al_work.al_ext = al_ext; | 274 | /* Double check: it may have been committed by someone else, |
269 | al_work.enr = enr; | 275 | * while we have been waiting for the lock. */ |
270 | al_work.old_enr = al_ext->lc_number; | 276 | if (mdev->act_log->pending_changes) { |
271 | al_work.w.cb = w_al_write_transaction; | 277 | bool write_al_updates; |
272 | drbd_queue_work_front(&mdev->data.work, &al_work.w); | 278 | |
273 | wait_for_completion(&al_work.event); | 279 | rcu_read_lock(); |
274 | 280 | write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates; | |
275 | mdev->al_writ_cnt++; | 281 | rcu_read_unlock(); |
276 | 282 | ||
277 | spin_lock_irq(&mdev->al_lock); | 283 | if (write_al_updates) { |
278 | lc_changed(mdev->act_log, al_ext); | 284 | al_write_transaction(mdev); |
279 | spin_unlock_irq(&mdev->al_lock); | 285 | mdev->al_writ_cnt++; |
286 | } | ||
287 | |||
288 | spin_lock_irq(&mdev->al_lock); | ||
289 | /* FIXME | ||
290 | if (err) | ||
291 | we need an "lc_cancel" here; | ||
292 | */ | ||
293 | lc_committed(mdev->act_log); | ||
294 | spin_unlock_irq(&mdev->al_lock); | ||
295 | } | ||
296 | lc_unlock(mdev->act_log); | ||
280 | wake_up(&mdev->al_wait); | 297 | wake_up(&mdev->al_wait); |
281 | } | 298 | } |
282 | } | 299 | } |
283 | 300 | ||
284 | void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector) | 301 | void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i) |
285 | { | 302 | { |
286 | unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9)); | 303 | /* for bios crossing activity log extent boundaries, |
304 | * we may need to activate two extents in one go */ | ||
305 | unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); | ||
306 | unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); | ||
307 | unsigned enr; | ||
287 | struct lc_element *extent; | 308 | struct lc_element *extent; |
288 | unsigned long flags; | 309 | unsigned long flags; |
289 | 310 | ||
311 | D_ASSERT(first <= last); | ||
290 | spin_lock_irqsave(&mdev->al_lock, flags); | 312 | spin_lock_irqsave(&mdev->al_lock, flags); |
291 | 313 | ||
292 | extent = lc_find(mdev->act_log, enr); | 314 | for (enr = first; enr <= last; enr++) { |
293 | 315 | extent = lc_find(mdev->act_log, enr); | |
294 | if (!extent) { | 316 | if (!extent) { |
295 | spin_unlock_irqrestore(&mdev->al_lock, flags); | 317 | dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr); |
296 | dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr); | 318 | continue; |
297 | return; | 319 | } |
320 | lc_put(mdev->act_log, extent); | ||
298 | } | 321 | } |
299 | |||
300 | if (lc_put(mdev->act_log, extent) == 0) | ||
301 | wake_up(&mdev->al_wait); | ||
302 | |||
303 | spin_unlock_irqrestore(&mdev->al_lock, flags); | 322 | spin_unlock_irqrestore(&mdev->al_lock, flags); |
323 | wake_up(&mdev->al_wait); | ||
304 | } | 324 | } |
305 | 325 | ||
306 | #if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT) | 326 | #if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT) |
@@ -326,296 +346,148 @@ static unsigned int rs_extent_to_bm_page(unsigned int rs_enr) | |||
326 | return rs_enr >> | 346 | return rs_enr >> |
327 | /* bit to page */ | 347 | /* bit to page */ |
328 | ((PAGE_SHIFT + 3) - | 348 | ((PAGE_SHIFT + 3) - |
329 | /* al extent number to bit */ | 349 | /* resync extent number to bit */ |
330 | (BM_EXT_SHIFT - BM_BLOCK_SHIFT)); | 350 | (BM_EXT_SHIFT - BM_BLOCK_SHIFT)); |
331 | } | 351 | } |
332 | 352 | ||
333 | int | 353 | static int |
334 | w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) | 354 | _al_write_transaction(struct drbd_conf *mdev) |
335 | { | 355 | { |
336 | struct update_al_work *aw = container_of(w, struct update_al_work, w); | 356 | struct al_transaction_on_disk *buffer; |
337 | struct lc_element *updated = aw->al_ext; | 357 | struct lc_element *e; |
338 | const unsigned int new_enr = aw->enr; | ||
339 | const unsigned int evicted = aw->old_enr; | ||
340 | struct al_transaction *buffer; | ||
341 | sector_t sector; | 358 | sector_t sector; |
342 | int i, n, mx; | 359 | int i, mx; |
343 | unsigned int extent_nr; | 360 | unsigned extent_nr; |
344 | u32 xor_sum = 0; | 361 | unsigned crc = 0; |
362 | int err = 0; | ||
345 | 363 | ||
346 | if (!get_ldev(mdev)) { | 364 | if (!get_ldev(mdev)) { |
347 | dev_err(DEV, | 365 | dev_err(DEV, "disk is %s, cannot start al transaction\n", |
348 | "disk is %s, cannot start al transaction (-%d +%d)\n", | 366 | drbd_disk_str(mdev->state.disk)); |
349 | drbd_disk_str(mdev->state.disk), evicted, new_enr); | 367 | return -EIO; |
350 | complete(&((struct update_al_work *)w)->event); | ||
351 | return 1; | ||
352 | } | 368 | } |
353 | /* do we have to do a bitmap write, first? | ||
354 | * TODO reduce maximum latency: | ||
355 | * submit both bios, then wait for both, | ||
356 | * instead of doing two synchronous sector writes. | ||
357 | * For now, we must not write the transaction, | ||
358 | * if we cannot write out the bitmap of the evicted extent. */ | ||
359 | if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE) | ||
360 | drbd_bm_write_page(mdev, al_extent_to_bm_page(evicted)); | ||
361 | 369 | ||
362 | /* The bitmap write may have failed, causing a state change. */ | 370 | /* The bitmap write may have failed, causing a state change. */ |
363 | if (mdev->state.disk < D_INCONSISTENT) { | 371 | if (mdev->state.disk < D_INCONSISTENT) { |
364 | dev_err(DEV, | 372 | dev_err(DEV, |
365 | "disk is %s, cannot write al transaction (-%d +%d)\n", | 373 | "disk is %s, cannot write al transaction\n", |
366 | drbd_disk_str(mdev->state.disk), evicted, new_enr); | 374 | drbd_disk_str(mdev->state.disk)); |
367 | complete(&((struct update_al_work *)w)->event); | ||
368 | put_ldev(mdev); | 375 | put_ldev(mdev); |
369 | return 1; | 376 | return -EIO; |
370 | } | 377 | } |
371 | 378 | ||
372 | buffer = drbd_md_get_buffer(mdev); /* protects md_io_buffer, al_tr_cycle, ... */ | 379 | buffer = drbd_md_get_buffer(mdev); /* protects md_io_buffer, al_tr_cycle, ... */ |
373 | if (!buffer) { | 380 | if (!buffer) { |
374 | dev_err(DEV, "disk failed while waiting for md_io buffer\n"); | 381 | dev_err(DEV, "disk failed while waiting for md_io buffer\n"); |
375 | complete(&((struct update_al_work *)w)->event); | ||
376 | put_ldev(mdev); | 382 | put_ldev(mdev); |
377 | return 1; | 383 | return -ENODEV; |
378 | } | 384 | } |
379 | 385 | ||
380 | buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC); | 386 | memset(buffer, 0, sizeof(*buffer)); |
387 | buffer->magic = cpu_to_be32(DRBD_AL_MAGIC); | ||
381 | buffer->tr_number = cpu_to_be32(mdev->al_tr_number); | 388 | buffer->tr_number = cpu_to_be32(mdev->al_tr_number); |
382 | 389 | ||
383 | n = lc_index_of(mdev->act_log, updated); | 390 | i = 0; |
391 | |||
392 | /* Even though no one can start to change this list | ||
393 | * once we set the LC_LOCKED -- from drbd_al_begin_io(), | ||
394 | * lc_try_lock_for_transaction() --, someone may still | ||
395 | * be in the process of changing it. */ | ||
396 | spin_lock_irq(&mdev->al_lock); | ||
397 | list_for_each_entry(e, &mdev->act_log->to_be_changed, list) { | ||
398 | if (i == AL_UPDATES_PER_TRANSACTION) { | ||
399 | i++; | ||
400 | break; | ||
401 | } | ||
402 | buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index); | ||
403 | buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number); | ||
404 | if (e->lc_number != LC_FREE) | ||
405 | drbd_bm_mark_for_writeout(mdev, | ||
406 | al_extent_to_bm_page(e->lc_number)); | ||
407 | i++; | ||
408 | } | ||
409 | spin_unlock_irq(&mdev->al_lock); | ||
410 | BUG_ON(i > AL_UPDATES_PER_TRANSACTION); | ||
384 | 411 | ||
385 | buffer->updates[0].pos = cpu_to_be32(n); | 412 | buffer->n_updates = cpu_to_be16(i); |
386 | buffer->updates[0].extent = cpu_to_be32(new_enr); | 413 | for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) { |
414 | buffer->update_slot_nr[i] = cpu_to_be16(-1); | ||
415 | buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE); | ||
416 | } | ||
387 | 417 | ||
388 | xor_sum ^= new_enr; | 418 | buffer->context_size = cpu_to_be16(mdev->act_log->nr_elements); |
419 | buffer->context_start_slot_nr = cpu_to_be16(mdev->al_tr_cycle); | ||
389 | 420 | ||
390 | mx = min_t(int, AL_EXTENTS_PT, | 421 | mx = min_t(int, AL_CONTEXT_PER_TRANSACTION, |
391 | mdev->act_log->nr_elements - mdev->al_tr_cycle); | 422 | mdev->act_log->nr_elements - mdev->al_tr_cycle); |
392 | for (i = 0; i < mx; i++) { | 423 | for (i = 0; i < mx; i++) { |
393 | unsigned idx = mdev->al_tr_cycle + i; | 424 | unsigned idx = mdev->al_tr_cycle + i; |
394 | extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number; | 425 | extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number; |
395 | buffer->updates[i+1].pos = cpu_to_be32(idx); | 426 | buffer->context[i] = cpu_to_be32(extent_nr); |
396 | buffer->updates[i+1].extent = cpu_to_be32(extent_nr); | ||
397 | xor_sum ^= extent_nr; | ||
398 | } | ||
399 | for (; i < AL_EXTENTS_PT; i++) { | ||
400 | buffer->updates[i+1].pos = __constant_cpu_to_be32(-1); | ||
401 | buffer->updates[i+1].extent = __constant_cpu_to_be32(LC_FREE); | ||
402 | xor_sum ^= LC_FREE; | ||
403 | } | 427 | } |
404 | mdev->al_tr_cycle += AL_EXTENTS_PT; | 428 | for (; i < AL_CONTEXT_PER_TRANSACTION; i++) |
429 | buffer->context[i] = cpu_to_be32(LC_FREE); | ||
430 | |||
431 | mdev->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION; | ||
405 | if (mdev->al_tr_cycle >= mdev->act_log->nr_elements) | 432 | if (mdev->al_tr_cycle >= mdev->act_log->nr_elements) |
406 | mdev->al_tr_cycle = 0; | 433 | mdev->al_tr_cycle = 0; |
407 | 434 | ||
408 | buffer->xor_sum = cpu_to_be32(xor_sum); | ||
409 | |||
410 | sector = mdev->ldev->md.md_offset | 435 | sector = mdev->ldev->md.md_offset |
411 | + mdev->ldev->md.al_offset + mdev->al_tr_pos; | 436 | + mdev->ldev->md.al_offset |
412 | 437 | + mdev->al_tr_pos * (MD_BLOCK_SIZE>>9); | |
413 | if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) | ||
414 | drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); | ||
415 | 438 | ||
416 | if (++mdev->al_tr_pos > | 439 | crc = crc32c(0, buffer, 4096); |
417 | div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT)) | 440 | buffer->crc32c = cpu_to_be32(crc); |
418 | mdev->al_tr_pos = 0; | ||
419 | 441 | ||
420 | D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE); | 442 | if (drbd_bm_write_hinted(mdev)) |
421 | mdev->al_tr_number++; | 443 | err = -EIO; |
444 | /* drbd_chk_io_error done already */ | ||
445 | else if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { | ||
446 | err = -EIO; | ||
447 | drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); | ||
448 | } else { | ||
449 | /* advance ringbuffer position and transaction counter */ | ||
450 | mdev->al_tr_pos = (mdev->al_tr_pos + 1) % (MD_AL_SECTORS*512/MD_BLOCK_SIZE); | ||
451 | mdev->al_tr_number++; | ||
452 | } | ||
422 | 453 | ||
423 | drbd_md_put_buffer(mdev); | 454 | drbd_md_put_buffer(mdev); |
424 | |||
425 | complete(&((struct update_al_work *)w)->event); | ||
426 | put_ldev(mdev); | 455 | put_ldev(mdev); |
427 | 456 | ||
428 | return 1; | 457 | return err; |
429 | } | 458 | } |
430 | 459 | ||
431 | /** | ||
432 | * drbd_al_read_tr() - Read a single transaction from the on disk activity log | ||
433 | * @mdev: DRBD device. | ||
434 | * @bdev: Block device to read form. | ||
435 | * @b: pointer to an al_transaction. | ||
436 | * @index: On disk slot of the transaction to read. | ||
437 | * | ||
438 | * Returns -1 on IO error, 0 on checksum error and 1 upon success. | ||
439 | */ | ||
440 | static int drbd_al_read_tr(struct drbd_conf *mdev, | ||
441 | struct drbd_backing_dev *bdev, | ||
442 | struct al_transaction *b, | ||
443 | int index) | ||
444 | { | ||
445 | sector_t sector; | ||
446 | int rv, i; | ||
447 | u32 xor_sum = 0; | ||
448 | |||
449 | sector = bdev->md.md_offset + bdev->md.al_offset + index; | ||
450 | |||
451 | /* Dont process error normally, | ||
452 | * as this is done before disk is attached! */ | ||
453 | if (!drbd_md_sync_page_io(mdev, bdev, sector, READ)) | ||
454 | return -1; | ||
455 | |||
456 | rv = (be32_to_cpu(b->magic) == DRBD_MAGIC); | ||
457 | |||
458 | for (i = 0; i < AL_EXTENTS_PT + 1; i++) | ||
459 | xor_sum ^= be32_to_cpu(b->updates[i].extent); | ||
460 | rv &= (xor_sum == be32_to_cpu(b->xor_sum)); | ||
461 | 460 | ||
462 | return rv; | 461 | static int w_al_write_transaction(struct drbd_work *w, int unused) |
463 | } | ||
464 | |||
465 | /** | ||
466 | * drbd_al_read_log() - Restores the activity log from its on disk representation. | ||
467 | * @mdev: DRBD device. | ||
468 | * @bdev: Block device to read form. | ||
469 | * | ||
470 | * Returns 1 on success, returns 0 when reading the log failed due to IO errors. | ||
471 | */ | ||
472 | int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) | ||
473 | { | 462 | { |
474 | struct al_transaction *buffer; | 463 | struct update_al_work *aw = container_of(w, struct update_al_work, w); |
475 | int i; | 464 | struct drbd_conf *mdev = w->mdev; |
476 | int rv; | 465 | int err; |
477 | int mx; | ||
478 | int active_extents = 0; | ||
479 | int transactions = 0; | ||
480 | int found_valid = 0; | ||
481 | int from = 0; | ||
482 | int to = 0; | ||
483 | u32 from_tnr = 0; | ||
484 | u32 to_tnr = 0; | ||
485 | u32 cnr; | ||
486 | |||
487 | mx = div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT); | ||
488 | |||
489 | /* lock out all other meta data io for now, | ||
490 | * and make sure the page is mapped. | ||
491 | */ | ||
492 | buffer = drbd_md_get_buffer(mdev); | ||
493 | if (!buffer) | ||
494 | return 0; | ||
495 | |||
496 | /* Find the valid transaction in the log */ | ||
497 | for (i = 0; i <= mx; i++) { | ||
498 | rv = drbd_al_read_tr(mdev, bdev, buffer, i); | ||
499 | if (rv == 0) | ||
500 | continue; | ||
501 | if (rv == -1) { | ||
502 | drbd_md_put_buffer(mdev); | ||
503 | return 0; | ||
504 | } | ||
505 | cnr = be32_to_cpu(buffer->tr_number); | ||
506 | |||
507 | if (++found_valid == 1) { | ||
508 | from = i; | ||
509 | to = i; | ||
510 | from_tnr = cnr; | ||
511 | to_tnr = cnr; | ||
512 | continue; | ||
513 | } | ||
514 | if ((int)cnr - (int)from_tnr < 0) { | ||
515 | D_ASSERT(from_tnr - cnr + i - from == mx+1); | ||
516 | from = i; | ||
517 | from_tnr = cnr; | ||
518 | } | ||
519 | if ((int)cnr - (int)to_tnr > 0) { | ||
520 | D_ASSERT(cnr - to_tnr == i - to); | ||
521 | to = i; | ||
522 | to_tnr = cnr; | ||
523 | } | ||
524 | } | ||
525 | |||
526 | if (!found_valid) { | ||
527 | dev_warn(DEV, "No usable activity log found.\n"); | ||
528 | drbd_md_put_buffer(mdev); | ||
529 | return 1; | ||
530 | } | ||
531 | |||
532 | /* Read the valid transactions. | ||
533 | * dev_info(DEV, "Reading from %d to %d.\n",from,to); */ | ||
534 | i = from; | ||
535 | while (1) { | ||
536 | int j, pos; | ||
537 | unsigned int extent_nr; | ||
538 | unsigned int trn; | ||
539 | |||
540 | rv = drbd_al_read_tr(mdev, bdev, buffer, i); | ||
541 | ERR_IF(rv == 0) goto cancel; | ||
542 | if (rv == -1) { | ||
543 | drbd_md_put_buffer(mdev); | ||
544 | return 0; | ||
545 | } | ||
546 | |||
547 | trn = be32_to_cpu(buffer->tr_number); | ||
548 | |||
549 | spin_lock_irq(&mdev->al_lock); | ||
550 | |||
551 | /* This loop runs backwards because in the cyclic | ||
552 | elements there might be an old version of the | ||
553 | updated element (in slot 0). So the element in slot 0 | ||
554 | can overwrite old versions. */ | ||
555 | for (j = AL_EXTENTS_PT; j >= 0; j--) { | ||
556 | pos = be32_to_cpu(buffer->updates[j].pos); | ||
557 | extent_nr = be32_to_cpu(buffer->updates[j].extent); | ||
558 | |||
559 | if (extent_nr == LC_FREE) | ||
560 | continue; | ||
561 | |||
562 | lc_set(mdev->act_log, extent_nr, pos); | ||
563 | active_extents++; | ||
564 | } | ||
565 | spin_unlock_irq(&mdev->al_lock); | ||
566 | |||
567 | transactions++; | ||
568 | |||
569 | cancel: | ||
570 | if (i == to) | ||
571 | break; | ||
572 | i++; | ||
573 | if (i > mx) | ||
574 | i = 0; | ||
575 | } | ||
576 | |||
577 | mdev->al_tr_number = to_tnr+1; | ||
578 | mdev->al_tr_pos = to; | ||
579 | if (++mdev->al_tr_pos > | ||
580 | div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT)) | ||
581 | mdev->al_tr_pos = 0; | ||
582 | |||
583 | /* ok, we are done with it */ | ||
584 | drbd_md_put_buffer(mdev); | ||
585 | 466 | ||
586 | dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n", | 467 | err = _al_write_transaction(mdev); |
587 | transactions, active_extents); | 468 | aw->err = err; |
469 | complete(&aw->event); | ||
588 | 470 | ||
589 | return 1; | 471 | return err != -EIO ? err : 0; |
590 | } | 472 | } |
591 | 473 | ||
592 | /** | 474 | /* Calls from worker context (see w_restart_disk_io()) need to write the |
593 | * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents | 475 | transaction directly. Others came through generic_make_request(), |
594 | * @mdev: DRBD device. | 476 | those need to delegate it to the worker. */ |
595 | */ | 477 | static int al_write_transaction(struct drbd_conf *mdev) |
596 | void drbd_al_apply_to_bm(struct drbd_conf *mdev) | ||
597 | { | 478 | { |
598 | unsigned int enr; | 479 | struct update_al_work al_work; |
599 | unsigned long add = 0; | ||
600 | char ppb[10]; | ||
601 | int i, tmp; | ||
602 | |||
603 | wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); | ||
604 | 480 | ||
605 | for (i = 0; i < mdev->act_log->nr_elements; i++) { | 481 | if (current == mdev->tconn->worker.task) |
606 | enr = lc_element_by_index(mdev->act_log, i)->lc_number; | 482 | return _al_write_transaction(mdev); |
607 | if (enr == LC_FREE) | ||
608 | continue; | ||
609 | tmp = drbd_bm_ALe_set_all(mdev, enr); | ||
610 | dynamic_dev_dbg(DEV, "AL: set %d bits in extent %u\n", tmp, enr); | ||
611 | add += tmp; | ||
612 | } | ||
613 | 483 | ||
614 | lc_unlock(mdev->act_log); | 484 | init_completion(&al_work.event); |
615 | wake_up(&mdev->al_wait); | 485 | al_work.w.cb = w_al_write_transaction; |
486 | al_work.w.mdev = mdev; | ||
487 | drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w); | ||
488 | wait_for_completion(&al_work.event); | ||
616 | 489 | ||
617 | dev_info(DEV, "Marked additional %s as out-of-sync based on AL.\n", | 490 | return al_work.err; |
618 | ppsize(ppb, Bit2KB(add))); | ||
619 | } | 491 | } |
620 | 492 | ||
621 | static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext) | 493 | static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext) |
@@ -645,7 +517,7 @@ void drbd_al_shrink(struct drbd_conf *mdev) | |||
645 | struct lc_element *al_ext; | 517 | struct lc_element *al_ext; |
646 | int i; | 518 | int i; |
647 | 519 | ||
648 | D_ASSERT(test_bit(__LC_DIRTY, &mdev->act_log->flags)); | 520 | D_ASSERT(test_bit(__LC_LOCKED, &mdev->act_log->flags)); |
649 | 521 | ||
650 | for (i = 0; i < mdev->act_log->nr_elements; i++) { | 522 | for (i = 0; i < mdev->act_log->nr_elements; i++) { |
651 | al_ext = lc_element_by_index(mdev->act_log, i); | 523 | al_ext = lc_element_by_index(mdev->act_log, i); |
@@ -657,15 +529,17 @@ void drbd_al_shrink(struct drbd_conf *mdev) | |||
657 | wake_up(&mdev->al_wait); | 529 | wake_up(&mdev->al_wait); |
658 | } | 530 | } |
659 | 531 | ||
660 | static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused) | 532 | static int w_update_odbm(struct drbd_work *w, int unused) |
661 | { | 533 | { |
662 | struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w); | 534 | struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w); |
535 | struct drbd_conf *mdev = w->mdev; | ||
536 | struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, }; | ||
663 | 537 | ||
664 | if (!get_ldev(mdev)) { | 538 | if (!get_ldev(mdev)) { |
665 | if (__ratelimit(&drbd_ratelimit_state)) | 539 | if (__ratelimit(&drbd_ratelimit_state)) |
666 | dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n"); | 540 | dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n"); |
667 | kfree(udw); | 541 | kfree(udw); |
668 | return 1; | 542 | return 0; |
669 | } | 543 | } |
670 | 544 | ||
671 | drbd_bm_write_page(mdev, rs_extent_to_bm_page(udw->enr)); | 545 | drbd_bm_write_page(mdev, rs_extent_to_bm_page(udw->enr)); |
@@ -683,9 +557,9 @@ static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused | |||
683 | break; | 557 | break; |
684 | } | 558 | } |
685 | } | 559 | } |
686 | drbd_bcast_sync_progress(mdev); | 560 | drbd_bcast_event(mdev, &sib); |
687 | 561 | ||
688 | return 1; | 562 | return 0; |
689 | } | 563 | } |
690 | 564 | ||
691 | 565 | ||
@@ -755,7 +629,9 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector, | |||
755 | } | 629 | } |
756 | ext->rs_left = rs_left; | 630 | ext->rs_left = rs_left; |
757 | ext->rs_failed = success ? 0 : count; | 631 | ext->rs_failed = success ? 0 : count; |
758 | lc_changed(mdev->resync, &ext->lce); | 632 | /* we don't keep a persistent log of the resync lru, |
633 | * we can commit any change right away. */ | ||
634 | lc_committed(mdev->resync); | ||
759 | } | 635 | } |
760 | lc_put(mdev->resync, &ext->lce); | 636 | lc_put(mdev->resync, &ext->lce); |
761 | /* no race, we are within the al_lock! */ | 637 | /* no race, we are within the al_lock! */ |
@@ -767,7 +643,8 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector, | |||
767 | if (udw) { | 643 | if (udw) { |
768 | udw->enr = ext->lce.lc_number; | 644 | udw->enr = ext->lce.lc_number; |
769 | udw->w.cb = w_update_odbm; | 645 | udw->w.cb = w_update_odbm; |
770 | drbd_queue_work_front(&mdev->data.work, &udw->w); | 646 | udw->w.mdev = mdev; |
647 | drbd_queue_work_front(&mdev->tconn->sender_work, &udw->w); | ||
771 | } else { | 648 | } else { |
772 | dev_warn(DEV, "Could not kmalloc an udw\n"); | 649 | dev_warn(DEV, "Could not kmalloc an udw\n"); |
773 | } | 650 | } |
@@ -813,16 +690,22 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, | |||
813 | int wake_up = 0; | 690 | int wake_up = 0; |
814 | unsigned long flags; | 691 | unsigned long flags; |
815 | 692 | ||
816 | if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) { | 693 | if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { |
817 | dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n", | 694 | dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n", |
818 | (unsigned long long)sector, size); | 695 | (unsigned long long)sector, size); |
819 | return; | 696 | return; |
820 | } | 697 | } |
698 | |||
699 | if (!get_ldev(mdev)) | ||
700 | return; /* no disk, no metadata, no bitmap to clear bits in */ | ||
701 | |||
821 | nr_sectors = drbd_get_capacity(mdev->this_bdev); | 702 | nr_sectors = drbd_get_capacity(mdev->this_bdev); |
822 | esector = sector + (size >> 9) - 1; | 703 | esector = sector + (size >> 9) - 1; |
823 | 704 | ||
824 | ERR_IF(sector >= nr_sectors) return; | 705 | if (!expect(sector < nr_sectors)) |
825 | ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1); | 706 | goto out; |
707 | if (!expect(esector < nr_sectors)) | ||
708 | esector = nr_sectors - 1; | ||
826 | 709 | ||
827 | lbnr = BM_SECT_TO_BIT(nr_sectors-1); | 710 | lbnr = BM_SECT_TO_BIT(nr_sectors-1); |
828 | 711 | ||
@@ -830,7 +713,7 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, | |||
830 | * round up start sector, round down end sector. we make sure we only | 713 | * round up start sector, round down end sector. we make sure we only |
831 | * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */ | 714 | * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */ |
832 | if (unlikely(esector < BM_SECT_PER_BIT-1)) | 715 | if (unlikely(esector < BM_SECT_PER_BIT-1)) |
833 | return; | 716 | goto out; |
834 | if (unlikely(esector == (nr_sectors-1))) | 717 | if (unlikely(esector == (nr_sectors-1))) |
835 | ebnr = lbnr; | 718 | ebnr = lbnr; |
836 | else | 719 | else |
@@ -838,14 +721,14 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, | |||
838 | sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); | 721 | sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); |
839 | 722 | ||
840 | if (sbnr > ebnr) | 723 | if (sbnr > ebnr) |
841 | return; | 724 | goto out; |
842 | 725 | ||
843 | /* | 726 | /* |
844 | * ok, (capacity & 7) != 0 sometimes, but who cares... | 727 | * ok, (capacity & 7) != 0 sometimes, but who cares... |
845 | * we count rs_{total,left} in bits, not sectors. | 728 | * we count rs_{total,left} in bits, not sectors. |
846 | */ | 729 | */ |
847 | count = drbd_bm_clear_bits(mdev, sbnr, ebnr); | 730 | count = drbd_bm_clear_bits(mdev, sbnr, ebnr); |
848 | if (count && get_ldev(mdev)) { | 731 | if (count) { |
849 | drbd_advance_rs_marks(mdev, drbd_bm_total_weight(mdev)); | 732 | drbd_advance_rs_marks(mdev, drbd_bm_total_weight(mdev)); |
850 | spin_lock_irqsave(&mdev->al_lock, flags); | 733 | spin_lock_irqsave(&mdev->al_lock, flags); |
851 | drbd_try_clear_on_disk_bm(mdev, sector, count, true); | 734 | drbd_try_clear_on_disk_bm(mdev, sector, count, true); |
@@ -854,8 +737,9 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, | |||
854 | /* just wake_up unconditional now, various lc_chaged(), | 737 | /* just wake_up unconditional now, various lc_chaged(), |
855 | * lc_put() in drbd_try_clear_on_disk_bm(). */ | 738 | * lc_put() in drbd_try_clear_on_disk_bm(). */ |
856 | wake_up = 1; | 739 | wake_up = 1; |
857 | put_ldev(mdev); | ||
858 | } | 740 | } |
741 | out: | ||
742 | put_ldev(mdev); | ||
859 | if (wake_up) | 743 | if (wake_up) |
860 | wake_up(&mdev->al_wait); | 744 | wake_up(&mdev->al_wait); |
861 | } | 745 | } |
@@ -871,7 +755,7 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, | |||
871 | int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, | 755 | int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, |
872 | const char *file, const unsigned int line) | 756 | const char *file, const unsigned int line) |
873 | { | 757 | { |
874 | unsigned long sbnr, ebnr, lbnr, flags; | 758 | unsigned long sbnr, ebnr, flags; |
875 | sector_t esector, nr_sectors; | 759 | sector_t esector, nr_sectors; |
876 | unsigned int enr, count = 0; | 760 | unsigned int enr, count = 0; |
877 | struct lc_element *e; | 761 | struct lc_element *e; |
@@ -880,7 +764,7 @@ int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, | |||
880 | if (size == 0) | 764 | if (size == 0) |
881 | return 0; | 765 | return 0; |
882 | 766 | ||
883 | if (size < 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) { | 767 | if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { |
884 | dev_err(DEV, "sector: %llus, size: %d\n", | 768 | dev_err(DEV, "sector: %llus, size: %d\n", |
885 | (unsigned long long)sector, size); | 769 | (unsigned long long)sector, size); |
886 | return 0; | 770 | return 0; |
@@ -892,12 +776,10 @@ int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, | |||
892 | nr_sectors = drbd_get_capacity(mdev->this_bdev); | 776 | nr_sectors = drbd_get_capacity(mdev->this_bdev); |
893 | esector = sector + (size >> 9) - 1; | 777 | esector = sector + (size >> 9) - 1; |
894 | 778 | ||
895 | ERR_IF(sector >= nr_sectors) | 779 | if (!expect(sector < nr_sectors)) |
896 | goto out; | 780 | goto out; |
897 | ERR_IF(esector >= nr_sectors) | 781 | if (!expect(esector < nr_sectors)) |
898 | esector = (nr_sectors-1); | 782 | esector = nr_sectors - 1; |
899 | |||
900 | lbnr = BM_SECT_TO_BIT(nr_sectors-1); | ||
901 | 783 | ||
902 | /* we set it out of sync, | 784 | /* we set it out of sync, |
903 | * we do not need to round anything here */ | 785 | * we do not need to round anything here */ |
@@ -940,7 +822,7 @@ struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr) | |||
940 | if (bm_ext->lce.lc_number != enr) { | 822 | if (bm_ext->lce.lc_number != enr) { |
941 | bm_ext->rs_left = drbd_bm_e_weight(mdev, enr); | 823 | bm_ext->rs_left = drbd_bm_e_weight(mdev, enr); |
942 | bm_ext->rs_failed = 0; | 824 | bm_ext->rs_failed = 0; |
943 | lc_changed(mdev->resync, &bm_ext->lce); | 825 | lc_committed(mdev->resync); |
944 | wakeup = 1; | 826 | wakeup = 1; |
945 | } | 827 | } |
946 | if (bm_ext->lce.refcnt == 1) | 828 | if (bm_ext->lce.refcnt == 1) |
@@ -956,7 +838,7 @@ struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr) | |||
956 | if (rs_flags & LC_STARVING) | 838 | if (rs_flags & LC_STARVING) |
957 | dev_warn(DEV, "Have to wait for element" | 839 | dev_warn(DEV, "Have to wait for element" |
958 | " (resync LRU too small?)\n"); | 840 | " (resync LRU too small?)\n"); |
959 | BUG_ON(rs_flags & LC_DIRTY); | 841 | BUG_ON(rs_flags & LC_LOCKED); |
960 | } | 842 | } |
961 | 843 | ||
962 | return bm_ext; | 844 | return bm_ext; |
@@ -964,26 +846,12 @@ struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr) | |||
964 | 846 | ||
965 | static int _is_in_al(struct drbd_conf *mdev, unsigned int enr) | 847 | static int _is_in_al(struct drbd_conf *mdev, unsigned int enr) |
966 | { | 848 | { |
967 | struct lc_element *al_ext; | 849 | int rv; |
968 | int rv = 0; | ||
969 | 850 | ||
970 | spin_lock_irq(&mdev->al_lock); | 851 | spin_lock_irq(&mdev->al_lock); |
971 | if (unlikely(enr == mdev->act_log->new_number)) | 852 | rv = lc_is_used(mdev->act_log, enr); |
972 | rv = 1; | ||
973 | else { | ||
974 | al_ext = lc_find(mdev->act_log, enr); | ||
975 | if (al_ext) { | ||
976 | if (al_ext->refcnt) | ||
977 | rv = 1; | ||
978 | } | ||
979 | } | ||
980 | spin_unlock_irq(&mdev->al_lock); | 853 | spin_unlock_irq(&mdev->al_lock); |
981 | 854 | ||
982 | /* | ||
983 | if (unlikely(rv)) { | ||
984 | dev_info(DEV, "Delaying sync read until app's write is done\n"); | ||
985 | } | ||
986 | */ | ||
987 | return rv; | 855 | return rv; |
988 | } | 856 | } |
989 | 857 | ||
@@ -1113,13 +981,13 @@ int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector) | |||
1113 | if (rs_flags & LC_STARVING) | 981 | if (rs_flags & LC_STARVING) |
1114 | dev_warn(DEV, "Have to wait for element" | 982 | dev_warn(DEV, "Have to wait for element" |
1115 | " (resync LRU too small?)\n"); | 983 | " (resync LRU too small?)\n"); |
1116 | BUG_ON(rs_flags & LC_DIRTY); | 984 | BUG_ON(rs_flags & LC_LOCKED); |
1117 | goto try_again; | 985 | goto try_again; |
1118 | } | 986 | } |
1119 | if (bm_ext->lce.lc_number != enr) { | 987 | if (bm_ext->lce.lc_number != enr) { |
1120 | bm_ext->rs_left = drbd_bm_e_weight(mdev, enr); | 988 | bm_ext->rs_left = drbd_bm_e_weight(mdev, enr); |
1121 | bm_ext->rs_failed = 0; | 989 | bm_ext->rs_failed = 0; |
1122 | lc_changed(mdev->resync, &bm_ext->lce); | 990 | lc_committed(mdev->resync); |
1123 | wake_up(&mdev->al_wait); | 991 | wake_up(&mdev->al_wait); |
1124 | D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0); | 992 | D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0); |
1125 | } | 993 | } |
@@ -1130,8 +998,6 @@ int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector) | |||
1130 | } | 998 | } |
1131 | check_al: | 999 | check_al: |
1132 | for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { | 1000 | for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { |
1133 | if (unlikely(al_enr+i == mdev->act_log->new_number)) | ||
1134 | goto try_again; | ||
1135 | if (lc_is_used(mdev->act_log, al_enr+i)) | 1001 | if (lc_is_used(mdev->act_log, al_enr+i)) |
1136 | goto try_again; | 1002 | goto try_again; |
1137 | } | 1003 | } |
@@ -1266,7 +1132,7 @@ void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size) | |||
1266 | sector_t esector, nr_sectors; | 1132 | sector_t esector, nr_sectors; |
1267 | int wake_up = 0; | 1133 | int wake_up = 0; |
1268 | 1134 | ||
1269 | if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) { | 1135 | if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { |
1270 | dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n", | 1136 | dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n", |
1271 | (unsigned long long)sector, size); | 1137 | (unsigned long long)sector, size); |
1272 | return; | 1138 | return; |
@@ -1274,8 +1140,10 @@ void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size) | |||
1274 | nr_sectors = drbd_get_capacity(mdev->this_bdev); | 1140 | nr_sectors = drbd_get_capacity(mdev->this_bdev); |
1275 | esector = sector + (size >> 9) - 1; | 1141 | esector = sector + (size >> 9) - 1; |
1276 | 1142 | ||
1277 | ERR_IF(sector >= nr_sectors) return; | 1143 | if (!expect(sector < nr_sectors)) |
1278 | ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1); | 1144 | return; |
1145 | if (!expect(esector < nr_sectors)) | ||
1146 | esector = nr_sectors - 1; | ||
1279 | 1147 | ||
1280 | lbnr = BM_SECT_TO_BIT(nr_sectors-1); | 1148 | lbnr = BM_SECT_TO_BIT(nr_sectors-1); |
1281 | 1149 | ||
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index d84566496746..8dc29502dc08 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c | |||
@@ -119,13 +119,9 @@ static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func) | |||
119 | if (!__ratelimit(&drbd_ratelimit_state)) | 119 | if (!__ratelimit(&drbd_ratelimit_state)) |
120 | return; | 120 | return; |
121 | dev_err(DEV, "FIXME %s in %s, bitmap locked for '%s' by %s\n", | 121 | dev_err(DEV, "FIXME %s in %s, bitmap locked for '%s' by %s\n", |
122 | current == mdev->receiver.task ? "receiver" : | 122 | drbd_task_to_thread_name(mdev->tconn, current), |
123 | current == mdev->asender.task ? "asender" : | 123 | func, b->bm_why ?: "?", |
124 | current == mdev->worker.task ? "worker" : current->comm, | 124 | drbd_task_to_thread_name(mdev->tconn, b->bm_task)); |
125 | func, b->bm_why ?: "?", | ||
126 | b->bm_task == mdev->receiver.task ? "receiver" : | ||
127 | b->bm_task == mdev->asender.task ? "asender" : | ||
128 | b->bm_task == mdev->worker.task ? "worker" : "?"); | ||
129 | } | 125 | } |
130 | 126 | ||
131 | void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags) | 127 | void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags) |
@@ -142,13 +138,9 @@ void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags) | |||
142 | 138 | ||
143 | if (trylock_failed) { | 139 | if (trylock_failed) { |
144 | dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n", | 140 | dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n", |
145 | current == mdev->receiver.task ? "receiver" : | 141 | drbd_task_to_thread_name(mdev->tconn, current), |
146 | current == mdev->asender.task ? "asender" : | 142 | why, b->bm_why ?: "?", |
147 | current == mdev->worker.task ? "worker" : current->comm, | 143 | drbd_task_to_thread_name(mdev->tconn, b->bm_task)); |
148 | why, b->bm_why ?: "?", | ||
149 | b->bm_task == mdev->receiver.task ? "receiver" : | ||
150 | b->bm_task == mdev->asender.task ? "asender" : | ||
151 | b->bm_task == mdev->worker.task ? "worker" : "?"); | ||
152 | mutex_lock(&b->bm_change); | 144 | mutex_lock(&b->bm_change); |
153 | } | 145 | } |
154 | if (BM_LOCKED_MASK & b->bm_flags) | 146 | if (BM_LOCKED_MASK & b->bm_flags) |
@@ -196,6 +188,9 @@ void drbd_bm_unlock(struct drbd_conf *mdev) | |||
196 | /* to mark for lazy writeout once syncer cleared all clearable bits, | 188 | /* to mark for lazy writeout once syncer cleared all clearable bits, |
197 | * we if bits have been cleared since last IO. */ | 189 | * we if bits have been cleared since last IO. */ |
198 | #define BM_PAGE_LAZY_WRITEOUT 28 | 190 | #define BM_PAGE_LAZY_WRITEOUT 28 |
191 | /* pages marked with this "HINT" will be considered for writeout | ||
192 | * on activity log transactions */ | ||
193 | #define BM_PAGE_HINT_WRITEOUT 27 | ||
199 | 194 | ||
200 | /* store_page_idx uses non-atomic assignment. It is only used directly after | 195 | /* store_page_idx uses non-atomic assignment. It is only used directly after |
201 | * allocating the page. All other bm_set_page_* and bm_clear_page_* need to | 196 | * allocating the page. All other bm_set_page_* and bm_clear_page_* need to |
@@ -227,8 +222,7 @@ static void bm_page_unlock_io(struct drbd_conf *mdev, int page_nr) | |||
227 | { | 222 | { |
228 | struct drbd_bitmap *b = mdev->bitmap; | 223 | struct drbd_bitmap *b = mdev->bitmap; |
229 | void *addr = &page_private(b->bm_pages[page_nr]); | 224 | void *addr = &page_private(b->bm_pages[page_nr]); |
230 | clear_bit(BM_PAGE_IO_LOCK, addr); | 225 | clear_bit_unlock(BM_PAGE_IO_LOCK, addr); |
231 | smp_mb__after_clear_bit(); | ||
232 | wake_up(&mdev->bitmap->bm_io_wait); | 226 | wake_up(&mdev->bitmap->bm_io_wait); |
233 | } | 227 | } |
234 | 228 | ||
@@ -246,6 +240,27 @@ static void bm_set_page_need_writeout(struct page *page) | |||
246 | set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page)); | 240 | set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page)); |
247 | } | 241 | } |
248 | 242 | ||
243 | /** | ||
244 | * drbd_bm_mark_for_writeout() - mark a page with a "hint" to be considered for writeout | ||
245 | * @mdev: DRBD device. | ||
246 | * @page_nr: the bitmap page to mark with the "hint" flag | ||
247 | * | ||
248 | * From within an activity log transaction, we mark a few pages with these | ||
249 | * hints, then call drbd_bm_write_hinted(), which will only write out changed | ||
250 | * pages which are flagged with this mark. | ||
251 | */ | ||
252 | void drbd_bm_mark_for_writeout(struct drbd_conf *mdev, int page_nr) | ||
253 | { | ||
254 | struct page *page; | ||
255 | if (page_nr >= mdev->bitmap->bm_number_of_pages) { | ||
256 | dev_warn(DEV, "BAD: page_nr: %u, number_of_pages: %u\n", | ||
257 | page_nr, (int)mdev->bitmap->bm_number_of_pages); | ||
258 | return; | ||
259 | } | ||
260 | page = mdev->bitmap->bm_pages[page_nr]; | ||
261 | set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page)); | ||
262 | } | ||
263 | |||
249 | static int bm_test_page_unchanged(struct page *page) | 264 | static int bm_test_page_unchanged(struct page *page) |
250 | { | 265 | { |
251 | volatile const unsigned long *addr = &page_private(page); | 266 | volatile const unsigned long *addr = &page_private(page); |
@@ -373,14 +388,16 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) | |||
373 | return old_pages; | 388 | return old_pages; |
374 | 389 | ||
375 | /* Trying kmalloc first, falling back to vmalloc. | 390 | /* Trying kmalloc first, falling back to vmalloc. |
376 | * GFP_KERNEL is ok, as this is done when a lower level disk is | 391 | * GFP_NOIO, as this is called while drbd IO is "suspended", |
377 | * "attached" to the drbd. Context is receiver thread or cqueue | 392 | * and during resize or attach on diskless Primary, |
378 | * thread. As we have no disk yet, we are not in the IO path, | 393 | * we must not block on IO to ourselves. |
379 | * not even the IO path of the peer. */ | 394 | * Context is receiver thread or dmsetup. */ |
380 | bytes = sizeof(struct page *)*want; | 395 | bytes = sizeof(struct page *)*want; |
381 | new_pages = kzalloc(bytes, GFP_KERNEL); | 396 | new_pages = kzalloc(bytes, GFP_NOIO); |
382 | if (!new_pages) { | 397 | if (!new_pages) { |
383 | new_pages = vzalloc(bytes); | 398 | new_pages = __vmalloc(bytes, |
399 | GFP_NOIO | __GFP_HIGHMEM | __GFP_ZERO, | ||
400 | PAGE_KERNEL); | ||
384 | if (!new_pages) | 401 | if (!new_pages) |
385 | return NULL; | 402 | return NULL; |
386 | vmalloced = 1; | 403 | vmalloced = 1; |
@@ -390,7 +407,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) | |||
390 | for (i = 0; i < have; i++) | 407 | for (i = 0; i < have; i++) |
391 | new_pages[i] = old_pages[i]; | 408 | new_pages[i] = old_pages[i]; |
392 | for (; i < want; i++) { | 409 | for (; i < want; i++) { |
393 | page = alloc_page(GFP_HIGHUSER); | 410 | page = alloc_page(GFP_NOIO | __GFP_HIGHMEM); |
394 | if (!page) { | 411 | if (!page) { |
395 | bm_free_pages(new_pages + have, i - have); | 412 | bm_free_pages(new_pages + have, i - have); |
396 | bm_vk_free(new_pages, vmalloced); | 413 | bm_vk_free(new_pages, vmalloced); |
@@ -439,7 +456,8 @@ int drbd_bm_init(struct drbd_conf *mdev) | |||
439 | 456 | ||
440 | sector_t drbd_bm_capacity(struct drbd_conf *mdev) | 457 | sector_t drbd_bm_capacity(struct drbd_conf *mdev) |
441 | { | 458 | { |
442 | ERR_IF(!mdev->bitmap) return 0; | 459 | if (!expect(mdev->bitmap)) |
460 | return 0; | ||
443 | return mdev->bitmap->bm_dev_capacity; | 461 | return mdev->bitmap->bm_dev_capacity; |
444 | } | 462 | } |
445 | 463 | ||
@@ -447,7 +465,8 @@ sector_t drbd_bm_capacity(struct drbd_conf *mdev) | |||
447 | */ | 465 | */ |
448 | void drbd_bm_cleanup(struct drbd_conf *mdev) | 466 | void drbd_bm_cleanup(struct drbd_conf *mdev) |
449 | { | 467 | { |
450 | ERR_IF (!mdev->bitmap) return; | 468 | if (!expect(mdev->bitmap)) |
469 | return; | ||
451 | bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages); | 470 | bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages); |
452 | bm_vk_free(mdev->bitmap->bm_pages, (BM_P_VMALLOCED & mdev->bitmap->bm_flags)); | 471 | bm_vk_free(mdev->bitmap->bm_pages, (BM_P_VMALLOCED & mdev->bitmap->bm_flags)); |
453 | kfree(mdev->bitmap); | 472 | kfree(mdev->bitmap); |
@@ -610,7 +629,8 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits) | |||
610 | int err = 0, growing; | 629 | int err = 0, growing; |
611 | int opages_vmalloced; | 630 | int opages_vmalloced; |
612 | 631 | ||
613 | ERR_IF(!b) return -ENOMEM; | 632 | if (!expect(b)) |
633 | return -ENOMEM; | ||
614 | 634 | ||
615 | drbd_bm_lock(mdev, "resize", BM_LOCKED_MASK); | 635 | drbd_bm_lock(mdev, "resize", BM_LOCKED_MASK); |
616 | 636 | ||
@@ -732,8 +752,10 @@ unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev) | |||
732 | unsigned long s; | 752 | unsigned long s; |
733 | unsigned long flags; | 753 | unsigned long flags; |
734 | 754 | ||
735 | ERR_IF(!b) return 0; | 755 | if (!expect(b)) |
736 | ERR_IF(!b->bm_pages) return 0; | 756 | return 0; |
757 | if (!expect(b->bm_pages)) | ||
758 | return 0; | ||
737 | 759 | ||
738 | spin_lock_irqsave(&b->bm_lock, flags); | 760 | spin_lock_irqsave(&b->bm_lock, flags); |
739 | s = b->bm_set; | 761 | s = b->bm_set; |
@@ -756,8 +778,10 @@ unsigned long drbd_bm_total_weight(struct drbd_conf *mdev) | |||
756 | size_t drbd_bm_words(struct drbd_conf *mdev) | 778 | size_t drbd_bm_words(struct drbd_conf *mdev) |
757 | { | 779 | { |
758 | struct drbd_bitmap *b = mdev->bitmap; | 780 | struct drbd_bitmap *b = mdev->bitmap; |
759 | ERR_IF(!b) return 0; | 781 | if (!expect(b)) |
760 | ERR_IF(!b->bm_pages) return 0; | 782 | return 0; |
783 | if (!expect(b->bm_pages)) | ||
784 | return 0; | ||
761 | 785 | ||
762 | return b->bm_words; | 786 | return b->bm_words; |
763 | } | 787 | } |
@@ -765,7 +789,8 @@ size_t drbd_bm_words(struct drbd_conf *mdev) | |||
765 | unsigned long drbd_bm_bits(struct drbd_conf *mdev) | 789 | unsigned long drbd_bm_bits(struct drbd_conf *mdev) |
766 | { | 790 | { |
767 | struct drbd_bitmap *b = mdev->bitmap; | 791 | struct drbd_bitmap *b = mdev->bitmap; |
768 | ERR_IF(!b) return 0; | 792 | if (!expect(b)) |
793 | return 0; | ||
769 | 794 | ||
770 | return b->bm_bits; | 795 | return b->bm_bits; |
771 | } | 796 | } |
@@ -786,8 +811,10 @@ void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number, | |||
786 | 811 | ||
787 | end = offset + number; | 812 | end = offset + number; |
788 | 813 | ||
789 | ERR_IF(!b) return; | 814 | if (!expect(b)) |
790 | ERR_IF(!b->bm_pages) return; | 815 | return; |
816 | if (!expect(b->bm_pages)) | ||
817 | return; | ||
791 | if (number == 0) | 818 | if (number == 0) |
792 | return; | 819 | return; |
793 | WARN_ON(offset >= b->bm_words); | 820 | WARN_ON(offset >= b->bm_words); |
@@ -831,8 +858,10 @@ void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number, | |||
831 | 858 | ||
832 | end = offset + number; | 859 | end = offset + number; |
833 | 860 | ||
834 | ERR_IF(!b) return; | 861 | if (!expect(b)) |
835 | ERR_IF(!b->bm_pages) return; | 862 | return; |
863 | if (!expect(b->bm_pages)) | ||
864 | return; | ||
836 | 865 | ||
837 | spin_lock_irq(&b->bm_lock); | 866 | spin_lock_irq(&b->bm_lock); |
838 | if ((offset >= b->bm_words) || | 867 | if ((offset >= b->bm_words) || |
@@ -860,8 +889,10 @@ void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number, | |||
860 | void drbd_bm_set_all(struct drbd_conf *mdev) | 889 | void drbd_bm_set_all(struct drbd_conf *mdev) |
861 | { | 890 | { |
862 | struct drbd_bitmap *b = mdev->bitmap; | 891 | struct drbd_bitmap *b = mdev->bitmap; |
863 | ERR_IF(!b) return; | 892 | if (!expect(b)) |
864 | ERR_IF(!b->bm_pages) return; | 893 | return; |
894 | if (!expect(b->bm_pages)) | ||
895 | return; | ||
865 | 896 | ||
866 | spin_lock_irq(&b->bm_lock); | 897 | spin_lock_irq(&b->bm_lock); |
867 | bm_memset(b, 0, 0xff, b->bm_words); | 898 | bm_memset(b, 0, 0xff, b->bm_words); |
@@ -874,8 +905,10 @@ void drbd_bm_set_all(struct drbd_conf *mdev) | |||
874 | void drbd_bm_clear_all(struct drbd_conf *mdev) | 905 | void drbd_bm_clear_all(struct drbd_conf *mdev) |
875 | { | 906 | { |
876 | struct drbd_bitmap *b = mdev->bitmap; | 907 | struct drbd_bitmap *b = mdev->bitmap; |
877 | ERR_IF(!b) return; | 908 | if (!expect(b)) |
878 | ERR_IF(!b->bm_pages) return; | 909 | return; |
910 | if (!expect(b->bm_pages)) | ||
911 | return; | ||
879 | 912 | ||
880 | spin_lock_irq(&b->bm_lock); | 913 | spin_lock_irq(&b->bm_lock); |
881 | bm_memset(b, 0, 0, b->bm_words); | 914 | bm_memset(b, 0, 0, b->bm_words); |
@@ -889,7 +922,8 @@ struct bm_aio_ctx { | |||
889 | unsigned int done; | 922 | unsigned int done; |
890 | unsigned flags; | 923 | unsigned flags; |
891 | #define BM_AIO_COPY_PAGES 1 | 924 | #define BM_AIO_COPY_PAGES 1 |
892 | #define BM_WRITE_ALL_PAGES 2 | 925 | #define BM_AIO_WRITE_HINTED 2 |
926 | #define BM_WRITE_ALL_PAGES 4 | ||
893 | int error; | 927 | int error; |
894 | struct kref kref; | 928 | struct kref kref; |
895 | }; | 929 | }; |
@@ -977,17 +1011,11 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must | |||
977 | bm_set_page_unchanged(b->bm_pages[page_nr]); | 1011 | bm_set_page_unchanged(b->bm_pages[page_nr]); |
978 | 1012 | ||
979 | if (ctx->flags & BM_AIO_COPY_PAGES) { | 1013 | if (ctx->flags & BM_AIO_COPY_PAGES) { |
980 | void *src, *dest; | ||
981 | page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT); | 1014 | page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT); |
982 | dest = kmap_atomic(page); | 1015 | copy_highpage(page, b->bm_pages[page_nr]); |
983 | src = kmap_atomic(b->bm_pages[page_nr]); | ||
984 | memcpy(dest, src, PAGE_SIZE); | ||
985 | kunmap_atomic(src); | ||
986 | kunmap_atomic(dest); | ||
987 | bm_store_page_idx(page, page_nr); | 1016 | bm_store_page_idx(page, page_nr); |
988 | } else | 1017 | } else |
989 | page = b->bm_pages[page_nr]; | 1018 | page = b->bm_pages[page_nr]; |
990 | |||
991 | bio->bi_bdev = mdev->ldev->md_bdev; | 1019 | bio->bi_bdev = mdev->ldev->md_bdev; |
992 | bio->bi_sector = on_disk_sector; | 1020 | bio->bi_sector = on_disk_sector; |
993 | /* bio_add_page of a single page to an empty bio will always succeed, | 1021 | /* bio_add_page of a single page to an empty bio will always succeed, |
@@ -1060,6 +1088,11 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w | |||
1060 | if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx) | 1088 | if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx) |
1061 | break; | 1089 | break; |
1062 | if (rw & WRITE) { | 1090 | if (rw & WRITE) { |
1091 | if ((flags & BM_AIO_WRITE_HINTED) && | ||
1092 | !test_and_clear_bit(BM_PAGE_HINT_WRITEOUT, | ||
1093 | &page_private(b->bm_pages[i]))) | ||
1094 | continue; | ||
1095 | |||
1063 | if (!(flags & BM_WRITE_ALL_PAGES) && | 1096 | if (!(flags & BM_WRITE_ALL_PAGES) && |
1064 | bm_test_page_unchanged(b->bm_pages[i])) { | 1097 | bm_test_page_unchanged(b->bm_pages[i])) { |
1065 | dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i); | 1098 | dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i); |
@@ -1088,13 +1121,15 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w | |||
1088 | * "in_flight reached zero, all done" event. | 1121 | * "in_flight reached zero, all done" event. |
1089 | */ | 1122 | */ |
1090 | if (!atomic_dec_and_test(&ctx->in_flight)) | 1123 | if (!atomic_dec_and_test(&ctx->in_flight)) |
1091 | wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done); | 1124 | wait_until_done_or_force_detached(mdev, mdev->ldev, &ctx->done); |
1092 | else | 1125 | else |
1093 | kref_put(&ctx->kref, &bm_aio_ctx_destroy); | 1126 | kref_put(&ctx->kref, &bm_aio_ctx_destroy); |
1094 | 1127 | ||
1095 | dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n", | 1128 | /* summary for global bitmap IO */ |
1096 | rw == WRITE ? "WRITE" : "READ", | 1129 | if (flags == 0) |
1097 | count, jiffies - now); | 1130 | dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n", |
1131 | rw == WRITE ? "WRITE" : "READ", | ||
1132 | count, jiffies - now); | ||
1098 | 1133 | ||
1099 | if (ctx->error) { | 1134 | if (ctx->error) { |
1100 | dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n"); | 1135 | dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n"); |
@@ -1103,7 +1138,7 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w | |||
1103 | } | 1138 | } |
1104 | 1139 | ||
1105 | if (atomic_read(&ctx->in_flight)) | 1140 | if (atomic_read(&ctx->in_flight)) |
1106 | err = -EIO; /* Disk failed during IO... */ | 1141 | err = -EIO; /* Disk timeout/force-detach during IO... */ |
1107 | 1142 | ||
1108 | now = jiffies; | 1143 | now = jiffies; |
1109 | if (rw == WRITE) { | 1144 | if (rw == WRITE) { |
@@ -1115,8 +1150,9 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w | |||
1115 | } | 1150 | } |
1116 | now = b->bm_set; | 1151 | now = b->bm_set; |
1117 | 1152 | ||
1118 | dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n", | 1153 | if (flags == 0) |
1119 | ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now); | 1154 | dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n", |
1155 | ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now); | ||
1120 | 1156 | ||
1121 | kref_put(&ctx->kref, &bm_aio_ctx_destroy); | 1157 | kref_put(&ctx->kref, &bm_aio_ctx_destroy); |
1122 | return err; | 1158 | return err; |
@@ -1179,9 +1215,17 @@ int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local) | |||
1179 | return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, 0); | 1215 | return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, 0); |
1180 | } | 1216 | } |
1181 | 1217 | ||
1218 | /** | ||
1219 | * drbd_bm_write_hinted() - Write bitmap pages with "hint" marks, if they have changed. | ||
1220 | * @mdev: DRBD device. | ||
1221 | */ | ||
1222 | int drbd_bm_write_hinted(struct drbd_conf *mdev) __must_hold(local) | ||
1223 | { | ||
1224 | return bm_rw(mdev, WRITE, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0); | ||
1225 | } | ||
1182 | 1226 | ||
1183 | /** | 1227 | /** |
1184 | * drbd_bm_write_page: Writes a PAGE_SIZE aligned piece of bitmap | 1228 | * drbd_bm_write_page() - Writes a PAGE_SIZE aligned piece of bitmap |
1185 | * @mdev: DRBD device. | 1229 | * @mdev: DRBD device. |
1186 | * @idx: bitmap page index | 1230 | * @idx: bitmap page index |
1187 | * | 1231 | * |
@@ -1222,11 +1266,11 @@ int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(loc | |||
1222 | } | 1266 | } |
1223 | 1267 | ||
1224 | bm_page_io_async(ctx, idx, WRITE_SYNC); | 1268 | bm_page_io_async(ctx, idx, WRITE_SYNC); |
1225 | wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done); | 1269 | wait_until_done_or_force_detached(mdev, mdev->ldev, &ctx->done); |
1226 | 1270 | ||
1227 | if (ctx->error) | 1271 | if (ctx->error) |
1228 | drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); | 1272 | drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); |
1229 | /* that should force detach, so the in memory bitmap will be | 1273 | /* that causes us to detach, so the in memory bitmap will be |
1230 | * gone in a moment as well. */ | 1274 | * gone in a moment as well. */ |
1231 | 1275 | ||
1232 | mdev->bm_writ_cnt++; | 1276 | mdev->bm_writ_cnt++; |
@@ -1289,8 +1333,10 @@ static unsigned long bm_find_next(struct drbd_conf *mdev, | |||
1289 | struct drbd_bitmap *b = mdev->bitmap; | 1333 | struct drbd_bitmap *b = mdev->bitmap; |
1290 | unsigned long i = DRBD_END_OF_BITMAP; | 1334 | unsigned long i = DRBD_END_OF_BITMAP; |
1291 | 1335 | ||
1292 | ERR_IF(!b) return i; | 1336 | if (!expect(b)) |
1293 | ERR_IF(!b->bm_pages) return i; | 1337 | return i; |
1338 | if (!expect(b->bm_pages)) | ||
1339 | return i; | ||
1294 | 1340 | ||
1295 | spin_lock_irq(&b->bm_lock); | 1341 | spin_lock_irq(&b->bm_lock); |
1296 | if (BM_DONT_TEST & b->bm_flags) | 1342 | if (BM_DONT_TEST & b->bm_flags) |
@@ -1391,8 +1437,10 @@ static int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, | |||
1391 | struct drbd_bitmap *b = mdev->bitmap; | 1437 | struct drbd_bitmap *b = mdev->bitmap; |
1392 | int c = 0; | 1438 | int c = 0; |
1393 | 1439 | ||
1394 | ERR_IF(!b) return 1; | 1440 | if (!expect(b)) |
1395 | ERR_IF(!b->bm_pages) return 0; | 1441 | return 1; |
1442 | if (!expect(b->bm_pages)) | ||
1443 | return 0; | ||
1396 | 1444 | ||
1397 | spin_lock_irqsave(&b->bm_lock, flags); | 1445 | spin_lock_irqsave(&b->bm_lock, flags); |
1398 | if ((val ? BM_DONT_SET : BM_DONT_CLEAR) & b->bm_flags) | 1446 | if ((val ? BM_DONT_SET : BM_DONT_CLEAR) & b->bm_flags) |
@@ -1423,13 +1471,21 @@ static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b, | |||
1423 | { | 1471 | { |
1424 | int i; | 1472 | int i; |
1425 | int bits; | 1473 | int bits; |
1474 | int changed = 0; | ||
1426 | unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]); | 1475 | unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]); |
1427 | for (i = first_word; i < last_word; i++) { | 1476 | for (i = first_word; i < last_word; i++) { |
1428 | bits = hweight_long(paddr[i]); | 1477 | bits = hweight_long(paddr[i]); |
1429 | paddr[i] = ~0UL; | 1478 | paddr[i] = ~0UL; |
1430 | b->bm_set += BITS_PER_LONG - bits; | 1479 | changed += BITS_PER_LONG - bits; |
1431 | } | 1480 | } |
1432 | kunmap_atomic(paddr); | 1481 | kunmap_atomic(paddr); |
1482 | if (changed) { | ||
1483 | /* We only need lazy writeout, the information is still in the | ||
1484 | * remote bitmap as well, and is reconstructed during the next | ||
1485 | * bitmap exchange, if lost locally due to a crash. */ | ||
1486 | bm_set_page_lazy_writeout(b->bm_pages[page_nr]); | ||
1487 | b->bm_set += changed; | ||
1488 | } | ||
1433 | } | 1489 | } |
1434 | 1490 | ||
1435 | /* Same thing as drbd_bm_set_bits, | 1491 | /* Same thing as drbd_bm_set_bits, |
@@ -1524,8 +1580,10 @@ int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr) | |||
1524 | unsigned long *p_addr; | 1580 | unsigned long *p_addr; |
1525 | int i; | 1581 | int i; |
1526 | 1582 | ||
1527 | ERR_IF(!b) return 0; | 1583 | if (!expect(b)) |
1528 | ERR_IF(!b->bm_pages) return 0; | 1584 | return 0; |
1585 | if (!expect(b->bm_pages)) | ||
1586 | return 0; | ||
1529 | 1587 | ||
1530 | spin_lock_irqsave(&b->bm_lock, flags); | 1588 | spin_lock_irqsave(&b->bm_lock, flags); |
1531 | if (BM_DONT_TEST & b->bm_flags) | 1589 | if (BM_DONT_TEST & b->bm_flags) |
@@ -1559,8 +1617,10 @@ int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsi | |||
1559 | * robust in case we screwed up elsewhere, in that case pretend there | 1617 | * robust in case we screwed up elsewhere, in that case pretend there |
1560 | * was one dirty bit in the requested area, so we won't try to do a | 1618 | * was one dirty bit in the requested area, so we won't try to do a |
1561 | * local read there (no bitmap probably implies no disk) */ | 1619 | * local read there (no bitmap probably implies no disk) */ |
1562 | ERR_IF(!b) return 1; | 1620 | if (!expect(b)) |
1563 | ERR_IF(!b->bm_pages) return 1; | 1621 | return 1; |
1622 | if (!expect(b->bm_pages)) | ||
1623 | return 1; | ||
1564 | 1624 | ||
1565 | spin_lock_irqsave(&b->bm_lock, flags); | 1625 | spin_lock_irqsave(&b->bm_lock, flags); |
1566 | if (BM_DONT_TEST & b->bm_flags) | 1626 | if (BM_DONT_TEST & b->bm_flags) |
@@ -1573,11 +1633,10 @@ int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsi | |||
1573 | bm_unmap(p_addr); | 1633 | bm_unmap(p_addr); |
1574 | p_addr = bm_map_pidx(b, idx); | 1634 | p_addr = bm_map_pidx(b, idx); |
1575 | } | 1635 | } |
1576 | ERR_IF (bitnr >= b->bm_bits) { | 1636 | if (expect(bitnr < b->bm_bits)) |
1577 | dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits); | ||
1578 | } else { | ||
1579 | c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr)); | 1637 | c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr)); |
1580 | } | 1638 | else |
1639 | dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits); | ||
1581 | } | 1640 | } |
1582 | if (p_addr) | 1641 | if (p_addr) |
1583 | bm_unmap(p_addr); | 1642 | bm_unmap(p_addr); |
@@ -1607,8 +1666,10 @@ int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr) | |||
1607 | unsigned long flags; | 1666 | unsigned long flags; |
1608 | unsigned long *p_addr, *bm; | 1667 | unsigned long *p_addr, *bm; |
1609 | 1668 | ||
1610 | ERR_IF(!b) return 0; | 1669 | if (!expect(b)) |
1611 | ERR_IF(!b->bm_pages) return 0; | 1670 | return 0; |
1671 | if (!expect(b->bm_pages)) | ||
1672 | return 0; | ||
1612 | 1673 | ||
1613 | spin_lock_irqsave(&b->bm_lock, flags); | 1674 | spin_lock_irqsave(&b->bm_lock, flags); |
1614 | if (BM_DONT_TEST & b->bm_flags) | 1675 | if (BM_DONT_TEST & b->bm_flags) |
@@ -1630,47 +1691,3 @@ int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr) | |||
1630 | spin_unlock_irqrestore(&b->bm_lock, flags); | 1691 | spin_unlock_irqrestore(&b->bm_lock, flags); |
1631 | return count; | 1692 | return count; |
1632 | } | 1693 | } |
1633 | |||
1634 | /* Set all bits covered by the AL-extent al_enr. | ||
1635 | * Returns number of bits changed. */ | ||
1636 | unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr) | ||
1637 | { | ||
1638 | struct drbd_bitmap *b = mdev->bitmap; | ||
1639 | unsigned long *p_addr, *bm; | ||
1640 | unsigned long weight; | ||
1641 | unsigned long s, e; | ||
1642 | int count, i, do_now; | ||
1643 | ERR_IF(!b) return 0; | ||
1644 | ERR_IF(!b->bm_pages) return 0; | ||
1645 | |||
1646 | spin_lock_irq(&b->bm_lock); | ||
1647 | if (BM_DONT_SET & b->bm_flags) | ||
1648 | bm_print_lock_info(mdev); | ||
1649 | weight = b->bm_set; | ||
1650 | |||
1651 | s = al_enr * BM_WORDS_PER_AL_EXT; | ||
1652 | e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words); | ||
1653 | /* assert that s and e are on the same page */ | ||
1654 | D_ASSERT((e-1) >> (PAGE_SHIFT - LN2_BPL + 3) | ||
1655 | == s >> (PAGE_SHIFT - LN2_BPL + 3)); | ||
1656 | count = 0; | ||
1657 | if (s < b->bm_words) { | ||
1658 | i = do_now = e-s; | ||
1659 | p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s)); | ||
1660 | bm = p_addr + MLPP(s); | ||
1661 | while (i--) { | ||
1662 | count += hweight_long(*bm); | ||
1663 | *bm = -1UL; | ||
1664 | bm++; | ||
1665 | } | ||
1666 | bm_unmap(p_addr); | ||
1667 | b->bm_set += do_now*BITS_PER_LONG - count; | ||
1668 | if (e == b->bm_words) | ||
1669 | b->bm_set -= bm_clear_surplus(b); | ||
1670 | } else { | ||
1671 | dev_err(DEV, "start offset (%lu) too large in drbd_bm_ALe_set_all\n", s); | ||
1672 | } | ||
1673 | weight = b->bm_set - weight; | ||
1674 | spin_unlock_irq(&b->bm_lock); | ||
1675 | return weight; | ||
1676 | } | ||
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index b953cc7c9c00..6b51afa1aae1 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h | |||
@@ -39,9 +39,13 @@ | |||
39 | #include <linux/major.h> | 39 | #include <linux/major.h> |
40 | #include <linux/blkdev.h> | 40 | #include <linux/blkdev.h> |
41 | #include <linux/genhd.h> | 41 | #include <linux/genhd.h> |
42 | #include <linux/idr.h> | ||
42 | #include <net/tcp.h> | 43 | #include <net/tcp.h> |
43 | #include <linux/lru_cache.h> | 44 | #include <linux/lru_cache.h> |
44 | #include <linux/prefetch.h> | 45 | #include <linux/prefetch.h> |
46 | #include <linux/drbd_genl_api.h> | ||
47 | #include <linux/drbd.h> | ||
48 | #include "drbd_state.h" | ||
45 | 49 | ||
46 | #ifdef __CHECKER__ | 50 | #ifdef __CHECKER__ |
47 | # define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr"))) | 51 | # define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr"))) |
@@ -61,7 +65,6 @@ | |||
61 | extern unsigned int minor_count; | 65 | extern unsigned int minor_count; |
62 | extern bool disable_sendpage; | 66 | extern bool disable_sendpage; |
63 | extern bool allow_oos; | 67 | extern bool allow_oos; |
64 | extern unsigned int cn_idx; | ||
65 | 68 | ||
66 | #ifdef CONFIG_DRBD_FAULT_INJECTION | 69 | #ifdef CONFIG_DRBD_FAULT_INJECTION |
67 | extern int enable_faults; | 70 | extern int enable_faults; |
@@ -86,34 +89,44 @@ extern char usermode_helper[]; | |||
86 | */ | 89 | */ |
87 | #define DRBD_SIGKILL SIGHUP | 90 | #define DRBD_SIGKILL SIGHUP |
88 | 91 | ||
89 | /* All EEs on the free list should have ID_VACANT (== 0) | ||
90 | * freshly allocated EEs get !ID_VACANT (== 1) | ||
91 | * so if it says "cannot dereference null pointer at address 0x00000001", | ||
92 | * it is most likely one of these :( */ | ||
93 | |||
94 | #define ID_IN_SYNC (4711ULL) | 92 | #define ID_IN_SYNC (4711ULL) |
95 | #define ID_OUT_OF_SYNC (4712ULL) | 93 | #define ID_OUT_OF_SYNC (4712ULL) |
96 | |||
97 | #define ID_SYNCER (-1ULL) | 94 | #define ID_SYNCER (-1ULL) |
98 | #define ID_VACANT 0 | 95 | |
99 | #define is_syncer_block_id(id) ((id) == ID_SYNCER) | ||
100 | #define UUID_NEW_BM_OFFSET ((u64)0x0001000000000000ULL) | 96 | #define UUID_NEW_BM_OFFSET ((u64)0x0001000000000000ULL) |
101 | 97 | ||
102 | struct drbd_conf; | 98 | struct drbd_conf; |
99 | struct drbd_tconn; | ||
103 | 100 | ||
104 | 101 | ||
105 | /* to shorten dev_warn(DEV, "msg"); and relatives statements */ | 102 | /* to shorten dev_warn(DEV, "msg"); and relatives statements */ |
106 | #define DEV (disk_to_dev(mdev->vdisk)) | 103 | #define DEV (disk_to_dev(mdev->vdisk)) |
107 | 104 | ||
105 | #define conn_printk(LEVEL, TCONN, FMT, ARGS...) \ | ||
106 | printk(LEVEL "d-con %s: " FMT, TCONN->name , ## ARGS) | ||
107 | #define conn_alert(TCONN, FMT, ARGS...) conn_printk(KERN_ALERT, TCONN, FMT, ## ARGS) | ||
108 | #define conn_crit(TCONN, FMT, ARGS...) conn_printk(KERN_CRIT, TCONN, FMT, ## ARGS) | ||
109 | #define conn_err(TCONN, FMT, ARGS...) conn_printk(KERN_ERR, TCONN, FMT, ## ARGS) | ||
110 | #define conn_warn(TCONN, FMT, ARGS...) conn_printk(KERN_WARNING, TCONN, FMT, ## ARGS) | ||
111 | #define conn_notice(TCONN, FMT, ARGS...) conn_printk(KERN_NOTICE, TCONN, FMT, ## ARGS) | ||
112 | #define conn_info(TCONN, FMT, ARGS...) conn_printk(KERN_INFO, TCONN, FMT, ## ARGS) | ||
113 | #define conn_dbg(TCONN, FMT, ARGS...) conn_printk(KERN_DEBUG, TCONN, FMT, ## ARGS) | ||
114 | |||
108 | #define D_ASSERT(exp) if (!(exp)) \ | 115 | #define D_ASSERT(exp) if (!(exp)) \ |
109 | dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__) | 116 | dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__) |
110 | 117 | ||
111 | #define ERR_IF(exp) if (({ \ | 118 | /** |
112 | int _b = (exp) != 0; \ | 119 | * expect - Make an assertion |
113 | if (_b) dev_err(DEV, "ASSERT FAILED: %s: (%s) in %s:%d\n", \ | 120 | * |
114 | __func__, #exp, __FILE__, __LINE__); \ | 121 | * Unlike the assert macro, this macro returns a boolean result. |
115 | _b; \ | 122 | */ |
116 | })) | 123 | #define expect(exp) ({ \ |
124 | bool _bool = (exp); \ | ||
125 | if (!_bool) \ | ||
126 | dev_err(DEV, "ASSERTION %s FAILED in %s\n", \ | ||
127 | #exp, __func__); \ | ||
128 | _bool; \ | ||
129 | }) | ||
117 | 130 | ||
118 | /* Defines to control fault insertion */ | 131 | /* Defines to control fault insertion */ |
119 | enum { | 132 | enum { |
@@ -150,15 +163,12 @@ drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) { | |||
150 | /* usual integer division */ | 163 | /* usual integer division */ |
151 | #define div_floor(A, B) ((A)/(B)) | 164 | #define div_floor(A, B) ((A)/(B)) |
152 | 165 | ||
153 | /* drbd_meta-data.c (still in drbd_main.c) */ | ||
154 | /* 4th incarnation of the disk layout. */ | ||
155 | #define DRBD_MD_MAGIC (DRBD_MAGIC+4) | ||
156 | |||
157 | extern struct drbd_conf **minor_table; | ||
158 | extern struct ratelimit_state drbd_ratelimit_state; | 166 | extern struct ratelimit_state drbd_ratelimit_state; |
167 | extern struct idr minors; /* RCU, updates: genl_lock() */ | ||
168 | extern struct list_head drbd_tconns; /* RCU, updates: genl_lock() */ | ||
159 | 169 | ||
160 | /* on the wire */ | 170 | /* on the wire */ |
161 | enum drbd_packets { | 171 | enum drbd_packet { |
162 | /* receiver (data socket) */ | 172 | /* receiver (data socket) */ |
163 | P_DATA = 0x00, | 173 | P_DATA = 0x00, |
164 | P_DATA_REPLY = 0x01, /* Response to P_DATA_REQUEST */ | 174 | P_DATA_REPLY = 0x01, /* Response to P_DATA_REQUEST */ |
@@ -186,7 +196,7 @@ enum drbd_packets { | |||
186 | P_RECV_ACK = 0x15, /* Used in protocol B */ | 196 | P_RECV_ACK = 0x15, /* Used in protocol B */ |
187 | P_WRITE_ACK = 0x16, /* Used in protocol C */ | 197 | P_WRITE_ACK = 0x16, /* Used in protocol C */ |
188 | P_RS_WRITE_ACK = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */ | 198 | P_RS_WRITE_ACK = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */ |
189 | P_DISCARD_ACK = 0x18, /* Used in proto C, two-primaries conflict detection */ | 199 | P_SUPERSEDED = 0x18, /* Used in proto C, two-primaries conflict detection */ |
190 | P_NEG_ACK = 0x19, /* Sent if local disk is unusable */ | 200 | P_NEG_ACK = 0x19, /* Sent if local disk is unusable */ |
191 | P_NEG_DREPLY = 0x1a, /* Local disk is broken... */ | 201 | P_NEG_DREPLY = 0x1a, /* Local disk is broken... */ |
192 | P_NEG_RS_DREPLY = 0x1b, /* Local disk is broken... */ | 202 | P_NEG_RS_DREPLY = 0x1b, /* Local disk is broken... */ |
@@ -207,77 +217,23 @@ enum drbd_packets { | |||
207 | P_DELAY_PROBE = 0x27, /* is used on BOTH sockets */ | 217 | P_DELAY_PROBE = 0x27, /* is used on BOTH sockets */ |
208 | P_OUT_OF_SYNC = 0x28, /* Mark as out of sync (Outrunning), data socket */ | 218 | P_OUT_OF_SYNC = 0x28, /* Mark as out of sync (Outrunning), data socket */ |
209 | P_RS_CANCEL = 0x29, /* meta: Used to cancel RS_DATA_REQUEST packet by SyncSource */ | 219 | P_RS_CANCEL = 0x29, /* meta: Used to cancel RS_DATA_REQUEST packet by SyncSource */ |
220 | P_CONN_ST_CHG_REQ = 0x2a, /* data sock: Connection wide state request */ | ||
221 | P_CONN_ST_CHG_REPLY = 0x2b, /* meta sock: Connection side state req reply */ | ||
222 | P_RETRY_WRITE = 0x2c, /* Protocol C: retry conflicting write request */ | ||
223 | P_PROTOCOL_UPDATE = 0x2d, /* data sock: is used in established connections */ | ||
210 | 224 | ||
211 | P_MAX_CMD = 0x2A, | ||
212 | P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ | 225 | P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ |
213 | P_MAX_OPT_CMD = 0x101, | 226 | P_MAX_OPT_CMD = 0x101, |
214 | 227 | ||
215 | /* special command ids for handshake */ | 228 | /* special command ids for handshake */ |
216 | 229 | ||
217 | P_HAND_SHAKE_M = 0xfff1, /* First Packet on the MetaSock */ | 230 | P_INITIAL_META = 0xfff1, /* First Packet on the MetaSock */ |
218 | P_HAND_SHAKE_S = 0xfff2, /* First Packet on the Socket */ | 231 | P_INITIAL_DATA = 0xfff2, /* First Packet on the Socket */ |
219 | 232 | ||
220 | P_HAND_SHAKE = 0xfffe /* FIXED for the next century! */ | 233 | P_CONNECTION_FEATURES = 0xfffe /* FIXED for the next century! */ |
221 | }; | 234 | }; |
222 | 235 | ||
223 | static inline const char *cmdname(enum drbd_packets cmd) | 236 | extern const char *cmdname(enum drbd_packet cmd); |
224 | { | ||
225 | /* THINK may need to become several global tables | ||
226 | * when we want to support more than | ||
227 | * one PRO_VERSION */ | ||
228 | static const char *cmdnames[] = { | ||
229 | [P_DATA] = "Data", | ||
230 | [P_DATA_REPLY] = "DataReply", | ||
231 | [P_RS_DATA_REPLY] = "RSDataReply", | ||
232 | [P_BARRIER] = "Barrier", | ||
233 | [P_BITMAP] = "ReportBitMap", | ||
234 | [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget", | ||
235 | [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource", | ||
236 | [P_UNPLUG_REMOTE] = "UnplugRemote", | ||
237 | [P_DATA_REQUEST] = "DataRequest", | ||
238 | [P_RS_DATA_REQUEST] = "RSDataRequest", | ||
239 | [P_SYNC_PARAM] = "SyncParam", | ||
240 | [P_SYNC_PARAM89] = "SyncParam89", | ||
241 | [P_PROTOCOL] = "ReportProtocol", | ||
242 | [P_UUIDS] = "ReportUUIDs", | ||
243 | [P_SIZES] = "ReportSizes", | ||
244 | [P_STATE] = "ReportState", | ||
245 | [P_SYNC_UUID] = "ReportSyncUUID", | ||
246 | [P_AUTH_CHALLENGE] = "AuthChallenge", | ||
247 | [P_AUTH_RESPONSE] = "AuthResponse", | ||
248 | [P_PING] = "Ping", | ||
249 | [P_PING_ACK] = "PingAck", | ||
250 | [P_RECV_ACK] = "RecvAck", | ||
251 | [P_WRITE_ACK] = "WriteAck", | ||
252 | [P_RS_WRITE_ACK] = "RSWriteAck", | ||
253 | [P_DISCARD_ACK] = "DiscardAck", | ||
254 | [P_NEG_ACK] = "NegAck", | ||
255 | [P_NEG_DREPLY] = "NegDReply", | ||
256 | [P_NEG_RS_DREPLY] = "NegRSDReply", | ||
257 | [P_BARRIER_ACK] = "BarrierAck", | ||
258 | [P_STATE_CHG_REQ] = "StateChgRequest", | ||
259 | [P_STATE_CHG_REPLY] = "StateChgReply", | ||
260 | [P_OV_REQUEST] = "OVRequest", | ||
261 | [P_OV_REPLY] = "OVReply", | ||
262 | [P_OV_RESULT] = "OVResult", | ||
263 | [P_CSUM_RS_REQUEST] = "CsumRSRequest", | ||
264 | [P_RS_IS_IN_SYNC] = "CsumRSIsInSync", | ||
265 | [P_COMPRESSED_BITMAP] = "CBitmap", | ||
266 | [P_DELAY_PROBE] = "DelayProbe", | ||
267 | [P_OUT_OF_SYNC] = "OutOfSync", | ||
268 | [P_MAX_CMD] = NULL, | ||
269 | }; | ||
270 | |||
271 | if (cmd == P_HAND_SHAKE_M) | ||
272 | return "HandShakeM"; | ||
273 | if (cmd == P_HAND_SHAKE_S) | ||
274 | return "HandShakeS"; | ||
275 | if (cmd == P_HAND_SHAKE) | ||
276 | return "HandShake"; | ||
277 | if (cmd >= P_MAX_CMD) | ||
278 | return "Unknown"; | ||
279 | return cmdnames[cmd]; | ||
280 | } | ||
281 | 237 | ||
282 | /* for sending/receiving the bitmap, | 238 | /* for sending/receiving the bitmap, |
283 | * possibly in some encoding scheme */ | 239 | * possibly in some encoding scheme */ |
@@ -337,37 +293,24 @@ struct p_header80 { | |||
337 | u32 magic; | 293 | u32 magic; |
338 | u16 command; | 294 | u16 command; |
339 | u16 length; /* bytes of data after this header */ | 295 | u16 length; /* bytes of data after this header */ |
340 | u8 payload[0]; | ||
341 | } __packed; | 296 | } __packed; |
342 | 297 | ||
343 | /* Header for big packets, Used for data packets exceeding 64kB */ | 298 | /* Header for big packets, Used for data packets exceeding 64kB */ |
344 | struct p_header95 { | 299 | struct p_header95 { |
345 | u16 magic; /* use DRBD_MAGIC_BIG here */ | 300 | u16 magic; /* use DRBD_MAGIC_BIG here */ |
346 | u16 command; | 301 | u16 command; |
347 | u32 length; /* Use only 24 bits of that. Ignore the highest 8 bit. */ | 302 | u32 length; |
348 | u8 payload[0]; | ||
349 | } __packed; | 303 | } __packed; |
350 | 304 | ||
351 | union p_header { | 305 | struct p_header100 { |
352 | struct p_header80 h80; | 306 | u32 magic; |
353 | struct p_header95 h95; | 307 | u16 volume; |
354 | }; | 308 | u16 command; |
355 | 309 | u32 length; | |
356 | /* | 310 | u32 pad; |
357 | * short commands, packets without payload, plain p_header: | 311 | } __packed; |
358 | * P_PING | ||
359 | * P_PING_ACK | ||
360 | * P_BECOME_SYNC_TARGET | ||
361 | * P_BECOME_SYNC_SOURCE | ||
362 | * P_UNPLUG_REMOTE | ||
363 | */ | ||
364 | 312 | ||
365 | /* | 313 | extern unsigned int drbd_header_size(struct drbd_tconn *tconn); |
366 | * commands with out-of-struct payload: | ||
367 | * P_BITMAP (no additional fields) | ||
368 | * P_DATA, P_DATA_REPLY (see p_data) | ||
369 | * P_COMPRESSED_BITMAP (see receive_compressed_bitmap) | ||
370 | */ | ||
371 | 314 | ||
372 | /* these defines must not be changed without changing the protocol version */ | 315 | /* these defines must not be changed without changing the protocol version */ |
373 | #define DP_HARDBARRIER 1 /* depricated */ | 316 | #define DP_HARDBARRIER 1 /* depricated */ |
@@ -377,9 +320,10 @@ union p_header { | |||
377 | #define DP_FUA 16 /* equals REQ_FUA */ | 320 | #define DP_FUA 16 /* equals REQ_FUA */ |
378 | #define DP_FLUSH 32 /* equals REQ_FLUSH */ | 321 | #define DP_FLUSH 32 /* equals REQ_FLUSH */ |
379 | #define DP_DISCARD 64 /* equals REQ_DISCARD */ | 322 | #define DP_DISCARD 64 /* equals REQ_DISCARD */ |
323 | #define DP_SEND_RECEIVE_ACK 128 /* This is a proto B write request */ | ||
324 | #define DP_SEND_WRITE_ACK 256 /* This is a proto C write request */ | ||
380 | 325 | ||
381 | struct p_data { | 326 | struct p_data { |
382 | union p_header head; | ||
383 | u64 sector; /* 64 bits sector number */ | 327 | u64 sector; /* 64 bits sector number */ |
384 | u64 block_id; /* to identify the request in protocol B&C */ | 328 | u64 block_id; /* to identify the request in protocol B&C */ |
385 | u32 seq_num; | 329 | u32 seq_num; |
@@ -390,21 +334,18 @@ struct p_data { | |||
390 | * commands which share a struct: | 334 | * commands which share a struct: |
391 | * p_block_ack: | 335 | * p_block_ack: |
392 | * P_RECV_ACK (proto B), P_WRITE_ACK (proto C), | 336 | * P_RECV_ACK (proto B), P_WRITE_ACK (proto C), |
393 | * P_DISCARD_ACK (proto C, two-primaries conflict detection) | 337 | * P_SUPERSEDED (proto C, two-primaries conflict detection) |
394 | * p_block_req: | 338 | * p_block_req: |
395 | * P_DATA_REQUEST, P_RS_DATA_REQUEST | 339 | * P_DATA_REQUEST, P_RS_DATA_REQUEST |
396 | */ | 340 | */ |
397 | struct p_block_ack { | 341 | struct p_block_ack { |
398 | struct p_header80 head; | ||
399 | u64 sector; | 342 | u64 sector; |
400 | u64 block_id; | 343 | u64 block_id; |
401 | u32 blksize; | 344 | u32 blksize; |
402 | u32 seq_num; | 345 | u32 seq_num; |
403 | } __packed; | 346 | } __packed; |
404 | 347 | ||
405 | |||
406 | struct p_block_req { | 348 | struct p_block_req { |
407 | struct p_header80 head; | ||
408 | u64 sector; | 349 | u64 sector; |
409 | u64 block_id; | 350 | u64 block_id; |
410 | u32 blksize; | 351 | u32 blksize; |
@@ -413,59 +354,52 @@ struct p_block_req { | |||
413 | 354 | ||
414 | /* | 355 | /* |
415 | * commands with their own struct for additional fields: | 356 | * commands with their own struct for additional fields: |
416 | * P_HAND_SHAKE | 357 | * P_CONNECTION_FEATURES |
417 | * P_BARRIER | 358 | * P_BARRIER |
418 | * P_BARRIER_ACK | 359 | * P_BARRIER_ACK |
419 | * P_SYNC_PARAM | 360 | * P_SYNC_PARAM |
420 | * ReportParams | 361 | * ReportParams |
421 | */ | 362 | */ |
422 | 363 | ||
423 | struct p_handshake { | 364 | struct p_connection_features { |
424 | struct p_header80 head; /* 8 bytes */ | ||
425 | u32 protocol_min; | 365 | u32 protocol_min; |
426 | u32 feature_flags; | 366 | u32 feature_flags; |
427 | u32 protocol_max; | 367 | u32 protocol_max; |
428 | 368 | ||
429 | /* should be more than enough for future enhancements | 369 | /* should be more than enough for future enhancements |
430 | * for now, feature_flags and the reserverd array shall be zero. | 370 | * for now, feature_flags and the reserved array shall be zero. |
431 | */ | 371 | */ |
432 | 372 | ||
433 | u32 _pad; | 373 | u32 _pad; |
434 | u64 reserverd[7]; | 374 | u64 reserved[7]; |
435 | } __packed; | 375 | } __packed; |
436 | /* 80 bytes, FIXED for the next century */ | ||
437 | 376 | ||
438 | struct p_barrier { | 377 | struct p_barrier { |
439 | struct p_header80 head; | ||
440 | u32 barrier; /* barrier number _handle_ only */ | 378 | u32 barrier; /* barrier number _handle_ only */ |
441 | u32 pad; /* to multiple of 8 Byte */ | 379 | u32 pad; /* to multiple of 8 Byte */ |
442 | } __packed; | 380 | } __packed; |
443 | 381 | ||
444 | struct p_barrier_ack { | 382 | struct p_barrier_ack { |
445 | struct p_header80 head; | ||
446 | u32 barrier; | 383 | u32 barrier; |
447 | u32 set_size; | 384 | u32 set_size; |
448 | } __packed; | 385 | } __packed; |
449 | 386 | ||
450 | struct p_rs_param { | 387 | struct p_rs_param { |
451 | struct p_header80 head; | 388 | u32 resync_rate; |
452 | u32 rate; | ||
453 | 389 | ||
454 | /* Since protocol version 88 and higher. */ | 390 | /* Since protocol version 88 and higher. */ |
455 | char verify_alg[0]; | 391 | char verify_alg[0]; |
456 | } __packed; | 392 | } __packed; |
457 | 393 | ||
458 | struct p_rs_param_89 { | 394 | struct p_rs_param_89 { |
459 | struct p_header80 head; | 395 | u32 resync_rate; |
460 | u32 rate; | ||
461 | /* protocol version 89: */ | 396 | /* protocol version 89: */ |
462 | char verify_alg[SHARED_SECRET_MAX]; | 397 | char verify_alg[SHARED_SECRET_MAX]; |
463 | char csums_alg[SHARED_SECRET_MAX]; | 398 | char csums_alg[SHARED_SECRET_MAX]; |
464 | } __packed; | 399 | } __packed; |
465 | 400 | ||
466 | struct p_rs_param_95 { | 401 | struct p_rs_param_95 { |
467 | struct p_header80 head; | 402 | u32 resync_rate; |
468 | u32 rate; | ||
469 | char verify_alg[SHARED_SECRET_MAX]; | 403 | char verify_alg[SHARED_SECRET_MAX]; |
470 | char csums_alg[SHARED_SECRET_MAX]; | 404 | char csums_alg[SHARED_SECRET_MAX]; |
471 | u32 c_plan_ahead; | 405 | u32 c_plan_ahead; |
@@ -475,12 +409,11 @@ struct p_rs_param_95 { | |||
475 | } __packed; | 409 | } __packed; |
476 | 410 | ||
477 | enum drbd_conn_flags { | 411 | enum drbd_conn_flags { |
478 | CF_WANT_LOSE = 1, | 412 | CF_DISCARD_MY_DATA = 1, |
479 | CF_DRY_RUN = 2, | 413 | CF_DRY_RUN = 2, |
480 | }; | 414 | }; |
481 | 415 | ||
482 | struct p_protocol { | 416 | struct p_protocol { |
483 | struct p_header80 head; | ||
484 | u32 protocol; | 417 | u32 protocol; |
485 | u32 after_sb_0p; | 418 | u32 after_sb_0p; |
486 | u32 after_sb_1p; | 419 | u32 after_sb_1p; |
@@ -494,17 +427,14 @@ struct p_protocol { | |||
494 | } __packed; | 427 | } __packed; |
495 | 428 | ||
496 | struct p_uuids { | 429 | struct p_uuids { |
497 | struct p_header80 head; | ||
498 | u64 uuid[UI_EXTENDED_SIZE]; | 430 | u64 uuid[UI_EXTENDED_SIZE]; |
499 | } __packed; | 431 | } __packed; |
500 | 432 | ||
501 | struct p_rs_uuid { | 433 | struct p_rs_uuid { |
502 | struct p_header80 head; | ||
503 | u64 uuid; | 434 | u64 uuid; |
504 | } __packed; | 435 | } __packed; |
505 | 436 | ||
506 | struct p_sizes { | 437 | struct p_sizes { |
507 | struct p_header80 head; | ||
508 | u64 d_size; /* size of disk */ | 438 | u64 d_size; /* size of disk */ |
509 | u64 u_size; /* user requested size */ | 439 | u64 u_size; /* user requested size */ |
510 | u64 c_size; /* current exported size */ | 440 | u64 c_size; /* current exported size */ |
@@ -514,18 +444,15 @@ struct p_sizes { | |||
514 | } __packed; | 444 | } __packed; |
515 | 445 | ||
516 | struct p_state { | 446 | struct p_state { |
517 | struct p_header80 head; | ||
518 | u32 state; | 447 | u32 state; |
519 | } __packed; | 448 | } __packed; |
520 | 449 | ||
521 | struct p_req_state { | 450 | struct p_req_state { |
522 | struct p_header80 head; | ||
523 | u32 mask; | 451 | u32 mask; |
524 | u32 val; | 452 | u32 val; |
525 | } __packed; | 453 | } __packed; |
526 | 454 | ||
527 | struct p_req_state_reply { | 455 | struct p_req_state_reply { |
528 | struct p_header80 head; | ||
529 | u32 retcode; | 456 | u32 retcode; |
530 | } __packed; | 457 | } __packed; |
531 | 458 | ||
@@ -539,15 +466,7 @@ struct p_drbd06_param { | |||
539 | u32 bit_map_gen[5]; | 466 | u32 bit_map_gen[5]; |
540 | } __packed; | 467 | } __packed; |
541 | 468 | ||
542 | struct p_discard { | ||
543 | struct p_header80 head; | ||
544 | u64 block_id; | ||
545 | u32 seq_num; | ||
546 | u32 pad; | ||
547 | } __packed; | ||
548 | |||
549 | struct p_block_desc { | 469 | struct p_block_desc { |
550 | struct p_header80 head; | ||
551 | u64 sector; | 470 | u64 sector; |
552 | u32 blksize; | 471 | u32 blksize; |
553 | u32 pad; /* to multiple of 8 Byte */ | 472 | u32 pad; /* to multiple of 8 Byte */ |
@@ -563,7 +482,6 @@ enum drbd_bitmap_code { | |||
563 | }; | 482 | }; |
564 | 483 | ||
565 | struct p_compressed_bm { | 484 | struct p_compressed_bm { |
566 | struct p_header80 head; | ||
567 | /* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code | 485 | /* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code |
568 | * (encoding & 0x80): polarity (set/unset) of first runlength | 486 | * (encoding & 0x80): polarity (set/unset) of first runlength |
569 | * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits | 487 | * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits |
@@ -575,90 +493,22 @@ struct p_compressed_bm { | |||
575 | } __packed; | 493 | } __packed; |
576 | 494 | ||
577 | struct p_delay_probe93 { | 495 | struct p_delay_probe93 { |
578 | struct p_header80 head; | ||
579 | u32 seq_num; /* sequence number to match the two probe packets */ | 496 | u32 seq_num; /* sequence number to match the two probe packets */ |
580 | u32 offset; /* usecs the probe got sent after the reference time point */ | 497 | u32 offset; /* usecs the probe got sent after the reference time point */ |
581 | } __packed; | 498 | } __packed; |
582 | 499 | ||
583 | /* DCBP: Drbd Compressed Bitmap Packet ... */ | 500 | /* |
584 | static inline enum drbd_bitmap_code | 501 | * Bitmap packets need to fit within a single page on the sender and receiver, |
585 | DCBP_get_code(struct p_compressed_bm *p) | 502 | * so we are limited to 4 KiB (and not to PAGE_SIZE, which can be bigger). |
586 | { | ||
587 | return (enum drbd_bitmap_code)(p->encoding & 0x0f); | ||
588 | } | ||
589 | |||
590 | static inline void | ||
591 | DCBP_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code) | ||
592 | { | ||
593 | BUG_ON(code & ~0xf); | ||
594 | p->encoding = (p->encoding & ~0xf) | code; | ||
595 | } | ||
596 | |||
597 | static inline int | ||
598 | DCBP_get_start(struct p_compressed_bm *p) | ||
599 | { | ||
600 | return (p->encoding & 0x80) != 0; | ||
601 | } | ||
602 | |||
603 | static inline void | ||
604 | DCBP_set_start(struct p_compressed_bm *p, int set) | ||
605 | { | ||
606 | p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0); | ||
607 | } | ||
608 | |||
609 | static inline int | ||
610 | DCBP_get_pad_bits(struct p_compressed_bm *p) | ||
611 | { | ||
612 | return (p->encoding >> 4) & 0x7; | ||
613 | } | ||
614 | |||
615 | static inline void | ||
616 | DCBP_set_pad_bits(struct p_compressed_bm *p, int n) | ||
617 | { | ||
618 | BUG_ON(n & ~0x7); | ||
619 | p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4); | ||
620 | } | ||
621 | |||
622 | /* one bitmap packet, including the p_header, | ||
623 | * should fit within one _architecture independend_ page. | ||
624 | * so we need to use the fixed size 4KiB page size | ||
625 | * most architectures have used for a long time. | ||
626 | */ | 503 | */ |
627 | #define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header80)) | 504 | #define DRBD_SOCKET_BUFFER_SIZE 4096 |
628 | #define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long)) | ||
629 | #define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm)) | ||
630 | #if (PAGE_SIZE < 4096) | ||
631 | /* drbd_send_bitmap / receive_bitmap would break horribly */ | ||
632 | #error "PAGE_SIZE too small" | ||
633 | #endif | ||
634 | |||
635 | union p_polymorph { | ||
636 | union p_header header; | ||
637 | struct p_handshake handshake; | ||
638 | struct p_data data; | ||
639 | struct p_block_ack block_ack; | ||
640 | struct p_barrier barrier; | ||
641 | struct p_barrier_ack barrier_ack; | ||
642 | struct p_rs_param_89 rs_param_89; | ||
643 | struct p_rs_param_95 rs_param_95; | ||
644 | struct p_protocol protocol; | ||
645 | struct p_sizes sizes; | ||
646 | struct p_uuids uuids; | ||
647 | struct p_state state; | ||
648 | struct p_req_state req_state; | ||
649 | struct p_req_state_reply req_state_reply; | ||
650 | struct p_block_req block_req; | ||
651 | struct p_delay_probe93 delay_probe93; | ||
652 | struct p_rs_uuid rs_uuid; | ||
653 | struct p_block_desc block_desc; | ||
654 | } __packed; | ||
655 | 505 | ||
656 | /**********************************************************************/ | 506 | /**********************************************************************/ |
657 | enum drbd_thread_state { | 507 | enum drbd_thread_state { |
658 | None, | 508 | NONE, |
659 | Running, | 509 | RUNNING, |
660 | Exiting, | 510 | EXITING, |
661 | Restarting | 511 | RESTARTING |
662 | }; | 512 | }; |
663 | 513 | ||
664 | struct drbd_thread { | 514 | struct drbd_thread { |
@@ -667,8 +517,9 @@ struct drbd_thread { | |||
667 | struct completion stop; | 517 | struct completion stop; |
668 | enum drbd_thread_state t_state; | 518 | enum drbd_thread_state t_state; |
669 | int (*function) (struct drbd_thread *); | 519 | int (*function) (struct drbd_thread *); |
670 | struct drbd_conf *mdev; | 520 | struct drbd_tconn *tconn; |
671 | int reset_cpu_mask; | 521 | int reset_cpu_mask; |
522 | char name[9]; | ||
672 | }; | 523 | }; |
673 | 524 | ||
674 | static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi) | 525 | static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi) |
@@ -681,58 +532,54 @@ static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi) | |||
681 | return thi->t_state; | 532 | return thi->t_state; |
682 | } | 533 | } |
683 | 534 | ||
684 | struct drbd_work; | ||
685 | typedef int (*drbd_work_cb)(struct drbd_conf *, struct drbd_work *, int cancel); | ||
686 | struct drbd_work { | 535 | struct drbd_work { |
687 | struct list_head list; | 536 | struct list_head list; |
688 | drbd_work_cb cb; | 537 | int (*cb)(struct drbd_work *, int cancel); |
538 | union { | ||
539 | struct drbd_conf *mdev; | ||
540 | struct drbd_tconn *tconn; | ||
541 | }; | ||
689 | }; | 542 | }; |
690 | 543 | ||
691 | struct drbd_tl_epoch; | 544 | #include "drbd_interval.h" |
545 | |||
546 | extern int drbd_wait_misc(struct drbd_conf *, struct drbd_interval *); | ||
547 | |||
692 | struct drbd_request { | 548 | struct drbd_request { |
693 | struct drbd_work w; | 549 | struct drbd_work w; |
694 | struct drbd_conf *mdev; | ||
695 | 550 | ||
696 | /* if local IO is not allowed, will be NULL. | 551 | /* if local IO is not allowed, will be NULL. |
697 | * if local IO _is_ allowed, holds the locally submitted bio clone, | 552 | * if local IO _is_ allowed, holds the locally submitted bio clone, |
698 | * or, after local IO completion, the ERR_PTR(error). | 553 | * or, after local IO completion, the ERR_PTR(error). |
699 | * see drbd_endio_pri(). */ | 554 | * see drbd_request_endio(). */ |
700 | struct bio *private_bio; | 555 | struct bio *private_bio; |
701 | 556 | ||
702 | struct hlist_node collision; | 557 | struct drbd_interval i; |
703 | sector_t sector; | ||
704 | unsigned int size; | ||
705 | unsigned int epoch; /* barrier_nr */ | ||
706 | 558 | ||
707 | /* barrier_nr: used to check on "completion" whether this req was in | 559 | /* epoch: used to check on "completion" whether this req was in |
708 | * the current epoch, and we therefore have to close it, | 560 | * the current epoch, and we therefore have to close it, |
709 | * starting a new epoch... | 561 | * causing a p_barrier packet to be send, starting a new epoch. |
562 | * | ||
563 | * This corresponds to "barrier" in struct p_barrier[_ack], | ||
564 | * and to "barrier_nr" in struct drbd_epoch (and various | ||
565 | * comments/function parameters/local variable names). | ||
710 | */ | 566 | */ |
567 | unsigned int epoch; | ||
711 | 568 | ||
712 | struct list_head tl_requests; /* ring list in the transfer log */ | 569 | struct list_head tl_requests; /* ring list in the transfer log */ |
713 | struct bio *master_bio; /* master bio pointer */ | 570 | struct bio *master_bio; /* master bio pointer */ |
714 | unsigned long rq_state; /* see comments above _req_mod() */ | ||
715 | unsigned long start_time; | 571 | unsigned long start_time; |
716 | }; | ||
717 | |||
718 | struct drbd_tl_epoch { | ||
719 | struct drbd_work w; | ||
720 | struct list_head requests; /* requests before */ | ||
721 | struct drbd_tl_epoch *next; /* pointer to the next barrier */ | ||
722 | unsigned int br_number; /* the barriers identifier. */ | ||
723 | int n_writes; /* number of requests attached before this barrier */ | ||
724 | }; | ||
725 | 572 | ||
726 | struct drbd_request; | 573 | /* once it hits 0, we may complete the master_bio */ |
574 | atomic_t completion_ref; | ||
575 | /* once it hits 0, we may destroy this drbd_request object */ | ||
576 | struct kref kref; | ||
727 | 577 | ||
728 | /* These Tl_epoch_entries may be in one of 6 lists: | 578 | unsigned rq_state; /* see comments above _req_mod() */ |
729 | active_ee .. data packet being written | 579 | }; |
730 | sync_ee .. syncer block being written | ||
731 | done_ee .. block written, need to send P_WRITE_ACK | ||
732 | read_ee .. [RS]P_DATA_REQUEST being read | ||
733 | */ | ||
734 | 580 | ||
735 | struct drbd_epoch { | 581 | struct drbd_epoch { |
582 | struct drbd_tconn *tconn; | ||
736 | struct list_head list; | 583 | struct list_head list; |
737 | unsigned int barrier_nr; | 584 | unsigned int barrier_nr; |
738 | atomic_t epoch_size; /* increased on every request added. */ | 585 | atomic_t epoch_size; /* increased on every request added. */ |
@@ -762,17 +609,14 @@ struct digest_info { | |||
762 | void *digest; | 609 | void *digest; |
763 | }; | 610 | }; |
764 | 611 | ||
765 | struct drbd_epoch_entry { | 612 | struct drbd_peer_request { |
766 | struct drbd_work w; | 613 | struct drbd_work w; |
767 | struct hlist_node collision; | ||
768 | struct drbd_epoch *epoch; /* for writes */ | 614 | struct drbd_epoch *epoch; /* for writes */ |
769 | struct drbd_conf *mdev; | ||
770 | struct page *pages; | 615 | struct page *pages; |
771 | atomic_t pending_bios; | 616 | atomic_t pending_bios; |
772 | unsigned int size; | 617 | struct drbd_interval i; |
773 | /* see comments on ee flag bits below */ | 618 | /* see comments on ee flag bits below */ |
774 | unsigned long flags; | 619 | unsigned long flags; |
775 | sector_t sector; | ||
776 | union { | 620 | union { |
777 | u64 block_id; | 621 | u64 block_id; |
778 | struct digest_info *digest; | 622 | struct digest_info *digest; |
@@ -793,31 +637,37 @@ enum { | |||
793 | * we need to resubmit without the barrier flag. */ | 637 | * we need to resubmit without the barrier flag. */ |
794 | __EE_RESUBMITTED, | 638 | __EE_RESUBMITTED, |
795 | 639 | ||
796 | /* we may have several bios per epoch entry. | 640 | /* we may have several bios per peer request. |
797 | * if any of those fail, we set this flag atomically | 641 | * if any of those fail, we set this flag atomically |
798 | * from the endio callback */ | 642 | * from the endio callback */ |
799 | __EE_WAS_ERROR, | 643 | __EE_WAS_ERROR, |
800 | 644 | ||
801 | /* This ee has a pointer to a digest instead of a block id */ | 645 | /* This ee has a pointer to a digest instead of a block id */ |
802 | __EE_HAS_DIGEST, | 646 | __EE_HAS_DIGEST, |
647 | |||
648 | /* Conflicting local requests need to be restarted after this request */ | ||
649 | __EE_RESTART_REQUESTS, | ||
650 | |||
651 | /* The peer wants a write ACK for this (wire proto C) */ | ||
652 | __EE_SEND_WRITE_ACK, | ||
653 | |||
654 | /* Is set when net_conf had two_primaries set while creating this peer_req */ | ||
655 | __EE_IN_INTERVAL_TREE, | ||
803 | }; | 656 | }; |
804 | #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) | 657 | #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) |
805 | #define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) | 658 | #define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) |
806 | #define EE_RESUBMITTED (1<<__EE_RESUBMITTED) | 659 | #define EE_RESUBMITTED (1<<__EE_RESUBMITTED) |
807 | #define EE_WAS_ERROR (1<<__EE_WAS_ERROR) | 660 | #define EE_WAS_ERROR (1<<__EE_WAS_ERROR) |
808 | #define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST) | 661 | #define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST) |
662 | #define EE_RESTART_REQUESTS (1<<__EE_RESTART_REQUESTS) | ||
663 | #define EE_SEND_WRITE_ACK (1<<__EE_SEND_WRITE_ACK) | ||
664 | #define EE_IN_INTERVAL_TREE (1<<__EE_IN_INTERVAL_TREE) | ||
809 | 665 | ||
810 | /* global flag bits */ | 666 | /* flag bits per mdev */ |
811 | enum { | 667 | enum { |
812 | CREATE_BARRIER, /* next P_DATA is preceded by a P_BARRIER */ | ||
813 | SIGNAL_ASENDER, /* whether asender wants to be interrupted */ | ||
814 | SEND_PING, /* whether asender should send a ping asap */ | ||
815 | |||
816 | UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */ | 668 | UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */ |
817 | MD_DIRTY, /* current uuids and flags not yet on disk */ | 669 | MD_DIRTY, /* current uuids and flags not yet on disk */ |
818 | DISCARD_CONCURRENT, /* Set on one node, cleared on the peer! */ | ||
819 | USE_DEGR_WFC_T, /* degr-wfc-timeout instead of wfc-timeout. */ | 670 | USE_DEGR_WFC_T, /* degr-wfc-timeout instead of wfc-timeout. */ |
820 | CLUSTER_ST_CHANGE, /* Cluster wide state change going on... */ | ||
821 | CL_ST_CHG_SUCCESS, | 671 | CL_ST_CHG_SUCCESS, |
822 | CL_ST_CHG_FAIL, | 672 | CL_ST_CHG_FAIL, |
823 | CRASHED_PRIMARY, /* This node was a crashed primary. | 673 | CRASHED_PRIMARY, /* This node was a crashed primary. |
@@ -831,32 +681,18 @@ enum { | |||
831 | once no more io in flight, start bitmap io */ | 681 | once no more io in flight, start bitmap io */ |
832 | BITMAP_IO_QUEUED, /* Started bitmap IO */ | 682 | BITMAP_IO_QUEUED, /* Started bitmap IO */ |
833 | GO_DISKLESS, /* Disk is being detached, on io-error or admin request. */ | 683 | GO_DISKLESS, /* Disk is being detached, on io-error or admin request. */ |
834 | WAS_IO_ERROR, /* Local disk failed returned IO error */ | 684 | WAS_IO_ERROR, /* Local disk failed, returned IO error */ |
685 | WAS_READ_ERROR, /* Local disk READ failed (set additionally to the above) */ | ||
835 | FORCE_DETACH, /* Force-detach from local disk, aborting any pending local IO */ | 686 | FORCE_DETACH, /* Force-detach from local disk, aborting any pending local IO */ |
836 | RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */ | 687 | RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */ |
837 | NET_CONGESTED, /* The data socket is congested */ | ||
838 | |||
839 | CONFIG_PENDING, /* serialization of (re)configuration requests. | ||
840 | * if set, also prevents the device from dying */ | ||
841 | DEVICE_DYING, /* device became unconfigured, | ||
842 | * but worker thread is still handling the cleanup. | ||
843 | * reconfiguring (nl_disk_conf, nl_net_conf) is dissalowed, | ||
844 | * while this is set. */ | ||
845 | RESIZE_PENDING, /* Size change detected locally, waiting for the response from | 688 | RESIZE_PENDING, /* Size change detected locally, waiting for the response from |
846 | * the peer, if it changed there as well. */ | 689 | * the peer, if it changed there as well. */ |
847 | CONN_DRY_RUN, /* Expect disconnect after resync handshake. */ | ||
848 | GOT_PING_ACK, /* set when we receive a ping_ack packet, misc wait gets woken */ | ||
849 | NEW_CUR_UUID, /* Create new current UUID when thawing IO */ | 690 | NEW_CUR_UUID, /* Create new current UUID when thawing IO */ |
850 | AL_SUSPENDED, /* Activity logging is currently suspended. */ | 691 | AL_SUSPENDED, /* Activity logging is currently suspended. */ |
851 | AHEAD_TO_SYNC_SOURCE, /* Ahead -> SyncSource queued */ | 692 | AHEAD_TO_SYNC_SOURCE, /* Ahead -> SyncSource queued */ |
852 | STATE_SENT, /* Do not change state/UUIDs while this is set */ | 693 | B_RS_H_DONE, /* Before resync handler done (already executed) */ |
853 | 694 | DISCARD_MY_DATA, /* discard_my_data flag per volume */ | |
854 | CALLBACK_PENDING, /* Whether we have a call_usermodehelper(, UMH_WAIT_PROC) | 695 | READ_BALANCE_RR, |
855 | * pending, from drbd worker context. | ||
856 | * If set, bdi_write_congested() returns true, | ||
857 | * so shrink_page_list() would not recurse into, | ||
858 | * and potentially deadlock on, this drbd worker. | ||
859 | */ | ||
860 | }; | 696 | }; |
861 | 697 | ||
862 | struct drbd_bitmap; /* opaque for drbd_conf */ | 698 | struct drbd_bitmap; /* opaque for drbd_conf */ |
@@ -894,24 +730,24 @@ enum bm_flag { | |||
894 | 730 | ||
895 | struct drbd_work_queue { | 731 | struct drbd_work_queue { |
896 | struct list_head q; | 732 | struct list_head q; |
897 | struct semaphore s; /* producers up it, worker down()s it */ | ||
898 | spinlock_t q_lock; /* to protect the list. */ | 733 | spinlock_t q_lock; /* to protect the list. */ |
734 | wait_queue_head_t q_wait; | ||
899 | }; | 735 | }; |
900 | 736 | ||
901 | struct drbd_socket { | 737 | struct drbd_socket { |
902 | struct drbd_work_queue work; | ||
903 | struct mutex mutex; | 738 | struct mutex mutex; |
904 | struct socket *socket; | 739 | struct socket *socket; |
905 | /* this way we get our | 740 | /* this way we get our |
906 | * send/receive buffers off the stack */ | 741 | * send/receive buffers off the stack */ |
907 | union p_polymorph sbuf; | 742 | void *sbuf; |
908 | union p_polymorph rbuf; | 743 | void *rbuf; |
909 | }; | 744 | }; |
910 | 745 | ||
911 | struct drbd_md { | 746 | struct drbd_md { |
912 | u64 md_offset; /* sector offset to 'super' block */ | 747 | u64 md_offset; /* sector offset to 'super' block */ |
913 | 748 | ||
914 | u64 la_size_sect; /* last agreed size, unit sectors */ | 749 | u64 la_size_sect; /* last agreed size, unit sectors */ |
750 | spinlock_t uuid_lock; | ||
915 | u64 uuid[UI_SIZE]; | 751 | u64 uuid[UI_SIZE]; |
916 | u64 device_uuid; | 752 | u64 device_uuid; |
917 | u32 flags; | 753 | u32 flags; |
@@ -921,24 +757,16 @@ struct drbd_md { | |||
921 | s32 bm_offset; /* signed relative sector offset to bitmap */ | 757 | s32 bm_offset; /* signed relative sector offset to bitmap */ |
922 | 758 | ||
923 | /* u32 al_nr_extents; important for restoring the AL | 759 | /* u32 al_nr_extents; important for restoring the AL |
924 | * is stored into sync_conf.al_extents, which in turn | 760 | * is stored into ldev->dc.al_extents, which in turn |
925 | * gets applied to act_log->nr_elements | 761 | * gets applied to act_log->nr_elements |
926 | */ | 762 | */ |
927 | }; | 763 | }; |
928 | 764 | ||
929 | /* for sync_conf and other types... */ | ||
930 | #define NL_PACKET(name, number, fields) struct name { fields }; | ||
931 | #define NL_INTEGER(pn,pr,member) int member; | ||
932 | #define NL_INT64(pn,pr,member) __u64 member; | ||
933 | #define NL_BIT(pn,pr,member) unsigned member:1; | ||
934 | #define NL_STRING(pn,pr,member,len) unsigned char member[len]; int member ## _len; | ||
935 | #include <linux/drbd_nl.h> | ||
936 | |||
937 | struct drbd_backing_dev { | 765 | struct drbd_backing_dev { |
938 | struct block_device *backing_bdev; | 766 | struct block_device *backing_bdev; |
939 | struct block_device *md_bdev; | 767 | struct block_device *md_bdev; |
940 | struct drbd_md md; | 768 | struct drbd_md md; |
941 | struct disk_conf dc; /* The user provided config... */ | 769 | struct disk_conf *disk_conf; /* RCU, for updates: mdev->tconn->conf_update */ |
942 | sector_t known_size; /* last known size of that backing device */ | 770 | sector_t known_size; /* last known size of that backing device */ |
943 | }; | 771 | }; |
944 | 772 | ||
@@ -962,18 +790,116 @@ enum write_ordering_e { | |||
962 | }; | 790 | }; |
963 | 791 | ||
964 | struct fifo_buffer { | 792 | struct fifo_buffer { |
965 | int *values; | ||
966 | unsigned int head_index; | 793 | unsigned int head_index; |
967 | unsigned int size; | 794 | unsigned int size; |
795 | int total; /* sum of all values */ | ||
796 | int values[0]; | ||
797 | }; | ||
798 | extern struct fifo_buffer *fifo_alloc(int fifo_size); | ||
799 | |||
800 | /* flag bits per tconn */ | ||
801 | enum { | ||
802 | NET_CONGESTED, /* The data socket is congested */ | ||
803 | RESOLVE_CONFLICTS, /* Set on one node, cleared on the peer! */ | ||
804 | SEND_PING, /* whether asender should send a ping asap */ | ||
805 | SIGNAL_ASENDER, /* whether asender wants to be interrupted */ | ||
806 | GOT_PING_ACK, /* set when we receive a ping_ack packet, ping_wait gets woken */ | ||
807 | CONN_WD_ST_CHG_REQ, /* A cluster wide state change on the connection is active */ | ||
808 | CONN_WD_ST_CHG_OKAY, | ||
809 | CONN_WD_ST_CHG_FAIL, | ||
810 | CONN_DRY_RUN, /* Expect disconnect after resync handshake. */ | ||
811 | CREATE_BARRIER, /* next P_DATA is preceded by a P_BARRIER */ | ||
812 | STATE_SENT, /* Do not change state/UUIDs while this is set */ | ||
813 | CALLBACK_PENDING, /* Whether we have a call_usermodehelper(, UMH_WAIT_PROC) | ||
814 | * pending, from drbd worker context. | ||
815 | * If set, bdi_write_congested() returns true, | ||
816 | * so shrink_page_list() would not recurse into, | ||
817 | * and potentially deadlock on, this drbd worker. | ||
818 | */ | ||
819 | DISCONNECT_SENT, | ||
820 | }; | ||
821 | |||
822 | struct drbd_tconn { /* is a resource from the config file */ | ||
823 | char *name; /* Resource name */ | ||
824 | struct list_head all_tconn; /* linked on global drbd_tconns */ | ||
825 | struct kref kref; | ||
826 | struct idr volumes; /* <tconn, vnr> to mdev mapping */ | ||
827 | enum drbd_conns cstate; /* Only C_STANDALONE to C_WF_REPORT_PARAMS */ | ||
828 | unsigned susp:1; /* IO suspended by user */ | ||
829 | unsigned susp_nod:1; /* IO suspended because no data */ | ||
830 | unsigned susp_fen:1; /* IO suspended because fence peer handler runs */ | ||
831 | struct mutex cstate_mutex; /* Protects graceful disconnects */ | ||
832 | |||
833 | unsigned long flags; | ||
834 | struct net_conf *net_conf; /* content protected by rcu */ | ||
835 | struct mutex conf_update; /* mutex for ready-copy-update of net_conf and disk_conf */ | ||
836 | wait_queue_head_t ping_wait; /* Woken upon reception of a ping, and a state change */ | ||
837 | struct res_opts res_opts; | ||
838 | |||
839 | struct sockaddr_storage my_addr; | ||
840 | int my_addr_len; | ||
841 | struct sockaddr_storage peer_addr; | ||
842 | int peer_addr_len; | ||
843 | |||
844 | struct drbd_socket data; /* data/barrier/cstate/parameter packets */ | ||
845 | struct drbd_socket meta; /* ping/ack (metadata) packets */ | ||
846 | int agreed_pro_version; /* actually used protocol version */ | ||
847 | unsigned long last_received; /* in jiffies, either socket */ | ||
848 | unsigned int ko_count; | ||
849 | |||
850 | spinlock_t req_lock; | ||
851 | |||
852 | struct list_head transfer_log; /* all requests not yet fully processed */ | ||
853 | |||
854 | struct crypto_hash *cram_hmac_tfm; | ||
855 | struct crypto_hash *integrity_tfm; /* checksums we compute, updates protected by tconn->data->mutex */ | ||
856 | struct crypto_hash *peer_integrity_tfm; /* checksums we verify, only accessed from receiver thread */ | ||
857 | struct crypto_hash *csums_tfm; | ||
858 | struct crypto_hash *verify_tfm; | ||
859 | void *int_dig_in; | ||
860 | void *int_dig_vv; | ||
861 | |||
862 | /* receiver side */ | ||
863 | struct drbd_epoch *current_epoch; | ||
864 | spinlock_t epoch_lock; | ||
865 | unsigned int epochs; | ||
866 | enum write_ordering_e write_ordering; | ||
867 | atomic_t current_tle_nr; /* transfer log epoch number */ | ||
868 | unsigned current_tle_writes; /* writes seen within this tl epoch */ | ||
869 | |||
870 | unsigned long last_reconnect_jif; | ||
871 | struct drbd_thread receiver; | ||
872 | struct drbd_thread worker; | ||
873 | struct drbd_thread asender; | ||
874 | cpumask_var_t cpu_mask; | ||
875 | |||
876 | /* sender side */ | ||
877 | struct drbd_work_queue sender_work; | ||
878 | |||
879 | struct { | ||
880 | /* whether this sender thread | ||
881 | * has processed a single write yet. */ | ||
882 | bool seen_any_write_yet; | ||
883 | |||
884 | /* Which barrier number to send with the next P_BARRIER */ | ||
885 | int current_epoch_nr; | ||
886 | |||
887 | /* how many write requests have been sent | ||
888 | * with req->epoch == current_epoch_nr. | ||
889 | * If none, no P_BARRIER will be sent. */ | ||
890 | unsigned current_epoch_writes; | ||
891 | } send; | ||
968 | }; | 892 | }; |
969 | 893 | ||
970 | struct drbd_conf { | 894 | struct drbd_conf { |
895 | struct drbd_tconn *tconn; | ||
896 | int vnr; /* volume number within the connection */ | ||
897 | struct kref kref; | ||
898 | |||
971 | /* things that are stored as / read from meta data on disk */ | 899 | /* things that are stored as / read from meta data on disk */ |
972 | unsigned long flags; | 900 | unsigned long flags; |
973 | 901 | ||
974 | /* configured by drbdsetup */ | 902 | /* configured by drbdsetup */ |
975 | struct net_conf *net_conf; /* protected by get_net_conf() and put_net_conf() */ | ||
976 | struct syncer_conf sync_conf; | ||
977 | struct drbd_backing_dev *ldev __protected_by(local); | 903 | struct drbd_backing_dev *ldev __protected_by(local); |
978 | 904 | ||
979 | sector_t p_size; /* partner's disk size */ | 905 | sector_t p_size; /* partner's disk size */ |
@@ -981,11 +907,7 @@ struct drbd_conf { | |||
981 | struct block_device *this_bdev; | 907 | struct block_device *this_bdev; |
982 | struct gendisk *vdisk; | 908 | struct gendisk *vdisk; |
983 | 909 | ||
984 | struct drbd_socket data; /* data/barrier/cstate/parameter packets */ | 910 | unsigned long last_reattach_jif; |
985 | struct drbd_socket meta; /* ping/ack (metadata) packets */ | ||
986 | int agreed_pro_version; /* actually used protocol version */ | ||
987 | unsigned long last_received; /* in jiffies, either socket */ | ||
988 | unsigned int ko_count; | ||
989 | struct drbd_work resync_work, | 911 | struct drbd_work resync_work, |
990 | unplug_work, | 912 | unplug_work, |
991 | go_diskless, | 913 | go_diskless, |
@@ -1005,10 +927,9 @@ struct drbd_conf { | |||
1005 | /* Used after attach while negotiating new disk state. */ | 927 | /* Used after attach while negotiating new disk state. */ |
1006 | union drbd_state new_state_tmp; | 928 | union drbd_state new_state_tmp; |
1007 | 929 | ||
1008 | union drbd_state state; | 930 | union drbd_dev_state state; |
1009 | wait_queue_head_t misc_wait; | 931 | wait_queue_head_t misc_wait; |
1010 | wait_queue_head_t state_wait; /* upon each state change. */ | 932 | wait_queue_head_t state_wait; /* upon each state change. */ |
1011 | wait_queue_head_t net_cnt_wait; | ||
1012 | unsigned int send_cnt; | 933 | unsigned int send_cnt; |
1013 | unsigned int recv_cnt; | 934 | unsigned int recv_cnt; |
1014 | unsigned int read_cnt; | 935 | unsigned int read_cnt; |
@@ -1018,17 +939,12 @@ struct drbd_conf { | |||
1018 | atomic_t ap_bio_cnt; /* Requests we need to complete */ | 939 | atomic_t ap_bio_cnt; /* Requests we need to complete */ |
1019 | atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */ | 940 | atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */ |
1020 | atomic_t rs_pending_cnt; /* RS request/data packets on the wire */ | 941 | atomic_t rs_pending_cnt; /* RS request/data packets on the wire */ |
1021 | atomic_t unacked_cnt; /* Need to send replys for */ | 942 | atomic_t unacked_cnt; /* Need to send replies for */ |
1022 | atomic_t local_cnt; /* Waiting for local completion */ | 943 | atomic_t local_cnt; /* Waiting for local completion */ |
1023 | atomic_t net_cnt; /* Users of net_conf */ | 944 | |
1024 | spinlock_t req_lock; | 945 | /* Interval tree of pending local requests */ |
1025 | struct drbd_tl_epoch *unused_spare_tle; /* for pre-allocation */ | 946 | struct rb_root read_requests; |
1026 | struct drbd_tl_epoch *newest_tle; | 947 | struct rb_root write_requests; |
1027 | struct drbd_tl_epoch *oldest_tle; | ||
1028 | struct list_head out_of_sequence_requests; | ||
1029 | struct list_head barrier_acked_requests; | ||
1030 | struct hlist_head *tl_hash; | ||
1031 | unsigned int tl_hash_s; | ||
1032 | 948 | ||
1033 | /* blocks to resync in this run [unit BM_BLOCK_SIZE] */ | 949 | /* blocks to resync in this run [unit BM_BLOCK_SIZE] */ |
1034 | unsigned long rs_total; | 950 | unsigned long rs_total; |
@@ -1048,9 +964,11 @@ struct drbd_conf { | |||
1048 | unsigned long rs_mark_time[DRBD_SYNC_MARKS]; | 964 | unsigned long rs_mark_time[DRBD_SYNC_MARKS]; |
1049 | /* current index into rs_mark_{left,time} */ | 965 | /* current index into rs_mark_{left,time} */ |
1050 | int rs_last_mark; | 966 | int rs_last_mark; |
967 | unsigned long rs_last_bcast; /* [unit jiffies] */ | ||
1051 | 968 | ||
1052 | /* where does the admin want us to start? (sector) */ | 969 | /* where does the admin want us to start? (sector) */ |
1053 | sector_t ov_start_sector; | 970 | sector_t ov_start_sector; |
971 | sector_t ov_stop_sector; | ||
1054 | /* where are we now? (sector) */ | 972 | /* where are we now? (sector) */ |
1055 | sector_t ov_position; | 973 | sector_t ov_position; |
1056 | /* Start sector of out of sync range (to merge printk reporting). */ | 974 | /* Start sector of out of sync range (to merge printk reporting). */ |
@@ -1058,14 +976,7 @@ struct drbd_conf { | |||
1058 | /* size of out-of-sync range in sectors. */ | 976 | /* size of out-of-sync range in sectors. */ |
1059 | sector_t ov_last_oos_size; | 977 | sector_t ov_last_oos_size; |
1060 | unsigned long ov_left; /* in bits */ | 978 | unsigned long ov_left; /* in bits */ |
1061 | struct crypto_hash *csums_tfm; | ||
1062 | struct crypto_hash *verify_tfm; | ||
1063 | 979 | ||
1064 | unsigned long last_reattach_jif; | ||
1065 | unsigned long last_reconnect_jif; | ||
1066 | struct drbd_thread receiver; | ||
1067 | struct drbd_thread worker; | ||
1068 | struct drbd_thread asender; | ||
1069 | struct drbd_bitmap *bitmap; | 980 | struct drbd_bitmap *bitmap; |
1070 | unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */ | 981 | unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */ |
1071 | 982 | ||
@@ -1078,29 +989,19 @@ struct drbd_conf { | |||
1078 | 989 | ||
1079 | int open_cnt; | 990 | int open_cnt; |
1080 | u64 *p_uuid; | 991 | u64 *p_uuid; |
1081 | struct drbd_epoch *current_epoch; | 992 | |
1082 | spinlock_t epoch_lock; | ||
1083 | unsigned int epochs; | ||
1084 | enum write_ordering_e write_ordering; | ||
1085 | struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */ | 993 | struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */ |
1086 | struct list_head sync_ee; /* IO in progress (P_RS_DATA_REPLY gets written to disk) */ | 994 | struct list_head sync_ee; /* IO in progress (P_RS_DATA_REPLY gets written to disk) */ |
1087 | struct list_head done_ee; /* send ack */ | 995 | struct list_head done_ee; /* need to send P_WRITE_ACK */ |
1088 | struct list_head read_ee; /* IO in progress (any read) */ | 996 | struct list_head read_ee; /* [RS]P_DATA_REQUEST being read */ |
1089 | struct list_head net_ee; /* zero-copy network send in progress */ | 997 | struct list_head net_ee; /* zero-copy network send in progress */ |
1090 | struct hlist_head *ee_hash; /* is proteced by req_lock! */ | ||
1091 | unsigned int ee_hash_s; | ||
1092 | |||
1093 | /* this one is protected by ee_lock, single thread */ | ||
1094 | struct drbd_epoch_entry *last_write_w_barrier; | ||
1095 | 998 | ||
1096 | int next_barrier_nr; | 999 | int next_barrier_nr; |
1097 | struct hlist_head *app_reads_hash; /* is proteced by req_lock */ | ||
1098 | struct list_head resync_reads; | 1000 | struct list_head resync_reads; |
1099 | atomic_t pp_in_use; /* allocated from page pool */ | 1001 | atomic_t pp_in_use; /* allocated from page pool */ |
1100 | atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */ | 1002 | atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */ |
1101 | wait_queue_head_t ee_wait; | 1003 | wait_queue_head_t ee_wait; |
1102 | struct page *md_io_page; /* one page buffer for md_io */ | 1004 | struct page *md_io_page; /* one page buffer for md_io */ |
1103 | struct page *md_io_tmpp; /* for logical_block_size != 512 */ | ||
1104 | struct drbd_md_io md_io; | 1005 | struct drbd_md_io md_io; |
1105 | atomic_t md_io_in_use; /* protects the md_io, md_io_page and md_io_tmpp */ | 1006 | atomic_t md_io_in_use; /* protects the md_io, md_io_page and md_io_tmpp */ |
1106 | spinlock_t al_lock; | 1007 | spinlock_t al_lock; |
@@ -1109,22 +1010,16 @@ struct drbd_conf { | |||
1109 | unsigned int al_tr_number; | 1010 | unsigned int al_tr_number; |
1110 | int al_tr_cycle; | 1011 | int al_tr_cycle; |
1111 | int al_tr_pos; /* position of the next transaction in the journal */ | 1012 | int al_tr_pos; /* position of the next transaction in the journal */ |
1112 | struct crypto_hash *cram_hmac_tfm; | ||
1113 | struct crypto_hash *integrity_w_tfm; /* to be used by the worker thread */ | ||
1114 | struct crypto_hash *integrity_r_tfm; /* to be used by the receiver thread */ | ||
1115 | void *int_dig_out; | ||
1116 | void *int_dig_in; | ||
1117 | void *int_dig_vv; | ||
1118 | wait_queue_head_t seq_wait; | 1013 | wait_queue_head_t seq_wait; |
1119 | atomic_t packet_seq; | 1014 | atomic_t packet_seq; |
1120 | unsigned int peer_seq; | 1015 | unsigned int peer_seq; |
1121 | spinlock_t peer_seq_lock; | 1016 | spinlock_t peer_seq_lock; |
1122 | unsigned int minor; | 1017 | unsigned int minor; |
1123 | unsigned long comm_bm_set; /* communicated number of set bits. */ | 1018 | unsigned long comm_bm_set; /* communicated number of set bits. */ |
1124 | cpumask_var_t cpu_mask; | ||
1125 | struct bm_io_work bm_io_work; | 1019 | struct bm_io_work bm_io_work; |
1126 | u64 ed_uuid; /* UUID of the exposed data */ | 1020 | u64 ed_uuid; /* UUID of the exposed data */ |
1127 | struct mutex state_mutex; | 1021 | struct mutex own_state_mutex; |
1022 | struct mutex *state_mutex; /* either own_state_mutex or mdev->tconn->cstate_mutex */ | ||
1128 | char congestion_reason; /* Why we where congested... */ | 1023 | char congestion_reason; /* Why we where congested... */ |
1129 | atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */ | 1024 | atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */ |
1130 | atomic_t rs_sect_ev; /* for submitted resync data rate, both */ | 1025 | atomic_t rs_sect_ev; /* for submitted resync data rate, both */ |
@@ -1132,9 +1027,8 @@ struct drbd_conf { | |||
1132 | int rs_last_events; /* counter of read or write "events" (unit sectors) | 1027 | int rs_last_events; /* counter of read or write "events" (unit sectors) |
1133 | * on the lower level device when we last looked. */ | 1028 | * on the lower level device when we last looked. */ |
1134 | int c_sync_rate; /* current resync rate after syncer throttle magic */ | 1029 | int c_sync_rate; /* current resync rate after syncer throttle magic */ |
1135 | struct fifo_buffer rs_plan_s; /* correction values of resync planer */ | 1030 | struct fifo_buffer *rs_plan_s; /* correction values of resync planer (RCU, tconn->conn_update) */ |
1136 | int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */ | 1031 | int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */ |
1137 | int rs_planed; /* resync sectors already planned */ | ||
1138 | atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */ | 1032 | atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */ |
1139 | unsigned int peer_max_bio_size; | 1033 | unsigned int peer_max_bio_size; |
1140 | unsigned int local_max_bio_size; | 1034 | unsigned int local_max_bio_size; |
@@ -1142,11 +1036,7 @@ struct drbd_conf { | |||
1142 | 1036 | ||
1143 | static inline struct drbd_conf *minor_to_mdev(unsigned int minor) | 1037 | static inline struct drbd_conf *minor_to_mdev(unsigned int minor) |
1144 | { | 1038 | { |
1145 | struct drbd_conf *mdev; | 1039 | return (struct drbd_conf *)idr_find(&minors, minor); |
1146 | |||
1147 | mdev = minor < minor_count ? minor_table[minor] : NULL; | ||
1148 | |||
1149 | return mdev; | ||
1150 | } | 1040 | } |
1151 | 1041 | ||
1152 | static inline unsigned int mdev_to_minor(struct drbd_conf *mdev) | 1042 | static inline unsigned int mdev_to_minor(struct drbd_conf *mdev) |
@@ -1154,29 +1044,9 @@ static inline unsigned int mdev_to_minor(struct drbd_conf *mdev) | |||
1154 | return mdev->minor; | 1044 | return mdev->minor; |
1155 | } | 1045 | } |
1156 | 1046 | ||
1157 | /* returns 1 if it was successful, | 1047 | static inline struct drbd_conf *vnr_to_mdev(struct drbd_tconn *tconn, int vnr) |
1158 | * returns 0 if there was no data socket. | ||
1159 | * so wherever you are going to use the data.socket, e.g. do | ||
1160 | * if (!drbd_get_data_sock(mdev)) | ||
1161 | * return 0; | ||
1162 | * CODE(); | ||
1163 | * drbd_put_data_sock(mdev); | ||
1164 | */ | ||
1165 | static inline int drbd_get_data_sock(struct drbd_conf *mdev) | ||
1166 | { | ||
1167 | mutex_lock(&mdev->data.mutex); | ||
1168 | /* drbd_disconnect() could have called drbd_free_sock() | ||
1169 | * while we were waiting in down()... */ | ||
1170 | if (unlikely(mdev->data.socket == NULL)) { | ||
1171 | mutex_unlock(&mdev->data.mutex); | ||
1172 | return 0; | ||
1173 | } | ||
1174 | return 1; | ||
1175 | } | ||
1176 | |||
1177 | static inline void drbd_put_data_sock(struct drbd_conf *mdev) | ||
1178 | { | 1048 | { |
1179 | mutex_unlock(&mdev->data.mutex); | 1049 | return (struct drbd_conf *)idr_find(&tconn->volumes, vnr); |
1180 | } | 1050 | } |
1181 | 1051 | ||
1182 | /* | 1052 | /* |
@@ -1185,106 +1055,77 @@ static inline void drbd_put_data_sock(struct drbd_conf *mdev) | |||
1185 | 1055 | ||
1186 | /* drbd_main.c */ | 1056 | /* drbd_main.c */ |
1187 | 1057 | ||
1188 | enum chg_state_flags { | ||
1189 | CS_HARD = 1, | ||
1190 | CS_VERBOSE = 2, | ||
1191 | CS_WAIT_COMPLETE = 4, | ||
1192 | CS_SERIALIZE = 8, | ||
1193 | CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE, | ||
1194 | }; | ||
1195 | |||
1196 | enum dds_flags { | 1058 | enum dds_flags { |
1197 | DDSF_FORCED = 1, | 1059 | DDSF_FORCED = 1, |
1198 | DDSF_NO_RESYNC = 2, /* Do not run a resync for the new space */ | 1060 | DDSF_NO_RESYNC = 2, /* Do not run a resync for the new space */ |
1199 | }; | 1061 | }; |
1200 | 1062 | ||
1201 | extern void drbd_init_set_defaults(struct drbd_conf *mdev); | 1063 | extern void drbd_init_set_defaults(struct drbd_conf *mdev); |
1202 | extern enum drbd_state_rv drbd_change_state(struct drbd_conf *mdev, | ||
1203 | enum chg_state_flags f, | ||
1204 | union drbd_state mask, | ||
1205 | union drbd_state val); | ||
1206 | extern void drbd_force_state(struct drbd_conf *, union drbd_state, | ||
1207 | union drbd_state); | ||
1208 | extern enum drbd_state_rv _drbd_request_state(struct drbd_conf *, | ||
1209 | union drbd_state, | ||
1210 | union drbd_state, | ||
1211 | enum chg_state_flags); | ||
1212 | extern enum drbd_state_rv __drbd_set_state(struct drbd_conf *, union drbd_state, | ||
1213 | enum chg_state_flags, | ||
1214 | struct completion *done); | ||
1215 | extern void print_st_err(struct drbd_conf *, union drbd_state, | ||
1216 | union drbd_state, int); | ||
1217 | extern int drbd_thread_start(struct drbd_thread *thi); | 1064 | extern int drbd_thread_start(struct drbd_thread *thi); |
1218 | extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait); | 1065 | extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait); |
1066 | extern char *drbd_task_to_thread_name(struct drbd_tconn *tconn, struct task_struct *task); | ||
1219 | #ifdef CONFIG_SMP | 1067 | #ifdef CONFIG_SMP |
1220 | extern void drbd_thread_current_set_cpu(struct drbd_conf *mdev); | 1068 | extern void drbd_thread_current_set_cpu(struct drbd_thread *thi); |
1221 | extern void drbd_calc_cpu_mask(struct drbd_conf *mdev); | 1069 | extern void drbd_calc_cpu_mask(struct drbd_tconn *tconn); |
1222 | #else | 1070 | #else |
1223 | #define drbd_thread_current_set_cpu(A) ({}) | 1071 | #define drbd_thread_current_set_cpu(A) ({}) |
1224 | #define drbd_calc_cpu_mask(A) ({}) | 1072 | #define drbd_calc_cpu_mask(A) ({}) |
1225 | #endif | 1073 | #endif |
1226 | extern void drbd_free_resources(struct drbd_conf *mdev); | 1074 | extern void tl_release(struct drbd_tconn *, unsigned int barrier_nr, |
1227 | extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, | ||
1228 | unsigned int set_size); | 1075 | unsigned int set_size); |
1229 | extern void tl_clear(struct drbd_conf *mdev); | 1076 | extern void tl_clear(struct drbd_tconn *); |
1230 | extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *); | 1077 | extern void drbd_free_sock(struct drbd_tconn *tconn); |
1231 | extern void drbd_free_sock(struct drbd_conf *mdev); | 1078 | extern int drbd_send(struct drbd_tconn *tconn, struct socket *sock, |
1232 | extern int drbd_send(struct drbd_conf *mdev, struct socket *sock, | 1079 | void *buf, size_t size, unsigned msg_flags); |
1233 | void *buf, size_t size, unsigned msg_flags); | 1080 | extern int drbd_send_all(struct drbd_tconn *, struct socket *, void *, size_t, |
1234 | extern int drbd_send_protocol(struct drbd_conf *mdev); | 1081 | unsigned); |
1082 | |||
1083 | extern int __drbd_send_protocol(struct drbd_tconn *tconn, enum drbd_packet cmd); | ||
1084 | extern int drbd_send_protocol(struct drbd_tconn *tconn); | ||
1235 | extern int drbd_send_uuids(struct drbd_conf *mdev); | 1085 | extern int drbd_send_uuids(struct drbd_conf *mdev); |
1236 | extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev); | 1086 | extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev); |
1237 | extern int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev); | 1087 | extern void drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev); |
1238 | extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags); | 1088 | extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags); |
1239 | extern int drbd_send_state(struct drbd_conf *mdev, union drbd_state s); | 1089 | extern int drbd_send_state(struct drbd_conf *mdev, union drbd_state s); |
1240 | extern int drbd_send_current_state(struct drbd_conf *mdev); | 1090 | extern int drbd_send_current_state(struct drbd_conf *mdev); |
1241 | extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, | 1091 | extern int drbd_send_sync_param(struct drbd_conf *mdev); |
1242 | enum drbd_packets cmd, struct p_header80 *h, | 1092 | extern void drbd_send_b_ack(struct drbd_tconn *tconn, u32 barrier_nr, |
1243 | size_t size, unsigned msg_flags); | 1093 | u32 set_size); |
1244 | #define USE_DATA_SOCKET 1 | 1094 | extern int drbd_send_ack(struct drbd_conf *, enum drbd_packet, |
1245 | #define USE_META_SOCKET 0 | 1095 | struct drbd_peer_request *); |
1246 | extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket, | 1096 | extern void drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packet cmd, |
1247 | enum drbd_packets cmd, struct p_header80 *h, | 1097 | struct p_block_req *rp); |
1248 | size_t size); | 1098 | extern void drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packet cmd, |
1249 | extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, | 1099 | struct p_data *dp, int data_size); |
1250 | char *data, size_t size); | 1100 | extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packet cmd, |
1251 | extern int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc); | ||
1252 | extern int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, | ||
1253 | u32 set_size); | ||
1254 | extern int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
1255 | struct drbd_epoch_entry *e); | ||
1256 | extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
1257 | struct p_block_req *rp); | ||
1258 | extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
1259 | struct p_data *dp, int data_size); | ||
1260 | extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd, | ||
1261 | sector_t sector, int blksize, u64 block_id); | 1101 | sector_t sector, int blksize, u64 block_id); |
1262 | extern int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req); | 1102 | extern int drbd_send_out_of_sync(struct drbd_conf *, struct drbd_request *); |
1263 | extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, | 1103 | extern int drbd_send_block(struct drbd_conf *, enum drbd_packet, |
1264 | struct drbd_epoch_entry *e); | 1104 | struct drbd_peer_request *); |
1265 | extern int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req); | 1105 | extern int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req); |
1266 | extern int drbd_send_drequest(struct drbd_conf *mdev, int cmd, | 1106 | extern int drbd_send_drequest(struct drbd_conf *mdev, int cmd, |
1267 | sector_t sector, int size, u64 block_id); | 1107 | sector_t sector, int size, u64 block_id); |
1268 | extern int drbd_send_drequest_csum(struct drbd_conf *mdev, | 1108 | extern int drbd_send_drequest_csum(struct drbd_conf *mdev, sector_t sector, |
1269 | sector_t sector,int size, | 1109 | int size, void *digest, int digest_size, |
1270 | void *digest, int digest_size, | 1110 | enum drbd_packet cmd); |
1271 | enum drbd_packets cmd); | ||
1272 | extern int drbd_send_ov_request(struct drbd_conf *mdev,sector_t sector,int size); | 1111 | extern int drbd_send_ov_request(struct drbd_conf *mdev,sector_t sector,int size); |
1273 | 1112 | ||
1274 | extern int drbd_send_bitmap(struct drbd_conf *mdev); | 1113 | extern int drbd_send_bitmap(struct drbd_conf *mdev); |
1275 | extern int _drbd_send_bitmap(struct drbd_conf *mdev); | 1114 | extern void drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode); |
1276 | extern int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode); | 1115 | extern void conn_send_sr_reply(struct drbd_tconn *tconn, enum drbd_state_rv retcode); |
1277 | extern void drbd_free_bc(struct drbd_backing_dev *ldev); | 1116 | extern void drbd_free_bc(struct drbd_backing_dev *ldev); |
1278 | extern void drbd_mdev_cleanup(struct drbd_conf *mdev); | 1117 | extern void drbd_mdev_cleanup(struct drbd_conf *mdev); |
1279 | void drbd_print_uuids(struct drbd_conf *mdev, const char *text); | 1118 | void drbd_print_uuids(struct drbd_conf *mdev, const char *text); |
1280 | 1119 | ||
1120 | extern void conn_md_sync(struct drbd_tconn *tconn); | ||
1281 | extern void drbd_md_sync(struct drbd_conf *mdev); | 1121 | extern void drbd_md_sync(struct drbd_conf *mdev); |
1282 | extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev); | 1122 | extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev); |
1283 | extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); | 1123 | extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); |
1284 | extern void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); | 1124 | extern void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); |
1285 | extern void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local); | 1125 | extern void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local); |
1286 | extern void _drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local); | ||
1287 | extern void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local); | 1126 | extern void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local); |
1127 | extern void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local); | ||
1128 | extern void __drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); | ||
1288 | extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local); | 1129 | extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local); |
1289 | extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local); | 1130 | extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local); |
1290 | extern int drbd_md_test_flag(struct drbd_backing_dev *, int); | 1131 | extern int drbd_md_test_flag(struct drbd_backing_dev *, int); |
@@ -1302,33 +1143,52 @@ extern void drbd_queue_bitmap_io(struct drbd_conf *mdev, | |||
1302 | extern int drbd_bitmap_io(struct drbd_conf *mdev, | 1143 | extern int drbd_bitmap_io(struct drbd_conf *mdev, |
1303 | int (*io_fn)(struct drbd_conf *), | 1144 | int (*io_fn)(struct drbd_conf *), |
1304 | char *why, enum bm_flag flags); | 1145 | char *why, enum bm_flag flags); |
1146 | extern int drbd_bitmap_io_from_worker(struct drbd_conf *mdev, | ||
1147 | int (*io_fn)(struct drbd_conf *), | ||
1148 | char *why, enum bm_flag flags); | ||
1305 | extern int drbd_bmio_set_n_write(struct drbd_conf *mdev); | 1149 | extern int drbd_bmio_set_n_write(struct drbd_conf *mdev); |
1306 | extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); | 1150 | extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); |
1307 | extern void drbd_go_diskless(struct drbd_conf *mdev); | 1151 | extern void drbd_go_diskless(struct drbd_conf *mdev); |
1308 | extern void drbd_ldev_destroy(struct drbd_conf *mdev); | 1152 | extern void drbd_ldev_destroy(struct drbd_conf *mdev); |
1309 | 1153 | ||
1310 | |||
1311 | /* Meta data layout | 1154 | /* Meta data layout |
1312 | We reserve a 128MB Block (4k aligned) | 1155 | We reserve a 128MB Block (4k aligned) |
1313 | * either at the end of the backing device | 1156 | * either at the end of the backing device |
1314 | * or on a separate meta data device. */ | 1157 | * or on a separate meta data device. */ |
1315 | 1158 | ||
1316 | #define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */ | ||
1317 | /* The following numbers are sectors */ | 1159 | /* The following numbers are sectors */ |
1318 | #define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */ | 1160 | /* Allows up to about 3.8TB, so if you want more, |
1319 | #define MD_AL_MAX_SIZE 64 /* = 32 kb LOG ~ 3776 extents ~ 14 GB Storage */ | 1161 | * you need to use the "flexible" meta data format. */ |
1320 | /* Allows up to about 3.8TB */ | 1162 | #define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */ |
1321 | #define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_MAX_SIZE) | 1163 | #define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */ |
1322 | 1164 | #define MD_AL_SECTORS 64 /* = 32 kB on disk activity log ring buffer */ | |
1323 | /* Since the smalles IO unit is usually 512 byte */ | 1165 | #define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_SECTORS) |
1324 | #define MD_SECTOR_SHIFT 9 | 1166 | |
1325 | #define MD_SECTOR_SIZE (1<<MD_SECTOR_SHIFT) | 1167 | /* we do all meta data IO in 4k blocks */ |
1326 | 1168 | #define MD_BLOCK_SHIFT 12 | |
1327 | /* activity log */ | 1169 | #define MD_BLOCK_SIZE (1<<MD_BLOCK_SHIFT) |
1328 | #define AL_EXTENTS_PT ((MD_SECTOR_SIZE-12)/8-1) /* 61 ; Extents per 512B sector */ | 1170 | |
1329 | #define AL_EXTENT_SHIFT 22 /* One extent represents 4M Storage */ | 1171 | /* One activity log extent represents 4M of storage */ |
1172 | #define AL_EXTENT_SHIFT 22 | ||
1330 | #define AL_EXTENT_SIZE (1<<AL_EXTENT_SHIFT) | 1173 | #define AL_EXTENT_SIZE (1<<AL_EXTENT_SHIFT) |
1331 | 1174 | ||
1175 | /* We could make these currently hardcoded constants configurable | ||
1176 | * variables at create-md time (or even re-configurable at runtime?). | ||
1177 | * Which will require some more changes to the DRBD "super block" | ||
1178 | * and attach code. | ||
1179 | * | ||
1180 | * updates per transaction: | ||
1181 | * This many changes to the active set can be logged with one transaction. | ||
1182 | * This number is arbitrary. | ||
1183 | * context per transaction: | ||
1184 | * This many context extent numbers are logged with each transaction. | ||
1185 | * This number is resulting from the transaction block size (4k), the layout | ||
1186 | * of the transaction header, and the number of updates per transaction. | ||
1187 | * See drbd_actlog.c:struct al_transaction_on_disk | ||
1188 | * */ | ||
1189 | #define AL_UPDATES_PER_TRANSACTION 64 // arbitrary | ||
1190 | #define AL_CONTEXT_PER_TRANSACTION 919 // (4096 - 36 - 6*64)/4 | ||
1191 | |||
1332 | #if BITS_PER_LONG == 32 | 1192 | #if BITS_PER_LONG == 32 |
1333 | #define LN2_BPL 5 | 1193 | #define LN2_BPL 5 |
1334 | #define cpu_to_lel(A) cpu_to_le32(A) | 1194 | #define cpu_to_lel(A) cpu_to_le32(A) |
@@ -1364,11 +1224,14 @@ struct bm_extent { | |||
1364 | 1224 | ||
1365 | #define SLEEP_TIME (HZ/10) | 1225 | #define SLEEP_TIME (HZ/10) |
1366 | 1226 | ||
1367 | #define BM_BLOCK_SHIFT 12 /* 4k per bit */ | 1227 | /* We do bitmap IO in units of 4k blocks. |
1228 | * We also still have a hardcoded 4k per bit relation. */ | ||
1229 | #define BM_BLOCK_SHIFT 12 /* 4k per bit */ | ||
1368 | #define BM_BLOCK_SIZE (1<<BM_BLOCK_SHIFT) | 1230 | #define BM_BLOCK_SIZE (1<<BM_BLOCK_SHIFT) |
1369 | /* (9+3) : 512 bytes @ 8 bits; representing 16M storage | 1231 | /* mostly arbitrarily set the represented size of one bitmap extent, |
1370 | * per sector of on disk bitmap */ | 1232 | * aka resync extent, to 16 MiB (which is also 512 Byte worth of bitmap |
1371 | #define BM_EXT_SHIFT (BM_BLOCK_SHIFT + MD_SECTOR_SHIFT + 3) /* = 24 */ | 1233 | * at 4k per bit resolution) */ |
1234 | #define BM_EXT_SHIFT 24 /* 16 MiB per resync extent */ | ||
1372 | #define BM_EXT_SIZE (1<<BM_EXT_SHIFT) | 1235 | #define BM_EXT_SIZE (1<<BM_EXT_SHIFT) |
1373 | 1236 | ||
1374 | #if (BM_EXT_SHIFT != 24) || (BM_BLOCK_SHIFT != 12) | 1237 | #if (BM_EXT_SHIFT != 24) || (BM_BLOCK_SHIFT != 12) |
@@ -1436,17 +1299,20 @@ struct bm_extent { | |||
1436 | #endif | 1299 | #endif |
1437 | #endif | 1300 | #endif |
1438 | 1301 | ||
1439 | /* Sector shift value for the "hash" functions of tl_hash and ee_hash tables. | 1302 | /* BIO_MAX_SIZE is 256 * PAGE_CACHE_SIZE, |
1440 | * With a value of 8 all IO in one 128K block make it to the same slot of the | 1303 | * so for typical PAGE_CACHE_SIZE of 4k, that is (1<<20) Byte. |
1441 | * hash table. */ | 1304 | * Since we may live in a mixed-platform cluster, |
1442 | #define HT_SHIFT 8 | 1305 | * we limit us to a platform agnostic constant here for now. |
1443 | #define DRBD_MAX_BIO_SIZE (1U<<(9+HT_SHIFT)) | 1306 | * A followup commit may allow even bigger BIO sizes, |
1307 | * once we thought that through. */ | ||
1308 | #define DRBD_MAX_BIO_SIZE (1U << 20) | ||
1309 | #if DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE | ||
1310 | #error Architecture not supported: DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE | ||
1311 | #endif | ||
1444 | #define DRBD_MAX_BIO_SIZE_SAFE (1U << 12) /* Works always = 4k */ | 1312 | #define DRBD_MAX_BIO_SIZE_SAFE (1U << 12) /* Works always = 4k */ |
1445 | 1313 | ||
1446 | #define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* The old header only allows packets up to 32Kib data */ | 1314 | #define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */ |
1447 | 1315 | #define DRBD_MAX_BIO_SIZE_P95 (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */ | |
1448 | /* Number of elements in the app_reads_hash */ | ||
1449 | #define APP_R_HSIZE 15 | ||
1450 | 1316 | ||
1451 | extern int drbd_bm_init(struct drbd_conf *mdev); | 1317 | extern int drbd_bm_init(struct drbd_conf *mdev); |
1452 | extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors, int set_new_bits); | 1318 | extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors, int set_new_bits); |
@@ -1468,11 +1334,11 @@ extern int drbd_bm_test_bit(struct drbd_conf *mdev, unsigned long bitnr); | |||
1468 | extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr); | 1334 | extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr); |
1469 | extern int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local); | 1335 | extern int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local); |
1470 | extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local); | 1336 | extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local); |
1337 | extern void drbd_bm_mark_for_writeout(struct drbd_conf *mdev, int page_nr); | ||
1471 | extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local); | 1338 | extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local); |
1339 | extern int drbd_bm_write_hinted(struct drbd_conf *mdev) __must_hold(local); | ||
1472 | extern int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local); | 1340 | extern int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local); |
1473 | extern int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local); | 1341 | extern int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local); |
1474 | extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, | ||
1475 | unsigned long al_enr); | ||
1476 | extern size_t drbd_bm_words(struct drbd_conf *mdev); | 1342 | extern size_t drbd_bm_words(struct drbd_conf *mdev); |
1477 | extern unsigned long drbd_bm_bits(struct drbd_conf *mdev); | 1343 | extern unsigned long drbd_bm_bits(struct drbd_conf *mdev); |
1478 | extern sector_t drbd_bm_capacity(struct drbd_conf *mdev); | 1344 | extern sector_t drbd_bm_capacity(struct drbd_conf *mdev); |
@@ -1497,7 +1363,7 @@ extern void drbd_bm_unlock(struct drbd_conf *mdev); | |||
1497 | /* drbd_main.c */ | 1363 | /* drbd_main.c */ |
1498 | 1364 | ||
1499 | extern struct kmem_cache *drbd_request_cache; | 1365 | extern struct kmem_cache *drbd_request_cache; |
1500 | extern struct kmem_cache *drbd_ee_cache; /* epoch entries */ | 1366 | extern struct kmem_cache *drbd_ee_cache; /* peer requests */ |
1501 | extern struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ | 1367 | extern struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ |
1502 | extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ | 1368 | extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ |
1503 | extern mempool_t *drbd_request_mempool; | 1369 | extern mempool_t *drbd_request_mempool; |
@@ -1537,12 +1403,22 @@ extern struct bio *bio_alloc_drbd(gfp_t gfp_mask); | |||
1537 | 1403 | ||
1538 | extern rwlock_t global_state_lock; | 1404 | extern rwlock_t global_state_lock; |
1539 | 1405 | ||
1540 | extern struct drbd_conf *drbd_new_device(unsigned int minor); | 1406 | extern int conn_lowest_minor(struct drbd_tconn *tconn); |
1541 | extern void drbd_free_mdev(struct drbd_conf *mdev); | 1407 | enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr); |
1408 | extern void drbd_minor_destroy(struct kref *kref); | ||
1409 | |||
1410 | extern int set_resource_options(struct drbd_tconn *tconn, struct res_opts *res_opts); | ||
1411 | extern struct drbd_tconn *conn_create(const char *name, struct res_opts *res_opts); | ||
1412 | extern void conn_destroy(struct kref *kref); | ||
1413 | struct drbd_tconn *conn_get_by_name(const char *name); | ||
1414 | extern struct drbd_tconn *conn_get_by_addrs(void *my_addr, int my_addr_len, | ||
1415 | void *peer_addr, int peer_addr_len); | ||
1416 | extern void conn_free_crypto(struct drbd_tconn *tconn); | ||
1542 | 1417 | ||
1543 | extern int proc_details; | 1418 | extern int proc_details; |
1544 | 1419 | ||
1545 | /* drbd_req */ | 1420 | /* drbd_req */ |
1421 | extern void __drbd_make_request(struct drbd_conf *, struct bio *, unsigned long); | ||
1546 | extern void drbd_make_request(struct request_queue *q, struct bio *bio); | 1422 | extern void drbd_make_request(struct request_queue *q, struct bio *bio); |
1547 | extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req); | 1423 | extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req); |
1548 | extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec); | 1424 | extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec); |
@@ -1550,10 +1426,11 @@ extern int is_valid_ar_handle(struct drbd_request *, sector_t); | |||
1550 | 1426 | ||
1551 | 1427 | ||
1552 | /* drbd_nl.c */ | 1428 | /* drbd_nl.c */ |
1429 | extern int drbd_msg_put_info(const char *info); | ||
1553 | extern void drbd_suspend_io(struct drbd_conf *mdev); | 1430 | extern void drbd_suspend_io(struct drbd_conf *mdev); |
1554 | extern void drbd_resume_io(struct drbd_conf *mdev); | 1431 | extern void drbd_resume_io(struct drbd_conf *mdev); |
1555 | extern char *ppsize(char *buf, unsigned long long size); | 1432 | extern char *ppsize(char *buf, unsigned long long size); |
1556 | extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, int); | 1433 | extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, sector_t, int); |
1557 | enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 }; | 1434 | enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 }; |
1558 | extern enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local); | 1435 | extern enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local); |
1559 | extern void resync_after_online_grow(struct drbd_conf *); | 1436 | extern void resync_after_online_grow(struct drbd_conf *); |
@@ -1561,13 +1438,14 @@ extern void drbd_reconsider_max_bio_size(struct drbd_conf *mdev); | |||
1561 | extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev, | 1438 | extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev, |
1562 | enum drbd_role new_role, | 1439 | enum drbd_role new_role, |
1563 | int force); | 1440 | int force); |
1564 | extern enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev); | 1441 | extern bool conn_try_outdate_peer(struct drbd_tconn *tconn); |
1565 | extern void drbd_try_outdate_peer_async(struct drbd_conf *mdev); | 1442 | extern void conn_try_outdate_peer_async(struct drbd_tconn *tconn); |
1566 | extern int drbd_khelper(struct drbd_conf *mdev, char *cmd); | 1443 | extern int drbd_khelper(struct drbd_conf *mdev, char *cmd); |
1567 | 1444 | ||
1568 | /* drbd_worker.c */ | 1445 | /* drbd_worker.c */ |
1569 | extern int drbd_worker(struct drbd_thread *thi); | 1446 | extern int drbd_worker(struct drbd_thread *thi); |
1570 | extern int drbd_alter_sa(struct drbd_conf *mdev, int na); | 1447 | enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor); |
1448 | void drbd_resync_after_changed(struct drbd_conf *mdev); | ||
1571 | extern void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side); | 1449 | extern void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side); |
1572 | extern void resume_next_sg(struct drbd_conf *mdev); | 1450 | extern void resume_next_sg(struct drbd_conf *mdev); |
1573 | extern void suspend_other_sg(struct drbd_conf *mdev); | 1451 | extern void suspend_other_sg(struct drbd_conf *mdev); |
@@ -1576,13 +1454,13 @@ extern int drbd_resync_finished(struct drbd_conf *mdev); | |||
1576 | extern void *drbd_md_get_buffer(struct drbd_conf *mdev); | 1454 | extern void *drbd_md_get_buffer(struct drbd_conf *mdev); |
1577 | extern void drbd_md_put_buffer(struct drbd_conf *mdev); | 1455 | extern void drbd_md_put_buffer(struct drbd_conf *mdev); |
1578 | extern int drbd_md_sync_page_io(struct drbd_conf *mdev, | 1456 | extern int drbd_md_sync_page_io(struct drbd_conf *mdev, |
1579 | struct drbd_backing_dev *bdev, sector_t sector, int rw); | 1457 | struct drbd_backing_dev *bdev, sector_t sector, int rw); |
1580 | extern void wait_until_done_or_disk_failure(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, | 1458 | extern void drbd_ov_out_of_sync_found(struct drbd_conf *, sector_t, int); |
1581 | unsigned int *done); | 1459 | extern void wait_until_done_or_force_detached(struct drbd_conf *mdev, |
1582 | extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int); | 1460 | struct drbd_backing_dev *bdev, unsigned int *done); |
1583 | extern void drbd_rs_controller_reset(struct drbd_conf *mdev); | 1461 | extern void drbd_rs_controller_reset(struct drbd_conf *mdev); |
1584 | 1462 | ||
1585 | static inline void ov_oos_print(struct drbd_conf *mdev) | 1463 | static inline void ov_out_of_sync_print(struct drbd_conf *mdev) |
1586 | { | 1464 | { |
1587 | if (mdev->ov_last_oos_size) { | 1465 | if (mdev->ov_last_oos_size) { |
1588 | dev_err(DEV, "Out of sync: start=%llu, size=%lu (sectors)\n", | 1466 | dev_err(DEV, "Out of sync: start=%llu, size=%lu (sectors)\n", |
@@ -1594,97 +1472,102 @@ static inline void ov_oos_print(struct drbd_conf *mdev) | |||
1594 | 1472 | ||
1595 | 1473 | ||
1596 | extern void drbd_csum_bio(struct drbd_conf *, struct crypto_hash *, struct bio *, void *); | 1474 | extern void drbd_csum_bio(struct drbd_conf *, struct crypto_hash *, struct bio *, void *); |
1597 | extern void drbd_csum_ee(struct drbd_conf *, struct crypto_hash *, struct drbd_epoch_entry *, void *); | 1475 | extern void drbd_csum_ee(struct drbd_conf *, struct crypto_hash *, |
1476 | struct drbd_peer_request *, void *); | ||
1598 | /* worker callbacks */ | 1477 | /* worker callbacks */ |
1599 | extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int); | 1478 | extern int w_e_end_data_req(struct drbd_work *, int); |
1600 | extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int); | 1479 | extern int w_e_end_rsdata_req(struct drbd_work *, int); |
1601 | extern int w_e_end_data_req(struct drbd_conf *, struct drbd_work *, int); | 1480 | extern int w_e_end_csum_rs_req(struct drbd_work *, int); |
1602 | extern int w_e_end_rsdata_req(struct drbd_conf *, struct drbd_work *, int); | 1481 | extern int w_e_end_ov_reply(struct drbd_work *, int); |
1603 | extern int w_e_end_csum_rs_req(struct drbd_conf *, struct drbd_work *, int); | 1482 | extern int w_e_end_ov_req(struct drbd_work *, int); |
1604 | extern int w_e_end_ov_reply(struct drbd_conf *, struct drbd_work *, int); | 1483 | extern int w_ov_finished(struct drbd_work *, int); |
1605 | extern int w_e_end_ov_req(struct drbd_conf *, struct drbd_work *, int); | 1484 | extern int w_resync_timer(struct drbd_work *, int); |
1606 | extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int); | 1485 | extern int w_send_write_hint(struct drbd_work *, int); |
1607 | extern int w_resync_timer(struct drbd_conf *, struct drbd_work *, int); | 1486 | extern int w_make_resync_request(struct drbd_work *, int); |
1608 | extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int); | 1487 | extern int w_send_dblock(struct drbd_work *, int); |
1609 | extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int); | 1488 | extern int w_send_read_req(struct drbd_work *, int); |
1610 | extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int); | 1489 | extern int w_prev_work_done(struct drbd_work *, int); |
1611 | extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int); | 1490 | extern int w_e_reissue(struct drbd_work *, int); |
1612 | extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int); | 1491 | extern int w_restart_disk_io(struct drbd_work *, int); |
1613 | extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int); | 1492 | extern int w_send_out_of_sync(struct drbd_work *, int); |
1614 | extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int); | 1493 | extern int w_start_resync(struct drbd_work *, int); |
1615 | extern int w_restart_disk_io(struct drbd_conf *, struct drbd_work *, int); | ||
1616 | extern int w_send_oos(struct drbd_conf *, struct drbd_work *, int); | ||
1617 | extern int w_start_resync(struct drbd_conf *, struct drbd_work *, int); | ||
1618 | 1494 | ||
1619 | extern void resync_timer_fn(unsigned long data); | 1495 | extern void resync_timer_fn(unsigned long data); |
1620 | extern void start_resync_timer_fn(unsigned long data); | 1496 | extern void start_resync_timer_fn(unsigned long data); |
1621 | 1497 | ||
1622 | /* drbd_receiver.c */ | 1498 | /* drbd_receiver.c */ |
1623 | extern int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector); | 1499 | extern int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector); |
1624 | extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, | 1500 | extern int drbd_submit_peer_request(struct drbd_conf *, |
1625 | const unsigned rw, const int fault_type); | 1501 | struct drbd_peer_request *, const unsigned, |
1626 | extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list); | 1502 | const int); |
1627 | extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, | 1503 | extern int drbd_free_peer_reqs(struct drbd_conf *, struct list_head *); |
1628 | u64 id, | 1504 | extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_conf *, u64, |
1629 | sector_t sector, | 1505 | sector_t, unsigned int, |
1630 | unsigned int data_size, | 1506 | gfp_t) __must_hold(local); |
1631 | gfp_t gfp_mask) __must_hold(local); | 1507 | extern void __drbd_free_peer_req(struct drbd_conf *, struct drbd_peer_request *, |
1632 | extern void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, | 1508 | int); |
1633 | int is_net); | 1509 | #define drbd_free_peer_req(m,e) __drbd_free_peer_req(m, e, 0) |
1634 | #define drbd_free_ee(m,e) drbd_free_some_ee(m, e, 0) | 1510 | #define drbd_free_net_peer_req(m,e) __drbd_free_peer_req(m, e, 1) |
1635 | #define drbd_free_net_ee(m,e) drbd_free_some_ee(m, e, 1) | 1511 | extern struct page *drbd_alloc_pages(struct drbd_conf *, unsigned int, bool); |
1636 | extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev, | ||
1637 | struct list_head *head); | ||
1638 | extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, | ||
1639 | struct list_head *head); | ||
1640 | extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled); | 1512 | extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled); |
1641 | extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed); | 1513 | extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed); |
1642 | extern void drbd_flush_workqueue(struct drbd_conf *mdev); | 1514 | extern void conn_flush_workqueue(struct drbd_tconn *tconn); |
1643 | extern void drbd_free_tl_hash(struct drbd_conf *mdev); | 1515 | extern int drbd_connected(struct drbd_conf *mdev); |
1516 | static inline void drbd_flush_workqueue(struct drbd_conf *mdev) | ||
1517 | { | ||
1518 | conn_flush_workqueue(mdev->tconn); | ||
1519 | } | ||
1644 | 1520 | ||
1645 | /* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to | 1521 | /* Yes, there is kernel_setsockopt, but only since 2.6.18. |
1646 | * mess with get_fs/set_fs, we know we are KERNEL_DS always. */ | 1522 | * So we have our own copy of it here. */ |
1647 | static inline int drbd_setsockopt(struct socket *sock, int level, int optname, | 1523 | static inline int drbd_setsockopt(struct socket *sock, int level, int optname, |
1648 | char __user *optval, int optlen) | 1524 | char *optval, int optlen) |
1649 | { | 1525 | { |
1526 | mm_segment_t oldfs = get_fs(); | ||
1527 | char __user *uoptval; | ||
1650 | int err; | 1528 | int err; |
1529 | |||
1530 | uoptval = (char __user __force *)optval; | ||
1531 | |||
1532 | set_fs(KERNEL_DS); | ||
1651 | if (level == SOL_SOCKET) | 1533 | if (level == SOL_SOCKET) |
1652 | err = sock_setsockopt(sock, level, optname, optval, optlen); | 1534 | err = sock_setsockopt(sock, level, optname, uoptval, optlen); |
1653 | else | 1535 | else |
1654 | err = sock->ops->setsockopt(sock, level, optname, optval, | 1536 | err = sock->ops->setsockopt(sock, level, optname, uoptval, |
1655 | optlen); | 1537 | optlen); |
1538 | set_fs(oldfs); | ||
1656 | return err; | 1539 | return err; |
1657 | } | 1540 | } |
1658 | 1541 | ||
1659 | static inline void drbd_tcp_cork(struct socket *sock) | 1542 | static inline void drbd_tcp_cork(struct socket *sock) |
1660 | { | 1543 | { |
1661 | int __user val = 1; | 1544 | int val = 1; |
1662 | (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK, | 1545 | (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK, |
1663 | (char __user *)&val, sizeof(val)); | 1546 | (char*)&val, sizeof(val)); |
1664 | } | 1547 | } |
1665 | 1548 | ||
1666 | static inline void drbd_tcp_uncork(struct socket *sock) | 1549 | static inline void drbd_tcp_uncork(struct socket *sock) |
1667 | { | 1550 | { |
1668 | int __user val = 0; | 1551 | int val = 0; |
1669 | (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK, | 1552 | (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK, |
1670 | (char __user *)&val, sizeof(val)); | 1553 | (char*)&val, sizeof(val)); |
1671 | } | 1554 | } |
1672 | 1555 | ||
1673 | static inline void drbd_tcp_nodelay(struct socket *sock) | 1556 | static inline void drbd_tcp_nodelay(struct socket *sock) |
1674 | { | 1557 | { |
1675 | int __user val = 1; | 1558 | int val = 1; |
1676 | (void) drbd_setsockopt(sock, SOL_TCP, TCP_NODELAY, | 1559 | (void) drbd_setsockopt(sock, SOL_TCP, TCP_NODELAY, |
1677 | (char __user *)&val, sizeof(val)); | 1560 | (char*)&val, sizeof(val)); |
1678 | } | 1561 | } |
1679 | 1562 | ||
1680 | static inline void drbd_tcp_quickack(struct socket *sock) | 1563 | static inline void drbd_tcp_quickack(struct socket *sock) |
1681 | { | 1564 | { |
1682 | int __user val = 2; | 1565 | int val = 2; |
1683 | (void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK, | 1566 | (void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK, |
1684 | (char __user *)&val, sizeof(val)); | 1567 | (char*)&val, sizeof(val)); |
1685 | } | 1568 | } |
1686 | 1569 | ||
1687 | void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo); | 1570 | void drbd_bump_write_ordering(struct drbd_tconn *tconn, enum write_ordering_e wo); |
1688 | 1571 | ||
1689 | /* drbd_proc.c */ | 1572 | /* drbd_proc.c */ |
1690 | extern struct proc_dir_entry *drbd_proc; | 1573 | extern struct proc_dir_entry *drbd_proc; |
@@ -1693,8 +1576,8 @@ extern const char *drbd_conn_str(enum drbd_conns s); | |||
1693 | extern const char *drbd_role_str(enum drbd_role s); | 1576 | extern const char *drbd_role_str(enum drbd_role s); |
1694 | 1577 | ||
1695 | /* drbd_actlog.c */ | 1578 | /* drbd_actlog.c */ |
1696 | extern void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector); | 1579 | extern void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i); |
1697 | extern void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector); | 1580 | extern void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i); |
1698 | extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector); | 1581 | extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector); |
1699 | extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector); | 1582 | extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector); |
1700 | extern int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector); | 1583 | extern int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector); |
@@ -1702,7 +1585,6 @@ extern void drbd_rs_cancel_all(struct drbd_conf *mdev); | |||
1702 | extern int drbd_rs_del_all(struct drbd_conf *mdev); | 1585 | extern int drbd_rs_del_all(struct drbd_conf *mdev); |
1703 | extern void drbd_rs_failed_io(struct drbd_conf *mdev, | 1586 | extern void drbd_rs_failed_io(struct drbd_conf *mdev, |
1704 | sector_t sector, int size); | 1587 | sector_t sector, int size); |
1705 | extern int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *); | ||
1706 | extern void drbd_advance_rs_marks(struct drbd_conf *mdev, unsigned long still_to_go); | 1588 | extern void drbd_advance_rs_marks(struct drbd_conf *mdev, unsigned long still_to_go); |
1707 | extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, | 1589 | extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, |
1708 | int size, const char *file, const unsigned int line); | 1590 | int size, const char *file, const unsigned int line); |
@@ -1712,73 +1594,24 @@ extern int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, | |||
1712 | int size, const char *file, const unsigned int line); | 1594 | int size, const char *file, const unsigned int line); |
1713 | #define drbd_set_out_of_sync(mdev, sector, size) \ | 1595 | #define drbd_set_out_of_sync(mdev, sector, size) \ |
1714 | __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__) | 1596 | __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__) |
1715 | extern void drbd_al_apply_to_bm(struct drbd_conf *mdev); | ||
1716 | extern void drbd_al_shrink(struct drbd_conf *mdev); | 1597 | extern void drbd_al_shrink(struct drbd_conf *mdev); |
1717 | 1598 | ||
1718 | |||
1719 | /* drbd_nl.c */ | 1599 | /* drbd_nl.c */ |
1720 | 1600 | /* state info broadcast */ | |
1721 | void drbd_nl_cleanup(void); | 1601 | struct sib_info { |
1722 | int __init drbd_nl_init(void); | 1602 | enum drbd_state_info_bcast_reason sib_reason; |
1723 | void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state); | 1603 | union { |
1724 | void drbd_bcast_sync_progress(struct drbd_conf *mdev); | 1604 | struct { |
1725 | void drbd_bcast_ee(struct drbd_conf *mdev, | 1605 | char *helper_name; |
1726 | const char *reason, const int dgs, | 1606 | unsigned helper_exit_code; |
1727 | const char* seen_hash, const char* calc_hash, | 1607 | }; |
1728 | const struct drbd_epoch_entry* e); | 1608 | struct { |
1729 | 1609 | union drbd_state os; | |
1730 | 1610 | union drbd_state ns; | |
1731 | /** | 1611 | }; |
1732 | * DOC: DRBD State macros | 1612 | }; |
1733 | * | 1613 | }; |
1734 | * These macros are used to express state changes in easily readable form. | 1614 | void drbd_bcast_event(struct drbd_conf *mdev, const struct sib_info *sib); |
1735 | * | ||
1736 | * The NS macros expand to a mask and a value, that can be bit ored onto the | ||
1737 | * current state as soon as the spinlock (req_lock) was taken. | ||
1738 | * | ||
1739 | * The _NS macros are used for state functions that get called with the | ||
1740 | * spinlock. These macros expand directly to the new state value. | ||
1741 | * | ||
1742 | * Besides the basic forms NS() and _NS() additional _?NS[23] are defined | ||
1743 | * to express state changes that affect more than one aspect of the state. | ||
1744 | * | ||
1745 | * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY) | ||
1746 | * Means that the network connection was established and that the peer | ||
1747 | * is in secondary role. | ||
1748 | */ | ||
1749 | #define role_MASK R_MASK | ||
1750 | #define peer_MASK R_MASK | ||
1751 | #define disk_MASK D_MASK | ||
1752 | #define pdsk_MASK D_MASK | ||
1753 | #define conn_MASK C_MASK | ||
1754 | #define susp_MASK 1 | ||
1755 | #define user_isp_MASK 1 | ||
1756 | #define aftr_isp_MASK 1 | ||
1757 | #define susp_nod_MASK 1 | ||
1758 | #define susp_fen_MASK 1 | ||
1759 | |||
1760 | #define NS(T, S) \ | ||
1761 | ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \ | ||
1762 | ({ union drbd_state val; val.i = 0; val.T = (S); val; }) | ||
1763 | #define NS2(T1, S1, T2, S2) \ | ||
1764 | ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \ | ||
1765 | mask.T2 = T2##_MASK; mask; }), \ | ||
1766 | ({ union drbd_state val; val.i = 0; val.T1 = (S1); \ | ||
1767 | val.T2 = (S2); val; }) | ||
1768 | #define NS3(T1, S1, T2, S2, T3, S3) \ | ||
1769 | ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \ | ||
1770 | mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \ | ||
1771 | ({ union drbd_state val; val.i = 0; val.T1 = (S1); \ | ||
1772 | val.T2 = (S2); val.T3 = (S3); val; }) | ||
1773 | |||
1774 | #define _NS(D, T, S) \ | ||
1775 | D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T = (S); __ns; }) | ||
1776 | #define _NS2(D, T1, S1, T2, S2) \ | ||
1777 | D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \ | ||
1778 | __ns.T2 = (S2); __ns; }) | ||
1779 | #define _NS3(D, T1, S1, T2, S2, T3, S3) \ | ||
1780 | D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \ | ||
1781 | __ns.T2 = (S2); __ns.T3 = (S3); __ns; }) | ||
1782 | 1615 | ||
1783 | /* | 1616 | /* |
1784 | * inline helper functions | 1617 | * inline helper functions |
@@ -1795,9 +1628,10 @@ static inline struct page *page_chain_next(struct page *page) | |||
1795 | #define page_chain_for_each_safe(page, n) \ | 1628 | #define page_chain_for_each_safe(page, n) \ |
1796 | for (; page && ({ n = page_chain_next(page); 1; }); page = n) | 1629 | for (; page && ({ n = page_chain_next(page); 1; }); page = n) |
1797 | 1630 | ||
1798 | static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e) | 1631 | |
1632 | static inline int drbd_peer_req_has_active_page(struct drbd_peer_request *peer_req) | ||
1799 | { | 1633 | { |
1800 | struct page *page = e->pages; | 1634 | struct page *page = peer_req->pages; |
1801 | page_chain_for_each(page) { | 1635 | page_chain_for_each(page) { |
1802 | if (page_count(page) > 1) | 1636 | if (page_count(page) > 1) |
1803 | return 1; | 1637 | return 1; |
@@ -1805,18 +1639,6 @@ static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e) | |||
1805 | return 0; | 1639 | return 0; |
1806 | } | 1640 | } |
1807 | 1641 | ||
1808 | static inline void drbd_state_lock(struct drbd_conf *mdev) | ||
1809 | { | ||
1810 | wait_event(mdev->misc_wait, | ||
1811 | !test_and_set_bit(CLUSTER_ST_CHANGE, &mdev->flags)); | ||
1812 | } | ||
1813 | |||
1814 | static inline void drbd_state_unlock(struct drbd_conf *mdev) | ||
1815 | { | ||
1816 | clear_bit(CLUSTER_ST_CHANGE, &mdev->flags); | ||
1817 | wake_up(&mdev->misc_wait); | ||
1818 | } | ||
1819 | |||
1820 | static inline enum drbd_state_rv | 1642 | static inline enum drbd_state_rv |
1821 | _drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, | 1643 | _drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, |
1822 | enum chg_state_flags flags, struct completion *done) | 1644 | enum chg_state_flags flags, struct completion *done) |
@@ -1830,48 +1652,71 @@ _drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, | |||
1830 | return rv; | 1652 | return rv; |
1831 | } | 1653 | } |
1832 | 1654 | ||
1833 | /** | 1655 | static inline union drbd_state drbd_read_state(struct drbd_conf *mdev) |
1834 | * drbd_request_state() - Reqest a state change | ||
1835 | * @mdev: DRBD device. | ||
1836 | * @mask: mask of state bits to change. | ||
1837 | * @val: value of new state bits. | ||
1838 | * | ||
1839 | * This is the most graceful way of requesting a state change. It is verbose | ||
1840 | * quite verbose in case the state change is not possible, and all those | ||
1841 | * state changes are globally serialized. | ||
1842 | */ | ||
1843 | static inline int drbd_request_state(struct drbd_conf *mdev, | ||
1844 | union drbd_state mask, | ||
1845 | union drbd_state val) | ||
1846 | { | 1656 | { |
1847 | return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED); | 1657 | union drbd_state rv; |
1658 | |||
1659 | rv.i = mdev->state.i; | ||
1660 | rv.susp = mdev->tconn->susp; | ||
1661 | rv.susp_nod = mdev->tconn->susp_nod; | ||
1662 | rv.susp_fen = mdev->tconn->susp_fen; | ||
1663 | |||
1664 | return rv; | ||
1848 | } | 1665 | } |
1849 | 1666 | ||
1850 | enum drbd_force_detach_flags { | 1667 | enum drbd_force_detach_flags { |
1851 | DRBD_IO_ERROR, | 1668 | DRBD_READ_ERROR, |
1669 | DRBD_WRITE_ERROR, | ||
1852 | DRBD_META_IO_ERROR, | 1670 | DRBD_META_IO_ERROR, |
1853 | DRBD_FORCE_DETACH, | 1671 | DRBD_FORCE_DETACH, |
1854 | }; | 1672 | }; |
1855 | 1673 | ||
1856 | #define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__) | 1674 | #define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__) |
1857 | static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, | 1675 | static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, |
1858 | enum drbd_force_detach_flags forcedetach, | 1676 | enum drbd_force_detach_flags df, |
1859 | const char *where) | 1677 | const char *where) |
1860 | { | 1678 | { |
1861 | switch (mdev->ldev->dc.on_io_error) { | 1679 | enum drbd_io_error_p ep; |
1862 | case EP_PASS_ON: | 1680 | |
1863 | if (forcedetach == DRBD_IO_ERROR) { | 1681 | rcu_read_lock(); |
1682 | ep = rcu_dereference(mdev->ldev->disk_conf)->on_io_error; | ||
1683 | rcu_read_unlock(); | ||
1684 | switch (ep) { | ||
1685 | case EP_PASS_ON: /* FIXME would this be better named "Ignore"? */ | ||
1686 | if (df == DRBD_READ_ERROR || df == DRBD_WRITE_ERROR) { | ||
1864 | if (__ratelimit(&drbd_ratelimit_state)) | 1687 | if (__ratelimit(&drbd_ratelimit_state)) |
1865 | dev_err(DEV, "Local IO failed in %s.\n", where); | 1688 | dev_err(DEV, "Local IO failed in %s.\n", where); |
1866 | if (mdev->state.disk > D_INCONSISTENT) | 1689 | if (mdev->state.disk > D_INCONSISTENT) |
1867 | _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_HARD, NULL); | 1690 | _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_HARD, NULL); |
1868 | break; | 1691 | break; |
1869 | } | 1692 | } |
1870 | /* NOTE fall through to detach case if forcedetach set */ | 1693 | /* NOTE fall through for DRBD_META_IO_ERROR or DRBD_FORCE_DETACH */ |
1871 | case EP_DETACH: | 1694 | case EP_DETACH: |
1872 | case EP_CALL_HELPER: | 1695 | case EP_CALL_HELPER: |
1696 | /* Remember whether we saw a READ or WRITE error. | ||
1697 | * | ||
1698 | * Recovery of the affected area for WRITE failure is covered | ||
1699 | * by the activity log. | ||
1700 | * READ errors may fall outside that area though. Certain READ | ||
1701 | * errors can be "healed" by writing good data to the affected | ||
1702 | * blocks, which triggers block re-allocation in lower layers. | ||
1703 | * | ||
1704 | * If we can not write the bitmap after a READ error, | ||
1705 | * we may need to trigger a full sync (see w_go_diskless()). | ||
1706 | * | ||
1707 | * Force-detach is not really an IO error, but rather a | ||
1708 | * desperate measure to try to deal with a completely | ||
1709 | * unresponsive lower level IO stack. | ||
1710 | * Still it should be treated as a WRITE error. | ||
1711 | * | ||
1712 | * Meta IO error is always WRITE error: | ||
1713 | * we read meta data only once during attach, | ||
1714 | * which will fail in case of errors. | ||
1715 | */ | ||
1873 | set_bit(WAS_IO_ERROR, &mdev->flags); | 1716 | set_bit(WAS_IO_ERROR, &mdev->flags); |
1874 | if (forcedetach == DRBD_FORCE_DETACH) | 1717 | if (df == DRBD_READ_ERROR) |
1718 | set_bit(WAS_READ_ERROR, &mdev->flags); | ||
1719 | if (df == DRBD_FORCE_DETACH) | ||
1875 | set_bit(FORCE_DETACH, &mdev->flags); | 1720 | set_bit(FORCE_DETACH, &mdev->flags); |
1876 | if (mdev->state.disk > D_FAILED) { | 1721 | if (mdev->state.disk > D_FAILED) { |
1877 | _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL); | 1722 | _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL); |
@@ -1896,9 +1741,9 @@ static inline void drbd_chk_io_error_(struct drbd_conf *mdev, | |||
1896 | { | 1741 | { |
1897 | if (error) { | 1742 | if (error) { |
1898 | unsigned long flags; | 1743 | unsigned long flags; |
1899 | spin_lock_irqsave(&mdev->req_lock, flags); | 1744 | spin_lock_irqsave(&mdev->tconn->req_lock, flags); |
1900 | __drbd_chk_io_error_(mdev, forcedetach, where); | 1745 | __drbd_chk_io_error_(mdev, forcedetach, where); |
1901 | spin_unlock_irqrestore(&mdev->req_lock, flags); | 1746 | spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); |
1902 | } | 1747 | } |
1903 | } | 1748 | } |
1904 | 1749 | ||
@@ -1910,9 +1755,9 @@ static inline void drbd_chk_io_error_(struct drbd_conf *mdev, | |||
1910 | * BTW, for internal meta data, this happens to be the maximum capacity | 1755 | * BTW, for internal meta data, this happens to be the maximum capacity |
1911 | * we could agree upon with our peer node. | 1756 | * we could agree upon with our peer node. |
1912 | */ | 1757 | */ |
1913 | static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev) | 1758 | static inline sector_t _drbd_md_first_sector(int meta_dev_idx, struct drbd_backing_dev *bdev) |
1914 | { | 1759 | { |
1915 | switch (bdev->dc.meta_dev_idx) { | 1760 | switch (meta_dev_idx) { |
1916 | case DRBD_MD_INDEX_INTERNAL: | 1761 | case DRBD_MD_INDEX_INTERNAL: |
1917 | case DRBD_MD_INDEX_FLEX_INT: | 1762 | case DRBD_MD_INDEX_FLEX_INT: |
1918 | return bdev->md.md_offset + bdev->md.bm_offset; | 1763 | return bdev->md.md_offset + bdev->md.bm_offset; |
@@ -1922,13 +1767,30 @@ static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev) | |||
1922 | } | 1767 | } |
1923 | } | 1768 | } |
1924 | 1769 | ||
1770 | static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev) | ||
1771 | { | ||
1772 | int meta_dev_idx; | ||
1773 | |||
1774 | rcu_read_lock(); | ||
1775 | meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; | ||
1776 | rcu_read_unlock(); | ||
1777 | |||
1778 | return _drbd_md_first_sector(meta_dev_idx, bdev); | ||
1779 | } | ||
1780 | |||
1925 | /** | 1781 | /** |
1926 | * drbd_md_last_sector() - Return the last sector number of the meta data area | 1782 | * drbd_md_last_sector() - Return the last sector number of the meta data area |
1927 | * @bdev: Meta data block device. | 1783 | * @bdev: Meta data block device. |
1928 | */ | 1784 | */ |
1929 | static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev) | 1785 | static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev) |
1930 | { | 1786 | { |
1931 | switch (bdev->dc.meta_dev_idx) { | 1787 | int meta_dev_idx; |
1788 | |||
1789 | rcu_read_lock(); | ||
1790 | meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; | ||
1791 | rcu_read_unlock(); | ||
1792 | |||
1793 | switch (meta_dev_idx) { | ||
1932 | case DRBD_MD_INDEX_INTERNAL: | 1794 | case DRBD_MD_INDEX_INTERNAL: |
1933 | case DRBD_MD_INDEX_FLEX_INT: | 1795 | case DRBD_MD_INDEX_FLEX_INT: |
1934 | return bdev->md.md_offset + MD_AL_OFFSET - 1; | 1796 | return bdev->md.md_offset + MD_AL_OFFSET - 1; |
@@ -1956,12 +1818,18 @@ static inline sector_t drbd_get_capacity(struct block_device *bdev) | |||
1956 | static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) | 1818 | static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) |
1957 | { | 1819 | { |
1958 | sector_t s; | 1820 | sector_t s; |
1959 | switch (bdev->dc.meta_dev_idx) { | 1821 | int meta_dev_idx; |
1822 | |||
1823 | rcu_read_lock(); | ||
1824 | meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; | ||
1825 | rcu_read_unlock(); | ||
1826 | |||
1827 | switch (meta_dev_idx) { | ||
1960 | case DRBD_MD_INDEX_INTERNAL: | 1828 | case DRBD_MD_INDEX_INTERNAL: |
1961 | case DRBD_MD_INDEX_FLEX_INT: | 1829 | case DRBD_MD_INDEX_FLEX_INT: |
1962 | s = drbd_get_capacity(bdev->backing_bdev) | 1830 | s = drbd_get_capacity(bdev->backing_bdev) |
1963 | ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX, | 1831 | ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX, |
1964 | drbd_md_first_sector(bdev)) | 1832 | _drbd_md_first_sector(meta_dev_idx, bdev)) |
1965 | : 0; | 1833 | : 0; |
1966 | break; | 1834 | break; |
1967 | case DRBD_MD_INDEX_FLEX_EXT: | 1835 | case DRBD_MD_INDEX_FLEX_EXT: |
@@ -1987,9 +1855,15 @@ static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) | |||
1987 | static inline sector_t drbd_md_ss__(struct drbd_conf *mdev, | 1855 | static inline sector_t drbd_md_ss__(struct drbd_conf *mdev, |
1988 | struct drbd_backing_dev *bdev) | 1856 | struct drbd_backing_dev *bdev) |
1989 | { | 1857 | { |
1990 | switch (bdev->dc.meta_dev_idx) { | 1858 | int meta_dev_idx; |
1859 | |||
1860 | rcu_read_lock(); | ||
1861 | meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; | ||
1862 | rcu_read_unlock(); | ||
1863 | |||
1864 | switch (meta_dev_idx) { | ||
1991 | default: /* external, some index */ | 1865 | default: /* external, some index */ |
1992 | return MD_RESERVED_SECT * bdev->dc.meta_dev_idx; | 1866 | return MD_RESERVED_SECT * meta_dev_idx; |
1993 | case DRBD_MD_INDEX_INTERNAL: | 1867 | case DRBD_MD_INDEX_INTERNAL: |
1994 | /* with drbd08, internal meta data is always "flexible" */ | 1868 | /* with drbd08, internal meta data is always "flexible" */ |
1995 | case DRBD_MD_INDEX_FLEX_INT: | 1869 | case DRBD_MD_INDEX_FLEX_INT: |
@@ -2015,9 +1889,8 @@ drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w) | |||
2015 | unsigned long flags; | 1889 | unsigned long flags; |
2016 | spin_lock_irqsave(&q->q_lock, flags); | 1890 | spin_lock_irqsave(&q->q_lock, flags); |
2017 | list_add(&w->list, &q->q); | 1891 | list_add(&w->list, &q->q); |
2018 | up(&q->s); /* within the spinlock, | ||
2019 | see comment near end of drbd_worker() */ | ||
2020 | spin_unlock_irqrestore(&q->q_lock, flags); | 1892 | spin_unlock_irqrestore(&q->q_lock, flags); |
1893 | wake_up(&q->q_wait); | ||
2021 | } | 1894 | } |
2022 | 1895 | ||
2023 | static inline void | 1896 | static inline void |
@@ -2026,41 +1899,35 @@ drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w) | |||
2026 | unsigned long flags; | 1899 | unsigned long flags; |
2027 | spin_lock_irqsave(&q->q_lock, flags); | 1900 | spin_lock_irqsave(&q->q_lock, flags); |
2028 | list_add_tail(&w->list, &q->q); | 1901 | list_add_tail(&w->list, &q->q); |
2029 | up(&q->s); /* within the spinlock, | ||
2030 | see comment near end of drbd_worker() */ | ||
2031 | spin_unlock_irqrestore(&q->q_lock, flags); | 1902 | spin_unlock_irqrestore(&q->q_lock, flags); |
1903 | wake_up(&q->q_wait); | ||
2032 | } | 1904 | } |
2033 | 1905 | ||
2034 | static inline void wake_asender(struct drbd_conf *mdev) | 1906 | static inline void wake_asender(struct drbd_tconn *tconn) |
2035 | { | ||
2036 | if (test_bit(SIGNAL_ASENDER, &mdev->flags)) | ||
2037 | force_sig(DRBD_SIG, mdev->asender.task); | ||
2038 | } | ||
2039 | |||
2040 | static inline void request_ping(struct drbd_conf *mdev) | ||
2041 | { | 1907 | { |
2042 | set_bit(SEND_PING, &mdev->flags); | 1908 | if (test_bit(SIGNAL_ASENDER, &tconn->flags)) |
2043 | wake_asender(mdev); | 1909 | force_sig(DRBD_SIG, tconn->asender.task); |
2044 | } | 1910 | } |
2045 | 1911 | ||
2046 | static inline int drbd_send_short_cmd(struct drbd_conf *mdev, | 1912 | static inline void request_ping(struct drbd_tconn *tconn) |
2047 | enum drbd_packets cmd) | ||
2048 | { | 1913 | { |
2049 | struct p_header80 h; | 1914 | set_bit(SEND_PING, &tconn->flags); |
2050 | return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h)); | 1915 | wake_asender(tconn); |
2051 | } | 1916 | } |
2052 | 1917 | ||
2053 | static inline int drbd_send_ping(struct drbd_conf *mdev) | 1918 | extern void *conn_prepare_command(struct drbd_tconn *, struct drbd_socket *); |
2054 | { | 1919 | extern void *drbd_prepare_command(struct drbd_conf *, struct drbd_socket *); |
2055 | struct p_header80 h; | 1920 | extern int conn_send_command(struct drbd_tconn *, struct drbd_socket *, |
2056 | return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h)); | 1921 | enum drbd_packet, unsigned int, void *, |
2057 | } | 1922 | unsigned int); |
1923 | extern int drbd_send_command(struct drbd_conf *, struct drbd_socket *, | ||
1924 | enum drbd_packet, unsigned int, void *, | ||
1925 | unsigned int); | ||
2058 | 1926 | ||
2059 | static inline int drbd_send_ping_ack(struct drbd_conf *mdev) | 1927 | extern int drbd_send_ping(struct drbd_tconn *tconn); |
2060 | { | 1928 | extern int drbd_send_ping_ack(struct drbd_tconn *tconn); |
2061 | struct p_header80 h; | 1929 | extern int drbd_send_state_req(struct drbd_conf *, union drbd_state, union drbd_state); |
2062 | return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h)); | 1930 | extern int conn_send_state_req(struct drbd_tconn *, union drbd_state, union drbd_state); |
2063 | } | ||
2064 | 1931 | ||
2065 | static inline void drbd_thread_stop(struct drbd_thread *thi) | 1932 | static inline void drbd_thread_stop(struct drbd_thread *thi) |
2066 | { | 1933 | { |
@@ -2082,21 +1949,21 @@ static inline void drbd_thread_restart_nowait(struct drbd_thread *thi) | |||
2082 | * or implicit barrier packets as necessary. | 1949 | * or implicit barrier packets as necessary. |
2083 | * increased: | 1950 | * increased: |
2084 | * w_send_barrier | 1951 | * w_send_barrier |
2085 | * _req_mod(req, queue_for_net_write or queue_for_net_read); | 1952 | * _req_mod(req, QUEUE_FOR_NET_WRITE or QUEUE_FOR_NET_READ); |
2086 | * it is much easier and equally valid to count what we queue for the | 1953 | * it is much easier and equally valid to count what we queue for the |
2087 | * worker, even before it actually was queued or send. | 1954 | * worker, even before it actually was queued or send. |
2088 | * (drbd_make_request_common; recovery path on read io-error) | 1955 | * (drbd_make_request_common; recovery path on read io-error) |
2089 | * decreased: | 1956 | * decreased: |
2090 | * got_BarrierAck (respective tl_clear, tl_clear_barrier) | 1957 | * got_BarrierAck (respective tl_clear, tl_clear_barrier) |
2091 | * _req_mod(req, data_received) | 1958 | * _req_mod(req, DATA_RECEIVED) |
2092 | * [from receive_DataReply] | 1959 | * [from receive_DataReply] |
2093 | * _req_mod(req, write_acked_by_peer or recv_acked_by_peer or neg_acked) | 1960 | * _req_mod(req, WRITE_ACKED_BY_PEER or RECV_ACKED_BY_PEER or NEG_ACKED) |
2094 | * [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)] | 1961 | * [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)] |
2095 | * for some reason it is NOT decreased in got_NegAck, | 1962 | * for some reason it is NOT decreased in got_NegAck, |
2096 | * but in the resulting cleanup code from report_params. | 1963 | * but in the resulting cleanup code from report_params. |
2097 | * we should try to remember the reason for that... | 1964 | * we should try to remember the reason for that... |
2098 | * _req_mod(req, send_failed or send_canceled) | 1965 | * _req_mod(req, SEND_FAILED or SEND_CANCELED) |
2099 | * _req_mod(req, connection_lost_while_pending) | 1966 | * _req_mod(req, CONNECTION_LOST_WHILE_PENDING) |
2100 | * [from tl_clear_barrier] | 1967 | * [from tl_clear_barrier] |
2101 | */ | 1968 | */ |
2102 | static inline void inc_ap_pending(struct drbd_conf *mdev) | 1969 | static inline void inc_ap_pending(struct drbd_conf *mdev) |
@@ -2104,17 +1971,19 @@ static inline void inc_ap_pending(struct drbd_conf *mdev) | |||
2104 | atomic_inc(&mdev->ap_pending_cnt); | 1971 | atomic_inc(&mdev->ap_pending_cnt); |
2105 | } | 1972 | } |
2106 | 1973 | ||
2107 | #define ERR_IF_CNT_IS_NEGATIVE(which) \ | 1974 | #define ERR_IF_CNT_IS_NEGATIVE(which, func, line) \ |
2108 | if (atomic_read(&mdev->which) < 0) \ | 1975 | if (atomic_read(&mdev->which) < 0) \ |
2109 | dev_err(DEV, "in %s:%d: " #which " = %d < 0 !\n", \ | 1976 | dev_err(DEV, "in %s:%d: " #which " = %d < 0 !\n", \ |
2110 | __func__ , __LINE__ , \ | 1977 | func, line, \ |
2111 | atomic_read(&mdev->which)) | 1978 | atomic_read(&mdev->which)) |
2112 | 1979 | ||
2113 | #define dec_ap_pending(mdev) do { \ | 1980 | #define dec_ap_pending(mdev) _dec_ap_pending(mdev, __FUNCTION__, __LINE__) |
2114 | typecheck(struct drbd_conf *, mdev); \ | 1981 | static inline void _dec_ap_pending(struct drbd_conf *mdev, const char *func, int line) |
2115 | if (atomic_dec_and_test(&mdev->ap_pending_cnt)) \ | 1982 | { |
2116 | wake_up(&mdev->misc_wait); \ | 1983 | if (atomic_dec_and_test(&mdev->ap_pending_cnt)) |
2117 | ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt); } while (0) | 1984 | wake_up(&mdev->misc_wait); |
1985 | ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt, func, line); | ||
1986 | } | ||
2118 | 1987 | ||
2119 | /* counts how many resync-related answers we still expect from the peer | 1988 | /* counts how many resync-related answers we still expect from the peer |
2120 | * increase decrease | 1989 | * increase decrease |
@@ -2127,10 +1996,12 @@ static inline void inc_rs_pending(struct drbd_conf *mdev) | |||
2127 | atomic_inc(&mdev->rs_pending_cnt); | 1996 | atomic_inc(&mdev->rs_pending_cnt); |
2128 | } | 1997 | } |
2129 | 1998 | ||
2130 | #define dec_rs_pending(mdev) do { \ | 1999 | #define dec_rs_pending(mdev) _dec_rs_pending(mdev, __FUNCTION__, __LINE__) |
2131 | typecheck(struct drbd_conf *, mdev); \ | 2000 | static inline void _dec_rs_pending(struct drbd_conf *mdev, const char *func, int line) |
2132 | atomic_dec(&mdev->rs_pending_cnt); \ | 2001 | { |
2133 | ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt); } while (0) | 2002 | atomic_dec(&mdev->rs_pending_cnt); |
2003 | ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt, func, line); | ||
2004 | } | ||
2134 | 2005 | ||
2135 | /* counts how many answers we still need to send to the peer. | 2006 | /* counts how many answers we still need to send to the peer. |
2136 | * increased on | 2007 | * increased on |
@@ -2146,38 +2017,18 @@ static inline void inc_unacked(struct drbd_conf *mdev) | |||
2146 | atomic_inc(&mdev->unacked_cnt); | 2017 | atomic_inc(&mdev->unacked_cnt); |
2147 | } | 2018 | } |
2148 | 2019 | ||
2149 | #define dec_unacked(mdev) do { \ | 2020 | #define dec_unacked(mdev) _dec_unacked(mdev, __FUNCTION__, __LINE__) |
2150 | typecheck(struct drbd_conf *, mdev); \ | 2021 | static inline void _dec_unacked(struct drbd_conf *mdev, const char *func, int line) |
2151 | atomic_dec(&mdev->unacked_cnt); \ | ||
2152 | ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0) | ||
2153 | |||
2154 | #define sub_unacked(mdev, n) do { \ | ||
2155 | typecheck(struct drbd_conf *, mdev); \ | ||
2156 | atomic_sub(n, &mdev->unacked_cnt); \ | ||
2157 | ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0) | ||
2158 | |||
2159 | |||
2160 | static inline void put_net_conf(struct drbd_conf *mdev) | ||
2161 | { | 2022 | { |
2162 | if (atomic_dec_and_test(&mdev->net_cnt)) | 2023 | atomic_dec(&mdev->unacked_cnt); |
2163 | wake_up(&mdev->net_cnt_wait); | 2024 | ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line); |
2164 | } | 2025 | } |
2165 | 2026 | ||
2166 | /** | 2027 | #define sub_unacked(mdev, n) _sub_unacked(mdev, n, __FUNCTION__, __LINE__) |
2167 | * get_net_conf() - Increase ref count on mdev->net_conf; Returns 0 if nothing there | 2028 | static inline void _sub_unacked(struct drbd_conf *mdev, int n, const char *func, int line) |
2168 | * @mdev: DRBD device. | ||
2169 | * | ||
2170 | * You have to call put_net_conf() when finished working with mdev->net_conf. | ||
2171 | */ | ||
2172 | static inline int get_net_conf(struct drbd_conf *mdev) | ||
2173 | { | 2029 | { |
2174 | int have_net_conf; | 2030 | atomic_sub(n, &mdev->unacked_cnt); |
2175 | 2031 | ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line); | |
2176 | atomic_inc(&mdev->net_cnt); | ||
2177 | have_net_conf = mdev->state.conn >= C_UNCONNECTED; | ||
2178 | if (!have_net_conf) | ||
2179 | put_net_conf(mdev); | ||
2180 | return have_net_conf; | ||
2181 | } | 2032 | } |
2182 | 2033 | ||
2183 | /** | 2034 | /** |
@@ -2281,17 +2132,20 @@ static inline void drbd_get_syncer_progress(struct drbd_conf *mdev, | |||
2281 | * maybe re-implement using semaphores? */ | 2132 | * maybe re-implement using semaphores? */ |
2282 | static inline int drbd_get_max_buffers(struct drbd_conf *mdev) | 2133 | static inline int drbd_get_max_buffers(struct drbd_conf *mdev) |
2283 | { | 2134 | { |
2284 | int mxb = 1000000; /* arbitrary limit on open requests */ | 2135 | struct net_conf *nc; |
2285 | if (get_net_conf(mdev)) { | 2136 | int mxb; |
2286 | mxb = mdev->net_conf->max_buffers; | 2137 | |
2287 | put_net_conf(mdev); | 2138 | rcu_read_lock(); |
2288 | } | 2139 | nc = rcu_dereference(mdev->tconn->net_conf); |
2140 | mxb = nc ? nc->max_buffers : 1000000; /* arbitrary limit on open requests */ | ||
2141 | rcu_read_unlock(); | ||
2142 | |||
2289 | return mxb; | 2143 | return mxb; |
2290 | } | 2144 | } |
2291 | 2145 | ||
2292 | static inline int drbd_state_is_stable(struct drbd_conf *mdev) | 2146 | static inline int drbd_state_is_stable(struct drbd_conf *mdev) |
2293 | { | 2147 | { |
2294 | union drbd_state s = mdev->state; | 2148 | union drbd_dev_state s = mdev->state; |
2295 | 2149 | ||
2296 | /* DO NOT add a default clause, we want the compiler to warn us | 2150 | /* DO NOT add a default clause, we want the compiler to warn us |
2297 | * for any newly introduced state we may have forgotten to add here */ | 2151 | * for any newly introduced state we may have forgotten to add here */ |
@@ -2325,7 +2179,7 @@ static inline int drbd_state_is_stable(struct drbd_conf *mdev) | |||
2325 | 2179 | ||
2326 | /* Allow IO in BM exchange states with new protocols */ | 2180 | /* Allow IO in BM exchange states with new protocols */ |
2327 | case C_WF_BITMAP_S: | 2181 | case C_WF_BITMAP_S: |
2328 | if (mdev->agreed_pro_version < 96) | 2182 | if (mdev->tconn->agreed_pro_version < 96) |
2329 | return 0; | 2183 | return 0; |
2330 | break; | 2184 | break; |
2331 | 2185 | ||
@@ -2347,7 +2201,7 @@ static inline int drbd_state_is_stable(struct drbd_conf *mdev) | |||
2347 | /* disk state is stable as well. */ | 2201 | /* disk state is stable as well. */ |
2348 | break; | 2202 | break; |
2349 | 2203 | ||
2350 | /* no new io accepted during tansitional states */ | 2204 | /* no new io accepted during transitional states */ |
2351 | case D_ATTACHING: | 2205 | case D_ATTACHING: |
2352 | case D_NEGOTIATING: | 2206 | case D_NEGOTIATING: |
2353 | case D_UNKNOWN: | 2207 | case D_UNKNOWN: |
@@ -2359,16 +2213,18 @@ static inline int drbd_state_is_stable(struct drbd_conf *mdev) | |||
2359 | return 1; | 2213 | return 1; |
2360 | } | 2214 | } |
2361 | 2215 | ||
2362 | static inline int is_susp(union drbd_state s) | 2216 | static inline int drbd_suspended(struct drbd_conf *mdev) |
2363 | { | 2217 | { |
2364 | return s.susp || s.susp_nod || s.susp_fen; | 2218 | struct drbd_tconn *tconn = mdev->tconn; |
2219 | |||
2220 | return tconn->susp || tconn->susp_fen || tconn->susp_nod; | ||
2365 | } | 2221 | } |
2366 | 2222 | ||
2367 | static inline bool may_inc_ap_bio(struct drbd_conf *mdev) | 2223 | static inline bool may_inc_ap_bio(struct drbd_conf *mdev) |
2368 | { | 2224 | { |
2369 | int mxb = drbd_get_max_buffers(mdev); | 2225 | int mxb = drbd_get_max_buffers(mdev); |
2370 | 2226 | ||
2371 | if (is_susp(mdev->state)) | 2227 | if (drbd_suspended(mdev)) |
2372 | return false; | 2228 | return false; |
2373 | if (test_bit(SUSPEND_IO, &mdev->flags)) | 2229 | if (test_bit(SUSPEND_IO, &mdev->flags)) |
2374 | return false; | 2230 | return false; |
@@ -2390,30 +2246,30 @@ static inline bool may_inc_ap_bio(struct drbd_conf *mdev) | |||
2390 | return true; | 2246 | return true; |
2391 | } | 2247 | } |
2392 | 2248 | ||
2393 | static inline bool inc_ap_bio_cond(struct drbd_conf *mdev, int count) | 2249 | static inline bool inc_ap_bio_cond(struct drbd_conf *mdev) |
2394 | { | 2250 | { |
2395 | bool rv = false; | 2251 | bool rv = false; |
2396 | 2252 | ||
2397 | spin_lock_irq(&mdev->req_lock); | 2253 | spin_lock_irq(&mdev->tconn->req_lock); |
2398 | rv = may_inc_ap_bio(mdev); | 2254 | rv = may_inc_ap_bio(mdev); |
2399 | if (rv) | 2255 | if (rv) |
2400 | atomic_add(count, &mdev->ap_bio_cnt); | 2256 | atomic_inc(&mdev->ap_bio_cnt); |
2401 | spin_unlock_irq(&mdev->req_lock); | 2257 | spin_unlock_irq(&mdev->tconn->req_lock); |
2402 | 2258 | ||
2403 | return rv; | 2259 | return rv; |
2404 | } | 2260 | } |
2405 | 2261 | ||
2406 | static inline void inc_ap_bio(struct drbd_conf *mdev, int count) | 2262 | static inline void inc_ap_bio(struct drbd_conf *mdev) |
2407 | { | 2263 | { |
2408 | /* we wait here | 2264 | /* we wait here |
2409 | * as long as the device is suspended | 2265 | * as long as the device is suspended |
2410 | * until the bitmap is no longer on the fly during connection | 2266 | * until the bitmap is no longer on the fly during connection |
2411 | * handshake as long as we would exeed the max_buffer limit. | 2267 | * handshake as long as we would exceed the max_buffer limit. |
2412 | * | 2268 | * |
2413 | * to avoid races with the reconnect code, | 2269 | * to avoid races with the reconnect code, |
2414 | * we need to atomic_inc within the spinlock. */ | 2270 | * we need to atomic_inc within the spinlock. */ |
2415 | 2271 | ||
2416 | wait_event(mdev->misc_wait, inc_ap_bio_cond(mdev, count)); | 2272 | wait_event(mdev->misc_wait, inc_ap_bio_cond(mdev)); |
2417 | } | 2273 | } |
2418 | 2274 | ||
2419 | static inline void dec_ap_bio(struct drbd_conf *mdev) | 2275 | static inline void dec_ap_bio(struct drbd_conf *mdev) |
@@ -2425,7 +2281,7 @@ static inline void dec_ap_bio(struct drbd_conf *mdev) | |||
2425 | 2281 | ||
2426 | if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) { | 2282 | if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) { |
2427 | if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags)) | 2283 | if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags)) |
2428 | drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w); | 2284 | drbd_queue_work(&mdev->tconn->sender_work, &mdev->bm_io_work.w); |
2429 | } | 2285 | } |
2430 | 2286 | ||
2431 | /* this currently does wake_up for every dec_ap_bio! | 2287 | /* this currently does wake_up for every dec_ap_bio! |
@@ -2435,6 +2291,12 @@ static inline void dec_ap_bio(struct drbd_conf *mdev) | |||
2435 | wake_up(&mdev->misc_wait); | 2291 | wake_up(&mdev->misc_wait); |
2436 | } | 2292 | } |
2437 | 2293 | ||
2294 | static inline bool verify_can_do_stop_sector(struct drbd_conf *mdev) | ||
2295 | { | ||
2296 | return mdev->tconn->agreed_pro_version >= 97 && | ||
2297 | mdev->tconn->agreed_pro_version != 100; | ||
2298 | } | ||
2299 | |||
2438 | static inline int drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val) | 2300 | static inline int drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val) |
2439 | { | 2301 | { |
2440 | int changed = mdev->ed_uuid != val; | 2302 | int changed = mdev->ed_uuid != val; |
@@ -2442,40 +2304,6 @@ static inline int drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val) | |||
2442 | return changed; | 2304 | return changed; |
2443 | } | 2305 | } |
2444 | 2306 | ||
2445 | static inline int seq_cmp(u32 a, u32 b) | ||
2446 | { | ||
2447 | /* we assume wrap around at 32bit. | ||
2448 | * for wrap around at 24bit (old atomic_t), | ||
2449 | * we'd have to | ||
2450 | * a <<= 8; b <<= 8; | ||
2451 | */ | ||
2452 | return (s32)(a) - (s32)(b); | ||
2453 | } | ||
2454 | #define seq_lt(a, b) (seq_cmp((a), (b)) < 0) | ||
2455 | #define seq_gt(a, b) (seq_cmp((a), (b)) > 0) | ||
2456 | #define seq_ge(a, b) (seq_cmp((a), (b)) >= 0) | ||
2457 | #define seq_le(a, b) (seq_cmp((a), (b)) <= 0) | ||
2458 | /* CAUTION: please no side effects in arguments! */ | ||
2459 | #define seq_max(a, b) ((u32)(seq_gt((a), (b)) ? (a) : (b))) | ||
2460 | |||
2461 | static inline void update_peer_seq(struct drbd_conf *mdev, unsigned int new_seq) | ||
2462 | { | ||
2463 | unsigned int m; | ||
2464 | spin_lock(&mdev->peer_seq_lock); | ||
2465 | m = seq_max(mdev->peer_seq, new_seq); | ||
2466 | mdev->peer_seq = m; | ||
2467 | spin_unlock(&mdev->peer_seq_lock); | ||
2468 | if (m == new_seq) | ||
2469 | wake_up(&mdev->seq_wait); | ||
2470 | } | ||
2471 | |||
2472 | static inline void drbd_update_congested(struct drbd_conf *mdev) | ||
2473 | { | ||
2474 | struct sock *sk = mdev->data.socket->sk; | ||
2475 | if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5) | ||
2476 | set_bit(NET_CONGESTED, &mdev->flags); | ||
2477 | } | ||
2478 | |||
2479 | static inline int drbd_queue_order_type(struct drbd_conf *mdev) | 2307 | static inline int drbd_queue_order_type(struct drbd_conf *mdev) |
2480 | { | 2308 | { |
2481 | /* sorry, we currently have no working implementation | 2309 | /* sorry, we currently have no working implementation |
@@ -2490,10 +2318,15 @@ static inline void drbd_md_flush(struct drbd_conf *mdev) | |||
2490 | { | 2318 | { |
2491 | int r; | 2319 | int r; |
2492 | 2320 | ||
2321 | if (mdev->ldev == NULL) { | ||
2322 | dev_warn(DEV, "mdev->ldev == NULL in drbd_md_flush\n"); | ||
2323 | return; | ||
2324 | } | ||
2325 | |||
2493 | if (test_bit(MD_NO_FUA, &mdev->flags)) | 2326 | if (test_bit(MD_NO_FUA, &mdev->flags)) |
2494 | return; | 2327 | return; |
2495 | 2328 | ||
2496 | r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL); | 2329 | r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_NOIO, NULL); |
2497 | if (r) { | 2330 | if (r) { |
2498 | set_bit(MD_NO_FUA, &mdev->flags); | 2331 | set_bit(MD_NO_FUA, &mdev->flags); |
2499 | dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r); | 2332 | dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r); |
diff --git a/drivers/block/drbd/drbd_interval.c b/drivers/block/drbd/drbd_interval.c new file mode 100644 index 000000000000..89c497c630b4 --- /dev/null +++ b/drivers/block/drbd/drbd_interval.c | |||
@@ -0,0 +1,207 @@ | |||
1 | #include <asm/bug.h> | ||
2 | #include <linux/rbtree_augmented.h> | ||
3 | #include "drbd_interval.h" | ||
4 | |||
5 | /** | ||
6 | * interval_end - return end of @node | ||
7 | */ | ||
8 | static inline | ||
9 | sector_t interval_end(struct rb_node *node) | ||
10 | { | ||
11 | struct drbd_interval *this = rb_entry(node, struct drbd_interval, rb); | ||
12 | return this->end; | ||
13 | } | ||
14 | |||
15 | /** | ||
16 | * compute_subtree_last - compute end of @node | ||
17 | * | ||
18 | * The end of an interval is the highest (start + (size >> 9)) value of this | ||
19 | * node and of its children. Called for @node and its parents whenever the end | ||
20 | * may have changed. | ||
21 | */ | ||
22 | static inline sector_t | ||
23 | compute_subtree_last(struct drbd_interval *node) | ||
24 | { | ||
25 | sector_t max = node->sector + (node->size >> 9); | ||
26 | |||
27 | if (node->rb.rb_left) { | ||
28 | sector_t left = interval_end(node->rb.rb_left); | ||
29 | if (left > max) | ||
30 | max = left; | ||
31 | } | ||
32 | if (node->rb.rb_right) { | ||
33 | sector_t right = interval_end(node->rb.rb_right); | ||
34 | if (right > max) | ||
35 | max = right; | ||
36 | } | ||
37 | return max; | ||
38 | } | ||
39 | |||
40 | static void augment_propagate(struct rb_node *rb, struct rb_node *stop) | ||
41 | { | ||
42 | while (rb != stop) { | ||
43 | struct drbd_interval *node = rb_entry(rb, struct drbd_interval, rb); | ||
44 | sector_t subtree_last = compute_subtree_last(node); | ||
45 | if (node->end == subtree_last) | ||
46 | break; | ||
47 | node->end = subtree_last; | ||
48 | rb = rb_parent(&node->rb); | ||
49 | } | ||
50 | } | ||
51 | |||
52 | static void augment_copy(struct rb_node *rb_old, struct rb_node *rb_new) | ||
53 | { | ||
54 | struct drbd_interval *old = rb_entry(rb_old, struct drbd_interval, rb); | ||
55 | struct drbd_interval *new = rb_entry(rb_new, struct drbd_interval, rb); | ||
56 | |||
57 | new->end = old->end; | ||
58 | } | ||
59 | |||
60 | static void augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new) | ||
61 | { | ||
62 | struct drbd_interval *old = rb_entry(rb_old, struct drbd_interval, rb); | ||
63 | struct drbd_interval *new = rb_entry(rb_new, struct drbd_interval, rb); | ||
64 | |||
65 | new->end = old->end; | ||
66 | old->end = compute_subtree_last(old); | ||
67 | } | ||
68 | |||
69 | static const struct rb_augment_callbacks augment_callbacks = { | ||
70 | augment_propagate, | ||
71 | augment_copy, | ||
72 | augment_rotate, | ||
73 | }; | ||
74 | |||
75 | /** | ||
76 | * drbd_insert_interval - insert a new interval into a tree | ||
77 | */ | ||
78 | bool | ||
79 | drbd_insert_interval(struct rb_root *root, struct drbd_interval *this) | ||
80 | { | ||
81 | struct rb_node **new = &root->rb_node, *parent = NULL; | ||
82 | |||
83 | BUG_ON(!IS_ALIGNED(this->size, 512)); | ||
84 | |||
85 | while (*new) { | ||
86 | struct drbd_interval *here = | ||
87 | rb_entry(*new, struct drbd_interval, rb); | ||
88 | |||
89 | parent = *new; | ||
90 | if (this->sector < here->sector) | ||
91 | new = &(*new)->rb_left; | ||
92 | else if (this->sector > here->sector) | ||
93 | new = &(*new)->rb_right; | ||
94 | else if (this < here) | ||
95 | new = &(*new)->rb_left; | ||
96 | else if (this > here) | ||
97 | new = &(*new)->rb_right; | ||
98 | else | ||
99 | return false; | ||
100 | } | ||
101 | |||
102 | rb_link_node(&this->rb, parent, new); | ||
103 | rb_insert_augmented(&this->rb, root, &augment_callbacks); | ||
104 | return true; | ||
105 | } | ||
106 | |||
107 | /** | ||
108 | * drbd_contains_interval - check if a tree contains a given interval | ||
109 | * @sector: start sector of @interval | ||
110 | * @interval: may not be a valid pointer | ||
111 | * | ||
112 | * Returns if the tree contains the node @interval with start sector @start. | ||
113 | * Does not dereference @interval until @interval is known to be a valid object | ||
114 | * in @tree. Returns %false if @interval is in the tree but with a different | ||
115 | * sector number. | ||
116 | */ | ||
117 | bool | ||
118 | drbd_contains_interval(struct rb_root *root, sector_t sector, | ||
119 | struct drbd_interval *interval) | ||
120 | { | ||
121 | struct rb_node *node = root->rb_node; | ||
122 | |||
123 | while (node) { | ||
124 | struct drbd_interval *here = | ||
125 | rb_entry(node, struct drbd_interval, rb); | ||
126 | |||
127 | if (sector < here->sector) | ||
128 | node = node->rb_left; | ||
129 | else if (sector > here->sector) | ||
130 | node = node->rb_right; | ||
131 | else if (interval < here) | ||
132 | node = node->rb_left; | ||
133 | else if (interval > here) | ||
134 | node = node->rb_right; | ||
135 | else | ||
136 | return true; | ||
137 | } | ||
138 | return false; | ||
139 | } | ||
140 | |||
141 | /** | ||
142 | * drbd_remove_interval - remove an interval from a tree | ||
143 | */ | ||
144 | void | ||
145 | drbd_remove_interval(struct rb_root *root, struct drbd_interval *this) | ||
146 | { | ||
147 | rb_erase_augmented(&this->rb, root, &augment_callbacks); | ||
148 | } | ||
149 | |||
150 | /** | ||
151 | * drbd_find_overlap - search for an interval overlapping with [sector, sector + size) | ||
152 | * @sector: start sector | ||
153 | * @size: size, aligned to 512 bytes | ||
154 | * | ||
155 | * Returns an interval overlapping with [sector, sector + size), or NULL if | ||
156 | * there is none. When there is more than one overlapping interval in the | ||
157 | * tree, the interval with the lowest start sector is returned, and all other | ||
158 | * overlapping intervals will be on the right side of the tree, reachable with | ||
159 | * rb_next(). | ||
160 | */ | ||
161 | struct drbd_interval * | ||
162 | drbd_find_overlap(struct rb_root *root, sector_t sector, unsigned int size) | ||
163 | { | ||
164 | struct rb_node *node = root->rb_node; | ||
165 | struct drbd_interval *overlap = NULL; | ||
166 | sector_t end = sector + (size >> 9); | ||
167 | |||
168 | BUG_ON(!IS_ALIGNED(size, 512)); | ||
169 | |||
170 | while (node) { | ||
171 | struct drbd_interval *here = | ||
172 | rb_entry(node, struct drbd_interval, rb); | ||
173 | |||
174 | if (node->rb_left && | ||
175 | sector < interval_end(node->rb_left)) { | ||
176 | /* Overlap if any must be on left side */ | ||
177 | node = node->rb_left; | ||
178 | } else if (here->sector < end && | ||
179 | sector < here->sector + (here->size >> 9)) { | ||
180 | overlap = here; | ||
181 | break; | ||
182 | } else if (sector >= here->sector) { | ||
183 | /* Overlap if any must be on right side */ | ||
184 | node = node->rb_right; | ||
185 | } else | ||
186 | break; | ||
187 | } | ||
188 | return overlap; | ||
189 | } | ||
190 | |||
191 | struct drbd_interval * | ||
192 | drbd_next_overlap(struct drbd_interval *i, sector_t sector, unsigned int size) | ||
193 | { | ||
194 | sector_t end = sector + (size >> 9); | ||
195 | struct rb_node *node; | ||
196 | |||
197 | for (;;) { | ||
198 | node = rb_next(&i->rb); | ||
199 | if (!node) | ||
200 | return NULL; | ||
201 | i = rb_entry(node, struct drbd_interval, rb); | ||
202 | if (i->sector >= end) | ||
203 | return NULL; | ||
204 | if (sector < i->sector + (i->size >> 9)) | ||
205 | return i; | ||
206 | } | ||
207 | } | ||
diff --git a/drivers/block/drbd/drbd_interval.h b/drivers/block/drbd/drbd_interval.h new file mode 100644 index 000000000000..f38fcb00c10d --- /dev/null +++ b/drivers/block/drbd/drbd_interval.h | |||
@@ -0,0 +1,40 @@ | |||
1 | #ifndef __DRBD_INTERVAL_H | ||
2 | #define __DRBD_INTERVAL_H | ||
3 | |||
4 | #include <linux/types.h> | ||
5 | #include <linux/rbtree.h> | ||
6 | |||
7 | struct drbd_interval { | ||
8 | struct rb_node rb; | ||
9 | sector_t sector; /* start sector of the interval */ | ||
10 | unsigned int size; /* size in bytes */ | ||
11 | sector_t end; /* highest interval end in subtree */ | ||
12 | int local:1 /* local or remote request? */; | ||
13 | int waiting:1; | ||
14 | }; | ||
15 | |||
16 | static inline void drbd_clear_interval(struct drbd_interval *i) | ||
17 | { | ||
18 | RB_CLEAR_NODE(&i->rb); | ||
19 | } | ||
20 | |||
21 | static inline bool drbd_interval_empty(struct drbd_interval *i) | ||
22 | { | ||
23 | return RB_EMPTY_NODE(&i->rb); | ||
24 | } | ||
25 | |||
26 | extern bool drbd_insert_interval(struct rb_root *, struct drbd_interval *); | ||
27 | extern bool drbd_contains_interval(struct rb_root *, sector_t, | ||
28 | struct drbd_interval *); | ||
29 | extern void drbd_remove_interval(struct rb_root *, struct drbd_interval *); | ||
30 | extern struct drbd_interval *drbd_find_overlap(struct rb_root *, sector_t, | ||
31 | unsigned int); | ||
32 | extern struct drbd_interval *drbd_next_overlap(struct drbd_interval *, sector_t, | ||
33 | unsigned int); | ||
34 | |||
35 | #define drbd_for_each_overlap(i, root, sector, size) \ | ||
36 | for (i = drbd_find_overlap(root, sector, size); \ | ||
37 | i; \ | ||
38 | i = drbd_next_overlap(i, sector, size)) | ||
39 | |||
40 | #endif /* __DRBD_INTERVAL_H */ | ||
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index f55683ad4ffa..8c13eeb83c53 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c | |||
@@ -56,14 +56,6 @@ | |||
56 | 56 | ||
57 | #include "drbd_vli.h" | 57 | #include "drbd_vli.h" |
58 | 58 | ||
59 | struct after_state_chg_work { | ||
60 | struct drbd_work w; | ||
61 | union drbd_state os; | ||
62 | union drbd_state ns; | ||
63 | enum chg_state_flags flags; | ||
64 | struct completion *done; | ||
65 | }; | ||
66 | |||
67 | static DEFINE_MUTEX(drbd_main_mutex); | 59 | static DEFINE_MUTEX(drbd_main_mutex); |
68 | int drbdd_init(struct drbd_thread *); | 60 | int drbdd_init(struct drbd_thread *); |
69 | int drbd_worker(struct drbd_thread *); | 61 | int drbd_worker(struct drbd_thread *); |
@@ -72,21 +64,17 @@ int drbd_asender(struct drbd_thread *); | |||
72 | int drbd_init(void); | 64 | int drbd_init(void); |
73 | static int drbd_open(struct block_device *bdev, fmode_t mode); | 65 | static int drbd_open(struct block_device *bdev, fmode_t mode); |
74 | static int drbd_release(struct gendisk *gd, fmode_t mode); | 66 | static int drbd_release(struct gendisk *gd, fmode_t mode); |
75 | static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused); | 67 | static int w_md_sync(struct drbd_work *w, int unused); |
76 | static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, | ||
77 | union drbd_state ns, enum chg_state_flags flags); | ||
78 | static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused); | ||
79 | static void md_sync_timer_fn(unsigned long data); | 68 | static void md_sync_timer_fn(unsigned long data); |
80 | static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused); | 69 | static int w_bitmap_io(struct drbd_work *w, int unused); |
81 | static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused); | 70 | static int w_go_diskless(struct drbd_work *w, int unused); |
82 | static void _tl_clear(struct drbd_conf *mdev); | ||
83 | 71 | ||
84 | MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, " | 72 | MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, " |
85 | "Lars Ellenberg <lars@linbit.com>"); | 73 | "Lars Ellenberg <lars@linbit.com>"); |
86 | MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION); | 74 | MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION); |
87 | MODULE_VERSION(REL_VERSION); | 75 | MODULE_VERSION(REL_VERSION); |
88 | MODULE_LICENSE("GPL"); | 76 | MODULE_LICENSE("GPL"); |
89 | MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (" | 77 | MODULE_PARM_DESC(minor_count, "Approximate number of drbd devices (" |
90 | __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")"); | 78 | __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")"); |
91 | MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR); | 79 | MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR); |
92 | 80 | ||
@@ -98,7 +86,6 @@ MODULE_PARM_DESC(allow_oos, "DONT USE!"); | |||
98 | module_param(minor_count, uint, 0444); | 86 | module_param(minor_count, uint, 0444); |
99 | module_param(disable_sendpage, bool, 0644); | 87 | module_param(disable_sendpage, bool, 0644); |
100 | module_param(allow_oos, bool, 0); | 88 | module_param(allow_oos, bool, 0); |
101 | module_param(cn_idx, uint, 0444); | ||
102 | module_param(proc_details, int, 0644); | 89 | module_param(proc_details, int, 0644); |
103 | 90 | ||
104 | #ifdef CONFIG_DRBD_FAULT_INJECTION | 91 | #ifdef CONFIG_DRBD_FAULT_INJECTION |
@@ -120,7 +107,6 @@ module_param(fault_devs, int, 0644); | |||
120 | unsigned int minor_count = DRBD_MINOR_COUNT_DEF; | 107 | unsigned int minor_count = DRBD_MINOR_COUNT_DEF; |
121 | bool disable_sendpage; | 108 | bool disable_sendpage; |
122 | bool allow_oos; | 109 | bool allow_oos; |
123 | unsigned int cn_idx = CN_IDX_DRBD; | ||
124 | int proc_details; /* Detail level in proc drbd*/ | 110 | int proc_details; /* Detail level in proc drbd*/ |
125 | 111 | ||
126 | /* Module parameter for setting the user mode helper program | 112 | /* Module parameter for setting the user mode helper program |
@@ -132,10 +118,11 @@ module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0 | |||
132 | /* in 2.6.x, our device mapping and config info contains our virtual gendisks | 118 | /* in 2.6.x, our device mapping and config info contains our virtual gendisks |
133 | * as member "struct gendisk *vdisk;" | 119 | * as member "struct gendisk *vdisk;" |
134 | */ | 120 | */ |
135 | struct drbd_conf **minor_table; | 121 | struct idr minors; |
122 | struct list_head drbd_tconns; /* list of struct drbd_tconn */ | ||
136 | 123 | ||
137 | struct kmem_cache *drbd_request_cache; | 124 | struct kmem_cache *drbd_request_cache; |
138 | struct kmem_cache *drbd_ee_cache; /* epoch entries */ | 125 | struct kmem_cache *drbd_ee_cache; /* peer requests */ |
139 | struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ | 126 | struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ |
140 | struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ | 127 | struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ |
141 | mempool_t *drbd_request_mempool; | 128 | mempool_t *drbd_request_mempool; |
@@ -164,10 +151,15 @@ static const struct block_device_operations drbd_ops = { | |||
164 | 151 | ||
165 | struct bio *bio_alloc_drbd(gfp_t gfp_mask) | 152 | struct bio *bio_alloc_drbd(gfp_t gfp_mask) |
166 | { | 153 | { |
154 | struct bio *bio; | ||
155 | |||
167 | if (!drbd_md_io_bio_set) | 156 | if (!drbd_md_io_bio_set) |
168 | return bio_alloc(gfp_mask, 1); | 157 | return bio_alloc(gfp_mask, 1); |
169 | 158 | ||
170 | return bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set); | 159 | bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set); |
160 | if (!bio) | ||
161 | return NULL; | ||
162 | return bio; | ||
171 | } | 163 | } |
172 | 164 | ||
173 | #ifdef __CHECKER__ | 165 | #ifdef __CHECKER__ |
@@ -190,158 +182,87 @@ int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins) | |||
190 | #endif | 182 | #endif |
191 | 183 | ||
192 | /** | 184 | /** |
193 | * DOC: The transfer log | 185 | * tl_release() - mark as BARRIER_ACKED all requests in the corresponding transfer log epoch |
194 | * | 186 | * @tconn: DRBD connection. |
195 | * The transfer log is a single linked list of &struct drbd_tl_epoch objects. | ||
196 | * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail | ||
197 | * of the list. There is always at least one &struct drbd_tl_epoch object. | ||
198 | * | ||
199 | * Each &struct drbd_tl_epoch has a circular double linked list of requests | ||
200 | * attached. | ||
201 | */ | ||
202 | static int tl_init(struct drbd_conf *mdev) | ||
203 | { | ||
204 | struct drbd_tl_epoch *b; | ||
205 | |||
206 | /* during device minor initialization, we may well use GFP_KERNEL */ | ||
207 | b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL); | ||
208 | if (!b) | ||
209 | return 0; | ||
210 | INIT_LIST_HEAD(&b->requests); | ||
211 | INIT_LIST_HEAD(&b->w.list); | ||
212 | b->next = NULL; | ||
213 | b->br_number = 4711; | ||
214 | b->n_writes = 0; | ||
215 | b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */ | ||
216 | |||
217 | mdev->oldest_tle = b; | ||
218 | mdev->newest_tle = b; | ||
219 | INIT_LIST_HEAD(&mdev->out_of_sequence_requests); | ||
220 | INIT_LIST_HEAD(&mdev->barrier_acked_requests); | ||
221 | |||
222 | mdev->tl_hash = NULL; | ||
223 | mdev->tl_hash_s = 0; | ||
224 | |||
225 | return 1; | ||
226 | } | ||
227 | |||
228 | static void tl_cleanup(struct drbd_conf *mdev) | ||
229 | { | ||
230 | D_ASSERT(mdev->oldest_tle == mdev->newest_tle); | ||
231 | D_ASSERT(list_empty(&mdev->out_of_sequence_requests)); | ||
232 | kfree(mdev->oldest_tle); | ||
233 | mdev->oldest_tle = NULL; | ||
234 | kfree(mdev->unused_spare_tle); | ||
235 | mdev->unused_spare_tle = NULL; | ||
236 | kfree(mdev->tl_hash); | ||
237 | mdev->tl_hash = NULL; | ||
238 | mdev->tl_hash_s = 0; | ||
239 | } | ||
240 | |||
241 | /** | ||
242 | * _tl_add_barrier() - Adds a barrier to the transfer log | ||
243 | * @mdev: DRBD device. | ||
244 | * @new: Barrier to be added before the current head of the TL. | ||
245 | * | ||
246 | * The caller must hold the req_lock. | ||
247 | */ | ||
248 | void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new) | ||
249 | { | ||
250 | struct drbd_tl_epoch *newest_before; | ||
251 | |||
252 | INIT_LIST_HEAD(&new->requests); | ||
253 | INIT_LIST_HEAD(&new->w.list); | ||
254 | new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */ | ||
255 | new->next = NULL; | ||
256 | new->n_writes = 0; | ||
257 | |||
258 | newest_before = mdev->newest_tle; | ||
259 | new->br_number = newest_before->br_number+1; | ||
260 | if (mdev->newest_tle != new) { | ||
261 | mdev->newest_tle->next = new; | ||
262 | mdev->newest_tle = new; | ||
263 | } | ||
264 | } | ||
265 | |||
266 | /** | ||
267 | * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL | ||
268 | * @mdev: DRBD device. | ||
269 | * @barrier_nr: Expected identifier of the DRBD write barrier packet. | 187 | * @barrier_nr: Expected identifier of the DRBD write barrier packet. |
270 | * @set_size: Expected number of requests before that barrier. | 188 | * @set_size: Expected number of requests before that barrier. |
271 | * | 189 | * |
272 | * In case the passed barrier_nr or set_size does not match the oldest | 190 | * In case the passed barrier_nr or set_size does not match the oldest |
273 | * &struct drbd_tl_epoch objects this function will cause a termination | 191 | * epoch of not yet barrier-acked requests, this function will cause a |
274 | * of the connection. | 192 | * termination of the connection. |
275 | */ | 193 | */ |
276 | void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, | 194 | void tl_release(struct drbd_tconn *tconn, unsigned int barrier_nr, |
277 | unsigned int set_size) | 195 | unsigned int set_size) |
278 | { | 196 | { |
279 | struct drbd_tl_epoch *b, *nob; /* next old barrier */ | ||
280 | struct list_head *le, *tle; | ||
281 | struct drbd_request *r; | 197 | struct drbd_request *r; |
282 | 198 | struct drbd_request *req = NULL; | |
283 | spin_lock_irq(&mdev->req_lock); | 199 | int expect_epoch = 0; |
284 | 200 | int expect_size = 0; | |
285 | b = mdev->oldest_tle; | 201 | |
202 | spin_lock_irq(&tconn->req_lock); | ||
203 | |||
204 | /* find oldest not yet barrier-acked write request, | ||
205 | * count writes in its epoch. */ | ||
206 | list_for_each_entry(r, &tconn->transfer_log, tl_requests) { | ||
207 | const unsigned s = r->rq_state; | ||
208 | if (!req) { | ||
209 | if (!(s & RQ_WRITE)) | ||
210 | continue; | ||
211 | if (!(s & RQ_NET_MASK)) | ||
212 | continue; | ||
213 | if (s & RQ_NET_DONE) | ||
214 | continue; | ||
215 | req = r; | ||
216 | expect_epoch = req->epoch; | ||
217 | expect_size ++; | ||
218 | } else { | ||
219 | if (r->epoch != expect_epoch) | ||
220 | break; | ||
221 | if (!(s & RQ_WRITE)) | ||
222 | continue; | ||
223 | /* if (s & RQ_DONE): not expected */ | ||
224 | /* if (!(s & RQ_NET_MASK)): not expected */ | ||
225 | expect_size++; | ||
226 | } | ||
227 | } | ||
286 | 228 | ||
287 | /* first some paranoia code */ | 229 | /* first some paranoia code */ |
288 | if (b == NULL) { | 230 | if (req == NULL) { |
289 | dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n", | 231 | conn_err(tconn, "BAD! BarrierAck #%u received, but no epoch in tl!?\n", |
290 | barrier_nr); | 232 | barrier_nr); |
291 | goto bail; | 233 | goto bail; |
292 | } | 234 | } |
293 | if (b->br_number != barrier_nr) { | 235 | if (expect_epoch != barrier_nr) { |
294 | dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n", | 236 | conn_err(tconn, "BAD! BarrierAck #%u received, expected #%u!\n", |
295 | barrier_nr, b->br_number); | 237 | barrier_nr, expect_epoch); |
296 | goto bail; | 238 | goto bail; |
297 | } | 239 | } |
298 | if (b->n_writes != set_size) { | 240 | |
299 | dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n", | 241 | if (expect_size != set_size) { |
300 | barrier_nr, set_size, b->n_writes); | 242 | conn_err(tconn, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n", |
243 | barrier_nr, set_size, expect_size); | ||
301 | goto bail; | 244 | goto bail; |
302 | } | 245 | } |
303 | 246 | ||
304 | /* Clean up list of requests processed during current epoch */ | 247 | /* Clean up list of requests processed during current epoch. */ |
305 | list_for_each_safe(le, tle, &b->requests) { | 248 | /* this extra list walk restart is paranoia, |
306 | r = list_entry(le, struct drbd_request, tl_requests); | 249 | * to catch requests being barrier-acked "unexpectedly". |
307 | _req_mod(r, barrier_acked); | 250 | * It usually should find the same req again, or some READ preceding it. */ |
308 | } | 251 | list_for_each_entry(req, &tconn->transfer_log, tl_requests) |
309 | /* There could be requests on the list waiting for completion | 252 | if (req->epoch == expect_epoch) |
310 | of the write to the local disk. To avoid corruptions of | 253 | break; |
311 | slab's data structures we have to remove the lists head. | 254 | list_for_each_entry_safe_from(req, r, &tconn->transfer_log, tl_requests) { |
312 | 255 | if (req->epoch != expect_epoch) | |
313 | Also there could have been a barrier ack out of sequence, overtaking | 256 | break; |
314 | the write acks - which would be a bug and violating write ordering. | 257 | _req_mod(req, BARRIER_ACKED); |
315 | To not deadlock in case we lose connection while such requests are | ||
316 | still pending, we need some way to find them for the | ||
317 | _req_mode(connection_lost_while_pending). | ||
318 | |||
319 | These have been list_move'd to the out_of_sequence_requests list in | ||
320 | _req_mod(, barrier_acked) above. | ||
321 | */ | ||
322 | list_splice_init(&b->requests, &mdev->barrier_acked_requests); | ||
323 | |||
324 | nob = b->next; | ||
325 | if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) { | ||
326 | _tl_add_barrier(mdev, b); | ||
327 | if (nob) | ||
328 | mdev->oldest_tle = nob; | ||
329 | /* if nob == NULL b was the only barrier, and becomes the new | ||
330 | barrier. Therefore mdev->oldest_tle points already to b */ | ||
331 | } else { | ||
332 | D_ASSERT(nob != NULL); | ||
333 | mdev->oldest_tle = nob; | ||
334 | kfree(b); | ||
335 | } | 258 | } |
336 | 259 | spin_unlock_irq(&tconn->req_lock); | |
337 | spin_unlock_irq(&mdev->req_lock); | ||
338 | dec_ap_pending(mdev); | ||
339 | 260 | ||
340 | return; | 261 | return; |
341 | 262 | ||
342 | bail: | 263 | bail: |
343 | spin_unlock_irq(&mdev->req_lock); | 264 | spin_unlock_irq(&tconn->req_lock); |
344 | drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); | 265 | conn_request_state(tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD); |
345 | } | 266 | } |
346 | 267 | ||
347 | 268 | ||
@@ -350,85 +271,24 @@ bail: | |||
350 | * @mdev: DRBD device. | 271 | * @mdev: DRBD device. |
351 | * @what: The action/event to perform with all request objects | 272 | * @what: The action/event to perform with all request objects |
352 | * | 273 | * |
353 | * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io, | 274 | * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO, |
354 | * restart_frozen_disk_io. | 275 | * RESTART_FROZEN_DISK_IO. |
355 | */ | 276 | */ |
356 | static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) | 277 | /* must hold resource->req_lock */ |
357 | { | 278 | void _tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what) |
358 | struct drbd_tl_epoch *b, *tmp, **pn; | 279 | { |
359 | struct list_head *le, *tle, carry_reads; | 280 | struct drbd_request *req, *r; |
360 | struct drbd_request *req; | ||
361 | int rv, n_writes, n_reads; | ||
362 | |||
363 | b = mdev->oldest_tle; | ||
364 | pn = &mdev->oldest_tle; | ||
365 | while (b) { | ||
366 | n_writes = 0; | ||
367 | n_reads = 0; | ||
368 | INIT_LIST_HEAD(&carry_reads); | ||
369 | list_for_each_safe(le, tle, &b->requests) { | ||
370 | req = list_entry(le, struct drbd_request, tl_requests); | ||
371 | rv = _req_mod(req, what); | ||
372 | |||
373 | n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT; | ||
374 | n_reads += (rv & MR_READ) >> MR_READ_SHIFT; | ||
375 | } | ||
376 | tmp = b->next; | ||
377 | |||
378 | if (n_writes) { | ||
379 | if (what == resend) { | ||
380 | b->n_writes = n_writes; | ||
381 | if (b->w.cb == NULL) { | ||
382 | b->w.cb = w_send_barrier; | ||
383 | inc_ap_pending(mdev); | ||
384 | set_bit(CREATE_BARRIER, &mdev->flags); | ||
385 | } | ||
386 | |||
387 | drbd_queue_work(&mdev->data.work, &b->w); | ||
388 | } | ||
389 | pn = &b->next; | ||
390 | } else { | ||
391 | if (n_reads) | ||
392 | list_add(&carry_reads, &b->requests); | ||
393 | /* there could still be requests on that ring list, | ||
394 | * in case local io is still pending */ | ||
395 | list_del(&b->requests); | ||
396 | |||
397 | /* dec_ap_pending corresponding to queue_barrier. | ||
398 | * the newest barrier may not have been queued yet, | ||
399 | * in which case w.cb is still NULL. */ | ||
400 | if (b->w.cb != NULL) | ||
401 | dec_ap_pending(mdev); | ||
402 | |||
403 | if (b == mdev->newest_tle) { | ||
404 | /* recycle, but reinit! */ | ||
405 | D_ASSERT(tmp == NULL); | ||
406 | INIT_LIST_HEAD(&b->requests); | ||
407 | list_splice(&carry_reads, &b->requests); | ||
408 | INIT_LIST_HEAD(&b->w.list); | ||
409 | b->w.cb = NULL; | ||
410 | b->br_number = net_random(); | ||
411 | b->n_writes = 0; | ||
412 | |||
413 | *pn = b; | ||
414 | break; | ||
415 | } | ||
416 | *pn = tmp; | ||
417 | kfree(b); | ||
418 | } | ||
419 | b = tmp; | ||
420 | list_splice(&carry_reads, &b->requests); | ||
421 | } | ||
422 | |||
423 | /* Actions operating on the disk state, also want to work on | ||
424 | requests that got barrier acked. */ | ||
425 | 281 | ||
426 | list_for_each_safe(le, tle, &mdev->barrier_acked_requests) { | 282 | list_for_each_entry_safe(req, r, &tconn->transfer_log, tl_requests) |
427 | req = list_entry(le, struct drbd_request, tl_requests); | ||
428 | _req_mod(req, what); | 283 | _req_mod(req, what); |
429 | } | ||
430 | } | 284 | } |
431 | 285 | ||
286 | void tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what) | ||
287 | { | ||
288 | spin_lock_irq(&tconn->req_lock); | ||
289 | _tl_restart(tconn, what); | ||
290 | spin_unlock_irq(&tconn->req_lock); | ||
291 | } | ||
432 | 292 | ||
433 | /** | 293 | /** |
434 | * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL | 294 | * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL |
@@ -438,43 +298,9 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) | |||
438 | * by the requests on the transfer gets marked as our of sync. Called from the | 298 | * by the requests on the transfer gets marked as our of sync. Called from the |
439 | * receiver thread and the worker thread. | 299 | * receiver thread and the worker thread. |
440 | */ | 300 | */ |
441 | void tl_clear(struct drbd_conf *mdev) | 301 | void tl_clear(struct drbd_tconn *tconn) |
442 | { | 302 | { |
443 | spin_lock_irq(&mdev->req_lock); | 303 | tl_restart(tconn, CONNECTION_LOST_WHILE_PENDING); |
444 | _tl_clear(mdev); | ||
445 | spin_unlock_irq(&mdev->req_lock); | ||
446 | } | ||
447 | |||
448 | static void _tl_clear(struct drbd_conf *mdev) | ||
449 | { | ||
450 | struct list_head *le, *tle; | ||
451 | struct drbd_request *r; | ||
452 | |||
453 | _tl_restart(mdev, connection_lost_while_pending); | ||
454 | |||
455 | /* we expect this list to be empty. */ | ||
456 | D_ASSERT(list_empty(&mdev->out_of_sequence_requests)); | ||
457 | |||
458 | /* but just in case, clean it up anyways! */ | ||
459 | list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) { | ||
460 | r = list_entry(le, struct drbd_request, tl_requests); | ||
461 | /* It would be nice to complete outside of spinlock. | ||
462 | * But this is easier for now. */ | ||
463 | _req_mod(r, connection_lost_while_pending); | ||
464 | } | ||
465 | |||
466 | /* ensure bit indicating barrier is required is clear */ | ||
467 | clear_bit(CREATE_BARRIER, &mdev->flags); | ||
468 | |||
469 | memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *)); | ||
470 | |||
471 | } | ||
472 | |||
473 | void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) | ||
474 | { | ||
475 | spin_lock_irq(&mdev->req_lock); | ||
476 | _tl_restart(mdev, what); | ||
477 | spin_unlock_irq(&mdev->req_lock); | ||
478 | } | 304 | } |
479 | 305 | ||
480 | /** | 306 | /** |
@@ -483,1377 +309,131 @@ void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) | |||
483 | */ | 309 | */ |
484 | void tl_abort_disk_io(struct drbd_conf *mdev) | 310 | void tl_abort_disk_io(struct drbd_conf *mdev) |
485 | { | 311 | { |
486 | struct drbd_tl_epoch *b; | 312 | struct drbd_tconn *tconn = mdev->tconn; |
487 | struct list_head *le, *tle; | 313 | struct drbd_request *req, *r; |
488 | struct drbd_request *req; | ||
489 | |||
490 | spin_lock_irq(&mdev->req_lock); | ||
491 | b = mdev->oldest_tle; | ||
492 | while (b) { | ||
493 | list_for_each_safe(le, tle, &b->requests) { | ||
494 | req = list_entry(le, struct drbd_request, tl_requests); | ||
495 | if (!(req->rq_state & RQ_LOCAL_PENDING)) | ||
496 | continue; | ||
497 | _req_mod(req, abort_disk_io); | ||
498 | } | ||
499 | b = b->next; | ||
500 | } | ||
501 | 314 | ||
502 | list_for_each_safe(le, tle, &mdev->barrier_acked_requests) { | 315 | spin_lock_irq(&tconn->req_lock); |
503 | req = list_entry(le, struct drbd_request, tl_requests); | 316 | list_for_each_entry_safe(req, r, &tconn->transfer_log, tl_requests) { |
504 | if (!(req->rq_state & RQ_LOCAL_PENDING)) | 317 | if (!(req->rq_state & RQ_LOCAL_PENDING)) |
505 | continue; | 318 | continue; |
506 | _req_mod(req, abort_disk_io); | 319 | if (req->w.mdev != mdev) |
507 | } | 320 | continue; |
508 | 321 | _req_mod(req, ABORT_DISK_IO); | |
509 | spin_unlock_irq(&mdev->req_lock); | ||
510 | } | ||
511 | |||
512 | /** | ||
513 | * cl_wide_st_chg() - true if the state change is a cluster wide one | ||
514 | * @mdev: DRBD device. | ||
515 | * @os: old (current) state. | ||
516 | * @ns: new (wanted) state. | ||
517 | */ | ||
518 | static int cl_wide_st_chg(struct drbd_conf *mdev, | ||
519 | union drbd_state os, union drbd_state ns) | ||
520 | { | ||
521 | return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED && | ||
522 | ((os.role != R_PRIMARY && ns.role == R_PRIMARY) || | ||
523 | (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || | ||
524 | (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) || | ||
525 | (os.disk != D_FAILED && ns.disk == D_FAILED))) || | ||
526 | (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) || | ||
527 | (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S); | ||
528 | } | ||
529 | |||
530 | enum drbd_state_rv | ||
531 | drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f, | ||
532 | union drbd_state mask, union drbd_state val) | ||
533 | { | ||
534 | unsigned long flags; | ||
535 | union drbd_state os, ns; | ||
536 | enum drbd_state_rv rv; | ||
537 | |||
538 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
539 | os = mdev->state; | ||
540 | ns.i = (os.i & ~mask.i) | val.i; | ||
541 | rv = _drbd_set_state(mdev, ns, f, NULL); | ||
542 | ns = mdev->state; | ||
543 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
544 | |||
545 | return rv; | ||
546 | } | ||
547 | |||
548 | /** | ||
549 | * drbd_force_state() - Impose a change which happens outside our control on our state | ||
550 | * @mdev: DRBD device. | ||
551 | * @mask: mask of state bits to change. | ||
552 | * @val: value of new state bits. | ||
553 | */ | ||
554 | void drbd_force_state(struct drbd_conf *mdev, | ||
555 | union drbd_state mask, union drbd_state val) | ||
556 | { | ||
557 | drbd_change_state(mdev, CS_HARD, mask, val); | ||
558 | } | ||
559 | |||
560 | static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state); | ||
561 | static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *, | ||
562 | union drbd_state, | ||
563 | union drbd_state); | ||
564 | enum sanitize_state_warnings { | ||
565 | NO_WARNING, | ||
566 | ABORTED_ONLINE_VERIFY, | ||
567 | ABORTED_RESYNC, | ||
568 | CONNECTION_LOST_NEGOTIATING, | ||
569 | IMPLICITLY_UPGRADED_DISK, | ||
570 | IMPLICITLY_UPGRADED_PDSK, | ||
571 | }; | ||
572 | static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, | ||
573 | union drbd_state ns, enum sanitize_state_warnings *warn); | ||
574 | int drbd_send_state_req(struct drbd_conf *, | ||
575 | union drbd_state, union drbd_state); | ||
576 | |||
577 | static enum drbd_state_rv | ||
578 | _req_st_cond(struct drbd_conf *mdev, union drbd_state mask, | ||
579 | union drbd_state val) | ||
580 | { | ||
581 | union drbd_state os, ns; | ||
582 | unsigned long flags; | ||
583 | enum drbd_state_rv rv; | ||
584 | |||
585 | if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags)) | ||
586 | return SS_CW_SUCCESS; | ||
587 | |||
588 | if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags)) | ||
589 | return SS_CW_FAILED_BY_PEER; | ||
590 | |||
591 | rv = 0; | ||
592 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
593 | os = mdev->state; | ||
594 | ns.i = (os.i & ~mask.i) | val.i; | ||
595 | ns = sanitize_state(mdev, os, ns, NULL); | ||
596 | |||
597 | if (!cl_wide_st_chg(mdev, os, ns)) | ||
598 | rv = SS_CW_NO_NEED; | ||
599 | if (!rv) { | ||
600 | rv = is_valid_state(mdev, ns); | ||
601 | if (rv == SS_SUCCESS) { | ||
602 | rv = is_valid_state_transition(mdev, ns, os); | ||
603 | if (rv == SS_SUCCESS) | ||
604 | rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */ | ||
605 | } | ||
606 | } | ||
607 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
608 | |||
609 | return rv; | ||
610 | } | ||
611 | |||
612 | /** | ||
613 | * drbd_req_state() - Perform an eventually cluster wide state change | ||
614 | * @mdev: DRBD device. | ||
615 | * @mask: mask of state bits to change. | ||
616 | * @val: value of new state bits. | ||
617 | * @f: flags | ||
618 | * | ||
619 | * Should not be called directly, use drbd_request_state() or | ||
620 | * _drbd_request_state(). | ||
621 | */ | ||
622 | static enum drbd_state_rv | ||
623 | drbd_req_state(struct drbd_conf *mdev, union drbd_state mask, | ||
624 | union drbd_state val, enum chg_state_flags f) | ||
625 | { | ||
626 | struct completion done; | ||
627 | unsigned long flags; | ||
628 | union drbd_state os, ns; | ||
629 | enum drbd_state_rv rv; | ||
630 | |||
631 | init_completion(&done); | ||
632 | |||
633 | if (f & CS_SERIALIZE) | ||
634 | mutex_lock(&mdev->state_mutex); | ||
635 | |||
636 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
637 | os = mdev->state; | ||
638 | ns.i = (os.i & ~mask.i) | val.i; | ||
639 | ns = sanitize_state(mdev, os, ns, NULL); | ||
640 | |||
641 | if (cl_wide_st_chg(mdev, os, ns)) { | ||
642 | rv = is_valid_state(mdev, ns); | ||
643 | if (rv == SS_SUCCESS) | ||
644 | rv = is_valid_state_transition(mdev, ns, os); | ||
645 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
646 | |||
647 | if (rv < SS_SUCCESS) { | ||
648 | if (f & CS_VERBOSE) | ||
649 | print_st_err(mdev, os, ns, rv); | ||
650 | goto abort; | ||
651 | } | ||
652 | |||
653 | drbd_state_lock(mdev); | ||
654 | if (!drbd_send_state_req(mdev, mask, val)) { | ||
655 | drbd_state_unlock(mdev); | ||
656 | rv = SS_CW_FAILED_BY_PEER; | ||
657 | if (f & CS_VERBOSE) | ||
658 | print_st_err(mdev, os, ns, rv); | ||
659 | goto abort; | ||
660 | } | ||
661 | |||
662 | wait_event(mdev->state_wait, | ||
663 | (rv = _req_st_cond(mdev, mask, val))); | ||
664 | |||
665 | if (rv < SS_SUCCESS) { | ||
666 | drbd_state_unlock(mdev); | ||
667 | if (f & CS_VERBOSE) | ||
668 | print_st_err(mdev, os, ns, rv); | ||
669 | goto abort; | ||
670 | } | ||
671 | spin_lock_irqsave(&mdev->req_lock, flags); | ||
672 | os = mdev->state; | ||
673 | ns.i = (os.i & ~mask.i) | val.i; | ||
674 | rv = _drbd_set_state(mdev, ns, f, &done); | ||
675 | drbd_state_unlock(mdev); | ||
676 | } else { | ||
677 | rv = _drbd_set_state(mdev, ns, f, &done); | ||
678 | } | ||
679 | |||
680 | spin_unlock_irqrestore(&mdev->req_lock, flags); | ||
681 | |||
682 | if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) { | ||
683 | D_ASSERT(current != mdev->worker.task); | ||
684 | wait_for_completion(&done); | ||
685 | } | ||
686 | |||
687 | abort: | ||
688 | if (f & CS_SERIALIZE) | ||
689 | mutex_unlock(&mdev->state_mutex); | ||
690 | |||
691 | return rv; | ||
692 | } | ||
693 | |||
694 | /** | ||
695 | * _drbd_request_state() - Request a state change (with flags) | ||
696 | * @mdev: DRBD device. | ||
697 | * @mask: mask of state bits to change. | ||
698 | * @val: value of new state bits. | ||
699 | * @f: flags | ||
700 | * | ||
701 | * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE | ||
702 | * flag, or when logging of failed state change requests is not desired. | ||
703 | */ | ||
704 | enum drbd_state_rv | ||
705 | _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask, | ||
706 | union drbd_state val, enum chg_state_flags f) | ||
707 | { | ||
708 | enum drbd_state_rv rv; | ||
709 | |||
710 | wait_event(mdev->state_wait, | ||
711 | (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE); | ||
712 | |||
713 | return rv; | ||
714 | } | ||
715 | |||
716 | static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns) | ||
717 | { | ||
718 | dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n", | ||
719 | name, | ||
720 | drbd_conn_str(ns.conn), | ||
721 | drbd_role_str(ns.role), | ||
722 | drbd_role_str(ns.peer), | ||
723 | drbd_disk_str(ns.disk), | ||
724 | drbd_disk_str(ns.pdsk), | ||
725 | is_susp(ns) ? 's' : 'r', | ||
726 | ns.aftr_isp ? 'a' : '-', | ||
727 | ns.peer_isp ? 'p' : '-', | ||
728 | ns.user_isp ? 'u' : '-' | ||
729 | ); | ||
730 | } | ||
731 | |||
732 | void print_st_err(struct drbd_conf *mdev, union drbd_state os, | ||
733 | union drbd_state ns, enum drbd_state_rv err) | ||
734 | { | ||
735 | if (err == SS_IN_TRANSIENT_STATE) | ||
736 | return; | ||
737 | dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err)); | ||
738 | print_st(mdev, " state", os); | ||
739 | print_st(mdev, "wanted", ns); | ||
740 | } | ||
741 | |||
742 | |||
743 | /** | ||
744 | * is_valid_state() - Returns an SS_ error code if ns is not valid | ||
745 | * @mdev: DRBD device. | ||
746 | * @ns: State to consider. | ||
747 | */ | ||
748 | static enum drbd_state_rv | ||
749 | is_valid_state(struct drbd_conf *mdev, union drbd_state ns) | ||
750 | { | ||
751 | /* See drbd_state_sw_errors in drbd_strings.c */ | ||
752 | |||
753 | enum drbd_fencing_p fp; | ||
754 | enum drbd_state_rv rv = SS_SUCCESS; | ||
755 | |||
756 | fp = FP_DONT_CARE; | ||
757 | if (get_ldev(mdev)) { | ||
758 | fp = mdev->ldev->dc.fencing; | ||
759 | put_ldev(mdev); | ||
760 | } | ||
761 | |||
762 | if (get_net_conf(mdev)) { | ||
763 | if (!mdev->net_conf->two_primaries && | ||
764 | ns.role == R_PRIMARY && ns.peer == R_PRIMARY) | ||
765 | rv = SS_TWO_PRIMARIES; | ||
766 | put_net_conf(mdev); | ||
767 | } | ||
768 | |||
769 | if (rv <= 0) | ||
770 | /* already found a reason to abort */; | ||
771 | else if (ns.role == R_SECONDARY && mdev->open_cnt) | ||
772 | rv = SS_DEVICE_IN_USE; | ||
773 | |||
774 | else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE) | ||
775 | rv = SS_NO_UP_TO_DATE_DISK; | ||
776 | |||
777 | else if (fp >= FP_RESOURCE && | ||
778 | ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN) | ||
779 | rv = SS_PRIMARY_NOP; | ||
780 | |||
781 | else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT) | ||
782 | rv = SS_NO_UP_TO_DATE_DISK; | ||
783 | |||
784 | else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT) | ||
785 | rv = SS_NO_LOCAL_DISK; | ||
786 | |||
787 | else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT) | ||
788 | rv = SS_NO_REMOTE_DISK; | ||
789 | |||
790 | else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) | ||
791 | rv = SS_NO_UP_TO_DATE_DISK; | ||
792 | |||
793 | else if ((ns.conn == C_CONNECTED || | ||
794 | ns.conn == C_WF_BITMAP_S || | ||
795 | ns.conn == C_SYNC_SOURCE || | ||
796 | ns.conn == C_PAUSED_SYNC_S) && | ||
797 | ns.disk == D_OUTDATED) | ||
798 | rv = SS_CONNECTED_OUTDATES; | ||
799 | |||
800 | else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && | ||
801 | (mdev->sync_conf.verify_alg[0] == 0)) | ||
802 | rv = SS_NO_VERIFY_ALG; | ||
803 | |||
804 | else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && | ||
805 | mdev->agreed_pro_version < 88) | ||
806 | rv = SS_NOT_SUPPORTED; | ||
807 | |||
808 | else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN) | ||
809 | rv = SS_CONNECTED_OUTDATES; | ||
810 | |||
811 | return rv; | ||
812 | } | ||
813 | |||
814 | /** | ||
815 | * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible | ||
816 | * @mdev: DRBD device. | ||
817 | * @ns: new state. | ||
818 | * @os: old state. | ||
819 | */ | ||
820 | static enum drbd_state_rv | ||
821 | is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns, | ||
822 | union drbd_state os) | ||
823 | { | ||
824 | enum drbd_state_rv rv = SS_SUCCESS; | ||
825 | |||
826 | if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) && | ||
827 | os.conn > C_CONNECTED) | ||
828 | rv = SS_RESYNC_RUNNING; | ||
829 | |||
830 | if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE) | ||
831 | rv = SS_ALREADY_STANDALONE; | ||
832 | |||
833 | if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS) | ||
834 | rv = SS_IS_DISKLESS; | ||
835 | |||
836 | if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED) | ||
837 | rv = SS_NO_NET_CONFIG; | ||
838 | |||
839 | if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING) | ||
840 | rv = SS_LOWER_THAN_OUTDATED; | ||
841 | |||
842 | if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED) | ||
843 | rv = SS_IN_TRANSIENT_STATE; | ||
844 | |||
845 | if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS) | ||
846 | rv = SS_IN_TRANSIENT_STATE; | ||
847 | |||
848 | /* While establishing a connection only allow cstate to change. | ||
849 | Delay/refuse role changes, detach attach etc... */ | ||
850 | if (test_bit(STATE_SENT, &mdev->flags) && | ||
851 | !(os.conn == C_WF_REPORT_PARAMS || | ||
852 | (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION))) | ||
853 | rv = SS_IN_TRANSIENT_STATE; | ||
854 | |||
855 | if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED) | ||
856 | rv = SS_NEED_CONNECTION; | ||
857 | |||
858 | if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && | ||
859 | ns.conn != os.conn && os.conn > C_CONNECTED) | ||
860 | rv = SS_RESYNC_RUNNING; | ||
861 | |||
862 | if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) && | ||
863 | os.conn < C_CONNECTED) | ||
864 | rv = SS_NEED_CONNECTION; | ||
865 | |||
866 | if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE) | ||
867 | && os.conn < C_WF_REPORT_PARAMS) | ||
868 | rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */ | ||
869 | |||
870 | return rv; | ||
871 | } | ||
872 | |||
873 | static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn) | ||
874 | { | ||
875 | static const char *msg_table[] = { | ||
876 | [NO_WARNING] = "", | ||
877 | [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.", | ||
878 | [ABORTED_RESYNC] = "Resync aborted.", | ||
879 | [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!", | ||
880 | [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk", | ||
881 | [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk", | ||
882 | }; | ||
883 | |||
884 | if (warn != NO_WARNING) | ||
885 | dev_warn(DEV, "%s\n", msg_table[warn]); | ||
886 | } | ||
887 | |||
888 | /** | ||
889 | * sanitize_state() - Resolves implicitly necessary additional changes to a state transition | ||
890 | * @mdev: DRBD device. | ||
891 | * @os: old state. | ||
892 | * @ns: new state. | ||
893 | * @warn_sync_abort: | ||
894 | * | ||
895 | * When we loose connection, we have to set the state of the peers disk (pdsk) | ||
896 | * to D_UNKNOWN. This rule and many more along those lines are in this function. | ||
897 | */ | ||
898 | static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, | ||
899 | union drbd_state ns, enum sanitize_state_warnings *warn) | ||
900 | { | ||
901 | enum drbd_fencing_p fp; | ||
902 | enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max; | ||
903 | |||
904 | if (warn) | ||
905 | *warn = NO_WARNING; | ||
906 | |||
907 | fp = FP_DONT_CARE; | ||
908 | if (get_ldev(mdev)) { | ||
909 | fp = mdev->ldev->dc.fencing; | ||
910 | put_ldev(mdev); | ||
911 | } | ||
912 | |||
913 | /* Disallow Network errors to configure a device's network part */ | ||
914 | if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) && | ||
915 | os.conn <= C_DISCONNECTING) | ||
916 | ns.conn = os.conn; | ||
917 | |||
918 | /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow. | ||
919 | * If you try to go into some Sync* state, that shall fail (elsewhere). */ | ||
920 | if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN && | ||
921 | ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_CONNECTED) | ||
922 | ns.conn = os.conn; | ||
923 | |||
924 | /* we cannot fail (again) if we already detached */ | ||
925 | if (ns.disk == D_FAILED && os.disk == D_DISKLESS) | ||
926 | ns.disk = D_DISKLESS; | ||
927 | |||
928 | /* After C_DISCONNECTING only C_STANDALONE may follow */ | ||
929 | if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE) | ||
930 | ns.conn = os.conn; | ||
931 | |||
932 | if (ns.conn < C_CONNECTED) { | ||
933 | ns.peer_isp = 0; | ||
934 | ns.peer = R_UNKNOWN; | ||
935 | if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT) | ||
936 | ns.pdsk = D_UNKNOWN; | ||
937 | } | ||
938 | |||
939 | /* Clear the aftr_isp when becoming unconfigured */ | ||
940 | if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY) | ||
941 | ns.aftr_isp = 0; | ||
942 | |||
943 | /* Abort resync if a disk fails/detaches */ | ||
944 | if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED && | ||
945 | (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) { | ||
946 | if (warn) | ||
947 | *warn = os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ? | ||
948 | ABORTED_ONLINE_VERIFY : ABORTED_RESYNC; | ||
949 | ns.conn = C_CONNECTED; | ||
950 | } | ||
951 | |||
952 | /* Connection breaks down before we finished "Negotiating" */ | ||
953 | if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING && | ||
954 | get_ldev_if_state(mdev, D_NEGOTIATING)) { | ||
955 | if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) { | ||
956 | ns.disk = mdev->new_state_tmp.disk; | ||
957 | ns.pdsk = mdev->new_state_tmp.pdsk; | ||
958 | } else { | ||
959 | if (warn) | ||
960 | *warn = CONNECTION_LOST_NEGOTIATING; | ||
961 | ns.disk = D_DISKLESS; | ||
962 | ns.pdsk = D_UNKNOWN; | ||
963 | } | ||
964 | put_ldev(mdev); | ||
965 | } | ||
966 | |||
967 | /* D_CONSISTENT and D_OUTDATED vanish when we get connected */ | ||
968 | if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) { | ||
969 | if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) | ||
970 | ns.disk = D_UP_TO_DATE; | ||
971 | if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED) | ||
972 | ns.pdsk = D_UP_TO_DATE; | ||
973 | } | ||
974 | |||
975 | /* Implications of the connection stat on the disk states */ | ||
976 | disk_min = D_DISKLESS; | ||
977 | disk_max = D_UP_TO_DATE; | ||
978 | pdsk_min = D_INCONSISTENT; | ||
979 | pdsk_max = D_UNKNOWN; | ||
980 | switch ((enum drbd_conns)ns.conn) { | ||
981 | case C_WF_BITMAP_T: | ||
982 | case C_PAUSED_SYNC_T: | ||
983 | case C_STARTING_SYNC_T: | ||
984 | case C_WF_SYNC_UUID: | ||
985 | case C_BEHIND: | ||
986 | disk_min = D_INCONSISTENT; | ||
987 | disk_max = D_OUTDATED; | ||
988 | pdsk_min = D_UP_TO_DATE; | ||
989 | pdsk_max = D_UP_TO_DATE; | ||
990 | break; | ||
991 | case C_VERIFY_S: | ||
992 | case C_VERIFY_T: | ||
993 | disk_min = D_UP_TO_DATE; | ||
994 | disk_max = D_UP_TO_DATE; | ||
995 | pdsk_min = D_UP_TO_DATE; | ||
996 | pdsk_max = D_UP_TO_DATE; | ||
997 | break; | ||
998 | case C_CONNECTED: | ||
999 | disk_min = D_DISKLESS; | ||
1000 | disk_max = D_UP_TO_DATE; | ||
1001 | pdsk_min = D_DISKLESS; | ||
1002 | pdsk_max = D_UP_TO_DATE; | ||
1003 | break; | ||
1004 | case C_WF_BITMAP_S: | ||
1005 | case C_PAUSED_SYNC_S: | ||
1006 | case C_STARTING_SYNC_S: | ||
1007 | case C_AHEAD: | ||
1008 | disk_min = D_UP_TO_DATE; | ||
1009 | disk_max = D_UP_TO_DATE; | ||
1010 | pdsk_min = D_INCONSISTENT; | ||
1011 | pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/ | ||
1012 | break; | ||
1013 | case C_SYNC_TARGET: | ||
1014 | disk_min = D_INCONSISTENT; | ||
1015 | disk_max = D_INCONSISTENT; | ||
1016 | pdsk_min = D_UP_TO_DATE; | ||
1017 | pdsk_max = D_UP_TO_DATE; | ||
1018 | break; | ||
1019 | case C_SYNC_SOURCE: | ||
1020 | disk_min = D_UP_TO_DATE; | ||
1021 | disk_max = D_UP_TO_DATE; | ||
1022 | pdsk_min = D_INCONSISTENT; | ||
1023 | pdsk_max = D_INCONSISTENT; | ||
1024 | break; | ||
1025 | case C_STANDALONE: | ||
1026 | case C_DISCONNECTING: | ||
1027 | case C_UNCONNECTED: | ||
1028 | case C_TIMEOUT: | ||
1029 | case C_BROKEN_PIPE: | ||
1030 | case C_NETWORK_FAILURE: | ||
1031 | case C_PROTOCOL_ERROR: | ||
1032 | case C_TEAR_DOWN: | ||
1033 | case C_WF_CONNECTION: | ||
1034 | case C_WF_REPORT_PARAMS: | ||
1035 | case C_MASK: | ||
1036 | break; | ||
1037 | } | ||
1038 | if (ns.disk > disk_max) | ||
1039 | ns.disk = disk_max; | ||
1040 | |||
1041 | if (ns.disk < disk_min) { | ||
1042 | if (warn) | ||
1043 | *warn = IMPLICITLY_UPGRADED_DISK; | ||
1044 | ns.disk = disk_min; | ||
1045 | } | ||
1046 | if (ns.pdsk > pdsk_max) | ||
1047 | ns.pdsk = pdsk_max; | ||
1048 | |||
1049 | if (ns.pdsk < pdsk_min) { | ||
1050 | if (warn) | ||
1051 | *warn = IMPLICITLY_UPGRADED_PDSK; | ||
1052 | ns.pdsk = pdsk_min; | ||
1053 | } | ||
1054 | |||
1055 | if (fp == FP_STONITH && | ||
1056 | (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) && | ||
1057 | !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED)) | ||
1058 | ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */ | ||
1059 | |||
1060 | if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO && | ||
1061 | (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) && | ||
1062 | !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE)) | ||
1063 | ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */ | ||
1064 | |||
1065 | if (ns.aftr_isp || ns.peer_isp || ns.user_isp) { | ||
1066 | if (ns.conn == C_SYNC_SOURCE) | ||
1067 | ns.conn = C_PAUSED_SYNC_S; | ||
1068 | if (ns.conn == C_SYNC_TARGET) | ||
1069 | ns.conn = C_PAUSED_SYNC_T; | ||
1070 | } else { | ||
1071 | if (ns.conn == C_PAUSED_SYNC_S) | ||
1072 | ns.conn = C_SYNC_SOURCE; | ||
1073 | if (ns.conn == C_PAUSED_SYNC_T) | ||
1074 | ns.conn = C_SYNC_TARGET; | ||
1075 | } | ||
1076 | |||
1077 | return ns; | ||
1078 | } | ||
1079 | |||
1080 | /* helper for __drbd_set_state */ | ||
1081 | static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs) | ||
1082 | { | ||
1083 | if (mdev->agreed_pro_version < 90) | ||
1084 | mdev->ov_start_sector = 0; | ||
1085 | mdev->rs_total = drbd_bm_bits(mdev); | ||
1086 | mdev->ov_position = 0; | ||
1087 | if (cs == C_VERIFY_T) { | ||
1088 | /* starting online verify from an arbitrary position | ||
1089 | * does not fit well into the existing protocol. | ||
1090 | * on C_VERIFY_T, we initialize ov_left and friends | ||
1091 | * implicitly in receive_DataRequest once the | ||
1092 | * first P_OV_REQUEST is received */ | ||
1093 | mdev->ov_start_sector = ~(sector_t)0; | ||
1094 | } else { | ||
1095 | unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector); | ||
1096 | if (bit >= mdev->rs_total) { | ||
1097 | mdev->ov_start_sector = | ||
1098 | BM_BIT_TO_SECT(mdev->rs_total - 1); | ||
1099 | mdev->rs_total = 1; | ||
1100 | } else | ||
1101 | mdev->rs_total -= bit; | ||
1102 | mdev->ov_position = mdev->ov_start_sector; | ||
1103 | } | ||
1104 | mdev->ov_left = mdev->rs_total; | ||
1105 | } | ||
1106 | |||
1107 | static void drbd_resume_al(struct drbd_conf *mdev) | ||
1108 | { | ||
1109 | if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags)) | ||
1110 | dev_info(DEV, "Resumed AL updates\n"); | ||
1111 | } | ||
1112 | |||
1113 | /** | ||
1114 | * __drbd_set_state() - Set a new DRBD state | ||
1115 | * @mdev: DRBD device. | ||
1116 | * @ns: new state. | ||
1117 | * @flags: Flags | ||
1118 | * @done: Optional completion, that will get completed after the after_state_ch() finished | ||
1119 | * | ||
1120 | * Caller needs to hold req_lock, and global_state_lock. Do not call directly. | ||
1121 | */ | ||
1122 | enum drbd_state_rv | ||
1123 | __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, | ||
1124 | enum chg_state_flags flags, struct completion *done) | ||
1125 | { | ||
1126 | union drbd_state os; | ||
1127 | enum drbd_state_rv rv = SS_SUCCESS; | ||
1128 | enum sanitize_state_warnings ssw; | ||
1129 | struct after_state_chg_work *ascw; | ||
1130 | |||
1131 | os = mdev->state; | ||
1132 | |||
1133 | ns = sanitize_state(mdev, os, ns, &ssw); | ||
1134 | |||
1135 | if (ns.i == os.i) | ||
1136 | return SS_NOTHING_TO_DO; | ||
1137 | |||
1138 | if (!(flags & CS_HARD)) { | ||
1139 | /* pre-state-change checks ; only look at ns */ | ||
1140 | /* See drbd_state_sw_errors in drbd_strings.c */ | ||
1141 | |||
1142 | rv = is_valid_state(mdev, ns); | ||
1143 | if (rv < SS_SUCCESS) { | ||
1144 | /* If the old state was illegal as well, then let | ||
1145 | this happen...*/ | ||
1146 | |||
1147 | if (is_valid_state(mdev, os) == rv) | ||
1148 | rv = is_valid_state_transition(mdev, ns, os); | ||
1149 | } else | ||
1150 | rv = is_valid_state_transition(mdev, ns, os); | ||
1151 | } | ||
1152 | |||
1153 | if (rv < SS_SUCCESS) { | ||
1154 | if (flags & CS_VERBOSE) | ||
1155 | print_st_err(mdev, os, ns, rv); | ||
1156 | return rv; | ||
1157 | } | ||
1158 | |||
1159 | print_sanitize_warnings(mdev, ssw); | ||
1160 | |||
1161 | { | ||
1162 | char *pbp, pb[300]; | ||
1163 | pbp = pb; | ||
1164 | *pbp = 0; | ||
1165 | if (ns.role != os.role) | ||
1166 | pbp += sprintf(pbp, "role( %s -> %s ) ", | ||
1167 | drbd_role_str(os.role), | ||
1168 | drbd_role_str(ns.role)); | ||
1169 | if (ns.peer != os.peer) | ||
1170 | pbp += sprintf(pbp, "peer( %s -> %s ) ", | ||
1171 | drbd_role_str(os.peer), | ||
1172 | drbd_role_str(ns.peer)); | ||
1173 | if (ns.conn != os.conn) | ||
1174 | pbp += sprintf(pbp, "conn( %s -> %s ) ", | ||
1175 | drbd_conn_str(os.conn), | ||
1176 | drbd_conn_str(ns.conn)); | ||
1177 | if (ns.disk != os.disk) | ||
1178 | pbp += sprintf(pbp, "disk( %s -> %s ) ", | ||
1179 | drbd_disk_str(os.disk), | ||
1180 | drbd_disk_str(ns.disk)); | ||
1181 | if (ns.pdsk != os.pdsk) | ||
1182 | pbp += sprintf(pbp, "pdsk( %s -> %s ) ", | ||
1183 | drbd_disk_str(os.pdsk), | ||
1184 | drbd_disk_str(ns.pdsk)); | ||
1185 | if (is_susp(ns) != is_susp(os)) | ||
1186 | pbp += sprintf(pbp, "susp( %d -> %d ) ", | ||
1187 | is_susp(os), | ||
1188 | is_susp(ns)); | ||
1189 | if (ns.aftr_isp != os.aftr_isp) | ||
1190 | pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ", | ||
1191 | os.aftr_isp, | ||
1192 | ns.aftr_isp); | ||
1193 | if (ns.peer_isp != os.peer_isp) | ||
1194 | pbp += sprintf(pbp, "peer_isp( %d -> %d ) ", | ||
1195 | os.peer_isp, | ||
1196 | ns.peer_isp); | ||
1197 | if (ns.user_isp != os.user_isp) | ||
1198 | pbp += sprintf(pbp, "user_isp( %d -> %d ) ", | ||
1199 | os.user_isp, | ||
1200 | ns.user_isp); | ||
1201 | dev_info(DEV, "%s\n", pb); | ||
1202 | } | ||
1203 | |||
1204 | /* solve the race between becoming unconfigured, | ||
1205 | * worker doing the cleanup, and | ||
1206 | * admin reconfiguring us: | ||
1207 | * on (re)configure, first set CONFIG_PENDING, | ||
1208 | * then wait for a potentially exiting worker, | ||
1209 | * start the worker, and schedule one no_op. | ||
1210 | * then proceed with configuration. | ||
1211 | */ | ||
1212 | if (ns.disk == D_DISKLESS && | ||
1213 | ns.conn == C_STANDALONE && | ||
1214 | ns.role == R_SECONDARY && | ||
1215 | !test_and_set_bit(CONFIG_PENDING, &mdev->flags)) | ||
1216 | set_bit(DEVICE_DYING, &mdev->flags); | ||
1217 | |||
1218 | /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference | ||
1219 | * on the ldev here, to be sure the transition -> D_DISKLESS resp. | ||
1220 | * drbd_ldev_destroy() won't happen before our corresponding | ||
1221 | * after_state_ch works run, where we put_ldev again. */ | ||
1222 | if ((os.disk != D_FAILED && ns.disk == D_FAILED) || | ||
1223 | (os.disk != D_DISKLESS && ns.disk == D_DISKLESS)) | ||
1224 | atomic_inc(&mdev->local_cnt); | ||
1225 | |||
1226 | mdev->state = ns; | ||
1227 | |||
1228 | if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING) | ||
1229 | drbd_print_uuids(mdev, "attached to UUIDs"); | ||
1230 | |||
1231 | wake_up(&mdev->misc_wait); | ||
1232 | wake_up(&mdev->state_wait); | ||
1233 | |||
1234 | /* aborted verify run. log the last position */ | ||
1235 | if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) && | ||
1236 | ns.conn < C_CONNECTED) { | ||
1237 | mdev->ov_start_sector = | ||
1238 | BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left); | ||
1239 | dev_info(DEV, "Online Verify reached sector %llu\n", | ||
1240 | (unsigned long long)mdev->ov_start_sector); | ||
1241 | } | ||
1242 | |||
1243 | if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) && | ||
1244 | (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) { | ||
1245 | dev_info(DEV, "Syncer continues.\n"); | ||
1246 | mdev->rs_paused += (long)jiffies | ||
1247 | -(long)mdev->rs_mark_time[mdev->rs_last_mark]; | ||
1248 | if (ns.conn == C_SYNC_TARGET) | ||
1249 | mod_timer(&mdev->resync_timer, jiffies); | ||
1250 | } | ||
1251 | |||
1252 | if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) && | ||
1253 | (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) { | ||
1254 | dev_info(DEV, "Resync suspended\n"); | ||
1255 | mdev->rs_mark_time[mdev->rs_last_mark] = jiffies; | ||
1256 | } | ||
1257 | |||
1258 | if (os.conn == C_CONNECTED && | ||
1259 | (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) { | ||
1260 | unsigned long now = jiffies; | ||
1261 | int i; | ||
1262 | |||
1263 | set_ov_position(mdev, ns.conn); | ||
1264 | mdev->rs_start = now; | ||
1265 | mdev->rs_last_events = 0; | ||
1266 | mdev->rs_last_sect_ev = 0; | ||
1267 | mdev->ov_last_oos_size = 0; | ||
1268 | mdev->ov_last_oos_start = 0; | ||
1269 | |||
1270 | for (i = 0; i < DRBD_SYNC_MARKS; i++) { | ||
1271 | mdev->rs_mark_left[i] = mdev->ov_left; | ||
1272 | mdev->rs_mark_time[i] = now; | ||
1273 | } | ||
1274 | |||
1275 | drbd_rs_controller_reset(mdev); | ||
1276 | |||
1277 | if (ns.conn == C_VERIFY_S) { | ||
1278 | dev_info(DEV, "Starting Online Verify from sector %llu\n", | ||
1279 | (unsigned long long)mdev->ov_position); | ||
1280 | mod_timer(&mdev->resync_timer, jiffies); | ||
1281 | } | ||
1282 | } | ||
1283 | |||
1284 | if (get_ldev(mdev)) { | ||
1285 | u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND| | ||
1286 | MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE| | ||
1287 | MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY); | ||
1288 | |||
1289 | if (test_bit(CRASHED_PRIMARY, &mdev->flags)) | ||
1290 | mdf |= MDF_CRASHED_PRIMARY; | ||
1291 | if (mdev->state.role == R_PRIMARY || | ||
1292 | (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY)) | ||
1293 | mdf |= MDF_PRIMARY_IND; | ||
1294 | if (mdev->state.conn > C_WF_REPORT_PARAMS) | ||
1295 | mdf |= MDF_CONNECTED_IND; | ||
1296 | if (mdev->state.disk > D_INCONSISTENT) | ||
1297 | mdf |= MDF_CONSISTENT; | ||
1298 | if (mdev->state.disk > D_OUTDATED) | ||
1299 | mdf |= MDF_WAS_UP_TO_DATE; | ||
1300 | if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT) | ||
1301 | mdf |= MDF_PEER_OUT_DATED; | ||
1302 | if (mdf != mdev->ldev->md.flags) { | ||
1303 | mdev->ldev->md.flags = mdf; | ||
1304 | drbd_md_mark_dirty(mdev); | ||
1305 | } | ||
1306 | if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT) | ||
1307 | drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]); | ||
1308 | put_ldev(mdev); | ||
1309 | } | ||
1310 | |||
1311 | /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */ | ||
1312 | if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT && | ||
1313 | os.peer == R_SECONDARY && ns.peer == R_PRIMARY) | ||
1314 | set_bit(CONSIDER_RESYNC, &mdev->flags); | ||
1315 | |||
1316 | /* Receiver should clean up itself */ | ||
1317 | if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING) | ||
1318 | drbd_thread_stop_nowait(&mdev->receiver); | ||
1319 | |||
1320 | /* Now the receiver finished cleaning up itself, it should die */ | ||
1321 | if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE) | ||
1322 | drbd_thread_stop_nowait(&mdev->receiver); | ||
1323 | |||
1324 | /* Upon network failure, we need to restart the receiver. */ | ||
1325 | if (os.conn > C_WF_CONNECTION && | ||
1326 | ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT) | ||
1327 | drbd_thread_restart_nowait(&mdev->receiver); | ||
1328 | |||
1329 | /* Resume AL writing if we get a connection */ | ||
1330 | if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) | ||
1331 | drbd_resume_al(mdev); | ||
1332 | |||
1333 | /* remember last connect and attach times so request_timer_fn() won't | ||
1334 | * kill newly established sessions while we are still trying to thaw | ||
1335 | * previously frozen IO */ | ||
1336 | if (os.conn != C_WF_REPORT_PARAMS && ns.conn == C_WF_REPORT_PARAMS) | ||
1337 | mdev->last_reconnect_jif = jiffies; | ||
1338 | if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && | ||
1339 | ns.disk > D_NEGOTIATING) | ||
1340 | mdev->last_reattach_jif = jiffies; | ||
1341 | |||
1342 | ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC); | ||
1343 | if (ascw) { | ||
1344 | ascw->os = os; | ||
1345 | ascw->ns = ns; | ||
1346 | ascw->flags = flags; | ||
1347 | ascw->w.cb = w_after_state_ch; | ||
1348 | ascw->done = done; | ||
1349 | drbd_queue_work(&mdev->data.work, &ascw->w); | ||
1350 | } else { | ||
1351 | dev_warn(DEV, "Could not kmalloc an ascw\n"); | ||
1352 | } | ||
1353 | |||
1354 | return rv; | ||
1355 | } | ||
1356 | |||
1357 | static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused) | ||
1358 | { | ||
1359 | struct after_state_chg_work *ascw = | ||
1360 | container_of(w, struct after_state_chg_work, w); | ||
1361 | after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags); | ||
1362 | if (ascw->flags & CS_WAIT_COMPLETE) { | ||
1363 | D_ASSERT(ascw->done != NULL); | ||
1364 | complete(ascw->done); | ||
1365 | } | ||
1366 | kfree(ascw); | ||
1367 | |||
1368 | return 1; | ||
1369 | } | ||
1370 | |||
1371 | static void abw_start_sync(struct drbd_conf *mdev, int rv) | ||
1372 | { | ||
1373 | if (rv) { | ||
1374 | dev_err(DEV, "Writing the bitmap failed not starting resync.\n"); | ||
1375 | _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE); | ||
1376 | return; | ||
1377 | } | ||
1378 | |||
1379 | switch (mdev->state.conn) { | ||
1380 | case C_STARTING_SYNC_T: | ||
1381 | _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); | ||
1382 | break; | ||
1383 | case C_STARTING_SYNC_S: | ||
1384 | drbd_start_resync(mdev, C_SYNC_SOURCE); | ||
1385 | break; | ||
1386 | } | ||
1387 | } | ||
1388 | |||
1389 | int drbd_bitmap_io_from_worker(struct drbd_conf *mdev, | ||
1390 | int (*io_fn)(struct drbd_conf *), | ||
1391 | char *why, enum bm_flag flags) | ||
1392 | { | ||
1393 | int rv; | ||
1394 | |||
1395 | D_ASSERT(current == mdev->worker.task); | ||
1396 | |||
1397 | /* open coded non-blocking drbd_suspend_io(mdev); */ | ||
1398 | set_bit(SUSPEND_IO, &mdev->flags); | ||
1399 | |||
1400 | drbd_bm_lock(mdev, why, flags); | ||
1401 | rv = io_fn(mdev); | ||
1402 | drbd_bm_unlock(mdev); | ||
1403 | |||
1404 | drbd_resume_io(mdev); | ||
1405 | |||
1406 | return rv; | ||
1407 | } | ||
1408 | |||
1409 | /** | ||
1410 | * after_state_ch() - Perform after state change actions that may sleep | ||
1411 | * @mdev: DRBD device. | ||
1412 | * @os: old state. | ||
1413 | * @ns: new state. | ||
1414 | * @flags: Flags | ||
1415 | */ | ||
1416 | static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, | ||
1417 | union drbd_state ns, enum chg_state_flags flags) | ||
1418 | { | ||
1419 | enum drbd_fencing_p fp; | ||
1420 | enum drbd_req_event what = nothing; | ||
1421 | union drbd_state nsm = (union drbd_state){ .i = -1 }; | ||
1422 | |||
1423 | if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) { | ||
1424 | clear_bit(CRASHED_PRIMARY, &mdev->flags); | ||
1425 | if (mdev->p_uuid) | ||
1426 | mdev->p_uuid[UI_FLAGS] &= ~((u64)2); | ||
1427 | } | ||
1428 | |||
1429 | fp = FP_DONT_CARE; | ||
1430 | if (get_ldev(mdev)) { | ||
1431 | fp = mdev->ldev->dc.fencing; | ||
1432 | put_ldev(mdev); | ||
1433 | } | ||
1434 | |||
1435 | /* Inform userspace about the change... */ | ||
1436 | drbd_bcast_state(mdev, ns); | ||
1437 | |||
1438 | if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) && | ||
1439 | (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)) | ||
1440 | drbd_khelper(mdev, "pri-on-incon-degr"); | ||
1441 | |||
1442 | /* Here we have the actions that are performed after a | ||
1443 | state change. This function might sleep */ | ||
1444 | |||
1445 | if (os.disk <= D_NEGOTIATING && ns.disk > D_NEGOTIATING) | ||
1446 | mod_timer(&mdev->request_timer, jiffies + HZ); | ||
1447 | |||
1448 | nsm.i = -1; | ||
1449 | if (ns.susp_nod) { | ||
1450 | if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) | ||
1451 | what = resend; | ||
1452 | |||
1453 | if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && | ||
1454 | ns.disk > D_NEGOTIATING) | ||
1455 | what = restart_frozen_disk_io; | ||
1456 | |||
1457 | if (what != nothing) | ||
1458 | nsm.susp_nod = 0; | ||
1459 | } | ||
1460 | |||
1461 | if (ns.susp_fen) { | ||
1462 | /* case1: The outdate peer handler is successful: */ | ||
1463 | if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) { | ||
1464 | if (test_bit(NEW_CUR_UUID, &mdev->flags)) { | ||
1465 | drbd_uuid_new_current(mdev); | ||
1466 | clear_bit(NEW_CUR_UUID, &mdev->flags); | ||
1467 | } | ||
1468 | spin_lock_irq(&mdev->req_lock); | ||
1469 | _tl_clear(mdev); | ||
1470 | _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL); | ||
1471 | spin_unlock_irq(&mdev->req_lock); | ||
1472 | } | ||
1473 | /* case2: The connection was established again: */ | ||
1474 | if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) { | ||
1475 | clear_bit(NEW_CUR_UUID, &mdev->flags); | ||
1476 | what = resend; | ||
1477 | nsm.susp_fen = 0; | ||
1478 | } | ||
1479 | } | ||
1480 | |||
1481 | if (what != nothing) { | ||
1482 | spin_lock_irq(&mdev->req_lock); | ||
1483 | _tl_restart(mdev, what); | ||
1484 | nsm.i &= mdev->state.i; | ||
1485 | _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL); | ||
1486 | spin_unlock_irq(&mdev->req_lock); | ||
1487 | } | ||
1488 | |||
1489 | /* Became sync source. With protocol >= 96, we still need to send out | ||
1490 | * the sync uuid now. Need to do that before any drbd_send_state, or | ||
1491 | * the other side may go "paused sync" before receiving the sync uuids, | ||
1492 | * which is unexpected. */ | ||
1493 | if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) && | ||
1494 | (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) && | ||
1495 | mdev->agreed_pro_version >= 96 && get_ldev(mdev)) { | ||
1496 | drbd_gen_and_send_sync_uuid(mdev); | ||
1497 | put_ldev(mdev); | ||
1498 | } | ||
1499 | |||
1500 | /* Do not change the order of the if above and the two below... */ | ||
1501 | if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */ | ||
1502 | /* we probably will start a resync soon. | ||
1503 | * make sure those things are properly reset. */ | ||
1504 | mdev->rs_total = 0; | ||
1505 | mdev->rs_failed = 0; | ||
1506 | atomic_set(&mdev->rs_pending_cnt, 0); | ||
1507 | drbd_rs_cancel_all(mdev); | ||
1508 | |||
1509 | drbd_send_uuids(mdev); | ||
1510 | drbd_send_state(mdev, ns); | ||
1511 | } | ||
1512 | /* No point in queuing send_bitmap if we don't have a connection | ||
1513 | * anymore, so check also the _current_ state, not only the new state | ||
1514 | * at the time this work was queued. */ | ||
1515 | if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S && | ||
1516 | mdev->state.conn == C_WF_BITMAP_S) | ||
1517 | drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, | ||
1518 | "send_bitmap (WFBitMapS)", | ||
1519 | BM_LOCKED_TEST_ALLOWED); | ||
1520 | |||
1521 | /* Lost contact to peer's copy of the data */ | ||
1522 | if ((os.pdsk >= D_INCONSISTENT && | ||
1523 | os.pdsk != D_UNKNOWN && | ||
1524 | os.pdsk != D_OUTDATED) | ||
1525 | && (ns.pdsk < D_INCONSISTENT || | ||
1526 | ns.pdsk == D_UNKNOWN || | ||
1527 | ns.pdsk == D_OUTDATED)) { | ||
1528 | if (get_ldev(mdev)) { | ||
1529 | if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && | ||
1530 | mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { | ||
1531 | if (is_susp(mdev->state)) { | ||
1532 | set_bit(NEW_CUR_UUID, &mdev->flags); | ||
1533 | } else { | ||
1534 | drbd_uuid_new_current(mdev); | ||
1535 | drbd_send_uuids(mdev); | ||
1536 | } | ||
1537 | } | ||
1538 | put_ldev(mdev); | ||
1539 | } | ||
1540 | } | ||
1541 | |||
1542 | if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) { | ||
1543 | if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY && | ||
1544 | mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { | ||
1545 | drbd_uuid_new_current(mdev); | ||
1546 | drbd_send_uuids(mdev); | ||
1547 | } | ||
1548 | /* D_DISKLESS Peer becomes secondary */ | ||
1549 | if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) | ||
1550 | /* We may still be Primary ourselves. | ||
1551 | * No harm done if the bitmap still changes, | ||
1552 | * redirtied pages will follow later. */ | ||
1553 | drbd_bitmap_io_from_worker(mdev, &drbd_bm_write, | ||
1554 | "demote diskless peer", BM_LOCKED_SET_ALLOWED); | ||
1555 | put_ldev(mdev); | ||
1556 | } | ||
1557 | |||
1558 | /* Write out all changed bits on demote. | ||
1559 | * Though, no need to da that just yet | ||
1560 | * if there is a resync going on still */ | ||
1561 | if (os.role == R_PRIMARY && ns.role == R_SECONDARY && | ||
1562 | mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) { | ||
1563 | /* No changes to the bitmap expected this time, so assert that, | ||
1564 | * even though no harm was done if it did change. */ | ||
1565 | drbd_bitmap_io_from_worker(mdev, &drbd_bm_write, | ||
1566 | "demote", BM_LOCKED_TEST_ALLOWED); | ||
1567 | put_ldev(mdev); | ||
1568 | } | ||
1569 | |||
1570 | /* Last part of the attaching process ... */ | ||
1571 | if (ns.conn >= C_CONNECTED && | ||
1572 | os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) { | ||
1573 | drbd_send_sizes(mdev, 0, 0); /* to start sync... */ | ||
1574 | drbd_send_uuids(mdev); | ||
1575 | drbd_send_state(mdev, ns); | ||
1576 | } | ||
1577 | |||
1578 | /* We want to pause/continue resync, tell peer. */ | ||
1579 | if (ns.conn >= C_CONNECTED && | ||
1580 | ((os.aftr_isp != ns.aftr_isp) || | ||
1581 | (os.user_isp != ns.user_isp))) | ||
1582 | drbd_send_state(mdev, ns); | ||
1583 | |||
1584 | /* In case one of the isp bits got set, suspend other devices. */ | ||
1585 | if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) && | ||
1586 | (ns.aftr_isp || ns.peer_isp || ns.user_isp)) | ||
1587 | suspend_other_sg(mdev); | ||
1588 | |||
1589 | /* Make sure the peer gets informed about eventual state | ||
1590 | changes (ISP bits) while we were in WFReportParams. */ | ||
1591 | if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED) | ||
1592 | drbd_send_state(mdev, ns); | ||
1593 | |||
1594 | if (os.conn != C_AHEAD && ns.conn == C_AHEAD) | ||
1595 | drbd_send_state(mdev, ns); | ||
1596 | |||
1597 | /* We are in the progress to start a full sync... */ | ||
1598 | if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || | ||
1599 | (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S)) | ||
1600 | /* no other bitmap changes expected during this phase */ | ||
1601 | drbd_queue_bitmap_io(mdev, | ||
1602 | &drbd_bmio_set_n_write, &abw_start_sync, | ||
1603 | "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED); | ||
1604 | |||
1605 | /* We are invalidating our self... */ | ||
1606 | if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED && | ||
1607 | os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT) | ||
1608 | /* other bitmap operation expected during this phase */ | ||
1609 | drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, | ||
1610 | "set_n_write from invalidate", BM_LOCKED_MASK); | ||
1611 | |||
1612 | /* first half of local IO error, failure to attach, | ||
1613 | * or administrative detach */ | ||
1614 | if (os.disk != D_FAILED && ns.disk == D_FAILED) { | ||
1615 | enum drbd_io_error_p eh = EP_PASS_ON; | ||
1616 | int was_io_error = 0; | ||
1617 | /* corresponding get_ldev was in __drbd_set_state, to serialize | ||
1618 | * our cleanup here with the transition to D_DISKLESS. | ||
1619 | * But is is still not save to dreference ldev here, since | ||
1620 | * we might come from an failed Attach before ldev was set. */ | ||
1621 | if (mdev->ldev) { | ||
1622 | eh = mdev->ldev->dc.on_io_error; | ||
1623 | was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags); | ||
1624 | |||
1625 | if (was_io_error && eh == EP_CALL_HELPER) | ||
1626 | drbd_khelper(mdev, "local-io-error"); | ||
1627 | |||
1628 | /* Immediately allow completion of all application IO, | ||
1629 | * that waits for completion from the local disk, | ||
1630 | * if this was a force-detach due to disk_timeout | ||
1631 | * or administrator request (drbdsetup detach --force). | ||
1632 | * Do NOT abort otherwise. | ||
1633 | * Aborting local requests may cause serious problems, | ||
1634 | * if requests are completed to upper layers already, | ||
1635 | * and then later the already submitted local bio completes. | ||
1636 | * This can cause DMA into former bio pages that meanwhile | ||
1637 | * have been re-used for other things. | ||
1638 | * So aborting local requests may cause crashes, | ||
1639 | * or even worse, silent data corruption. | ||
1640 | */ | ||
1641 | if (test_and_clear_bit(FORCE_DETACH, &mdev->flags)) | ||
1642 | tl_abort_disk_io(mdev); | ||
1643 | |||
1644 | /* current state still has to be D_FAILED, | ||
1645 | * there is only one way out: to D_DISKLESS, | ||
1646 | * and that may only happen after our put_ldev below. */ | ||
1647 | if (mdev->state.disk != D_FAILED) | ||
1648 | dev_err(DEV, | ||
1649 | "ASSERT FAILED: disk is %s during detach\n", | ||
1650 | drbd_disk_str(mdev->state.disk)); | ||
1651 | |||
1652 | if (ns.conn >= C_CONNECTED) | ||
1653 | drbd_send_state(mdev, ns); | ||
1654 | |||
1655 | drbd_rs_cancel_all(mdev); | ||
1656 | |||
1657 | /* In case we want to get something to stable storage still, | ||
1658 | * this may be the last chance. | ||
1659 | * Following put_ldev may transition to D_DISKLESS. */ | ||
1660 | drbd_md_sync(mdev); | ||
1661 | } | ||
1662 | put_ldev(mdev); | ||
1663 | } | ||
1664 | |||
1665 | /* second half of local IO error, failure to attach, | ||
1666 | * or administrative detach, | ||
1667 | * after local_cnt references have reached zero again */ | ||
1668 | if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) { | ||
1669 | /* We must still be diskless, | ||
1670 | * re-attach has to be serialized with this! */ | ||
1671 | if (mdev->state.disk != D_DISKLESS) | ||
1672 | dev_err(DEV, | ||
1673 | "ASSERT FAILED: disk is %s while going diskless\n", | ||
1674 | drbd_disk_str(mdev->state.disk)); | ||
1675 | |||
1676 | if (ns.conn >= C_CONNECTED) | ||
1677 | drbd_send_state(mdev, ns); | ||
1678 | |||
1679 | /* corresponding get_ldev in __drbd_set_state | ||
1680 | * this may finally trigger drbd_ldev_destroy. */ | ||
1681 | put_ldev(mdev); | ||
1682 | } | ||
1683 | |||
1684 | /* Notify peer that I had a local IO error, and did not detached.. */ | ||
1685 | if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED) | ||
1686 | drbd_send_state(mdev, ns); | ||
1687 | |||
1688 | /* Disks got bigger while they were detached */ | ||
1689 | if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING && | ||
1690 | test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) { | ||
1691 | if (ns.conn == C_CONNECTED) | ||
1692 | resync_after_online_grow(mdev); | ||
1693 | } | ||
1694 | |||
1695 | /* A resync finished or aborted, wake paused devices... */ | ||
1696 | if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) || | ||
1697 | (os.peer_isp && !ns.peer_isp) || | ||
1698 | (os.user_isp && !ns.user_isp)) | ||
1699 | resume_next_sg(mdev); | ||
1700 | |||
1701 | /* sync target done with resync. Explicitly notify peer, even though | ||
1702 | * it should (at least for non-empty resyncs) already know itself. */ | ||
1703 | if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED) | ||
1704 | drbd_send_state(mdev, ns); | ||
1705 | |||
1706 | /* Wake up role changes, that were delayed because of connection establishing */ | ||
1707 | if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS) { | ||
1708 | clear_bit(STATE_SENT, &mdev->flags); | ||
1709 | wake_up(&mdev->state_wait); | ||
1710 | } | ||
1711 | |||
1712 | /* This triggers bitmap writeout of potentially still unwritten pages | ||
1713 | * if the resync finished cleanly, or aborted because of peer disk | ||
1714 | * failure, or because of connection loss. | ||
1715 | * For resync aborted because of local disk failure, we cannot do | ||
1716 | * any bitmap writeout anymore. | ||
1717 | * No harm done if some bits change during this phase. | ||
1718 | */ | ||
1719 | if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) { | ||
1720 | drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL, | ||
1721 | "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED); | ||
1722 | put_ldev(mdev); | ||
1723 | } | ||
1724 | |||
1725 | /* free tl_hash if we Got thawed and are C_STANDALONE */ | ||
1726 | if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash) | ||
1727 | drbd_free_tl_hash(mdev); | ||
1728 | |||
1729 | /* Upon network connection, we need to start the receiver */ | ||
1730 | if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED) | ||
1731 | drbd_thread_start(&mdev->receiver); | ||
1732 | |||
1733 | /* Terminate worker thread if we are unconfigured - it will be | ||
1734 | restarted as needed... */ | ||
1735 | if (ns.disk == D_DISKLESS && | ||
1736 | ns.conn == C_STANDALONE && | ||
1737 | ns.role == R_SECONDARY) { | ||
1738 | if (os.aftr_isp != ns.aftr_isp) | ||
1739 | resume_next_sg(mdev); | ||
1740 | /* set in __drbd_set_state, unless CONFIG_PENDING was set */ | ||
1741 | if (test_bit(DEVICE_DYING, &mdev->flags)) | ||
1742 | drbd_thread_stop_nowait(&mdev->worker); | ||
1743 | } | 322 | } |
1744 | 323 | spin_unlock_irq(&tconn->req_lock); | |
1745 | drbd_md_sync(mdev); | ||
1746 | } | 324 | } |
1747 | 325 | ||
1748 | |||
1749 | static int drbd_thread_setup(void *arg) | 326 | static int drbd_thread_setup(void *arg) |
1750 | { | 327 | { |
1751 | struct drbd_thread *thi = (struct drbd_thread *) arg; | 328 | struct drbd_thread *thi = (struct drbd_thread *) arg; |
1752 | struct drbd_conf *mdev = thi->mdev; | 329 | struct drbd_tconn *tconn = thi->tconn; |
1753 | unsigned long flags; | 330 | unsigned long flags; |
1754 | int retval; | 331 | int retval; |
1755 | 332 | ||
333 | snprintf(current->comm, sizeof(current->comm), "drbd_%c_%s", | ||
334 | thi->name[0], thi->tconn->name); | ||
335 | |||
1756 | restart: | 336 | restart: |
1757 | retval = thi->function(thi); | 337 | retval = thi->function(thi); |
1758 | 338 | ||
1759 | spin_lock_irqsave(&thi->t_lock, flags); | 339 | spin_lock_irqsave(&thi->t_lock, flags); |
1760 | 340 | ||
1761 | /* if the receiver has been "Exiting", the last thing it did | 341 | /* if the receiver has been "EXITING", the last thing it did |
1762 | * was set the conn state to "StandAlone", | 342 | * was set the conn state to "StandAlone", |
1763 | * if now a re-connect request comes in, conn state goes C_UNCONNECTED, | 343 | * if now a re-connect request comes in, conn state goes C_UNCONNECTED, |
1764 | * and receiver thread will be "started". | 344 | * and receiver thread will be "started". |
1765 | * drbd_thread_start needs to set "Restarting" in that case. | 345 | * drbd_thread_start needs to set "RESTARTING" in that case. |
1766 | * t_state check and assignment needs to be within the same spinlock, | 346 | * t_state check and assignment needs to be within the same spinlock, |
1767 | * so either thread_start sees Exiting, and can remap to Restarting, | 347 | * so either thread_start sees EXITING, and can remap to RESTARTING, |
1768 | * or thread_start see None, and can proceed as normal. | 348 | * or thread_start see NONE, and can proceed as normal. |
1769 | */ | 349 | */ |
1770 | 350 | ||
1771 | if (thi->t_state == Restarting) { | 351 | if (thi->t_state == RESTARTING) { |
1772 | dev_info(DEV, "Restarting %s\n", current->comm); | 352 | conn_info(tconn, "Restarting %s thread\n", thi->name); |
1773 | thi->t_state = Running; | 353 | thi->t_state = RUNNING; |
1774 | spin_unlock_irqrestore(&thi->t_lock, flags); | 354 | spin_unlock_irqrestore(&thi->t_lock, flags); |
1775 | goto restart; | 355 | goto restart; |
1776 | } | 356 | } |
1777 | 357 | ||
1778 | thi->task = NULL; | 358 | thi->task = NULL; |
1779 | thi->t_state = None; | 359 | thi->t_state = NONE; |
1780 | smp_mb(); | 360 | smp_mb(); |
1781 | complete(&thi->stop); | 361 | complete_all(&thi->stop); |
1782 | spin_unlock_irqrestore(&thi->t_lock, flags); | 362 | spin_unlock_irqrestore(&thi->t_lock, flags); |
1783 | 363 | ||
1784 | dev_info(DEV, "Terminating %s\n", current->comm); | 364 | conn_info(tconn, "Terminating %s\n", current->comm); |
1785 | 365 | ||
1786 | /* Release mod reference taken when thread was started */ | 366 | /* Release mod reference taken when thread was started */ |
367 | |||
368 | kref_put(&tconn->kref, &conn_destroy); | ||
1787 | module_put(THIS_MODULE); | 369 | module_put(THIS_MODULE); |
1788 | return retval; | 370 | return retval; |
1789 | } | 371 | } |
1790 | 372 | ||
1791 | static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi, | 373 | static void drbd_thread_init(struct drbd_tconn *tconn, struct drbd_thread *thi, |
1792 | int (*func) (struct drbd_thread *)) | 374 | int (*func) (struct drbd_thread *), char *name) |
1793 | { | 375 | { |
1794 | spin_lock_init(&thi->t_lock); | 376 | spin_lock_init(&thi->t_lock); |
1795 | thi->task = NULL; | 377 | thi->task = NULL; |
1796 | thi->t_state = None; | 378 | thi->t_state = NONE; |
1797 | thi->function = func; | 379 | thi->function = func; |
1798 | thi->mdev = mdev; | 380 | thi->tconn = tconn; |
381 | strncpy(thi->name, name, ARRAY_SIZE(thi->name)); | ||
1799 | } | 382 | } |
1800 | 383 | ||
1801 | int drbd_thread_start(struct drbd_thread *thi) | 384 | int drbd_thread_start(struct drbd_thread *thi) |
1802 | { | 385 | { |
1803 | struct drbd_conf *mdev = thi->mdev; | 386 | struct drbd_tconn *tconn = thi->tconn; |
1804 | struct task_struct *nt; | 387 | struct task_struct *nt; |
1805 | unsigned long flags; | 388 | unsigned long flags; |
1806 | 389 | ||
1807 | const char *me = | ||
1808 | thi == &mdev->receiver ? "receiver" : | ||
1809 | thi == &mdev->asender ? "asender" : | ||
1810 | thi == &mdev->worker ? "worker" : "NONSENSE"; | ||
1811 | |||
1812 | /* is used from state engine doing drbd_thread_stop_nowait, | 390 | /* is used from state engine doing drbd_thread_stop_nowait, |
1813 | * while holding the req lock irqsave */ | 391 | * while holding the req lock irqsave */ |
1814 | spin_lock_irqsave(&thi->t_lock, flags); | 392 | spin_lock_irqsave(&thi->t_lock, flags); |
1815 | 393 | ||
1816 | switch (thi->t_state) { | 394 | switch (thi->t_state) { |
1817 | case None: | 395 | case NONE: |
1818 | dev_info(DEV, "Starting %s thread (from %s [%d])\n", | 396 | conn_info(tconn, "Starting %s thread (from %s [%d])\n", |
1819 | me, current->comm, current->pid); | 397 | thi->name, current->comm, current->pid); |
1820 | 398 | ||
1821 | /* Get ref on module for thread - this is released when thread exits */ | 399 | /* Get ref on module for thread - this is released when thread exits */ |
1822 | if (!try_module_get(THIS_MODULE)) { | 400 | if (!try_module_get(THIS_MODULE)) { |
1823 | dev_err(DEV, "Failed to get module reference in drbd_thread_start\n"); | 401 | conn_err(tconn, "Failed to get module reference in drbd_thread_start\n"); |
1824 | spin_unlock_irqrestore(&thi->t_lock, flags); | 402 | spin_unlock_irqrestore(&thi->t_lock, flags); |
1825 | return false; | 403 | return false; |
1826 | } | 404 | } |
1827 | 405 | ||
406 | kref_get(&thi->tconn->kref); | ||
407 | |||
1828 | init_completion(&thi->stop); | 408 | init_completion(&thi->stop); |
1829 | D_ASSERT(thi->task == NULL); | ||
1830 | thi->reset_cpu_mask = 1; | 409 | thi->reset_cpu_mask = 1; |
1831 | thi->t_state = Running; | 410 | thi->t_state = RUNNING; |
1832 | spin_unlock_irqrestore(&thi->t_lock, flags); | 411 | spin_unlock_irqrestore(&thi->t_lock, flags); |
1833 | flush_signals(current); /* otherw. may get -ERESTARTNOINTR */ | 412 | flush_signals(current); /* otherw. may get -ERESTARTNOINTR */ |
1834 | 413 | ||
1835 | nt = kthread_create(drbd_thread_setup, (void *) thi, | 414 | nt = kthread_create(drbd_thread_setup, (void *) thi, |
1836 | "drbd%d_%s", mdev_to_minor(mdev), me); | 415 | "drbd_%c_%s", thi->name[0], thi->tconn->name); |
1837 | 416 | ||
1838 | if (IS_ERR(nt)) { | 417 | if (IS_ERR(nt)) { |
1839 | dev_err(DEV, "Couldn't start thread\n"); | 418 | conn_err(tconn, "Couldn't start thread\n"); |
1840 | 419 | ||
420 | kref_put(&tconn->kref, &conn_destroy); | ||
1841 | module_put(THIS_MODULE); | 421 | module_put(THIS_MODULE); |
1842 | return false; | 422 | return false; |
1843 | } | 423 | } |
1844 | spin_lock_irqsave(&thi->t_lock, flags); | 424 | spin_lock_irqsave(&thi->t_lock, flags); |
1845 | thi->task = nt; | 425 | thi->task = nt; |
1846 | thi->t_state = Running; | 426 | thi->t_state = RUNNING; |
1847 | spin_unlock_irqrestore(&thi->t_lock, flags); | 427 | spin_unlock_irqrestore(&thi->t_lock, flags); |
1848 | wake_up_process(nt); | 428 | wake_up_process(nt); |
1849 | break; | 429 | break; |
1850 | case Exiting: | 430 | case EXITING: |
1851 | thi->t_state = Restarting; | 431 | thi->t_state = RESTARTING; |
1852 | dev_info(DEV, "Restarting %s thread (from %s [%d])\n", | 432 | conn_info(tconn, "Restarting %s thread (from %s [%d])\n", |
1853 | me, current->comm, current->pid); | 433 | thi->name, current->comm, current->pid); |
1854 | /* fall through */ | 434 | /* fall through */ |
1855 | case Running: | 435 | case RUNNING: |
1856 | case Restarting: | 436 | case RESTARTING: |
1857 | default: | 437 | default: |
1858 | spin_unlock_irqrestore(&thi->t_lock, flags); | 438 | spin_unlock_irqrestore(&thi->t_lock, flags); |
1859 | break; | 439 | break; |
@@ -1867,12 +447,12 @@ void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait) | |||
1867 | { | 447 | { |
1868 | unsigned long flags; | 448 | unsigned long flags; |
1869 | 449 | ||
1870 | enum drbd_thread_state ns = restart ? Restarting : Exiting; | 450 | enum drbd_thread_state ns = restart ? RESTARTING : EXITING; |
1871 | 451 | ||
1872 | /* may be called from state engine, holding the req lock irqsave */ | 452 | /* may be called from state engine, holding the req lock irqsave */ |
1873 | spin_lock_irqsave(&thi->t_lock, flags); | 453 | spin_lock_irqsave(&thi->t_lock, flags); |
1874 | 454 | ||
1875 | if (thi->t_state == None) { | 455 | if (thi->t_state == NONE) { |
1876 | spin_unlock_irqrestore(&thi->t_lock, flags); | 456 | spin_unlock_irqrestore(&thi->t_lock, flags); |
1877 | if (restart) | 457 | if (restart) |
1878 | drbd_thread_start(thi); | 458 | drbd_thread_start(thi); |
@@ -1890,7 +470,6 @@ void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait) | |||
1890 | init_completion(&thi->stop); | 470 | init_completion(&thi->stop); |
1891 | if (thi->task != current) | 471 | if (thi->task != current) |
1892 | force_sig(DRBD_SIGKILL, thi->task); | 472 | force_sig(DRBD_SIGKILL, thi->task); |
1893 | |||
1894 | } | 473 | } |
1895 | 474 | ||
1896 | spin_unlock_irqrestore(&thi->t_lock, flags); | 475 | spin_unlock_irqrestore(&thi->t_lock, flags); |
@@ -1899,6 +478,35 @@ void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait) | |||
1899 | wait_for_completion(&thi->stop); | 478 | wait_for_completion(&thi->stop); |
1900 | } | 479 | } |
1901 | 480 | ||
481 | static struct drbd_thread *drbd_task_to_thread(struct drbd_tconn *tconn, struct task_struct *task) | ||
482 | { | ||
483 | struct drbd_thread *thi = | ||
484 | task == tconn->receiver.task ? &tconn->receiver : | ||
485 | task == tconn->asender.task ? &tconn->asender : | ||
486 | task == tconn->worker.task ? &tconn->worker : NULL; | ||
487 | |||
488 | return thi; | ||
489 | } | ||
490 | |||
491 | char *drbd_task_to_thread_name(struct drbd_tconn *tconn, struct task_struct *task) | ||
492 | { | ||
493 | struct drbd_thread *thi = drbd_task_to_thread(tconn, task); | ||
494 | return thi ? thi->name : task->comm; | ||
495 | } | ||
496 | |||
497 | int conn_lowest_minor(struct drbd_tconn *tconn) | ||
498 | { | ||
499 | struct drbd_conf *mdev; | ||
500 | int vnr = 0, m; | ||
501 | |||
502 | rcu_read_lock(); | ||
503 | mdev = idr_get_next(&tconn->volumes, &vnr); | ||
504 | m = mdev ? mdev_to_minor(mdev) : -1; | ||
505 | rcu_read_unlock(); | ||
506 | |||
507 | return m; | ||
508 | } | ||
509 | |||
1902 | #ifdef CONFIG_SMP | 510 | #ifdef CONFIG_SMP |
1903 | /** | 511 | /** |
1904 | * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs | 512 | * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs |
@@ -1907,238 +515,345 @@ void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait) | |||
1907 | * Forces all threads of a device onto the same CPU. This is beneficial for | 515 | * Forces all threads of a device onto the same CPU. This is beneficial for |
1908 | * DRBD's performance. May be overwritten by user's configuration. | 516 | * DRBD's performance. May be overwritten by user's configuration. |
1909 | */ | 517 | */ |
1910 | void drbd_calc_cpu_mask(struct drbd_conf *mdev) | 518 | void drbd_calc_cpu_mask(struct drbd_tconn *tconn) |
1911 | { | 519 | { |
1912 | int ord, cpu; | 520 | int ord, cpu; |
1913 | 521 | ||
1914 | /* user override. */ | 522 | /* user override. */ |
1915 | if (cpumask_weight(mdev->cpu_mask)) | 523 | if (cpumask_weight(tconn->cpu_mask)) |
1916 | return; | 524 | return; |
1917 | 525 | ||
1918 | ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask); | 526 | ord = conn_lowest_minor(tconn) % cpumask_weight(cpu_online_mask); |
1919 | for_each_online_cpu(cpu) { | 527 | for_each_online_cpu(cpu) { |
1920 | if (ord-- == 0) { | 528 | if (ord-- == 0) { |
1921 | cpumask_set_cpu(cpu, mdev->cpu_mask); | 529 | cpumask_set_cpu(cpu, tconn->cpu_mask); |
1922 | return; | 530 | return; |
1923 | } | 531 | } |
1924 | } | 532 | } |
1925 | /* should not be reached */ | 533 | /* should not be reached */ |
1926 | cpumask_setall(mdev->cpu_mask); | 534 | cpumask_setall(tconn->cpu_mask); |
1927 | } | 535 | } |
1928 | 536 | ||
1929 | /** | 537 | /** |
1930 | * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread | 538 | * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread |
1931 | * @mdev: DRBD device. | 539 | * @mdev: DRBD device. |
540 | * @thi: drbd_thread object | ||
1932 | * | 541 | * |
1933 | * call in the "main loop" of _all_ threads, no need for any mutex, current won't die | 542 | * call in the "main loop" of _all_ threads, no need for any mutex, current won't die |
1934 | * prematurely. | 543 | * prematurely. |
1935 | */ | 544 | */ |
1936 | void drbd_thread_current_set_cpu(struct drbd_conf *mdev) | 545 | void drbd_thread_current_set_cpu(struct drbd_thread *thi) |
1937 | { | 546 | { |
1938 | struct task_struct *p = current; | 547 | struct task_struct *p = current; |
1939 | struct drbd_thread *thi = | 548 | |
1940 | p == mdev->asender.task ? &mdev->asender : | ||
1941 | p == mdev->receiver.task ? &mdev->receiver : | ||
1942 | p == mdev->worker.task ? &mdev->worker : | ||
1943 | NULL; | ||
1944 | ERR_IF(thi == NULL) | ||
1945 | return; | ||
1946 | if (!thi->reset_cpu_mask) | 549 | if (!thi->reset_cpu_mask) |
1947 | return; | 550 | return; |
1948 | thi->reset_cpu_mask = 0; | 551 | thi->reset_cpu_mask = 0; |
1949 | set_cpus_allowed_ptr(p, mdev->cpu_mask); | 552 | set_cpus_allowed_ptr(p, thi->tconn->cpu_mask); |
1950 | } | 553 | } |
1951 | #endif | 554 | #endif |
1952 | 555 | ||
1953 | /* the appropriate socket mutex must be held already */ | 556 | /** |
1954 | int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, | 557 | * drbd_header_size - size of a packet header |
1955 | enum drbd_packets cmd, struct p_header80 *h, | 558 | * |
1956 | size_t size, unsigned msg_flags) | 559 | * The header size is a multiple of 8, so any payload following the header is |
560 | * word aligned on 64-bit architectures. (The bitmap send and receive code | ||
561 | * relies on this.) | ||
562 | */ | ||
563 | unsigned int drbd_header_size(struct drbd_tconn *tconn) | ||
1957 | { | 564 | { |
1958 | int sent, ok; | 565 | if (tconn->agreed_pro_version >= 100) { |
566 | BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header100), 8)); | ||
567 | return sizeof(struct p_header100); | ||
568 | } else { | ||
569 | BUILD_BUG_ON(sizeof(struct p_header80) != | ||
570 | sizeof(struct p_header95)); | ||
571 | BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header80), 8)); | ||
572 | return sizeof(struct p_header80); | ||
573 | } | ||
574 | } | ||
1959 | 575 | ||
1960 | ERR_IF(!h) return false; | 576 | static unsigned int prepare_header80(struct p_header80 *h, enum drbd_packet cmd, int size) |
1961 | ERR_IF(!size) return false; | 577 | { |
578 | h->magic = cpu_to_be32(DRBD_MAGIC); | ||
579 | h->command = cpu_to_be16(cmd); | ||
580 | h->length = cpu_to_be16(size); | ||
581 | return sizeof(struct p_header80); | ||
582 | } | ||
1962 | 583 | ||
1963 | h->magic = BE_DRBD_MAGIC; | 584 | static unsigned int prepare_header95(struct p_header95 *h, enum drbd_packet cmd, int size) |
585 | { | ||
586 | h->magic = cpu_to_be16(DRBD_MAGIC_BIG); | ||
1964 | h->command = cpu_to_be16(cmd); | 587 | h->command = cpu_to_be16(cmd); |
1965 | h->length = cpu_to_be16(size-sizeof(struct p_header80)); | 588 | h->length = cpu_to_be32(size); |
589 | return sizeof(struct p_header95); | ||
590 | } | ||
1966 | 591 | ||
1967 | sent = drbd_send(mdev, sock, h, size, msg_flags); | 592 | static unsigned int prepare_header100(struct p_header100 *h, enum drbd_packet cmd, |
593 | int size, int vnr) | ||
594 | { | ||
595 | h->magic = cpu_to_be32(DRBD_MAGIC_100); | ||
596 | h->volume = cpu_to_be16(vnr); | ||
597 | h->command = cpu_to_be16(cmd); | ||
598 | h->length = cpu_to_be32(size); | ||
599 | h->pad = 0; | ||
600 | return sizeof(struct p_header100); | ||
601 | } | ||
1968 | 602 | ||
1969 | ok = (sent == size); | 603 | static unsigned int prepare_header(struct drbd_tconn *tconn, int vnr, |
1970 | if (!ok && !signal_pending(current)) | 604 | void *buffer, enum drbd_packet cmd, int size) |
1971 | dev_warn(DEV, "short sent %s size=%d sent=%d\n", | 605 | { |
1972 | cmdname(cmd), (int)size, sent); | 606 | if (tconn->agreed_pro_version >= 100) |
1973 | return ok; | 607 | return prepare_header100(buffer, cmd, size, vnr); |
608 | else if (tconn->agreed_pro_version >= 95 && | ||
609 | size > DRBD_MAX_SIZE_H80_PACKET) | ||
610 | return prepare_header95(buffer, cmd, size); | ||
611 | else | ||
612 | return prepare_header80(buffer, cmd, size); | ||
1974 | } | 613 | } |
1975 | 614 | ||
1976 | /* don't pass the socket. we may only look at it | 615 | static void *__conn_prepare_command(struct drbd_tconn *tconn, |
1977 | * when we hold the appropriate socket mutex. | 616 | struct drbd_socket *sock) |
1978 | */ | ||
1979 | int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket, | ||
1980 | enum drbd_packets cmd, struct p_header80 *h, size_t size) | ||
1981 | { | 617 | { |
1982 | int ok = 0; | 618 | if (!sock->socket) |
1983 | struct socket *sock; | 619 | return NULL; |
620 | return sock->sbuf + drbd_header_size(tconn); | ||
621 | } | ||
1984 | 622 | ||
1985 | if (use_data_socket) { | 623 | void *conn_prepare_command(struct drbd_tconn *tconn, struct drbd_socket *sock) |
1986 | mutex_lock(&mdev->data.mutex); | 624 | { |
1987 | sock = mdev->data.socket; | 625 | void *p; |
1988 | } else { | ||
1989 | mutex_lock(&mdev->meta.mutex); | ||
1990 | sock = mdev->meta.socket; | ||
1991 | } | ||
1992 | 626 | ||
1993 | /* drbd_disconnect() could have called drbd_free_sock() | 627 | mutex_lock(&sock->mutex); |
1994 | * while we were waiting in down()... */ | 628 | p = __conn_prepare_command(tconn, sock); |
1995 | if (likely(sock != NULL)) | 629 | if (!p) |
1996 | ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0); | 630 | mutex_unlock(&sock->mutex); |
1997 | 631 | ||
1998 | if (use_data_socket) | 632 | return p; |
1999 | mutex_unlock(&mdev->data.mutex); | ||
2000 | else | ||
2001 | mutex_unlock(&mdev->meta.mutex); | ||
2002 | return ok; | ||
2003 | } | 633 | } |
2004 | 634 | ||
2005 | int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data, | 635 | void *drbd_prepare_command(struct drbd_conf *mdev, struct drbd_socket *sock) |
2006 | size_t size) | ||
2007 | { | 636 | { |
2008 | struct p_header80 h; | 637 | return conn_prepare_command(mdev->tconn, sock); |
2009 | int ok; | 638 | } |
2010 | 639 | ||
2011 | h.magic = BE_DRBD_MAGIC; | 640 | static int __send_command(struct drbd_tconn *tconn, int vnr, |
2012 | h.command = cpu_to_be16(cmd); | 641 | struct drbd_socket *sock, enum drbd_packet cmd, |
2013 | h.length = cpu_to_be16(size); | 642 | unsigned int header_size, void *data, |
643 | unsigned int size) | ||
644 | { | ||
645 | int msg_flags; | ||
646 | int err; | ||
2014 | 647 | ||
2015 | if (!drbd_get_data_sock(mdev)) | 648 | /* |
2016 | return 0; | 649 | * Called with @data == NULL and the size of the data blocks in @size |
650 | * for commands that send data blocks. For those commands, omit the | ||
651 | * MSG_MORE flag: this will increase the likelihood that data blocks | ||
652 | * which are page aligned on the sender will end up page aligned on the | ||
653 | * receiver. | ||
654 | */ | ||
655 | msg_flags = data ? MSG_MORE : 0; | ||
656 | |||
657 | header_size += prepare_header(tconn, vnr, sock->sbuf, cmd, | ||
658 | header_size + size); | ||
659 | err = drbd_send_all(tconn, sock->socket, sock->sbuf, header_size, | ||
660 | msg_flags); | ||
661 | if (data && !err) | ||
662 | err = drbd_send_all(tconn, sock->socket, data, size, 0); | ||
663 | return err; | ||
664 | } | ||
2017 | 665 | ||
2018 | ok = (sizeof(h) == | 666 | static int __conn_send_command(struct drbd_tconn *tconn, struct drbd_socket *sock, |
2019 | drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0)); | 667 | enum drbd_packet cmd, unsigned int header_size, |
2020 | ok = ok && (size == | 668 | void *data, unsigned int size) |
2021 | drbd_send(mdev, mdev->data.socket, data, size, 0)); | 669 | { |
670 | return __send_command(tconn, 0, sock, cmd, header_size, data, size); | ||
671 | } | ||
2022 | 672 | ||
2023 | drbd_put_data_sock(mdev); | 673 | int conn_send_command(struct drbd_tconn *tconn, struct drbd_socket *sock, |
674 | enum drbd_packet cmd, unsigned int header_size, | ||
675 | void *data, unsigned int size) | ||
676 | { | ||
677 | int err; | ||
2024 | 678 | ||
2025 | return ok; | 679 | err = __conn_send_command(tconn, sock, cmd, header_size, data, size); |
680 | mutex_unlock(&sock->mutex); | ||
681 | return err; | ||
2026 | } | 682 | } |
2027 | 683 | ||
2028 | int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc) | 684 | int drbd_send_command(struct drbd_conf *mdev, struct drbd_socket *sock, |
685 | enum drbd_packet cmd, unsigned int header_size, | ||
686 | void *data, unsigned int size) | ||
2029 | { | 687 | { |
688 | int err; | ||
689 | |||
690 | err = __send_command(mdev->tconn, mdev->vnr, sock, cmd, header_size, | ||
691 | data, size); | ||
692 | mutex_unlock(&sock->mutex); | ||
693 | return err; | ||
694 | } | ||
695 | |||
696 | int drbd_send_ping(struct drbd_tconn *tconn) | ||
697 | { | ||
698 | struct drbd_socket *sock; | ||
699 | |||
700 | sock = &tconn->meta; | ||
701 | if (!conn_prepare_command(tconn, sock)) | ||
702 | return -EIO; | ||
703 | return conn_send_command(tconn, sock, P_PING, 0, NULL, 0); | ||
704 | } | ||
705 | |||
706 | int drbd_send_ping_ack(struct drbd_tconn *tconn) | ||
707 | { | ||
708 | struct drbd_socket *sock; | ||
709 | |||
710 | sock = &tconn->meta; | ||
711 | if (!conn_prepare_command(tconn, sock)) | ||
712 | return -EIO; | ||
713 | return conn_send_command(tconn, sock, P_PING_ACK, 0, NULL, 0); | ||
714 | } | ||
715 | |||
716 | int drbd_send_sync_param(struct drbd_conf *mdev) | ||
717 | { | ||
718 | struct drbd_socket *sock; | ||
2030 | struct p_rs_param_95 *p; | 719 | struct p_rs_param_95 *p; |
2031 | struct socket *sock; | 720 | int size; |
2032 | int size, rv; | 721 | const int apv = mdev->tconn->agreed_pro_version; |
2033 | const int apv = mdev->agreed_pro_version; | 722 | enum drbd_packet cmd; |
723 | struct net_conf *nc; | ||
724 | struct disk_conf *dc; | ||
725 | |||
726 | sock = &mdev->tconn->data; | ||
727 | p = drbd_prepare_command(mdev, sock); | ||
728 | if (!p) | ||
729 | return -EIO; | ||
730 | |||
731 | rcu_read_lock(); | ||
732 | nc = rcu_dereference(mdev->tconn->net_conf); | ||
2034 | 733 | ||
2035 | size = apv <= 87 ? sizeof(struct p_rs_param) | 734 | size = apv <= 87 ? sizeof(struct p_rs_param) |
2036 | : apv == 88 ? sizeof(struct p_rs_param) | 735 | : apv == 88 ? sizeof(struct p_rs_param) |
2037 | + strlen(mdev->sync_conf.verify_alg) + 1 | 736 | + strlen(nc->verify_alg) + 1 |
2038 | : apv <= 94 ? sizeof(struct p_rs_param_89) | 737 | : apv <= 94 ? sizeof(struct p_rs_param_89) |
2039 | : /* apv >= 95 */ sizeof(struct p_rs_param_95); | 738 | : /* apv >= 95 */ sizeof(struct p_rs_param_95); |
2040 | 739 | ||
2041 | /* used from admin command context and receiver/worker context. | 740 | cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM; |
2042 | * to avoid kmalloc, grab the socket right here, | ||
2043 | * then use the pre-allocated sbuf there */ | ||
2044 | mutex_lock(&mdev->data.mutex); | ||
2045 | sock = mdev->data.socket; | ||
2046 | |||
2047 | if (likely(sock != NULL)) { | ||
2048 | enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM; | ||
2049 | |||
2050 | p = &mdev->data.sbuf.rs_param_95; | ||
2051 | 741 | ||
2052 | /* initialize verify_alg and csums_alg */ | 742 | /* initialize verify_alg and csums_alg */ |
2053 | memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); | 743 | memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); |
2054 | 744 | ||
2055 | p->rate = cpu_to_be32(sc->rate); | 745 | if (get_ldev(mdev)) { |
2056 | p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead); | 746 | dc = rcu_dereference(mdev->ldev->disk_conf); |
2057 | p->c_delay_target = cpu_to_be32(sc->c_delay_target); | 747 | p->resync_rate = cpu_to_be32(dc->resync_rate); |
2058 | p->c_fill_target = cpu_to_be32(sc->c_fill_target); | 748 | p->c_plan_ahead = cpu_to_be32(dc->c_plan_ahead); |
2059 | p->c_max_rate = cpu_to_be32(sc->c_max_rate); | 749 | p->c_delay_target = cpu_to_be32(dc->c_delay_target); |
2060 | 750 | p->c_fill_target = cpu_to_be32(dc->c_fill_target); | |
2061 | if (apv >= 88) | 751 | p->c_max_rate = cpu_to_be32(dc->c_max_rate); |
2062 | strcpy(p->verify_alg, mdev->sync_conf.verify_alg); | 752 | put_ldev(mdev); |
2063 | if (apv >= 89) | 753 | } else { |
2064 | strcpy(p->csums_alg, mdev->sync_conf.csums_alg); | 754 | p->resync_rate = cpu_to_be32(DRBD_RESYNC_RATE_DEF); |
2065 | 755 | p->c_plan_ahead = cpu_to_be32(DRBD_C_PLAN_AHEAD_DEF); | |
2066 | rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0); | 756 | p->c_delay_target = cpu_to_be32(DRBD_C_DELAY_TARGET_DEF); |
2067 | } else | 757 | p->c_fill_target = cpu_to_be32(DRBD_C_FILL_TARGET_DEF); |
2068 | rv = 0; /* not ok */ | 758 | p->c_max_rate = cpu_to_be32(DRBD_C_MAX_RATE_DEF); |
759 | } | ||
2069 | 760 | ||
2070 | mutex_unlock(&mdev->data.mutex); | 761 | if (apv >= 88) |
762 | strcpy(p->verify_alg, nc->verify_alg); | ||
763 | if (apv >= 89) | ||
764 | strcpy(p->csums_alg, nc->csums_alg); | ||
765 | rcu_read_unlock(); | ||
2071 | 766 | ||
2072 | return rv; | 767 | return drbd_send_command(mdev, sock, cmd, size, NULL, 0); |
2073 | } | 768 | } |
2074 | 769 | ||
2075 | int drbd_send_protocol(struct drbd_conf *mdev) | 770 | int __drbd_send_protocol(struct drbd_tconn *tconn, enum drbd_packet cmd) |
2076 | { | 771 | { |
772 | struct drbd_socket *sock; | ||
2077 | struct p_protocol *p; | 773 | struct p_protocol *p; |
2078 | int size, cf, rv; | 774 | struct net_conf *nc; |
775 | int size, cf; | ||
2079 | 776 | ||
2080 | size = sizeof(struct p_protocol); | 777 | sock = &tconn->data; |
778 | p = __conn_prepare_command(tconn, sock); | ||
779 | if (!p) | ||
780 | return -EIO; | ||
2081 | 781 | ||
2082 | if (mdev->agreed_pro_version >= 87) | 782 | rcu_read_lock(); |
2083 | size += strlen(mdev->net_conf->integrity_alg) + 1; | 783 | nc = rcu_dereference(tconn->net_conf); |
2084 | 784 | ||
2085 | /* we must not recurse into our own queue, | 785 | if (nc->tentative && tconn->agreed_pro_version < 92) { |
2086 | * as that is blocked during handshake */ | 786 | rcu_read_unlock(); |
2087 | p = kmalloc(size, GFP_NOIO); | 787 | mutex_unlock(&sock->mutex); |
2088 | if (p == NULL) | 788 | conn_err(tconn, "--dry-run is not supported by peer"); |
2089 | return 0; | 789 | return -EOPNOTSUPP; |
790 | } | ||
2090 | 791 | ||
2091 | p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol); | 792 | size = sizeof(*p); |
2092 | p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p); | 793 | if (tconn->agreed_pro_version >= 87) |
2093 | p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p); | 794 | size += strlen(nc->integrity_alg) + 1; |
2094 | p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p); | ||
2095 | p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries); | ||
2096 | 795 | ||
796 | p->protocol = cpu_to_be32(nc->wire_protocol); | ||
797 | p->after_sb_0p = cpu_to_be32(nc->after_sb_0p); | ||
798 | p->after_sb_1p = cpu_to_be32(nc->after_sb_1p); | ||
799 | p->after_sb_2p = cpu_to_be32(nc->after_sb_2p); | ||
800 | p->two_primaries = cpu_to_be32(nc->two_primaries); | ||
2097 | cf = 0; | 801 | cf = 0; |
2098 | if (mdev->net_conf->want_lose) | 802 | if (nc->discard_my_data) |
2099 | cf |= CF_WANT_LOSE; | 803 | cf |= CF_DISCARD_MY_DATA; |
2100 | if (mdev->net_conf->dry_run) { | 804 | if (nc->tentative) |
2101 | if (mdev->agreed_pro_version >= 92) | 805 | cf |= CF_DRY_RUN; |
2102 | cf |= CF_DRY_RUN; | ||
2103 | else { | ||
2104 | dev_err(DEV, "--dry-run is not supported by peer"); | ||
2105 | kfree(p); | ||
2106 | return -1; | ||
2107 | } | ||
2108 | } | ||
2109 | p->conn_flags = cpu_to_be32(cf); | 806 | p->conn_flags = cpu_to_be32(cf); |
2110 | 807 | ||
2111 | if (mdev->agreed_pro_version >= 87) | 808 | if (tconn->agreed_pro_version >= 87) |
2112 | strcpy(p->integrity_alg, mdev->net_conf->integrity_alg); | 809 | strcpy(p->integrity_alg, nc->integrity_alg); |
810 | rcu_read_unlock(); | ||
2113 | 811 | ||
2114 | rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL, | 812 | return __conn_send_command(tconn, sock, cmd, size, NULL, 0); |
2115 | (struct p_header80 *)p, size); | 813 | } |
2116 | kfree(p); | 814 | |
2117 | return rv; | 815 | int drbd_send_protocol(struct drbd_tconn *tconn) |
816 | { | ||
817 | int err; | ||
818 | |||
819 | mutex_lock(&tconn->data.mutex); | ||
820 | err = __drbd_send_protocol(tconn, P_PROTOCOL); | ||
821 | mutex_unlock(&tconn->data.mutex); | ||
822 | |||
823 | return err; | ||
2118 | } | 824 | } |
2119 | 825 | ||
2120 | int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags) | 826 | int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags) |
2121 | { | 827 | { |
2122 | struct p_uuids p; | 828 | struct drbd_socket *sock; |
829 | struct p_uuids *p; | ||
2123 | int i; | 830 | int i; |
2124 | 831 | ||
2125 | if (!get_ldev_if_state(mdev, D_NEGOTIATING)) | 832 | if (!get_ldev_if_state(mdev, D_NEGOTIATING)) |
2126 | return 1; | 833 | return 0; |
2127 | 834 | ||
835 | sock = &mdev->tconn->data; | ||
836 | p = drbd_prepare_command(mdev, sock); | ||
837 | if (!p) { | ||
838 | put_ldev(mdev); | ||
839 | return -EIO; | ||
840 | } | ||
841 | spin_lock_irq(&mdev->ldev->md.uuid_lock); | ||
2128 | for (i = UI_CURRENT; i < UI_SIZE; i++) | 842 | for (i = UI_CURRENT; i < UI_SIZE; i++) |
2129 | p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0; | 843 | p->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]); |
844 | spin_unlock_irq(&mdev->ldev->md.uuid_lock); | ||
2130 | 845 | ||
2131 | mdev->comm_bm_set = drbd_bm_total_weight(mdev); | 846 | mdev->comm_bm_set = drbd_bm_total_weight(mdev); |
2132 | p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set); | 847 | p->uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set); |
2133 | uuid_flags |= mdev->net_conf->want_lose ? 1 : 0; | 848 | rcu_read_lock(); |
849 | uuid_flags |= rcu_dereference(mdev->tconn->net_conf)->discard_my_data ? 1 : 0; | ||
850 | rcu_read_unlock(); | ||
2134 | uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0; | 851 | uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0; |
2135 | uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0; | 852 | uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0; |
2136 | p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags); | 853 | p->uuid[UI_FLAGS] = cpu_to_be64(uuid_flags); |
2137 | 854 | ||
2138 | put_ldev(mdev); | 855 | put_ldev(mdev); |
2139 | 856 | return drbd_send_command(mdev, sock, P_UUIDS, sizeof(*p), NULL, 0); | |
2140 | return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS, | ||
2141 | (struct p_header80 *)&p, sizeof(p)); | ||
2142 | } | 857 | } |
2143 | 858 | ||
2144 | int drbd_send_uuids(struct drbd_conf *mdev) | 859 | int drbd_send_uuids(struct drbd_conf *mdev) |
@@ -2169,9 +884,10 @@ void drbd_print_uuids(struct drbd_conf *mdev, const char *text) | |||
2169 | } | 884 | } |
2170 | } | 885 | } |
2171 | 886 | ||
2172 | int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev) | 887 | void drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev) |
2173 | { | 888 | { |
2174 | struct p_rs_uuid p; | 889 | struct drbd_socket *sock; |
890 | struct p_rs_uuid *p; | ||
2175 | u64 uuid; | 891 | u64 uuid; |
2176 | 892 | ||
2177 | D_ASSERT(mdev->state.disk == D_UP_TO_DATE); | 893 | D_ASSERT(mdev->state.disk == D_UP_TO_DATE); |
@@ -2184,24 +900,29 @@ int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev) | |||
2184 | drbd_uuid_set(mdev, UI_BITMAP, uuid); | 900 | drbd_uuid_set(mdev, UI_BITMAP, uuid); |
2185 | drbd_print_uuids(mdev, "updated sync UUID"); | 901 | drbd_print_uuids(mdev, "updated sync UUID"); |
2186 | drbd_md_sync(mdev); | 902 | drbd_md_sync(mdev); |
2187 | p.uuid = cpu_to_be64(uuid); | ||
2188 | 903 | ||
2189 | return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID, | 904 | sock = &mdev->tconn->data; |
2190 | (struct p_header80 *)&p, sizeof(p)); | 905 | p = drbd_prepare_command(mdev, sock); |
906 | if (p) { | ||
907 | p->uuid = cpu_to_be64(uuid); | ||
908 | drbd_send_command(mdev, sock, P_SYNC_UUID, sizeof(*p), NULL, 0); | ||
909 | } | ||
2191 | } | 910 | } |
2192 | 911 | ||
2193 | int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags) | 912 | int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags) |
2194 | { | 913 | { |
2195 | struct p_sizes p; | 914 | struct drbd_socket *sock; |
915 | struct p_sizes *p; | ||
2196 | sector_t d_size, u_size; | 916 | sector_t d_size, u_size; |
2197 | int q_order_type; | 917 | int q_order_type; |
2198 | unsigned int max_bio_size; | 918 | unsigned int max_bio_size; |
2199 | int ok; | ||
2200 | 919 | ||
2201 | if (get_ldev_if_state(mdev, D_NEGOTIATING)) { | 920 | if (get_ldev_if_state(mdev, D_NEGOTIATING)) { |
2202 | D_ASSERT(mdev->ldev->backing_bdev); | 921 | D_ASSERT(mdev->ldev->backing_bdev); |
2203 | d_size = drbd_get_max_capacity(mdev->ldev); | 922 | d_size = drbd_get_max_capacity(mdev->ldev); |
2204 | u_size = mdev->ldev->dc.disk_size; | 923 | rcu_read_lock(); |
924 | u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size; | ||
925 | rcu_read_unlock(); | ||
2205 | q_order_type = drbd_queue_order_type(mdev); | 926 | q_order_type = drbd_queue_order_type(mdev); |
2206 | max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9; | 927 | max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9; |
2207 | max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE); | 928 | max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE); |
@@ -2213,20 +934,23 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl | |||
2213 | max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */ | 934 | max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */ |
2214 | } | 935 | } |
2215 | 936 | ||
2216 | /* Never allow old drbd (up to 8.3.7) to see more than 32KiB */ | 937 | sock = &mdev->tconn->data; |
2217 | if (mdev->agreed_pro_version <= 94) | 938 | p = drbd_prepare_command(mdev, sock); |
2218 | max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET); | 939 | if (!p) |
940 | return -EIO; | ||
2219 | 941 | ||
2220 | p.d_size = cpu_to_be64(d_size); | 942 | if (mdev->tconn->agreed_pro_version <= 94) |
2221 | p.u_size = cpu_to_be64(u_size); | 943 | max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET); |
2222 | p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev)); | 944 | else if (mdev->tconn->agreed_pro_version < 100) |
2223 | p.max_bio_size = cpu_to_be32(max_bio_size); | 945 | max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE_P95); |
2224 | p.queue_order_type = cpu_to_be16(q_order_type); | ||
2225 | p.dds_flags = cpu_to_be16(flags); | ||
2226 | 946 | ||
2227 | ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES, | 947 | p->d_size = cpu_to_be64(d_size); |
2228 | (struct p_header80 *)&p, sizeof(p)); | 948 | p->u_size = cpu_to_be64(u_size); |
2229 | return ok; | 949 | p->c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev)); |
950 | p->max_bio_size = cpu_to_be32(max_bio_size); | ||
951 | p->queue_order_type = cpu_to_be16(q_order_type); | ||
952 | p->dds_flags = cpu_to_be16(flags); | ||
953 | return drbd_send_command(mdev, sock, P_SIZES, sizeof(*p), NULL, 0); | ||
2230 | } | 954 | } |
2231 | 955 | ||
2232 | /** | 956 | /** |
@@ -2235,34 +959,21 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl | |||
2235 | */ | 959 | */ |
2236 | int drbd_send_current_state(struct drbd_conf *mdev) | 960 | int drbd_send_current_state(struct drbd_conf *mdev) |
2237 | { | 961 | { |
2238 | struct socket *sock; | 962 | struct drbd_socket *sock; |
2239 | struct p_state p; | 963 | struct p_state *p; |
2240 | int ok = 0; | ||
2241 | |||
2242 | /* Grab state lock so we wont send state if we're in the middle | ||
2243 | * of a cluster wide state change on another thread */ | ||
2244 | drbd_state_lock(mdev); | ||
2245 | |||
2246 | mutex_lock(&mdev->data.mutex); | ||
2247 | |||
2248 | p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */ | ||
2249 | sock = mdev->data.socket; | ||
2250 | 964 | ||
2251 | if (likely(sock != NULL)) { | 965 | sock = &mdev->tconn->data; |
2252 | ok = _drbd_send_cmd(mdev, sock, P_STATE, | 966 | p = drbd_prepare_command(mdev, sock); |
2253 | (struct p_header80 *)&p, sizeof(p), 0); | 967 | if (!p) |
2254 | } | 968 | return -EIO; |
2255 | 969 | p->state = cpu_to_be32(mdev->state.i); /* Within the send mutex */ | |
2256 | mutex_unlock(&mdev->data.mutex); | 970 | return drbd_send_command(mdev, sock, P_STATE, sizeof(*p), NULL, 0); |
2257 | |||
2258 | drbd_state_unlock(mdev); | ||
2259 | return ok; | ||
2260 | } | 971 | } |
2261 | 972 | ||
2262 | /** | 973 | /** |
2263 | * drbd_send_state() - After a state change, sends the new state to the peer | 974 | * drbd_send_state() - After a state change, sends the new state to the peer |
2264 | * @mdev: DRBD device. | 975 | * @mdev: DRBD device. |
2265 | * @state: the state to send, not necessarily the current state. | 976 | * @state: the state to send, not necessarily the current state. |
2266 | * | 977 | * |
2267 | * Each state change queues an "after_state_ch" work, which will eventually | 978 | * Each state change queues an "after_state_ch" work, which will eventually |
2268 | * send the resulting new state to the peer. If more state changes happen | 979 | * send the resulting new state to the peer. If more state changes happen |
@@ -2271,50 +982,95 @@ int drbd_send_current_state(struct drbd_conf *mdev) | |||
2271 | */ | 982 | */ |
2272 | int drbd_send_state(struct drbd_conf *mdev, union drbd_state state) | 983 | int drbd_send_state(struct drbd_conf *mdev, union drbd_state state) |
2273 | { | 984 | { |
2274 | struct socket *sock; | 985 | struct drbd_socket *sock; |
2275 | struct p_state p; | 986 | struct p_state *p; |
2276 | int ok = 0; | ||
2277 | 987 | ||
2278 | mutex_lock(&mdev->data.mutex); | 988 | sock = &mdev->tconn->data; |
989 | p = drbd_prepare_command(mdev, sock); | ||
990 | if (!p) | ||
991 | return -EIO; | ||
992 | p->state = cpu_to_be32(state.i); /* Within the send mutex */ | ||
993 | return drbd_send_command(mdev, sock, P_STATE, sizeof(*p), NULL, 0); | ||
994 | } | ||
2279 | 995 | ||
2280 | p.state = cpu_to_be32(state.i); | 996 | int drbd_send_state_req(struct drbd_conf *mdev, union drbd_state mask, union drbd_state val) |
2281 | sock = mdev->data.socket; | 997 | { |
998 | struct drbd_socket *sock; | ||
999 | struct p_req_state *p; | ||
2282 | 1000 | ||
2283 | if (likely(sock != NULL)) { | 1001 | sock = &mdev->tconn->data; |
2284 | ok = _drbd_send_cmd(mdev, sock, P_STATE, | 1002 | p = drbd_prepare_command(mdev, sock); |
2285 | (struct p_header80 *)&p, sizeof(p), 0); | 1003 | if (!p) |
2286 | } | 1004 | return -EIO; |
1005 | p->mask = cpu_to_be32(mask.i); | ||
1006 | p->val = cpu_to_be32(val.i); | ||
1007 | return drbd_send_command(mdev, sock, P_STATE_CHG_REQ, sizeof(*p), NULL, 0); | ||
1008 | } | ||
2287 | 1009 | ||
2288 | mutex_unlock(&mdev->data.mutex); | 1010 | int conn_send_state_req(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val) |
1011 | { | ||
1012 | enum drbd_packet cmd; | ||
1013 | struct drbd_socket *sock; | ||
1014 | struct p_req_state *p; | ||
2289 | 1015 | ||
2290 | return ok; | 1016 | cmd = tconn->agreed_pro_version < 100 ? P_STATE_CHG_REQ : P_CONN_ST_CHG_REQ; |
1017 | sock = &tconn->data; | ||
1018 | p = conn_prepare_command(tconn, sock); | ||
1019 | if (!p) | ||
1020 | return -EIO; | ||
1021 | p->mask = cpu_to_be32(mask.i); | ||
1022 | p->val = cpu_to_be32(val.i); | ||
1023 | return conn_send_command(tconn, sock, cmd, sizeof(*p), NULL, 0); | ||
2291 | } | 1024 | } |
2292 | 1025 | ||
2293 | int drbd_send_state_req(struct drbd_conf *mdev, | 1026 | void drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode) |
2294 | union drbd_state mask, union drbd_state val) | ||
2295 | { | 1027 | { |
2296 | struct p_req_state p; | 1028 | struct drbd_socket *sock; |
1029 | struct p_req_state_reply *p; | ||
2297 | 1030 | ||
2298 | p.mask = cpu_to_be32(mask.i); | 1031 | sock = &mdev->tconn->meta; |
2299 | p.val = cpu_to_be32(val.i); | 1032 | p = drbd_prepare_command(mdev, sock); |
1033 | if (p) { | ||
1034 | p->retcode = cpu_to_be32(retcode); | ||
1035 | drbd_send_command(mdev, sock, P_STATE_CHG_REPLY, sizeof(*p), NULL, 0); | ||
1036 | } | ||
1037 | } | ||
1038 | |||
1039 | void conn_send_sr_reply(struct drbd_tconn *tconn, enum drbd_state_rv retcode) | ||
1040 | { | ||
1041 | struct drbd_socket *sock; | ||
1042 | struct p_req_state_reply *p; | ||
1043 | enum drbd_packet cmd = tconn->agreed_pro_version < 100 ? P_STATE_CHG_REPLY : P_CONN_ST_CHG_REPLY; | ||
2300 | 1044 | ||
2301 | return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ, | 1045 | sock = &tconn->meta; |
2302 | (struct p_header80 *)&p, sizeof(p)); | 1046 | p = conn_prepare_command(tconn, sock); |
1047 | if (p) { | ||
1048 | p->retcode = cpu_to_be32(retcode); | ||
1049 | conn_send_command(tconn, sock, cmd, sizeof(*p), NULL, 0); | ||
1050 | } | ||
2303 | } | 1051 | } |
2304 | 1052 | ||
2305 | int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode) | 1053 | static void dcbp_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code) |
2306 | { | 1054 | { |
2307 | struct p_req_state_reply p; | 1055 | BUG_ON(code & ~0xf); |
1056 | p->encoding = (p->encoding & ~0xf) | code; | ||
1057 | } | ||
2308 | 1058 | ||
2309 | p.retcode = cpu_to_be32(retcode); | 1059 | static void dcbp_set_start(struct p_compressed_bm *p, int set) |
1060 | { | ||
1061 | p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0); | ||
1062 | } | ||
2310 | 1063 | ||
2311 | return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY, | 1064 | static void dcbp_set_pad_bits(struct p_compressed_bm *p, int n) |
2312 | (struct p_header80 *)&p, sizeof(p)); | 1065 | { |
1066 | BUG_ON(n & ~0x7); | ||
1067 | p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4); | ||
2313 | } | 1068 | } |
2314 | 1069 | ||
2315 | int fill_bitmap_rle_bits(struct drbd_conf *mdev, | 1070 | int fill_bitmap_rle_bits(struct drbd_conf *mdev, |
2316 | struct p_compressed_bm *p, | 1071 | struct p_compressed_bm *p, |
2317 | struct bm_xfer_ctx *c) | 1072 | unsigned int size, |
1073 | struct bm_xfer_ctx *c) | ||
2318 | { | 1074 | { |
2319 | struct bitstream bs; | 1075 | struct bitstream bs; |
2320 | unsigned long plain_bits; | 1076 | unsigned long plain_bits; |
@@ -2322,19 +1078,21 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev, | |||
2322 | unsigned long rl; | 1078 | unsigned long rl; |
2323 | unsigned len; | 1079 | unsigned len; |
2324 | unsigned toggle; | 1080 | unsigned toggle; |
2325 | int bits; | 1081 | int bits, use_rle; |
2326 | 1082 | ||
2327 | /* may we use this feature? */ | 1083 | /* may we use this feature? */ |
2328 | if ((mdev->sync_conf.use_rle == 0) || | 1084 | rcu_read_lock(); |
2329 | (mdev->agreed_pro_version < 90)) | 1085 | use_rle = rcu_dereference(mdev->tconn->net_conf)->use_rle; |
2330 | return 0; | 1086 | rcu_read_unlock(); |
1087 | if (!use_rle || mdev->tconn->agreed_pro_version < 90) | ||
1088 | return 0; | ||
2331 | 1089 | ||
2332 | if (c->bit_offset >= c->bm_bits) | 1090 | if (c->bit_offset >= c->bm_bits) |
2333 | return 0; /* nothing to do. */ | 1091 | return 0; /* nothing to do. */ |
2334 | 1092 | ||
2335 | /* use at most thus many bytes */ | 1093 | /* use at most thus many bytes */ |
2336 | bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0); | 1094 | bitstream_init(&bs, p->code, size, 0); |
2337 | memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX); | 1095 | memset(p->code, 0, size); |
2338 | /* plain bits covered in this code string */ | 1096 | /* plain bits covered in this code string */ |
2339 | plain_bits = 0; | 1097 | plain_bits = 0; |
2340 | 1098 | ||
@@ -2356,12 +1114,12 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev, | |||
2356 | if (rl == 0) { | 1114 | if (rl == 0) { |
2357 | /* the first checked bit was set, | 1115 | /* the first checked bit was set, |
2358 | * store start value, */ | 1116 | * store start value, */ |
2359 | DCBP_set_start(p, 1); | 1117 | dcbp_set_start(p, 1); |
2360 | /* but skip encoding of zero run length */ | 1118 | /* but skip encoding of zero run length */ |
2361 | toggle = !toggle; | 1119 | toggle = !toggle; |
2362 | continue; | 1120 | continue; |
2363 | } | 1121 | } |
2364 | DCBP_set_start(p, 0); | 1122 | dcbp_set_start(p, 0); |
2365 | } | 1123 | } |
2366 | 1124 | ||
2367 | /* paranoia: catch zero runlength. | 1125 | /* paranoia: catch zero runlength. |
@@ -2401,7 +1159,7 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev, | |||
2401 | bm_xfer_ctx_bit_to_word_offset(c); | 1159 | bm_xfer_ctx_bit_to_word_offset(c); |
2402 | 1160 | ||
2403 | /* store pad_bits */ | 1161 | /* store pad_bits */ |
2404 | DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7); | 1162 | dcbp_set_pad_bits(p, (8 - bs.cur.bit) & 0x7); |
2405 | 1163 | ||
2406 | return len; | 1164 | return len; |
2407 | } | 1165 | } |
@@ -2413,48 +1171,52 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev, | |||
2413 | * code upon failure. | 1171 | * code upon failure. |
2414 | */ | 1172 | */ |
2415 | static int | 1173 | static int |
2416 | send_bitmap_rle_or_plain(struct drbd_conf *mdev, | 1174 | send_bitmap_rle_or_plain(struct drbd_conf *mdev, struct bm_xfer_ctx *c) |
2417 | struct p_header80 *h, struct bm_xfer_ctx *c) | ||
2418 | { | 1175 | { |
2419 | struct p_compressed_bm *p = (void*)h; | 1176 | struct drbd_socket *sock = &mdev->tconn->data; |
2420 | unsigned long num_words; | 1177 | unsigned int header_size = drbd_header_size(mdev->tconn); |
2421 | int len; | 1178 | struct p_compressed_bm *p = sock->sbuf + header_size; |
2422 | int ok; | 1179 | int len, err; |
2423 | |||
2424 | len = fill_bitmap_rle_bits(mdev, p, c); | ||
2425 | 1180 | ||
1181 | len = fill_bitmap_rle_bits(mdev, p, | ||
1182 | DRBD_SOCKET_BUFFER_SIZE - header_size - sizeof(*p), c); | ||
2426 | if (len < 0) | 1183 | if (len < 0) |
2427 | return -EIO; | 1184 | return -EIO; |
2428 | 1185 | ||
2429 | if (len) { | 1186 | if (len) { |
2430 | DCBP_set_code(p, RLE_VLI_Bits); | 1187 | dcbp_set_code(p, RLE_VLI_Bits); |
2431 | ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h, | 1188 | err = __send_command(mdev->tconn, mdev->vnr, sock, |
2432 | sizeof(*p) + len, 0); | 1189 | P_COMPRESSED_BITMAP, sizeof(*p) + len, |
2433 | 1190 | NULL, 0); | |
2434 | c->packets[0]++; | 1191 | c->packets[0]++; |
2435 | c->bytes[0] += sizeof(*p) + len; | 1192 | c->bytes[0] += header_size + sizeof(*p) + len; |
2436 | 1193 | ||
2437 | if (c->bit_offset >= c->bm_bits) | 1194 | if (c->bit_offset >= c->bm_bits) |
2438 | len = 0; /* DONE */ | 1195 | len = 0; /* DONE */ |
2439 | } else { | 1196 | } else { |
2440 | /* was not compressible. | 1197 | /* was not compressible. |
2441 | * send a buffer full of plain text bits instead. */ | 1198 | * send a buffer full of plain text bits instead. */ |
2442 | num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset); | 1199 | unsigned int data_size; |
2443 | len = num_words * sizeof(long); | 1200 | unsigned long num_words; |
1201 | unsigned long *p = sock->sbuf + header_size; | ||
1202 | |||
1203 | data_size = DRBD_SOCKET_BUFFER_SIZE - header_size; | ||
1204 | num_words = min_t(size_t, data_size / sizeof(*p), | ||
1205 | c->bm_words - c->word_offset); | ||
1206 | len = num_words * sizeof(*p); | ||
2444 | if (len) | 1207 | if (len) |
2445 | drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload); | 1208 | drbd_bm_get_lel(mdev, c->word_offset, num_words, p); |
2446 | ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP, | 1209 | err = __send_command(mdev->tconn, mdev->vnr, sock, P_BITMAP, len, NULL, 0); |
2447 | h, sizeof(struct p_header80) + len, 0); | ||
2448 | c->word_offset += num_words; | 1210 | c->word_offset += num_words; |
2449 | c->bit_offset = c->word_offset * BITS_PER_LONG; | 1211 | c->bit_offset = c->word_offset * BITS_PER_LONG; |
2450 | 1212 | ||
2451 | c->packets[1]++; | 1213 | c->packets[1]++; |
2452 | c->bytes[1] += sizeof(struct p_header80) + len; | 1214 | c->bytes[1] += header_size + len; |
2453 | 1215 | ||
2454 | if (c->bit_offset > c->bm_bits) | 1216 | if (c->bit_offset > c->bm_bits) |
2455 | c->bit_offset = c->bm_bits; | 1217 | c->bit_offset = c->bm_bits; |
2456 | } | 1218 | } |
2457 | if (ok) { | 1219 | if (!err) { |
2458 | if (len == 0) { | 1220 | if (len == 0) { |
2459 | INFO_bm_xfer_stats(mdev, "send", c); | 1221 | INFO_bm_xfer_stats(mdev, "send", c); |
2460 | return 0; | 1222 | return 0; |
@@ -2465,21 +1227,13 @@ send_bitmap_rle_or_plain(struct drbd_conf *mdev, | |||
2465 | } | 1227 | } |
2466 | 1228 | ||
2467 | /* See the comment at receive_bitmap() */ | 1229 | /* See the comment at receive_bitmap() */ |
2468 | int _drbd_send_bitmap(struct drbd_conf *mdev) | 1230 | static int _drbd_send_bitmap(struct drbd_conf *mdev) |
2469 | { | 1231 | { |
2470 | struct bm_xfer_ctx c; | 1232 | struct bm_xfer_ctx c; |
2471 | struct p_header80 *p; | ||
2472 | int err; | 1233 | int err; |
2473 | 1234 | ||
2474 | ERR_IF(!mdev->bitmap) return false; | 1235 | if (!expect(mdev->bitmap)) |
2475 | |||
2476 | /* maybe we should use some per thread scratch page, | ||
2477 | * and allocate that during initial device creation? */ | ||
2478 | p = (struct p_header80 *) __get_free_page(GFP_NOIO); | ||
2479 | if (!p) { | ||
2480 | dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__); | ||
2481 | return false; | 1236 | return false; |
2482 | } | ||
2483 | 1237 | ||
2484 | if (get_ldev(mdev)) { | 1238 | if (get_ldev(mdev)) { |
2485 | if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) { | 1239 | if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) { |
@@ -2504,37 +1258,39 @@ int _drbd_send_bitmap(struct drbd_conf *mdev) | |||
2504 | }; | 1258 | }; |
2505 | 1259 | ||
2506 | do { | 1260 | do { |
2507 | err = send_bitmap_rle_or_plain(mdev, p, &c); | 1261 | err = send_bitmap_rle_or_plain(mdev, &c); |
2508 | } while (err > 0); | 1262 | } while (err > 0); |
2509 | 1263 | ||
2510 | free_page((unsigned long) p); | ||
2511 | return err == 0; | 1264 | return err == 0; |
2512 | } | 1265 | } |
2513 | 1266 | ||
2514 | int drbd_send_bitmap(struct drbd_conf *mdev) | 1267 | int drbd_send_bitmap(struct drbd_conf *mdev) |
2515 | { | 1268 | { |
2516 | int err; | 1269 | struct drbd_socket *sock = &mdev->tconn->data; |
1270 | int err = -1; | ||
2517 | 1271 | ||
2518 | if (!drbd_get_data_sock(mdev)) | 1272 | mutex_lock(&sock->mutex); |
2519 | return -1; | 1273 | if (sock->socket) |
2520 | err = !_drbd_send_bitmap(mdev); | 1274 | err = !_drbd_send_bitmap(mdev); |
2521 | drbd_put_data_sock(mdev); | 1275 | mutex_unlock(&sock->mutex); |
2522 | return err; | 1276 | return err; |
2523 | } | 1277 | } |
2524 | 1278 | ||
2525 | int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size) | 1279 | void drbd_send_b_ack(struct drbd_tconn *tconn, u32 barrier_nr, u32 set_size) |
2526 | { | 1280 | { |
2527 | int ok; | 1281 | struct drbd_socket *sock; |
2528 | struct p_barrier_ack p; | 1282 | struct p_barrier_ack *p; |
2529 | 1283 | ||
2530 | p.barrier = barrier_nr; | 1284 | if (tconn->cstate < C_WF_REPORT_PARAMS) |
2531 | p.set_size = cpu_to_be32(set_size); | 1285 | return; |
2532 | 1286 | ||
2533 | if (mdev->state.conn < C_CONNECTED) | 1287 | sock = &tconn->meta; |
2534 | return false; | 1288 | p = conn_prepare_command(tconn, sock); |
2535 | ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK, | 1289 | if (!p) |
2536 | (struct p_header80 *)&p, sizeof(p)); | 1290 | return; |
2537 | return ok; | 1291 | p->barrier = barrier_nr; |
1292 | p->set_size = cpu_to_be32(set_size); | ||
1293 | conn_send_command(tconn, sock, P_BARRIER_ACK, sizeof(*p), NULL, 0); | ||
2538 | } | 1294 | } |
2539 | 1295 | ||
2540 | /** | 1296 | /** |
@@ -2545,62 +1301,62 @@ int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size) | |||
2545 | * @blksize: size in byte, needs to be in big endian byte order | 1301 | * @blksize: size in byte, needs to be in big endian byte order |
2546 | * @block_id: Id, big endian byte order | 1302 | * @block_id: Id, big endian byte order |
2547 | */ | 1303 | */ |
2548 | static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd, | 1304 | static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packet cmd, |
2549 | u64 sector, | 1305 | u64 sector, u32 blksize, u64 block_id) |
2550 | u32 blksize, | ||
2551 | u64 block_id) | ||
2552 | { | 1306 | { |
2553 | int ok; | 1307 | struct drbd_socket *sock; |
2554 | struct p_block_ack p; | 1308 | struct p_block_ack *p; |
2555 | 1309 | ||
2556 | p.sector = sector; | 1310 | if (mdev->state.conn < C_CONNECTED) |
2557 | p.block_id = block_id; | 1311 | return -EIO; |
2558 | p.blksize = blksize; | ||
2559 | p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq)); | ||
2560 | 1312 | ||
2561 | if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED) | 1313 | sock = &mdev->tconn->meta; |
2562 | return false; | 1314 | p = drbd_prepare_command(mdev, sock); |
2563 | ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd, | 1315 | if (!p) |
2564 | (struct p_header80 *)&p, sizeof(p)); | 1316 | return -EIO; |
2565 | return ok; | 1317 | p->sector = sector; |
1318 | p->block_id = block_id; | ||
1319 | p->blksize = blksize; | ||
1320 | p->seq_num = cpu_to_be32(atomic_inc_return(&mdev->packet_seq)); | ||
1321 | return drbd_send_command(mdev, sock, cmd, sizeof(*p), NULL, 0); | ||
2566 | } | 1322 | } |
2567 | 1323 | ||
2568 | /* dp->sector and dp->block_id already/still in network byte order, | 1324 | /* dp->sector and dp->block_id already/still in network byte order, |
2569 | * data_size is payload size according to dp->head, | 1325 | * data_size is payload size according to dp->head, |
2570 | * and may need to be corrected for digest size. */ | 1326 | * and may need to be corrected for digest size. */ |
2571 | int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd, | 1327 | void drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packet cmd, |
2572 | struct p_data *dp, int data_size) | 1328 | struct p_data *dp, int data_size) |
2573 | { | 1329 | { |
2574 | data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ? | 1330 | if (mdev->tconn->peer_integrity_tfm) |
2575 | crypto_hash_digestsize(mdev->integrity_r_tfm) : 0; | 1331 | data_size -= crypto_hash_digestsize(mdev->tconn->peer_integrity_tfm); |
2576 | return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size), | 1332 | _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size), |
2577 | dp->block_id); | 1333 | dp->block_id); |
2578 | } | 1334 | } |
2579 | 1335 | ||
2580 | int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd, | 1336 | void drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packet cmd, |
2581 | struct p_block_req *rp) | 1337 | struct p_block_req *rp) |
2582 | { | 1338 | { |
2583 | return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id); | 1339 | _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id); |
2584 | } | 1340 | } |
2585 | 1341 | ||
2586 | /** | 1342 | /** |
2587 | * drbd_send_ack() - Sends an ack packet | 1343 | * drbd_send_ack() - Sends an ack packet |
2588 | * @mdev: DRBD device. | 1344 | * @mdev: DRBD device |
2589 | * @cmd: Packet command code. | 1345 | * @cmd: packet command code |
2590 | * @e: Epoch entry. | 1346 | * @peer_req: peer request |
2591 | */ | 1347 | */ |
2592 | int drbd_send_ack(struct drbd_conf *mdev, | 1348 | int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packet cmd, |
2593 | enum drbd_packets cmd, struct drbd_epoch_entry *e) | 1349 | struct drbd_peer_request *peer_req) |
2594 | { | 1350 | { |
2595 | return _drbd_send_ack(mdev, cmd, | 1351 | return _drbd_send_ack(mdev, cmd, |
2596 | cpu_to_be64(e->sector), | 1352 | cpu_to_be64(peer_req->i.sector), |
2597 | cpu_to_be32(e->size), | 1353 | cpu_to_be32(peer_req->i.size), |
2598 | e->block_id); | 1354 | peer_req->block_id); |
2599 | } | 1355 | } |
2600 | 1356 | ||
2601 | /* This function misuses the block_id field to signal if the blocks | 1357 | /* This function misuses the block_id field to signal if the blocks |
2602 | * are is sync or not. */ | 1358 | * are is sync or not. */ |
2603 | int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd, | 1359 | int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packet cmd, |
2604 | sector_t sector, int blksize, u64 block_id) | 1360 | sector_t sector, int blksize, u64 block_id) |
2605 | { | 1361 | { |
2606 | return _drbd_send_ack(mdev, cmd, | 1362 | return _drbd_send_ack(mdev, cmd, |
@@ -2612,85 +1368,87 @@ int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd, | |||
2612 | int drbd_send_drequest(struct drbd_conf *mdev, int cmd, | 1368 | int drbd_send_drequest(struct drbd_conf *mdev, int cmd, |
2613 | sector_t sector, int size, u64 block_id) | 1369 | sector_t sector, int size, u64 block_id) |
2614 | { | 1370 | { |
2615 | int ok; | 1371 | struct drbd_socket *sock; |
2616 | struct p_block_req p; | 1372 | struct p_block_req *p; |
2617 | |||
2618 | p.sector = cpu_to_be64(sector); | ||
2619 | p.block_id = block_id; | ||
2620 | p.blksize = cpu_to_be32(size); | ||
2621 | 1373 | ||
2622 | ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, | 1374 | sock = &mdev->tconn->data; |
2623 | (struct p_header80 *)&p, sizeof(p)); | 1375 | p = drbd_prepare_command(mdev, sock); |
2624 | return ok; | 1376 | if (!p) |
1377 | return -EIO; | ||
1378 | p->sector = cpu_to_be64(sector); | ||
1379 | p->block_id = block_id; | ||
1380 | p->blksize = cpu_to_be32(size); | ||
1381 | return drbd_send_command(mdev, sock, cmd, sizeof(*p), NULL, 0); | ||
2625 | } | 1382 | } |
2626 | 1383 | ||
2627 | int drbd_send_drequest_csum(struct drbd_conf *mdev, | 1384 | int drbd_send_drequest_csum(struct drbd_conf *mdev, sector_t sector, int size, |
2628 | sector_t sector, int size, | 1385 | void *digest, int digest_size, enum drbd_packet cmd) |
2629 | void *digest, int digest_size, | ||
2630 | enum drbd_packets cmd) | ||
2631 | { | 1386 | { |
2632 | int ok; | 1387 | struct drbd_socket *sock; |
2633 | struct p_block_req p; | 1388 | struct p_block_req *p; |
2634 | |||
2635 | p.sector = cpu_to_be64(sector); | ||
2636 | p.block_id = BE_DRBD_MAGIC + 0xbeef; | ||
2637 | p.blksize = cpu_to_be32(size); | ||
2638 | |||
2639 | p.head.magic = BE_DRBD_MAGIC; | ||
2640 | p.head.command = cpu_to_be16(cmd); | ||
2641 | p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size); | ||
2642 | 1389 | ||
2643 | mutex_lock(&mdev->data.mutex); | 1390 | /* FIXME: Put the digest into the preallocated socket buffer. */ |
2644 | 1391 | ||
2645 | ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0)); | 1392 | sock = &mdev->tconn->data; |
2646 | ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0)); | 1393 | p = drbd_prepare_command(mdev, sock); |
2647 | 1394 | if (!p) | |
2648 | mutex_unlock(&mdev->data.mutex); | 1395 | return -EIO; |
2649 | 1396 | p->sector = cpu_to_be64(sector); | |
2650 | return ok; | 1397 | p->block_id = ID_SYNCER /* unused */; |
1398 | p->blksize = cpu_to_be32(size); | ||
1399 | return drbd_send_command(mdev, sock, cmd, sizeof(*p), | ||
1400 | digest, digest_size); | ||
2651 | } | 1401 | } |
2652 | 1402 | ||
2653 | int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size) | 1403 | int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size) |
2654 | { | 1404 | { |
2655 | int ok; | 1405 | struct drbd_socket *sock; |
2656 | struct p_block_req p; | 1406 | struct p_block_req *p; |
2657 | 1407 | ||
2658 | p.sector = cpu_to_be64(sector); | 1408 | sock = &mdev->tconn->data; |
2659 | p.block_id = BE_DRBD_MAGIC + 0xbabe; | 1409 | p = drbd_prepare_command(mdev, sock); |
2660 | p.blksize = cpu_to_be32(size); | 1410 | if (!p) |
2661 | 1411 | return -EIO; | |
2662 | ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST, | 1412 | p->sector = cpu_to_be64(sector); |
2663 | (struct p_header80 *)&p, sizeof(p)); | 1413 | p->block_id = ID_SYNCER /* unused */; |
2664 | return ok; | 1414 | p->blksize = cpu_to_be32(size); |
1415 | return drbd_send_command(mdev, sock, P_OV_REQUEST, sizeof(*p), NULL, 0); | ||
2665 | } | 1416 | } |
2666 | 1417 | ||
2667 | /* called on sndtimeo | 1418 | /* called on sndtimeo |
2668 | * returns false if we should retry, | 1419 | * returns false if we should retry, |
2669 | * true if we think connection is dead | 1420 | * true if we think connection is dead |
2670 | */ | 1421 | */ |
2671 | static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock) | 1422 | static int we_should_drop_the_connection(struct drbd_tconn *tconn, struct socket *sock) |
2672 | { | 1423 | { |
2673 | int drop_it; | 1424 | int drop_it; |
2674 | /* long elapsed = (long)(jiffies - mdev->last_received); */ | 1425 | /* long elapsed = (long)(jiffies - mdev->last_received); */ |
2675 | 1426 | ||
2676 | drop_it = mdev->meta.socket == sock | 1427 | drop_it = tconn->meta.socket == sock |
2677 | || !mdev->asender.task | 1428 | || !tconn->asender.task |
2678 | || get_t_state(&mdev->asender) != Running | 1429 | || get_t_state(&tconn->asender) != RUNNING |
2679 | || mdev->state.conn < C_CONNECTED; | 1430 | || tconn->cstate < C_WF_REPORT_PARAMS; |
2680 | 1431 | ||
2681 | if (drop_it) | 1432 | if (drop_it) |
2682 | return true; | 1433 | return true; |
2683 | 1434 | ||
2684 | drop_it = !--mdev->ko_count; | 1435 | drop_it = !--tconn->ko_count; |
2685 | if (!drop_it) { | 1436 | if (!drop_it) { |
2686 | dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n", | 1437 | conn_err(tconn, "[%s/%d] sock_sendmsg time expired, ko = %u\n", |
2687 | current->comm, current->pid, mdev->ko_count); | 1438 | current->comm, current->pid, tconn->ko_count); |
2688 | request_ping(mdev); | 1439 | request_ping(tconn); |
2689 | } | 1440 | } |
2690 | 1441 | ||
2691 | return drop_it; /* && (mdev->state == R_PRIMARY) */; | 1442 | return drop_it; /* && (mdev->state == R_PRIMARY) */; |
2692 | } | 1443 | } |
2693 | 1444 | ||
1445 | static void drbd_update_congested(struct drbd_tconn *tconn) | ||
1446 | { | ||
1447 | struct sock *sk = tconn->data.socket->sk; | ||
1448 | if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5) | ||
1449 | set_bit(NET_CONGESTED, &tconn->flags); | ||
1450 | } | ||
1451 | |||
2694 | /* The idea of sendpage seems to be to put some kind of reference | 1452 | /* The idea of sendpage seems to be to put some kind of reference |
2695 | * to the page into the skb, and to hand it over to the NIC. In | 1453 | * to the page into the skb, and to hand it over to the NIC. In |
2696 | * this process get_page() gets called. | 1454 | * this process get_page() gets called. |
@@ -2713,21 +1471,28 @@ static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket * | |||
2713 | * with page_count == 0 or PageSlab. | 1471 | * with page_count == 0 or PageSlab. |
2714 | */ | 1472 | */ |
2715 | static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page, | 1473 | static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page, |
2716 | int offset, size_t size, unsigned msg_flags) | 1474 | int offset, size_t size, unsigned msg_flags) |
2717 | { | 1475 | { |
2718 | int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags); | 1476 | struct socket *socket; |
1477 | void *addr; | ||
1478 | int err; | ||
1479 | |||
1480 | socket = mdev->tconn->data.socket; | ||
1481 | addr = kmap(page) + offset; | ||
1482 | err = drbd_send_all(mdev->tconn, socket, addr, size, msg_flags); | ||
2719 | kunmap(page); | 1483 | kunmap(page); |
2720 | if (sent == size) | 1484 | if (!err) |
2721 | mdev->send_cnt += size>>9; | 1485 | mdev->send_cnt += size >> 9; |
2722 | return sent == size; | 1486 | return err; |
2723 | } | 1487 | } |
2724 | 1488 | ||
2725 | static int _drbd_send_page(struct drbd_conf *mdev, struct page *page, | 1489 | static int _drbd_send_page(struct drbd_conf *mdev, struct page *page, |
2726 | int offset, size_t size, unsigned msg_flags) | 1490 | int offset, size_t size, unsigned msg_flags) |
2727 | { | 1491 | { |
1492 | struct socket *socket = mdev->tconn->data.socket; | ||
2728 | mm_segment_t oldfs = get_fs(); | 1493 | mm_segment_t oldfs = get_fs(); |
2729 | int sent, ok; | ||
2730 | int len = size; | 1494 | int len = size; |
1495 | int err = -EIO; | ||
2731 | 1496 | ||
2732 | /* e.g. XFS meta- & log-data is in slab pages, which have a | 1497 | /* e.g. XFS meta- & log-data is in slab pages, which have a |
2733 | * page_count of 0 and/or have PageSlab() set. | 1498 | * page_count of 0 and/or have PageSlab() set. |
@@ -2739,34 +1504,35 @@ static int _drbd_send_page(struct drbd_conf *mdev, struct page *page, | |||
2739 | return _drbd_no_send_page(mdev, page, offset, size, msg_flags); | 1504 | return _drbd_no_send_page(mdev, page, offset, size, msg_flags); |
2740 | 1505 | ||
2741 | msg_flags |= MSG_NOSIGNAL; | 1506 | msg_flags |= MSG_NOSIGNAL; |
2742 | drbd_update_congested(mdev); | 1507 | drbd_update_congested(mdev->tconn); |
2743 | set_fs(KERNEL_DS); | 1508 | set_fs(KERNEL_DS); |
2744 | do { | 1509 | do { |
2745 | sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page, | 1510 | int sent; |
2746 | offset, len, | 1511 | |
2747 | msg_flags); | 1512 | sent = socket->ops->sendpage(socket, page, offset, len, msg_flags); |
2748 | if (sent == -EAGAIN) { | ||
2749 | if (we_should_drop_the_connection(mdev, | ||
2750 | mdev->data.socket)) | ||
2751 | break; | ||
2752 | else | ||
2753 | continue; | ||
2754 | } | ||
2755 | if (sent <= 0) { | 1513 | if (sent <= 0) { |
1514 | if (sent == -EAGAIN) { | ||
1515 | if (we_should_drop_the_connection(mdev->tconn, socket)) | ||
1516 | break; | ||
1517 | continue; | ||
1518 | } | ||
2756 | dev_warn(DEV, "%s: size=%d len=%d sent=%d\n", | 1519 | dev_warn(DEV, "%s: size=%d len=%d sent=%d\n", |
2757 | __func__, (int)size, len, sent); | 1520 | __func__, (int)size, len, sent); |
1521 | if (sent < 0) | ||
1522 | err = sent; | ||
2758 | break; | 1523 | break; |
2759 | } | 1524 | } |
2760 | len -= sent; | 1525 | len -= sent; |
2761 | offset += sent; | 1526 | offset += sent; |
2762 | } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/); | 1527 | } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/); |
2763 | set_fs(oldfs); | 1528 | set_fs(oldfs); |
2764 | clear_bit(NET_CONGESTED, &mdev->flags); | 1529 | clear_bit(NET_CONGESTED, &mdev->tconn->flags); |
2765 | 1530 | ||
2766 | ok = (len == 0); | 1531 | if (len == 0) { |
2767 | if (likely(ok)) | 1532 | err = 0; |
2768 | mdev->send_cnt += size>>9; | 1533 | mdev->send_cnt += size >> 9; |
2769 | return ok; | 1534 | } |
1535 | return err; | ||
2770 | } | 1536 | } |
2771 | 1537 | ||
2772 | static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio) | 1538 | static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio) |
@@ -2775,12 +1541,15 @@ static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio) | |||
2775 | int i; | 1541 | int i; |
2776 | /* hint all but last page with MSG_MORE */ | 1542 | /* hint all but last page with MSG_MORE */ |
2777 | bio_for_each_segment(bvec, bio, i) { | 1543 | bio_for_each_segment(bvec, bio, i) { |
2778 | if (!_drbd_no_send_page(mdev, bvec->bv_page, | 1544 | int err; |
2779 | bvec->bv_offset, bvec->bv_len, | 1545 | |
2780 | i == bio->bi_vcnt -1 ? 0 : MSG_MORE)) | 1546 | err = _drbd_no_send_page(mdev, bvec->bv_page, |
2781 | return 0; | 1547 | bvec->bv_offset, bvec->bv_len, |
1548 | i == bio->bi_vcnt - 1 ? 0 : MSG_MORE); | ||
1549 | if (err) | ||
1550 | return err; | ||
2782 | } | 1551 | } |
2783 | return 1; | 1552 | return 0; |
2784 | } | 1553 | } |
2785 | 1554 | ||
2786 | static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio) | 1555 | static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio) |
@@ -2789,32 +1558,40 @@ static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio) | |||
2789 | int i; | 1558 | int i; |
2790 | /* hint all but last page with MSG_MORE */ | 1559 | /* hint all but last page with MSG_MORE */ |
2791 | bio_for_each_segment(bvec, bio, i) { | 1560 | bio_for_each_segment(bvec, bio, i) { |
2792 | if (!_drbd_send_page(mdev, bvec->bv_page, | 1561 | int err; |
2793 | bvec->bv_offset, bvec->bv_len, | 1562 | |
2794 | i == bio->bi_vcnt -1 ? 0 : MSG_MORE)) | 1563 | err = _drbd_send_page(mdev, bvec->bv_page, |
2795 | return 0; | 1564 | bvec->bv_offset, bvec->bv_len, |
1565 | i == bio->bi_vcnt - 1 ? 0 : MSG_MORE); | ||
1566 | if (err) | ||
1567 | return err; | ||
2796 | } | 1568 | } |
2797 | return 1; | 1569 | return 0; |
2798 | } | 1570 | } |
2799 | 1571 | ||
2800 | static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e) | 1572 | static int _drbd_send_zc_ee(struct drbd_conf *mdev, |
1573 | struct drbd_peer_request *peer_req) | ||
2801 | { | 1574 | { |
2802 | struct page *page = e->pages; | 1575 | struct page *page = peer_req->pages; |
2803 | unsigned len = e->size; | 1576 | unsigned len = peer_req->i.size; |
1577 | int err; | ||
1578 | |||
2804 | /* hint all but last page with MSG_MORE */ | 1579 | /* hint all but last page with MSG_MORE */ |
2805 | page_chain_for_each(page) { | 1580 | page_chain_for_each(page) { |
2806 | unsigned l = min_t(unsigned, len, PAGE_SIZE); | 1581 | unsigned l = min_t(unsigned, len, PAGE_SIZE); |
2807 | if (!_drbd_send_page(mdev, page, 0, l, | 1582 | |
2808 | page_chain_next(page) ? MSG_MORE : 0)) | 1583 | err = _drbd_send_page(mdev, page, 0, l, |
2809 | return 0; | 1584 | page_chain_next(page) ? MSG_MORE : 0); |
1585 | if (err) | ||
1586 | return err; | ||
2810 | len -= l; | 1587 | len -= l; |
2811 | } | 1588 | } |
2812 | return 1; | 1589 | return 0; |
2813 | } | 1590 | } |
2814 | 1591 | ||
2815 | static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw) | 1592 | static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw) |
2816 | { | 1593 | { |
2817 | if (mdev->agreed_pro_version >= 95) | 1594 | if (mdev->tconn->agreed_pro_version >= 95) |
2818 | return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) | | 1595 | return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) | |
2819 | (bi_rw & REQ_FUA ? DP_FUA : 0) | | 1596 | (bi_rw & REQ_FUA ? DP_FUA : 0) | |
2820 | (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) | | 1597 | (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) | |
@@ -2828,50 +1605,36 @@ static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw) | |||
2828 | */ | 1605 | */ |
2829 | int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) | 1606 | int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) |
2830 | { | 1607 | { |
2831 | int ok = 1; | 1608 | struct drbd_socket *sock; |
2832 | struct p_data p; | 1609 | struct p_data *p; |
2833 | unsigned int dp_flags = 0; | 1610 | unsigned int dp_flags = 0; |
2834 | void *dgb; | ||
2835 | int dgs; | 1611 | int dgs; |
1612 | int err; | ||
2836 | 1613 | ||
2837 | if (!drbd_get_data_sock(mdev)) | 1614 | sock = &mdev->tconn->data; |
2838 | return 0; | 1615 | p = drbd_prepare_command(mdev, sock); |
2839 | 1616 | dgs = mdev->tconn->integrity_tfm ? crypto_hash_digestsize(mdev->tconn->integrity_tfm) : 0; | |
2840 | dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ? | ||
2841 | crypto_hash_digestsize(mdev->integrity_w_tfm) : 0; | ||
2842 | |||
2843 | if (req->size <= DRBD_MAX_SIZE_H80_PACKET) { | ||
2844 | p.head.h80.magic = BE_DRBD_MAGIC; | ||
2845 | p.head.h80.command = cpu_to_be16(P_DATA); | ||
2846 | p.head.h80.length = | ||
2847 | cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size); | ||
2848 | } else { | ||
2849 | p.head.h95.magic = BE_DRBD_MAGIC_BIG; | ||
2850 | p.head.h95.command = cpu_to_be16(P_DATA); | ||
2851 | p.head.h95.length = | ||
2852 | cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size); | ||
2853 | } | ||
2854 | |||
2855 | p.sector = cpu_to_be64(req->sector); | ||
2856 | p.block_id = (unsigned long)req; | ||
2857 | p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq)); | ||
2858 | 1617 | ||
1618 | if (!p) | ||
1619 | return -EIO; | ||
1620 | p->sector = cpu_to_be64(req->i.sector); | ||
1621 | p->block_id = (unsigned long)req; | ||
1622 | p->seq_num = cpu_to_be32(atomic_inc_return(&mdev->packet_seq)); | ||
2859 | dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw); | 1623 | dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw); |
2860 | |||
2861 | if (mdev->state.conn >= C_SYNC_SOURCE && | 1624 | if (mdev->state.conn >= C_SYNC_SOURCE && |
2862 | mdev->state.conn <= C_PAUSED_SYNC_T) | 1625 | mdev->state.conn <= C_PAUSED_SYNC_T) |
2863 | dp_flags |= DP_MAY_SET_IN_SYNC; | 1626 | dp_flags |= DP_MAY_SET_IN_SYNC; |
2864 | 1627 | if (mdev->tconn->agreed_pro_version >= 100) { | |
2865 | p.dp_flags = cpu_to_be32(dp_flags); | 1628 | if (req->rq_state & RQ_EXP_RECEIVE_ACK) |
2866 | set_bit(UNPLUG_REMOTE, &mdev->flags); | 1629 | dp_flags |= DP_SEND_RECEIVE_ACK; |
2867 | ok = (sizeof(p) == | 1630 | if (req->rq_state & RQ_EXP_WRITE_ACK) |
2868 | drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0)); | 1631 | dp_flags |= DP_SEND_WRITE_ACK; |
2869 | if (ok && dgs) { | 1632 | } |
2870 | dgb = mdev->int_dig_out; | 1633 | p->dp_flags = cpu_to_be32(dp_flags); |
2871 | drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb); | 1634 | if (dgs) |
2872 | ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0); | 1635 | drbd_csum_bio(mdev, mdev->tconn->integrity_tfm, req->master_bio, p + 1); |
2873 | } | 1636 | err = __send_command(mdev->tconn, mdev->vnr, sock, P_DATA, sizeof(*p) + dgs, NULL, req->i.size); |
2874 | if (ok) { | 1637 | if (!err) { |
2875 | /* For protocol A, we have to memcpy the payload into | 1638 | /* For protocol A, we have to memcpy the payload into |
2876 | * socket buffers, as we may complete right away | 1639 | * socket buffers, as we may complete right away |
2877 | * as soon as we handed it over to tcp, at which point the data | 1640 | * as soon as we handed it over to tcp, at which point the data |
@@ -2883,92 +1646,76 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) | |||
2883 | * out ok after sending on this side, but does not fit on the | 1646 | * out ok after sending on this side, but does not fit on the |
2884 | * receiving side, we sure have detected corruption elsewhere. | 1647 | * receiving side, we sure have detected corruption elsewhere. |
2885 | */ | 1648 | */ |
2886 | if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs) | 1649 | if (!(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK)) || dgs) |
2887 | ok = _drbd_send_bio(mdev, req->master_bio); | 1650 | err = _drbd_send_bio(mdev, req->master_bio); |
2888 | else | 1651 | else |
2889 | ok = _drbd_send_zc_bio(mdev, req->master_bio); | 1652 | err = _drbd_send_zc_bio(mdev, req->master_bio); |
2890 | 1653 | ||
2891 | /* double check digest, sometimes buffers have been modified in flight. */ | 1654 | /* double check digest, sometimes buffers have been modified in flight. */ |
2892 | if (dgs > 0 && dgs <= 64) { | 1655 | if (dgs > 0 && dgs <= 64) { |
2893 | /* 64 byte, 512 bit, is the largest digest size | 1656 | /* 64 byte, 512 bit, is the largest digest size |
2894 | * currently supported in kernel crypto. */ | 1657 | * currently supported in kernel crypto. */ |
2895 | unsigned char digest[64]; | 1658 | unsigned char digest[64]; |
2896 | drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest); | 1659 | drbd_csum_bio(mdev, mdev->tconn->integrity_tfm, req->master_bio, digest); |
2897 | if (memcmp(mdev->int_dig_out, digest, dgs)) { | 1660 | if (memcmp(p + 1, digest, dgs)) { |
2898 | dev_warn(DEV, | 1661 | dev_warn(DEV, |
2899 | "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n", | 1662 | "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n", |
2900 | (unsigned long long)req->sector, req->size); | 1663 | (unsigned long long)req->i.sector, req->i.size); |
2901 | } | 1664 | } |
2902 | } /* else if (dgs > 64) { | 1665 | } /* else if (dgs > 64) { |
2903 | ... Be noisy about digest too large ... | 1666 | ... Be noisy about digest too large ... |
2904 | } */ | 1667 | } */ |
2905 | } | 1668 | } |
1669 | mutex_unlock(&sock->mutex); /* locked by drbd_prepare_command() */ | ||
2906 | 1670 | ||
2907 | drbd_put_data_sock(mdev); | 1671 | return err; |
2908 | |||
2909 | return ok; | ||
2910 | } | 1672 | } |
2911 | 1673 | ||
2912 | /* answer packet, used to send data back for read requests: | 1674 | /* answer packet, used to send data back for read requests: |
2913 | * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY) | 1675 | * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY) |
2914 | * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY) | 1676 | * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY) |
2915 | */ | 1677 | */ |
2916 | int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, | 1678 | int drbd_send_block(struct drbd_conf *mdev, enum drbd_packet cmd, |
2917 | struct drbd_epoch_entry *e) | 1679 | struct drbd_peer_request *peer_req) |
2918 | { | 1680 | { |
2919 | int ok; | 1681 | struct drbd_socket *sock; |
2920 | struct p_data p; | 1682 | struct p_data *p; |
2921 | void *dgb; | 1683 | int err; |
2922 | int dgs; | 1684 | int dgs; |
2923 | 1685 | ||
2924 | dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ? | 1686 | sock = &mdev->tconn->data; |
2925 | crypto_hash_digestsize(mdev->integrity_w_tfm) : 0; | 1687 | p = drbd_prepare_command(mdev, sock); |
2926 | |||
2927 | if (e->size <= DRBD_MAX_SIZE_H80_PACKET) { | ||
2928 | p.head.h80.magic = BE_DRBD_MAGIC; | ||
2929 | p.head.h80.command = cpu_to_be16(cmd); | ||
2930 | p.head.h80.length = | ||
2931 | cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size); | ||
2932 | } else { | ||
2933 | p.head.h95.magic = BE_DRBD_MAGIC_BIG; | ||
2934 | p.head.h95.command = cpu_to_be16(cmd); | ||
2935 | p.head.h95.length = | ||
2936 | cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size); | ||
2937 | } | ||
2938 | |||
2939 | p.sector = cpu_to_be64(e->sector); | ||
2940 | p.block_id = e->block_id; | ||
2941 | /* p.seq_num = 0; No sequence numbers here.. */ | ||
2942 | |||
2943 | /* Only called by our kernel thread. | ||
2944 | * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL | ||
2945 | * in response to admin command or module unload. | ||
2946 | */ | ||
2947 | if (!drbd_get_data_sock(mdev)) | ||
2948 | return 0; | ||
2949 | 1688 | ||
2950 | ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0); | 1689 | dgs = mdev->tconn->integrity_tfm ? crypto_hash_digestsize(mdev->tconn->integrity_tfm) : 0; |
2951 | if (ok && dgs) { | ||
2952 | dgb = mdev->int_dig_out; | ||
2953 | drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb); | ||
2954 | ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0); | ||
2955 | } | ||
2956 | if (ok) | ||
2957 | ok = _drbd_send_zc_ee(mdev, e); | ||
2958 | 1690 | ||
2959 | drbd_put_data_sock(mdev); | 1691 | if (!p) |
1692 | return -EIO; | ||
1693 | p->sector = cpu_to_be64(peer_req->i.sector); | ||
1694 | p->block_id = peer_req->block_id; | ||
1695 | p->seq_num = 0; /* unused */ | ||
1696 | p->dp_flags = 0; | ||
1697 | if (dgs) | ||
1698 | drbd_csum_ee(mdev, mdev->tconn->integrity_tfm, peer_req, p + 1); | ||
1699 | err = __send_command(mdev->tconn, mdev->vnr, sock, cmd, sizeof(*p) + dgs, NULL, peer_req->i.size); | ||
1700 | if (!err) | ||
1701 | err = _drbd_send_zc_ee(mdev, peer_req); | ||
1702 | mutex_unlock(&sock->mutex); /* locked by drbd_prepare_command() */ | ||
2960 | 1703 | ||
2961 | return ok; | 1704 | return err; |
2962 | } | 1705 | } |
2963 | 1706 | ||
2964 | int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req) | 1707 | int drbd_send_out_of_sync(struct drbd_conf *mdev, struct drbd_request *req) |
2965 | { | 1708 | { |
2966 | struct p_block_desc p; | 1709 | struct drbd_socket *sock; |
2967 | 1710 | struct p_block_desc *p; | |
2968 | p.sector = cpu_to_be64(req->sector); | ||
2969 | p.blksize = cpu_to_be32(req->size); | ||
2970 | 1711 | ||
2971 | return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p)); | 1712 | sock = &mdev->tconn->data; |
1713 | p = drbd_prepare_command(mdev, sock); | ||
1714 | if (!p) | ||
1715 | return -EIO; | ||
1716 | p->sector = cpu_to_be64(req->i.sector); | ||
1717 | p->blksize = cpu_to_be32(req->i.size); | ||
1718 | return drbd_send_command(mdev, sock, P_OUT_OF_SYNC, sizeof(*p), NULL, 0); | ||
2972 | } | 1719 | } |
2973 | 1720 | ||
2974 | /* | 1721 | /* |
@@ -2987,7 +1734,7 @@ int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req) | |||
2987 | /* | 1734 | /* |
2988 | * you must have down()ed the appropriate [m]sock_mutex elsewhere! | 1735 | * you must have down()ed the appropriate [m]sock_mutex elsewhere! |
2989 | */ | 1736 | */ |
2990 | int drbd_send(struct drbd_conf *mdev, struct socket *sock, | 1737 | int drbd_send(struct drbd_tconn *tconn, struct socket *sock, |
2991 | void *buf, size_t size, unsigned msg_flags) | 1738 | void *buf, size_t size, unsigned msg_flags) |
2992 | { | 1739 | { |
2993 | struct kvec iov; | 1740 | struct kvec iov; |
@@ -2995,7 +1742,7 @@ int drbd_send(struct drbd_conf *mdev, struct socket *sock, | |||
2995 | int rv, sent = 0; | 1742 | int rv, sent = 0; |
2996 | 1743 | ||
2997 | if (!sock) | 1744 | if (!sock) |
2998 | return -1000; | 1745 | return -EBADR; |
2999 | 1746 | ||
3000 | /* THINK if (signal_pending) return ... ? */ | 1747 | /* THINK if (signal_pending) return ... ? */ |
3001 | 1748 | ||
@@ -3008,9 +1755,11 @@ int drbd_send(struct drbd_conf *mdev, struct socket *sock, | |||
3008 | msg.msg_controllen = 0; | 1755 | msg.msg_controllen = 0; |
3009 | msg.msg_flags = msg_flags | MSG_NOSIGNAL; | 1756 | msg.msg_flags = msg_flags | MSG_NOSIGNAL; |
3010 | 1757 | ||
3011 | if (sock == mdev->data.socket) { | 1758 | if (sock == tconn->data.socket) { |
3012 | mdev->ko_count = mdev->net_conf->ko_count; | 1759 | rcu_read_lock(); |
3013 | drbd_update_congested(mdev); | 1760 | tconn->ko_count = rcu_dereference(tconn->net_conf)->ko_count; |
1761 | rcu_read_unlock(); | ||
1762 | drbd_update_congested(tconn); | ||
3014 | } | 1763 | } |
3015 | do { | 1764 | do { |
3016 | /* STRANGE | 1765 | /* STRANGE |
@@ -3024,12 +1773,11 @@ int drbd_send(struct drbd_conf *mdev, struct socket *sock, | |||
3024 | */ | 1773 | */ |
3025 | rv = kernel_sendmsg(sock, &msg, &iov, 1, size); | 1774 | rv = kernel_sendmsg(sock, &msg, &iov, 1, size); |
3026 | if (rv == -EAGAIN) { | 1775 | if (rv == -EAGAIN) { |
3027 | if (we_should_drop_the_connection(mdev, sock)) | 1776 | if (we_should_drop_the_connection(tconn, sock)) |
3028 | break; | 1777 | break; |
3029 | else | 1778 | else |
3030 | continue; | 1779 | continue; |
3031 | } | 1780 | } |
3032 | D_ASSERT(rv != 0); | ||
3033 | if (rv == -EINTR) { | 1781 | if (rv == -EINTR) { |
3034 | flush_signals(current); | 1782 | flush_signals(current); |
3035 | rv = 0; | 1783 | rv = 0; |
@@ -3041,22 +1789,40 @@ int drbd_send(struct drbd_conf *mdev, struct socket *sock, | |||
3041 | iov.iov_len -= rv; | 1789 | iov.iov_len -= rv; |
3042 | } while (sent < size); | 1790 | } while (sent < size); |
3043 | 1791 | ||
3044 | if (sock == mdev->data.socket) | 1792 | if (sock == tconn->data.socket) |
3045 | clear_bit(NET_CONGESTED, &mdev->flags); | 1793 | clear_bit(NET_CONGESTED, &tconn->flags); |
3046 | 1794 | ||
3047 | if (rv <= 0) { | 1795 | if (rv <= 0) { |
3048 | if (rv != -EAGAIN) { | 1796 | if (rv != -EAGAIN) { |
3049 | dev_err(DEV, "%s_sendmsg returned %d\n", | 1797 | conn_err(tconn, "%s_sendmsg returned %d\n", |
3050 | sock == mdev->meta.socket ? "msock" : "sock", | 1798 | sock == tconn->meta.socket ? "msock" : "sock", |
3051 | rv); | 1799 | rv); |
3052 | drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE)); | 1800 | conn_request_state(tconn, NS(conn, C_BROKEN_PIPE), CS_HARD); |
3053 | } else | 1801 | } else |
3054 | drbd_force_state(mdev, NS(conn, C_TIMEOUT)); | 1802 | conn_request_state(tconn, NS(conn, C_TIMEOUT), CS_HARD); |
3055 | } | 1803 | } |
3056 | 1804 | ||
3057 | return sent; | 1805 | return sent; |
3058 | } | 1806 | } |
3059 | 1807 | ||
1808 | /** | ||
1809 | * drbd_send_all - Send an entire buffer | ||
1810 | * | ||
1811 | * Returns 0 upon success and a negative error value otherwise. | ||
1812 | */ | ||
1813 | int drbd_send_all(struct drbd_tconn *tconn, struct socket *sock, void *buffer, | ||
1814 | size_t size, unsigned msg_flags) | ||
1815 | { | ||
1816 | int err; | ||
1817 | |||
1818 | err = drbd_send(tconn, sock, buffer, size, msg_flags); | ||
1819 | if (err < 0) | ||
1820 | return err; | ||
1821 | if (err != size) | ||
1822 | return -EIO; | ||
1823 | return 0; | ||
1824 | } | ||
1825 | |||
3060 | static int drbd_open(struct block_device *bdev, fmode_t mode) | 1826 | static int drbd_open(struct block_device *bdev, fmode_t mode) |
3061 | { | 1827 | { |
3062 | struct drbd_conf *mdev = bdev->bd_disk->private_data; | 1828 | struct drbd_conf *mdev = bdev->bd_disk->private_data; |
@@ -3064,7 +1830,7 @@ static int drbd_open(struct block_device *bdev, fmode_t mode) | |||
3064 | int rv = 0; | 1830 | int rv = 0; |
3065 | 1831 | ||
3066 | mutex_lock(&drbd_main_mutex); | 1832 | mutex_lock(&drbd_main_mutex); |
3067 | spin_lock_irqsave(&mdev->req_lock, flags); | 1833 | spin_lock_irqsave(&mdev->tconn->req_lock, flags); |
3068 | /* to have a stable mdev->state.role | 1834 | /* to have a stable mdev->state.role |
3069 | * and no race with updating open_cnt */ | 1835 | * and no race with updating open_cnt */ |
3070 | 1836 | ||
@@ -3077,7 +1843,7 @@ static int drbd_open(struct block_device *bdev, fmode_t mode) | |||
3077 | 1843 | ||
3078 | if (!rv) | 1844 | if (!rv) |
3079 | mdev->open_cnt++; | 1845 | mdev->open_cnt++; |
3080 | spin_unlock_irqrestore(&mdev->req_lock, flags); | 1846 | spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); |
3081 | mutex_unlock(&drbd_main_mutex); | 1847 | mutex_unlock(&drbd_main_mutex); |
3082 | 1848 | ||
3083 | return rv; | 1849 | return rv; |
@@ -3094,35 +1860,14 @@ static int drbd_release(struct gendisk *gd, fmode_t mode) | |||
3094 | 1860 | ||
3095 | static void drbd_set_defaults(struct drbd_conf *mdev) | 1861 | static void drbd_set_defaults(struct drbd_conf *mdev) |
3096 | { | 1862 | { |
3097 | /* This way we get a compile error when sync_conf grows, | 1863 | /* Beware! The actual layout differs |
3098 | and we forgot to initialize it here */ | 1864 | * between big endian and little endian */ |
3099 | mdev->sync_conf = (struct syncer_conf) { | 1865 | mdev->state = (union drbd_dev_state) { |
3100 | /* .rate = */ DRBD_RATE_DEF, | ||
3101 | /* .after = */ DRBD_AFTER_DEF, | ||
3102 | /* .al_extents = */ DRBD_AL_EXTENTS_DEF, | ||
3103 | /* .verify_alg = */ {}, 0, | ||
3104 | /* .cpu_mask = */ {}, 0, | ||
3105 | /* .csums_alg = */ {}, 0, | ||
3106 | /* .use_rle = */ 0, | ||
3107 | /* .on_no_data = */ DRBD_ON_NO_DATA_DEF, | ||
3108 | /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF, | ||
3109 | /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF, | ||
3110 | /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF, | ||
3111 | /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF, | ||
3112 | /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF | ||
3113 | }; | ||
3114 | |||
3115 | /* Have to use that way, because the layout differs between | ||
3116 | big endian and little endian */ | ||
3117 | mdev->state = (union drbd_state) { | ||
3118 | { .role = R_SECONDARY, | 1866 | { .role = R_SECONDARY, |
3119 | .peer = R_UNKNOWN, | 1867 | .peer = R_UNKNOWN, |
3120 | .conn = C_STANDALONE, | 1868 | .conn = C_STANDALONE, |
3121 | .disk = D_DISKLESS, | 1869 | .disk = D_DISKLESS, |
3122 | .pdsk = D_UNKNOWN, | 1870 | .pdsk = D_UNKNOWN, |
3123 | .susp = 0, | ||
3124 | .susp_nod = 0, | ||
3125 | .susp_fen = 0 | ||
3126 | } }; | 1871 | } }; |
3127 | } | 1872 | } |
3128 | 1873 | ||
@@ -3138,28 +1883,17 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) | |||
3138 | atomic_set(&mdev->rs_pending_cnt, 0); | 1883 | atomic_set(&mdev->rs_pending_cnt, 0); |
3139 | atomic_set(&mdev->unacked_cnt, 0); | 1884 | atomic_set(&mdev->unacked_cnt, 0); |
3140 | atomic_set(&mdev->local_cnt, 0); | 1885 | atomic_set(&mdev->local_cnt, 0); |
3141 | atomic_set(&mdev->net_cnt, 0); | ||
3142 | atomic_set(&mdev->packet_seq, 0); | ||
3143 | atomic_set(&mdev->pp_in_use, 0); | ||
3144 | atomic_set(&mdev->pp_in_use_by_net, 0); | 1886 | atomic_set(&mdev->pp_in_use_by_net, 0); |
3145 | atomic_set(&mdev->rs_sect_in, 0); | 1887 | atomic_set(&mdev->rs_sect_in, 0); |
3146 | atomic_set(&mdev->rs_sect_ev, 0); | 1888 | atomic_set(&mdev->rs_sect_ev, 0); |
3147 | atomic_set(&mdev->ap_in_flight, 0); | 1889 | atomic_set(&mdev->ap_in_flight, 0); |
3148 | atomic_set(&mdev->md_io_in_use, 0); | 1890 | atomic_set(&mdev->md_io_in_use, 0); |
3149 | 1891 | ||
3150 | mutex_init(&mdev->data.mutex); | 1892 | mutex_init(&mdev->own_state_mutex); |
3151 | mutex_init(&mdev->meta.mutex); | 1893 | mdev->state_mutex = &mdev->own_state_mutex; |
3152 | sema_init(&mdev->data.work.s, 0); | ||
3153 | sema_init(&mdev->meta.work.s, 0); | ||
3154 | mutex_init(&mdev->state_mutex); | ||
3155 | |||
3156 | spin_lock_init(&mdev->data.work.q_lock); | ||
3157 | spin_lock_init(&mdev->meta.work.q_lock); | ||
3158 | 1894 | ||
3159 | spin_lock_init(&mdev->al_lock); | 1895 | spin_lock_init(&mdev->al_lock); |
3160 | spin_lock_init(&mdev->req_lock); | ||
3161 | spin_lock_init(&mdev->peer_seq_lock); | 1896 | spin_lock_init(&mdev->peer_seq_lock); |
3162 | spin_lock_init(&mdev->epoch_lock); | ||
3163 | 1897 | ||
3164 | INIT_LIST_HEAD(&mdev->active_ee); | 1898 | INIT_LIST_HEAD(&mdev->active_ee); |
3165 | INIT_LIST_HEAD(&mdev->sync_ee); | 1899 | INIT_LIST_HEAD(&mdev->sync_ee); |
@@ -3167,8 +1901,6 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) | |||
3167 | INIT_LIST_HEAD(&mdev->read_ee); | 1901 | INIT_LIST_HEAD(&mdev->read_ee); |
3168 | INIT_LIST_HEAD(&mdev->net_ee); | 1902 | INIT_LIST_HEAD(&mdev->net_ee); |
3169 | INIT_LIST_HEAD(&mdev->resync_reads); | 1903 | INIT_LIST_HEAD(&mdev->resync_reads); |
3170 | INIT_LIST_HEAD(&mdev->data.work.q); | ||
3171 | INIT_LIST_HEAD(&mdev->meta.work.q); | ||
3172 | INIT_LIST_HEAD(&mdev->resync_work.list); | 1904 | INIT_LIST_HEAD(&mdev->resync_work.list); |
3173 | INIT_LIST_HEAD(&mdev->unplug_work.list); | 1905 | INIT_LIST_HEAD(&mdev->unplug_work.list); |
3174 | INIT_LIST_HEAD(&mdev->go_diskless.list); | 1906 | INIT_LIST_HEAD(&mdev->go_diskless.list); |
@@ -3182,6 +1914,14 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) | |||
3182 | mdev->md_sync_work.cb = w_md_sync; | 1914 | mdev->md_sync_work.cb = w_md_sync; |
3183 | mdev->bm_io_work.w.cb = w_bitmap_io; | 1915 | mdev->bm_io_work.w.cb = w_bitmap_io; |
3184 | mdev->start_resync_work.cb = w_start_resync; | 1916 | mdev->start_resync_work.cb = w_start_resync; |
1917 | |||
1918 | mdev->resync_work.mdev = mdev; | ||
1919 | mdev->unplug_work.mdev = mdev; | ||
1920 | mdev->go_diskless.mdev = mdev; | ||
1921 | mdev->md_sync_work.mdev = mdev; | ||
1922 | mdev->bm_io_work.w.mdev = mdev; | ||
1923 | mdev->start_resync_work.mdev = mdev; | ||
1924 | |||
3185 | init_timer(&mdev->resync_timer); | 1925 | init_timer(&mdev->resync_timer); |
3186 | init_timer(&mdev->md_sync_timer); | 1926 | init_timer(&mdev->md_sync_timer); |
3187 | init_timer(&mdev->start_resync_timer); | 1927 | init_timer(&mdev->start_resync_timer); |
@@ -3197,17 +1937,10 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) | |||
3197 | 1937 | ||
3198 | init_waitqueue_head(&mdev->misc_wait); | 1938 | init_waitqueue_head(&mdev->misc_wait); |
3199 | init_waitqueue_head(&mdev->state_wait); | 1939 | init_waitqueue_head(&mdev->state_wait); |
3200 | init_waitqueue_head(&mdev->net_cnt_wait); | ||
3201 | init_waitqueue_head(&mdev->ee_wait); | 1940 | init_waitqueue_head(&mdev->ee_wait); |
3202 | init_waitqueue_head(&mdev->al_wait); | 1941 | init_waitqueue_head(&mdev->al_wait); |
3203 | init_waitqueue_head(&mdev->seq_wait); | 1942 | init_waitqueue_head(&mdev->seq_wait); |
3204 | 1943 | ||
3205 | drbd_thread_init(mdev, &mdev->receiver, drbdd_init); | ||
3206 | drbd_thread_init(mdev, &mdev->worker, drbd_worker); | ||
3207 | drbd_thread_init(mdev, &mdev->asender, drbd_asender); | ||
3208 | |||
3209 | mdev->agreed_pro_version = PRO_VERSION_MAX; | ||
3210 | mdev->write_ordering = WO_bdev_flush; | ||
3211 | mdev->resync_wenr = LC_FREE; | 1944 | mdev->resync_wenr = LC_FREE; |
3212 | mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE; | 1945 | mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE; |
3213 | mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE; | 1946 | mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE; |
@@ -3216,13 +1949,10 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) | |||
3216 | void drbd_mdev_cleanup(struct drbd_conf *mdev) | 1949 | void drbd_mdev_cleanup(struct drbd_conf *mdev) |
3217 | { | 1950 | { |
3218 | int i; | 1951 | int i; |
3219 | if (mdev->receiver.t_state != None) | 1952 | if (mdev->tconn->receiver.t_state != NONE) |
3220 | dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n", | 1953 | dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n", |
3221 | mdev->receiver.t_state); | 1954 | mdev->tconn->receiver.t_state); |
3222 | 1955 | ||
3223 | /* no need to lock it, I'm the only thread alive */ | ||
3224 | if (atomic_read(&mdev->current_epoch->epoch_size) != 0) | ||
3225 | dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size)); | ||
3226 | mdev->al_writ_cnt = | 1956 | mdev->al_writ_cnt = |
3227 | mdev->bm_writ_cnt = | 1957 | mdev->bm_writ_cnt = |
3228 | mdev->read_cnt = | 1958 | mdev->read_cnt = |
@@ -3239,7 +1969,7 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev) | |||
3239 | mdev->rs_mark_left[i] = 0; | 1969 | mdev->rs_mark_left[i] = 0; |
3240 | mdev->rs_mark_time[i] = 0; | 1970 | mdev->rs_mark_time[i] = 0; |
3241 | } | 1971 | } |
3242 | D_ASSERT(mdev->net_conf == NULL); | 1972 | D_ASSERT(mdev->tconn->net_conf == NULL); |
3243 | 1973 | ||
3244 | drbd_set_my_capacity(mdev, 0); | 1974 | drbd_set_my_capacity(mdev, 0); |
3245 | if (mdev->bitmap) { | 1975 | if (mdev->bitmap) { |
@@ -3248,21 +1978,18 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev) | |||
3248 | drbd_bm_cleanup(mdev); | 1978 | drbd_bm_cleanup(mdev); |
3249 | } | 1979 | } |
3250 | 1980 | ||
3251 | drbd_free_resources(mdev); | 1981 | drbd_free_bc(mdev->ldev); |
1982 | mdev->ldev = NULL; | ||
1983 | |||
3252 | clear_bit(AL_SUSPENDED, &mdev->flags); | 1984 | clear_bit(AL_SUSPENDED, &mdev->flags); |
3253 | 1985 | ||
3254 | /* | ||
3255 | * currently we drbd_init_ee only on module load, so | ||
3256 | * we may do drbd_release_ee only on module unload! | ||
3257 | */ | ||
3258 | D_ASSERT(list_empty(&mdev->active_ee)); | 1986 | D_ASSERT(list_empty(&mdev->active_ee)); |
3259 | D_ASSERT(list_empty(&mdev->sync_ee)); | 1987 | D_ASSERT(list_empty(&mdev->sync_ee)); |
3260 | D_ASSERT(list_empty(&mdev->done_ee)); | 1988 | D_ASSERT(list_empty(&mdev->done_ee)); |
3261 | D_ASSERT(list_empty(&mdev->read_ee)); | 1989 | D_ASSERT(list_empty(&mdev->read_ee)); |
3262 | D_ASSERT(list_empty(&mdev->net_ee)); | 1990 | D_ASSERT(list_empty(&mdev->net_ee)); |
3263 | D_ASSERT(list_empty(&mdev->resync_reads)); | 1991 | D_ASSERT(list_empty(&mdev->resync_reads)); |
3264 | D_ASSERT(list_empty(&mdev->data.work.q)); | 1992 | D_ASSERT(list_empty(&mdev->tconn->sender_work.q)); |
3265 | D_ASSERT(list_empty(&mdev->meta.work.q)); | ||
3266 | D_ASSERT(list_empty(&mdev->resync_work.list)); | 1993 | D_ASSERT(list_empty(&mdev->resync_work.list)); |
3267 | D_ASSERT(list_empty(&mdev->unplug_work.list)); | 1994 | D_ASSERT(list_empty(&mdev->unplug_work.list)); |
3268 | D_ASSERT(list_empty(&mdev->go_diskless.list)); | 1995 | D_ASSERT(list_empty(&mdev->go_diskless.list)); |
@@ -3336,7 +2063,7 @@ static int drbd_create_mempools(void) | |||
3336 | goto Enomem; | 2063 | goto Enomem; |
3337 | 2064 | ||
3338 | drbd_ee_cache = kmem_cache_create( | 2065 | drbd_ee_cache = kmem_cache_create( |
3339 | "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL); | 2066 | "drbd_ee", sizeof(struct drbd_peer_request), 0, 0, NULL); |
3340 | if (drbd_ee_cache == NULL) | 2067 | if (drbd_ee_cache == NULL) |
3341 | goto Enomem; | 2068 | goto Enomem; |
3342 | 2069 | ||
@@ -3351,11 +2078,9 @@ static int drbd_create_mempools(void) | |||
3351 | goto Enomem; | 2078 | goto Enomem; |
3352 | 2079 | ||
3353 | /* mempools */ | 2080 | /* mempools */ |
3354 | #ifdef COMPAT_HAVE_BIOSET_CREATE | ||
3355 | drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0); | 2081 | drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0); |
3356 | if (drbd_md_io_bio_set == NULL) | 2082 | if (drbd_md_io_bio_set == NULL) |
3357 | goto Enomem; | 2083 | goto Enomem; |
3358 | #endif | ||
3359 | 2084 | ||
3360 | drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0); | 2085 | drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0); |
3361 | if (drbd_md_io_page_pool == NULL) | 2086 | if (drbd_md_io_page_pool == NULL) |
@@ -3404,73 +2129,53 @@ static struct notifier_block drbd_notifier = { | |||
3404 | .notifier_call = drbd_notify_sys, | 2129 | .notifier_call = drbd_notify_sys, |
3405 | }; | 2130 | }; |
3406 | 2131 | ||
3407 | static void drbd_release_ee_lists(struct drbd_conf *mdev) | 2132 | static void drbd_release_all_peer_reqs(struct drbd_conf *mdev) |
3408 | { | 2133 | { |
3409 | int rr; | 2134 | int rr; |
3410 | 2135 | ||
3411 | rr = drbd_release_ee(mdev, &mdev->active_ee); | 2136 | rr = drbd_free_peer_reqs(mdev, &mdev->active_ee); |
3412 | if (rr) | 2137 | if (rr) |
3413 | dev_err(DEV, "%d EEs in active list found!\n", rr); | 2138 | dev_err(DEV, "%d EEs in active list found!\n", rr); |
3414 | 2139 | ||
3415 | rr = drbd_release_ee(mdev, &mdev->sync_ee); | 2140 | rr = drbd_free_peer_reqs(mdev, &mdev->sync_ee); |
3416 | if (rr) | 2141 | if (rr) |
3417 | dev_err(DEV, "%d EEs in sync list found!\n", rr); | 2142 | dev_err(DEV, "%d EEs in sync list found!\n", rr); |
3418 | 2143 | ||
3419 | rr = drbd_release_ee(mdev, &mdev->read_ee); | 2144 | rr = drbd_free_peer_reqs(mdev, &mdev->read_ee); |
3420 | if (rr) | 2145 | if (rr) |
3421 | dev_err(DEV, "%d EEs in read list found!\n", rr); | 2146 | dev_err(DEV, "%d EEs in read list found!\n", rr); |
3422 | 2147 | ||
3423 | rr = drbd_release_ee(mdev, &mdev->done_ee); | 2148 | rr = drbd_free_peer_reqs(mdev, &mdev->done_ee); |
3424 | if (rr) | 2149 | if (rr) |
3425 | dev_err(DEV, "%d EEs in done list found!\n", rr); | 2150 | dev_err(DEV, "%d EEs in done list found!\n", rr); |
3426 | 2151 | ||
3427 | rr = drbd_release_ee(mdev, &mdev->net_ee); | 2152 | rr = drbd_free_peer_reqs(mdev, &mdev->net_ee); |
3428 | if (rr) | 2153 | if (rr) |
3429 | dev_err(DEV, "%d EEs in net list found!\n", rr); | 2154 | dev_err(DEV, "%d EEs in net list found!\n", rr); |
3430 | } | 2155 | } |
3431 | 2156 | ||
3432 | /* caution. no locking. | 2157 | /* caution. no locking. */ |
3433 | * currently only used from module cleanup code. */ | 2158 | void drbd_minor_destroy(struct kref *kref) |
3434 | static void drbd_delete_device(unsigned int minor) | ||
3435 | { | 2159 | { |
3436 | struct drbd_conf *mdev = minor_to_mdev(minor); | 2160 | struct drbd_conf *mdev = container_of(kref, struct drbd_conf, kref); |
3437 | 2161 | struct drbd_tconn *tconn = mdev->tconn; | |
3438 | if (!mdev) | ||
3439 | return; | ||
3440 | 2162 | ||
3441 | del_timer_sync(&mdev->request_timer); | 2163 | del_timer_sync(&mdev->request_timer); |
3442 | 2164 | ||
3443 | /* paranoia asserts */ | 2165 | /* paranoia asserts */ |
3444 | if (mdev->open_cnt != 0) | 2166 | D_ASSERT(mdev->open_cnt == 0); |
3445 | dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt, | ||
3446 | __FILE__ , __LINE__); | ||
3447 | |||
3448 | ERR_IF (!list_empty(&mdev->data.work.q)) { | ||
3449 | struct list_head *lp; | ||
3450 | list_for_each(lp, &mdev->data.work.q) { | ||
3451 | dev_err(DEV, "lp = %p\n", lp); | ||
3452 | } | ||
3453 | }; | ||
3454 | /* end paranoia asserts */ | 2167 | /* end paranoia asserts */ |
3455 | 2168 | ||
3456 | del_gendisk(mdev->vdisk); | ||
3457 | |||
3458 | /* cleanup stuff that may have been allocated during | 2169 | /* cleanup stuff that may have been allocated during |
3459 | * device (re-)configuration or state changes */ | 2170 | * device (re-)configuration or state changes */ |
3460 | 2171 | ||
3461 | if (mdev->this_bdev) | 2172 | if (mdev->this_bdev) |
3462 | bdput(mdev->this_bdev); | 2173 | bdput(mdev->this_bdev); |
3463 | 2174 | ||
3464 | drbd_free_resources(mdev); | 2175 | drbd_free_bc(mdev->ldev); |
2176 | mdev->ldev = NULL; | ||
3465 | 2177 | ||
3466 | drbd_release_ee_lists(mdev); | 2178 | drbd_release_all_peer_reqs(mdev); |
3467 | |||
3468 | /* should be freed on disconnect? */ | ||
3469 | kfree(mdev->ee_hash); | ||
3470 | /* | ||
3471 | mdev->ee_hash_s = 0; | ||
3472 | mdev->ee_hash = NULL; | ||
3473 | */ | ||
3474 | 2179 | ||
3475 | lc_destroy(mdev->act_log); | 2180 | lc_destroy(mdev->act_log); |
3476 | lc_destroy(mdev->resync); | 2181 | lc_destroy(mdev->resync); |
@@ -3478,19 +2183,101 @@ static void drbd_delete_device(unsigned int minor) | |||
3478 | kfree(mdev->p_uuid); | 2183 | kfree(mdev->p_uuid); |
3479 | /* mdev->p_uuid = NULL; */ | 2184 | /* mdev->p_uuid = NULL; */ |
3480 | 2185 | ||
3481 | kfree(mdev->int_dig_out); | 2186 | if (mdev->bitmap) /* should no longer be there. */ |
3482 | kfree(mdev->int_dig_in); | 2187 | drbd_bm_cleanup(mdev); |
3483 | kfree(mdev->int_dig_vv); | 2188 | __free_page(mdev->md_io_page); |
2189 | put_disk(mdev->vdisk); | ||
2190 | blk_cleanup_queue(mdev->rq_queue); | ||
2191 | kfree(mdev->rs_plan_s); | ||
2192 | kfree(mdev); | ||
3484 | 2193 | ||
3485 | /* cleanup the rest that has been | 2194 | kref_put(&tconn->kref, &conn_destroy); |
3486 | * allocated from drbd_new_device | ||
3487 | * and actually free the mdev itself */ | ||
3488 | drbd_free_mdev(mdev); | ||
3489 | } | 2195 | } |
3490 | 2196 | ||
2197 | /* One global retry thread, if we need to push back some bio and have it | ||
2198 | * reinserted through our make request function. | ||
2199 | */ | ||
2200 | static struct retry_worker { | ||
2201 | struct workqueue_struct *wq; | ||
2202 | struct work_struct worker; | ||
2203 | |||
2204 | spinlock_t lock; | ||
2205 | struct list_head writes; | ||
2206 | } retry; | ||
2207 | |||
2208 | static void do_retry(struct work_struct *ws) | ||
2209 | { | ||
2210 | struct retry_worker *retry = container_of(ws, struct retry_worker, worker); | ||
2211 | LIST_HEAD(writes); | ||
2212 | struct drbd_request *req, *tmp; | ||
2213 | |||
2214 | spin_lock_irq(&retry->lock); | ||
2215 | list_splice_init(&retry->writes, &writes); | ||
2216 | spin_unlock_irq(&retry->lock); | ||
2217 | |||
2218 | list_for_each_entry_safe(req, tmp, &writes, tl_requests) { | ||
2219 | struct drbd_conf *mdev = req->w.mdev; | ||
2220 | struct bio *bio = req->master_bio; | ||
2221 | unsigned long start_time = req->start_time; | ||
2222 | bool expected; | ||
2223 | |||
2224 | expected = | ||
2225 | expect(atomic_read(&req->completion_ref) == 0) && | ||
2226 | expect(req->rq_state & RQ_POSTPONED) && | ||
2227 | expect((req->rq_state & RQ_LOCAL_PENDING) == 0 || | ||
2228 | (req->rq_state & RQ_LOCAL_ABORTED) != 0); | ||
2229 | |||
2230 | if (!expected) | ||
2231 | dev_err(DEV, "req=%p completion_ref=%d rq_state=%x\n", | ||
2232 | req, atomic_read(&req->completion_ref), | ||
2233 | req->rq_state); | ||
2234 | |||
2235 | /* We still need to put one kref associated with the | ||
2236 | * "completion_ref" going zero in the code path that queued it | ||
2237 | * here. The request object may still be referenced by a | ||
2238 | * frozen local req->private_bio, in case we force-detached. | ||
2239 | */ | ||
2240 | kref_put(&req->kref, drbd_req_destroy); | ||
2241 | |||
2242 | /* A single suspended or otherwise blocking device may stall | ||
2243 | * all others as well. Fortunately, this code path is to | ||
2244 | * recover from a situation that "should not happen": | ||
2245 | * concurrent writes in multi-primary setup. | ||
2246 | * In a "normal" lifecycle, this workqueue is supposed to be | ||
2247 | * destroyed without ever doing anything. | ||
2248 | * If it turns out to be an issue anyways, we can do per | ||
2249 | * resource (replication group) or per device (minor) retry | ||
2250 | * workqueues instead. | ||
2251 | */ | ||
2252 | |||
2253 | /* We are not just doing generic_make_request(), | ||
2254 | * as we want to keep the start_time information. */ | ||
2255 | inc_ap_bio(mdev); | ||
2256 | __drbd_make_request(mdev, bio, start_time); | ||
2257 | } | ||
2258 | } | ||
2259 | |||
2260 | void drbd_restart_request(struct drbd_request *req) | ||
2261 | { | ||
2262 | unsigned long flags; | ||
2263 | spin_lock_irqsave(&retry.lock, flags); | ||
2264 | list_move_tail(&req->tl_requests, &retry.writes); | ||
2265 | spin_unlock_irqrestore(&retry.lock, flags); | ||
2266 | |||
2267 | /* Drop the extra reference that would otherwise | ||
2268 | * have been dropped by complete_master_bio. | ||
2269 | * do_retry() needs to grab a new one. */ | ||
2270 | dec_ap_bio(req->w.mdev); | ||
2271 | |||
2272 | queue_work(retry.wq, &retry.worker); | ||
2273 | } | ||
2274 | |||
2275 | |||
3491 | static void drbd_cleanup(void) | 2276 | static void drbd_cleanup(void) |
3492 | { | 2277 | { |
3493 | unsigned int i; | 2278 | unsigned int i; |
2279 | struct drbd_conf *mdev; | ||
2280 | struct drbd_tconn *tconn, *tmp; | ||
3494 | 2281 | ||
3495 | unregister_reboot_notifier(&drbd_notifier); | 2282 | unregister_reboot_notifier(&drbd_notifier); |
3496 | 2283 | ||
@@ -3505,19 +2292,31 @@ static void drbd_cleanup(void) | |||
3505 | if (drbd_proc) | 2292 | if (drbd_proc) |
3506 | remove_proc_entry("drbd", NULL); | 2293 | remove_proc_entry("drbd", NULL); |
3507 | 2294 | ||
3508 | drbd_nl_cleanup(); | 2295 | if (retry.wq) |
2296 | destroy_workqueue(retry.wq); | ||
2297 | |||
2298 | drbd_genl_unregister(); | ||
3509 | 2299 | ||
3510 | if (minor_table) { | 2300 | idr_for_each_entry(&minors, mdev, i) { |
3511 | i = minor_count; | 2301 | idr_remove(&minors, mdev_to_minor(mdev)); |
3512 | while (i--) | 2302 | idr_remove(&mdev->tconn->volumes, mdev->vnr); |
3513 | drbd_delete_device(i); | 2303 | del_gendisk(mdev->vdisk); |
3514 | drbd_destroy_mempools(); | 2304 | /* synchronize_rcu(); No other threads running at this point */ |
2305 | kref_put(&mdev->kref, &drbd_minor_destroy); | ||
3515 | } | 2306 | } |
3516 | 2307 | ||
3517 | kfree(minor_table); | 2308 | /* not _rcu since, no other updater anymore. Genl already unregistered */ |
2309 | list_for_each_entry_safe(tconn, tmp, &drbd_tconns, all_tconn) { | ||
2310 | list_del(&tconn->all_tconn); /* not _rcu no proc, not other threads */ | ||
2311 | /* synchronize_rcu(); */ | ||
2312 | kref_put(&tconn->kref, &conn_destroy); | ||
2313 | } | ||
3518 | 2314 | ||
2315 | drbd_destroy_mempools(); | ||
3519 | unregister_blkdev(DRBD_MAJOR, "drbd"); | 2316 | unregister_blkdev(DRBD_MAJOR, "drbd"); |
3520 | 2317 | ||
2318 | idr_destroy(&minors); | ||
2319 | |||
3521 | printk(KERN_INFO "drbd: module cleanup done.\n"); | 2320 | printk(KERN_INFO "drbd: module cleanup done.\n"); |
3522 | } | 2321 | } |
3523 | 2322 | ||
@@ -3542,7 +2341,7 @@ static int drbd_congested(void *congested_data, int bdi_bits) | |||
3542 | goto out; | 2341 | goto out; |
3543 | } | 2342 | } |
3544 | 2343 | ||
3545 | if (test_bit(CALLBACK_PENDING, &mdev->flags)) { | 2344 | if (test_bit(CALLBACK_PENDING, &mdev->tconn->flags)) { |
3546 | r |= (1 << BDI_async_congested); | 2345 | r |= (1 << BDI_async_congested); |
3547 | /* Without good local data, we would need to read from remote, | 2346 | /* Without good local data, we would need to read from remote, |
3548 | * and that would need the worker thread as well, which is | 2347 | * and that would need the worker thread as well, which is |
@@ -3566,7 +2365,7 @@ static int drbd_congested(void *congested_data, int bdi_bits) | |||
3566 | reason = 'b'; | 2365 | reason = 'b'; |
3567 | } | 2366 | } |
3568 | 2367 | ||
3569 | if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) { | 2368 | if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->tconn->flags)) { |
3570 | r |= (1 << BDI_async_congested); | 2369 | r |= (1 << BDI_async_congested); |
3571 | reason = reason == 'b' ? 'a' : 'n'; | 2370 | reason = reason == 'b' ? 'a' : 'n'; |
3572 | } | 2371 | } |
@@ -3576,20 +2375,243 @@ out: | |||
3576 | return r; | 2375 | return r; |
3577 | } | 2376 | } |
3578 | 2377 | ||
3579 | struct drbd_conf *drbd_new_device(unsigned int minor) | 2378 | static void drbd_init_workqueue(struct drbd_work_queue* wq) |
2379 | { | ||
2380 | spin_lock_init(&wq->q_lock); | ||
2381 | INIT_LIST_HEAD(&wq->q); | ||
2382 | init_waitqueue_head(&wq->q_wait); | ||
2383 | } | ||
2384 | |||
2385 | struct drbd_tconn *conn_get_by_name(const char *name) | ||
2386 | { | ||
2387 | struct drbd_tconn *tconn; | ||
2388 | |||
2389 | if (!name || !name[0]) | ||
2390 | return NULL; | ||
2391 | |||
2392 | rcu_read_lock(); | ||
2393 | list_for_each_entry_rcu(tconn, &drbd_tconns, all_tconn) { | ||
2394 | if (!strcmp(tconn->name, name)) { | ||
2395 | kref_get(&tconn->kref); | ||
2396 | goto found; | ||
2397 | } | ||
2398 | } | ||
2399 | tconn = NULL; | ||
2400 | found: | ||
2401 | rcu_read_unlock(); | ||
2402 | return tconn; | ||
2403 | } | ||
2404 | |||
2405 | struct drbd_tconn *conn_get_by_addrs(void *my_addr, int my_addr_len, | ||
2406 | void *peer_addr, int peer_addr_len) | ||
2407 | { | ||
2408 | struct drbd_tconn *tconn; | ||
2409 | |||
2410 | rcu_read_lock(); | ||
2411 | list_for_each_entry_rcu(tconn, &drbd_tconns, all_tconn) { | ||
2412 | if (tconn->my_addr_len == my_addr_len && | ||
2413 | tconn->peer_addr_len == peer_addr_len && | ||
2414 | !memcmp(&tconn->my_addr, my_addr, my_addr_len) && | ||
2415 | !memcmp(&tconn->peer_addr, peer_addr, peer_addr_len)) { | ||
2416 | kref_get(&tconn->kref); | ||
2417 | goto found; | ||
2418 | } | ||
2419 | } | ||
2420 | tconn = NULL; | ||
2421 | found: | ||
2422 | rcu_read_unlock(); | ||
2423 | return tconn; | ||
2424 | } | ||
2425 | |||
2426 | static int drbd_alloc_socket(struct drbd_socket *socket) | ||
2427 | { | ||
2428 | socket->rbuf = (void *) __get_free_page(GFP_KERNEL); | ||
2429 | if (!socket->rbuf) | ||
2430 | return -ENOMEM; | ||
2431 | socket->sbuf = (void *) __get_free_page(GFP_KERNEL); | ||
2432 | if (!socket->sbuf) | ||
2433 | return -ENOMEM; | ||
2434 | return 0; | ||
2435 | } | ||
2436 | |||
2437 | static void drbd_free_socket(struct drbd_socket *socket) | ||
2438 | { | ||
2439 | free_page((unsigned long) socket->sbuf); | ||
2440 | free_page((unsigned long) socket->rbuf); | ||
2441 | } | ||
2442 | |||
2443 | void conn_free_crypto(struct drbd_tconn *tconn) | ||
2444 | { | ||
2445 | drbd_free_sock(tconn); | ||
2446 | |||
2447 | crypto_free_hash(tconn->csums_tfm); | ||
2448 | crypto_free_hash(tconn->verify_tfm); | ||
2449 | crypto_free_hash(tconn->cram_hmac_tfm); | ||
2450 | crypto_free_hash(tconn->integrity_tfm); | ||
2451 | crypto_free_hash(tconn->peer_integrity_tfm); | ||
2452 | kfree(tconn->int_dig_in); | ||
2453 | kfree(tconn->int_dig_vv); | ||
2454 | |||
2455 | tconn->csums_tfm = NULL; | ||
2456 | tconn->verify_tfm = NULL; | ||
2457 | tconn->cram_hmac_tfm = NULL; | ||
2458 | tconn->integrity_tfm = NULL; | ||
2459 | tconn->peer_integrity_tfm = NULL; | ||
2460 | tconn->int_dig_in = NULL; | ||
2461 | tconn->int_dig_vv = NULL; | ||
2462 | } | ||
2463 | |||
2464 | int set_resource_options(struct drbd_tconn *tconn, struct res_opts *res_opts) | ||
2465 | { | ||
2466 | cpumask_var_t new_cpu_mask; | ||
2467 | int err; | ||
2468 | |||
2469 | if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) | ||
2470 | return -ENOMEM; | ||
2471 | /* | ||
2472 | retcode = ERR_NOMEM; | ||
2473 | drbd_msg_put_info("unable to allocate cpumask"); | ||
2474 | */ | ||
2475 | |||
2476 | /* silently ignore cpu mask on UP kernel */ | ||
2477 | if (nr_cpu_ids > 1 && res_opts->cpu_mask[0] != 0) { | ||
2478 | /* FIXME: Get rid of constant 32 here */ | ||
2479 | err = bitmap_parse(res_opts->cpu_mask, 32, | ||
2480 | cpumask_bits(new_cpu_mask), nr_cpu_ids); | ||
2481 | if (err) { | ||
2482 | conn_warn(tconn, "bitmap_parse() failed with %d\n", err); | ||
2483 | /* retcode = ERR_CPU_MASK_PARSE; */ | ||
2484 | goto fail; | ||
2485 | } | ||
2486 | } | ||
2487 | tconn->res_opts = *res_opts; | ||
2488 | if (!cpumask_equal(tconn->cpu_mask, new_cpu_mask)) { | ||
2489 | cpumask_copy(tconn->cpu_mask, new_cpu_mask); | ||
2490 | drbd_calc_cpu_mask(tconn); | ||
2491 | tconn->receiver.reset_cpu_mask = 1; | ||
2492 | tconn->asender.reset_cpu_mask = 1; | ||
2493 | tconn->worker.reset_cpu_mask = 1; | ||
2494 | } | ||
2495 | err = 0; | ||
2496 | |||
2497 | fail: | ||
2498 | free_cpumask_var(new_cpu_mask); | ||
2499 | return err; | ||
2500 | |||
2501 | } | ||
2502 | |||
2503 | /* caller must be under genl_lock() */ | ||
2504 | struct drbd_tconn *conn_create(const char *name, struct res_opts *res_opts) | ||
2505 | { | ||
2506 | struct drbd_tconn *tconn; | ||
2507 | |||
2508 | tconn = kzalloc(sizeof(struct drbd_tconn), GFP_KERNEL); | ||
2509 | if (!tconn) | ||
2510 | return NULL; | ||
2511 | |||
2512 | tconn->name = kstrdup(name, GFP_KERNEL); | ||
2513 | if (!tconn->name) | ||
2514 | goto fail; | ||
2515 | |||
2516 | if (drbd_alloc_socket(&tconn->data)) | ||
2517 | goto fail; | ||
2518 | if (drbd_alloc_socket(&tconn->meta)) | ||
2519 | goto fail; | ||
2520 | |||
2521 | if (!zalloc_cpumask_var(&tconn->cpu_mask, GFP_KERNEL)) | ||
2522 | goto fail; | ||
2523 | |||
2524 | if (set_resource_options(tconn, res_opts)) | ||
2525 | goto fail; | ||
2526 | |||
2527 | tconn->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL); | ||
2528 | if (!tconn->current_epoch) | ||
2529 | goto fail; | ||
2530 | |||
2531 | INIT_LIST_HEAD(&tconn->transfer_log); | ||
2532 | |||
2533 | INIT_LIST_HEAD(&tconn->current_epoch->list); | ||
2534 | tconn->epochs = 1; | ||
2535 | spin_lock_init(&tconn->epoch_lock); | ||
2536 | tconn->write_ordering = WO_bdev_flush; | ||
2537 | |||
2538 | tconn->send.seen_any_write_yet = false; | ||
2539 | tconn->send.current_epoch_nr = 0; | ||
2540 | tconn->send.current_epoch_writes = 0; | ||
2541 | |||
2542 | tconn->cstate = C_STANDALONE; | ||
2543 | mutex_init(&tconn->cstate_mutex); | ||
2544 | spin_lock_init(&tconn->req_lock); | ||
2545 | mutex_init(&tconn->conf_update); | ||
2546 | init_waitqueue_head(&tconn->ping_wait); | ||
2547 | idr_init(&tconn->volumes); | ||
2548 | |||
2549 | drbd_init_workqueue(&tconn->sender_work); | ||
2550 | mutex_init(&tconn->data.mutex); | ||
2551 | mutex_init(&tconn->meta.mutex); | ||
2552 | |||
2553 | drbd_thread_init(tconn, &tconn->receiver, drbdd_init, "receiver"); | ||
2554 | drbd_thread_init(tconn, &tconn->worker, drbd_worker, "worker"); | ||
2555 | drbd_thread_init(tconn, &tconn->asender, drbd_asender, "asender"); | ||
2556 | |||
2557 | kref_init(&tconn->kref); | ||
2558 | list_add_tail_rcu(&tconn->all_tconn, &drbd_tconns); | ||
2559 | |||
2560 | return tconn; | ||
2561 | |||
2562 | fail: | ||
2563 | kfree(tconn->current_epoch); | ||
2564 | free_cpumask_var(tconn->cpu_mask); | ||
2565 | drbd_free_socket(&tconn->meta); | ||
2566 | drbd_free_socket(&tconn->data); | ||
2567 | kfree(tconn->name); | ||
2568 | kfree(tconn); | ||
2569 | |||
2570 | return NULL; | ||
2571 | } | ||
2572 | |||
2573 | void conn_destroy(struct kref *kref) | ||
2574 | { | ||
2575 | struct drbd_tconn *tconn = container_of(kref, struct drbd_tconn, kref); | ||
2576 | |||
2577 | if (atomic_read(&tconn->current_epoch->epoch_size) != 0) | ||
2578 | conn_err(tconn, "epoch_size:%d\n", atomic_read(&tconn->current_epoch->epoch_size)); | ||
2579 | kfree(tconn->current_epoch); | ||
2580 | |||
2581 | idr_destroy(&tconn->volumes); | ||
2582 | |||
2583 | free_cpumask_var(tconn->cpu_mask); | ||
2584 | drbd_free_socket(&tconn->meta); | ||
2585 | drbd_free_socket(&tconn->data); | ||
2586 | kfree(tconn->name); | ||
2587 | kfree(tconn->int_dig_in); | ||
2588 | kfree(tconn->int_dig_vv); | ||
2589 | kfree(tconn); | ||
2590 | } | ||
2591 | |||
2592 | enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr) | ||
3580 | { | 2593 | { |
3581 | struct drbd_conf *mdev; | 2594 | struct drbd_conf *mdev; |
3582 | struct gendisk *disk; | 2595 | struct gendisk *disk; |
3583 | struct request_queue *q; | 2596 | struct request_queue *q; |
2597 | int vnr_got = vnr; | ||
2598 | int minor_got = minor; | ||
2599 | enum drbd_ret_code err = ERR_NOMEM; | ||
2600 | |||
2601 | mdev = minor_to_mdev(minor); | ||
2602 | if (mdev) | ||
2603 | return ERR_MINOR_EXISTS; | ||
3584 | 2604 | ||
3585 | /* GFP_KERNEL, we are outside of all write-out paths */ | 2605 | /* GFP_KERNEL, we are outside of all write-out paths */ |
3586 | mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL); | 2606 | mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL); |
3587 | if (!mdev) | 2607 | if (!mdev) |
3588 | return NULL; | 2608 | return ERR_NOMEM; |
3589 | if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL)) | 2609 | |
3590 | goto out_no_cpumask; | 2610 | kref_get(&tconn->kref); |
2611 | mdev->tconn = tconn; | ||
3591 | 2612 | ||
3592 | mdev->minor = minor; | 2613 | mdev->minor = minor; |
2614 | mdev->vnr = vnr; | ||
3593 | 2615 | ||
3594 | drbd_init_set_defaults(mdev); | 2616 | drbd_init_set_defaults(mdev); |
3595 | 2617 | ||
@@ -3627,7 +2649,7 @@ struct drbd_conf *drbd_new_device(unsigned int minor) | |||
3627 | blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8); | 2649 | blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8); |
3628 | blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); | 2650 | blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); |
3629 | blk_queue_merge_bvec(q, drbd_merge_bvec); | 2651 | blk_queue_merge_bvec(q, drbd_merge_bvec); |
3630 | q->queue_lock = &mdev->req_lock; | 2652 | q->queue_lock = &mdev->tconn->req_lock; /* needed since we use */ |
3631 | 2653 | ||
3632 | mdev->md_io_page = alloc_page(GFP_KERNEL); | 2654 | mdev->md_io_page = alloc_page(GFP_KERNEL); |
3633 | if (!mdev->md_io_page) | 2655 | if (!mdev->md_io_page) |
@@ -3635,30 +2657,44 @@ struct drbd_conf *drbd_new_device(unsigned int minor) | |||
3635 | 2657 | ||
3636 | if (drbd_bm_init(mdev)) | 2658 | if (drbd_bm_init(mdev)) |
3637 | goto out_no_bitmap; | 2659 | goto out_no_bitmap; |
3638 | /* no need to lock access, we are still initializing this minor device. */ | 2660 | mdev->read_requests = RB_ROOT; |
3639 | if (!tl_init(mdev)) | 2661 | mdev->write_requests = RB_ROOT; |
3640 | goto out_no_tl; | 2662 | |
3641 | 2663 | if (!idr_pre_get(&minors, GFP_KERNEL)) | |
3642 | mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL); | 2664 | goto out_no_minor_idr; |
3643 | if (!mdev->app_reads_hash) | 2665 | if (idr_get_new_above(&minors, mdev, minor, &minor_got)) |
3644 | goto out_no_app_reads; | 2666 | goto out_no_minor_idr; |
3645 | 2667 | if (minor_got != minor) { | |
3646 | mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL); | 2668 | err = ERR_MINOR_EXISTS; |
3647 | if (!mdev->current_epoch) | 2669 | drbd_msg_put_info("requested minor exists already"); |
3648 | goto out_no_epoch; | 2670 | goto out_idr_remove_minor; |
3649 | 2671 | } | |
3650 | INIT_LIST_HEAD(&mdev->current_epoch->list); | 2672 | |
3651 | mdev->epochs = 1; | 2673 | if (!idr_pre_get(&tconn->volumes, GFP_KERNEL)) |
3652 | 2674 | goto out_idr_remove_minor; | |
3653 | return mdev; | 2675 | if (idr_get_new_above(&tconn->volumes, mdev, vnr, &vnr_got)) |
3654 | 2676 | goto out_idr_remove_minor; | |
3655 | /* out_whatever_else: | 2677 | if (vnr_got != vnr) { |
3656 | kfree(mdev->current_epoch); */ | 2678 | err = ERR_INVALID_REQUEST; |
3657 | out_no_epoch: | 2679 | drbd_msg_put_info("requested volume exists already"); |
3658 | kfree(mdev->app_reads_hash); | 2680 | goto out_idr_remove_vol; |
3659 | out_no_app_reads: | 2681 | } |
3660 | tl_cleanup(mdev); | 2682 | add_disk(disk); |
3661 | out_no_tl: | 2683 | kref_init(&mdev->kref); /* one ref for both idrs and the the add_disk */ |
2684 | |||
2685 | /* inherit the connection state */ | ||
2686 | mdev->state.conn = tconn->cstate; | ||
2687 | if (mdev->state.conn == C_WF_REPORT_PARAMS) | ||
2688 | drbd_connected(mdev); | ||
2689 | |||
2690 | return NO_ERROR; | ||
2691 | |||
2692 | out_idr_remove_vol: | ||
2693 | idr_remove(&tconn->volumes, vnr_got); | ||
2694 | out_idr_remove_minor: | ||
2695 | idr_remove(&minors, minor_got); | ||
2696 | synchronize_rcu(); | ||
2697 | out_no_minor_idr: | ||
3662 | drbd_bm_cleanup(mdev); | 2698 | drbd_bm_cleanup(mdev); |
3663 | out_no_bitmap: | 2699 | out_no_bitmap: |
3664 | __free_page(mdev->md_io_page); | 2700 | __free_page(mdev->md_io_page); |
@@ -3667,55 +2703,25 @@ out_no_io_page: | |||
3667 | out_no_disk: | 2703 | out_no_disk: |
3668 | blk_cleanup_queue(q); | 2704 | blk_cleanup_queue(q); |
3669 | out_no_q: | 2705 | out_no_q: |
3670 | free_cpumask_var(mdev->cpu_mask); | ||
3671 | out_no_cpumask: | ||
3672 | kfree(mdev); | ||
3673 | return NULL; | ||
3674 | } | ||
3675 | |||
3676 | /* counterpart of drbd_new_device. | ||
3677 | * last part of drbd_delete_device. */ | ||
3678 | void drbd_free_mdev(struct drbd_conf *mdev) | ||
3679 | { | ||
3680 | kfree(mdev->current_epoch); | ||
3681 | kfree(mdev->app_reads_hash); | ||
3682 | tl_cleanup(mdev); | ||
3683 | if (mdev->bitmap) /* should no longer be there. */ | ||
3684 | drbd_bm_cleanup(mdev); | ||
3685 | __free_page(mdev->md_io_page); | ||
3686 | put_disk(mdev->vdisk); | ||
3687 | blk_cleanup_queue(mdev->rq_queue); | ||
3688 | free_cpumask_var(mdev->cpu_mask); | ||
3689 | drbd_free_tl_hash(mdev); | ||
3690 | kfree(mdev); | 2706 | kfree(mdev); |
2707 | kref_put(&tconn->kref, &conn_destroy); | ||
2708 | return err; | ||
3691 | } | 2709 | } |
3692 | 2710 | ||
3693 | |||
3694 | int __init drbd_init(void) | 2711 | int __init drbd_init(void) |
3695 | { | 2712 | { |
3696 | int err; | 2713 | int err; |
3697 | 2714 | ||
3698 | if (sizeof(struct p_handshake) != 80) { | ||
3699 | printk(KERN_ERR | ||
3700 | "drbd: never change the size or layout " | ||
3701 | "of the HandShake packet.\n"); | ||
3702 | return -EINVAL; | ||
3703 | } | ||
3704 | |||
3705 | if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) { | 2715 | if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) { |
3706 | printk(KERN_ERR | 2716 | printk(KERN_ERR |
3707 | "drbd: invalid minor_count (%d)\n", minor_count); | 2717 | "drbd: invalid minor_count (%d)\n", minor_count); |
3708 | #ifdef MODULE | 2718 | #ifdef MODULE |
3709 | return -EINVAL; | 2719 | return -EINVAL; |
3710 | #else | 2720 | #else |
3711 | minor_count = 8; | 2721 | minor_count = DRBD_MINOR_COUNT_DEF; |
3712 | #endif | 2722 | #endif |
3713 | } | 2723 | } |
3714 | 2724 | ||
3715 | err = drbd_nl_init(); | ||
3716 | if (err) | ||
3717 | return err; | ||
3718 | |||
3719 | err = register_blkdev(DRBD_MAJOR, "drbd"); | 2725 | err = register_blkdev(DRBD_MAJOR, "drbd"); |
3720 | if (err) { | 2726 | if (err) { |
3721 | printk(KERN_ERR | 2727 | printk(KERN_ERR |
@@ -3724,6 +2730,13 @@ int __init drbd_init(void) | |||
3724 | return err; | 2730 | return err; |
3725 | } | 2731 | } |
3726 | 2732 | ||
2733 | err = drbd_genl_register(); | ||
2734 | if (err) { | ||
2735 | printk(KERN_ERR "drbd: unable to register generic netlink family\n"); | ||
2736 | goto fail; | ||
2737 | } | ||
2738 | |||
2739 | |||
3727 | register_reboot_notifier(&drbd_notifier); | 2740 | register_reboot_notifier(&drbd_notifier); |
3728 | 2741 | ||
3729 | /* | 2742 | /* |
@@ -3734,22 +2747,29 @@ int __init drbd_init(void) | |||
3734 | init_waitqueue_head(&drbd_pp_wait); | 2747 | init_waitqueue_head(&drbd_pp_wait); |
3735 | 2748 | ||
3736 | drbd_proc = NULL; /* play safe for drbd_cleanup */ | 2749 | drbd_proc = NULL; /* play safe for drbd_cleanup */ |
3737 | minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count, | 2750 | idr_init(&minors); |
3738 | GFP_KERNEL); | ||
3739 | if (!minor_table) | ||
3740 | goto Enomem; | ||
3741 | 2751 | ||
3742 | err = drbd_create_mempools(); | 2752 | err = drbd_create_mempools(); |
3743 | if (err) | 2753 | if (err) |
3744 | goto Enomem; | 2754 | goto fail; |
3745 | 2755 | ||
3746 | drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL); | 2756 | drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL); |
3747 | if (!drbd_proc) { | 2757 | if (!drbd_proc) { |
3748 | printk(KERN_ERR "drbd: unable to register proc file\n"); | 2758 | printk(KERN_ERR "drbd: unable to register proc file\n"); |
3749 | goto Enomem; | 2759 | goto fail; |
3750 | } | 2760 | } |
3751 | 2761 | ||
3752 | rwlock_init(&global_state_lock); | 2762 | rwlock_init(&global_state_lock); |
2763 | INIT_LIST_HEAD(&drbd_tconns); | ||
2764 | |||
2765 | retry.wq = create_singlethread_workqueue("drbd-reissue"); | ||
2766 | if (!retry.wq) { | ||
2767 | printk(KERN_ERR "drbd: unable to create retry workqueue\n"); | ||
2768 | goto fail; | ||
2769 | } | ||
2770 | INIT_WORK(&retry.worker, do_retry); | ||
2771 | spin_lock_init(&retry.lock); | ||
2772 | INIT_LIST_HEAD(&retry.writes); | ||
3753 | 2773 | ||
3754 | printk(KERN_INFO "drbd: initialized. " | 2774 | printk(KERN_INFO "drbd: initialized. " |
3755 | "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n", | 2775 | "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n", |
@@ -3757,11 +2777,10 @@ int __init drbd_init(void) | |||
3757 | printk(KERN_INFO "drbd: %s\n", drbd_buildtag()); | 2777 | printk(KERN_INFO "drbd: %s\n", drbd_buildtag()); |
3758 | printk(KERN_INFO "drbd: registered as block device major %d\n", | 2778 | printk(KERN_INFO "drbd: registered as block device major %d\n", |
3759 | DRBD_MAJOR); | 2779 | DRBD_MAJOR); |
3760 | printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table); | ||
3761 | 2780 | ||
3762 | return 0; /* Success! */ | 2781 | return 0; /* Success! */ |
3763 | 2782 | ||
3764 | Enomem: | 2783 | fail: |
3765 | drbd_cleanup(); | 2784 | drbd_cleanup(); |
3766 | if (err == -ENOMEM) | 2785 | if (err == -ENOMEM) |
3767 | /* currently always the case */ | 2786 | /* currently always the case */ |
@@ -3782,47 +2801,42 @@ void drbd_free_bc(struct drbd_backing_dev *ldev) | |||
3782 | kfree(ldev); | 2801 | kfree(ldev); |
3783 | } | 2802 | } |
3784 | 2803 | ||
3785 | void drbd_free_sock(struct drbd_conf *mdev) | 2804 | void drbd_free_sock(struct drbd_tconn *tconn) |
3786 | { | 2805 | { |
3787 | if (mdev->data.socket) { | 2806 | if (tconn->data.socket) { |
3788 | mutex_lock(&mdev->data.mutex); | 2807 | mutex_lock(&tconn->data.mutex); |
3789 | kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR); | 2808 | kernel_sock_shutdown(tconn->data.socket, SHUT_RDWR); |
3790 | sock_release(mdev->data.socket); | 2809 | sock_release(tconn->data.socket); |
3791 | mdev->data.socket = NULL; | 2810 | tconn->data.socket = NULL; |
3792 | mutex_unlock(&mdev->data.mutex); | 2811 | mutex_unlock(&tconn->data.mutex); |
3793 | } | 2812 | } |
3794 | if (mdev->meta.socket) { | 2813 | if (tconn->meta.socket) { |
3795 | mutex_lock(&mdev->meta.mutex); | 2814 | mutex_lock(&tconn->meta.mutex); |
3796 | kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR); | 2815 | kernel_sock_shutdown(tconn->meta.socket, SHUT_RDWR); |
3797 | sock_release(mdev->meta.socket); | 2816 | sock_release(tconn->meta.socket); |
3798 | mdev->meta.socket = NULL; | 2817 | tconn->meta.socket = NULL; |
3799 | mutex_unlock(&mdev->meta.mutex); | 2818 | mutex_unlock(&tconn->meta.mutex); |
3800 | } | 2819 | } |
3801 | } | 2820 | } |
3802 | 2821 | ||
2822 | /* meta data management */ | ||
3803 | 2823 | ||
3804 | void drbd_free_resources(struct drbd_conf *mdev) | 2824 | void conn_md_sync(struct drbd_tconn *tconn) |
3805 | { | 2825 | { |
3806 | crypto_free_hash(mdev->csums_tfm); | 2826 | struct drbd_conf *mdev; |
3807 | mdev->csums_tfm = NULL; | 2827 | int vnr; |
3808 | crypto_free_hash(mdev->verify_tfm); | ||
3809 | mdev->verify_tfm = NULL; | ||
3810 | crypto_free_hash(mdev->cram_hmac_tfm); | ||
3811 | mdev->cram_hmac_tfm = NULL; | ||
3812 | crypto_free_hash(mdev->integrity_w_tfm); | ||
3813 | mdev->integrity_w_tfm = NULL; | ||
3814 | crypto_free_hash(mdev->integrity_r_tfm); | ||
3815 | mdev->integrity_r_tfm = NULL; | ||
3816 | |||
3817 | drbd_free_sock(mdev); | ||
3818 | 2828 | ||
3819 | __no_warn(local, | 2829 | rcu_read_lock(); |
3820 | drbd_free_bc(mdev->ldev); | 2830 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { |
3821 | mdev->ldev = NULL;); | 2831 | kref_get(&mdev->kref); |
2832 | rcu_read_unlock(); | ||
2833 | drbd_md_sync(mdev); | ||
2834 | kref_put(&mdev->kref, &drbd_minor_destroy); | ||
2835 | rcu_read_lock(); | ||
2836 | } | ||
2837 | rcu_read_unlock(); | ||
3822 | } | 2838 | } |
3823 | 2839 | ||
3824 | /* meta data management */ | ||
3825 | |||
3826 | struct meta_data_on_disk { | 2840 | struct meta_data_on_disk { |
3827 | u64 la_size; /* last agreed size. */ | 2841 | u64 la_size; /* last agreed size. */ |
3828 | u64 uuid[UI_SIZE]; /* UUIDs. */ | 2842 | u64 uuid[UI_SIZE]; /* UUIDs. */ |
@@ -3833,7 +2847,7 @@ struct meta_data_on_disk { | |||
3833 | u32 md_size_sect; | 2847 | u32 md_size_sect; |
3834 | u32 al_offset; /* offset to this block */ | 2848 | u32 al_offset; /* offset to this block */ |
3835 | u32 al_nr_extents; /* important for restoring the AL */ | 2849 | u32 al_nr_extents; /* important for restoring the AL */ |
3836 | /* `-- act_log->nr_elements <-- sync_conf.al_extents */ | 2850 | /* `-- act_log->nr_elements <-- ldev->dc.al_extents */ |
3837 | u32 bm_offset; /* offset to the bitmap, from here */ | 2851 | u32 bm_offset; /* offset to the bitmap, from here */ |
3838 | u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */ | 2852 | u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */ |
3839 | u32 la_peer_max_bio_size; /* last peer max_bio_size */ | 2853 | u32 la_peer_max_bio_size; /* last peer max_bio_size */ |
@@ -3871,7 +2885,7 @@ void drbd_md_sync(struct drbd_conf *mdev) | |||
3871 | for (i = UI_CURRENT; i < UI_SIZE; i++) | 2885 | for (i = UI_CURRENT; i < UI_SIZE; i++) |
3872 | buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]); | 2886 | buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]); |
3873 | buffer->flags = cpu_to_be32(mdev->ldev->md.flags); | 2887 | buffer->flags = cpu_to_be32(mdev->ldev->md.flags); |
3874 | buffer->magic = cpu_to_be32(DRBD_MD_MAGIC); | 2888 | buffer->magic = cpu_to_be32(DRBD_MD_MAGIC_84_UNCLEAN); |
3875 | 2889 | ||
3876 | buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect); | 2890 | buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect); |
3877 | buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset); | 2891 | buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset); |
@@ -3885,7 +2899,7 @@ void drbd_md_sync(struct drbd_conf *mdev) | |||
3885 | D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset); | 2899 | D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset); |
3886 | sector = mdev->ldev->md.md_offset; | 2900 | sector = mdev->ldev->md.md_offset; |
3887 | 2901 | ||
3888 | if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { | 2902 | if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { |
3889 | /* this was a try anyways ... */ | 2903 | /* this was a try anyways ... */ |
3890 | dev_err(DEV, "meta data update failed!\n"); | 2904 | dev_err(DEV, "meta data update failed!\n"); |
3891 | drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); | 2905 | drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); |
@@ -3906,11 +2920,12 @@ out: | |||
3906 | * @bdev: Device from which the meta data should be read in. | 2920 | * @bdev: Device from which the meta data should be read in. |
3907 | * | 2921 | * |
3908 | * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case | 2922 | * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case |
3909 | * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID. | 2923 | * something goes wrong. |
3910 | */ | 2924 | */ |
3911 | int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) | 2925 | int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) |
3912 | { | 2926 | { |
3913 | struct meta_data_on_disk *buffer; | 2927 | struct meta_data_on_disk *buffer; |
2928 | u32 magic, flags; | ||
3914 | int i, rv = NO_ERROR; | 2929 | int i, rv = NO_ERROR; |
3915 | 2930 | ||
3916 | if (!get_ldev_if_state(mdev, D_ATTACHING)) | 2931 | if (!get_ldev_if_state(mdev, D_ATTACHING)) |
@@ -3920,7 +2935,7 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) | |||
3920 | if (!buffer) | 2935 | if (!buffer) |
3921 | goto out; | 2936 | goto out; |
3922 | 2937 | ||
3923 | if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) { | 2938 | if (drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) { |
3924 | /* NOTE: can't do normal error processing here as this is | 2939 | /* NOTE: can't do normal error processing here as this is |
3925 | called BEFORE disk is attached */ | 2940 | called BEFORE disk is attached */ |
3926 | dev_err(DEV, "Error while reading metadata.\n"); | 2941 | dev_err(DEV, "Error while reading metadata.\n"); |
@@ -3928,8 +2943,20 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) | |||
3928 | goto err; | 2943 | goto err; |
3929 | } | 2944 | } |
3930 | 2945 | ||
3931 | if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) { | 2946 | magic = be32_to_cpu(buffer->magic); |
3932 | dev_err(DEV, "Error while reading metadata, magic not found.\n"); | 2947 | flags = be32_to_cpu(buffer->flags); |
2948 | if (magic == DRBD_MD_MAGIC_84_UNCLEAN || | ||
2949 | (magic == DRBD_MD_MAGIC_08 && !(flags & MDF_AL_CLEAN))) { | ||
2950 | /* btw: that's Activity Log clean, not "all" clean. */ | ||
2951 | dev_err(DEV, "Found unclean meta data. Did you \"drbdadm apply-al\"?\n"); | ||
2952 | rv = ERR_MD_UNCLEAN; | ||
2953 | goto err; | ||
2954 | } | ||
2955 | if (magic != DRBD_MD_MAGIC_08) { | ||
2956 | if (magic == DRBD_MD_MAGIC_07) | ||
2957 | dev_err(DEV, "Found old (0.7) meta data magic. Did you \"drbdadm create-md\"?\n"); | ||
2958 | else | ||
2959 | dev_err(DEV, "Meta data magic not found. Did you \"drbdadm create-md\"?\n"); | ||
3933 | rv = ERR_MD_INVALID; | 2960 | rv = ERR_MD_INVALID; |
3934 | goto err; | 2961 | goto err; |
3935 | } | 2962 | } |
@@ -3963,20 +2990,16 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) | |||
3963 | for (i = UI_CURRENT; i < UI_SIZE; i++) | 2990 | for (i = UI_CURRENT; i < UI_SIZE; i++) |
3964 | bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]); | 2991 | bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]); |
3965 | bdev->md.flags = be32_to_cpu(buffer->flags); | 2992 | bdev->md.flags = be32_to_cpu(buffer->flags); |
3966 | mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents); | ||
3967 | bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid); | 2993 | bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid); |
3968 | 2994 | ||
3969 | spin_lock_irq(&mdev->req_lock); | 2995 | spin_lock_irq(&mdev->tconn->req_lock); |
3970 | if (mdev->state.conn < C_CONNECTED) { | 2996 | if (mdev->state.conn < C_CONNECTED) { |
3971 | unsigned int peer; | 2997 | unsigned int peer; |
3972 | peer = be32_to_cpu(buffer->la_peer_max_bio_size); | 2998 | peer = be32_to_cpu(buffer->la_peer_max_bio_size); |
3973 | peer = max(peer, DRBD_MAX_BIO_SIZE_SAFE); | 2999 | peer = max(peer, DRBD_MAX_BIO_SIZE_SAFE); |
3974 | mdev->peer_max_bio_size = peer; | 3000 | mdev->peer_max_bio_size = peer; |
3975 | } | 3001 | } |
3976 | spin_unlock_irq(&mdev->req_lock); | 3002 | spin_unlock_irq(&mdev->tconn->req_lock); |
3977 | |||
3978 | if (mdev->sync_conf.al_extents < 7) | ||
3979 | mdev->sync_conf.al_extents = 127; | ||
3980 | 3003 | ||
3981 | err: | 3004 | err: |
3982 | drbd_md_put_buffer(mdev); | 3005 | drbd_md_put_buffer(mdev); |
@@ -4011,7 +3034,7 @@ void drbd_md_mark_dirty(struct drbd_conf *mdev) | |||
4011 | } | 3034 | } |
4012 | #endif | 3035 | #endif |
4013 | 3036 | ||
4014 | static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local) | 3037 | void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local) |
4015 | { | 3038 | { |
4016 | int i; | 3039 | int i; |
4017 | 3040 | ||
@@ -4019,7 +3042,7 @@ static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local) | |||
4019 | mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i]; | 3042 | mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i]; |
4020 | } | 3043 | } |
4021 | 3044 | ||
4022 | void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) | 3045 | void __drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) |
4023 | { | 3046 | { |
4024 | if (idx == UI_CURRENT) { | 3047 | if (idx == UI_CURRENT) { |
4025 | if (mdev->state.role == R_PRIMARY) | 3048 | if (mdev->state.role == R_PRIMARY) |
@@ -4034,14 +3057,24 @@ void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) | |||
4034 | drbd_md_mark_dirty(mdev); | 3057 | drbd_md_mark_dirty(mdev); |
4035 | } | 3058 | } |
4036 | 3059 | ||
3060 | void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) | ||
3061 | { | ||
3062 | unsigned long flags; | ||
3063 | spin_lock_irqsave(&mdev->ldev->md.uuid_lock, flags); | ||
3064 | __drbd_uuid_set(mdev, idx, val); | ||
3065 | spin_unlock_irqrestore(&mdev->ldev->md.uuid_lock, flags); | ||
3066 | } | ||
4037 | 3067 | ||
4038 | void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) | 3068 | void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) |
4039 | { | 3069 | { |
3070 | unsigned long flags; | ||
3071 | spin_lock_irqsave(&mdev->ldev->md.uuid_lock, flags); | ||
4040 | if (mdev->ldev->md.uuid[idx]) { | 3072 | if (mdev->ldev->md.uuid[idx]) { |
4041 | drbd_uuid_move_history(mdev); | 3073 | drbd_uuid_move_history(mdev); |
4042 | mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx]; | 3074 | mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx]; |
4043 | } | 3075 | } |
4044 | _drbd_uuid_set(mdev, idx, val); | 3076 | __drbd_uuid_set(mdev, idx, val); |
3077 | spin_unlock_irqrestore(&mdev->ldev->md.uuid_lock, flags); | ||
4045 | } | 3078 | } |
4046 | 3079 | ||
4047 | /** | 3080 | /** |
@@ -4054,15 +3087,20 @@ void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) | |||
4054 | void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local) | 3087 | void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local) |
4055 | { | 3088 | { |
4056 | u64 val; | 3089 | u64 val; |
4057 | unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP]; | 3090 | unsigned long long bm_uuid; |
3091 | |||
3092 | get_random_bytes(&val, sizeof(u64)); | ||
3093 | |||
3094 | spin_lock_irq(&mdev->ldev->md.uuid_lock); | ||
3095 | bm_uuid = mdev->ldev->md.uuid[UI_BITMAP]; | ||
4058 | 3096 | ||
4059 | if (bm_uuid) | 3097 | if (bm_uuid) |
4060 | dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid); | 3098 | dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid); |
4061 | 3099 | ||
4062 | mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT]; | 3100 | mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT]; |
3101 | __drbd_uuid_set(mdev, UI_CURRENT, val); | ||
3102 | spin_unlock_irq(&mdev->ldev->md.uuid_lock); | ||
4063 | 3103 | ||
4064 | get_random_bytes(&val, sizeof(u64)); | ||
4065 | _drbd_uuid_set(mdev, UI_CURRENT, val); | ||
4066 | drbd_print_uuids(mdev, "new current UUID"); | 3104 | drbd_print_uuids(mdev, "new current UUID"); |
4067 | /* get it to stable storage _now_ */ | 3105 | /* get it to stable storage _now_ */ |
4068 | drbd_md_sync(mdev); | 3106 | drbd_md_sync(mdev); |
@@ -4070,9 +3108,11 @@ void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local) | |||
4070 | 3108 | ||
4071 | void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local) | 3109 | void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local) |
4072 | { | 3110 | { |
3111 | unsigned long flags; | ||
4073 | if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0) | 3112 | if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0) |
4074 | return; | 3113 | return; |
4075 | 3114 | ||
3115 | spin_lock_irqsave(&mdev->ldev->md.uuid_lock, flags); | ||
4076 | if (val == 0) { | 3116 | if (val == 0) { |
4077 | drbd_uuid_move_history(mdev); | 3117 | drbd_uuid_move_history(mdev); |
4078 | mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP]; | 3118 | mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP]; |
@@ -4084,6 +3124,8 @@ void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local) | |||
4084 | 3124 | ||
4085 | mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1); | 3125 | mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1); |
4086 | } | 3126 | } |
3127 | spin_unlock_irqrestore(&mdev->ldev->md.uuid_lock, flags); | ||
3128 | |||
4087 | drbd_md_mark_dirty(mdev); | 3129 | drbd_md_mark_dirty(mdev); |
4088 | } | 3130 | } |
4089 | 3131 | ||
@@ -4135,9 +3177,10 @@ int drbd_bmio_clear_n_write(struct drbd_conf *mdev) | |||
4135 | return rv; | 3177 | return rv; |
4136 | } | 3178 | } |
4137 | 3179 | ||
4138 | static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused) | 3180 | static int w_bitmap_io(struct drbd_work *w, int unused) |
4139 | { | 3181 | { |
4140 | struct bm_io_work *work = container_of(w, struct bm_io_work, w); | 3182 | struct bm_io_work *work = container_of(w, struct bm_io_work, w); |
3183 | struct drbd_conf *mdev = w->mdev; | ||
4141 | int rv = -EIO; | 3184 | int rv = -EIO; |
4142 | 3185 | ||
4143 | D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0); | 3186 | D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0); |
@@ -4149,8 +3192,7 @@ static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused) | |||
4149 | put_ldev(mdev); | 3192 | put_ldev(mdev); |
4150 | } | 3193 | } |
4151 | 3194 | ||
4152 | clear_bit(BITMAP_IO, &mdev->flags); | 3195 | clear_bit_unlock(BITMAP_IO, &mdev->flags); |
4153 | smp_mb__after_clear_bit(); | ||
4154 | wake_up(&mdev->misc_wait); | 3196 | wake_up(&mdev->misc_wait); |
4155 | 3197 | ||
4156 | if (work->done) | 3198 | if (work->done) |
@@ -4160,7 +3202,7 @@ static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused) | |||
4160 | work->why = NULL; | 3202 | work->why = NULL; |
4161 | work->flags = 0; | 3203 | work->flags = 0; |
4162 | 3204 | ||
4163 | return 1; | 3205 | return 0; |
4164 | } | 3206 | } |
4165 | 3207 | ||
4166 | void drbd_ldev_destroy(struct drbd_conf *mdev) | 3208 | void drbd_ldev_destroy(struct drbd_conf *mdev) |
@@ -4173,29 +3215,51 @@ void drbd_ldev_destroy(struct drbd_conf *mdev) | |||
4173 | drbd_free_bc(mdev->ldev); | 3215 | drbd_free_bc(mdev->ldev); |
4174 | mdev->ldev = NULL;); | 3216 | mdev->ldev = NULL;); |
4175 | 3217 | ||
4176 | if (mdev->md_io_tmpp) { | ||
4177 | __free_page(mdev->md_io_tmpp); | ||
4178 | mdev->md_io_tmpp = NULL; | ||
4179 | } | ||
4180 | clear_bit(GO_DISKLESS, &mdev->flags); | 3218 | clear_bit(GO_DISKLESS, &mdev->flags); |
4181 | } | 3219 | } |
4182 | 3220 | ||
4183 | static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused) | 3221 | static int w_go_diskless(struct drbd_work *w, int unused) |
4184 | { | 3222 | { |
3223 | struct drbd_conf *mdev = w->mdev; | ||
3224 | |||
4185 | D_ASSERT(mdev->state.disk == D_FAILED); | 3225 | D_ASSERT(mdev->state.disk == D_FAILED); |
4186 | /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will | 3226 | /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will |
4187 | * inc/dec it frequently. Once we are D_DISKLESS, no one will touch | 3227 | * inc/dec it frequently. Once we are D_DISKLESS, no one will touch |
4188 | * the protected members anymore, though, so once put_ldev reaches zero | 3228 | * the protected members anymore, though, so once put_ldev reaches zero |
4189 | * again, it will be safe to free them. */ | 3229 | * again, it will be safe to free them. */ |
3230 | |||
3231 | /* Try to write changed bitmap pages, read errors may have just | ||
3232 | * set some bits outside the area covered by the activity log. | ||
3233 | * | ||
3234 | * If we have an IO error during the bitmap writeout, | ||
3235 | * we will want a full sync next time, just in case. | ||
3236 | * (Do we want a specific meta data flag for this?) | ||
3237 | * | ||
3238 | * If that does not make it to stable storage either, | ||
3239 | * we cannot do anything about that anymore. | ||
3240 | * | ||
3241 | * We still need to check if both bitmap and ldev are present, we may | ||
3242 | * end up here after a failed attach, before ldev was even assigned. | ||
3243 | */ | ||
3244 | if (mdev->bitmap && mdev->ldev) { | ||
3245 | if (drbd_bitmap_io_from_worker(mdev, drbd_bm_write, | ||
3246 | "detach", BM_LOCKED_MASK)) { | ||
3247 | if (test_bit(WAS_READ_ERROR, &mdev->flags)) { | ||
3248 | drbd_md_set_flag(mdev, MDF_FULL_SYNC); | ||
3249 | drbd_md_sync(mdev); | ||
3250 | } | ||
3251 | } | ||
3252 | } | ||
3253 | |||
4190 | drbd_force_state(mdev, NS(disk, D_DISKLESS)); | 3254 | drbd_force_state(mdev, NS(disk, D_DISKLESS)); |
4191 | return 1; | 3255 | return 0; |
4192 | } | 3256 | } |
4193 | 3257 | ||
4194 | void drbd_go_diskless(struct drbd_conf *mdev) | 3258 | void drbd_go_diskless(struct drbd_conf *mdev) |
4195 | { | 3259 | { |
4196 | D_ASSERT(mdev->state.disk == D_FAILED); | 3260 | D_ASSERT(mdev->state.disk == D_FAILED); |
4197 | if (!test_and_set_bit(GO_DISKLESS, &mdev->flags)) | 3261 | if (!test_and_set_bit(GO_DISKLESS, &mdev->flags)) |
4198 | drbd_queue_work(&mdev->data.work, &mdev->go_diskless); | 3262 | drbd_queue_work(&mdev->tconn->sender_work, &mdev->go_diskless); |
4199 | } | 3263 | } |
4200 | 3264 | ||
4201 | /** | 3265 | /** |
@@ -4215,7 +3279,7 @@ void drbd_queue_bitmap_io(struct drbd_conf *mdev, | |||
4215 | void (*done)(struct drbd_conf *, int), | 3279 | void (*done)(struct drbd_conf *, int), |
4216 | char *why, enum bm_flag flags) | 3280 | char *why, enum bm_flag flags) |
4217 | { | 3281 | { |
4218 | D_ASSERT(current == mdev->worker.task); | 3282 | D_ASSERT(current == mdev->tconn->worker.task); |
4219 | 3283 | ||
4220 | D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags)); | 3284 | D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags)); |
4221 | D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags)); | 3285 | D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags)); |
@@ -4229,13 +3293,13 @@ void drbd_queue_bitmap_io(struct drbd_conf *mdev, | |||
4229 | mdev->bm_io_work.why = why; | 3293 | mdev->bm_io_work.why = why; |
4230 | mdev->bm_io_work.flags = flags; | 3294 | mdev->bm_io_work.flags = flags; |
4231 | 3295 | ||
4232 | spin_lock_irq(&mdev->req_lock); | 3296 | spin_lock_irq(&mdev->tconn->req_lock); |
4233 | set_bit(BITMAP_IO, &mdev->flags); | 3297 | set_bit(BITMAP_IO, &mdev->flags); |
4234 | if (atomic_read(&mdev->ap_bio_cnt) == 0) { | 3298 | if (atomic_read(&mdev->ap_bio_cnt) == 0) { |
4235 | if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags)) | 3299 | if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags)) |
4236 | drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w); | 3300 | drbd_queue_work(&mdev->tconn->sender_work, &mdev->bm_io_work.w); |
4237 | } | 3301 | } |
4238 | spin_unlock_irq(&mdev->req_lock); | 3302 | spin_unlock_irq(&mdev->tconn->req_lock); |
4239 | } | 3303 | } |
4240 | 3304 | ||
4241 | /** | 3305 | /** |
@@ -4252,7 +3316,7 @@ int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), | |||
4252 | { | 3316 | { |
4253 | int rv; | 3317 | int rv; |
4254 | 3318 | ||
4255 | D_ASSERT(current != mdev->worker.task); | 3319 | D_ASSERT(current != mdev->tconn->worker.task); |
4256 | 3320 | ||
4257 | if ((flags & BM_LOCKED_SET_ALLOWED) == 0) | 3321 | if ((flags & BM_LOCKED_SET_ALLOWED) == 0) |
4258 | drbd_suspend_io(mdev); | 3322 | drbd_suspend_io(mdev); |
@@ -4291,18 +3355,127 @@ static void md_sync_timer_fn(unsigned long data) | |||
4291 | { | 3355 | { |
4292 | struct drbd_conf *mdev = (struct drbd_conf *) data; | 3356 | struct drbd_conf *mdev = (struct drbd_conf *) data; |
4293 | 3357 | ||
4294 | drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work); | 3358 | /* must not double-queue! */ |
3359 | if (list_empty(&mdev->md_sync_work.list)) | ||
3360 | drbd_queue_work_front(&mdev->tconn->sender_work, &mdev->md_sync_work); | ||
4295 | } | 3361 | } |
4296 | 3362 | ||
4297 | static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused) | 3363 | static int w_md_sync(struct drbd_work *w, int unused) |
4298 | { | 3364 | { |
3365 | struct drbd_conf *mdev = w->mdev; | ||
3366 | |||
4299 | dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n"); | 3367 | dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n"); |
4300 | #ifdef DEBUG | 3368 | #ifdef DEBUG |
4301 | dev_warn(DEV, "last md_mark_dirty: %s:%u\n", | 3369 | dev_warn(DEV, "last md_mark_dirty: %s:%u\n", |
4302 | mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line); | 3370 | mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line); |
4303 | #endif | 3371 | #endif |
4304 | drbd_md_sync(mdev); | 3372 | drbd_md_sync(mdev); |
4305 | return 1; | 3373 | return 0; |
3374 | } | ||
3375 | |||
3376 | const char *cmdname(enum drbd_packet cmd) | ||
3377 | { | ||
3378 | /* THINK may need to become several global tables | ||
3379 | * when we want to support more than | ||
3380 | * one PRO_VERSION */ | ||
3381 | static const char *cmdnames[] = { | ||
3382 | [P_DATA] = "Data", | ||
3383 | [P_DATA_REPLY] = "DataReply", | ||
3384 | [P_RS_DATA_REPLY] = "RSDataReply", | ||
3385 | [P_BARRIER] = "Barrier", | ||
3386 | [P_BITMAP] = "ReportBitMap", | ||
3387 | [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget", | ||
3388 | [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource", | ||
3389 | [P_UNPLUG_REMOTE] = "UnplugRemote", | ||
3390 | [P_DATA_REQUEST] = "DataRequest", | ||
3391 | [P_RS_DATA_REQUEST] = "RSDataRequest", | ||
3392 | [P_SYNC_PARAM] = "SyncParam", | ||
3393 | [P_SYNC_PARAM89] = "SyncParam89", | ||
3394 | [P_PROTOCOL] = "ReportProtocol", | ||
3395 | [P_UUIDS] = "ReportUUIDs", | ||
3396 | [P_SIZES] = "ReportSizes", | ||
3397 | [P_STATE] = "ReportState", | ||
3398 | [P_SYNC_UUID] = "ReportSyncUUID", | ||
3399 | [P_AUTH_CHALLENGE] = "AuthChallenge", | ||
3400 | [P_AUTH_RESPONSE] = "AuthResponse", | ||
3401 | [P_PING] = "Ping", | ||
3402 | [P_PING_ACK] = "PingAck", | ||
3403 | [P_RECV_ACK] = "RecvAck", | ||
3404 | [P_WRITE_ACK] = "WriteAck", | ||
3405 | [P_RS_WRITE_ACK] = "RSWriteAck", | ||
3406 | [P_SUPERSEDED] = "Superseded", | ||
3407 | [P_NEG_ACK] = "NegAck", | ||
3408 | [P_NEG_DREPLY] = "NegDReply", | ||
3409 | [P_NEG_RS_DREPLY] = "NegRSDReply", | ||
3410 | [P_BARRIER_ACK] = "BarrierAck", | ||
3411 | [P_STATE_CHG_REQ] = "StateChgRequest", | ||
3412 | [P_STATE_CHG_REPLY] = "StateChgReply", | ||
3413 | [P_OV_REQUEST] = "OVRequest", | ||
3414 | [P_OV_REPLY] = "OVReply", | ||
3415 | [P_OV_RESULT] = "OVResult", | ||
3416 | [P_CSUM_RS_REQUEST] = "CsumRSRequest", | ||
3417 | [P_RS_IS_IN_SYNC] = "CsumRSIsInSync", | ||
3418 | [P_COMPRESSED_BITMAP] = "CBitmap", | ||
3419 | [P_DELAY_PROBE] = "DelayProbe", | ||
3420 | [P_OUT_OF_SYNC] = "OutOfSync", | ||
3421 | [P_RETRY_WRITE] = "RetryWrite", | ||
3422 | [P_RS_CANCEL] = "RSCancel", | ||
3423 | [P_CONN_ST_CHG_REQ] = "conn_st_chg_req", | ||
3424 | [P_CONN_ST_CHG_REPLY] = "conn_st_chg_reply", | ||
3425 | [P_RETRY_WRITE] = "retry_write", | ||
3426 | [P_PROTOCOL_UPDATE] = "protocol_update", | ||
3427 | |||
3428 | /* enum drbd_packet, but not commands - obsoleted flags: | ||
3429 | * P_MAY_IGNORE | ||
3430 | * P_MAX_OPT_CMD | ||
3431 | */ | ||
3432 | }; | ||
3433 | |||
3434 | /* too big for the array: 0xfffX */ | ||
3435 | if (cmd == P_INITIAL_META) | ||
3436 | return "InitialMeta"; | ||
3437 | if (cmd == P_INITIAL_DATA) | ||
3438 | return "InitialData"; | ||
3439 | if (cmd == P_CONNECTION_FEATURES) | ||
3440 | return "ConnectionFeatures"; | ||
3441 | if (cmd >= ARRAY_SIZE(cmdnames)) | ||
3442 | return "Unknown"; | ||
3443 | return cmdnames[cmd]; | ||
3444 | } | ||
3445 | |||
3446 | /** | ||
3447 | * drbd_wait_misc - wait for a request to make progress | ||
3448 | * @mdev: device associated with the request | ||
3449 | * @i: the struct drbd_interval embedded in struct drbd_request or | ||
3450 | * struct drbd_peer_request | ||
3451 | */ | ||
3452 | int drbd_wait_misc(struct drbd_conf *mdev, struct drbd_interval *i) | ||
3453 | { | ||
3454 | struct net_conf *nc; | ||
3455 | DEFINE_WAIT(wait); | ||
3456 | long timeout; | ||
3457 | |||
3458 | rcu_read_lock(); | ||
3459 | nc = rcu_dereference(mdev->tconn->net_conf); | ||
3460 | if (!nc) { | ||
3461 | rcu_read_unlock(); | ||
3462 | return -ETIMEDOUT; | ||
3463 | } | ||
3464 | timeout = nc->ko_count ? nc->timeout * HZ / 10 * nc->ko_count : MAX_SCHEDULE_TIMEOUT; | ||
3465 | rcu_read_unlock(); | ||
3466 | |||
3467 | /* Indicate to wake up mdev->misc_wait on progress. */ | ||
3468 | i->waiting = true; | ||
3469 | prepare_to_wait(&mdev->misc_wait, &wait, TASK_INTERRUPTIBLE); | ||
3470 | spin_unlock_irq(&mdev->tconn->req_lock); | ||
3471 | timeout = schedule_timeout(timeout); | ||
3472 | finish_wait(&mdev->misc_wait, &wait); | ||
3473 | spin_lock_irq(&mdev->tconn->req_lock); | ||
3474 | if (!timeout || mdev->state.conn < C_CONNECTED) | ||
3475 | return -ETIMEDOUT; | ||
3476 | if (signal_pending(current)) | ||
3477 | return -ERESTARTSYS; | ||
3478 | return 0; | ||
4306 | } | 3479 | } |
4307 | 3480 | ||
4308 | #ifdef CONFIG_DRBD_FAULT_INJECTION | 3481 | #ifdef CONFIG_DRBD_FAULT_INJECTION |
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index edb490aad8b4..2af26fc95280 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c | |||
@@ -29,159 +29,317 @@ | |||
29 | #include <linux/fs.h> | 29 | #include <linux/fs.h> |
30 | #include <linux/file.h> | 30 | #include <linux/file.h> |
31 | #include <linux/slab.h> | 31 | #include <linux/slab.h> |
32 | #include <linux/connector.h> | ||
33 | #include <linux/blkpg.h> | 32 | #include <linux/blkpg.h> |
34 | #include <linux/cpumask.h> | 33 | #include <linux/cpumask.h> |
35 | #include "drbd_int.h" | 34 | #include "drbd_int.h" |
36 | #include "drbd_req.h" | 35 | #include "drbd_req.h" |
37 | #include "drbd_wrappers.h" | 36 | #include "drbd_wrappers.h" |
38 | #include <asm/unaligned.h> | 37 | #include <asm/unaligned.h> |
39 | #include <linux/drbd_tag_magic.h> | ||
40 | #include <linux/drbd_limits.h> | 38 | #include <linux/drbd_limits.h> |
41 | #include <linux/compiler.h> | ||
42 | #include <linux/kthread.h> | 39 | #include <linux/kthread.h> |
43 | 40 | ||
44 | static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int); | 41 | #include <net/genetlink.h> |
45 | static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *); | 42 | |
46 | static unsigned short *tl_add_int(unsigned short *, enum drbd_tags, const void *); | 43 | /* .doit */ |
47 | 44 | // int drbd_adm_create_resource(struct sk_buff *skb, struct genl_info *info); | |
48 | /* see get_sb_bdev and bd_claim */ | 45 | // int drbd_adm_delete_resource(struct sk_buff *skb, struct genl_info *info); |
46 | |||
47 | int drbd_adm_add_minor(struct sk_buff *skb, struct genl_info *info); | ||
48 | int drbd_adm_delete_minor(struct sk_buff *skb, struct genl_info *info); | ||
49 | |||
50 | int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info); | ||
51 | int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info); | ||
52 | int drbd_adm_down(struct sk_buff *skb, struct genl_info *info); | ||
53 | |||
54 | int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info); | ||
55 | int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info); | ||
56 | int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info); | ||
57 | int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info); | ||
58 | int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info); | ||
59 | int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info); | ||
60 | int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info); | ||
61 | int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info); | ||
62 | int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info); | ||
63 | int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info); | ||
64 | int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info); | ||
65 | int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info); | ||
66 | int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info); | ||
67 | int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info); | ||
68 | int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info); | ||
69 | int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info); | ||
70 | int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info); | ||
71 | int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info); | ||
72 | int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info); | ||
73 | int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info); | ||
74 | /* .dumpit */ | ||
75 | int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb); | ||
76 | |||
77 | #include <linux/drbd_genl_api.h> | ||
78 | #include "drbd_nla.h" | ||
79 | #include <linux/genl_magic_func.h> | ||
80 | |||
81 | /* used blkdev_get_by_path, to claim our meta data device(s) */ | ||
49 | static char *drbd_m_holder = "Hands off! this is DRBD's meta data device."; | 82 | static char *drbd_m_holder = "Hands off! this is DRBD's meta data device."; |
50 | 83 | ||
51 | /* Generate the tag_list to struct functions */ | 84 | /* Configuration is strictly serialized, because generic netlink message |
52 | #define NL_PACKET(name, number, fields) \ | 85 | * processing is strictly serialized by the genl_lock(). |
53 | static int name ## _from_tags(struct drbd_conf *mdev, \ | 86 | * Which means we can use one static global drbd_config_context struct. |
54 | unsigned short *tags, struct name *arg) __attribute__ ((unused)); \ | 87 | */ |
55 | static int name ## _from_tags(struct drbd_conf *mdev, \ | 88 | static struct drbd_config_context { |
56 | unsigned short *tags, struct name *arg) \ | 89 | /* assigned from drbd_genlmsghdr */ |
57 | { \ | 90 | unsigned int minor; |
58 | int tag; \ | 91 | /* assigned from request attributes, if present */ |
59 | int dlen; \ | 92 | unsigned int volume; |
60 | \ | 93 | #define VOLUME_UNSPECIFIED (-1U) |
61 | while ((tag = get_unaligned(tags++)) != TT_END) { \ | 94 | /* pointer into the request skb, |
62 | dlen = get_unaligned(tags++); \ | 95 | * limited lifetime! */ |
63 | switch (tag_number(tag)) { \ | 96 | char *resource_name; |
64 | fields \ | 97 | struct nlattr *my_addr; |
65 | default: \ | 98 | struct nlattr *peer_addr; |
66 | if (tag & T_MANDATORY) { \ | 99 | |
67 | dev_err(DEV, "Unknown tag: %d\n", tag_number(tag)); \ | 100 | /* reply buffer */ |
68 | return 0; \ | 101 | struct sk_buff *reply_skb; |
69 | } \ | 102 | /* pointer into reply buffer */ |
70 | } \ | 103 | struct drbd_genlmsghdr *reply_dh; |
71 | tags = (unsigned short *)((char *)tags + dlen); \ | 104 | /* resolved from attributes, if possible */ |
72 | } \ | 105 | struct drbd_conf *mdev; |
73 | return 1; \ | 106 | struct drbd_tconn *tconn; |
74 | } | 107 | } adm_ctx; |
75 | #define NL_INTEGER(pn, pr, member) \ | 108 | |
76 | case pn: /* D_ASSERT( tag_type(tag) == TT_INTEGER ); */ \ | 109 | static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info) |
77 | arg->member = get_unaligned((int *)(tags)); \ | 110 | { |
78 | break; | 111 | genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb)))); |
79 | #define NL_INT64(pn, pr, member) \ | 112 | if (genlmsg_reply(skb, info)) |
80 | case pn: /* D_ASSERT( tag_type(tag) == TT_INT64 ); */ \ | 113 | printk(KERN_ERR "drbd: error sending genl reply\n"); |
81 | arg->member = get_unaligned((u64 *)(tags)); \ | 114 | } |
115 | |||
116 | /* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only | ||
117 | * reason it could fail was no space in skb, and there are 4k available. */ | ||
118 | int drbd_msg_put_info(const char *info) | ||
119 | { | ||
120 | struct sk_buff *skb = adm_ctx.reply_skb; | ||
121 | struct nlattr *nla; | ||
122 | int err = -EMSGSIZE; | ||
123 | |||
124 | if (!info || !info[0]) | ||
125 | return 0; | ||
126 | |||
127 | nla = nla_nest_start(skb, DRBD_NLA_CFG_REPLY); | ||
128 | if (!nla) | ||
129 | return err; | ||
130 | |||
131 | err = nla_put_string(skb, T_info_text, info); | ||
132 | if (err) { | ||
133 | nla_nest_cancel(skb, nla); | ||
134 | return err; | ||
135 | } else | ||
136 | nla_nest_end(skb, nla); | ||
137 | return 0; | ||
138 | } | ||
139 | |||
140 | /* This would be a good candidate for a "pre_doit" hook, | ||
141 | * and per-family private info->pointers. | ||
142 | * But we need to stay compatible with older kernels. | ||
143 | * If it returns successfully, adm_ctx members are valid. | ||
144 | */ | ||
145 | #define DRBD_ADM_NEED_MINOR 1 | ||
146 | #define DRBD_ADM_NEED_RESOURCE 2 | ||
147 | #define DRBD_ADM_NEED_CONNECTION 4 | ||
148 | static int drbd_adm_prepare(struct sk_buff *skb, struct genl_info *info, | ||
149 | unsigned flags) | ||
150 | { | ||
151 | struct drbd_genlmsghdr *d_in = info->userhdr; | ||
152 | const u8 cmd = info->genlhdr->cmd; | ||
153 | int err; | ||
154 | |||
155 | memset(&adm_ctx, 0, sizeof(adm_ctx)); | ||
156 | |||
157 | /* genl_rcv_msg only checks for CAP_NET_ADMIN on "GENL_ADMIN_PERM" :( */ | ||
158 | if (cmd != DRBD_ADM_GET_STATUS && !capable(CAP_NET_ADMIN)) | ||
159 | return -EPERM; | ||
160 | |||
161 | adm_ctx.reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); | ||
162 | if (!adm_ctx.reply_skb) { | ||
163 | err = -ENOMEM; | ||
164 | goto fail; | ||
165 | } | ||
166 | |||
167 | adm_ctx.reply_dh = genlmsg_put_reply(adm_ctx.reply_skb, | ||
168 | info, &drbd_genl_family, 0, cmd); | ||
169 | /* put of a few bytes into a fresh skb of >= 4k will always succeed. | ||
170 | * but anyways */ | ||
171 | if (!adm_ctx.reply_dh) { | ||
172 | err = -ENOMEM; | ||
173 | goto fail; | ||
174 | } | ||
175 | |||
176 | adm_ctx.reply_dh->minor = d_in->minor; | ||
177 | adm_ctx.reply_dh->ret_code = NO_ERROR; | ||
178 | |||
179 | adm_ctx.volume = VOLUME_UNSPECIFIED; | ||
180 | if (info->attrs[DRBD_NLA_CFG_CONTEXT]) { | ||
181 | struct nlattr *nla; | ||
182 | /* parse and validate only */ | ||
183 | err = drbd_cfg_context_from_attrs(NULL, info); | ||
184 | if (err) | ||
185 | goto fail; | ||
186 | |||
187 | /* It was present, and valid, | ||
188 | * copy it over to the reply skb. */ | ||
189 | err = nla_put_nohdr(adm_ctx.reply_skb, | ||
190 | info->attrs[DRBD_NLA_CFG_CONTEXT]->nla_len, | ||
191 | info->attrs[DRBD_NLA_CFG_CONTEXT]); | ||
192 | if (err) | ||
193 | goto fail; | ||
194 | |||
195 | /* and assign stuff to the global adm_ctx */ | ||
196 | nla = nested_attr_tb[__nla_type(T_ctx_volume)]; | ||
197 | if (nla) | ||
198 | adm_ctx.volume = nla_get_u32(nla); | ||
199 | nla = nested_attr_tb[__nla_type(T_ctx_resource_name)]; | ||
200 | if (nla) | ||
201 | adm_ctx.resource_name = nla_data(nla); | ||
202 | adm_ctx.my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)]; | ||
203 | adm_ctx.peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)]; | ||
204 | if ((adm_ctx.my_addr && | ||
205 | nla_len(adm_ctx.my_addr) > sizeof(adm_ctx.tconn->my_addr)) || | ||
206 | (adm_ctx.peer_addr && | ||
207 | nla_len(adm_ctx.peer_addr) > sizeof(adm_ctx.tconn->peer_addr))) { | ||
208 | err = -EINVAL; | ||
209 | goto fail; | ||
210 | } | ||
211 | } | ||
212 | |||
213 | adm_ctx.minor = d_in->minor; | ||
214 | adm_ctx.mdev = minor_to_mdev(d_in->minor); | ||
215 | adm_ctx.tconn = conn_get_by_name(adm_ctx.resource_name); | ||
216 | |||
217 | if (!adm_ctx.mdev && (flags & DRBD_ADM_NEED_MINOR)) { | ||
218 | drbd_msg_put_info("unknown minor"); | ||
219 | return ERR_MINOR_INVALID; | ||
220 | } | ||
221 | if (!adm_ctx.tconn && (flags & DRBD_ADM_NEED_RESOURCE)) { | ||
222 | drbd_msg_put_info("unknown resource"); | ||
223 | return ERR_INVALID_REQUEST; | ||
224 | } | ||
225 | |||
226 | if (flags & DRBD_ADM_NEED_CONNECTION) { | ||
227 | if (adm_ctx.tconn && !(flags & DRBD_ADM_NEED_RESOURCE)) { | ||
228 | drbd_msg_put_info("no resource name expected"); | ||
229 | return ERR_INVALID_REQUEST; | ||
230 | } | ||
231 | if (adm_ctx.mdev) { | ||
232 | drbd_msg_put_info("no minor number expected"); | ||
233 | return ERR_INVALID_REQUEST; | ||
234 | } | ||
235 | if (adm_ctx.my_addr && adm_ctx.peer_addr) | ||
236 | adm_ctx.tconn = conn_get_by_addrs(nla_data(adm_ctx.my_addr), | ||
237 | nla_len(adm_ctx.my_addr), | ||
238 | nla_data(adm_ctx.peer_addr), | ||
239 | nla_len(adm_ctx.peer_addr)); | ||
240 | if (!adm_ctx.tconn) { | ||
241 | drbd_msg_put_info("unknown connection"); | ||
242 | return ERR_INVALID_REQUEST; | ||
243 | } | ||
244 | } | ||
245 | |||
246 | /* some more paranoia, if the request was over-determined */ | ||
247 | if (adm_ctx.mdev && adm_ctx.tconn && | ||
248 | adm_ctx.mdev->tconn != adm_ctx.tconn) { | ||
249 | pr_warning("request: minor=%u, resource=%s; but that minor belongs to connection %s\n", | ||
250 | adm_ctx.minor, adm_ctx.resource_name, | ||
251 | adm_ctx.mdev->tconn->name); | ||
252 | drbd_msg_put_info("minor exists in different resource"); | ||
253 | return ERR_INVALID_REQUEST; | ||
254 | } | ||
255 | if (adm_ctx.mdev && | ||
256 | adm_ctx.volume != VOLUME_UNSPECIFIED && | ||
257 | adm_ctx.volume != adm_ctx.mdev->vnr) { | ||
258 | pr_warning("request: minor=%u, volume=%u; but that minor is volume %u in %s\n", | ||
259 | adm_ctx.minor, adm_ctx.volume, | ||
260 | adm_ctx.mdev->vnr, adm_ctx.mdev->tconn->name); | ||
261 | drbd_msg_put_info("minor exists as different volume"); | ||
262 | return ERR_INVALID_REQUEST; | ||
263 | } | ||
264 | |||
265 | return NO_ERROR; | ||
266 | |||
267 | fail: | ||
268 | nlmsg_free(adm_ctx.reply_skb); | ||
269 | adm_ctx.reply_skb = NULL; | ||
270 | return err; | ||
271 | } | ||
272 | |||
273 | static int drbd_adm_finish(struct genl_info *info, int retcode) | ||
274 | { | ||
275 | if (adm_ctx.tconn) { | ||
276 | kref_put(&adm_ctx.tconn->kref, &conn_destroy); | ||
277 | adm_ctx.tconn = NULL; | ||
278 | } | ||
279 | |||
280 | if (!adm_ctx.reply_skb) | ||
281 | return -ENOMEM; | ||
282 | |||
283 | adm_ctx.reply_dh->ret_code = retcode; | ||
284 | drbd_adm_send_reply(adm_ctx.reply_skb, info); | ||
285 | return 0; | ||
286 | } | ||
287 | |||
288 | static void setup_khelper_env(struct drbd_tconn *tconn, char **envp) | ||
289 | { | ||
290 | char *afs; | ||
291 | |||
292 | /* FIXME: A future version will not allow this case. */ | ||
293 | if (tconn->my_addr_len == 0 || tconn->peer_addr_len == 0) | ||
294 | return; | ||
295 | |||
296 | switch (((struct sockaddr *)&tconn->peer_addr)->sa_family) { | ||
297 | case AF_INET6: | ||
298 | afs = "ipv6"; | ||
299 | snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI6", | ||
300 | &((struct sockaddr_in6 *)&tconn->peer_addr)->sin6_addr); | ||
82 | break; | 301 | break; |
83 | #define NL_BIT(pn, pr, member) \ | 302 | case AF_INET: |
84 | case pn: /* D_ASSERT( tag_type(tag) == TT_BIT ); */ \ | 303 | afs = "ipv4"; |
85 | arg->member = *(char *)(tags) ? 1 : 0; \ | 304 | snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4", |
305 | &((struct sockaddr_in *)&tconn->peer_addr)->sin_addr); | ||
86 | break; | 306 | break; |
87 | #define NL_STRING(pn, pr, member, len) \ | 307 | default: |
88 | case pn: /* D_ASSERT( tag_type(tag) == TT_STRING ); */ \ | 308 | afs = "ssocks"; |
89 | if (dlen > len) { \ | 309 | snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4", |
90 | dev_err(DEV, "arg too long: %s (%u wanted, max len: %u bytes)\n", \ | 310 | &((struct sockaddr_in *)&tconn->peer_addr)->sin_addr); |
91 | #member, dlen, (unsigned int)len); \ | 311 | } |
92 | return 0; \ | 312 | snprintf(envp[3], 20, "DRBD_PEER_AF=%s", afs); |
93 | } \ | 313 | } |
94 | arg->member ## _len = dlen; \ | ||
95 | memcpy(arg->member, tags, min_t(size_t, dlen, len)); \ | ||
96 | break; | ||
97 | #include <linux/drbd_nl.h> | ||
98 | |||
99 | /* Generate the struct to tag_list functions */ | ||
100 | #define NL_PACKET(name, number, fields) \ | ||
101 | static unsigned short* \ | ||
102 | name ## _to_tags(struct drbd_conf *mdev, \ | ||
103 | struct name *arg, unsigned short *tags) __attribute__ ((unused)); \ | ||
104 | static unsigned short* \ | ||
105 | name ## _to_tags(struct drbd_conf *mdev, \ | ||
106 | struct name *arg, unsigned short *tags) \ | ||
107 | { \ | ||
108 | fields \ | ||
109 | return tags; \ | ||
110 | } | ||
111 | |||
112 | #define NL_INTEGER(pn, pr, member) \ | ||
113 | put_unaligned(pn | pr | TT_INTEGER, tags++); \ | ||
114 | put_unaligned(sizeof(int), tags++); \ | ||
115 | put_unaligned(arg->member, (int *)tags); \ | ||
116 | tags = (unsigned short *)((char *)tags+sizeof(int)); | ||
117 | #define NL_INT64(pn, pr, member) \ | ||
118 | put_unaligned(pn | pr | TT_INT64, tags++); \ | ||
119 | put_unaligned(sizeof(u64), tags++); \ | ||
120 | put_unaligned(arg->member, (u64 *)tags); \ | ||
121 | tags = (unsigned short *)((char *)tags+sizeof(u64)); | ||
122 | #define NL_BIT(pn, pr, member) \ | ||
123 | put_unaligned(pn | pr | TT_BIT, tags++); \ | ||
124 | put_unaligned(sizeof(char), tags++); \ | ||
125 | *(char *)tags = arg->member; \ | ||
126 | tags = (unsigned short *)((char *)tags+sizeof(char)); | ||
127 | #define NL_STRING(pn, pr, member, len) \ | ||
128 | put_unaligned(pn | pr | TT_STRING, tags++); \ | ||
129 | put_unaligned(arg->member ## _len, tags++); \ | ||
130 | memcpy(tags, arg->member, arg->member ## _len); \ | ||
131 | tags = (unsigned short *)((char *)tags + arg->member ## _len); | ||
132 | #include <linux/drbd_nl.h> | ||
133 | |||
134 | void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name); | ||
135 | void drbd_nl_send_reply(struct cn_msg *, int); | ||
136 | 314 | ||
137 | int drbd_khelper(struct drbd_conf *mdev, char *cmd) | 315 | int drbd_khelper(struct drbd_conf *mdev, char *cmd) |
138 | { | 316 | { |
139 | char *envp[] = { "HOME=/", | 317 | char *envp[] = { "HOME=/", |
140 | "TERM=linux", | 318 | "TERM=linux", |
141 | "PATH=/sbin:/usr/sbin:/bin:/usr/bin", | 319 | "PATH=/sbin:/usr/sbin:/bin:/usr/bin", |
142 | NULL, /* Will be set to address family */ | 320 | (char[20]) { }, /* address family */ |
143 | NULL, /* Will be set to address */ | 321 | (char[60]) { }, /* address */ |
144 | NULL }; | 322 | NULL }; |
145 | 323 | char mb[12]; | |
146 | char mb[12], af[20], ad[60], *afs; | ||
147 | char *argv[] = {usermode_helper, cmd, mb, NULL }; | 324 | char *argv[] = {usermode_helper, cmd, mb, NULL }; |
325 | struct drbd_tconn *tconn = mdev->tconn; | ||
326 | struct sib_info sib; | ||
148 | int ret; | 327 | int ret; |
149 | 328 | ||
150 | if (current == mdev->worker.task) | 329 | if (current == tconn->worker.task) |
151 | set_bit(CALLBACK_PENDING, &mdev->flags); | 330 | set_bit(CALLBACK_PENDING, &tconn->flags); |
152 | 331 | ||
153 | snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev)); | 332 | snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev)); |
154 | 333 | setup_khelper_env(tconn, envp); | |
155 | if (get_net_conf(mdev)) { | ||
156 | switch (((struct sockaddr *)mdev->net_conf->peer_addr)->sa_family) { | ||
157 | case AF_INET6: | ||
158 | afs = "ipv6"; | ||
159 | snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI6", | ||
160 | &((struct sockaddr_in6 *)mdev->net_conf->peer_addr)->sin6_addr); | ||
161 | break; | ||
162 | case AF_INET: | ||
163 | afs = "ipv4"; | ||
164 | snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4", | ||
165 | &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr); | ||
166 | break; | ||
167 | default: | ||
168 | afs = "ssocks"; | ||
169 | snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4", | ||
170 | &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr); | ||
171 | } | ||
172 | snprintf(af, 20, "DRBD_PEER_AF=%s", afs); | ||
173 | envp[3]=af; | ||
174 | envp[4]=ad; | ||
175 | put_net_conf(mdev); | ||
176 | } | ||
177 | 334 | ||
178 | /* The helper may take some time. | 335 | /* The helper may take some time. |
179 | * write out any unsynced meta data changes now */ | 336 | * write out any unsynced meta data changes now */ |
180 | drbd_md_sync(mdev); | 337 | drbd_md_sync(mdev); |
181 | 338 | ||
182 | dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb); | 339 | dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb); |
183 | 340 | sib.sib_reason = SIB_HELPER_PRE; | |
184 | drbd_bcast_ev_helper(mdev, cmd); | 341 | sib.helper_name = cmd; |
342 | drbd_bcast_event(mdev, &sib); | ||
185 | ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC); | 343 | ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC); |
186 | if (ret) | 344 | if (ret) |
187 | dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n", | 345 | dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n", |
@@ -191,9 +349,46 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd) | |||
191 | dev_info(DEV, "helper command: %s %s %s exit code %u (0x%x)\n", | 349 | dev_info(DEV, "helper command: %s %s %s exit code %u (0x%x)\n", |
192 | usermode_helper, cmd, mb, | 350 | usermode_helper, cmd, mb, |
193 | (ret >> 8) & 0xff, ret); | 351 | (ret >> 8) & 0xff, ret); |
352 | sib.sib_reason = SIB_HELPER_POST; | ||
353 | sib.helper_exit_code = ret; | ||
354 | drbd_bcast_event(mdev, &sib); | ||
355 | |||
356 | if (current == tconn->worker.task) | ||
357 | clear_bit(CALLBACK_PENDING, &tconn->flags); | ||
358 | |||
359 | if (ret < 0) /* Ignore any ERRNOs we got. */ | ||
360 | ret = 0; | ||
361 | |||
362 | return ret; | ||
363 | } | ||
364 | |||
365 | int conn_khelper(struct drbd_tconn *tconn, char *cmd) | ||
366 | { | ||
367 | char *envp[] = { "HOME=/", | ||
368 | "TERM=linux", | ||
369 | "PATH=/sbin:/usr/sbin:/bin:/usr/bin", | ||
370 | (char[20]) { }, /* address family */ | ||
371 | (char[60]) { }, /* address */ | ||
372 | NULL }; | ||
373 | char *argv[] = {usermode_helper, cmd, tconn->name, NULL }; | ||
374 | int ret; | ||
375 | |||
376 | setup_khelper_env(tconn, envp); | ||
377 | conn_md_sync(tconn); | ||
194 | 378 | ||
195 | if (current == mdev->worker.task) | 379 | conn_info(tconn, "helper command: %s %s %s\n", usermode_helper, cmd, tconn->name); |
196 | clear_bit(CALLBACK_PENDING, &mdev->flags); | 380 | /* TODO: conn_bcast_event() ?? */ |
381 | |||
382 | ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC); | ||
383 | if (ret) | ||
384 | conn_warn(tconn, "helper command: %s %s %s exit code %u (0x%x)\n", | ||
385 | usermode_helper, cmd, tconn->name, | ||
386 | (ret >> 8) & 0xff, ret); | ||
387 | else | ||
388 | conn_info(tconn, "helper command: %s %s %s exit code %u (0x%x)\n", | ||
389 | usermode_helper, cmd, tconn->name, | ||
390 | (ret >> 8) & 0xff, ret); | ||
391 | /* TODO: conn_bcast_event() ?? */ | ||
197 | 392 | ||
198 | if (ret < 0) /* Ignore any ERRNOs we got. */ | 393 | if (ret < 0) /* Ignore any ERRNOs we got. */ |
199 | ret = 0; | 394 | ret = 0; |
@@ -201,116 +396,129 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd) | |||
201 | return ret; | 396 | return ret; |
202 | } | 397 | } |
203 | 398 | ||
204 | enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev) | 399 | static enum drbd_fencing_p highest_fencing_policy(struct drbd_tconn *tconn) |
205 | { | 400 | { |
401 | enum drbd_fencing_p fp = FP_NOT_AVAIL; | ||
402 | struct drbd_conf *mdev; | ||
403 | int vnr; | ||
404 | |||
405 | rcu_read_lock(); | ||
406 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
407 | if (get_ldev_if_state(mdev, D_CONSISTENT)) { | ||
408 | fp = max_t(enum drbd_fencing_p, fp, | ||
409 | rcu_dereference(mdev->ldev->disk_conf)->fencing); | ||
410 | put_ldev(mdev); | ||
411 | } | ||
412 | } | ||
413 | rcu_read_unlock(); | ||
414 | |||
415 | return fp; | ||
416 | } | ||
417 | |||
418 | bool conn_try_outdate_peer(struct drbd_tconn *tconn) | ||
419 | { | ||
420 | union drbd_state mask = { }; | ||
421 | union drbd_state val = { }; | ||
422 | enum drbd_fencing_p fp; | ||
206 | char *ex_to_string; | 423 | char *ex_to_string; |
207 | int r; | 424 | int r; |
208 | enum drbd_disk_state nps; | ||
209 | enum drbd_fencing_p fp; | ||
210 | 425 | ||
211 | D_ASSERT(mdev->state.pdsk == D_UNKNOWN); | 426 | if (tconn->cstate >= C_WF_REPORT_PARAMS) { |
427 | conn_err(tconn, "Expected cstate < C_WF_REPORT_PARAMS\n"); | ||
428 | return false; | ||
429 | } | ||
212 | 430 | ||
213 | if (get_ldev_if_state(mdev, D_CONSISTENT)) { | 431 | fp = highest_fencing_policy(tconn); |
214 | fp = mdev->ldev->dc.fencing; | 432 | switch (fp) { |
215 | put_ldev(mdev); | 433 | case FP_NOT_AVAIL: |
216 | } else { | 434 | conn_warn(tconn, "Not fencing peer, I'm not even Consistent myself.\n"); |
217 | dev_warn(DEV, "Not fencing peer, I'm not even Consistent myself.\n"); | ||
218 | nps = mdev->state.pdsk; | ||
219 | goto out; | 435 | goto out; |
436 | case FP_DONT_CARE: | ||
437 | return true; | ||
438 | default: ; | ||
220 | } | 439 | } |
221 | 440 | ||
222 | r = drbd_khelper(mdev, "fence-peer"); | 441 | r = conn_khelper(tconn, "fence-peer"); |
223 | 442 | ||
224 | switch ((r>>8) & 0xff) { | 443 | switch ((r>>8) & 0xff) { |
225 | case 3: /* peer is inconsistent */ | 444 | case 3: /* peer is inconsistent */ |
226 | ex_to_string = "peer is inconsistent or worse"; | 445 | ex_to_string = "peer is inconsistent or worse"; |
227 | nps = D_INCONSISTENT; | 446 | mask.pdsk = D_MASK; |
447 | val.pdsk = D_INCONSISTENT; | ||
228 | break; | 448 | break; |
229 | case 4: /* peer got outdated, or was already outdated */ | 449 | case 4: /* peer got outdated, or was already outdated */ |
230 | ex_to_string = "peer was fenced"; | 450 | ex_to_string = "peer was fenced"; |
231 | nps = D_OUTDATED; | 451 | mask.pdsk = D_MASK; |
452 | val.pdsk = D_OUTDATED; | ||
232 | break; | 453 | break; |
233 | case 5: /* peer was down */ | 454 | case 5: /* peer was down */ |
234 | if (mdev->state.disk == D_UP_TO_DATE) { | 455 | if (conn_highest_disk(tconn) == D_UP_TO_DATE) { |
235 | /* we will(have) create(d) a new UUID anyways... */ | 456 | /* we will(have) create(d) a new UUID anyways... */ |
236 | ex_to_string = "peer is unreachable, assumed to be dead"; | 457 | ex_to_string = "peer is unreachable, assumed to be dead"; |
237 | nps = D_OUTDATED; | 458 | mask.pdsk = D_MASK; |
459 | val.pdsk = D_OUTDATED; | ||
238 | } else { | 460 | } else { |
239 | ex_to_string = "peer unreachable, doing nothing since disk != UpToDate"; | 461 | ex_to_string = "peer unreachable, doing nothing since disk != UpToDate"; |
240 | nps = mdev->state.pdsk; | ||
241 | } | 462 | } |
242 | break; | 463 | break; |
243 | case 6: /* Peer is primary, voluntarily outdate myself. | 464 | case 6: /* Peer is primary, voluntarily outdate myself. |
244 | * This is useful when an unconnected R_SECONDARY is asked to | 465 | * This is useful when an unconnected R_SECONDARY is asked to |
245 | * become R_PRIMARY, but finds the other peer being active. */ | 466 | * become R_PRIMARY, but finds the other peer being active. */ |
246 | ex_to_string = "peer is active"; | 467 | ex_to_string = "peer is active"; |
247 | dev_warn(DEV, "Peer is primary, outdating myself.\n"); | 468 | conn_warn(tconn, "Peer is primary, outdating myself.\n"); |
248 | nps = D_UNKNOWN; | 469 | mask.disk = D_MASK; |
249 | _drbd_request_state(mdev, NS(disk, D_OUTDATED), CS_WAIT_COMPLETE); | 470 | val.disk = D_OUTDATED; |
250 | break; | 471 | break; |
251 | case 7: | 472 | case 7: |
252 | if (fp != FP_STONITH) | 473 | if (fp != FP_STONITH) |
253 | dev_err(DEV, "fence-peer() = 7 && fencing != Stonith !!!\n"); | 474 | conn_err(tconn, "fence-peer() = 7 && fencing != Stonith !!!\n"); |
254 | ex_to_string = "peer was stonithed"; | 475 | ex_to_string = "peer was stonithed"; |
255 | nps = D_OUTDATED; | 476 | mask.pdsk = D_MASK; |
477 | val.pdsk = D_OUTDATED; | ||
256 | break; | 478 | break; |
257 | default: | 479 | default: |
258 | /* The script is broken ... */ | 480 | /* The script is broken ... */ |
259 | nps = D_UNKNOWN; | 481 | conn_err(tconn, "fence-peer helper broken, returned %d\n", (r>>8)&0xff); |
260 | dev_err(DEV, "fence-peer helper broken, returned %d\n", (r>>8)&0xff); | 482 | return false; /* Eventually leave IO frozen */ |
261 | return nps; | ||
262 | } | 483 | } |
263 | 484 | ||
264 | dev_info(DEV, "fence-peer helper returned %d (%s)\n", | 485 | conn_info(tconn, "fence-peer helper returned %d (%s)\n", |
265 | (r>>8) & 0xff, ex_to_string); | 486 | (r>>8) & 0xff, ex_to_string); |
266 | 487 | ||
267 | out: | 488 | out: |
268 | if (mdev->state.susp_fen && nps >= D_UNKNOWN) { | ||
269 | /* The handler was not successful... unfreeze here, the | ||
270 | state engine can not unfreeze... */ | ||
271 | _drbd_request_state(mdev, NS(susp_fen, 0), CS_VERBOSE); | ||
272 | } | ||
273 | 489 | ||
274 | return nps; | 490 | /* Not using |
491 | conn_request_state(tconn, mask, val, CS_VERBOSE); | ||
492 | here, because we might were able to re-establish the connection in the | ||
493 | meantime. */ | ||
494 | spin_lock_irq(&tconn->req_lock); | ||
495 | if (tconn->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &tconn->flags)) | ||
496 | _conn_request_state(tconn, mask, val, CS_VERBOSE); | ||
497 | spin_unlock_irq(&tconn->req_lock); | ||
498 | |||
499 | return conn_highest_pdsk(tconn) <= D_OUTDATED; | ||
275 | } | 500 | } |
276 | 501 | ||
277 | static int _try_outdate_peer_async(void *data) | 502 | static int _try_outdate_peer_async(void *data) |
278 | { | 503 | { |
279 | struct drbd_conf *mdev = (struct drbd_conf *)data; | 504 | struct drbd_tconn *tconn = (struct drbd_tconn *)data; |
280 | enum drbd_disk_state nps; | ||
281 | union drbd_state ns; | ||
282 | 505 | ||
283 | nps = drbd_try_outdate_peer(mdev); | 506 | conn_try_outdate_peer(tconn); |
284 | |||
285 | /* Not using | ||
286 | drbd_request_state(mdev, NS(pdsk, nps)); | ||
287 | here, because we might were able to re-establish the connection | ||
288 | in the meantime. This can only partially be solved in the state's | ||
289 | engine is_valid_state() and is_valid_state_transition() | ||
290 | functions. | ||
291 | |||
292 | nps can be D_INCONSISTENT, D_OUTDATED or D_UNKNOWN. | ||
293 | pdsk == D_INCONSISTENT while conn >= C_CONNECTED is valid, | ||
294 | therefore we have to have the pre state change check here. | ||
295 | */ | ||
296 | spin_lock_irq(&mdev->req_lock); | ||
297 | ns = mdev->state; | ||
298 | if (ns.conn < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &mdev->flags)) { | ||
299 | ns.pdsk = nps; | ||
300 | _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); | ||
301 | } | ||
302 | spin_unlock_irq(&mdev->req_lock); | ||
303 | 507 | ||
508 | kref_put(&tconn->kref, &conn_destroy); | ||
304 | return 0; | 509 | return 0; |
305 | } | 510 | } |
306 | 511 | ||
307 | void drbd_try_outdate_peer_async(struct drbd_conf *mdev) | 512 | void conn_try_outdate_peer_async(struct drbd_tconn *tconn) |
308 | { | 513 | { |
309 | struct task_struct *opa; | 514 | struct task_struct *opa; |
310 | 515 | ||
311 | opa = kthread_run(_try_outdate_peer_async, mdev, "drbd%d_a_helper", mdev_to_minor(mdev)); | 516 | kref_get(&tconn->kref); |
312 | if (IS_ERR(opa)) | 517 | opa = kthread_run(_try_outdate_peer_async, tconn, "drbd_async_h"); |
313 | dev_err(DEV, "out of mem, failed to invoke fence-peer helper\n"); | 518 | if (IS_ERR(opa)) { |
519 | conn_err(tconn, "out of mem, failed to invoke fence-peer helper\n"); | ||
520 | kref_put(&tconn->kref, &conn_destroy); | ||
521 | } | ||
314 | } | 522 | } |
315 | 523 | ||
316 | enum drbd_state_rv | 524 | enum drbd_state_rv |
@@ -318,15 +526,15 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) | |||
318 | { | 526 | { |
319 | const int max_tries = 4; | 527 | const int max_tries = 4; |
320 | enum drbd_state_rv rv = SS_UNKNOWN_ERROR; | 528 | enum drbd_state_rv rv = SS_UNKNOWN_ERROR; |
529 | struct net_conf *nc; | ||
321 | int try = 0; | 530 | int try = 0; |
322 | int forced = 0; | 531 | int forced = 0; |
323 | union drbd_state mask, val; | 532 | union drbd_state mask, val; |
324 | enum drbd_disk_state nps; | ||
325 | 533 | ||
326 | if (new_role == R_PRIMARY) | 534 | if (new_role == R_PRIMARY) |
327 | request_ping(mdev); /* Detect a dead peer ASAP */ | 535 | request_ping(mdev->tconn); /* Detect a dead peer ASAP */ |
328 | 536 | ||
329 | mutex_lock(&mdev->state_mutex); | 537 | mutex_lock(mdev->state_mutex); |
330 | 538 | ||
331 | mask.i = 0; mask.role = R_MASK; | 539 | mask.i = 0; mask.role = R_MASK; |
332 | val.i = 0; val.role = new_role; | 540 | val.i = 0; val.role = new_role; |
@@ -354,38 +562,34 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) | |||
354 | if (rv == SS_NO_UP_TO_DATE_DISK && | 562 | if (rv == SS_NO_UP_TO_DATE_DISK && |
355 | mdev->state.disk == D_CONSISTENT && mask.pdsk == 0) { | 563 | mdev->state.disk == D_CONSISTENT && mask.pdsk == 0) { |
356 | D_ASSERT(mdev->state.pdsk == D_UNKNOWN); | 564 | D_ASSERT(mdev->state.pdsk == D_UNKNOWN); |
357 | nps = drbd_try_outdate_peer(mdev); | ||
358 | 565 | ||
359 | if (nps == D_OUTDATED || nps == D_INCONSISTENT) { | 566 | if (conn_try_outdate_peer(mdev->tconn)) { |
360 | val.disk = D_UP_TO_DATE; | 567 | val.disk = D_UP_TO_DATE; |
361 | mask.disk = D_MASK; | 568 | mask.disk = D_MASK; |
362 | } | 569 | } |
363 | |||
364 | val.pdsk = nps; | ||
365 | mask.pdsk = D_MASK; | ||
366 | |||
367 | continue; | 570 | continue; |
368 | } | 571 | } |
369 | 572 | ||
370 | if (rv == SS_NOTHING_TO_DO) | 573 | if (rv == SS_NOTHING_TO_DO) |
371 | goto fail; | 574 | goto out; |
372 | if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) { | 575 | if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) { |
373 | nps = drbd_try_outdate_peer(mdev); | 576 | if (!conn_try_outdate_peer(mdev->tconn) && force) { |
374 | |||
375 | if (force && nps > D_OUTDATED) { | ||
376 | dev_warn(DEV, "Forced into split brain situation!\n"); | 577 | dev_warn(DEV, "Forced into split brain situation!\n"); |
377 | nps = D_OUTDATED; | 578 | mask.pdsk = D_MASK; |
378 | } | 579 | val.pdsk = D_OUTDATED; |
379 | |||
380 | mask.pdsk = D_MASK; | ||
381 | val.pdsk = nps; | ||
382 | 580 | ||
581 | } | ||
383 | continue; | 582 | continue; |
384 | } | 583 | } |
385 | if (rv == SS_TWO_PRIMARIES) { | 584 | if (rv == SS_TWO_PRIMARIES) { |
386 | /* Maybe the peer is detected as dead very soon... | 585 | /* Maybe the peer is detected as dead very soon... |
387 | retry at most once more in this case. */ | 586 | retry at most once more in this case. */ |
388 | schedule_timeout_interruptible((mdev->net_conf->ping_timeo+1)*HZ/10); | 587 | int timeo; |
588 | rcu_read_lock(); | ||
589 | nc = rcu_dereference(mdev->tconn->net_conf); | ||
590 | timeo = nc ? (nc->ping_timeo + 1) * HZ / 10 : 1; | ||
591 | rcu_read_unlock(); | ||
592 | schedule_timeout_interruptible(timeo); | ||
389 | if (try < max_tries) | 593 | if (try < max_tries) |
390 | try = max_tries - 1; | 594 | try = max_tries - 1; |
391 | continue; | 595 | continue; |
@@ -394,13 +598,13 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) | |||
394 | rv = _drbd_request_state(mdev, mask, val, | 598 | rv = _drbd_request_state(mdev, mask, val, |
395 | CS_VERBOSE + CS_WAIT_COMPLETE); | 599 | CS_VERBOSE + CS_WAIT_COMPLETE); |
396 | if (rv < SS_SUCCESS) | 600 | if (rv < SS_SUCCESS) |
397 | goto fail; | 601 | goto out; |
398 | } | 602 | } |
399 | break; | 603 | break; |
400 | } | 604 | } |
401 | 605 | ||
402 | if (rv < SS_SUCCESS) | 606 | if (rv < SS_SUCCESS) |
403 | goto fail; | 607 | goto out; |
404 | 608 | ||
405 | if (forced) | 609 | if (forced) |
406 | dev_warn(DEV, "Forced to consider local data as UpToDate!\n"); | 610 | dev_warn(DEV, "Forced to consider local data as UpToDate!\n"); |
@@ -408,6 +612,8 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) | |||
408 | /* Wait until nothing is on the fly :) */ | 612 | /* Wait until nothing is on the fly :) */ |
409 | wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0); | 613 | wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0); |
410 | 614 | ||
615 | /* FIXME also wait for all pending P_BARRIER_ACK? */ | ||
616 | |||
411 | if (new_role == R_SECONDARY) { | 617 | if (new_role == R_SECONDARY) { |
412 | set_disk_ro(mdev->vdisk, true); | 618 | set_disk_ro(mdev->vdisk, true); |
413 | if (get_ldev(mdev)) { | 619 | if (get_ldev(mdev)) { |
@@ -415,10 +621,12 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) | |||
415 | put_ldev(mdev); | 621 | put_ldev(mdev); |
416 | } | 622 | } |
417 | } else { | 623 | } else { |
418 | if (get_net_conf(mdev)) { | 624 | mutex_lock(&mdev->tconn->conf_update); |
419 | mdev->net_conf->want_lose = 0; | 625 | nc = mdev->tconn->net_conf; |
420 | put_net_conf(mdev); | 626 | if (nc) |
421 | } | 627 | nc->discard_my_data = 0; /* without copy; single bit op is atomic */ |
628 | mutex_unlock(&mdev->tconn->conf_update); | ||
629 | |||
422 | set_disk_ro(mdev->vdisk, false); | 630 | set_disk_ro(mdev->vdisk, false); |
423 | if (get_ldev(mdev)) { | 631 | if (get_ldev(mdev)) { |
424 | if (((mdev->state.conn < C_CONNECTED || | 632 | if (((mdev->state.conn < C_CONNECTED || |
@@ -444,67 +652,47 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) | |||
444 | drbd_md_sync(mdev); | 652 | drbd_md_sync(mdev); |
445 | 653 | ||
446 | kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); | 654 | kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); |
447 | fail: | 655 | out: |
448 | mutex_unlock(&mdev->state_mutex); | 656 | mutex_unlock(mdev->state_mutex); |
449 | return rv; | 657 | return rv; |
450 | } | 658 | } |
451 | 659 | ||
452 | static struct drbd_conf *ensure_mdev(int minor, int create) | 660 | static const char *from_attrs_err_to_txt(int err) |
453 | { | 661 | { |
454 | struct drbd_conf *mdev; | 662 | return err == -ENOMSG ? "required attribute missing" : |
455 | 663 | err == -EOPNOTSUPP ? "unknown mandatory attribute" : | |
456 | if (minor >= minor_count) | 664 | err == -EEXIST ? "can not change invariant setting" : |
457 | return NULL; | 665 | "invalid attribute value"; |
458 | |||
459 | mdev = minor_to_mdev(minor); | ||
460 | |||
461 | if (!mdev && create) { | ||
462 | struct gendisk *disk = NULL; | ||
463 | mdev = drbd_new_device(minor); | ||
464 | |||
465 | spin_lock_irq(&drbd_pp_lock); | ||
466 | if (minor_table[minor] == NULL) { | ||
467 | minor_table[minor] = mdev; | ||
468 | disk = mdev->vdisk; | ||
469 | mdev = NULL; | ||
470 | } /* else: we lost the race */ | ||
471 | spin_unlock_irq(&drbd_pp_lock); | ||
472 | |||
473 | if (disk) /* we won the race above */ | ||
474 | /* in case we ever add a drbd_delete_device(), | ||
475 | * don't forget the del_gendisk! */ | ||
476 | add_disk(disk); | ||
477 | else /* we lost the race above */ | ||
478 | drbd_free_mdev(mdev); | ||
479 | |||
480 | mdev = minor_to_mdev(minor); | ||
481 | } | ||
482 | |||
483 | return mdev; | ||
484 | } | 666 | } |
485 | 667 | ||
486 | static int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 668 | int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info) |
487 | struct drbd_nl_cfg_reply *reply) | ||
488 | { | 669 | { |
489 | struct primary primary_args; | 670 | struct set_role_parms parms; |
490 | 671 | int err; | |
491 | memset(&primary_args, 0, sizeof(struct primary)); | 672 | enum drbd_ret_code retcode; |
492 | if (!primary_from_tags(mdev, nlp->tag_list, &primary_args)) { | ||
493 | reply->ret_code = ERR_MANDATORY_TAG; | ||
494 | return 0; | ||
495 | } | ||
496 | |||
497 | reply->ret_code = | ||
498 | drbd_set_role(mdev, R_PRIMARY, primary_args.primary_force); | ||
499 | 673 | ||
500 | return 0; | 674 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); |
501 | } | 675 | if (!adm_ctx.reply_skb) |
676 | return retcode; | ||
677 | if (retcode != NO_ERROR) | ||
678 | goto out; | ||
502 | 679 | ||
503 | static int drbd_nl_secondary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 680 | memset(&parms, 0, sizeof(parms)); |
504 | struct drbd_nl_cfg_reply *reply) | 681 | if (info->attrs[DRBD_NLA_SET_ROLE_PARMS]) { |
505 | { | 682 | err = set_role_parms_from_attrs(&parms, info); |
506 | reply->ret_code = drbd_set_role(mdev, R_SECONDARY, 0); | 683 | if (err) { |
684 | retcode = ERR_MANDATORY_TAG; | ||
685 | drbd_msg_put_info(from_attrs_err_to_txt(err)); | ||
686 | goto out; | ||
687 | } | ||
688 | } | ||
507 | 689 | ||
690 | if (info->genlhdr->cmd == DRBD_ADM_PRIMARY) | ||
691 | retcode = drbd_set_role(adm_ctx.mdev, R_PRIMARY, parms.assume_uptodate); | ||
692 | else | ||
693 | retcode = drbd_set_role(adm_ctx.mdev, R_SECONDARY, 0); | ||
694 | out: | ||
695 | drbd_adm_finish(info, retcode); | ||
508 | return 0; | 696 | return 0; |
509 | } | 697 | } |
510 | 698 | ||
@@ -514,7 +702,12 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, | |||
514 | struct drbd_backing_dev *bdev) | 702 | struct drbd_backing_dev *bdev) |
515 | { | 703 | { |
516 | sector_t md_size_sect = 0; | 704 | sector_t md_size_sect = 0; |
517 | switch (bdev->dc.meta_dev_idx) { | 705 | int meta_dev_idx; |
706 | |||
707 | rcu_read_lock(); | ||
708 | meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; | ||
709 | |||
710 | switch (meta_dev_idx) { | ||
518 | default: | 711 | default: |
519 | /* v07 style fixed size indexed meta data */ | 712 | /* v07 style fixed size indexed meta data */ |
520 | bdev->md.md_size_sect = MD_RESERVED_SECT; | 713 | bdev->md.md_size_sect = MD_RESERVED_SECT; |
@@ -533,7 +726,7 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, | |||
533 | case DRBD_MD_INDEX_FLEX_INT: | 726 | case DRBD_MD_INDEX_FLEX_INT: |
534 | bdev->md.md_offset = drbd_md_ss__(mdev, bdev); | 727 | bdev->md.md_offset = drbd_md_ss__(mdev, bdev); |
535 | /* al size is still fixed */ | 728 | /* al size is still fixed */ |
536 | bdev->md.al_offset = -MD_AL_MAX_SIZE; | 729 | bdev->md.al_offset = -MD_AL_SECTORS; |
537 | /* we need (slightly less than) ~ this much bitmap sectors: */ | 730 | /* we need (slightly less than) ~ this much bitmap sectors: */ |
538 | md_size_sect = drbd_get_capacity(bdev->backing_bdev); | 731 | md_size_sect = drbd_get_capacity(bdev->backing_bdev); |
539 | md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT); | 732 | md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT); |
@@ -549,6 +742,7 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, | |||
549 | bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET; | 742 | bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET; |
550 | break; | 743 | break; |
551 | } | 744 | } |
745 | rcu_read_unlock(); | ||
552 | } | 746 | } |
553 | 747 | ||
554 | /* input size is expected to be in KB */ | 748 | /* input size is expected to be in KB */ |
@@ -581,10 +775,16 @@ char *ppsize(char *buf, unsigned long long size) | |||
581 | * R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET: | 775 | * R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET: |
582 | * peer may not initiate a resize. | 776 | * peer may not initiate a resize. |
583 | */ | 777 | */ |
778 | /* Note these are not to be confused with | ||
779 | * drbd_adm_suspend_io/drbd_adm_resume_io, | ||
780 | * which are (sub) state changes triggered by admin (drbdsetup), | ||
781 | * and can be long lived. | ||
782 | * This changes an mdev->flag, is triggered by drbd internals, | ||
783 | * and should be short-lived. */ | ||
584 | void drbd_suspend_io(struct drbd_conf *mdev) | 784 | void drbd_suspend_io(struct drbd_conf *mdev) |
585 | { | 785 | { |
586 | set_bit(SUSPEND_IO, &mdev->flags); | 786 | set_bit(SUSPEND_IO, &mdev->flags); |
587 | if (is_susp(mdev->state)) | 787 | if (drbd_suspended(mdev)) |
588 | return; | 788 | return; |
589 | wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); | 789 | wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); |
590 | } | 790 | } |
@@ -605,7 +805,7 @@ void drbd_resume_io(struct drbd_conf *mdev) | |||
605 | enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local) | 805 | enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local) |
606 | { | 806 | { |
607 | sector_t prev_first_sect, prev_size; /* previous meta location */ | 807 | sector_t prev_first_sect, prev_size; /* previous meta location */ |
608 | sector_t la_size; | 808 | sector_t la_size, u_size; |
609 | sector_t size; | 809 | sector_t size; |
610 | char ppb[10]; | 810 | char ppb[10]; |
611 | 811 | ||
@@ -633,7 +833,10 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds | |||
633 | /* TODO: should only be some assert here, not (re)init... */ | 833 | /* TODO: should only be some assert here, not (re)init... */ |
634 | drbd_md_set_sector_offsets(mdev, mdev->ldev); | 834 | drbd_md_set_sector_offsets(mdev, mdev->ldev); |
635 | 835 | ||
636 | size = drbd_new_dev_size(mdev, mdev->ldev, flags & DDSF_FORCED); | 836 | rcu_read_lock(); |
837 | u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size; | ||
838 | rcu_read_unlock(); | ||
839 | size = drbd_new_dev_size(mdev, mdev->ldev, u_size, flags & DDSF_FORCED); | ||
637 | 840 | ||
638 | if (drbd_get_capacity(mdev->this_bdev) != size || | 841 | if (drbd_get_capacity(mdev->this_bdev) != size || |
639 | drbd_bm_capacity(mdev) != size) { | 842 | drbd_bm_capacity(mdev) != size) { |
@@ -696,12 +899,12 @@ out: | |||
696 | } | 899 | } |
697 | 900 | ||
698 | sector_t | 901 | sector_t |
699 | drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, int assume_peer_has_space) | 902 | drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, |
903 | sector_t u_size, int assume_peer_has_space) | ||
700 | { | 904 | { |
701 | sector_t p_size = mdev->p_size; /* partner's disk size. */ | 905 | sector_t p_size = mdev->p_size; /* partner's disk size. */ |
702 | sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */ | 906 | sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */ |
703 | sector_t m_size; /* my size */ | 907 | sector_t m_size; /* my size */ |
704 | sector_t u_size = bdev->dc.disk_size; /* size requested by user. */ | ||
705 | sector_t size = 0; | 908 | sector_t size = 0; |
706 | 909 | ||
707 | m_size = drbd_get_max_capacity(bdev); | 910 | m_size = drbd_get_max_capacity(bdev); |
@@ -750,24 +953,21 @@ drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, int ass | |||
750 | * failed, and 0 on success. You should call drbd_md_sync() after you called | 953 | * failed, and 0 on success. You should call drbd_md_sync() after you called |
751 | * this function. | 954 | * this function. |
752 | */ | 955 | */ |
753 | static int drbd_check_al_size(struct drbd_conf *mdev) | 956 | static int drbd_check_al_size(struct drbd_conf *mdev, struct disk_conf *dc) |
754 | { | 957 | { |
755 | struct lru_cache *n, *t; | 958 | struct lru_cache *n, *t; |
756 | struct lc_element *e; | 959 | struct lc_element *e; |
757 | unsigned int in_use; | 960 | unsigned int in_use; |
758 | int i; | 961 | int i; |
759 | 962 | ||
760 | ERR_IF(mdev->sync_conf.al_extents < 7) | ||
761 | mdev->sync_conf.al_extents = 127; | ||
762 | |||
763 | if (mdev->act_log && | 963 | if (mdev->act_log && |
764 | mdev->act_log->nr_elements == mdev->sync_conf.al_extents) | 964 | mdev->act_log->nr_elements == dc->al_extents) |
765 | return 0; | 965 | return 0; |
766 | 966 | ||
767 | in_use = 0; | 967 | in_use = 0; |
768 | t = mdev->act_log; | 968 | t = mdev->act_log; |
769 | n = lc_create("act_log", drbd_al_ext_cache, | 969 | n = lc_create("act_log", drbd_al_ext_cache, AL_UPDATES_PER_TRANSACTION, |
770 | mdev->sync_conf.al_extents, sizeof(struct lc_element), 0); | 970 | dc->al_extents, sizeof(struct lc_element), 0); |
771 | 971 | ||
772 | if (n == NULL) { | 972 | if (n == NULL) { |
773 | dev_err(DEV, "Cannot allocate act_log lru!\n"); | 973 | dev_err(DEV, "Cannot allocate act_log lru!\n"); |
@@ -808,7 +1008,9 @@ static void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_ | |||
808 | struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; | 1008 | struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; |
809 | 1009 | ||
810 | max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9); | 1010 | max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9); |
811 | max_segments = mdev->ldev->dc.max_bio_bvecs; | 1011 | rcu_read_lock(); |
1012 | max_segments = rcu_dereference(mdev->ldev->disk_conf)->max_bio_bvecs; | ||
1013 | rcu_read_unlock(); | ||
812 | put_ldev(mdev); | 1014 | put_ldev(mdev); |
813 | } | 1015 | } |
814 | 1016 | ||
@@ -852,12 +1054,14 @@ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev) | |||
852 | Because new from 8.3.8 onwards the peer can use multiple | 1054 | Because new from 8.3.8 onwards the peer can use multiple |
853 | BIOs for a single peer_request */ | 1055 | BIOs for a single peer_request */ |
854 | if (mdev->state.conn >= C_CONNECTED) { | 1056 | if (mdev->state.conn >= C_CONNECTED) { |
855 | if (mdev->agreed_pro_version < 94) { | 1057 | if (mdev->tconn->agreed_pro_version < 94) |
856 | peer = min(mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); | 1058 | peer = min( mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); |
857 | /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */ | 1059 | /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */ |
858 | } else if (mdev->agreed_pro_version == 94) | 1060 | else if (mdev->tconn->agreed_pro_version == 94) |
859 | peer = DRBD_MAX_SIZE_H80_PACKET; | 1061 | peer = DRBD_MAX_SIZE_H80_PACKET; |
860 | else /* drbd 8.3.8 onwards */ | 1062 | else if (mdev->tconn->agreed_pro_version < 100) |
1063 | peer = DRBD_MAX_BIO_SIZE_P95; /* drbd 8.3.8 onwards, before 8.4.0 */ | ||
1064 | else | ||
861 | peer = DRBD_MAX_BIO_SIZE; | 1065 | peer = DRBD_MAX_BIO_SIZE; |
862 | } | 1066 | } |
863 | 1067 | ||
@@ -872,36 +1076,27 @@ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev) | |||
872 | drbd_setup_queue_param(mdev, new); | 1076 | drbd_setup_queue_param(mdev, new); |
873 | } | 1077 | } |
874 | 1078 | ||
875 | /* serialize deconfig (worker exiting, doing cleanup) | 1079 | /* Starts the worker thread */ |
876 | * and reconfig (drbdsetup disk, drbdsetup net) | 1080 | static void conn_reconfig_start(struct drbd_tconn *tconn) |
877 | * | ||
878 | * Wait for a potentially exiting worker, then restart it, | ||
879 | * or start a new one. Flush any pending work, there may still be an | ||
880 | * after_state_change queued. | ||
881 | */ | ||
882 | static void drbd_reconfig_start(struct drbd_conf *mdev) | ||
883 | { | 1081 | { |
884 | wait_event(mdev->state_wait, !test_and_set_bit(CONFIG_PENDING, &mdev->flags)); | 1082 | drbd_thread_start(&tconn->worker); |
885 | wait_event(mdev->state_wait, !test_bit(DEVICE_DYING, &mdev->flags)); | 1083 | conn_flush_workqueue(tconn); |
886 | drbd_thread_start(&mdev->worker); | ||
887 | drbd_flush_workqueue(mdev); | ||
888 | } | 1084 | } |
889 | 1085 | ||
890 | /* if still unconfigured, stops worker again. | 1086 | /* if still unconfigured, stops worker again. */ |
891 | * if configured now, clears CONFIG_PENDING. | 1087 | static void conn_reconfig_done(struct drbd_tconn *tconn) |
892 | * wakes potential waiters */ | ||
893 | static void drbd_reconfig_done(struct drbd_conf *mdev) | ||
894 | { | 1088 | { |
895 | spin_lock_irq(&mdev->req_lock); | 1089 | bool stop_threads; |
896 | if (mdev->state.disk == D_DISKLESS && | 1090 | spin_lock_irq(&tconn->req_lock); |
897 | mdev->state.conn == C_STANDALONE && | 1091 | stop_threads = conn_all_vols_unconf(tconn) && |
898 | mdev->state.role == R_SECONDARY) { | 1092 | tconn->cstate == C_STANDALONE; |
899 | set_bit(DEVICE_DYING, &mdev->flags); | 1093 | spin_unlock_irq(&tconn->req_lock); |
900 | drbd_thread_stop_nowait(&mdev->worker); | 1094 | if (stop_threads) { |
901 | } else | 1095 | /* asender is implicitly stopped by receiver |
902 | clear_bit(CONFIG_PENDING, &mdev->flags); | 1096 | * in conn_disconnect() */ |
903 | spin_unlock_irq(&mdev->req_lock); | 1097 | drbd_thread_stop(&tconn->receiver); |
904 | wake_up(&mdev->state_wait); | 1098 | drbd_thread_stop(&tconn->worker); |
1099 | } | ||
905 | } | 1100 | } |
906 | 1101 | ||
907 | /* Make sure IO is suspended before calling this function(). */ | 1102 | /* Make sure IO is suspended before calling this function(). */ |
@@ -909,42 +1104,187 @@ static void drbd_suspend_al(struct drbd_conf *mdev) | |||
909 | { | 1104 | { |
910 | int s = 0; | 1105 | int s = 0; |
911 | 1106 | ||
912 | if (lc_try_lock(mdev->act_log)) { | 1107 | if (!lc_try_lock(mdev->act_log)) { |
913 | drbd_al_shrink(mdev); | ||
914 | lc_unlock(mdev->act_log); | ||
915 | } else { | ||
916 | dev_warn(DEV, "Failed to lock al in drbd_suspend_al()\n"); | 1108 | dev_warn(DEV, "Failed to lock al in drbd_suspend_al()\n"); |
917 | return; | 1109 | return; |
918 | } | 1110 | } |
919 | 1111 | ||
920 | spin_lock_irq(&mdev->req_lock); | 1112 | drbd_al_shrink(mdev); |
1113 | spin_lock_irq(&mdev->tconn->req_lock); | ||
921 | if (mdev->state.conn < C_CONNECTED) | 1114 | if (mdev->state.conn < C_CONNECTED) |
922 | s = !test_and_set_bit(AL_SUSPENDED, &mdev->flags); | 1115 | s = !test_and_set_bit(AL_SUSPENDED, &mdev->flags); |
923 | 1116 | spin_unlock_irq(&mdev->tconn->req_lock); | |
924 | spin_unlock_irq(&mdev->req_lock); | 1117 | lc_unlock(mdev->act_log); |
925 | 1118 | ||
926 | if (s) | 1119 | if (s) |
927 | dev_info(DEV, "Suspended AL updates\n"); | 1120 | dev_info(DEV, "Suspended AL updates\n"); |
928 | } | 1121 | } |
929 | 1122 | ||
930 | /* does always return 0; | 1123 | |
931 | * interesting return code is in reply->ret_code */ | 1124 | static bool should_set_defaults(struct genl_info *info) |
932 | static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 1125 | { |
933 | struct drbd_nl_cfg_reply *reply) | 1126 | unsigned flags = ((struct drbd_genlmsghdr*)info->userhdr)->flags; |
1127 | return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS); | ||
1128 | } | ||
1129 | |||
1130 | static void enforce_disk_conf_limits(struct disk_conf *dc) | ||
1131 | { | ||
1132 | if (dc->al_extents < DRBD_AL_EXTENTS_MIN) | ||
1133 | dc->al_extents = DRBD_AL_EXTENTS_MIN; | ||
1134 | if (dc->al_extents > DRBD_AL_EXTENTS_MAX) | ||
1135 | dc->al_extents = DRBD_AL_EXTENTS_MAX; | ||
1136 | |||
1137 | if (dc->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX) | ||
1138 | dc->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX; | ||
1139 | } | ||
1140 | |||
1141 | int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) | ||
934 | { | 1142 | { |
935 | enum drbd_ret_code retcode; | 1143 | enum drbd_ret_code retcode; |
1144 | struct drbd_conf *mdev; | ||
1145 | struct disk_conf *new_disk_conf, *old_disk_conf; | ||
1146 | struct fifo_buffer *old_plan = NULL, *new_plan = NULL; | ||
1147 | int err, fifo_size; | ||
1148 | |||
1149 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); | ||
1150 | if (!adm_ctx.reply_skb) | ||
1151 | return retcode; | ||
1152 | if (retcode != NO_ERROR) | ||
1153 | goto out; | ||
1154 | |||
1155 | mdev = adm_ctx.mdev; | ||
1156 | |||
1157 | /* we also need a disk | ||
1158 | * to change the options on */ | ||
1159 | if (!get_ldev(mdev)) { | ||
1160 | retcode = ERR_NO_DISK; | ||
1161 | goto out; | ||
1162 | } | ||
1163 | |||
1164 | new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL); | ||
1165 | if (!new_disk_conf) { | ||
1166 | retcode = ERR_NOMEM; | ||
1167 | goto fail; | ||
1168 | } | ||
1169 | |||
1170 | mutex_lock(&mdev->tconn->conf_update); | ||
1171 | old_disk_conf = mdev->ldev->disk_conf; | ||
1172 | *new_disk_conf = *old_disk_conf; | ||
1173 | if (should_set_defaults(info)) | ||
1174 | set_disk_conf_defaults(new_disk_conf); | ||
1175 | |||
1176 | err = disk_conf_from_attrs_for_change(new_disk_conf, info); | ||
1177 | if (err && err != -ENOMSG) { | ||
1178 | retcode = ERR_MANDATORY_TAG; | ||
1179 | drbd_msg_put_info(from_attrs_err_to_txt(err)); | ||
1180 | } | ||
1181 | |||
1182 | if (!expect(new_disk_conf->resync_rate >= 1)) | ||
1183 | new_disk_conf->resync_rate = 1; | ||
1184 | |||
1185 | enforce_disk_conf_limits(new_disk_conf); | ||
1186 | |||
1187 | fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ; | ||
1188 | if (fifo_size != mdev->rs_plan_s->size) { | ||
1189 | new_plan = fifo_alloc(fifo_size); | ||
1190 | if (!new_plan) { | ||
1191 | dev_err(DEV, "kmalloc of fifo_buffer failed"); | ||
1192 | retcode = ERR_NOMEM; | ||
1193 | goto fail_unlock; | ||
1194 | } | ||
1195 | } | ||
1196 | |||
1197 | drbd_suspend_io(mdev); | ||
1198 | wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); | ||
1199 | drbd_al_shrink(mdev); | ||
1200 | err = drbd_check_al_size(mdev, new_disk_conf); | ||
1201 | lc_unlock(mdev->act_log); | ||
1202 | wake_up(&mdev->al_wait); | ||
1203 | drbd_resume_io(mdev); | ||
1204 | |||
1205 | if (err) { | ||
1206 | retcode = ERR_NOMEM; | ||
1207 | goto fail_unlock; | ||
1208 | } | ||
1209 | |||
1210 | write_lock_irq(&global_state_lock); | ||
1211 | retcode = drbd_resync_after_valid(mdev, new_disk_conf->resync_after); | ||
1212 | if (retcode == NO_ERROR) { | ||
1213 | rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf); | ||
1214 | drbd_resync_after_changed(mdev); | ||
1215 | } | ||
1216 | write_unlock_irq(&global_state_lock); | ||
1217 | |||
1218 | if (retcode != NO_ERROR) | ||
1219 | goto fail_unlock; | ||
1220 | |||
1221 | if (new_plan) { | ||
1222 | old_plan = mdev->rs_plan_s; | ||
1223 | rcu_assign_pointer(mdev->rs_plan_s, new_plan); | ||
1224 | } | ||
1225 | |||
1226 | mutex_unlock(&mdev->tconn->conf_update); | ||
1227 | |||
1228 | if (new_disk_conf->al_updates) | ||
1229 | mdev->ldev->md.flags &= ~MDF_AL_DISABLED; | ||
1230 | else | ||
1231 | mdev->ldev->md.flags |= MDF_AL_DISABLED; | ||
1232 | |||
1233 | if (new_disk_conf->md_flushes) | ||
1234 | clear_bit(MD_NO_FUA, &mdev->flags); | ||
1235 | else | ||
1236 | set_bit(MD_NO_FUA, &mdev->flags); | ||
1237 | |||
1238 | drbd_bump_write_ordering(mdev->tconn, WO_bdev_flush); | ||
1239 | |||
1240 | drbd_md_sync(mdev); | ||
1241 | |||
1242 | if (mdev->state.conn >= C_CONNECTED) | ||
1243 | drbd_send_sync_param(mdev); | ||
1244 | |||
1245 | synchronize_rcu(); | ||
1246 | kfree(old_disk_conf); | ||
1247 | kfree(old_plan); | ||
1248 | mod_timer(&mdev->request_timer, jiffies + HZ); | ||
1249 | goto success; | ||
1250 | |||
1251 | fail_unlock: | ||
1252 | mutex_unlock(&mdev->tconn->conf_update); | ||
1253 | fail: | ||
1254 | kfree(new_disk_conf); | ||
1255 | kfree(new_plan); | ||
1256 | success: | ||
1257 | put_ldev(mdev); | ||
1258 | out: | ||
1259 | drbd_adm_finish(info, retcode); | ||
1260 | return 0; | ||
1261 | } | ||
1262 | |||
1263 | int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) | ||
1264 | { | ||
1265 | struct drbd_conf *mdev; | ||
1266 | int err; | ||
1267 | enum drbd_ret_code retcode; | ||
936 | enum determine_dev_size dd; | 1268 | enum determine_dev_size dd; |
937 | sector_t max_possible_sectors; | 1269 | sector_t max_possible_sectors; |
938 | sector_t min_md_device_sectors; | 1270 | sector_t min_md_device_sectors; |
939 | struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */ | 1271 | struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */ |
1272 | struct disk_conf *new_disk_conf = NULL; | ||
940 | struct block_device *bdev; | 1273 | struct block_device *bdev; |
941 | struct lru_cache *resync_lru = NULL; | 1274 | struct lru_cache *resync_lru = NULL; |
1275 | struct fifo_buffer *new_plan = NULL; | ||
942 | union drbd_state ns, os; | 1276 | union drbd_state ns, os; |
943 | enum drbd_state_rv rv; | 1277 | enum drbd_state_rv rv; |
944 | int cp_discovered = 0; | 1278 | struct net_conf *nc; |
945 | int logical_block_size; | ||
946 | 1279 | ||
947 | drbd_reconfig_start(mdev); | 1280 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); |
1281 | if (!adm_ctx.reply_skb) | ||
1282 | return retcode; | ||
1283 | if (retcode != NO_ERROR) | ||
1284 | goto finish; | ||
1285 | |||
1286 | mdev = adm_ctx.mdev; | ||
1287 | conn_reconfig_start(mdev->tconn); | ||
948 | 1288 | ||
949 | /* if you want to reconfigure, please tear down first */ | 1289 | /* if you want to reconfigure, please tear down first */ |
950 | if (mdev->state.disk > D_DISKLESS) { | 1290 | if (mdev->state.disk > D_DISKLESS) { |
@@ -959,47 +1299,65 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
959 | 1299 | ||
960 | /* make sure there is no leftover from previous force-detach attempts */ | 1300 | /* make sure there is no leftover from previous force-detach attempts */ |
961 | clear_bit(FORCE_DETACH, &mdev->flags); | 1301 | clear_bit(FORCE_DETACH, &mdev->flags); |
1302 | clear_bit(WAS_IO_ERROR, &mdev->flags); | ||
1303 | clear_bit(WAS_READ_ERROR, &mdev->flags); | ||
962 | 1304 | ||
963 | /* and no leftover from previously aborted resync or verify, either */ | 1305 | /* and no leftover from previously aborted resync or verify, either */ |
964 | mdev->rs_total = 0; | 1306 | mdev->rs_total = 0; |
965 | mdev->rs_failed = 0; | 1307 | mdev->rs_failed = 0; |
966 | atomic_set(&mdev->rs_pending_cnt, 0); | 1308 | atomic_set(&mdev->rs_pending_cnt, 0); |
967 | 1309 | ||
968 | /* allocation not in the IO path, cqueue thread context */ | 1310 | /* allocation not in the IO path, drbdsetup context */ |
969 | nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL); | 1311 | nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL); |
970 | if (!nbc) { | 1312 | if (!nbc) { |
971 | retcode = ERR_NOMEM; | 1313 | retcode = ERR_NOMEM; |
972 | goto fail; | 1314 | goto fail; |
973 | } | 1315 | } |
1316 | spin_lock_init(&nbc->md.uuid_lock); | ||
974 | 1317 | ||
975 | nbc->dc.disk_size = DRBD_DISK_SIZE_SECT_DEF; | 1318 | new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL); |
976 | nbc->dc.on_io_error = DRBD_ON_IO_ERROR_DEF; | 1319 | if (!new_disk_conf) { |
977 | nbc->dc.fencing = DRBD_FENCING_DEF; | 1320 | retcode = ERR_NOMEM; |
978 | nbc->dc.max_bio_bvecs = DRBD_MAX_BIO_BVECS_DEF; | 1321 | goto fail; |
1322 | } | ||
1323 | nbc->disk_conf = new_disk_conf; | ||
979 | 1324 | ||
980 | if (!disk_conf_from_tags(mdev, nlp->tag_list, &nbc->dc)) { | 1325 | set_disk_conf_defaults(new_disk_conf); |
1326 | err = disk_conf_from_attrs(new_disk_conf, info); | ||
1327 | if (err) { | ||
981 | retcode = ERR_MANDATORY_TAG; | 1328 | retcode = ERR_MANDATORY_TAG; |
1329 | drbd_msg_put_info(from_attrs_err_to_txt(err)); | ||
982 | goto fail; | 1330 | goto fail; |
983 | } | 1331 | } |
984 | 1332 | ||
985 | if (nbc->dc.meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) { | 1333 | enforce_disk_conf_limits(new_disk_conf); |
1334 | |||
1335 | new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ); | ||
1336 | if (!new_plan) { | ||
1337 | retcode = ERR_NOMEM; | ||
1338 | goto fail; | ||
1339 | } | ||
1340 | |||
1341 | if (new_disk_conf->meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) { | ||
986 | retcode = ERR_MD_IDX_INVALID; | 1342 | retcode = ERR_MD_IDX_INVALID; |
987 | goto fail; | 1343 | goto fail; |
988 | } | 1344 | } |
989 | 1345 | ||
990 | if (get_net_conf(mdev)) { | 1346 | rcu_read_lock(); |
991 | int prot = mdev->net_conf->wire_protocol; | 1347 | nc = rcu_dereference(mdev->tconn->net_conf); |
992 | put_net_conf(mdev); | 1348 | if (nc) { |
993 | if (nbc->dc.fencing == FP_STONITH && prot == DRBD_PROT_A) { | 1349 | if (new_disk_conf->fencing == FP_STONITH && nc->wire_protocol == DRBD_PROT_A) { |
1350 | rcu_read_unlock(); | ||
994 | retcode = ERR_STONITH_AND_PROT_A; | 1351 | retcode = ERR_STONITH_AND_PROT_A; |
995 | goto fail; | 1352 | goto fail; |
996 | } | 1353 | } |
997 | } | 1354 | } |
1355 | rcu_read_unlock(); | ||
998 | 1356 | ||
999 | bdev = blkdev_get_by_path(nbc->dc.backing_dev, | 1357 | bdev = blkdev_get_by_path(new_disk_conf->backing_dev, |
1000 | FMODE_READ | FMODE_WRITE | FMODE_EXCL, mdev); | 1358 | FMODE_READ | FMODE_WRITE | FMODE_EXCL, mdev); |
1001 | if (IS_ERR(bdev)) { | 1359 | if (IS_ERR(bdev)) { |
1002 | dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev, | 1360 | dev_err(DEV, "open(\"%s\") failed with %ld\n", new_disk_conf->backing_dev, |
1003 | PTR_ERR(bdev)); | 1361 | PTR_ERR(bdev)); |
1004 | retcode = ERR_OPEN_DISK; | 1362 | retcode = ERR_OPEN_DISK; |
1005 | goto fail; | 1363 | goto fail; |
@@ -1014,12 +1372,12 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1014 | * should check it for you already; but if you don't, or | 1372 | * should check it for you already; but if you don't, or |
1015 | * someone fooled it, we need to double check here) | 1373 | * someone fooled it, we need to double check here) |
1016 | */ | 1374 | */ |
1017 | bdev = blkdev_get_by_path(nbc->dc.meta_dev, | 1375 | bdev = blkdev_get_by_path(new_disk_conf->meta_dev, |
1018 | FMODE_READ | FMODE_WRITE | FMODE_EXCL, | 1376 | FMODE_READ | FMODE_WRITE | FMODE_EXCL, |
1019 | (nbc->dc.meta_dev_idx < 0) ? | 1377 | (new_disk_conf->meta_dev_idx < 0) ? |
1020 | (void *)mdev : (void *)drbd_m_holder); | 1378 | (void *)mdev : (void *)drbd_m_holder); |
1021 | if (IS_ERR(bdev)) { | 1379 | if (IS_ERR(bdev)) { |
1022 | dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev, | 1380 | dev_err(DEV, "open(\"%s\") failed with %ld\n", new_disk_conf->meta_dev, |
1023 | PTR_ERR(bdev)); | 1381 | PTR_ERR(bdev)); |
1024 | retcode = ERR_OPEN_MD_DISK; | 1382 | retcode = ERR_OPEN_MD_DISK; |
1025 | goto fail; | 1383 | goto fail; |
@@ -1027,14 +1385,14 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1027 | nbc->md_bdev = bdev; | 1385 | nbc->md_bdev = bdev; |
1028 | 1386 | ||
1029 | if ((nbc->backing_bdev == nbc->md_bdev) != | 1387 | if ((nbc->backing_bdev == nbc->md_bdev) != |
1030 | (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL || | 1388 | (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL || |
1031 | nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) { | 1389 | new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) { |
1032 | retcode = ERR_MD_IDX_INVALID; | 1390 | retcode = ERR_MD_IDX_INVALID; |
1033 | goto fail; | 1391 | goto fail; |
1034 | } | 1392 | } |
1035 | 1393 | ||
1036 | resync_lru = lc_create("resync", drbd_bm_ext_cache, | 1394 | resync_lru = lc_create("resync", drbd_bm_ext_cache, |
1037 | 61, sizeof(struct bm_extent), | 1395 | 1, 61, sizeof(struct bm_extent), |
1038 | offsetof(struct bm_extent, lce)); | 1396 | offsetof(struct bm_extent, lce)); |
1039 | if (!resync_lru) { | 1397 | if (!resync_lru) { |
1040 | retcode = ERR_NOMEM; | 1398 | retcode = ERR_NOMEM; |
@@ -1044,21 +1402,21 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1044 | /* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */ | 1402 | /* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */ |
1045 | drbd_md_set_sector_offsets(mdev, nbc); | 1403 | drbd_md_set_sector_offsets(mdev, nbc); |
1046 | 1404 | ||
1047 | if (drbd_get_max_capacity(nbc) < nbc->dc.disk_size) { | 1405 | if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) { |
1048 | dev_err(DEV, "max capacity %llu smaller than disk size %llu\n", | 1406 | dev_err(DEV, "max capacity %llu smaller than disk size %llu\n", |
1049 | (unsigned long long) drbd_get_max_capacity(nbc), | 1407 | (unsigned long long) drbd_get_max_capacity(nbc), |
1050 | (unsigned long long) nbc->dc.disk_size); | 1408 | (unsigned long long) new_disk_conf->disk_size); |
1051 | retcode = ERR_DISK_TOO_SMALL; | 1409 | retcode = ERR_DISK_TOO_SMALL; |
1052 | goto fail; | 1410 | goto fail; |
1053 | } | 1411 | } |
1054 | 1412 | ||
1055 | if (nbc->dc.meta_dev_idx < 0) { | 1413 | if (new_disk_conf->meta_dev_idx < 0) { |
1056 | max_possible_sectors = DRBD_MAX_SECTORS_FLEX; | 1414 | max_possible_sectors = DRBD_MAX_SECTORS_FLEX; |
1057 | /* at least one MB, otherwise it does not make sense */ | 1415 | /* at least one MB, otherwise it does not make sense */ |
1058 | min_md_device_sectors = (2<<10); | 1416 | min_md_device_sectors = (2<<10); |
1059 | } else { | 1417 | } else { |
1060 | max_possible_sectors = DRBD_MAX_SECTORS; | 1418 | max_possible_sectors = DRBD_MAX_SECTORS; |
1061 | min_md_device_sectors = MD_RESERVED_SECT * (nbc->dc.meta_dev_idx + 1); | 1419 | min_md_device_sectors = MD_RESERVED_SECT * (new_disk_conf->meta_dev_idx + 1); |
1062 | } | 1420 | } |
1063 | 1421 | ||
1064 | if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) { | 1422 | if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) { |
@@ -1083,14 +1441,20 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1083 | dev_warn(DEV, "==> truncating very big lower level device " | 1441 | dev_warn(DEV, "==> truncating very big lower level device " |
1084 | "to currently maximum possible %llu sectors <==\n", | 1442 | "to currently maximum possible %llu sectors <==\n", |
1085 | (unsigned long long) max_possible_sectors); | 1443 | (unsigned long long) max_possible_sectors); |
1086 | if (nbc->dc.meta_dev_idx >= 0) | 1444 | if (new_disk_conf->meta_dev_idx >= 0) |
1087 | dev_warn(DEV, "==>> using internal or flexible " | 1445 | dev_warn(DEV, "==>> using internal or flexible " |
1088 | "meta data may help <<==\n"); | 1446 | "meta data may help <<==\n"); |
1089 | } | 1447 | } |
1090 | 1448 | ||
1091 | drbd_suspend_io(mdev); | 1449 | drbd_suspend_io(mdev); |
1092 | /* also wait for the last barrier ack. */ | 1450 | /* also wait for the last barrier ack. */ |
1093 | wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt) || is_susp(mdev->state)); | 1451 | /* FIXME see also https://daiquiri.linbit/cgi-bin/bugzilla/show_bug.cgi?id=171 |
1452 | * We need a way to either ignore barrier acks for barriers sent before a device | ||
1453 | * was attached, or a way to wait for all pending barrier acks to come in. | ||
1454 | * As barriers are counted per resource, | ||
1455 | * we'd need to suspend io on all devices of a resource. | ||
1456 | */ | ||
1457 | wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt) || drbd_suspended(mdev)); | ||
1094 | /* and for any other previously queued work */ | 1458 | /* and for any other previously queued work */ |
1095 | drbd_flush_workqueue(mdev); | 1459 | drbd_flush_workqueue(mdev); |
1096 | 1460 | ||
@@ -1105,25 +1469,6 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1105 | 1469 | ||
1106 | drbd_md_set_sector_offsets(mdev, nbc); | 1470 | drbd_md_set_sector_offsets(mdev, nbc); |
1107 | 1471 | ||
1108 | /* allocate a second IO page if logical_block_size != 512 */ | ||
1109 | logical_block_size = bdev_logical_block_size(nbc->md_bdev); | ||
1110 | if (logical_block_size == 0) | ||
1111 | logical_block_size = MD_SECTOR_SIZE; | ||
1112 | |||
1113 | if (logical_block_size != MD_SECTOR_SIZE) { | ||
1114 | if (!mdev->md_io_tmpp) { | ||
1115 | struct page *page = alloc_page(GFP_NOIO); | ||
1116 | if (!page) | ||
1117 | goto force_diskless_dec; | ||
1118 | |||
1119 | dev_warn(DEV, "Meta data's bdev logical_block_size = %d != %d\n", | ||
1120 | logical_block_size, MD_SECTOR_SIZE); | ||
1121 | dev_warn(DEV, "Workaround engaged (has performance impact).\n"); | ||
1122 | |||
1123 | mdev->md_io_tmpp = page; | ||
1124 | } | ||
1125 | } | ||
1126 | |||
1127 | if (!mdev->bitmap) { | 1472 | if (!mdev->bitmap) { |
1128 | if (drbd_bm_init(mdev)) { | 1473 | if (drbd_bm_init(mdev)) { |
1129 | retcode = ERR_NOMEM; | 1474 | retcode = ERR_NOMEM; |
@@ -1145,30 +1490,25 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1145 | } | 1490 | } |
1146 | 1491 | ||
1147 | /* Since we are diskless, fix the activity log first... */ | 1492 | /* Since we are diskless, fix the activity log first... */ |
1148 | if (drbd_check_al_size(mdev)) { | 1493 | if (drbd_check_al_size(mdev, new_disk_conf)) { |
1149 | retcode = ERR_NOMEM; | 1494 | retcode = ERR_NOMEM; |
1150 | goto force_diskless_dec; | 1495 | goto force_diskless_dec; |
1151 | } | 1496 | } |
1152 | 1497 | ||
1153 | /* Prevent shrinking of consistent devices ! */ | 1498 | /* Prevent shrinking of consistent devices ! */ |
1154 | if (drbd_md_test_flag(nbc, MDF_CONSISTENT) && | 1499 | if (drbd_md_test_flag(nbc, MDF_CONSISTENT) && |
1155 | drbd_new_dev_size(mdev, nbc, 0) < nbc->md.la_size_sect) { | 1500 | drbd_new_dev_size(mdev, nbc, nbc->disk_conf->disk_size, 0) < nbc->md.la_size_sect) { |
1156 | dev_warn(DEV, "refusing to truncate a consistent device\n"); | 1501 | dev_warn(DEV, "refusing to truncate a consistent device\n"); |
1157 | retcode = ERR_DISK_TOO_SMALL; | 1502 | retcode = ERR_DISK_TOO_SMALL; |
1158 | goto force_diskless_dec; | 1503 | goto force_diskless_dec; |
1159 | } | 1504 | } |
1160 | 1505 | ||
1161 | if (!drbd_al_read_log(mdev, nbc)) { | ||
1162 | retcode = ERR_IO_MD_DISK; | ||
1163 | goto force_diskless_dec; | ||
1164 | } | ||
1165 | |||
1166 | /* Reset the "barriers don't work" bits here, then force meta data to | 1506 | /* Reset the "barriers don't work" bits here, then force meta data to |
1167 | * be written, to ensure we determine if barriers are supported. */ | 1507 | * be written, to ensure we determine if barriers are supported. */ |
1168 | if (nbc->dc.no_md_flush) | 1508 | if (new_disk_conf->md_flushes) |
1169 | set_bit(MD_NO_FUA, &mdev->flags); | ||
1170 | else | ||
1171 | clear_bit(MD_NO_FUA, &mdev->flags); | 1509 | clear_bit(MD_NO_FUA, &mdev->flags); |
1510 | else | ||
1511 | set_bit(MD_NO_FUA, &mdev->flags); | ||
1172 | 1512 | ||
1173 | /* Point of no return reached. | 1513 | /* Point of no return reached. |
1174 | * Devices and memory are no longer released by error cleanup below. | 1514 | * Devices and memory are no longer released by error cleanup below. |
@@ -1177,11 +1517,13 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1177 | D_ASSERT(mdev->ldev == NULL); | 1517 | D_ASSERT(mdev->ldev == NULL); |
1178 | mdev->ldev = nbc; | 1518 | mdev->ldev = nbc; |
1179 | mdev->resync = resync_lru; | 1519 | mdev->resync = resync_lru; |
1520 | mdev->rs_plan_s = new_plan; | ||
1180 | nbc = NULL; | 1521 | nbc = NULL; |
1181 | resync_lru = NULL; | 1522 | resync_lru = NULL; |
1523 | new_disk_conf = NULL; | ||
1524 | new_plan = NULL; | ||
1182 | 1525 | ||
1183 | mdev->write_ordering = WO_bdev_flush; | 1526 | drbd_bump_write_ordering(mdev->tconn, WO_bdev_flush); |
1184 | drbd_bump_write_ordering(mdev, WO_bdev_flush); | ||
1185 | 1527 | ||
1186 | if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY)) | 1528 | if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY)) |
1187 | set_bit(CRASHED_PRIMARY, &mdev->flags); | 1529 | set_bit(CRASHED_PRIMARY, &mdev->flags); |
@@ -1189,10 +1531,8 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1189 | clear_bit(CRASHED_PRIMARY, &mdev->flags); | 1531 | clear_bit(CRASHED_PRIMARY, &mdev->flags); |
1190 | 1532 | ||
1191 | if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) && | 1533 | if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) && |
1192 | !(mdev->state.role == R_PRIMARY && mdev->state.susp_nod)) { | 1534 | !(mdev->state.role == R_PRIMARY && mdev->tconn->susp_nod)) |
1193 | set_bit(CRASHED_PRIMARY, &mdev->flags); | 1535 | set_bit(CRASHED_PRIMARY, &mdev->flags); |
1194 | cp_discovered = 1; | ||
1195 | } | ||
1196 | 1536 | ||
1197 | mdev->send_cnt = 0; | 1537 | mdev->send_cnt = 0; |
1198 | mdev->recv_cnt = 0; | 1538 | mdev->recv_cnt = 0; |
@@ -1228,7 +1568,9 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1228 | } else if (dd == grew) | 1568 | } else if (dd == grew) |
1229 | set_bit(RESYNC_AFTER_NEG, &mdev->flags); | 1569 | set_bit(RESYNC_AFTER_NEG, &mdev->flags); |
1230 | 1570 | ||
1231 | if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) { | 1571 | if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC) || |
1572 | (test_bit(CRASHED_PRIMARY, &mdev->flags) && | ||
1573 | drbd_md_test_flag(mdev->ldev, MDF_AL_DISABLED))) { | ||
1232 | dev_info(DEV, "Assuming that all blocks are out of sync " | 1574 | dev_info(DEV, "Assuming that all blocks are out of sync " |
1233 | "(aka FullSync)\n"); | 1575 | "(aka FullSync)\n"); |
1234 | if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, | 1576 | if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, |
@@ -1238,16 +1580,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1238 | } | 1580 | } |
1239 | } else { | 1581 | } else { |
1240 | if (drbd_bitmap_io(mdev, &drbd_bm_read, | 1582 | if (drbd_bitmap_io(mdev, &drbd_bm_read, |
1241 | "read from attaching", BM_LOCKED_MASK) < 0) { | 1583 | "read from attaching", BM_LOCKED_MASK)) { |
1242 | retcode = ERR_IO_MD_DISK; | ||
1243 | goto force_diskless_dec; | ||
1244 | } | ||
1245 | } | ||
1246 | |||
1247 | if (cp_discovered) { | ||
1248 | drbd_al_apply_to_bm(mdev); | ||
1249 | if (drbd_bitmap_io(mdev, &drbd_bm_write, | ||
1250 | "crashed primary apply AL", BM_LOCKED_MASK)) { | ||
1251 | retcode = ERR_IO_MD_DISK; | 1584 | retcode = ERR_IO_MD_DISK; |
1252 | goto force_diskless_dec; | 1585 | goto force_diskless_dec; |
1253 | } | 1586 | } |
@@ -1256,9 +1589,9 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1256 | if (_drbd_bm_total_weight(mdev) == drbd_bm_bits(mdev)) | 1589 | if (_drbd_bm_total_weight(mdev) == drbd_bm_bits(mdev)) |
1257 | drbd_suspend_al(mdev); /* IO is still suspended here... */ | 1590 | drbd_suspend_al(mdev); /* IO is still suspended here... */ |
1258 | 1591 | ||
1259 | spin_lock_irq(&mdev->req_lock); | 1592 | spin_lock_irq(&mdev->tconn->req_lock); |
1260 | os = mdev->state; | 1593 | os = drbd_read_state(mdev); |
1261 | ns.i = os.i; | 1594 | ns = os; |
1262 | /* If MDF_CONSISTENT is not set go into inconsistent state, | 1595 | /* If MDF_CONSISTENT is not set go into inconsistent state, |
1263 | otherwise investigate MDF_WasUpToDate... | 1596 | otherwise investigate MDF_WasUpToDate... |
1264 | If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state, | 1597 | If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state, |
@@ -1276,8 +1609,9 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1276 | if (drbd_md_test_flag(mdev->ldev, MDF_PEER_OUT_DATED)) | 1609 | if (drbd_md_test_flag(mdev->ldev, MDF_PEER_OUT_DATED)) |
1277 | ns.pdsk = D_OUTDATED; | 1610 | ns.pdsk = D_OUTDATED; |
1278 | 1611 | ||
1279 | if ( ns.disk == D_CONSISTENT && | 1612 | rcu_read_lock(); |
1280 | (ns.pdsk == D_OUTDATED || mdev->ldev->dc.fencing == FP_DONT_CARE)) | 1613 | if (ns.disk == D_CONSISTENT && |
1614 | (ns.pdsk == D_OUTDATED || rcu_dereference(mdev->ldev->disk_conf)->fencing == FP_DONT_CARE)) | ||
1281 | ns.disk = D_UP_TO_DATE; | 1615 | ns.disk = D_UP_TO_DATE; |
1282 | 1616 | ||
1283 | /* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND, | 1617 | /* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND, |
@@ -1285,6 +1619,13 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1285 | this point, because drbd_request_state() modifies these | 1619 | this point, because drbd_request_state() modifies these |
1286 | flags. */ | 1620 | flags. */ |
1287 | 1621 | ||
1622 | if (rcu_dereference(mdev->ldev->disk_conf)->al_updates) | ||
1623 | mdev->ldev->md.flags &= ~MDF_AL_DISABLED; | ||
1624 | else | ||
1625 | mdev->ldev->md.flags |= MDF_AL_DISABLED; | ||
1626 | |||
1627 | rcu_read_unlock(); | ||
1628 | |||
1288 | /* In case we are C_CONNECTED postpone any decision on the new disk | 1629 | /* In case we are C_CONNECTED postpone any decision on the new disk |
1289 | state after the negotiation phase. */ | 1630 | state after the negotiation phase. */ |
1290 | if (mdev->state.conn == C_CONNECTED) { | 1631 | if (mdev->state.conn == C_CONNECTED) { |
@@ -1300,12 +1641,13 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1300 | } | 1641 | } |
1301 | 1642 | ||
1302 | rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); | 1643 | rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); |
1303 | ns = mdev->state; | 1644 | spin_unlock_irq(&mdev->tconn->req_lock); |
1304 | spin_unlock_irq(&mdev->req_lock); | ||
1305 | 1645 | ||
1306 | if (rv < SS_SUCCESS) | 1646 | if (rv < SS_SUCCESS) |
1307 | goto force_diskless_dec; | 1647 | goto force_diskless_dec; |
1308 | 1648 | ||
1649 | mod_timer(&mdev->request_timer, jiffies + HZ); | ||
1650 | |||
1309 | if (mdev->state.role == R_PRIMARY) | 1651 | if (mdev->state.role == R_PRIMARY) |
1310 | mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1; | 1652 | mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1; |
1311 | else | 1653 | else |
@@ -1316,16 +1658,17 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1316 | 1658 | ||
1317 | kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); | 1659 | kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); |
1318 | put_ldev(mdev); | 1660 | put_ldev(mdev); |
1319 | reply->ret_code = retcode; | 1661 | conn_reconfig_done(mdev->tconn); |
1320 | drbd_reconfig_done(mdev); | 1662 | drbd_adm_finish(info, retcode); |
1321 | return 0; | 1663 | return 0; |
1322 | 1664 | ||
1323 | force_diskless_dec: | 1665 | force_diskless_dec: |
1324 | put_ldev(mdev); | 1666 | put_ldev(mdev); |
1325 | force_diskless: | 1667 | force_diskless: |
1326 | drbd_force_state(mdev, NS(disk, D_FAILED)); | 1668 | drbd_force_state(mdev, NS(disk, D_DISKLESS)); |
1327 | drbd_md_sync(mdev); | 1669 | drbd_md_sync(mdev); |
1328 | fail: | 1670 | fail: |
1671 | conn_reconfig_done(mdev->tconn); | ||
1329 | if (nbc) { | 1672 | if (nbc) { |
1330 | if (nbc->backing_bdev) | 1673 | if (nbc->backing_bdev) |
1331 | blkdev_put(nbc->backing_bdev, | 1674 | blkdev_put(nbc->backing_bdev, |
@@ -1335,34 +1678,24 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1335 | FMODE_READ | FMODE_WRITE | FMODE_EXCL); | 1678 | FMODE_READ | FMODE_WRITE | FMODE_EXCL); |
1336 | kfree(nbc); | 1679 | kfree(nbc); |
1337 | } | 1680 | } |
1681 | kfree(new_disk_conf); | ||
1338 | lc_destroy(resync_lru); | 1682 | lc_destroy(resync_lru); |
1683 | kfree(new_plan); | ||
1339 | 1684 | ||
1340 | reply->ret_code = retcode; | 1685 | finish: |
1341 | drbd_reconfig_done(mdev); | 1686 | drbd_adm_finish(info, retcode); |
1342 | return 0; | 1687 | return 0; |
1343 | } | 1688 | } |
1344 | 1689 | ||
1345 | /* Detaching the disk is a process in multiple stages. First we need to lock | 1690 | static int adm_detach(struct drbd_conf *mdev, int force) |
1346 | * out application IO, in-flight IO, IO stuck in drbd_al_begin_io. | ||
1347 | * Then we transition to D_DISKLESS, and wait for put_ldev() to return all | ||
1348 | * internal references as well. | ||
1349 | * Only then we have finally detached. */ | ||
1350 | static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | ||
1351 | struct drbd_nl_cfg_reply *reply) | ||
1352 | { | 1691 | { |
1353 | enum drbd_ret_code retcode; | 1692 | enum drbd_state_rv retcode; |
1354 | int ret; | 1693 | int ret; |
1355 | struct detach dt = {}; | ||
1356 | 1694 | ||
1357 | if (!detach_from_tags(mdev, nlp->tag_list, &dt)) { | 1695 | if (force) { |
1358 | reply->ret_code = ERR_MANDATORY_TAG; | ||
1359 | goto out; | ||
1360 | } | ||
1361 | |||
1362 | if (dt.detach_force) { | ||
1363 | set_bit(FORCE_DETACH, &mdev->flags); | 1696 | set_bit(FORCE_DETACH, &mdev->flags); |
1364 | drbd_force_state(mdev, NS(disk, D_FAILED)); | 1697 | drbd_force_state(mdev, NS(disk, D_FAILED)); |
1365 | reply->ret_code = SS_SUCCESS; | 1698 | retcode = SS_SUCCESS; |
1366 | goto out; | 1699 | goto out; |
1367 | } | 1700 | } |
1368 | 1701 | ||
@@ -1374,326 +1707,529 @@ static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | |||
1374 | ret = wait_event_interruptible(mdev->misc_wait, | 1707 | ret = wait_event_interruptible(mdev->misc_wait, |
1375 | mdev->state.disk != D_FAILED); | 1708 | mdev->state.disk != D_FAILED); |
1376 | drbd_resume_io(mdev); | 1709 | drbd_resume_io(mdev); |
1377 | |||
1378 | if ((int)retcode == (int)SS_IS_DISKLESS) | 1710 | if ((int)retcode == (int)SS_IS_DISKLESS) |
1379 | retcode = SS_NOTHING_TO_DO; | 1711 | retcode = SS_NOTHING_TO_DO; |
1380 | if (ret) | 1712 | if (ret) |
1381 | retcode = ERR_INTR; | 1713 | retcode = ERR_INTR; |
1382 | reply->ret_code = retcode; | ||
1383 | out: | 1714 | out: |
1384 | return 0; | 1715 | return retcode; |
1385 | } | 1716 | } |
1386 | 1717 | ||
1387 | static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 1718 | /* Detaching the disk is a process in multiple stages. First we need to lock |
1388 | struct drbd_nl_cfg_reply *reply) | 1719 | * out application IO, in-flight IO, IO stuck in drbd_al_begin_io. |
1720 | * Then we transition to D_DISKLESS, and wait for put_ldev() to return all | ||
1721 | * internal references as well. | ||
1722 | * Only then we have finally detached. */ | ||
1723 | int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info) | ||
1389 | { | 1724 | { |
1390 | int i, ns; | ||
1391 | enum drbd_ret_code retcode; | 1725 | enum drbd_ret_code retcode; |
1392 | struct net_conf *new_conf = NULL; | 1726 | struct detach_parms parms = { }; |
1393 | struct crypto_hash *tfm = NULL; | 1727 | int err; |
1394 | struct crypto_hash *integrity_w_tfm = NULL; | ||
1395 | struct crypto_hash *integrity_r_tfm = NULL; | ||
1396 | struct hlist_head *new_tl_hash = NULL; | ||
1397 | struct hlist_head *new_ee_hash = NULL; | ||
1398 | struct drbd_conf *odev; | ||
1399 | char hmac_name[CRYPTO_MAX_ALG_NAME]; | ||
1400 | void *int_dig_out = NULL; | ||
1401 | void *int_dig_in = NULL; | ||
1402 | void *int_dig_vv = NULL; | ||
1403 | struct sockaddr *new_my_addr, *new_peer_addr, *taken_addr; | ||
1404 | 1728 | ||
1405 | drbd_reconfig_start(mdev); | 1729 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); |
1730 | if (!adm_ctx.reply_skb) | ||
1731 | return retcode; | ||
1732 | if (retcode != NO_ERROR) | ||
1733 | goto out; | ||
1406 | 1734 | ||
1407 | if (mdev->state.conn > C_STANDALONE) { | 1735 | if (info->attrs[DRBD_NLA_DETACH_PARMS]) { |
1408 | retcode = ERR_NET_CONFIGURED; | 1736 | err = detach_parms_from_attrs(&parms, info); |
1409 | goto fail; | 1737 | if (err) { |
1738 | retcode = ERR_MANDATORY_TAG; | ||
1739 | drbd_msg_put_info(from_attrs_err_to_txt(err)); | ||
1740 | goto out; | ||
1741 | } | ||
1410 | } | 1742 | } |
1411 | 1743 | ||
1412 | /* allocation not in the IO path, cqueue thread context */ | 1744 | retcode = adm_detach(adm_ctx.mdev, parms.force_detach); |
1745 | out: | ||
1746 | drbd_adm_finish(info, retcode); | ||
1747 | return 0; | ||
1748 | } | ||
1749 | |||
1750 | static bool conn_resync_running(struct drbd_tconn *tconn) | ||
1751 | { | ||
1752 | struct drbd_conf *mdev; | ||
1753 | bool rv = false; | ||
1754 | int vnr; | ||
1755 | |||
1756 | rcu_read_lock(); | ||
1757 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
1758 | if (mdev->state.conn == C_SYNC_SOURCE || | ||
1759 | mdev->state.conn == C_SYNC_TARGET || | ||
1760 | mdev->state.conn == C_PAUSED_SYNC_S || | ||
1761 | mdev->state.conn == C_PAUSED_SYNC_T) { | ||
1762 | rv = true; | ||
1763 | break; | ||
1764 | } | ||
1765 | } | ||
1766 | rcu_read_unlock(); | ||
1767 | |||
1768 | return rv; | ||
1769 | } | ||
1770 | |||
1771 | static bool conn_ov_running(struct drbd_tconn *tconn) | ||
1772 | { | ||
1773 | struct drbd_conf *mdev; | ||
1774 | bool rv = false; | ||
1775 | int vnr; | ||
1776 | |||
1777 | rcu_read_lock(); | ||
1778 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
1779 | if (mdev->state.conn == C_VERIFY_S || | ||
1780 | mdev->state.conn == C_VERIFY_T) { | ||
1781 | rv = true; | ||
1782 | break; | ||
1783 | } | ||
1784 | } | ||
1785 | rcu_read_unlock(); | ||
1786 | |||
1787 | return rv; | ||
1788 | } | ||
1789 | |||
1790 | static enum drbd_ret_code | ||
1791 | _check_net_options(struct drbd_tconn *tconn, struct net_conf *old_conf, struct net_conf *new_conf) | ||
1792 | { | ||
1793 | struct drbd_conf *mdev; | ||
1794 | int i; | ||
1795 | |||
1796 | if (old_conf && tconn->cstate == C_WF_REPORT_PARAMS && tconn->agreed_pro_version < 100) { | ||
1797 | if (new_conf->wire_protocol != old_conf->wire_protocol) | ||
1798 | return ERR_NEED_APV_100; | ||
1799 | |||
1800 | if (new_conf->two_primaries != old_conf->two_primaries) | ||
1801 | return ERR_NEED_APV_100; | ||
1802 | |||
1803 | if (strcmp(new_conf->integrity_alg, old_conf->integrity_alg)) | ||
1804 | return ERR_NEED_APV_100; | ||
1805 | } | ||
1806 | |||
1807 | if (!new_conf->two_primaries && | ||
1808 | conn_highest_role(tconn) == R_PRIMARY && | ||
1809 | conn_highest_peer(tconn) == R_PRIMARY) | ||
1810 | return ERR_NEED_ALLOW_TWO_PRI; | ||
1811 | |||
1812 | if (new_conf->two_primaries && | ||
1813 | (new_conf->wire_protocol != DRBD_PROT_C)) | ||
1814 | return ERR_NOT_PROTO_C; | ||
1815 | |||
1816 | idr_for_each_entry(&tconn->volumes, mdev, i) { | ||
1817 | if (get_ldev(mdev)) { | ||
1818 | enum drbd_fencing_p fp = rcu_dereference(mdev->ldev->disk_conf)->fencing; | ||
1819 | put_ldev(mdev); | ||
1820 | if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH) | ||
1821 | return ERR_STONITH_AND_PROT_A; | ||
1822 | } | ||
1823 | if (mdev->state.role == R_PRIMARY && new_conf->discard_my_data) | ||
1824 | return ERR_DISCARD_IMPOSSIBLE; | ||
1825 | } | ||
1826 | |||
1827 | if (new_conf->on_congestion != OC_BLOCK && new_conf->wire_protocol != DRBD_PROT_A) | ||
1828 | return ERR_CONG_NOT_PROTO_A; | ||
1829 | |||
1830 | return NO_ERROR; | ||
1831 | } | ||
1832 | |||
1833 | static enum drbd_ret_code | ||
1834 | check_net_options(struct drbd_tconn *tconn, struct net_conf *new_conf) | ||
1835 | { | ||
1836 | static enum drbd_ret_code rv; | ||
1837 | struct drbd_conf *mdev; | ||
1838 | int i; | ||
1839 | |||
1840 | rcu_read_lock(); | ||
1841 | rv = _check_net_options(tconn, rcu_dereference(tconn->net_conf), new_conf); | ||
1842 | rcu_read_unlock(); | ||
1843 | |||
1844 | /* tconn->volumes protected by genl_lock() here */ | ||
1845 | idr_for_each_entry(&tconn->volumes, mdev, i) { | ||
1846 | if (!mdev->bitmap) { | ||
1847 | if(drbd_bm_init(mdev)) | ||
1848 | return ERR_NOMEM; | ||
1849 | } | ||
1850 | } | ||
1851 | |||
1852 | return rv; | ||
1853 | } | ||
1854 | |||
1855 | struct crypto { | ||
1856 | struct crypto_hash *verify_tfm; | ||
1857 | struct crypto_hash *csums_tfm; | ||
1858 | struct crypto_hash *cram_hmac_tfm; | ||
1859 | struct crypto_hash *integrity_tfm; | ||
1860 | }; | ||
1861 | |||
1862 | static int | ||
1863 | alloc_hash(struct crypto_hash **tfm, char *tfm_name, int err_alg) | ||
1864 | { | ||
1865 | if (!tfm_name[0]) | ||
1866 | return NO_ERROR; | ||
1867 | |||
1868 | *tfm = crypto_alloc_hash(tfm_name, 0, CRYPTO_ALG_ASYNC); | ||
1869 | if (IS_ERR(*tfm)) { | ||
1870 | *tfm = NULL; | ||
1871 | return err_alg; | ||
1872 | } | ||
1873 | |||
1874 | return NO_ERROR; | ||
1875 | } | ||
1876 | |||
1877 | static enum drbd_ret_code | ||
1878 | alloc_crypto(struct crypto *crypto, struct net_conf *new_conf) | ||
1879 | { | ||
1880 | char hmac_name[CRYPTO_MAX_ALG_NAME]; | ||
1881 | enum drbd_ret_code rv; | ||
1882 | |||
1883 | rv = alloc_hash(&crypto->csums_tfm, new_conf->csums_alg, | ||
1884 | ERR_CSUMS_ALG); | ||
1885 | if (rv != NO_ERROR) | ||
1886 | return rv; | ||
1887 | rv = alloc_hash(&crypto->verify_tfm, new_conf->verify_alg, | ||
1888 | ERR_VERIFY_ALG); | ||
1889 | if (rv != NO_ERROR) | ||
1890 | return rv; | ||
1891 | rv = alloc_hash(&crypto->integrity_tfm, new_conf->integrity_alg, | ||
1892 | ERR_INTEGRITY_ALG); | ||
1893 | if (rv != NO_ERROR) | ||
1894 | return rv; | ||
1895 | if (new_conf->cram_hmac_alg[0] != 0) { | ||
1896 | snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)", | ||
1897 | new_conf->cram_hmac_alg); | ||
1898 | |||
1899 | rv = alloc_hash(&crypto->cram_hmac_tfm, hmac_name, | ||
1900 | ERR_AUTH_ALG); | ||
1901 | } | ||
1902 | |||
1903 | return rv; | ||
1904 | } | ||
1905 | |||
1906 | static void free_crypto(struct crypto *crypto) | ||
1907 | { | ||
1908 | crypto_free_hash(crypto->cram_hmac_tfm); | ||
1909 | crypto_free_hash(crypto->integrity_tfm); | ||
1910 | crypto_free_hash(crypto->csums_tfm); | ||
1911 | crypto_free_hash(crypto->verify_tfm); | ||
1912 | } | ||
1913 | |||
1914 | int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) | ||
1915 | { | ||
1916 | enum drbd_ret_code retcode; | ||
1917 | struct drbd_tconn *tconn; | ||
1918 | struct net_conf *old_conf, *new_conf = NULL; | ||
1919 | int err; | ||
1920 | int ovr; /* online verify running */ | ||
1921 | int rsr; /* re-sync running */ | ||
1922 | struct crypto crypto = { }; | ||
1923 | |||
1924 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION); | ||
1925 | if (!adm_ctx.reply_skb) | ||
1926 | return retcode; | ||
1927 | if (retcode != NO_ERROR) | ||
1928 | goto out; | ||
1929 | |||
1930 | tconn = adm_ctx.tconn; | ||
1931 | |||
1413 | new_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL); | 1932 | new_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL); |
1414 | if (!new_conf) { | 1933 | if (!new_conf) { |
1415 | retcode = ERR_NOMEM; | 1934 | retcode = ERR_NOMEM; |
1935 | goto out; | ||
1936 | } | ||
1937 | |||
1938 | conn_reconfig_start(tconn); | ||
1939 | |||
1940 | mutex_lock(&tconn->data.mutex); | ||
1941 | mutex_lock(&tconn->conf_update); | ||
1942 | old_conf = tconn->net_conf; | ||
1943 | |||
1944 | if (!old_conf) { | ||
1945 | drbd_msg_put_info("net conf missing, try connect"); | ||
1946 | retcode = ERR_INVALID_REQUEST; | ||
1416 | goto fail; | 1947 | goto fail; |
1417 | } | 1948 | } |
1418 | 1949 | ||
1419 | new_conf->timeout = DRBD_TIMEOUT_DEF; | 1950 | *new_conf = *old_conf; |
1420 | new_conf->try_connect_int = DRBD_CONNECT_INT_DEF; | 1951 | if (should_set_defaults(info)) |
1421 | new_conf->ping_int = DRBD_PING_INT_DEF; | 1952 | set_net_conf_defaults(new_conf); |
1422 | new_conf->max_epoch_size = DRBD_MAX_EPOCH_SIZE_DEF; | 1953 | |
1423 | new_conf->max_buffers = DRBD_MAX_BUFFERS_DEF; | 1954 | err = net_conf_from_attrs_for_change(new_conf, info); |
1424 | new_conf->unplug_watermark = DRBD_UNPLUG_WATERMARK_DEF; | 1955 | if (err && err != -ENOMSG) { |
1425 | new_conf->sndbuf_size = DRBD_SNDBUF_SIZE_DEF; | ||
1426 | new_conf->rcvbuf_size = DRBD_RCVBUF_SIZE_DEF; | ||
1427 | new_conf->ko_count = DRBD_KO_COUNT_DEF; | ||
1428 | new_conf->after_sb_0p = DRBD_AFTER_SB_0P_DEF; | ||
1429 | new_conf->after_sb_1p = DRBD_AFTER_SB_1P_DEF; | ||
1430 | new_conf->after_sb_2p = DRBD_AFTER_SB_2P_DEF; | ||
1431 | new_conf->want_lose = 0; | ||
1432 | new_conf->two_primaries = 0; | ||
1433 | new_conf->wire_protocol = DRBD_PROT_C; | ||
1434 | new_conf->ping_timeo = DRBD_PING_TIMEO_DEF; | ||
1435 | new_conf->rr_conflict = DRBD_RR_CONFLICT_DEF; | ||
1436 | new_conf->on_congestion = DRBD_ON_CONGESTION_DEF; | ||
1437 | new_conf->cong_extents = DRBD_CONG_EXTENTS_DEF; | ||
1438 | |||
1439 | if (!net_conf_from_tags(mdev, nlp->tag_list, new_conf)) { | ||
1440 | retcode = ERR_MANDATORY_TAG; | 1956 | retcode = ERR_MANDATORY_TAG; |
1957 | drbd_msg_put_info(from_attrs_err_to_txt(err)); | ||
1441 | goto fail; | 1958 | goto fail; |
1442 | } | 1959 | } |
1443 | 1960 | ||
1444 | if (new_conf->two_primaries | 1961 | retcode = check_net_options(tconn, new_conf); |
1445 | && (new_conf->wire_protocol != DRBD_PROT_C)) { | 1962 | if (retcode != NO_ERROR) |
1446 | retcode = ERR_NOT_PROTO_C; | ||
1447 | goto fail; | 1963 | goto fail; |
1448 | } | ||
1449 | 1964 | ||
1450 | if (get_ldev(mdev)) { | 1965 | /* re-sync running */ |
1451 | enum drbd_fencing_p fp = mdev->ldev->dc.fencing; | 1966 | rsr = conn_resync_running(tconn); |
1452 | put_ldev(mdev); | 1967 | if (rsr && strcmp(new_conf->csums_alg, old_conf->csums_alg)) { |
1453 | if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH) { | 1968 | retcode = ERR_CSUMS_RESYNC_RUNNING; |
1454 | retcode = ERR_STONITH_AND_PROT_A; | 1969 | goto fail; |
1455 | goto fail; | ||
1456 | } | ||
1457 | } | 1970 | } |
1458 | 1971 | ||
1459 | if (new_conf->on_congestion != OC_BLOCK && new_conf->wire_protocol != DRBD_PROT_A) { | 1972 | /* online verify running */ |
1460 | retcode = ERR_CONG_NOT_PROTO_A; | 1973 | ovr = conn_ov_running(tconn); |
1974 | if (ovr && strcmp(new_conf->verify_alg, old_conf->verify_alg)) { | ||
1975 | retcode = ERR_VERIFY_RUNNING; | ||
1461 | goto fail; | 1976 | goto fail; |
1462 | } | 1977 | } |
1463 | 1978 | ||
1464 | if (mdev->state.role == R_PRIMARY && new_conf->want_lose) { | 1979 | retcode = alloc_crypto(&crypto, new_conf); |
1465 | retcode = ERR_DISCARD; | 1980 | if (retcode != NO_ERROR) |
1466 | goto fail; | 1981 | goto fail; |
1467 | } | ||
1468 | 1982 | ||
1469 | retcode = NO_ERROR; | 1983 | rcu_assign_pointer(tconn->net_conf, new_conf); |
1470 | 1984 | ||
1471 | new_my_addr = (struct sockaddr *)&new_conf->my_addr; | 1985 | if (!rsr) { |
1472 | new_peer_addr = (struct sockaddr *)&new_conf->peer_addr; | 1986 | crypto_free_hash(tconn->csums_tfm); |
1473 | for (i = 0; i < minor_count; i++) { | 1987 | tconn->csums_tfm = crypto.csums_tfm; |
1474 | odev = minor_to_mdev(i); | 1988 | crypto.csums_tfm = NULL; |
1475 | if (!odev || odev == mdev) | 1989 | } |
1476 | continue; | 1990 | if (!ovr) { |
1477 | if (get_net_conf(odev)) { | 1991 | crypto_free_hash(tconn->verify_tfm); |
1478 | taken_addr = (struct sockaddr *)&odev->net_conf->my_addr; | 1992 | tconn->verify_tfm = crypto.verify_tfm; |
1479 | if (new_conf->my_addr_len == odev->net_conf->my_addr_len && | 1993 | crypto.verify_tfm = NULL; |
1480 | !memcmp(new_my_addr, taken_addr, new_conf->my_addr_len)) | ||
1481 | retcode = ERR_LOCAL_ADDR; | ||
1482 | |||
1483 | taken_addr = (struct sockaddr *)&odev->net_conf->peer_addr; | ||
1484 | if (new_conf->peer_addr_len == odev->net_conf->peer_addr_len && | ||
1485 | !memcmp(new_peer_addr, taken_addr, new_conf->peer_addr_len)) | ||
1486 | retcode = ERR_PEER_ADDR; | ||
1487 | |||
1488 | put_net_conf(odev); | ||
1489 | if (retcode != NO_ERROR) | ||
1490 | goto fail; | ||
1491 | } | ||
1492 | } | 1994 | } |
1493 | 1995 | ||
1494 | if (new_conf->cram_hmac_alg[0] != 0) { | 1996 | crypto_free_hash(tconn->integrity_tfm); |
1495 | snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)", | 1997 | tconn->integrity_tfm = crypto.integrity_tfm; |
1496 | new_conf->cram_hmac_alg); | 1998 | if (tconn->cstate >= C_WF_REPORT_PARAMS && tconn->agreed_pro_version >= 100) |
1497 | tfm = crypto_alloc_hash(hmac_name, 0, CRYPTO_ALG_ASYNC); | 1999 | /* Do this without trying to take tconn->data.mutex again. */ |
1498 | if (IS_ERR(tfm)) { | 2000 | __drbd_send_protocol(tconn, P_PROTOCOL_UPDATE); |
1499 | tfm = NULL; | ||
1500 | retcode = ERR_AUTH_ALG; | ||
1501 | goto fail; | ||
1502 | } | ||
1503 | 2001 | ||
1504 | if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) { | 2002 | crypto_free_hash(tconn->cram_hmac_tfm); |
1505 | retcode = ERR_AUTH_ALG_ND; | 2003 | tconn->cram_hmac_tfm = crypto.cram_hmac_tfm; |
1506 | goto fail; | ||
1507 | } | ||
1508 | } | ||
1509 | 2004 | ||
1510 | if (new_conf->integrity_alg[0]) { | 2005 | mutex_unlock(&tconn->conf_update); |
1511 | integrity_w_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC); | 2006 | mutex_unlock(&tconn->data.mutex); |
1512 | if (IS_ERR(integrity_w_tfm)) { | 2007 | synchronize_rcu(); |
1513 | integrity_w_tfm = NULL; | 2008 | kfree(old_conf); |
1514 | retcode=ERR_INTEGRITY_ALG; | ||
1515 | goto fail; | ||
1516 | } | ||
1517 | 2009 | ||
1518 | if (!drbd_crypto_is_hash(crypto_hash_tfm(integrity_w_tfm))) { | 2010 | if (tconn->cstate >= C_WF_REPORT_PARAMS) |
1519 | retcode=ERR_INTEGRITY_ALG_ND; | 2011 | drbd_send_sync_param(minor_to_mdev(conn_lowest_minor(tconn))); |
1520 | goto fail; | ||
1521 | } | ||
1522 | 2012 | ||
1523 | integrity_r_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC); | 2013 | goto done; |
1524 | if (IS_ERR(integrity_r_tfm)) { | 2014 | |
1525 | integrity_r_tfm = NULL; | 2015 | fail: |
1526 | retcode=ERR_INTEGRITY_ALG; | 2016 | mutex_unlock(&tconn->conf_update); |
1527 | goto fail; | 2017 | mutex_unlock(&tconn->data.mutex); |
1528 | } | 2018 | free_crypto(&crypto); |
2019 | kfree(new_conf); | ||
2020 | done: | ||
2021 | conn_reconfig_done(tconn); | ||
2022 | out: | ||
2023 | drbd_adm_finish(info, retcode); | ||
2024 | return 0; | ||
2025 | } | ||
2026 | |||
2027 | int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) | ||
2028 | { | ||
2029 | struct drbd_conf *mdev; | ||
2030 | struct net_conf *old_conf, *new_conf = NULL; | ||
2031 | struct crypto crypto = { }; | ||
2032 | struct drbd_tconn *tconn; | ||
2033 | enum drbd_ret_code retcode; | ||
2034 | int i; | ||
2035 | int err; | ||
2036 | |||
2037 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); | ||
2038 | |||
2039 | if (!adm_ctx.reply_skb) | ||
2040 | return retcode; | ||
2041 | if (retcode != NO_ERROR) | ||
2042 | goto out; | ||
2043 | if (!(adm_ctx.my_addr && adm_ctx.peer_addr)) { | ||
2044 | drbd_msg_put_info("connection endpoint(s) missing"); | ||
2045 | retcode = ERR_INVALID_REQUEST; | ||
2046 | goto out; | ||
1529 | } | 2047 | } |
1530 | 2048 | ||
1531 | ns = new_conf->max_epoch_size/8; | 2049 | /* No need for _rcu here. All reconfiguration is |
1532 | if (mdev->tl_hash_s != ns) { | 2050 | * strictly serialized on genl_lock(). We are protected against |
1533 | new_tl_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL); | 2051 | * concurrent reconfiguration/addition/deletion */ |
1534 | if (!new_tl_hash) { | 2052 | list_for_each_entry(tconn, &drbd_tconns, all_tconn) { |
1535 | retcode = ERR_NOMEM; | 2053 | if (nla_len(adm_ctx.my_addr) == tconn->my_addr_len && |
1536 | goto fail; | 2054 | !memcmp(nla_data(adm_ctx.my_addr), &tconn->my_addr, tconn->my_addr_len)) { |
2055 | retcode = ERR_LOCAL_ADDR; | ||
2056 | goto out; | ||
1537 | } | 2057 | } |
1538 | } | ||
1539 | 2058 | ||
1540 | ns = new_conf->max_buffers/8; | 2059 | if (nla_len(adm_ctx.peer_addr) == tconn->peer_addr_len && |
1541 | if (new_conf->two_primaries && (mdev->ee_hash_s != ns)) { | 2060 | !memcmp(nla_data(adm_ctx.peer_addr), &tconn->peer_addr, tconn->peer_addr_len)) { |
1542 | new_ee_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL); | 2061 | retcode = ERR_PEER_ADDR; |
1543 | if (!new_ee_hash) { | 2062 | goto out; |
1544 | retcode = ERR_NOMEM; | ||
1545 | goto fail; | ||
1546 | } | 2063 | } |
1547 | } | 2064 | } |
1548 | 2065 | ||
1549 | ((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0; | 2066 | tconn = adm_ctx.tconn; |
2067 | conn_reconfig_start(tconn); | ||
1550 | 2068 | ||
1551 | if (integrity_w_tfm) { | 2069 | if (tconn->cstate > C_STANDALONE) { |
1552 | i = crypto_hash_digestsize(integrity_w_tfm); | 2070 | retcode = ERR_NET_CONFIGURED; |
1553 | int_dig_out = kmalloc(i, GFP_KERNEL); | 2071 | goto fail; |
1554 | if (!int_dig_out) { | ||
1555 | retcode = ERR_NOMEM; | ||
1556 | goto fail; | ||
1557 | } | ||
1558 | int_dig_in = kmalloc(i, GFP_KERNEL); | ||
1559 | if (!int_dig_in) { | ||
1560 | retcode = ERR_NOMEM; | ||
1561 | goto fail; | ||
1562 | } | ||
1563 | int_dig_vv = kmalloc(i, GFP_KERNEL); | ||
1564 | if (!int_dig_vv) { | ||
1565 | retcode = ERR_NOMEM; | ||
1566 | goto fail; | ||
1567 | } | ||
1568 | } | 2072 | } |
1569 | 2073 | ||
1570 | if (!mdev->bitmap) { | 2074 | /* allocation not in the IO path, drbdsetup / netlink process context */ |
1571 | if(drbd_bm_init(mdev)) { | 2075 | new_conf = kzalloc(sizeof(*new_conf), GFP_KERNEL); |
1572 | retcode = ERR_NOMEM; | 2076 | if (!new_conf) { |
1573 | goto fail; | 2077 | retcode = ERR_NOMEM; |
1574 | } | 2078 | goto fail; |
1575 | } | 2079 | } |
1576 | 2080 | ||
1577 | drbd_flush_workqueue(mdev); | 2081 | set_net_conf_defaults(new_conf); |
1578 | spin_lock_irq(&mdev->req_lock); | 2082 | |
1579 | if (mdev->net_conf != NULL) { | 2083 | err = net_conf_from_attrs(new_conf, info); |
1580 | retcode = ERR_NET_CONFIGURED; | 2084 | if (err && err != -ENOMSG) { |
1581 | spin_unlock_irq(&mdev->req_lock); | 2085 | retcode = ERR_MANDATORY_TAG; |
2086 | drbd_msg_put_info(from_attrs_err_to_txt(err)); | ||
1582 | goto fail; | 2087 | goto fail; |
1583 | } | 2088 | } |
1584 | mdev->net_conf = new_conf; | ||
1585 | 2089 | ||
1586 | mdev->send_cnt = 0; | 2090 | retcode = check_net_options(tconn, new_conf); |
1587 | mdev->recv_cnt = 0; | 2091 | if (retcode != NO_ERROR) |
2092 | goto fail; | ||
1588 | 2093 | ||
1589 | if (new_tl_hash) { | 2094 | retcode = alloc_crypto(&crypto, new_conf); |
1590 | kfree(mdev->tl_hash); | 2095 | if (retcode != NO_ERROR) |
1591 | mdev->tl_hash_s = mdev->net_conf->max_epoch_size/8; | 2096 | goto fail; |
1592 | mdev->tl_hash = new_tl_hash; | 2097 | |
1593 | } | 2098 | ((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0; |
2099 | |||
2100 | conn_flush_workqueue(tconn); | ||
1594 | 2101 | ||
1595 | if (new_ee_hash) { | 2102 | mutex_lock(&tconn->conf_update); |
1596 | kfree(mdev->ee_hash); | 2103 | old_conf = tconn->net_conf; |
1597 | mdev->ee_hash_s = mdev->net_conf->max_buffers/8; | 2104 | if (old_conf) { |
1598 | mdev->ee_hash = new_ee_hash; | 2105 | retcode = ERR_NET_CONFIGURED; |
2106 | mutex_unlock(&tconn->conf_update); | ||
2107 | goto fail; | ||
1599 | } | 2108 | } |
2109 | rcu_assign_pointer(tconn->net_conf, new_conf); | ||
1600 | 2110 | ||
1601 | crypto_free_hash(mdev->cram_hmac_tfm); | 2111 | conn_free_crypto(tconn); |
1602 | mdev->cram_hmac_tfm = tfm; | 2112 | tconn->cram_hmac_tfm = crypto.cram_hmac_tfm; |
2113 | tconn->integrity_tfm = crypto.integrity_tfm; | ||
2114 | tconn->csums_tfm = crypto.csums_tfm; | ||
2115 | tconn->verify_tfm = crypto.verify_tfm; | ||
1603 | 2116 | ||
1604 | crypto_free_hash(mdev->integrity_w_tfm); | 2117 | tconn->my_addr_len = nla_len(adm_ctx.my_addr); |
1605 | mdev->integrity_w_tfm = integrity_w_tfm; | 2118 | memcpy(&tconn->my_addr, nla_data(adm_ctx.my_addr), tconn->my_addr_len); |
2119 | tconn->peer_addr_len = nla_len(adm_ctx.peer_addr); | ||
2120 | memcpy(&tconn->peer_addr, nla_data(adm_ctx.peer_addr), tconn->peer_addr_len); | ||
1606 | 2121 | ||
1607 | crypto_free_hash(mdev->integrity_r_tfm); | 2122 | mutex_unlock(&tconn->conf_update); |
1608 | mdev->integrity_r_tfm = integrity_r_tfm; | ||
1609 | 2123 | ||
1610 | kfree(mdev->int_dig_out); | 2124 | rcu_read_lock(); |
1611 | kfree(mdev->int_dig_in); | 2125 | idr_for_each_entry(&tconn->volumes, mdev, i) { |
1612 | kfree(mdev->int_dig_vv); | 2126 | mdev->send_cnt = 0; |
1613 | mdev->int_dig_out=int_dig_out; | 2127 | mdev->recv_cnt = 0; |
1614 | mdev->int_dig_in=int_dig_in; | 2128 | } |
1615 | mdev->int_dig_vv=int_dig_vv; | 2129 | rcu_read_unlock(); |
1616 | retcode = _drbd_set_state(_NS(mdev, conn, C_UNCONNECTED), CS_VERBOSE, NULL); | ||
1617 | spin_unlock_irq(&mdev->req_lock); | ||
1618 | 2130 | ||
1619 | kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); | 2131 | retcode = conn_request_state(tconn, NS(conn, C_UNCONNECTED), CS_VERBOSE); |
1620 | reply->ret_code = retcode; | 2132 | |
1621 | drbd_reconfig_done(mdev); | 2133 | conn_reconfig_done(tconn); |
2134 | drbd_adm_finish(info, retcode); | ||
1622 | return 0; | 2135 | return 0; |
1623 | 2136 | ||
1624 | fail: | 2137 | fail: |
1625 | kfree(int_dig_out); | 2138 | free_crypto(&crypto); |
1626 | kfree(int_dig_in); | ||
1627 | kfree(int_dig_vv); | ||
1628 | crypto_free_hash(tfm); | ||
1629 | crypto_free_hash(integrity_w_tfm); | ||
1630 | crypto_free_hash(integrity_r_tfm); | ||
1631 | kfree(new_tl_hash); | ||
1632 | kfree(new_ee_hash); | ||
1633 | kfree(new_conf); | 2139 | kfree(new_conf); |
1634 | 2140 | ||
1635 | reply->ret_code = retcode; | 2141 | conn_reconfig_done(tconn); |
1636 | drbd_reconfig_done(mdev); | 2142 | out: |
2143 | drbd_adm_finish(info, retcode); | ||
1637 | return 0; | 2144 | return 0; |
1638 | } | 2145 | } |
1639 | 2146 | ||
1640 | static int drbd_nl_disconnect(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2147 | static enum drbd_state_rv conn_try_disconnect(struct drbd_tconn *tconn, bool force) |
1641 | struct drbd_nl_cfg_reply *reply) | ||
1642 | { | 2148 | { |
1643 | int retcode; | 2149 | enum drbd_state_rv rv; |
1644 | struct disconnect dc; | ||
1645 | |||
1646 | memset(&dc, 0, sizeof(struct disconnect)); | ||
1647 | if (!disconnect_from_tags(mdev, nlp->tag_list, &dc)) { | ||
1648 | retcode = ERR_MANDATORY_TAG; | ||
1649 | goto fail; | ||
1650 | } | ||
1651 | |||
1652 | if (dc.force) { | ||
1653 | spin_lock_irq(&mdev->req_lock); | ||
1654 | if (mdev->state.conn >= C_WF_CONNECTION) | ||
1655 | _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), CS_HARD, NULL); | ||
1656 | spin_unlock_irq(&mdev->req_lock); | ||
1657 | goto done; | ||
1658 | } | ||
1659 | 2150 | ||
1660 | retcode = _drbd_request_state(mdev, NS(conn, C_DISCONNECTING), CS_ORDERED); | 2151 | rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING), |
2152 | force ? CS_HARD : 0); | ||
1661 | 2153 | ||
1662 | if (retcode == SS_NOTHING_TO_DO) | 2154 | switch (rv) { |
1663 | goto done; | 2155 | case SS_NOTHING_TO_DO: |
1664 | else if (retcode == SS_ALREADY_STANDALONE) | 2156 | break; |
1665 | goto done; | 2157 | case SS_ALREADY_STANDALONE: |
1666 | else if (retcode == SS_PRIMARY_NOP) { | 2158 | return SS_SUCCESS; |
1667 | /* Our statche checking code wants to see the peer outdated. */ | 2159 | case SS_PRIMARY_NOP: |
1668 | retcode = drbd_request_state(mdev, NS2(conn, C_DISCONNECTING, | 2160 | /* Our state checking code wants to see the peer outdated. */ |
1669 | pdsk, D_OUTDATED)); | 2161 | rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING, |
1670 | } else if (retcode == SS_CW_FAILED_BY_PEER) { | 2162 | pdsk, D_OUTDATED), CS_VERBOSE); |
2163 | break; | ||
2164 | case SS_CW_FAILED_BY_PEER: | ||
1671 | /* The peer probably wants to see us outdated. */ | 2165 | /* The peer probably wants to see us outdated. */ |
1672 | retcode = _drbd_request_state(mdev, NS2(conn, C_DISCONNECTING, | 2166 | rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING, |
1673 | disk, D_OUTDATED), | 2167 | disk, D_OUTDATED), 0); |
1674 | CS_ORDERED); | 2168 | if (rv == SS_IS_DISKLESS || rv == SS_LOWER_THAN_OUTDATED) { |
1675 | if (retcode == SS_IS_DISKLESS || retcode == SS_LOWER_THAN_OUTDATED) { | 2169 | rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING), |
1676 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | 2170 | CS_HARD); |
1677 | retcode = SS_SUCCESS; | ||
1678 | } | 2171 | } |
2172 | break; | ||
2173 | default:; | ||
2174 | /* no special handling necessary */ | ||
2175 | } | ||
2176 | |||
2177 | if (rv >= SS_SUCCESS) { | ||
2178 | enum drbd_state_rv rv2; | ||
2179 | /* No one else can reconfigure the network while I am here. | ||
2180 | * The state handling only uses drbd_thread_stop_nowait(), | ||
2181 | * we want to really wait here until the receiver is no more. | ||
2182 | */ | ||
2183 | drbd_thread_stop(&adm_ctx.tconn->receiver); | ||
2184 | |||
2185 | /* Race breaker. This additional state change request may be | ||
2186 | * necessary, if this was a forced disconnect during a receiver | ||
2187 | * restart. We may have "killed" the receiver thread just | ||
2188 | * after drbdd_init() returned. Typically, we should be | ||
2189 | * C_STANDALONE already, now, and this becomes a no-op. | ||
2190 | */ | ||
2191 | rv2 = conn_request_state(tconn, NS(conn, C_STANDALONE), | ||
2192 | CS_VERBOSE | CS_HARD); | ||
2193 | if (rv2 < SS_SUCCESS) | ||
2194 | conn_err(tconn, | ||
2195 | "unexpected rv2=%d in conn_try_disconnect()\n", | ||
2196 | rv2); | ||
1679 | } | 2197 | } |
2198 | return rv; | ||
2199 | } | ||
1680 | 2200 | ||
1681 | if (retcode < SS_SUCCESS) | 2201 | int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info) |
1682 | goto fail; | 2202 | { |
2203 | struct disconnect_parms parms; | ||
2204 | struct drbd_tconn *tconn; | ||
2205 | enum drbd_state_rv rv; | ||
2206 | enum drbd_ret_code retcode; | ||
2207 | int err; | ||
1683 | 2208 | ||
1684 | if (wait_event_interruptible(mdev->state_wait, | 2209 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION); |
1685 | mdev->state.conn != C_DISCONNECTING)) { | 2210 | if (!adm_ctx.reply_skb) |
1686 | /* Do not test for mdev->state.conn == C_STANDALONE, since | 2211 | return retcode; |
1687 | someone else might connect us in the mean time! */ | 2212 | if (retcode != NO_ERROR) |
1688 | retcode = ERR_INTR; | ||
1689 | goto fail; | 2213 | goto fail; |
2214 | |||
2215 | tconn = adm_ctx.tconn; | ||
2216 | memset(&parms, 0, sizeof(parms)); | ||
2217 | if (info->attrs[DRBD_NLA_DISCONNECT_PARMS]) { | ||
2218 | err = disconnect_parms_from_attrs(&parms, info); | ||
2219 | if (err) { | ||
2220 | retcode = ERR_MANDATORY_TAG; | ||
2221 | drbd_msg_put_info(from_attrs_err_to_txt(err)); | ||
2222 | goto fail; | ||
2223 | } | ||
1690 | } | 2224 | } |
1691 | 2225 | ||
1692 | done: | 2226 | rv = conn_try_disconnect(tconn, parms.force_disconnect); |
1693 | retcode = NO_ERROR; | 2227 | if (rv < SS_SUCCESS) |
2228 | retcode = rv; /* FIXME: Type mismatch. */ | ||
2229 | else | ||
2230 | retcode = NO_ERROR; | ||
1694 | fail: | 2231 | fail: |
1695 | drbd_md_sync(mdev); | 2232 | drbd_adm_finish(info, retcode); |
1696 | reply->ret_code = retcode; | ||
1697 | return 0; | 2233 | return 0; |
1698 | } | 2234 | } |
1699 | 2235 | ||
@@ -1705,7 +2241,7 @@ void resync_after_online_grow(struct drbd_conf *mdev) | |||
1705 | if (mdev->state.role != mdev->state.peer) | 2241 | if (mdev->state.role != mdev->state.peer) |
1706 | iass = (mdev->state.role == R_PRIMARY); | 2242 | iass = (mdev->state.role == R_PRIMARY); |
1707 | else | 2243 | else |
1708 | iass = test_bit(DISCARD_CONCURRENT, &mdev->flags); | 2244 | iass = test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags); |
1709 | 2245 | ||
1710 | if (iass) | 2246 | if (iass) |
1711 | drbd_start_resync(mdev, C_SYNC_SOURCE); | 2247 | drbd_start_resync(mdev, C_SYNC_SOURCE); |
@@ -1713,20 +2249,34 @@ void resync_after_online_grow(struct drbd_conf *mdev) | |||
1713 | _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE); | 2249 | _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE); |
1714 | } | 2250 | } |
1715 | 2251 | ||
1716 | static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2252 | int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) |
1717 | struct drbd_nl_cfg_reply *reply) | ||
1718 | { | 2253 | { |
1719 | struct resize rs; | 2254 | struct disk_conf *old_disk_conf, *new_disk_conf = NULL; |
1720 | int retcode = NO_ERROR; | 2255 | struct resize_parms rs; |
2256 | struct drbd_conf *mdev; | ||
2257 | enum drbd_ret_code retcode; | ||
1721 | enum determine_dev_size dd; | 2258 | enum determine_dev_size dd; |
1722 | enum dds_flags ddsf; | 2259 | enum dds_flags ddsf; |
2260 | sector_t u_size; | ||
2261 | int err; | ||
1723 | 2262 | ||
1724 | memset(&rs, 0, sizeof(struct resize)); | 2263 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); |
1725 | if (!resize_from_tags(mdev, nlp->tag_list, &rs)) { | 2264 | if (!adm_ctx.reply_skb) |
1726 | retcode = ERR_MANDATORY_TAG; | 2265 | return retcode; |
2266 | if (retcode != NO_ERROR) | ||
1727 | goto fail; | 2267 | goto fail; |
2268 | |||
2269 | memset(&rs, 0, sizeof(struct resize_parms)); | ||
2270 | if (info->attrs[DRBD_NLA_RESIZE_PARMS]) { | ||
2271 | err = resize_parms_from_attrs(&rs, info); | ||
2272 | if (err) { | ||
2273 | retcode = ERR_MANDATORY_TAG; | ||
2274 | drbd_msg_put_info(from_attrs_err_to_txt(err)); | ||
2275 | goto fail; | ||
2276 | } | ||
1728 | } | 2277 | } |
1729 | 2278 | ||
2279 | mdev = adm_ctx.mdev; | ||
1730 | if (mdev->state.conn > C_CONNECTED) { | 2280 | if (mdev->state.conn > C_CONNECTED) { |
1731 | retcode = ERR_RESIZE_RESYNC; | 2281 | retcode = ERR_RESIZE_RESYNC; |
1732 | goto fail; | 2282 | goto fail; |
@@ -1743,15 +2293,36 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | |||
1743 | goto fail; | 2293 | goto fail; |
1744 | } | 2294 | } |
1745 | 2295 | ||
1746 | if (rs.no_resync && mdev->agreed_pro_version < 93) { | 2296 | if (rs.no_resync && mdev->tconn->agreed_pro_version < 93) { |
1747 | retcode = ERR_NEED_APV_93; | 2297 | retcode = ERR_NEED_APV_93; |
1748 | goto fail_ldev; | 2298 | goto fail_ldev; |
1749 | } | 2299 | } |
1750 | 2300 | ||
2301 | rcu_read_lock(); | ||
2302 | u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size; | ||
2303 | rcu_read_unlock(); | ||
2304 | if (u_size != (sector_t)rs.resize_size) { | ||
2305 | new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL); | ||
2306 | if (!new_disk_conf) { | ||
2307 | retcode = ERR_NOMEM; | ||
2308 | goto fail_ldev; | ||
2309 | } | ||
2310 | } | ||
2311 | |||
1751 | if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) | 2312 | if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) |
1752 | mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); | 2313 | mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); |
1753 | 2314 | ||
1754 | mdev->ldev->dc.disk_size = (sector_t)rs.resize_size; | 2315 | if (new_disk_conf) { |
2316 | mutex_lock(&mdev->tconn->conf_update); | ||
2317 | old_disk_conf = mdev->ldev->disk_conf; | ||
2318 | *new_disk_conf = *old_disk_conf; | ||
2319 | new_disk_conf->disk_size = (sector_t)rs.resize_size; | ||
2320 | rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf); | ||
2321 | mutex_unlock(&mdev->tconn->conf_update); | ||
2322 | synchronize_rcu(); | ||
2323 | kfree(old_disk_conf); | ||
2324 | } | ||
2325 | |||
1755 | ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0); | 2326 | ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0); |
1756 | dd = drbd_determine_dev_size(mdev, ddsf); | 2327 | dd = drbd_determine_dev_size(mdev, ddsf); |
1757 | drbd_md_sync(mdev); | 2328 | drbd_md_sync(mdev); |
@@ -1770,7 +2341,7 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | |||
1770 | } | 2341 | } |
1771 | 2342 | ||
1772 | fail: | 2343 | fail: |
1773 | reply->ret_code = retcode; | 2344 | drbd_adm_finish(info, retcode); |
1774 | return 0; | 2345 | return 0; |
1775 | 2346 | ||
1776 | fail_ldev: | 2347 | fail_ldev: |
@@ -1778,204 +2349,55 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | |||
1778 | goto fail; | 2349 | goto fail; |
1779 | } | 2350 | } |
1780 | 2351 | ||
1781 | static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2352 | int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info) |
1782 | struct drbd_nl_cfg_reply *reply) | ||
1783 | { | 2353 | { |
1784 | int retcode = NO_ERROR; | 2354 | enum drbd_ret_code retcode; |
2355 | struct drbd_tconn *tconn; | ||
2356 | struct res_opts res_opts; | ||
1785 | int err; | 2357 | int err; |
1786 | int ovr; /* online verify running */ | ||
1787 | int rsr; /* re-sync running */ | ||
1788 | struct crypto_hash *verify_tfm = NULL; | ||
1789 | struct crypto_hash *csums_tfm = NULL; | ||
1790 | struct syncer_conf sc; | ||
1791 | cpumask_var_t new_cpu_mask; | ||
1792 | int *rs_plan_s = NULL; | ||
1793 | int fifo_size; | ||
1794 | |||
1795 | if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) { | ||
1796 | retcode = ERR_NOMEM; | ||
1797 | goto fail; | ||
1798 | } | ||
1799 | 2358 | ||
1800 | if (nlp->flags & DRBD_NL_SET_DEFAULTS) { | 2359 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); |
1801 | memset(&sc, 0, sizeof(struct syncer_conf)); | 2360 | if (!adm_ctx.reply_skb) |
1802 | sc.rate = DRBD_RATE_DEF; | 2361 | return retcode; |
1803 | sc.after = DRBD_AFTER_DEF; | ||
1804 | sc.al_extents = DRBD_AL_EXTENTS_DEF; | ||
1805 | sc.on_no_data = DRBD_ON_NO_DATA_DEF; | ||
1806 | sc.c_plan_ahead = DRBD_C_PLAN_AHEAD_DEF; | ||
1807 | sc.c_delay_target = DRBD_C_DELAY_TARGET_DEF; | ||
1808 | sc.c_fill_target = DRBD_C_FILL_TARGET_DEF; | ||
1809 | sc.c_max_rate = DRBD_C_MAX_RATE_DEF; | ||
1810 | sc.c_min_rate = DRBD_C_MIN_RATE_DEF; | ||
1811 | } else | ||
1812 | memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf)); | ||
1813 | |||
1814 | if (!syncer_conf_from_tags(mdev, nlp->tag_list, &sc)) { | ||
1815 | retcode = ERR_MANDATORY_TAG; | ||
1816 | goto fail; | ||
1817 | } | ||
1818 | |||
1819 | /* re-sync running */ | ||
1820 | rsr = ( mdev->state.conn == C_SYNC_SOURCE || | ||
1821 | mdev->state.conn == C_SYNC_TARGET || | ||
1822 | mdev->state.conn == C_PAUSED_SYNC_S || | ||
1823 | mdev->state.conn == C_PAUSED_SYNC_T ); | ||
1824 | |||
1825 | if (rsr && strcmp(sc.csums_alg, mdev->sync_conf.csums_alg)) { | ||
1826 | retcode = ERR_CSUMS_RESYNC_RUNNING; | ||
1827 | goto fail; | ||
1828 | } | ||
1829 | |||
1830 | if (!rsr && sc.csums_alg[0]) { | ||
1831 | csums_tfm = crypto_alloc_hash(sc.csums_alg, 0, CRYPTO_ALG_ASYNC); | ||
1832 | if (IS_ERR(csums_tfm)) { | ||
1833 | csums_tfm = NULL; | ||
1834 | retcode = ERR_CSUMS_ALG; | ||
1835 | goto fail; | ||
1836 | } | ||
1837 | |||
1838 | if (!drbd_crypto_is_hash(crypto_hash_tfm(csums_tfm))) { | ||
1839 | retcode = ERR_CSUMS_ALG_ND; | ||
1840 | goto fail; | ||
1841 | } | ||
1842 | } | ||
1843 | |||
1844 | /* online verify running */ | ||
1845 | ovr = (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T); | ||
1846 | |||
1847 | if (ovr) { | ||
1848 | if (strcmp(sc.verify_alg, mdev->sync_conf.verify_alg)) { | ||
1849 | retcode = ERR_VERIFY_RUNNING; | ||
1850 | goto fail; | ||
1851 | } | ||
1852 | } | ||
1853 | |||
1854 | if (!ovr && sc.verify_alg[0]) { | ||
1855 | verify_tfm = crypto_alloc_hash(sc.verify_alg, 0, CRYPTO_ALG_ASYNC); | ||
1856 | if (IS_ERR(verify_tfm)) { | ||
1857 | verify_tfm = NULL; | ||
1858 | retcode = ERR_VERIFY_ALG; | ||
1859 | goto fail; | ||
1860 | } | ||
1861 | |||
1862 | if (!drbd_crypto_is_hash(crypto_hash_tfm(verify_tfm))) { | ||
1863 | retcode = ERR_VERIFY_ALG_ND; | ||
1864 | goto fail; | ||
1865 | } | ||
1866 | } | ||
1867 | |||
1868 | /* silently ignore cpu mask on UP kernel */ | ||
1869 | if (nr_cpu_ids > 1 && sc.cpu_mask[0] != 0) { | ||
1870 | err = bitmap_parse(sc.cpu_mask, 32, | ||
1871 | cpumask_bits(new_cpu_mask), nr_cpu_ids); | ||
1872 | if (err) { | ||
1873 | dev_warn(DEV, "bitmap_parse() failed with %d\n", err); | ||
1874 | retcode = ERR_CPU_MASK_PARSE; | ||
1875 | goto fail; | ||
1876 | } | ||
1877 | } | ||
1878 | |||
1879 | ERR_IF (sc.rate < 1) sc.rate = 1; | ||
1880 | ERR_IF (sc.al_extents < 7) sc.al_extents = 127; /* arbitrary minimum */ | ||
1881 | #define AL_MAX ((MD_AL_MAX_SIZE-1) * AL_EXTENTS_PT) | ||
1882 | if (sc.al_extents > AL_MAX) { | ||
1883 | dev_err(DEV, "sc.al_extents > %d\n", AL_MAX); | ||
1884 | sc.al_extents = AL_MAX; | ||
1885 | } | ||
1886 | #undef AL_MAX | ||
1887 | |||
1888 | /* to avoid spurious errors when configuring minors before configuring | ||
1889 | * the minors they depend on: if necessary, first create the minor we | ||
1890 | * depend on */ | ||
1891 | if (sc.after >= 0) | ||
1892 | ensure_mdev(sc.after, 1); | ||
1893 | |||
1894 | /* most sanity checks done, try to assign the new sync-after | ||
1895 | * dependency. need to hold the global lock in there, | ||
1896 | * to avoid a race in the dependency loop check. */ | ||
1897 | retcode = drbd_alter_sa(mdev, sc.after); | ||
1898 | if (retcode != NO_ERROR) | 2362 | if (retcode != NO_ERROR) |
1899 | goto fail; | 2363 | goto fail; |
2364 | tconn = adm_ctx.tconn; | ||
1900 | 2365 | ||
1901 | fifo_size = (sc.c_plan_ahead * 10 * SLEEP_TIME) / HZ; | 2366 | res_opts = tconn->res_opts; |
1902 | if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) { | 2367 | if (should_set_defaults(info)) |
1903 | rs_plan_s = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL); | 2368 | set_res_opts_defaults(&res_opts); |
1904 | if (!rs_plan_s) { | ||
1905 | dev_err(DEV, "kmalloc of fifo_buffer failed"); | ||
1906 | retcode = ERR_NOMEM; | ||
1907 | goto fail; | ||
1908 | } | ||
1909 | } | ||
1910 | 2369 | ||
1911 | /* ok, assign the rest of it as well. | 2370 | err = res_opts_from_attrs(&res_opts, info); |
1912 | * lock against receive_SyncParam() */ | 2371 | if (err && err != -ENOMSG) { |
1913 | spin_lock(&mdev->peer_seq_lock); | 2372 | retcode = ERR_MANDATORY_TAG; |
1914 | mdev->sync_conf = sc; | 2373 | drbd_msg_put_info(from_attrs_err_to_txt(err)); |
1915 | 2374 | goto fail; | |
1916 | if (!rsr) { | ||
1917 | crypto_free_hash(mdev->csums_tfm); | ||
1918 | mdev->csums_tfm = csums_tfm; | ||
1919 | csums_tfm = NULL; | ||
1920 | } | ||
1921 | |||
1922 | if (!ovr) { | ||
1923 | crypto_free_hash(mdev->verify_tfm); | ||
1924 | mdev->verify_tfm = verify_tfm; | ||
1925 | verify_tfm = NULL; | ||
1926 | } | ||
1927 | |||
1928 | if (fifo_size != mdev->rs_plan_s.size) { | ||
1929 | kfree(mdev->rs_plan_s.values); | ||
1930 | mdev->rs_plan_s.values = rs_plan_s; | ||
1931 | mdev->rs_plan_s.size = fifo_size; | ||
1932 | mdev->rs_planed = 0; | ||
1933 | rs_plan_s = NULL; | ||
1934 | } | 2375 | } |
1935 | 2376 | ||
1936 | spin_unlock(&mdev->peer_seq_lock); | 2377 | err = set_resource_options(tconn, &res_opts); |
1937 | 2378 | if (err) { | |
1938 | if (get_ldev(mdev)) { | 2379 | retcode = ERR_INVALID_REQUEST; |
1939 | wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); | 2380 | if (err == -ENOMEM) |
1940 | drbd_al_shrink(mdev); | ||
1941 | err = drbd_check_al_size(mdev); | ||
1942 | lc_unlock(mdev->act_log); | ||
1943 | wake_up(&mdev->al_wait); | ||
1944 | |||
1945 | put_ldev(mdev); | ||
1946 | drbd_md_sync(mdev); | ||
1947 | |||
1948 | if (err) { | ||
1949 | retcode = ERR_NOMEM; | 2381 | retcode = ERR_NOMEM; |
1950 | goto fail; | ||
1951 | } | ||
1952 | } | 2382 | } |
1953 | 2383 | ||
1954 | if (mdev->state.conn >= C_CONNECTED) | ||
1955 | drbd_send_sync_param(mdev, &sc); | ||
1956 | |||
1957 | if (!cpumask_equal(mdev->cpu_mask, new_cpu_mask)) { | ||
1958 | cpumask_copy(mdev->cpu_mask, new_cpu_mask); | ||
1959 | drbd_calc_cpu_mask(mdev); | ||
1960 | mdev->receiver.reset_cpu_mask = 1; | ||
1961 | mdev->asender.reset_cpu_mask = 1; | ||
1962 | mdev->worker.reset_cpu_mask = 1; | ||
1963 | } | ||
1964 | |||
1965 | kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); | ||
1966 | fail: | 2384 | fail: |
1967 | kfree(rs_plan_s); | 2385 | drbd_adm_finish(info, retcode); |
1968 | free_cpumask_var(new_cpu_mask); | ||
1969 | crypto_free_hash(csums_tfm); | ||
1970 | crypto_free_hash(verify_tfm); | ||
1971 | reply->ret_code = retcode; | ||
1972 | return 0; | 2386 | return 0; |
1973 | } | 2387 | } |
1974 | 2388 | ||
1975 | static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2389 | int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info) |
1976 | struct drbd_nl_cfg_reply *reply) | ||
1977 | { | 2390 | { |
1978 | int retcode; | 2391 | struct drbd_conf *mdev; |
2392 | int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ | ||
2393 | |||
2394 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); | ||
2395 | if (!adm_ctx.reply_skb) | ||
2396 | return retcode; | ||
2397 | if (retcode != NO_ERROR) | ||
2398 | goto out; | ||
2399 | |||
2400 | mdev = adm_ctx.mdev; | ||
1979 | 2401 | ||
1980 | /* If there is still bitmap IO pending, probably because of a previous | 2402 | /* If there is still bitmap IO pending, probably because of a previous |
1981 | * resync just being finished, wait for it before requesting a new resync. | 2403 | * resync just being finished, wait for it before requesting a new resync. |
@@ -1990,10 +2412,10 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl | |||
1990 | retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); | 2412 | retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); |
1991 | 2413 | ||
1992 | while (retcode == SS_NEED_CONNECTION) { | 2414 | while (retcode == SS_NEED_CONNECTION) { |
1993 | spin_lock_irq(&mdev->req_lock); | 2415 | spin_lock_irq(&mdev->tconn->req_lock); |
1994 | if (mdev->state.conn < C_CONNECTED) | 2416 | if (mdev->state.conn < C_CONNECTED) |
1995 | retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL); | 2417 | retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL); |
1996 | spin_unlock_irq(&mdev->req_lock); | 2418 | spin_unlock_irq(&mdev->tconn->req_lock); |
1997 | 2419 | ||
1998 | if (retcode != SS_NEED_CONNECTION) | 2420 | if (retcode != SS_NEED_CONNECTION) |
1999 | break; | 2421 | break; |
@@ -2002,7 +2424,25 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl | |||
2002 | } | 2424 | } |
2003 | drbd_resume_io(mdev); | 2425 | drbd_resume_io(mdev); |
2004 | 2426 | ||
2005 | reply->ret_code = retcode; | 2427 | out: |
2428 | drbd_adm_finish(info, retcode); | ||
2429 | return 0; | ||
2430 | } | ||
2431 | |||
2432 | static int drbd_adm_simple_request_state(struct sk_buff *skb, struct genl_info *info, | ||
2433 | union drbd_state mask, union drbd_state val) | ||
2434 | { | ||
2435 | enum drbd_ret_code retcode; | ||
2436 | |||
2437 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); | ||
2438 | if (!adm_ctx.reply_skb) | ||
2439 | return retcode; | ||
2440 | if (retcode != NO_ERROR) | ||
2441 | goto out; | ||
2442 | |||
2443 | retcode = drbd_request_state(adm_ctx.mdev, mask, val); | ||
2444 | out: | ||
2445 | drbd_adm_finish(info, retcode); | ||
2006 | return 0; | 2446 | return 0; |
2007 | } | 2447 | } |
2008 | 2448 | ||
@@ -2015,10 +2455,18 @@ static int drbd_bmio_set_susp_al(struct drbd_conf *mdev) | |||
2015 | return rv; | 2455 | return rv; |
2016 | } | 2456 | } |
2017 | 2457 | ||
2018 | static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2458 | int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info) |
2019 | struct drbd_nl_cfg_reply *reply) | ||
2020 | { | 2459 | { |
2021 | int retcode; | 2460 | int retcode; /* drbd_ret_code, drbd_state_rv */ |
2461 | struct drbd_conf *mdev; | ||
2462 | |||
2463 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); | ||
2464 | if (!adm_ctx.reply_skb) | ||
2465 | return retcode; | ||
2466 | if (retcode != NO_ERROR) | ||
2467 | goto out; | ||
2468 | |||
2469 | mdev = adm_ctx.mdev; | ||
2022 | 2470 | ||
2023 | /* If there is still bitmap IO pending, probably because of a previous | 2471 | /* If there is still bitmap IO pending, probably because of a previous |
2024 | * resync just being finished, wait for it before requesting a new resync. | 2472 | * resync just being finished, wait for it before requesting a new resync. |
@@ -2028,16 +2476,15 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re | |||
2028 | drbd_flush_workqueue(mdev); | 2476 | drbd_flush_workqueue(mdev); |
2029 | 2477 | ||
2030 | retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED); | 2478 | retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED); |
2031 | |||
2032 | if (retcode < SS_SUCCESS) { | 2479 | if (retcode < SS_SUCCESS) { |
2033 | if (retcode == SS_NEED_CONNECTION && mdev->state.role == R_PRIMARY) { | 2480 | if (retcode == SS_NEED_CONNECTION && mdev->state.role == R_PRIMARY) { |
2034 | /* The peer will get a resync upon connect anyways. Just make that | 2481 | /* The peer will get a resync upon connect anyways. |
2035 | into a full resync. */ | 2482 | * Just make that into a full resync. */ |
2036 | retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT)); | 2483 | retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT)); |
2037 | if (retcode >= SS_SUCCESS) { | 2484 | if (retcode >= SS_SUCCESS) { |
2038 | if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al, | 2485 | if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al, |
2039 | "set_n_write from invalidate_peer", | 2486 | "set_n_write from invalidate_peer", |
2040 | BM_LOCKED_SET_ALLOWED)) | 2487 | BM_LOCKED_SET_ALLOWED)) |
2041 | retcode = ERR_IO_MD_DISK; | 2488 | retcode = ERR_IO_MD_DISK; |
2042 | } | 2489 | } |
2043 | } else | 2490 | } else |
@@ -2045,30 +2492,41 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re | |||
2045 | } | 2492 | } |
2046 | drbd_resume_io(mdev); | 2493 | drbd_resume_io(mdev); |
2047 | 2494 | ||
2048 | reply->ret_code = retcode; | 2495 | out: |
2496 | drbd_adm_finish(info, retcode); | ||
2049 | return 0; | 2497 | return 0; |
2050 | } | 2498 | } |
2051 | 2499 | ||
2052 | static int drbd_nl_pause_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2500 | int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info) |
2053 | struct drbd_nl_cfg_reply *reply) | ||
2054 | { | 2501 | { |
2055 | int retcode = NO_ERROR; | 2502 | enum drbd_ret_code retcode; |
2056 | 2503 | ||
2057 | if (drbd_request_state(mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO) | 2504 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); |
2058 | retcode = ERR_PAUSE_IS_SET; | 2505 | if (!adm_ctx.reply_skb) |
2506 | return retcode; | ||
2507 | if (retcode != NO_ERROR) | ||
2508 | goto out; | ||
2059 | 2509 | ||
2060 | reply->ret_code = retcode; | 2510 | if (drbd_request_state(adm_ctx.mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO) |
2511 | retcode = ERR_PAUSE_IS_SET; | ||
2512 | out: | ||
2513 | drbd_adm_finish(info, retcode); | ||
2061 | return 0; | 2514 | return 0; |
2062 | } | 2515 | } |
2063 | 2516 | ||
2064 | static int drbd_nl_resume_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2517 | int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info) |
2065 | struct drbd_nl_cfg_reply *reply) | ||
2066 | { | 2518 | { |
2067 | int retcode = NO_ERROR; | 2519 | union drbd_dev_state s; |
2068 | union drbd_state s; | 2520 | enum drbd_ret_code retcode; |
2069 | 2521 | ||
2070 | if (drbd_request_state(mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO) { | 2522 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); |
2071 | s = mdev->state; | 2523 | if (!adm_ctx.reply_skb) |
2524 | return retcode; | ||
2525 | if (retcode != NO_ERROR) | ||
2526 | goto out; | ||
2527 | |||
2528 | if (drbd_request_state(adm_ctx.mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO) { | ||
2529 | s = adm_ctx.mdev->state; | ||
2072 | if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) { | 2530 | if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) { |
2073 | retcode = s.aftr_isp ? ERR_PIC_AFTER_DEP : | 2531 | retcode = s.aftr_isp ? ERR_PIC_AFTER_DEP : |
2074 | s.peer_isp ? ERR_PIC_PEER_DEP : ERR_PAUSE_IS_CLEAR; | 2532 | s.peer_isp ? ERR_PIC_PEER_DEP : ERR_PAUSE_IS_CLEAR; |
@@ -2077,172 +2535,482 @@ static int drbd_nl_resume_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n | |||
2077 | } | 2535 | } |
2078 | } | 2536 | } |
2079 | 2537 | ||
2080 | reply->ret_code = retcode; | 2538 | out: |
2539 | drbd_adm_finish(info, retcode); | ||
2081 | return 0; | 2540 | return 0; |
2082 | } | 2541 | } |
2083 | 2542 | ||
2084 | static int drbd_nl_suspend_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2543 | int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info) |
2085 | struct drbd_nl_cfg_reply *reply) | ||
2086 | { | 2544 | { |
2087 | reply->ret_code = drbd_request_state(mdev, NS(susp, 1)); | 2545 | return drbd_adm_simple_request_state(skb, info, NS(susp, 1)); |
2088 | |||
2089 | return 0; | ||
2090 | } | 2546 | } |
2091 | 2547 | ||
2092 | static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2548 | int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info) |
2093 | struct drbd_nl_cfg_reply *reply) | ||
2094 | { | 2549 | { |
2550 | struct drbd_conf *mdev; | ||
2551 | int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ | ||
2552 | |||
2553 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); | ||
2554 | if (!adm_ctx.reply_skb) | ||
2555 | return retcode; | ||
2556 | if (retcode != NO_ERROR) | ||
2557 | goto out; | ||
2558 | |||
2559 | mdev = adm_ctx.mdev; | ||
2095 | if (test_bit(NEW_CUR_UUID, &mdev->flags)) { | 2560 | if (test_bit(NEW_CUR_UUID, &mdev->flags)) { |
2096 | drbd_uuid_new_current(mdev); | 2561 | drbd_uuid_new_current(mdev); |
2097 | clear_bit(NEW_CUR_UUID, &mdev->flags); | 2562 | clear_bit(NEW_CUR_UUID, &mdev->flags); |
2098 | } | 2563 | } |
2099 | drbd_suspend_io(mdev); | 2564 | drbd_suspend_io(mdev); |
2100 | reply->ret_code = drbd_request_state(mdev, NS3(susp, 0, susp_nod, 0, susp_fen, 0)); | 2565 | retcode = drbd_request_state(mdev, NS3(susp, 0, susp_nod, 0, susp_fen, 0)); |
2101 | if (reply->ret_code == SS_SUCCESS) { | 2566 | if (retcode == SS_SUCCESS) { |
2102 | if (mdev->state.conn < C_CONNECTED) | 2567 | if (mdev->state.conn < C_CONNECTED) |
2103 | tl_clear(mdev); | 2568 | tl_clear(mdev->tconn); |
2104 | if (mdev->state.disk == D_DISKLESS || mdev->state.disk == D_FAILED) | 2569 | if (mdev->state.disk == D_DISKLESS || mdev->state.disk == D_FAILED) |
2105 | tl_restart(mdev, fail_frozen_disk_io); | 2570 | tl_restart(mdev->tconn, FAIL_FROZEN_DISK_IO); |
2106 | } | 2571 | } |
2107 | drbd_resume_io(mdev); | 2572 | drbd_resume_io(mdev); |
2108 | 2573 | ||
2574 | out: | ||
2575 | drbd_adm_finish(info, retcode); | ||
2109 | return 0; | 2576 | return 0; |
2110 | } | 2577 | } |
2111 | 2578 | ||
2112 | static int drbd_nl_outdate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2579 | int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info) |
2113 | struct drbd_nl_cfg_reply *reply) | ||
2114 | { | 2580 | { |
2115 | reply->ret_code = drbd_request_state(mdev, NS(disk, D_OUTDATED)); | 2581 | return drbd_adm_simple_request_state(skb, info, NS(disk, D_OUTDATED)); |
2116 | return 0; | ||
2117 | } | 2582 | } |
2118 | 2583 | ||
2119 | static int drbd_nl_get_config(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2584 | int nla_put_drbd_cfg_context(struct sk_buff *skb, struct drbd_tconn *tconn, unsigned vnr) |
2120 | struct drbd_nl_cfg_reply *reply) | ||
2121 | { | 2585 | { |
2122 | unsigned short *tl; | 2586 | struct nlattr *nla; |
2587 | nla = nla_nest_start(skb, DRBD_NLA_CFG_CONTEXT); | ||
2588 | if (!nla) | ||
2589 | goto nla_put_failure; | ||
2590 | if (vnr != VOLUME_UNSPECIFIED && | ||
2591 | nla_put_u32(skb, T_ctx_volume, vnr)) | ||
2592 | goto nla_put_failure; | ||
2593 | if (nla_put_string(skb, T_ctx_resource_name, tconn->name)) | ||
2594 | goto nla_put_failure; | ||
2595 | if (tconn->my_addr_len && | ||
2596 | nla_put(skb, T_ctx_my_addr, tconn->my_addr_len, &tconn->my_addr)) | ||
2597 | goto nla_put_failure; | ||
2598 | if (tconn->peer_addr_len && | ||
2599 | nla_put(skb, T_ctx_peer_addr, tconn->peer_addr_len, &tconn->peer_addr)) | ||
2600 | goto nla_put_failure; | ||
2601 | nla_nest_end(skb, nla); | ||
2602 | return 0; | ||
2123 | 2603 | ||
2124 | tl = reply->tag_list; | 2604 | nla_put_failure: |
2605 | if (nla) | ||
2606 | nla_nest_cancel(skb, nla); | ||
2607 | return -EMSGSIZE; | ||
2608 | } | ||
2125 | 2609 | ||
2126 | if (get_ldev(mdev)) { | 2610 | int nla_put_status_info(struct sk_buff *skb, struct drbd_conf *mdev, |
2127 | tl = disk_conf_to_tags(mdev, &mdev->ldev->dc, tl); | 2611 | const struct sib_info *sib) |
2128 | put_ldev(mdev); | 2612 | { |
2129 | } | 2613 | struct state_info *si = NULL; /* for sizeof(si->member); */ |
2614 | struct net_conf *nc; | ||
2615 | struct nlattr *nla; | ||
2616 | int got_ldev; | ||
2617 | int err = 0; | ||
2618 | int exclude_sensitive; | ||
2619 | |||
2620 | /* If sib != NULL, this is drbd_bcast_event, which anyone can listen | ||
2621 | * to. So we better exclude_sensitive information. | ||
2622 | * | ||
2623 | * If sib == NULL, this is drbd_adm_get_status, executed synchronously | ||
2624 | * in the context of the requesting user process. Exclude sensitive | ||
2625 | * information, unless current has superuser. | ||
2626 | * | ||
2627 | * NOTE: for drbd_adm_get_status_all(), this is a netlink dump, and | ||
2628 | * relies on the current implementation of netlink_dump(), which | ||
2629 | * executes the dump callback successively from netlink_recvmsg(), | ||
2630 | * always in the context of the receiving process */ | ||
2631 | exclude_sensitive = sib || !capable(CAP_SYS_ADMIN); | ||
2632 | |||
2633 | got_ldev = get_ldev(mdev); | ||
2634 | |||
2635 | /* We need to add connection name and volume number information still. | ||
2636 | * Minor number is in drbd_genlmsghdr. */ | ||
2637 | if (nla_put_drbd_cfg_context(skb, mdev->tconn, mdev->vnr)) | ||
2638 | goto nla_put_failure; | ||
2639 | |||
2640 | if (res_opts_to_skb(skb, &mdev->tconn->res_opts, exclude_sensitive)) | ||
2641 | goto nla_put_failure; | ||
2642 | |||
2643 | rcu_read_lock(); | ||
2644 | if (got_ldev) | ||
2645 | if (disk_conf_to_skb(skb, rcu_dereference(mdev->ldev->disk_conf), exclude_sensitive)) | ||
2646 | goto nla_put_failure; | ||
2647 | |||
2648 | nc = rcu_dereference(mdev->tconn->net_conf); | ||
2649 | if (nc) | ||
2650 | err = net_conf_to_skb(skb, nc, exclude_sensitive); | ||
2651 | rcu_read_unlock(); | ||
2652 | if (err) | ||
2653 | goto nla_put_failure; | ||
2654 | |||
2655 | nla = nla_nest_start(skb, DRBD_NLA_STATE_INFO); | ||
2656 | if (!nla) | ||
2657 | goto nla_put_failure; | ||
2658 | if (nla_put_u32(skb, T_sib_reason, sib ? sib->sib_reason : SIB_GET_STATUS_REPLY) || | ||
2659 | nla_put_u32(skb, T_current_state, mdev->state.i) || | ||
2660 | nla_put_u64(skb, T_ed_uuid, mdev->ed_uuid) || | ||
2661 | nla_put_u64(skb, T_capacity, drbd_get_capacity(mdev->this_bdev)) || | ||
2662 | nla_put_u64(skb, T_send_cnt, mdev->send_cnt) || | ||
2663 | nla_put_u64(skb, T_recv_cnt, mdev->recv_cnt) || | ||
2664 | nla_put_u64(skb, T_read_cnt, mdev->read_cnt) || | ||
2665 | nla_put_u64(skb, T_writ_cnt, mdev->writ_cnt) || | ||
2666 | nla_put_u64(skb, T_al_writ_cnt, mdev->al_writ_cnt) || | ||
2667 | nla_put_u64(skb, T_bm_writ_cnt, mdev->bm_writ_cnt) || | ||
2668 | nla_put_u32(skb, T_ap_bio_cnt, atomic_read(&mdev->ap_bio_cnt)) || | ||
2669 | nla_put_u32(skb, T_ap_pending_cnt, atomic_read(&mdev->ap_pending_cnt)) || | ||
2670 | nla_put_u32(skb, T_rs_pending_cnt, atomic_read(&mdev->rs_pending_cnt))) | ||
2671 | goto nla_put_failure; | ||
2672 | |||
2673 | if (got_ldev) { | ||
2674 | int err; | ||
2130 | 2675 | ||
2131 | if (get_net_conf(mdev)) { | 2676 | spin_lock_irq(&mdev->ldev->md.uuid_lock); |
2132 | tl = net_conf_to_tags(mdev, mdev->net_conf, tl); | 2677 | err = nla_put(skb, T_uuids, sizeof(si->uuids), mdev->ldev->md.uuid); |
2133 | put_net_conf(mdev); | 2678 | spin_unlock_irq(&mdev->ldev->md.uuid_lock); |
2679 | |||
2680 | if (err) | ||
2681 | goto nla_put_failure; | ||
2682 | |||
2683 | if (nla_put_u32(skb, T_disk_flags, mdev->ldev->md.flags) || | ||
2684 | nla_put_u64(skb, T_bits_total, drbd_bm_bits(mdev)) || | ||
2685 | nla_put_u64(skb, T_bits_oos, drbd_bm_total_weight(mdev))) | ||
2686 | goto nla_put_failure; | ||
2687 | if (C_SYNC_SOURCE <= mdev->state.conn && | ||
2688 | C_PAUSED_SYNC_T >= mdev->state.conn) { | ||
2689 | if (nla_put_u64(skb, T_bits_rs_total, mdev->rs_total) || | ||
2690 | nla_put_u64(skb, T_bits_rs_failed, mdev->rs_failed)) | ||
2691 | goto nla_put_failure; | ||
2692 | } | ||
2134 | } | 2693 | } |
2135 | tl = syncer_conf_to_tags(mdev, &mdev->sync_conf, tl); | ||
2136 | 2694 | ||
2137 | put_unaligned(TT_END, tl++); /* Close the tag list */ | 2695 | if (sib) { |
2696 | switch(sib->sib_reason) { | ||
2697 | case SIB_SYNC_PROGRESS: | ||
2698 | case SIB_GET_STATUS_REPLY: | ||
2699 | break; | ||
2700 | case SIB_STATE_CHANGE: | ||
2701 | if (nla_put_u32(skb, T_prev_state, sib->os.i) || | ||
2702 | nla_put_u32(skb, T_new_state, sib->ns.i)) | ||
2703 | goto nla_put_failure; | ||
2704 | break; | ||
2705 | case SIB_HELPER_POST: | ||
2706 | if (nla_put_u32(skb, T_helper_exit_code, | ||
2707 | sib->helper_exit_code)) | ||
2708 | goto nla_put_failure; | ||
2709 | /* fall through */ | ||
2710 | case SIB_HELPER_PRE: | ||
2711 | if (nla_put_string(skb, T_helper, sib->helper_name)) | ||
2712 | goto nla_put_failure; | ||
2713 | break; | ||
2714 | } | ||
2715 | } | ||
2716 | nla_nest_end(skb, nla); | ||
2138 | 2717 | ||
2139 | return (int)((char *)tl - (char *)reply->tag_list); | 2718 | if (0) |
2719 | nla_put_failure: | ||
2720 | err = -EMSGSIZE; | ||
2721 | if (got_ldev) | ||
2722 | put_ldev(mdev); | ||
2723 | return err; | ||
2140 | } | 2724 | } |
2141 | 2725 | ||
2142 | static int drbd_nl_get_state(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2726 | int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info) |
2143 | struct drbd_nl_cfg_reply *reply) | ||
2144 | { | 2727 | { |
2145 | unsigned short *tl = reply->tag_list; | 2728 | enum drbd_ret_code retcode; |
2146 | union drbd_state s = mdev->state; | 2729 | int err; |
2147 | unsigned long rs_left; | ||
2148 | unsigned int res; | ||
2149 | 2730 | ||
2150 | tl = get_state_to_tags(mdev, (struct get_state *)&s, tl); | 2731 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); |
2732 | if (!adm_ctx.reply_skb) | ||
2733 | return retcode; | ||
2734 | if (retcode != NO_ERROR) | ||
2735 | goto out; | ||
2151 | 2736 | ||
2152 | /* no local ref, no bitmap, no syncer progress. */ | 2737 | err = nla_put_status_info(adm_ctx.reply_skb, adm_ctx.mdev, NULL); |
2153 | if (s.conn >= C_SYNC_SOURCE && s.conn <= C_PAUSED_SYNC_T) { | 2738 | if (err) { |
2154 | if (get_ldev(mdev)) { | 2739 | nlmsg_free(adm_ctx.reply_skb); |
2155 | drbd_get_syncer_progress(mdev, &rs_left, &res); | 2740 | return err; |
2156 | tl = tl_add_int(tl, T_sync_progress, &res); | ||
2157 | put_ldev(mdev); | ||
2158 | } | ||
2159 | } | 2741 | } |
2160 | put_unaligned(TT_END, tl++); /* Close the tag list */ | 2742 | out: |
2161 | 2743 | drbd_adm_finish(info, retcode); | |
2162 | return (int)((char *)tl - (char *)reply->tag_list); | 2744 | return 0; |
2163 | } | 2745 | } |
2164 | 2746 | ||
2165 | static int drbd_nl_get_uuids(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2747 | int get_one_status(struct sk_buff *skb, struct netlink_callback *cb) |
2166 | struct drbd_nl_cfg_reply *reply) | ||
2167 | { | 2748 | { |
2168 | unsigned short *tl; | 2749 | struct drbd_conf *mdev; |
2169 | 2750 | struct drbd_genlmsghdr *dh; | |
2170 | tl = reply->tag_list; | 2751 | struct drbd_tconn *pos = (struct drbd_tconn*)cb->args[0]; |
2752 | struct drbd_tconn *tconn = NULL; | ||
2753 | struct drbd_tconn *tmp; | ||
2754 | unsigned volume = cb->args[1]; | ||
2755 | |||
2756 | /* Open coded, deferred, iteration: | ||
2757 | * list_for_each_entry_safe(tconn, tmp, &drbd_tconns, all_tconn) { | ||
2758 | * idr_for_each_entry(&tconn->volumes, mdev, i) { | ||
2759 | * ... | ||
2760 | * } | ||
2761 | * } | ||
2762 | * where tconn is cb->args[0]; | ||
2763 | * and i is cb->args[1]; | ||
2764 | * | ||
2765 | * cb->args[2] indicates if we shall loop over all resources, | ||
2766 | * or just dump all volumes of a single resource. | ||
2767 | * | ||
2768 | * This may miss entries inserted after this dump started, | ||
2769 | * or entries deleted before they are reached. | ||
2770 | * | ||
2771 | * We need to make sure the mdev won't disappear while | ||
2772 | * we are looking at it, and revalidate our iterators | ||
2773 | * on each iteration. | ||
2774 | */ | ||
2171 | 2775 | ||
2172 | if (get_ldev(mdev)) { | 2776 | /* synchronize with conn_create()/conn_destroy() */ |
2173 | tl = tl_add_blob(tl, T_uuids, mdev->ldev->md.uuid, UI_SIZE*sizeof(u64)); | 2777 | rcu_read_lock(); |
2174 | tl = tl_add_int(tl, T_uuids_flags, &mdev->ldev->md.flags); | 2778 | /* revalidate iterator position */ |
2175 | put_ldev(mdev); | 2779 | list_for_each_entry_rcu(tmp, &drbd_tconns, all_tconn) { |
2780 | if (pos == NULL) { | ||
2781 | /* first iteration */ | ||
2782 | pos = tmp; | ||
2783 | tconn = pos; | ||
2784 | break; | ||
2785 | } | ||
2786 | if (tmp == pos) { | ||
2787 | tconn = pos; | ||
2788 | break; | ||
2789 | } | ||
2176 | } | 2790 | } |
2177 | put_unaligned(TT_END, tl++); /* Close the tag list */ | 2791 | if (tconn) { |
2792 | next_tconn: | ||
2793 | mdev = idr_get_next(&tconn->volumes, &volume); | ||
2794 | if (!mdev) { | ||
2795 | /* No more volumes to dump on this tconn. | ||
2796 | * Advance tconn iterator. */ | ||
2797 | pos = list_entry_rcu(tconn->all_tconn.next, | ||
2798 | struct drbd_tconn, all_tconn); | ||
2799 | /* Did we dump any volume on this tconn yet? */ | ||
2800 | if (volume != 0) { | ||
2801 | /* If we reached the end of the list, | ||
2802 | * or only a single resource dump was requested, | ||
2803 | * we are done. */ | ||
2804 | if (&pos->all_tconn == &drbd_tconns || cb->args[2]) | ||
2805 | goto out; | ||
2806 | volume = 0; | ||
2807 | tconn = pos; | ||
2808 | goto next_tconn; | ||
2809 | } | ||
2810 | } | ||
2811 | |||
2812 | dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, | ||
2813 | cb->nlh->nlmsg_seq, &drbd_genl_family, | ||
2814 | NLM_F_MULTI, DRBD_ADM_GET_STATUS); | ||
2815 | if (!dh) | ||
2816 | goto out; | ||
2817 | |||
2818 | if (!mdev) { | ||
2819 | /* This is a tconn without a single volume. | ||
2820 | * Suprisingly enough, it may have a network | ||
2821 | * configuration. */ | ||
2822 | struct net_conf *nc; | ||
2823 | dh->minor = -1U; | ||
2824 | dh->ret_code = NO_ERROR; | ||
2825 | if (nla_put_drbd_cfg_context(skb, tconn, VOLUME_UNSPECIFIED)) | ||
2826 | goto cancel; | ||
2827 | nc = rcu_dereference(tconn->net_conf); | ||
2828 | if (nc && net_conf_to_skb(skb, nc, 1) != 0) | ||
2829 | goto cancel; | ||
2830 | goto done; | ||
2831 | } | ||
2832 | |||
2833 | D_ASSERT(mdev->vnr == volume); | ||
2834 | D_ASSERT(mdev->tconn == tconn); | ||
2835 | |||
2836 | dh->minor = mdev_to_minor(mdev); | ||
2837 | dh->ret_code = NO_ERROR; | ||
2178 | 2838 | ||
2179 | return (int)((char *)tl - (char *)reply->tag_list); | 2839 | if (nla_put_status_info(skb, mdev, NULL)) { |
2840 | cancel: | ||
2841 | genlmsg_cancel(skb, dh); | ||
2842 | goto out; | ||
2843 | } | ||
2844 | done: | ||
2845 | genlmsg_end(skb, dh); | ||
2846 | } | ||
2847 | |||
2848 | out: | ||
2849 | rcu_read_unlock(); | ||
2850 | /* where to start the next iteration */ | ||
2851 | cb->args[0] = (long)pos; | ||
2852 | cb->args[1] = (pos == tconn) ? volume + 1 : 0; | ||
2853 | |||
2854 | /* No more tconns/volumes/minors found results in an empty skb. | ||
2855 | * Which will terminate the dump. */ | ||
2856 | return skb->len; | ||
2180 | } | 2857 | } |
2181 | 2858 | ||
2182 | /** | 2859 | /* |
2183 | * drbd_nl_get_timeout_flag() - Used by drbdsetup to find out which timeout value to use | 2860 | * Request status of all resources, or of all volumes within a single resource. |
2184 | * @mdev: DRBD device. | 2861 | * |
2185 | * @nlp: Netlink/connector packet from drbdsetup | 2862 | * This is a dump, as the answer may not fit in a single reply skb otherwise. |
2186 | * @reply: Reply packet for drbdsetup | 2863 | * Which means we cannot use the family->attrbuf or other such members, because |
2864 | * dump is NOT protected by the genl_lock(). During dump, we only have access | ||
2865 | * to the incoming skb, and need to opencode "parsing" of the nlattr payload. | ||
2866 | * | ||
2867 | * Once things are setup properly, we call into get_one_status(). | ||
2187 | */ | 2868 | */ |
2188 | static int drbd_nl_get_timeout_flag(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2869 | int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb) |
2189 | struct drbd_nl_cfg_reply *reply) | ||
2190 | { | 2870 | { |
2191 | unsigned short *tl; | 2871 | const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ; |
2192 | char rv; | 2872 | struct nlattr *nla; |
2873 | const char *resource_name; | ||
2874 | struct drbd_tconn *tconn; | ||
2875 | int maxtype; | ||
2876 | |||
2877 | /* Is this a followup call? */ | ||
2878 | if (cb->args[0]) { | ||
2879 | /* ... of a single resource dump, | ||
2880 | * and the resource iterator has been advanced already? */ | ||
2881 | if (cb->args[2] && cb->args[2] != cb->args[0]) | ||
2882 | return 0; /* DONE. */ | ||
2883 | goto dump; | ||
2884 | } | ||
2885 | |||
2886 | /* First call (from netlink_dump_start). We need to figure out | ||
2887 | * which resource(s) the user wants us to dump. */ | ||
2888 | nla = nla_find(nlmsg_attrdata(cb->nlh, hdrlen), | ||
2889 | nlmsg_attrlen(cb->nlh, hdrlen), | ||
2890 | DRBD_NLA_CFG_CONTEXT); | ||
2891 | |||
2892 | /* No explicit context given. Dump all. */ | ||
2893 | if (!nla) | ||
2894 | goto dump; | ||
2895 | maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1; | ||
2896 | nla = drbd_nla_find_nested(maxtype, nla, __nla_type(T_ctx_resource_name)); | ||
2897 | if (IS_ERR(nla)) | ||
2898 | return PTR_ERR(nla); | ||
2899 | /* context given, but no name present? */ | ||
2900 | if (!nla) | ||
2901 | return -EINVAL; | ||
2902 | resource_name = nla_data(nla); | ||
2903 | tconn = conn_get_by_name(resource_name); | ||
2904 | |||
2905 | if (!tconn) | ||
2906 | return -ENODEV; | ||
2907 | |||
2908 | kref_put(&tconn->kref, &conn_destroy); /* get_one_status() (re)validates tconn by itself */ | ||
2909 | |||
2910 | /* prime iterators, and set "filter" mode mark: | ||
2911 | * only dump this tconn. */ | ||
2912 | cb->args[0] = (long)tconn; | ||
2913 | /* cb->args[1] = 0; passed in this way. */ | ||
2914 | cb->args[2] = (long)tconn; | ||
2915 | |||
2916 | dump: | ||
2917 | return get_one_status(skb, cb); | ||
2918 | } | ||
2193 | 2919 | ||
2194 | tl = reply->tag_list; | 2920 | int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info) |
2921 | { | ||
2922 | enum drbd_ret_code retcode; | ||
2923 | struct timeout_parms tp; | ||
2924 | int err; | ||
2195 | 2925 | ||
2196 | rv = mdev->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED : | 2926 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); |
2197 | test_bit(USE_DEGR_WFC_T, &mdev->flags) ? UT_DEGRADED : UT_DEFAULT; | 2927 | if (!adm_ctx.reply_skb) |
2928 | return retcode; | ||
2929 | if (retcode != NO_ERROR) | ||
2930 | goto out; | ||
2198 | 2931 | ||
2199 | tl = tl_add_blob(tl, T_use_degraded, &rv, sizeof(rv)); | 2932 | tp.timeout_type = |
2200 | put_unaligned(TT_END, tl++); /* Close the tag list */ | 2933 | adm_ctx.mdev->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED : |
2934 | test_bit(USE_DEGR_WFC_T, &adm_ctx.mdev->flags) ? UT_DEGRADED : | ||
2935 | UT_DEFAULT; | ||
2201 | 2936 | ||
2202 | return (int)((char *)tl - (char *)reply->tag_list); | 2937 | err = timeout_parms_to_priv_skb(adm_ctx.reply_skb, &tp); |
2938 | if (err) { | ||
2939 | nlmsg_free(adm_ctx.reply_skb); | ||
2940 | return err; | ||
2941 | } | ||
2942 | out: | ||
2943 | drbd_adm_finish(info, retcode); | ||
2944 | return 0; | ||
2203 | } | 2945 | } |
2204 | 2946 | ||
2205 | static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2947 | int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info) |
2206 | struct drbd_nl_cfg_reply *reply) | ||
2207 | { | 2948 | { |
2208 | /* default to resume from last known position, if possible */ | 2949 | struct drbd_conf *mdev; |
2209 | struct start_ov args = | 2950 | enum drbd_ret_code retcode; |
2210 | { .start_sector = mdev->ov_start_sector }; | 2951 | struct start_ov_parms parms; |
2211 | 2952 | ||
2212 | if (!start_ov_from_tags(mdev, nlp->tag_list, &args)) { | 2953 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); |
2213 | reply->ret_code = ERR_MANDATORY_TAG; | 2954 | if (!adm_ctx.reply_skb) |
2214 | return 0; | 2955 | return retcode; |
2956 | if (retcode != NO_ERROR) | ||
2957 | goto out; | ||
2958 | |||
2959 | mdev = adm_ctx.mdev; | ||
2960 | |||
2961 | /* resume from last known position, if possible */ | ||
2962 | parms.ov_start_sector = mdev->ov_start_sector; | ||
2963 | parms.ov_stop_sector = ULLONG_MAX; | ||
2964 | if (info->attrs[DRBD_NLA_START_OV_PARMS]) { | ||
2965 | int err = start_ov_parms_from_attrs(&parms, info); | ||
2966 | if (err) { | ||
2967 | retcode = ERR_MANDATORY_TAG; | ||
2968 | drbd_msg_put_info(from_attrs_err_to_txt(err)); | ||
2969 | goto out; | ||
2970 | } | ||
2215 | } | 2971 | } |
2972 | /* w_make_ov_request expects position to be aligned */ | ||
2973 | mdev->ov_start_sector = parms.ov_start_sector & ~(BM_SECT_PER_BIT-1); | ||
2974 | mdev->ov_stop_sector = parms.ov_stop_sector; | ||
2216 | 2975 | ||
2217 | /* If there is still bitmap IO pending, e.g. previous resync or verify | 2976 | /* If there is still bitmap IO pending, e.g. previous resync or verify |
2218 | * just being finished, wait for it before requesting a new resync. */ | 2977 | * just being finished, wait for it before requesting a new resync. */ |
2219 | drbd_suspend_io(mdev); | 2978 | drbd_suspend_io(mdev); |
2220 | wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); | 2979 | wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); |
2221 | 2980 | retcode = drbd_request_state(mdev,NS(conn,C_VERIFY_S)); | |
2222 | /* w_make_ov_request expects position to be aligned */ | ||
2223 | mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT; | ||
2224 | reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S)); | ||
2225 | drbd_resume_io(mdev); | 2981 | drbd_resume_io(mdev); |
2982 | out: | ||
2983 | drbd_adm_finish(info, retcode); | ||
2226 | return 0; | 2984 | return 0; |
2227 | } | 2985 | } |
2228 | 2986 | ||
2229 | 2987 | ||
2230 | static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, | 2988 | int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info) |
2231 | struct drbd_nl_cfg_reply *reply) | ||
2232 | { | 2989 | { |
2233 | int retcode = NO_ERROR; | 2990 | struct drbd_conf *mdev; |
2991 | enum drbd_ret_code retcode; | ||
2234 | int skip_initial_sync = 0; | 2992 | int skip_initial_sync = 0; |
2235 | int err; | 2993 | int err; |
2994 | struct new_c_uuid_parms args; | ||
2236 | 2995 | ||
2237 | struct new_c_uuid args; | 2996 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); |
2997 | if (!adm_ctx.reply_skb) | ||
2998 | return retcode; | ||
2999 | if (retcode != NO_ERROR) | ||
3000 | goto out_nolock; | ||
2238 | 3001 | ||
2239 | memset(&args, 0, sizeof(struct new_c_uuid)); | 3002 | mdev = adm_ctx.mdev; |
2240 | if (!new_c_uuid_from_tags(mdev, nlp->tag_list, &args)) { | 3003 | memset(&args, 0, sizeof(args)); |
2241 | reply->ret_code = ERR_MANDATORY_TAG; | 3004 | if (info->attrs[DRBD_NLA_NEW_C_UUID_PARMS]) { |
2242 | return 0; | 3005 | err = new_c_uuid_parms_from_attrs(&args, info); |
3006 | if (err) { | ||
3007 | retcode = ERR_MANDATORY_TAG; | ||
3008 | drbd_msg_put_info(from_attrs_err_to_txt(err)); | ||
3009 | goto out_nolock; | ||
3010 | } | ||
2243 | } | 3011 | } |
2244 | 3012 | ||
2245 | mutex_lock(&mdev->state_mutex); /* Protects us against serialized state changes. */ | 3013 | mutex_lock(mdev->state_mutex); /* Protects us against serialized state changes. */ |
2246 | 3014 | ||
2247 | if (!get_ldev(mdev)) { | 3015 | if (!get_ldev(mdev)) { |
2248 | retcode = ERR_NO_DISK; | 3016 | retcode = ERR_NO_DISK; |
@@ -2250,7 +3018,7 @@ static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl | |||
2250 | } | 3018 | } |
2251 | 3019 | ||
2252 | /* this is "skip initial sync", assume to be clean */ | 3020 | /* this is "skip initial sync", assume to be clean */ |
2253 | if (mdev->state.conn == C_CONNECTED && mdev->agreed_pro_version >= 90 && | 3021 | if (mdev->state.conn == C_CONNECTED && mdev->tconn->agreed_pro_version >= 90 && |
2254 | mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) { | 3022 | mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) { |
2255 | dev_info(DEV, "Preparing to skip initial sync\n"); | 3023 | dev_info(DEV, "Preparing to skip initial sync\n"); |
2256 | skip_initial_sync = 1; | 3024 | skip_initial_sync = 1; |
@@ -2273,10 +3041,10 @@ static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl | |||
2273 | drbd_send_uuids_skip_initial_sync(mdev); | 3041 | drbd_send_uuids_skip_initial_sync(mdev); |
2274 | _drbd_uuid_set(mdev, UI_BITMAP, 0); | 3042 | _drbd_uuid_set(mdev, UI_BITMAP, 0); |
2275 | drbd_print_uuids(mdev, "cleared bitmap UUID"); | 3043 | drbd_print_uuids(mdev, "cleared bitmap UUID"); |
2276 | spin_lock_irq(&mdev->req_lock); | 3044 | spin_lock_irq(&mdev->tconn->req_lock); |
2277 | _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE), | 3045 | _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE), |
2278 | CS_VERBOSE, NULL); | 3046 | CS_VERBOSE, NULL); |
2279 | spin_unlock_irq(&mdev->req_lock); | 3047 | spin_unlock_irq(&mdev->tconn->req_lock); |
2280 | } | 3048 | } |
2281 | } | 3049 | } |
2282 | 3050 | ||
@@ -2284,416 +3052,284 @@ static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl | |||
2284 | out_dec: | 3052 | out_dec: |
2285 | put_ldev(mdev); | 3053 | put_ldev(mdev); |
2286 | out: | 3054 | out: |
2287 | mutex_unlock(&mdev->state_mutex); | 3055 | mutex_unlock(mdev->state_mutex); |
2288 | 3056 | out_nolock: | |
2289 | reply->ret_code = retcode; | 3057 | drbd_adm_finish(info, retcode); |
2290 | return 0; | 3058 | return 0; |
2291 | } | 3059 | } |
2292 | 3060 | ||
2293 | struct cn_handler_struct { | 3061 | static enum drbd_ret_code |
2294 | int (*function)(struct drbd_conf *, | 3062 | drbd_check_resource_name(const char *name) |
2295 | struct drbd_nl_cfg_req *, | ||
2296 | struct drbd_nl_cfg_reply *); | ||
2297 | int reply_body_size; | ||
2298 | }; | ||
2299 | |||
2300 | static struct cn_handler_struct cnd_table[] = { | ||
2301 | [ P_primary ] = { &drbd_nl_primary, 0 }, | ||
2302 | [ P_secondary ] = { &drbd_nl_secondary, 0 }, | ||
2303 | [ P_disk_conf ] = { &drbd_nl_disk_conf, 0 }, | ||
2304 | [ P_detach ] = { &drbd_nl_detach, 0 }, | ||
2305 | [ P_net_conf ] = { &drbd_nl_net_conf, 0 }, | ||
2306 | [ P_disconnect ] = { &drbd_nl_disconnect, 0 }, | ||
2307 | [ P_resize ] = { &drbd_nl_resize, 0 }, | ||
2308 | [ P_syncer_conf ] = { &drbd_nl_syncer_conf, 0 }, | ||
2309 | [ P_invalidate ] = { &drbd_nl_invalidate, 0 }, | ||
2310 | [ P_invalidate_peer ] = { &drbd_nl_invalidate_peer, 0 }, | ||
2311 | [ P_pause_sync ] = { &drbd_nl_pause_sync, 0 }, | ||
2312 | [ P_resume_sync ] = { &drbd_nl_resume_sync, 0 }, | ||
2313 | [ P_suspend_io ] = { &drbd_nl_suspend_io, 0 }, | ||
2314 | [ P_resume_io ] = { &drbd_nl_resume_io, 0 }, | ||
2315 | [ P_outdate ] = { &drbd_nl_outdate, 0 }, | ||
2316 | [ P_get_config ] = { &drbd_nl_get_config, | ||
2317 | sizeof(struct syncer_conf_tag_len_struct) + | ||
2318 | sizeof(struct disk_conf_tag_len_struct) + | ||
2319 | sizeof(struct net_conf_tag_len_struct) }, | ||
2320 | [ P_get_state ] = { &drbd_nl_get_state, | ||
2321 | sizeof(struct get_state_tag_len_struct) + | ||
2322 | sizeof(struct sync_progress_tag_len_struct) }, | ||
2323 | [ P_get_uuids ] = { &drbd_nl_get_uuids, | ||
2324 | sizeof(struct get_uuids_tag_len_struct) }, | ||
2325 | [ P_get_timeout_flag ] = { &drbd_nl_get_timeout_flag, | ||
2326 | sizeof(struct get_timeout_flag_tag_len_struct)}, | ||
2327 | [ P_start_ov ] = { &drbd_nl_start_ov, 0 }, | ||
2328 | [ P_new_c_uuid ] = { &drbd_nl_new_c_uuid, 0 }, | ||
2329 | }; | ||
2330 | |||
2331 | static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms *nsp) | ||
2332 | { | 3063 | { |
2333 | struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req *)req->data; | 3064 | if (!name || !name[0]) { |
2334 | struct cn_handler_struct *cm; | 3065 | drbd_msg_put_info("resource name missing"); |
2335 | struct cn_msg *cn_reply; | 3066 | return ERR_MANDATORY_TAG; |
2336 | struct drbd_nl_cfg_reply *reply; | ||
2337 | struct drbd_conf *mdev; | ||
2338 | int retcode, rr; | ||
2339 | int reply_size = sizeof(struct cn_msg) | ||
2340 | + sizeof(struct drbd_nl_cfg_reply) | ||
2341 | + sizeof(short int); | ||
2342 | |||
2343 | if (!try_module_get(THIS_MODULE)) { | ||
2344 | printk(KERN_ERR "drbd: try_module_get() failed!\n"); | ||
2345 | return; | ||
2346 | } | 3067 | } |
2347 | 3068 | /* if we want to use these in sysfs/configfs/debugfs some day, | |
2348 | if (!capable(CAP_SYS_ADMIN)) { | 3069 | * we must not allow slashes */ |
2349 | retcode = ERR_PERM; | 3070 | if (strchr(name, '/')) { |
2350 | goto fail; | 3071 | drbd_msg_put_info("invalid resource name"); |
2351 | } | 3072 | return ERR_INVALID_REQUEST; |
2352 | |||
2353 | mdev = ensure_mdev(nlp->drbd_minor, | ||
2354 | (nlp->flags & DRBD_NL_CREATE_DEVICE)); | ||
2355 | if (!mdev) { | ||
2356 | retcode = ERR_MINOR_INVALID; | ||
2357 | goto fail; | ||
2358 | } | 3073 | } |
3074 | return NO_ERROR; | ||
3075 | } | ||
2359 | 3076 | ||
2360 | if (nlp->packet_type >= P_nl_after_last_packet || | 3077 | int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info) |
2361 | nlp->packet_type == P_return_code_only) { | 3078 | { |
2362 | retcode = ERR_PACKET_NR; | 3079 | enum drbd_ret_code retcode; |
2363 | goto fail; | 3080 | struct res_opts res_opts; |
2364 | } | 3081 | int err; |
2365 | 3082 | ||
2366 | cm = cnd_table + nlp->packet_type; | 3083 | retcode = drbd_adm_prepare(skb, info, 0); |
3084 | if (!adm_ctx.reply_skb) | ||
3085 | return retcode; | ||
3086 | if (retcode != NO_ERROR) | ||
3087 | goto out; | ||
2367 | 3088 | ||
2368 | /* This may happen if packet number is 0: */ | 3089 | set_res_opts_defaults(&res_opts); |
2369 | if (cm->function == NULL) { | 3090 | err = res_opts_from_attrs(&res_opts, info); |
2370 | retcode = ERR_PACKET_NR; | 3091 | if (err && err != -ENOMSG) { |
2371 | goto fail; | 3092 | retcode = ERR_MANDATORY_TAG; |
3093 | drbd_msg_put_info(from_attrs_err_to_txt(err)); | ||
3094 | goto out; | ||
2372 | } | 3095 | } |
2373 | 3096 | ||
2374 | reply_size += cm->reply_body_size; | 3097 | retcode = drbd_check_resource_name(adm_ctx.resource_name); |
3098 | if (retcode != NO_ERROR) | ||
3099 | goto out; | ||
2375 | 3100 | ||
2376 | /* allocation not in the IO path, cqueue thread context */ | 3101 | if (adm_ctx.tconn) { |
2377 | cn_reply = kzalloc(reply_size, GFP_KERNEL); | 3102 | if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) { |
2378 | if (!cn_reply) { | 3103 | retcode = ERR_INVALID_REQUEST; |
2379 | retcode = ERR_NOMEM; | 3104 | drbd_msg_put_info("resource exists"); |
2380 | goto fail; | 3105 | } |
3106 | /* else: still NO_ERROR */ | ||
3107 | goto out; | ||
2381 | } | 3108 | } |
2382 | reply = (struct drbd_nl_cfg_reply *) cn_reply->data; | ||
2383 | |||
2384 | reply->packet_type = | ||
2385 | cm->reply_body_size ? nlp->packet_type : P_return_code_only; | ||
2386 | reply->minor = nlp->drbd_minor; | ||
2387 | reply->ret_code = NO_ERROR; /* Might by modified by cm->function. */ | ||
2388 | /* reply->tag_list; might be modified by cm->function. */ | ||
2389 | |||
2390 | rr = cm->function(mdev, nlp, reply); | ||
2391 | |||
2392 | cn_reply->id = req->id; | ||
2393 | cn_reply->seq = req->seq; | ||
2394 | cn_reply->ack = req->ack + 1; | ||
2395 | cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + rr; | ||
2396 | cn_reply->flags = 0; | ||
2397 | |||
2398 | rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_KERNEL); | ||
2399 | if (rr && rr != -ESRCH) | ||
2400 | printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr); | ||
2401 | 3109 | ||
2402 | kfree(cn_reply); | 3110 | if (!conn_create(adm_ctx.resource_name, &res_opts)) |
2403 | module_put(THIS_MODULE); | 3111 | retcode = ERR_NOMEM; |
2404 | return; | 3112 | out: |
2405 | fail: | 3113 | drbd_adm_finish(info, retcode); |
2406 | drbd_nl_send_reply(req, retcode); | 3114 | return 0; |
2407 | module_put(THIS_MODULE); | ||
2408 | } | 3115 | } |
2409 | 3116 | ||
2410 | static atomic_t drbd_nl_seq = ATOMIC_INIT(2); /* two. */ | 3117 | int drbd_adm_add_minor(struct sk_buff *skb, struct genl_info *info) |
2411 | |||
2412 | static unsigned short * | ||
2413 | __tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, | ||
2414 | unsigned short len, int nul_terminated) | ||
2415 | { | 3118 | { |
2416 | unsigned short l = tag_descriptions[tag_number(tag)].max_len; | 3119 | struct drbd_genlmsghdr *dh = info->userhdr; |
2417 | len = (len < l) ? len : l; | 3120 | enum drbd_ret_code retcode; |
2418 | put_unaligned(tag, tl++); | ||
2419 | put_unaligned(len, tl++); | ||
2420 | memcpy(tl, data, len); | ||
2421 | tl = (unsigned short*)((char*)tl + len); | ||
2422 | if (nul_terminated) | ||
2423 | *((char*)tl - 1) = 0; | ||
2424 | return tl; | ||
2425 | } | ||
2426 | 3121 | ||
2427 | static unsigned short * | 3122 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); |
2428 | tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, int len) | 3123 | if (!adm_ctx.reply_skb) |
2429 | { | 3124 | return retcode; |
2430 | return __tl_add_blob(tl, tag, data, len, 0); | 3125 | if (retcode != NO_ERROR) |
2431 | } | 3126 | goto out; |
2432 | 3127 | ||
2433 | static unsigned short * | 3128 | if (dh->minor > MINORMASK) { |
2434 | tl_add_str(unsigned short *tl, enum drbd_tags tag, const char *str) | 3129 | drbd_msg_put_info("requested minor out of range"); |
2435 | { | 3130 | retcode = ERR_INVALID_REQUEST; |
2436 | return __tl_add_blob(tl, tag, str, strlen(str)+1, 0); | 3131 | goto out; |
2437 | } | 3132 | } |
3133 | if (adm_ctx.volume > DRBD_VOLUME_MAX) { | ||
3134 | drbd_msg_put_info("requested volume id out of range"); | ||
3135 | retcode = ERR_INVALID_REQUEST; | ||
3136 | goto out; | ||
3137 | } | ||
2438 | 3138 | ||
2439 | static unsigned short * | 3139 | /* drbd_adm_prepare made sure already |
2440 | tl_add_int(unsigned short *tl, enum drbd_tags tag, const void *val) | 3140 | * that mdev->tconn and mdev->vnr match the request. */ |
2441 | { | 3141 | if (adm_ctx.mdev) { |
2442 | put_unaligned(tag, tl++); | 3142 | if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) |
2443 | switch(tag_type(tag)) { | 3143 | retcode = ERR_MINOR_EXISTS; |
2444 | case TT_INTEGER: | 3144 | /* else: still NO_ERROR */ |
2445 | put_unaligned(sizeof(int), tl++); | 3145 | goto out; |
2446 | put_unaligned(*(int *)val, (int *)tl); | ||
2447 | tl = (unsigned short*)((char*)tl+sizeof(int)); | ||
2448 | break; | ||
2449 | case TT_INT64: | ||
2450 | put_unaligned(sizeof(u64), tl++); | ||
2451 | put_unaligned(*(u64 *)val, (u64 *)tl); | ||
2452 | tl = (unsigned short*)((char*)tl+sizeof(u64)); | ||
2453 | break; | ||
2454 | default: | ||
2455 | /* someone did something stupid. */ | ||
2456 | ; | ||
2457 | } | 3146 | } |
2458 | return tl; | 3147 | |
3148 | retcode = conn_new_minor(adm_ctx.tconn, dh->minor, adm_ctx.volume); | ||
3149 | out: | ||
3150 | drbd_adm_finish(info, retcode); | ||
3151 | return 0; | ||
2459 | } | 3152 | } |
2460 | 3153 | ||
2461 | void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state state) | 3154 | static enum drbd_ret_code adm_delete_minor(struct drbd_conf *mdev) |
2462 | { | 3155 | { |
2463 | char buffer[sizeof(struct cn_msg)+ | 3156 | if (mdev->state.disk == D_DISKLESS && |
2464 | sizeof(struct drbd_nl_cfg_reply)+ | 3157 | /* no need to be mdev->state.conn == C_STANDALONE && |
2465 | sizeof(struct get_state_tag_len_struct)+ | 3158 | * we may want to delete a minor from a live replication group. |
2466 | sizeof(short int)]; | 3159 | */ |
2467 | struct cn_msg *cn_reply = (struct cn_msg *) buffer; | 3160 | mdev->state.role == R_SECONDARY) { |
2468 | struct drbd_nl_cfg_reply *reply = | 3161 | _drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS), |
2469 | (struct drbd_nl_cfg_reply *)cn_reply->data; | 3162 | CS_VERBOSE + CS_WAIT_COMPLETE); |
2470 | unsigned short *tl = reply->tag_list; | 3163 | idr_remove(&mdev->tconn->volumes, mdev->vnr); |
2471 | 3164 | idr_remove(&minors, mdev_to_minor(mdev)); | |
2472 | /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */ | 3165 | del_gendisk(mdev->vdisk); |
2473 | 3166 | synchronize_rcu(); | |
2474 | tl = get_state_to_tags(mdev, (struct get_state *)&state, tl); | 3167 | kref_put(&mdev->kref, &drbd_minor_destroy); |
2475 | 3168 | return NO_ERROR; | |
2476 | put_unaligned(TT_END, tl++); /* Close the tag list */ | 3169 | } else |
2477 | 3170 | return ERR_MINOR_CONFIGURED; | |
2478 | cn_reply->id.idx = CN_IDX_DRBD; | ||
2479 | cn_reply->id.val = CN_VAL_DRBD; | ||
2480 | |||
2481 | cn_reply->seq = atomic_add_return(1, &drbd_nl_seq); | ||
2482 | cn_reply->ack = 0; /* not used here. */ | ||
2483 | cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + | ||
2484 | (int)((char *)tl - (char *)reply->tag_list); | ||
2485 | cn_reply->flags = 0; | ||
2486 | |||
2487 | reply->packet_type = P_get_state; | ||
2488 | reply->minor = mdev_to_minor(mdev); | ||
2489 | reply->ret_code = NO_ERROR; | ||
2490 | |||
2491 | cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); | ||
2492 | } | 3171 | } |
2493 | 3172 | ||
2494 | void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name) | 3173 | int drbd_adm_delete_minor(struct sk_buff *skb, struct genl_info *info) |
2495 | { | 3174 | { |
2496 | char buffer[sizeof(struct cn_msg)+ | 3175 | enum drbd_ret_code retcode; |
2497 | sizeof(struct drbd_nl_cfg_reply)+ | ||
2498 | sizeof(struct call_helper_tag_len_struct)+ | ||
2499 | sizeof(short int)]; | ||
2500 | struct cn_msg *cn_reply = (struct cn_msg *) buffer; | ||
2501 | struct drbd_nl_cfg_reply *reply = | ||
2502 | (struct drbd_nl_cfg_reply *)cn_reply->data; | ||
2503 | unsigned short *tl = reply->tag_list; | ||
2504 | |||
2505 | /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */ | ||
2506 | |||
2507 | tl = tl_add_str(tl, T_helper, helper_name); | ||
2508 | put_unaligned(TT_END, tl++); /* Close the tag list */ | ||
2509 | |||
2510 | cn_reply->id.idx = CN_IDX_DRBD; | ||
2511 | cn_reply->id.val = CN_VAL_DRBD; | ||
2512 | |||
2513 | cn_reply->seq = atomic_add_return(1, &drbd_nl_seq); | ||
2514 | cn_reply->ack = 0; /* not used here. */ | ||
2515 | cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + | ||
2516 | (int)((char *)tl - (char *)reply->tag_list); | ||
2517 | cn_reply->flags = 0; | ||
2518 | 3176 | ||
2519 | reply->packet_type = P_call_helper; | 3177 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); |
2520 | reply->minor = mdev_to_minor(mdev); | 3178 | if (!adm_ctx.reply_skb) |
2521 | reply->ret_code = NO_ERROR; | 3179 | return retcode; |
3180 | if (retcode != NO_ERROR) | ||
3181 | goto out; | ||
2522 | 3182 | ||
2523 | cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); | 3183 | retcode = adm_delete_minor(adm_ctx.mdev); |
3184 | out: | ||
3185 | drbd_adm_finish(info, retcode); | ||
3186 | return 0; | ||
2524 | } | 3187 | } |
2525 | 3188 | ||
2526 | void drbd_bcast_ee(struct drbd_conf *mdev, | 3189 | int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) |
2527 | const char *reason, const int dgs, | ||
2528 | const char* seen_hash, const char* calc_hash, | ||
2529 | const struct drbd_epoch_entry* e) | ||
2530 | { | 3190 | { |
2531 | struct cn_msg *cn_reply; | 3191 | int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ |
2532 | struct drbd_nl_cfg_reply *reply; | 3192 | struct drbd_conf *mdev; |
2533 | unsigned short *tl; | 3193 | unsigned i; |
2534 | struct page *page; | ||
2535 | unsigned len; | ||
2536 | 3194 | ||
2537 | if (!e) | 3195 | retcode = drbd_adm_prepare(skb, info, 0); |
2538 | return; | 3196 | if (!adm_ctx.reply_skb) |
2539 | if (!reason || !reason[0]) | 3197 | return retcode; |
2540 | return; | 3198 | if (retcode != NO_ERROR) |
3199 | goto out; | ||
2541 | 3200 | ||
2542 | /* apparently we have to memcpy twice, first to prepare the data for the | 3201 | if (!adm_ctx.tconn) { |
2543 | * struct cn_msg, then within cn_netlink_send from the cn_msg to the | 3202 | retcode = ERR_RES_NOT_KNOWN; |
2544 | * netlink skb. */ | 3203 | goto out; |
2545 | /* receiver thread context, which is not in the writeout path (of this node), | ||
2546 | * but may be in the writeout path of the _other_ node. | ||
2547 | * GFP_NOIO to avoid potential "distributed deadlock". */ | ||
2548 | cn_reply = kzalloc( | ||
2549 | sizeof(struct cn_msg)+ | ||
2550 | sizeof(struct drbd_nl_cfg_reply)+ | ||
2551 | sizeof(struct dump_ee_tag_len_struct)+ | ||
2552 | sizeof(short int), | ||
2553 | GFP_NOIO); | ||
2554 | |||
2555 | if (!cn_reply) { | ||
2556 | dev_err(DEV, "could not kmalloc buffer for drbd_bcast_ee, sector %llu, size %u\n", | ||
2557 | (unsigned long long)e->sector, e->size); | ||
2558 | return; | ||
2559 | } | 3204 | } |
2560 | 3205 | ||
2561 | reply = (struct drbd_nl_cfg_reply*)cn_reply->data; | 3206 | /* demote */ |
2562 | tl = reply->tag_list; | 3207 | idr_for_each_entry(&adm_ctx.tconn->volumes, mdev, i) { |
2563 | 3208 | retcode = drbd_set_role(mdev, R_SECONDARY, 0); | |
2564 | tl = tl_add_str(tl, T_dump_ee_reason, reason); | 3209 | if (retcode < SS_SUCCESS) { |
2565 | tl = tl_add_blob(tl, T_seen_digest, seen_hash, dgs); | 3210 | drbd_msg_put_info("failed to demote"); |
2566 | tl = tl_add_blob(tl, T_calc_digest, calc_hash, dgs); | 3211 | goto out; |
2567 | tl = tl_add_int(tl, T_ee_sector, &e->sector); | 3212 | } |
2568 | tl = tl_add_int(tl, T_ee_block_id, &e->block_id); | ||
2569 | |||
2570 | /* dump the first 32k */ | ||
2571 | len = min_t(unsigned, e->size, 32 << 10); | ||
2572 | put_unaligned(T_ee_data, tl++); | ||
2573 | put_unaligned(len, tl++); | ||
2574 | |||
2575 | page = e->pages; | ||
2576 | page_chain_for_each(page) { | ||
2577 | void *d = kmap_atomic(page); | ||
2578 | unsigned l = min_t(unsigned, len, PAGE_SIZE); | ||
2579 | memcpy(tl, d, l); | ||
2580 | kunmap_atomic(d); | ||
2581 | tl = (unsigned short*)((char*)tl + l); | ||
2582 | len -= l; | ||
2583 | if (len == 0) | ||
2584 | break; | ||
2585 | } | 3213 | } |
2586 | put_unaligned(TT_END, tl++); /* Close the tag list */ | ||
2587 | |||
2588 | cn_reply->id.idx = CN_IDX_DRBD; | ||
2589 | cn_reply->id.val = CN_VAL_DRBD; | ||
2590 | |||
2591 | cn_reply->seq = atomic_add_return(1,&drbd_nl_seq); | ||
2592 | cn_reply->ack = 0; // not used here. | ||
2593 | cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + | ||
2594 | (int)((char*)tl - (char*)reply->tag_list); | ||
2595 | cn_reply->flags = 0; | ||
2596 | |||
2597 | reply->packet_type = P_dump_ee; | ||
2598 | reply->minor = mdev_to_minor(mdev); | ||
2599 | reply->ret_code = NO_ERROR; | ||
2600 | 3214 | ||
2601 | cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); | 3215 | retcode = conn_try_disconnect(adm_ctx.tconn, 0); |
2602 | kfree(cn_reply); | 3216 | if (retcode < SS_SUCCESS) { |
2603 | } | 3217 | drbd_msg_put_info("failed to disconnect"); |
2604 | 3218 | goto out; | |
2605 | void drbd_bcast_sync_progress(struct drbd_conf *mdev) | 3219 | } |
2606 | { | ||
2607 | char buffer[sizeof(struct cn_msg)+ | ||
2608 | sizeof(struct drbd_nl_cfg_reply)+ | ||
2609 | sizeof(struct sync_progress_tag_len_struct)+ | ||
2610 | sizeof(short int)]; | ||
2611 | struct cn_msg *cn_reply = (struct cn_msg *) buffer; | ||
2612 | struct drbd_nl_cfg_reply *reply = | ||
2613 | (struct drbd_nl_cfg_reply *)cn_reply->data; | ||
2614 | unsigned short *tl = reply->tag_list; | ||
2615 | unsigned long rs_left; | ||
2616 | unsigned int res; | ||
2617 | 3220 | ||
2618 | /* no local ref, no bitmap, no syncer progress, no broadcast. */ | 3221 | /* detach */ |
2619 | if (!get_ldev(mdev)) | 3222 | idr_for_each_entry(&adm_ctx.tconn->volumes, mdev, i) { |
2620 | return; | 3223 | retcode = adm_detach(mdev, 0); |
2621 | drbd_get_syncer_progress(mdev, &rs_left, &res); | 3224 | if (retcode < SS_SUCCESS || retcode > NO_ERROR) { |
2622 | put_ldev(mdev); | 3225 | drbd_msg_put_info("failed to detach"); |
3226 | goto out; | ||
3227 | } | ||
3228 | } | ||
2623 | 3229 | ||
2624 | tl = tl_add_int(tl, T_sync_progress, &res); | 3230 | /* If we reach this, all volumes (of this tconn) are Secondary, |
2625 | put_unaligned(TT_END, tl++); /* Close the tag list */ | 3231 | * Disconnected, Diskless, aka Unconfigured. Make sure all threads have |
3232 | * actually stopped, state handling only does drbd_thread_stop_nowait(). */ | ||
3233 | drbd_thread_stop(&adm_ctx.tconn->worker); | ||
2626 | 3234 | ||
2627 | cn_reply->id.idx = CN_IDX_DRBD; | 3235 | /* Now, nothing can fail anymore */ |
2628 | cn_reply->id.val = CN_VAL_DRBD; | ||
2629 | 3236 | ||
2630 | cn_reply->seq = atomic_add_return(1, &drbd_nl_seq); | 3237 | /* delete volumes */ |
2631 | cn_reply->ack = 0; /* not used here. */ | 3238 | idr_for_each_entry(&adm_ctx.tconn->volumes, mdev, i) { |
2632 | cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + | 3239 | retcode = adm_delete_minor(mdev); |
2633 | (int)((char *)tl - (char *)reply->tag_list); | 3240 | if (retcode != NO_ERROR) { |
2634 | cn_reply->flags = 0; | 3241 | /* "can not happen" */ |
3242 | drbd_msg_put_info("failed to delete volume"); | ||
3243 | goto out; | ||
3244 | } | ||
3245 | } | ||
2635 | 3246 | ||
2636 | reply->packet_type = P_sync_progress; | 3247 | /* delete connection */ |
2637 | reply->minor = mdev_to_minor(mdev); | 3248 | if (conn_lowest_minor(adm_ctx.tconn) < 0) { |
2638 | reply->ret_code = NO_ERROR; | 3249 | list_del_rcu(&adm_ctx.tconn->all_tconn); |
3250 | synchronize_rcu(); | ||
3251 | kref_put(&adm_ctx.tconn->kref, &conn_destroy); | ||
2639 | 3252 | ||
2640 | cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); | 3253 | retcode = NO_ERROR; |
3254 | } else { | ||
3255 | /* "can not happen" */ | ||
3256 | retcode = ERR_RES_IN_USE; | ||
3257 | drbd_msg_put_info("failed to delete connection"); | ||
3258 | } | ||
3259 | goto out; | ||
3260 | out: | ||
3261 | drbd_adm_finish(info, retcode); | ||
3262 | return 0; | ||
2641 | } | 3263 | } |
2642 | 3264 | ||
2643 | int __init drbd_nl_init(void) | 3265 | int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info) |
2644 | { | 3266 | { |
2645 | static struct cb_id cn_id_drbd; | 3267 | enum drbd_ret_code retcode; |
2646 | int err, try=10; | ||
2647 | 3268 | ||
2648 | cn_id_drbd.val = CN_VAL_DRBD; | 3269 | retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); |
2649 | do { | 3270 | if (!adm_ctx.reply_skb) |
2650 | cn_id_drbd.idx = cn_idx; | 3271 | return retcode; |
2651 | err = cn_add_callback(&cn_id_drbd, "cn_drbd", &drbd_connector_callback); | 3272 | if (retcode != NO_ERROR) |
2652 | if (!err) | 3273 | goto out; |
2653 | break; | ||
2654 | cn_idx = (cn_idx + CN_IDX_STEP); | ||
2655 | } while (try--); | ||
2656 | 3274 | ||
2657 | if (err) { | 3275 | if (conn_lowest_minor(adm_ctx.tconn) < 0) { |
2658 | printk(KERN_ERR "drbd: cn_drbd failed to register\n"); | 3276 | list_del_rcu(&adm_ctx.tconn->all_tconn); |
2659 | return err; | 3277 | synchronize_rcu(); |
3278 | kref_put(&adm_ctx.tconn->kref, &conn_destroy); | ||
3279 | |||
3280 | retcode = NO_ERROR; | ||
3281 | } else { | ||
3282 | retcode = ERR_RES_IN_USE; | ||
2660 | } | 3283 | } |
2661 | 3284 | ||
3285 | if (retcode == NO_ERROR) | ||
3286 | drbd_thread_stop(&adm_ctx.tconn->worker); | ||
3287 | out: | ||
3288 | drbd_adm_finish(info, retcode); | ||
2662 | return 0; | 3289 | return 0; |
2663 | } | 3290 | } |
2664 | 3291 | ||
2665 | void drbd_nl_cleanup(void) | 3292 | void drbd_bcast_event(struct drbd_conf *mdev, const struct sib_info *sib) |
2666 | { | 3293 | { |
2667 | static struct cb_id cn_id_drbd; | 3294 | static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */ |
2668 | 3295 | struct sk_buff *msg; | |
2669 | cn_id_drbd.idx = cn_idx; | 3296 | struct drbd_genlmsghdr *d_out; |
2670 | cn_id_drbd.val = CN_VAL_DRBD; | 3297 | unsigned seq; |
2671 | 3298 | int err = -ENOMEM; | |
2672 | cn_del_callback(&cn_id_drbd); | 3299 | |
2673 | } | 3300 | if (sib->sib_reason == SIB_SYNC_PROGRESS) { |
3301 | if (time_after(jiffies, mdev->rs_last_bcast + HZ)) | ||
3302 | mdev->rs_last_bcast = jiffies; | ||
3303 | else | ||
3304 | return; | ||
3305 | } | ||
2674 | 3306 | ||
2675 | void drbd_nl_send_reply(struct cn_msg *req, int ret_code) | 3307 | seq = atomic_inc_return(&drbd_genl_seq); |
2676 | { | 3308 | msg = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO); |
2677 | char buffer[sizeof(struct cn_msg)+sizeof(struct drbd_nl_cfg_reply)]; | 3309 | if (!msg) |
2678 | struct cn_msg *cn_reply = (struct cn_msg *) buffer; | 3310 | goto failed; |
2679 | struct drbd_nl_cfg_reply *reply = | ||
2680 | (struct drbd_nl_cfg_reply *)cn_reply->data; | ||
2681 | int rr; | ||
2682 | 3311 | ||
2683 | memset(buffer, 0, sizeof(buffer)); | 3312 | err = -EMSGSIZE; |
2684 | cn_reply->id = req->id; | 3313 | d_out = genlmsg_put(msg, 0, seq, &drbd_genl_family, 0, DRBD_EVENT); |
3314 | if (!d_out) /* cannot happen, but anyways. */ | ||
3315 | goto nla_put_failure; | ||
3316 | d_out->minor = mdev_to_minor(mdev); | ||
3317 | d_out->ret_code = NO_ERROR; | ||
2685 | 3318 | ||
2686 | cn_reply->seq = req->seq; | 3319 | if (nla_put_status_info(msg, mdev, sib)) |
2687 | cn_reply->ack = req->ack + 1; | 3320 | goto nla_put_failure; |
2688 | cn_reply->len = sizeof(struct drbd_nl_cfg_reply); | 3321 | genlmsg_end(msg, d_out); |
2689 | cn_reply->flags = 0; | 3322 | err = drbd_genl_multicast_events(msg, 0); |
3323 | /* msg has been consumed or freed in netlink_broadcast() */ | ||
3324 | if (err && err != -ESRCH) | ||
3325 | goto failed; | ||
2690 | 3326 | ||
2691 | reply->packet_type = P_return_code_only; | 3327 | return; |
2692 | reply->minor = ((struct drbd_nl_cfg_req *)req->data)->drbd_minor; | ||
2693 | reply->ret_code = ret_code; | ||
2694 | 3328 | ||
2695 | rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); | 3329 | nla_put_failure: |
2696 | if (rr && rr != -ESRCH) | 3330 | nlmsg_free(msg); |
2697 | printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr); | 3331 | failed: |
3332 | dev_err(DEV, "Error %d while broadcasting event. " | ||
3333 | "Event seq:%u sib_reason:%u\n", | ||
3334 | err, seq, sib->sib_reason); | ||
2698 | } | 3335 | } |
2699 | |||
diff --git a/drivers/block/drbd/drbd_nla.c b/drivers/block/drbd/drbd_nla.c new file mode 100644 index 000000000000..fa672b6df8d6 --- /dev/null +++ b/drivers/block/drbd/drbd_nla.c | |||
@@ -0,0 +1,55 @@ | |||
1 | #include "drbd_wrappers.h" | ||
2 | #include <linux/kernel.h> | ||
3 | #include <net/netlink.h> | ||
4 | #include <linux/drbd_genl_api.h> | ||
5 | #include "drbd_nla.h" | ||
6 | |||
7 | static int drbd_nla_check_mandatory(int maxtype, struct nlattr *nla) | ||
8 | { | ||
9 | struct nlattr *head = nla_data(nla); | ||
10 | int len = nla_len(nla); | ||
11 | int rem; | ||
12 | |||
13 | /* | ||
14 | * validate_nla (called from nla_parse_nested) ignores attributes | ||
15 | * beyond maxtype, and does not understand the DRBD_GENLA_F_MANDATORY flag. | ||
16 | * In order to have it validate attributes with the DRBD_GENLA_F_MANDATORY | ||
17 | * flag set also, check and remove that flag before calling | ||
18 | * nla_parse_nested. | ||
19 | */ | ||
20 | |||
21 | nla_for_each_attr(nla, head, len, rem) { | ||
22 | if (nla->nla_type & DRBD_GENLA_F_MANDATORY) { | ||
23 | nla->nla_type &= ~DRBD_GENLA_F_MANDATORY; | ||
24 | if (nla_type(nla) > maxtype) | ||
25 | return -EOPNOTSUPP; | ||
26 | } | ||
27 | } | ||
28 | return 0; | ||
29 | } | ||
30 | |||
31 | int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla, | ||
32 | const struct nla_policy *policy) | ||
33 | { | ||
34 | int err; | ||
35 | |||
36 | err = drbd_nla_check_mandatory(maxtype, nla); | ||
37 | if (!err) | ||
38 | err = nla_parse_nested(tb, maxtype, nla, policy); | ||
39 | |||
40 | return err; | ||
41 | } | ||
42 | |||
43 | struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype) | ||
44 | { | ||
45 | int err; | ||
46 | /* | ||
47 | * If any nested attribute has the DRBD_GENLA_F_MANDATORY flag set and | ||
48 | * we don't know about that attribute, reject all the nested | ||
49 | * attributes. | ||
50 | */ | ||
51 | err = drbd_nla_check_mandatory(maxtype, nla); | ||
52 | if (err) | ||
53 | return ERR_PTR(err); | ||
54 | return nla_find_nested(nla, attrtype); | ||
55 | } | ||
diff --git a/drivers/block/drbd/drbd_nla.h b/drivers/block/drbd/drbd_nla.h new file mode 100644 index 000000000000..679c2d5b4535 --- /dev/null +++ b/drivers/block/drbd/drbd_nla.h | |||
@@ -0,0 +1,8 @@ | |||
1 | #ifndef __DRBD_NLA_H | ||
2 | #define __DRBD_NLA_H | ||
3 | |||
4 | extern int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla, | ||
5 | const struct nla_policy *policy); | ||
6 | extern struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype); | ||
7 | |||
8 | #endif /* __DRBD_NLA_H */ | ||
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index 5496104f90b9..56672a61eb94 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c | |||
@@ -167,18 +167,24 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq) | |||
167 | * we convert to sectors in the display below. */ | 167 | * we convert to sectors in the display below. */ |
168 | unsigned long bm_bits = drbd_bm_bits(mdev); | 168 | unsigned long bm_bits = drbd_bm_bits(mdev); |
169 | unsigned long bit_pos; | 169 | unsigned long bit_pos; |
170 | unsigned long long stop_sector = 0; | ||
170 | if (mdev->state.conn == C_VERIFY_S || | 171 | if (mdev->state.conn == C_VERIFY_S || |
171 | mdev->state.conn == C_VERIFY_T) | 172 | mdev->state.conn == C_VERIFY_T) { |
172 | bit_pos = bm_bits - mdev->ov_left; | 173 | bit_pos = bm_bits - mdev->ov_left; |
173 | else | 174 | if (verify_can_do_stop_sector(mdev)) |
175 | stop_sector = mdev->ov_stop_sector; | ||
176 | } else | ||
174 | bit_pos = mdev->bm_resync_fo; | 177 | bit_pos = mdev->bm_resync_fo; |
175 | /* Total sectors may be slightly off for oddly | 178 | /* Total sectors may be slightly off for oddly |
176 | * sized devices. So what. */ | 179 | * sized devices. So what. */ |
177 | seq_printf(seq, | 180 | seq_printf(seq, |
178 | "\t%3d%% sector pos: %llu/%llu\n", | 181 | "\t%3d%% sector pos: %llu/%llu", |
179 | (int)(bit_pos / (bm_bits/100+1)), | 182 | (int)(bit_pos / (bm_bits/100+1)), |
180 | (unsigned long long)bit_pos * BM_SECT_PER_BIT, | 183 | (unsigned long long)bit_pos * BM_SECT_PER_BIT, |
181 | (unsigned long long)bm_bits * BM_SECT_PER_BIT); | 184 | (unsigned long long)bm_bits * BM_SECT_PER_BIT); |
185 | if (stop_sector != 0 && stop_sector != ULLONG_MAX) | ||
186 | seq_printf(seq, " stop sector: %llu", stop_sector); | ||
187 | seq_printf(seq, "\n"); | ||
182 | } | 188 | } |
183 | } | 189 | } |
184 | 190 | ||
@@ -194,9 +200,11 @@ static void resync_dump_detail(struct seq_file *seq, struct lc_element *e) | |||
194 | 200 | ||
195 | static int drbd_seq_show(struct seq_file *seq, void *v) | 201 | static int drbd_seq_show(struct seq_file *seq, void *v) |
196 | { | 202 | { |
197 | int i, hole = 0; | 203 | int i, prev_i = -1; |
198 | const char *sn; | 204 | const char *sn; |
199 | struct drbd_conf *mdev; | 205 | struct drbd_conf *mdev; |
206 | struct net_conf *nc; | ||
207 | char wp; | ||
200 | 208 | ||
201 | static char write_ordering_chars[] = { | 209 | static char write_ordering_chars[] = { |
202 | [WO_none] = 'n', | 210 | [WO_none] = 'n', |
@@ -227,16 +235,11 @@ static int drbd_seq_show(struct seq_file *seq, void *v) | |||
227 | oos .. known out-of-sync kB | 235 | oos .. known out-of-sync kB |
228 | */ | 236 | */ |
229 | 237 | ||
230 | for (i = 0; i < minor_count; i++) { | 238 | rcu_read_lock(); |
231 | mdev = minor_to_mdev(i); | 239 | idr_for_each_entry(&minors, mdev, i) { |
232 | if (!mdev) { | 240 | if (prev_i != i - 1) |
233 | hole = 1; | ||
234 | continue; | ||
235 | } | ||
236 | if (hole) { | ||
237 | hole = 0; | ||
238 | seq_printf(seq, "\n"); | 241 | seq_printf(seq, "\n"); |
239 | } | 242 | prev_i = i; |
240 | 243 | ||
241 | sn = drbd_conn_str(mdev->state.conn); | 244 | sn = drbd_conn_str(mdev->state.conn); |
242 | 245 | ||
@@ -248,6 +251,8 @@ static int drbd_seq_show(struct seq_file *seq, void *v) | |||
248 | /* reset mdev->congestion_reason */ | 251 | /* reset mdev->congestion_reason */ |
249 | bdi_rw_congested(&mdev->rq_queue->backing_dev_info); | 252 | bdi_rw_congested(&mdev->rq_queue->backing_dev_info); |
250 | 253 | ||
254 | nc = rcu_dereference(mdev->tconn->net_conf); | ||
255 | wp = nc ? nc->wire_protocol - DRBD_PROT_A + 'A' : ' '; | ||
251 | seq_printf(seq, | 256 | seq_printf(seq, |
252 | "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n" | 257 | "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n" |
253 | " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u " | 258 | " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u " |
@@ -257,9 +262,8 @@ static int drbd_seq_show(struct seq_file *seq, void *v) | |||
257 | drbd_role_str(mdev->state.peer), | 262 | drbd_role_str(mdev->state.peer), |
258 | drbd_disk_str(mdev->state.disk), | 263 | drbd_disk_str(mdev->state.disk), |
259 | drbd_disk_str(mdev->state.pdsk), | 264 | drbd_disk_str(mdev->state.pdsk), |
260 | (mdev->net_conf == NULL ? ' ' : | 265 | wp, |
261 | (mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')), | 266 | drbd_suspended(mdev) ? 's' : 'r', |
262 | is_susp(mdev->state) ? 's' : 'r', | ||
263 | mdev->state.aftr_isp ? 'a' : '-', | 267 | mdev->state.aftr_isp ? 'a' : '-', |
264 | mdev->state.peer_isp ? 'p' : '-', | 268 | mdev->state.peer_isp ? 'p' : '-', |
265 | mdev->state.user_isp ? 'u' : '-', | 269 | mdev->state.user_isp ? 'u' : '-', |
@@ -276,8 +280,8 @@ static int drbd_seq_show(struct seq_file *seq, void *v) | |||
276 | atomic_read(&mdev->rs_pending_cnt), | 280 | atomic_read(&mdev->rs_pending_cnt), |
277 | atomic_read(&mdev->unacked_cnt), | 281 | atomic_read(&mdev->unacked_cnt), |
278 | atomic_read(&mdev->ap_bio_cnt), | 282 | atomic_read(&mdev->ap_bio_cnt), |
279 | mdev->epochs, | 283 | mdev->tconn->epochs, |
280 | write_ordering_chars[mdev->write_ordering] | 284 | write_ordering_chars[mdev->tconn->write_ordering] |
281 | ); | 285 | ); |
282 | seq_printf(seq, " oos:%llu\n", | 286 | seq_printf(seq, " oos:%llu\n", |
283 | Bit2KB((unsigned long long) | 287 | Bit2KB((unsigned long long) |
@@ -302,6 +306,7 @@ static int drbd_seq_show(struct seq_file *seq, void *v) | |||
302 | } | 306 | } |
303 | } | 307 | } |
304 | } | 308 | } |
309 | rcu_read_unlock(); | ||
305 | 310 | ||
306 | return 0; | 311 | return 0; |
307 | } | 312 | } |
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index c74ca2df7431..a9eccfc6079b 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c | |||
@@ -48,17 +48,25 @@ | |||
48 | 48 | ||
49 | #include "drbd_vli.h" | 49 | #include "drbd_vli.h" |
50 | 50 | ||
51 | struct packet_info { | ||
52 | enum drbd_packet cmd; | ||
53 | unsigned int size; | ||
54 | unsigned int vnr; | ||
55 | void *data; | ||
56 | }; | ||
57 | |||
51 | enum finish_epoch { | 58 | enum finish_epoch { |
52 | FE_STILL_LIVE, | 59 | FE_STILL_LIVE, |
53 | FE_DESTROYED, | 60 | FE_DESTROYED, |
54 | FE_RECYCLED, | 61 | FE_RECYCLED, |
55 | }; | 62 | }; |
56 | 63 | ||
57 | static int drbd_do_handshake(struct drbd_conf *mdev); | 64 | static int drbd_do_features(struct drbd_tconn *tconn); |
58 | static int drbd_do_auth(struct drbd_conf *mdev); | 65 | static int drbd_do_auth(struct drbd_tconn *tconn); |
66 | static int drbd_disconnected(struct drbd_conf *mdev); | ||
59 | 67 | ||
60 | static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event); | 68 | static enum finish_epoch drbd_may_finish_epoch(struct drbd_tconn *, struct drbd_epoch *, enum epoch_event); |
61 | static int e_end_block(struct drbd_conf *, struct drbd_work *, int); | 69 | static int e_end_block(struct drbd_work *, int); |
62 | 70 | ||
63 | 71 | ||
64 | #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) | 72 | #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) |
@@ -142,11 +150,12 @@ static void page_chain_add(struct page **head, | |||
142 | *head = chain_first; | 150 | *head = chain_first; |
143 | } | 151 | } |
144 | 152 | ||
145 | static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int number) | 153 | static struct page *__drbd_alloc_pages(struct drbd_conf *mdev, |
154 | unsigned int number) | ||
146 | { | 155 | { |
147 | struct page *page = NULL; | 156 | struct page *page = NULL; |
148 | struct page *tmp = NULL; | 157 | struct page *tmp = NULL; |
149 | int i = 0; | 158 | unsigned int i = 0; |
150 | 159 | ||
151 | /* Yes, testing drbd_pp_vacant outside the lock is racy. | 160 | /* Yes, testing drbd_pp_vacant outside the lock is racy. |
152 | * So what. It saves a spin_lock. */ | 161 | * So what. It saves a spin_lock. */ |
@@ -175,7 +184,7 @@ static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int | |||
175 | return page; | 184 | return page; |
176 | 185 | ||
177 | /* Not enough pages immediately available this time. | 186 | /* Not enough pages immediately available this time. |
178 | * No need to jump around here, drbd_pp_alloc will retry this | 187 | * No need to jump around here, drbd_alloc_pages will retry this |
179 | * function "soon". */ | 188 | * function "soon". */ |
180 | if (page) { | 189 | if (page) { |
181 | tmp = page_chain_tail(page, NULL); | 190 | tmp = page_chain_tail(page, NULL); |
@@ -187,9 +196,10 @@ static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int | |||
187 | return NULL; | 196 | return NULL; |
188 | } | 197 | } |
189 | 198 | ||
190 | static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed) | 199 | static void reclaim_finished_net_peer_reqs(struct drbd_conf *mdev, |
200 | struct list_head *to_be_freed) | ||
191 | { | 201 | { |
192 | struct drbd_epoch_entry *e; | 202 | struct drbd_peer_request *peer_req; |
193 | struct list_head *le, *tle; | 203 | struct list_head *le, *tle; |
194 | 204 | ||
195 | /* The EEs are always appended to the end of the list. Since | 205 | /* The EEs are always appended to the end of the list. Since |
@@ -198,8 +208,8 @@ static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed | |||
198 | stop to examine the list... */ | 208 | stop to examine the list... */ |
199 | 209 | ||
200 | list_for_each_safe(le, tle, &mdev->net_ee) { | 210 | list_for_each_safe(le, tle, &mdev->net_ee) { |
201 | e = list_entry(le, struct drbd_epoch_entry, w.list); | 211 | peer_req = list_entry(le, struct drbd_peer_request, w.list); |
202 | if (drbd_ee_has_active_page(e)) | 212 | if (drbd_peer_req_has_active_page(peer_req)) |
203 | break; | 213 | break; |
204 | list_move(le, to_be_freed); | 214 | list_move(le, to_be_freed); |
205 | } | 215 | } |
@@ -208,18 +218,18 @@ static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed | |||
208 | static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev) | 218 | static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev) |
209 | { | 219 | { |
210 | LIST_HEAD(reclaimed); | 220 | LIST_HEAD(reclaimed); |
211 | struct drbd_epoch_entry *e, *t; | 221 | struct drbd_peer_request *peer_req, *t; |
212 | 222 | ||
213 | spin_lock_irq(&mdev->req_lock); | 223 | spin_lock_irq(&mdev->tconn->req_lock); |
214 | reclaim_net_ee(mdev, &reclaimed); | 224 | reclaim_finished_net_peer_reqs(mdev, &reclaimed); |
215 | spin_unlock_irq(&mdev->req_lock); | 225 | spin_unlock_irq(&mdev->tconn->req_lock); |
216 | 226 | ||
217 | list_for_each_entry_safe(e, t, &reclaimed, w.list) | 227 | list_for_each_entry_safe(peer_req, t, &reclaimed, w.list) |
218 | drbd_free_net_ee(mdev, e); | 228 | drbd_free_net_peer_req(mdev, peer_req); |
219 | } | 229 | } |
220 | 230 | ||
221 | /** | 231 | /** |
222 | * drbd_pp_alloc() - Returns @number pages, retries forever (or until signalled) | 232 | * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled) |
223 | * @mdev: DRBD device. | 233 | * @mdev: DRBD device. |
224 | * @number: number of pages requested | 234 | * @number: number of pages requested |
225 | * @retry: whether to retry, if not enough pages are available right now | 235 | * @retry: whether to retry, if not enough pages are available right now |
@@ -230,23 +240,31 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev) | |||
230 | * | 240 | * |
231 | * Returns a page chain linked via page->private. | 241 | * Returns a page chain linked via page->private. |
232 | */ | 242 | */ |
233 | static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool retry) | 243 | struct page *drbd_alloc_pages(struct drbd_conf *mdev, unsigned int number, |
244 | bool retry) | ||
234 | { | 245 | { |
235 | struct page *page = NULL; | 246 | struct page *page = NULL; |
247 | struct net_conf *nc; | ||
236 | DEFINE_WAIT(wait); | 248 | DEFINE_WAIT(wait); |
249 | int mxb; | ||
237 | 250 | ||
238 | /* Yes, we may run up to @number over max_buffers. If we | 251 | /* Yes, we may run up to @number over max_buffers. If we |
239 | * follow it strictly, the admin will get it wrong anyways. */ | 252 | * follow it strictly, the admin will get it wrong anyways. */ |
240 | if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) | 253 | rcu_read_lock(); |
241 | page = drbd_pp_first_pages_or_try_alloc(mdev, number); | 254 | nc = rcu_dereference(mdev->tconn->net_conf); |
255 | mxb = nc ? nc->max_buffers : 1000000; | ||
256 | rcu_read_unlock(); | ||
257 | |||
258 | if (atomic_read(&mdev->pp_in_use) < mxb) | ||
259 | page = __drbd_alloc_pages(mdev, number); | ||
242 | 260 | ||
243 | while (page == NULL) { | 261 | while (page == NULL) { |
244 | prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE); | 262 | prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE); |
245 | 263 | ||
246 | drbd_kick_lo_and_reclaim_net(mdev); | 264 | drbd_kick_lo_and_reclaim_net(mdev); |
247 | 265 | ||
248 | if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) { | 266 | if (atomic_read(&mdev->pp_in_use) < mxb) { |
249 | page = drbd_pp_first_pages_or_try_alloc(mdev, number); | 267 | page = __drbd_alloc_pages(mdev, number); |
250 | if (page) | 268 | if (page) |
251 | break; | 269 | break; |
252 | } | 270 | } |
@@ -255,7 +273,7 @@ static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool | |||
255 | break; | 273 | break; |
256 | 274 | ||
257 | if (signal_pending(current)) { | 275 | if (signal_pending(current)) { |
258 | dev_warn(DEV, "drbd_pp_alloc interrupted!\n"); | 276 | dev_warn(DEV, "drbd_alloc_pages interrupted!\n"); |
259 | break; | 277 | break; |
260 | } | 278 | } |
261 | 279 | ||
@@ -268,11 +286,11 @@ static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool | |||
268 | return page; | 286 | return page; |
269 | } | 287 | } |
270 | 288 | ||
271 | /* Must not be used from irq, as that may deadlock: see drbd_pp_alloc. | 289 | /* Must not be used from irq, as that may deadlock: see drbd_alloc_pages. |
272 | * Is also used from inside an other spin_lock_irq(&mdev->req_lock); | 290 | * Is also used from inside an other spin_lock_irq(&mdev->tconn->req_lock); |
273 | * Either links the page chain back to the global pool, | 291 | * Either links the page chain back to the global pool, |
274 | * or returns all pages to the system. */ | 292 | * or returns all pages to the system. */ |
275 | static void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net) | 293 | static void drbd_free_pages(struct drbd_conf *mdev, struct page *page, int is_net) |
276 | { | 294 | { |
277 | atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use; | 295 | atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use; |
278 | int i; | 296 | int i; |
@@ -280,7 +298,7 @@ static void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net) | |||
280 | if (page == NULL) | 298 | if (page == NULL) |
281 | return; | 299 | return; |
282 | 300 | ||
283 | if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE)*minor_count) | 301 | if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count) |
284 | i = page_chain_free(page); | 302 | i = page_chain_free(page); |
285 | else { | 303 | else { |
286 | struct page *tmp; | 304 | struct page *tmp; |
@@ -302,127 +320,130 @@ You need to hold the req_lock: | |||
302 | _drbd_wait_ee_list_empty() | 320 | _drbd_wait_ee_list_empty() |
303 | 321 | ||
304 | You must not have the req_lock: | 322 | You must not have the req_lock: |
305 | drbd_free_ee() | 323 | drbd_free_peer_req() |
306 | drbd_alloc_ee() | 324 | drbd_alloc_peer_req() |
307 | drbd_init_ee() | 325 | drbd_free_peer_reqs() |
308 | drbd_release_ee() | ||
309 | drbd_ee_fix_bhs() | 326 | drbd_ee_fix_bhs() |
310 | drbd_process_done_ee() | 327 | drbd_finish_peer_reqs() |
311 | drbd_clear_done_ee() | 328 | drbd_clear_done_ee() |
312 | drbd_wait_ee_list_empty() | 329 | drbd_wait_ee_list_empty() |
313 | */ | 330 | */ |
314 | 331 | ||
315 | struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, | 332 | struct drbd_peer_request * |
316 | u64 id, | 333 | drbd_alloc_peer_req(struct drbd_conf *mdev, u64 id, sector_t sector, |
317 | sector_t sector, | 334 | unsigned int data_size, gfp_t gfp_mask) __must_hold(local) |
318 | unsigned int data_size, | ||
319 | gfp_t gfp_mask) __must_hold(local) | ||
320 | { | 335 | { |
321 | struct drbd_epoch_entry *e; | 336 | struct drbd_peer_request *peer_req; |
322 | struct page *page = NULL; | 337 | struct page *page = NULL; |
323 | unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT; | 338 | unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT; |
324 | 339 | ||
325 | if (drbd_insert_fault(mdev, DRBD_FAULT_AL_EE)) | 340 | if (drbd_insert_fault(mdev, DRBD_FAULT_AL_EE)) |
326 | return NULL; | 341 | return NULL; |
327 | 342 | ||
328 | e = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM); | 343 | peer_req = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM); |
329 | if (!e) { | 344 | if (!peer_req) { |
330 | if (!(gfp_mask & __GFP_NOWARN)) | 345 | if (!(gfp_mask & __GFP_NOWARN)) |
331 | dev_err(DEV, "alloc_ee: Allocation of an EE failed\n"); | 346 | dev_err(DEV, "%s: allocation failed\n", __func__); |
332 | return NULL; | 347 | return NULL; |
333 | } | 348 | } |
334 | 349 | ||
335 | if (data_size) { | 350 | if (data_size) { |
336 | page = drbd_pp_alloc(mdev, nr_pages, (gfp_mask & __GFP_WAIT)); | 351 | page = drbd_alloc_pages(mdev, nr_pages, (gfp_mask & __GFP_WAIT)); |
337 | if (!page) | 352 | if (!page) |
338 | goto fail; | 353 | goto fail; |
339 | } | 354 | } |
340 | 355 | ||
341 | INIT_HLIST_NODE(&e->collision); | 356 | drbd_clear_interval(&peer_req->i); |
342 | e->epoch = NULL; | 357 | peer_req->i.size = data_size; |
343 | e->mdev = mdev; | 358 | peer_req->i.sector = sector; |
344 | e->pages = page; | 359 | peer_req->i.local = false; |
345 | atomic_set(&e->pending_bios, 0); | 360 | peer_req->i.waiting = false; |
346 | e->size = data_size; | 361 | |
347 | e->flags = 0; | 362 | peer_req->epoch = NULL; |
348 | e->sector = sector; | 363 | peer_req->w.mdev = mdev; |
349 | e->block_id = id; | 364 | peer_req->pages = page; |
365 | atomic_set(&peer_req->pending_bios, 0); | ||
366 | peer_req->flags = 0; | ||
367 | /* | ||
368 | * The block_id is opaque to the receiver. It is not endianness | ||
369 | * converted, and sent back to the sender unchanged. | ||
370 | */ | ||
371 | peer_req->block_id = id; | ||
350 | 372 | ||
351 | return e; | 373 | return peer_req; |
352 | 374 | ||
353 | fail: | 375 | fail: |
354 | mempool_free(e, drbd_ee_mempool); | 376 | mempool_free(peer_req, drbd_ee_mempool); |
355 | return NULL; | 377 | return NULL; |
356 | } | 378 | } |
357 | 379 | ||
358 | void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, int is_net) | 380 | void __drbd_free_peer_req(struct drbd_conf *mdev, struct drbd_peer_request *peer_req, |
381 | int is_net) | ||
359 | { | 382 | { |
360 | if (e->flags & EE_HAS_DIGEST) | 383 | if (peer_req->flags & EE_HAS_DIGEST) |
361 | kfree(e->digest); | 384 | kfree(peer_req->digest); |
362 | drbd_pp_free(mdev, e->pages, is_net); | 385 | drbd_free_pages(mdev, peer_req->pages, is_net); |
363 | D_ASSERT(atomic_read(&e->pending_bios) == 0); | 386 | D_ASSERT(atomic_read(&peer_req->pending_bios) == 0); |
364 | D_ASSERT(hlist_unhashed(&e->collision)); | 387 | D_ASSERT(drbd_interval_empty(&peer_req->i)); |
365 | mempool_free(e, drbd_ee_mempool); | 388 | mempool_free(peer_req, drbd_ee_mempool); |
366 | } | 389 | } |
367 | 390 | ||
368 | int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list) | 391 | int drbd_free_peer_reqs(struct drbd_conf *mdev, struct list_head *list) |
369 | { | 392 | { |
370 | LIST_HEAD(work_list); | 393 | LIST_HEAD(work_list); |
371 | struct drbd_epoch_entry *e, *t; | 394 | struct drbd_peer_request *peer_req, *t; |
372 | int count = 0; | 395 | int count = 0; |
373 | int is_net = list == &mdev->net_ee; | 396 | int is_net = list == &mdev->net_ee; |
374 | 397 | ||
375 | spin_lock_irq(&mdev->req_lock); | 398 | spin_lock_irq(&mdev->tconn->req_lock); |
376 | list_splice_init(list, &work_list); | 399 | list_splice_init(list, &work_list); |
377 | spin_unlock_irq(&mdev->req_lock); | 400 | spin_unlock_irq(&mdev->tconn->req_lock); |
378 | 401 | ||
379 | list_for_each_entry_safe(e, t, &work_list, w.list) { | 402 | list_for_each_entry_safe(peer_req, t, &work_list, w.list) { |
380 | drbd_free_some_ee(mdev, e, is_net); | 403 | __drbd_free_peer_req(mdev, peer_req, is_net); |
381 | count++; | 404 | count++; |
382 | } | 405 | } |
383 | return count; | 406 | return count; |
384 | } | 407 | } |
385 | 408 | ||
386 | |||
387 | /* | 409 | /* |
388 | * This function is called from _asender only_ | 410 | * See also comments in _req_mod(,BARRIER_ACKED) and receive_Barrier. |
389 | * but see also comments in _req_mod(,barrier_acked) | ||
390 | * and receive_Barrier. | ||
391 | * | ||
392 | * Move entries from net_ee to done_ee, if ready. | ||
393 | * Grab done_ee, call all callbacks, free the entries. | ||
394 | * The callbacks typically send out ACKs. | ||
395 | */ | 411 | */ |
396 | static int drbd_process_done_ee(struct drbd_conf *mdev) | 412 | static int drbd_finish_peer_reqs(struct drbd_conf *mdev) |
397 | { | 413 | { |
398 | LIST_HEAD(work_list); | 414 | LIST_HEAD(work_list); |
399 | LIST_HEAD(reclaimed); | 415 | LIST_HEAD(reclaimed); |
400 | struct drbd_epoch_entry *e, *t; | 416 | struct drbd_peer_request *peer_req, *t; |
401 | int ok = (mdev->state.conn >= C_WF_REPORT_PARAMS); | 417 | int err = 0; |
402 | 418 | ||
403 | spin_lock_irq(&mdev->req_lock); | 419 | spin_lock_irq(&mdev->tconn->req_lock); |
404 | reclaim_net_ee(mdev, &reclaimed); | 420 | reclaim_finished_net_peer_reqs(mdev, &reclaimed); |
405 | list_splice_init(&mdev->done_ee, &work_list); | 421 | list_splice_init(&mdev->done_ee, &work_list); |
406 | spin_unlock_irq(&mdev->req_lock); | 422 | spin_unlock_irq(&mdev->tconn->req_lock); |
407 | 423 | ||
408 | list_for_each_entry_safe(e, t, &reclaimed, w.list) | 424 | list_for_each_entry_safe(peer_req, t, &reclaimed, w.list) |
409 | drbd_free_net_ee(mdev, e); | 425 | drbd_free_net_peer_req(mdev, peer_req); |
410 | 426 | ||
411 | /* possible callbacks here: | 427 | /* possible callbacks here: |
412 | * e_end_block, and e_end_resync_block, e_send_discard_ack. | 428 | * e_end_block, and e_end_resync_block, e_send_superseded. |
413 | * all ignore the last argument. | 429 | * all ignore the last argument. |
414 | */ | 430 | */ |
415 | list_for_each_entry_safe(e, t, &work_list, w.list) { | 431 | list_for_each_entry_safe(peer_req, t, &work_list, w.list) { |
432 | int err2; | ||
433 | |||
416 | /* list_del not necessary, next/prev members not touched */ | 434 | /* list_del not necessary, next/prev members not touched */ |
417 | ok = e->w.cb(mdev, &e->w, !ok) && ok; | 435 | err2 = peer_req->w.cb(&peer_req->w, !!err); |
418 | drbd_free_ee(mdev, e); | 436 | if (!err) |
437 | err = err2; | ||
438 | drbd_free_peer_req(mdev, peer_req); | ||
419 | } | 439 | } |
420 | wake_up(&mdev->ee_wait); | 440 | wake_up(&mdev->ee_wait); |
421 | 441 | ||
422 | return ok; | 442 | return err; |
423 | } | 443 | } |
424 | 444 | ||
425 | void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head) | 445 | static void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, |
446 | struct list_head *head) | ||
426 | { | 447 | { |
427 | DEFINE_WAIT(wait); | 448 | DEFINE_WAIT(wait); |
428 | 449 | ||
@@ -430,55 +451,22 @@ void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head) | |||
430 | * and calling prepare_to_wait in the fast path */ | 451 | * and calling prepare_to_wait in the fast path */ |
431 | while (!list_empty(head)) { | 452 | while (!list_empty(head)) { |
432 | prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE); | 453 | prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE); |
433 | spin_unlock_irq(&mdev->req_lock); | 454 | spin_unlock_irq(&mdev->tconn->req_lock); |
434 | io_schedule(); | 455 | io_schedule(); |
435 | finish_wait(&mdev->ee_wait, &wait); | 456 | finish_wait(&mdev->ee_wait, &wait); |
436 | spin_lock_irq(&mdev->req_lock); | 457 | spin_lock_irq(&mdev->tconn->req_lock); |
437 | } | 458 | } |
438 | } | 459 | } |
439 | 460 | ||
440 | void drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head) | 461 | static void drbd_wait_ee_list_empty(struct drbd_conf *mdev, |
462 | struct list_head *head) | ||
441 | { | 463 | { |
442 | spin_lock_irq(&mdev->req_lock); | 464 | spin_lock_irq(&mdev->tconn->req_lock); |
443 | _drbd_wait_ee_list_empty(mdev, head); | 465 | _drbd_wait_ee_list_empty(mdev, head); |
444 | spin_unlock_irq(&mdev->req_lock); | 466 | spin_unlock_irq(&mdev->tconn->req_lock); |
445 | } | ||
446 | |||
447 | /* see also kernel_accept; which is only present since 2.6.18. | ||
448 | * also we want to log which part of it failed, exactly */ | ||
449 | static int drbd_accept(struct drbd_conf *mdev, const char **what, | ||
450 | struct socket *sock, struct socket **newsock) | ||
451 | { | ||
452 | struct sock *sk = sock->sk; | ||
453 | int err = 0; | ||
454 | |||
455 | *what = "listen"; | ||
456 | err = sock->ops->listen(sock, 5); | ||
457 | if (err < 0) | ||
458 | goto out; | ||
459 | |||
460 | *what = "sock_create_lite"; | ||
461 | err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol, | ||
462 | newsock); | ||
463 | if (err < 0) | ||
464 | goto out; | ||
465 | |||
466 | *what = "accept"; | ||
467 | err = sock->ops->accept(sock, *newsock, 0); | ||
468 | if (err < 0) { | ||
469 | sock_release(*newsock); | ||
470 | *newsock = NULL; | ||
471 | goto out; | ||
472 | } | ||
473 | (*newsock)->ops = sock->ops; | ||
474 | __module_get((*newsock)->ops->owner); | ||
475 | |||
476 | out: | ||
477 | return err; | ||
478 | } | 467 | } |
479 | 468 | ||
480 | static int drbd_recv_short(struct drbd_conf *mdev, struct socket *sock, | 469 | static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags) |
481 | void *buf, size_t size, int flags) | ||
482 | { | 470 | { |
483 | mm_segment_t oldfs; | 471 | mm_segment_t oldfs; |
484 | struct kvec iov = { | 472 | struct kvec iov = { |
@@ -500,59 +488,62 @@ static int drbd_recv_short(struct drbd_conf *mdev, struct socket *sock, | |||
500 | return rv; | 488 | return rv; |
501 | } | 489 | } |
502 | 490 | ||
503 | static int drbd_recv(struct drbd_conf *mdev, void *buf, size_t size) | 491 | static int drbd_recv(struct drbd_tconn *tconn, void *buf, size_t size) |
504 | { | 492 | { |
505 | mm_segment_t oldfs; | ||
506 | struct kvec iov = { | ||
507 | .iov_base = buf, | ||
508 | .iov_len = size, | ||
509 | }; | ||
510 | struct msghdr msg = { | ||
511 | .msg_iovlen = 1, | ||
512 | .msg_iov = (struct iovec *)&iov, | ||
513 | .msg_flags = MSG_WAITALL | MSG_NOSIGNAL | ||
514 | }; | ||
515 | int rv; | 493 | int rv; |
516 | 494 | ||
517 | oldfs = get_fs(); | 495 | rv = drbd_recv_short(tconn->data.socket, buf, size, 0); |
518 | set_fs(KERNEL_DS); | ||
519 | 496 | ||
520 | for (;;) { | 497 | if (rv < 0) { |
521 | rv = sock_recvmsg(mdev->data.socket, &msg, size, msg.msg_flags); | 498 | if (rv == -ECONNRESET) |
522 | if (rv == size) | 499 | conn_info(tconn, "sock was reset by peer\n"); |
523 | break; | 500 | else if (rv != -ERESTARTSYS) |
501 | conn_err(tconn, "sock_recvmsg returned %d\n", rv); | ||
502 | } else if (rv == 0) { | ||
503 | if (test_bit(DISCONNECT_SENT, &tconn->flags)) { | ||
504 | long t; | ||
505 | rcu_read_lock(); | ||
506 | t = rcu_dereference(tconn->net_conf)->ping_timeo * HZ/10; | ||
507 | rcu_read_unlock(); | ||
524 | 508 | ||
525 | /* Note: | 509 | t = wait_event_timeout(tconn->ping_wait, tconn->cstate < C_WF_REPORT_PARAMS, t); |
526 | * ECONNRESET other side closed the connection | ||
527 | * ERESTARTSYS (on sock) we got a signal | ||
528 | */ | ||
529 | 510 | ||
530 | if (rv < 0) { | 511 | if (t) |
531 | if (rv == -ECONNRESET) | 512 | goto out; |
532 | dev_info(DEV, "sock was reset by peer\n"); | ||
533 | else if (rv != -ERESTARTSYS) | ||
534 | dev_err(DEV, "sock_recvmsg returned %d\n", rv); | ||
535 | break; | ||
536 | } else if (rv == 0) { | ||
537 | dev_info(DEV, "sock was shut down by peer\n"); | ||
538 | break; | ||
539 | } else { | ||
540 | /* signal came in, or peer/link went down, | ||
541 | * after we read a partial message | ||
542 | */ | ||
543 | /* D_ASSERT(signal_pending(current)); */ | ||
544 | break; | ||
545 | } | 513 | } |
546 | }; | 514 | conn_info(tconn, "sock was shut down by peer\n"); |
547 | 515 | } | |
548 | set_fs(oldfs); | ||
549 | 516 | ||
550 | if (rv != size) | 517 | if (rv != size) |
551 | drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE)); | 518 | conn_request_state(tconn, NS(conn, C_BROKEN_PIPE), CS_HARD); |
552 | 519 | ||
520 | out: | ||
553 | return rv; | 521 | return rv; |
554 | } | 522 | } |
555 | 523 | ||
524 | static int drbd_recv_all(struct drbd_tconn *tconn, void *buf, size_t size) | ||
525 | { | ||
526 | int err; | ||
527 | |||
528 | err = drbd_recv(tconn, buf, size); | ||
529 | if (err != size) { | ||
530 | if (err >= 0) | ||
531 | err = -EIO; | ||
532 | } else | ||
533 | err = 0; | ||
534 | return err; | ||
535 | } | ||
536 | |||
537 | static int drbd_recv_all_warn(struct drbd_tconn *tconn, void *buf, size_t size) | ||
538 | { | ||
539 | int err; | ||
540 | |||
541 | err = drbd_recv_all(tconn, buf, size); | ||
542 | if (err && !signal_pending(current)) | ||
543 | conn_warn(tconn, "short read (expected size %d)\n", (int)size); | ||
544 | return err; | ||
545 | } | ||
546 | |||
556 | /* quoting tcp(7): | 547 | /* quoting tcp(7): |
557 | * On individual connections, the socket buffer size must be set prior to the | 548 | * On individual connections, the socket buffer size must be set prior to the |
558 | * listen(2) or connect(2) calls in order to have it take effect. | 549 | * listen(2) or connect(2) calls in order to have it take effect. |
@@ -572,29 +563,50 @@ static void drbd_setbufsize(struct socket *sock, unsigned int snd, | |||
572 | } | 563 | } |
573 | } | 564 | } |
574 | 565 | ||
575 | static struct socket *drbd_try_connect(struct drbd_conf *mdev) | 566 | static struct socket *drbd_try_connect(struct drbd_tconn *tconn) |
576 | { | 567 | { |
577 | const char *what; | 568 | const char *what; |
578 | struct socket *sock; | 569 | struct socket *sock; |
579 | struct sockaddr_in6 src_in6; | 570 | struct sockaddr_in6 src_in6; |
580 | int err; | 571 | struct sockaddr_in6 peer_in6; |
572 | struct net_conf *nc; | ||
573 | int err, peer_addr_len, my_addr_len; | ||
574 | int sndbuf_size, rcvbuf_size, connect_int; | ||
581 | int disconnect_on_error = 1; | 575 | int disconnect_on_error = 1; |
582 | 576 | ||
583 | if (!get_net_conf(mdev)) | 577 | rcu_read_lock(); |
578 | nc = rcu_dereference(tconn->net_conf); | ||
579 | if (!nc) { | ||
580 | rcu_read_unlock(); | ||
584 | return NULL; | 581 | return NULL; |
582 | } | ||
583 | sndbuf_size = nc->sndbuf_size; | ||
584 | rcvbuf_size = nc->rcvbuf_size; | ||
585 | connect_int = nc->connect_int; | ||
586 | rcu_read_unlock(); | ||
587 | |||
588 | my_addr_len = min_t(int, tconn->my_addr_len, sizeof(src_in6)); | ||
589 | memcpy(&src_in6, &tconn->my_addr, my_addr_len); | ||
590 | |||
591 | if (((struct sockaddr *)&tconn->my_addr)->sa_family == AF_INET6) | ||
592 | src_in6.sin6_port = 0; | ||
593 | else | ||
594 | ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */ | ||
595 | |||
596 | peer_addr_len = min_t(int, tconn->peer_addr_len, sizeof(src_in6)); | ||
597 | memcpy(&peer_in6, &tconn->peer_addr, peer_addr_len); | ||
585 | 598 | ||
586 | what = "sock_create_kern"; | 599 | what = "sock_create_kern"; |
587 | err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family, | 600 | err = sock_create_kern(((struct sockaddr *)&src_in6)->sa_family, |
588 | SOCK_STREAM, IPPROTO_TCP, &sock); | 601 | SOCK_STREAM, IPPROTO_TCP, &sock); |
589 | if (err < 0) { | 602 | if (err < 0) { |
590 | sock = NULL; | 603 | sock = NULL; |
591 | goto out; | 604 | goto out; |
592 | } | 605 | } |
593 | 606 | ||
594 | sock->sk->sk_rcvtimeo = | 607 | sock->sk->sk_rcvtimeo = |
595 | sock->sk->sk_sndtimeo = mdev->net_conf->try_connect_int*HZ; | 608 | sock->sk->sk_sndtimeo = connect_int * HZ; |
596 | drbd_setbufsize(sock, mdev->net_conf->sndbuf_size, | 609 | drbd_setbufsize(sock, sndbuf_size, rcvbuf_size); |
597 | mdev->net_conf->rcvbuf_size); | ||
598 | 610 | ||
599 | /* explicitly bind to the configured IP as source IP | 611 | /* explicitly bind to the configured IP as source IP |
600 | * for the outgoing connections. | 612 | * for the outgoing connections. |
@@ -603,17 +615,8 @@ static struct socket *drbd_try_connect(struct drbd_conf *mdev) | |||
603 | * Make sure to use 0 as port number, so linux selects | 615 | * Make sure to use 0 as port number, so linux selects |
604 | * a free one dynamically. | 616 | * a free one dynamically. |
605 | */ | 617 | */ |
606 | memcpy(&src_in6, mdev->net_conf->my_addr, | ||
607 | min_t(int, mdev->net_conf->my_addr_len, sizeof(src_in6))); | ||
608 | if (((struct sockaddr *)mdev->net_conf->my_addr)->sa_family == AF_INET6) | ||
609 | src_in6.sin6_port = 0; | ||
610 | else | ||
611 | ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */ | ||
612 | |||
613 | what = "bind before connect"; | 618 | what = "bind before connect"; |
614 | err = sock->ops->bind(sock, | 619 | err = sock->ops->bind(sock, (struct sockaddr *) &src_in6, my_addr_len); |
615 | (struct sockaddr *) &src_in6, | ||
616 | mdev->net_conf->my_addr_len); | ||
617 | if (err < 0) | 620 | if (err < 0) |
618 | goto out; | 621 | goto out; |
619 | 622 | ||
@@ -621,9 +624,7 @@ static struct socket *drbd_try_connect(struct drbd_conf *mdev) | |||
621 | * stay C_WF_CONNECTION, don't go Disconnecting! */ | 624 | * stay C_WF_CONNECTION, don't go Disconnecting! */ |
622 | disconnect_on_error = 0; | 625 | disconnect_on_error = 0; |
623 | what = "connect"; | 626 | what = "connect"; |
624 | err = sock->ops->connect(sock, | 627 | err = sock->ops->connect(sock, (struct sockaddr *) &peer_in6, peer_addr_len, 0); |
625 | (struct sockaddr *)mdev->net_conf->peer_addr, | ||
626 | mdev->net_conf->peer_addr_len, 0); | ||
627 | 628 | ||
628 | out: | 629 | out: |
629 | if (err < 0) { | 630 | if (err < 0) { |
@@ -641,91 +642,174 @@ out: | |||
641 | disconnect_on_error = 0; | 642 | disconnect_on_error = 0; |
642 | break; | 643 | break; |
643 | default: | 644 | default: |
644 | dev_err(DEV, "%s failed, err = %d\n", what, err); | 645 | conn_err(tconn, "%s failed, err = %d\n", what, err); |
645 | } | 646 | } |
646 | if (disconnect_on_error) | 647 | if (disconnect_on_error) |
647 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | 648 | conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD); |
648 | } | 649 | } |
649 | put_net_conf(mdev); | 650 | |
650 | return sock; | 651 | return sock; |
651 | } | 652 | } |
652 | 653 | ||
653 | static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev) | 654 | struct accept_wait_data { |
655 | struct drbd_tconn *tconn; | ||
656 | struct socket *s_listen; | ||
657 | struct completion door_bell; | ||
658 | void (*original_sk_state_change)(struct sock *sk); | ||
659 | |||
660 | }; | ||
661 | |||
662 | static void drbd_incoming_connection(struct sock *sk) | ||
654 | { | 663 | { |
655 | int timeo, err; | 664 | struct accept_wait_data *ad = sk->sk_user_data; |
656 | struct socket *s_estab = NULL, *s_listen; | 665 | void (*state_change)(struct sock *sk); |
666 | |||
667 | state_change = ad->original_sk_state_change; | ||
668 | if (sk->sk_state == TCP_ESTABLISHED) | ||
669 | complete(&ad->door_bell); | ||
670 | state_change(sk); | ||
671 | } | ||
672 | |||
673 | static int prepare_listen_socket(struct drbd_tconn *tconn, struct accept_wait_data *ad) | ||
674 | { | ||
675 | int err, sndbuf_size, rcvbuf_size, my_addr_len; | ||
676 | struct sockaddr_in6 my_addr; | ||
677 | struct socket *s_listen; | ||
678 | struct net_conf *nc; | ||
657 | const char *what; | 679 | const char *what; |
658 | 680 | ||
659 | if (!get_net_conf(mdev)) | 681 | rcu_read_lock(); |
660 | return NULL; | 682 | nc = rcu_dereference(tconn->net_conf); |
683 | if (!nc) { | ||
684 | rcu_read_unlock(); | ||
685 | return -EIO; | ||
686 | } | ||
687 | sndbuf_size = nc->sndbuf_size; | ||
688 | rcvbuf_size = nc->rcvbuf_size; | ||
689 | rcu_read_unlock(); | ||
690 | |||
691 | my_addr_len = min_t(int, tconn->my_addr_len, sizeof(struct sockaddr_in6)); | ||
692 | memcpy(&my_addr, &tconn->my_addr, my_addr_len); | ||
661 | 693 | ||
662 | what = "sock_create_kern"; | 694 | what = "sock_create_kern"; |
663 | err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family, | 695 | err = sock_create_kern(((struct sockaddr *)&my_addr)->sa_family, |
664 | SOCK_STREAM, IPPROTO_TCP, &s_listen); | 696 | SOCK_STREAM, IPPROTO_TCP, &s_listen); |
665 | if (err) { | 697 | if (err) { |
666 | s_listen = NULL; | 698 | s_listen = NULL; |
667 | goto out; | 699 | goto out; |
668 | } | 700 | } |
669 | 701 | ||
670 | timeo = mdev->net_conf->try_connect_int * HZ; | 702 | s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ |
671 | timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */ | 703 | drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size); |
672 | |||
673 | s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ | ||
674 | s_listen->sk->sk_rcvtimeo = timeo; | ||
675 | s_listen->sk->sk_sndtimeo = timeo; | ||
676 | drbd_setbufsize(s_listen, mdev->net_conf->sndbuf_size, | ||
677 | mdev->net_conf->rcvbuf_size); | ||
678 | 704 | ||
679 | what = "bind before listen"; | 705 | what = "bind before listen"; |
680 | err = s_listen->ops->bind(s_listen, | 706 | err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len); |
681 | (struct sockaddr *) mdev->net_conf->my_addr, | ||
682 | mdev->net_conf->my_addr_len); | ||
683 | if (err < 0) | 707 | if (err < 0) |
684 | goto out; | 708 | goto out; |
685 | 709 | ||
686 | err = drbd_accept(mdev, &what, s_listen, &s_estab); | 710 | ad->s_listen = s_listen; |
711 | write_lock_bh(&s_listen->sk->sk_callback_lock); | ||
712 | ad->original_sk_state_change = s_listen->sk->sk_state_change; | ||
713 | s_listen->sk->sk_state_change = drbd_incoming_connection; | ||
714 | s_listen->sk->sk_user_data = ad; | ||
715 | write_unlock_bh(&s_listen->sk->sk_callback_lock); | ||
716 | |||
717 | what = "listen"; | ||
718 | err = s_listen->ops->listen(s_listen, 5); | ||
719 | if (err < 0) | ||
720 | goto out; | ||
687 | 721 | ||
722 | return 0; | ||
688 | out: | 723 | out: |
689 | if (s_listen) | 724 | if (s_listen) |
690 | sock_release(s_listen); | 725 | sock_release(s_listen); |
691 | if (err < 0) { | 726 | if (err < 0) { |
692 | if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) { | 727 | if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) { |
693 | dev_err(DEV, "%s failed, err = %d\n", what, err); | 728 | conn_err(tconn, "%s failed, err = %d\n", what, err); |
694 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | 729 | conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD); |
695 | } | 730 | } |
696 | } | 731 | } |
697 | put_net_conf(mdev); | ||
698 | 732 | ||
699 | return s_estab; | 733 | return -EIO; |
700 | } | 734 | } |
701 | 735 | ||
702 | static int drbd_send_fp(struct drbd_conf *mdev, | 736 | static void unregister_state_change(struct sock *sk, struct accept_wait_data *ad) |
703 | struct socket *sock, enum drbd_packets cmd) | ||
704 | { | 737 | { |
705 | struct p_header80 *h = &mdev->data.sbuf.header.h80; | 738 | write_lock_bh(&sk->sk_callback_lock); |
706 | 739 | sk->sk_state_change = ad->original_sk_state_change; | |
707 | return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0); | 740 | sk->sk_user_data = NULL; |
741 | write_unlock_bh(&sk->sk_callback_lock); | ||
708 | } | 742 | } |
709 | 743 | ||
710 | static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock) | 744 | static struct socket *drbd_wait_for_connect(struct drbd_tconn *tconn, struct accept_wait_data *ad) |
711 | { | 745 | { |
712 | struct p_header80 *h = &mdev->data.rbuf.header.h80; | 746 | int timeo, connect_int, err = 0; |
713 | int rr; | 747 | struct socket *s_estab = NULL; |
748 | struct net_conf *nc; | ||
749 | |||
750 | rcu_read_lock(); | ||
751 | nc = rcu_dereference(tconn->net_conf); | ||
752 | if (!nc) { | ||
753 | rcu_read_unlock(); | ||
754 | return NULL; | ||
755 | } | ||
756 | connect_int = nc->connect_int; | ||
757 | rcu_read_unlock(); | ||
758 | |||
759 | timeo = connect_int * HZ; | ||
760 | timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */ | ||
761 | |||
762 | err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo); | ||
763 | if (err <= 0) | ||
764 | return NULL; | ||
765 | |||
766 | err = kernel_accept(ad->s_listen, &s_estab, 0); | ||
767 | if (err < 0) { | ||
768 | if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) { | ||
769 | conn_err(tconn, "accept failed, err = %d\n", err); | ||
770 | conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD); | ||
771 | } | ||
772 | } | ||
773 | |||
774 | if (s_estab) | ||
775 | unregister_state_change(s_estab->sk, ad); | ||
776 | |||
777 | return s_estab; | ||
778 | } | ||
714 | 779 | ||
715 | rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0); | 780 | static int decode_header(struct drbd_tconn *, void *, struct packet_info *); |
716 | 781 | ||
717 | if (rr == sizeof(*h) && h->magic == BE_DRBD_MAGIC) | 782 | static int send_first_packet(struct drbd_tconn *tconn, struct drbd_socket *sock, |
718 | return be16_to_cpu(h->command); | 783 | enum drbd_packet cmd) |
784 | { | ||
785 | if (!conn_prepare_command(tconn, sock)) | ||
786 | return -EIO; | ||
787 | return conn_send_command(tconn, sock, cmd, 0, NULL, 0); | ||
788 | } | ||
719 | 789 | ||
720 | return 0xffff; | 790 | static int receive_first_packet(struct drbd_tconn *tconn, struct socket *sock) |
791 | { | ||
792 | unsigned int header_size = drbd_header_size(tconn); | ||
793 | struct packet_info pi; | ||
794 | int err; | ||
795 | |||
796 | err = drbd_recv_short(sock, tconn->data.rbuf, header_size, 0); | ||
797 | if (err != header_size) { | ||
798 | if (err >= 0) | ||
799 | err = -EIO; | ||
800 | return err; | ||
801 | } | ||
802 | err = decode_header(tconn, tconn->data.rbuf, &pi); | ||
803 | if (err) | ||
804 | return err; | ||
805 | return pi.cmd; | ||
721 | } | 806 | } |
722 | 807 | ||
723 | /** | 808 | /** |
724 | * drbd_socket_okay() - Free the socket if its connection is not okay | 809 | * drbd_socket_okay() - Free the socket if its connection is not okay |
725 | * @mdev: DRBD device. | ||
726 | * @sock: pointer to the pointer to the socket. | 810 | * @sock: pointer to the pointer to the socket. |
727 | */ | 811 | */ |
728 | static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock) | 812 | static int drbd_socket_okay(struct socket **sock) |
729 | { | 813 | { |
730 | int rr; | 814 | int rr; |
731 | char tb[4]; | 815 | char tb[4]; |
@@ -733,7 +817,7 @@ static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock) | |||
733 | if (!*sock) | 817 | if (!*sock) |
734 | return false; | 818 | return false; |
735 | 819 | ||
736 | rr = drbd_recv_short(mdev, *sock, tb, 4, MSG_DONTWAIT | MSG_PEEK); | 820 | rr = drbd_recv_short(*sock, tb, 4, MSG_DONTWAIT | MSG_PEEK); |
737 | 821 | ||
738 | if (rr > 0 || rr == -EAGAIN) { | 822 | if (rr > 0 || rr == -EAGAIN) { |
739 | return true; | 823 | return true; |
@@ -743,6 +827,31 @@ static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock) | |||
743 | return false; | 827 | return false; |
744 | } | 828 | } |
745 | } | 829 | } |
830 | /* Gets called if a connection is established, or if a new minor gets created | ||
831 | in a connection */ | ||
832 | int drbd_connected(struct drbd_conf *mdev) | ||
833 | { | ||
834 | int err; | ||
835 | |||
836 | atomic_set(&mdev->packet_seq, 0); | ||
837 | mdev->peer_seq = 0; | ||
838 | |||
839 | mdev->state_mutex = mdev->tconn->agreed_pro_version < 100 ? | ||
840 | &mdev->tconn->cstate_mutex : | ||
841 | &mdev->own_state_mutex; | ||
842 | |||
843 | err = drbd_send_sync_param(mdev); | ||
844 | if (!err) | ||
845 | err = drbd_send_sizes(mdev, 0, 0); | ||
846 | if (!err) | ||
847 | err = drbd_send_uuids(mdev); | ||
848 | if (!err) | ||
849 | err = drbd_send_current_state(mdev); | ||
850 | clear_bit(USE_DEGR_WFC_T, &mdev->flags); | ||
851 | clear_bit(RESIZE_PENDING, &mdev->flags); | ||
852 | mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */ | ||
853 | return err; | ||
854 | } | ||
746 | 855 | ||
747 | /* | 856 | /* |
748 | * return values: | 857 | * return values: |
@@ -752,232 +861,315 @@ static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock) | |||
752 | * no point in trying again, please go standalone. | 861 | * no point in trying again, please go standalone. |
753 | * -2 We do not have a network config... | 862 | * -2 We do not have a network config... |
754 | */ | 863 | */ |
755 | static int drbd_connect(struct drbd_conf *mdev) | 864 | static int conn_connect(struct drbd_tconn *tconn) |
756 | { | 865 | { |
757 | struct socket *s, *sock, *msock; | 866 | struct drbd_socket sock, msock; |
758 | int try, h, ok; | 867 | struct drbd_conf *mdev; |
868 | struct net_conf *nc; | ||
869 | int vnr, timeout, h, ok; | ||
870 | bool discard_my_data; | ||
759 | enum drbd_state_rv rv; | 871 | enum drbd_state_rv rv; |
872 | struct accept_wait_data ad = { | ||
873 | .tconn = tconn, | ||
874 | .door_bell = COMPLETION_INITIALIZER_ONSTACK(ad.door_bell), | ||
875 | }; | ||
760 | 876 | ||
761 | D_ASSERT(!mdev->data.socket); | 877 | clear_bit(DISCONNECT_SENT, &tconn->flags); |
762 | 878 | if (conn_request_state(tconn, NS(conn, C_WF_CONNECTION), CS_VERBOSE) < SS_SUCCESS) | |
763 | if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS) | ||
764 | return -2; | 879 | return -2; |
765 | 880 | ||
766 | clear_bit(DISCARD_CONCURRENT, &mdev->flags); | 881 | mutex_init(&sock.mutex); |
882 | sock.sbuf = tconn->data.sbuf; | ||
883 | sock.rbuf = tconn->data.rbuf; | ||
884 | sock.socket = NULL; | ||
885 | mutex_init(&msock.mutex); | ||
886 | msock.sbuf = tconn->meta.sbuf; | ||
887 | msock.rbuf = tconn->meta.rbuf; | ||
888 | msock.socket = NULL; | ||
889 | |||
890 | /* Assume that the peer only understands protocol 80 until we know better. */ | ||
891 | tconn->agreed_pro_version = 80; | ||
767 | 892 | ||
768 | sock = NULL; | 893 | if (prepare_listen_socket(tconn, &ad)) |
769 | msock = NULL; | 894 | return 0; |
770 | 895 | ||
771 | do { | 896 | do { |
772 | for (try = 0;;) { | 897 | struct socket *s; |
773 | /* 3 tries, this should take less than a second! */ | ||
774 | s = drbd_try_connect(mdev); | ||
775 | if (s || ++try >= 3) | ||
776 | break; | ||
777 | /* give the other side time to call bind() & listen() */ | ||
778 | schedule_timeout_interruptible(HZ / 10); | ||
779 | } | ||
780 | 898 | ||
899 | s = drbd_try_connect(tconn); | ||
781 | if (s) { | 900 | if (s) { |
782 | if (!sock) { | 901 | if (!sock.socket) { |
783 | drbd_send_fp(mdev, s, P_HAND_SHAKE_S); | 902 | sock.socket = s; |
784 | sock = s; | 903 | send_first_packet(tconn, &sock, P_INITIAL_DATA); |
785 | s = NULL; | 904 | } else if (!msock.socket) { |
786 | } else if (!msock) { | 905 | clear_bit(RESOLVE_CONFLICTS, &tconn->flags); |
787 | drbd_send_fp(mdev, s, P_HAND_SHAKE_M); | 906 | msock.socket = s; |
788 | msock = s; | 907 | send_first_packet(tconn, &msock, P_INITIAL_META); |
789 | s = NULL; | ||
790 | } else { | 908 | } else { |
791 | dev_err(DEV, "Logic error in drbd_connect()\n"); | 909 | conn_err(tconn, "Logic error in conn_connect()\n"); |
792 | goto out_release_sockets; | 910 | goto out_release_sockets; |
793 | } | 911 | } |
794 | } | 912 | } |
795 | 913 | ||
796 | if (sock && msock) { | 914 | if (sock.socket && msock.socket) { |
797 | schedule_timeout_interruptible(mdev->net_conf->ping_timeo*HZ/10); | 915 | rcu_read_lock(); |
798 | ok = drbd_socket_okay(mdev, &sock); | 916 | nc = rcu_dereference(tconn->net_conf); |
799 | ok = drbd_socket_okay(mdev, &msock) && ok; | 917 | timeout = nc->ping_timeo * HZ / 10; |
918 | rcu_read_unlock(); | ||
919 | schedule_timeout_interruptible(timeout); | ||
920 | ok = drbd_socket_okay(&sock.socket); | ||
921 | ok = drbd_socket_okay(&msock.socket) && ok; | ||
800 | if (ok) | 922 | if (ok) |
801 | break; | 923 | break; |
802 | } | 924 | } |
803 | 925 | ||
804 | retry: | 926 | retry: |
805 | s = drbd_wait_for_connect(mdev); | 927 | s = drbd_wait_for_connect(tconn, &ad); |
806 | if (s) { | 928 | if (s) { |
807 | try = drbd_recv_fp(mdev, s); | 929 | int fp = receive_first_packet(tconn, s); |
808 | drbd_socket_okay(mdev, &sock); | 930 | drbd_socket_okay(&sock.socket); |
809 | drbd_socket_okay(mdev, &msock); | 931 | drbd_socket_okay(&msock.socket); |
810 | switch (try) { | 932 | switch (fp) { |
811 | case P_HAND_SHAKE_S: | 933 | case P_INITIAL_DATA: |
812 | if (sock) { | 934 | if (sock.socket) { |
813 | dev_warn(DEV, "initial packet S crossed\n"); | 935 | conn_warn(tconn, "initial packet S crossed\n"); |
814 | sock_release(sock); | 936 | sock_release(sock.socket); |
937 | sock.socket = s; | ||
938 | goto randomize; | ||
815 | } | 939 | } |
816 | sock = s; | 940 | sock.socket = s; |
817 | break; | 941 | break; |
818 | case P_HAND_SHAKE_M: | 942 | case P_INITIAL_META: |
819 | if (msock) { | 943 | set_bit(RESOLVE_CONFLICTS, &tconn->flags); |
820 | dev_warn(DEV, "initial packet M crossed\n"); | 944 | if (msock.socket) { |
821 | sock_release(msock); | 945 | conn_warn(tconn, "initial packet M crossed\n"); |
946 | sock_release(msock.socket); | ||
947 | msock.socket = s; | ||
948 | goto randomize; | ||
822 | } | 949 | } |
823 | msock = s; | 950 | msock.socket = s; |
824 | set_bit(DISCARD_CONCURRENT, &mdev->flags); | ||
825 | break; | 951 | break; |
826 | default: | 952 | default: |
827 | dev_warn(DEV, "Error receiving initial packet\n"); | 953 | conn_warn(tconn, "Error receiving initial packet\n"); |
828 | sock_release(s); | 954 | sock_release(s); |
955 | randomize: | ||
829 | if (random32() & 1) | 956 | if (random32() & 1) |
830 | goto retry; | 957 | goto retry; |
831 | } | 958 | } |
832 | } | 959 | } |
833 | 960 | ||
834 | if (mdev->state.conn <= C_DISCONNECTING) | 961 | if (tconn->cstate <= C_DISCONNECTING) |
835 | goto out_release_sockets; | 962 | goto out_release_sockets; |
836 | if (signal_pending(current)) { | 963 | if (signal_pending(current)) { |
837 | flush_signals(current); | 964 | flush_signals(current); |
838 | smp_rmb(); | 965 | smp_rmb(); |
839 | if (get_t_state(&mdev->receiver) == Exiting) | 966 | if (get_t_state(&tconn->receiver) == EXITING) |
840 | goto out_release_sockets; | 967 | goto out_release_sockets; |
841 | } | 968 | } |
842 | 969 | ||
843 | if (sock && msock) { | 970 | ok = drbd_socket_okay(&sock.socket); |
844 | ok = drbd_socket_okay(mdev, &sock); | 971 | ok = drbd_socket_okay(&msock.socket) && ok; |
845 | ok = drbd_socket_okay(mdev, &msock) && ok; | 972 | } while (!ok); |
846 | if (ok) | 973 | |
847 | break; | 974 | if (ad.s_listen) |
848 | } | 975 | sock_release(ad.s_listen); |
849 | } while (1); | ||
850 | 976 | ||
851 | msock->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ | 977 | sock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ |
852 | sock->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ | 978 | msock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ |
853 | 979 | ||
854 | sock->sk->sk_allocation = GFP_NOIO; | 980 | sock.socket->sk->sk_allocation = GFP_NOIO; |
855 | msock->sk->sk_allocation = GFP_NOIO; | 981 | msock.socket->sk->sk_allocation = GFP_NOIO; |
856 | 982 | ||
857 | sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK; | 983 | sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK; |
858 | msock->sk->sk_priority = TC_PRIO_INTERACTIVE; | 984 | msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE; |
859 | 985 | ||
860 | /* NOT YET ... | 986 | /* NOT YET ... |
861 | * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; | 987 | * sock.socket->sk->sk_sndtimeo = tconn->net_conf->timeout*HZ/10; |
862 | * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; | 988 | * sock.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; |
863 | * first set it to the P_HAND_SHAKE timeout, | 989 | * first set it to the P_CONNECTION_FEATURES timeout, |
864 | * which we set to 4x the configured ping_timeout. */ | 990 | * which we set to 4x the configured ping_timeout. */ |
865 | sock->sk->sk_sndtimeo = | 991 | rcu_read_lock(); |
866 | sock->sk->sk_rcvtimeo = mdev->net_conf->ping_timeo*4*HZ/10; | 992 | nc = rcu_dereference(tconn->net_conf); |
867 | 993 | ||
868 | msock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; | 994 | sock.socket->sk->sk_sndtimeo = |
869 | msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ; | 995 | sock.socket->sk->sk_rcvtimeo = nc->ping_timeo*4*HZ/10; |
996 | |||
997 | msock.socket->sk->sk_rcvtimeo = nc->ping_int*HZ; | ||
998 | timeout = nc->timeout * HZ / 10; | ||
999 | discard_my_data = nc->discard_my_data; | ||
1000 | rcu_read_unlock(); | ||
1001 | |||
1002 | msock.socket->sk->sk_sndtimeo = timeout; | ||
870 | 1003 | ||
871 | /* we don't want delays. | 1004 | /* we don't want delays. |
872 | * we use TCP_CORK where appropriate, though */ | 1005 | * we use TCP_CORK where appropriate, though */ |
873 | drbd_tcp_nodelay(sock); | 1006 | drbd_tcp_nodelay(sock.socket); |
874 | drbd_tcp_nodelay(msock); | 1007 | drbd_tcp_nodelay(msock.socket); |
875 | |||
876 | mdev->data.socket = sock; | ||
877 | mdev->meta.socket = msock; | ||
878 | mdev->last_received = jiffies; | ||
879 | 1008 | ||
880 | D_ASSERT(mdev->asender.task == NULL); | 1009 | tconn->data.socket = sock.socket; |
1010 | tconn->meta.socket = msock.socket; | ||
1011 | tconn->last_received = jiffies; | ||
881 | 1012 | ||
882 | h = drbd_do_handshake(mdev); | 1013 | h = drbd_do_features(tconn); |
883 | if (h <= 0) | 1014 | if (h <= 0) |
884 | return h; | 1015 | return h; |
885 | 1016 | ||
886 | if (mdev->cram_hmac_tfm) { | 1017 | if (tconn->cram_hmac_tfm) { |
887 | /* drbd_request_state(mdev, NS(conn, WFAuth)); */ | 1018 | /* drbd_request_state(mdev, NS(conn, WFAuth)); */ |
888 | switch (drbd_do_auth(mdev)) { | 1019 | switch (drbd_do_auth(tconn)) { |
889 | case -1: | 1020 | case -1: |
890 | dev_err(DEV, "Authentication of peer failed\n"); | 1021 | conn_err(tconn, "Authentication of peer failed\n"); |
891 | return -1; | 1022 | return -1; |
892 | case 0: | 1023 | case 0: |
893 | dev_err(DEV, "Authentication of peer failed, trying again.\n"); | 1024 | conn_err(tconn, "Authentication of peer failed, trying again.\n"); |
894 | return 0; | 1025 | return 0; |
895 | } | 1026 | } |
896 | } | 1027 | } |
897 | 1028 | ||
898 | sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; | 1029 | tconn->data.socket->sk->sk_sndtimeo = timeout; |
899 | sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; | 1030 | tconn->data.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; |
900 | 1031 | ||
901 | atomic_set(&mdev->packet_seq, 0); | 1032 | if (drbd_send_protocol(tconn) == -EOPNOTSUPP) |
902 | mdev->peer_seq = 0; | ||
903 | |||
904 | if (drbd_send_protocol(mdev) == -1) | ||
905 | return -1; | 1033 | return -1; |
906 | set_bit(STATE_SENT, &mdev->flags); | ||
907 | drbd_send_sync_param(mdev, &mdev->sync_conf); | ||
908 | drbd_send_sizes(mdev, 0, 0); | ||
909 | drbd_send_uuids(mdev); | ||
910 | drbd_send_current_state(mdev); | ||
911 | clear_bit(USE_DEGR_WFC_T, &mdev->flags); | ||
912 | clear_bit(RESIZE_PENDING, &mdev->flags); | ||
913 | 1034 | ||
914 | spin_lock_irq(&mdev->req_lock); | 1035 | set_bit(STATE_SENT, &tconn->flags); |
915 | rv = _drbd_set_state(_NS(mdev, conn, C_WF_REPORT_PARAMS), CS_VERBOSE, NULL); | 1036 | |
916 | if (mdev->state.conn != C_WF_REPORT_PARAMS) | 1037 | rcu_read_lock(); |
917 | clear_bit(STATE_SENT, &mdev->flags); | 1038 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { |
918 | spin_unlock_irq(&mdev->req_lock); | 1039 | kref_get(&mdev->kref); |
1040 | /* Prevent a race between resync-handshake and | ||
1041 | * being promoted to Primary. | ||
1042 | * | ||
1043 | * Grab and release the state mutex, so we know that any current | ||
1044 | * drbd_set_role() is finished, and any incoming drbd_set_role | ||
1045 | * will see the STATE_SENT flag, and wait for it to be cleared. | ||
1046 | */ | ||
1047 | mutex_lock(mdev->state_mutex); | ||
1048 | mutex_unlock(mdev->state_mutex); | ||
1049 | |||
1050 | rcu_read_unlock(); | ||
1051 | |||
1052 | if (discard_my_data) | ||
1053 | set_bit(DISCARD_MY_DATA, &mdev->flags); | ||
1054 | else | ||
1055 | clear_bit(DISCARD_MY_DATA, &mdev->flags); | ||
1056 | |||
1057 | drbd_connected(mdev); | ||
1058 | kref_put(&mdev->kref, &drbd_minor_destroy); | ||
1059 | rcu_read_lock(); | ||
1060 | } | ||
1061 | rcu_read_unlock(); | ||
919 | 1062 | ||
920 | if (rv < SS_SUCCESS) | 1063 | rv = conn_request_state(tconn, NS(conn, C_WF_REPORT_PARAMS), CS_VERBOSE); |
1064 | if (rv < SS_SUCCESS || tconn->cstate != C_WF_REPORT_PARAMS) { | ||
1065 | clear_bit(STATE_SENT, &tconn->flags); | ||
921 | return 0; | 1066 | return 0; |
1067 | } | ||
922 | 1068 | ||
923 | drbd_thread_start(&mdev->asender); | 1069 | drbd_thread_start(&tconn->asender); |
924 | mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */ | ||
925 | 1070 | ||
926 | return 1; | 1071 | mutex_lock(&tconn->conf_update); |
1072 | /* The discard_my_data flag is a single-shot modifier to the next | ||
1073 | * connection attempt, the handshake of which is now well underway. | ||
1074 | * No need for rcu style copying of the whole struct | ||
1075 | * just to clear a single value. */ | ||
1076 | tconn->net_conf->discard_my_data = 0; | ||
1077 | mutex_unlock(&tconn->conf_update); | ||
1078 | |||
1079 | return h; | ||
927 | 1080 | ||
928 | out_release_sockets: | 1081 | out_release_sockets: |
929 | if (sock) | 1082 | if (ad.s_listen) |
930 | sock_release(sock); | 1083 | sock_release(ad.s_listen); |
931 | if (msock) | 1084 | if (sock.socket) |
932 | sock_release(msock); | 1085 | sock_release(sock.socket); |
1086 | if (msock.socket) | ||
1087 | sock_release(msock.socket); | ||
933 | return -1; | 1088 | return -1; |
934 | } | 1089 | } |
935 | 1090 | ||
936 | static int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsigned int *packet_size) | 1091 | static int decode_header(struct drbd_tconn *tconn, void *header, struct packet_info *pi) |
937 | { | 1092 | { |
938 | union p_header *h = &mdev->data.rbuf.header; | 1093 | unsigned int header_size = drbd_header_size(tconn); |
939 | int r; | 1094 | |
940 | 1095 | if (header_size == sizeof(struct p_header100) && | |
941 | r = drbd_recv(mdev, h, sizeof(*h)); | 1096 | *(__be32 *)header == cpu_to_be32(DRBD_MAGIC_100)) { |
942 | if (unlikely(r != sizeof(*h))) { | 1097 | struct p_header100 *h = header; |
943 | if (!signal_pending(current)) | 1098 | if (h->pad != 0) { |
944 | dev_warn(DEV, "short read expecting header on sock: r=%d\n", r); | 1099 | conn_err(tconn, "Header padding is not zero\n"); |
945 | return false; | 1100 | return -EINVAL; |
946 | } | 1101 | } |
947 | 1102 | pi->vnr = be16_to_cpu(h->volume); | |
948 | if (likely(h->h80.magic == BE_DRBD_MAGIC)) { | 1103 | pi->cmd = be16_to_cpu(h->command); |
949 | *cmd = be16_to_cpu(h->h80.command); | 1104 | pi->size = be32_to_cpu(h->length); |
950 | *packet_size = be16_to_cpu(h->h80.length); | 1105 | } else if (header_size == sizeof(struct p_header95) && |
951 | } else if (h->h95.magic == BE_DRBD_MAGIC_BIG) { | 1106 | *(__be16 *)header == cpu_to_be16(DRBD_MAGIC_BIG)) { |
952 | *cmd = be16_to_cpu(h->h95.command); | 1107 | struct p_header95 *h = header; |
953 | *packet_size = be32_to_cpu(h->h95.length); | 1108 | pi->cmd = be16_to_cpu(h->command); |
1109 | pi->size = be32_to_cpu(h->length); | ||
1110 | pi->vnr = 0; | ||
1111 | } else if (header_size == sizeof(struct p_header80) && | ||
1112 | *(__be32 *)header == cpu_to_be32(DRBD_MAGIC)) { | ||
1113 | struct p_header80 *h = header; | ||
1114 | pi->cmd = be16_to_cpu(h->command); | ||
1115 | pi->size = be16_to_cpu(h->length); | ||
1116 | pi->vnr = 0; | ||
954 | } else { | 1117 | } else { |
955 | dev_err(DEV, "magic?? on data m: 0x%08x c: %d l: %d\n", | 1118 | conn_err(tconn, "Wrong magic value 0x%08x in protocol version %d\n", |
956 | be32_to_cpu(h->h80.magic), | 1119 | be32_to_cpu(*(__be32 *)header), |
957 | be16_to_cpu(h->h80.command), | 1120 | tconn->agreed_pro_version); |
958 | be16_to_cpu(h->h80.length)); | 1121 | return -EINVAL; |
959 | return false; | ||
960 | } | 1122 | } |
961 | mdev->last_received = jiffies; | 1123 | pi->data = header + header_size; |
1124 | return 0; | ||
1125 | } | ||
962 | 1126 | ||
963 | return true; | 1127 | static int drbd_recv_header(struct drbd_tconn *tconn, struct packet_info *pi) |
1128 | { | ||
1129 | void *buffer = tconn->data.rbuf; | ||
1130 | int err; | ||
1131 | |||
1132 | err = drbd_recv_all_warn(tconn, buffer, drbd_header_size(tconn)); | ||
1133 | if (err) | ||
1134 | return err; | ||
1135 | |||
1136 | err = decode_header(tconn, buffer, pi); | ||
1137 | tconn->last_received = jiffies; | ||
1138 | |||
1139 | return err; | ||
964 | } | 1140 | } |
965 | 1141 | ||
966 | static void drbd_flush(struct drbd_conf *mdev) | 1142 | static void drbd_flush(struct drbd_tconn *tconn) |
967 | { | 1143 | { |
968 | int rv; | 1144 | int rv; |
1145 | struct drbd_conf *mdev; | ||
1146 | int vnr; | ||
969 | 1147 | ||
970 | if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) { | 1148 | if (tconn->write_ordering >= WO_bdev_flush) { |
971 | rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL, | 1149 | rcu_read_lock(); |
972 | NULL); | 1150 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { |
973 | if (rv) { | 1151 | if (!get_ldev(mdev)) |
974 | dev_info(DEV, "local disk flush failed with status %d\n", rv); | 1152 | continue; |
975 | /* would rather check on EOPNOTSUPP, but that is not reliable. | 1153 | kref_get(&mdev->kref); |
976 | * don't try again for ANY return value != 0 | 1154 | rcu_read_unlock(); |
977 | * if (rv == -EOPNOTSUPP) */ | 1155 | |
978 | drbd_bump_write_ordering(mdev, WO_drain_io); | 1156 | rv = blkdev_issue_flush(mdev->ldev->backing_bdev, |
1157 | GFP_NOIO, NULL); | ||
1158 | if (rv) { | ||
1159 | dev_info(DEV, "local disk flush failed with status %d\n", rv); | ||
1160 | /* would rather check on EOPNOTSUPP, but that is not reliable. | ||
1161 | * don't try again for ANY return value != 0 | ||
1162 | * if (rv == -EOPNOTSUPP) */ | ||
1163 | drbd_bump_write_ordering(tconn, WO_drain_io); | ||
1164 | } | ||
1165 | put_ldev(mdev); | ||
1166 | kref_put(&mdev->kref, &drbd_minor_destroy); | ||
1167 | |||
1168 | rcu_read_lock(); | ||
1169 | if (rv) | ||
1170 | break; | ||
979 | } | 1171 | } |
980 | put_ldev(mdev); | 1172 | rcu_read_unlock(); |
981 | } | 1173 | } |
982 | } | 1174 | } |
983 | 1175 | ||
@@ -987,7 +1179,7 @@ static void drbd_flush(struct drbd_conf *mdev) | |||
987 | * @epoch: Epoch object. | 1179 | * @epoch: Epoch object. |
988 | * @ev: Epoch event. | 1180 | * @ev: Epoch event. |
989 | */ | 1181 | */ |
990 | static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, | 1182 | static enum finish_epoch drbd_may_finish_epoch(struct drbd_tconn *tconn, |
991 | struct drbd_epoch *epoch, | 1183 | struct drbd_epoch *epoch, |
992 | enum epoch_event ev) | 1184 | enum epoch_event ev) |
993 | { | 1185 | { |
@@ -995,7 +1187,7 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, | |||
995 | struct drbd_epoch *next_epoch; | 1187 | struct drbd_epoch *next_epoch; |
996 | enum finish_epoch rv = FE_STILL_LIVE; | 1188 | enum finish_epoch rv = FE_STILL_LIVE; |
997 | 1189 | ||
998 | spin_lock(&mdev->epoch_lock); | 1190 | spin_lock(&tconn->epoch_lock); |
999 | do { | 1191 | do { |
1000 | next_epoch = NULL; | 1192 | next_epoch = NULL; |
1001 | 1193 | ||
@@ -1017,18 +1209,22 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, | |||
1017 | atomic_read(&epoch->active) == 0 && | 1209 | atomic_read(&epoch->active) == 0 && |
1018 | (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) { | 1210 | (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) { |
1019 | if (!(ev & EV_CLEANUP)) { | 1211 | if (!(ev & EV_CLEANUP)) { |
1020 | spin_unlock(&mdev->epoch_lock); | 1212 | spin_unlock(&tconn->epoch_lock); |
1021 | drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size); | 1213 | drbd_send_b_ack(epoch->tconn, epoch->barrier_nr, epoch_size); |
1022 | spin_lock(&mdev->epoch_lock); | 1214 | spin_lock(&tconn->epoch_lock); |
1023 | } | 1215 | } |
1216 | #if 0 | ||
1217 | /* FIXME: dec unacked on connection, once we have | ||
1218 | * something to count pending connection packets in. */ | ||
1024 | if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) | 1219 | if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) |
1025 | dec_unacked(mdev); | 1220 | dec_unacked(epoch->tconn); |
1221 | #endif | ||
1026 | 1222 | ||
1027 | if (mdev->current_epoch != epoch) { | 1223 | if (tconn->current_epoch != epoch) { |
1028 | next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list); | 1224 | next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list); |
1029 | list_del(&epoch->list); | 1225 | list_del(&epoch->list); |
1030 | ev = EV_BECAME_LAST | (ev & EV_CLEANUP); | 1226 | ev = EV_BECAME_LAST | (ev & EV_CLEANUP); |
1031 | mdev->epochs--; | 1227 | tconn->epochs--; |
1032 | kfree(epoch); | 1228 | kfree(epoch); |
1033 | 1229 | ||
1034 | if (rv == FE_STILL_LIVE) | 1230 | if (rv == FE_STILL_LIVE) |
@@ -1039,7 +1235,6 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, | |||
1039 | /* atomic_set(&epoch->active, 0); is already zero */ | 1235 | /* atomic_set(&epoch->active, 0); is already zero */ |
1040 | if (rv == FE_STILL_LIVE) | 1236 | if (rv == FE_STILL_LIVE) |
1041 | rv = FE_RECYCLED; | 1237 | rv = FE_RECYCLED; |
1042 | wake_up(&mdev->ee_wait); | ||
1043 | } | 1238 | } |
1044 | } | 1239 | } |
1045 | 1240 | ||
@@ -1049,40 +1244,52 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, | |||
1049 | epoch = next_epoch; | 1244 | epoch = next_epoch; |
1050 | } while (1); | 1245 | } while (1); |
1051 | 1246 | ||
1052 | spin_unlock(&mdev->epoch_lock); | 1247 | spin_unlock(&tconn->epoch_lock); |
1053 | 1248 | ||
1054 | return rv; | 1249 | return rv; |
1055 | } | 1250 | } |
1056 | 1251 | ||
1057 | /** | 1252 | /** |
1058 | * drbd_bump_write_ordering() - Fall back to an other write ordering method | 1253 | * drbd_bump_write_ordering() - Fall back to an other write ordering method |
1059 | * @mdev: DRBD device. | 1254 | * @tconn: DRBD connection. |
1060 | * @wo: Write ordering method to try. | 1255 | * @wo: Write ordering method to try. |
1061 | */ | 1256 | */ |
1062 | void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) __must_hold(local) | 1257 | void drbd_bump_write_ordering(struct drbd_tconn *tconn, enum write_ordering_e wo) |
1063 | { | 1258 | { |
1259 | struct disk_conf *dc; | ||
1260 | struct drbd_conf *mdev; | ||
1064 | enum write_ordering_e pwo; | 1261 | enum write_ordering_e pwo; |
1262 | int vnr; | ||
1065 | static char *write_ordering_str[] = { | 1263 | static char *write_ordering_str[] = { |
1066 | [WO_none] = "none", | 1264 | [WO_none] = "none", |
1067 | [WO_drain_io] = "drain", | 1265 | [WO_drain_io] = "drain", |
1068 | [WO_bdev_flush] = "flush", | 1266 | [WO_bdev_flush] = "flush", |
1069 | }; | 1267 | }; |
1070 | 1268 | ||
1071 | pwo = mdev->write_ordering; | 1269 | pwo = tconn->write_ordering; |
1072 | wo = min(pwo, wo); | 1270 | wo = min(pwo, wo); |
1073 | if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush) | 1271 | rcu_read_lock(); |
1074 | wo = WO_drain_io; | 1272 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { |
1075 | if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain) | 1273 | if (!get_ldev_if_state(mdev, D_ATTACHING)) |
1076 | wo = WO_none; | 1274 | continue; |
1077 | mdev->write_ordering = wo; | 1275 | dc = rcu_dereference(mdev->ldev->disk_conf); |
1078 | if (pwo != mdev->write_ordering || wo == WO_bdev_flush) | 1276 | |
1079 | dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]); | 1277 | if (wo == WO_bdev_flush && !dc->disk_flushes) |
1278 | wo = WO_drain_io; | ||
1279 | if (wo == WO_drain_io && !dc->disk_drain) | ||
1280 | wo = WO_none; | ||
1281 | put_ldev(mdev); | ||
1282 | } | ||
1283 | rcu_read_unlock(); | ||
1284 | tconn->write_ordering = wo; | ||
1285 | if (pwo != tconn->write_ordering || wo == WO_bdev_flush) | ||
1286 | conn_info(tconn, "Method to ensure write ordering: %s\n", write_ordering_str[tconn->write_ordering]); | ||
1080 | } | 1287 | } |
1081 | 1288 | ||
1082 | /** | 1289 | /** |
1083 | * drbd_submit_ee() | 1290 | * drbd_submit_peer_request() |
1084 | * @mdev: DRBD device. | 1291 | * @mdev: DRBD device. |
1085 | * @e: epoch entry | 1292 | * @peer_req: peer request |
1086 | * @rw: flag field, see bio->bi_rw | 1293 | * @rw: flag field, see bio->bi_rw |
1087 | * | 1294 | * |
1088 | * May spread the pages to multiple bios, | 1295 | * May spread the pages to multiple bios, |
@@ -1096,14 +1303,15 @@ void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) | |||
1096 | * on certain Xen deployments. | 1303 | * on certain Xen deployments. |
1097 | */ | 1304 | */ |
1098 | /* TODO allocate from our own bio_set. */ | 1305 | /* TODO allocate from our own bio_set. */ |
1099 | int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, | 1306 | int drbd_submit_peer_request(struct drbd_conf *mdev, |
1100 | const unsigned rw, const int fault_type) | 1307 | struct drbd_peer_request *peer_req, |
1308 | const unsigned rw, const int fault_type) | ||
1101 | { | 1309 | { |
1102 | struct bio *bios = NULL; | 1310 | struct bio *bios = NULL; |
1103 | struct bio *bio; | 1311 | struct bio *bio; |
1104 | struct page *page = e->pages; | 1312 | struct page *page = peer_req->pages; |
1105 | sector_t sector = e->sector; | 1313 | sector_t sector = peer_req->i.sector; |
1106 | unsigned ds = e->size; | 1314 | unsigned ds = peer_req->i.size; |
1107 | unsigned n_bios = 0; | 1315 | unsigned n_bios = 0; |
1108 | unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT; | 1316 | unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT; |
1109 | int err = -ENOMEM; | 1317 | int err = -ENOMEM; |
@@ -1122,12 +1330,12 @@ next_bio: | |||
1122 | dev_err(DEV, "submit_ee: Allocation of a bio failed\n"); | 1330 | dev_err(DEV, "submit_ee: Allocation of a bio failed\n"); |
1123 | goto fail; | 1331 | goto fail; |
1124 | } | 1332 | } |
1125 | /* > e->sector, unless this is the first bio */ | 1333 | /* > peer_req->i.sector, unless this is the first bio */ |
1126 | bio->bi_sector = sector; | 1334 | bio->bi_sector = sector; |
1127 | bio->bi_bdev = mdev->ldev->backing_bdev; | 1335 | bio->bi_bdev = mdev->ldev->backing_bdev; |
1128 | bio->bi_rw = rw; | 1336 | bio->bi_rw = rw; |
1129 | bio->bi_private = e; | 1337 | bio->bi_private = peer_req; |
1130 | bio->bi_end_io = drbd_endio_sec; | 1338 | bio->bi_end_io = drbd_peer_request_endio; |
1131 | 1339 | ||
1132 | bio->bi_next = bios; | 1340 | bio->bi_next = bios; |
1133 | bios = bio; | 1341 | bios = bio; |
@@ -1156,7 +1364,7 @@ next_bio: | |||
1156 | D_ASSERT(page == NULL); | 1364 | D_ASSERT(page == NULL); |
1157 | D_ASSERT(ds == 0); | 1365 | D_ASSERT(ds == 0); |
1158 | 1366 | ||
1159 | atomic_set(&e->pending_bios, n_bios); | 1367 | atomic_set(&peer_req->pending_bios, n_bios); |
1160 | do { | 1368 | do { |
1161 | bio = bios; | 1369 | bio = bios; |
1162 | bios = bios->bi_next; | 1370 | bios = bios->bi_next; |
@@ -1175,26 +1383,57 @@ fail: | |||
1175 | return err; | 1383 | return err; |
1176 | } | 1384 | } |
1177 | 1385 | ||
1178 | static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 1386 | static void drbd_remove_epoch_entry_interval(struct drbd_conf *mdev, |
1387 | struct drbd_peer_request *peer_req) | ||
1388 | { | ||
1389 | struct drbd_interval *i = &peer_req->i; | ||
1390 | |||
1391 | drbd_remove_interval(&mdev->write_requests, i); | ||
1392 | drbd_clear_interval(i); | ||
1393 | |||
1394 | /* Wake up any processes waiting for this peer request to complete. */ | ||
1395 | if (i->waiting) | ||
1396 | wake_up(&mdev->misc_wait); | ||
1397 | } | ||
1398 | |||
1399 | void conn_wait_active_ee_empty(struct drbd_tconn *tconn) | ||
1400 | { | ||
1401 | struct drbd_conf *mdev; | ||
1402 | int vnr; | ||
1403 | |||
1404 | rcu_read_lock(); | ||
1405 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
1406 | kref_get(&mdev->kref); | ||
1407 | rcu_read_unlock(); | ||
1408 | drbd_wait_ee_list_empty(mdev, &mdev->active_ee); | ||
1409 | kref_put(&mdev->kref, &drbd_minor_destroy); | ||
1410 | rcu_read_lock(); | ||
1411 | } | ||
1412 | rcu_read_unlock(); | ||
1413 | } | ||
1414 | |||
1415 | static int receive_Barrier(struct drbd_tconn *tconn, struct packet_info *pi) | ||
1179 | { | 1416 | { |
1180 | int rv; | 1417 | int rv; |
1181 | struct p_barrier *p = &mdev->data.rbuf.barrier; | 1418 | struct p_barrier *p = pi->data; |
1182 | struct drbd_epoch *epoch; | 1419 | struct drbd_epoch *epoch; |
1183 | 1420 | ||
1184 | inc_unacked(mdev); | 1421 | /* FIXME these are unacked on connection, |
1185 | 1422 | * not a specific (peer)device. | |
1186 | mdev->current_epoch->barrier_nr = p->barrier; | 1423 | */ |
1187 | rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_GOT_BARRIER_NR); | 1424 | tconn->current_epoch->barrier_nr = p->barrier; |
1425 | tconn->current_epoch->tconn = tconn; | ||
1426 | rv = drbd_may_finish_epoch(tconn, tconn->current_epoch, EV_GOT_BARRIER_NR); | ||
1188 | 1427 | ||
1189 | /* P_BARRIER_ACK may imply that the corresponding extent is dropped from | 1428 | /* P_BARRIER_ACK may imply that the corresponding extent is dropped from |
1190 | * the activity log, which means it would not be resynced in case the | 1429 | * the activity log, which means it would not be resynced in case the |
1191 | * R_PRIMARY crashes now. | 1430 | * R_PRIMARY crashes now. |
1192 | * Therefore we must send the barrier_ack after the barrier request was | 1431 | * Therefore we must send the barrier_ack after the barrier request was |
1193 | * completed. */ | 1432 | * completed. */ |
1194 | switch (mdev->write_ordering) { | 1433 | switch (tconn->write_ordering) { |
1195 | case WO_none: | 1434 | case WO_none: |
1196 | if (rv == FE_RECYCLED) | 1435 | if (rv == FE_RECYCLED) |
1197 | return true; | 1436 | return 0; |
1198 | 1437 | ||
1199 | /* receiver context, in the writeout path of the other node. | 1438 | /* receiver context, in the writeout path of the other node. |
1200 | * avoid potential distributed deadlock */ | 1439 | * avoid potential distributed deadlock */ |
@@ -1202,81 +1441,75 @@ static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsign | |||
1202 | if (epoch) | 1441 | if (epoch) |
1203 | break; | 1442 | break; |
1204 | else | 1443 | else |
1205 | dev_warn(DEV, "Allocation of an epoch failed, slowing down\n"); | 1444 | conn_warn(tconn, "Allocation of an epoch failed, slowing down\n"); |
1206 | /* Fall through */ | 1445 | /* Fall through */ |
1207 | 1446 | ||
1208 | case WO_bdev_flush: | 1447 | case WO_bdev_flush: |
1209 | case WO_drain_io: | 1448 | case WO_drain_io: |
1210 | drbd_wait_ee_list_empty(mdev, &mdev->active_ee); | 1449 | conn_wait_active_ee_empty(tconn); |
1211 | drbd_flush(mdev); | 1450 | drbd_flush(tconn); |
1212 | 1451 | ||
1213 | if (atomic_read(&mdev->current_epoch->epoch_size)) { | 1452 | if (atomic_read(&tconn->current_epoch->epoch_size)) { |
1214 | epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO); | 1453 | epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO); |
1215 | if (epoch) | 1454 | if (epoch) |
1216 | break; | 1455 | break; |
1217 | } | 1456 | } |
1218 | 1457 | ||
1219 | epoch = mdev->current_epoch; | 1458 | return 0; |
1220 | wait_event(mdev->ee_wait, atomic_read(&epoch->epoch_size) == 0); | ||
1221 | |||
1222 | D_ASSERT(atomic_read(&epoch->active) == 0); | ||
1223 | D_ASSERT(epoch->flags == 0); | ||
1224 | |||
1225 | return true; | ||
1226 | default: | 1459 | default: |
1227 | dev_err(DEV, "Strangeness in mdev->write_ordering %d\n", mdev->write_ordering); | 1460 | conn_err(tconn, "Strangeness in tconn->write_ordering %d\n", tconn->write_ordering); |
1228 | return false; | 1461 | return -EIO; |
1229 | } | 1462 | } |
1230 | 1463 | ||
1231 | epoch->flags = 0; | 1464 | epoch->flags = 0; |
1232 | atomic_set(&epoch->epoch_size, 0); | 1465 | atomic_set(&epoch->epoch_size, 0); |
1233 | atomic_set(&epoch->active, 0); | 1466 | atomic_set(&epoch->active, 0); |
1234 | 1467 | ||
1235 | spin_lock(&mdev->epoch_lock); | 1468 | spin_lock(&tconn->epoch_lock); |
1236 | if (atomic_read(&mdev->current_epoch->epoch_size)) { | 1469 | if (atomic_read(&tconn->current_epoch->epoch_size)) { |
1237 | list_add(&epoch->list, &mdev->current_epoch->list); | 1470 | list_add(&epoch->list, &tconn->current_epoch->list); |
1238 | mdev->current_epoch = epoch; | 1471 | tconn->current_epoch = epoch; |
1239 | mdev->epochs++; | 1472 | tconn->epochs++; |
1240 | } else { | 1473 | } else { |
1241 | /* The current_epoch got recycled while we allocated this one... */ | 1474 | /* The current_epoch got recycled while we allocated this one... */ |
1242 | kfree(epoch); | 1475 | kfree(epoch); |
1243 | } | 1476 | } |
1244 | spin_unlock(&mdev->epoch_lock); | 1477 | spin_unlock(&tconn->epoch_lock); |
1245 | 1478 | ||
1246 | return true; | 1479 | return 0; |
1247 | } | 1480 | } |
1248 | 1481 | ||
1249 | /* used from receive_RSDataReply (recv_resync_read) | 1482 | /* used from receive_RSDataReply (recv_resync_read) |
1250 | * and from receive_Data */ | 1483 | * and from receive_Data */ |
1251 | static struct drbd_epoch_entry * | 1484 | static struct drbd_peer_request * |
1252 | read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local) | 1485 | read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, |
1486 | int data_size) __must_hold(local) | ||
1253 | { | 1487 | { |
1254 | const sector_t capacity = drbd_get_capacity(mdev->this_bdev); | 1488 | const sector_t capacity = drbd_get_capacity(mdev->this_bdev); |
1255 | struct drbd_epoch_entry *e; | 1489 | struct drbd_peer_request *peer_req; |
1256 | struct page *page; | 1490 | struct page *page; |
1257 | int dgs, ds, rr; | 1491 | int dgs, ds, err; |
1258 | void *dig_in = mdev->int_dig_in; | 1492 | void *dig_in = mdev->tconn->int_dig_in; |
1259 | void *dig_vv = mdev->int_dig_vv; | 1493 | void *dig_vv = mdev->tconn->int_dig_vv; |
1260 | unsigned long *data; | 1494 | unsigned long *data; |
1261 | 1495 | ||
1262 | dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ? | 1496 | dgs = 0; |
1263 | crypto_hash_digestsize(mdev->integrity_r_tfm) : 0; | 1497 | if (mdev->tconn->peer_integrity_tfm) { |
1264 | 1498 | dgs = crypto_hash_digestsize(mdev->tconn->peer_integrity_tfm); | |
1265 | if (dgs) { | 1499 | /* |
1266 | rr = drbd_recv(mdev, dig_in, dgs); | 1500 | * FIXME: Receive the incoming digest into the receive buffer |
1267 | if (rr != dgs) { | 1501 | * here, together with its struct p_data? |
1268 | if (!signal_pending(current)) | 1502 | */ |
1269 | dev_warn(DEV, | 1503 | err = drbd_recv_all_warn(mdev->tconn, dig_in, dgs); |
1270 | "short read receiving data digest: read %d expected %d\n", | 1504 | if (err) |
1271 | rr, dgs); | ||
1272 | return NULL; | 1505 | return NULL; |
1273 | } | 1506 | data_size -= dgs; |
1274 | } | 1507 | } |
1275 | 1508 | ||
1276 | data_size -= dgs; | 1509 | if (!expect(IS_ALIGNED(data_size, 512))) |
1277 | 1510 | return NULL; | |
1278 | ERR_IF(data_size & 0x1ff) return NULL; | 1511 | if (!expect(data_size <= DRBD_MAX_BIO_SIZE)) |
1279 | ERR_IF(data_size > DRBD_MAX_BIO_SIZE) return NULL; | 1512 | return NULL; |
1280 | 1513 | ||
1281 | /* even though we trust out peer, | 1514 | /* even though we trust out peer, |
1282 | * we sometimes have to double check. */ | 1515 | * we sometimes have to double check. */ |
@@ -1291,47 +1524,42 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __ | |||
1291 | /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD | 1524 | /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD |
1292 | * "criss-cross" setup, that might cause write-out on some other DRBD, | 1525 | * "criss-cross" setup, that might cause write-out on some other DRBD, |
1293 | * which in turn might block on the other node at this very place. */ | 1526 | * which in turn might block on the other node at this very place. */ |
1294 | e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO); | 1527 | peer_req = drbd_alloc_peer_req(mdev, id, sector, data_size, GFP_NOIO); |
1295 | if (!e) | 1528 | if (!peer_req) |
1296 | return NULL; | 1529 | return NULL; |
1297 | 1530 | ||
1298 | if (!data_size) | 1531 | if (!data_size) |
1299 | return e; | 1532 | return peer_req; |
1300 | 1533 | ||
1301 | ds = data_size; | 1534 | ds = data_size; |
1302 | page = e->pages; | 1535 | page = peer_req->pages; |
1303 | page_chain_for_each(page) { | 1536 | page_chain_for_each(page) { |
1304 | unsigned len = min_t(int, ds, PAGE_SIZE); | 1537 | unsigned len = min_t(int, ds, PAGE_SIZE); |
1305 | data = kmap(page); | 1538 | data = kmap(page); |
1306 | rr = drbd_recv(mdev, data, len); | 1539 | err = drbd_recv_all_warn(mdev->tconn, data, len); |
1307 | if (drbd_insert_fault(mdev, DRBD_FAULT_RECEIVE)) { | 1540 | if (drbd_insert_fault(mdev, DRBD_FAULT_RECEIVE)) { |
1308 | dev_err(DEV, "Fault injection: Corrupting data on receive\n"); | 1541 | dev_err(DEV, "Fault injection: Corrupting data on receive\n"); |
1309 | data[0] = data[0] ^ (unsigned long)-1; | 1542 | data[0] = data[0] ^ (unsigned long)-1; |
1310 | } | 1543 | } |
1311 | kunmap(page); | 1544 | kunmap(page); |
1312 | if (rr != len) { | 1545 | if (err) { |
1313 | drbd_free_ee(mdev, e); | 1546 | drbd_free_peer_req(mdev, peer_req); |
1314 | if (!signal_pending(current)) | ||
1315 | dev_warn(DEV, "short read receiving data: read %d expected %d\n", | ||
1316 | rr, len); | ||
1317 | return NULL; | 1547 | return NULL; |
1318 | } | 1548 | } |
1319 | ds -= rr; | 1549 | ds -= len; |
1320 | } | 1550 | } |
1321 | 1551 | ||
1322 | if (dgs) { | 1552 | if (dgs) { |
1323 | drbd_csum_ee(mdev, mdev->integrity_r_tfm, e, dig_vv); | 1553 | drbd_csum_ee(mdev, mdev->tconn->peer_integrity_tfm, peer_req, dig_vv); |
1324 | if (memcmp(dig_in, dig_vv, dgs)) { | 1554 | if (memcmp(dig_in, dig_vv, dgs)) { |
1325 | dev_err(DEV, "Digest integrity check FAILED: %llus +%u\n", | 1555 | dev_err(DEV, "Digest integrity check FAILED: %llus +%u\n", |
1326 | (unsigned long long)sector, data_size); | 1556 | (unsigned long long)sector, data_size); |
1327 | drbd_bcast_ee(mdev, "digest failed", | 1557 | drbd_free_peer_req(mdev, peer_req); |
1328 | dgs, dig_in, dig_vv, e); | ||
1329 | drbd_free_ee(mdev, e); | ||
1330 | return NULL; | 1558 | return NULL; |
1331 | } | 1559 | } |
1332 | } | 1560 | } |
1333 | mdev->recv_cnt += data_size>>9; | 1561 | mdev->recv_cnt += data_size>>9; |
1334 | return e; | 1562 | return peer_req; |
1335 | } | 1563 | } |
1336 | 1564 | ||
1337 | /* drbd_drain_block() just takes a data block | 1565 | /* drbd_drain_block() just takes a data block |
@@ -1340,30 +1568,26 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __ | |||
1340 | static int drbd_drain_block(struct drbd_conf *mdev, int data_size) | 1568 | static int drbd_drain_block(struct drbd_conf *mdev, int data_size) |
1341 | { | 1569 | { |
1342 | struct page *page; | 1570 | struct page *page; |
1343 | int rr, rv = 1; | 1571 | int err = 0; |
1344 | void *data; | 1572 | void *data; |
1345 | 1573 | ||
1346 | if (!data_size) | 1574 | if (!data_size) |
1347 | return true; | 1575 | return 0; |
1348 | 1576 | ||
1349 | page = drbd_pp_alloc(mdev, 1, 1); | 1577 | page = drbd_alloc_pages(mdev, 1, 1); |
1350 | 1578 | ||
1351 | data = kmap(page); | 1579 | data = kmap(page); |
1352 | while (data_size) { | 1580 | while (data_size) { |
1353 | rr = drbd_recv(mdev, data, min_t(int, data_size, PAGE_SIZE)); | 1581 | unsigned int len = min_t(int, data_size, PAGE_SIZE); |
1354 | if (rr != min_t(int, data_size, PAGE_SIZE)) { | 1582 | |
1355 | rv = 0; | 1583 | err = drbd_recv_all_warn(mdev->tconn, data, len); |
1356 | if (!signal_pending(current)) | 1584 | if (err) |
1357 | dev_warn(DEV, | ||
1358 | "short read receiving data: read %d expected %d\n", | ||
1359 | rr, min_t(int, data_size, PAGE_SIZE)); | ||
1360 | break; | 1585 | break; |
1361 | } | 1586 | data_size -= len; |
1362 | data_size -= rr; | ||
1363 | } | 1587 | } |
1364 | kunmap(page); | 1588 | kunmap(page); |
1365 | drbd_pp_free(mdev, page, 0); | 1589 | drbd_free_pages(mdev, page, 0); |
1366 | return rv; | 1590 | return err; |
1367 | } | 1591 | } |
1368 | 1592 | ||
1369 | static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req, | 1593 | static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req, |
@@ -1371,26 +1595,19 @@ static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req, | |||
1371 | { | 1595 | { |
1372 | struct bio_vec *bvec; | 1596 | struct bio_vec *bvec; |
1373 | struct bio *bio; | 1597 | struct bio *bio; |
1374 | int dgs, rr, i, expect; | 1598 | int dgs, err, i, expect; |
1375 | void *dig_in = mdev->int_dig_in; | 1599 | void *dig_in = mdev->tconn->int_dig_in; |
1376 | void *dig_vv = mdev->int_dig_vv; | 1600 | void *dig_vv = mdev->tconn->int_dig_vv; |
1377 | |||
1378 | dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ? | ||
1379 | crypto_hash_digestsize(mdev->integrity_r_tfm) : 0; | ||
1380 | 1601 | ||
1381 | if (dgs) { | 1602 | dgs = 0; |
1382 | rr = drbd_recv(mdev, dig_in, dgs); | 1603 | if (mdev->tconn->peer_integrity_tfm) { |
1383 | if (rr != dgs) { | 1604 | dgs = crypto_hash_digestsize(mdev->tconn->peer_integrity_tfm); |
1384 | if (!signal_pending(current)) | 1605 | err = drbd_recv_all_warn(mdev->tconn, dig_in, dgs); |
1385 | dev_warn(DEV, | 1606 | if (err) |
1386 | "short read receiving data reply digest: read %d expected %d\n", | 1607 | return err; |
1387 | rr, dgs); | 1608 | data_size -= dgs; |
1388 | return 0; | ||
1389 | } | ||
1390 | } | 1609 | } |
1391 | 1610 | ||
1392 | data_size -= dgs; | ||
1393 | |||
1394 | /* optimistically update recv_cnt. if receiving fails below, | 1611 | /* optimistically update recv_cnt. if receiving fails below, |
1395 | * we disconnect anyways, and counters will be reset. */ | 1612 | * we disconnect anyways, and counters will be reset. */ |
1396 | mdev->recv_cnt += data_size>>9; | 1613 | mdev->recv_cnt += data_size>>9; |
@@ -1399,63 +1616,61 @@ static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req, | |||
1399 | D_ASSERT(sector == bio->bi_sector); | 1616 | D_ASSERT(sector == bio->bi_sector); |
1400 | 1617 | ||
1401 | bio_for_each_segment(bvec, bio, i) { | 1618 | bio_for_each_segment(bvec, bio, i) { |
1619 | void *mapped = kmap(bvec->bv_page) + bvec->bv_offset; | ||
1402 | expect = min_t(int, data_size, bvec->bv_len); | 1620 | expect = min_t(int, data_size, bvec->bv_len); |
1403 | rr = drbd_recv(mdev, | 1621 | err = drbd_recv_all_warn(mdev->tconn, mapped, expect); |
1404 | kmap(bvec->bv_page)+bvec->bv_offset, | ||
1405 | expect); | ||
1406 | kunmap(bvec->bv_page); | 1622 | kunmap(bvec->bv_page); |
1407 | if (rr != expect) { | 1623 | if (err) |
1408 | if (!signal_pending(current)) | 1624 | return err; |
1409 | dev_warn(DEV, "short read receiving data reply: " | 1625 | data_size -= expect; |
1410 | "read %d expected %d\n", | ||
1411 | rr, expect); | ||
1412 | return 0; | ||
1413 | } | ||
1414 | data_size -= rr; | ||
1415 | } | 1626 | } |
1416 | 1627 | ||
1417 | if (dgs) { | 1628 | if (dgs) { |
1418 | drbd_csum_bio(mdev, mdev->integrity_r_tfm, bio, dig_vv); | 1629 | drbd_csum_bio(mdev, mdev->tconn->peer_integrity_tfm, bio, dig_vv); |
1419 | if (memcmp(dig_in, dig_vv, dgs)) { | 1630 | if (memcmp(dig_in, dig_vv, dgs)) { |
1420 | dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n"); | 1631 | dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n"); |
1421 | return 0; | 1632 | return -EINVAL; |
1422 | } | 1633 | } |
1423 | } | 1634 | } |
1424 | 1635 | ||
1425 | D_ASSERT(data_size == 0); | 1636 | D_ASSERT(data_size == 0); |
1426 | return 1; | 1637 | return 0; |
1427 | } | 1638 | } |
1428 | 1639 | ||
1429 | /* e_end_resync_block() is called via | 1640 | /* |
1430 | * drbd_process_done_ee() by asender only */ | 1641 | * e_end_resync_block() is called in asender context via |
1431 | static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int unused) | 1642 | * drbd_finish_peer_reqs(). |
1643 | */ | ||
1644 | static int e_end_resync_block(struct drbd_work *w, int unused) | ||
1432 | { | 1645 | { |
1433 | struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; | 1646 | struct drbd_peer_request *peer_req = |
1434 | sector_t sector = e->sector; | 1647 | container_of(w, struct drbd_peer_request, w); |
1435 | int ok; | 1648 | struct drbd_conf *mdev = w->mdev; |
1649 | sector_t sector = peer_req->i.sector; | ||
1650 | int err; | ||
1436 | 1651 | ||
1437 | D_ASSERT(hlist_unhashed(&e->collision)); | 1652 | D_ASSERT(drbd_interval_empty(&peer_req->i)); |
1438 | 1653 | ||
1439 | if (likely((e->flags & EE_WAS_ERROR) == 0)) { | 1654 | if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { |
1440 | drbd_set_in_sync(mdev, sector, e->size); | 1655 | drbd_set_in_sync(mdev, sector, peer_req->i.size); |
1441 | ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e); | 1656 | err = drbd_send_ack(mdev, P_RS_WRITE_ACK, peer_req); |
1442 | } else { | 1657 | } else { |
1443 | /* Record failure to sync */ | 1658 | /* Record failure to sync */ |
1444 | drbd_rs_failed_io(mdev, sector, e->size); | 1659 | drbd_rs_failed_io(mdev, sector, peer_req->i.size); |
1445 | 1660 | ||
1446 | ok = drbd_send_ack(mdev, P_NEG_ACK, e); | 1661 | err = drbd_send_ack(mdev, P_NEG_ACK, peer_req); |
1447 | } | 1662 | } |
1448 | dec_unacked(mdev); | 1663 | dec_unacked(mdev); |
1449 | 1664 | ||
1450 | return ok; | 1665 | return err; |
1451 | } | 1666 | } |
1452 | 1667 | ||
1453 | static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_size) __releases(local) | 1668 | static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_size) __releases(local) |
1454 | { | 1669 | { |
1455 | struct drbd_epoch_entry *e; | 1670 | struct drbd_peer_request *peer_req; |
1456 | 1671 | ||
1457 | e = read_in_block(mdev, ID_SYNCER, sector, data_size); | 1672 | peer_req = read_in_block(mdev, ID_SYNCER, sector, data_size); |
1458 | if (!e) | 1673 | if (!peer_req) |
1459 | goto fail; | 1674 | goto fail; |
1460 | 1675 | ||
1461 | dec_rs_pending(mdev); | 1676 | dec_rs_pending(mdev); |
@@ -1464,64 +1679,88 @@ static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_si | |||
1464 | /* corresponding dec_unacked() in e_end_resync_block() | 1679 | /* corresponding dec_unacked() in e_end_resync_block() |
1465 | * respective _drbd_clear_done_ee */ | 1680 | * respective _drbd_clear_done_ee */ |
1466 | 1681 | ||
1467 | e->w.cb = e_end_resync_block; | 1682 | peer_req->w.cb = e_end_resync_block; |
1468 | 1683 | ||
1469 | spin_lock_irq(&mdev->req_lock); | 1684 | spin_lock_irq(&mdev->tconn->req_lock); |
1470 | list_add(&e->w.list, &mdev->sync_ee); | 1685 | list_add(&peer_req->w.list, &mdev->sync_ee); |
1471 | spin_unlock_irq(&mdev->req_lock); | 1686 | spin_unlock_irq(&mdev->tconn->req_lock); |
1472 | 1687 | ||
1473 | atomic_add(data_size >> 9, &mdev->rs_sect_ev); | 1688 | atomic_add(data_size >> 9, &mdev->rs_sect_ev); |
1474 | if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0) | 1689 | if (drbd_submit_peer_request(mdev, peer_req, WRITE, DRBD_FAULT_RS_WR) == 0) |
1475 | return true; | 1690 | return 0; |
1476 | 1691 | ||
1477 | /* don't care for the reason here */ | 1692 | /* don't care for the reason here */ |
1478 | dev_err(DEV, "submit failed, triggering re-connect\n"); | 1693 | dev_err(DEV, "submit failed, triggering re-connect\n"); |
1479 | spin_lock_irq(&mdev->req_lock); | 1694 | spin_lock_irq(&mdev->tconn->req_lock); |
1480 | list_del(&e->w.list); | 1695 | list_del(&peer_req->w.list); |
1481 | spin_unlock_irq(&mdev->req_lock); | 1696 | spin_unlock_irq(&mdev->tconn->req_lock); |
1482 | 1697 | ||
1483 | drbd_free_ee(mdev, e); | 1698 | drbd_free_peer_req(mdev, peer_req); |
1484 | fail: | 1699 | fail: |
1485 | put_ldev(mdev); | 1700 | put_ldev(mdev); |
1486 | return false; | 1701 | return -EIO; |
1487 | } | 1702 | } |
1488 | 1703 | ||
1489 | static int receive_DataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 1704 | static struct drbd_request * |
1705 | find_request(struct drbd_conf *mdev, struct rb_root *root, u64 id, | ||
1706 | sector_t sector, bool missing_ok, const char *func) | ||
1490 | { | 1707 | { |
1491 | struct drbd_request *req; | 1708 | struct drbd_request *req; |
1709 | |||
1710 | /* Request object according to our peer */ | ||
1711 | req = (struct drbd_request *)(unsigned long)id; | ||
1712 | if (drbd_contains_interval(root, sector, &req->i) && req->i.local) | ||
1713 | return req; | ||
1714 | if (!missing_ok) { | ||
1715 | dev_err(DEV, "%s: failed to find request 0x%lx, sector %llus\n", func, | ||
1716 | (unsigned long)id, (unsigned long long)sector); | ||
1717 | } | ||
1718 | return NULL; | ||
1719 | } | ||
1720 | |||
1721 | static int receive_DataReply(struct drbd_tconn *tconn, struct packet_info *pi) | ||
1722 | { | ||
1723 | struct drbd_conf *mdev; | ||
1724 | struct drbd_request *req; | ||
1492 | sector_t sector; | 1725 | sector_t sector; |
1493 | int ok; | 1726 | int err; |
1494 | struct p_data *p = &mdev->data.rbuf.data; | 1727 | struct p_data *p = pi->data; |
1728 | |||
1729 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
1730 | if (!mdev) | ||
1731 | return -EIO; | ||
1495 | 1732 | ||
1496 | sector = be64_to_cpu(p->sector); | 1733 | sector = be64_to_cpu(p->sector); |
1497 | 1734 | ||
1498 | spin_lock_irq(&mdev->req_lock); | 1735 | spin_lock_irq(&mdev->tconn->req_lock); |
1499 | req = _ar_id_to_req(mdev, p->block_id, sector); | 1736 | req = find_request(mdev, &mdev->read_requests, p->block_id, sector, false, __func__); |
1500 | spin_unlock_irq(&mdev->req_lock); | 1737 | spin_unlock_irq(&mdev->tconn->req_lock); |
1501 | if (unlikely(!req)) { | 1738 | if (unlikely(!req)) |
1502 | dev_err(DEV, "Got a corrupt block_id/sector pair(1).\n"); | 1739 | return -EIO; |
1503 | return false; | ||
1504 | } | ||
1505 | 1740 | ||
1506 | /* hlist_del(&req->collision) is done in _req_may_be_done, to avoid | 1741 | /* hlist_del(&req->collision) is done in _req_may_be_done, to avoid |
1507 | * special casing it there for the various failure cases. | 1742 | * special casing it there for the various failure cases. |
1508 | * still no race with drbd_fail_pending_reads */ | 1743 | * still no race with drbd_fail_pending_reads */ |
1509 | ok = recv_dless_read(mdev, req, sector, data_size); | 1744 | err = recv_dless_read(mdev, req, sector, pi->size); |
1510 | 1745 | if (!err) | |
1511 | if (ok) | 1746 | req_mod(req, DATA_RECEIVED); |
1512 | req_mod(req, data_received); | ||
1513 | /* else: nothing. handled from drbd_disconnect... | 1747 | /* else: nothing. handled from drbd_disconnect... |
1514 | * I don't think we may complete this just yet | 1748 | * I don't think we may complete this just yet |
1515 | * in case we are "on-disconnect: freeze" */ | 1749 | * in case we are "on-disconnect: freeze" */ |
1516 | 1750 | ||
1517 | return ok; | 1751 | return err; |
1518 | } | 1752 | } |
1519 | 1753 | ||
1520 | static int receive_RSDataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 1754 | static int receive_RSDataReply(struct drbd_tconn *tconn, struct packet_info *pi) |
1521 | { | 1755 | { |
1756 | struct drbd_conf *mdev; | ||
1522 | sector_t sector; | 1757 | sector_t sector; |
1523 | int ok; | 1758 | int err; |
1524 | struct p_data *p = &mdev->data.rbuf.data; | 1759 | struct p_data *p = pi->data; |
1760 | |||
1761 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
1762 | if (!mdev) | ||
1763 | return -EIO; | ||
1525 | 1764 | ||
1526 | sector = be64_to_cpu(p->sector); | 1765 | sector = be64_to_cpu(p->sector); |
1527 | D_ASSERT(p->block_id == ID_SYNCER); | 1766 | D_ASSERT(p->block_id == ID_SYNCER); |
@@ -1529,42 +1768,63 @@ static int receive_RSDataReply(struct drbd_conf *mdev, enum drbd_packets cmd, un | |||
1529 | if (get_ldev(mdev)) { | 1768 | if (get_ldev(mdev)) { |
1530 | /* data is submitted to disk within recv_resync_read. | 1769 | /* data is submitted to disk within recv_resync_read. |
1531 | * corresponding put_ldev done below on error, | 1770 | * corresponding put_ldev done below on error, |
1532 | * or in drbd_endio_write_sec. */ | 1771 | * or in drbd_peer_request_endio. */ |
1533 | ok = recv_resync_read(mdev, sector, data_size); | 1772 | err = recv_resync_read(mdev, sector, pi->size); |
1534 | } else { | 1773 | } else { |
1535 | if (__ratelimit(&drbd_ratelimit_state)) | 1774 | if (__ratelimit(&drbd_ratelimit_state)) |
1536 | dev_err(DEV, "Can not write resync data to local disk.\n"); | 1775 | dev_err(DEV, "Can not write resync data to local disk.\n"); |
1537 | 1776 | ||
1538 | ok = drbd_drain_block(mdev, data_size); | 1777 | err = drbd_drain_block(mdev, pi->size); |
1539 | 1778 | ||
1540 | drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size); | 1779 | drbd_send_ack_dp(mdev, P_NEG_ACK, p, pi->size); |
1541 | } | 1780 | } |
1542 | 1781 | ||
1543 | atomic_add(data_size >> 9, &mdev->rs_sect_in); | 1782 | atomic_add(pi->size >> 9, &mdev->rs_sect_in); |
1544 | 1783 | ||
1545 | return ok; | 1784 | return err; |
1546 | } | 1785 | } |
1547 | 1786 | ||
1548 | /* e_end_block() is called via drbd_process_done_ee(). | 1787 | static void restart_conflicting_writes(struct drbd_conf *mdev, |
1549 | * this means this function only runs in the asender thread | 1788 | sector_t sector, int size) |
1550 | */ | ||
1551 | static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
1552 | { | 1789 | { |
1553 | struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; | 1790 | struct drbd_interval *i; |
1554 | sector_t sector = e->sector; | 1791 | struct drbd_request *req; |
1555 | int ok = 1, pcmd; | ||
1556 | 1792 | ||
1557 | if (mdev->net_conf->wire_protocol == DRBD_PROT_C) { | 1793 | drbd_for_each_overlap(i, &mdev->write_requests, sector, size) { |
1558 | if (likely((e->flags & EE_WAS_ERROR) == 0)) { | 1794 | if (!i->local) |
1795 | continue; | ||
1796 | req = container_of(i, struct drbd_request, i); | ||
1797 | if (req->rq_state & RQ_LOCAL_PENDING || | ||
1798 | !(req->rq_state & RQ_POSTPONED)) | ||
1799 | continue; | ||
1800 | /* as it is RQ_POSTPONED, this will cause it to | ||
1801 | * be queued on the retry workqueue. */ | ||
1802 | __req_mod(req, CONFLICT_RESOLVED, NULL); | ||
1803 | } | ||
1804 | } | ||
1805 | |||
1806 | /* | ||
1807 | * e_end_block() is called in asender context via drbd_finish_peer_reqs(). | ||
1808 | */ | ||
1809 | static int e_end_block(struct drbd_work *w, int cancel) | ||
1810 | { | ||
1811 | struct drbd_peer_request *peer_req = | ||
1812 | container_of(w, struct drbd_peer_request, w); | ||
1813 | struct drbd_conf *mdev = w->mdev; | ||
1814 | sector_t sector = peer_req->i.sector; | ||
1815 | int err = 0, pcmd; | ||
1816 | |||
1817 | if (peer_req->flags & EE_SEND_WRITE_ACK) { | ||
1818 | if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { | ||
1559 | pcmd = (mdev->state.conn >= C_SYNC_SOURCE && | 1819 | pcmd = (mdev->state.conn >= C_SYNC_SOURCE && |
1560 | mdev->state.conn <= C_PAUSED_SYNC_T && | 1820 | mdev->state.conn <= C_PAUSED_SYNC_T && |
1561 | e->flags & EE_MAY_SET_IN_SYNC) ? | 1821 | peer_req->flags & EE_MAY_SET_IN_SYNC) ? |
1562 | P_RS_WRITE_ACK : P_WRITE_ACK; | 1822 | P_RS_WRITE_ACK : P_WRITE_ACK; |
1563 | ok &= drbd_send_ack(mdev, pcmd, e); | 1823 | err = drbd_send_ack(mdev, pcmd, peer_req); |
1564 | if (pcmd == P_RS_WRITE_ACK) | 1824 | if (pcmd == P_RS_WRITE_ACK) |
1565 | drbd_set_in_sync(mdev, sector, e->size); | 1825 | drbd_set_in_sync(mdev, sector, peer_req->i.size); |
1566 | } else { | 1826 | } else { |
1567 | ok = drbd_send_ack(mdev, P_NEG_ACK, e); | 1827 | err = drbd_send_ack(mdev, P_NEG_ACK, peer_req); |
1568 | /* we expect it to be marked out of sync anyways... | 1828 | /* we expect it to be marked out of sync anyways... |
1569 | * maybe assert this? */ | 1829 | * maybe assert this? */ |
1570 | } | 1830 | } |
@@ -1572,52 +1832,115 @@ static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | |||
1572 | } | 1832 | } |
1573 | /* we delete from the conflict detection hash _after_ we sent out the | 1833 | /* we delete from the conflict detection hash _after_ we sent out the |
1574 | * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */ | 1834 | * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */ |
1575 | if (mdev->net_conf->two_primaries) { | 1835 | if (peer_req->flags & EE_IN_INTERVAL_TREE) { |
1576 | spin_lock_irq(&mdev->req_lock); | 1836 | spin_lock_irq(&mdev->tconn->req_lock); |
1577 | D_ASSERT(!hlist_unhashed(&e->collision)); | 1837 | D_ASSERT(!drbd_interval_empty(&peer_req->i)); |
1578 | hlist_del_init(&e->collision); | 1838 | drbd_remove_epoch_entry_interval(mdev, peer_req); |
1579 | spin_unlock_irq(&mdev->req_lock); | 1839 | if (peer_req->flags & EE_RESTART_REQUESTS) |
1580 | } else { | 1840 | restart_conflicting_writes(mdev, sector, peer_req->i.size); |
1581 | D_ASSERT(hlist_unhashed(&e->collision)); | 1841 | spin_unlock_irq(&mdev->tconn->req_lock); |
1582 | } | 1842 | } else |
1843 | D_ASSERT(drbd_interval_empty(&peer_req->i)); | ||
1583 | 1844 | ||
1584 | drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0)); | 1845 | drbd_may_finish_epoch(mdev->tconn, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0)); |
1585 | 1846 | ||
1586 | return ok; | 1847 | return err; |
1587 | } | 1848 | } |
1588 | 1849 | ||
1589 | static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int unused) | 1850 | static int e_send_ack(struct drbd_work *w, enum drbd_packet ack) |
1590 | { | 1851 | { |
1591 | struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; | 1852 | struct drbd_conf *mdev = w->mdev; |
1592 | int ok = 1; | 1853 | struct drbd_peer_request *peer_req = |
1854 | container_of(w, struct drbd_peer_request, w); | ||
1855 | int err; | ||
1593 | 1856 | ||
1594 | D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); | 1857 | err = drbd_send_ack(mdev, ack, peer_req); |
1595 | ok = drbd_send_ack(mdev, P_DISCARD_ACK, e); | 1858 | dec_unacked(mdev); |
1596 | 1859 | ||
1597 | spin_lock_irq(&mdev->req_lock); | 1860 | return err; |
1598 | D_ASSERT(!hlist_unhashed(&e->collision)); | 1861 | } |
1599 | hlist_del_init(&e->collision); | ||
1600 | spin_unlock_irq(&mdev->req_lock); | ||
1601 | 1862 | ||
1602 | dec_unacked(mdev); | 1863 | static int e_send_superseded(struct drbd_work *w, int unused) |
1864 | { | ||
1865 | return e_send_ack(w, P_SUPERSEDED); | ||
1866 | } | ||
1867 | |||
1868 | static int e_send_retry_write(struct drbd_work *w, int unused) | ||
1869 | { | ||
1870 | struct drbd_tconn *tconn = w->mdev->tconn; | ||
1871 | |||
1872 | return e_send_ack(w, tconn->agreed_pro_version >= 100 ? | ||
1873 | P_RETRY_WRITE : P_SUPERSEDED); | ||
1874 | } | ||
1875 | |||
1876 | static bool seq_greater(u32 a, u32 b) | ||
1877 | { | ||
1878 | /* | ||
1879 | * We assume 32-bit wrap-around here. | ||
1880 | * For 24-bit wrap-around, we would have to shift: | ||
1881 | * a <<= 8; b <<= 8; | ||
1882 | */ | ||
1883 | return (s32)a - (s32)b > 0; | ||
1884 | } | ||
1885 | |||
1886 | static u32 seq_max(u32 a, u32 b) | ||
1887 | { | ||
1888 | return seq_greater(a, b) ? a : b; | ||
1889 | } | ||
1890 | |||
1891 | static bool need_peer_seq(struct drbd_conf *mdev) | ||
1892 | { | ||
1893 | struct drbd_tconn *tconn = mdev->tconn; | ||
1894 | int tp; | ||
1603 | 1895 | ||
1604 | return ok; | 1896 | /* |
1897 | * We only need to keep track of the last packet_seq number of our peer | ||
1898 | * if we are in dual-primary mode and we have the resolve-conflicts flag set; see | ||
1899 | * handle_write_conflicts(). | ||
1900 | */ | ||
1901 | |||
1902 | rcu_read_lock(); | ||
1903 | tp = rcu_dereference(mdev->tconn->net_conf)->two_primaries; | ||
1904 | rcu_read_unlock(); | ||
1905 | |||
1906 | return tp && test_bit(RESOLVE_CONFLICTS, &tconn->flags); | ||
1605 | } | 1907 | } |
1606 | 1908 | ||
1607 | static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_epoch_entry *data_e) | 1909 | static void update_peer_seq(struct drbd_conf *mdev, unsigned int peer_seq) |
1608 | { | 1910 | { |
1911 | unsigned int newest_peer_seq; | ||
1609 | 1912 | ||
1610 | struct drbd_epoch_entry *rs_e; | 1913 | if (need_peer_seq(mdev)) { |
1914 | spin_lock(&mdev->peer_seq_lock); | ||
1915 | newest_peer_seq = seq_max(mdev->peer_seq, peer_seq); | ||
1916 | mdev->peer_seq = newest_peer_seq; | ||
1917 | spin_unlock(&mdev->peer_seq_lock); | ||
1918 | /* wake up only if we actually changed mdev->peer_seq */ | ||
1919 | if (peer_seq == newest_peer_seq) | ||
1920 | wake_up(&mdev->seq_wait); | ||
1921 | } | ||
1922 | } | ||
1923 | |||
1924 | static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2) | ||
1925 | { | ||
1926 | return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9))); | ||
1927 | } | ||
1928 | |||
1929 | /* maybe change sync_ee into interval trees as well? */ | ||
1930 | static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_peer_request *peer_req) | ||
1931 | { | ||
1932 | struct drbd_peer_request *rs_req; | ||
1611 | bool rv = 0; | 1933 | bool rv = 0; |
1612 | 1934 | ||
1613 | spin_lock_irq(&mdev->req_lock); | 1935 | spin_lock_irq(&mdev->tconn->req_lock); |
1614 | list_for_each_entry(rs_e, &mdev->sync_ee, w.list) { | 1936 | list_for_each_entry(rs_req, &mdev->sync_ee, w.list) { |
1615 | if (overlaps(data_e->sector, data_e->size, rs_e->sector, rs_e->size)) { | 1937 | if (overlaps(peer_req->i.sector, peer_req->i.size, |
1938 | rs_req->i.sector, rs_req->i.size)) { | ||
1616 | rv = 1; | 1939 | rv = 1; |
1617 | break; | 1940 | break; |
1618 | } | 1941 | } |
1619 | } | 1942 | } |
1620 | spin_unlock_irq(&mdev->req_lock); | 1943 | spin_unlock_irq(&mdev->tconn->req_lock); |
1621 | 1944 | ||
1622 | return rv; | 1945 | return rv; |
1623 | } | 1946 | } |
@@ -1643,35 +1966,41 @@ static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_epoch_e | |||
1643 | * | 1966 | * |
1644 | * returns 0 if we may process the packet, | 1967 | * returns 0 if we may process the packet, |
1645 | * -ERESTARTSYS if we were interrupted (by disconnect signal). */ | 1968 | * -ERESTARTSYS if we were interrupted (by disconnect signal). */ |
1646 | static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq) | 1969 | static int wait_for_and_update_peer_seq(struct drbd_conf *mdev, const u32 peer_seq) |
1647 | { | 1970 | { |
1648 | DEFINE_WAIT(wait); | 1971 | DEFINE_WAIT(wait); |
1649 | unsigned int p_seq; | ||
1650 | long timeout; | 1972 | long timeout; |
1651 | int ret = 0; | 1973 | int ret; |
1974 | |||
1975 | if (!need_peer_seq(mdev)) | ||
1976 | return 0; | ||
1977 | |||
1652 | spin_lock(&mdev->peer_seq_lock); | 1978 | spin_lock(&mdev->peer_seq_lock); |
1653 | for (;;) { | 1979 | for (;;) { |
1654 | prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE); | 1980 | if (!seq_greater(peer_seq - 1, mdev->peer_seq)) { |
1655 | if (seq_le(packet_seq, mdev->peer_seq+1)) | 1981 | mdev->peer_seq = seq_max(mdev->peer_seq, peer_seq); |
1982 | ret = 0; | ||
1656 | break; | 1983 | break; |
1984 | } | ||
1657 | if (signal_pending(current)) { | 1985 | if (signal_pending(current)) { |
1658 | ret = -ERESTARTSYS; | 1986 | ret = -ERESTARTSYS; |
1659 | break; | 1987 | break; |
1660 | } | 1988 | } |
1661 | p_seq = mdev->peer_seq; | 1989 | prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE); |
1662 | spin_unlock(&mdev->peer_seq_lock); | 1990 | spin_unlock(&mdev->peer_seq_lock); |
1663 | timeout = schedule_timeout(30*HZ); | 1991 | rcu_read_lock(); |
1992 | timeout = rcu_dereference(mdev->tconn->net_conf)->ping_timeo*HZ/10; | ||
1993 | rcu_read_unlock(); | ||
1994 | timeout = schedule_timeout(timeout); | ||
1664 | spin_lock(&mdev->peer_seq_lock); | 1995 | spin_lock(&mdev->peer_seq_lock); |
1665 | if (timeout == 0 && p_seq == mdev->peer_seq) { | 1996 | if (!timeout) { |
1666 | ret = -ETIMEDOUT; | 1997 | ret = -ETIMEDOUT; |
1667 | dev_err(DEV, "ASSERT FAILED waited 30 seconds for sequence update, forcing reconnect\n"); | 1998 | dev_err(DEV, "Timed out waiting for missing ack packets; disconnecting\n"); |
1668 | break; | 1999 | break; |
1669 | } | 2000 | } |
1670 | } | 2001 | } |
1671 | finish_wait(&mdev->seq_wait, &wait); | ||
1672 | if (mdev->peer_seq+1 == packet_seq) | ||
1673 | mdev->peer_seq++; | ||
1674 | spin_unlock(&mdev->peer_seq_lock); | 2002 | spin_unlock(&mdev->peer_seq_lock); |
2003 | finish_wait(&mdev->seq_wait, &wait); | ||
1675 | return ret; | 2004 | return ret; |
1676 | } | 2005 | } |
1677 | 2006 | ||
@@ -1686,233 +2015,277 @@ static unsigned long wire_flags_to_bio(struct drbd_conf *mdev, u32 dpf) | |||
1686 | (dpf & DP_DISCARD ? REQ_DISCARD : 0); | 2015 | (dpf & DP_DISCARD ? REQ_DISCARD : 0); |
1687 | } | 2016 | } |
1688 | 2017 | ||
2018 | static void fail_postponed_requests(struct drbd_conf *mdev, sector_t sector, | ||
2019 | unsigned int size) | ||
2020 | { | ||
2021 | struct drbd_interval *i; | ||
2022 | |||
2023 | repeat: | ||
2024 | drbd_for_each_overlap(i, &mdev->write_requests, sector, size) { | ||
2025 | struct drbd_request *req; | ||
2026 | struct bio_and_error m; | ||
2027 | |||
2028 | if (!i->local) | ||
2029 | continue; | ||
2030 | req = container_of(i, struct drbd_request, i); | ||
2031 | if (!(req->rq_state & RQ_POSTPONED)) | ||
2032 | continue; | ||
2033 | req->rq_state &= ~RQ_POSTPONED; | ||
2034 | __req_mod(req, NEG_ACKED, &m); | ||
2035 | spin_unlock_irq(&mdev->tconn->req_lock); | ||
2036 | if (m.bio) | ||
2037 | complete_master_bio(mdev, &m); | ||
2038 | spin_lock_irq(&mdev->tconn->req_lock); | ||
2039 | goto repeat; | ||
2040 | } | ||
2041 | } | ||
2042 | |||
2043 | static int handle_write_conflicts(struct drbd_conf *mdev, | ||
2044 | struct drbd_peer_request *peer_req) | ||
2045 | { | ||
2046 | struct drbd_tconn *tconn = mdev->tconn; | ||
2047 | bool resolve_conflicts = test_bit(RESOLVE_CONFLICTS, &tconn->flags); | ||
2048 | sector_t sector = peer_req->i.sector; | ||
2049 | const unsigned int size = peer_req->i.size; | ||
2050 | struct drbd_interval *i; | ||
2051 | bool equal; | ||
2052 | int err; | ||
2053 | |||
2054 | /* | ||
2055 | * Inserting the peer request into the write_requests tree will prevent | ||
2056 | * new conflicting local requests from being added. | ||
2057 | */ | ||
2058 | drbd_insert_interval(&mdev->write_requests, &peer_req->i); | ||
2059 | |||
2060 | repeat: | ||
2061 | drbd_for_each_overlap(i, &mdev->write_requests, sector, size) { | ||
2062 | if (i == &peer_req->i) | ||
2063 | continue; | ||
2064 | |||
2065 | if (!i->local) { | ||
2066 | /* | ||
2067 | * Our peer has sent a conflicting remote request; this | ||
2068 | * should not happen in a two-node setup. Wait for the | ||
2069 | * earlier peer request to complete. | ||
2070 | */ | ||
2071 | err = drbd_wait_misc(mdev, i); | ||
2072 | if (err) | ||
2073 | goto out; | ||
2074 | goto repeat; | ||
2075 | } | ||
2076 | |||
2077 | equal = i->sector == sector && i->size == size; | ||
2078 | if (resolve_conflicts) { | ||
2079 | /* | ||
2080 | * If the peer request is fully contained within the | ||
2081 | * overlapping request, it can be considered overwritten | ||
2082 | * and thus superseded; otherwise, it will be retried | ||
2083 | * once all overlapping requests have completed. | ||
2084 | */ | ||
2085 | bool superseded = i->sector <= sector && i->sector + | ||
2086 | (i->size >> 9) >= sector + (size >> 9); | ||
2087 | |||
2088 | if (!equal) | ||
2089 | dev_alert(DEV, "Concurrent writes detected: " | ||
2090 | "local=%llus +%u, remote=%llus +%u, " | ||
2091 | "assuming %s came first\n", | ||
2092 | (unsigned long long)i->sector, i->size, | ||
2093 | (unsigned long long)sector, size, | ||
2094 | superseded ? "local" : "remote"); | ||
2095 | |||
2096 | inc_unacked(mdev); | ||
2097 | peer_req->w.cb = superseded ? e_send_superseded : | ||
2098 | e_send_retry_write; | ||
2099 | list_add_tail(&peer_req->w.list, &mdev->done_ee); | ||
2100 | wake_asender(mdev->tconn); | ||
2101 | |||
2102 | err = -ENOENT; | ||
2103 | goto out; | ||
2104 | } else { | ||
2105 | struct drbd_request *req = | ||
2106 | container_of(i, struct drbd_request, i); | ||
2107 | |||
2108 | if (!equal) | ||
2109 | dev_alert(DEV, "Concurrent writes detected: " | ||
2110 | "local=%llus +%u, remote=%llus +%u\n", | ||
2111 | (unsigned long long)i->sector, i->size, | ||
2112 | (unsigned long long)sector, size); | ||
2113 | |||
2114 | if (req->rq_state & RQ_LOCAL_PENDING || | ||
2115 | !(req->rq_state & RQ_POSTPONED)) { | ||
2116 | /* | ||
2117 | * Wait for the node with the discard flag to | ||
2118 | * decide if this request has been superseded | ||
2119 | * or needs to be retried. | ||
2120 | * Requests that have been superseded will | ||
2121 | * disappear from the write_requests tree. | ||
2122 | * | ||
2123 | * In addition, wait for the conflicting | ||
2124 | * request to finish locally before submitting | ||
2125 | * the conflicting peer request. | ||
2126 | */ | ||
2127 | err = drbd_wait_misc(mdev, &req->i); | ||
2128 | if (err) { | ||
2129 | _conn_request_state(mdev->tconn, | ||
2130 | NS(conn, C_TIMEOUT), | ||
2131 | CS_HARD); | ||
2132 | fail_postponed_requests(mdev, sector, size); | ||
2133 | goto out; | ||
2134 | } | ||
2135 | goto repeat; | ||
2136 | } | ||
2137 | /* | ||
2138 | * Remember to restart the conflicting requests after | ||
2139 | * the new peer request has completed. | ||
2140 | */ | ||
2141 | peer_req->flags |= EE_RESTART_REQUESTS; | ||
2142 | } | ||
2143 | } | ||
2144 | err = 0; | ||
2145 | |||
2146 | out: | ||
2147 | if (err) | ||
2148 | drbd_remove_epoch_entry_interval(mdev, peer_req); | ||
2149 | return err; | ||
2150 | } | ||
2151 | |||
1689 | /* mirrored write */ | 2152 | /* mirrored write */ |
1690 | static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 2153 | static int receive_Data(struct drbd_tconn *tconn, struct packet_info *pi) |
1691 | { | 2154 | { |
2155 | struct drbd_conf *mdev; | ||
1692 | sector_t sector; | 2156 | sector_t sector; |
1693 | struct drbd_epoch_entry *e; | 2157 | struct drbd_peer_request *peer_req; |
1694 | struct p_data *p = &mdev->data.rbuf.data; | 2158 | struct p_data *p = pi->data; |
2159 | u32 peer_seq = be32_to_cpu(p->seq_num); | ||
1695 | int rw = WRITE; | 2160 | int rw = WRITE; |
1696 | u32 dp_flags; | 2161 | u32 dp_flags; |
2162 | int err, tp; | ||
1697 | 2163 | ||
1698 | if (!get_ldev(mdev)) { | 2164 | mdev = vnr_to_mdev(tconn, pi->vnr); |
1699 | spin_lock(&mdev->peer_seq_lock); | 2165 | if (!mdev) |
1700 | if (mdev->peer_seq+1 == be32_to_cpu(p->seq_num)) | 2166 | return -EIO; |
1701 | mdev->peer_seq++; | ||
1702 | spin_unlock(&mdev->peer_seq_lock); | ||
1703 | 2167 | ||
1704 | drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size); | 2168 | if (!get_ldev(mdev)) { |
1705 | atomic_inc(&mdev->current_epoch->epoch_size); | 2169 | int err2; |
1706 | return drbd_drain_block(mdev, data_size); | 2170 | |
2171 | err = wait_for_and_update_peer_seq(mdev, peer_seq); | ||
2172 | drbd_send_ack_dp(mdev, P_NEG_ACK, p, pi->size); | ||
2173 | atomic_inc(&tconn->current_epoch->epoch_size); | ||
2174 | err2 = drbd_drain_block(mdev, pi->size); | ||
2175 | if (!err) | ||
2176 | err = err2; | ||
2177 | return err; | ||
1707 | } | 2178 | } |
1708 | 2179 | ||
1709 | /* get_ldev(mdev) successful. | 2180 | /* |
1710 | * Corresponding put_ldev done either below (on various errors), | 2181 | * Corresponding put_ldev done either below (on various errors), or in |
1711 | * or in drbd_endio_write_sec, if we successfully submit the data at | 2182 | * drbd_peer_request_endio, if we successfully submit the data at the |
1712 | * the end of this function. */ | 2183 | * end of this function. |
2184 | */ | ||
1713 | 2185 | ||
1714 | sector = be64_to_cpu(p->sector); | 2186 | sector = be64_to_cpu(p->sector); |
1715 | e = read_in_block(mdev, p->block_id, sector, data_size); | 2187 | peer_req = read_in_block(mdev, p->block_id, sector, pi->size); |
1716 | if (!e) { | 2188 | if (!peer_req) { |
1717 | put_ldev(mdev); | 2189 | put_ldev(mdev); |
1718 | return false; | 2190 | return -EIO; |
1719 | } | 2191 | } |
1720 | 2192 | ||
1721 | e->w.cb = e_end_block; | 2193 | peer_req->w.cb = e_end_block; |
1722 | 2194 | ||
1723 | dp_flags = be32_to_cpu(p->dp_flags); | 2195 | dp_flags = be32_to_cpu(p->dp_flags); |
1724 | rw |= wire_flags_to_bio(mdev, dp_flags); | 2196 | rw |= wire_flags_to_bio(mdev, dp_flags); |
1725 | if (e->pages == NULL) { | 2197 | if (peer_req->pages == NULL) { |
1726 | D_ASSERT(e->size == 0); | 2198 | D_ASSERT(peer_req->i.size == 0); |
1727 | D_ASSERT(dp_flags & DP_FLUSH); | 2199 | D_ASSERT(dp_flags & DP_FLUSH); |
1728 | } | 2200 | } |
1729 | 2201 | ||
1730 | if (dp_flags & DP_MAY_SET_IN_SYNC) | 2202 | if (dp_flags & DP_MAY_SET_IN_SYNC) |
1731 | e->flags |= EE_MAY_SET_IN_SYNC; | 2203 | peer_req->flags |= EE_MAY_SET_IN_SYNC; |
1732 | 2204 | ||
1733 | spin_lock(&mdev->epoch_lock); | 2205 | spin_lock(&tconn->epoch_lock); |
1734 | e->epoch = mdev->current_epoch; | 2206 | peer_req->epoch = tconn->current_epoch; |
1735 | atomic_inc(&e->epoch->epoch_size); | 2207 | atomic_inc(&peer_req->epoch->epoch_size); |
1736 | atomic_inc(&e->epoch->active); | 2208 | atomic_inc(&peer_req->epoch->active); |
1737 | spin_unlock(&mdev->epoch_lock); | 2209 | spin_unlock(&tconn->epoch_lock); |
1738 | 2210 | ||
1739 | /* I'm the receiver, I do hold a net_cnt reference. */ | 2211 | rcu_read_lock(); |
1740 | if (!mdev->net_conf->two_primaries) { | 2212 | tp = rcu_dereference(mdev->tconn->net_conf)->two_primaries; |
1741 | spin_lock_irq(&mdev->req_lock); | 2213 | rcu_read_unlock(); |
1742 | } else { | 2214 | if (tp) { |
1743 | /* don't get the req_lock yet, | 2215 | peer_req->flags |= EE_IN_INTERVAL_TREE; |
1744 | * we may sleep in drbd_wait_peer_seq */ | 2216 | err = wait_for_and_update_peer_seq(mdev, peer_seq); |
1745 | const int size = e->size; | 2217 | if (err) |
1746 | const int discard = test_bit(DISCARD_CONCURRENT, &mdev->flags); | ||
1747 | DEFINE_WAIT(wait); | ||
1748 | struct drbd_request *i; | ||
1749 | struct hlist_node *n; | ||
1750 | struct hlist_head *slot; | ||
1751 | int first; | ||
1752 | |||
1753 | D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); | ||
1754 | BUG_ON(mdev->ee_hash == NULL); | ||
1755 | BUG_ON(mdev->tl_hash == NULL); | ||
1756 | |||
1757 | /* conflict detection and handling: | ||
1758 | * 1. wait on the sequence number, | ||
1759 | * in case this data packet overtook ACK packets. | ||
1760 | * 2. check our hash tables for conflicting requests. | ||
1761 | * we only need to walk the tl_hash, since an ee can not | ||
1762 | * have a conflict with an other ee: on the submitting | ||
1763 | * node, the corresponding req had already been conflicting, | ||
1764 | * and a conflicting req is never sent. | ||
1765 | * | ||
1766 | * Note: for two_primaries, we are protocol C, | ||
1767 | * so there cannot be any request that is DONE | ||
1768 | * but still on the transfer log. | ||
1769 | * | ||
1770 | * unconditionally add to the ee_hash. | ||
1771 | * | ||
1772 | * if no conflicting request is found: | ||
1773 | * submit. | ||
1774 | * | ||
1775 | * if any conflicting request is found | ||
1776 | * that has not yet been acked, | ||
1777 | * AND I have the "discard concurrent writes" flag: | ||
1778 | * queue (via done_ee) the P_DISCARD_ACK; OUT. | ||
1779 | * | ||
1780 | * if any conflicting request is found: | ||
1781 | * block the receiver, waiting on misc_wait | ||
1782 | * until no more conflicting requests are there, | ||
1783 | * or we get interrupted (disconnect). | ||
1784 | * | ||
1785 | * we do not just write after local io completion of those | ||
1786 | * requests, but only after req is done completely, i.e. | ||
1787 | * we wait for the P_DISCARD_ACK to arrive! | ||
1788 | * | ||
1789 | * then proceed normally, i.e. submit. | ||
1790 | */ | ||
1791 | if (drbd_wait_peer_seq(mdev, be32_to_cpu(p->seq_num))) | ||
1792 | goto out_interrupted; | 2218 | goto out_interrupted; |
1793 | 2219 | spin_lock_irq(&mdev->tconn->req_lock); | |
1794 | spin_lock_irq(&mdev->req_lock); | 2220 | err = handle_write_conflicts(mdev, peer_req); |
1795 | 2221 | if (err) { | |
1796 | hlist_add_head(&e->collision, ee_hash_slot(mdev, sector)); | 2222 | spin_unlock_irq(&mdev->tconn->req_lock); |
1797 | 2223 | if (err == -ENOENT) { | |
1798 | #define OVERLAPS overlaps(i->sector, i->size, sector, size) | ||
1799 | slot = tl_hash_slot(mdev, sector); | ||
1800 | first = 1; | ||
1801 | for (;;) { | ||
1802 | int have_unacked = 0; | ||
1803 | int have_conflict = 0; | ||
1804 | prepare_to_wait(&mdev->misc_wait, &wait, | ||
1805 | TASK_INTERRUPTIBLE); | ||
1806 | hlist_for_each_entry(i, n, slot, collision) { | ||
1807 | if (OVERLAPS) { | ||
1808 | /* only ALERT on first iteration, | ||
1809 | * we may be woken up early... */ | ||
1810 | if (first) | ||
1811 | dev_alert(DEV, "%s[%u] Concurrent local write detected!" | ||
1812 | " new: %llus +%u; pending: %llus +%u\n", | ||
1813 | current->comm, current->pid, | ||
1814 | (unsigned long long)sector, size, | ||
1815 | (unsigned long long)i->sector, i->size); | ||
1816 | if (i->rq_state & RQ_NET_PENDING) | ||
1817 | ++have_unacked; | ||
1818 | ++have_conflict; | ||
1819 | } | ||
1820 | } | ||
1821 | #undef OVERLAPS | ||
1822 | if (!have_conflict) | ||
1823 | break; | ||
1824 | |||
1825 | /* Discard Ack only for the _first_ iteration */ | ||
1826 | if (first && discard && have_unacked) { | ||
1827 | dev_alert(DEV, "Concurrent write! [DISCARD BY FLAG] sec=%llus\n", | ||
1828 | (unsigned long long)sector); | ||
1829 | inc_unacked(mdev); | ||
1830 | e->w.cb = e_send_discard_ack; | ||
1831 | list_add_tail(&e->w.list, &mdev->done_ee); | ||
1832 | |||
1833 | spin_unlock_irq(&mdev->req_lock); | ||
1834 | |||
1835 | /* we could probably send that P_DISCARD_ACK ourselves, | ||
1836 | * but I don't like the receiver using the msock */ | ||
1837 | |||
1838 | put_ldev(mdev); | 2224 | put_ldev(mdev); |
1839 | wake_asender(mdev); | 2225 | return 0; |
1840 | finish_wait(&mdev->misc_wait, &wait); | ||
1841 | return true; | ||
1842 | } | 2226 | } |
2227 | goto out_interrupted; | ||
2228 | } | ||
2229 | } else | ||
2230 | spin_lock_irq(&mdev->tconn->req_lock); | ||
2231 | list_add(&peer_req->w.list, &mdev->active_ee); | ||
2232 | spin_unlock_irq(&mdev->tconn->req_lock); | ||
1843 | 2233 | ||
1844 | if (signal_pending(current)) { | 2234 | if (mdev->state.conn == C_SYNC_TARGET) |
1845 | hlist_del_init(&e->collision); | 2235 | wait_event(mdev->ee_wait, !overlapping_resync_write(mdev, peer_req)); |
1846 | |||
1847 | spin_unlock_irq(&mdev->req_lock); | ||
1848 | |||
1849 | finish_wait(&mdev->misc_wait, &wait); | ||
1850 | goto out_interrupted; | ||
1851 | } | ||
1852 | 2236 | ||
1853 | spin_unlock_irq(&mdev->req_lock); | 2237 | if (mdev->tconn->agreed_pro_version < 100) { |
1854 | if (first) { | 2238 | rcu_read_lock(); |
1855 | first = 0; | 2239 | switch (rcu_dereference(mdev->tconn->net_conf)->wire_protocol) { |
1856 | dev_alert(DEV, "Concurrent write! [W AFTERWARDS] " | 2240 | case DRBD_PROT_C: |
1857 | "sec=%llus\n", (unsigned long long)sector); | 2241 | dp_flags |= DP_SEND_WRITE_ACK; |
1858 | } else if (discard) { | 2242 | break; |
1859 | /* we had none on the first iteration. | 2243 | case DRBD_PROT_B: |
1860 | * there must be none now. */ | 2244 | dp_flags |= DP_SEND_RECEIVE_ACK; |
1861 | D_ASSERT(have_unacked == 0); | 2245 | break; |
1862 | } | ||
1863 | schedule(); | ||
1864 | spin_lock_irq(&mdev->req_lock); | ||
1865 | } | 2246 | } |
1866 | finish_wait(&mdev->misc_wait, &wait); | 2247 | rcu_read_unlock(); |
1867 | } | 2248 | } |
1868 | 2249 | ||
1869 | list_add(&e->w.list, &mdev->active_ee); | 2250 | if (dp_flags & DP_SEND_WRITE_ACK) { |
1870 | spin_unlock_irq(&mdev->req_lock); | 2251 | peer_req->flags |= EE_SEND_WRITE_ACK; |
1871 | |||
1872 | if (mdev->state.conn == C_SYNC_TARGET) | ||
1873 | wait_event(mdev->ee_wait, !overlapping_resync_write(mdev, e)); | ||
1874 | |||
1875 | switch (mdev->net_conf->wire_protocol) { | ||
1876 | case DRBD_PROT_C: | ||
1877 | inc_unacked(mdev); | 2252 | inc_unacked(mdev); |
1878 | /* corresponding dec_unacked() in e_end_block() | 2253 | /* corresponding dec_unacked() in e_end_block() |
1879 | * respective _drbd_clear_done_ee */ | 2254 | * respective _drbd_clear_done_ee */ |
1880 | break; | 2255 | } |
1881 | case DRBD_PROT_B: | 2256 | |
2257 | if (dp_flags & DP_SEND_RECEIVE_ACK) { | ||
1882 | /* I really don't like it that the receiver thread | 2258 | /* I really don't like it that the receiver thread |
1883 | * sends on the msock, but anyways */ | 2259 | * sends on the msock, but anyways */ |
1884 | drbd_send_ack(mdev, P_RECV_ACK, e); | 2260 | drbd_send_ack(mdev, P_RECV_ACK, peer_req); |
1885 | break; | ||
1886 | case DRBD_PROT_A: | ||
1887 | /* nothing to do */ | ||
1888 | break; | ||
1889 | } | 2261 | } |
1890 | 2262 | ||
1891 | if (mdev->state.pdsk < D_INCONSISTENT) { | 2263 | if (mdev->state.pdsk < D_INCONSISTENT) { |
1892 | /* In case we have the only disk of the cluster, */ | 2264 | /* In case we have the only disk of the cluster, */ |
1893 | drbd_set_out_of_sync(mdev, e->sector, e->size); | 2265 | drbd_set_out_of_sync(mdev, peer_req->i.sector, peer_req->i.size); |
1894 | e->flags |= EE_CALL_AL_COMPLETE_IO; | 2266 | peer_req->flags |= EE_CALL_AL_COMPLETE_IO; |
1895 | e->flags &= ~EE_MAY_SET_IN_SYNC; | 2267 | peer_req->flags &= ~EE_MAY_SET_IN_SYNC; |
1896 | drbd_al_begin_io(mdev, e->sector); | 2268 | drbd_al_begin_io(mdev, &peer_req->i); |
1897 | } | 2269 | } |
1898 | 2270 | ||
1899 | if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0) | 2271 | err = drbd_submit_peer_request(mdev, peer_req, rw, DRBD_FAULT_DT_WR); |
1900 | return true; | 2272 | if (!err) |
2273 | return 0; | ||
1901 | 2274 | ||
1902 | /* don't care for the reason here */ | 2275 | /* don't care for the reason here */ |
1903 | dev_err(DEV, "submit failed, triggering re-connect\n"); | 2276 | dev_err(DEV, "submit failed, triggering re-connect\n"); |
1904 | spin_lock_irq(&mdev->req_lock); | 2277 | spin_lock_irq(&mdev->tconn->req_lock); |
1905 | list_del(&e->w.list); | 2278 | list_del(&peer_req->w.list); |
1906 | hlist_del_init(&e->collision); | 2279 | drbd_remove_epoch_entry_interval(mdev, peer_req); |
1907 | spin_unlock_irq(&mdev->req_lock); | 2280 | spin_unlock_irq(&mdev->tconn->req_lock); |
1908 | if (e->flags & EE_CALL_AL_COMPLETE_IO) | 2281 | if (peer_req->flags & EE_CALL_AL_COMPLETE_IO) |
1909 | drbd_al_complete_io(mdev, e->sector); | 2282 | drbd_al_complete_io(mdev, &peer_req->i); |
1910 | 2283 | ||
1911 | out_interrupted: | 2284 | out_interrupted: |
1912 | drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + EV_CLEANUP); | 2285 | drbd_may_finish_epoch(tconn, peer_req->epoch, EV_PUT + EV_CLEANUP); |
1913 | put_ldev(mdev); | 2286 | put_ldev(mdev); |
1914 | drbd_free_ee(mdev, e); | 2287 | drbd_free_peer_req(mdev, peer_req); |
1915 | return false; | 2288 | return err; |
1916 | } | 2289 | } |
1917 | 2290 | ||
1918 | /* We may throttle resync, if the lower device seems to be busy, | 2291 | /* We may throttle resync, if the lower device seems to be busy, |
@@ -1933,9 +2306,14 @@ int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector) | |||
1933 | struct lc_element *tmp; | 2306 | struct lc_element *tmp; |
1934 | int curr_events; | 2307 | int curr_events; |
1935 | int throttle = 0; | 2308 | int throttle = 0; |
2309 | unsigned int c_min_rate; | ||
2310 | |||
2311 | rcu_read_lock(); | ||
2312 | c_min_rate = rcu_dereference(mdev->ldev->disk_conf)->c_min_rate; | ||
2313 | rcu_read_unlock(); | ||
1936 | 2314 | ||
1937 | /* feature disabled? */ | 2315 | /* feature disabled? */ |
1938 | if (mdev->sync_conf.c_min_rate == 0) | 2316 | if (c_min_rate == 0) |
1939 | return 0; | 2317 | return 0; |
1940 | 2318 | ||
1941 | spin_lock_irq(&mdev->al_lock); | 2319 | spin_lock_irq(&mdev->al_lock); |
@@ -1975,40 +2353,46 @@ int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector) | |||
1975 | db = mdev->rs_mark_left[i] - rs_left; | 2353 | db = mdev->rs_mark_left[i] - rs_left; |
1976 | dbdt = Bit2KB(db/dt); | 2354 | dbdt = Bit2KB(db/dt); |
1977 | 2355 | ||
1978 | if (dbdt > mdev->sync_conf.c_min_rate) | 2356 | if (dbdt > c_min_rate) |
1979 | throttle = 1; | 2357 | throttle = 1; |
1980 | } | 2358 | } |
1981 | return throttle; | 2359 | return throttle; |
1982 | } | 2360 | } |
1983 | 2361 | ||
1984 | 2362 | ||
1985 | static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int digest_size) | 2363 | static int receive_DataRequest(struct drbd_tconn *tconn, struct packet_info *pi) |
1986 | { | 2364 | { |
2365 | struct drbd_conf *mdev; | ||
1987 | sector_t sector; | 2366 | sector_t sector; |
1988 | const sector_t capacity = drbd_get_capacity(mdev->this_bdev); | 2367 | sector_t capacity; |
1989 | struct drbd_epoch_entry *e; | 2368 | struct drbd_peer_request *peer_req; |
1990 | struct digest_info *di = NULL; | 2369 | struct digest_info *di = NULL; |
1991 | int size, verb; | 2370 | int size, verb; |
1992 | unsigned int fault_type; | 2371 | unsigned int fault_type; |
1993 | struct p_block_req *p = &mdev->data.rbuf.block_req; | 2372 | struct p_block_req *p = pi->data; |
2373 | |||
2374 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
2375 | if (!mdev) | ||
2376 | return -EIO; | ||
2377 | capacity = drbd_get_capacity(mdev->this_bdev); | ||
1994 | 2378 | ||
1995 | sector = be64_to_cpu(p->sector); | 2379 | sector = be64_to_cpu(p->sector); |
1996 | size = be32_to_cpu(p->blksize); | 2380 | size = be32_to_cpu(p->blksize); |
1997 | 2381 | ||
1998 | if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) { | 2382 | if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { |
1999 | dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, | 2383 | dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, |
2000 | (unsigned long long)sector, size); | 2384 | (unsigned long long)sector, size); |
2001 | return false; | 2385 | return -EINVAL; |
2002 | } | 2386 | } |
2003 | if (sector + (size>>9) > capacity) { | 2387 | if (sector + (size>>9) > capacity) { |
2004 | dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, | 2388 | dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, |
2005 | (unsigned long long)sector, size); | 2389 | (unsigned long long)sector, size); |
2006 | return false; | 2390 | return -EINVAL; |
2007 | } | 2391 | } |
2008 | 2392 | ||
2009 | if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) { | 2393 | if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) { |
2010 | verb = 1; | 2394 | verb = 1; |
2011 | switch (cmd) { | 2395 | switch (pi->cmd) { |
2012 | case P_DATA_REQUEST: | 2396 | case P_DATA_REQUEST: |
2013 | drbd_send_ack_rp(mdev, P_NEG_DREPLY, p); | 2397 | drbd_send_ack_rp(mdev, P_NEG_DREPLY, p); |
2014 | break; | 2398 | break; |
@@ -2023,35 +2407,34 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un | |||
2023 | drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, ID_IN_SYNC); | 2407 | drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, ID_IN_SYNC); |
2024 | break; | 2408 | break; |
2025 | default: | 2409 | default: |
2026 | dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n", | 2410 | BUG(); |
2027 | cmdname(cmd)); | ||
2028 | } | 2411 | } |
2029 | if (verb && __ratelimit(&drbd_ratelimit_state)) | 2412 | if (verb && __ratelimit(&drbd_ratelimit_state)) |
2030 | dev_err(DEV, "Can not satisfy peer's read request, " | 2413 | dev_err(DEV, "Can not satisfy peer's read request, " |
2031 | "no local data.\n"); | 2414 | "no local data.\n"); |
2032 | 2415 | ||
2033 | /* drain possibly payload */ | 2416 | /* drain possibly payload */ |
2034 | return drbd_drain_block(mdev, digest_size); | 2417 | return drbd_drain_block(mdev, pi->size); |
2035 | } | 2418 | } |
2036 | 2419 | ||
2037 | /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD | 2420 | /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD |
2038 | * "criss-cross" setup, that might cause write-out on some other DRBD, | 2421 | * "criss-cross" setup, that might cause write-out on some other DRBD, |
2039 | * which in turn might block on the other node at this very place. */ | 2422 | * which in turn might block on the other node at this very place. */ |
2040 | e = drbd_alloc_ee(mdev, p->block_id, sector, size, GFP_NOIO); | 2423 | peer_req = drbd_alloc_peer_req(mdev, p->block_id, sector, size, GFP_NOIO); |
2041 | if (!e) { | 2424 | if (!peer_req) { |
2042 | put_ldev(mdev); | 2425 | put_ldev(mdev); |
2043 | return false; | 2426 | return -ENOMEM; |
2044 | } | 2427 | } |
2045 | 2428 | ||
2046 | switch (cmd) { | 2429 | switch (pi->cmd) { |
2047 | case P_DATA_REQUEST: | 2430 | case P_DATA_REQUEST: |
2048 | e->w.cb = w_e_end_data_req; | 2431 | peer_req->w.cb = w_e_end_data_req; |
2049 | fault_type = DRBD_FAULT_DT_RD; | 2432 | fault_type = DRBD_FAULT_DT_RD; |
2050 | /* application IO, don't drbd_rs_begin_io */ | 2433 | /* application IO, don't drbd_rs_begin_io */ |
2051 | goto submit; | 2434 | goto submit; |
2052 | 2435 | ||
2053 | case P_RS_DATA_REQUEST: | 2436 | case P_RS_DATA_REQUEST: |
2054 | e->w.cb = w_e_end_rsdata_req; | 2437 | peer_req->w.cb = w_e_end_rsdata_req; |
2055 | fault_type = DRBD_FAULT_RS_RD; | 2438 | fault_type = DRBD_FAULT_RS_RD; |
2056 | /* used in the sector offset progress display */ | 2439 | /* used in the sector offset progress display */ |
2057 | mdev->bm_resync_fo = BM_SECT_TO_BIT(sector); | 2440 | mdev->bm_resync_fo = BM_SECT_TO_BIT(sector); |
@@ -2060,28 +2443,28 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un | |||
2060 | case P_OV_REPLY: | 2443 | case P_OV_REPLY: |
2061 | case P_CSUM_RS_REQUEST: | 2444 | case P_CSUM_RS_REQUEST: |
2062 | fault_type = DRBD_FAULT_RS_RD; | 2445 | fault_type = DRBD_FAULT_RS_RD; |
2063 | di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO); | 2446 | di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO); |
2064 | if (!di) | 2447 | if (!di) |
2065 | goto out_free_e; | 2448 | goto out_free_e; |
2066 | 2449 | ||
2067 | di->digest_size = digest_size; | 2450 | di->digest_size = pi->size; |
2068 | di->digest = (((char *)di)+sizeof(struct digest_info)); | 2451 | di->digest = (((char *)di)+sizeof(struct digest_info)); |
2069 | 2452 | ||
2070 | e->digest = di; | 2453 | peer_req->digest = di; |
2071 | e->flags |= EE_HAS_DIGEST; | 2454 | peer_req->flags |= EE_HAS_DIGEST; |
2072 | 2455 | ||
2073 | if (drbd_recv(mdev, di->digest, digest_size) != digest_size) | 2456 | if (drbd_recv_all(mdev->tconn, di->digest, pi->size)) |
2074 | goto out_free_e; | 2457 | goto out_free_e; |
2075 | 2458 | ||
2076 | if (cmd == P_CSUM_RS_REQUEST) { | 2459 | if (pi->cmd == P_CSUM_RS_REQUEST) { |
2077 | D_ASSERT(mdev->agreed_pro_version >= 89); | 2460 | D_ASSERT(mdev->tconn->agreed_pro_version >= 89); |
2078 | e->w.cb = w_e_end_csum_rs_req; | 2461 | peer_req->w.cb = w_e_end_csum_rs_req; |
2079 | /* used in the sector offset progress display */ | 2462 | /* used in the sector offset progress display */ |
2080 | mdev->bm_resync_fo = BM_SECT_TO_BIT(sector); | 2463 | mdev->bm_resync_fo = BM_SECT_TO_BIT(sector); |
2081 | } else if (cmd == P_OV_REPLY) { | 2464 | } else if (pi->cmd == P_OV_REPLY) { |
2082 | /* track progress, we may need to throttle */ | 2465 | /* track progress, we may need to throttle */ |
2083 | atomic_add(size >> 9, &mdev->rs_sect_in); | 2466 | atomic_add(size >> 9, &mdev->rs_sect_in); |
2084 | e->w.cb = w_e_end_ov_reply; | 2467 | peer_req->w.cb = w_e_end_ov_reply; |
2085 | dec_rs_pending(mdev); | 2468 | dec_rs_pending(mdev); |
2086 | /* drbd_rs_begin_io done when we sent this request, | 2469 | /* drbd_rs_begin_io done when we sent this request, |
2087 | * but accounting still needs to be done. */ | 2470 | * but accounting still needs to be done. */ |
@@ -2091,7 +2474,7 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un | |||
2091 | 2474 | ||
2092 | case P_OV_REQUEST: | 2475 | case P_OV_REQUEST: |
2093 | if (mdev->ov_start_sector == ~(sector_t)0 && | 2476 | if (mdev->ov_start_sector == ~(sector_t)0 && |
2094 | mdev->agreed_pro_version >= 90) { | 2477 | mdev->tconn->agreed_pro_version >= 90) { |
2095 | unsigned long now = jiffies; | 2478 | unsigned long now = jiffies; |
2096 | int i; | 2479 | int i; |
2097 | mdev->ov_start_sector = sector; | 2480 | mdev->ov_start_sector = sector; |
@@ -2105,15 +2488,12 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un | |||
2105 | dev_info(DEV, "Online Verify start sector: %llu\n", | 2488 | dev_info(DEV, "Online Verify start sector: %llu\n", |
2106 | (unsigned long long)sector); | 2489 | (unsigned long long)sector); |
2107 | } | 2490 | } |
2108 | e->w.cb = w_e_end_ov_req; | 2491 | peer_req->w.cb = w_e_end_ov_req; |
2109 | fault_type = DRBD_FAULT_RS_RD; | 2492 | fault_type = DRBD_FAULT_RS_RD; |
2110 | break; | 2493 | break; |
2111 | 2494 | ||
2112 | default: | 2495 | default: |
2113 | dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n", | 2496 | BUG(); |
2114 | cmdname(cmd)); | ||
2115 | fault_type = DRBD_FAULT_MAX; | ||
2116 | goto out_free_e; | ||
2117 | } | 2497 | } |
2118 | 2498 | ||
2119 | /* Throttle, drbd_rs_begin_io and submit should become asynchronous | 2499 | /* Throttle, drbd_rs_begin_io and submit should become asynchronous |
@@ -2148,30 +2528,31 @@ submit_for_resync: | |||
2148 | 2528 | ||
2149 | submit: | 2529 | submit: |
2150 | inc_unacked(mdev); | 2530 | inc_unacked(mdev); |
2151 | spin_lock_irq(&mdev->req_lock); | 2531 | spin_lock_irq(&mdev->tconn->req_lock); |
2152 | list_add_tail(&e->w.list, &mdev->read_ee); | 2532 | list_add_tail(&peer_req->w.list, &mdev->read_ee); |
2153 | spin_unlock_irq(&mdev->req_lock); | 2533 | spin_unlock_irq(&mdev->tconn->req_lock); |
2154 | 2534 | ||
2155 | if (drbd_submit_ee(mdev, e, READ, fault_type) == 0) | 2535 | if (drbd_submit_peer_request(mdev, peer_req, READ, fault_type) == 0) |
2156 | return true; | 2536 | return 0; |
2157 | 2537 | ||
2158 | /* don't care for the reason here */ | 2538 | /* don't care for the reason here */ |
2159 | dev_err(DEV, "submit failed, triggering re-connect\n"); | 2539 | dev_err(DEV, "submit failed, triggering re-connect\n"); |
2160 | spin_lock_irq(&mdev->req_lock); | 2540 | spin_lock_irq(&mdev->tconn->req_lock); |
2161 | list_del(&e->w.list); | 2541 | list_del(&peer_req->w.list); |
2162 | spin_unlock_irq(&mdev->req_lock); | 2542 | spin_unlock_irq(&mdev->tconn->req_lock); |
2163 | /* no drbd_rs_complete_io(), we are dropping the connection anyways */ | 2543 | /* no drbd_rs_complete_io(), we are dropping the connection anyways */ |
2164 | 2544 | ||
2165 | out_free_e: | 2545 | out_free_e: |
2166 | put_ldev(mdev); | 2546 | put_ldev(mdev); |
2167 | drbd_free_ee(mdev, e); | 2547 | drbd_free_peer_req(mdev, peer_req); |
2168 | return false; | 2548 | return -EIO; |
2169 | } | 2549 | } |
2170 | 2550 | ||
2171 | static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local) | 2551 | static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local) |
2172 | { | 2552 | { |
2173 | int self, peer, rv = -100; | 2553 | int self, peer, rv = -100; |
2174 | unsigned long ch_self, ch_peer; | 2554 | unsigned long ch_self, ch_peer; |
2555 | enum drbd_after_sb_p after_sb_0p; | ||
2175 | 2556 | ||
2176 | self = mdev->ldev->md.uuid[UI_BITMAP] & 1; | 2557 | self = mdev->ldev->md.uuid[UI_BITMAP] & 1; |
2177 | peer = mdev->p_uuid[UI_BITMAP] & 1; | 2558 | peer = mdev->p_uuid[UI_BITMAP] & 1; |
@@ -2179,10 +2560,14 @@ static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local) | |||
2179 | ch_peer = mdev->p_uuid[UI_SIZE]; | 2560 | ch_peer = mdev->p_uuid[UI_SIZE]; |
2180 | ch_self = mdev->comm_bm_set; | 2561 | ch_self = mdev->comm_bm_set; |
2181 | 2562 | ||
2182 | switch (mdev->net_conf->after_sb_0p) { | 2563 | rcu_read_lock(); |
2564 | after_sb_0p = rcu_dereference(mdev->tconn->net_conf)->after_sb_0p; | ||
2565 | rcu_read_unlock(); | ||
2566 | switch (after_sb_0p) { | ||
2183 | case ASB_CONSENSUS: | 2567 | case ASB_CONSENSUS: |
2184 | case ASB_DISCARD_SECONDARY: | 2568 | case ASB_DISCARD_SECONDARY: |
2185 | case ASB_CALL_HELPER: | 2569 | case ASB_CALL_HELPER: |
2570 | case ASB_VIOLENTLY: | ||
2186 | dev_err(DEV, "Configuration error.\n"); | 2571 | dev_err(DEV, "Configuration error.\n"); |
2187 | break; | 2572 | break; |
2188 | case ASB_DISCONNECT: | 2573 | case ASB_DISCONNECT: |
@@ -2211,14 +2596,14 @@ static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local) | |||
2211 | "Using discard-least-changes instead\n"); | 2596 | "Using discard-least-changes instead\n"); |
2212 | case ASB_DISCARD_ZERO_CHG: | 2597 | case ASB_DISCARD_ZERO_CHG: |
2213 | if (ch_peer == 0 && ch_self == 0) { | 2598 | if (ch_peer == 0 && ch_self == 0) { |
2214 | rv = test_bit(DISCARD_CONCURRENT, &mdev->flags) | 2599 | rv = test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags) |
2215 | ? -1 : 1; | 2600 | ? -1 : 1; |
2216 | break; | 2601 | break; |
2217 | } else { | 2602 | } else { |
2218 | if (ch_peer == 0) { rv = 1; break; } | 2603 | if (ch_peer == 0) { rv = 1; break; } |
2219 | if (ch_self == 0) { rv = -1; break; } | 2604 | if (ch_self == 0) { rv = -1; break; } |
2220 | } | 2605 | } |
2221 | if (mdev->net_conf->after_sb_0p == ASB_DISCARD_ZERO_CHG) | 2606 | if (after_sb_0p == ASB_DISCARD_ZERO_CHG) |
2222 | break; | 2607 | break; |
2223 | case ASB_DISCARD_LEAST_CHG: | 2608 | case ASB_DISCARD_LEAST_CHG: |
2224 | if (ch_self < ch_peer) | 2609 | if (ch_self < ch_peer) |
@@ -2227,7 +2612,7 @@ static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local) | |||
2227 | rv = 1; | 2612 | rv = 1; |
2228 | else /* ( ch_self == ch_peer ) */ | 2613 | else /* ( ch_self == ch_peer ) */ |
2229 | /* Well, then use something else. */ | 2614 | /* Well, then use something else. */ |
2230 | rv = test_bit(DISCARD_CONCURRENT, &mdev->flags) | 2615 | rv = test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags) |
2231 | ? -1 : 1; | 2616 | ? -1 : 1; |
2232 | break; | 2617 | break; |
2233 | case ASB_DISCARD_LOCAL: | 2618 | case ASB_DISCARD_LOCAL: |
@@ -2243,13 +2628,18 @@ static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local) | |||
2243 | static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local) | 2628 | static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local) |
2244 | { | 2629 | { |
2245 | int hg, rv = -100; | 2630 | int hg, rv = -100; |
2631 | enum drbd_after_sb_p after_sb_1p; | ||
2246 | 2632 | ||
2247 | switch (mdev->net_conf->after_sb_1p) { | 2633 | rcu_read_lock(); |
2634 | after_sb_1p = rcu_dereference(mdev->tconn->net_conf)->after_sb_1p; | ||
2635 | rcu_read_unlock(); | ||
2636 | switch (after_sb_1p) { | ||
2248 | case ASB_DISCARD_YOUNGER_PRI: | 2637 | case ASB_DISCARD_YOUNGER_PRI: |
2249 | case ASB_DISCARD_OLDER_PRI: | 2638 | case ASB_DISCARD_OLDER_PRI: |
2250 | case ASB_DISCARD_LEAST_CHG: | 2639 | case ASB_DISCARD_LEAST_CHG: |
2251 | case ASB_DISCARD_LOCAL: | 2640 | case ASB_DISCARD_LOCAL: |
2252 | case ASB_DISCARD_REMOTE: | 2641 | case ASB_DISCARD_REMOTE: |
2642 | case ASB_DISCARD_ZERO_CHG: | ||
2253 | dev_err(DEV, "Configuration error.\n"); | 2643 | dev_err(DEV, "Configuration error.\n"); |
2254 | break; | 2644 | break; |
2255 | case ASB_DISCONNECT: | 2645 | case ASB_DISCONNECT: |
@@ -2292,8 +2682,12 @@ static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local) | |||
2292 | static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local) | 2682 | static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local) |
2293 | { | 2683 | { |
2294 | int hg, rv = -100; | 2684 | int hg, rv = -100; |
2685 | enum drbd_after_sb_p after_sb_2p; | ||
2295 | 2686 | ||
2296 | switch (mdev->net_conf->after_sb_2p) { | 2687 | rcu_read_lock(); |
2688 | after_sb_2p = rcu_dereference(mdev->tconn->net_conf)->after_sb_2p; | ||
2689 | rcu_read_unlock(); | ||
2690 | switch (after_sb_2p) { | ||
2297 | case ASB_DISCARD_YOUNGER_PRI: | 2691 | case ASB_DISCARD_YOUNGER_PRI: |
2298 | case ASB_DISCARD_OLDER_PRI: | 2692 | case ASB_DISCARD_OLDER_PRI: |
2299 | case ASB_DISCARD_LEAST_CHG: | 2693 | case ASB_DISCARD_LEAST_CHG: |
@@ -2301,6 +2695,7 @@ static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local) | |||
2301 | case ASB_DISCARD_REMOTE: | 2695 | case ASB_DISCARD_REMOTE: |
2302 | case ASB_CONSENSUS: | 2696 | case ASB_CONSENSUS: |
2303 | case ASB_DISCARD_SECONDARY: | 2697 | case ASB_DISCARD_SECONDARY: |
2698 | case ASB_DISCARD_ZERO_CHG: | ||
2304 | dev_err(DEV, "Configuration error.\n"); | 2699 | dev_err(DEV, "Configuration error.\n"); |
2305 | break; | 2700 | break; |
2306 | case ASB_VIOLENTLY: | 2701 | case ASB_VIOLENTLY: |
@@ -2386,13 +2781,15 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l | |||
2386 | 2781 | ||
2387 | if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) { | 2782 | if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) { |
2388 | 2783 | ||
2389 | if (mdev->agreed_pro_version < 91) | 2784 | if (mdev->tconn->agreed_pro_version < 91) |
2390 | return -1091; | 2785 | return -1091; |
2391 | 2786 | ||
2392 | if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) && | 2787 | if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) && |
2393 | (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) { | 2788 | (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) { |
2394 | dev_info(DEV, "was SyncSource, missed the resync finished event, corrected myself:\n"); | 2789 | dev_info(DEV, "was SyncSource, missed the resync finished event, corrected myself:\n"); |
2395 | drbd_uuid_set_bm(mdev, 0UL); | 2790 | drbd_uuid_move_history(mdev); |
2791 | mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP]; | ||
2792 | mdev->ldev->md.uuid[UI_BITMAP] = 0; | ||
2396 | 2793 | ||
2397 | drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, | 2794 | drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, |
2398 | mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0); | 2795 | mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0); |
@@ -2407,7 +2804,7 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l | |||
2407 | 2804 | ||
2408 | if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) { | 2805 | if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) { |
2409 | 2806 | ||
2410 | if (mdev->agreed_pro_version < 91) | 2807 | if (mdev->tconn->agreed_pro_version < 91) |
2411 | return -1091; | 2808 | return -1091; |
2412 | 2809 | ||
2413 | if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) && | 2810 | if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) && |
@@ -2440,7 +2837,7 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l | |||
2440 | case 1: /* self_pri && !peer_pri */ return 1; | 2837 | case 1: /* self_pri && !peer_pri */ return 1; |
2441 | case 2: /* !self_pri && peer_pri */ return -1; | 2838 | case 2: /* !self_pri && peer_pri */ return -1; |
2442 | case 3: /* self_pri && peer_pri */ | 2839 | case 3: /* self_pri && peer_pri */ |
2443 | dc = test_bit(DISCARD_CONCURRENT, &mdev->flags); | 2840 | dc = test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags); |
2444 | return dc ? -1 : 1; | 2841 | return dc ? -1 : 1; |
2445 | } | 2842 | } |
2446 | } | 2843 | } |
@@ -2453,14 +2850,14 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l | |||
2453 | *rule_nr = 51; | 2850 | *rule_nr = 51; |
2454 | peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1); | 2851 | peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1); |
2455 | if (self == peer) { | 2852 | if (self == peer) { |
2456 | if (mdev->agreed_pro_version < 96 ? | 2853 | if (mdev->tconn->agreed_pro_version < 96 ? |
2457 | (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == | 2854 | (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == |
2458 | (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) : | 2855 | (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) : |
2459 | peer + UUID_NEW_BM_OFFSET == (mdev->p_uuid[UI_BITMAP] & ~((u64)1))) { | 2856 | peer + UUID_NEW_BM_OFFSET == (mdev->p_uuid[UI_BITMAP] & ~((u64)1))) { |
2460 | /* The last P_SYNC_UUID did not get though. Undo the last start of | 2857 | /* The last P_SYNC_UUID did not get though. Undo the last start of |
2461 | resync as sync source modifications of the peer's UUIDs. */ | 2858 | resync as sync source modifications of the peer's UUIDs. */ |
2462 | 2859 | ||
2463 | if (mdev->agreed_pro_version < 91) | 2860 | if (mdev->tconn->agreed_pro_version < 91) |
2464 | return -1091; | 2861 | return -1091; |
2465 | 2862 | ||
2466 | mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START]; | 2863 | mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START]; |
@@ -2490,18 +2887,18 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l | |||
2490 | *rule_nr = 71; | 2887 | *rule_nr = 71; |
2491 | self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1); | 2888 | self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1); |
2492 | if (self == peer) { | 2889 | if (self == peer) { |
2493 | if (mdev->agreed_pro_version < 96 ? | 2890 | if (mdev->tconn->agreed_pro_version < 96 ? |
2494 | (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == | 2891 | (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == |
2495 | (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) : | 2892 | (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) : |
2496 | self + UUID_NEW_BM_OFFSET == (mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) { | 2893 | self + UUID_NEW_BM_OFFSET == (mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) { |
2497 | /* The last P_SYNC_UUID did not get though. Undo the last start of | 2894 | /* The last P_SYNC_UUID did not get though. Undo the last start of |
2498 | resync as sync source modifications of our UUIDs. */ | 2895 | resync as sync source modifications of our UUIDs. */ |
2499 | 2896 | ||
2500 | if (mdev->agreed_pro_version < 91) | 2897 | if (mdev->tconn->agreed_pro_version < 91) |
2501 | return -1091; | 2898 | return -1091; |
2502 | 2899 | ||
2503 | _drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]); | 2900 | __drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]); |
2504 | _drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]); | 2901 | __drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]); |
2505 | 2902 | ||
2506 | dev_info(DEV, "Last syncUUID did not get through, corrected:\n"); | 2903 | dev_info(DEV, "Last syncUUID did not get through, corrected:\n"); |
2507 | drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, | 2904 | drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, |
@@ -2545,20 +2942,24 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l | |||
2545 | static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role, | 2942 | static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role, |
2546 | enum drbd_disk_state peer_disk) __must_hold(local) | 2943 | enum drbd_disk_state peer_disk) __must_hold(local) |
2547 | { | 2944 | { |
2548 | int hg, rule_nr; | ||
2549 | enum drbd_conns rv = C_MASK; | 2945 | enum drbd_conns rv = C_MASK; |
2550 | enum drbd_disk_state mydisk; | 2946 | enum drbd_disk_state mydisk; |
2947 | struct net_conf *nc; | ||
2948 | int hg, rule_nr, rr_conflict, tentative; | ||
2551 | 2949 | ||
2552 | mydisk = mdev->state.disk; | 2950 | mydisk = mdev->state.disk; |
2553 | if (mydisk == D_NEGOTIATING) | 2951 | if (mydisk == D_NEGOTIATING) |
2554 | mydisk = mdev->new_state_tmp.disk; | 2952 | mydisk = mdev->new_state_tmp.disk; |
2555 | 2953 | ||
2556 | dev_info(DEV, "drbd_sync_handshake:\n"); | 2954 | dev_info(DEV, "drbd_sync_handshake:\n"); |
2955 | |||
2956 | spin_lock_irq(&mdev->ldev->md.uuid_lock); | ||
2557 | drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, mdev->comm_bm_set, 0); | 2957 | drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, mdev->comm_bm_set, 0); |
2558 | drbd_uuid_dump(mdev, "peer", mdev->p_uuid, | 2958 | drbd_uuid_dump(mdev, "peer", mdev->p_uuid, |
2559 | mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]); | 2959 | mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]); |
2560 | 2960 | ||
2561 | hg = drbd_uuid_compare(mdev, &rule_nr); | 2961 | hg = drbd_uuid_compare(mdev, &rule_nr); |
2962 | spin_unlock_irq(&mdev->ldev->md.uuid_lock); | ||
2562 | 2963 | ||
2563 | dev_info(DEV, "uuid_compare()=%d by rule %d\n", hg, rule_nr); | 2964 | dev_info(DEV, "uuid_compare()=%d by rule %d\n", hg, rule_nr); |
2564 | 2965 | ||
@@ -2584,7 +2985,10 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol | |||
2584 | if (abs(hg) == 100) | 2985 | if (abs(hg) == 100) |
2585 | drbd_khelper(mdev, "initial-split-brain"); | 2986 | drbd_khelper(mdev, "initial-split-brain"); |
2586 | 2987 | ||
2587 | if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) { | 2988 | rcu_read_lock(); |
2989 | nc = rcu_dereference(mdev->tconn->net_conf); | ||
2990 | |||
2991 | if (hg == 100 || (hg == -100 && nc->always_asbp)) { | ||
2588 | int pcount = (mdev->state.role == R_PRIMARY) | 2992 | int pcount = (mdev->state.role == R_PRIMARY) |
2589 | + (peer_role == R_PRIMARY); | 2993 | + (peer_role == R_PRIMARY); |
2590 | int forced = (hg == -100); | 2994 | int forced = (hg == -100); |
@@ -2613,9 +3017,9 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol | |||
2613 | } | 3017 | } |
2614 | 3018 | ||
2615 | if (hg == -100) { | 3019 | if (hg == -100) { |
2616 | if (mdev->net_conf->want_lose && !(mdev->p_uuid[UI_FLAGS]&1)) | 3020 | if (test_bit(DISCARD_MY_DATA, &mdev->flags) && !(mdev->p_uuid[UI_FLAGS]&1)) |
2617 | hg = -1; | 3021 | hg = -1; |
2618 | if (!mdev->net_conf->want_lose && (mdev->p_uuid[UI_FLAGS]&1)) | 3022 | if (!test_bit(DISCARD_MY_DATA, &mdev->flags) && (mdev->p_uuid[UI_FLAGS]&1)) |
2619 | hg = 1; | 3023 | hg = 1; |
2620 | 3024 | ||
2621 | if (abs(hg) < 100) | 3025 | if (abs(hg) < 100) |
@@ -2623,6 +3027,9 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol | |||
2623 | "Sync from %s node\n", | 3027 | "Sync from %s node\n", |
2624 | (hg < 0) ? "peer" : "this"); | 3028 | (hg < 0) ? "peer" : "this"); |
2625 | } | 3029 | } |
3030 | rr_conflict = nc->rr_conflict; | ||
3031 | tentative = nc->tentative; | ||
3032 | rcu_read_unlock(); | ||
2626 | 3033 | ||
2627 | if (hg == -100) { | 3034 | if (hg == -100) { |
2628 | /* FIXME this log message is not correct if we end up here | 3035 | /* FIXME this log message is not correct if we end up here |
@@ -2641,7 +3048,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol | |||
2641 | 3048 | ||
2642 | if (hg < 0 && /* by intention we do not use mydisk here. */ | 3049 | if (hg < 0 && /* by intention we do not use mydisk here. */ |
2643 | mdev->state.role == R_PRIMARY && mdev->state.disk >= D_CONSISTENT) { | 3050 | mdev->state.role == R_PRIMARY && mdev->state.disk >= D_CONSISTENT) { |
2644 | switch (mdev->net_conf->rr_conflict) { | 3051 | switch (rr_conflict) { |
2645 | case ASB_CALL_HELPER: | 3052 | case ASB_CALL_HELPER: |
2646 | drbd_khelper(mdev, "pri-lost"); | 3053 | drbd_khelper(mdev, "pri-lost"); |
2647 | /* fall through */ | 3054 | /* fall through */ |
@@ -2654,7 +3061,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol | |||
2654 | } | 3061 | } |
2655 | } | 3062 | } |
2656 | 3063 | ||
2657 | if (mdev->net_conf->dry_run || test_bit(CONN_DRY_RUN, &mdev->flags)) { | 3064 | if (tentative || test_bit(CONN_DRY_RUN, &mdev->tconn->flags)) { |
2658 | if (hg == 0) | 3065 | if (hg == 0) |
2659 | dev_info(DEV, "dry-run connect: No resync, would become Connected immediately.\n"); | 3066 | dev_info(DEV, "dry-run connect: No resync, would become Connected immediately.\n"); |
2660 | else | 3067 | else |
@@ -2686,33 +3093,29 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol | |||
2686 | return rv; | 3093 | return rv; |
2687 | } | 3094 | } |
2688 | 3095 | ||
2689 | /* returns 1 if invalid */ | 3096 | static enum drbd_after_sb_p convert_after_sb(enum drbd_after_sb_p peer) |
2690 | static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self) | ||
2691 | { | 3097 | { |
2692 | /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */ | 3098 | /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */ |
2693 | if ((peer == ASB_DISCARD_REMOTE && self == ASB_DISCARD_LOCAL) || | 3099 | if (peer == ASB_DISCARD_REMOTE) |
2694 | (self == ASB_DISCARD_REMOTE && peer == ASB_DISCARD_LOCAL)) | 3100 | return ASB_DISCARD_LOCAL; |
2695 | return 0; | ||
2696 | 3101 | ||
2697 | /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */ | 3102 | /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */ |
2698 | if (peer == ASB_DISCARD_REMOTE || peer == ASB_DISCARD_LOCAL || | 3103 | if (peer == ASB_DISCARD_LOCAL) |
2699 | self == ASB_DISCARD_REMOTE || self == ASB_DISCARD_LOCAL) | 3104 | return ASB_DISCARD_REMOTE; |
2700 | return 1; | ||
2701 | 3105 | ||
2702 | /* everything else is valid if they are equal on both sides. */ | 3106 | /* everything else is valid if they are equal on both sides. */ |
2703 | if (peer == self) | 3107 | return peer; |
2704 | return 0; | ||
2705 | |||
2706 | /* everything es is invalid. */ | ||
2707 | return 1; | ||
2708 | } | 3108 | } |
2709 | 3109 | ||
2710 | static int receive_protocol(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 3110 | static int receive_protocol(struct drbd_tconn *tconn, struct packet_info *pi) |
2711 | { | 3111 | { |
2712 | struct p_protocol *p = &mdev->data.rbuf.protocol; | 3112 | struct p_protocol *p = pi->data; |
2713 | int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p; | 3113 | enum drbd_after_sb_p p_after_sb_0p, p_after_sb_1p, p_after_sb_2p; |
2714 | int p_want_lose, p_two_primaries, cf; | 3114 | int p_proto, p_discard_my_data, p_two_primaries, cf; |
2715 | char p_integrity_alg[SHARED_SECRET_MAX] = ""; | 3115 | struct net_conf *nc, *old_net_conf, *new_net_conf = NULL; |
3116 | char integrity_alg[SHARED_SECRET_MAX] = ""; | ||
3117 | struct crypto_hash *peer_integrity_tfm = NULL; | ||
3118 | void *int_dig_in = NULL, *int_dig_vv = NULL; | ||
2716 | 3119 | ||
2717 | p_proto = be32_to_cpu(p->protocol); | 3120 | p_proto = be32_to_cpu(p->protocol); |
2718 | p_after_sb_0p = be32_to_cpu(p->after_sb_0p); | 3121 | p_after_sb_0p = be32_to_cpu(p->after_sb_0p); |
@@ -2720,63 +3123,138 @@ static int receive_protocol(struct drbd_conf *mdev, enum drbd_packets cmd, unsig | |||
2720 | p_after_sb_2p = be32_to_cpu(p->after_sb_2p); | 3123 | p_after_sb_2p = be32_to_cpu(p->after_sb_2p); |
2721 | p_two_primaries = be32_to_cpu(p->two_primaries); | 3124 | p_two_primaries = be32_to_cpu(p->two_primaries); |
2722 | cf = be32_to_cpu(p->conn_flags); | 3125 | cf = be32_to_cpu(p->conn_flags); |
2723 | p_want_lose = cf & CF_WANT_LOSE; | 3126 | p_discard_my_data = cf & CF_DISCARD_MY_DATA; |
2724 | |||
2725 | clear_bit(CONN_DRY_RUN, &mdev->flags); | ||
2726 | 3127 | ||
2727 | if (cf & CF_DRY_RUN) | 3128 | if (tconn->agreed_pro_version >= 87) { |
2728 | set_bit(CONN_DRY_RUN, &mdev->flags); | 3129 | int err; |
2729 | 3130 | ||
2730 | if (p_proto != mdev->net_conf->wire_protocol) { | 3131 | if (pi->size > sizeof(integrity_alg)) |
2731 | dev_err(DEV, "incompatible communication protocols\n"); | 3132 | return -EIO; |
2732 | goto disconnect; | 3133 | err = drbd_recv_all(tconn, integrity_alg, pi->size); |
3134 | if (err) | ||
3135 | return err; | ||
3136 | integrity_alg[SHARED_SECRET_MAX - 1] = 0; | ||
2733 | } | 3137 | } |
2734 | 3138 | ||
2735 | if (cmp_after_sb(p_after_sb_0p, mdev->net_conf->after_sb_0p)) { | 3139 | if (pi->cmd != P_PROTOCOL_UPDATE) { |
2736 | dev_err(DEV, "incompatible after-sb-0pri settings\n"); | 3140 | clear_bit(CONN_DRY_RUN, &tconn->flags); |
2737 | goto disconnect; | ||
2738 | } | ||
2739 | 3141 | ||
2740 | if (cmp_after_sb(p_after_sb_1p, mdev->net_conf->after_sb_1p)) { | 3142 | if (cf & CF_DRY_RUN) |
2741 | dev_err(DEV, "incompatible after-sb-1pri settings\n"); | 3143 | set_bit(CONN_DRY_RUN, &tconn->flags); |
2742 | goto disconnect; | ||
2743 | } | ||
2744 | 3144 | ||
2745 | if (cmp_after_sb(p_after_sb_2p, mdev->net_conf->after_sb_2p)) { | 3145 | rcu_read_lock(); |
2746 | dev_err(DEV, "incompatible after-sb-2pri settings\n"); | 3146 | nc = rcu_dereference(tconn->net_conf); |
2747 | goto disconnect; | ||
2748 | } | ||
2749 | 3147 | ||
2750 | if (p_want_lose && mdev->net_conf->want_lose) { | 3148 | if (p_proto != nc->wire_protocol) { |
2751 | dev_err(DEV, "both sides have the 'want_lose' flag set\n"); | 3149 | conn_err(tconn, "incompatible %s settings\n", "protocol"); |
2752 | goto disconnect; | 3150 | goto disconnect_rcu_unlock; |
2753 | } | 3151 | } |
2754 | 3152 | ||
2755 | if (p_two_primaries != mdev->net_conf->two_primaries) { | 3153 | if (convert_after_sb(p_after_sb_0p) != nc->after_sb_0p) { |
2756 | dev_err(DEV, "incompatible setting of the two-primaries options\n"); | 3154 | conn_err(tconn, "incompatible %s settings\n", "after-sb-0pri"); |
2757 | goto disconnect; | 3155 | goto disconnect_rcu_unlock; |
3156 | } | ||
3157 | |||
3158 | if (convert_after_sb(p_after_sb_1p) != nc->after_sb_1p) { | ||
3159 | conn_err(tconn, "incompatible %s settings\n", "after-sb-1pri"); | ||
3160 | goto disconnect_rcu_unlock; | ||
3161 | } | ||
3162 | |||
3163 | if (convert_after_sb(p_after_sb_2p) != nc->after_sb_2p) { | ||
3164 | conn_err(tconn, "incompatible %s settings\n", "after-sb-2pri"); | ||
3165 | goto disconnect_rcu_unlock; | ||
3166 | } | ||
3167 | |||
3168 | if (p_discard_my_data && nc->discard_my_data) { | ||
3169 | conn_err(tconn, "incompatible %s settings\n", "discard-my-data"); | ||
3170 | goto disconnect_rcu_unlock; | ||
3171 | } | ||
3172 | |||
3173 | if (p_two_primaries != nc->two_primaries) { | ||
3174 | conn_err(tconn, "incompatible %s settings\n", "allow-two-primaries"); | ||
3175 | goto disconnect_rcu_unlock; | ||
3176 | } | ||
3177 | |||
3178 | if (strcmp(integrity_alg, nc->integrity_alg)) { | ||
3179 | conn_err(tconn, "incompatible %s settings\n", "data-integrity-alg"); | ||
3180 | goto disconnect_rcu_unlock; | ||
3181 | } | ||
3182 | |||
3183 | rcu_read_unlock(); | ||
2758 | } | 3184 | } |
2759 | 3185 | ||
2760 | if (mdev->agreed_pro_version >= 87) { | 3186 | if (integrity_alg[0]) { |
2761 | unsigned char *my_alg = mdev->net_conf->integrity_alg; | 3187 | int hash_size; |
2762 | 3188 | ||
2763 | if (drbd_recv(mdev, p_integrity_alg, data_size) != data_size) | 3189 | /* |
2764 | return false; | 3190 | * We can only change the peer data integrity algorithm |
3191 | * here. Changing our own data integrity algorithm | ||
3192 | * requires that we send a P_PROTOCOL_UPDATE packet at | ||
3193 | * the same time; otherwise, the peer has no way to | ||
3194 | * tell between which packets the algorithm should | ||
3195 | * change. | ||
3196 | */ | ||
2765 | 3197 | ||
2766 | p_integrity_alg[SHARED_SECRET_MAX-1] = 0; | 3198 | peer_integrity_tfm = crypto_alloc_hash(integrity_alg, 0, CRYPTO_ALG_ASYNC); |
2767 | if (strcmp(p_integrity_alg, my_alg)) { | 3199 | if (!peer_integrity_tfm) { |
2768 | dev_err(DEV, "incompatible setting of the data-integrity-alg\n"); | 3200 | conn_err(tconn, "peer data-integrity-alg %s not supported\n", |
3201 | integrity_alg); | ||
2769 | goto disconnect; | 3202 | goto disconnect; |
2770 | } | 3203 | } |
2771 | dev_info(DEV, "data-integrity-alg: %s\n", | 3204 | |
2772 | my_alg[0] ? my_alg : (unsigned char *)"<not-used>"); | 3205 | hash_size = crypto_hash_digestsize(peer_integrity_tfm); |
3206 | int_dig_in = kmalloc(hash_size, GFP_KERNEL); | ||
3207 | int_dig_vv = kmalloc(hash_size, GFP_KERNEL); | ||
3208 | if (!(int_dig_in && int_dig_vv)) { | ||
3209 | conn_err(tconn, "Allocation of buffers for data integrity checking failed\n"); | ||
3210 | goto disconnect; | ||
3211 | } | ||
3212 | } | ||
3213 | |||
3214 | new_net_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL); | ||
3215 | if (!new_net_conf) { | ||
3216 | conn_err(tconn, "Allocation of new net_conf failed\n"); | ||
3217 | goto disconnect; | ||
2773 | } | 3218 | } |
2774 | 3219 | ||
2775 | return true; | 3220 | mutex_lock(&tconn->data.mutex); |
3221 | mutex_lock(&tconn->conf_update); | ||
3222 | old_net_conf = tconn->net_conf; | ||
3223 | *new_net_conf = *old_net_conf; | ||
3224 | |||
3225 | new_net_conf->wire_protocol = p_proto; | ||
3226 | new_net_conf->after_sb_0p = convert_after_sb(p_after_sb_0p); | ||
3227 | new_net_conf->after_sb_1p = convert_after_sb(p_after_sb_1p); | ||
3228 | new_net_conf->after_sb_2p = convert_after_sb(p_after_sb_2p); | ||
3229 | new_net_conf->two_primaries = p_two_primaries; | ||
2776 | 3230 | ||
3231 | rcu_assign_pointer(tconn->net_conf, new_net_conf); | ||
3232 | mutex_unlock(&tconn->conf_update); | ||
3233 | mutex_unlock(&tconn->data.mutex); | ||
3234 | |||
3235 | crypto_free_hash(tconn->peer_integrity_tfm); | ||
3236 | kfree(tconn->int_dig_in); | ||
3237 | kfree(tconn->int_dig_vv); | ||
3238 | tconn->peer_integrity_tfm = peer_integrity_tfm; | ||
3239 | tconn->int_dig_in = int_dig_in; | ||
3240 | tconn->int_dig_vv = int_dig_vv; | ||
3241 | |||
3242 | if (strcmp(old_net_conf->integrity_alg, integrity_alg)) | ||
3243 | conn_info(tconn, "peer data-integrity-alg: %s\n", | ||
3244 | integrity_alg[0] ? integrity_alg : "(none)"); | ||
3245 | |||
3246 | synchronize_rcu(); | ||
3247 | kfree(old_net_conf); | ||
3248 | return 0; | ||
3249 | |||
3250 | disconnect_rcu_unlock: | ||
3251 | rcu_read_unlock(); | ||
2777 | disconnect: | 3252 | disconnect: |
2778 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | 3253 | crypto_free_hash(peer_integrity_tfm); |
2779 | return false; | 3254 | kfree(int_dig_in); |
3255 | kfree(int_dig_vv); | ||
3256 | conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD); | ||
3257 | return -EIO; | ||
2780 | } | 3258 | } |
2781 | 3259 | ||
2782 | /* helper function | 3260 | /* helper function |
@@ -2798,24 +3276,64 @@ struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev, | |||
2798 | alg, name, PTR_ERR(tfm)); | 3276 | alg, name, PTR_ERR(tfm)); |
2799 | return tfm; | 3277 | return tfm; |
2800 | } | 3278 | } |
2801 | if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) { | ||
2802 | crypto_free_hash(tfm); | ||
2803 | dev_err(DEV, "\"%s\" is not a digest (%s)\n", alg, name); | ||
2804 | return ERR_PTR(-EINVAL); | ||
2805 | } | ||
2806 | return tfm; | 3279 | return tfm; |
2807 | } | 3280 | } |
2808 | 3281 | ||
2809 | static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int packet_size) | 3282 | static int ignore_remaining_packet(struct drbd_tconn *tconn, struct packet_info *pi) |
3283 | { | ||
3284 | void *buffer = tconn->data.rbuf; | ||
3285 | int size = pi->size; | ||
3286 | |||
3287 | while (size) { | ||
3288 | int s = min_t(int, size, DRBD_SOCKET_BUFFER_SIZE); | ||
3289 | s = drbd_recv(tconn, buffer, s); | ||
3290 | if (s <= 0) { | ||
3291 | if (s < 0) | ||
3292 | return s; | ||
3293 | break; | ||
3294 | } | ||
3295 | size -= s; | ||
3296 | } | ||
3297 | if (size) | ||
3298 | return -EIO; | ||
3299 | return 0; | ||
3300 | } | ||
3301 | |||
3302 | /* | ||
3303 | * config_unknown_volume - device configuration command for unknown volume | ||
3304 | * | ||
3305 | * When a device is added to an existing connection, the node on which the | ||
3306 | * device is added first will send configuration commands to its peer but the | ||
3307 | * peer will not know about the device yet. It will warn and ignore these | ||
3308 | * commands. Once the device is added on the second node, the second node will | ||
3309 | * send the same device configuration commands, but in the other direction. | ||
3310 | * | ||
3311 | * (We can also end up here if drbd is misconfigured.) | ||
3312 | */ | ||
3313 | static int config_unknown_volume(struct drbd_tconn *tconn, struct packet_info *pi) | ||
2810 | { | 3314 | { |
2811 | int ok = true; | 3315 | conn_warn(tconn, "%s packet received for volume %u, which is not configured locally\n", |
2812 | struct p_rs_param_95 *p = &mdev->data.rbuf.rs_param_95; | 3316 | cmdname(pi->cmd), pi->vnr); |
3317 | return ignore_remaining_packet(tconn, pi); | ||
3318 | } | ||
3319 | |||
3320 | static int receive_SyncParam(struct drbd_tconn *tconn, struct packet_info *pi) | ||
3321 | { | ||
3322 | struct drbd_conf *mdev; | ||
3323 | struct p_rs_param_95 *p; | ||
2813 | unsigned int header_size, data_size, exp_max_sz; | 3324 | unsigned int header_size, data_size, exp_max_sz; |
2814 | struct crypto_hash *verify_tfm = NULL; | 3325 | struct crypto_hash *verify_tfm = NULL; |
2815 | struct crypto_hash *csums_tfm = NULL; | 3326 | struct crypto_hash *csums_tfm = NULL; |
2816 | const int apv = mdev->agreed_pro_version; | 3327 | struct net_conf *old_net_conf, *new_net_conf = NULL; |
2817 | int *rs_plan_s = NULL; | 3328 | struct disk_conf *old_disk_conf = NULL, *new_disk_conf = NULL; |
3329 | const int apv = tconn->agreed_pro_version; | ||
3330 | struct fifo_buffer *old_plan = NULL, *new_plan = NULL; | ||
2818 | int fifo_size = 0; | 3331 | int fifo_size = 0; |
3332 | int err; | ||
3333 | |||
3334 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
3335 | if (!mdev) | ||
3336 | return config_unknown_volume(tconn, pi); | ||
2819 | 3337 | ||
2820 | exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param) | 3338 | exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param) |
2821 | : apv == 88 ? sizeof(struct p_rs_param) | 3339 | : apv == 88 ? sizeof(struct p_rs_param) |
@@ -2823,32 +3341,49 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi | |||
2823 | : apv <= 94 ? sizeof(struct p_rs_param_89) | 3341 | : apv <= 94 ? sizeof(struct p_rs_param_89) |
2824 | : /* apv >= 95 */ sizeof(struct p_rs_param_95); | 3342 | : /* apv >= 95 */ sizeof(struct p_rs_param_95); |
2825 | 3343 | ||
2826 | if (packet_size > exp_max_sz) { | 3344 | if (pi->size > exp_max_sz) { |
2827 | dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n", | 3345 | dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n", |
2828 | packet_size, exp_max_sz); | 3346 | pi->size, exp_max_sz); |
2829 | return false; | 3347 | return -EIO; |
2830 | } | 3348 | } |
2831 | 3349 | ||
2832 | if (apv <= 88) { | 3350 | if (apv <= 88) { |
2833 | header_size = sizeof(struct p_rs_param) - sizeof(struct p_header80); | 3351 | header_size = sizeof(struct p_rs_param); |
2834 | data_size = packet_size - header_size; | 3352 | data_size = pi->size - header_size; |
2835 | } else if (apv <= 94) { | 3353 | } else if (apv <= 94) { |
2836 | header_size = sizeof(struct p_rs_param_89) - sizeof(struct p_header80); | 3354 | header_size = sizeof(struct p_rs_param_89); |
2837 | data_size = packet_size - header_size; | 3355 | data_size = pi->size - header_size; |
2838 | D_ASSERT(data_size == 0); | 3356 | D_ASSERT(data_size == 0); |
2839 | } else { | 3357 | } else { |
2840 | header_size = sizeof(struct p_rs_param_95) - sizeof(struct p_header80); | 3358 | header_size = sizeof(struct p_rs_param_95); |
2841 | data_size = packet_size - header_size; | 3359 | data_size = pi->size - header_size; |
2842 | D_ASSERT(data_size == 0); | 3360 | D_ASSERT(data_size == 0); |
2843 | } | 3361 | } |
2844 | 3362 | ||
2845 | /* initialize verify_alg and csums_alg */ | 3363 | /* initialize verify_alg and csums_alg */ |
3364 | p = pi->data; | ||
2846 | memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); | 3365 | memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); |
2847 | 3366 | ||
2848 | if (drbd_recv(mdev, &p->head.payload, header_size) != header_size) | 3367 | err = drbd_recv_all(mdev->tconn, p, header_size); |
2849 | return false; | 3368 | if (err) |
3369 | return err; | ||
2850 | 3370 | ||
2851 | mdev->sync_conf.rate = be32_to_cpu(p->rate); | 3371 | mutex_lock(&mdev->tconn->conf_update); |
3372 | old_net_conf = mdev->tconn->net_conf; | ||
3373 | if (get_ldev(mdev)) { | ||
3374 | new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL); | ||
3375 | if (!new_disk_conf) { | ||
3376 | put_ldev(mdev); | ||
3377 | mutex_unlock(&mdev->tconn->conf_update); | ||
3378 | dev_err(DEV, "Allocation of new disk_conf failed\n"); | ||
3379 | return -ENOMEM; | ||
3380 | } | ||
3381 | |||
3382 | old_disk_conf = mdev->ldev->disk_conf; | ||
3383 | *new_disk_conf = *old_disk_conf; | ||
3384 | |||
3385 | new_disk_conf->resync_rate = be32_to_cpu(p->resync_rate); | ||
3386 | } | ||
2852 | 3387 | ||
2853 | if (apv >= 88) { | 3388 | if (apv >= 88) { |
2854 | if (apv == 88) { | 3389 | if (apv == 88) { |
@@ -2856,12 +3391,13 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi | |||
2856 | dev_err(DEV, "verify-alg of wrong size, " | 3391 | dev_err(DEV, "verify-alg of wrong size, " |
2857 | "peer wants %u, accepting only up to %u byte\n", | 3392 | "peer wants %u, accepting only up to %u byte\n", |
2858 | data_size, SHARED_SECRET_MAX); | 3393 | data_size, SHARED_SECRET_MAX); |
2859 | return false; | 3394 | err = -EIO; |
3395 | goto reconnect; | ||
2860 | } | 3396 | } |
2861 | 3397 | ||
2862 | if (drbd_recv(mdev, p->verify_alg, data_size) != data_size) | 3398 | err = drbd_recv_all(mdev->tconn, p->verify_alg, data_size); |
2863 | return false; | 3399 | if (err) |
2864 | 3400 | goto reconnect; | |
2865 | /* we expect NUL terminated string */ | 3401 | /* we expect NUL terminated string */ |
2866 | /* but just in case someone tries to be evil */ | 3402 | /* but just in case someone tries to be evil */ |
2867 | D_ASSERT(p->verify_alg[data_size-1] == 0); | 3403 | D_ASSERT(p->verify_alg[data_size-1] == 0); |
@@ -2876,10 +3412,10 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi | |||
2876 | p->csums_alg[SHARED_SECRET_MAX-1] = 0; | 3412 | p->csums_alg[SHARED_SECRET_MAX-1] = 0; |
2877 | } | 3413 | } |
2878 | 3414 | ||
2879 | if (strcmp(mdev->sync_conf.verify_alg, p->verify_alg)) { | 3415 | if (strcmp(old_net_conf->verify_alg, p->verify_alg)) { |
2880 | if (mdev->state.conn == C_WF_REPORT_PARAMS) { | 3416 | if (mdev->state.conn == C_WF_REPORT_PARAMS) { |
2881 | dev_err(DEV, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n", | 3417 | dev_err(DEV, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n", |
2882 | mdev->sync_conf.verify_alg, p->verify_alg); | 3418 | old_net_conf->verify_alg, p->verify_alg); |
2883 | goto disconnect; | 3419 | goto disconnect; |
2884 | } | 3420 | } |
2885 | verify_tfm = drbd_crypto_alloc_digest_safe(mdev, | 3421 | verify_tfm = drbd_crypto_alloc_digest_safe(mdev, |
@@ -2890,10 +3426,10 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi | |||
2890 | } | 3426 | } |
2891 | } | 3427 | } |
2892 | 3428 | ||
2893 | if (apv >= 89 && strcmp(mdev->sync_conf.csums_alg, p->csums_alg)) { | 3429 | if (apv >= 89 && strcmp(old_net_conf->csums_alg, p->csums_alg)) { |
2894 | if (mdev->state.conn == C_WF_REPORT_PARAMS) { | 3430 | if (mdev->state.conn == C_WF_REPORT_PARAMS) { |
2895 | dev_err(DEV, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n", | 3431 | dev_err(DEV, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n", |
2896 | mdev->sync_conf.csums_alg, p->csums_alg); | 3432 | old_net_conf->csums_alg, p->csums_alg); |
2897 | goto disconnect; | 3433 | goto disconnect; |
2898 | } | 3434 | } |
2899 | csums_tfm = drbd_crypto_alloc_digest_safe(mdev, | 3435 | csums_tfm = drbd_crypto_alloc_digest_safe(mdev, |
@@ -2904,57 +3440,91 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi | |||
2904 | } | 3440 | } |
2905 | } | 3441 | } |
2906 | 3442 | ||
2907 | if (apv > 94) { | 3443 | if (apv > 94 && new_disk_conf) { |
2908 | mdev->sync_conf.rate = be32_to_cpu(p->rate); | 3444 | new_disk_conf->c_plan_ahead = be32_to_cpu(p->c_plan_ahead); |
2909 | mdev->sync_conf.c_plan_ahead = be32_to_cpu(p->c_plan_ahead); | 3445 | new_disk_conf->c_delay_target = be32_to_cpu(p->c_delay_target); |
2910 | mdev->sync_conf.c_delay_target = be32_to_cpu(p->c_delay_target); | 3446 | new_disk_conf->c_fill_target = be32_to_cpu(p->c_fill_target); |
2911 | mdev->sync_conf.c_fill_target = be32_to_cpu(p->c_fill_target); | 3447 | new_disk_conf->c_max_rate = be32_to_cpu(p->c_max_rate); |
2912 | mdev->sync_conf.c_max_rate = be32_to_cpu(p->c_max_rate); | 3448 | |
2913 | 3449 | fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ; | |
2914 | fifo_size = (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ; | 3450 | if (fifo_size != mdev->rs_plan_s->size) { |
2915 | if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) { | 3451 | new_plan = fifo_alloc(fifo_size); |
2916 | rs_plan_s = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL); | 3452 | if (!new_plan) { |
2917 | if (!rs_plan_s) { | ||
2918 | dev_err(DEV, "kmalloc of fifo_buffer failed"); | 3453 | dev_err(DEV, "kmalloc of fifo_buffer failed"); |
3454 | put_ldev(mdev); | ||
2919 | goto disconnect; | 3455 | goto disconnect; |
2920 | } | 3456 | } |
2921 | } | 3457 | } |
2922 | } | 3458 | } |
2923 | 3459 | ||
2924 | spin_lock(&mdev->peer_seq_lock); | 3460 | if (verify_tfm || csums_tfm) { |
2925 | /* lock against drbd_nl_syncer_conf() */ | 3461 | new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL); |
2926 | if (verify_tfm) { | 3462 | if (!new_net_conf) { |
2927 | strcpy(mdev->sync_conf.verify_alg, p->verify_alg); | 3463 | dev_err(DEV, "Allocation of new net_conf failed\n"); |
2928 | mdev->sync_conf.verify_alg_len = strlen(p->verify_alg) + 1; | 3464 | goto disconnect; |
2929 | crypto_free_hash(mdev->verify_tfm); | 3465 | } |
2930 | mdev->verify_tfm = verify_tfm; | 3466 | |
2931 | dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg); | 3467 | *new_net_conf = *old_net_conf; |
2932 | } | 3468 | |
2933 | if (csums_tfm) { | 3469 | if (verify_tfm) { |
2934 | strcpy(mdev->sync_conf.csums_alg, p->csums_alg); | 3470 | strcpy(new_net_conf->verify_alg, p->verify_alg); |
2935 | mdev->sync_conf.csums_alg_len = strlen(p->csums_alg) + 1; | 3471 | new_net_conf->verify_alg_len = strlen(p->verify_alg) + 1; |
2936 | crypto_free_hash(mdev->csums_tfm); | 3472 | crypto_free_hash(mdev->tconn->verify_tfm); |
2937 | mdev->csums_tfm = csums_tfm; | 3473 | mdev->tconn->verify_tfm = verify_tfm; |
2938 | dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg); | 3474 | dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg); |
2939 | } | 3475 | } |
2940 | if (fifo_size != mdev->rs_plan_s.size) { | 3476 | if (csums_tfm) { |
2941 | kfree(mdev->rs_plan_s.values); | 3477 | strcpy(new_net_conf->csums_alg, p->csums_alg); |
2942 | mdev->rs_plan_s.values = rs_plan_s; | 3478 | new_net_conf->csums_alg_len = strlen(p->csums_alg) + 1; |
2943 | mdev->rs_plan_s.size = fifo_size; | 3479 | crypto_free_hash(mdev->tconn->csums_tfm); |
2944 | mdev->rs_planed = 0; | 3480 | mdev->tconn->csums_tfm = csums_tfm; |
3481 | dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg); | ||
3482 | } | ||
3483 | rcu_assign_pointer(tconn->net_conf, new_net_conf); | ||
2945 | } | 3484 | } |
2946 | spin_unlock(&mdev->peer_seq_lock); | ||
2947 | } | 3485 | } |
2948 | 3486 | ||
2949 | return ok; | 3487 | if (new_disk_conf) { |
3488 | rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf); | ||
3489 | put_ldev(mdev); | ||
3490 | } | ||
3491 | |||
3492 | if (new_plan) { | ||
3493 | old_plan = mdev->rs_plan_s; | ||
3494 | rcu_assign_pointer(mdev->rs_plan_s, new_plan); | ||
3495 | } | ||
3496 | |||
3497 | mutex_unlock(&mdev->tconn->conf_update); | ||
3498 | synchronize_rcu(); | ||
3499 | if (new_net_conf) | ||
3500 | kfree(old_net_conf); | ||
3501 | kfree(old_disk_conf); | ||
3502 | kfree(old_plan); | ||
3503 | |||
3504 | return 0; | ||
3505 | |||
3506 | reconnect: | ||
3507 | if (new_disk_conf) { | ||
3508 | put_ldev(mdev); | ||
3509 | kfree(new_disk_conf); | ||
3510 | } | ||
3511 | mutex_unlock(&mdev->tconn->conf_update); | ||
3512 | return -EIO; | ||
3513 | |||
2950 | disconnect: | 3514 | disconnect: |
3515 | kfree(new_plan); | ||
3516 | if (new_disk_conf) { | ||
3517 | put_ldev(mdev); | ||
3518 | kfree(new_disk_conf); | ||
3519 | } | ||
3520 | mutex_unlock(&mdev->tconn->conf_update); | ||
2951 | /* just for completeness: actually not needed, | 3521 | /* just for completeness: actually not needed, |
2952 | * as this is not reached if csums_tfm was ok. */ | 3522 | * as this is not reached if csums_tfm was ok. */ |
2953 | crypto_free_hash(csums_tfm); | 3523 | crypto_free_hash(csums_tfm); |
2954 | /* but free the verify_tfm again, if csums_tfm did not work out */ | 3524 | /* but free the verify_tfm again, if csums_tfm did not work out */ |
2955 | crypto_free_hash(verify_tfm); | 3525 | crypto_free_hash(verify_tfm); |
2956 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | 3526 | conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD); |
2957 | return false; | 3527 | return -EIO; |
2958 | } | 3528 | } |
2959 | 3529 | ||
2960 | /* warn if the arguments differ by more than 12.5% */ | 3530 | /* warn if the arguments differ by more than 12.5% */ |
@@ -2970,59 +3540,77 @@ static void warn_if_differ_considerably(struct drbd_conf *mdev, | |||
2970 | (unsigned long long)a, (unsigned long long)b); | 3540 | (unsigned long long)a, (unsigned long long)b); |
2971 | } | 3541 | } |
2972 | 3542 | ||
2973 | static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 3543 | static int receive_sizes(struct drbd_tconn *tconn, struct packet_info *pi) |
2974 | { | 3544 | { |
2975 | struct p_sizes *p = &mdev->data.rbuf.sizes; | 3545 | struct drbd_conf *mdev; |
3546 | struct p_sizes *p = pi->data; | ||
2976 | enum determine_dev_size dd = unchanged; | 3547 | enum determine_dev_size dd = unchanged; |
2977 | sector_t p_size, p_usize, my_usize; | 3548 | sector_t p_size, p_usize, my_usize; |
2978 | int ldsc = 0; /* local disk size changed */ | 3549 | int ldsc = 0; /* local disk size changed */ |
2979 | enum dds_flags ddsf; | 3550 | enum dds_flags ddsf; |
2980 | 3551 | ||
3552 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
3553 | if (!mdev) | ||
3554 | return config_unknown_volume(tconn, pi); | ||
3555 | |||
2981 | p_size = be64_to_cpu(p->d_size); | 3556 | p_size = be64_to_cpu(p->d_size); |
2982 | p_usize = be64_to_cpu(p->u_size); | 3557 | p_usize = be64_to_cpu(p->u_size); |
2983 | 3558 | ||
2984 | if (p_size == 0 && mdev->state.disk == D_DISKLESS) { | ||
2985 | dev_err(DEV, "some backing storage is needed\n"); | ||
2986 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
2987 | return false; | ||
2988 | } | ||
2989 | |||
2990 | /* just store the peer's disk size for now. | 3559 | /* just store the peer's disk size for now. |
2991 | * we still need to figure out whether we accept that. */ | 3560 | * we still need to figure out whether we accept that. */ |
2992 | mdev->p_size = p_size; | 3561 | mdev->p_size = p_size; |
2993 | 3562 | ||
2994 | if (get_ldev(mdev)) { | 3563 | if (get_ldev(mdev)) { |
3564 | rcu_read_lock(); | ||
3565 | my_usize = rcu_dereference(mdev->ldev->disk_conf)->disk_size; | ||
3566 | rcu_read_unlock(); | ||
3567 | |||
2995 | warn_if_differ_considerably(mdev, "lower level device sizes", | 3568 | warn_if_differ_considerably(mdev, "lower level device sizes", |
2996 | p_size, drbd_get_max_capacity(mdev->ldev)); | 3569 | p_size, drbd_get_max_capacity(mdev->ldev)); |
2997 | warn_if_differ_considerably(mdev, "user requested size", | 3570 | warn_if_differ_considerably(mdev, "user requested size", |
2998 | p_usize, mdev->ldev->dc.disk_size); | 3571 | p_usize, my_usize); |
2999 | 3572 | ||
3000 | /* if this is the first connect, or an otherwise expected | 3573 | /* if this is the first connect, or an otherwise expected |
3001 | * param exchange, choose the minimum */ | 3574 | * param exchange, choose the minimum */ |
3002 | if (mdev->state.conn == C_WF_REPORT_PARAMS) | 3575 | if (mdev->state.conn == C_WF_REPORT_PARAMS) |
3003 | p_usize = min_not_zero((sector_t)mdev->ldev->dc.disk_size, | 3576 | p_usize = min_not_zero(my_usize, p_usize); |
3004 | p_usize); | ||
3005 | |||
3006 | my_usize = mdev->ldev->dc.disk_size; | ||
3007 | |||
3008 | if (mdev->ldev->dc.disk_size != p_usize) { | ||
3009 | mdev->ldev->dc.disk_size = p_usize; | ||
3010 | dev_info(DEV, "Peer sets u_size to %lu sectors\n", | ||
3011 | (unsigned long)mdev->ldev->dc.disk_size); | ||
3012 | } | ||
3013 | 3577 | ||
3014 | /* Never shrink a device with usable data during connect. | 3578 | /* Never shrink a device with usable data during connect. |
3015 | But allow online shrinking if we are connected. */ | 3579 | But allow online shrinking if we are connected. */ |
3016 | if (drbd_new_dev_size(mdev, mdev->ldev, 0) < | 3580 | if (drbd_new_dev_size(mdev, mdev->ldev, p_usize, 0) < |
3017 | drbd_get_capacity(mdev->this_bdev) && | 3581 | drbd_get_capacity(mdev->this_bdev) && |
3018 | mdev->state.disk >= D_OUTDATED && | 3582 | mdev->state.disk >= D_OUTDATED && |
3019 | mdev->state.conn < C_CONNECTED) { | 3583 | mdev->state.conn < C_CONNECTED) { |
3020 | dev_err(DEV, "The peer's disk size is too small!\n"); | 3584 | dev_err(DEV, "The peer's disk size is too small!\n"); |
3021 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | 3585 | conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD); |
3022 | mdev->ldev->dc.disk_size = my_usize; | ||
3023 | put_ldev(mdev); | 3586 | put_ldev(mdev); |
3024 | return false; | 3587 | return -EIO; |
3588 | } | ||
3589 | |||
3590 | if (my_usize != p_usize) { | ||
3591 | struct disk_conf *old_disk_conf, *new_disk_conf = NULL; | ||
3592 | |||
3593 | new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL); | ||
3594 | if (!new_disk_conf) { | ||
3595 | dev_err(DEV, "Allocation of new disk_conf failed\n"); | ||
3596 | put_ldev(mdev); | ||
3597 | return -ENOMEM; | ||
3598 | } | ||
3599 | |||
3600 | mutex_lock(&mdev->tconn->conf_update); | ||
3601 | old_disk_conf = mdev->ldev->disk_conf; | ||
3602 | *new_disk_conf = *old_disk_conf; | ||
3603 | new_disk_conf->disk_size = p_usize; | ||
3604 | |||
3605 | rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf); | ||
3606 | mutex_unlock(&mdev->tconn->conf_update); | ||
3607 | synchronize_rcu(); | ||
3608 | kfree(old_disk_conf); | ||
3609 | |||
3610 | dev_info(DEV, "Peer sets u_size to %lu sectors\n", | ||
3611 | (unsigned long)my_usize); | ||
3025 | } | 3612 | } |
3613 | |||
3026 | put_ldev(mdev); | 3614 | put_ldev(mdev); |
3027 | } | 3615 | } |
3028 | 3616 | ||
@@ -3031,7 +3619,7 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned | |||
3031 | dd = drbd_determine_dev_size(mdev, ddsf); | 3619 | dd = drbd_determine_dev_size(mdev, ddsf); |
3032 | put_ldev(mdev); | 3620 | put_ldev(mdev); |
3033 | if (dd == dev_size_error) | 3621 | if (dd == dev_size_error) |
3034 | return false; | 3622 | return -EIO; |
3035 | drbd_md_sync(mdev); | 3623 | drbd_md_sync(mdev); |
3036 | } else { | 3624 | } else { |
3037 | /* I am diskless, need to accept the peer's size. */ | 3625 | /* I am diskless, need to accept the peer's size. */ |
@@ -3070,16 +3658,25 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned | |||
3070 | } | 3658 | } |
3071 | } | 3659 | } |
3072 | 3660 | ||
3073 | return true; | 3661 | return 0; |
3074 | } | 3662 | } |
3075 | 3663 | ||
3076 | static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 3664 | static int receive_uuids(struct drbd_tconn *tconn, struct packet_info *pi) |
3077 | { | 3665 | { |
3078 | struct p_uuids *p = &mdev->data.rbuf.uuids; | 3666 | struct drbd_conf *mdev; |
3667 | struct p_uuids *p = pi->data; | ||
3079 | u64 *p_uuid; | 3668 | u64 *p_uuid; |
3080 | int i, updated_uuids = 0; | 3669 | int i, updated_uuids = 0; |
3081 | 3670 | ||
3671 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
3672 | if (!mdev) | ||
3673 | return config_unknown_volume(tconn, pi); | ||
3674 | |||
3082 | p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO); | 3675 | p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO); |
3676 | if (!p_uuid) { | ||
3677 | dev_err(DEV, "kmalloc of p_uuid failed\n"); | ||
3678 | return false; | ||
3679 | } | ||
3083 | 3680 | ||
3084 | for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++) | 3681 | for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++) |
3085 | p_uuid[i] = be64_to_cpu(p->uuid[i]); | 3682 | p_uuid[i] = be64_to_cpu(p->uuid[i]); |
@@ -3093,14 +3690,14 @@ static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned | |||
3093 | (mdev->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) { | 3690 | (mdev->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) { |
3094 | dev_err(DEV, "Can only connect to data with current UUID=%016llX\n", | 3691 | dev_err(DEV, "Can only connect to data with current UUID=%016llX\n", |
3095 | (unsigned long long)mdev->ed_uuid); | 3692 | (unsigned long long)mdev->ed_uuid); |
3096 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | 3693 | conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD); |
3097 | return false; | 3694 | return -EIO; |
3098 | } | 3695 | } |
3099 | 3696 | ||
3100 | if (get_ldev(mdev)) { | 3697 | if (get_ldev(mdev)) { |
3101 | int skip_initial_sync = | 3698 | int skip_initial_sync = |
3102 | mdev->state.conn == C_CONNECTED && | 3699 | mdev->state.conn == C_CONNECTED && |
3103 | mdev->agreed_pro_version >= 90 && | 3700 | mdev->tconn->agreed_pro_version >= 90 && |
3104 | mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && | 3701 | mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && |
3105 | (p_uuid[UI_FLAGS] & 8); | 3702 | (p_uuid[UI_FLAGS] & 8); |
3106 | if (skip_initial_sync) { | 3703 | if (skip_initial_sync) { |
@@ -3127,14 +3724,15 @@ static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned | |||
3127 | ongoing cluster wide state change is finished. That is important if | 3724 | ongoing cluster wide state change is finished. That is important if |
3128 | we are primary and are detaching from our disk. We need to see the | 3725 | we are primary and are detaching from our disk. We need to see the |
3129 | new disk state... */ | 3726 | new disk state... */ |
3130 | wait_event(mdev->misc_wait, !test_bit(CLUSTER_ST_CHANGE, &mdev->flags)); | 3727 | mutex_lock(mdev->state_mutex); |
3728 | mutex_unlock(mdev->state_mutex); | ||
3131 | if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT) | 3729 | if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT) |
3132 | updated_uuids |= drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]); | 3730 | updated_uuids |= drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]); |
3133 | 3731 | ||
3134 | if (updated_uuids) | 3732 | if (updated_uuids) |
3135 | drbd_print_uuids(mdev, "receiver updated UUIDs to"); | 3733 | drbd_print_uuids(mdev, "receiver updated UUIDs to"); |
3136 | 3734 | ||
3137 | return true; | 3735 | return 0; |
3138 | } | 3736 | } |
3139 | 3737 | ||
3140 | /** | 3738 | /** |
@@ -3146,6 +3744,7 @@ static union drbd_state convert_state(union drbd_state ps) | |||
3146 | union drbd_state ms; | 3744 | union drbd_state ms; |
3147 | 3745 | ||
3148 | static enum drbd_conns c_tab[] = { | 3746 | static enum drbd_conns c_tab[] = { |
3747 | [C_WF_REPORT_PARAMS] = C_WF_REPORT_PARAMS, | ||
3149 | [C_CONNECTED] = C_CONNECTED, | 3748 | [C_CONNECTED] = C_CONNECTED, |
3150 | 3749 | ||
3151 | [C_STARTING_SYNC_S] = C_STARTING_SYNC_T, | 3750 | [C_STARTING_SYNC_S] = C_STARTING_SYNC_T, |
@@ -3167,40 +3766,74 @@ static union drbd_state convert_state(union drbd_state ps) | |||
3167 | return ms; | 3766 | return ms; |
3168 | } | 3767 | } |
3169 | 3768 | ||
3170 | static int receive_req_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 3769 | static int receive_req_state(struct drbd_tconn *tconn, struct packet_info *pi) |
3171 | { | 3770 | { |
3172 | struct p_req_state *p = &mdev->data.rbuf.req_state; | 3771 | struct drbd_conf *mdev; |
3772 | struct p_req_state *p = pi->data; | ||
3173 | union drbd_state mask, val; | 3773 | union drbd_state mask, val; |
3174 | enum drbd_state_rv rv; | 3774 | enum drbd_state_rv rv; |
3175 | 3775 | ||
3776 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
3777 | if (!mdev) | ||
3778 | return -EIO; | ||
3779 | |||
3176 | mask.i = be32_to_cpu(p->mask); | 3780 | mask.i = be32_to_cpu(p->mask); |
3177 | val.i = be32_to_cpu(p->val); | 3781 | val.i = be32_to_cpu(p->val); |
3178 | 3782 | ||
3179 | if (test_bit(DISCARD_CONCURRENT, &mdev->flags) && | 3783 | if (test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags) && |
3180 | test_bit(CLUSTER_ST_CHANGE, &mdev->flags)) { | 3784 | mutex_is_locked(mdev->state_mutex)) { |
3181 | drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG); | 3785 | drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG); |
3182 | return true; | 3786 | return 0; |
3183 | } | 3787 | } |
3184 | 3788 | ||
3185 | mask = convert_state(mask); | 3789 | mask = convert_state(mask); |
3186 | val = convert_state(val); | 3790 | val = convert_state(val); |
3187 | 3791 | ||
3188 | rv = drbd_change_state(mdev, CS_VERBOSE, mask, val); | 3792 | rv = drbd_change_state(mdev, CS_VERBOSE, mask, val); |
3189 | |||
3190 | drbd_send_sr_reply(mdev, rv); | 3793 | drbd_send_sr_reply(mdev, rv); |
3794 | |||
3191 | drbd_md_sync(mdev); | 3795 | drbd_md_sync(mdev); |
3192 | 3796 | ||
3193 | return true; | 3797 | return 0; |
3194 | } | 3798 | } |
3195 | 3799 | ||
3196 | static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 3800 | static int receive_req_conn_state(struct drbd_tconn *tconn, struct packet_info *pi) |
3197 | { | 3801 | { |
3198 | struct p_state *p = &mdev->data.rbuf.state; | 3802 | struct p_req_state *p = pi->data; |
3803 | union drbd_state mask, val; | ||
3804 | enum drbd_state_rv rv; | ||
3805 | |||
3806 | mask.i = be32_to_cpu(p->mask); | ||
3807 | val.i = be32_to_cpu(p->val); | ||
3808 | |||
3809 | if (test_bit(RESOLVE_CONFLICTS, &tconn->flags) && | ||
3810 | mutex_is_locked(&tconn->cstate_mutex)) { | ||
3811 | conn_send_sr_reply(tconn, SS_CONCURRENT_ST_CHG); | ||
3812 | return 0; | ||
3813 | } | ||
3814 | |||
3815 | mask = convert_state(mask); | ||
3816 | val = convert_state(val); | ||
3817 | |||
3818 | rv = conn_request_state(tconn, mask, val, CS_VERBOSE | CS_LOCAL_ONLY | CS_IGN_OUTD_FAIL); | ||
3819 | conn_send_sr_reply(tconn, rv); | ||
3820 | |||
3821 | return 0; | ||
3822 | } | ||
3823 | |||
3824 | static int receive_state(struct drbd_tconn *tconn, struct packet_info *pi) | ||
3825 | { | ||
3826 | struct drbd_conf *mdev; | ||
3827 | struct p_state *p = pi->data; | ||
3199 | union drbd_state os, ns, peer_state; | 3828 | union drbd_state os, ns, peer_state; |
3200 | enum drbd_disk_state real_peer_disk; | 3829 | enum drbd_disk_state real_peer_disk; |
3201 | enum chg_state_flags cs_flags; | 3830 | enum chg_state_flags cs_flags; |
3202 | int rv; | 3831 | int rv; |
3203 | 3832 | ||
3833 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
3834 | if (!mdev) | ||
3835 | return config_unknown_volume(tconn, pi); | ||
3836 | |||
3204 | peer_state.i = be32_to_cpu(p->state); | 3837 | peer_state.i = be32_to_cpu(p->state); |
3205 | 3838 | ||
3206 | real_peer_disk = peer_state.disk; | 3839 | real_peer_disk = peer_state.disk; |
@@ -3209,16 +3842,16 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned | |||
3209 | dev_info(DEV, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk)); | 3842 | dev_info(DEV, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk)); |
3210 | } | 3843 | } |
3211 | 3844 | ||
3212 | spin_lock_irq(&mdev->req_lock); | 3845 | spin_lock_irq(&mdev->tconn->req_lock); |
3213 | retry: | 3846 | retry: |
3214 | os = ns = mdev->state; | 3847 | os = ns = drbd_read_state(mdev); |
3215 | spin_unlock_irq(&mdev->req_lock); | 3848 | spin_unlock_irq(&mdev->tconn->req_lock); |
3216 | 3849 | ||
3217 | /* If some other part of the code (asender thread, timeout) | 3850 | /* If some other part of the code (asender thread, timeout) |
3218 | * already decided to close the connection again, | 3851 | * already decided to close the connection again, |
3219 | * we must not "re-establish" it here. */ | 3852 | * we must not "re-establish" it here. */ |
3220 | if (os.conn <= C_TEAR_DOWN) | 3853 | if (os.conn <= C_TEAR_DOWN) |
3221 | return false; | 3854 | return -ECONNRESET; |
3222 | 3855 | ||
3223 | /* If this is the "end of sync" confirmation, usually the peer disk | 3856 | /* If this is the "end of sync" confirmation, usually the peer disk |
3224 | * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits | 3857 | * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits |
@@ -3246,10 +3879,18 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned | |||
3246 | peer_state.conn == C_CONNECTED) { | 3879 | peer_state.conn == C_CONNECTED) { |
3247 | if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) | 3880 | if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) |
3248 | drbd_resync_finished(mdev); | 3881 | drbd_resync_finished(mdev); |
3249 | return true; | 3882 | return 0; |
3250 | } | 3883 | } |
3251 | } | 3884 | } |
3252 | 3885 | ||
3886 | /* explicit verify finished notification, stop sector reached. */ | ||
3887 | if (os.conn == C_VERIFY_T && os.disk == D_UP_TO_DATE && | ||
3888 | peer_state.conn == C_CONNECTED && real_peer_disk == D_UP_TO_DATE) { | ||
3889 | ov_out_of_sync_print(mdev); | ||
3890 | drbd_resync_finished(mdev); | ||
3891 | return 0; | ||
3892 | } | ||
3893 | |||
3253 | /* peer says his disk is inconsistent, while we think it is uptodate, | 3894 | /* peer says his disk is inconsistent, while we think it is uptodate, |
3254 | * and this happens while the peer still thinks we have a sync going on, | 3895 | * and this happens while the peer still thinks we have a sync going on, |
3255 | * but we think we are already done with the sync. | 3896 | * but we think we are already done with the sync. |
@@ -3298,17 +3939,17 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned | |||
3298 | peer_state.disk = D_DISKLESS; | 3939 | peer_state.disk = D_DISKLESS; |
3299 | real_peer_disk = D_DISKLESS; | 3940 | real_peer_disk = D_DISKLESS; |
3300 | } else { | 3941 | } else { |
3301 | if (test_and_clear_bit(CONN_DRY_RUN, &mdev->flags)) | 3942 | if (test_and_clear_bit(CONN_DRY_RUN, &mdev->tconn->flags)) |
3302 | return false; | 3943 | return -EIO; |
3303 | D_ASSERT(os.conn == C_WF_REPORT_PARAMS); | 3944 | D_ASSERT(os.conn == C_WF_REPORT_PARAMS); |
3304 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | 3945 | conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD); |
3305 | return false; | 3946 | return -EIO; |
3306 | } | 3947 | } |
3307 | } | 3948 | } |
3308 | } | 3949 | } |
3309 | 3950 | ||
3310 | spin_lock_irq(&mdev->req_lock); | 3951 | spin_lock_irq(&mdev->tconn->req_lock); |
3311 | if (mdev->state.i != os.i) | 3952 | if (os.i != drbd_read_state(mdev).i) |
3312 | goto retry; | 3953 | goto retry; |
3313 | clear_bit(CONSIDER_RESYNC, &mdev->flags); | 3954 | clear_bit(CONSIDER_RESYNC, &mdev->flags); |
3314 | ns.peer = peer_state.role; | 3955 | ns.peer = peer_state.role; |
@@ -3317,25 +3958,25 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned | |||
3317 | if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING) | 3958 | if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING) |
3318 | ns.disk = mdev->new_state_tmp.disk; | 3959 | ns.disk = mdev->new_state_tmp.disk; |
3319 | cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD); | 3960 | cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD); |
3320 | if (ns.pdsk == D_CONSISTENT && is_susp(ns) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED && | 3961 | if (ns.pdsk == D_CONSISTENT && drbd_suspended(mdev) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED && |
3321 | test_bit(NEW_CUR_UUID, &mdev->flags)) { | 3962 | test_bit(NEW_CUR_UUID, &mdev->flags)) { |
3322 | /* Do not allow tl_restart(resend) for a rebooted peer. We can only allow this | 3963 | /* Do not allow tl_restart(RESEND) for a rebooted peer. We can only allow this |
3323 | for temporal network outages! */ | 3964 | for temporal network outages! */ |
3324 | spin_unlock_irq(&mdev->req_lock); | 3965 | spin_unlock_irq(&mdev->tconn->req_lock); |
3325 | dev_err(DEV, "Aborting Connect, can not thaw IO with an only Consistent peer\n"); | 3966 | dev_err(DEV, "Aborting Connect, can not thaw IO with an only Consistent peer\n"); |
3326 | tl_clear(mdev); | 3967 | tl_clear(mdev->tconn); |
3327 | drbd_uuid_new_current(mdev); | 3968 | drbd_uuid_new_current(mdev); |
3328 | clear_bit(NEW_CUR_UUID, &mdev->flags); | 3969 | clear_bit(NEW_CUR_UUID, &mdev->flags); |
3329 | drbd_force_state(mdev, NS2(conn, C_PROTOCOL_ERROR, susp, 0)); | 3970 | conn_request_state(mdev->tconn, NS2(conn, C_PROTOCOL_ERROR, susp, 0), CS_HARD); |
3330 | return false; | 3971 | return -EIO; |
3331 | } | 3972 | } |
3332 | rv = _drbd_set_state(mdev, ns, cs_flags, NULL); | 3973 | rv = _drbd_set_state(mdev, ns, cs_flags, NULL); |
3333 | ns = mdev->state; | 3974 | ns = drbd_read_state(mdev); |
3334 | spin_unlock_irq(&mdev->req_lock); | 3975 | spin_unlock_irq(&mdev->tconn->req_lock); |
3335 | 3976 | ||
3336 | if (rv < SS_SUCCESS) { | 3977 | if (rv < SS_SUCCESS) { |
3337 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | 3978 | conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD); |
3338 | return false; | 3979 | return -EIO; |
3339 | } | 3980 | } |
3340 | 3981 | ||
3341 | if (os.conn > C_WF_REPORT_PARAMS) { | 3982 | if (os.conn > C_WF_REPORT_PARAMS) { |
@@ -3349,16 +3990,21 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned | |||
3349 | } | 3990 | } |
3350 | } | 3991 | } |
3351 | 3992 | ||
3352 | mdev->net_conf->want_lose = 0; | 3993 | clear_bit(DISCARD_MY_DATA, &mdev->flags); |
3353 | 3994 | ||
3354 | drbd_md_sync(mdev); /* update connected indicator, la_size, ... */ | 3995 | drbd_md_sync(mdev); /* update connected indicator, la_size, ... */ |
3355 | 3996 | ||
3356 | return true; | 3997 | return 0; |
3357 | } | 3998 | } |
3358 | 3999 | ||
3359 | static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 4000 | static int receive_sync_uuid(struct drbd_tconn *tconn, struct packet_info *pi) |
3360 | { | 4001 | { |
3361 | struct p_rs_uuid *p = &mdev->data.rbuf.rs_uuid; | 4002 | struct drbd_conf *mdev; |
4003 | struct p_rs_uuid *p = pi->data; | ||
4004 | |||
4005 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
4006 | if (!mdev) | ||
4007 | return -EIO; | ||
3362 | 4008 | ||
3363 | wait_event(mdev->misc_wait, | 4009 | wait_event(mdev->misc_wait, |
3364 | mdev->state.conn == C_WF_SYNC_UUID || | 4010 | mdev->state.conn == C_WF_SYNC_UUID || |
@@ -3381,7 +4027,7 @@ static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsi | |||
3381 | } else | 4027 | } else |
3382 | dev_err(DEV, "Ignoring SyncUUID packet!\n"); | 4028 | dev_err(DEV, "Ignoring SyncUUID packet!\n"); |
3383 | 4029 | ||
3384 | return true; | 4030 | return 0; |
3385 | } | 4031 | } |
3386 | 4032 | ||
3387 | /** | 4033 | /** |
@@ -3391,27 +4037,27 @@ static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsi | |||
3391 | * code upon failure. | 4037 | * code upon failure. |
3392 | */ | 4038 | */ |
3393 | static int | 4039 | static int |
3394 | receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size, | 4040 | receive_bitmap_plain(struct drbd_conf *mdev, unsigned int size, |
3395 | unsigned long *buffer, struct bm_xfer_ctx *c) | 4041 | unsigned long *p, struct bm_xfer_ctx *c) |
3396 | { | 4042 | { |
3397 | unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset); | 4043 | unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - |
3398 | unsigned want = num_words * sizeof(long); | 4044 | drbd_header_size(mdev->tconn); |
4045 | unsigned int num_words = min_t(size_t, data_size / sizeof(*p), | ||
4046 | c->bm_words - c->word_offset); | ||
4047 | unsigned int want = num_words * sizeof(*p); | ||
3399 | int err; | 4048 | int err; |
3400 | 4049 | ||
3401 | if (want != data_size) { | 4050 | if (want != size) { |
3402 | dev_err(DEV, "%s:want (%u) != data_size (%u)\n", __func__, want, data_size); | 4051 | dev_err(DEV, "%s:want (%u) != size (%u)\n", __func__, want, size); |
3403 | return -EIO; | 4052 | return -EIO; |
3404 | } | 4053 | } |
3405 | if (want == 0) | 4054 | if (want == 0) |
3406 | return 0; | 4055 | return 0; |
3407 | err = drbd_recv(mdev, buffer, want); | 4056 | err = drbd_recv_all(mdev->tconn, p, want); |
3408 | if (err != want) { | 4057 | if (err) |
3409 | if (err >= 0) | ||
3410 | err = -EIO; | ||
3411 | return err; | 4058 | return err; |
3412 | } | ||
3413 | 4059 | ||
3414 | drbd_bm_merge_lel(mdev, c->word_offset, num_words, buffer); | 4060 | drbd_bm_merge_lel(mdev, c->word_offset, num_words, p); |
3415 | 4061 | ||
3416 | c->word_offset += num_words; | 4062 | c->word_offset += num_words; |
3417 | c->bit_offset = c->word_offset * BITS_PER_LONG; | 4063 | c->bit_offset = c->word_offset * BITS_PER_LONG; |
@@ -3421,6 +4067,21 @@ receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size, | |||
3421 | return 1; | 4067 | return 1; |
3422 | } | 4068 | } |
3423 | 4069 | ||
4070 | static enum drbd_bitmap_code dcbp_get_code(struct p_compressed_bm *p) | ||
4071 | { | ||
4072 | return (enum drbd_bitmap_code)(p->encoding & 0x0f); | ||
4073 | } | ||
4074 | |||
4075 | static int dcbp_get_start(struct p_compressed_bm *p) | ||
4076 | { | ||
4077 | return (p->encoding & 0x80) != 0; | ||
4078 | } | ||
4079 | |||
4080 | static int dcbp_get_pad_bits(struct p_compressed_bm *p) | ||
4081 | { | ||
4082 | return (p->encoding >> 4) & 0x7; | ||
4083 | } | ||
4084 | |||
3424 | /** | 4085 | /** |
3425 | * recv_bm_rle_bits | 4086 | * recv_bm_rle_bits |
3426 | * | 4087 | * |
@@ -3430,7 +4091,8 @@ receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size, | |||
3430 | static int | 4091 | static int |
3431 | recv_bm_rle_bits(struct drbd_conf *mdev, | 4092 | recv_bm_rle_bits(struct drbd_conf *mdev, |
3432 | struct p_compressed_bm *p, | 4093 | struct p_compressed_bm *p, |
3433 | struct bm_xfer_ctx *c) | 4094 | struct bm_xfer_ctx *c, |
4095 | unsigned int len) | ||
3434 | { | 4096 | { |
3435 | struct bitstream bs; | 4097 | struct bitstream bs; |
3436 | u64 look_ahead; | 4098 | u64 look_ahead; |
@@ -3438,12 +4100,11 @@ recv_bm_rle_bits(struct drbd_conf *mdev, | |||
3438 | u64 tmp; | 4100 | u64 tmp; |
3439 | unsigned long s = c->bit_offset; | 4101 | unsigned long s = c->bit_offset; |
3440 | unsigned long e; | 4102 | unsigned long e; |
3441 | int len = be16_to_cpu(p->head.length) - (sizeof(*p) - sizeof(p->head)); | 4103 | int toggle = dcbp_get_start(p); |
3442 | int toggle = DCBP_get_start(p); | ||
3443 | int have; | 4104 | int have; |
3444 | int bits; | 4105 | int bits; |
3445 | 4106 | ||
3446 | bitstream_init(&bs, p->code, len, DCBP_get_pad_bits(p)); | 4107 | bitstream_init(&bs, p->code, len, dcbp_get_pad_bits(p)); |
3447 | 4108 | ||
3448 | bits = bitstream_get_bits(&bs, &look_ahead, 64); | 4109 | bits = bitstream_get_bits(&bs, &look_ahead, 64); |
3449 | if (bits < 0) | 4110 | if (bits < 0) |
@@ -3495,17 +4156,18 @@ recv_bm_rle_bits(struct drbd_conf *mdev, | |||
3495 | static int | 4156 | static int |
3496 | decode_bitmap_c(struct drbd_conf *mdev, | 4157 | decode_bitmap_c(struct drbd_conf *mdev, |
3497 | struct p_compressed_bm *p, | 4158 | struct p_compressed_bm *p, |
3498 | struct bm_xfer_ctx *c) | 4159 | struct bm_xfer_ctx *c, |
4160 | unsigned int len) | ||
3499 | { | 4161 | { |
3500 | if (DCBP_get_code(p) == RLE_VLI_Bits) | 4162 | if (dcbp_get_code(p) == RLE_VLI_Bits) |
3501 | return recv_bm_rle_bits(mdev, p, c); | 4163 | return recv_bm_rle_bits(mdev, p, c, len - sizeof(*p)); |
3502 | 4164 | ||
3503 | /* other variants had been implemented for evaluation, | 4165 | /* other variants had been implemented for evaluation, |
3504 | * but have been dropped as this one turned out to be "best" | 4166 | * but have been dropped as this one turned out to be "best" |
3505 | * during all our tests. */ | 4167 | * during all our tests. */ |
3506 | 4168 | ||
3507 | dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding); | 4169 | dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding); |
3508 | drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); | 4170 | conn_request_state(mdev->tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD); |
3509 | return -EIO; | 4171 | return -EIO; |
3510 | } | 4172 | } |
3511 | 4173 | ||
@@ -3513,11 +4175,13 @@ void INFO_bm_xfer_stats(struct drbd_conf *mdev, | |||
3513 | const char *direction, struct bm_xfer_ctx *c) | 4175 | const char *direction, struct bm_xfer_ctx *c) |
3514 | { | 4176 | { |
3515 | /* what would it take to transfer it "plaintext" */ | 4177 | /* what would it take to transfer it "plaintext" */ |
3516 | unsigned plain = sizeof(struct p_header80) * | 4178 | unsigned int header_size = drbd_header_size(mdev->tconn); |
3517 | ((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1) | 4179 | unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - header_size; |
3518 | + c->bm_words * sizeof(long); | 4180 | unsigned int plain = |
3519 | unsigned total = c->bytes[0] + c->bytes[1]; | 4181 | header_size * (DIV_ROUND_UP(c->bm_words, data_size) + 1) + |
3520 | unsigned r; | 4182 | c->bm_words * sizeof(unsigned long); |
4183 | unsigned int total = c->bytes[0] + c->bytes[1]; | ||
4184 | unsigned int r; | ||
3521 | 4185 | ||
3522 | /* total can not be zero. but just in case: */ | 4186 | /* total can not be zero. but just in case: */ |
3523 | if (total == 0) | 4187 | if (total == 0) |
@@ -3551,67 +4215,63 @@ void INFO_bm_xfer_stats(struct drbd_conf *mdev, | |||
3551 | in order to be agnostic to the 32 vs 64 bits issue. | 4215 | in order to be agnostic to the 32 vs 64 bits issue. |
3552 | 4216 | ||
3553 | returns 0 on failure, 1 if we successfully received it. */ | 4217 | returns 0 on failure, 1 if we successfully received it. */ |
3554 | static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 4218 | static int receive_bitmap(struct drbd_tconn *tconn, struct packet_info *pi) |
3555 | { | 4219 | { |
4220 | struct drbd_conf *mdev; | ||
3556 | struct bm_xfer_ctx c; | 4221 | struct bm_xfer_ctx c; |
3557 | void *buffer; | ||
3558 | int err; | 4222 | int err; |
3559 | int ok = false; | 4223 | |
3560 | struct p_header80 *h = &mdev->data.rbuf.header.h80; | 4224 | mdev = vnr_to_mdev(tconn, pi->vnr); |
4225 | if (!mdev) | ||
4226 | return -EIO; | ||
3561 | 4227 | ||
3562 | drbd_bm_lock(mdev, "receive bitmap", BM_LOCKED_SET_ALLOWED); | 4228 | drbd_bm_lock(mdev, "receive bitmap", BM_LOCKED_SET_ALLOWED); |
3563 | /* you are supposed to send additional out-of-sync information | 4229 | /* you are supposed to send additional out-of-sync information |
3564 | * if you actually set bits during this phase */ | 4230 | * if you actually set bits during this phase */ |
3565 | 4231 | ||
3566 | /* maybe we should use some per thread scratch page, | ||
3567 | * and allocate that during initial device creation? */ | ||
3568 | buffer = (unsigned long *) __get_free_page(GFP_NOIO); | ||
3569 | if (!buffer) { | ||
3570 | dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__); | ||
3571 | goto out; | ||
3572 | } | ||
3573 | |||
3574 | c = (struct bm_xfer_ctx) { | 4232 | c = (struct bm_xfer_ctx) { |
3575 | .bm_bits = drbd_bm_bits(mdev), | 4233 | .bm_bits = drbd_bm_bits(mdev), |
3576 | .bm_words = drbd_bm_words(mdev), | 4234 | .bm_words = drbd_bm_words(mdev), |
3577 | }; | 4235 | }; |
3578 | 4236 | ||
3579 | for(;;) { | 4237 | for(;;) { |
3580 | if (cmd == P_BITMAP) { | 4238 | if (pi->cmd == P_BITMAP) |
3581 | err = receive_bitmap_plain(mdev, data_size, buffer, &c); | 4239 | err = receive_bitmap_plain(mdev, pi->size, pi->data, &c); |
3582 | } else if (cmd == P_COMPRESSED_BITMAP) { | 4240 | else if (pi->cmd == P_COMPRESSED_BITMAP) { |
3583 | /* MAYBE: sanity check that we speak proto >= 90, | 4241 | /* MAYBE: sanity check that we speak proto >= 90, |
3584 | * and the feature is enabled! */ | 4242 | * and the feature is enabled! */ |
3585 | struct p_compressed_bm *p; | 4243 | struct p_compressed_bm *p = pi->data; |
3586 | 4244 | ||
3587 | if (data_size > BM_PACKET_PAYLOAD_BYTES) { | 4245 | if (pi->size > DRBD_SOCKET_BUFFER_SIZE - drbd_header_size(tconn)) { |
3588 | dev_err(DEV, "ReportCBitmap packet too large\n"); | 4246 | dev_err(DEV, "ReportCBitmap packet too large\n"); |
4247 | err = -EIO; | ||
3589 | goto out; | 4248 | goto out; |
3590 | } | 4249 | } |
3591 | /* use the page buff */ | 4250 | if (pi->size <= sizeof(*p)) { |
3592 | p = buffer; | 4251 | dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", pi->size); |
3593 | memcpy(p, h, sizeof(*h)); | 4252 | err = -EIO; |
3594 | if (drbd_recv(mdev, p->head.payload, data_size) != data_size) | ||
3595 | goto out; | ||
3596 | if (data_size <= (sizeof(*p) - sizeof(p->head))) { | ||
3597 | dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", data_size); | ||
3598 | goto out; | 4253 | goto out; |
3599 | } | 4254 | } |
3600 | err = decode_bitmap_c(mdev, p, &c); | 4255 | err = drbd_recv_all(mdev->tconn, p, pi->size); |
4256 | if (err) | ||
4257 | goto out; | ||
4258 | err = decode_bitmap_c(mdev, p, &c, pi->size); | ||
3601 | } else { | 4259 | } else { |
3602 | dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", cmd); | 4260 | dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", pi->cmd); |
4261 | err = -EIO; | ||
3603 | goto out; | 4262 | goto out; |
3604 | } | 4263 | } |
3605 | 4264 | ||
3606 | c.packets[cmd == P_BITMAP]++; | 4265 | c.packets[pi->cmd == P_BITMAP]++; |
3607 | c.bytes[cmd == P_BITMAP] += sizeof(struct p_header80) + data_size; | 4266 | c.bytes[pi->cmd == P_BITMAP] += drbd_header_size(tconn) + pi->size; |
3608 | 4267 | ||
3609 | if (err <= 0) { | 4268 | if (err <= 0) { |
3610 | if (err < 0) | 4269 | if (err < 0) |
3611 | goto out; | 4270 | goto out; |
3612 | break; | 4271 | break; |
3613 | } | 4272 | } |
3614 | if (!drbd_recv_header(mdev, &cmd, &data_size)) | 4273 | err = drbd_recv_header(mdev->tconn, pi); |
4274 | if (err) | ||
3615 | goto out; | 4275 | goto out; |
3616 | } | 4276 | } |
3617 | 4277 | ||
@@ -3620,8 +4280,8 @@ static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigne | |||
3620 | if (mdev->state.conn == C_WF_BITMAP_T) { | 4280 | if (mdev->state.conn == C_WF_BITMAP_T) { |
3621 | enum drbd_state_rv rv; | 4281 | enum drbd_state_rv rv; |
3622 | 4282 | ||
3623 | ok = !drbd_send_bitmap(mdev); | 4283 | err = drbd_send_bitmap(mdev); |
3624 | if (!ok) | 4284 | if (err) |
3625 | goto out; | 4285 | goto out; |
3626 | /* Omit CS_ORDERED with this state transition to avoid deadlocks. */ | 4286 | /* Omit CS_ORDERED with this state transition to avoid deadlocks. */ |
3627 | rv = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); | 4287 | rv = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); |
@@ -3632,47 +4292,40 @@ static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigne | |||
3632 | dev_info(DEV, "unexpected cstate (%s) in receive_bitmap\n", | 4292 | dev_info(DEV, "unexpected cstate (%s) in receive_bitmap\n", |
3633 | drbd_conn_str(mdev->state.conn)); | 4293 | drbd_conn_str(mdev->state.conn)); |
3634 | } | 4294 | } |
4295 | err = 0; | ||
3635 | 4296 | ||
3636 | ok = true; | ||
3637 | out: | 4297 | out: |
3638 | drbd_bm_unlock(mdev); | 4298 | drbd_bm_unlock(mdev); |
3639 | if (ok && mdev->state.conn == C_WF_BITMAP_S) | 4299 | if (!err && mdev->state.conn == C_WF_BITMAP_S) |
3640 | drbd_start_resync(mdev, C_SYNC_SOURCE); | 4300 | drbd_start_resync(mdev, C_SYNC_SOURCE); |
3641 | free_page((unsigned long) buffer); | 4301 | return err; |
3642 | return ok; | ||
3643 | } | 4302 | } |
3644 | 4303 | ||
3645 | static int receive_skip(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 4304 | static int receive_skip(struct drbd_tconn *tconn, struct packet_info *pi) |
3646 | { | 4305 | { |
3647 | /* TODO zero copy sink :) */ | 4306 | conn_warn(tconn, "skipping unknown optional packet type %d, l: %d!\n", |
3648 | static char sink[128]; | 4307 | pi->cmd, pi->size); |
3649 | int size, want, r; | ||
3650 | 4308 | ||
3651 | dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n", | 4309 | return ignore_remaining_packet(tconn, pi); |
3652 | cmd, data_size); | ||
3653 | |||
3654 | size = data_size; | ||
3655 | while (size > 0) { | ||
3656 | want = min_t(int, size, sizeof(sink)); | ||
3657 | r = drbd_recv(mdev, sink, want); | ||
3658 | ERR_IF(r <= 0) break; | ||
3659 | size -= r; | ||
3660 | } | ||
3661 | return size == 0; | ||
3662 | } | 4310 | } |
3663 | 4311 | ||
3664 | static int receive_UnplugRemote(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 4312 | static int receive_UnplugRemote(struct drbd_tconn *tconn, struct packet_info *pi) |
3665 | { | 4313 | { |
3666 | /* Make sure we've acked all the TCP data associated | 4314 | /* Make sure we've acked all the TCP data associated |
3667 | * with the data requests being unplugged */ | 4315 | * with the data requests being unplugged */ |
3668 | drbd_tcp_quickack(mdev->data.socket); | 4316 | drbd_tcp_quickack(tconn->data.socket); |
3669 | 4317 | ||
3670 | return true; | 4318 | return 0; |
3671 | } | 4319 | } |
3672 | 4320 | ||
3673 | static int receive_out_of_sync(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 4321 | static int receive_out_of_sync(struct drbd_tconn *tconn, struct packet_info *pi) |
3674 | { | 4322 | { |
3675 | struct p_block_desc *p = &mdev->data.rbuf.block_desc; | 4323 | struct drbd_conf *mdev; |
4324 | struct p_block_desc *p = pi->data; | ||
4325 | |||
4326 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
4327 | if (!mdev) | ||
4328 | return -EIO; | ||
3676 | 4329 | ||
3677 | switch (mdev->state.conn) { | 4330 | switch (mdev->state.conn) { |
3678 | case C_WF_SYNC_UUID: | 4331 | case C_WF_SYNC_UUID: |
@@ -3686,15 +4339,13 @@ static int receive_out_of_sync(struct drbd_conf *mdev, enum drbd_packets cmd, un | |||
3686 | 4339 | ||
3687 | drbd_set_out_of_sync(mdev, be64_to_cpu(p->sector), be32_to_cpu(p->blksize)); | 4340 | drbd_set_out_of_sync(mdev, be64_to_cpu(p->sector), be32_to_cpu(p->blksize)); |
3688 | 4341 | ||
3689 | return true; | 4342 | return 0; |
3690 | } | 4343 | } |
3691 | 4344 | ||
3692 | typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, enum drbd_packets cmd, unsigned int to_receive); | ||
3693 | |||
3694 | struct data_cmd { | 4345 | struct data_cmd { |
3695 | int expect_payload; | 4346 | int expect_payload; |
3696 | size_t pkt_size; | 4347 | size_t pkt_size; |
3697 | drbd_cmd_handler_f function; | 4348 | int (*fn)(struct drbd_tconn *, struct packet_info *); |
3698 | }; | 4349 | }; |
3699 | 4350 | ||
3700 | static struct data_cmd drbd_cmd_handler[] = { | 4351 | static struct data_cmd drbd_cmd_handler[] = { |
@@ -3702,13 +4353,13 @@ static struct data_cmd drbd_cmd_handler[] = { | |||
3702 | [P_DATA_REPLY] = { 1, sizeof(struct p_data), receive_DataReply }, | 4353 | [P_DATA_REPLY] = { 1, sizeof(struct p_data), receive_DataReply }, |
3703 | [P_RS_DATA_REPLY] = { 1, sizeof(struct p_data), receive_RSDataReply } , | 4354 | [P_RS_DATA_REPLY] = { 1, sizeof(struct p_data), receive_RSDataReply } , |
3704 | [P_BARRIER] = { 0, sizeof(struct p_barrier), receive_Barrier } , | 4355 | [P_BARRIER] = { 0, sizeof(struct p_barrier), receive_Barrier } , |
3705 | [P_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } , | 4356 | [P_BITMAP] = { 1, 0, receive_bitmap } , |
3706 | [P_COMPRESSED_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } , | 4357 | [P_COMPRESSED_BITMAP] = { 1, 0, receive_bitmap } , |
3707 | [P_UNPLUG_REMOTE] = { 0, sizeof(struct p_header80), receive_UnplugRemote }, | 4358 | [P_UNPLUG_REMOTE] = { 0, 0, receive_UnplugRemote }, |
3708 | [P_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, | 4359 | [P_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, |
3709 | [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, | 4360 | [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, |
3710 | [P_SYNC_PARAM] = { 1, sizeof(struct p_header80), receive_SyncParam }, | 4361 | [P_SYNC_PARAM] = { 1, 0, receive_SyncParam }, |
3711 | [P_SYNC_PARAM89] = { 1, sizeof(struct p_header80), receive_SyncParam }, | 4362 | [P_SYNC_PARAM89] = { 1, 0, receive_SyncParam }, |
3712 | [P_PROTOCOL] = { 1, sizeof(struct p_protocol), receive_protocol }, | 4363 | [P_PROTOCOL] = { 1, sizeof(struct p_protocol), receive_protocol }, |
3713 | [P_UUIDS] = { 0, sizeof(struct p_uuids), receive_uuids }, | 4364 | [P_UUIDS] = { 0, sizeof(struct p_uuids), receive_uuids }, |
3714 | [P_SIZES] = { 0, sizeof(struct p_sizes), receive_sizes }, | 4365 | [P_SIZES] = { 0, sizeof(struct p_sizes), receive_sizes }, |
@@ -3720,124 +4371,75 @@ static struct data_cmd drbd_cmd_handler[] = { | |||
3720 | [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest }, | 4371 | [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest }, |
3721 | [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip }, | 4372 | [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip }, |
3722 | [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync }, | 4373 | [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync }, |
3723 | /* anything missing from this table is in | 4374 | [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state }, |
3724 | * the asender_tbl, see get_asender_cmd */ | 4375 | [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol }, |
3725 | [P_MAX_CMD] = { 0, 0, NULL }, | ||
3726 | }; | 4376 | }; |
3727 | 4377 | ||
3728 | /* All handler functions that expect a sub-header get that sub-heder in | 4378 | static void drbdd(struct drbd_tconn *tconn) |
3729 | mdev->data.rbuf.header.head.payload. | ||
3730 | |||
3731 | Usually in mdev->data.rbuf.header.head the callback can find the usual | ||
3732 | p_header, but they may not rely on that. Since there is also p_header95 ! | ||
3733 | */ | ||
3734 | |||
3735 | static void drbdd(struct drbd_conf *mdev) | ||
3736 | { | 4379 | { |
3737 | union p_header *header = &mdev->data.rbuf.header; | 4380 | struct packet_info pi; |
3738 | unsigned int packet_size; | ||
3739 | enum drbd_packets cmd; | ||
3740 | size_t shs; /* sub header size */ | 4381 | size_t shs; /* sub header size */ |
3741 | int rv; | 4382 | int err; |
4383 | |||
4384 | while (get_t_state(&tconn->receiver) == RUNNING) { | ||
4385 | struct data_cmd *cmd; | ||
3742 | 4386 | ||
3743 | while (get_t_state(&mdev->receiver) == Running) { | 4387 | drbd_thread_current_set_cpu(&tconn->receiver); |
3744 | drbd_thread_current_set_cpu(mdev); | 4388 | if (drbd_recv_header(tconn, &pi)) |
3745 | if (!drbd_recv_header(mdev, &cmd, &packet_size)) | ||
3746 | goto err_out; | 4389 | goto err_out; |
3747 | 4390 | ||
3748 | if (unlikely(cmd >= P_MAX_CMD || !drbd_cmd_handler[cmd].function)) { | 4391 | cmd = &drbd_cmd_handler[pi.cmd]; |
3749 | dev_err(DEV, "unknown packet type %d, l: %d!\n", cmd, packet_size); | 4392 | if (unlikely(pi.cmd >= ARRAY_SIZE(drbd_cmd_handler) || !cmd->fn)) { |
4393 | conn_err(tconn, "Unexpected data packet %s (0x%04x)", | ||
4394 | cmdname(pi.cmd), pi.cmd); | ||
3750 | goto err_out; | 4395 | goto err_out; |
3751 | } | 4396 | } |
3752 | 4397 | ||
3753 | shs = drbd_cmd_handler[cmd].pkt_size - sizeof(union p_header); | 4398 | shs = cmd->pkt_size; |
3754 | if (packet_size - shs > 0 && !drbd_cmd_handler[cmd].expect_payload) { | 4399 | if (pi.size > shs && !cmd->expect_payload) { |
3755 | dev_err(DEV, "No payload expected %s l:%d\n", cmdname(cmd), packet_size); | 4400 | conn_err(tconn, "No payload expected %s l:%d\n", |
4401 | cmdname(pi.cmd), pi.size); | ||
3756 | goto err_out; | 4402 | goto err_out; |
3757 | } | 4403 | } |
3758 | 4404 | ||
3759 | if (shs) { | 4405 | if (shs) { |
3760 | rv = drbd_recv(mdev, &header->h80.payload, shs); | 4406 | err = drbd_recv_all_warn(tconn, pi.data, shs); |
3761 | if (unlikely(rv != shs)) { | 4407 | if (err) |
3762 | if (!signal_pending(current)) | ||
3763 | dev_warn(DEV, "short read while reading sub header: rv=%d\n", rv); | ||
3764 | goto err_out; | 4408 | goto err_out; |
3765 | } | 4409 | pi.size -= shs; |
3766 | } | 4410 | } |
3767 | 4411 | ||
3768 | rv = drbd_cmd_handler[cmd].function(mdev, cmd, packet_size - shs); | 4412 | err = cmd->fn(tconn, &pi); |
3769 | 4413 | if (err) { | |
3770 | if (unlikely(!rv)) { | 4414 | conn_err(tconn, "error receiving %s, e: %d l: %d!\n", |
3771 | dev_err(DEV, "error receiving %s, l: %d!\n", | 4415 | cmdname(pi.cmd), err, pi.size); |
3772 | cmdname(cmd), packet_size); | ||
3773 | goto err_out; | 4416 | goto err_out; |
3774 | } | 4417 | } |
3775 | } | 4418 | } |
4419 | return; | ||
3776 | 4420 | ||
3777 | if (0) { | 4421 | err_out: |
3778 | err_out: | 4422 | conn_request_state(tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD); |
3779 | drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); | ||
3780 | } | ||
3781 | /* If we leave here, we probably want to update at least the | ||
3782 | * "Connected" indicator on stable storage. Do so explicitly here. */ | ||
3783 | drbd_md_sync(mdev); | ||
3784 | } | 4423 | } |
3785 | 4424 | ||
3786 | void drbd_flush_workqueue(struct drbd_conf *mdev) | 4425 | void conn_flush_workqueue(struct drbd_tconn *tconn) |
3787 | { | 4426 | { |
3788 | struct drbd_wq_barrier barr; | 4427 | struct drbd_wq_barrier barr; |
3789 | 4428 | ||
3790 | barr.w.cb = w_prev_work_done; | 4429 | barr.w.cb = w_prev_work_done; |
4430 | barr.w.tconn = tconn; | ||
3791 | init_completion(&barr.done); | 4431 | init_completion(&barr.done); |
3792 | drbd_queue_work(&mdev->data.work, &barr.w); | 4432 | drbd_queue_work(&tconn->sender_work, &barr.w); |
3793 | wait_for_completion(&barr.done); | 4433 | wait_for_completion(&barr.done); |
3794 | } | 4434 | } |
3795 | 4435 | ||
3796 | void drbd_free_tl_hash(struct drbd_conf *mdev) | 4436 | static void conn_disconnect(struct drbd_tconn *tconn) |
3797 | { | ||
3798 | struct hlist_head *h; | ||
3799 | |||
3800 | spin_lock_irq(&mdev->req_lock); | ||
3801 | |||
3802 | if (!mdev->tl_hash || mdev->state.conn != C_STANDALONE) { | ||
3803 | spin_unlock_irq(&mdev->req_lock); | ||
3804 | return; | ||
3805 | } | ||
3806 | /* paranoia code */ | ||
3807 | for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++) | ||
3808 | if (h->first) | ||
3809 | dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n", | ||
3810 | (int)(h - mdev->ee_hash), h->first); | ||
3811 | kfree(mdev->ee_hash); | ||
3812 | mdev->ee_hash = NULL; | ||
3813 | mdev->ee_hash_s = 0; | ||
3814 | |||
3815 | /* We may not have had the chance to wait for all locally pending | ||
3816 | * application requests. The hlist_add_fake() prevents access after | ||
3817 | * free on master bio completion. */ | ||
3818 | for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++) { | ||
3819 | struct drbd_request *req; | ||
3820 | struct hlist_node *pos, *n; | ||
3821 | hlist_for_each_entry_safe(req, pos, n, h, collision) { | ||
3822 | hlist_del_init(&req->collision); | ||
3823 | hlist_add_fake(&req->collision); | ||
3824 | } | ||
3825 | } | ||
3826 | |||
3827 | kfree(mdev->tl_hash); | ||
3828 | mdev->tl_hash = NULL; | ||
3829 | mdev->tl_hash_s = 0; | ||
3830 | spin_unlock_irq(&mdev->req_lock); | ||
3831 | } | ||
3832 | |||
3833 | static void drbd_disconnect(struct drbd_conf *mdev) | ||
3834 | { | 4437 | { |
3835 | enum drbd_fencing_p fp; | 4438 | struct drbd_conf *mdev; |
3836 | union drbd_state os, ns; | 4439 | enum drbd_conns oc; |
3837 | int rv = SS_UNKNOWN_ERROR; | 4440 | int vnr; |
3838 | unsigned int i; | ||
3839 | 4441 | ||
3840 | if (mdev->state.conn == C_STANDALONE) | 4442 | if (tconn->cstate == C_STANDALONE) |
3841 | return; | 4443 | return; |
3842 | 4444 | ||
3843 | /* We are about to start the cleanup after connection loss. | 4445 | /* We are about to start the cleanup after connection loss. |
@@ -3845,18 +4447,54 @@ static void drbd_disconnect(struct drbd_conf *mdev) | |||
3845 | * Usually we should be in some network failure state already, | 4447 | * Usually we should be in some network failure state already, |
3846 | * but just in case we are not, we fix it up here. | 4448 | * but just in case we are not, we fix it up here. |
3847 | */ | 4449 | */ |
3848 | drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE)); | 4450 | conn_request_state(tconn, NS(conn, C_NETWORK_FAILURE), CS_HARD); |
3849 | 4451 | ||
3850 | /* asender does not clean up anything. it must not interfere, either */ | 4452 | /* asender does not clean up anything. it must not interfere, either */ |
3851 | drbd_thread_stop(&mdev->asender); | 4453 | drbd_thread_stop(&tconn->asender); |
3852 | drbd_free_sock(mdev); | 4454 | drbd_free_sock(tconn); |
4455 | |||
4456 | rcu_read_lock(); | ||
4457 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
4458 | kref_get(&mdev->kref); | ||
4459 | rcu_read_unlock(); | ||
4460 | drbd_disconnected(mdev); | ||
4461 | kref_put(&mdev->kref, &drbd_minor_destroy); | ||
4462 | rcu_read_lock(); | ||
4463 | } | ||
4464 | rcu_read_unlock(); | ||
4465 | |||
4466 | if (!list_empty(&tconn->current_epoch->list)) | ||
4467 | conn_err(tconn, "ASSERTION FAILED: tconn->current_epoch->list not empty\n"); | ||
4468 | /* ok, no more ee's on the fly, it is safe to reset the epoch_size */ | ||
4469 | atomic_set(&tconn->current_epoch->epoch_size, 0); | ||
4470 | tconn->send.seen_any_write_yet = false; | ||
4471 | |||
4472 | conn_info(tconn, "Connection closed\n"); | ||
4473 | |||
4474 | if (conn_highest_role(tconn) == R_PRIMARY && conn_highest_pdsk(tconn) >= D_UNKNOWN) | ||
4475 | conn_try_outdate_peer_async(tconn); | ||
4476 | |||
4477 | spin_lock_irq(&tconn->req_lock); | ||
4478 | oc = tconn->cstate; | ||
4479 | if (oc >= C_UNCONNECTED) | ||
4480 | _conn_request_state(tconn, NS(conn, C_UNCONNECTED), CS_VERBOSE); | ||
4481 | |||
4482 | spin_unlock_irq(&tconn->req_lock); | ||
4483 | |||
4484 | if (oc == C_DISCONNECTING) | ||
4485 | conn_request_state(tconn, NS(conn, C_STANDALONE), CS_VERBOSE | CS_HARD); | ||
4486 | } | ||
4487 | |||
4488 | static int drbd_disconnected(struct drbd_conf *mdev) | ||
4489 | { | ||
4490 | unsigned int i; | ||
3853 | 4491 | ||
3854 | /* wait for current activity to cease. */ | 4492 | /* wait for current activity to cease. */ |
3855 | spin_lock_irq(&mdev->req_lock); | 4493 | spin_lock_irq(&mdev->tconn->req_lock); |
3856 | _drbd_wait_ee_list_empty(mdev, &mdev->active_ee); | 4494 | _drbd_wait_ee_list_empty(mdev, &mdev->active_ee); |
3857 | _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee); | 4495 | _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee); |
3858 | _drbd_wait_ee_list_empty(mdev, &mdev->read_ee); | 4496 | _drbd_wait_ee_list_empty(mdev, &mdev->read_ee); |
3859 | spin_unlock_irq(&mdev->req_lock); | 4497 | spin_unlock_irq(&mdev->tconn->req_lock); |
3860 | 4498 | ||
3861 | /* We do not have data structures that would allow us to | 4499 | /* We do not have data structures that would allow us to |
3862 | * get the rs_pending_cnt down to 0 again. | 4500 | * get the rs_pending_cnt down to 0 again. |
@@ -3874,7 +4512,6 @@ static void drbd_disconnect(struct drbd_conf *mdev) | |||
3874 | atomic_set(&mdev->rs_pending_cnt, 0); | 4512 | atomic_set(&mdev->rs_pending_cnt, 0); |
3875 | wake_up(&mdev->misc_wait); | 4513 | wake_up(&mdev->misc_wait); |
3876 | 4514 | ||
3877 | /* make sure syncer is stopped and w_resume_next_sg queued */ | ||
3878 | del_timer_sync(&mdev->resync_timer); | 4515 | del_timer_sync(&mdev->resync_timer); |
3879 | resync_timer_fn((unsigned long)mdev); | 4516 | resync_timer_fn((unsigned long)mdev); |
3880 | 4517 | ||
@@ -3883,50 +4520,25 @@ static void drbd_disconnect(struct drbd_conf *mdev) | |||
3883 | * to be "canceled" */ | 4520 | * to be "canceled" */ |
3884 | drbd_flush_workqueue(mdev); | 4521 | drbd_flush_workqueue(mdev); |
3885 | 4522 | ||
3886 | /* This also does reclaim_net_ee(). If we do this too early, we might | 4523 | drbd_finish_peer_reqs(mdev); |
3887 | * miss some resync ee and pages.*/ | 4524 | |
3888 | drbd_process_done_ee(mdev); | 4525 | /* This second workqueue flush is necessary, since drbd_finish_peer_reqs() |
4526 | might have issued a work again. The one before drbd_finish_peer_reqs() is | ||
4527 | necessary to reclain net_ee in drbd_finish_peer_reqs(). */ | ||
4528 | drbd_flush_workqueue(mdev); | ||
4529 | |||
4530 | /* need to do it again, drbd_finish_peer_reqs() may have populated it | ||
4531 | * again via drbd_try_clear_on_disk_bm(). */ | ||
4532 | drbd_rs_cancel_all(mdev); | ||
3889 | 4533 | ||
3890 | kfree(mdev->p_uuid); | 4534 | kfree(mdev->p_uuid); |
3891 | mdev->p_uuid = NULL; | 4535 | mdev->p_uuid = NULL; |
3892 | 4536 | ||
3893 | if (!is_susp(mdev->state)) | 4537 | if (!drbd_suspended(mdev)) |
3894 | tl_clear(mdev); | 4538 | tl_clear(mdev->tconn); |
3895 | |||
3896 | dev_info(DEV, "Connection closed\n"); | ||
3897 | 4539 | ||
3898 | drbd_md_sync(mdev); | 4540 | drbd_md_sync(mdev); |
3899 | 4541 | ||
3900 | fp = FP_DONT_CARE; | ||
3901 | if (get_ldev(mdev)) { | ||
3902 | fp = mdev->ldev->dc.fencing; | ||
3903 | put_ldev(mdev); | ||
3904 | } | ||
3905 | |||
3906 | if (mdev->state.role == R_PRIMARY && fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN) | ||
3907 | drbd_try_outdate_peer_async(mdev); | ||
3908 | |||
3909 | spin_lock_irq(&mdev->req_lock); | ||
3910 | os = mdev->state; | ||
3911 | if (os.conn >= C_UNCONNECTED) { | ||
3912 | /* Do not restart in case we are C_DISCONNECTING */ | ||
3913 | ns = os; | ||
3914 | ns.conn = C_UNCONNECTED; | ||
3915 | rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); | ||
3916 | } | ||
3917 | spin_unlock_irq(&mdev->req_lock); | ||
3918 | |||
3919 | if (os.conn == C_DISCONNECTING) { | ||
3920 | wait_event(mdev->net_cnt_wait, atomic_read(&mdev->net_cnt) == 0); | ||
3921 | |||
3922 | crypto_free_hash(mdev->cram_hmac_tfm); | ||
3923 | mdev->cram_hmac_tfm = NULL; | ||
3924 | |||
3925 | kfree(mdev->net_conf); | ||
3926 | mdev->net_conf = NULL; | ||
3927 | drbd_request_state(mdev, NS(conn, C_STANDALONE)); | ||
3928 | } | ||
3929 | |||
3930 | /* serialize with bitmap writeout triggered by the state change, | 4542 | /* serialize with bitmap writeout triggered by the state change, |
3931 | * if any. */ | 4543 | * if any. */ |
3932 | wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); | 4544 | wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); |
@@ -3938,7 +4550,7 @@ static void drbd_disconnect(struct drbd_conf *mdev) | |||
3938 | * Actually we don't care for exactly when the network stack does its | 4550 | * Actually we don't care for exactly when the network stack does its |
3939 | * put_page(), but release our reference on these pages right here. | 4551 | * put_page(), but release our reference on these pages right here. |
3940 | */ | 4552 | */ |
3941 | i = drbd_release_ee(mdev, &mdev->net_ee); | 4553 | i = drbd_free_peer_reqs(mdev, &mdev->net_ee); |
3942 | if (i) | 4554 | if (i) |
3943 | dev_info(DEV, "net_ee not empty, killed %u entries\n", i); | 4555 | dev_info(DEV, "net_ee not empty, killed %u entries\n", i); |
3944 | i = atomic_read(&mdev->pp_in_use_by_net); | 4556 | i = atomic_read(&mdev->pp_in_use_by_net); |
@@ -3953,9 +4565,7 @@ static void drbd_disconnect(struct drbd_conf *mdev) | |||
3953 | D_ASSERT(list_empty(&mdev->sync_ee)); | 4565 | D_ASSERT(list_empty(&mdev->sync_ee)); |
3954 | D_ASSERT(list_empty(&mdev->done_ee)); | 4566 | D_ASSERT(list_empty(&mdev->done_ee)); |
3955 | 4567 | ||
3956 | /* ok, no more ee's on the fly, it is safe to reset the epoch_size */ | 4568 | return 0; |
3957 | atomic_set(&mdev->current_epoch->epoch_size, 0); | ||
3958 | D_ASSERT(list_empty(&mdev->current_epoch->list)); | ||
3959 | } | 4569 | } |
3960 | 4570 | ||
3961 | /* | 4571 | /* |
@@ -3967,29 +4577,19 @@ static void drbd_disconnect(struct drbd_conf *mdev) | |||
3967 | * | 4577 | * |
3968 | * for now, they are expected to be zero, but ignored. | 4578 | * for now, they are expected to be zero, but ignored. |
3969 | */ | 4579 | */ |
3970 | static int drbd_send_handshake(struct drbd_conf *mdev) | 4580 | static int drbd_send_features(struct drbd_tconn *tconn) |
3971 | { | 4581 | { |
3972 | /* ASSERT current == mdev->receiver ... */ | 4582 | struct drbd_socket *sock; |
3973 | struct p_handshake *p = &mdev->data.sbuf.handshake; | 4583 | struct p_connection_features *p; |
3974 | int ok; | ||
3975 | |||
3976 | if (mutex_lock_interruptible(&mdev->data.mutex)) { | ||
3977 | dev_err(DEV, "interrupted during initial handshake\n"); | ||
3978 | return 0; /* interrupted. not ok. */ | ||
3979 | } | ||
3980 | |||
3981 | if (mdev->data.socket == NULL) { | ||
3982 | mutex_unlock(&mdev->data.mutex); | ||
3983 | return 0; | ||
3984 | } | ||
3985 | 4584 | ||
4585 | sock = &tconn->data; | ||
4586 | p = conn_prepare_command(tconn, sock); | ||
4587 | if (!p) | ||
4588 | return -EIO; | ||
3986 | memset(p, 0, sizeof(*p)); | 4589 | memset(p, 0, sizeof(*p)); |
3987 | p->protocol_min = cpu_to_be32(PRO_VERSION_MIN); | 4590 | p->protocol_min = cpu_to_be32(PRO_VERSION_MIN); |
3988 | p->protocol_max = cpu_to_be32(PRO_VERSION_MAX); | 4591 | p->protocol_max = cpu_to_be32(PRO_VERSION_MAX); |
3989 | ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE, | 4592 | return conn_send_command(tconn, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0); |
3990 | (struct p_header80 *)p, sizeof(*p), 0 ); | ||
3991 | mutex_unlock(&mdev->data.mutex); | ||
3992 | return ok; | ||
3993 | } | 4593 | } |
3994 | 4594 | ||
3995 | /* | 4595 | /* |
@@ -3999,42 +4599,38 @@ static int drbd_send_handshake(struct drbd_conf *mdev) | |||
3999 | * -1 peer talks different language, | 4599 | * -1 peer talks different language, |
4000 | * no point in trying again, please go standalone. | 4600 | * no point in trying again, please go standalone. |
4001 | */ | 4601 | */ |
4002 | static int drbd_do_handshake(struct drbd_conf *mdev) | 4602 | static int drbd_do_features(struct drbd_tconn *tconn) |
4003 | { | 4603 | { |
4004 | /* ASSERT current == mdev->receiver ... */ | 4604 | /* ASSERT current == tconn->receiver ... */ |
4005 | struct p_handshake *p = &mdev->data.rbuf.handshake; | 4605 | struct p_connection_features *p; |
4006 | const int expect = sizeof(struct p_handshake) - sizeof(struct p_header80); | 4606 | const int expect = sizeof(struct p_connection_features); |
4007 | unsigned int length; | 4607 | struct packet_info pi; |
4008 | enum drbd_packets cmd; | 4608 | int err; |
4009 | int rv; | ||
4010 | 4609 | ||
4011 | rv = drbd_send_handshake(mdev); | 4610 | err = drbd_send_features(tconn); |
4012 | if (!rv) | 4611 | if (err) |
4013 | return 0; | 4612 | return 0; |
4014 | 4613 | ||
4015 | rv = drbd_recv_header(mdev, &cmd, &length); | 4614 | err = drbd_recv_header(tconn, &pi); |
4016 | if (!rv) | 4615 | if (err) |
4017 | return 0; | 4616 | return 0; |
4018 | 4617 | ||
4019 | if (cmd != P_HAND_SHAKE) { | 4618 | if (pi.cmd != P_CONNECTION_FEATURES) { |
4020 | dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n", | 4619 | conn_err(tconn, "expected ConnectionFeatures packet, received: %s (0x%04x)\n", |
4021 | cmdname(cmd), cmd); | 4620 | cmdname(pi.cmd), pi.cmd); |
4022 | return -1; | 4621 | return -1; |
4023 | } | 4622 | } |
4024 | 4623 | ||
4025 | if (length != expect) { | 4624 | if (pi.size != expect) { |
4026 | dev_err(DEV, "expected HandShake length: %u, received: %u\n", | 4625 | conn_err(tconn, "expected ConnectionFeatures length: %u, received: %u\n", |
4027 | expect, length); | 4626 | expect, pi.size); |
4028 | return -1; | 4627 | return -1; |
4029 | } | 4628 | } |
4030 | 4629 | ||
4031 | rv = drbd_recv(mdev, &p->head.payload, expect); | 4630 | p = pi.data; |
4032 | 4631 | err = drbd_recv_all_warn(tconn, p, expect); | |
4033 | if (rv != expect) { | 4632 | if (err) |
4034 | if (!signal_pending(current)) | ||
4035 | dev_warn(DEV, "short read receiving handshake packet: l=%u\n", rv); | ||
4036 | return 0; | 4633 | return 0; |
4037 | } | ||
4038 | 4634 | ||
4039 | p->protocol_min = be32_to_cpu(p->protocol_min); | 4635 | p->protocol_min = be32_to_cpu(p->protocol_min); |
4040 | p->protocol_max = be32_to_cpu(p->protocol_max); | 4636 | p->protocol_max = be32_to_cpu(p->protocol_max); |
@@ -4045,15 +4641,15 @@ static int drbd_do_handshake(struct drbd_conf *mdev) | |||
4045 | PRO_VERSION_MIN > p->protocol_max) | 4641 | PRO_VERSION_MIN > p->protocol_max) |
4046 | goto incompat; | 4642 | goto incompat; |
4047 | 4643 | ||
4048 | mdev->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max); | 4644 | tconn->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max); |
4049 | 4645 | ||
4050 | dev_info(DEV, "Handshake successful: " | 4646 | conn_info(tconn, "Handshake successful: " |
4051 | "Agreed network protocol version %d\n", mdev->agreed_pro_version); | 4647 | "Agreed network protocol version %d\n", tconn->agreed_pro_version); |
4052 | 4648 | ||
4053 | return 1; | 4649 | return 1; |
4054 | 4650 | ||
4055 | incompat: | 4651 | incompat: |
4056 | dev_err(DEV, "incompatible DRBD dialects: " | 4652 | conn_err(tconn, "incompatible DRBD dialects: " |
4057 | "I support %d-%d, peer supports %d-%d\n", | 4653 | "I support %d-%d, peer supports %d-%d\n", |
4058 | PRO_VERSION_MIN, PRO_VERSION_MAX, | 4654 | PRO_VERSION_MIN, PRO_VERSION_MAX, |
4059 | p->protocol_min, p->protocol_max); | 4655 | p->protocol_min, p->protocol_max); |
@@ -4061,7 +4657,7 @@ static int drbd_do_handshake(struct drbd_conf *mdev) | |||
4061 | } | 4657 | } |
4062 | 4658 | ||
4063 | #if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE) | 4659 | #if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE) |
4064 | static int drbd_do_auth(struct drbd_conf *mdev) | 4660 | static int drbd_do_auth(struct drbd_tconn *tconn) |
4065 | { | 4661 | { |
4066 | dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n"); | 4662 | dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n"); |
4067 | dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n"); | 4663 | dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n"); |
@@ -4076,121 +4672,139 @@ static int drbd_do_auth(struct drbd_conf *mdev) | |||
4076 | -1 - auth failed, don't try again. | 4672 | -1 - auth failed, don't try again. |
4077 | */ | 4673 | */ |
4078 | 4674 | ||
4079 | static int drbd_do_auth(struct drbd_conf *mdev) | 4675 | static int drbd_do_auth(struct drbd_tconn *tconn) |
4080 | { | 4676 | { |
4677 | struct drbd_socket *sock; | ||
4081 | char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */ | 4678 | char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */ |
4082 | struct scatterlist sg; | 4679 | struct scatterlist sg; |
4083 | char *response = NULL; | 4680 | char *response = NULL; |
4084 | char *right_response = NULL; | 4681 | char *right_response = NULL; |
4085 | char *peers_ch = NULL; | 4682 | char *peers_ch = NULL; |
4086 | unsigned int key_len = strlen(mdev->net_conf->shared_secret); | 4683 | unsigned int key_len; |
4684 | char secret[SHARED_SECRET_MAX]; /* 64 byte */ | ||
4087 | unsigned int resp_size; | 4685 | unsigned int resp_size; |
4088 | struct hash_desc desc; | 4686 | struct hash_desc desc; |
4089 | enum drbd_packets cmd; | 4687 | struct packet_info pi; |
4090 | unsigned int length; | 4688 | struct net_conf *nc; |
4091 | int rv; | 4689 | int err, rv; |
4690 | |||
4691 | /* FIXME: Put the challenge/response into the preallocated socket buffer. */ | ||
4092 | 4692 | ||
4093 | desc.tfm = mdev->cram_hmac_tfm; | 4693 | rcu_read_lock(); |
4694 | nc = rcu_dereference(tconn->net_conf); | ||
4695 | key_len = strlen(nc->shared_secret); | ||
4696 | memcpy(secret, nc->shared_secret, key_len); | ||
4697 | rcu_read_unlock(); | ||
4698 | |||
4699 | desc.tfm = tconn->cram_hmac_tfm; | ||
4094 | desc.flags = 0; | 4700 | desc.flags = 0; |
4095 | 4701 | ||
4096 | rv = crypto_hash_setkey(mdev->cram_hmac_tfm, | 4702 | rv = crypto_hash_setkey(tconn->cram_hmac_tfm, (u8 *)secret, key_len); |
4097 | (u8 *)mdev->net_conf->shared_secret, key_len); | ||
4098 | if (rv) { | 4703 | if (rv) { |
4099 | dev_err(DEV, "crypto_hash_setkey() failed with %d\n", rv); | 4704 | conn_err(tconn, "crypto_hash_setkey() failed with %d\n", rv); |
4100 | rv = -1; | 4705 | rv = -1; |
4101 | goto fail; | 4706 | goto fail; |
4102 | } | 4707 | } |
4103 | 4708 | ||
4104 | get_random_bytes(my_challenge, CHALLENGE_LEN); | 4709 | get_random_bytes(my_challenge, CHALLENGE_LEN); |
4105 | 4710 | ||
4106 | rv = drbd_send_cmd2(mdev, P_AUTH_CHALLENGE, my_challenge, CHALLENGE_LEN); | 4711 | sock = &tconn->data; |
4712 | if (!conn_prepare_command(tconn, sock)) { | ||
4713 | rv = 0; | ||
4714 | goto fail; | ||
4715 | } | ||
4716 | rv = !conn_send_command(tconn, sock, P_AUTH_CHALLENGE, 0, | ||
4717 | my_challenge, CHALLENGE_LEN); | ||
4107 | if (!rv) | 4718 | if (!rv) |
4108 | goto fail; | 4719 | goto fail; |
4109 | 4720 | ||
4110 | rv = drbd_recv_header(mdev, &cmd, &length); | 4721 | err = drbd_recv_header(tconn, &pi); |
4111 | if (!rv) | 4722 | if (err) { |
4723 | rv = 0; | ||
4112 | goto fail; | 4724 | goto fail; |
4725 | } | ||
4113 | 4726 | ||
4114 | if (cmd != P_AUTH_CHALLENGE) { | 4727 | if (pi.cmd != P_AUTH_CHALLENGE) { |
4115 | dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n", | 4728 | conn_err(tconn, "expected AuthChallenge packet, received: %s (0x%04x)\n", |
4116 | cmdname(cmd), cmd); | 4729 | cmdname(pi.cmd), pi.cmd); |
4117 | rv = 0; | 4730 | rv = 0; |
4118 | goto fail; | 4731 | goto fail; |
4119 | } | 4732 | } |
4120 | 4733 | ||
4121 | if (length > CHALLENGE_LEN * 2) { | 4734 | if (pi.size > CHALLENGE_LEN * 2) { |
4122 | dev_err(DEV, "expected AuthChallenge payload too big.\n"); | 4735 | conn_err(tconn, "expected AuthChallenge payload too big.\n"); |
4123 | rv = -1; | 4736 | rv = -1; |
4124 | goto fail; | 4737 | goto fail; |
4125 | } | 4738 | } |
4126 | 4739 | ||
4127 | peers_ch = kmalloc(length, GFP_NOIO); | 4740 | peers_ch = kmalloc(pi.size, GFP_NOIO); |
4128 | if (peers_ch == NULL) { | 4741 | if (peers_ch == NULL) { |
4129 | dev_err(DEV, "kmalloc of peers_ch failed\n"); | 4742 | conn_err(tconn, "kmalloc of peers_ch failed\n"); |
4130 | rv = -1; | 4743 | rv = -1; |
4131 | goto fail; | 4744 | goto fail; |
4132 | } | 4745 | } |
4133 | 4746 | ||
4134 | rv = drbd_recv(mdev, peers_ch, length); | 4747 | err = drbd_recv_all_warn(tconn, peers_ch, pi.size); |
4135 | 4748 | if (err) { | |
4136 | if (rv != length) { | ||
4137 | if (!signal_pending(current)) | ||
4138 | dev_warn(DEV, "short read AuthChallenge: l=%u\n", rv); | ||
4139 | rv = 0; | 4749 | rv = 0; |
4140 | goto fail; | 4750 | goto fail; |
4141 | } | 4751 | } |
4142 | 4752 | ||
4143 | resp_size = crypto_hash_digestsize(mdev->cram_hmac_tfm); | 4753 | resp_size = crypto_hash_digestsize(tconn->cram_hmac_tfm); |
4144 | response = kmalloc(resp_size, GFP_NOIO); | 4754 | response = kmalloc(resp_size, GFP_NOIO); |
4145 | if (response == NULL) { | 4755 | if (response == NULL) { |
4146 | dev_err(DEV, "kmalloc of response failed\n"); | 4756 | conn_err(tconn, "kmalloc of response failed\n"); |
4147 | rv = -1; | 4757 | rv = -1; |
4148 | goto fail; | 4758 | goto fail; |
4149 | } | 4759 | } |
4150 | 4760 | ||
4151 | sg_init_table(&sg, 1); | 4761 | sg_init_table(&sg, 1); |
4152 | sg_set_buf(&sg, peers_ch, length); | 4762 | sg_set_buf(&sg, peers_ch, pi.size); |
4153 | 4763 | ||
4154 | rv = crypto_hash_digest(&desc, &sg, sg.length, response); | 4764 | rv = crypto_hash_digest(&desc, &sg, sg.length, response); |
4155 | if (rv) { | 4765 | if (rv) { |
4156 | dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv); | 4766 | conn_err(tconn, "crypto_hash_digest() failed with %d\n", rv); |
4157 | rv = -1; | 4767 | rv = -1; |
4158 | goto fail; | 4768 | goto fail; |
4159 | } | 4769 | } |
4160 | 4770 | ||
4161 | rv = drbd_send_cmd2(mdev, P_AUTH_RESPONSE, response, resp_size); | 4771 | if (!conn_prepare_command(tconn, sock)) { |
4162 | if (!rv) | 4772 | rv = 0; |
4163 | goto fail; | 4773 | goto fail; |
4164 | 4774 | } | |
4165 | rv = drbd_recv_header(mdev, &cmd, &length); | 4775 | rv = !conn_send_command(tconn, sock, P_AUTH_RESPONSE, 0, |
4776 | response, resp_size); | ||
4166 | if (!rv) | 4777 | if (!rv) |
4167 | goto fail; | 4778 | goto fail; |
4168 | 4779 | ||
4169 | if (cmd != P_AUTH_RESPONSE) { | 4780 | err = drbd_recv_header(tconn, &pi); |
4170 | dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n", | 4781 | if (err) { |
4171 | cmdname(cmd), cmd); | ||
4172 | rv = 0; | 4782 | rv = 0; |
4173 | goto fail; | 4783 | goto fail; |
4174 | } | 4784 | } |
4175 | 4785 | ||
4176 | if (length != resp_size) { | 4786 | if (pi.cmd != P_AUTH_RESPONSE) { |
4177 | dev_err(DEV, "expected AuthResponse payload of wrong size\n"); | 4787 | conn_err(tconn, "expected AuthResponse packet, received: %s (0x%04x)\n", |
4788 | cmdname(pi.cmd), pi.cmd); | ||
4178 | rv = 0; | 4789 | rv = 0; |
4179 | goto fail; | 4790 | goto fail; |
4180 | } | 4791 | } |
4181 | 4792 | ||
4182 | rv = drbd_recv(mdev, response , resp_size); | 4793 | if (pi.size != resp_size) { |
4794 | conn_err(tconn, "expected AuthResponse payload of wrong size\n"); | ||
4795 | rv = 0; | ||
4796 | goto fail; | ||
4797 | } | ||
4183 | 4798 | ||
4184 | if (rv != resp_size) { | 4799 | err = drbd_recv_all_warn(tconn, response , resp_size); |
4185 | if (!signal_pending(current)) | 4800 | if (err) { |
4186 | dev_warn(DEV, "short read receiving AuthResponse: l=%u\n", rv); | ||
4187 | rv = 0; | 4801 | rv = 0; |
4188 | goto fail; | 4802 | goto fail; |
4189 | } | 4803 | } |
4190 | 4804 | ||
4191 | right_response = kmalloc(resp_size, GFP_NOIO); | 4805 | right_response = kmalloc(resp_size, GFP_NOIO); |
4192 | if (right_response == NULL) { | 4806 | if (right_response == NULL) { |
4193 | dev_err(DEV, "kmalloc of right_response failed\n"); | 4807 | conn_err(tconn, "kmalloc of right_response failed\n"); |
4194 | rv = -1; | 4808 | rv = -1; |
4195 | goto fail; | 4809 | goto fail; |
4196 | } | 4810 | } |
@@ -4199,7 +4813,7 @@ static int drbd_do_auth(struct drbd_conf *mdev) | |||
4199 | 4813 | ||
4200 | rv = crypto_hash_digest(&desc, &sg, sg.length, right_response); | 4814 | rv = crypto_hash_digest(&desc, &sg, sg.length, right_response); |
4201 | if (rv) { | 4815 | if (rv) { |
4202 | dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv); | 4816 | conn_err(tconn, "crypto_hash_digest() failed with %d\n", rv); |
4203 | rv = -1; | 4817 | rv = -1; |
4204 | goto fail; | 4818 | goto fail; |
4205 | } | 4819 | } |
@@ -4207,8 +4821,8 @@ static int drbd_do_auth(struct drbd_conf *mdev) | |||
4207 | rv = !memcmp(response, right_response, resp_size); | 4821 | rv = !memcmp(response, right_response, resp_size); |
4208 | 4822 | ||
4209 | if (rv) | 4823 | if (rv) |
4210 | dev_info(DEV, "Peer authenticated using %d bytes of '%s' HMAC\n", | 4824 | conn_info(tconn, "Peer authenticated using %d bytes HMAC\n", |
4211 | resp_size, mdev->net_conf->cram_hmac_alg); | 4825 | resp_size); |
4212 | else | 4826 | else |
4213 | rv = -1; | 4827 | rv = -1; |
4214 | 4828 | ||
@@ -4223,82 +4837,106 @@ static int drbd_do_auth(struct drbd_conf *mdev) | |||
4223 | 4837 | ||
4224 | int drbdd_init(struct drbd_thread *thi) | 4838 | int drbdd_init(struct drbd_thread *thi) |
4225 | { | 4839 | { |
4226 | struct drbd_conf *mdev = thi->mdev; | 4840 | struct drbd_tconn *tconn = thi->tconn; |
4227 | unsigned int minor = mdev_to_minor(mdev); | ||
4228 | int h; | 4841 | int h; |
4229 | 4842 | ||
4230 | sprintf(current->comm, "drbd%d_receiver", minor); | 4843 | conn_info(tconn, "receiver (re)started\n"); |
4231 | |||
4232 | dev_info(DEV, "receiver (re)started\n"); | ||
4233 | 4844 | ||
4234 | do { | 4845 | do { |
4235 | h = drbd_connect(mdev); | 4846 | h = conn_connect(tconn); |
4236 | if (h == 0) { | 4847 | if (h == 0) { |
4237 | drbd_disconnect(mdev); | 4848 | conn_disconnect(tconn); |
4238 | schedule_timeout_interruptible(HZ); | 4849 | schedule_timeout_interruptible(HZ); |
4239 | } | 4850 | } |
4240 | if (h == -1) { | 4851 | if (h == -1) { |
4241 | dev_warn(DEV, "Discarding network configuration.\n"); | 4852 | conn_warn(tconn, "Discarding network configuration.\n"); |
4242 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | 4853 | conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD); |
4243 | } | 4854 | } |
4244 | } while (h == 0); | 4855 | } while (h == 0); |
4245 | 4856 | ||
4246 | if (h > 0) { | 4857 | if (h > 0) |
4247 | if (get_net_conf(mdev)) { | 4858 | drbdd(tconn); |
4248 | drbdd(mdev); | ||
4249 | put_net_conf(mdev); | ||
4250 | } | ||
4251 | } | ||
4252 | 4859 | ||
4253 | drbd_disconnect(mdev); | 4860 | conn_disconnect(tconn); |
4254 | 4861 | ||
4255 | dev_info(DEV, "receiver terminated\n"); | 4862 | conn_info(tconn, "receiver terminated\n"); |
4256 | return 0; | 4863 | return 0; |
4257 | } | 4864 | } |
4258 | 4865 | ||
4259 | /* ********* acknowledge sender ******** */ | 4866 | /* ********* acknowledge sender ******** */ |
4260 | 4867 | ||
4261 | static int got_RqSReply(struct drbd_conf *mdev, struct p_header80 *h) | 4868 | static int got_conn_RqSReply(struct drbd_tconn *tconn, struct packet_info *pi) |
4262 | { | 4869 | { |
4263 | struct p_req_state_reply *p = (struct p_req_state_reply *)h; | 4870 | struct p_req_state_reply *p = pi->data; |
4871 | int retcode = be32_to_cpu(p->retcode); | ||
4872 | |||
4873 | if (retcode >= SS_SUCCESS) { | ||
4874 | set_bit(CONN_WD_ST_CHG_OKAY, &tconn->flags); | ||
4875 | } else { | ||
4876 | set_bit(CONN_WD_ST_CHG_FAIL, &tconn->flags); | ||
4877 | conn_err(tconn, "Requested state change failed by peer: %s (%d)\n", | ||
4878 | drbd_set_st_err_str(retcode), retcode); | ||
4879 | } | ||
4880 | wake_up(&tconn->ping_wait); | ||
4881 | |||
4882 | return 0; | ||
4883 | } | ||
4264 | 4884 | ||
4885 | static int got_RqSReply(struct drbd_tconn *tconn, struct packet_info *pi) | ||
4886 | { | ||
4887 | struct drbd_conf *mdev; | ||
4888 | struct p_req_state_reply *p = pi->data; | ||
4265 | int retcode = be32_to_cpu(p->retcode); | 4889 | int retcode = be32_to_cpu(p->retcode); |
4266 | 4890 | ||
4891 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
4892 | if (!mdev) | ||
4893 | return -EIO; | ||
4894 | |||
4895 | if (test_bit(CONN_WD_ST_CHG_REQ, &tconn->flags)) { | ||
4896 | D_ASSERT(tconn->agreed_pro_version < 100); | ||
4897 | return got_conn_RqSReply(tconn, pi); | ||
4898 | } | ||
4899 | |||
4267 | if (retcode >= SS_SUCCESS) { | 4900 | if (retcode >= SS_SUCCESS) { |
4268 | set_bit(CL_ST_CHG_SUCCESS, &mdev->flags); | 4901 | set_bit(CL_ST_CHG_SUCCESS, &mdev->flags); |
4269 | } else { | 4902 | } else { |
4270 | set_bit(CL_ST_CHG_FAIL, &mdev->flags); | 4903 | set_bit(CL_ST_CHG_FAIL, &mdev->flags); |
4271 | dev_err(DEV, "Requested state change failed by peer: %s (%d)\n", | 4904 | dev_err(DEV, "Requested state change failed by peer: %s (%d)\n", |
4272 | drbd_set_st_err_str(retcode), retcode); | 4905 | drbd_set_st_err_str(retcode), retcode); |
4273 | } | 4906 | } |
4274 | wake_up(&mdev->state_wait); | 4907 | wake_up(&mdev->state_wait); |
4275 | 4908 | ||
4276 | return true; | 4909 | return 0; |
4277 | } | 4910 | } |
4278 | 4911 | ||
4279 | static int got_Ping(struct drbd_conf *mdev, struct p_header80 *h) | 4912 | static int got_Ping(struct drbd_tconn *tconn, struct packet_info *pi) |
4280 | { | 4913 | { |
4281 | return drbd_send_ping_ack(mdev); | 4914 | return drbd_send_ping_ack(tconn); |
4282 | 4915 | ||
4283 | } | 4916 | } |
4284 | 4917 | ||
4285 | static int got_PingAck(struct drbd_conf *mdev, struct p_header80 *h) | 4918 | static int got_PingAck(struct drbd_tconn *tconn, struct packet_info *pi) |
4286 | { | 4919 | { |
4287 | /* restore idle timeout */ | 4920 | /* restore idle timeout */ |
4288 | mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ; | 4921 | tconn->meta.socket->sk->sk_rcvtimeo = tconn->net_conf->ping_int*HZ; |
4289 | if (!test_and_set_bit(GOT_PING_ACK, &mdev->flags)) | 4922 | if (!test_and_set_bit(GOT_PING_ACK, &tconn->flags)) |
4290 | wake_up(&mdev->misc_wait); | 4923 | wake_up(&tconn->ping_wait); |
4291 | 4924 | ||
4292 | return true; | 4925 | return 0; |
4293 | } | 4926 | } |
4294 | 4927 | ||
4295 | static int got_IsInSync(struct drbd_conf *mdev, struct p_header80 *h) | 4928 | static int got_IsInSync(struct drbd_tconn *tconn, struct packet_info *pi) |
4296 | { | 4929 | { |
4297 | struct p_block_ack *p = (struct p_block_ack *)h; | 4930 | struct drbd_conf *mdev; |
4931 | struct p_block_ack *p = pi->data; | ||
4298 | sector_t sector = be64_to_cpu(p->sector); | 4932 | sector_t sector = be64_to_cpu(p->sector); |
4299 | int blksize = be32_to_cpu(p->blksize); | 4933 | int blksize = be32_to_cpu(p->blksize); |
4300 | 4934 | ||
4301 | D_ASSERT(mdev->agreed_pro_version >= 89); | 4935 | mdev = vnr_to_mdev(tconn, pi->vnr); |
4936 | if (!mdev) | ||
4937 | return -EIO; | ||
4938 | |||
4939 | D_ASSERT(mdev->tconn->agreed_pro_version >= 89); | ||
4302 | 4940 | ||
4303 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); | 4941 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); |
4304 | 4942 | ||
@@ -4312,162 +4950,139 @@ static int got_IsInSync(struct drbd_conf *mdev, struct p_header80 *h) | |||
4312 | dec_rs_pending(mdev); | 4950 | dec_rs_pending(mdev); |
4313 | atomic_add(blksize >> 9, &mdev->rs_sect_in); | 4951 | atomic_add(blksize >> 9, &mdev->rs_sect_in); |
4314 | 4952 | ||
4315 | return true; | 4953 | return 0; |
4316 | } | ||
4317 | |||
4318 | /* when we receive the ACK for a write request, | ||
4319 | * verify that we actually know about it */ | ||
4320 | static struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev, | ||
4321 | u64 id, sector_t sector) | ||
4322 | { | ||
4323 | struct hlist_head *slot = tl_hash_slot(mdev, sector); | ||
4324 | struct hlist_node *n; | ||
4325 | struct drbd_request *req; | ||
4326 | |||
4327 | hlist_for_each_entry(req, n, slot, collision) { | ||
4328 | if ((unsigned long)req == (unsigned long)id) { | ||
4329 | if (req->sector != sector) { | ||
4330 | dev_err(DEV, "_ack_id_to_req: found req %p but it has " | ||
4331 | "wrong sector (%llus versus %llus)\n", req, | ||
4332 | (unsigned long long)req->sector, | ||
4333 | (unsigned long long)sector); | ||
4334 | break; | ||
4335 | } | ||
4336 | return req; | ||
4337 | } | ||
4338 | } | ||
4339 | return NULL; | ||
4340 | } | 4954 | } |
4341 | 4955 | ||
4342 | typedef struct drbd_request *(req_validator_fn) | 4956 | static int |
4343 | (struct drbd_conf *mdev, u64 id, sector_t sector); | 4957 | validate_req_change_req_state(struct drbd_conf *mdev, u64 id, sector_t sector, |
4344 | 4958 | struct rb_root *root, const char *func, | |
4345 | static int validate_req_change_req_state(struct drbd_conf *mdev, | 4959 | enum drbd_req_event what, bool missing_ok) |
4346 | u64 id, sector_t sector, req_validator_fn validator, | ||
4347 | const char *func, enum drbd_req_event what) | ||
4348 | { | 4960 | { |
4349 | struct drbd_request *req; | 4961 | struct drbd_request *req; |
4350 | struct bio_and_error m; | 4962 | struct bio_and_error m; |
4351 | 4963 | ||
4352 | spin_lock_irq(&mdev->req_lock); | 4964 | spin_lock_irq(&mdev->tconn->req_lock); |
4353 | req = validator(mdev, id, sector); | 4965 | req = find_request(mdev, root, id, sector, missing_ok, func); |
4354 | if (unlikely(!req)) { | 4966 | if (unlikely(!req)) { |
4355 | spin_unlock_irq(&mdev->req_lock); | 4967 | spin_unlock_irq(&mdev->tconn->req_lock); |
4356 | 4968 | return -EIO; | |
4357 | dev_err(DEV, "%s: failed to find req %p, sector %llus\n", func, | ||
4358 | (void *)(unsigned long)id, (unsigned long long)sector); | ||
4359 | return false; | ||
4360 | } | 4969 | } |
4361 | __req_mod(req, what, &m); | 4970 | __req_mod(req, what, &m); |
4362 | spin_unlock_irq(&mdev->req_lock); | 4971 | spin_unlock_irq(&mdev->tconn->req_lock); |
4363 | 4972 | ||
4364 | if (m.bio) | 4973 | if (m.bio) |
4365 | complete_master_bio(mdev, &m); | 4974 | complete_master_bio(mdev, &m); |
4366 | return true; | 4975 | return 0; |
4367 | } | 4976 | } |
4368 | 4977 | ||
4369 | static int got_BlockAck(struct drbd_conf *mdev, struct p_header80 *h) | 4978 | static int got_BlockAck(struct drbd_tconn *tconn, struct packet_info *pi) |
4370 | { | 4979 | { |
4371 | struct p_block_ack *p = (struct p_block_ack *)h; | 4980 | struct drbd_conf *mdev; |
4981 | struct p_block_ack *p = pi->data; | ||
4372 | sector_t sector = be64_to_cpu(p->sector); | 4982 | sector_t sector = be64_to_cpu(p->sector); |
4373 | int blksize = be32_to_cpu(p->blksize); | 4983 | int blksize = be32_to_cpu(p->blksize); |
4374 | enum drbd_req_event what; | 4984 | enum drbd_req_event what; |
4375 | 4985 | ||
4986 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
4987 | if (!mdev) | ||
4988 | return -EIO; | ||
4989 | |||
4376 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); | 4990 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); |
4377 | 4991 | ||
4378 | if (is_syncer_block_id(p->block_id)) { | 4992 | if (p->block_id == ID_SYNCER) { |
4379 | drbd_set_in_sync(mdev, sector, blksize); | 4993 | drbd_set_in_sync(mdev, sector, blksize); |
4380 | dec_rs_pending(mdev); | 4994 | dec_rs_pending(mdev); |
4381 | return true; | 4995 | return 0; |
4382 | } | 4996 | } |
4383 | switch (be16_to_cpu(h->command)) { | 4997 | switch (pi->cmd) { |
4384 | case P_RS_WRITE_ACK: | 4998 | case P_RS_WRITE_ACK: |
4385 | D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); | 4999 | what = WRITE_ACKED_BY_PEER_AND_SIS; |
4386 | what = write_acked_by_peer_and_sis; | ||
4387 | break; | 5000 | break; |
4388 | case P_WRITE_ACK: | 5001 | case P_WRITE_ACK: |
4389 | D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); | 5002 | what = WRITE_ACKED_BY_PEER; |
4390 | what = write_acked_by_peer; | ||
4391 | break; | 5003 | break; |
4392 | case P_RECV_ACK: | 5004 | case P_RECV_ACK: |
4393 | D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_B); | 5005 | what = RECV_ACKED_BY_PEER; |
4394 | what = recv_acked_by_peer; | ||
4395 | break; | 5006 | break; |
4396 | case P_DISCARD_ACK: | 5007 | case P_SUPERSEDED: |
4397 | D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); | 5008 | what = CONFLICT_RESOLVED; |
4398 | what = conflict_discarded_by_peer; | 5009 | break; |
5010 | case P_RETRY_WRITE: | ||
5011 | what = POSTPONE_WRITE; | ||
4399 | break; | 5012 | break; |
4400 | default: | 5013 | default: |
4401 | D_ASSERT(0); | 5014 | BUG(); |
4402 | return false; | ||
4403 | } | 5015 | } |
4404 | 5016 | ||
4405 | return validate_req_change_req_state(mdev, p->block_id, sector, | 5017 | return validate_req_change_req_state(mdev, p->block_id, sector, |
4406 | _ack_id_to_req, __func__ , what); | 5018 | &mdev->write_requests, __func__, |
5019 | what, false); | ||
4407 | } | 5020 | } |
4408 | 5021 | ||
4409 | static int got_NegAck(struct drbd_conf *mdev, struct p_header80 *h) | 5022 | static int got_NegAck(struct drbd_tconn *tconn, struct packet_info *pi) |
4410 | { | 5023 | { |
4411 | struct p_block_ack *p = (struct p_block_ack *)h; | 5024 | struct drbd_conf *mdev; |
5025 | struct p_block_ack *p = pi->data; | ||
4412 | sector_t sector = be64_to_cpu(p->sector); | 5026 | sector_t sector = be64_to_cpu(p->sector); |
4413 | int size = be32_to_cpu(p->blksize); | 5027 | int size = be32_to_cpu(p->blksize); |
4414 | struct drbd_request *req; | 5028 | int err; |
4415 | struct bio_and_error m; | 5029 | |
5030 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
5031 | if (!mdev) | ||
5032 | return -EIO; | ||
4416 | 5033 | ||
4417 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); | 5034 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); |
4418 | 5035 | ||
4419 | if (is_syncer_block_id(p->block_id)) { | 5036 | if (p->block_id == ID_SYNCER) { |
4420 | dec_rs_pending(mdev); | 5037 | dec_rs_pending(mdev); |
4421 | drbd_rs_failed_io(mdev, sector, size); | 5038 | drbd_rs_failed_io(mdev, sector, size); |
4422 | return true; | 5039 | return 0; |
4423 | } | 5040 | } |
4424 | 5041 | ||
4425 | spin_lock_irq(&mdev->req_lock); | 5042 | err = validate_req_change_req_state(mdev, p->block_id, sector, |
4426 | req = _ack_id_to_req(mdev, p->block_id, sector); | 5043 | &mdev->write_requests, __func__, |
4427 | if (!req) { | 5044 | NEG_ACKED, true); |
4428 | spin_unlock_irq(&mdev->req_lock); | 5045 | if (err) { |
4429 | if (mdev->net_conf->wire_protocol == DRBD_PROT_A || | 5046 | /* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs. |
4430 | mdev->net_conf->wire_protocol == DRBD_PROT_B) { | 5047 | The master bio might already be completed, therefore the |
4431 | /* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs. | 5048 | request is no longer in the collision hash. */ |
4432 | The master bio might already be completed, therefore the | 5049 | /* In Protocol B we might already have got a P_RECV_ACK |
4433 | request is no longer in the collision hash. | 5050 | but then get a P_NEG_ACK afterwards. */ |
4434 | => Do not try to validate block_id as request. */ | 5051 | drbd_set_out_of_sync(mdev, sector, size); |
4435 | /* In Protocol B we might already have got a P_RECV_ACK | ||
4436 | but then get a P_NEG_ACK after wards. */ | ||
4437 | drbd_set_out_of_sync(mdev, sector, size); | ||
4438 | return true; | ||
4439 | } else { | ||
4440 | dev_err(DEV, "%s: failed to find req %p, sector %llus\n", __func__, | ||
4441 | (void *)(unsigned long)p->block_id, (unsigned long long)sector); | ||
4442 | return false; | ||
4443 | } | ||
4444 | } | 5052 | } |
4445 | __req_mod(req, neg_acked, &m); | 5053 | return 0; |
4446 | spin_unlock_irq(&mdev->req_lock); | ||
4447 | |||
4448 | if (m.bio) | ||
4449 | complete_master_bio(mdev, &m); | ||
4450 | return true; | ||
4451 | } | 5054 | } |
4452 | 5055 | ||
4453 | static int got_NegDReply(struct drbd_conf *mdev, struct p_header80 *h) | 5056 | static int got_NegDReply(struct drbd_tconn *tconn, struct packet_info *pi) |
4454 | { | 5057 | { |
4455 | struct p_block_ack *p = (struct p_block_ack *)h; | 5058 | struct drbd_conf *mdev; |
5059 | struct p_block_ack *p = pi->data; | ||
4456 | sector_t sector = be64_to_cpu(p->sector); | 5060 | sector_t sector = be64_to_cpu(p->sector); |
4457 | 5061 | ||
5062 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
5063 | if (!mdev) | ||
5064 | return -EIO; | ||
5065 | |||
4458 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); | 5066 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); |
4459 | dev_err(DEV, "Got NegDReply; Sector %llus, len %u; Fail original request.\n", | 5067 | |
5068 | dev_err(DEV, "Got NegDReply; Sector %llus, len %u.\n", | ||
4460 | (unsigned long long)sector, be32_to_cpu(p->blksize)); | 5069 | (unsigned long long)sector, be32_to_cpu(p->blksize)); |
4461 | 5070 | ||
4462 | return validate_req_change_req_state(mdev, p->block_id, sector, | 5071 | return validate_req_change_req_state(mdev, p->block_id, sector, |
4463 | _ar_id_to_req, __func__ , neg_acked); | 5072 | &mdev->read_requests, __func__, |
5073 | NEG_ACKED, false); | ||
4464 | } | 5074 | } |
4465 | 5075 | ||
4466 | static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header80 *h) | 5076 | static int got_NegRSDReply(struct drbd_tconn *tconn, struct packet_info *pi) |
4467 | { | 5077 | { |
5078 | struct drbd_conf *mdev; | ||
4468 | sector_t sector; | 5079 | sector_t sector; |
4469 | int size; | 5080 | int size; |
4470 | struct p_block_ack *p = (struct p_block_ack *)h; | 5081 | struct p_block_ack *p = pi->data; |
5082 | |||
5083 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
5084 | if (!mdev) | ||
5085 | return -EIO; | ||
4471 | 5086 | ||
4472 | sector = be64_to_cpu(p->sector); | 5087 | sector = be64_to_cpu(p->sector); |
4473 | size = be32_to_cpu(p->blksize); | 5088 | size = be32_to_cpu(p->blksize); |
@@ -4478,57 +5093,66 @@ static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header80 *h) | |||
4478 | 5093 | ||
4479 | if (get_ldev_if_state(mdev, D_FAILED)) { | 5094 | if (get_ldev_if_state(mdev, D_FAILED)) { |
4480 | drbd_rs_complete_io(mdev, sector); | 5095 | drbd_rs_complete_io(mdev, sector); |
4481 | switch (be16_to_cpu(h->command)) { | 5096 | switch (pi->cmd) { |
4482 | case P_NEG_RS_DREPLY: | 5097 | case P_NEG_RS_DREPLY: |
4483 | drbd_rs_failed_io(mdev, sector, size); | 5098 | drbd_rs_failed_io(mdev, sector, size); |
4484 | case P_RS_CANCEL: | 5099 | case P_RS_CANCEL: |
4485 | break; | 5100 | break; |
4486 | default: | 5101 | default: |
4487 | D_ASSERT(0); | 5102 | BUG(); |
4488 | put_ldev(mdev); | ||
4489 | return false; | ||
4490 | } | 5103 | } |
4491 | put_ldev(mdev); | 5104 | put_ldev(mdev); |
4492 | } | 5105 | } |
4493 | 5106 | ||
4494 | return true; | 5107 | return 0; |
4495 | } | 5108 | } |
4496 | 5109 | ||
4497 | static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h) | 5110 | static int got_BarrierAck(struct drbd_tconn *tconn, struct packet_info *pi) |
4498 | { | 5111 | { |
4499 | struct p_barrier_ack *p = (struct p_barrier_ack *)h; | 5112 | struct p_barrier_ack *p = pi->data; |
4500 | 5113 | struct drbd_conf *mdev; | |
4501 | tl_release(mdev, p->barrier, be32_to_cpu(p->set_size)); | 5114 | int vnr; |
4502 | 5115 | ||
4503 | if (mdev->state.conn == C_AHEAD && | 5116 | tl_release(tconn, p->barrier, be32_to_cpu(p->set_size)); |
4504 | atomic_read(&mdev->ap_in_flight) == 0 && | 5117 | |
4505 | !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags)) { | 5118 | rcu_read_lock(); |
4506 | mdev->start_resync_timer.expires = jiffies + HZ; | 5119 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { |
4507 | add_timer(&mdev->start_resync_timer); | 5120 | if (mdev->state.conn == C_AHEAD && |
5121 | atomic_read(&mdev->ap_in_flight) == 0 && | ||
5122 | !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags)) { | ||
5123 | mdev->start_resync_timer.expires = jiffies + HZ; | ||
5124 | add_timer(&mdev->start_resync_timer); | ||
5125 | } | ||
4508 | } | 5126 | } |
5127 | rcu_read_unlock(); | ||
4509 | 5128 | ||
4510 | return true; | 5129 | return 0; |
4511 | } | 5130 | } |
4512 | 5131 | ||
4513 | static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h) | 5132 | static int got_OVResult(struct drbd_tconn *tconn, struct packet_info *pi) |
4514 | { | 5133 | { |
4515 | struct p_block_ack *p = (struct p_block_ack *)h; | 5134 | struct drbd_conf *mdev; |
5135 | struct p_block_ack *p = pi->data; | ||
4516 | struct drbd_work *w; | 5136 | struct drbd_work *w; |
4517 | sector_t sector; | 5137 | sector_t sector; |
4518 | int size; | 5138 | int size; |
4519 | 5139 | ||
5140 | mdev = vnr_to_mdev(tconn, pi->vnr); | ||
5141 | if (!mdev) | ||
5142 | return -EIO; | ||
5143 | |||
4520 | sector = be64_to_cpu(p->sector); | 5144 | sector = be64_to_cpu(p->sector); |
4521 | size = be32_to_cpu(p->blksize); | 5145 | size = be32_to_cpu(p->blksize); |
4522 | 5146 | ||
4523 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); | 5147 | update_peer_seq(mdev, be32_to_cpu(p->seq_num)); |
4524 | 5148 | ||
4525 | if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC) | 5149 | if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC) |
4526 | drbd_ov_oos_found(mdev, sector, size); | 5150 | drbd_ov_out_of_sync_found(mdev, sector, size); |
4527 | else | 5151 | else |
4528 | ov_oos_print(mdev); | 5152 | ov_out_of_sync_print(mdev); |
4529 | 5153 | ||
4530 | if (!get_ldev(mdev)) | 5154 | if (!get_ldev(mdev)) |
4531 | return true; | 5155 | return 0; |
4532 | 5156 | ||
4533 | drbd_rs_complete_io(mdev, sector); | 5157 | drbd_rs_complete_io(mdev, sector); |
4534 | dec_rs_pending(mdev); | 5158 | dec_rs_pending(mdev); |
@@ -4543,114 +5167,137 @@ static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h) | |||
4543 | w = kmalloc(sizeof(*w), GFP_NOIO); | 5167 | w = kmalloc(sizeof(*w), GFP_NOIO); |
4544 | if (w) { | 5168 | if (w) { |
4545 | w->cb = w_ov_finished; | 5169 | w->cb = w_ov_finished; |
4546 | drbd_queue_work_front(&mdev->data.work, w); | 5170 | w->mdev = mdev; |
5171 | drbd_queue_work(&mdev->tconn->sender_work, w); | ||
4547 | } else { | 5172 | } else { |
4548 | dev_err(DEV, "kmalloc(w) failed."); | 5173 | dev_err(DEV, "kmalloc(w) failed."); |
4549 | ov_oos_print(mdev); | 5174 | ov_out_of_sync_print(mdev); |
4550 | drbd_resync_finished(mdev); | 5175 | drbd_resync_finished(mdev); |
4551 | } | 5176 | } |
4552 | } | 5177 | } |
4553 | put_ldev(mdev); | 5178 | put_ldev(mdev); |
4554 | return true; | 5179 | return 0; |
5180 | } | ||
5181 | |||
5182 | static int got_skip(struct drbd_tconn *tconn, struct packet_info *pi) | ||
5183 | { | ||
5184 | return 0; | ||
4555 | } | 5185 | } |
4556 | 5186 | ||
4557 | static int got_skip(struct drbd_conf *mdev, struct p_header80 *h) | 5187 | static int tconn_finish_peer_reqs(struct drbd_tconn *tconn) |
4558 | { | 5188 | { |
4559 | return true; | 5189 | struct drbd_conf *mdev; |
5190 | int vnr, not_empty = 0; | ||
5191 | |||
5192 | do { | ||
5193 | clear_bit(SIGNAL_ASENDER, &tconn->flags); | ||
5194 | flush_signals(current); | ||
5195 | |||
5196 | rcu_read_lock(); | ||
5197 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
5198 | kref_get(&mdev->kref); | ||
5199 | rcu_read_unlock(); | ||
5200 | if (drbd_finish_peer_reqs(mdev)) { | ||
5201 | kref_put(&mdev->kref, &drbd_minor_destroy); | ||
5202 | return 1; | ||
5203 | } | ||
5204 | kref_put(&mdev->kref, &drbd_minor_destroy); | ||
5205 | rcu_read_lock(); | ||
5206 | } | ||
5207 | set_bit(SIGNAL_ASENDER, &tconn->flags); | ||
5208 | |||
5209 | spin_lock_irq(&tconn->req_lock); | ||
5210 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
5211 | not_empty = !list_empty(&mdev->done_ee); | ||
5212 | if (not_empty) | ||
5213 | break; | ||
5214 | } | ||
5215 | spin_unlock_irq(&tconn->req_lock); | ||
5216 | rcu_read_unlock(); | ||
5217 | } while (not_empty); | ||
5218 | |||
5219 | return 0; | ||
4560 | } | 5220 | } |
4561 | 5221 | ||
4562 | struct asender_cmd { | 5222 | struct asender_cmd { |
4563 | size_t pkt_size; | 5223 | size_t pkt_size; |
4564 | int (*process)(struct drbd_conf *mdev, struct p_header80 *h); | 5224 | int (*fn)(struct drbd_tconn *tconn, struct packet_info *); |
4565 | }; | 5225 | }; |
4566 | 5226 | ||
4567 | static struct asender_cmd *get_asender_cmd(int cmd) | 5227 | static struct asender_cmd asender_tbl[] = { |
4568 | { | 5228 | [P_PING] = { 0, got_Ping }, |
4569 | static struct asender_cmd asender_tbl[] = { | 5229 | [P_PING_ACK] = { 0, got_PingAck }, |
4570 | /* anything missing from this table is in | ||
4571 | * the drbd_cmd_handler (drbd_default_handler) table, | ||
4572 | * see the beginning of drbdd() */ | ||
4573 | [P_PING] = { sizeof(struct p_header80), got_Ping }, | ||
4574 | [P_PING_ACK] = { sizeof(struct p_header80), got_PingAck }, | ||
4575 | [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, | 5230 | [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, |
4576 | [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, | 5231 | [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, |
4577 | [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, | 5232 | [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, |
4578 | [P_DISCARD_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, | 5233 | [P_SUPERSEDED] = { sizeof(struct p_block_ack), got_BlockAck }, |
4579 | [P_NEG_ACK] = { sizeof(struct p_block_ack), got_NegAck }, | 5234 | [P_NEG_ACK] = { sizeof(struct p_block_ack), got_NegAck }, |
4580 | [P_NEG_DREPLY] = { sizeof(struct p_block_ack), got_NegDReply }, | 5235 | [P_NEG_DREPLY] = { sizeof(struct p_block_ack), got_NegDReply }, |
4581 | [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply}, | 5236 | [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply }, |
4582 | [P_OV_RESULT] = { sizeof(struct p_block_ack), got_OVResult }, | 5237 | [P_OV_RESULT] = { sizeof(struct p_block_ack), got_OVResult }, |
4583 | [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck }, | 5238 | [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck }, |
4584 | [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply }, | 5239 | [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply }, |
4585 | [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync }, | 5240 | [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync }, |
4586 | [P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_skip }, | 5241 | [P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_skip }, |
4587 | [P_RS_CANCEL] = { sizeof(struct p_block_ack), got_NegRSDReply}, | 5242 | [P_RS_CANCEL] = { sizeof(struct p_block_ack), got_NegRSDReply }, |
4588 | [P_MAX_CMD] = { 0, NULL }, | 5243 | [P_CONN_ST_CHG_REPLY]={ sizeof(struct p_req_state_reply), got_conn_RqSReply }, |
4589 | }; | 5244 | [P_RETRY_WRITE] = { sizeof(struct p_block_ack), got_BlockAck }, |
4590 | if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL) | 5245 | }; |
4591 | return NULL; | ||
4592 | return &asender_tbl[cmd]; | ||
4593 | } | ||
4594 | 5246 | ||
4595 | int drbd_asender(struct drbd_thread *thi) | 5247 | int drbd_asender(struct drbd_thread *thi) |
4596 | { | 5248 | { |
4597 | struct drbd_conf *mdev = thi->mdev; | 5249 | struct drbd_tconn *tconn = thi->tconn; |
4598 | struct p_header80 *h = &mdev->meta.rbuf.header.h80; | ||
4599 | struct asender_cmd *cmd = NULL; | 5250 | struct asender_cmd *cmd = NULL; |
4600 | 5251 | struct packet_info pi; | |
4601 | int rv, len; | 5252 | int rv; |
4602 | void *buf = h; | 5253 | void *buf = tconn->meta.rbuf; |
4603 | int received = 0; | 5254 | int received = 0; |
4604 | int expect = sizeof(struct p_header80); | 5255 | unsigned int header_size = drbd_header_size(tconn); |
4605 | int empty; | 5256 | int expect = header_size; |
4606 | int ping_timeout_active = 0; | 5257 | bool ping_timeout_active = false; |
4607 | 5258 | struct net_conf *nc; | |
4608 | sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev)); | 5259 | int ping_timeo, tcp_cork, ping_int; |
4609 | 5260 | ||
4610 | current->policy = SCHED_RR; /* Make this a realtime task! */ | 5261 | current->policy = SCHED_RR; /* Make this a realtime task! */ |
4611 | current->rt_priority = 2; /* more important than all other tasks */ | 5262 | current->rt_priority = 2; /* more important than all other tasks */ |
4612 | 5263 | ||
4613 | while (get_t_state(thi) == Running) { | 5264 | while (get_t_state(thi) == RUNNING) { |
4614 | drbd_thread_current_set_cpu(mdev); | 5265 | drbd_thread_current_set_cpu(thi); |
4615 | if (test_and_clear_bit(SEND_PING, &mdev->flags)) { | ||
4616 | ERR_IF(!drbd_send_ping(mdev)) goto reconnect; | ||
4617 | mdev->meta.socket->sk->sk_rcvtimeo = | ||
4618 | mdev->net_conf->ping_timeo*HZ/10; | ||
4619 | ping_timeout_active = 1; | ||
4620 | } | ||
4621 | 5266 | ||
4622 | /* conditionally cork; | 5267 | rcu_read_lock(); |
4623 | * it may hurt latency if we cork without much to send */ | 5268 | nc = rcu_dereference(tconn->net_conf); |
4624 | if (!mdev->net_conf->no_cork && | 5269 | ping_timeo = nc->ping_timeo; |
4625 | 3 < atomic_read(&mdev->unacked_cnt)) | 5270 | tcp_cork = nc->tcp_cork; |
4626 | drbd_tcp_cork(mdev->meta.socket); | 5271 | ping_int = nc->ping_int; |
4627 | while (1) { | 5272 | rcu_read_unlock(); |
4628 | clear_bit(SIGNAL_ASENDER, &mdev->flags); | 5273 | |
4629 | flush_signals(current); | 5274 | if (test_and_clear_bit(SEND_PING, &tconn->flags)) { |
4630 | if (!drbd_process_done_ee(mdev)) | 5275 | if (drbd_send_ping(tconn)) { |
5276 | conn_err(tconn, "drbd_send_ping has failed\n"); | ||
4631 | goto reconnect; | 5277 | goto reconnect; |
4632 | /* to avoid race with newly queued ACKs */ | 5278 | } |
4633 | set_bit(SIGNAL_ASENDER, &mdev->flags); | 5279 | tconn->meta.socket->sk->sk_rcvtimeo = ping_timeo * HZ / 10; |
4634 | spin_lock_irq(&mdev->req_lock); | 5280 | ping_timeout_active = true; |
4635 | empty = list_empty(&mdev->done_ee); | 5281 | } |
4636 | spin_unlock_irq(&mdev->req_lock); | 5282 | |
4637 | /* new ack may have been queued right here, | 5283 | /* TODO: conditionally cork; it may hurt latency if we cork without |
4638 | * but then there is also a signal pending, | 5284 | much to send */ |
4639 | * and we start over... */ | 5285 | if (tcp_cork) |
4640 | if (empty) | 5286 | drbd_tcp_cork(tconn->meta.socket); |
4641 | break; | 5287 | if (tconn_finish_peer_reqs(tconn)) { |
5288 | conn_err(tconn, "tconn_finish_peer_reqs() failed\n"); | ||
5289 | goto reconnect; | ||
4642 | } | 5290 | } |
4643 | /* but unconditionally uncork unless disabled */ | 5291 | /* but unconditionally uncork unless disabled */ |
4644 | if (!mdev->net_conf->no_cork) | 5292 | if (tcp_cork) |
4645 | drbd_tcp_uncork(mdev->meta.socket); | 5293 | drbd_tcp_uncork(tconn->meta.socket); |
4646 | 5294 | ||
4647 | /* short circuit, recv_msg would return EINTR anyways. */ | 5295 | /* short circuit, recv_msg would return EINTR anyways. */ |
4648 | if (signal_pending(current)) | 5296 | if (signal_pending(current)) |
4649 | continue; | 5297 | continue; |
4650 | 5298 | ||
4651 | rv = drbd_recv_short(mdev, mdev->meta.socket, | 5299 | rv = drbd_recv_short(tconn->meta.socket, buf, expect-received, 0); |
4652 | buf, expect-received, 0); | 5300 | clear_bit(SIGNAL_ASENDER, &tconn->flags); |
4653 | clear_bit(SIGNAL_ASENDER, &mdev->flags); | ||
4654 | 5301 | ||
4655 | flush_signals(current); | 5302 | flush_signals(current); |
4656 | 5303 | ||
@@ -4668,80 +5315,91 @@ int drbd_asender(struct drbd_thread *thi) | |||
4668 | received += rv; | 5315 | received += rv; |
4669 | buf += rv; | 5316 | buf += rv; |
4670 | } else if (rv == 0) { | 5317 | } else if (rv == 0) { |
4671 | dev_err(DEV, "meta connection shut down by peer.\n"); | 5318 | if (test_bit(DISCONNECT_SENT, &tconn->flags)) { |
5319 | long t; | ||
5320 | rcu_read_lock(); | ||
5321 | t = rcu_dereference(tconn->net_conf)->ping_timeo * HZ/10; | ||
5322 | rcu_read_unlock(); | ||
5323 | |||
5324 | t = wait_event_timeout(tconn->ping_wait, | ||
5325 | tconn->cstate < C_WF_REPORT_PARAMS, | ||
5326 | t); | ||
5327 | if (t) | ||
5328 | break; | ||
5329 | } | ||
5330 | conn_err(tconn, "meta connection shut down by peer.\n"); | ||
4672 | goto reconnect; | 5331 | goto reconnect; |
4673 | } else if (rv == -EAGAIN) { | 5332 | } else if (rv == -EAGAIN) { |
4674 | /* If the data socket received something meanwhile, | 5333 | /* If the data socket received something meanwhile, |
4675 | * that is good enough: peer is still alive. */ | 5334 | * that is good enough: peer is still alive. */ |
4676 | if (time_after(mdev->last_received, | 5335 | if (time_after(tconn->last_received, |
4677 | jiffies - mdev->meta.socket->sk->sk_rcvtimeo)) | 5336 | jiffies - tconn->meta.socket->sk->sk_rcvtimeo)) |
4678 | continue; | 5337 | continue; |
4679 | if (ping_timeout_active) { | 5338 | if (ping_timeout_active) { |
4680 | dev_err(DEV, "PingAck did not arrive in time.\n"); | 5339 | conn_err(tconn, "PingAck did not arrive in time.\n"); |
4681 | goto reconnect; | 5340 | goto reconnect; |
4682 | } | 5341 | } |
4683 | set_bit(SEND_PING, &mdev->flags); | 5342 | set_bit(SEND_PING, &tconn->flags); |
4684 | continue; | 5343 | continue; |
4685 | } else if (rv == -EINTR) { | 5344 | } else if (rv == -EINTR) { |
4686 | continue; | 5345 | continue; |
4687 | } else { | 5346 | } else { |
4688 | dev_err(DEV, "sock_recvmsg returned %d\n", rv); | 5347 | conn_err(tconn, "sock_recvmsg returned %d\n", rv); |
4689 | goto reconnect; | 5348 | goto reconnect; |
4690 | } | 5349 | } |
4691 | 5350 | ||
4692 | if (received == expect && cmd == NULL) { | 5351 | if (received == expect && cmd == NULL) { |
4693 | if (unlikely(h->magic != BE_DRBD_MAGIC)) { | 5352 | if (decode_header(tconn, tconn->meta.rbuf, &pi)) |
4694 | dev_err(DEV, "magic?? on meta m: 0x%08x c: %d l: %d\n", | ||
4695 | be32_to_cpu(h->magic), | ||
4696 | be16_to_cpu(h->command), | ||
4697 | be16_to_cpu(h->length)); | ||
4698 | goto reconnect; | 5353 | goto reconnect; |
4699 | } | 5354 | cmd = &asender_tbl[pi.cmd]; |
4700 | cmd = get_asender_cmd(be16_to_cpu(h->command)); | 5355 | if (pi.cmd >= ARRAY_SIZE(asender_tbl) || !cmd->fn) { |
4701 | len = be16_to_cpu(h->length); | 5356 | conn_err(tconn, "Unexpected meta packet %s (0x%04x)\n", |
4702 | if (unlikely(cmd == NULL)) { | 5357 | cmdname(pi.cmd), pi.cmd); |
4703 | dev_err(DEV, "unknown command?? on meta m: 0x%08x c: %d l: %d\n", | ||
4704 | be32_to_cpu(h->magic), | ||
4705 | be16_to_cpu(h->command), | ||
4706 | be16_to_cpu(h->length)); | ||
4707 | goto disconnect; | 5358 | goto disconnect; |
4708 | } | 5359 | } |
4709 | expect = cmd->pkt_size; | 5360 | expect = header_size + cmd->pkt_size; |
4710 | ERR_IF(len != expect-sizeof(struct p_header80)) | 5361 | if (pi.size != expect - header_size) { |
5362 | conn_err(tconn, "Wrong packet size on meta (c: %d, l: %d)\n", | ||
5363 | pi.cmd, pi.size); | ||
4711 | goto reconnect; | 5364 | goto reconnect; |
5365 | } | ||
4712 | } | 5366 | } |
4713 | if (received == expect) { | 5367 | if (received == expect) { |
4714 | mdev->last_received = jiffies; | 5368 | bool err; |
4715 | D_ASSERT(cmd != NULL); | 5369 | |
4716 | if (!cmd->process(mdev, h)) | 5370 | err = cmd->fn(tconn, &pi); |
5371 | if (err) { | ||
5372 | conn_err(tconn, "%pf failed\n", cmd->fn); | ||
4717 | goto reconnect; | 5373 | goto reconnect; |
5374 | } | ||
5375 | |||
5376 | tconn->last_received = jiffies; | ||
4718 | 5377 | ||
4719 | /* the idle_timeout (ping-int) | 5378 | if (cmd == &asender_tbl[P_PING_ACK]) { |
4720 | * has been restored in got_PingAck() */ | 5379 | /* restore idle timeout */ |
4721 | if (cmd == get_asender_cmd(P_PING_ACK)) | 5380 | tconn->meta.socket->sk->sk_rcvtimeo = ping_int * HZ; |
4722 | ping_timeout_active = 0; | 5381 | ping_timeout_active = false; |
5382 | } | ||
4723 | 5383 | ||
4724 | buf = h; | 5384 | buf = tconn->meta.rbuf; |
4725 | received = 0; | 5385 | received = 0; |
4726 | expect = sizeof(struct p_header80); | 5386 | expect = header_size; |
4727 | cmd = NULL; | 5387 | cmd = NULL; |
4728 | } | 5388 | } |
4729 | } | 5389 | } |
4730 | 5390 | ||
4731 | if (0) { | 5391 | if (0) { |
4732 | reconnect: | 5392 | reconnect: |
4733 | drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE)); | 5393 | conn_request_state(tconn, NS(conn, C_NETWORK_FAILURE), CS_HARD); |
4734 | drbd_md_sync(mdev); | 5394 | conn_md_sync(tconn); |
4735 | } | 5395 | } |
4736 | if (0) { | 5396 | if (0) { |
4737 | disconnect: | 5397 | disconnect: |
4738 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | 5398 | conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD); |
4739 | drbd_md_sync(mdev); | ||
4740 | } | 5399 | } |
4741 | clear_bit(SIGNAL_ASENDER, &mdev->flags); | 5400 | clear_bit(SIGNAL_ASENDER, &tconn->flags); |
4742 | 5401 | ||
4743 | D_ASSERT(mdev->state.conn < C_CONNECTED); | 5402 | conn_info(tconn, "asender terminated\n"); |
4744 | dev_info(DEV, "asender terminated\n"); | ||
4745 | 5403 | ||
4746 | return 0; | 5404 | return 0; |
4747 | } | 5405 | } |
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 01b2ac641c7b..f58a4a4b4dfb 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c | |||
@@ -31,6 +31,8 @@ | |||
31 | #include "drbd_req.h" | 31 | #include "drbd_req.h" |
32 | 32 | ||
33 | 33 | ||
34 | static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size); | ||
35 | |||
34 | /* Update disk stats at start of I/O request */ | 36 | /* Update disk stats at start of I/O request */ |
35 | static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio) | 37 | static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio) |
36 | { | 38 | { |
@@ -40,6 +42,8 @@ static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req | |||
40 | part_round_stats(cpu, &mdev->vdisk->part0); | 42 | part_round_stats(cpu, &mdev->vdisk->part0); |
41 | part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); | 43 | part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); |
42 | part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio)); | 44 | part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio)); |
45 | (void) cpu; /* The macro invocations above want the cpu argument, I do not like | ||
46 | the compiler warning about cpu only assigned but never used... */ | ||
43 | part_inc_in_flight(&mdev->vdisk->part0, rw); | 47 | part_inc_in_flight(&mdev->vdisk->part0, rw); |
44 | part_stat_unlock(); | 48 | part_stat_unlock(); |
45 | } | 49 | } |
@@ -57,9 +61,51 @@ static void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request *req) | |||
57 | part_stat_unlock(); | 61 | part_stat_unlock(); |
58 | } | 62 | } |
59 | 63 | ||
60 | static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw) | 64 | static struct drbd_request *drbd_req_new(struct drbd_conf *mdev, |
65 | struct bio *bio_src) | ||
66 | { | ||
67 | struct drbd_request *req; | ||
68 | |||
69 | req = mempool_alloc(drbd_request_mempool, GFP_NOIO); | ||
70 | if (!req) | ||
71 | return NULL; | ||
72 | |||
73 | drbd_req_make_private_bio(req, bio_src); | ||
74 | req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0; | ||
75 | req->w.mdev = mdev; | ||
76 | req->master_bio = bio_src; | ||
77 | req->epoch = 0; | ||
78 | |||
79 | drbd_clear_interval(&req->i); | ||
80 | req->i.sector = bio_src->bi_sector; | ||
81 | req->i.size = bio_src->bi_size; | ||
82 | req->i.local = true; | ||
83 | req->i.waiting = false; | ||
84 | |||
85 | INIT_LIST_HEAD(&req->tl_requests); | ||
86 | INIT_LIST_HEAD(&req->w.list); | ||
87 | |||
88 | /* one reference to be put by __drbd_make_request */ | ||
89 | atomic_set(&req->completion_ref, 1); | ||
90 | /* one kref as long as completion_ref > 0 */ | ||
91 | kref_init(&req->kref); | ||
92 | return req; | ||
93 | } | ||
94 | |||
95 | void drbd_req_destroy(struct kref *kref) | ||
61 | { | 96 | { |
62 | const unsigned long s = req->rq_state; | 97 | struct drbd_request *req = container_of(kref, struct drbd_request, kref); |
98 | struct drbd_conf *mdev = req->w.mdev; | ||
99 | const unsigned s = req->rq_state; | ||
100 | |||
101 | if ((req->master_bio && !(s & RQ_POSTPONED)) || | ||
102 | atomic_read(&req->completion_ref) || | ||
103 | (s & RQ_LOCAL_PENDING) || | ||
104 | ((s & RQ_NET_MASK) && !(s & RQ_NET_DONE))) { | ||
105 | dev_err(DEV, "drbd_req_destroy: Logic BUG rq_state = 0x%x, completion_ref = %d\n", | ||
106 | s, atomic_read(&req->completion_ref)); | ||
107 | return; | ||
108 | } | ||
63 | 109 | ||
64 | /* remove it from the transfer log. | 110 | /* remove it from the transfer log. |
65 | * well, only if it had been there in the first | 111 | * well, only if it had been there in the first |
@@ -67,24 +113,33 @@ static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const | |||
67 | * and never sent), it should still be "empty" as | 113 | * and never sent), it should still be "empty" as |
68 | * initialized in drbd_req_new(), so we can list_del() it | 114 | * initialized in drbd_req_new(), so we can list_del() it |
69 | * here unconditionally */ | 115 | * here unconditionally */ |
70 | list_del(&req->tl_requests); | 116 | list_del_init(&req->tl_requests); |
71 | 117 | ||
72 | /* if it was a write, we may have to set the corresponding | 118 | /* if it was a write, we may have to set the corresponding |
73 | * bit(s) out-of-sync first. If it had a local part, we need to | 119 | * bit(s) out-of-sync first. If it had a local part, we need to |
74 | * release the reference to the activity log. */ | 120 | * release the reference to the activity log. */ |
75 | if (rw == WRITE) { | 121 | if (s & RQ_WRITE) { |
76 | /* Set out-of-sync unless both OK flags are set | 122 | /* Set out-of-sync unless both OK flags are set |
77 | * (local only or remote failed). | 123 | * (local only or remote failed). |
78 | * Other places where we set out-of-sync: | 124 | * Other places where we set out-of-sync: |
79 | * READ with local io-error */ | 125 | * READ with local io-error */ |
80 | if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK)) | ||
81 | drbd_set_out_of_sync(mdev, req->sector, req->size); | ||
82 | 126 | ||
83 | if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS)) | 127 | /* There is a special case: |
84 | drbd_set_in_sync(mdev, req->sector, req->size); | 128 | * we may notice late that IO was suspended, |
129 | * and postpone, or schedule for retry, a write, | ||
130 | * before it even was submitted or sent. | ||
131 | * In that case we do not want to touch the bitmap at all. | ||
132 | */ | ||
133 | if ((s & (RQ_POSTPONED|RQ_LOCAL_MASK|RQ_NET_MASK)) != RQ_POSTPONED) { | ||
134 | if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK)) | ||
135 | drbd_set_out_of_sync(mdev, req->i.sector, req->i.size); | ||
136 | |||
137 | if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS)) | ||
138 | drbd_set_in_sync(mdev, req->i.sector, req->i.size); | ||
139 | } | ||
85 | 140 | ||
86 | /* one might be tempted to move the drbd_al_complete_io | 141 | /* one might be tempted to move the drbd_al_complete_io |
87 | * to the local io completion callback drbd_endio_pri. | 142 | * to the local io completion callback drbd_request_endio. |
88 | * but, if this was a mirror write, we may only | 143 | * but, if this was a mirror write, we may only |
89 | * drbd_al_complete_io after this is RQ_NET_DONE, | 144 | * drbd_al_complete_io after this is RQ_NET_DONE, |
90 | * otherwise the extent could be dropped from the al | 145 | * otherwise the extent could be dropped from the al |
@@ -93,109 +148,35 @@ static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const | |||
93 | * but after the extent has been dropped from the al, | 148 | * but after the extent has been dropped from the al, |
94 | * we would forget to resync the corresponding extent. | 149 | * we would forget to resync the corresponding extent. |
95 | */ | 150 | */ |
96 | if (s & RQ_LOCAL_MASK) { | 151 | if (s & RQ_IN_ACT_LOG) { |
97 | if (get_ldev_if_state(mdev, D_FAILED)) { | 152 | if (get_ldev_if_state(mdev, D_FAILED)) { |
98 | if (s & RQ_IN_ACT_LOG) | 153 | drbd_al_complete_io(mdev, &req->i); |
99 | drbd_al_complete_io(mdev, req->sector); | ||
100 | put_ldev(mdev); | 154 | put_ldev(mdev); |
101 | } else if (__ratelimit(&drbd_ratelimit_state)) { | 155 | } else if (__ratelimit(&drbd_ratelimit_state)) { |
102 | dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu), " | 156 | dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu, %u), " |
103 | "but my Disk seems to have failed :(\n", | 157 | "but my Disk seems to have failed :(\n", |
104 | (unsigned long long) req->sector); | 158 | (unsigned long long) req->i.sector, req->i.size); |
105 | } | 159 | } |
106 | } | 160 | } |
107 | } | 161 | } |
108 | 162 | ||
109 | drbd_req_free(req); | 163 | mempool_free(req, drbd_request_mempool); |
110 | } | 164 | } |
111 | 165 | ||
112 | static void queue_barrier(struct drbd_conf *mdev) | 166 | static void wake_all_senders(struct drbd_tconn *tconn) { |
113 | { | 167 | wake_up(&tconn->sender_work.q_wait); |
114 | struct drbd_tl_epoch *b; | ||
115 | |||
116 | /* We are within the req_lock. Once we queued the barrier for sending, | ||
117 | * we set the CREATE_BARRIER bit. It is cleared as soon as a new | ||
118 | * barrier/epoch object is added. This is the only place this bit is | ||
119 | * set. It indicates that the barrier for this epoch is already queued, | ||
120 | * and no new epoch has been created yet. */ | ||
121 | if (test_bit(CREATE_BARRIER, &mdev->flags)) | ||
122 | return; | ||
123 | |||
124 | b = mdev->newest_tle; | ||
125 | b->w.cb = w_send_barrier; | ||
126 | /* inc_ap_pending done here, so we won't | ||
127 | * get imbalanced on connection loss. | ||
128 | * dec_ap_pending will be done in got_BarrierAck | ||
129 | * or (on connection loss) in tl_clear. */ | ||
130 | inc_ap_pending(mdev); | ||
131 | drbd_queue_work(&mdev->data.work, &b->w); | ||
132 | set_bit(CREATE_BARRIER, &mdev->flags); | ||
133 | } | 168 | } |
134 | 169 | ||
135 | static void _about_to_complete_local_write(struct drbd_conf *mdev, | 170 | /* must hold resource->req_lock */ |
136 | struct drbd_request *req) | 171 | static void start_new_tl_epoch(struct drbd_tconn *tconn) |
137 | { | 172 | { |
138 | const unsigned long s = req->rq_state; | 173 | /* no point closing an epoch, if it is empty, anyways. */ |
139 | struct drbd_request *i; | 174 | if (tconn->current_tle_writes == 0) |
140 | struct drbd_epoch_entry *e; | 175 | return; |
141 | struct hlist_node *n; | ||
142 | struct hlist_head *slot; | ||
143 | |||
144 | /* Before we can signal completion to the upper layers, | ||
145 | * we may need to close the current epoch. | ||
146 | * We can skip this, if this request has not even been sent, because we | ||
147 | * did not have a fully established connection yet/anymore, during | ||
148 | * bitmap exchange, or while we are C_AHEAD due to congestion policy. | ||
149 | */ | ||
150 | if (mdev->state.conn >= C_CONNECTED && | ||
151 | (s & RQ_NET_SENT) != 0 && | ||
152 | req->epoch == mdev->newest_tle->br_number) | ||
153 | queue_barrier(mdev); | ||
154 | |||
155 | /* we need to do the conflict detection stuff, | ||
156 | * if we have the ee_hash (two_primaries) and | ||
157 | * this has been on the network */ | ||
158 | if ((s & RQ_NET_DONE) && mdev->ee_hash != NULL) { | ||
159 | const sector_t sector = req->sector; | ||
160 | const int size = req->size; | ||
161 | |||
162 | /* ASSERT: | ||
163 | * there must be no conflicting requests, since | ||
164 | * they must have been failed on the spot */ | ||
165 | #define OVERLAPS overlaps(sector, size, i->sector, i->size) | ||
166 | slot = tl_hash_slot(mdev, sector); | ||
167 | hlist_for_each_entry(i, n, slot, collision) { | ||
168 | if (OVERLAPS) { | ||
169 | dev_alert(DEV, "LOGIC BUG: completed: %p %llus +%u; " | ||
170 | "other: %p %llus +%u\n", | ||
171 | req, (unsigned long long)sector, size, | ||
172 | i, (unsigned long long)i->sector, i->size); | ||
173 | } | ||
174 | } | ||
175 | 176 | ||
176 | /* maybe "wake" those conflicting epoch entries | 177 | tconn->current_tle_writes = 0; |
177 | * that wait for this request to finish. | 178 | atomic_inc(&tconn->current_tle_nr); |
178 | * | 179 | wake_all_senders(tconn); |
179 | * currently, there can be only _one_ such ee | ||
180 | * (well, or some more, which would be pending | ||
181 | * P_DISCARD_ACK not yet sent by the asender...), | ||
182 | * since we block the receiver thread upon the | ||
183 | * first conflict detection, which will wait on | ||
184 | * misc_wait. maybe we want to assert that? | ||
185 | * | ||
186 | * anyways, if we found one, | ||
187 | * we just have to do a wake_up. */ | ||
188 | #undef OVERLAPS | ||
189 | #define OVERLAPS overlaps(sector, size, e->sector, e->size) | ||
190 | slot = ee_hash_slot(mdev, req->sector); | ||
191 | hlist_for_each_entry(e, n, slot, collision) { | ||
192 | if (OVERLAPS) { | ||
193 | wake_up(&mdev->misc_wait); | ||
194 | break; | ||
195 | } | ||
196 | } | ||
197 | } | ||
198 | #undef OVERLAPS | ||
199 | } | 180 | } |
200 | 181 | ||
201 | void complete_master_bio(struct drbd_conf *mdev, | 182 | void complete_master_bio(struct drbd_conf *mdev, |
@@ -205,17 +186,33 @@ void complete_master_bio(struct drbd_conf *mdev, | |||
205 | dec_ap_bio(mdev); | 186 | dec_ap_bio(mdev); |
206 | } | 187 | } |
207 | 188 | ||
189 | |||
190 | static void drbd_remove_request_interval(struct rb_root *root, | ||
191 | struct drbd_request *req) | ||
192 | { | ||
193 | struct drbd_conf *mdev = req->w.mdev; | ||
194 | struct drbd_interval *i = &req->i; | ||
195 | |||
196 | drbd_remove_interval(root, i); | ||
197 | |||
198 | /* Wake up any processes waiting for this request to complete. */ | ||
199 | if (i->waiting) | ||
200 | wake_up(&mdev->misc_wait); | ||
201 | } | ||
202 | |||
208 | /* Helper for __req_mod(). | 203 | /* Helper for __req_mod(). |
209 | * Set m->bio to the master bio, if it is fit to be completed, | 204 | * Set m->bio to the master bio, if it is fit to be completed, |
210 | * or leave it alone (it is initialized to NULL in __req_mod), | 205 | * or leave it alone (it is initialized to NULL in __req_mod), |
211 | * if it has already been completed, or cannot be completed yet. | 206 | * if it has already been completed, or cannot be completed yet. |
212 | * If m->bio is set, the error status to be returned is placed in m->error. | 207 | * If m->bio is set, the error status to be returned is placed in m->error. |
213 | */ | 208 | */ |
214 | void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) | 209 | static |
210 | void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m) | ||
215 | { | 211 | { |
216 | const unsigned long s = req->rq_state; | 212 | const unsigned s = req->rq_state; |
217 | struct drbd_conf *mdev = req->mdev; | 213 | struct drbd_conf *mdev = req->w.mdev; |
218 | int rw = req->rq_state & RQ_WRITE ? WRITE : READ; | 214 | int rw; |
215 | int error, ok; | ||
219 | 216 | ||
220 | /* we must not complete the master bio, while it is | 217 | /* we must not complete the master bio, while it is |
221 | * still being processed by _drbd_send_zc_bio (drbd_send_dblock) | 218 | * still being processed by _drbd_send_zc_bio (drbd_send_dblock) |
@@ -226,165 +223,220 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) | |||
226 | * the receiver, | 223 | * the receiver, |
227 | * the bio_endio completion callbacks. | 224 | * the bio_endio completion callbacks. |
228 | */ | 225 | */ |
229 | if (s & RQ_NET_QUEUED) | 226 | if ((s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED)) || |
230 | return; | 227 | (s & RQ_NET_QUEUED) || (s & RQ_NET_PENDING) || |
231 | if (s & RQ_NET_PENDING) | 228 | (s & RQ_COMPLETION_SUSP)) { |
229 | dev_err(DEV, "drbd_req_complete: Logic BUG rq_state = 0x%x\n", s); | ||
232 | return; | 230 | return; |
233 | if (s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED)) | 231 | } |
232 | |||
233 | if (!req->master_bio) { | ||
234 | dev_err(DEV, "drbd_req_complete: Logic BUG, master_bio == NULL!\n"); | ||
234 | return; | 235 | return; |
236 | } | ||
235 | 237 | ||
236 | if (req->master_bio) { | 238 | rw = bio_rw(req->master_bio); |
237 | /* this is data_received (remote read) | ||
238 | * or protocol C P_WRITE_ACK | ||
239 | * or protocol B P_RECV_ACK | ||
240 | * or protocol A "handed_over_to_network" (SendAck) | ||
241 | * or canceled or failed, | ||
242 | * or killed from the transfer log due to connection loss. | ||
243 | */ | ||
244 | 239 | ||
245 | /* | 240 | /* |
246 | * figure out whether to report success or failure. | 241 | * figure out whether to report success or failure. |
247 | * | 242 | * |
248 | * report success when at least one of the operations succeeded. | 243 | * report success when at least one of the operations succeeded. |
249 | * or, to put the other way, | 244 | * or, to put the other way, |
250 | * only report failure, when both operations failed. | 245 | * only report failure, when both operations failed. |
251 | * | 246 | * |
252 | * what to do about the failures is handled elsewhere. | 247 | * what to do about the failures is handled elsewhere. |
253 | * what we need to do here is just: complete the master_bio. | 248 | * what we need to do here is just: complete the master_bio. |
254 | * | 249 | * |
255 | * local completion error, if any, has been stored as ERR_PTR | 250 | * local completion error, if any, has been stored as ERR_PTR |
256 | * in private_bio within drbd_endio_pri. | 251 | * in private_bio within drbd_request_endio. |
257 | */ | 252 | */ |
258 | int ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK); | 253 | ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK); |
259 | int error = PTR_ERR(req->private_bio); | 254 | error = PTR_ERR(req->private_bio); |
260 | 255 | ||
261 | /* remove the request from the conflict detection | 256 | /* remove the request from the conflict detection |
262 | * respective block_id verification hash */ | 257 | * respective block_id verification hash */ |
263 | if (!hlist_unhashed(&req->collision)) | 258 | if (!drbd_interval_empty(&req->i)) { |
264 | hlist_del(&req->collision); | 259 | struct rb_root *root; |
265 | else | ||
266 | D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0); | ||
267 | 260 | ||
268 | /* for writes we need to do some extra housekeeping */ | ||
269 | if (rw == WRITE) | 261 | if (rw == WRITE) |
270 | _about_to_complete_local_write(mdev, req); | 262 | root = &mdev->write_requests; |
263 | else | ||
264 | root = &mdev->read_requests; | ||
265 | drbd_remove_request_interval(root, req); | ||
266 | } else if (!(s & RQ_POSTPONED)) | ||
267 | D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0); | ||
271 | 268 | ||
272 | /* Update disk stats */ | 269 | /* Before we can signal completion to the upper layers, |
273 | _drbd_end_io_acct(mdev, req); | 270 | * we may need to close the current transfer log epoch. |
271 | * We are within the request lock, so we can simply compare | ||
272 | * the request epoch number with the current transfer log | ||
273 | * epoch number. If they match, increase the current_tle_nr, | ||
274 | * and reset the transfer log epoch write_cnt. | ||
275 | */ | ||
276 | if (rw == WRITE && | ||
277 | req->epoch == atomic_read(&mdev->tconn->current_tle_nr)) | ||
278 | start_new_tl_epoch(mdev->tconn); | ||
279 | |||
280 | /* Update disk stats */ | ||
281 | _drbd_end_io_acct(mdev, req); | ||
282 | |||
283 | /* If READ failed, | ||
284 | * have it be pushed back to the retry work queue, | ||
285 | * so it will re-enter __drbd_make_request(), | ||
286 | * and be re-assigned to a suitable local or remote path, | ||
287 | * or failed if we do not have access to good data anymore. | ||
288 | * | ||
289 | * Unless it was failed early by __drbd_make_request(), | ||
290 | * because no path was available, in which case | ||
291 | * it was not even added to the transfer_log. | ||
292 | * | ||
293 | * READA may fail, and will not be retried. | ||
294 | * | ||
295 | * WRITE should have used all available paths already. | ||
296 | */ | ||
297 | if (!ok && rw == READ && !list_empty(&req->tl_requests)) | ||
298 | req->rq_state |= RQ_POSTPONED; | ||
274 | 299 | ||
300 | if (!(req->rq_state & RQ_POSTPONED)) { | ||
275 | m->error = ok ? 0 : (error ?: -EIO); | 301 | m->error = ok ? 0 : (error ?: -EIO); |
276 | m->bio = req->master_bio; | 302 | m->bio = req->master_bio; |
277 | req->master_bio = NULL; | 303 | req->master_bio = NULL; |
278 | } | 304 | } |
305 | } | ||
279 | 306 | ||
280 | if (s & RQ_LOCAL_PENDING) | 307 | static int drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_error *m, int put) |
281 | return; | 308 | { |
309 | struct drbd_conf *mdev = req->w.mdev; | ||
310 | D_ASSERT(m || (req->rq_state & RQ_POSTPONED)); | ||
311 | |||
312 | if (!atomic_sub_and_test(put, &req->completion_ref)) | ||
313 | return 0; | ||
282 | 314 | ||
283 | if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) { | 315 | drbd_req_complete(req, m); |
284 | /* this is disconnected (local only) operation, | 316 | |
285 | * or protocol C P_WRITE_ACK, | 317 | if (req->rq_state & RQ_POSTPONED) { |
286 | * or protocol A or B P_BARRIER_ACK, | 318 | /* don't destroy the req object just yet, |
287 | * or killed from the transfer log due to connection loss. */ | 319 | * but queue it for retry */ |
288 | _req_is_done(mdev, req, rw); | 320 | drbd_restart_request(req); |
321 | return 0; | ||
289 | } | 322 | } |
290 | /* else: network part and not DONE yet. that is | 323 | |
291 | * protocol A or B, barrier ack still pending... */ | 324 | return 1; |
292 | } | 325 | } |
293 | 326 | ||
294 | static void _req_may_be_done_not_susp(struct drbd_request *req, struct bio_and_error *m) | 327 | /* I'd like this to be the only place that manipulates |
328 | * req->completion_ref and req->kref. */ | ||
329 | static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m, | ||
330 | int clear, int set) | ||
295 | { | 331 | { |
296 | struct drbd_conf *mdev = req->mdev; | 332 | struct drbd_conf *mdev = req->w.mdev; |
333 | unsigned s = req->rq_state; | ||
334 | int c_put = 0; | ||
335 | int k_put = 0; | ||
297 | 336 | ||
298 | if (!is_susp(mdev->state)) | 337 | if (drbd_suspended(mdev) && !((s | clear) & RQ_COMPLETION_SUSP)) |
299 | _req_may_be_done(req, m); | 338 | set |= RQ_COMPLETION_SUSP; |
300 | } | ||
301 | 339 | ||
302 | /* | 340 | /* apply */ |
303 | * checks whether there was an overlapping request | ||
304 | * or ee already registered. | ||
305 | * | ||
306 | * if so, return 1, in which case this request is completed on the spot, | ||
307 | * without ever being submitted or send. | ||
308 | * | ||
309 | * return 0 if it is ok to submit this request. | ||
310 | * | ||
311 | * NOTE: | ||
312 | * paranoia: assume something above us is broken, and issues different write | ||
313 | * requests for the same block simultaneously... | ||
314 | * | ||
315 | * To ensure these won't be reordered differently on both nodes, resulting in | ||
316 | * diverging data sets, we discard the later one(s). Not that this is supposed | ||
317 | * to happen, but this is the rationale why we also have to check for | ||
318 | * conflicting requests with local origin, and why we have to do so regardless | ||
319 | * of whether we allowed multiple primaries. | ||
320 | * | ||
321 | * BTW, in case we only have one primary, the ee_hash is empty anyways, and the | ||
322 | * second hlist_for_each_entry becomes a noop. This is even simpler than to | ||
323 | * grab a reference on the net_conf, and check for the two_primaries flag... | ||
324 | */ | ||
325 | static int _req_conflicts(struct drbd_request *req) | ||
326 | { | ||
327 | struct drbd_conf *mdev = req->mdev; | ||
328 | const sector_t sector = req->sector; | ||
329 | const int size = req->size; | ||
330 | struct drbd_request *i; | ||
331 | struct drbd_epoch_entry *e; | ||
332 | struct hlist_node *n; | ||
333 | struct hlist_head *slot; | ||
334 | 341 | ||
335 | D_ASSERT(hlist_unhashed(&req->collision)); | 342 | req->rq_state &= ~clear; |
343 | req->rq_state |= set; | ||
336 | 344 | ||
337 | if (!get_net_conf(mdev)) | 345 | /* no change? */ |
338 | return 0; | 346 | if (req->rq_state == s) |
347 | return; | ||
339 | 348 | ||
340 | /* BUG_ON */ | 349 | /* intent: get references */ |
341 | ERR_IF (mdev->tl_hash_s == 0) | 350 | |
342 | goto out_no_conflict; | 351 | if (!(s & RQ_LOCAL_PENDING) && (set & RQ_LOCAL_PENDING)) |
343 | BUG_ON(mdev->tl_hash == NULL); | 352 | atomic_inc(&req->completion_ref); |
344 | 353 | ||
345 | #define OVERLAPS overlaps(i->sector, i->size, sector, size) | 354 | if (!(s & RQ_NET_PENDING) && (set & RQ_NET_PENDING)) { |
346 | slot = tl_hash_slot(mdev, sector); | 355 | inc_ap_pending(mdev); |
347 | hlist_for_each_entry(i, n, slot, collision) { | 356 | atomic_inc(&req->completion_ref); |
348 | if (OVERLAPS) { | ||
349 | dev_alert(DEV, "%s[%u] Concurrent local write detected! " | ||
350 | "[DISCARD L] new: %llus +%u; " | ||
351 | "pending: %llus +%u\n", | ||
352 | current->comm, current->pid, | ||
353 | (unsigned long long)sector, size, | ||
354 | (unsigned long long)i->sector, i->size); | ||
355 | goto out_conflict; | ||
356 | } | ||
357 | } | 357 | } |
358 | 358 | ||
359 | if (mdev->ee_hash_s) { | 359 | if (!(s & RQ_NET_QUEUED) && (set & RQ_NET_QUEUED)) |
360 | /* now, check for overlapping requests with remote origin */ | 360 | atomic_inc(&req->completion_ref); |
361 | BUG_ON(mdev->ee_hash == NULL); | 361 | |
362 | #undef OVERLAPS | 362 | if (!(s & RQ_EXP_BARR_ACK) && (set & RQ_EXP_BARR_ACK)) |
363 | #define OVERLAPS overlaps(e->sector, e->size, sector, size) | 363 | kref_get(&req->kref); /* wait for the DONE */ |
364 | slot = ee_hash_slot(mdev, sector); | 364 | |
365 | hlist_for_each_entry(e, n, slot, collision) { | 365 | if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) |
366 | if (OVERLAPS) { | 366 | atomic_add(req->i.size >> 9, &mdev->ap_in_flight); |
367 | dev_alert(DEV, "%s[%u] Concurrent remote write detected!" | 367 | |
368 | " [DISCARD L] new: %llus +%u; " | 368 | if (!(s & RQ_COMPLETION_SUSP) && (set & RQ_COMPLETION_SUSP)) |
369 | "pending: %llus +%u\n", | 369 | atomic_inc(&req->completion_ref); |
370 | current->comm, current->pid, | 370 | |
371 | (unsigned long long)sector, size, | 371 | /* progress: put references */ |
372 | (unsigned long long)e->sector, e->size); | 372 | |
373 | goto out_conflict; | 373 | if ((s & RQ_COMPLETION_SUSP) && (clear & RQ_COMPLETION_SUSP)) |
374 | } | 374 | ++c_put; |
375 | } | 375 | |
376 | if (!(s & RQ_LOCAL_ABORTED) && (set & RQ_LOCAL_ABORTED)) { | ||
377 | D_ASSERT(req->rq_state & RQ_LOCAL_PENDING); | ||
378 | /* local completion may still come in later, | ||
379 | * we need to keep the req object around. */ | ||
380 | kref_get(&req->kref); | ||
381 | ++c_put; | ||
382 | } | ||
383 | |||
384 | if ((s & RQ_LOCAL_PENDING) && (clear & RQ_LOCAL_PENDING)) { | ||
385 | if (req->rq_state & RQ_LOCAL_ABORTED) | ||
386 | ++k_put; | ||
387 | else | ||
388 | ++c_put; | ||
376 | } | 389 | } |
377 | #undef OVERLAPS | ||
378 | 390 | ||
379 | out_no_conflict: | 391 | if ((s & RQ_NET_PENDING) && (clear & RQ_NET_PENDING)) { |
380 | /* this is like it should be, and what we expected. | 392 | dec_ap_pending(mdev); |
381 | * our users do behave after all... */ | 393 | ++c_put; |
382 | put_net_conf(mdev); | 394 | } |
383 | return 0; | ||
384 | 395 | ||
385 | out_conflict: | 396 | if ((s & RQ_NET_QUEUED) && (clear & RQ_NET_QUEUED)) |
386 | put_net_conf(mdev); | 397 | ++c_put; |
387 | return 1; | 398 | |
399 | if ((s & RQ_EXP_BARR_ACK) && !(s & RQ_NET_DONE) && (set & RQ_NET_DONE)) { | ||
400 | if (req->rq_state & RQ_NET_SENT) | ||
401 | atomic_sub(req->i.size >> 9, &mdev->ap_in_flight); | ||
402 | ++k_put; | ||
403 | } | ||
404 | |||
405 | /* potentially complete and destroy */ | ||
406 | |||
407 | if (k_put || c_put) { | ||
408 | /* Completion does it's own kref_put. If we are going to | ||
409 | * kref_sub below, we need req to be still around then. */ | ||
410 | int at_least = k_put + !!c_put; | ||
411 | int refcount = atomic_read(&req->kref.refcount); | ||
412 | if (refcount < at_least) | ||
413 | dev_err(DEV, | ||
414 | "mod_rq_state: Logic BUG: %x -> %x: refcount = %d, should be >= %d\n", | ||
415 | s, req->rq_state, refcount, at_least); | ||
416 | } | ||
417 | |||
418 | /* If we made progress, retry conflicting peer requests, if any. */ | ||
419 | if (req->i.waiting) | ||
420 | wake_up(&mdev->misc_wait); | ||
421 | |||
422 | if (c_put) | ||
423 | k_put += drbd_req_put_completion_ref(req, m, c_put); | ||
424 | if (k_put) | ||
425 | kref_sub(&req->kref, k_put, drbd_req_destroy); | ||
426 | } | ||
427 | |||
428 | static void drbd_report_io_error(struct drbd_conf *mdev, struct drbd_request *req) | ||
429 | { | ||
430 | char b[BDEVNAME_SIZE]; | ||
431 | |||
432 | if (!__ratelimit(&drbd_ratelimit_state)) | ||
433 | return; | ||
434 | |||
435 | dev_warn(DEV, "local %s IO error sector %llu+%u on %s\n", | ||
436 | (req->rq_state & RQ_WRITE) ? "WRITE" : "READ", | ||
437 | (unsigned long long)req->i.sector, | ||
438 | req->i.size >> 9, | ||
439 | bdevname(mdev->ldev->backing_bdev, b)); | ||
388 | } | 440 | } |
389 | 441 | ||
390 | /* obviously this could be coded as many single functions | 442 | /* obviously this could be coded as many single functions |
@@ -402,9 +454,12 @@ out_conflict: | |||
402 | int __req_mod(struct drbd_request *req, enum drbd_req_event what, | 454 | int __req_mod(struct drbd_request *req, enum drbd_req_event what, |
403 | struct bio_and_error *m) | 455 | struct bio_and_error *m) |
404 | { | 456 | { |
405 | struct drbd_conf *mdev = req->mdev; | 457 | struct drbd_conf *mdev = req->w.mdev; |
406 | int rv = 0; | 458 | struct net_conf *nc; |
407 | m->bio = NULL; | 459 | int p, rv = 0; |
460 | |||
461 | if (m) | ||
462 | m->bio = NULL; | ||
408 | 463 | ||
409 | switch (what) { | 464 | switch (what) { |
410 | default: | 465 | default: |
@@ -413,116 +468,91 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, | |||
413 | 468 | ||
414 | /* does not happen... | 469 | /* does not happen... |
415 | * initialization done in drbd_req_new | 470 | * initialization done in drbd_req_new |
416 | case created: | 471 | case CREATED: |
417 | break; | 472 | break; |
418 | */ | 473 | */ |
419 | 474 | ||
420 | case to_be_send: /* via network */ | 475 | case TO_BE_SENT: /* via network */ |
421 | /* reached via drbd_make_request_common | 476 | /* reached via __drbd_make_request |
422 | * and from w_read_retry_remote */ | 477 | * and from w_read_retry_remote */ |
423 | D_ASSERT(!(req->rq_state & RQ_NET_MASK)); | 478 | D_ASSERT(!(req->rq_state & RQ_NET_MASK)); |
424 | req->rq_state |= RQ_NET_PENDING; | 479 | rcu_read_lock(); |
425 | inc_ap_pending(mdev); | 480 | nc = rcu_dereference(mdev->tconn->net_conf); |
481 | p = nc->wire_protocol; | ||
482 | rcu_read_unlock(); | ||
483 | req->rq_state |= | ||
484 | p == DRBD_PROT_C ? RQ_EXP_WRITE_ACK : | ||
485 | p == DRBD_PROT_B ? RQ_EXP_RECEIVE_ACK : 0; | ||
486 | mod_rq_state(req, m, 0, RQ_NET_PENDING); | ||
426 | break; | 487 | break; |
427 | 488 | ||
428 | case to_be_submitted: /* locally */ | 489 | case TO_BE_SUBMITTED: /* locally */ |
429 | /* reached via drbd_make_request_common */ | 490 | /* reached via __drbd_make_request */ |
430 | D_ASSERT(!(req->rq_state & RQ_LOCAL_MASK)); | 491 | D_ASSERT(!(req->rq_state & RQ_LOCAL_MASK)); |
431 | req->rq_state |= RQ_LOCAL_PENDING; | 492 | mod_rq_state(req, m, 0, RQ_LOCAL_PENDING); |
432 | break; | 493 | break; |
433 | 494 | ||
434 | case completed_ok: | 495 | case COMPLETED_OK: |
435 | if (req->rq_state & RQ_WRITE) | 496 | if (req->rq_state & RQ_WRITE) |
436 | mdev->writ_cnt += req->size>>9; | 497 | mdev->writ_cnt += req->i.size >> 9; |
437 | else | 498 | else |
438 | mdev->read_cnt += req->size>>9; | 499 | mdev->read_cnt += req->i.size >> 9; |
439 | 500 | ||
440 | req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK); | 501 | mod_rq_state(req, m, RQ_LOCAL_PENDING, |
441 | req->rq_state &= ~RQ_LOCAL_PENDING; | 502 | RQ_LOCAL_COMPLETED|RQ_LOCAL_OK); |
442 | |||
443 | _req_may_be_done_not_susp(req, m); | ||
444 | break; | 503 | break; |
445 | 504 | ||
446 | case abort_disk_io: | 505 | case ABORT_DISK_IO: |
447 | req->rq_state |= RQ_LOCAL_ABORTED; | 506 | mod_rq_state(req, m, 0, RQ_LOCAL_ABORTED); |
448 | if (req->rq_state & RQ_WRITE) | ||
449 | _req_may_be_done_not_susp(req, m); | ||
450 | else | ||
451 | goto goto_queue_for_net_read; | ||
452 | break; | 507 | break; |
453 | 508 | ||
454 | case write_completed_with_error: | 509 | case WRITE_COMPLETED_WITH_ERROR: |
455 | req->rq_state |= RQ_LOCAL_COMPLETED; | 510 | drbd_report_io_error(mdev, req); |
456 | req->rq_state &= ~RQ_LOCAL_PENDING; | 511 | __drbd_chk_io_error(mdev, DRBD_WRITE_ERROR); |
457 | 512 | mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED); | |
458 | __drbd_chk_io_error(mdev, DRBD_IO_ERROR); | ||
459 | _req_may_be_done_not_susp(req, m); | ||
460 | break; | 513 | break; |
461 | 514 | ||
462 | case read_ahead_completed_with_error: | 515 | case READ_COMPLETED_WITH_ERROR: |
463 | /* it is legal to fail READA */ | 516 | drbd_set_out_of_sync(mdev, req->i.sector, req->i.size); |
464 | req->rq_state |= RQ_LOCAL_COMPLETED; | 517 | drbd_report_io_error(mdev, req); |
465 | req->rq_state &= ~RQ_LOCAL_PENDING; | 518 | __drbd_chk_io_error(mdev, DRBD_READ_ERROR); |
466 | _req_may_be_done_not_susp(req, m); | 519 | /* fall through. */ |
520 | case READ_AHEAD_COMPLETED_WITH_ERROR: | ||
521 | /* it is legal to fail READA, no __drbd_chk_io_error in that case. */ | ||
522 | mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED); | ||
467 | break; | 523 | break; |
468 | 524 | ||
469 | case read_completed_with_error: | 525 | case QUEUE_FOR_NET_READ: |
470 | drbd_set_out_of_sync(mdev, req->sector, req->size); | ||
471 | |||
472 | req->rq_state |= RQ_LOCAL_COMPLETED; | ||
473 | req->rq_state &= ~RQ_LOCAL_PENDING; | ||
474 | |||
475 | if (req->rq_state & RQ_LOCAL_ABORTED) { | ||
476 | _req_may_be_done(req, m); | ||
477 | break; | ||
478 | } | ||
479 | |||
480 | __drbd_chk_io_error(mdev, DRBD_IO_ERROR); | ||
481 | |||
482 | goto_queue_for_net_read: | ||
483 | |||
484 | D_ASSERT(!(req->rq_state & RQ_NET_MASK)); | ||
485 | |||
486 | /* no point in retrying if there is no good remote data, | ||
487 | * or we have no connection. */ | ||
488 | if (mdev->state.pdsk != D_UP_TO_DATE) { | ||
489 | _req_may_be_done_not_susp(req, m); | ||
490 | break; | ||
491 | } | ||
492 | |||
493 | /* _req_mod(req,to_be_send); oops, recursion... */ | ||
494 | req->rq_state |= RQ_NET_PENDING; | ||
495 | inc_ap_pending(mdev); | ||
496 | /* fall through: _req_mod(req,queue_for_net_read); */ | ||
497 | |||
498 | case queue_for_net_read: | ||
499 | /* READ or READA, and | 526 | /* READ or READA, and |
500 | * no local disk, | 527 | * no local disk, |
501 | * or target area marked as invalid, | 528 | * or target area marked as invalid, |
502 | * or just got an io-error. */ | 529 | * or just got an io-error. */ |
503 | /* from drbd_make_request_common | 530 | /* from __drbd_make_request |
504 | * or from bio_endio during read io-error recovery */ | 531 | * or from bio_endio during read io-error recovery */ |
505 | 532 | ||
506 | /* so we can verify the handle in the answer packet | 533 | /* So we can verify the handle in the answer packet. |
507 | * corresponding hlist_del is in _req_may_be_done() */ | 534 | * Corresponding drbd_remove_request_interval is in |
508 | hlist_add_head(&req->collision, ar_hash_slot(mdev, req->sector)); | 535 | * drbd_req_complete() */ |
536 | D_ASSERT(drbd_interval_empty(&req->i)); | ||
537 | drbd_insert_interval(&mdev->read_requests, &req->i); | ||
509 | 538 | ||
510 | set_bit(UNPLUG_REMOTE, &mdev->flags); | 539 | set_bit(UNPLUG_REMOTE, &mdev->flags); |
511 | 540 | ||
512 | D_ASSERT(req->rq_state & RQ_NET_PENDING); | 541 | D_ASSERT(req->rq_state & RQ_NET_PENDING); |
513 | req->rq_state |= RQ_NET_QUEUED; | 542 | D_ASSERT((req->rq_state & RQ_LOCAL_MASK) == 0); |
514 | req->w.cb = (req->rq_state & RQ_LOCAL_MASK) | 543 | mod_rq_state(req, m, 0, RQ_NET_QUEUED); |
515 | ? w_read_retry_remote | 544 | req->w.cb = w_send_read_req; |
516 | : w_send_read_req; | 545 | drbd_queue_work(&mdev->tconn->sender_work, &req->w); |
517 | drbd_queue_work(&mdev->data.work, &req->w); | ||
518 | break; | 546 | break; |
519 | 547 | ||
520 | case queue_for_net_write: | 548 | case QUEUE_FOR_NET_WRITE: |
521 | /* assert something? */ | 549 | /* assert something? */ |
522 | /* from drbd_make_request_common only */ | 550 | /* from __drbd_make_request only */ |
523 | 551 | ||
524 | hlist_add_head(&req->collision, tl_hash_slot(mdev, req->sector)); | 552 | /* Corresponding drbd_remove_request_interval is in |
525 | /* corresponding hlist_del is in _req_may_be_done() */ | 553 | * drbd_req_complete() */ |
554 | D_ASSERT(drbd_interval_empty(&req->i)); | ||
555 | drbd_insert_interval(&mdev->write_requests, &req->i); | ||
526 | 556 | ||
527 | /* NOTE | 557 | /* NOTE |
528 | * In case the req ended up on the transfer log before being | 558 | * In case the req ended up on the transfer log before being |
@@ -533,7 +563,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, | |||
533 | * | 563 | * |
534 | * _req_add_to_epoch(req); this has to be after the | 564 | * _req_add_to_epoch(req); this has to be after the |
535 | * _maybe_start_new_epoch(req); which happened in | 565 | * _maybe_start_new_epoch(req); which happened in |
536 | * drbd_make_request_common, because we now may set the bit | 566 | * __drbd_make_request, because we now may set the bit |
537 | * again ourselves to close the current epoch. | 567 | * again ourselves to close the current epoch. |
538 | * | 568 | * |
539 | * Add req to the (now) current epoch (barrier). */ | 569 | * Add req to the (now) current epoch (barrier). */ |
@@ -543,202 +573,187 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, | |||
543 | * hurting performance. */ | 573 | * hurting performance. */ |
544 | set_bit(UNPLUG_REMOTE, &mdev->flags); | 574 | set_bit(UNPLUG_REMOTE, &mdev->flags); |
545 | 575 | ||
546 | /* see drbd_make_request_common, | ||
547 | * just after it grabs the req_lock */ | ||
548 | D_ASSERT(test_bit(CREATE_BARRIER, &mdev->flags) == 0); | ||
549 | |||
550 | req->epoch = mdev->newest_tle->br_number; | ||
551 | |||
552 | /* increment size of current epoch */ | ||
553 | mdev->newest_tle->n_writes++; | ||
554 | |||
555 | /* queue work item to send data */ | 576 | /* queue work item to send data */ |
556 | D_ASSERT(req->rq_state & RQ_NET_PENDING); | 577 | D_ASSERT(req->rq_state & RQ_NET_PENDING); |
557 | req->rq_state |= RQ_NET_QUEUED; | 578 | mod_rq_state(req, m, 0, RQ_NET_QUEUED|RQ_EXP_BARR_ACK); |
558 | req->w.cb = w_send_dblock; | 579 | req->w.cb = w_send_dblock; |
559 | drbd_queue_work(&mdev->data.work, &req->w); | 580 | drbd_queue_work(&mdev->tconn->sender_work, &req->w); |
560 | 581 | ||
561 | /* close the epoch, in case it outgrew the limit */ | 582 | /* close the epoch, in case it outgrew the limit */ |
562 | if (mdev->newest_tle->n_writes >= mdev->net_conf->max_epoch_size) | 583 | rcu_read_lock(); |
563 | queue_barrier(mdev); | 584 | nc = rcu_dereference(mdev->tconn->net_conf); |
585 | p = nc->max_epoch_size; | ||
586 | rcu_read_unlock(); | ||
587 | if (mdev->tconn->current_tle_writes >= p) | ||
588 | start_new_tl_epoch(mdev->tconn); | ||
564 | 589 | ||
565 | break; | 590 | break; |
566 | 591 | ||
567 | case queue_for_send_oos: | 592 | case QUEUE_FOR_SEND_OOS: |
568 | req->rq_state |= RQ_NET_QUEUED; | 593 | mod_rq_state(req, m, 0, RQ_NET_QUEUED); |
569 | req->w.cb = w_send_oos; | 594 | req->w.cb = w_send_out_of_sync; |
570 | drbd_queue_work(&mdev->data.work, &req->w); | 595 | drbd_queue_work(&mdev->tconn->sender_work, &req->w); |
571 | break; | 596 | break; |
572 | 597 | ||
573 | case read_retry_remote_canceled: | 598 | case READ_RETRY_REMOTE_CANCELED: |
574 | case send_canceled: | 599 | case SEND_CANCELED: |
575 | case send_failed: | 600 | case SEND_FAILED: |
576 | /* real cleanup will be done from tl_clear. just update flags | 601 | /* real cleanup will be done from tl_clear. just update flags |
577 | * so it is no longer marked as on the worker queue */ | 602 | * so it is no longer marked as on the worker queue */ |
578 | req->rq_state &= ~RQ_NET_QUEUED; | 603 | mod_rq_state(req, m, RQ_NET_QUEUED, 0); |
579 | /* if we did it right, tl_clear should be scheduled only after | ||
580 | * this, so this should not be necessary! */ | ||
581 | _req_may_be_done_not_susp(req, m); | ||
582 | break; | 604 | break; |
583 | 605 | ||
584 | case handed_over_to_network: | 606 | case HANDED_OVER_TO_NETWORK: |
585 | /* assert something? */ | 607 | /* assert something? */ |
586 | if (bio_data_dir(req->master_bio) == WRITE) | ||
587 | atomic_add(req->size>>9, &mdev->ap_in_flight); | ||
588 | |||
589 | if (bio_data_dir(req->master_bio) == WRITE && | 608 | if (bio_data_dir(req->master_bio) == WRITE && |
590 | mdev->net_conf->wire_protocol == DRBD_PROT_A) { | 609 | !(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK))) { |
591 | /* this is what is dangerous about protocol A: | 610 | /* this is what is dangerous about protocol A: |
592 | * pretend it was successfully written on the peer. */ | 611 | * pretend it was successfully written on the peer. */ |
593 | if (req->rq_state & RQ_NET_PENDING) { | 612 | if (req->rq_state & RQ_NET_PENDING) |
594 | dec_ap_pending(mdev); | 613 | mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK); |
595 | req->rq_state &= ~RQ_NET_PENDING; | 614 | /* else: neg-ack was faster... */ |
596 | req->rq_state |= RQ_NET_OK; | ||
597 | } /* else: neg-ack was faster... */ | ||
598 | /* it is still not yet RQ_NET_DONE until the | 615 | /* it is still not yet RQ_NET_DONE until the |
599 | * corresponding epoch barrier got acked as well, | 616 | * corresponding epoch barrier got acked as well, |
600 | * so we know what to dirty on connection loss */ | 617 | * so we know what to dirty on connection loss */ |
601 | } | 618 | } |
602 | req->rq_state &= ~RQ_NET_QUEUED; | 619 | mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_SENT); |
603 | req->rq_state |= RQ_NET_SENT; | ||
604 | _req_may_be_done_not_susp(req, m); | ||
605 | break; | 620 | break; |
606 | 621 | ||
607 | case oos_handed_to_network: | 622 | case OOS_HANDED_TO_NETWORK: |
608 | /* Was not set PENDING, no longer QUEUED, so is now DONE | 623 | /* Was not set PENDING, no longer QUEUED, so is now DONE |
609 | * as far as this connection is concerned. */ | 624 | * as far as this connection is concerned. */ |
610 | req->rq_state &= ~RQ_NET_QUEUED; | 625 | mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_DONE); |
611 | req->rq_state |= RQ_NET_DONE; | ||
612 | _req_may_be_done_not_susp(req, m); | ||
613 | break; | 626 | break; |
614 | 627 | ||
615 | case connection_lost_while_pending: | 628 | case CONNECTION_LOST_WHILE_PENDING: |
616 | /* transfer log cleanup after connection loss */ | 629 | /* transfer log cleanup after connection loss */ |
617 | /* assert something? */ | 630 | mod_rq_state(req, m, |
618 | if (req->rq_state & RQ_NET_PENDING) | 631 | RQ_NET_OK|RQ_NET_PENDING|RQ_COMPLETION_SUSP, |
619 | dec_ap_pending(mdev); | 632 | RQ_NET_DONE); |
620 | req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING); | ||
621 | req->rq_state |= RQ_NET_DONE; | ||
622 | if (req->rq_state & RQ_NET_SENT && req->rq_state & RQ_WRITE) | ||
623 | atomic_sub(req->size>>9, &mdev->ap_in_flight); | ||
624 | |||
625 | /* if it is still queued, we may not complete it here. | ||
626 | * it will be canceled soon. */ | ||
627 | if (!(req->rq_state & RQ_NET_QUEUED)) | ||
628 | _req_may_be_done(req, m); /* Allowed while state.susp */ | ||
629 | break; | 633 | break; |
630 | 634 | ||
631 | case conflict_discarded_by_peer: | 635 | case CONFLICT_RESOLVED: |
632 | /* for discarded conflicting writes of multiple primaries, | 636 | /* for superseded conflicting writes of multiple primaries, |
633 | * there is no need to keep anything in the tl, potential | 637 | * there is no need to keep anything in the tl, potential |
634 | * node crashes are covered by the activity log. */ | 638 | * node crashes are covered by the activity log. |
635 | if (what == conflict_discarded_by_peer) | 639 | * |
636 | dev_alert(DEV, "Got DiscardAck packet %llus +%u!" | 640 | * If this request had been marked as RQ_POSTPONED before, |
637 | " DRBD is not a random data generator!\n", | 641 | * it will actually not be completed, but "restarted", |
638 | (unsigned long long)req->sector, req->size); | 642 | * resubmitted from the retry worker context. */ |
639 | req->rq_state |= RQ_NET_DONE; | 643 | D_ASSERT(req->rq_state & RQ_NET_PENDING); |
640 | /* fall through */ | 644 | D_ASSERT(req->rq_state & RQ_EXP_WRITE_ACK); |
641 | case write_acked_by_peer_and_sis: | 645 | mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_DONE|RQ_NET_OK); |
642 | case write_acked_by_peer: | 646 | break; |
643 | if (what == write_acked_by_peer_and_sis) | 647 | |
644 | req->rq_state |= RQ_NET_SIS; | 648 | case WRITE_ACKED_BY_PEER_AND_SIS: |
649 | req->rq_state |= RQ_NET_SIS; | ||
650 | case WRITE_ACKED_BY_PEER: | ||
651 | D_ASSERT(req->rq_state & RQ_EXP_WRITE_ACK); | ||
645 | /* protocol C; successfully written on peer. | 652 | /* protocol C; successfully written on peer. |
646 | * Nothing more to do here. | 653 | * Nothing more to do here. |
647 | * We want to keep the tl in place for all protocols, to cater | 654 | * We want to keep the tl in place for all protocols, to cater |
648 | * for volatile write-back caches on lower level devices. */ | 655 | * for volatile write-back caches on lower level devices. */ |
649 | 656 | ||
650 | case recv_acked_by_peer: | 657 | goto ack_common; |
658 | case RECV_ACKED_BY_PEER: | ||
659 | D_ASSERT(req->rq_state & RQ_EXP_RECEIVE_ACK); | ||
651 | /* protocol B; pretends to be successfully written on peer. | 660 | /* protocol B; pretends to be successfully written on peer. |
652 | * see also notes above in handed_over_to_network about | 661 | * see also notes above in HANDED_OVER_TO_NETWORK about |
653 | * protocol != C */ | 662 | * protocol != C */ |
654 | req->rq_state |= RQ_NET_OK; | 663 | ack_common: |
655 | D_ASSERT(req->rq_state & RQ_NET_PENDING); | 664 | D_ASSERT(req->rq_state & RQ_NET_PENDING); |
656 | dec_ap_pending(mdev); | 665 | mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK); |
657 | atomic_sub(req->size>>9, &mdev->ap_in_flight); | ||
658 | req->rq_state &= ~RQ_NET_PENDING; | ||
659 | _req_may_be_done_not_susp(req, m); | ||
660 | break; | 666 | break; |
661 | 667 | ||
662 | case neg_acked: | 668 | case POSTPONE_WRITE: |
663 | /* assert something? */ | 669 | D_ASSERT(req->rq_state & RQ_EXP_WRITE_ACK); |
664 | if (req->rq_state & RQ_NET_PENDING) { | 670 | /* If this node has already detected the write conflict, the |
665 | dec_ap_pending(mdev); | 671 | * worker will be waiting on misc_wait. Wake it up once this |
666 | atomic_sub(req->size>>9, &mdev->ap_in_flight); | 672 | * request has completed locally. |
667 | } | 673 | */ |
668 | req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING); | 674 | D_ASSERT(req->rq_state & RQ_NET_PENDING); |
675 | req->rq_state |= RQ_POSTPONED; | ||
676 | if (req->i.waiting) | ||
677 | wake_up(&mdev->misc_wait); | ||
678 | /* Do not clear RQ_NET_PENDING. This request will make further | ||
679 | * progress via restart_conflicting_writes() or | ||
680 | * fail_postponed_requests(). Hopefully. */ | ||
681 | break; | ||
669 | 682 | ||
670 | req->rq_state |= RQ_NET_DONE; | 683 | case NEG_ACKED: |
671 | _req_may_be_done_not_susp(req, m); | 684 | mod_rq_state(req, m, RQ_NET_OK|RQ_NET_PENDING, 0); |
672 | /* else: done by handed_over_to_network */ | ||
673 | break; | 685 | break; |
674 | 686 | ||
675 | case fail_frozen_disk_io: | 687 | case FAIL_FROZEN_DISK_IO: |
676 | if (!(req->rq_state & RQ_LOCAL_COMPLETED)) | 688 | if (!(req->rq_state & RQ_LOCAL_COMPLETED)) |
677 | break; | 689 | break; |
678 | 690 | mod_rq_state(req, m, RQ_COMPLETION_SUSP, 0); | |
679 | _req_may_be_done(req, m); /* Allowed while state.susp */ | ||
680 | break; | 691 | break; |
681 | 692 | ||
682 | case restart_frozen_disk_io: | 693 | case RESTART_FROZEN_DISK_IO: |
683 | if (!(req->rq_state & RQ_LOCAL_COMPLETED)) | 694 | if (!(req->rq_state & RQ_LOCAL_COMPLETED)) |
684 | break; | 695 | break; |
685 | 696 | ||
686 | req->rq_state &= ~RQ_LOCAL_COMPLETED; | 697 | mod_rq_state(req, m, |
698 | RQ_COMPLETION_SUSP|RQ_LOCAL_COMPLETED, | ||
699 | RQ_LOCAL_PENDING); | ||
687 | 700 | ||
688 | rv = MR_READ; | 701 | rv = MR_READ; |
689 | if (bio_data_dir(req->master_bio) == WRITE) | 702 | if (bio_data_dir(req->master_bio) == WRITE) |
690 | rv = MR_WRITE; | 703 | rv = MR_WRITE; |
691 | 704 | ||
692 | get_ldev(mdev); | 705 | get_ldev(mdev); /* always succeeds in this call path */ |
693 | req->w.cb = w_restart_disk_io; | 706 | req->w.cb = w_restart_disk_io; |
694 | drbd_queue_work(&mdev->data.work, &req->w); | 707 | drbd_queue_work(&mdev->tconn->sender_work, &req->w); |
695 | break; | 708 | break; |
696 | 709 | ||
697 | case resend: | 710 | case RESEND: |
698 | /* Simply complete (local only) READs. */ | 711 | /* Simply complete (local only) READs. */ |
699 | if (!(req->rq_state & RQ_WRITE) && !req->w.cb) { | 712 | if (!(req->rq_state & RQ_WRITE) && !req->w.cb) { |
700 | _req_may_be_done(req, m); | 713 | mod_rq_state(req, m, RQ_COMPLETION_SUSP, 0); |
701 | break; | 714 | break; |
702 | } | 715 | } |
703 | 716 | ||
704 | /* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK | 717 | /* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK |
705 | before the connection loss (B&C only); only P_BARRIER_ACK was missing. | 718 | before the connection loss (B&C only); only P_BARRIER_ACK |
706 | Trowing them out of the TL here by pretending we got a BARRIER_ACK | 719 | (or the local completion?) was missing when we suspended. |
707 | We ensure that the peer was not rebooted */ | 720 | Throwing them out of the TL here by pretending we got a BARRIER_ACK. |
721 | During connection handshake, we ensure that the peer was not rebooted. */ | ||
708 | if (!(req->rq_state & RQ_NET_OK)) { | 722 | if (!(req->rq_state & RQ_NET_OK)) { |
723 | /* FIXME could this possibly be a req->w.cb == w_send_out_of_sync? | ||
724 | * in that case we must not set RQ_NET_PENDING. */ | ||
725 | |||
726 | mod_rq_state(req, m, RQ_COMPLETION_SUSP, RQ_NET_QUEUED|RQ_NET_PENDING); | ||
709 | if (req->w.cb) { | 727 | if (req->w.cb) { |
710 | drbd_queue_work(&mdev->data.work, &req->w); | 728 | drbd_queue_work(&mdev->tconn->sender_work, &req->w); |
711 | rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ; | 729 | rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ; |
712 | } | 730 | } /* else: FIXME can this happen? */ |
713 | break; | 731 | break; |
714 | } | 732 | } |
715 | /* else, fall through to barrier_acked */ | 733 | /* else, fall through to BARRIER_ACKED */ |
716 | 734 | ||
717 | case barrier_acked: | 735 | case BARRIER_ACKED: |
736 | /* barrier ack for READ requests does not make sense */ | ||
718 | if (!(req->rq_state & RQ_WRITE)) | 737 | if (!(req->rq_state & RQ_WRITE)) |
719 | break; | 738 | break; |
720 | 739 | ||
721 | if (req->rq_state & RQ_NET_PENDING) { | 740 | if (req->rq_state & RQ_NET_PENDING) { |
722 | /* barrier came in before all requests have been acked. | 741 | /* barrier came in before all requests were acked. |
723 | * this is bad, because if the connection is lost now, | 742 | * this is bad, because if the connection is lost now, |
724 | * we won't be able to clean them up... */ | 743 | * we won't be able to clean them up... */ |
725 | dev_err(DEV, "FIXME (barrier_acked but pending)\n"); | 744 | dev_err(DEV, "FIXME (BARRIER_ACKED but pending)\n"); |
726 | list_move(&req->tl_requests, &mdev->out_of_sequence_requests); | ||
727 | } | 745 | } |
728 | if ((req->rq_state & RQ_NET_MASK) != 0) { | 746 | /* Allowed to complete requests, even while suspended. |
729 | req->rq_state |= RQ_NET_DONE; | 747 | * As this is called for all requests within a matching epoch, |
730 | if (mdev->net_conf->wire_protocol == DRBD_PROT_A) | 748 | * we need to filter, and only set RQ_NET_DONE for those that |
731 | atomic_sub(req->size>>9, &mdev->ap_in_flight); | 749 | * have actually been on the wire. */ |
732 | } | 750 | mod_rq_state(req, m, RQ_COMPLETION_SUSP, |
733 | _req_may_be_done(req, m); /* Allowed while state.susp */ | 751 | (req->rq_state & RQ_NET_MASK) ? RQ_NET_DONE : 0); |
734 | break; | 752 | break; |
735 | 753 | ||
736 | case data_received: | 754 | case DATA_RECEIVED: |
737 | D_ASSERT(req->rq_state & RQ_NET_PENDING); | 755 | D_ASSERT(req->rq_state & RQ_NET_PENDING); |
738 | dec_ap_pending(mdev); | 756 | mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK|RQ_NET_DONE); |
739 | req->rq_state &= ~RQ_NET_PENDING; | ||
740 | req->rq_state |= (RQ_NET_OK|RQ_NET_DONE); | ||
741 | _req_may_be_done_not_susp(req, m); | ||
742 | break; | 757 | break; |
743 | }; | 758 | }; |
744 | 759 | ||
@@ -752,75 +767,265 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, | |||
752 | * since size may be bigger than BM_BLOCK_SIZE, | 767 | * since size may be bigger than BM_BLOCK_SIZE, |
753 | * we may need to check several bits. | 768 | * we may need to check several bits. |
754 | */ | 769 | */ |
755 | static int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size) | 770 | static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size) |
756 | { | 771 | { |
757 | unsigned long sbnr, ebnr; | 772 | unsigned long sbnr, ebnr; |
758 | sector_t esector, nr_sectors; | 773 | sector_t esector, nr_sectors; |
759 | 774 | ||
760 | if (mdev->state.disk == D_UP_TO_DATE) | 775 | if (mdev->state.disk == D_UP_TO_DATE) |
761 | return 1; | 776 | return true; |
762 | if (mdev->state.disk >= D_OUTDATED) | 777 | if (mdev->state.disk != D_INCONSISTENT) |
763 | return 0; | 778 | return false; |
764 | if (mdev->state.disk < D_INCONSISTENT) | ||
765 | return 0; | ||
766 | /* state.disk == D_INCONSISTENT We will have a look at the BitMap */ | ||
767 | nr_sectors = drbd_get_capacity(mdev->this_bdev); | ||
768 | esector = sector + (size >> 9) - 1; | 779 | esector = sector + (size >> 9) - 1; |
769 | 780 | nr_sectors = drbd_get_capacity(mdev->this_bdev); | |
770 | D_ASSERT(sector < nr_sectors); | 781 | D_ASSERT(sector < nr_sectors); |
771 | D_ASSERT(esector < nr_sectors); | 782 | D_ASSERT(esector < nr_sectors); |
772 | 783 | ||
773 | sbnr = BM_SECT_TO_BIT(sector); | 784 | sbnr = BM_SECT_TO_BIT(sector); |
774 | ebnr = BM_SECT_TO_BIT(esector); | 785 | ebnr = BM_SECT_TO_BIT(esector); |
775 | 786 | ||
776 | return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr); | 787 | return drbd_bm_count_bits(mdev, sbnr, ebnr) == 0; |
788 | } | ||
789 | |||
790 | static bool remote_due_to_read_balancing(struct drbd_conf *mdev, sector_t sector, | ||
791 | enum drbd_read_balancing rbm) | ||
792 | { | ||
793 | struct backing_dev_info *bdi; | ||
794 | int stripe_shift; | ||
795 | |||
796 | switch (rbm) { | ||
797 | case RB_CONGESTED_REMOTE: | ||
798 | bdi = &mdev->ldev->backing_bdev->bd_disk->queue->backing_dev_info; | ||
799 | return bdi_read_congested(bdi); | ||
800 | case RB_LEAST_PENDING: | ||
801 | return atomic_read(&mdev->local_cnt) > | ||
802 | atomic_read(&mdev->ap_pending_cnt) + atomic_read(&mdev->rs_pending_cnt); | ||
803 | case RB_32K_STRIPING: /* stripe_shift = 15 */ | ||
804 | case RB_64K_STRIPING: | ||
805 | case RB_128K_STRIPING: | ||
806 | case RB_256K_STRIPING: | ||
807 | case RB_512K_STRIPING: | ||
808 | case RB_1M_STRIPING: /* stripe_shift = 20 */ | ||
809 | stripe_shift = (rbm - RB_32K_STRIPING + 15); | ||
810 | return (sector >> (stripe_shift - 9)) & 1; | ||
811 | case RB_ROUND_ROBIN: | ||
812 | return test_and_change_bit(READ_BALANCE_RR, &mdev->flags); | ||
813 | case RB_PREFER_REMOTE: | ||
814 | return true; | ||
815 | case RB_PREFER_LOCAL: | ||
816 | default: | ||
817 | return false; | ||
818 | } | ||
819 | } | ||
820 | |||
821 | /* | ||
822 | * complete_conflicting_writes - wait for any conflicting write requests | ||
823 | * | ||
824 | * The write_requests tree contains all active write requests which we | ||
825 | * currently know about. Wait for any requests to complete which conflict with | ||
826 | * the new one. | ||
827 | * | ||
828 | * Only way out: remove the conflicting intervals from the tree. | ||
829 | */ | ||
830 | static void complete_conflicting_writes(struct drbd_request *req) | ||
831 | { | ||
832 | DEFINE_WAIT(wait); | ||
833 | struct drbd_conf *mdev = req->w.mdev; | ||
834 | struct drbd_interval *i; | ||
835 | sector_t sector = req->i.sector; | ||
836 | int size = req->i.size; | ||
837 | |||
838 | i = drbd_find_overlap(&mdev->write_requests, sector, size); | ||
839 | if (!i) | ||
840 | return; | ||
841 | |||
842 | for (;;) { | ||
843 | prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE); | ||
844 | i = drbd_find_overlap(&mdev->write_requests, sector, size); | ||
845 | if (!i) | ||
846 | break; | ||
847 | /* Indicate to wake up device->misc_wait on progress. */ | ||
848 | i->waiting = true; | ||
849 | spin_unlock_irq(&mdev->tconn->req_lock); | ||
850 | schedule(); | ||
851 | spin_lock_irq(&mdev->tconn->req_lock); | ||
852 | } | ||
853 | finish_wait(&mdev->misc_wait, &wait); | ||
777 | } | 854 | } |
778 | 855 | ||
856 | /* called within req_lock and rcu_read_lock() */ | ||
779 | static void maybe_pull_ahead(struct drbd_conf *mdev) | 857 | static void maybe_pull_ahead(struct drbd_conf *mdev) |
780 | { | 858 | { |
781 | int congested = 0; | 859 | struct drbd_tconn *tconn = mdev->tconn; |
860 | struct net_conf *nc; | ||
861 | bool congested = false; | ||
862 | enum drbd_on_congestion on_congestion; | ||
863 | |||
864 | nc = rcu_dereference(tconn->net_conf); | ||
865 | on_congestion = nc ? nc->on_congestion : OC_BLOCK; | ||
866 | if (on_congestion == OC_BLOCK || | ||
867 | tconn->agreed_pro_version < 96) | ||
868 | return; | ||
782 | 869 | ||
783 | /* If I don't even have good local storage, we can not reasonably try | 870 | /* If I don't even have good local storage, we can not reasonably try |
784 | * to pull ahead of the peer. We also need the local reference to make | 871 | * to pull ahead of the peer. We also need the local reference to make |
785 | * sure mdev->act_log is there. | 872 | * sure mdev->act_log is there. |
786 | * Note: caller has to make sure that net_conf is there. | ||
787 | */ | 873 | */ |
788 | if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) | 874 | if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) |
789 | return; | 875 | return; |
790 | 876 | ||
791 | if (mdev->net_conf->cong_fill && | 877 | if (nc->cong_fill && |
792 | atomic_read(&mdev->ap_in_flight) >= mdev->net_conf->cong_fill) { | 878 | atomic_read(&mdev->ap_in_flight) >= nc->cong_fill) { |
793 | dev_info(DEV, "Congestion-fill threshold reached\n"); | 879 | dev_info(DEV, "Congestion-fill threshold reached\n"); |
794 | congested = 1; | 880 | congested = true; |
795 | } | 881 | } |
796 | 882 | ||
797 | if (mdev->act_log->used >= mdev->net_conf->cong_extents) { | 883 | if (mdev->act_log->used >= nc->cong_extents) { |
798 | dev_info(DEV, "Congestion-extents threshold reached\n"); | 884 | dev_info(DEV, "Congestion-extents threshold reached\n"); |
799 | congested = 1; | 885 | congested = true; |
800 | } | 886 | } |
801 | 887 | ||
802 | if (congested) { | 888 | if (congested) { |
803 | queue_barrier(mdev); /* last barrier, after mirrored writes */ | 889 | /* start a new epoch for non-mirrored writes */ |
890 | start_new_tl_epoch(mdev->tconn); | ||
804 | 891 | ||
805 | if (mdev->net_conf->on_congestion == OC_PULL_AHEAD) | 892 | if (on_congestion == OC_PULL_AHEAD) |
806 | _drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL); | 893 | _drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL); |
807 | else /*mdev->net_conf->on_congestion == OC_DISCONNECT */ | 894 | else /*nc->on_congestion == OC_DISCONNECT */ |
808 | _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL); | 895 | _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL); |
809 | } | 896 | } |
810 | put_ldev(mdev); | 897 | put_ldev(mdev); |
811 | } | 898 | } |
812 | 899 | ||
813 | static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time) | 900 | /* If this returns false, and req->private_bio is still set, |
901 | * this should be submitted locally. | ||
902 | * | ||
903 | * If it returns false, but req->private_bio is not set, | ||
904 | * we do not have access to good data :( | ||
905 | * | ||
906 | * Otherwise, this destroys req->private_bio, if any, | ||
907 | * and returns true. | ||
908 | */ | ||
909 | static bool do_remote_read(struct drbd_request *req) | ||
910 | { | ||
911 | struct drbd_conf *mdev = req->w.mdev; | ||
912 | enum drbd_read_balancing rbm; | ||
913 | |||
914 | if (req->private_bio) { | ||
915 | if (!drbd_may_do_local_read(mdev, | ||
916 | req->i.sector, req->i.size)) { | ||
917 | bio_put(req->private_bio); | ||
918 | req->private_bio = NULL; | ||
919 | put_ldev(mdev); | ||
920 | } | ||
921 | } | ||
922 | |||
923 | if (mdev->state.pdsk != D_UP_TO_DATE) | ||
924 | return false; | ||
925 | |||
926 | if (req->private_bio == NULL) | ||
927 | return true; | ||
928 | |||
929 | /* TODO: improve read balancing decisions, take into account drbd | ||
930 | * protocol, pending requests etc. */ | ||
931 | |||
932 | rcu_read_lock(); | ||
933 | rbm = rcu_dereference(mdev->ldev->disk_conf)->read_balancing; | ||
934 | rcu_read_unlock(); | ||
935 | |||
936 | if (rbm == RB_PREFER_LOCAL && req->private_bio) | ||
937 | return false; /* submit locally */ | ||
938 | |||
939 | if (remote_due_to_read_balancing(mdev, req->i.sector, rbm)) { | ||
940 | if (req->private_bio) { | ||
941 | bio_put(req->private_bio); | ||
942 | req->private_bio = NULL; | ||
943 | put_ldev(mdev); | ||
944 | } | ||
945 | return true; | ||
946 | } | ||
947 | |||
948 | return false; | ||
949 | } | ||
950 | |||
951 | /* returns number of connections (== 1, for drbd 8.4) | ||
952 | * expected to actually write this data, | ||
953 | * which does NOT include those that we are L_AHEAD for. */ | ||
954 | static int drbd_process_write_request(struct drbd_request *req) | ||
955 | { | ||
956 | struct drbd_conf *mdev = req->w.mdev; | ||
957 | int remote, send_oos; | ||
958 | |||
959 | rcu_read_lock(); | ||
960 | remote = drbd_should_do_remote(mdev->state); | ||
961 | if (remote) { | ||
962 | maybe_pull_ahead(mdev); | ||
963 | remote = drbd_should_do_remote(mdev->state); | ||
964 | } | ||
965 | send_oos = drbd_should_send_out_of_sync(mdev->state); | ||
966 | rcu_read_unlock(); | ||
967 | |||
968 | /* Need to replicate writes. Unless it is an empty flush, | ||
969 | * which is better mapped to a DRBD P_BARRIER packet, | ||
970 | * also for drbd wire protocol compatibility reasons. | ||
971 | * If this was a flush, just start a new epoch. | ||
972 | * Unless the current epoch was empty anyways, or we are not currently | ||
973 | * replicating, in which case there is no point. */ | ||
974 | if (unlikely(req->i.size == 0)) { | ||
975 | /* The only size==0 bios we expect are empty flushes. */ | ||
976 | D_ASSERT(req->master_bio->bi_rw & REQ_FLUSH); | ||
977 | if (remote) | ||
978 | start_new_tl_epoch(mdev->tconn); | ||
979 | return 0; | ||
980 | } | ||
981 | |||
982 | if (!remote && !send_oos) | ||
983 | return 0; | ||
984 | |||
985 | D_ASSERT(!(remote && send_oos)); | ||
986 | |||
987 | if (remote) { | ||
988 | _req_mod(req, TO_BE_SENT); | ||
989 | _req_mod(req, QUEUE_FOR_NET_WRITE); | ||
990 | } else if (drbd_set_out_of_sync(mdev, req->i.sector, req->i.size)) | ||
991 | _req_mod(req, QUEUE_FOR_SEND_OOS); | ||
992 | |||
993 | return remote; | ||
994 | } | ||
995 | |||
996 | static void | ||
997 | drbd_submit_req_private_bio(struct drbd_request *req) | ||
998 | { | ||
999 | struct drbd_conf *mdev = req->w.mdev; | ||
1000 | struct bio *bio = req->private_bio; | ||
1001 | const int rw = bio_rw(bio); | ||
1002 | |||
1003 | bio->bi_bdev = mdev->ldev->backing_bdev; | ||
1004 | |||
1005 | /* State may have changed since we grabbed our reference on the | ||
1006 | * ->ldev member. Double check, and short-circuit to endio. | ||
1007 | * In case the last activity log transaction failed to get on | ||
1008 | * stable storage, and this is a WRITE, we may not even submit | ||
1009 | * this bio. */ | ||
1010 | if (get_ldev(mdev)) { | ||
1011 | if (drbd_insert_fault(mdev, | ||
1012 | rw == WRITE ? DRBD_FAULT_DT_WR | ||
1013 | : rw == READ ? DRBD_FAULT_DT_RD | ||
1014 | : DRBD_FAULT_DT_RA)) | ||
1015 | bio_endio(bio, -EIO); | ||
1016 | else | ||
1017 | generic_make_request(bio); | ||
1018 | put_ldev(mdev); | ||
1019 | } else | ||
1020 | bio_endio(bio, -EIO); | ||
1021 | } | ||
1022 | |||
1023 | void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time) | ||
814 | { | 1024 | { |
815 | const int rw = bio_rw(bio); | 1025 | const int rw = bio_rw(bio); |
816 | const int size = bio->bi_size; | 1026 | struct bio_and_error m = { NULL, }; |
817 | const sector_t sector = bio->bi_sector; | ||
818 | struct drbd_tl_epoch *b = NULL; | ||
819 | struct drbd_request *req; | 1027 | struct drbd_request *req; |
820 | int local, remote, send_oos = 0; | 1028 | bool no_remote = false; |
821 | int err = -EIO; | ||
822 | int ret = 0; | ||
823 | union drbd_state s; | ||
824 | 1029 | ||
825 | /* allocate outside of all locks; */ | 1030 | /* allocate outside of all locks; */ |
826 | req = drbd_req_new(mdev, bio); | 1031 | req = drbd_req_new(mdev, bio); |
@@ -830,55 +1035,14 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns | |||
830 | * if user cannot handle io errors, that's not our business. */ | 1035 | * if user cannot handle io errors, that's not our business. */ |
831 | dev_err(DEV, "could not kmalloc() req\n"); | 1036 | dev_err(DEV, "could not kmalloc() req\n"); |
832 | bio_endio(bio, -ENOMEM); | 1037 | bio_endio(bio, -ENOMEM); |
833 | return 0; | 1038 | return; |
834 | } | 1039 | } |
835 | req->start_time = start_time; | 1040 | req->start_time = start_time; |
836 | 1041 | ||
837 | local = get_ldev(mdev); | 1042 | if (!get_ldev(mdev)) { |
838 | if (!local) { | 1043 | bio_put(req->private_bio); |
839 | bio_put(req->private_bio); /* or we get a bio leak */ | ||
840 | req->private_bio = NULL; | 1044 | req->private_bio = NULL; |
841 | } | 1045 | } |
842 | if (rw == WRITE) { | ||
843 | /* Need to replicate writes. Unless it is an empty flush, | ||
844 | * which is better mapped to a DRBD P_BARRIER packet, | ||
845 | * also for drbd wire protocol compatibility reasons. */ | ||
846 | if (unlikely(size == 0)) { | ||
847 | /* The only size==0 bios we expect are empty flushes. */ | ||
848 | D_ASSERT(bio->bi_rw & REQ_FLUSH); | ||
849 | remote = 0; | ||
850 | } else | ||
851 | remote = 1; | ||
852 | } else { | ||
853 | /* READ || READA */ | ||
854 | if (local) { | ||
855 | if (!drbd_may_do_local_read(mdev, sector, size)) { | ||
856 | /* we could kick the syncer to | ||
857 | * sync this extent asap, wait for | ||
858 | * it, then continue locally. | ||
859 | * Or just issue the request remotely. | ||
860 | */ | ||
861 | local = 0; | ||
862 | bio_put(req->private_bio); | ||
863 | req->private_bio = NULL; | ||
864 | put_ldev(mdev); | ||
865 | } | ||
866 | } | ||
867 | remote = !local && mdev->state.pdsk >= D_UP_TO_DATE; | ||
868 | } | ||
869 | |||
870 | /* If we have a disk, but a READA request is mapped to remote, | ||
871 | * we are R_PRIMARY, D_INCONSISTENT, SyncTarget. | ||
872 | * Just fail that READA request right here. | ||
873 | * | ||
874 | * THINK: maybe fail all READA when not local? | ||
875 | * or make this configurable... | ||
876 | * if network is slow, READA won't do any good. | ||
877 | */ | ||
878 | if (rw == READA && mdev->state.disk >= D_INCONSISTENT && !local) { | ||
879 | err = -EWOULDBLOCK; | ||
880 | goto fail_and_free_req; | ||
881 | } | ||
882 | 1046 | ||
883 | /* For WRITES going to the local disk, grab a reference on the target | 1047 | /* For WRITES going to the local disk, grab a reference on the target |
884 | * extent. This waits for any resync activity in the corresponding | 1048 | * extent. This waits for any resync activity in the corresponding |
@@ -887,348 +1051,131 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns | |||
887 | * of transactional on-disk meta data updates. | 1051 | * of transactional on-disk meta data updates. |
888 | * Empty flushes don't need to go into the activity log, they can only | 1052 | * Empty flushes don't need to go into the activity log, they can only |
889 | * flush data for pending writes which are already in there. */ | 1053 | * flush data for pending writes which are already in there. */ |
890 | if (rw == WRITE && local && size | 1054 | if (rw == WRITE && req->private_bio && req->i.size |
891 | && !test_bit(AL_SUSPENDED, &mdev->flags)) { | 1055 | && !test_bit(AL_SUSPENDED, &mdev->flags)) { |
892 | req->rq_state |= RQ_IN_ACT_LOG; | 1056 | req->rq_state |= RQ_IN_ACT_LOG; |
893 | drbd_al_begin_io(mdev, sector); | 1057 | drbd_al_begin_io(mdev, &req->i); |
894 | } | ||
895 | |||
896 | s = mdev->state; | ||
897 | remote = remote && drbd_should_do_remote(s); | ||
898 | send_oos = rw == WRITE && drbd_should_send_oos(s); | ||
899 | D_ASSERT(!(remote && send_oos)); | ||
900 | |||
901 | if (!(local || remote) && !is_susp(mdev->state)) { | ||
902 | if (__ratelimit(&drbd_ratelimit_state)) | ||
903 | dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); | ||
904 | goto fail_free_complete; | ||
905 | } | 1058 | } |
906 | 1059 | ||
907 | /* For WRITE request, we have to make sure that we have an | 1060 | spin_lock_irq(&mdev->tconn->req_lock); |
908 | * unused_spare_tle, in case we need to start a new epoch. | 1061 | if (rw == WRITE) { |
909 | * I try to be smart and avoid to pre-allocate always "just in case", | 1062 | /* This may temporarily give up the req_lock, |
910 | * but there is a race between testing the bit and pointer outside the | 1063 | * but will re-aquire it before it returns here. |
911 | * spinlock, and grabbing the spinlock. | 1064 | * Needs to be before the check on drbd_suspended() */ |
912 | * if we lost that race, we retry. */ | 1065 | complete_conflicting_writes(req); |
913 | if (rw == WRITE && (remote || send_oos) && | ||
914 | mdev->unused_spare_tle == NULL && | ||
915 | test_bit(CREATE_BARRIER, &mdev->flags)) { | ||
916 | allocate_barrier: | ||
917 | b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_NOIO); | ||
918 | if (!b) { | ||
919 | dev_err(DEV, "Failed to alloc barrier.\n"); | ||
920 | err = -ENOMEM; | ||
921 | goto fail_free_complete; | ||
922 | } | ||
923 | } | 1066 | } |
924 | 1067 | ||
925 | /* GOOD, everything prepared, grab the spin_lock */ | 1068 | /* no more giving up req_lock from now on! */ |
926 | spin_lock_irq(&mdev->req_lock); | ||
927 | |||
928 | if (is_susp(mdev->state)) { | ||
929 | /* If we got suspended, use the retry mechanism of | ||
930 | drbd_make_request() to restart processing of this | ||
931 | bio. In the next call to drbd_make_request | ||
932 | we sleep in inc_ap_bio() */ | ||
933 | ret = 1; | ||
934 | spin_unlock_irq(&mdev->req_lock); | ||
935 | goto fail_free_complete; | ||
936 | } | ||
937 | 1069 | ||
938 | if (remote || send_oos) { | 1070 | if (drbd_suspended(mdev)) { |
939 | remote = drbd_should_do_remote(mdev->state); | 1071 | /* push back and retry: */ |
940 | send_oos = rw == WRITE && drbd_should_send_oos(mdev->state); | 1072 | req->rq_state |= RQ_POSTPONED; |
941 | D_ASSERT(!(remote && send_oos)); | 1073 | if (req->private_bio) { |
942 | 1074 | bio_put(req->private_bio); | |
943 | if (!(remote || send_oos)) | 1075 | req->private_bio = NULL; |
944 | dev_warn(DEV, "lost connection while grabbing the req_lock!\n"); | 1076 | put_ldev(mdev); |
945 | if (!(local || remote)) { | ||
946 | dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); | ||
947 | spin_unlock_irq(&mdev->req_lock); | ||
948 | goto fail_free_complete; | ||
949 | } | 1077 | } |
1078 | goto out; | ||
950 | } | 1079 | } |
951 | 1080 | ||
952 | if (b && mdev->unused_spare_tle == NULL) { | ||
953 | mdev->unused_spare_tle = b; | ||
954 | b = NULL; | ||
955 | } | ||
956 | if (rw == WRITE && (remote || send_oos) && | ||
957 | mdev->unused_spare_tle == NULL && | ||
958 | test_bit(CREATE_BARRIER, &mdev->flags)) { | ||
959 | /* someone closed the current epoch | ||
960 | * while we were grabbing the spinlock */ | ||
961 | spin_unlock_irq(&mdev->req_lock); | ||
962 | goto allocate_barrier; | ||
963 | } | ||
964 | |||
965 | |||
966 | /* Update disk stats */ | 1081 | /* Update disk stats */ |
967 | _drbd_start_io_acct(mdev, req, bio); | 1082 | _drbd_start_io_acct(mdev, req, bio); |
968 | 1083 | ||
969 | /* _maybe_start_new_epoch(mdev); | 1084 | /* We fail READ/READA early, if we can not serve it. |
970 | * If we need to generate a write barrier packet, we have to add the | 1085 | * We must do this before req is registered on any lists. |
971 | * new epoch (barrier) object, and queue the barrier packet for sending, | 1086 | * Otherwise, drbd_req_complete() will queue failed READ for retry. */ |
972 | * and queue the req's data after it _within the same lock_, otherwise | 1087 | if (rw != WRITE) { |
973 | * we have race conditions were the reorder domains could be mixed up. | 1088 | if (!do_remote_read(req) && !req->private_bio) |
974 | * | 1089 | goto nodata; |
975 | * Even read requests may start a new epoch and queue the corresponding | ||
976 | * barrier packet. To get the write ordering right, we only have to | ||
977 | * make sure that, if this is a write request and it triggered a | ||
978 | * barrier packet, this request is queued within the same spinlock. */ | ||
979 | if ((remote || send_oos) && mdev->unused_spare_tle && | ||
980 | test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) { | ||
981 | _tl_add_barrier(mdev, mdev->unused_spare_tle); | ||
982 | mdev->unused_spare_tle = NULL; | ||
983 | } else { | ||
984 | D_ASSERT(!(remote && rw == WRITE && | ||
985 | test_bit(CREATE_BARRIER, &mdev->flags))); | ||
986 | } | 1090 | } |
987 | 1091 | ||
988 | /* NOTE | 1092 | /* which transfer log epoch does this belong to? */ |
989 | * Actually, 'local' may be wrong here already, since we may have failed | 1093 | req->epoch = atomic_read(&mdev->tconn->current_tle_nr); |
990 | * to write to the meta data, and may become wrong anytime because of | ||
991 | * local io-error for some other request, which would lead to us | ||
992 | * "detaching" the local disk. | ||
993 | * | ||
994 | * 'remote' may become wrong any time because the network could fail. | ||
995 | * | ||
996 | * This is a harmless race condition, though, since it is handled | ||
997 | * correctly at the appropriate places; so it just defers the failure | ||
998 | * of the respective operation. | ||
999 | */ | ||
1000 | |||
1001 | /* mark them early for readability. | ||
1002 | * this just sets some state flags. */ | ||
1003 | if (remote) | ||
1004 | _req_mod(req, to_be_send); | ||
1005 | if (local) | ||
1006 | _req_mod(req, to_be_submitted); | ||
1007 | |||
1008 | /* check this request on the collision detection hash tables. | ||
1009 | * if we have a conflict, just complete it here. | ||
1010 | * THINK do we want to check reads, too? (I don't think so...) */ | ||
1011 | if (rw == WRITE && _req_conflicts(req)) | ||
1012 | goto fail_conflicting; | ||
1013 | 1094 | ||
1014 | /* no point in adding empty flushes to the transfer log, | 1095 | /* no point in adding empty flushes to the transfer log, |
1015 | * they are mapped to drbd barriers already. */ | 1096 | * they are mapped to drbd barriers already. */ |
1016 | if (likely(size!=0)) | 1097 | if (likely(req->i.size!=0)) { |
1017 | list_add_tail(&req->tl_requests, &mdev->newest_tle->requests); | 1098 | if (rw == WRITE) |
1099 | mdev->tconn->current_tle_writes++; | ||
1018 | 1100 | ||
1019 | /* NOTE remote first: to get the concurrent write detection right, | 1101 | list_add_tail(&req->tl_requests, &mdev->tconn->transfer_log); |
1020 | * we must register the request before start of local IO. */ | ||
1021 | if (remote) { | ||
1022 | /* either WRITE and C_CONNECTED, | ||
1023 | * or READ, and no local disk, | ||
1024 | * or READ, but not in sync. | ||
1025 | */ | ||
1026 | _req_mod(req, (rw == WRITE) | ||
1027 | ? queue_for_net_write | ||
1028 | : queue_for_net_read); | ||
1029 | } | 1102 | } |
1030 | if (send_oos && drbd_set_out_of_sync(mdev, sector, size)) | ||
1031 | _req_mod(req, queue_for_send_oos); | ||
1032 | 1103 | ||
1033 | if (remote && | 1104 | if (rw == WRITE) { |
1034 | mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96) | 1105 | if (!drbd_process_write_request(req)) |
1035 | maybe_pull_ahead(mdev); | 1106 | no_remote = true; |
1036 | 1107 | } else { | |
1037 | /* If this was a flush, queue a drbd barrier/start a new epoch. | 1108 | /* We either have a private_bio, or we can read from remote. |
1038 | * Unless the current epoch was empty anyways, or we are not currently | 1109 | * Otherwise we had done the goto nodata above. */ |
1039 | * replicating, in which case there is no point. */ | 1110 | if (req->private_bio == NULL) { |
1040 | if (unlikely(bio->bi_rw & REQ_FLUSH) | 1111 | _req_mod(req, TO_BE_SENT); |
1041 | && mdev->newest_tle->n_writes | 1112 | _req_mod(req, QUEUE_FOR_NET_READ); |
1042 | && drbd_should_do_remote(mdev->state)) | ||
1043 | queue_barrier(mdev); | ||
1044 | |||
1045 | spin_unlock_irq(&mdev->req_lock); | ||
1046 | kfree(b); /* if someone else has beaten us to it... */ | ||
1047 | |||
1048 | if (local) { | ||
1049 | req->private_bio->bi_bdev = mdev->ldev->backing_bdev; | ||
1050 | |||
1051 | /* State may have changed since we grabbed our reference on the | ||
1052 | * mdev->ldev member. Double check, and short-circuit to endio. | ||
1053 | * In case the last activity log transaction failed to get on | ||
1054 | * stable storage, and this is a WRITE, we may not even submit | ||
1055 | * this bio. */ | ||
1056 | if (get_ldev(mdev)) { | ||
1057 | if (drbd_insert_fault(mdev, rw == WRITE ? DRBD_FAULT_DT_WR | ||
1058 | : rw == READ ? DRBD_FAULT_DT_RD | ||
1059 | : DRBD_FAULT_DT_RA)) | ||
1060 | bio_endio(req->private_bio, -EIO); | ||
1061 | else | ||
1062 | generic_make_request(req->private_bio); | ||
1063 | put_ldev(mdev); | ||
1064 | } else | 1113 | } else |
1065 | bio_endio(req->private_bio, -EIO); | 1114 | no_remote = true; |
1066 | } | 1115 | } |
1067 | 1116 | ||
1068 | return 0; | 1117 | if (req->private_bio) { |
1069 | 1118 | /* needs to be marked within the same spinlock */ | |
1070 | fail_conflicting: | 1119 | _req_mod(req, TO_BE_SUBMITTED); |
1071 | /* this is a conflicting request. | 1120 | /* but we need to give up the spinlock to submit */ |
1072 | * even though it may have been only _partially_ | 1121 | spin_unlock_irq(&mdev->tconn->req_lock); |
1073 | * overlapping with one of the currently pending requests, | 1122 | drbd_submit_req_private_bio(req); |
1074 | * without even submitting or sending it, we will | 1123 | spin_lock_irq(&mdev->tconn->req_lock); |
1075 | * pretend that it was successfully served right now. | 1124 | } else if (no_remote) { |
1076 | */ | 1125 | nodata: |
1077 | _drbd_end_io_acct(mdev, req); | 1126 | if (__ratelimit(&drbd_ratelimit_state)) |
1078 | spin_unlock_irq(&mdev->req_lock); | 1127 | dev_err(DEV, "IO ERROR: neither local nor remote data, sector %llu+%u\n", |
1079 | if (remote) | 1128 | (unsigned long long)req->i.sector, req->i.size >> 9); |
1080 | dec_ap_pending(mdev); | 1129 | /* A write may have been queued for send_oos, however. |
1081 | /* THINK: do we want to fail it (-EIO), or pretend success? | 1130 | * So we can not simply free it, we must go through drbd_req_put_completion_ref() */ |
1082 | * this pretends success. */ | ||
1083 | err = 0; | ||
1084 | |||
1085 | fail_free_complete: | ||
1086 | if (req->rq_state & RQ_IN_ACT_LOG) | ||
1087 | drbd_al_complete_io(mdev, sector); | ||
1088 | fail_and_free_req: | ||
1089 | if (local) { | ||
1090 | bio_put(req->private_bio); | ||
1091 | req->private_bio = NULL; | ||
1092 | put_ldev(mdev); | ||
1093 | } | 1131 | } |
1094 | if (!ret) | ||
1095 | bio_endio(bio, err); | ||
1096 | |||
1097 | drbd_req_free(req); | ||
1098 | dec_ap_bio(mdev); | ||
1099 | kfree(b); | ||
1100 | |||
1101 | return ret; | ||
1102 | } | ||
1103 | 1132 | ||
1104 | /* helper function for drbd_make_request | 1133 | out: |
1105 | * if we can determine just by the mdev (state) that this request will fail, | 1134 | if (drbd_req_put_completion_ref(req, &m, 1)) |
1106 | * return 1 | 1135 | kref_put(&req->kref, drbd_req_destroy); |
1107 | * otherwise return 0 | 1136 | spin_unlock_irq(&mdev->tconn->req_lock); |
1108 | */ | ||
1109 | static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write) | ||
1110 | { | ||
1111 | if (mdev->state.role != R_PRIMARY && | ||
1112 | (!allow_oos || is_write)) { | ||
1113 | if (__ratelimit(&drbd_ratelimit_state)) { | ||
1114 | dev_err(DEV, "Process %s[%u] tried to %s; " | ||
1115 | "since we are not in Primary state, " | ||
1116 | "we cannot allow this\n", | ||
1117 | current->comm, current->pid, | ||
1118 | is_write ? "WRITE" : "READ"); | ||
1119 | } | ||
1120 | return 1; | ||
1121 | } | ||
1122 | 1137 | ||
1123 | return 0; | 1138 | if (m.bio) |
1139 | complete_master_bio(mdev, &m); | ||
1140 | return; | ||
1124 | } | 1141 | } |
1125 | 1142 | ||
1126 | void drbd_make_request(struct request_queue *q, struct bio *bio) | 1143 | void drbd_make_request(struct request_queue *q, struct bio *bio) |
1127 | { | 1144 | { |
1128 | unsigned int s_enr, e_enr; | ||
1129 | struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata; | 1145 | struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata; |
1130 | unsigned long start_time; | 1146 | unsigned long start_time; |
1131 | 1147 | ||
1132 | if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) { | ||
1133 | bio_endio(bio, -EPERM); | ||
1134 | return; | ||
1135 | } | ||
1136 | |||
1137 | start_time = jiffies; | 1148 | start_time = jiffies; |
1138 | 1149 | ||
1139 | /* | 1150 | /* |
1140 | * what we "blindly" assume: | 1151 | * what we "blindly" assume: |
1141 | */ | 1152 | */ |
1142 | D_ASSERT((bio->bi_size & 0x1ff) == 0); | 1153 | D_ASSERT(IS_ALIGNED(bio->bi_size, 512)); |
1143 | |||
1144 | /* to make some things easier, force alignment of requests within the | ||
1145 | * granularity of our hash tables */ | ||
1146 | s_enr = bio->bi_sector >> HT_SHIFT; | ||
1147 | e_enr = bio->bi_size ? (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT : s_enr; | ||
1148 | |||
1149 | if (likely(s_enr == e_enr)) { | ||
1150 | do { | ||
1151 | inc_ap_bio(mdev, 1); | ||
1152 | } while (drbd_make_request_common(mdev, bio, start_time)); | ||
1153 | return; | ||
1154 | } | ||
1155 | |||
1156 | /* can this bio be split generically? | ||
1157 | * Maybe add our own split-arbitrary-bios function. */ | ||
1158 | if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_BIO_SIZE) { | ||
1159 | /* rather error out here than BUG in bio_split */ | ||
1160 | dev_err(DEV, "bio would need to, but cannot, be split: " | ||
1161 | "(vcnt=%u,idx=%u,size=%u,sector=%llu)\n", | ||
1162 | bio->bi_vcnt, bio->bi_idx, bio->bi_size, | ||
1163 | (unsigned long long)bio->bi_sector); | ||
1164 | bio_endio(bio, -EINVAL); | ||
1165 | } else { | ||
1166 | /* This bio crosses some boundary, so we have to split it. */ | ||
1167 | struct bio_pair *bp; | ||
1168 | /* works for the "do not cross hash slot boundaries" case | ||
1169 | * e.g. sector 262269, size 4096 | ||
1170 | * s_enr = 262269 >> 6 = 4097 | ||
1171 | * e_enr = (262269+8-1) >> 6 = 4098 | ||
1172 | * HT_SHIFT = 6 | ||
1173 | * sps = 64, mask = 63 | ||
1174 | * first_sectors = 64 - (262269 & 63) = 3 | ||
1175 | */ | ||
1176 | const sector_t sect = bio->bi_sector; | ||
1177 | const int sps = 1 << HT_SHIFT; /* sectors per slot */ | ||
1178 | const int mask = sps - 1; | ||
1179 | const sector_t first_sectors = sps - (sect & mask); | ||
1180 | bp = bio_split(bio, first_sectors); | ||
1181 | 1154 | ||
1182 | /* we need to get a "reference count" (ap_bio_cnt) | 1155 | inc_ap_bio(mdev); |
1183 | * to avoid races with the disconnect/reconnect/suspend code. | 1156 | __drbd_make_request(mdev, bio, start_time); |
1184 | * In case we need to split the bio here, we need to get three references | ||
1185 | * atomically, otherwise we might deadlock when trying to submit the | ||
1186 | * second one! */ | ||
1187 | inc_ap_bio(mdev, 3); | ||
1188 | |||
1189 | D_ASSERT(e_enr == s_enr + 1); | ||
1190 | |||
1191 | while (drbd_make_request_common(mdev, &bp->bio1, start_time)) | ||
1192 | inc_ap_bio(mdev, 1); | ||
1193 | |||
1194 | while (drbd_make_request_common(mdev, &bp->bio2, start_time)) | ||
1195 | inc_ap_bio(mdev, 1); | ||
1196 | |||
1197 | dec_ap_bio(mdev); | ||
1198 | |||
1199 | bio_pair_release(bp); | ||
1200 | } | ||
1201 | } | 1157 | } |
1202 | 1158 | ||
1203 | /* This is called by bio_add_page(). With this function we reduce | 1159 | /* This is called by bio_add_page(). |
1204 | * the number of BIOs that span over multiple DRBD_MAX_BIO_SIZEs | 1160 | * |
1205 | * units (was AL_EXTENTs). | 1161 | * q->max_hw_sectors and other global limits are already enforced there. |
1206 | * | 1162 | * |
1207 | * we do the calculation within the lower 32bit of the byte offsets, | 1163 | * We need to call down to our lower level device, |
1208 | * since we don't care for actual offset, but only check whether it | 1164 | * in case it has special restrictions. |
1209 | * would cross "activity log extent" boundaries. | 1165 | * |
1166 | * We also may need to enforce configured max-bio-bvecs limits. | ||
1210 | * | 1167 | * |
1211 | * As long as the BIO is empty we have to allow at least one bvec, | 1168 | * As long as the BIO is empty we have to allow at least one bvec, |
1212 | * regardless of size and offset. so the resulting bio may still | 1169 | * regardless of size and offset, so no need to ask lower levels. |
1213 | * cross extent boundaries. those are dealt with (bio_split) in | ||
1214 | * drbd_make_request. | ||
1215 | */ | 1170 | */ |
1216 | int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec) | 1171 | int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec) |
1217 | { | 1172 | { |
1218 | struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata; | 1173 | struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata; |
1219 | unsigned int bio_offset = | ||
1220 | (unsigned int)bvm->bi_sector << 9; /* 32 bit */ | ||
1221 | unsigned int bio_size = bvm->bi_size; | 1174 | unsigned int bio_size = bvm->bi_size; |
1222 | int limit, backing_limit; | 1175 | int limit = DRBD_MAX_BIO_SIZE; |
1223 | 1176 | int backing_limit; | |
1224 | limit = DRBD_MAX_BIO_SIZE | 1177 | |
1225 | - ((bio_offset & (DRBD_MAX_BIO_SIZE-1)) + bio_size); | 1178 | if (bio_size && get_ldev(mdev)) { |
1226 | if (limit < 0) | ||
1227 | limit = 0; | ||
1228 | if (bio_size == 0) { | ||
1229 | if (limit <= bvec->bv_len) | ||
1230 | limit = bvec->bv_len; | ||
1231 | } else if (limit && get_ldev(mdev)) { | ||
1232 | struct request_queue * const b = | 1179 | struct request_queue * const b = |
1233 | mdev->ldev->backing_bdev->bd_disk->queue; | 1180 | mdev->ldev->backing_bdev->bd_disk->queue; |
1234 | if (b->merge_bvec_fn) { | 1181 | if (b->merge_bvec_fn) { |
@@ -1240,24 +1187,38 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct | |||
1240 | return limit; | 1187 | return limit; |
1241 | } | 1188 | } |
1242 | 1189 | ||
1190 | struct drbd_request *find_oldest_request(struct drbd_tconn *tconn) | ||
1191 | { | ||
1192 | /* Walk the transfer log, | ||
1193 | * and find the oldest not yet completed request */ | ||
1194 | struct drbd_request *r; | ||
1195 | list_for_each_entry(r, &tconn->transfer_log, tl_requests) { | ||
1196 | if (atomic_read(&r->completion_ref)) | ||
1197 | return r; | ||
1198 | } | ||
1199 | return NULL; | ||
1200 | } | ||
1201 | |||
1243 | void request_timer_fn(unsigned long data) | 1202 | void request_timer_fn(unsigned long data) |
1244 | { | 1203 | { |
1245 | struct drbd_conf *mdev = (struct drbd_conf *) data; | 1204 | struct drbd_conf *mdev = (struct drbd_conf *) data; |
1205 | struct drbd_tconn *tconn = mdev->tconn; | ||
1246 | struct drbd_request *req; /* oldest request */ | 1206 | struct drbd_request *req; /* oldest request */ |
1247 | struct list_head *le; | 1207 | struct net_conf *nc; |
1248 | unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */ | 1208 | unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */ |
1249 | unsigned long now; | 1209 | unsigned long now; |
1250 | 1210 | ||
1251 | if (get_net_conf(mdev)) { | 1211 | rcu_read_lock(); |
1252 | if (mdev->state.conn >= C_WF_REPORT_PARAMS) | 1212 | nc = rcu_dereference(tconn->net_conf); |
1253 | ent = mdev->net_conf->timeout*HZ/10 | 1213 | if (nc && mdev->state.conn >= C_WF_REPORT_PARAMS) |
1254 | * mdev->net_conf->ko_count; | 1214 | ent = nc->timeout * HZ/10 * nc->ko_count; |
1255 | put_net_conf(mdev); | 1215 | |
1256 | } | ||
1257 | if (get_ldev(mdev)) { /* implicit state.disk >= D_INCONSISTENT */ | 1216 | if (get_ldev(mdev)) { /* implicit state.disk >= D_INCONSISTENT */ |
1258 | dt = mdev->ldev->dc.disk_timeout * HZ / 10; | 1217 | dt = rcu_dereference(mdev->ldev->disk_conf)->disk_timeout * HZ / 10; |
1259 | put_ldev(mdev); | 1218 | put_ldev(mdev); |
1260 | } | 1219 | } |
1220 | rcu_read_unlock(); | ||
1221 | |||
1261 | et = min_not_zero(dt, ent); | 1222 | et = min_not_zero(dt, ent); |
1262 | 1223 | ||
1263 | if (!et) | 1224 | if (!et) |
@@ -1265,17 +1226,14 @@ void request_timer_fn(unsigned long data) | |||
1265 | 1226 | ||
1266 | now = jiffies; | 1227 | now = jiffies; |
1267 | 1228 | ||
1268 | spin_lock_irq(&mdev->req_lock); | 1229 | spin_lock_irq(&tconn->req_lock); |
1269 | le = &mdev->oldest_tle->requests; | 1230 | req = find_oldest_request(tconn); |
1270 | if (list_empty(le)) { | 1231 | if (!req) { |
1271 | spin_unlock_irq(&mdev->req_lock); | 1232 | spin_unlock_irq(&tconn->req_lock); |
1272 | mod_timer(&mdev->request_timer, now + et); | 1233 | mod_timer(&mdev->request_timer, now + et); |
1273 | return; | 1234 | return; |
1274 | } | 1235 | } |
1275 | 1236 | ||
1276 | le = le->prev; | ||
1277 | req = list_entry(le, struct drbd_request, tl_requests); | ||
1278 | |||
1279 | /* The request is considered timed out, if | 1237 | /* The request is considered timed out, if |
1280 | * - we have some effective timeout from the configuration, | 1238 | * - we have some effective timeout from the configuration, |
1281 | * with above state restrictions applied, | 1239 | * with above state restrictions applied, |
@@ -1294,17 +1252,17 @@ void request_timer_fn(unsigned long data) | |||
1294 | */ | 1252 | */ |
1295 | if (ent && req->rq_state & RQ_NET_PENDING && | 1253 | if (ent && req->rq_state & RQ_NET_PENDING && |
1296 | time_after(now, req->start_time + ent) && | 1254 | time_after(now, req->start_time + ent) && |
1297 | !time_in_range(now, mdev->last_reconnect_jif, mdev->last_reconnect_jif + ent)) { | 1255 | !time_in_range(now, tconn->last_reconnect_jif, tconn->last_reconnect_jif + ent)) { |
1298 | dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n"); | 1256 | dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n"); |
1299 | _drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL); | 1257 | _drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL); |
1300 | } | 1258 | } |
1301 | if (dt && req->rq_state & RQ_LOCAL_PENDING && | 1259 | if (dt && req->rq_state & RQ_LOCAL_PENDING && req->w.mdev == mdev && |
1302 | time_after(now, req->start_time + dt) && | 1260 | time_after(now, req->start_time + dt) && |
1303 | !time_in_range(now, mdev->last_reattach_jif, mdev->last_reattach_jif + dt)) { | 1261 | !time_in_range(now, mdev->last_reattach_jif, mdev->last_reattach_jif + dt)) { |
1304 | dev_warn(DEV, "Local backing device failed to meet the disk-timeout\n"); | 1262 | dev_warn(DEV, "Local backing device failed to meet the disk-timeout\n"); |
1305 | __drbd_chk_io_error(mdev, DRBD_FORCE_DETACH); | 1263 | __drbd_chk_io_error(mdev, DRBD_FORCE_DETACH); |
1306 | } | 1264 | } |
1307 | nt = (time_after(now, req->start_time + et) ? now : req->start_time) + et; | 1265 | nt = (time_after(now, req->start_time + et) ? now : req->start_time) + et; |
1308 | spin_unlock_irq(&mdev->req_lock); | 1266 | spin_unlock_irq(&tconn->req_lock); |
1309 | mod_timer(&mdev->request_timer, nt); | 1267 | mod_timer(&mdev->request_timer, nt); |
1310 | } | 1268 | } |
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index 3d2111919486..016de6b8bb57 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h | |||
@@ -77,40 +77,41 @@ | |||
77 | */ | 77 | */ |
78 | 78 | ||
79 | enum drbd_req_event { | 79 | enum drbd_req_event { |
80 | created, | 80 | CREATED, |
81 | to_be_send, | 81 | TO_BE_SENT, |
82 | to_be_submitted, | 82 | TO_BE_SUBMITTED, |
83 | 83 | ||
84 | /* XXX yes, now I am inconsistent... | 84 | /* XXX yes, now I am inconsistent... |
85 | * these are not "events" but "actions" | 85 | * these are not "events" but "actions" |
86 | * oh, well... */ | 86 | * oh, well... */ |
87 | queue_for_net_write, | 87 | QUEUE_FOR_NET_WRITE, |
88 | queue_for_net_read, | 88 | QUEUE_FOR_NET_READ, |
89 | queue_for_send_oos, | 89 | QUEUE_FOR_SEND_OOS, |
90 | 90 | ||
91 | send_canceled, | 91 | SEND_CANCELED, |
92 | send_failed, | 92 | SEND_FAILED, |
93 | handed_over_to_network, | 93 | HANDED_OVER_TO_NETWORK, |
94 | oos_handed_to_network, | 94 | OOS_HANDED_TO_NETWORK, |
95 | connection_lost_while_pending, | 95 | CONNECTION_LOST_WHILE_PENDING, |
96 | read_retry_remote_canceled, | 96 | READ_RETRY_REMOTE_CANCELED, |
97 | recv_acked_by_peer, | 97 | RECV_ACKED_BY_PEER, |
98 | write_acked_by_peer, | 98 | WRITE_ACKED_BY_PEER, |
99 | write_acked_by_peer_and_sis, /* and set_in_sync */ | 99 | WRITE_ACKED_BY_PEER_AND_SIS, /* and set_in_sync */ |
100 | conflict_discarded_by_peer, | 100 | CONFLICT_RESOLVED, |
101 | neg_acked, | 101 | POSTPONE_WRITE, |
102 | barrier_acked, /* in protocol A and B */ | 102 | NEG_ACKED, |
103 | data_received, /* (remote read) */ | 103 | BARRIER_ACKED, /* in protocol A and B */ |
104 | 104 | DATA_RECEIVED, /* (remote read) */ | |
105 | read_completed_with_error, | 105 | |
106 | read_ahead_completed_with_error, | 106 | READ_COMPLETED_WITH_ERROR, |
107 | write_completed_with_error, | 107 | READ_AHEAD_COMPLETED_WITH_ERROR, |
108 | abort_disk_io, | 108 | WRITE_COMPLETED_WITH_ERROR, |
109 | completed_ok, | 109 | ABORT_DISK_IO, |
110 | resend, | 110 | COMPLETED_OK, |
111 | fail_frozen_disk_io, | 111 | RESEND, |
112 | restart_frozen_disk_io, | 112 | FAIL_FROZEN_DISK_IO, |
113 | nothing, /* for tracing only */ | 113 | RESTART_FROZEN_DISK_IO, |
114 | NOTHING, | ||
114 | }; | 115 | }; |
115 | 116 | ||
116 | /* encoding of request states for now. we don't actually need that many bits. | 117 | /* encoding of request states for now. we don't actually need that many bits. |
@@ -142,8 +143,8 @@ enum drbd_req_state_bits { | |||
142 | * recv_ack (B) or implicit "ack" (A), | 143 | * recv_ack (B) or implicit "ack" (A), |
143 | * still waiting for the barrier ack. | 144 | * still waiting for the barrier ack. |
144 | * master_bio may already be completed and invalidated. | 145 | * master_bio may already be completed and invalidated. |
145 | * 11100: write_acked (C), | 146 | * 11100: write acked (C), |
146 | * data_received (for remote read, any protocol) | 147 | * data received (for remote read, any protocol) |
147 | * or finally the barrier ack has arrived (B,A)... | 148 | * or finally the barrier ack has arrived (B,A)... |
148 | * request can be freed | 149 | * request can be freed |
149 | * 01100: neg-acked (write, protocol C) | 150 | * 01100: neg-acked (write, protocol C) |
@@ -198,6 +199,22 @@ enum drbd_req_state_bits { | |||
198 | 199 | ||
199 | /* Should call drbd_al_complete_io() for this request... */ | 200 | /* Should call drbd_al_complete_io() for this request... */ |
200 | __RQ_IN_ACT_LOG, | 201 | __RQ_IN_ACT_LOG, |
202 | |||
203 | /* The peer has sent a retry ACK */ | ||
204 | __RQ_POSTPONED, | ||
205 | |||
206 | /* would have been completed, | ||
207 | * but was not, because of drbd_suspended() */ | ||
208 | __RQ_COMPLETION_SUSP, | ||
209 | |||
210 | /* We expect a receive ACK (wire proto B) */ | ||
211 | __RQ_EXP_RECEIVE_ACK, | ||
212 | |||
213 | /* We expect a write ACK (wite proto C) */ | ||
214 | __RQ_EXP_WRITE_ACK, | ||
215 | |||
216 | /* waiting for a barrier ack, did an extra kref_get */ | ||
217 | __RQ_EXP_BARR_ACK, | ||
201 | }; | 218 | }; |
202 | 219 | ||
203 | #define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING) | 220 | #define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING) |
@@ -219,56 +236,16 @@ enum drbd_req_state_bits { | |||
219 | 236 | ||
220 | #define RQ_WRITE (1UL << __RQ_WRITE) | 237 | #define RQ_WRITE (1UL << __RQ_WRITE) |
221 | #define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG) | 238 | #define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG) |
239 | #define RQ_POSTPONED (1UL << __RQ_POSTPONED) | ||
240 | #define RQ_COMPLETION_SUSP (1UL << __RQ_COMPLETION_SUSP) | ||
241 | #define RQ_EXP_RECEIVE_ACK (1UL << __RQ_EXP_RECEIVE_ACK) | ||
242 | #define RQ_EXP_WRITE_ACK (1UL << __RQ_EXP_WRITE_ACK) | ||
243 | #define RQ_EXP_BARR_ACK (1UL << __RQ_EXP_BARR_ACK) | ||
222 | 244 | ||
223 | /* For waking up the frozen transfer log mod_req() has to return if the request | 245 | /* For waking up the frozen transfer log mod_req() has to return if the request |
224 | should be counted in the epoch object*/ | 246 | should be counted in the epoch object*/ |
225 | #define MR_WRITE_SHIFT 0 | 247 | #define MR_WRITE 1 |
226 | #define MR_WRITE (1 << MR_WRITE_SHIFT) | 248 | #define MR_READ 2 |
227 | #define MR_READ_SHIFT 1 | ||
228 | #define MR_READ (1 << MR_READ_SHIFT) | ||
229 | |||
230 | /* epoch entries */ | ||
231 | static inline | ||
232 | struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector) | ||
233 | { | ||
234 | BUG_ON(mdev->ee_hash_s == 0); | ||
235 | return mdev->ee_hash + | ||
236 | ((unsigned int)(sector>>HT_SHIFT) % mdev->ee_hash_s); | ||
237 | } | ||
238 | |||
239 | /* transfer log (drbd_request objects) */ | ||
240 | static inline | ||
241 | struct hlist_head *tl_hash_slot(struct drbd_conf *mdev, sector_t sector) | ||
242 | { | ||
243 | BUG_ON(mdev->tl_hash_s == 0); | ||
244 | return mdev->tl_hash + | ||
245 | ((unsigned int)(sector>>HT_SHIFT) % mdev->tl_hash_s); | ||
246 | } | ||
247 | |||
248 | /* application reads (drbd_request objects) */ | ||
249 | static struct hlist_head *ar_hash_slot(struct drbd_conf *mdev, sector_t sector) | ||
250 | { | ||
251 | return mdev->app_reads_hash | ||
252 | + ((unsigned int)(sector) % APP_R_HSIZE); | ||
253 | } | ||
254 | |||
255 | /* when we receive the answer for a read request, | ||
256 | * verify that we actually know about it */ | ||
257 | static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev, | ||
258 | u64 id, sector_t sector) | ||
259 | { | ||
260 | struct hlist_head *slot = ar_hash_slot(mdev, sector); | ||
261 | struct hlist_node *n; | ||
262 | struct drbd_request *req; | ||
263 | |||
264 | hlist_for_each_entry(req, n, slot, collision) { | ||
265 | if ((unsigned long)req == (unsigned long)id) { | ||
266 | D_ASSERT(req->sector == sector); | ||
267 | return req; | ||
268 | } | ||
269 | } | ||
270 | return NULL; | ||
271 | } | ||
272 | 249 | ||
273 | static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src) | 250 | static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src) |
274 | { | 251 | { |
@@ -278,41 +255,10 @@ static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bi | |||
278 | req->private_bio = bio; | 255 | req->private_bio = bio; |
279 | 256 | ||
280 | bio->bi_private = req; | 257 | bio->bi_private = req; |
281 | bio->bi_end_io = drbd_endio_pri; | 258 | bio->bi_end_io = drbd_request_endio; |
282 | bio->bi_next = NULL; | 259 | bio->bi_next = NULL; |
283 | } | 260 | } |
284 | 261 | ||
285 | static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev, | ||
286 | struct bio *bio_src) | ||
287 | { | ||
288 | struct drbd_request *req = | ||
289 | mempool_alloc(drbd_request_mempool, GFP_NOIO); | ||
290 | if (likely(req)) { | ||
291 | drbd_req_make_private_bio(req, bio_src); | ||
292 | |||
293 | req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0; | ||
294 | req->mdev = mdev; | ||
295 | req->master_bio = bio_src; | ||
296 | req->epoch = 0; | ||
297 | req->sector = bio_src->bi_sector; | ||
298 | req->size = bio_src->bi_size; | ||
299 | INIT_HLIST_NODE(&req->collision); | ||
300 | INIT_LIST_HEAD(&req->tl_requests); | ||
301 | INIT_LIST_HEAD(&req->w.list); | ||
302 | } | ||
303 | return req; | ||
304 | } | ||
305 | |||
306 | static inline void drbd_req_free(struct drbd_request *req) | ||
307 | { | ||
308 | mempool_free(req, drbd_request_mempool); | ||
309 | } | ||
310 | |||
311 | static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2) | ||
312 | { | ||
313 | return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9))); | ||
314 | } | ||
315 | |||
316 | /* Short lived temporary struct on the stack. | 262 | /* Short lived temporary struct on the stack. |
317 | * We could squirrel the error to be returned into | 263 | * We could squirrel the error to be returned into |
318 | * bio->bi_size, or similar. But that would be too ugly. */ | 264 | * bio->bi_size, or similar. But that would be too ugly. */ |
@@ -321,6 +267,7 @@ struct bio_and_error { | |||
321 | int error; | 267 | int error; |
322 | }; | 268 | }; |
323 | 269 | ||
270 | extern void drbd_req_destroy(struct kref *kref); | ||
324 | extern void _req_may_be_done(struct drbd_request *req, | 271 | extern void _req_may_be_done(struct drbd_request *req, |
325 | struct bio_and_error *m); | 272 | struct bio_and_error *m); |
326 | extern int __req_mod(struct drbd_request *req, enum drbd_req_event what, | 273 | extern int __req_mod(struct drbd_request *req, enum drbd_req_event what, |
@@ -328,13 +275,17 @@ extern int __req_mod(struct drbd_request *req, enum drbd_req_event what, | |||
328 | extern void complete_master_bio(struct drbd_conf *mdev, | 275 | extern void complete_master_bio(struct drbd_conf *mdev, |
329 | struct bio_and_error *m); | 276 | struct bio_and_error *m); |
330 | extern void request_timer_fn(unsigned long data); | 277 | extern void request_timer_fn(unsigned long data); |
331 | extern void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what); | 278 | extern void tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what); |
279 | extern void _tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what); | ||
280 | |||
281 | /* this is in drbd_main.c */ | ||
282 | extern void drbd_restart_request(struct drbd_request *req); | ||
332 | 283 | ||
333 | /* use this if you don't want to deal with calling complete_master_bio() | 284 | /* use this if you don't want to deal with calling complete_master_bio() |
334 | * outside the spinlock, e.g. when walking some list on cleanup. */ | 285 | * outside the spinlock, e.g. when walking some list on cleanup. */ |
335 | static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what) | 286 | static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what) |
336 | { | 287 | { |
337 | struct drbd_conf *mdev = req->mdev; | 288 | struct drbd_conf *mdev = req->w.mdev; |
338 | struct bio_and_error m; | 289 | struct bio_and_error m; |
339 | int rv; | 290 | int rv; |
340 | 291 | ||
@@ -354,13 +305,13 @@ static inline int req_mod(struct drbd_request *req, | |||
354 | enum drbd_req_event what) | 305 | enum drbd_req_event what) |
355 | { | 306 | { |
356 | unsigned long flags; | 307 | unsigned long flags; |
357 | struct drbd_conf *mdev = req->mdev; | 308 | struct drbd_conf *mdev = req->w.mdev; |
358 | struct bio_and_error m; | 309 | struct bio_and_error m; |
359 | int rv; | 310 | int rv; |
360 | 311 | ||
361 | spin_lock_irqsave(&mdev->req_lock, flags); | 312 | spin_lock_irqsave(&mdev->tconn->req_lock, flags); |
362 | rv = __req_mod(req, what, &m); | 313 | rv = __req_mod(req, what, &m); |
363 | spin_unlock_irqrestore(&mdev->req_lock, flags); | 314 | spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); |
364 | 315 | ||
365 | if (m.bio) | 316 | if (m.bio) |
366 | complete_master_bio(mdev, &m); | 317 | complete_master_bio(mdev, &m); |
@@ -368,7 +319,7 @@ static inline int req_mod(struct drbd_request *req, | |||
368 | return rv; | 319 | return rv; |
369 | } | 320 | } |
370 | 321 | ||
371 | static inline bool drbd_should_do_remote(union drbd_state s) | 322 | static inline bool drbd_should_do_remote(union drbd_dev_state s) |
372 | { | 323 | { |
373 | return s.pdsk == D_UP_TO_DATE || | 324 | return s.pdsk == D_UP_TO_DATE || |
374 | (s.pdsk >= D_INCONSISTENT && | 325 | (s.pdsk >= D_INCONSISTENT && |
@@ -378,7 +329,7 @@ static inline bool drbd_should_do_remote(union drbd_state s) | |||
378 | That is equivalent since before 96 IO was frozen in the C_WF_BITMAP* | 329 | That is equivalent since before 96 IO was frozen in the C_WF_BITMAP* |
379 | states. */ | 330 | states. */ |
380 | } | 331 | } |
381 | static inline bool drbd_should_send_oos(union drbd_state s) | 332 | static inline bool drbd_should_send_out_of_sync(union drbd_dev_state s) |
382 | { | 333 | { |
383 | return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S; | 334 | return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S; |
384 | /* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary | 335 | /* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary |
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c new file mode 100644 index 000000000000..53bf6182bac4 --- /dev/null +++ b/drivers/block/drbd/drbd_state.c | |||
@@ -0,0 +1,1856 @@ | |||
1 | /* | ||
2 | drbd_state.c | ||
3 | |||
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | ||
5 | |||
6 | Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. | ||
7 | Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. | ||
8 | Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | ||
9 | |||
10 | Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev | ||
11 | from Logicworks, Inc. for making SDP replication support possible. | ||
12 | |||
13 | drbd is free software; you can redistribute it and/or modify | ||
14 | it under the terms of the GNU General Public License as published by | ||
15 | the Free Software Foundation; either version 2, or (at your option) | ||
16 | any later version. | ||
17 | |||
18 | drbd is distributed in the hope that it will be useful, | ||
19 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
20 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
21 | GNU General Public License for more details. | ||
22 | |||
23 | You should have received a copy of the GNU General Public License | ||
24 | along with drbd; see the file COPYING. If not, write to | ||
25 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | ||
26 | */ | ||
27 | |||
28 | #include <linux/drbd_limits.h> | ||
29 | #include "drbd_int.h" | ||
30 | #include "drbd_req.h" | ||
31 | |||
32 | /* in drbd_main.c */ | ||
33 | extern void tl_abort_disk_io(struct drbd_conf *mdev); | ||
34 | |||
35 | struct after_state_chg_work { | ||
36 | struct drbd_work w; | ||
37 | union drbd_state os; | ||
38 | union drbd_state ns; | ||
39 | enum chg_state_flags flags; | ||
40 | struct completion *done; | ||
41 | }; | ||
42 | |||
43 | enum sanitize_state_warnings { | ||
44 | NO_WARNING, | ||
45 | ABORTED_ONLINE_VERIFY, | ||
46 | ABORTED_RESYNC, | ||
47 | CONNECTION_LOST_NEGOTIATING, | ||
48 | IMPLICITLY_UPGRADED_DISK, | ||
49 | IMPLICITLY_UPGRADED_PDSK, | ||
50 | }; | ||
51 | |||
52 | static int w_after_state_ch(struct drbd_work *w, int unused); | ||
53 | static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, | ||
54 | union drbd_state ns, enum chg_state_flags flags); | ||
55 | static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state); | ||
56 | static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_tconn *); | ||
57 | static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns); | ||
58 | static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state ns, | ||
59 | enum sanitize_state_warnings *warn); | ||
60 | |||
61 | static inline bool is_susp(union drbd_state s) | ||
62 | { | ||
63 | return s.susp || s.susp_nod || s.susp_fen; | ||
64 | } | ||
65 | |||
66 | bool conn_all_vols_unconf(struct drbd_tconn *tconn) | ||
67 | { | ||
68 | struct drbd_conf *mdev; | ||
69 | bool rv = true; | ||
70 | int vnr; | ||
71 | |||
72 | rcu_read_lock(); | ||
73 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
74 | if (mdev->state.disk != D_DISKLESS || | ||
75 | mdev->state.conn != C_STANDALONE || | ||
76 | mdev->state.role != R_SECONDARY) { | ||
77 | rv = false; | ||
78 | break; | ||
79 | } | ||
80 | } | ||
81 | rcu_read_unlock(); | ||
82 | |||
83 | return rv; | ||
84 | } | ||
85 | |||
86 | /* Unfortunately the states where not correctly ordered, when | ||
87 | they where defined. therefore can not use max_t() here. */ | ||
88 | static enum drbd_role max_role(enum drbd_role role1, enum drbd_role role2) | ||
89 | { | ||
90 | if (role1 == R_PRIMARY || role2 == R_PRIMARY) | ||
91 | return R_PRIMARY; | ||
92 | if (role1 == R_SECONDARY || role2 == R_SECONDARY) | ||
93 | return R_SECONDARY; | ||
94 | return R_UNKNOWN; | ||
95 | } | ||
96 | static enum drbd_role min_role(enum drbd_role role1, enum drbd_role role2) | ||
97 | { | ||
98 | if (role1 == R_UNKNOWN || role2 == R_UNKNOWN) | ||
99 | return R_UNKNOWN; | ||
100 | if (role1 == R_SECONDARY || role2 == R_SECONDARY) | ||
101 | return R_SECONDARY; | ||
102 | return R_PRIMARY; | ||
103 | } | ||
104 | |||
105 | enum drbd_role conn_highest_role(struct drbd_tconn *tconn) | ||
106 | { | ||
107 | enum drbd_role role = R_UNKNOWN; | ||
108 | struct drbd_conf *mdev; | ||
109 | int vnr; | ||
110 | |||
111 | rcu_read_lock(); | ||
112 | idr_for_each_entry(&tconn->volumes, mdev, vnr) | ||
113 | role = max_role(role, mdev->state.role); | ||
114 | rcu_read_unlock(); | ||
115 | |||
116 | return role; | ||
117 | } | ||
118 | |||
119 | enum drbd_role conn_highest_peer(struct drbd_tconn *tconn) | ||
120 | { | ||
121 | enum drbd_role peer = R_UNKNOWN; | ||
122 | struct drbd_conf *mdev; | ||
123 | int vnr; | ||
124 | |||
125 | rcu_read_lock(); | ||
126 | idr_for_each_entry(&tconn->volumes, mdev, vnr) | ||
127 | peer = max_role(peer, mdev->state.peer); | ||
128 | rcu_read_unlock(); | ||
129 | |||
130 | return peer; | ||
131 | } | ||
132 | |||
133 | enum drbd_disk_state conn_highest_disk(struct drbd_tconn *tconn) | ||
134 | { | ||
135 | enum drbd_disk_state ds = D_DISKLESS; | ||
136 | struct drbd_conf *mdev; | ||
137 | int vnr; | ||
138 | |||
139 | rcu_read_lock(); | ||
140 | idr_for_each_entry(&tconn->volumes, mdev, vnr) | ||
141 | ds = max_t(enum drbd_disk_state, ds, mdev->state.disk); | ||
142 | rcu_read_unlock(); | ||
143 | |||
144 | return ds; | ||
145 | } | ||
146 | |||
147 | enum drbd_disk_state conn_lowest_disk(struct drbd_tconn *tconn) | ||
148 | { | ||
149 | enum drbd_disk_state ds = D_MASK; | ||
150 | struct drbd_conf *mdev; | ||
151 | int vnr; | ||
152 | |||
153 | rcu_read_lock(); | ||
154 | idr_for_each_entry(&tconn->volumes, mdev, vnr) | ||
155 | ds = min_t(enum drbd_disk_state, ds, mdev->state.disk); | ||
156 | rcu_read_unlock(); | ||
157 | |||
158 | return ds; | ||
159 | } | ||
160 | |||
161 | enum drbd_disk_state conn_highest_pdsk(struct drbd_tconn *tconn) | ||
162 | { | ||
163 | enum drbd_disk_state ds = D_DISKLESS; | ||
164 | struct drbd_conf *mdev; | ||
165 | int vnr; | ||
166 | |||
167 | rcu_read_lock(); | ||
168 | idr_for_each_entry(&tconn->volumes, mdev, vnr) | ||
169 | ds = max_t(enum drbd_disk_state, ds, mdev->state.pdsk); | ||
170 | rcu_read_unlock(); | ||
171 | |||
172 | return ds; | ||
173 | } | ||
174 | |||
175 | enum drbd_conns conn_lowest_conn(struct drbd_tconn *tconn) | ||
176 | { | ||
177 | enum drbd_conns conn = C_MASK; | ||
178 | struct drbd_conf *mdev; | ||
179 | int vnr; | ||
180 | |||
181 | rcu_read_lock(); | ||
182 | idr_for_each_entry(&tconn->volumes, mdev, vnr) | ||
183 | conn = min_t(enum drbd_conns, conn, mdev->state.conn); | ||
184 | rcu_read_unlock(); | ||
185 | |||
186 | return conn; | ||
187 | } | ||
188 | |||
189 | static bool no_peer_wf_report_params(struct drbd_tconn *tconn) | ||
190 | { | ||
191 | struct drbd_conf *mdev; | ||
192 | int vnr; | ||
193 | bool rv = true; | ||
194 | |||
195 | rcu_read_lock(); | ||
196 | idr_for_each_entry(&tconn->volumes, mdev, vnr) | ||
197 | if (mdev->state.conn == C_WF_REPORT_PARAMS) { | ||
198 | rv = false; | ||
199 | break; | ||
200 | } | ||
201 | rcu_read_unlock(); | ||
202 | |||
203 | return rv; | ||
204 | } | ||
205 | |||
206 | |||
207 | /** | ||
208 | * cl_wide_st_chg() - true if the state change is a cluster wide one | ||
209 | * @mdev: DRBD device. | ||
210 | * @os: old (current) state. | ||
211 | * @ns: new (wanted) state. | ||
212 | */ | ||
213 | static int cl_wide_st_chg(struct drbd_conf *mdev, | ||
214 | union drbd_state os, union drbd_state ns) | ||
215 | { | ||
216 | return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED && | ||
217 | ((os.role != R_PRIMARY && ns.role == R_PRIMARY) || | ||
218 | (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || | ||
219 | (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) || | ||
220 | (os.disk != D_FAILED && ns.disk == D_FAILED))) || | ||
221 | (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) || | ||
222 | (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S) || | ||
223 | (os.conn == C_CONNECTED && ns.conn == C_WF_REPORT_PARAMS); | ||
224 | } | ||
225 | |||
226 | static union drbd_state | ||
227 | apply_mask_val(union drbd_state os, union drbd_state mask, union drbd_state val) | ||
228 | { | ||
229 | union drbd_state ns; | ||
230 | ns.i = (os.i & ~mask.i) | val.i; | ||
231 | return ns; | ||
232 | } | ||
233 | |||
234 | enum drbd_state_rv | ||
235 | drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f, | ||
236 | union drbd_state mask, union drbd_state val) | ||
237 | { | ||
238 | unsigned long flags; | ||
239 | union drbd_state ns; | ||
240 | enum drbd_state_rv rv; | ||
241 | |||
242 | spin_lock_irqsave(&mdev->tconn->req_lock, flags); | ||
243 | ns = apply_mask_val(drbd_read_state(mdev), mask, val); | ||
244 | rv = _drbd_set_state(mdev, ns, f, NULL); | ||
245 | spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); | ||
246 | |||
247 | return rv; | ||
248 | } | ||
249 | |||
250 | /** | ||
251 | * drbd_force_state() - Impose a change which happens outside our control on our state | ||
252 | * @mdev: DRBD device. | ||
253 | * @mask: mask of state bits to change. | ||
254 | * @val: value of new state bits. | ||
255 | */ | ||
256 | void drbd_force_state(struct drbd_conf *mdev, | ||
257 | union drbd_state mask, union drbd_state val) | ||
258 | { | ||
259 | drbd_change_state(mdev, CS_HARD, mask, val); | ||
260 | } | ||
261 | |||
262 | static enum drbd_state_rv | ||
263 | _req_st_cond(struct drbd_conf *mdev, union drbd_state mask, | ||
264 | union drbd_state val) | ||
265 | { | ||
266 | union drbd_state os, ns; | ||
267 | unsigned long flags; | ||
268 | enum drbd_state_rv rv; | ||
269 | |||
270 | if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags)) | ||
271 | return SS_CW_SUCCESS; | ||
272 | |||
273 | if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags)) | ||
274 | return SS_CW_FAILED_BY_PEER; | ||
275 | |||
276 | spin_lock_irqsave(&mdev->tconn->req_lock, flags); | ||
277 | os = drbd_read_state(mdev); | ||
278 | ns = sanitize_state(mdev, apply_mask_val(os, mask, val), NULL); | ||
279 | rv = is_valid_transition(os, ns); | ||
280 | if (rv >= SS_SUCCESS) | ||
281 | rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */ | ||
282 | |||
283 | if (!cl_wide_st_chg(mdev, os, ns)) | ||
284 | rv = SS_CW_NO_NEED; | ||
285 | if (rv == SS_UNKNOWN_ERROR) { | ||
286 | rv = is_valid_state(mdev, ns); | ||
287 | if (rv >= SS_SUCCESS) { | ||
288 | rv = is_valid_soft_transition(os, ns, mdev->tconn); | ||
289 | if (rv >= SS_SUCCESS) | ||
290 | rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */ | ||
291 | } | ||
292 | } | ||
293 | spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); | ||
294 | |||
295 | return rv; | ||
296 | } | ||
297 | |||
298 | /** | ||
299 | * drbd_req_state() - Perform an eventually cluster wide state change | ||
300 | * @mdev: DRBD device. | ||
301 | * @mask: mask of state bits to change. | ||
302 | * @val: value of new state bits. | ||
303 | * @f: flags | ||
304 | * | ||
305 | * Should not be called directly, use drbd_request_state() or | ||
306 | * _drbd_request_state(). | ||
307 | */ | ||
308 | static enum drbd_state_rv | ||
309 | drbd_req_state(struct drbd_conf *mdev, union drbd_state mask, | ||
310 | union drbd_state val, enum chg_state_flags f) | ||
311 | { | ||
312 | struct completion done; | ||
313 | unsigned long flags; | ||
314 | union drbd_state os, ns; | ||
315 | enum drbd_state_rv rv; | ||
316 | |||
317 | init_completion(&done); | ||
318 | |||
319 | if (f & CS_SERIALIZE) | ||
320 | mutex_lock(mdev->state_mutex); | ||
321 | |||
322 | spin_lock_irqsave(&mdev->tconn->req_lock, flags); | ||
323 | os = drbd_read_state(mdev); | ||
324 | ns = sanitize_state(mdev, apply_mask_val(os, mask, val), NULL); | ||
325 | rv = is_valid_transition(os, ns); | ||
326 | if (rv < SS_SUCCESS) { | ||
327 | spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); | ||
328 | goto abort; | ||
329 | } | ||
330 | |||
331 | if (cl_wide_st_chg(mdev, os, ns)) { | ||
332 | rv = is_valid_state(mdev, ns); | ||
333 | if (rv == SS_SUCCESS) | ||
334 | rv = is_valid_soft_transition(os, ns, mdev->tconn); | ||
335 | spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); | ||
336 | |||
337 | if (rv < SS_SUCCESS) { | ||
338 | if (f & CS_VERBOSE) | ||
339 | print_st_err(mdev, os, ns, rv); | ||
340 | goto abort; | ||
341 | } | ||
342 | |||
343 | if (drbd_send_state_req(mdev, mask, val)) { | ||
344 | rv = SS_CW_FAILED_BY_PEER; | ||
345 | if (f & CS_VERBOSE) | ||
346 | print_st_err(mdev, os, ns, rv); | ||
347 | goto abort; | ||
348 | } | ||
349 | |||
350 | wait_event(mdev->state_wait, | ||
351 | (rv = _req_st_cond(mdev, mask, val))); | ||
352 | |||
353 | if (rv < SS_SUCCESS) { | ||
354 | if (f & CS_VERBOSE) | ||
355 | print_st_err(mdev, os, ns, rv); | ||
356 | goto abort; | ||
357 | } | ||
358 | spin_lock_irqsave(&mdev->tconn->req_lock, flags); | ||
359 | ns = apply_mask_val(drbd_read_state(mdev), mask, val); | ||
360 | rv = _drbd_set_state(mdev, ns, f, &done); | ||
361 | } else { | ||
362 | rv = _drbd_set_state(mdev, ns, f, &done); | ||
363 | } | ||
364 | |||
365 | spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); | ||
366 | |||
367 | if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) { | ||
368 | D_ASSERT(current != mdev->tconn->worker.task); | ||
369 | wait_for_completion(&done); | ||
370 | } | ||
371 | |||
372 | abort: | ||
373 | if (f & CS_SERIALIZE) | ||
374 | mutex_unlock(mdev->state_mutex); | ||
375 | |||
376 | return rv; | ||
377 | } | ||
378 | |||
379 | /** | ||
380 | * _drbd_request_state() - Request a state change (with flags) | ||
381 | * @mdev: DRBD device. | ||
382 | * @mask: mask of state bits to change. | ||
383 | * @val: value of new state bits. | ||
384 | * @f: flags | ||
385 | * | ||
386 | * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE | ||
387 | * flag, or when logging of failed state change requests is not desired. | ||
388 | */ | ||
389 | enum drbd_state_rv | ||
390 | _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask, | ||
391 | union drbd_state val, enum chg_state_flags f) | ||
392 | { | ||
393 | enum drbd_state_rv rv; | ||
394 | |||
395 | wait_event(mdev->state_wait, | ||
396 | (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE); | ||
397 | |||
398 | return rv; | ||
399 | } | ||
400 | |||
401 | static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns) | ||
402 | { | ||
403 | dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c%c%c }\n", | ||
404 | name, | ||
405 | drbd_conn_str(ns.conn), | ||
406 | drbd_role_str(ns.role), | ||
407 | drbd_role_str(ns.peer), | ||
408 | drbd_disk_str(ns.disk), | ||
409 | drbd_disk_str(ns.pdsk), | ||
410 | is_susp(ns) ? 's' : 'r', | ||
411 | ns.aftr_isp ? 'a' : '-', | ||
412 | ns.peer_isp ? 'p' : '-', | ||
413 | ns.user_isp ? 'u' : '-', | ||
414 | ns.susp_fen ? 'F' : '-', | ||
415 | ns.susp_nod ? 'N' : '-' | ||
416 | ); | ||
417 | } | ||
418 | |||
419 | void print_st_err(struct drbd_conf *mdev, union drbd_state os, | ||
420 | union drbd_state ns, enum drbd_state_rv err) | ||
421 | { | ||
422 | if (err == SS_IN_TRANSIENT_STATE) | ||
423 | return; | ||
424 | dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err)); | ||
425 | print_st(mdev, " state", os); | ||
426 | print_st(mdev, "wanted", ns); | ||
427 | } | ||
428 | |||
429 | static long print_state_change(char *pb, union drbd_state os, union drbd_state ns, | ||
430 | enum chg_state_flags flags) | ||
431 | { | ||
432 | char *pbp; | ||
433 | pbp = pb; | ||
434 | *pbp = 0; | ||
435 | |||
436 | if (ns.role != os.role && flags & CS_DC_ROLE) | ||
437 | pbp += sprintf(pbp, "role( %s -> %s ) ", | ||
438 | drbd_role_str(os.role), | ||
439 | drbd_role_str(ns.role)); | ||
440 | if (ns.peer != os.peer && flags & CS_DC_PEER) | ||
441 | pbp += sprintf(pbp, "peer( %s -> %s ) ", | ||
442 | drbd_role_str(os.peer), | ||
443 | drbd_role_str(ns.peer)); | ||
444 | if (ns.conn != os.conn && flags & CS_DC_CONN) | ||
445 | pbp += sprintf(pbp, "conn( %s -> %s ) ", | ||
446 | drbd_conn_str(os.conn), | ||
447 | drbd_conn_str(ns.conn)); | ||
448 | if (ns.disk != os.disk && flags & CS_DC_DISK) | ||
449 | pbp += sprintf(pbp, "disk( %s -> %s ) ", | ||
450 | drbd_disk_str(os.disk), | ||
451 | drbd_disk_str(ns.disk)); | ||
452 | if (ns.pdsk != os.pdsk && flags & CS_DC_PDSK) | ||
453 | pbp += sprintf(pbp, "pdsk( %s -> %s ) ", | ||
454 | drbd_disk_str(os.pdsk), | ||
455 | drbd_disk_str(ns.pdsk)); | ||
456 | |||
457 | return pbp - pb; | ||
458 | } | ||
459 | |||
460 | static void drbd_pr_state_change(struct drbd_conf *mdev, union drbd_state os, union drbd_state ns, | ||
461 | enum chg_state_flags flags) | ||
462 | { | ||
463 | char pb[300]; | ||
464 | char *pbp = pb; | ||
465 | |||
466 | pbp += print_state_change(pbp, os, ns, flags ^ CS_DC_MASK); | ||
467 | |||
468 | if (ns.aftr_isp != os.aftr_isp) | ||
469 | pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ", | ||
470 | os.aftr_isp, | ||
471 | ns.aftr_isp); | ||
472 | if (ns.peer_isp != os.peer_isp) | ||
473 | pbp += sprintf(pbp, "peer_isp( %d -> %d ) ", | ||
474 | os.peer_isp, | ||
475 | ns.peer_isp); | ||
476 | if (ns.user_isp != os.user_isp) | ||
477 | pbp += sprintf(pbp, "user_isp( %d -> %d ) ", | ||
478 | os.user_isp, | ||
479 | ns.user_isp); | ||
480 | |||
481 | if (pbp != pb) | ||
482 | dev_info(DEV, "%s\n", pb); | ||
483 | } | ||
484 | |||
485 | static void conn_pr_state_change(struct drbd_tconn *tconn, union drbd_state os, union drbd_state ns, | ||
486 | enum chg_state_flags flags) | ||
487 | { | ||
488 | char pb[300]; | ||
489 | char *pbp = pb; | ||
490 | |||
491 | pbp += print_state_change(pbp, os, ns, flags); | ||
492 | |||
493 | if (is_susp(ns) != is_susp(os) && flags & CS_DC_SUSP) | ||
494 | pbp += sprintf(pbp, "susp( %d -> %d ) ", | ||
495 | is_susp(os), | ||
496 | is_susp(ns)); | ||
497 | |||
498 | if (pbp != pb) | ||
499 | conn_info(tconn, "%s\n", pb); | ||
500 | } | ||
501 | |||
502 | |||
503 | /** | ||
504 | * is_valid_state() - Returns an SS_ error code if ns is not valid | ||
505 | * @mdev: DRBD device. | ||
506 | * @ns: State to consider. | ||
507 | */ | ||
508 | static enum drbd_state_rv | ||
509 | is_valid_state(struct drbd_conf *mdev, union drbd_state ns) | ||
510 | { | ||
511 | /* See drbd_state_sw_errors in drbd_strings.c */ | ||
512 | |||
513 | enum drbd_fencing_p fp; | ||
514 | enum drbd_state_rv rv = SS_SUCCESS; | ||
515 | struct net_conf *nc; | ||
516 | |||
517 | rcu_read_lock(); | ||
518 | fp = FP_DONT_CARE; | ||
519 | if (get_ldev(mdev)) { | ||
520 | fp = rcu_dereference(mdev->ldev->disk_conf)->fencing; | ||
521 | put_ldev(mdev); | ||
522 | } | ||
523 | |||
524 | nc = rcu_dereference(mdev->tconn->net_conf); | ||
525 | if (nc) { | ||
526 | if (!nc->two_primaries && ns.role == R_PRIMARY) { | ||
527 | if (ns.peer == R_PRIMARY) | ||
528 | rv = SS_TWO_PRIMARIES; | ||
529 | else if (conn_highest_peer(mdev->tconn) == R_PRIMARY) | ||
530 | rv = SS_O_VOL_PEER_PRI; | ||
531 | } | ||
532 | } | ||
533 | |||
534 | if (rv <= 0) | ||
535 | /* already found a reason to abort */; | ||
536 | else if (ns.role == R_SECONDARY && mdev->open_cnt) | ||
537 | rv = SS_DEVICE_IN_USE; | ||
538 | |||
539 | else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE) | ||
540 | rv = SS_NO_UP_TO_DATE_DISK; | ||
541 | |||
542 | else if (fp >= FP_RESOURCE && | ||
543 | ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN) | ||
544 | rv = SS_PRIMARY_NOP; | ||
545 | |||
546 | else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT) | ||
547 | rv = SS_NO_UP_TO_DATE_DISK; | ||
548 | |||
549 | else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT) | ||
550 | rv = SS_NO_LOCAL_DISK; | ||
551 | |||
552 | else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT) | ||
553 | rv = SS_NO_REMOTE_DISK; | ||
554 | |||
555 | else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) | ||
556 | rv = SS_NO_UP_TO_DATE_DISK; | ||
557 | |||
558 | else if ((ns.conn == C_CONNECTED || | ||
559 | ns.conn == C_WF_BITMAP_S || | ||
560 | ns.conn == C_SYNC_SOURCE || | ||
561 | ns.conn == C_PAUSED_SYNC_S) && | ||
562 | ns.disk == D_OUTDATED) | ||
563 | rv = SS_CONNECTED_OUTDATES; | ||
564 | |||
565 | else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && | ||
566 | (nc->verify_alg[0] == 0)) | ||
567 | rv = SS_NO_VERIFY_ALG; | ||
568 | |||
569 | else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && | ||
570 | mdev->tconn->agreed_pro_version < 88) | ||
571 | rv = SS_NOT_SUPPORTED; | ||
572 | |||
573 | else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN) | ||
574 | rv = SS_CONNECTED_OUTDATES; | ||
575 | |||
576 | rcu_read_unlock(); | ||
577 | |||
578 | return rv; | ||
579 | } | ||
580 | |||
581 | /** | ||
582 | * is_valid_soft_transition() - Returns an SS_ error code if the state transition is not possible | ||
583 | * This function limits state transitions that may be declined by DRBD. I.e. | ||
584 | * user requests (aka soft transitions). | ||
585 | * @mdev: DRBD device. | ||
586 | * @ns: new state. | ||
587 | * @os: old state. | ||
588 | */ | ||
589 | static enum drbd_state_rv | ||
590 | is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_tconn *tconn) | ||
591 | { | ||
592 | enum drbd_state_rv rv = SS_SUCCESS; | ||
593 | |||
594 | if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) && | ||
595 | os.conn > C_CONNECTED) | ||
596 | rv = SS_RESYNC_RUNNING; | ||
597 | |||
598 | if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE) | ||
599 | rv = SS_ALREADY_STANDALONE; | ||
600 | |||
601 | if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS) | ||
602 | rv = SS_IS_DISKLESS; | ||
603 | |||
604 | if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED) | ||
605 | rv = SS_NO_NET_CONFIG; | ||
606 | |||
607 | if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING) | ||
608 | rv = SS_LOWER_THAN_OUTDATED; | ||
609 | |||
610 | if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED) | ||
611 | rv = SS_IN_TRANSIENT_STATE; | ||
612 | |||
613 | /* if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS) | ||
614 | rv = SS_IN_TRANSIENT_STATE; */ | ||
615 | |||
616 | /* While establishing a connection only allow cstate to change. | ||
617 | Delay/refuse role changes, detach attach etc... */ | ||
618 | if (test_bit(STATE_SENT, &tconn->flags) && | ||
619 | !(os.conn == C_WF_REPORT_PARAMS || | ||
620 | (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION))) | ||
621 | rv = SS_IN_TRANSIENT_STATE; | ||
622 | |||
623 | if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED) | ||
624 | rv = SS_NEED_CONNECTION; | ||
625 | |||
626 | if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && | ||
627 | ns.conn != os.conn && os.conn > C_CONNECTED) | ||
628 | rv = SS_RESYNC_RUNNING; | ||
629 | |||
630 | if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) && | ||
631 | os.conn < C_CONNECTED) | ||
632 | rv = SS_NEED_CONNECTION; | ||
633 | |||
634 | if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE) | ||
635 | && os.conn < C_WF_REPORT_PARAMS) | ||
636 | rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */ | ||
637 | |||
638 | return rv; | ||
639 | } | ||
640 | |||
641 | static enum drbd_state_rv | ||
642 | is_valid_conn_transition(enum drbd_conns oc, enum drbd_conns nc) | ||
643 | { | ||
644 | /* no change -> nothing to do, at least for the connection part */ | ||
645 | if (oc == nc) | ||
646 | return SS_NOTHING_TO_DO; | ||
647 | |||
648 | /* disconnect of an unconfigured connection does not make sense */ | ||
649 | if (oc == C_STANDALONE && nc == C_DISCONNECTING) | ||
650 | return SS_ALREADY_STANDALONE; | ||
651 | |||
652 | /* from C_STANDALONE, we start with C_UNCONNECTED */ | ||
653 | if (oc == C_STANDALONE && nc != C_UNCONNECTED) | ||
654 | return SS_NEED_CONNECTION; | ||
655 | |||
656 | /* When establishing a connection we need to go through WF_REPORT_PARAMS! | ||
657 | Necessary to do the right thing upon invalidate-remote on a disconnected resource */ | ||
658 | if (oc < C_WF_REPORT_PARAMS && nc >= C_CONNECTED) | ||
659 | return SS_NEED_CONNECTION; | ||
660 | |||
661 | /* After a network error only C_UNCONNECTED or C_DISCONNECTING may follow. */ | ||
662 | if (oc >= C_TIMEOUT && oc <= C_TEAR_DOWN && nc != C_UNCONNECTED && nc != C_DISCONNECTING) | ||
663 | return SS_IN_TRANSIENT_STATE; | ||
664 | |||
665 | /* After C_DISCONNECTING only C_STANDALONE may follow */ | ||
666 | if (oc == C_DISCONNECTING && nc != C_STANDALONE) | ||
667 | return SS_IN_TRANSIENT_STATE; | ||
668 | |||
669 | return SS_SUCCESS; | ||
670 | } | ||
671 | |||
672 | |||
673 | /** | ||
674 | * is_valid_transition() - Returns an SS_ error code if the state transition is not possible | ||
675 | * This limits hard state transitions. Hard state transitions are facts there are | ||
676 | * imposed on DRBD by the environment. E.g. disk broke or network broke down. | ||
677 | * But those hard state transitions are still not allowed to do everything. | ||
678 | * @ns: new state. | ||
679 | * @os: old state. | ||
680 | */ | ||
681 | static enum drbd_state_rv | ||
682 | is_valid_transition(union drbd_state os, union drbd_state ns) | ||
683 | { | ||
684 | enum drbd_state_rv rv; | ||
685 | |||
686 | rv = is_valid_conn_transition(os.conn, ns.conn); | ||
687 | |||
688 | /* we cannot fail (again) if we already detached */ | ||
689 | if (ns.disk == D_FAILED && os.disk == D_DISKLESS) | ||
690 | rv = SS_IS_DISKLESS; | ||
691 | |||
692 | return rv; | ||
693 | } | ||
694 | |||
695 | static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn) | ||
696 | { | ||
697 | static const char *msg_table[] = { | ||
698 | [NO_WARNING] = "", | ||
699 | [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.", | ||
700 | [ABORTED_RESYNC] = "Resync aborted.", | ||
701 | [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!", | ||
702 | [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk", | ||
703 | [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk", | ||
704 | }; | ||
705 | |||
706 | if (warn != NO_WARNING) | ||
707 | dev_warn(DEV, "%s\n", msg_table[warn]); | ||
708 | } | ||
709 | |||
710 | /** | ||
711 | * sanitize_state() - Resolves implicitly necessary additional changes to a state transition | ||
712 | * @mdev: DRBD device. | ||
713 | * @os: old state. | ||
714 | * @ns: new state. | ||
715 | * @warn_sync_abort: | ||
716 | * | ||
717 | * When we loose connection, we have to set the state of the peers disk (pdsk) | ||
718 | * to D_UNKNOWN. This rule and many more along those lines are in this function. | ||
719 | */ | ||
720 | static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state ns, | ||
721 | enum sanitize_state_warnings *warn) | ||
722 | { | ||
723 | enum drbd_fencing_p fp; | ||
724 | enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max; | ||
725 | |||
726 | if (warn) | ||
727 | *warn = NO_WARNING; | ||
728 | |||
729 | fp = FP_DONT_CARE; | ||
730 | if (get_ldev(mdev)) { | ||
731 | rcu_read_lock(); | ||
732 | fp = rcu_dereference(mdev->ldev->disk_conf)->fencing; | ||
733 | rcu_read_unlock(); | ||
734 | put_ldev(mdev); | ||
735 | } | ||
736 | |||
737 | /* Implications from connection to peer and peer_isp */ | ||
738 | if (ns.conn < C_CONNECTED) { | ||
739 | ns.peer_isp = 0; | ||
740 | ns.peer = R_UNKNOWN; | ||
741 | if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT) | ||
742 | ns.pdsk = D_UNKNOWN; | ||
743 | } | ||
744 | |||
745 | /* Clear the aftr_isp when becoming unconfigured */ | ||
746 | if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY) | ||
747 | ns.aftr_isp = 0; | ||
748 | |||
749 | /* An implication of the disk states onto the connection state */ | ||
750 | /* Abort resync if a disk fails/detaches */ | ||
751 | if (ns.conn > C_CONNECTED && (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) { | ||
752 | if (warn) | ||
753 | *warn = ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T ? | ||
754 | ABORTED_ONLINE_VERIFY : ABORTED_RESYNC; | ||
755 | ns.conn = C_CONNECTED; | ||
756 | } | ||
757 | |||
758 | /* Connection breaks down before we finished "Negotiating" */ | ||
759 | if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING && | ||
760 | get_ldev_if_state(mdev, D_NEGOTIATING)) { | ||
761 | if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) { | ||
762 | ns.disk = mdev->new_state_tmp.disk; | ||
763 | ns.pdsk = mdev->new_state_tmp.pdsk; | ||
764 | } else { | ||
765 | if (warn) | ||
766 | *warn = CONNECTION_LOST_NEGOTIATING; | ||
767 | ns.disk = D_DISKLESS; | ||
768 | ns.pdsk = D_UNKNOWN; | ||
769 | } | ||
770 | put_ldev(mdev); | ||
771 | } | ||
772 | |||
773 | /* D_CONSISTENT and D_OUTDATED vanish when we get connected */ | ||
774 | if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) { | ||
775 | if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) | ||
776 | ns.disk = D_UP_TO_DATE; | ||
777 | if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED) | ||
778 | ns.pdsk = D_UP_TO_DATE; | ||
779 | } | ||
780 | |||
781 | /* Implications of the connection stat on the disk states */ | ||
782 | disk_min = D_DISKLESS; | ||
783 | disk_max = D_UP_TO_DATE; | ||
784 | pdsk_min = D_INCONSISTENT; | ||
785 | pdsk_max = D_UNKNOWN; | ||
786 | switch ((enum drbd_conns)ns.conn) { | ||
787 | case C_WF_BITMAP_T: | ||
788 | case C_PAUSED_SYNC_T: | ||
789 | case C_STARTING_SYNC_T: | ||
790 | case C_WF_SYNC_UUID: | ||
791 | case C_BEHIND: | ||
792 | disk_min = D_INCONSISTENT; | ||
793 | disk_max = D_OUTDATED; | ||
794 | pdsk_min = D_UP_TO_DATE; | ||
795 | pdsk_max = D_UP_TO_DATE; | ||
796 | break; | ||
797 | case C_VERIFY_S: | ||
798 | case C_VERIFY_T: | ||
799 | disk_min = D_UP_TO_DATE; | ||
800 | disk_max = D_UP_TO_DATE; | ||
801 | pdsk_min = D_UP_TO_DATE; | ||
802 | pdsk_max = D_UP_TO_DATE; | ||
803 | break; | ||
804 | case C_CONNECTED: | ||
805 | disk_min = D_DISKLESS; | ||
806 | disk_max = D_UP_TO_DATE; | ||
807 | pdsk_min = D_DISKLESS; | ||
808 | pdsk_max = D_UP_TO_DATE; | ||
809 | break; | ||
810 | case C_WF_BITMAP_S: | ||
811 | case C_PAUSED_SYNC_S: | ||
812 | case C_STARTING_SYNC_S: | ||
813 | case C_AHEAD: | ||
814 | disk_min = D_UP_TO_DATE; | ||
815 | disk_max = D_UP_TO_DATE; | ||
816 | pdsk_min = D_INCONSISTENT; | ||
817 | pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/ | ||
818 | break; | ||
819 | case C_SYNC_TARGET: | ||
820 | disk_min = D_INCONSISTENT; | ||
821 | disk_max = D_INCONSISTENT; | ||
822 | pdsk_min = D_UP_TO_DATE; | ||
823 | pdsk_max = D_UP_TO_DATE; | ||
824 | break; | ||
825 | case C_SYNC_SOURCE: | ||
826 | disk_min = D_UP_TO_DATE; | ||
827 | disk_max = D_UP_TO_DATE; | ||
828 | pdsk_min = D_INCONSISTENT; | ||
829 | pdsk_max = D_INCONSISTENT; | ||
830 | break; | ||
831 | case C_STANDALONE: | ||
832 | case C_DISCONNECTING: | ||
833 | case C_UNCONNECTED: | ||
834 | case C_TIMEOUT: | ||
835 | case C_BROKEN_PIPE: | ||
836 | case C_NETWORK_FAILURE: | ||
837 | case C_PROTOCOL_ERROR: | ||
838 | case C_TEAR_DOWN: | ||
839 | case C_WF_CONNECTION: | ||
840 | case C_WF_REPORT_PARAMS: | ||
841 | case C_MASK: | ||
842 | break; | ||
843 | } | ||
844 | if (ns.disk > disk_max) | ||
845 | ns.disk = disk_max; | ||
846 | |||
847 | if (ns.disk < disk_min) { | ||
848 | if (warn) | ||
849 | *warn = IMPLICITLY_UPGRADED_DISK; | ||
850 | ns.disk = disk_min; | ||
851 | } | ||
852 | if (ns.pdsk > pdsk_max) | ||
853 | ns.pdsk = pdsk_max; | ||
854 | |||
855 | if (ns.pdsk < pdsk_min) { | ||
856 | if (warn) | ||
857 | *warn = IMPLICITLY_UPGRADED_PDSK; | ||
858 | ns.pdsk = pdsk_min; | ||
859 | } | ||
860 | |||
861 | if (fp == FP_STONITH && | ||
862 | (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED)) | ||
863 | ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */ | ||
864 | |||
865 | if (mdev->tconn->res_opts.on_no_data == OND_SUSPEND_IO && | ||
866 | (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)) | ||
867 | ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */ | ||
868 | |||
869 | if (ns.aftr_isp || ns.peer_isp || ns.user_isp) { | ||
870 | if (ns.conn == C_SYNC_SOURCE) | ||
871 | ns.conn = C_PAUSED_SYNC_S; | ||
872 | if (ns.conn == C_SYNC_TARGET) | ||
873 | ns.conn = C_PAUSED_SYNC_T; | ||
874 | } else { | ||
875 | if (ns.conn == C_PAUSED_SYNC_S) | ||
876 | ns.conn = C_SYNC_SOURCE; | ||
877 | if (ns.conn == C_PAUSED_SYNC_T) | ||
878 | ns.conn = C_SYNC_TARGET; | ||
879 | } | ||
880 | |||
881 | return ns; | ||
882 | } | ||
883 | |||
884 | void drbd_resume_al(struct drbd_conf *mdev) | ||
885 | { | ||
886 | if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags)) | ||
887 | dev_info(DEV, "Resumed AL updates\n"); | ||
888 | } | ||
889 | |||
890 | /* helper for __drbd_set_state */ | ||
891 | static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs) | ||
892 | { | ||
893 | if (mdev->tconn->agreed_pro_version < 90) | ||
894 | mdev->ov_start_sector = 0; | ||
895 | mdev->rs_total = drbd_bm_bits(mdev); | ||
896 | mdev->ov_position = 0; | ||
897 | if (cs == C_VERIFY_T) { | ||
898 | /* starting online verify from an arbitrary position | ||
899 | * does not fit well into the existing protocol. | ||
900 | * on C_VERIFY_T, we initialize ov_left and friends | ||
901 | * implicitly in receive_DataRequest once the | ||
902 | * first P_OV_REQUEST is received */ | ||
903 | mdev->ov_start_sector = ~(sector_t)0; | ||
904 | } else { | ||
905 | unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector); | ||
906 | if (bit >= mdev->rs_total) { | ||
907 | mdev->ov_start_sector = | ||
908 | BM_BIT_TO_SECT(mdev->rs_total - 1); | ||
909 | mdev->rs_total = 1; | ||
910 | } else | ||
911 | mdev->rs_total -= bit; | ||
912 | mdev->ov_position = mdev->ov_start_sector; | ||
913 | } | ||
914 | mdev->ov_left = mdev->rs_total; | ||
915 | } | ||
916 | |||
917 | /** | ||
918 | * __drbd_set_state() - Set a new DRBD state | ||
919 | * @mdev: DRBD device. | ||
920 | * @ns: new state. | ||
921 | * @flags: Flags | ||
922 | * @done: Optional completion, that will get completed after the after_state_ch() finished | ||
923 | * | ||
924 | * Caller needs to hold req_lock, and global_state_lock. Do not call directly. | ||
925 | */ | ||
926 | enum drbd_state_rv | ||
927 | __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, | ||
928 | enum chg_state_flags flags, struct completion *done) | ||
929 | { | ||
930 | union drbd_state os; | ||
931 | enum drbd_state_rv rv = SS_SUCCESS; | ||
932 | enum sanitize_state_warnings ssw; | ||
933 | struct after_state_chg_work *ascw; | ||
934 | |||
935 | os = drbd_read_state(mdev); | ||
936 | |||
937 | ns = sanitize_state(mdev, ns, &ssw); | ||
938 | if (ns.i == os.i) | ||
939 | return SS_NOTHING_TO_DO; | ||
940 | |||
941 | rv = is_valid_transition(os, ns); | ||
942 | if (rv < SS_SUCCESS) | ||
943 | return rv; | ||
944 | |||
945 | if (!(flags & CS_HARD)) { | ||
946 | /* pre-state-change checks ; only look at ns */ | ||
947 | /* See drbd_state_sw_errors in drbd_strings.c */ | ||
948 | |||
949 | rv = is_valid_state(mdev, ns); | ||
950 | if (rv < SS_SUCCESS) { | ||
951 | /* If the old state was illegal as well, then let | ||
952 | this happen...*/ | ||
953 | |||
954 | if (is_valid_state(mdev, os) == rv) | ||
955 | rv = is_valid_soft_transition(os, ns, mdev->tconn); | ||
956 | } else | ||
957 | rv = is_valid_soft_transition(os, ns, mdev->tconn); | ||
958 | } | ||
959 | |||
960 | if (rv < SS_SUCCESS) { | ||
961 | if (flags & CS_VERBOSE) | ||
962 | print_st_err(mdev, os, ns, rv); | ||
963 | return rv; | ||
964 | } | ||
965 | |||
966 | print_sanitize_warnings(mdev, ssw); | ||
967 | |||
968 | drbd_pr_state_change(mdev, os, ns, flags); | ||
969 | |||
970 | /* Display changes to the susp* flags that where caused by the call to | ||
971 | sanitize_state(). Only display it here if we where not called from | ||
972 | _conn_request_state() */ | ||
973 | if (!(flags & CS_DC_SUSP)) | ||
974 | conn_pr_state_change(mdev->tconn, os, ns, (flags & ~CS_DC_MASK) | CS_DC_SUSP); | ||
975 | |||
976 | /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference | ||
977 | * on the ldev here, to be sure the transition -> D_DISKLESS resp. | ||
978 | * drbd_ldev_destroy() won't happen before our corresponding | ||
979 | * after_state_ch works run, where we put_ldev again. */ | ||
980 | if ((os.disk != D_FAILED && ns.disk == D_FAILED) || | ||
981 | (os.disk != D_DISKLESS && ns.disk == D_DISKLESS)) | ||
982 | atomic_inc(&mdev->local_cnt); | ||
983 | |||
984 | mdev->state.i = ns.i; | ||
985 | mdev->tconn->susp = ns.susp; | ||
986 | mdev->tconn->susp_nod = ns.susp_nod; | ||
987 | mdev->tconn->susp_fen = ns.susp_fen; | ||
988 | |||
989 | if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING) | ||
990 | drbd_print_uuids(mdev, "attached to UUIDs"); | ||
991 | |||
992 | /* Wake up role changes, that were delayed because of connection establishing */ | ||
993 | if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS && | ||
994 | no_peer_wf_report_params(mdev->tconn)) | ||
995 | clear_bit(STATE_SENT, &mdev->tconn->flags); | ||
996 | |||
997 | wake_up(&mdev->misc_wait); | ||
998 | wake_up(&mdev->state_wait); | ||
999 | wake_up(&mdev->tconn->ping_wait); | ||
1000 | |||
1001 | /* Aborted verify run, or we reached the stop sector. | ||
1002 | * Log the last position, unless end-of-device. */ | ||
1003 | if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) && | ||
1004 | ns.conn <= C_CONNECTED) { | ||
1005 | mdev->ov_start_sector = | ||
1006 | BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left); | ||
1007 | if (mdev->ov_left) | ||
1008 | dev_info(DEV, "Online Verify reached sector %llu\n", | ||
1009 | (unsigned long long)mdev->ov_start_sector); | ||
1010 | } | ||
1011 | |||
1012 | if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) && | ||
1013 | (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) { | ||
1014 | dev_info(DEV, "Syncer continues.\n"); | ||
1015 | mdev->rs_paused += (long)jiffies | ||
1016 | -(long)mdev->rs_mark_time[mdev->rs_last_mark]; | ||
1017 | if (ns.conn == C_SYNC_TARGET) | ||
1018 | mod_timer(&mdev->resync_timer, jiffies); | ||
1019 | } | ||
1020 | |||
1021 | if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) && | ||
1022 | (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) { | ||
1023 | dev_info(DEV, "Resync suspended\n"); | ||
1024 | mdev->rs_mark_time[mdev->rs_last_mark] = jiffies; | ||
1025 | } | ||
1026 | |||
1027 | if (os.conn == C_CONNECTED && | ||
1028 | (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) { | ||
1029 | unsigned long now = jiffies; | ||
1030 | int i; | ||
1031 | |||
1032 | set_ov_position(mdev, ns.conn); | ||
1033 | mdev->rs_start = now; | ||
1034 | mdev->rs_last_events = 0; | ||
1035 | mdev->rs_last_sect_ev = 0; | ||
1036 | mdev->ov_last_oos_size = 0; | ||
1037 | mdev->ov_last_oos_start = 0; | ||
1038 | |||
1039 | for (i = 0; i < DRBD_SYNC_MARKS; i++) { | ||
1040 | mdev->rs_mark_left[i] = mdev->ov_left; | ||
1041 | mdev->rs_mark_time[i] = now; | ||
1042 | } | ||
1043 | |||
1044 | drbd_rs_controller_reset(mdev); | ||
1045 | |||
1046 | if (ns.conn == C_VERIFY_S) { | ||
1047 | dev_info(DEV, "Starting Online Verify from sector %llu\n", | ||
1048 | (unsigned long long)mdev->ov_position); | ||
1049 | mod_timer(&mdev->resync_timer, jiffies); | ||
1050 | } | ||
1051 | } | ||
1052 | |||
1053 | if (get_ldev(mdev)) { | ||
1054 | u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND| | ||
1055 | MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE| | ||
1056 | MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY); | ||
1057 | |||
1058 | mdf &= ~MDF_AL_CLEAN; | ||
1059 | if (test_bit(CRASHED_PRIMARY, &mdev->flags)) | ||
1060 | mdf |= MDF_CRASHED_PRIMARY; | ||
1061 | if (mdev->state.role == R_PRIMARY || | ||
1062 | (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY)) | ||
1063 | mdf |= MDF_PRIMARY_IND; | ||
1064 | if (mdev->state.conn > C_WF_REPORT_PARAMS) | ||
1065 | mdf |= MDF_CONNECTED_IND; | ||
1066 | if (mdev->state.disk > D_INCONSISTENT) | ||
1067 | mdf |= MDF_CONSISTENT; | ||
1068 | if (mdev->state.disk > D_OUTDATED) | ||
1069 | mdf |= MDF_WAS_UP_TO_DATE; | ||
1070 | if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT) | ||
1071 | mdf |= MDF_PEER_OUT_DATED; | ||
1072 | if (mdf != mdev->ldev->md.flags) { | ||
1073 | mdev->ldev->md.flags = mdf; | ||
1074 | drbd_md_mark_dirty(mdev); | ||
1075 | } | ||
1076 | if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT) | ||
1077 | drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]); | ||
1078 | put_ldev(mdev); | ||
1079 | } | ||
1080 | |||
1081 | /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */ | ||
1082 | if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT && | ||
1083 | os.peer == R_SECONDARY && ns.peer == R_PRIMARY) | ||
1084 | set_bit(CONSIDER_RESYNC, &mdev->flags); | ||
1085 | |||
1086 | /* Receiver should clean up itself */ | ||
1087 | if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING) | ||
1088 | drbd_thread_stop_nowait(&mdev->tconn->receiver); | ||
1089 | |||
1090 | /* Now the receiver finished cleaning up itself, it should die */ | ||
1091 | if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE) | ||
1092 | drbd_thread_stop_nowait(&mdev->tconn->receiver); | ||
1093 | |||
1094 | /* Upon network failure, we need to restart the receiver. */ | ||
1095 | if (os.conn > C_WF_CONNECTION && | ||
1096 | ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT) | ||
1097 | drbd_thread_restart_nowait(&mdev->tconn->receiver); | ||
1098 | |||
1099 | /* Resume AL writing if we get a connection */ | ||
1100 | if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) | ||
1101 | drbd_resume_al(mdev); | ||
1102 | |||
1103 | /* remember last attach time so request_timer_fn() won't | ||
1104 | * kill newly established sessions while we are still trying to thaw | ||
1105 | * previously frozen IO */ | ||
1106 | if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && | ||
1107 | ns.disk > D_NEGOTIATING) | ||
1108 | mdev->last_reattach_jif = jiffies; | ||
1109 | |||
1110 | ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC); | ||
1111 | if (ascw) { | ||
1112 | ascw->os = os; | ||
1113 | ascw->ns = ns; | ||
1114 | ascw->flags = flags; | ||
1115 | ascw->w.cb = w_after_state_ch; | ||
1116 | ascw->w.mdev = mdev; | ||
1117 | ascw->done = done; | ||
1118 | drbd_queue_work(&mdev->tconn->sender_work, &ascw->w); | ||
1119 | } else { | ||
1120 | dev_err(DEV, "Could not kmalloc an ascw\n"); | ||
1121 | } | ||
1122 | |||
1123 | return rv; | ||
1124 | } | ||
1125 | |||
1126 | static int w_after_state_ch(struct drbd_work *w, int unused) | ||
1127 | { | ||
1128 | struct after_state_chg_work *ascw = | ||
1129 | container_of(w, struct after_state_chg_work, w); | ||
1130 | struct drbd_conf *mdev = w->mdev; | ||
1131 | |||
1132 | after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags); | ||
1133 | if (ascw->flags & CS_WAIT_COMPLETE) { | ||
1134 | D_ASSERT(ascw->done != NULL); | ||
1135 | complete(ascw->done); | ||
1136 | } | ||
1137 | kfree(ascw); | ||
1138 | |||
1139 | return 0; | ||
1140 | } | ||
1141 | |||
1142 | static void abw_start_sync(struct drbd_conf *mdev, int rv) | ||
1143 | { | ||
1144 | if (rv) { | ||
1145 | dev_err(DEV, "Writing the bitmap failed not starting resync.\n"); | ||
1146 | _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE); | ||
1147 | return; | ||
1148 | } | ||
1149 | |||
1150 | switch (mdev->state.conn) { | ||
1151 | case C_STARTING_SYNC_T: | ||
1152 | _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); | ||
1153 | break; | ||
1154 | case C_STARTING_SYNC_S: | ||
1155 | drbd_start_resync(mdev, C_SYNC_SOURCE); | ||
1156 | break; | ||
1157 | } | ||
1158 | } | ||
1159 | |||
1160 | int drbd_bitmap_io_from_worker(struct drbd_conf *mdev, | ||
1161 | int (*io_fn)(struct drbd_conf *), | ||
1162 | char *why, enum bm_flag flags) | ||
1163 | { | ||
1164 | int rv; | ||
1165 | |||
1166 | D_ASSERT(current == mdev->tconn->worker.task); | ||
1167 | |||
1168 | /* open coded non-blocking drbd_suspend_io(mdev); */ | ||
1169 | set_bit(SUSPEND_IO, &mdev->flags); | ||
1170 | |||
1171 | drbd_bm_lock(mdev, why, flags); | ||
1172 | rv = io_fn(mdev); | ||
1173 | drbd_bm_unlock(mdev); | ||
1174 | |||
1175 | drbd_resume_io(mdev); | ||
1176 | |||
1177 | return rv; | ||
1178 | } | ||
1179 | |||
1180 | /** | ||
1181 | * after_state_ch() - Perform after state change actions that may sleep | ||
1182 | * @mdev: DRBD device. | ||
1183 | * @os: old state. | ||
1184 | * @ns: new state. | ||
1185 | * @flags: Flags | ||
1186 | */ | ||
1187 | static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, | ||
1188 | union drbd_state ns, enum chg_state_flags flags) | ||
1189 | { | ||
1190 | struct sib_info sib; | ||
1191 | |||
1192 | sib.sib_reason = SIB_STATE_CHANGE; | ||
1193 | sib.os = os; | ||
1194 | sib.ns = ns; | ||
1195 | |||
1196 | if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) { | ||
1197 | clear_bit(CRASHED_PRIMARY, &mdev->flags); | ||
1198 | if (mdev->p_uuid) | ||
1199 | mdev->p_uuid[UI_FLAGS] &= ~((u64)2); | ||
1200 | } | ||
1201 | |||
1202 | /* Inform userspace about the change... */ | ||
1203 | drbd_bcast_event(mdev, &sib); | ||
1204 | |||
1205 | if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) && | ||
1206 | (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)) | ||
1207 | drbd_khelper(mdev, "pri-on-incon-degr"); | ||
1208 | |||
1209 | /* Here we have the actions that are performed after a | ||
1210 | state change. This function might sleep */ | ||
1211 | |||
1212 | if (ns.susp_nod) { | ||
1213 | struct drbd_tconn *tconn = mdev->tconn; | ||
1214 | enum drbd_req_event what = NOTHING; | ||
1215 | |||
1216 | spin_lock_irq(&tconn->req_lock); | ||
1217 | if (os.conn < C_CONNECTED && conn_lowest_conn(tconn) >= C_CONNECTED) | ||
1218 | what = RESEND; | ||
1219 | |||
1220 | if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && | ||
1221 | conn_lowest_disk(tconn) > D_NEGOTIATING) | ||
1222 | what = RESTART_FROZEN_DISK_IO; | ||
1223 | |||
1224 | if (tconn->susp_nod && what != NOTHING) { | ||
1225 | _tl_restart(tconn, what); | ||
1226 | _conn_request_state(tconn, | ||
1227 | (union drbd_state) { { .susp_nod = 1 } }, | ||
1228 | (union drbd_state) { { .susp_nod = 0 } }, | ||
1229 | CS_VERBOSE); | ||
1230 | } | ||
1231 | spin_unlock_irq(&tconn->req_lock); | ||
1232 | } | ||
1233 | |||
1234 | if (ns.susp_fen) { | ||
1235 | struct drbd_tconn *tconn = mdev->tconn; | ||
1236 | |||
1237 | spin_lock_irq(&tconn->req_lock); | ||
1238 | if (tconn->susp_fen && conn_lowest_conn(tconn) >= C_CONNECTED) { | ||
1239 | /* case2: The connection was established again: */ | ||
1240 | struct drbd_conf *odev; | ||
1241 | int vnr; | ||
1242 | |||
1243 | rcu_read_lock(); | ||
1244 | idr_for_each_entry(&tconn->volumes, odev, vnr) | ||
1245 | clear_bit(NEW_CUR_UUID, &odev->flags); | ||
1246 | rcu_read_unlock(); | ||
1247 | _tl_restart(tconn, RESEND); | ||
1248 | _conn_request_state(tconn, | ||
1249 | (union drbd_state) { { .susp_fen = 1 } }, | ||
1250 | (union drbd_state) { { .susp_fen = 0 } }, | ||
1251 | CS_VERBOSE); | ||
1252 | } | ||
1253 | spin_unlock_irq(&tconn->req_lock); | ||
1254 | } | ||
1255 | |||
1256 | /* Became sync source. With protocol >= 96, we still need to send out | ||
1257 | * the sync uuid now. Need to do that before any drbd_send_state, or | ||
1258 | * the other side may go "paused sync" before receiving the sync uuids, | ||
1259 | * which is unexpected. */ | ||
1260 | if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) && | ||
1261 | (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) && | ||
1262 | mdev->tconn->agreed_pro_version >= 96 && get_ldev(mdev)) { | ||
1263 | drbd_gen_and_send_sync_uuid(mdev); | ||
1264 | put_ldev(mdev); | ||
1265 | } | ||
1266 | |||
1267 | /* Do not change the order of the if above and the two below... */ | ||
1268 | if (os.pdsk == D_DISKLESS && | ||
1269 | ns.pdsk > D_DISKLESS && ns.pdsk != D_UNKNOWN) { /* attach on the peer */ | ||
1270 | /* we probably will start a resync soon. | ||
1271 | * make sure those things are properly reset. */ | ||
1272 | mdev->rs_total = 0; | ||
1273 | mdev->rs_failed = 0; | ||
1274 | atomic_set(&mdev->rs_pending_cnt, 0); | ||
1275 | drbd_rs_cancel_all(mdev); | ||
1276 | |||
1277 | drbd_send_uuids(mdev); | ||
1278 | drbd_send_state(mdev, ns); | ||
1279 | } | ||
1280 | /* No point in queuing send_bitmap if we don't have a connection | ||
1281 | * anymore, so check also the _current_ state, not only the new state | ||
1282 | * at the time this work was queued. */ | ||
1283 | if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S && | ||
1284 | mdev->state.conn == C_WF_BITMAP_S) | ||
1285 | drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, | ||
1286 | "send_bitmap (WFBitMapS)", | ||
1287 | BM_LOCKED_TEST_ALLOWED); | ||
1288 | |||
1289 | /* Lost contact to peer's copy of the data */ | ||
1290 | if ((os.pdsk >= D_INCONSISTENT && | ||
1291 | os.pdsk != D_UNKNOWN && | ||
1292 | os.pdsk != D_OUTDATED) | ||
1293 | && (ns.pdsk < D_INCONSISTENT || | ||
1294 | ns.pdsk == D_UNKNOWN || | ||
1295 | ns.pdsk == D_OUTDATED)) { | ||
1296 | if (get_ldev(mdev)) { | ||
1297 | if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && | ||
1298 | mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { | ||
1299 | if (drbd_suspended(mdev)) { | ||
1300 | set_bit(NEW_CUR_UUID, &mdev->flags); | ||
1301 | } else { | ||
1302 | drbd_uuid_new_current(mdev); | ||
1303 | drbd_send_uuids(mdev); | ||
1304 | } | ||
1305 | } | ||
1306 | put_ldev(mdev); | ||
1307 | } | ||
1308 | } | ||
1309 | |||
1310 | if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) { | ||
1311 | if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY && | ||
1312 | mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { | ||
1313 | drbd_uuid_new_current(mdev); | ||
1314 | drbd_send_uuids(mdev); | ||
1315 | } | ||
1316 | /* D_DISKLESS Peer becomes secondary */ | ||
1317 | if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) | ||
1318 | /* We may still be Primary ourselves. | ||
1319 | * No harm done if the bitmap still changes, | ||
1320 | * redirtied pages will follow later. */ | ||
1321 | drbd_bitmap_io_from_worker(mdev, &drbd_bm_write, | ||
1322 | "demote diskless peer", BM_LOCKED_SET_ALLOWED); | ||
1323 | put_ldev(mdev); | ||
1324 | } | ||
1325 | |||
1326 | /* Write out all changed bits on demote. | ||
1327 | * Though, no need to da that just yet | ||
1328 | * if there is a resync going on still */ | ||
1329 | if (os.role == R_PRIMARY && ns.role == R_SECONDARY && | ||
1330 | mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) { | ||
1331 | /* No changes to the bitmap expected this time, so assert that, | ||
1332 | * even though no harm was done if it did change. */ | ||
1333 | drbd_bitmap_io_from_worker(mdev, &drbd_bm_write, | ||
1334 | "demote", BM_LOCKED_TEST_ALLOWED); | ||
1335 | put_ldev(mdev); | ||
1336 | } | ||
1337 | |||
1338 | /* Last part of the attaching process ... */ | ||
1339 | if (ns.conn >= C_CONNECTED && | ||
1340 | os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) { | ||
1341 | drbd_send_sizes(mdev, 0, 0); /* to start sync... */ | ||
1342 | drbd_send_uuids(mdev); | ||
1343 | drbd_send_state(mdev, ns); | ||
1344 | } | ||
1345 | |||
1346 | /* We want to pause/continue resync, tell peer. */ | ||
1347 | if (ns.conn >= C_CONNECTED && | ||
1348 | ((os.aftr_isp != ns.aftr_isp) || | ||
1349 | (os.user_isp != ns.user_isp))) | ||
1350 | drbd_send_state(mdev, ns); | ||
1351 | |||
1352 | /* In case one of the isp bits got set, suspend other devices. */ | ||
1353 | if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) && | ||
1354 | (ns.aftr_isp || ns.peer_isp || ns.user_isp)) | ||
1355 | suspend_other_sg(mdev); | ||
1356 | |||
1357 | /* Make sure the peer gets informed about eventual state | ||
1358 | changes (ISP bits) while we were in WFReportParams. */ | ||
1359 | if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED) | ||
1360 | drbd_send_state(mdev, ns); | ||
1361 | |||
1362 | if (os.conn != C_AHEAD && ns.conn == C_AHEAD) | ||
1363 | drbd_send_state(mdev, ns); | ||
1364 | |||
1365 | /* We are in the progress to start a full sync... */ | ||
1366 | if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || | ||
1367 | (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S)) | ||
1368 | /* no other bitmap changes expected during this phase */ | ||
1369 | drbd_queue_bitmap_io(mdev, | ||
1370 | &drbd_bmio_set_n_write, &abw_start_sync, | ||
1371 | "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED); | ||
1372 | |||
1373 | /* We are invalidating our self... */ | ||
1374 | if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED && | ||
1375 | os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT) | ||
1376 | /* other bitmap operation expected during this phase */ | ||
1377 | drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, | ||
1378 | "set_n_write from invalidate", BM_LOCKED_MASK); | ||
1379 | |||
1380 | /* first half of local IO error, failure to attach, | ||
1381 | * or administrative detach */ | ||
1382 | if (os.disk != D_FAILED && ns.disk == D_FAILED) { | ||
1383 | enum drbd_io_error_p eh = EP_PASS_ON; | ||
1384 | int was_io_error = 0; | ||
1385 | /* corresponding get_ldev was in __drbd_set_state, to serialize | ||
1386 | * our cleanup here with the transition to D_DISKLESS. | ||
1387 | * But is is still not save to dreference ldev here, since | ||
1388 | * we might come from an failed Attach before ldev was set. */ | ||
1389 | if (mdev->ldev) { | ||
1390 | rcu_read_lock(); | ||
1391 | eh = rcu_dereference(mdev->ldev->disk_conf)->on_io_error; | ||
1392 | rcu_read_unlock(); | ||
1393 | |||
1394 | was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags); | ||
1395 | |||
1396 | if (was_io_error && eh == EP_CALL_HELPER) | ||
1397 | drbd_khelper(mdev, "local-io-error"); | ||
1398 | |||
1399 | /* Immediately allow completion of all application IO, | ||
1400 | * that waits for completion from the local disk, | ||
1401 | * if this was a force-detach due to disk_timeout | ||
1402 | * or administrator request (drbdsetup detach --force). | ||
1403 | * Do NOT abort otherwise. | ||
1404 | * Aborting local requests may cause serious problems, | ||
1405 | * if requests are completed to upper layers already, | ||
1406 | * and then later the already submitted local bio completes. | ||
1407 | * This can cause DMA into former bio pages that meanwhile | ||
1408 | * have been re-used for other things. | ||
1409 | * So aborting local requests may cause crashes, | ||
1410 | * or even worse, silent data corruption. | ||
1411 | */ | ||
1412 | if (test_and_clear_bit(FORCE_DETACH, &mdev->flags)) | ||
1413 | tl_abort_disk_io(mdev); | ||
1414 | |||
1415 | /* current state still has to be D_FAILED, | ||
1416 | * there is only one way out: to D_DISKLESS, | ||
1417 | * and that may only happen after our put_ldev below. */ | ||
1418 | if (mdev->state.disk != D_FAILED) | ||
1419 | dev_err(DEV, | ||
1420 | "ASSERT FAILED: disk is %s during detach\n", | ||
1421 | drbd_disk_str(mdev->state.disk)); | ||
1422 | |||
1423 | if (ns.conn >= C_CONNECTED) | ||
1424 | drbd_send_state(mdev, ns); | ||
1425 | |||
1426 | drbd_rs_cancel_all(mdev); | ||
1427 | |||
1428 | /* In case we want to get something to stable storage still, | ||
1429 | * this may be the last chance. | ||
1430 | * Following put_ldev may transition to D_DISKLESS. */ | ||
1431 | drbd_md_sync(mdev); | ||
1432 | } | ||
1433 | put_ldev(mdev); | ||
1434 | } | ||
1435 | |||
1436 | /* second half of local IO error, failure to attach, | ||
1437 | * or administrative detach, | ||
1438 | * after local_cnt references have reached zero again */ | ||
1439 | if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) { | ||
1440 | /* We must still be diskless, | ||
1441 | * re-attach has to be serialized with this! */ | ||
1442 | if (mdev->state.disk != D_DISKLESS) | ||
1443 | dev_err(DEV, | ||
1444 | "ASSERT FAILED: disk is %s while going diskless\n", | ||
1445 | drbd_disk_str(mdev->state.disk)); | ||
1446 | |||
1447 | if (ns.conn >= C_CONNECTED) | ||
1448 | drbd_send_state(mdev, ns); | ||
1449 | /* corresponding get_ldev in __drbd_set_state | ||
1450 | * this may finally trigger drbd_ldev_destroy. */ | ||
1451 | put_ldev(mdev); | ||
1452 | } | ||
1453 | |||
1454 | /* Notify peer that I had a local IO error, and did not detached.. */ | ||
1455 | if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED) | ||
1456 | drbd_send_state(mdev, ns); | ||
1457 | |||
1458 | /* Disks got bigger while they were detached */ | ||
1459 | if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING && | ||
1460 | test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) { | ||
1461 | if (ns.conn == C_CONNECTED) | ||
1462 | resync_after_online_grow(mdev); | ||
1463 | } | ||
1464 | |||
1465 | /* A resync finished or aborted, wake paused devices... */ | ||
1466 | if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) || | ||
1467 | (os.peer_isp && !ns.peer_isp) || | ||
1468 | (os.user_isp && !ns.user_isp)) | ||
1469 | resume_next_sg(mdev); | ||
1470 | |||
1471 | /* sync target done with resync. Explicitly notify peer, even though | ||
1472 | * it should (at least for non-empty resyncs) already know itself. */ | ||
1473 | if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED) | ||
1474 | drbd_send_state(mdev, ns); | ||
1475 | |||
1476 | /* Verify finished, or reached stop sector. Peer did not know about | ||
1477 | * the stop sector, and we may even have changed the stop sector during | ||
1478 | * verify to interrupt/stop early. Send the new state. */ | ||
1479 | if (os.conn == C_VERIFY_S && ns.conn == C_CONNECTED | ||
1480 | && verify_can_do_stop_sector(mdev)) | ||
1481 | drbd_send_state(mdev, ns); | ||
1482 | |||
1483 | /* This triggers bitmap writeout of potentially still unwritten pages | ||
1484 | * if the resync finished cleanly, or aborted because of peer disk | ||
1485 | * failure, or because of connection loss. | ||
1486 | * For resync aborted because of local disk failure, we cannot do | ||
1487 | * any bitmap writeout anymore. | ||
1488 | * No harm done if some bits change during this phase. | ||
1489 | */ | ||
1490 | if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) { | ||
1491 | drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL, | ||
1492 | "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED); | ||
1493 | put_ldev(mdev); | ||
1494 | } | ||
1495 | |||
1496 | if (ns.disk == D_DISKLESS && | ||
1497 | ns.conn == C_STANDALONE && | ||
1498 | ns.role == R_SECONDARY) { | ||
1499 | if (os.aftr_isp != ns.aftr_isp) | ||
1500 | resume_next_sg(mdev); | ||
1501 | } | ||
1502 | |||
1503 | drbd_md_sync(mdev); | ||
1504 | } | ||
1505 | |||
1506 | struct after_conn_state_chg_work { | ||
1507 | struct drbd_work w; | ||
1508 | enum drbd_conns oc; | ||
1509 | union drbd_state ns_min; | ||
1510 | union drbd_state ns_max; /* new, max state, over all mdevs */ | ||
1511 | enum chg_state_flags flags; | ||
1512 | }; | ||
1513 | |||
1514 | static int w_after_conn_state_ch(struct drbd_work *w, int unused) | ||
1515 | { | ||
1516 | struct after_conn_state_chg_work *acscw = | ||
1517 | container_of(w, struct after_conn_state_chg_work, w); | ||
1518 | struct drbd_tconn *tconn = w->tconn; | ||
1519 | enum drbd_conns oc = acscw->oc; | ||
1520 | union drbd_state ns_max = acscw->ns_max; | ||
1521 | struct drbd_conf *mdev; | ||
1522 | int vnr; | ||
1523 | |||
1524 | kfree(acscw); | ||
1525 | |||
1526 | /* Upon network configuration, we need to start the receiver */ | ||
1527 | if (oc == C_STANDALONE && ns_max.conn == C_UNCONNECTED) | ||
1528 | drbd_thread_start(&tconn->receiver); | ||
1529 | |||
1530 | if (oc == C_DISCONNECTING && ns_max.conn == C_STANDALONE) { | ||
1531 | struct net_conf *old_conf; | ||
1532 | |||
1533 | mutex_lock(&tconn->conf_update); | ||
1534 | old_conf = tconn->net_conf; | ||
1535 | tconn->my_addr_len = 0; | ||
1536 | tconn->peer_addr_len = 0; | ||
1537 | rcu_assign_pointer(tconn->net_conf, NULL); | ||
1538 | conn_free_crypto(tconn); | ||
1539 | mutex_unlock(&tconn->conf_update); | ||
1540 | |||
1541 | synchronize_rcu(); | ||
1542 | kfree(old_conf); | ||
1543 | } | ||
1544 | |||
1545 | if (ns_max.susp_fen) { | ||
1546 | /* case1: The outdate peer handler is successful: */ | ||
1547 | if (ns_max.pdsk <= D_OUTDATED) { | ||
1548 | rcu_read_lock(); | ||
1549 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
1550 | if (test_bit(NEW_CUR_UUID, &mdev->flags)) { | ||
1551 | drbd_uuid_new_current(mdev); | ||
1552 | clear_bit(NEW_CUR_UUID, &mdev->flags); | ||
1553 | } | ||
1554 | } | ||
1555 | rcu_read_unlock(); | ||
1556 | spin_lock_irq(&tconn->req_lock); | ||
1557 | _tl_restart(tconn, CONNECTION_LOST_WHILE_PENDING); | ||
1558 | _conn_request_state(tconn, | ||
1559 | (union drbd_state) { { .susp_fen = 1 } }, | ||
1560 | (union drbd_state) { { .susp_fen = 0 } }, | ||
1561 | CS_VERBOSE); | ||
1562 | spin_unlock_irq(&tconn->req_lock); | ||
1563 | } | ||
1564 | } | ||
1565 | kref_put(&tconn->kref, &conn_destroy); | ||
1566 | |||
1567 | conn_md_sync(tconn); | ||
1568 | |||
1569 | return 0; | ||
1570 | } | ||
1571 | |||
1572 | void conn_old_common_state(struct drbd_tconn *tconn, union drbd_state *pcs, enum chg_state_flags *pf) | ||
1573 | { | ||
1574 | enum chg_state_flags flags = ~0; | ||
1575 | struct drbd_conf *mdev; | ||
1576 | int vnr, first_vol = 1; | ||
1577 | union drbd_dev_state os, cs = { | ||
1578 | { .role = R_SECONDARY, | ||
1579 | .peer = R_UNKNOWN, | ||
1580 | .conn = tconn->cstate, | ||
1581 | .disk = D_DISKLESS, | ||
1582 | .pdsk = D_UNKNOWN, | ||
1583 | } }; | ||
1584 | |||
1585 | rcu_read_lock(); | ||
1586 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
1587 | os = mdev->state; | ||
1588 | |||
1589 | if (first_vol) { | ||
1590 | cs = os; | ||
1591 | first_vol = 0; | ||
1592 | continue; | ||
1593 | } | ||
1594 | |||
1595 | if (cs.role != os.role) | ||
1596 | flags &= ~CS_DC_ROLE; | ||
1597 | |||
1598 | if (cs.peer != os.peer) | ||
1599 | flags &= ~CS_DC_PEER; | ||
1600 | |||
1601 | if (cs.conn != os.conn) | ||
1602 | flags &= ~CS_DC_CONN; | ||
1603 | |||
1604 | if (cs.disk != os.disk) | ||
1605 | flags &= ~CS_DC_DISK; | ||
1606 | |||
1607 | if (cs.pdsk != os.pdsk) | ||
1608 | flags &= ~CS_DC_PDSK; | ||
1609 | } | ||
1610 | rcu_read_unlock(); | ||
1611 | |||
1612 | *pf |= CS_DC_MASK; | ||
1613 | *pf &= flags; | ||
1614 | (*pcs).i = cs.i; | ||
1615 | } | ||
1616 | |||
1617 | static enum drbd_state_rv | ||
1618 | conn_is_valid_transition(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val, | ||
1619 | enum chg_state_flags flags) | ||
1620 | { | ||
1621 | enum drbd_state_rv rv = SS_SUCCESS; | ||
1622 | union drbd_state ns, os; | ||
1623 | struct drbd_conf *mdev; | ||
1624 | int vnr; | ||
1625 | |||
1626 | rcu_read_lock(); | ||
1627 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
1628 | os = drbd_read_state(mdev); | ||
1629 | ns = sanitize_state(mdev, apply_mask_val(os, mask, val), NULL); | ||
1630 | |||
1631 | if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED) | ||
1632 | ns.disk = os.disk; | ||
1633 | |||
1634 | if (ns.i == os.i) | ||
1635 | continue; | ||
1636 | |||
1637 | rv = is_valid_transition(os, ns); | ||
1638 | if (rv < SS_SUCCESS) | ||
1639 | break; | ||
1640 | |||
1641 | if (!(flags & CS_HARD)) { | ||
1642 | rv = is_valid_state(mdev, ns); | ||
1643 | if (rv < SS_SUCCESS) { | ||
1644 | if (is_valid_state(mdev, os) == rv) | ||
1645 | rv = is_valid_soft_transition(os, ns, tconn); | ||
1646 | } else | ||
1647 | rv = is_valid_soft_transition(os, ns, tconn); | ||
1648 | } | ||
1649 | if (rv < SS_SUCCESS) | ||
1650 | break; | ||
1651 | } | ||
1652 | rcu_read_unlock(); | ||
1653 | |||
1654 | if (rv < SS_SUCCESS && flags & CS_VERBOSE) | ||
1655 | print_st_err(mdev, os, ns, rv); | ||
1656 | |||
1657 | return rv; | ||
1658 | } | ||
1659 | |||
1660 | void | ||
1661 | conn_set_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val, | ||
1662 | union drbd_state *pns_min, union drbd_state *pns_max, enum chg_state_flags flags) | ||
1663 | { | ||
1664 | union drbd_state ns, os, ns_max = { }; | ||
1665 | union drbd_state ns_min = { | ||
1666 | { .role = R_MASK, | ||
1667 | .peer = R_MASK, | ||
1668 | .conn = val.conn, | ||
1669 | .disk = D_MASK, | ||
1670 | .pdsk = D_MASK | ||
1671 | } }; | ||
1672 | struct drbd_conf *mdev; | ||
1673 | enum drbd_state_rv rv; | ||
1674 | int vnr, number_of_volumes = 0; | ||
1675 | |||
1676 | if (mask.conn == C_MASK) { | ||
1677 | /* remember last connect time so request_timer_fn() won't | ||
1678 | * kill newly established sessions while we are still trying to thaw | ||
1679 | * previously frozen IO */ | ||
1680 | if (tconn->cstate != C_WF_REPORT_PARAMS && val.conn == C_WF_REPORT_PARAMS) | ||
1681 | tconn->last_reconnect_jif = jiffies; | ||
1682 | |||
1683 | tconn->cstate = val.conn; | ||
1684 | } | ||
1685 | |||
1686 | rcu_read_lock(); | ||
1687 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
1688 | number_of_volumes++; | ||
1689 | os = drbd_read_state(mdev); | ||
1690 | ns = apply_mask_val(os, mask, val); | ||
1691 | ns = sanitize_state(mdev, ns, NULL); | ||
1692 | |||
1693 | if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED) | ||
1694 | ns.disk = os.disk; | ||
1695 | |||
1696 | rv = __drbd_set_state(mdev, ns, flags, NULL); | ||
1697 | if (rv < SS_SUCCESS) | ||
1698 | BUG(); | ||
1699 | |||
1700 | ns.i = mdev->state.i; | ||
1701 | ns_max.role = max_role(ns.role, ns_max.role); | ||
1702 | ns_max.peer = max_role(ns.peer, ns_max.peer); | ||
1703 | ns_max.conn = max_t(enum drbd_conns, ns.conn, ns_max.conn); | ||
1704 | ns_max.disk = max_t(enum drbd_disk_state, ns.disk, ns_max.disk); | ||
1705 | ns_max.pdsk = max_t(enum drbd_disk_state, ns.pdsk, ns_max.pdsk); | ||
1706 | |||
1707 | ns_min.role = min_role(ns.role, ns_min.role); | ||
1708 | ns_min.peer = min_role(ns.peer, ns_min.peer); | ||
1709 | ns_min.conn = min_t(enum drbd_conns, ns.conn, ns_min.conn); | ||
1710 | ns_min.disk = min_t(enum drbd_disk_state, ns.disk, ns_min.disk); | ||
1711 | ns_min.pdsk = min_t(enum drbd_disk_state, ns.pdsk, ns_min.pdsk); | ||
1712 | } | ||
1713 | rcu_read_unlock(); | ||
1714 | |||
1715 | if (number_of_volumes == 0) { | ||
1716 | ns_min = ns_max = (union drbd_state) { { | ||
1717 | .role = R_SECONDARY, | ||
1718 | .peer = R_UNKNOWN, | ||
1719 | .conn = val.conn, | ||
1720 | .disk = D_DISKLESS, | ||
1721 | .pdsk = D_UNKNOWN | ||
1722 | } }; | ||
1723 | } | ||
1724 | |||
1725 | ns_min.susp = ns_max.susp = tconn->susp; | ||
1726 | ns_min.susp_nod = ns_max.susp_nod = tconn->susp_nod; | ||
1727 | ns_min.susp_fen = ns_max.susp_fen = tconn->susp_fen; | ||
1728 | |||
1729 | *pns_min = ns_min; | ||
1730 | *pns_max = ns_max; | ||
1731 | } | ||
1732 | |||
1733 | static enum drbd_state_rv | ||
1734 | _conn_rq_cond(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val) | ||
1735 | { | ||
1736 | enum drbd_state_rv rv; | ||
1737 | |||
1738 | if (test_and_clear_bit(CONN_WD_ST_CHG_OKAY, &tconn->flags)) | ||
1739 | return SS_CW_SUCCESS; | ||
1740 | |||
1741 | if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &tconn->flags)) | ||
1742 | return SS_CW_FAILED_BY_PEER; | ||
1743 | |||
1744 | rv = tconn->cstate != C_WF_REPORT_PARAMS ? SS_CW_NO_NEED : SS_UNKNOWN_ERROR; | ||
1745 | |||
1746 | if (rv == SS_UNKNOWN_ERROR) | ||
1747 | rv = conn_is_valid_transition(tconn, mask, val, 0); | ||
1748 | |||
1749 | if (rv == SS_SUCCESS) | ||
1750 | rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */ | ||
1751 | |||
1752 | return rv; | ||
1753 | } | ||
1754 | |||
1755 | enum drbd_state_rv | ||
1756 | _conn_request_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val, | ||
1757 | enum chg_state_flags flags) | ||
1758 | { | ||
1759 | enum drbd_state_rv rv = SS_SUCCESS; | ||
1760 | struct after_conn_state_chg_work *acscw; | ||
1761 | enum drbd_conns oc = tconn->cstate; | ||
1762 | union drbd_state ns_max, ns_min, os; | ||
1763 | bool have_mutex = false; | ||
1764 | |||
1765 | if (mask.conn) { | ||
1766 | rv = is_valid_conn_transition(oc, val.conn); | ||
1767 | if (rv < SS_SUCCESS) | ||
1768 | goto abort; | ||
1769 | } | ||
1770 | |||
1771 | rv = conn_is_valid_transition(tconn, mask, val, flags); | ||
1772 | if (rv < SS_SUCCESS) | ||
1773 | goto abort; | ||
1774 | |||
1775 | if (oc == C_WF_REPORT_PARAMS && val.conn == C_DISCONNECTING && | ||
1776 | !(flags & (CS_LOCAL_ONLY | CS_HARD))) { | ||
1777 | |||
1778 | /* This will be a cluster-wide state change. | ||
1779 | * Need to give up the spinlock, grab the mutex, | ||
1780 | * then send the state change request, ... */ | ||
1781 | spin_unlock_irq(&tconn->req_lock); | ||
1782 | mutex_lock(&tconn->cstate_mutex); | ||
1783 | have_mutex = true; | ||
1784 | |||
1785 | set_bit(CONN_WD_ST_CHG_REQ, &tconn->flags); | ||
1786 | if (conn_send_state_req(tconn, mask, val)) { | ||
1787 | /* sending failed. */ | ||
1788 | clear_bit(CONN_WD_ST_CHG_REQ, &tconn->flags); | ||
1789 | rv = SS_CW_FAILED_BY_PEER; | ||
1790 | /* need to re-aquire the spin lock, though */ | ||
1791 | goto abort_unlocked; | ||
1792 | } | ||
1793 | |||
1794 | if (val.conn == C_DISCONNECTING) | ||
1795 | set_bit(DISCONNECT_SENT, &tconn->flags); | ||
1796 | |||
1797 | /* ... and re-aquire the spinlock. | ||
1798 | * If _conn_rq_cond() returned >= SS_SUCCESS, we must call | ||
1799 | * conn_set_state() within the same spinlock. */ | ||
1800 | spin_lock_irq(&tconn->req_lock); | ||
1801 | wait_event_lock_irq(tconn->ping_wait, | ||
1802 | (rv = _conn_rq_cond(tconn, mask, val)), | ||
1803 | tconn->req_lock); | ||
1804 | clear_bit(CONN_WD_ST_CHG_REQ, &tconn->flags); | ||
1805 | if (rv < SS_SUCCESS) | ||
1806 | goto abort; | ||
1807 | } | ||
1808 | |||
1809 | conn_old_common_state(tconn, &os, &flags); | ||
1810 | flags |= CS_DC_SUSP; | ||
1811 | conn_set_state(tconn, mask, val, &ns_min, &ns_max, flags); | ||
1812 | conn_pr_state_change(tconn, os, ns_max, flags); | ||
1813 | |||
1814 | acscw = kmalloc(sizeof(*acscw), GFP_ATOMIC); | ||
1815 | if (acscw) { | ||
1816 | acscw->oc = os.conn; | ||
1817 | acscw->ns_min = ns_min; | ||
1818 | acscw->ns_max = ns_max; | ||
1819 | acscw->flags = flags; | ||
1820 | acscw->w.cb = w_after_conn_state_ch; | ||
1821 | kref_get(&tconn->kref); | ||
1822 | acscw->w.tconn = tconn; | ||
1823 | drbd_queue_work(&tconn->sender_work, &acscw->w); | ||
1824 | } else { | ||
1825 | conn_err(tconn, "Could not kmalloc an acscw\n"); | ||
1826 | } | ||
1827 | |||
1828 | abort: | ||
1829 | if (have_mutex) { | ||
1830 | /* mutex_unlock() "... must not be used in interrupt context.", | ||
1831 | * so give up the spinlock, then re-aquire it */ | ||
1832 | spin_unlock_irq(&tconn->req_lock); | ||
1833 | abort_unlocked: | ||
1834 | mutex_unlock(&tconn->cstate_mutex); | ||
1835 | spin_lock_irq(&tconn->req_lock); | ||
1836 | } | ||
1837 | if (rv < SS_SUCCESS && flags & CS_VERBOSE) { | ||
1838 | conn_err(tconn, "State change failed: %s\n", drbd_set_st_err_str(rv)); | ||
1839 | conn_err(tconn, " mask = 0x%x val = 0x%x\n", mask.i, val.i); | ||
1840 | conn_err(tconn, " old_conn:%s wanted_conn:%s\n", drbd_conn_str(oc), drbd_conn_str(val.conn)); | ||
1841 | } | ||
1842 | return rv; | ||
1843 | } | ||
1844 | |||
1845 | enum drbd_state_rv | ||
1846 | conn_request_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val, | ||
1847 | enum chg_state_flags flags) | ||
1848 | { | ||
1849 | enum drbd_state_rv rv; | ||
1850 | |||
1851 | spin_lock_irq(&tconn->req_lock); | ||
1852 | rv = _conn_request_state(tconn, mask, val, flags); | ||
1853 | spin_unlock_irq(&tconn->req_lock); | ||
1854 | |||
1855 | return rv; | ||
1856 | } | ||
diff --git a/drivers/block/drbd/drbd_state.h b/drivers/block/drbd/drbd_state.h new file mode 100644 index 000000000000..a3c361bbc4b6 --- /dev/null +++ b/drivers/block/drbd/drbd_state.h | |||
@@ -0,0 +1,161 @@ | |||
1 | #ifndef DRBD_STATE_H | ||
2 | #define DRBD_STATE_H | ||
3 | |||
4 | struct drbd_conf; | ||
5 | struct drbd_tconn; | ||
6 | |||
7 | /** | ||
8 | * DOC: DRBD State macros | ||
9 | * | ||
10 | * These macros are used to express state changes in easily readable form. | ||
11 | * | ||
12 | * The NS macros expand to a mask and a value, that can be bit ored onto the | ||
13 | * current state as soon as the spinlock (req_lock) was taken. | ||
14 | * | ||
15 | * The _NS macros are used for state functions that get called with the | ||
16 | * spinlock. These macros expand directly to the new state value. | ||
17 | * | ||
18 | * Besides the basic forms NS() and _NS() additional _?NS[23] are defined | ||
19 | * to express state changes that affect more than one aspect of the state. | ||
20 | * | ||
21 | * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY) | ||
22 | * Means that the network connection was established and that the peer | ||
23 | * is in secondary role. | ||
24 | */ | ||
25 | #define role_MASK R_MASK | ||
26 | #define peer_MASK R_MASK | ||
27 | #define disk_MASK D_MASK | ||
28 | #define pdsk_MASK D_MASK | ||
29 | #define conn_MASK C_MASK | ||
30 | #define susp_MASK 1 | ||
31 | #define user_isp_MASK 1 | ||
32 | #define aftr_isp_MASK 1 | ||
33 | #define susp_nod_MASK 1 | ||
34 | #define susp_fen_MASK 1 | ||
35 | |||
36 | #define NS(T, S) \ | ||
37 | ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \ | ||
38 | ({ union drbd_state val; val.i = 0; val.T = (S); val; }) | ||
39 | #define NS2(T1, S1, T2, S2) \ | ||
40 | ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \ | ||
41 | mask.T2 = T2##_MASK; mask; }), \ | ||
42 | ({ union drbd_state val; val.i = 0; val.T1 = (S1); \ | ||
43 | val.T2 = (S2); val; }) | ||
44 | #define NS3(T1, S1, T2, S2, T3, S3) \ | ||
45 | ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \ | ||
46 | mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \ | ||
47 | ({ union drbd_state val; val.i = 0; val.T1 = (S1); \ | ||
48 | val.T2 = (S2); val.T3 = (S3); val; }) | ||
49 | |||
50 | #define _NS(D, T, S) \ | ||
51 | D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T = (S); __ns; }) | ||
52 | #define _NS2(D, T1, S1, T2, S2) \ | ||
53 | D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \ | ||
54 | __ns.T2 = (S2); __ns; }) | ||
55 | #define _NS3(D, T1, S1, T2, S2, T3, S3) \ | ||
56 | D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \ | ||
57 | __ns.T2 = (S2); __ns.T3 = (S3); __ns; }) | ||
58 | |||
59 | enum chg_state_flags { | ||
60 | CS_HARD = 1 << 0, | ||
61 | CS_VERBOSE = 1 << 1, | ||
62 | CS_WAIT_COMPLETE = 1 << 2, | ||
63 | CS_SERIALIZE = 1 << 3, | ||
64 | CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE, | ||
65 | CS_LOCAL_ONLY = 1 << 4, /* Do not consider a device pair wide state change */ | ||
66 | CS_DC_ROLE = 1 << 5, /* DC = display as connection state change */ | ||
67 | CS_DC_PEER = 1 << 6, | ||
68 | CS_DC_CONN = 1 << 7, | ||
69 | CS_DC_DISK = 1 << 8, | ||
70 | CS_DC_PDSK = 1 << 9, | ||
71 | CS_DC_SUSP = 1 << 10, | ||
72 | CS_DC_MASK = CS_DC_ROLE + CS_DC_PEER + CS_DC_CONN + CS_DC_DISK + CS_DC_PDSK, | ||
73 | CS_IGN_OUTD_FAIL = 1 << 11, | ||
74 | }; | ||
75 | |||
76 | /* drbd_dev_state and drbd_state are different types. This is to stress the | ||
77 | small difference. There is no suspended flag (.susp), and no suspended | ||
78 | while fence handler runs flas (susp_fen). */ | ||
79 | union drbd_dev_state { | ||
80 | struct { | ||
81 | #if defined(__LITTLE_ENDIAN_BITFIELD) | ||
82 | unsigned role:2 ; /* 3/4 primary/secondary/unknown */ | ||
83 | unsigned peer:2 ; /* 3/4 primary/secondary/unknown */ | ||
84 | unsigned conn:5 ; /* 17/32 cstates */ | ||
85 | unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ | ||
86 | unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ | ||
87 | unsigned _unused:1 ; | ||
88 | unsigned aftr_isp:1 ; /* isp .. imposed sync pause */ | ||
89 | unsigned peer_isp:1 ; | ||
90 | unsigned user_isp:1 ; | ||
91 | unsigned _pad:11; /* 0 unused */ | ||
92 | #elif defined(__BIG_ENDIAN_BITFIELD) | ||
93 | unsigned _pad:11; | ||
94 | unsigned user_isp:1 ; | ||
95 | unsigned peer_isp:1 ; | ||
96 | unsigned aftr_isp:1 ; /* isp .. imposed sync pause */ | ||
97 | unsigned _unused:1 ; | ||
98 | unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ | ||
99 | unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ | ||
100 | unsigned conn:5 ; /* 17/32 cstates */ | ||
101 | unsigned peer:2 ; /* 3/4 primary/secondary/unknown */ | ||
102 | unsigned role:2 ; /* 3/4 primary/secondary/unknown */ | ||
103 | #else | ||
104 | # error "this endianess is not supported" | ||
105 | #endif | ||
106 | }; | ||
107 | unsigned int i; | ||
108 | }; | ||
109 | |||
110 | extern enum drbd_state_rv drbd_change_state(struct drbd_conf *mdev, | ||
111 | enum chg_state_flags f, | ||
112 | union drbd_state mask, | ||
113 | union drbd_state val); | ||
114 | extern void drbd_force_state(struct drbd_conf *, union drbd_state, | ||
115 | union drbd_state); | ||
116 | extern enum drbd_state_rv _drbd_request_state(struct drbd_conf *, | ||
117 | union drbd_state, | ||
118 | union drbd_state, | ||
119 | enum chg_state_flags); | ||
120 | extern enum drbd_state_rv __drbd_set_state(struct drbd_conf *, union drbd_state, | ||
121 | enum chg_state_flags, | ||
122 | struct completion *done); | ||
123 | extern void print_st_err(struct drbd_conf *, union drbd_state, | ||
124 | union drbd_state, int); | ||
125 | |||
126 | enum drbd_state_rv | ||
127 | _conn_request_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val, | ||
128 | enum chg_state_flags flags); | ||
129 | |||
130 | enum drbd_state_rv | ||
131 | conn_request_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val, | ||
132 | enum chg_state_flags flags); | ||
133 | |||
134 | extern void drbd_resume_al(struct drbd_conf *mdev); | ||
135 | extern bool conn_all_vols_unconf(struct drbd_tconn *tconn); | ||
136 | |||
137 | /** | ||
138 | * drbd_request_state() - Reqest a state change | ||
139 | * @mdev: DRBD device. | ||
140 | * @mask: mask of state bits to change. | ||
141 | * @val: value of new state bits. | ||
142 | * | ||
143 | * This is the most graceful way of requesting a state change. It is verbose | ||
144 | * quite verbose in case the state change is not possible, and all those | ||
145 | * state changes are globally serialized. | ||
146 | */ | ||
147 | static inline int drbd_request_state(struct drbd_conf *mdev, | ||
148 | union drbd_state mask, | ||
149 | union drbd_state val) | ||
150 | { | ||
151 | return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED); | ||
152 | } | ||
153 | |||
154 | enum drbd_role conn_highest_role(struct drbd_tconn *tconn); | ||
155 | enum drbd_role conn_highest_peer(struct drbd_tconn *tconn); | ||
156 | enum drbd_disk_state conn_highest_disk(struct drbd_tconn *tconn); | ||
157 | enum drbd_disk_state conn_lowest_disk(struct drbd_tconn *tconn); | ||
158 | enum drbd_disk_state conn_highest_pdsk(struct drbd_tconn *tconn); | ||
159 | enum drbd_conns conn_lowest_conn(struct drbd_tconn *tconn); | ||
160 | |||
161 | #endif | ||
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c index c44a2a602772..9a664bd27404 100644 --- a/drivers/block/drbd/drbd_strings.c +++ b/drivers/block/drbd/drbd_strings.c | |||
@@ -89,6 +89,7 @@ static const char *drbd_state_sw_errors[] = { | |||
89 | [-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated", | 89 | [-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated", |
90 | [-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change", | 90 | [-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change", |
91 | [-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted", | 91 | [-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted", |
92 | [-SS_O_VOL_PEER_PRI] = "Other vol primary on peer not allowed by config", | ||
92 | }; | 93 | }; |
93 | 94 | ||
94 | const char *drbd_conn_str(enum drbd_conns s) | 95 | const char *drbd_conn_str(enum drbd_conns s) |
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 6bce2cc179d4..424dc7bdf9b7 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c | |||
@@ -38,16 +38,13 @@ | |||
38 | #include "drbd_int.h" | 38 | #include "drbd_int.h" |
39 | #include "drbd_req.h" | 39 | #include "drbd_req.h" |
40 | 40 | ||
41 | static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel); | 41 | static int w_make_ov_request(struct drbd_work *w, int cancel); |
42 | static int w_make_resync_request(struct drbd_conf *mdev, | ||
43 | struct drbd_work *w, int cancel); | ||
44 | |||
45 | 42 | ||
46 | 43 | ||
47 | /* endio handlers: | 44 | /* endio handlers: |
48 | * drbd_md_io_complete (defined here) | 45 | * drbd_md_io_complete (defined here) |
49 | * drbd_endio_pri (defined here) | 46 | * drbd_request_endio (defined here) |
50 | * drbd_endio_sec (defined here) | 47 | * drbd_peer_request_endio (defined here) |
51 | * bm_async_io_complete (defined in drbd_bitmap.c) | 48 | * bm_async_io_complete (defined in drbd_bitmap.c) |
52 | * | 49 | * |
53 | * For all these callbacks, note the following: | 50 | * For all these callbacks, note the following: |
@@ -60,7 +57,7 @@ static int w_make_resync_request(struct drbd_conf *mdev, | |||
60 | 57 | ||
61 | /* About the global_state_lock | 58 | /* About the global_state_lock |
62 | Each state transition on an device holds a read lock. In case we have | 59 | Each state transition on an device holds a read lock. In case we have |
63 | to evaluate the sync after dependencies, we grab a write lock, because | 60 | to evaluate the resync after dependencies, we grab a write lock, because |
64 | we need stable states on all devices for that. */ | 61 | we need stable states on all devices for that. */ |
65 | rwlock_t global_state_lock; | 62 | rwlock_t global_state_lock; |
66 | 63 | ||
@@ -98,97 +95,93 @@ void drbd_md_io_complete(struct bio *bio, int error) | |||
98 | /* reads on behalf of the partner, | 95 | /* reads on behalf of the partner, |
99 | * "submitted" by the receiver | 96 | * "submitted" by the receiver |
100 | */ | 97 | */ |
101 | void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local) | 98 | void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __releases(local) |
102 | { | 99 | { |
103 | unsigned long flags = 0; | 100 | unsigned long flags = 0; |
104 | struct drbd_conf *mdev = e->mdev; | 101 | struct drbd_conf *mdev = peer_req->w.mdev; |
105 | |||
106 | D_ASSERT(e->block_id != ID_VACANT); | ||
107 | 102 | ||
108 | spin_lock_irqsave(&mdev->req_lock, flags); | 103 | spin_lock_irqsave(&mdev->tconn->req_lock, flags); |
109 | mdev->read_cnt += e->size >> 9; | 104 | mdev->read_cnt += peer_req->i.size >> 9; |
110 | list_del(&e->w.list); | 105 | list_del(&peer_req->w.list); |
111 | if (list_empty(&mdev->read_ee)) | 106 | if (list_empty(&mdev->read_ee)) |
112 | wake_up(&mdev->ee_wait); | 107 | wake_up(&mdev->ee_wait); |
113 | if (test_bit(__EE_WAS_ERROR, &e->flags)) | 108 | if (test_bit(__EE_WAS_ERROR, &peer_req->flags)) |
114 | __drbd_chk_io_error(mdev, DRBD_IO_ERROR); | 109 | __drbd_chk_io_error(mdev, DRBD_READ_ERROR); |
115 | spin_unlock_irqrestore(&mdev->req_lock, flags); | 110 | spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); |
116 | 111 | ||
117 | drbd_queue_work(&mdev->data.work, &e->w); | 112 | drbd_queue_work(&mdev->tconn->sender_work, &peer_req->w); |
118 | put_ldev(mdev); | 113 | put_ldev(mdev); |
119 | } | 114 | } |
120 | 115 | ||
121 | /* writes on behalf of the partner, or resync writes, | 116 | /* writes on behalf of the partner, or resync writes, |
122 | * "submitted" by the receiver, final stage. */ | 117 | * "submitted" by the receiver, final stage. */ |
123 | static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(local) | 118 | static void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local) |
124 | { | 119 | { |
125 | unsigned long flags = 0; | 120 | unsigned long flags = 0; |
126 | struct drbd_conf *mdev = e->mdev; | 121 | struct drbd_conf *mdev = peer_req->w.mdev; |
127 | sector_t e_sector; | 122 | struct drbd_interval i; |
128 | int do_wake; | 123 | int do_wake; |
129 | int is_syncer_req; | 124 | u64 block_id; |
130 | int do_al_complete_io; | 125 | int do_al_complete_io; |
131 | 126 | ||
132 | D_ASSERT(e->block_id != ID_VACANT); | 127 | /* after we moved peer_req to done_ee, |
133 | |||
134 | /* after we moved e to done_ee, | ||
135 | * we may no longer access it, | 128 | * we may no longer access it, |
136 | * it may be freed/reused already! | 129 | * it may be freed/reused already! |
137 | * (as soon as we release the req_lock) */ | 130 | * (as soon as we release the req_lock) */ |
138 | e_sector = e->sector; | 131 | i = peer_req->i; |
139 | do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO; | 132 | do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO; |
140 | is_syncer_req = is_syncer_block_id(e->block_id); | 133 | block_id = peer_req->block_id; |
141 | 134 | ||
142 | spin_lock_irqsave(&mdev->req_lock, flags); | 135 | spin_lock_irqsave(&mdev->tconn->req_lock, flags); |
143 | mdev->writ_cnt += e->size >> 9; | 136 | mdev->writ_cnt += peer_req->i.size >> 9; |
144 | list_del(&e->w.list); /* has been on active_ee or sync_ee */ | 137 | list_move_tail(&peer_req->w.list, &mdev->done_ee); |
145 | list_add_tail(&e->w.list, &mdev->done_ee); | ||
146 | 138 | ||
147 | /* No hlist_del_init(&e->collision) here, we did not send the Ack yet, | 139 | /* |
148 | * neither did we wake possibly waiting conflicting requests. | 140 | * Do not remove from the write_requests tree here: we did not send the |
149 | * done from "drbd_process_done_ee" within the appropriate w.cb | 141 | * Ack yet and did not wake possibly waiting conflicting requests. |
150 | * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */ | 142 | * Removed from the tree from "drbd_process_done_ee" within the |
143 | * appropriate w.cb (e_end_block/e_end_resync_block) or from | ||
144 | * _drbd_clear_done_ee. | ||
145 | */ | ||
151 | 146 | ||
152 | do_wake = is_syncer_req | 147 | do_wake = list_empty(block_id == ID_SYNCER ? &mdev->sync_ee : &mdev->active_ee); |
153 | ? list_empty(&mdev->sync_ee) | ||
154 | : list_empty(&mdev->active_ee); | ||
155 | 148 | ||
156 | if (test_bit(__EE_WAS_ERROR, &e->flags)) | 149 | if (test_bit(__EE_WAS_ERROR, &peer_req->flags)) |
157 | __drbd_chk_io_error(mdev, DRBD_IO_ERROR); | 150 | __drbd_chk_io_error(mdev, DRBD_WRITE_ERROR); |
158 | spin_unlock_irqrestore(&mdev->req_lock, flags); | 151 | spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); |
159 | 152 | ||
160 | if (is_syncer_req) | 153 | if (block_id == ID_SYNCER) |
161 | drbd_rs_complete_io(mdev, e_sector); | 154 | drbd_rs_complete_io(mdev, i.sector); |
162 | 155 | ||
163 | if (do_wake) | 156 | if (do_wake) |
164 | wake_up(&mdev->ee_wait); | 157 | wake_up(&mdev->ee_wait); |
165 | 158 | ||
166 | if (do_al_complete_io) | 159 | if (do_al_complete_io) |
167 | drbd_al_complete_io(mdev, e_sector); | 160 | drbd_al_complete_io(mdev, &i); |
168 | 161 | ||
169 | wake_asender(mdev); | 162 | wake_asender(mdev->tconn); |
170 | put_ldev(mdev); | 163 | put_ldev(mdev); |
171 | } | 164 | } |
172 | 165 | ||
173 | /* writes on behalf of the partner, or resync writes, | 166 | /* writes on behalf of the partner, or resync writes, |
174 | * "submitted" by the receiver. | 167 | * "submitted" by the receiver. |
175 | */ | 168 | */ |
176 | void drbd_endio_sec(struct bio *bio, int error) | 169 | void drbd_peer_request_endio(struct bio *bio, int error) |
177 | { | 170 | { |
178 | struct drbd_epoch_entry *e = bio->bi_private; | 171 | struct drbd_peer_request *peer_req = bio->bi_private; |
179 | struct drbd_conf *mdev = e->mdev; | 172 | struct drbd_conf *mdev = peer_req->w.mdev; |
180 | int uptodate = bio_flagged(bio, BIO_UPTODATE); | 173 | int uptodate = bio_flagged(bio, BIO_UPTODATE); |
181 | int is_write = bio_data_dir(bio) == WRITE; | 174 | int is_write = bio_data_dir(bio) == WRITE; |
182 | 175 | ||
183 | if (error && __ratelimit(&drbd_ratelimit_state)) | 176 | if (error && __ratelimit(&drbd_ratelimit_state)) |
184 | dev_warn(DEV, "%s: error=%d s=%llus\n", | 177 | dev_warn(DEV, "%s: error=%d s=%llus\n", |
185 | is_write ? "write" : "read", error, | 178 | is_write ? "write" : "read", error, |
186 | (unsigned long long)e->sector); | 179 | (unsigned long long)peer_req->i.sector); |
187 | if (!error && !uptodate) { | 180 | if (!error && !uptodate) { |
188 | if (__ratelimit(&drbd_ratelimit_state)) | 181 | if (__ratelimit(&drbd_ratelimit_state)) |
189 | dev_warn(DEV, "%s: setting error to -EIO s=%llus\n", | 182 | dev_warn(DEV, "%s: setting error to -EIO s=%llus\n", |
190 | is_write ? "write" : "read", | 183 | is_write ? "write" : "read", |
191 | (unsigned long long)e->sector); | 184 | (unsigned long long)peer_req->i.sector); |
192 | /* strange behavior of some lower level drivers... | 185 | /* strange behavior of some lower level drivers... |
193 | * fail the request by clearing the uptodate flag, | 186 | * fail the request by clearing the uptodate flag, |
194 | * but do not return any error?! */ | 187 | * but do not return any error?! */ |
@@ -196,24 +189,24 @@ void drbd_endio_sec(struct bio *bio, int error) | |||
196 | } | 189 | } |
197 | 190 | ||
198 | if (error) | 191 | if (error) |
199 | set_bit(__EE_WAS_ERROR, &e->flags); | 192 | set_bit(__EE_WAS_ERROR, &peer_req->flags); |
200 | 193 | ||
201 | bio_put(bio); /* no need for the bio anymore */ | 194 | bio_put(bio); /* no need for the bio anymore */ |
202 | if (atomic_dec_and_test(&e->pending_bios)) { | 195 | if (atomic_dec_and_test(&peer_req->pending_bios)) { |
203 | if (is_write) | 196 | if (is_write) |
204 | drbd_endio_write_sec_final(e); | 197 | drbd_endio_write_sec_final(peer_req); |
205 | else | 198 | else |
206 | drbd_endio_read_sec_final(e); | 199 | drbd_endio_read_sec_final(peer_req); |
207 | } | 200 | } |
208 | } | 201 | } |
209 | 202 | ||
210 | /* read, readA or write requests on R_PRIMARY coming from drbd_make_request | 203 | /* read, readA or write requests on R_PRIMARY coming from drbd_make_request |
211 | */ | 204 | */ |
212 | void drbd_endio_pri(struct bio *bio, int error) | 205 | void drbd_request_endio(struct bio *bio, int error) |
213 | { | 206 | { |
214 | unsigned long flags; | 207 | unsigned long flags; |
215 | struct drbd_request *req = bio->bi_private; | 208 | struct drbd_request *req = bio->bi_private; |
216 | struct drbd_conf *mdev = req->mdev; | 209 | struct drbd_conf *mdev = req->w.mdev; |
217 | struct bio_and_error m; | 210 | struct bio_and_error m; |
218 | enum drbd_req_event what; | 211 | enum drbd_req_event what; |
219 | int uptodate = bio_flagged(bio, BIO_UPTODATE); | 212 | int uptodate = bio_flagged(bio, BIO_UPTODATE); |
@@ -227,53 +220,72 @@ void drbd_endio_pri(struct bio *bio, int error) | |||
227 | error = -EIO; | 220 | error = -EIO; |
228 | } | 221 | } |
229 | 222 | ||
223 | |||
224 | /* If this request was aborted locally before, | ||
225 | * but now was completed "successfully", | ||
226 | * chances are that this caused arbitrary data corruption. | ||
227 | * | ||
228 | * "aborting" requests, or force-detaching the disk, is intended for | ||
229 | * completely blocked/hung local backing devices which do no longer | ||
230 | * complete requests at all, not even do error completions. In this | ||
231 | * situation, usually a hard-reset and failover is the only way out. | ||
232 | * | ||
233 | * By "aborting", basically faking a local error-completion, | ||
234 | * we allow for a more graceful swichover by cleanly migrating services. | ||
235 | * Still the affected node has to be rebooted "soon". | ||
236 | * | ||
237 | * By completing these requests, we allow the upper layers to re-use | ||
238 | * the associated data pages. | ||
239 | * | ||
240 | * If later the local backing device "recovers", and now DMAs some data | ||
241 | * from disk into the original request pages, in the best case it will | ||
242 | * just put random data into unused pages; but typically it will corrupt | ||
243 | * meanwhile completely unrelated data, causing all sorts of damage. | ||
244 | * | ||
245 | * Which means delayed successful completion, | ||
246 | * especially for READ requests, | ||
247 | * is a reason to panic(). | ||
248 | * | ||
249 | * We assume that a delayed *error* completion is OK, | ||
250 | * though we still will complain noisily about it. | ||
251 | */ | ||
252 | if (unlikely(req->rq_state & RQ_LOCAL_ABORTED)) { | ||
253 | if (__ratelimit(&drbd_ratelimit_state)) | ||
254 | dev_emerg(DEV, "delayed completion of aborted local request; disk-timeout may be too aggressive\n"); | ||
255 | |||
256 | if (!error) | ||
257 | panic("possible random memory corruption caused by delayed completion of aborted local request\n"); | ||
258 | } | ||
259 | |||
230 | /* to avoid recursion in __req_mod */ | 260 | /* to avoid recursion in __req_mod */ |
231 | if (unlikely(error)) { | 261 | if (unlikely(error)) { |
232 | what = (bio_data_dir(bio) == WRITE) | 262 | what = (bio_data_dir(bio) == WRITE) |
233 | ? write_completed_with_error | 263 | ? WRITE_COMPLETED_WITH_ERROR |
234 | : (bio_rw(bio) == READ) | 264 | : (bio_rw(bio) == READ) |
235 | ? read_completed_with_error | 265 | ? READ_COMPLETED_WITH_ERROR |
236 | : read_ahead_completed_with_error; | 266 | : READ_AHEAD_COMPLETED_WITH_ERROR; |
237 | } else | 267 | } else |
238 | what = completed_ok; | 268 | what = COMPLETED_OK; |
239 | 269 | ||
240 | bio_put(req->private_bio); | 270 | bio_put(req->private_bio); |
241 | req->private_bio = ERR_PTR(error); | 271 | req->private_bio = ERR_PTR(error); |
242 | 272 | ||
243 | /* not req_mod(), we need irqsave here! */ | 273 | /* not req_mod(), we need irqsave here! */ |
244 | spin_lock_irqsave(&mdev->req_lock, flags); | 274 | spin_lock_irqsave(&mdev->tconn->req_lock, flags); |
245 | __req_mod(req, what, &m); | 275 | __req_mod(req, what, &m); |
246 | spin_unlock_irqrestore(&mdev->req_lock, flags); | 276 | spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); |
247 | put_ldev(mdev); | 277 | put_ldev(mdev); |
248 | 278 | ||
249 | if (m.bio) | 279 | if (m.bio) |
250 | complete_master_bio(mdev, &m); | 280 | complete_master_bio(mdev, &m); |
251 | } | 281 | } |
252 | 282 | ||
253 | int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 283 | void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, |
254 | { | 284 | struct drbd_peer_request *peer_req, void *digest) |
255 | struct drbd_request *req = container_of(w, struct drbd_request, w); | ||
256 | |||
257 | /* We should not detach for read io-error, | ||
258 | * but try to WRITE the P_DATA_REPLY to the failed location, | ||
259 | * to give the disk the chance to relocate that block */ | ||
260 | |||
261 | spin_lock_irq(&mdev->req_lock); | ||
262 | if (cancel || mdev->state.pdsk != D_UP_TO_DATE) { | ||
263 | _req_mod(req, read_retry_remote_canceled); | ||
264 | spin_unlock_irq(&mdev->req_lock); | ||
265 | return 1; | ||
266 | } | ||
267 | spin_unlock_irq(&mdev->req_lock); | ||
268 | |||
269 | return w_send_read_req(mdev, w, 0); | ||
270 | } | ||
271 | |||
272 | void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_epoch_entry *e, void *digest) | ||
273 | { | 285 | { |
274 | struct hash_desc desc; | 286 | struct hash_desc desc; |
275 | struct scatterlist sg; | 287 | struct scatterlist sg; |
276 | struct page *page = e->pages; | 288 | struct page *page = peer_req->pages; |
277 | struct page *tmp; | 289 | struct page *tmp; |
278 | unsigned len; | 290 | unsigned len; |
279 | 291 | ||
@@ -290,7 +302,7 @@ void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_e | |||
290 | page = tmp; | 302 | page = tmp; |
291 | } | 303 | } |
292 | /* and now the last, possibly only partially used page */ | 304 | /* and now the last, possibly only partially used page */ |
293 | len = e->size & (PAGE_SIZE - 1); | 305 | len = peer_req->i.size & (PAGE_SIZE - 1); |
294 | sg_set_page(&sg, page, len ?: PAGE_SIZE, 0); | 306 | sg_set_page(&sg, page, len ?: PAGE_SIZE, 0); |
295 | crypto_hash_update(&desc, &sg, sg.length); | 307 | crypto_hash_update(&desc, &sg, sg.length); |
296 | crypto_hash_final(&desc, digest); | 308 | crypto_hash_final(&desc, digest); |
@@ -316,59 +328,58 @@ void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio * | |||
316 | crypto_hash_final(&desc, digest); | 328 | crypto_hash_final(&desc, digest); |
317 | } | 329 | } |
318 | 330 | ||
319 | /* TODO merge common code with w_e_end_ov_req */ | 331 | /* MAYBE merge common code with w_e_end_ov_req */ |
320 | int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 332 | static int w_e_send_csum(struct drbd_work *w, int cancel) |
321 | { | 333 | { |
322 | struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); | 334 | struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); |
335 | struct drbd_conf *mdev = w->mdev; | ||
323 | int digest_size; | 336 | int digest_size; |
324 | void *digest; | 337 | void *digest; |
325 | int ok = 1; | 338 | int err = 0; |
326 | |||
327 | D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef); | ||
328 | 339 | ||
329 | if (unlikely(cancel)) | 340 | if (unlikely(cancel)) |
330 | goto out; | 341 | goto out; |
331 | 342 | ||
332 | if (likely((e->flags & EE_WAS_ERROR) != 0)) | 343 | if (unlikely((peer_req->flags & EE_WAS_ERROR) != 0)) |
333 | goto out; | 344 | goto out; |
334 | 345 | ||
335 | digest_size = crypto_hash_digestsize(mdev->csums_tfm); | 346 | digest_size = crypto_hash_digestsize(mdev->tconn->csums_tfm); |
336 | digest = kmalloc(digest_size, GFP_NOIO); | 347 | digest = kmalloc(digest_size, GFP_NOIO); |
337 | if (digest) { | 348 | if (digest) { |
338 | sector_t sector = e->sector; | 349 | sector_t sector = peer_req->i.sector; |
339 | unsigned int size = e->size; | 350 | unsigned int size = peer_req->i.size; |
340 | drbd_csum_ee(mdev, mdev->csums_tfm, e, digest); | 351 | drbd_csum_ee(mdev, mdev->tconn->csums_tfm, peer_req, digest); |
341 | /* Free e and pages before send. | 352 | /* Free peer_req and pages before send. |
342 | * In case we block on congestion, we could otherwise run into | 353 | * In case we block on congestion, we could otherwise run into |
343 | * some distributed deadlock, if the other side blocks on | 354 | * some distributed deadlock, if the other side blocks on |
344 | * congestion as well, because our receiver blocks in | 355 | * congestion as well, because our receiver blocks in |
345 | * drbd_pp_alloc due to pp_in_use > max_buffers. */ | 356 | * drbd_alloc_pages due to pp_in_use > max_buffers. */ |
346 | drbd_free_ee(mdev, e); | 357 | drbd_free_peer_req(mdev, peer_req); |
347 | e = NULL; | 358 | peer_req = NULL; |
348 | inc_rs_pending(mdev); | 359 | inc_rs_pending(mdev); |
349 | ok = drbd_send_drequest_csum(mdev, sector, size, | 360 | err = drbd_send_drequest_csum(mdev, sector, size, |
350 | digest, digest_size, | 361 | digest, digest_size, |
351 | P_CSUM_RS_REQUEST); | 362 | P_CSUM_RS_REQUEST); |
352 | kfree(digest); | 363 | kfree(digest); |
353 | } else { | 364 | } else { |
354 | dev_err(DEV, "kmalloc() of digest failed.\n"); | 365 | dev_err(DEV, "kmalloc() of digest failed.\n"); |
355 | ok = 0; | 366 | err = -ENOMEM; |
356 | } | 367 | } |
357 | 368 | ||
358 | out: | 369 | out: |
359 | if (e) | 370 | if (peer_req) |
360 | drbd_free_ee(mdev, e); | 371 | drbd_free_peer_req(mdev, peer_req); |
361 | 372 | ||
362 | if (unlikely(!ok)) | 373 | if (unlikely(err)) |
363 | dev_err(DEV, "drbd_send_drequest(..., csum) failed\n"); | 374 | dev_err(DEV, "drbd_send_drequest(..., csum) failed\n"); |
364 | return ok; | 375 | return err; |
365 | } | 376 | } |
366 | 377 | ||
367 | #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) | 378 | #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) |
368 | 379 | ||
369 | static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size) | 380 | static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size) |
370 | { | 381 | { |
371 | struct drbd_epoch_entry *e; | 382 | struct drbd_peer_request *peer_req; |
372 | 383 | ||
373 | if (!get_ldev(mdev)) | 384 | if (!get_ldev(mdev)) |
374 | return -EIO; | 385 | return -EIO; |
@@ -378,45 +389,47 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size) | |||
378 | 389 | ||
379 | /* GFP_TRY, because if there is no memory available right now, this may | 390 | /* GFP_TRY, because if there is no memory available right now, this may |
380 | * be rescheduled for later. It is "only" background resync, after all. */ | 391 | * be rescheduled for later. It is "only" background resync, after all. */ |
381 | e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY); | 392 | peer_req = drbd_alloc_peer_req(mdev, ID_SYNCER /* unused */, sector, |
382 | if (!e) | 393 | size, GFP_TRY); |
394 | if (!peer_req) | ||
383 | goto defer; | 395 | goto defer; |
384 | 396 | ||
385 | e->w.cb = w_e_send_csum; | 397 | peer_req->w.cb = w_e_send_csum; |
386 | spin_lock_irq(&mdev->req_lock); | 398 | spin_lock_irq(&mdev->tconn->req_lock); |
387 | list_add(&e->w.list, &mdev->read_ee); | 399 | list_add(&peer_req->w.list, &mdev->read_ee); |
388 | spin_unlock_irq(&mdev->req_lock); | 400 | spin_unlock_irq(&mdev->tconn->req_lock); |
389 | 401 | ||
390 | atomic_add(size >> 9, &mdev->rs_sect_ev); | 402 | atomic_add(size >> 9, &mdev->rs_sect_ev); |
391 | if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0) | 403 | if (drbd_submit_peer_request(mdev, peer_req, READ, DRBD_FAULT_RS_RD) == 0) |
392 | return 0; | 404 | return 0; |
393 | 405 | ||
394 | /* If it failed because of ENOMEM, retry should help. If it failed | 406 | /* If it failed because of ENOMEM, retry should help. If it failed |
395 | * because bio_add_page failed (probably broken lower level driver), | 407 | * because bio_add_page failed (probably broken lower level driver), |
396 | * retry may or may not help. | 408 | * retry may or may not help. |
397 | * If it does not, you may need to force disconnect. */ | 409 | * If it does not, you may need to force disconnect. */ |
398 | spin_lock_irq(&mdev->req_lock); | 410 | spin_lock_irq(&mdev->tconn->req_lock); |
399 | list_del(&e->w.list); | 411 | list_del(&peer_req->w.list); |
400 | spin_unlock_irq(&mdev->req_lock); | 412 | spin_unlock_irq(&mdev->tconn->req_lock); |
401 | 413 | ||
402 | drbd_free_ee(mdev, e); | 414 | drbd_free_peer_req(mdev, peer_req); |
403 | defer: | 415 | defer: |
404 | put_ldev(mdev); | 416 | put_ldev(mdev); |
405 | return -EAGAIN; | 417 | return -EAGAIN; |
406 | } | 418 | } |
407 | 419 | ||
408 | int w_resync_timer(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 420 | int w_resync_timer(struct drbd_work *w, int cancel) |
409 | { | 421 | { |
422 | struct drbd_conf *mdev = w->mdev; | ||
410 | switch (mdev->state.conn) { | 423 | switch (mdev->state.conn) { |
411 | case C_VERIFY_S: | 424 | case C_VERIFY_S: |
412 | w_make_ov_request(mdev, w, cancel); | 425 | w_make_ov_request(w, cancel); |
413 | break; | 426 | break; |
414 | case C_SYNC_TARGET: | 427 | case C_SYNC_TARGET: |
415 | w_make_resync_request(mdev, w, cancel); | 428 | w_make_resync_request(w, cancel); |
416 | break; | 429 | break; |
417 | } | 430 | } |
418 | 431 | ||
419 | return 1; | 432 | return 0; |
420 | } | 433 | } |
421 | 434 | ||
422 | void resync_timer_fn(unsigned long data) | 435 | void resync_timer_fn(unsigned long data) |
@@ -424,7 +437,7 @@ void resync_timer_fn(unsigned long data) | |||
424 | struct drbd_conf *mdev = (struct drbd_conf *) data; | 437 | struct drbd_conf *mdev = (struct drbd_conf *) data; |
425 | 438 | ||
426 | if (list_empty(&mdev->resync_work.list)) | 439 | if (list_empty(&mdev->resync_work.list)) |
427 | drbd_queue_work(&mdev->data.work, &mdev->resync_work); | 440 | drbd_queue_work(&mdev->tconn->sender_work, &mdev->resync_work); |
428 | } | 441 | } |
429 | 442 | ||
430 | static void fifo_set(struct fifo_buffer *fb, int value) | 443 | static void fifo_set(struct fifo_buffer *fb, int value) |
@@ -456,8 +469,24 @@ static void fifo_add_val(struct fifo_buffer *fb, int value) | |||
456 | fb->values[i] += value; | 469 | fb->values[i] += value; |
457 | } | 470 | } |
458 | 471 | ||
472 | struct fifo_buffer *fifo_alloc(int fifo_size) | ||
473 | { | ||
474 | struct fifo_buffer *fb; | ||
475 | |||
476 | fb = kzalloc(sizeof(struct fifo_buffer) + sizeof(int) * fifo_size, GFP_NOIO); | ||
477 | if (!fb) | ||
478 | return NULL; | ||
479 | |||
480 | fb->head_index = 0; | ||
481 | fb->size = fifo_size; | ||
482 | fb->total = 0; | ||
483 | |||
484 | return fb; | ||
485 | } | ||
486 | |||
459 | static int drbd_rs_controller(struct drbd_conf *mdev) | 487 | static int drbd_rs_controller(struct drbd_conf *mdev) |
460 | { | 488 | { |
489 | struct disk_conf *dc; | ||
461 | unsigned int sect_in; /* Number of sectors that came in since the last turn */ | 490 | unsigned int sect_in; /* Number of sectors that came in since the last turn */ |
462 | unsigned int want; /* The number of sectors we want in the proxy */ | 491 | unsigned int want; /* The number of sectors we want in the proxy */ |
463 | int req_sect; /* Number of sectors to request in this turn */ | 492 | int req_sect; /* Number of sectors to request in this turn */ |
@@ -466,38 +495,39 @@ static int drbd_rs_controller(struct drbd_conf *mdev) | |||
466 | int steps; /* Number of time steps to plan ahead */ | 495 | int steps; /* Number of time steps to plan ahead */ |
467 | int curr_corr; | 496 | int curr_corr; |
468 | int max_sect; | 497 | int max_sect; |
498 | struct fifo_buffer *plan; | ||
469 | 499 | ||
470 | sect_in = atomic_xchg(&mdev->rs_sect_in, 0); /* Number of sectors that came in */ | 500 | sect_in = atomic_xchg(&mdev->rs_sect_in, 0); /* Number of sectors that came in */ |
471 | mdev->rs_in_flight -= sect_in; | 501 | mdev->rs_in_flight -= sect_in; |
472 | 502 | ||
473 | spin_lock(&mdev->peer_seq_lock); /* get an atomic view on mdev->rs_plan_s */ | 503 | dc = rcu_dereference(mdev->ldev->disk_conf); |
504 | plan = rcu_dereference(mdev->rs_plan_s); | ||
474 | 505 | ||
475 | steps = mdev->rs_plan_s.size; /* (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ; */ | 506 | steps = plan->size; /* (dc->c_plan_ahead * 10 * SLEEP_TIME) / HZ; */ |
476 | 507 | ||
477 | if (mdev->rs_in_flight + sect_in == 0) { /* At start of resync */ | 508 | if (mdev->rs_in_flight + sect_in == 0) { /* At start of resync */ |
478 | want = ((mdev->sync_conf.rate * 2 * SLEEP_TIME) / HZ) * steps; | 509 | want = ((dc->resync_rate * 2 * SLEEP_TIME) / HZ) * steps; |
479 | } else { /* normal path */ | 510 | } else { /* normal path */ |
480 | want = mdev->sync_conf.c_fill_target ? mdev->sync_conf.c_fill_target : | 511 | want = dc->c_fill_target ? dc->c_fill_target : |
481 | sect_in * mdev->sync_conf.c_delay_target * HZ / (SLEEP_TIME * 10); | 512 | sect_in * dc->c_delay_target * HZ / (SLEEP_TIME * 10); |
482 | } | 513 | } |
483 | 514 | ||
484 | correction = want - mdev->rs_in_flight - mdev->rs_planed; | 515 | correction = want - mdev->rs_in_flight - plan->total; |
485 | 516 | ||
486 | /* Plan ahead */ | 517 | /* Plan ahead */ |
487 | cps = correction / steps; | 518 | cps = correction / steps; |
488 | fifo_add_val(&mdev->rs_plan_s, cps); | 519 | fifo_add_val(plan, cps); |
489 | mdev->rs_planed += cps * steps; | 520 | plan->total += cps * steps; |
490 | 521 | ||
491 | /* What we do in this step */ | 522 | /* What we do in this step */ |
492 | curr_corr = fifo_push(&mdev->rs_plan_s, 0); | 523 | curr_corr = fifo_push(plan, 0); |
493 | spin_unlock(&mdev->peer_seq_lock); | 524 | plan->total -= curr_corr; |
494 | mdev->rs_planed -= curr_corr; | ||
495 | 525 | ||
496 | req_sect = sect_in + curr_corr; | 526 | req_sect = sect_in + curr_corr; |
497 | if (req_sect < 0) | 527 | if (req_sect < 0) |
498 | req_sect = 0; | 528 | req_sect = 0; |
499 | 529 | ||
500 | max_sect = (mdev->sync_conf.c_max_rate * 2 * SLEEP_TIME) / HZ; | 530 | max_sect = (dc->c_max_rate * 2 * SLEEP_TIME) / HZ; |
501 | if (req_sect > max_sect) | 531 | if (req_sect > max_sect) |
502 | req_sect = max_sect; | 532 | req_sect = max_sect; |
503 | 533 | ||
@@ -513,22 +543,25 @@ static int drbd_rs_controller(struct drbd_conf *mdev) | |||
513 | static int drbd_rs_number_requests(struct drbd_conf *mdev) | 543 | static int drbd_rs_number_requests(struct drbd_conf *mdev) |
514 | { | 544 | { |
515 | int number; | 545 | int number; |
516 | if (mdev->rs_plan_s.size) { /* mdev->sync_conf.c_plan_ahead */ | 546 | |
547 | rcu_read_lock(); | ||
548 | if (rcu_dereference(mdev->rs_plan_s)->size) { | ||
517 | number = drbd_rs_controller(mdev) >> (BM_BLOCK_SHIFT - 9); | 549 | number = drbd_rs_controller(mdev) >> (BM_BLOCK_SHIFT - 9); |
518 | mdev->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME; | 550 | mdev->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME; |
519 | } else { | 551 | } else { |
520 | mdev->c_sync_rate = mdev->sync_conf.rate; | 552 | mdev->c_sync_rate = rcu_dereference(mdev->ldev->disk_conf)->resync_rate; |
521 | number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ); | 553 | number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ); |
522 | } | 554 | } |
555 | rcu_read_unlock(); | ||
523 | 556 | ||
524 | /* ignore the amount of pending requests, the resync controller should | 557 | /* ignore the amount of pending requests, the resync controller should |
525 | * throttle down to incoming reply rate soon enough anyways. */ | 558 | * throttle down to incoming reply rate soon enough anyways. */ |
526 | return number; | 559 | return number; |
527 | } | 560 | } |
528 | 561 | ||
529 | static int w_make_resync_request(struct drbd_conf *mdev, | 562 | int w_make_resync_request(struct drbd_work *w, int cancel) |
530 | struct drbd_work *w, int cancel) | ||
531 | { | 563 | { |
564 | struct drbd_conf *mdev = w->mdev; | ||
532 | unsigned long bit; | 565 | unsigned long bit; |
533 | sector_t sector; | 566 | sector_t sector; |
534 | const sector_t capacity = drbd_get_capacity(mdev->this_bdev); | 567 | const sector_t capacity = drbd_get_capacity(mdev->this_bdev); |
@@ -538,12 +571,12 @@ static int w_make_resync_request(struct drbd_conf *mdev, | |||
538 | int i = 0; | 571 | int i = 0; |
539 | 572 | ||
540 | if (unlikely(cancel)) | 573 | if (unlikely(cancel)) |
541 | return 1; | 574 | return 0; |
542 | 575 | ||
543 | if (mdev->rs_total == 0) { | 576 | if (mdev->rs_total == 0) { |
544 | /* empty resync? */ | 577 | /* empty resync? */ |
545 | drbd_resync_finished(mdev); | 578 | drbd_resync_finished(mdev); |
546 | return 1; | 579 | return 0; |
547 | } | 580 | } |
548 | 581 | ||
549 | if (!get_ldev(mdev)) { | 582 | if (!get_ldev(mdev)) { |
@@ -552,7 +585,7 @@ static int w_make_resync_request(struct drbd_conf *mdev, | |||
552 | to continue resync with a broken disk makes no sense at | 585 | to continue resync with a broken disk makes no sense at |
553 | all */ | 586 | all */ |
554 | dev_err(DEV, "Disk broke down during resync!\n"); | 587 | dev_err(DEV, "Disk broke down during resync!\n"); |
555 | return 1; | 588 | return 0; |
556 | } | 589 | } |
557 | 590 | ||
558 | max_bio_size = queue_max_hw_sectors(mdev->rq_queue) << 9; | 591 | max_bio_size = queue_max_hw_sectors(mdev->rq_queue) << 9; |
@@ -562,15 +595,15 @@ static int w_make_resync_request(struct drbd_conf *mdev, | |||
562 | 595 | ||
563 | for (i = 0; i < number; i++) { | 596 | for (i = 0; i < number; i++) { |
564 | /* Stop generating RS requests, when half of the send buffer is filled */ | 597 | /* Stop generating RS requests, when half of the send buffer is filled */ |
565 | mutex_lock(&mdev->data.mutex); | 598 | mutex_lock(&mdev->tconn->data.mutex); |
566 | if (mdev->data.socket) { | 599 | if (mdev->tconn->data.socket) { |
567 | queued = mdev->data.socket->sk->sk_wmem_queued; | 600 | queued = mdev->tconn->data.socket->sk->sk_wmem_queued; |
568 | sndbuf = mdev->data.socket->sk->sk_sndbuf; | 601 | sndbuf = mdev->tconn->data.socket->sk->sk_sndbuf; |
569 | } else { | 602 | } else { |
570 | queued = 1; | 603 | queued = 1; |
571 | sndbuf = 0; | 604 | sndbuf = 0; |
572 | } | 605 | } |
573 | mutex_unlock(&mdev->data.mutex); | 606 | mutex_unlock(&mdev->tconn->data.mutex); |
574 | if (queued > sndbuf / 2) | 607 | if (queued > sndbuf / 2) |
575 | goto requeue; | 608 | goto requeue; |
576 | 609 | ||
@@ -581,7 +614,7 @@ next_sector: | |||
581 | if (bit == DRBD_END_OF_BITMAP) { | 614 | if (bit == DRBD_END_OF_BITMAP) { |
582 | mdev->bm_resync_fo = drbd_bm_bits(mdev); | 615 | mdev->bm_resync_fo = drbd_bm_bits(mdev); |
583 | put_ldev(mdev); | 616 | put_ldev(mdev); |
584 | return 1; | 617 | return 0; |
585 | } | 618 | } |
586 | 619 | ||
587 | sector = BM_BIT_TO_SECT(bit); | 620 | sector = BM_BIT_TO_SECT(bit); |
@@ -640,11 +673,11 @@ next_sector: | |||
640 | /* adjust very last sectors, in case we are oddly sized */ | 673 | /* adjust very last sectors, in case we are oddly sized */ |
641 | if (sector + (size>>9) > capacity) | 674 | if (sector + (size>>9) > capacity) |
642 | size = (capacity-sector)<<9; | 675 | size = (capacity-sector)<<9; |
643 | if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) { | 676 | if (mdev->tconn->agreed_pro_version >= 89 && mdev->tconn->csums_tfm) { |
644 | switch (read_for_csum(mdev, sector, size)) { | 677 | switch (read_for_csum(mdev, sector, size)) { |
645 | case -EIO: /* Disk failure */ | 678 | case -EIO: /* Disk failure */ |
646 | put_ldev(mdev); | 679 | put_ldev(mdev); |
647 | return 0; | 680 | return -EIO; |
648 | case -EAGAIN: /* allocation failed, or ldev busy */ | 681 | case -EAGAIN: /* allocation failed, or ldev busy */ |
649 | drbd_rs_complete_io(mdev, sector); | 682 | drbd_rs_complete_io(mdev, sector); |
650 | mdev->bm_resync_fo = BM_SECT_TO_BIT(sector); | 683 | mdev->bm_resync_fo = BM_SECT_TO_BIT(sector); |
@@ -657,13 +690,16 @@ next_sector: | |||
657 | BUG(); | 690 | BUG(); |
658 | } | 691 | } |
659 | } else { | 692 | } else { |
693 | int err; | ||
694 | |||
660 | inc_rs_pending(mdev); | 695 | inc_rs_pending(mdev); |
661 | if (!drbd_send_drequest(mdev, P_RS_DATA_REQUEST, | 696 | err = drbd_send_drequest(mdev, P_RS_DATA_REQUEST, |
662 | sector, size, ID_SYNCER)) { | 697 | sector, size, ID_SYNCER); |
698 | if (err) { | ||
663 | dev_err(DEV, "drbd_send_drequest() failed, aborting...\n"); | 699 | dev_err(DEV, "drbd_send_drequest() failed, aborting...\n"); |
664 | dec_rs_pending(mdev); | 700 | dec_rs_pending(mdev); |
665 | put_ldev(mdev); | 701 | put_ldev(mdev); |
666 | return 0; | 702 | return err; |
667 | } | 703 | } |
668 | } | 704 | } |
669 | } | 705 | } |
@@ -676,21 +712,23 @@ next_sector: | |||
676 | * until then resync "work" is "inactive" ... | 712 | * until then resync "work" is "inactive" ... |
677 | */ | 713 | */ |
678 | put_ldev(mdev); | 714 | put_ldev(mdev); |
679 | return 1; | 715 | return 0; |
680 | } | 716 | } |
681 | 717 | ||
682 | requeue: | 718 | requeue: |
683 | mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9)); | 719 | mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9)); |
684 | mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); | 720 | mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); |
685 | put_ldev(mdev); | 721 | put_ldev(mdev); |
686 | return 1; | 722 | return 0; |
687 | } | 723 | } |
688 | 724 | ||
689 | static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 725 | static int w_make_ov_request(struct drbd_work *w, int cancel) |
690 | { | 726 | { |
727 | struct drbd_conf *mdev = w->mdev; | ||
691 | int number, i, size; | 728 | int number, i, size; |
692 | sector_t sector; | 729 | sector_t sector; |
693 | const sector_t capacity = drbd_get_capacity(mdev->this_bdev); | 730 | const sector_t capacity = drbd_get_capacity(mdev->this_bdev); |
731 | bool stop_sector_reached = false; | ||
694 | 732 | ||
695 | if (unlikely(cancel)) | 733 | if (unlikely(cancel)) |
696 | return 1; | 734 | return 1; |
@@ -699,9 +737,17 @@ static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int ca | |||
699 | 737 | ||
700 | sector = mdev->ov_position; | 738 | sector = mdev->ov_position; |
701 | for (i = 0; i < number; i++) { | 739 | for (i = 0; i < number; i++) { |
702 | if (sector >= capacity) { | 740 | if (sector >= capacity) |
703 | return 1; | 741 | return 1; |
704 | } | 742 | |
743 | /* We check for "finished" only in the reply path: | ||
744 | * w_e_end_ov_reply(). | ||
745 | * We need to send at least one request out. */ | ||
746 | stop_sector_reached = i > 0 | ||
747 | && verify_can_do_stop_sector(mdev) | ||
748 | && sector >= mdev->ov_stop_sector; | ||
749 | if (stop_sector_reached) | ||
750 | break; | ||
705 | 751 | ||
706 | size = BM_BLOCK_SIZE; | 752 | size = BM_BLOCK_SIZE; |
707 | 753 | ||
@@ -715,7 +761,7 @@ static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int ca | |||
715 | size = (capacity-sector)<<9; | 761 | size = (capacity-sector)<<9; |
716 | 762 | ||
717 | inc_rs_pending(mdev); | 763 | inc_rs_pending(mdev); |
718 | if (!drbd_send_ov_request(mdev, sector, size)) { | 764 | if (drbd_send_ov_request(mdev, sector, size)) { |
719 | dec_rs_pending(mdev); | 765 | dec_rs_pending(mdev); |
720 | return 0; | 766 | return 0; |
721 | } | 767 | } |
@@ -725,56 +771,39 @@ static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int ca | |||
725 | 771 | ||
726 | requeue: | 772 | requeue: |
727 | mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9)); | 773 | mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9)); |
728 | mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); | 774 | if (i == 0 || !stop_sector_reached) |
729 | return 1; | 775 | mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); |
730 | } | ||
731 | |||
732 | |||
733 | void start_resync_timer_fn(unsigned long data) | ||
734 | { | ||
735 | struct drbd_conf *mdev = (struct drbd_conf *) data; | ||
736 | |||
737 | drbd_queue_work(&mdev->data.work, &mdev->start_resync_work); | ||
738 | } | ||
739 | |||
740 | int w_start_resync(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
741 | { | ||
742 | if (atomic_read(&mdev->unacked_cnt) || atomic_read(&mdev->rs_pending_cnt)) { | ||
743 | dev_warn(DEV, "w_start_resync later...\n"); | ||
744 | mdev->start_resync_timer.expires = jiffies + HZ/10; | ||
745 | add_timer(&mdev->start_resync_timer); | ||
746 | return 1; | ||
747 | } | ||
748 | |||
749 | drbd_start_resync(mdev, C_SYNC_SOURCE); | ||
750 | clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags); | ||
751 | return 1; | 776 | return 1; |
752 | } | 777 | } |
753 | 778 | ||
754 | int w_ov_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 779 | int w_ov_finished(struct drbd_work *w, int cancel) |
755 | { | 780 | { |
781 | struct drbd_conf *mdev = w->mdev; | ||
756 | kfree(w); | 782 | kfree(w); |
757 | ov_oos_print(mdev); | 783 | ov_out_of_sync_print(mdev); |
758 | drbd_resync_finished(mdev); | 784 | drbd_resync_finished(mdev); |
759 | 785 | ||
760 | return 1; | 786 | return 0; |
761 | } | 787 | } |
762 | 788 | ||
763 | static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 789 | static int w_resync_finished(struct drbd_work *w, int cancel) |
764 | { | 790 | { |
791 | struct drbd_conf *mdev = w->mdev; | ||
765 | kfree(w); | 792 | kfree(w); |
766 | 793 | ||
767 | drbd_resync_finished(mdev); | 794 | drbd_resync_finished(mdev); |
768 | 795 | ||
769 | return 1; | 796 | return 0; |
770 | } | 797 | } |
771 | 798 | ||
772 | static void ping_peer(struct drbd_conf *mdev) | 799 | static void ping_peer(struct drbd_conf *mdev) |
773 | { | 800 | { |
774 | clear_bit(GOT_PING_ACK, &mdev->flags); | 801 | struct drbd_tconn *tconn = mdev->tconn; |
775 | request_ping(mdev); | 802 | |
776 | wait_event(mdev->misc_wait, | 803 | clear_bit(GOT_PING_ACK, &tconn->flags); |
777 | test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED); | 804 | request_ping(tconn); |
805 | wait_event(tconn->ping_wait, | ||
806 | test_bit(GOT_PING_ACK, &tconn->flags) || mdev->state.conn < C_CONNECTED); | ||
778 | } | 807 | } |
779 | 808 | ||
780 | int drbd_resync_finished(struct drbd_conf *mdev) | 809 | int drbd_resync_finished(struct drbd_conf *mdev) |
@@ -799,7 +828,8 @@ int drbd_resync_finished(struct drbd_conf *mdev) | |||
799 | w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC); | 828 | w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC); |
800 | if (w) { | 829 | if (w) { |
801 | w->cb = w_resync_finished; | 830 | w->cb = w_resync_finished; |
802 | drbd_queue_work(&mdev->data.work, w); | 831 | w->mdev = mdev; |
832 | drbd_queue_work(&mdev->tconn->sender_work, w); | ||
803 | return 1; | 833 | return 1; |
804 | } | 834 | } |
805 | dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n"); | 835 | dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n"); |
@@ -808,7 +838,12 @@ int drbd_resync_finished(struct drbd_conf *mdev) | |||
808 | dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ; | 838 | dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ; |
809 | if (dt <= 0) | 839 | if (dt <= 0) |
810 | dt = 1; | 840 | dt = 1; |
841 | |||
811 | db = mdev->rs_total; | 842 | db = mdev->rs_total; |
843 | /* adjust for verify start and stop sectors, respective reached position */ | ||
844 | if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T) | ||
845 | db -= mdev->ov_left; | ||
846 | |||
812 | dbdt = Bit2KB(db/dt); | 847 | dbdt = Bit2KB(db/dt); |
813 | mdev->rs_paused /= HZ; | 848 | mdev->rs_paused /= HZ; |
814 | 849 | ||
@@ -817,8 +852,8 @@ int drbd_resync_finished(struct drbd_conf *mdev) | |||
817 | 852 | ||
818 | ping_peer(mdev); | 853 | ping_peer(mdev); |
819 | 854 | ||
820 | spin_lock_irq(&mdev->req_lock); | 855 | spin_lock_irq(&mdev->tconn->req_lock); |
821 | os = mdev->state; | 856 | os = drbd_read_state(mdev); |
822 | 857 | ||
823 | verify_done = (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T); | 858 | verify_done = (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T); |
824 | 859 | ||
@@ -831,7 +866,7 @@ int drbd_resync_finished(struct drbd_conf *mdev) | |||
831 | ns.conn = C_CONNECTED; | 866 | ns.conn = C_CONNECTED; |
832 | 867 | ||
833 | dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n", | 868 | dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n", |
834 | verify_done ? "Online verify " : "Resync", | 869 | verify_done ? "Online verify" : "Resync", |
835 | dt + mdev->rs_paused, mdev->rs_paused, dbdt); | 870 | dt + mdev->rs_paused, mdev->rs_paused, dbdt); |
836 | 871 | ||
837 | n_oos = drbd_bm_total_weight(mdev); | 872 | n_oos = drbd_bm_total_weight(mdev); |
@@ -848,7 +883,7 @@ int drbd_resync_finished(struct drbd_conf *mdev) | |||
848 | if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) | 883 | if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) |
849 | khelper_cmd = "after-resync-target"; | 884 | khelper_cmd = "after-resync-target"; |
850 | 885 | ||
851 | if (mdev->csums_tfm && mdev->rs_total) { | 886 | if (mdev->tconn->csums_tfm && mdev->rs_total) { |
852 | const unsigned long s = mdev->rs_same_csum; | 887 | const unsigned long s = mdev->rs_same_csum; |
853 | const unsigned long t = mdev->rs_total; | 888 | const unsigned long t = mdev->rs_total; |
854 | const int ratio = | 889 | const int ratio = |
@@ -906,13 +941,15 @@ int drbd_resync_finished(struct drbd_conf *mdev) | |||
906 | 941 | ||
907 | _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); | 942 | _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); |
908 | out_unlock: | 943 | out_unlock: |
909 | spin_unlock_irq(&mdev->req_lock); | 944 | spin_unlock_irq(&mdev->tconn->req_lock); |
910 | put_ldev(mdev); | 945 | put_ldev(mdev); |
911 | out: | 946 | out: |
912 | mdev->rs_total = 0; | 947 | mdev->rs_total = 0; |
913 | mdev->rs_failed = 0; | 948 | mdev->rs_failed = 0; |
914 | mdev->rs_paused = 0; | 949 | mdev->rs_paused = 0; |
915 | if (verify_done) | 950 | |
951 | /* reset start sector, if we reached end of device */ | ||
952 | if (verify_done && mdev->ov_left == 0) | ||
916 | mdev->ov_start_sector = 0; | 953 | mdev->ov_start_sector = 0; |
917 | 954 | ||
918 | drbd_md_sync(mdev); | 955 | drbd_md_sync(mdev); |
@@ -924,19 +961,19 @@ out: | |||
924 | } | 961 | } |
925 | 962 | ||
926 | /* helper */ | 963 | /* helper */ |
927 | static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e) | 964 | static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_peer_request *peer_req) |
928 | { | 965 | { |
929 | if (drbd_ee_has_active_page(e)) { | 966 | if (drbd_peer_req_has_active_page(peer_req)) { |
930 | /* This might happen if sendpage() has not finished */ | 967 | /* This might happen if sendpage() has not finished */ |
931 | int i = (e->size + PAGE_SIZE -1) >> PAGE_SHIFT; | 968 | int i = (peer_req->i.size + PAGE_SIZE -1) >> PAGE_SHIFT; |
932 | atomic_add(i, &mdev->pp_in_use_by_net); | 969 | atomic_add(i, &mdev->pp_in_use_by_net); |
933 | atomic_sub(i, &mdev->pp_in_use); | 970 | atomic_sub(i, &mdev->pp_in_use); |
934 | spin_lock_irq(&mdev->req_lock); | 971 | spin_lock_irq(&mdev->tconn->req_lock); |
935 | list_add_tail(&e->w.list, &mdev->net_ee); | 972 | list_add_tail(&peer_req->w.list, &mdev->net_ee); |
936 | spin_unlock_irq(&mdev->req_lock); | 973 | spin_unlock_irq(&mdev->tconn->req_lock); |
937 | wake_up(&drbd_pp_wait); | 974 | wake_up(&drbd_pp_wait); |
938 | } else | 975 | } else |
939 | drbd_free_ee(mdev, e); | 976 | drbd_free_peer_req(mdev, peer_req); |
940 | } | 977 | } |
941 | 978 | ||
942 | /** | 979 | /** |
@@ -945,174 +982,177 @@ static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_ent | |||
945 | * @w: work object. | 982 | * @w: work object. |
946 | * @cancel: The connection will be closed anyways | 983 | * @cancel: The connection will be closed anyways |
947 | */ | 984 | */ |
948 | int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 985 | int w_e_end_data_req(struct drbd_work *w, int cancel) |
949 | { | 986 | { |
950 | struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); | 987 | struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); |
951 | int ok; | 988 | struct drbd_conf *mdev = w->mdev; |
989 | int err; | ||
952 | 990 | ||
953 | if (unlikely(cancel)) { | 991 | if (unlikely(cancel)) { |
954 | drbd_free_ee(mdev, e); | 992 | drbd_free_peer_req(mdev, peer_req); |
955 | dec_unacked(mdev); | 993 | dec_unacked(mdev); |
956 | return 1; | 994 | return 0; |
957 | } | 995 | } |
958 | 996 | ||
959 | if (likely((e->flags & EE_WAS_ERROR) == 0)) { | 997 | if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { |
960 | ok = drbd_send_block(mdev, P_DATA_REPLY, e); | 998 | err = drbd_send_block(mdev, P_DATA_REPLY, peer_req); |
961 | } else { | 999 | } else { |
962 | if (__ratelimit(&drbd_ratelimit_state)) | 1000 | if (__ratelimit(&drbd_ratelimit_state)) |
963 | dev_err(DEV, "Sending NegDReply. sector=%llus.\n", | 1001 | dev_err(DEV, "Sending NegDReply. sector=%llus.\n", |
964 | (unsigned long long)e->sector); | 1002 | (unsigned long long)peer_req->i.sector); |
965 | 1003 | ||
966 | ok = drbd_send_ack(mdev, P_NEG_DREPLY, e); | 1004 | err = drbd_send_ack(mdev, P_NEG_DREPLY, peer_req); |
967 | } | 1005 | } |
968 | 1006 | ||
969 | dec_unacked(mdev); | 1007 | dec_unacked(mdev); |
970 | 1008 | ||
971 | move_to_net_ee_or_free(mdev, e); | 1009 | move_to_net_ee_or_free(mdev, peer_req); |
972 | 1010 | ||
973 | if (unlikely(!ok)) | 1011 | if (unlikely(err)) |
974 | dev_err(DEV, "drbd_send_block() failed\n"); | 1012 | dev_err(DEV, "drbd_send_block() failed\n"); |
975 | return ok; | 1013 | return err; |
976 | } | 1014 | } |
977 | 1015 | ||
978 | /** | 1016 | /** |
979 | * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUESTRS | 1017 | * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST |
980 | * @mdev: DRBD device. | 1018 | * @mdev: DRBD device. |
981 | * @w: work object. | 1019 | * @w: work object. |
982 | * @cancel: The connection will be closed anyways | 1020 | * @cancel: The connection will be closed anyways |
983 | */ | 1021 | */ |
984 | int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 1022 | int w_e_end_rsdata_req(struct drbd_work *w, int cancel) |
985 | { | 1023 | { |
986 | struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); | 1024 | struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); |
987 | int ok; | 1025 | struct drbd_conf *mdev = w->mdev; |
1026 | int err; | ||
988 | 1027 | ||
989 | if (unlikely(cancel)) { | 1028 | if (unlikely(cancel)) { |
990 | drbd_free_ee(mdev, e); | 1029 | drbd_free_peer_req(mdev, peer_req); |
991 | dec_unacked(mdev); | 1030 | dec_unacked(mdev); |
992 | return 1; | 1031 | return 0; |
993 | } | 1032 | } |
994 | 1033 | ||
995 | if (get_ldev_if_state(mdev, D_FAILED)) { | 1034 | if (get_ldev_if_state(mdev, D_FAILED)) { |
996 | drbd_rs_complete_io(mdev, e->sector); | 1035 | drbd_rs_complete_io(mdev, peer_req->i.sector); |
997 | put_ldev(mdev); | 1036 | put_ldev(mdev); |
998 | } | 1037 | } |
999 | 1038 | ||
1000 | if (mdev->state.conn == C_AHEAD) { | 1039 | if (mdev->state.conn == C_AHEAD) { |
1001 | ok = drbd_send_ack(mdev, P_RS_CANCEL, e); | 1040 | err = drbd_send_ack(mdev, P_RS_CANCEL, peer_req); |
1002 | } else if (likely((e->flags & EE_WAS_ERROR) == 0)) { | 1041 | } else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { |
1003 | if (likely(mdev->state.pdsk >= D_INCONSISTENT)) { | 1042 | if (likely(mdev->state.pdsk >= D_INCONSISTENT)) { |
1004 | inc_rs_pending(mdev); | 1043 | inc_rs_pending(mdev); |
1005 | ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e); | 1044 | err = drbd_send_block(mdev, P_RS_DATA_REPLY, peer_req); |
1006 | } else { | 1045 | } else { |
1007 | if (__ratelimit(&drbd_ratelimit_state)) | 1046 | if (__ratelimit(&drbd_ratelimit_state)) |
1008 | dev_err(DEV, "Not sending RSDataReply, " | 1047 | dev_err(DEV, "Not sending RSDataReply, " |
1009 | "partner DISKLESS!\n"); | 1048 | "partner DISKLESS!\n"); |
1010 | ok = 1; | 1049 | err = 0; |
1011 | } | 1050 | } |
1012 | } else { | 1051 | } else { |
1013 | if (__ratelimit(&drbd_ratelimit_state)) | 1052 | if (__ratelimit(&drbd_ratelimit_state)) |
1014 | dev_err(DEV, "Sending NegRSDReply. sector %llus.\n", | 1053 | dev_err(DEV, "Sending NegRSDReply. sector %llus.\n", |
1015 | (unsigned long long)e->sector); | 1054 | (unsigned long long)peer_req->i.sector); |
1016 | 1055 | ||
1017 | ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e); | 1056 | err = drbd_send_ack(mdev, P_NEG_RS_DREPLY, peer_req); |
1018 | 1057 | ||
1019 | /* update resync data with failure */ | 1058 | /* update resync data with failure */ |
1020 | drbd_rs_failed_io(mdev, e->sector, e->size); | 1059 | drbd_rs_failed_io(mdev, peer_req->i.sector, peer_req->i.size); |
1021 | } | 1060 | } |
1022 | 1061 | ||
1023 | dec_unacked(mdev); | 1062 | dec_unacked(mdev); |
1024 | 1063 | ||
1025 | move_to_net_ee_or_free(mdev, e); | 1064 | move_to_net_ee_or_free(mdev, peer_req); |
1026 | 1065 | ||
1027 | if (unlikely(!ok)) | 1066 | if (unlikely(err)) |
1028 | dev_err(DEV, "drbd_send_block() failed\n"); | 1067 | dev_err(DEV, "drbd_send_block() failed\n"); |
1029 | return ok; | 1068 | return err; |
1030 | } | 1069 | } |
1031 | 1070 | ||
1032 | int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 1071 | int w_e_end_csum_rs_req(struct drbd_work *w, int cancel) |
1033 | { | 1072 | { |
1034 | struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); | 1073 | struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); |
1074 | struct drbd_conf *mdev = w->mdev; | ||
1035 | struct digest_info *di; | 1075 | struct digest_info *di; |
1036 | int digest_size; | 1076 | int digest_size; |
1037 | void *digest = NULL; | 1077 | void *digest = NULL; |
1038 | int ok, eq = 0; | 1078 | int err, eq = 0; |
1039 | 1079 | ||
1040 | if (unlikely(cancel)) { | 1080 | if (unlikely(cancel)) { |
1041 | drbd_free_ee(mdev, e); | 1081 | drbd_free_peer_req(mdev, peer_req); |
1042 | dec_unacked(mdev); | 1082 | dec_unacked(mdev); |
1043 | return 1; | 1083 | return 0; |
1044 | } | 1084 | } |
1045 | 1085 | ||
1046 | if (get_ldev(mdev)) { | 1086 | if (get_ldev(mdev)) { |
1047 | drbd_rs_complete_io(mdev, e->sector); | 1087 | drbd_rs_complete_io(mdev, peer_req->i.sector); |
1048 | put_ldev(mdev); | 1088 | put_ldev(mdev); |
1049 | } | 1089 | } |
1050 | 1090 | ||
1051 | di = e->digest; | 1091 | di = peer_req->digest; |
1052 | 1092 | ||
1053 | if (likely((e->flags & EE_WAS_ERROR) == 0)) { | 1093 | if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { |
1054 | /* quick hack to try to avoid a race against reconfiguration. | 1094 | /* quick hack to try to avoid a race against reconfiguration. |
1055 | * a real fix would be much more involved, | 1095 | * a real fix would be much more involved, |
1056 | * introducing more locking mechanisms */ | 1096 | * introducing more locking mechanisms */ |
1057 | if (mdev->csums_tfm) { | 1097 | if (mdev->tconn->csums_tfm) { |
1058 | digest_size = crypto_hash_digestsize(mdev->csums_tfm); | 1098 | digest_size = crypto_hash_digestsize(mdev->tconn->csums_tfm); |
1059 | D_ASSERT(digest_size == di->digest_size); | 1099 | D_ASSERT(digest_size == di->digest_size); |
1060 | digest = kmalloc(digest_size, GFP_NOIO); | 1100 | digest = kmalloc(digest_size, GFP_NOIO); |
1061 | } | 1101 | } |
1062 | if (digest) { | 1102 | if (digest) { |
1063 | drbd_csum_ee(mdev, mdev->csums_tfm, e, digest); | 1103 | drbd_csum_ee(mdev, mdev->tconn->csums_tfm, peer_req, digest); |
1064 | eq = !memcmp(digest, di->digest, digest_size); | 1104 | eq = !memcmp(digest, di->digest, digest_size); |
1065 | kfree(digest); | 1105 | kfree(digest); |
1066 | } | 1106 | } |
1067 | 1107 | ||
1068 | if (eq) { | 1108 | if (eq) { |
1069 | drbd_set_in_sync(mdev, e->sector, e->size); | 1109 | drbd_set_in_sync(mdev, peer_req->i.sector, peer_req->i.size); |
1070 | /* rs_same_csums unit is BM_BLOCK_SIZE */ | 1110 | /* rs_same_csums unit is BM_BLOCK_SIZE */ |
1071 | mdev->rs_same_csum += e->size >> BM_BLOCK_SHIFT; | 1111 | mdev->rs_same_csum += peer_req->i.size >> BM_BLOCK_SHIFT; |
1072 | ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e); | 1112 | err = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, peer_req); |
1073 | } else { | 1113 | } else { |
1074 | inc_rs_pending(mdev); | 1114 | inc_rs_pending(mdev); |
1075 | e->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */ | 1115 | peer_req->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */ |
1076 | e->flags &= ~EE_HAS_DIGEST; /* This e no longer has a digest pointer */ | 1116 | peer_req->flags &= ~EE_HAS_DIGEST; /* This peer request no longer has a digest pointer */ |
1077 | kfree(di); | 1117 | kfree(di); |
1078 | ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e); | 1118 | err = drbd_send_block(mdev, P_RS_DATA_REPLY, peer_req); |
1079 | } | 1119 | } |
1080 | } else { | 1120 | } else { |
1081 | ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e); | 1121 | err = drbd_send_ack(mdev, P_NEG_RS_DREPLY, peer_req); |
1082 | if (__ratelimit(&drbd_ratelimit_state)) | 1122 | if (__ratelimit(&drbd_ratelimit_state)) |
1083 | dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n"); | 1123 | dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n"); |
1084 | } | 1124 | } |
1085 | 1125 | ||
1086 | dec_unacked(mdev); | 1126 | dec_unacked(mdev); |
1087 | move_to_net_ee_or_free(mdev, e); | 1127 | move_to_net_ee_or_free(mdev, peer_req); |
1088 | 1128 | ||
1089 | if (unlikely(!ok)) | 1129 | if (unlikely(err)) |
1090 | dev_err(DEV, "drbd_send_block/ack() failed\n"); | 1130 | dev_err(DEV, "drbd_send_block/ack() failed\n"); |
1091 | return ok; | 1131 | return err; |
1092 | } | 1132 | } |
1093 | 1133 | ||
1094 | /* TODO merge common code with w_e_send_csum */ | 1134 | int w_e_end_ov_req(struct drbd_work *w, int cancel) |
1095 | int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | ||
1096 | { | 1135 | { |
1097 | struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); | 1136 | struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); |
1098 | sector_t sector = e->sector; | 1137 | struct drbd_conf *mdev = w->mdev; |
1099 | unsigned int size = e->size; | 1138 | sector_t sector = peer_req->i.sector; |
1139 | unsigned int size = peer_req->i.size; | ||
1100 | int digest_size; | 1140 | int digest_size; |
1101 | void *digest; | 1141 | void *digest; |
1102 | int ok = 1; | 1142 | int err = 0; |
1103 | 1143 | ||
1104 | if (unlikely(cancel)) | 1144 | if (unlikely(cancel)) |
1105 | goto out; | 1145 | goto out; |
1106 | 1146 | ||
1107 | digest_size = crypto_hash_digestsize(mdev->verify_tfm); | 1147 | digest_size = crypto_hash_digestsize(mdev->tconn->verify_tfm); |
1108 | digest = kmalloc(digest_size, GFP_NOIO); | 1148 | digest = kmalloc(digest_size, GFP_NOIO); |
1109 | if (!digest) { | 1149 | if (!digest) { |
1110 | ok = 0; /* terminate the connection in case the allocation failed */ | 1150 | err = 1; /* terminate the connection in case the allocation failed */ |
1111 | goto out; | 1151 | goto out; |
1112 | } | 1152 | } |
1113 | 1153 | ||
1114 | if (likely(!(e->flags & EE_WAS_ERROR))) | 1154 | if (likely(!(peer_req->flags & EE_WAS_ERROR))) |
1115 | drbd_csum_ee(mdev, mdev->verify_tfm, e, digest); | 1155 | drbd_csum_ee(mdev, mdev->tconn->verify_tfm, peer_req, digest); |
1116 | else | 1156 | else |
1117 | memset(digest, 0, digest_size); | 1157 | memset(digest, 0, digest_size); |
1118 | 1158 | ||
@@ -1120,25 +1160,23 @@ int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | |||
1120 | * In case we block on congestion, we could otherwise run into | 1160 | * In case we block on congestion, we could otherwise run into |
1121 | * some distributed deadlock, if the other side blocks on | 1161 | * some distributed deadlock, if the other side blocks on |
1122 | * congestion as well, because our receiver blocks in | 1162 | * congestion as well, because our receiver blocks in |
1123 | * drbd_pp_alloc due to pp_in_use > max_buffers. */ | 1163 | * drbd_alloc_pages due to pp_in_use > max_buffers. */ |
1124 | drbd_free_ee(mdev, e); | 1164 | drbd_free_peer_req(mdev, peer_req); |
1125 | e = NULL; | 1165 | peer_req = NULL; |
1126 | inc_rs_pending(mdev); | 1166 | inc_rs_pending(mdev); |
1127 | ok = drbd_send_drequest_csum(mdev, sector, size, | 1167 | err = drbd_send_drequest_csum(mdev, sector, size, digest, digest_size, P_OV_REPLY); |
1128 | digest, digest_size, | 1168 | if (err) |
1129 | P_OV_REPLY); | ||
1130 | if (!ok) | ||
1131 | dec_rs_pending(mdev); | 1169 | dec_rs_pending(mdev); |
1132 | kfree(digest); | 1170 | kfree(digest); |
1133 | 1171 | ||
1134 | out: | 1172 | out: |
1135 | if (e) | 1173 | if (peer_req) |
1136 | drbd_free_ee(mdev, e); | 1174 | drbd_free_peer_req(mdev, peer_req); |
1137 | dec_unacked(mdev); | 1175 | dec_unacked(mdev); |
1138 | return ok; | 1176 | return err; |
1139 | } | 1177 | } |
1140 | 1178 | ||
1141 | void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size) | 1179 | void drbd_ov_out_of_sync_found(struct drbd_conf *mdev, sector_t sector, int size) |
1142 | { | 1180 | { |
1143 | if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) { | 1181 | if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) { |
1144 | mdev->ov_last_oos_size += size>>9; | 1182 | mdev->ov_last_oos_size += size>>9; |
@@ -1149,36 +1187,38 @@ void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size) | |||
1149 | drbd_set_out_of_sync(mdev, sector, size); | 1187 | drbd_set_out_of_sync(mdev, sector, size); |
1150 | } | 1188 | } |
1151 | 1189 | ||
1152 | int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 1190 | int w_e_end_ov_reply(struct drbd_work *w, int cancel) |
1153 | { | 1191 | { |
1154 | struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); | 1192 | struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); |
1193 | struct drbd_conf *mdev = w->mdev; | ||
1155 | struct digest_info *di; | 1194 | struct digest_info *di; |
1156 | void *digest; | 1195 | void *digest; |
1157 | sector_t sector = e->sector; | 1196 | sector_t sector = peer_req->i.sector; |
1158 | unsigned int size = e->size; | 1197 | unsigned int size = peer_req->i.size; |
1159 | int digest_size; | 1198 | int digest_size; |
1160 | int ok, eq = 0; | 1199 | int err, eq = 0; |
1200 | bool stop_sector_reached = false; | ||
1161 | 1201 | ||
1162 | if (unlikely(cancel)) { | 1202 | if (unlikely(cancel)) { |
1163 | drbd_free_ee(mdev, e); | 1203 | drbd_free_peer_req(mdev, peer_req); |
1164 | dec_unacked(mdev); | 1204 | dec_unacked(mdev); |
1165 | return 1; | 1205 | return 0; |
1166 | } | 1206 | } |
1167 | 1207 | ||
1168 | /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all | 1208 | /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all |
1169 | * the resync lru has been cleaned up already */ | 1209 | * the resync lru has been cleaned up already */ |
1170 | if (get_ldev(mdev)) { | 1210 | if (get_ldev(mdev)) { |
1171 | drbd_rs_complete_io(mdev, e->sector); | 1211 | drbd_rs_complete_io(mdev, peer_req->i.sector); |
1172 | put_ldev(mdev); | 1212 | put_ldev(mdev); |
1173 | } | 1213 | } |
1174 | 1214 | ||
1175 | di = e->digest; | 1215 | di = peer_req->digest; |
1176 | 1216 | ||
1177 | if (likely((e->flags & EE_WAS_ERROR) == 0)) { | 1217 | if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { |
1178 | digest_size = crypto_hash_digestsize(mdev->verify_tfm); | 1218 | digest_size = crypto_hash_digestsize(mdev->tconn->verify_tfm); |
1179 | digest = kmalloc(digest_size, GFP_NOIO); | 1219 | digest = kmalloc(digest_size, GFP_NOIO); |
1180 | if (digest) { | 1220 | if (digest) { |
1181 | drbd_csum_ee(mdev, mdev->verify_tfm, e, digest); | 1221 | drbd_csum_ee(mdev, mdev->tconn->verify_tfm, peer_req, digest); |
1182 | 1222 | ||
1183 | D_ASSERT(digest_size == di->digest_size); | 1223 | D_ASSERT(digest_size == di->digest_size); |
1184 | eq = !memcmp(digest, di->digest, digest_size); | 1224 | eq = !memcmp(digest, di->digest, digest_size); |
@@ -1186,19 +1226,19 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | |||
1186 | } | 1226 | } |
1187 | } | 1227 | } |
1188 | 1228 | ||
1189 | /* Free e and pages before send. | 1229 | /* Free peer_req and pages before send. |
1190 | * In case we block on congestion, we could otherwise run into | 1230 | * In case we block on congestion, we could otherwise run into |
1191 | * some distributed deadlock, if the other side blocks on | 1231 | * some distributed deadlock, if the other side blocks on |
1192 | * congestion as well, because our receiver blocks in | 1232 | * congestion as well, because our receiver blocks in |
1193 | * drbd_pp_alloc due to pp_in_use > max_buffers. */ | 1233 | * drbd_alloc_pages due to pp_in_use > max_buffers. */ |
1194 | drbd_free_ee(mdev, e); | 1234 | drbd_free_peer_req(mdev, peer_req); |
1195 | if (!eq) | 1235 | if (!eq) |
1196 | drbd_ov_oos_found(mdev, sector, size); | 1236 | drbd_ov_out_of_sync_found(mdev, sector, size); |
1197 | else | 1237 | else |
1198 | ov_oos_print(mdev); | 1238 | ov_out_of_sync_print(mdev); |
1199 | 1239 | ||
1200 | ok = drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, | 1240 | err = drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, |
1201 | eq ? ID_IN_SYNC : ID_OUT_OF_SYNC); | 1241 | eq ? ID_IN_SYNC : ID_OUT_OF_SYNC); |
1202 | 1242 | ||
1203 | dec_unacked(mdev); | 1243 | dec_unacked(mdev); |
1204 | 1244 | ||
@@ -1208,73 +1248,102 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | |||
1208 | if ((mdev->ov_left & 0x200) == 0x200) | 1248 | if ((mdev->ov_left & 0x200) == 0x200) |
1209 | drbd_advance_rs_marks(mdev, mdev->ov_left); | 1249 | drbd_advance_rs_marks(mdev, mdev->ov_left); |
1210 | 1250 | ||
1211 | if (mdev->ov_left == 0) { | 1251 | stop_sector_reached = verify_can_do_stop_sector(mdev) && |
1212 | ov_oos_print(mdev); | 1252 | (sector + (size>>9)) >= mdev->ov_stop_sector; |
1253 | |||
1254 | if (mdev->ov_left == 0 || stop_sector_reached) { | ||
1255 | ov_out_of_sync_print(mdev); | ||
1213 | drbd_resync_finished(mdev); | 1256 | drbd_resync_finished(mdev); |
1214 | } | 1257 | } |
1215 | 1258 | ||
1216 | return ok; | 1259 | return err; |
1217 | } | 1260 | } |
1218 | 1261 | ||
1219 | int w_prev_work_done(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 1262 | int w_prev_work_done(struct drbd_work *w, int cancel) |
1220 | { | 1263 | { |
1221 | struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w); | 1264 | struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w); |
1265 | |||
1222 | complete(&b->done); | 1266 | complete(&b->done); |
1223 | return 1; | 1267 | return 0; |
1224 | } | 1268 | } |
1225 | 1269 | ||
1226 | int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 1270 | /* FIXME |
1271 | * We need to track the number of pending barrier acks, | ||
1272 | * and to be able to wait for them. | ||
1273 | * See also comment in drbd_adm_attach before drbd_suspend_io. | ||
1274 | */ | ||
1275 | int drbd_send_barrier(struct drbd_tconn *tconn) | ||
1227 | { | 1276 | { |
1228 | struct drbd_tl_epoch *b = container_of(w, struct drbd_tl_epoch, w); | 1277 | struct p_barrier *p; |
1229 | struct p_barrier *p = &mdev->data.sbuf.barrier; | 1278 | struct drbd_socket *sock; |
1230 | int ok = 1; | ||
1231 | |||
1232 | /* really avoid racing with tl_clear. w.cb may have been referenced | ||
1233 | * just before it was reassigned and re-queued, so double check that. | ||
1234 | * actually, this race was harmless, since we only try to send the | ||
1235 | * barrier packet here, and otherwise do nothing with the object. | ||
1236 | * but compare with the head of w_clear_epoch */ | ||
1237 | spin_lock_irq(&mdev->req_lock); | ||
1238 | if (w->cb != w_send_barrier || mdev->state.conn < C_CONNECTED) | ||
1239 | cancel = 1; | ||
1240 | spin_unlock_irq(&mdev->req_lock); | ||
1241 | if (cancel) | ||
1242 | return 1; | ||
1243 | 1279 | ||
1244 | if (!drbd_get_data_sock(mdev)) | 1280 | sock = &tconn->data; |
1245 | return 0; | 1281 | p = conn_prepare_command(tconn, sock); |
1246 | p->barrier = b->br_number; | 1282 | if (!p) |
1247 | /* inc_ap_pending was done where this was queued. | 1283 | return -EIO; |
1248 | * dec_ap_pending will be done in got_BarrierAck | 1284 | p->barrier = tconn->send.current_epoch_nr; |
1249 | * or (on connection loss) in w_clear_epoch. */ | 1285 | p->pad = 0; |
1250 | ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER, | 1286 | tconn->send.current_epoch_writes = 0; |
1251 | (struct p_header80 *)p, sizeof(*p), 0); | 1287 | |
1252 | drbd_put_data_sock(mdev); | 1288 | return conn_send_command(tconn, sock, P_BARRIER, sizeof(*p), NULL, 0); |
1253 | |||
1254 | return ok; | ||
1255 | } | 1289 | } |
1256 | 1290 | ||
1257 | int w_send_write_hint(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 1291 | int w_send_write_hint(struct drbd_work *w, int cancel) |
1258 | { | 1292 | { |
1293 | struct drbd_conf *mdev = w->mdev; | ||
1294 | struct drbd_socket *sock; | ||
1295 | |||
1259 | if (cancel) | 1296 | if (cancel) |
1260 | return 1; | 1297 | return 0; |
1261 | return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE); | 1298 | sock = &mdev->tconn->data; |
1299 | if (!drbd_prepare_command(mdev, sock)) | ||
1300 | return -EIO; | ||
1301 | return drbd_send_command(mdev, sock, P_UNPLUG_REMOTE, 0, NULL, 0); | ||
1262 | } | 1302 | } |
1263 | 1303 | ||
1264 | int w_send_oos(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 1304 | static void re_init_if_first_write(struct drbd_tconn *tconn, unsigned int epoch) |
1305 | { | ||
1306 | if (!tconn->send.seen_any_write_yet) { | ||
1307 | tconn->send.seen_any_write_yet = true; | ||
1308 | tconn->send.current_epoch_nr = epoch; | ||
1309 | tconn->send.current_epoch_writes = 0; | ||
1310 | } | ||
1311 | } | ||
1312 | |||
1313 | static void maybe_send_barrier(struct drbd_tconn *tconn, unsigned int epoch) | ||
1314 | { | ||
1315 | /* re-init if first write on this connection */ | ||
1316 | if (!tconn->send.seen_any_write_yet) | ||
1317 | return; | ||
1318 | if (tconn->send.current_epoch_nr != epoch) { | ||
1319 | if (tconn->send.current_epoch_writes) | ||
1320 | drbd_send_barrier(tconn); | ||
1321 | tconn->send.current_epoch_nr = epoch; | ||
1322 | } | ||
1323 | } | ||
1324 | |||
1325 | int w_send_out_of_sync(struct drbd_work *w, int cancel) | ||
1265 | { | 1326 | { |
1266 | struct drbd_request *req = container_of(w, struct drbd_request, w); | 1327 | struct drbd_request *req = container_of(w, struct drbd_request, w); |
1267 | int ok; | 1328 | struct drbd_conf *mdev = w->mdev; |
1329 | struct drbd_tconn *tconn = mdev->tconn; | ||
1330 | int err; | ||
1268 | 1331 | ||
1269 | if (unlikely(cancel)) { | 1332 | if (unlikely(cancel)) { |
1270 | req_mod(req, send_canceled); | 1333 | req_mod(req, SEND_CANCELED); |
1271 | return 1; | 1334 | return 0; |
1272 | } | 1335 | } |
1273 | 1336 | ||
1274 | ok = drbd_send_oos(mdev, req); | 1337 | /* this time, no tconn->send.current_epoch_writes++; |
1275 | req_mod(req, oos_handed_to_network); | 1338 | * If it was sent, it was the closing barrier for the last |
1339 | * replicated epoch, before we went into AHEAD mode. | ||
1340 | * No more barriers will be sent, until we leave AHEAD mode again. */ | ||
1341 | maybe_send_barrier(tconn, req->epoch); | ||
1342 | |||
1343 | err = drbd_send_out_of_sync(mdev, req); | ||
1344 | req_mod(req, OOS_HANDED_TO_NETWORK); | ||
1276 | 1345 | ||
1277 | return ok; | 1346 | return err; |
1278 | } | 1347 | } |
1279 | 1348 | ||
1280 | /** | 1349 | /** |
@@ -1283,20 +1352,26 @@ int w_send_oos(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | |||
1283 | * @w: work object. | 1352 | * @w: work object. |
1284 | * @cancel: The connection will be closed anyways | 1353 | * @cancel: The connection will be closed anyways |
1285 | */ | 1354 | */ |
1286 | int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 1355 | int w_send_dblock(struct drbd_work *w, int cancel) |
1287 | { | 1356 | { |
1288 | struct drbd_request *req = container_of(w, struct drbd_request, w); | 1357 | struct drbd_request *req = container_of(w, struct drbd_request, w); |
1289 | int ok; | 1358 | struct drbd_conf *mdev = w->mdev; |
1359 | struct drbd_tconn *tconn = mdev->tconn; | ||
1360 | int err; | ||
1290 | 1361 | ||
1291 | if (unlikely(cancel)) { | 1362 | if (unlikely(cancel)) { |
1292 | req_mod(req, send_canceled); | 1363 | req_mod(req, SEND_CANCELED); |
1293 | return 1; | 1364 | return 0; |
1294 | } | 1365 | } |
1295 | 1366 | ||
1296 | ok = drbd_send_dblock(mdev, req); | 1367 | re_init_if_first_write(tconn, req->epoch); |
1297 | req_mod(req, ok ? handed_over_to_network : send_failed); | 1368 | maybe_send_barrier(tconn, req->epoch); |
1369 | tconn->send.current_epoch_writes++; | ||
1370 | |||
1371 | err = drbd_send_dblock(mdev, req); | ||
1372 | req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK); | ||
1298 | 1373 | ||
1299 | return ok; | 1374 | return err; |
1300 | } | 1375 | } |
1301 | 1376 | ||
1302 | /** | 1377 | /** |
@@ -1305,57 +1380,61 @@ int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | |||
1305 | * @w: work object. | 1380 | * @w: work object. |
1306 | * @cancel: The connection will be closed anyways | 1381 | * @cancel: The connection will be closed anyways |
1307 | */ | 1382 | */ |
1308 | int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 1383 | int w_send_read_req(struct drbd_work *w, int cancel) |
1309 | { | 1384 | { |
1310 | struct drbd_request *req = container_of(w, struct drbd_request, w); | 1385 | struct drbd_request *req = container_of(w, struct drbd_request, w); |
1311 | int ok; | 1386 | struct drbd_conf *mdev = w->mdev; |
1387 | struct drbd_tconn *tconn = mdev->tconn; | ||
1388 | int err; | ||
1312 | 1389 | ||
1313 | if (unlikely(cancel)) { | 1390 | if (unlikely(cancel)) { |
1314 | req_mod(req, send_canceled); | 1391 | req_mod(req, SEND_CANCELED); |
1315 | return 1; | 1392 | return 0; |
1316 | } | 1393 | } |
1317 | 1394 | ||
1318 | ok = drbd_send_drequest(mdev, P_DATA_REQUEST, req->sector, req->size, | 1395 | /* Even read requests may close a write epoch, |
1319 | (unsigned long)req); | 1396 | * if there was any yet. */ |
1397 | maybe_send_barrier(tconn, req->epoch); | ||
1320 | 1398 | ||
1321 | if (!ok) { | 1399 | err = drbd_send_drequest(mdev, P_DATA_REQUEST, req->i.sector, req->i.size, |
1322 | /* ?? we set C_TIMEOUT or C_BROKEN_PIPE in drbd_send(); | 1400 | (unsigned long)req); |
1323 | * so this is probably redundant */ | 1401 | |
1324 | if (mdev->state.conn >= C_CONNECTED) | 1402 | req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK); |
1325 | drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE)); | ||
1326 | } | ||
1327 | req_mod(req, ok ? handed_over_to_network : send_failed); | ||
1328 | 1403 | ||
1329 | return ok; | 1404 | return err; |
1330 | } | 1405 | } |
1331 | 1406 | ||
1332 | int w_restart_disk_io(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 1407 | int w_restart_disk_io(struct drbd_work *w, int cancel) |
1333 | { | 1408 | { |
1334 | struct drbd_request *req = container_of(w, struct drbd_request, w); | 1409 | struct drbd_request *req = container_of(w, struct drbd_request, w); |
1410 | struct drbd_conf *mdev = w->mdev; | ||
1335 | 1411 | ||
1336 | if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG) | 1412 | if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG) |
1337 | drbd_al_begin_io(mdev, req->sector); | 1413 | drbd_al_begin_io(mdev, &req->i); |
1338 | /* Calling drbd_al_begin_io() out of the worker might deadlocks | ||
1339 | theoretically. Practically it can not deadlock, since this is | ||
1340 | only used when unfreezing IOs. All the extents of the requests | ||
1341 | that made it into the TL are already active */ | ||
1342 | 1414 | ||
1343 | drbd_req_make_private_bio(req, req->master_bio); | 1415 | drbd_req_make_private_bio(req, req->master_bio); |
1344 | req->private_bio->bi_bdev = mdev->ldev->backing_bdev; | 1416 | req->private_bio->bi_bdev = mdev->ldev->backing_bdev; |
1345 | generic_make_request(req->private_bio); | 1417 | generic_make_request(req->private_bio); |
1346 | 1418 | ||
1347 | return 1; | 1419 | return 0; |
1348 | } | 1420 | } |
1349 | 1421 | ||
1350 | static int _drbd_may_sync_now(struct drbd_conf *mdev) | 1422 | static int _drbd_may_sync_now(struct drbd_conf *mdev) |
1351 | { | 1423 | { |
1352 | struct drbd_conf *odev = mdev; | 1424 | struct drbd_conf *odev = mdev; |
1425 | int resync_after; | ||
1353 | 1426 | ||
1354 | while (1) { | 1427 | while (1) { |
1355 | if (odev->sync_conf.after == -1) | 1428 | if (!odev->ldev) |
1429 | return 1; | ||
1430 | rcu_read_lock(); | ||
1431 | resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after; | ||
1432 | rcu_read_unlock(); | ||
1433 | if (resync_after == -1) | ||
1434 | return 1; | ||
1435 | odev = minor_to_mdev(resync_after); | ||
1436 | if (!expect(odev)) | ||
1356 | return 1; | 1437 | return 1; |
1357 | odev = minor_to_mdev(odev->sync_conf.after); | ||
1358 | ERR_IF(!odev) return 1; | ||
1359 | if ((odev->state.conn >= C_SYNC_SOURCE && | 1438 | if ((odev->state.conn >= C_SYNC_SOURCE && |
1360 | odev->state.conn <= C_PAUSED_SYNC_T) || | 1439 | odev->state.conn <= C_PAUSED_SYNC_T) || |
1361 | odev->state.aftr_isp || odev->state.peer_isp || | 1440 | odev->state.aftr_isp || odev->state.peer_isp || |
@@ -1375,16 +1454,15 @@ static int _drbd_pause_after(struct drbd_conf *mdev) | |||
1375 | struct drbd_conf *odev; | 1454 | struct drbd_conf *odev; |
1376 | int i, rv = 0; | 1455 | int i, rv = 0; |
1377 | 1456 | ||
1378 | for (i = 0; i < minor_count; i++) { | 1457 | rcu_read_lock(); |
1379 | odev = minor_to_mdev(i); | 1458 | idr_for_each_entry(&minors, odev, i) { |
1380 | if (!odev) | ||
1381 | continue; | ||
1382 | if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) | 1459 | if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) |
1383 | continue; | 1460 | continue; |
1384 | if (!_drbd_may_sync_now(odev)) | 1461 | if (!_drbd_may_sync_now(odev)) |
1385 | rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL) | 1462 | rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL) |
1386 | != SS_NOTHING_TO_DO); | 1463 | != SS_NOTHING_TO_DO); |
1387 | } | 1464 | } |
1465 | rcu_read_unlock(); | ||
1388 | 1466 | ||
1389 | return rv; | 1467 | return rv; |
1390 | } | 1468 | } |
@@ -1400,10 +1478,8 @@ static int _drbd_resume_next(struct drbd_conf *mdev) | |||
1400 | struct drbd_conf *odev; | 1478 | struct drbd_conf *odev; |
1401 | int i, rv = 0; | 1479 | int i, rv = 0; |
1402 | 1480 | ||
1403 | for (i = 0; i < minor_count; i++) { | 1481 | rcu_read_lock(); |
1404 | odev = minor_to_mdev(i); | 1482 | idr_for_each_entry(&minors, odev, i) { |
1405 | if (!odev) | ||
1406 | continue; | ||
1407 | if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) | 1483 | if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) |
1408 | continue; | 1484 | continue; |
1409 | if (odev->state.aftr_isp) { | 1485 | if (odev->state.aftr_isp) { |
@@ -1413,6 +1489,7 @@ static int _drbd_resume_next(struct drbd_conf *mdev) | |||
1413 | != SS_NOTHING_TO_DO) ; | 1489 | != SS_NOTHING_TO_DO) ; |
1414 | } | 1490 | } |
1415 | } | 1491 | } |
1492 | rcu_read_unlock(); | ||
1416 | return rv; | 1493 | return rv; |
1417 | } | 1494 | } |
1418 | 1495 | ||
@@ -1430,57 +1507,86 @@ void suspend_other_sg(struct drbd_conf *mdev) | |||
1430 | write_unlock_irq(&global_state_lock); | 1507 | write_unlock_irq(&global_state_lock); |
1431 | } | 1508 | } |
1432 | 1509 | ||
1433 | static int sync_after_error(struct drbd_conf *mdev, int o_minor) | 1510 | /* caller must hold global_state_lock */ |
1511 | enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor) | ||
1434 | { | 1512 | { |
1435 | struct drbd_conf *odev; | 1513 | struct drbd_conf *odev; |
1514 | int resync_after; | ||
1436 | 1515 | ||
1437 | if (o_minor == -1) | 1516 | if (o_minor == -1) |
1438 | return NO_ERROR; | 1517 | return NO_ERROR; |
1439 | if (o_minor < -1 || minor_to_mdev(o_minor) == NULL) | 1518 | if (o_minor < -1 || minor_to_mdev(o_minor) == NULL) |
1440 | return ERR_SYNC_AFTER; | 1519 | return ERR_RESYNC_AFTER; |
1441 | 1520 | ||
1442 | /* check for loops */ | 1521 | /* check for loops */ |
1443 | odev = minor_to_mdev(o_minor); | 1522 | odev = minor_to_mdev(o_minor); |
1444 | while (1) { | 1523 | while (1) { |
1445 | if (odev == mdev) | 1524 | if (odev == mdev) |
1446 | return ERR_SYNC_AFTER_CYCLE; | 1525 | return ERR_RESYNC_AFTER_CYCLE; |
1447 | 1526 | ||
1527 | rcu_read_lock(); | ||
1528 | resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after; | ||
1529 | rcu_read_unlock(); | ||
1448 | /* dependency chain ends here, no cycles. */ | 1530 | /* dependency chain ends here, no cycles. */ |
1449 | if (odev->sync_conf.after == -1) | 1531 | if (resync_after == -1) |
1450 | return NO_ERROR; | 1532 | return NO_ERROR; |
1451 | 1533 | ||
1452 | /* follow the dependency chain */ | 1534 | /* follow the dependency chain */ |
1453 | odev = minor_to_mdev(odev->sync_conf.after); | 1535 | odev = minor_to_mdev(resync_after); |
1454 | } | 1536 | } |
1455 | } | 1537 | } |
1456 | 1538 | ||
1457 | int drbd_alter_sa(struct drbd_conf *mdev, int na) | 1539 | /* caller must hold global_state_lock */ |
1540 | void drbd_resync_after_changed(struct drbd_conf *mdev) | ||
1458 | { | 1541 | { |
1459 | int changes; | 1542 | int changes; |
1460 | int retcode; | ||
1461 | 1543 | ||
1462 | write_lock_irq(&global_state_lock); | 1544 | do { |
1463 | retcode = sync_after_error(mdev, na); | 1545 | changes = _drbd_pause_after(mdev); |
1464 | if (retcode == NO_ERROR) { | 1546 | changes |= _drbd_resume_next(mdev); |
1465 | mdev->sync_conf.after = na; | 1547 | } while (changes); |
1466 | do { | ||
1467 | changes = _drbd_pause_after(mdev); | ||
1468 | changes |= _drbd_resume_next(mdev); | ||
1469 | } while (changes); | ||
1470 | } | ||
1471 | write_unlock_irq(&global_state_lock); | ||
1472 | return retcode; | ||
1473 | } | 1548 | } |
1474 | 1549 | ||
1475 | void drbd_rs_controller_reset(struct drbd_conf *mdev) | 1550 | void drbd_rs_controller_reset(struct drbd_conf *mdev) |
1476 | { | 1551 | { |
1552 | struct fifo_buffer *plan; | ||
1553 | |||
1477 | atomic_set(&mdev->rs_sect_in, 0); | 1554 | atomic_set(&mdev->rs_sect_in, 0); |
1478 | atomic_set(&mdev->rs_sect_ev, 0); | 1555 | atomic_set(&mdev->rs_sect_ev, 0); |
1479 | mdev->rs_in_flight = 0; | 1556 | mdev->rs_in_flight = 0; |
1480 | mdev->rs_planed = 0; | 1557 | |
1481 | spin_lock(&mdev->peer_seq_lock); | 1558 | /* Updating the RCU protected object in place is necessary since |
1482 | fifo_set(&mdev->rs_plan_s, 0); | 1559 | this function gets called from atomic context. |
1483 | spin_unlock(&mdev->peer_seq_lock); | 1560 | It is valid since all other updates also lead to an completely |
1561 | empty fifo */ | ||
1562 | rcu_read_lock(); | ||
1563 | plan = rcu_dereference(mdev->rs_plan_s); | ||
1564 | plan->total = 0; | ||
1565 | fifo_set(plan, 0); | ||
1566 | rcu_read_unlock(); | ||
1567 | } | ||
1568 | |||
1569 | void start_resync_timer_fn(unsigned long data) | ||
1570 | { | ||
1571 | struct drbd_conf *mdev = (struct drbd_conf *) data; | ||
1572 | |||
1573 | drbd_queue_work(&mdev->tconn->sender_work, &mdev->start_resync_work); | ||
1574 | } | ||
1575 | |||
1576 | int w_start_resync(struct drbd_work *w, int cancel) | ||
1577 | { | ||
1578 | struct drbd_conf *mdev = w->mdev; | ||
1579 | |||
1580 | if (atomic_read(&mdev->unacked_cnt) || atomic_read(&mdev->rs_pending_cnt)) { | ||
1581 | dev_warn(DEV, "w_start_resync later...\n"); | ||
1582 | mdev->start_resync_timer.expires = jiffies + HZ/10; | ||
1583 | add_timer(&mdev->start_resync_timer); | ||
1584 | return 0; | ||
1585 | } | ||
1586 | |||
1587 | drbd_start_resync(mdev, C_SYNC_SOURCE); | ||
1588 | clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags); | ||
1589 | return 0; | ||
1484 | } | 1590 | } |
1485 | 1591 | ||
1486 | /** | 1592 | /** |
@@ -1501,43 +1607,58 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) | |||
1501 | return; | 1607 | return; |
1502 | } | 1608 | } |
1503 | 1609 | ||
1504 | if (side == C_SYNC_TARGET) { | 1610 | if (!test_bit(B_RS_H_DONE, &mdev->flags)) { |
1505 | /* Since application IO was locked out during C_WF_BITMAP_T and | 1611 | if (side == C_SYNC_TARGET) { |
1506 | C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET | 1612 | /* Since application IO was locked out during C_WF_BITMAP_T and |
1507 | we check that we might make the data inconsistent. */ | 1613 | C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET |
1508 | r = drbd_khelper(mdev, "before-resync-target"); | 1614 | we check that we might make the data inconsistent. */ |
1509 | r = (r >> 8) & 0xff; | 1615 | r = drbd_khelper(mdev, "before-resync-target"); |
1510 | if (r > 0) { | 1616 | r = (r >> 8) & 0xff; |
1511 | dev_info(DEV, "before-resync-target handler returned %d, " | 1617 | if (r > 0) { |
1512 | "dropping connection.\n", r); | 1618 | dev_info(DEV, "before-resync-target handler returned %d, " |
1513 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | ||
1514 | return; | ||
1515 | } | ||
1516 | } else /* C_SYNC_SOURCE */ { | ||
1517 | r = drbd_khelper(mdev, "before-resync-source"); | ||
1518 | r = (r >> 8) & 0xff; | ||
1519 | if (r > 0) { | ||
1520 | if (r == 3) { | ||
1521 | dev_info(DEV, "before-resync-source handler returned %d, " | ||
1522 | "ignoring. Old userland tools?", r); | ||
1523 | } else { | ||
1524 | dev_info(DEV, "before-resync-source handler returned %d, " | ||
1525 | "dropping connection.\n", r); | 1619 | "dropping connection.\n", r); |
1526 | drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); | 1620 | conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD); |
1527 | return; | 1621 | return; |
1528 | } | 1622 | } |
1623 | } else /* C_SYNC_SOURCE */ { | ||
1624 | r = drbd_khelper(mdev, "before-resync-source"); | ||
1625 | r = (r >> 8) & 0xff; | ||
1626 | if (r > 0) { | ||
1627 | if (r == 3) { | ||
1628 | dev_info(DEV, "before-resync-source handler returned %d, " | ||
1629 | "ignoring. Old userland tools?", r); | ||
1630 | } else { | ||
1631 | dev_info(DEV, "before-resync-source handler returned %d, " | ||
1632 | "dropping connection.\n", r); | ||
1633 | conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD); | ||
1634 | return; | ||
1635 | } | ||
1636 | } | ||
1529 | } | 1637 | } |
1530 | } | 1638 | } |
1531 | 1639 | ||
1532 | drbd_state_lock(mdev); | 1640 | if (current == mdev->tconn->worker.task) { |
1641 | /* The worker should not sleep waiting for state_mutex, | ||
1642 | that can take long */ | ||
1643 | if (!mutex_trylock(mdev->state_mutex)) { | ||
1644 | set_bit(B_RS_H_DONE, &mdev->flags); | ||
1645 | mdev->start_resync_timer.expires = jiffies + HZ/5; | ||
1646 | add_timer(&mdev->start_resync_timer); | ||
1647 | return; | ||
1648 | } | ||
1649 | } else { | ||
1650 | mutex_lock(mdev->state_mutex); | ||
1651 | } | ||
1652 | clear_bit(B_RS_H_DONE, &mdev->flags); | ||
1653 | |||
1533 | write_lock_irq(&global_state_lock); | 1654 | write_lock_irq(&global_state_lock); |
1534 | if (!get_ldev_if_state(mdev, D_NEGOTIATING)) { | 1655 | if (!get_ldev_if_state(mdev, D_NEGOTIATING)) { |
1535 | write_unlock_irq(&global_state_lock); | 1656 | write_unlock_irq(&global_state_lock); |
1536 | drbd_state_unlock(mdev); | 1657 | mutex_unlock(mdev->state_mutex); |
1537 | return; | 1658 | return; |
1538 | } | 1659 | } |
1539 | 1660 | ||
1540 | ns.i = mdev->state.i; | 1661 | ns = drbd_read_state(mdev); |
1541 | 1662 | ||
1542 | ns.aftr_isp = !_drbd_may_sync_now(mdev); | 1663 | ns.aftr_isp = !_drbd_may_sync_now(mdev); |
1543 | 1664 | ||
@@ -1549,7 +1670,7 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) | |||
1549 | ns.pdsk = D_INCONSISTENT; | 1670 | ns.pdsk = D_INCONSISTENT; |
1550 | 1671 | ||
1551 | r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL); | 1672 | r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL); |
1552 | ns = mdev->state; | 1673 | ns = drbd_read_state(mdev); |
1553 | 1674 | ||
1554 | if (ns.conn < C_CONNECTED) | 1675 | if (ns.conn < C_CONNECTED) |
1555 | r = SS_UNKNOWN_ERROR; | 1676 | r = SS_UNKNOWN_ERROR; |
@@ -1575,6 +1696,10 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) | |||
1575 | write_unlock_irq(&global_state_lock); | 1696 | write_unlock_irq(&global_state_lock); |
1576 | 1697 | ||
1577 | if (r == SS_SUCCESS) { | 1698 | if (r == SS_SUCCESS) { |
1699 | /* reset rs_last_bcast when a resync or verify is started, | ||
1700 | * to deal with potential jiffies wrap. */ | ||
1701 | mdev->rs_last_bcast = jiffies - HZ; | ||
1702 | |||
1578 | dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n", | 1703 | dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n", |
1579 | drbd_conn_str(ns.conn), | 1704 | drbd_conn_str(ns.conn), |
1580 | (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10), | 1705 | (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10), |
@@ -1589,10 +1714,10 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) | |||
1589 | * drbd_resync_finished from here in that case. | 1714 | * drbd_resync_finished from here in that case. |
1590 | * We drbd_gen_and_send_sync_uuid here for protocol < 96, | 1715 | * We drbd_gen_and_send_sync_uuid here for protocol < 96, |
1591 | * and from after_state_ch otherwise. */ | 1716 | * and from after_state_ch otherwise. */ |
1592 | if (side == C_SYNC_SOURCE && mdev->agreed_pro_version < 96) | 1717 | if (side == C_SYNC_SOURCE && mdev->tconn->agreed_pro_version < 96) |
1593 | drbd_gen_and_send_sync_uuid(mdev); | 1718 | drbd_gen_and_send_sync_uuid(mdev); |
1594 | 1719 | ||
1595 | if (mdev->agreed_pro_version < 95 && mdev->rs_total == 0) { | 1720 | if (mdev->tconn->agreed_pro_version < 95 && mdev->rs_total == 0) { |
1596 | /* This still has a race (about when exactly the peers | 1721 | /* This still has a race (about when exactly the peers |
1597 | * detect connection loss) that can lead to a full sync | 1722 | * detect connection loss) that can lead to a full sync |
1598 | * on next handshake. In 8.3.9 we fixed this with explicit | 1723 | * on next handshake. In 8.3.9 we fixed this with explicit |
@@ -1603,10 +1728,16 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) | |||
1603 | * detect connection loss, then waiting for a ping | 1728 | * detect connection loss, then waiting for a ping |
1604 | * response (implicit in drbd_resync_finished) reduces | 1729 | * response (implicit in drbd_resync_finished) reduces |
1605 | * the race considerably, but does not solve it. */ | 1730 | * the race considerably, but does not solve it. */ |
1606 | if (side == C_SYNC_SOURCE) | 1731 | if (side == C_SYNC_SOURCE) { |
1607 | schedule_timeout_interruptible( | 1732 | struct net_conf *nc; |
1608 | mdev->net_conf->ping_int * HZ + | 1733 | int timeo; |
1609 | mdev->net_conf->ping_timeo*HZ/9); | 1734 | |
1735 | rcu_read_lock(); | ||
1736 | nc = rcu_dereference(mdev->tconn->net_conf); | ||
1737 | timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9; | ||
1738 | rcu_read_unlock(); | ||
1739 | schedule_timeout_interruptible(timeo); | ||
1740 | } | ||
1610 | drbd_resync_finished(mdev); | 1741 | drbd_resync_finished(mdev); |
1611 | } | 1742 | } |
1612 | 1743 | ||
@@ -1621,114 +1752,180 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) | |||
1621 | drbd_md_sync(mdev); | 1752 | drbd_md_sync(mdev); |
1622 | } | 1753 | } |
1623 | put_ldev(mdev); | 1754 | put_ldev(mdev); |
1624 | drbd_state_unlock(mdev); | 1755 | mutex_unlock(mdev->state_mutex); |
1625 | } | 1756 | } |
1626 | 1757 | ||
1627 | int drbd_worker(struct drbd_thread *thi) | 1758 | /* If the resource already closed the current epoch, but we did not |
1759 | * (because we have not yet seen new requests), we should send the | ||
1760 | * corresponding barrier now. Must be checked within the same spinlock | ||
1761 | * that is used to check for new requests. */ | ||
1762 | bool need_to_send_barrier(struct drbd_tconn *connection) | ||
1628 | { | 1763 | { |
1629 | struct drbd_conf *mdev = thi->mdev; | 1764 | if (!connection->send.seen_any_write_yet) |
1630 | struct drbd_work *w = NULL; | 1765 | return false; |
1631 | LIST_HEAD(work_list); | 1766 | |
1632 | int intr = 0, i; | 1767 | /* Skip barriers that do not contain any writes. |
1768 | * This may happen during AHEAD mode. */ | ||
1769 | if (!connection->send.current_epoch_writes) | ||
1770 | return false; | ||
1771 | |||
1772 | /* ->req_lock is held when requests are queued on | ||
1773 | * connection->sender_work, and put into ->transfer_log. | ||
1774 | * It is also held when ->current_tle_nr is increased. | ||
1775 | * So either there are already new requests queued, | ||
1776 | * and corresponding barriers will be send there. | ||
1777 | * Or nothing new is queued yet, so the difference will be 1. | ||
1778 | */ | ||
1779 | if (atomic_read(&connection->current_tle_nr) != | ||
1780 | connection->send.current_epoch_nr + 1) | ||
1781 | return false; | ||
1782 | |||
1783 | return true; | ||
1784 | } | ||
1785 | |||
1786 | bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list) | ||
1787 | { | ||
1788 | spin_lock_irq(&queue->q_lock); | ||
1789 | list_splice_init(&queue->q, work_list); | ||
1790 | spin_unlock_irq(&queue->q_lock); | ||
1791 | return !list_empty(work_list); | ||
1792 | } | ||
1633 | 1793 | ||
1634 | sprintf(current->comm, "drbd%d_worker", mdev_to_minor(mdev)); | 1794 | bool dequeue_work_item(struct drbd_work_queue *queue, struct list_head *work_list) |
1795 | { | ||
1796 | spin_lock_irq(&queue->q_lock); | ||
1797 | if (!list_empty(&queue->q)) | ||
1798 | list_move(queue->q.next, work_list); | ||
1799 | spin_unlock_irq(&queue->q_lock); | ||
1800 | return !list_empty(work_list); | ||
1801 | } | ||
1635 | 1802 | ||
1636 | while (get_t_state(thi) == Running) { | 1803 | void wait_for_work(struct drbd_tconn *connection, struct list_head *work_list) |
1637 | drbd_thread_current_set_cpu(mdev); | 1804 | { |
1805 | DEFINE_WAIT(wait); | ||
1806 | struct net_conf *nc; | ||
1807 | int uncork, cork; | ||
1638 | 1808 | ||
1639 | if (down_trylock(&mdev->data.work.s)) { | 1809 | dequeue_work_item(&connection->sender_work, work_list); |
1640 | mutex_lock(&mdev->data.mutex); | 1810 | if (!list_empty(work_list)) |
1641 | if (mdev->data.socket && !mdev->net_conf->no_cork) | 1811 | return; |
1642 | drbd_tcp_uncork(mdev->data.socket); | ||
1643 | mutex_unlock(&mdev->data.mutex); | ||
1644 | 1812 | ||
1645 | intr = down_interruptible(&mdev->data.work.s); | 1813 | /* Still nothing to do? |
1814 | * Maybe we still need to close the current epoch, | ||
1815 | * even if no new requests are queued yet. | ||
1816 | * | ||
1817 | * Also, poke TCP, just in case. | ||
1818 | * Then wait for new work (or signal). */ | ||
1819 | rcu_read_lock(); | ||
1820 | nc = rcu_dereference(connection->net_conf); | ||
1821 | uncork = nc ? nc->tcp_cork : 0; | ||
1822 | rcu_read_unlock(); | ||
1823 | if (uncork) { | ||
1824 | mutex_lock(&connection->data.mutex); | ||
1825 | if (connection->data.socket) | ||
1826 | drbd_tcp_uncork(connection->data.socket); | ||
1827 | mutex_unlock(&connection->data.mutex); | ||
1828 | } | ||
1646 | 1829 | ||
1647 | mutex_lock(&mdev->data.mutex); | 1830 | for (;;) { |
1648 | if (mdev->data.socket && !mdev->net_conf->no_cork) | 1831 | int send_barrier; |
1649 | drbd_tcp_cork(mdev->data.socket); | 1832 | prepare_to_wait(&connection->sender_work.q_wait, &wait, TASK_INTERRUPTIBLE); |
1650 | mutex_unlock(&mdev->data.mutex); | 1833 | spin_lock_irq(&connection->req_lock); |
1834 | spin_lock(&connection->sender_work.q_lock); /* FIXME get rid of this one? */ | ||
1835 | /* dequeue single item only, | ||
1836 | * we still use drbd_queue_work_front() in some places */ | ||
1837 | if (!list_empty(&connection->sender_work.q)) | ||
1838 | list_move(connection->sender_work.q.next, work_list); | ||
1839 | spin_unlock(&connection->sender_work.q_lock); /* FIXME get rid of this one? */ | ||
1840 | if (!list_empty(work_list) || signal_pending(current)) { | ||
1841 | spin_unlock_irq(&connection->req_lock); | ||
1842 | break; | ||
1651 | } | 1843 | } |
1844 | send_barrier = need_to_send_barrier(connection); | ||
1845 | spin_unlock_irq(&connection->req_lock); | ||
1846 | if (send_barrier) { | ||
1847 | drbd_send_barrier(connection); | ||
1848 | connection->send.current_epoch_nr++; | ||
1849 | } | ||
1850 | schedule(); | ||
1851 | /* may be woken up for other things but new work, too, | ||
1852 | * e.g. if the current epoch got closed. | ||
1853 | * In which case we send the barrier above. */ | ||
1854 | } | ||
1855 | finish_wait(&connection->sender_work.q_wait, &wait); | ||
1856 | |||
1857 | /* someone may have changed the config while we have been waiting above. */ | ||
1858 | rcu_read_lock(); | ||
1859 | nc = rcu_dereference(connection->net_conf); | ||
1860 | cork = nc ? nc->tcp_cork : 0; | ||
1861 | rcu_read_unlock(); | ||
1862 | mutex_lock(&connection->data.mutex); | ||
1863 | if (connection->data.socket) { | ||
1864 | if (cork) | ||
1865 | drbd_tcp_cork(connection->data.socket); | ||
1866 | else if (!uncork) | ||
1867 | drbd_tcp_uncork(connection->data.socket); | ||
1868 | } | ||
1869 | mutex_unlock(&connection->data.mutex); | ||
1870 | } | ||
1652 | 1871 | ||
1653 | if (intr) { | 1872 | int drbd_worker(struct drbd_thread *thi) |
1654 | D_ASSERT(intr == -EINTR); | 1873 | { |
1874 | struct drbd_tconn *tconn = thi->tconn; | ||
1875 | struct drbd_work *w = NULL; | ||
1876 | struct drbd_conf *mdev; | ||
1877 | LIST_HEAD(work_list); | ||
1878 | int vnr; | ||
1879 | |||
1880 | while (get_t_state(thi) == RUNNING) { | ||
1881 | drbd_thread_current_set_cpu(thi); | ||
1882 | |||
1883 | /* as long as we use drbd_queue_work_front(), | ||
1884 | * we may only dequeue single work items here, not batches. */ | ||
1885 | if (list_empty(&work_list)) | ||
1886 | wait_for_work(tconn, &work_list); | ||
1887 | |||
1888 | if (signal_pending(current)) { | ||
1655 | flush_signals(current); | 1889 | flush_signals(current); |
1656 | ERR_IF (get_t_state(thi) == Running) | 1890 | if (get_t_state(thi) == RUNNING) { |
1891 | conn_warn(tconn, "Worker got an unexpected signal\n"); | ||
1657 | continue; | 1892 | continue; |
1893 | } | ||
1658 | break; | 1894 | break; |
1659 | } | 1895 | } |
1660 | 1896 | ||
1661 | if (get_t_state(thi) != Running) | 1897 | if (get_t_state(thi) != RUNNING) |
1662 | break; | 1898 | break; |
1663 | /* With this break, we have done a down() but not consumed | 1899 | |
1664 | the entry from the list. The cleanup code takes care of | 1900 | while (!list_empty(&work_list)) { |
1665 | this... */ | 1901 | w = list_first_entry(&work_list, struct drbd_work, list); |
1666 | 1902 | list_del_init(&w->list); | |
1667 | w = NULL; | 1903 | if (w->cb(w, tconn->cstate < C_WF_REPORT_PARAMS) == 0) |
1668 | spin_lock_irq(&mdev->data.work.q_lock); | 1904 | continue; |
1669 | ERR_IF(list_empty(&mdev->data.work.q)) { | 1905 | if (tconn->cstate >= C_WF_REPORT_PARAMS) |
1670 | /* something terribly wrong in our logic. | 1906 | conn_request_state(tconn, NS(conn, C_NETWORK_FAILURE), CS_HARD); |
1671 | * we were able to down() the semaphore, | ||
1672 | * but the list is empty... doh. | ||
1673 | * | ||
1674 | * what is the best thing to do now? | ||
1675 | * try again from scratch, restarting the receiver, | ||
1676 | * asender, whatnot? could break even more ugly, | ||
1677 | * e.g. when we are primary, but no good local data. | ||
1678 | * | ||
1679 | * I'll try to get away just starting over this loop. | ||
1680 | */ | ||
1681 | spin_unlock_irq(&mdev->data.work.q_lock); | ||
1682 | continue; | ||
1683 | } | ||
1684 | w = list_entry(mdev->data.work.q.next, struct drbd_work, list); | ||
1685 | list_del_init(&w->list); | ||
1686 | spin_unlock_irq(&mdev->data.work.q_lock); | ||
1687 | |||
1688 | if (!w->cb(mdev, w, mdev->state.conn < C_CONNECTED)) { | ||
1689 | /* dev_warn(DEV, "worker: a callback failed! \n"); */ | ||
1690 | if (mdev->state.conn >= C_CONNECTED) | ||
1691 | drbd_force_state(mdev, | ||
1692 | NS(conn, C_NETWORK_FAILURE)); | ||
1693 | } | 1907 | } |
1694 | } | 1908 | } |
1695 | D_ASSERT(test_bit(DEVICE_DYING, &mdev->flags)); | ||
1696 | D_ASSERT(test_bit(CONFIG_PENDING, &mdev->flags)); | ||
1697 | |||
1698 | spin_lock_irq(&mdev->data.work.q_lock); | ||
1699 | i = 0; | ||
1700 | while (!list_empty(&mdev->data.work.q)) { | ||
1701 | list_splice_init(&mdev->data.work.q, &work_list); | ||
1702 | spin_unlock_irq(&mdev->data.work.q_lock); | ||
1703 | 1909 | ||
1910 | do { | ||
1704 | while (!list_empty(&work_list)) { | 1911 | while (!list_empty(&work_list)) { |
1705 | w = list_entry(work_list.next, struct drbd_work, list); | 1912 | w = list_first_entry(&work_list, struct drbd_work, list); |
1706 | list_del_init(&w->list); | 1913 | list_del_init(&w->list); |
1707 | w->cb(mdev, w, 1); | 1914 | w->cb(w, 1); |
1708 | i++; /* dead debugging code */ | ||
1709 | } | 1915 | } |
1710 | 1916 | dequeue_work_batch(&tconn->sender_work, &work_list); | |
1711 | spin_lock_irq(&mdev->data.work.q_lock); | 1917 | } while (!list_empty(&work_list)); |
1918 | |||
1919 | rcu_read_lock(); | ||
1920 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | ||
1921 | D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE); | ||
1922 | kref_get(&mdev->kref); | ||
1923 | rcu_read_unlock(); | ||
1924 | drbd_mdev_cleanup(mdev); | ||
1925 | kref_put(&mdev->kref, &drbd_minor_destroy); | ||
1926 | rcu_read_lock(); | ||
1712 | } | 1927 | } |
1713 | sema_init(&mdev->data.work.s, 0); | 1928 | rcu_read_unlock(); |
1714 | /* DANGEROUS race: if someone did queue his work within the spinlock, | ||
1715 | * but up() ed outside the spinlock, we could get an up() on the | ||
1716 | * semaphore without corresponding list entry. | ||
1717 | * So don't do that. | ||
1718 | */ | ||
1719 | spin_unlock_irq(&mdev->data.work.q_lock); | ||
1720 | |||
1721 | D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE); | ||
1722 | /* _drbd_set_state only uses stop_nowait. | ||
1723 | * wait here for the Exiting receiver. */ | ||
1724 | drbd_thread_stop(&mdev->receiver); | ||
1725 | drbd_mdev_cleanup(mdev); | ||
1726 | |||
1727 | dev_info(DEV, "worker terminated\n"); | ||
1728 | |||
1729 | clear_bit(DEVICE_DYING, &mdev->flags); | ||
1730 | clear_bit(CONFIG_PENDING, &mdev->flags); | ||
1731 | wake_up(&mdev->state_wait); | ||
1732 | 1929 | ||
1733 | return 0; | 1930 | return 0; |
1734 | } | 1931 | } |
diff --git a/drivers/block/drbd/drbd_wrappers.h b/drivers/block/drbd/drbd_wrappers.h index 151f1a37478f..328f18e4b4ee 100644 --- a/drivers/block/drbd/drbd_wrappers.h +++ b/drivers/block/drbd/drbd_wrappers.h | |||
@@ -3,6 +3,7 @@ | |||
3 | 3 | ||
4 | #include <linux/ctype.h> | 4 | #include <linux/ctype.h> |
5 | #include <linux/mm.h> | 5 | #include <linux/mm.h> |
6 | #include "drbd_int.h" | ||
6 | 7 | ||
7 | /* see get_sb_bdev and bd_claim */ | 8 | /* see get_sb_bdev and bd_claim */ |
8 | extern char *drbd_sec_holder; | 9 | extern char *drbd_sec_holder; |
@@ -20,8 +21,8 @@ static inline void drbd_set_my_capacity(struct drbd_conf *mdev, | |||
20 | 21 | ||
21 | /* bi_end_io handlers */ | 22 | /* bi_end_io handlers */ |
22 | extern void drbd_md_io_complete(struct bio *bio, int error); | 23 | extern void drbd_md_io_complete(struct bio *bio, int error); |
23 | extern void drbd_endio_sec(struct bio *bio, int error); | 24 | extern void drbd_peer_request_endio(struct bio *bio, int error); |
24 | extern void drbd_endio_pri(struct bio *bio, int error); | 25 | extern void drbd_request_endio(struct bio *bio, int error); |
25 | 26 | ||
26 | /* | 27 | /* |
27 | * used to submit our private bio | 28 | * used to submit our private bio |
@@ -45,12 +46,6 @@ static inline void drbd_generic_make_request(struct drbd_conf *mdev, | |||
45 | generic_make_request(bio); | 46 | generic_make_request(bio); |
46 | } | 47 | } |
47 | 48 | ||
48 | static inline int drbd_crypto_is_hash(struct crypto_tfm *tfm) | ||
49 | { | ||
50 | return (crypto_tfm_alg_type(tfm) & CRYPTO_ALG_TYPE_HASH_MASK) | ||
51 | == CRYPTO_ALG_TYPE_HASH; | ||
52 | } | ||
53 | |||
54 | #ifndef __CHECKER__ | 49 | #ifndef __CHECKER__ |
55 | # undef __cond_lock | 50 | # undef __cond_lock |
56 | # define __cond_lock(x,c) (c) | 51 | # define __cond_lock(x,c) (c) |
diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 54046e51160a..ae1251270624 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c | |||
@@ -463,6 +463,7 @@ out: | |||
463 | */ | 463 | */ |
464 | static void loop_add_bio(struct loop_device *lo, struct bio *bio) | 464 | static void loop_add_bio(struct loop_device *lo, struct bio *bio) |
465 | { | 465 | { |
466 | lo->lo_bio_count++; | ||
466 | bio_list_add(&lo->lo_bio_list, bio); | 467 | bio_list_add(&lo->lo_bio_list, bio); |
467 | } | 468 | } |
468 | 469 | ||
@@ -471,6 +472,7 @@ static void loop_add_bio(struct loop_device *lo, struct bio *bio) | |||
471 | */ | 472 | */ |
472 | static struct bio *loop_get_bio(struct loop_device *lo) | 473 | static struct bio *loop_get_bio(struct loop_device *lo) |
473 | { | 474 | { |
475 | lo->lo_bio_count--; | ||
474 | return bio_list_pop(&lo->lo_bio_list); | 476 | return bio_list_pop(&lo->lo_bio_list); |
475 | } | 477 | } |
476 | 478 | ||
@@ -489,6 +491,10 @@ static void loop_make_request(struct request_queue *q, struct bio *old_bio) | |||
489 | goto out; | 491 | goto out; |
490 | if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY))) | 492 | if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY))) |
491 | goto out; | 493 | goto out; |
494 | if (lo->lo_bio_count >= q->nr_congestion_on) | ||
495 | wait_event_lock_irq(lo->lo_req_wait, | ||
496 | lo->lo_bio_count < q->nr_congestion_off, | ||
497 | lo->lo_lock); | ||
492 | loop_add_bio(lo, old_bio); | 498 | loop_add_bio(lo, old_bio); |
493 | wake_up(&lo->lo_event); | 499 | wake_up(&lo->lo_event); |
494 | spin_unlock_irq(&lo->lo_lock); | 500 | spin_unlock_irq(&lo->lo_lock); |
@@ -546,6 +552,8 @@ static int loop_thread(void *data) | |||
546 | continue; | 552 | continue; |
547 | spin_lock_irq(&lo->lo_lock); | 553 | spin_lock_irq(&lo->lo_lock); |
548 | bio = loop_get_bio(lo); | 554 | bio = loop_get_bio(lo); |
555 | if (lo->lo_bio_count < lo->lo_queue->nr_congestion_off) | ||
556 | wake_up(&lo->lo_req_wait); | ||
549 | spin_unlock_irq(&lo->lo_lock); | 557 | spin_unlock_irq(&lo->lo_lock); |
550 | 558 | ||
551 | BUG_ON(!bio); | 559 | BUG_ON(!bio); |
@@ -873,6 +881,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, | |||
873 | lo->transfer = transfer_none; | 881 | lo->transfer = transfer_none; |
874 | lo->ioctl = NULL; | 882 | lo->ioctl = NULL; |
875 | lo->lo_sizelimit = 0; | 883 | lo->lo_sizelimit = 0; |
884 | lo->lo_bio_count = 0; | ||
876 | lo->old_gfp_mask = mapping_gfp_mask(mapping); | 885 | lo->old_gfp_mask = mapping_gfp_mask(mapping); |
877 | mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); | 886 | mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); |
878 | 887 | ||
@@ -1673,6 +1682,7 @@ static int loop_add(struct loop_device **l, int i) | |||
1673 | lo->lo_number = i; | 1682 | lo->lo_number = i; |
1674 | lo->lo_thread = NULL; | 1683 | lo->lo_thread = NULL; |
1675 | init_waitqueue_head(&lo->lo_event); | 1684 | init_waitqueue_head(&lo->lo_event); |
1685 | init_waitqueue_head(&lo->lo_req_wait); | ||
1676 | spin_lock_init(&lo->lo_lock); | 1686 | spin_lock_init(&lo->lo_lock); |
1677 | disk->major = LOOP_MAJOR; | 1687 | disk->major = LOOP_MAJOR; |
1678 | disk->first_minor = i << part_shift; | 1688 | disk->first_minor = i << part_shift; |
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 280a13846e6c..74374fb762aa 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c | |||
@@ -39,6 +39,7 @@ | |||
39 | #include <linux/list.h> | 39 | #include <linux/list.h> |
40 | #include <linux/delay.h> | 40 | #include <linux/delay.h> |
41 | #include <linux/freezer.h> | 41 | #include <linux/freezer.h> |
42 | #include <linux/bitmap.h> | ||
42 | 43 | ||
43 | #include <xen/events.h> | 44 | #include <xen/events.h> |
44 | #include <xen/page.h> | 45 | #include <xen/page.h> |
@@ -79,6 +80,7 @@ struct pending_req { | |||
79 | unsigned short operation; | 80 | unsigned short operation; |
80 | int status; | 81 | int status; |
81 | struct list_head free_list; | 82 | struct list_head free_list; |
83 | DECLARE_BITMAP(unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST); | ||
82 | }; | 84 | }; |
83 | 85 | ||
84 | #define BLKBACK_INVALID_HANDLE (~0) | 86 | #define BLKBACK_INVALID_HANDLE (~0) |
@@ -99,6 +101,36 @@ struct xen_blkbk { | |||
99 | static struct xen_blkbk *blkbk; | 101 | static struct xen_blkbk *blkbk; |
100 | 102 | ||
101 | /* | 103 | /* |
104 | * Maximum number of grant pages that can be mapped in blkback. | ||
105 | * BLKIF_MAX_SEGMENTS_PER_REQUEST * RING_SIZE is the maximum number of | ||
106 | * pages that blkback will persistently map. | ||
107 | * Currently, this is: | ||
108 | * RING_SIZE = 32 (for all known ring types) | ||
109 | * BLKIF_MAX_SEGMENTS_PER_REQUEST = 11 | ||
110 | * sizeof(struct persistent_gnt) = 48 | ||
111 | * So the maximum memory used to store the grants is: | ||
112 | * 32 * 11 * 48 = 16896 bytes | ||
113 | */ | ||
114 | static inline unsigned int max_mapped_grant_pages(enum blkif_protocol protocol) | ||
115 | { | ||
116 | switch (protocol) { | ||
117 | case BLKIF_PROTOCOL_NATIVE: | ||
118 | return __CONST_RING_SIZE(blkif, PAGE_SIZE) * | ||
119 | BLKIF_MAX_SEGMENTS_PER_REQUEST; | ||
120 | case BLKIF_PROTOCOL_X86_32: | ||
121 | return __CONST_RING_SIZE(blkif_x86_32, PAGE_SIZE) * | ||
122 | BLKIF_MAX_SEGMENTS_PER_REQUEST; | ||
123 | case BLKIF_PROTOCOL_X86_64: | ||
124 | return __CONST_RING_SIZE(blkif_x86_64, PAGE_SIZE) * | ||
125 | BLKIF_MAX_SEGMENTS_PER_REQUEST; | ||
126 | default: | ||
127 | BUG(); | ||
128 | } | ||
129 | return 0; | ||
130 | } | ||
131 | |||
132 | |||
133 | /* | ||
102 | * Little helpful macro to figure out the index and virtual address of the | 134 | * Little helpful macro to figure out the index and virtual address of the |
103 | * pending_pages[..]. For each 'pending_req' we have have up to | 135 | * pending_pages[..]. For each 'pending_req' we have have up to |
104 | * BLKIF_MAX_SEGMENTS_PER_REQUEST (11) pages. The seg would be from 0 through | 136 | * BLKIF_MAX_SEGMENTS_PER_REQUEST (11) pages. The seg would be from 0 through |
@@ -129,6 +161,90 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, | |||
129 | static void make_response(struct xen_blkif *blkif, u64 id, | 161 | static void make_response(struct xen_blkif *blkif, u64 id, |
130 | unsigned short op, int st); | 162 | unsigned short op, int st); |
131 | 163 | ||
164 | #define foreach_grant(pos, rbtree, node) \ | ||
165 | for ((pos) = container_of(rb_first((rbtree)), typeof(*(pos)), node); \ | ||
166 | &(pos)->node != NULL; \ | ||
167 | (pos) = container_of(rb_next(&(pos)->node), typeof(*(pos)), node)) | ||
168 | |||
169 | |||
170 | static void add_persistent_gnt(struct rb_root *root, | ||
171 | struct persistent_gnt *persistent_gnt) | ||
172 | { | ||
173 | struct rb_node **new = &(root->rb_node), *parent = NULL; | ||
174 | struct persistent_gnt *this; | ||
175 | |||
176 | /* Figure out where to put new node */ | ||
177 | while (*new) { | ||
178 | this = container_of(*new, struct persistent_gnt, node); | ||
179 | |||
180 | parent = *new; | ||
181 | if (persistent_gnt->gnt < this->gnt) | ||
182 | new = &((*new)->rb_left); | ||
183 | else if (persistent_gnt->gnt > this->gnt) | ||
184 | new = &((*new)->rb_right); | ||
185 | else { | ||
186 | pr_alert(DRV_PFX " trying to add a gref that's already in the tree\n"); | ||
187 | BUG(); | ||
188 | } | ||
189 | } | ||
190 | |||
191 | /* Add new node and rebalance tree. */ | ||
192 | rb_link_node(&(persistent_gnt->node), parent, new); | ||
193 | rb_insert_color(&(persistent_gnt->node), root); | ||
194 | } | ||
195 | |||
196 | static struct persistent_gnt *get_persistent_gnt(struct rb_root *root, | ||
197 | grant_ref_t gref) | ||
198 | { | ||
199 | struct persistent_gnt *data; | ||
200 | struct rb_node *node = root->rb_node; | ||
201 | |||
202 | while (node) { | ||
203 | data = container_of(node, struct persistent_gnt, node); | ||
204 | |||
205 | if (gref < data->gnt) | ||
206 | node = node->rb_left; | ||
207 | else if (gref > data->gnt) | ||
208 | node = node->rb_right; | ||
209 | else | ||
210 | return data; | ||
211 | } | ||
212 | return NULL; | ||
213 | } | ||
214 | |||
215 | static void free_persistent_gnts(struct rb_root *root, unsigned int num) | ||
216 | { | ||
217 | struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | ||
218 | struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | ||
219 | struct persistent_gnt *persistent_gnt; | ||
220 | int ret = 0; | ||
221 | int segs_to_unmap = 0; | ||
222 | |||
223 | foreach_grant(persistent_gnt, root, node) { | ||
224 | BUG_ON(persistent_gnt->handle == | ||
225 | BLKBACK_INVALID_HANDLE); | ||
226 | gnttab_set_unmap_op(&unmap[segs_to_unmap], | ||
227 | (unsigned long) pfn_to_kaddr(page_to_pfn( | ||
228 | persistent_gnt->page)), | ||
229 | GNTMAP_host_map, | ||
230 | persistent_gnt->handle); | ||
231 | |||
232 | pages[segs_to_unmap] = persistent_gnt->page; | ||
233 | rb_erase(&persistent_gnt->node, root); | ||
234 | kfree(persistent_gnt); | ||
235 | num--; | ||
236 | |||
237 | if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST || | ||
238 | !rb_next(&persistent_gnt->node)) { | ||
239 | ret = gnttab_unmap_refs(unmap, NULL, pages, | ||
240 | segs_to_unmap); | ||
241 | BUG_ON(ret); | ||
242 | segs_to_unmap = 0; | ||
243 | } | ||
244 | } | ||
245 | BUG_ON(num != 0); | ||
246 | } | ||
247 | |||
132 | /* | 248 | /* |
133 | * Retrieve from the 'pending_reqs' a free pending_req structure to be used. | 249 | * Retrieve from the 'pending_reqs' a free pending_req structure to be used. |
134 | */ | 250 | */ |
@@ -302,6 +418,14 @@ int xen_blkif_schedule(void *arg) | |||
302 | print_stats(blkif); | 418 | print_stats(blkif); |
303 | } | 419 | } |
304 | 420 | ||
421 | /* Free all persistent grant pages */ | ||
422 | if (!RB_EMPTY_ROOT(&blkif->persistent_gnts)) | ||
423 | free_persistent_gnts(&blkif->persistent_gnts, | ||
424 | blkif->persistent_gnt_c); | ||
425 | |||
426 | BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); | ||
427 | blkif->persistent_gnt_c = 0; | ||
428 | |||
305 | if (log_stats) | 429 | if (log_stats) |
306 | print_stats(blkif); | 430 | print_stats(blkif); |
307 | 431 | ||
@@ -328,6 +452,8 @@ static void xen_blkbk_unmap(struct pending_req *req) | |||
328 | int ret; | 452 | int ret; |
329 | 453 | ||
330 | for (i = 0; i < req->nr_pages; i++) { | 454 | for (i = 0; i < req->nr_pages; i++) { |
455 | if (!test_bit(i, req->unmap_seg)) | ||
456 | continue; | ||
331 | handle = pending_handle(req, i); | 457 | handle = pending_handle(req, i); |
332 | if (handle == BLKBACK_INVALID_HANDLE) | 458 | if (handle == BLKBACK_INVALID_HANDLE) |
333 | continue; | 459 | continue; |
@@ -344,12 +470,26 @@ static void xen_blkbk_unmap(struct pending_req *req) | |||
344 | 470 | ||
345 | static int xen_blkbk_map(struct blkif_request *req, | 471 | static int xen_blkbk_map(struct blkif_request *req, |
346 | struct pending_req *pending_req, | 472 | struct pending_req *pending_req, |
347 | struct seg_buf seg[]) | 473 | struct seg_buf seg[], |
474 | struct page *pages[]) | ||
348 | { | 475 | { |
349 | struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | 476 | struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; |
350 | int i; | 477 | struct persistent_gnt *persistent_gnts[BLKIF_MAX_SEGMENTS_PER_REQUEST]; |
478 | struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | ||
479 | struct persistent_gnt *persistent_gnt = NULL; | ||
480 | struct xen_blkif *blkif = pending_req->blkif; | ||
481 | phys_addr_t addr = 0; | ||
482 | int i, j; | ||
483 | bool new_map; | ||
351 | int nseg = req->u.rw.nr_segments; | 484 | int nseg = req->u.rw.nr_segments; |
485 | int segs_to_map = 0; | ||
352 | int ret = 0; | 486 | int ret = 0; |
487 | int use_persistent_gnts; | ||
488 | |||
489 | use_persistent_gnts = (blkif->vbd.feature_gnt_persistent); | ||
490 | |||
491 | BUG_ON(blkif->persistent_gnt_c > | ||
492 | max_mapped_grant_pages(pending_req->blkif->blk_protocol)); | ||
353 | 493 | ||
354 | /* | 494 | /* |
355 | * Fill out preq.nr_sects with proper amount of sectors, and setup | 495 | * Fill out preq.nr_sects with proper amount of sectors, and setup |
@@ -359,36 +499,146 @@ static int xen_blkbk_map(struct blkif_request *req, | |||
359 | for (i = 0; i < nseg; i++) { | 499 | for (i = 0; i < nseg; i++) { |
360 | uint32_t flags; | 500 | uint32_t flags; |
361 | 501 | ||
362 | flags = GNTMAP_host_map; | 502 | if (use_persistent_gnts) |
363 | if (pending_req->operation != BLKIF_OP_READ) | 503 | persistent_gnt = get_persistent_gnt( |
364 | flags |= GNTMAP_readonly; | 504 | &blkif->persistent_gnts, |
365 | gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags, | 505 | req->u.rw.seg[i].gref); |
366 | req->u.rw.seg[i].gref, | 506 | |
367 | pending_req->blkif->domid); | 507 | if (persistent_gnt) { |
508 | /* | ||
509 | * We are using persistent grants and | ||
510 | * the grant is already mapped | ||
511 | */ | ||
512 | new_map = false; | ||
513 | } else if (use_persistent_gnts && | ||
514 | blkif->persistent_gnt_c < | ||
515 | max_mapped_grant_pages(blkif->blk_protocol)) { | ||
516 | /* | ||
517 | * We are using persistent grants, the grant is | ||
518 | * not mapped but we have room for it | ||
519 | */ | ||
520 | new_map = true; | ||
521 | persistent_gnt = kmalloc( | ||
522 | sizeof(struct persistent_gnt), | ||
523 | GFP_KERNEL); | ||
524 | if (!persistent_gnt) | ||
525 | return -ENOMEM; | ||
526 | persistent_gnt->page = alloc_page(GFP_KERNEL); | ||
527 | if (!persistent_gnt->page) { | ||
528 | kfree(persistent_gnt); | ||
529 | return -ENOMEM; | ||
530 | } | ||
531 | persistent_gnt->gnt = req->u.rw.seg[i].gref; | ||
532 | persistent_gnt->handle = BLKBACK_INVALID_HANDLE; | ||
533 | |||
534 | pages_to_gnt[segs_to_map] = | ||
535 | persistent_gnt->page; | ||
536 | addr = (unsigned long) pfn_to_kaddr( | ||
537 | page_to_pfn(persistent_gnt->page)); | ||
538 | |||
539 | add_persistent_gnt(&blkif->persistent_gnts, | ||
540 | persistent_gnt); | ||
541 | blkif->persistent_gnt_c++; | ||
542 | pr_debug(DRV_PFX " grant %u added to the tree of persistent grants, using %u/%u\n", | ||
543 | persistent_gnt->gnt, blkif->persistent_gnt_c, | ||
544 | max_mapped_grant_pages(blkif->blk_protocol)); | ||
545 | } else { | ||
546 | /* | ||
547 | * We are either using persistent grants and | ||
548 | * hit the maximum limit of grants mapped, | ||
549 | * or we are not using persistent grants. | ||
550 | */ | ||
551 | if (use_persistent_gnts && | ||
552 | !blkif->vbd.overflow_max_grants) { | ||
553 | blkif->vbd.overflow_max_grants = 1; | ||
554 | pr_alert(DRV_PFX " domain %u, device %#x is using maximum number of persistent grants\n", | ||
555 | blkif->domid, blkif->vbd.handle); | ||
556 | } | ||
557 | new_map = true; | ||
558 | pages[i] = blkbk->pending_page(pending_req, i); | ||
559 | addr = vaddr(pending_req, i); | ||
560 | pages_to_gnt[segs_to_map] = | ||
561 | blkbk->pending_page(pending_req, i); | ||
562 | } | ||
563 | |||
564 | if (persistent_gnt) { | ||
565 | pages[i] = persistent_gnt->page; | ||
566 | persistent_gnts[i] = persistent_gnt; | ||
567 | } else { | ||
568 | persistent_gnts[i] = NULL; | ||
569 | } | ||
570 | |||
571 | if (new_map) { | ||
572 | flags = GNTMAP_host_map; | ||
573 | if (!persistent_gnt && | ||
574 | (pending_req->operation != BLKIF_OP_READ)) | ||
575 | flags |= GNTMAP_readonly; | ||
576 | gnttab_set_map_op(&map[segs_to_map++], addr, | ||
577 | flags, req->u.rw.seg[i].gref, | ||
578 | blkif->domid); | ||
579 | } | ||
368 | } | 580 | } |
369 | 581 | ||
370 | ret = gnttab_map_refs(map, NULL, &blkbk->pending_page(pending_req, 0), nseg); | 582 | if (segs_to_map) { |
371 | BUG_ON(ret); | 583 | ret = gnttab_map_refs(map, NULL, pages_to_gnt, segs_to_map); |
584 | BUG_ON(ret); | ||
585 | } | ||
372 | 586 | ||
373 | /* | 587 | /* |
374 | * Now swizzle the MFN in our domain with the MFN from the other domain | 588 | * Now swizzle the MFN in our domain with the MFN from the other domain |
375 | * so that when we access vaddr(pending_req,i) it has the contents of | 589 | * so that when we access vaddr(pending_req,i) it has the contents of |
376 | * the page from the other domain. | 590 | * the page from the other domain. |
377 | */ | 591 | */ |
378 | for (i = 0; i < nseg; i++) { | 592 | bitmap_zero(pending_req->unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST); |
379 | if (unlikely(map[i].status != 0)) { | 593 | for (i = 0, j = 0; i < nseg; i++) { |
380 | pr_debug(DRV_PFX "invalid buffer -- could not remap it\n"); | 594 | if (!persistent_gnts[i] || |
381 | map[i].handle = BLKBACK_INVALID_HANDLE; | 595 | persistent_gnts[i]->handle == BLKBACK_INVALID_HANDLE) { |
382 | ret |= 1; | 596 | /* This is a newly mapped grant */ |
597 | BUG_ON(j >= segs_to_map); | ||
598 | if (unlikely(map[j].status != 0)) { | ||
599 | pr_debug(DRV_PFX "invalid buffer -- could not remap it\n"); | ||
600 | map[j].handle = BLKBACK_INVALID_HANDLE; | ||
601 | ret |= 1; | ||
602 | if (persistent_gnts[i]) { | ||
603 | rb_erase(&persistent_gnts[i]->node, | ||
604 | &blkif->persistent_gnts); | ||
605 | blkif->persistent_gnt_c--; | ||
606 | kfree(persistent_gnts[i]); | ||
607 | persistent_gnts[i] = NULL; | ||
608 | } | ||
609 | } | ||
610 | } | ||
611 | if (persistent_gnts[i]) { | ||
612 | if (persistent_gnts[i]->handle == | ||
613 | BLKBACK_INVALID_HANDLE) { | ||
614 | /* | ||
615 | * If this is a new persistent grant | ||
616 | * save the handler | ||
617 | */ | ||
618 | persistent_gnts[i]->handle = map[j].handle; | ||
619 | persistent_gnts[i]->dev_bus_addr = | ||
620 | map[j++].dev_bus_addr; | ||
621 | } | ||
622 | pending_handle(pending_req, i) = | ||
623 | persistent_gnts[i]->handle; | ||
624 | |||
625 | if (ret) | ||
626 | continue; | ||
627 | |||
628 | seg[i].buf = persistent_gnts[i]->dev_bus_addr | | ||
629 | (req->u.rw.seg[i].first_sect << 9); | ||
630 | } else { | ||
631 | pending_handle(pending_req, i) = map[j].handle; | ||
632 | bitmap_set(pending_req->unmap_seg, i, 1); | ||
633 | |||
634 | if (ret) { | ||
635 | j++; | ||
636 | continue; | ||
637 | } | ||
638 | |||
639 | seg[i].buf = map[j++].dev_bus_addr | | ||
640 | (req->u.rw.seg[i].first_sect << 9); | ||
383 | } | 641 | } |
384 | |||
385 | pending_handle(pending_req, i) = map[i].handle; | ||
386 | |||
387 | if (ret) | ||
388 | continue; | ||
389 | |||
390 | seg[i].buf = map[i].dev_bus_addr | | ||
391 | (req->u.rw.seg[i].first_sect << 9); | ||
392 | } | 642 | } |
393 | return ret; | 643 | return ret; |
394 | } | 644 | } |
@@ -591,6 +841,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, | |||
591 | int operation; | 841 | int operation; |
592 | struct blk_plug plug; | 842 | struct blk_plug plug; |
593 | bool drain = false; | 843 | bool drain = false; |
844 | struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | ||
594 | 845 | ||
595 | switch (req->operation) { | 846 | switch (req->operation) { |
596 | case BLKIF_OP_READ: | 847 | case BLKIF_OP_READ: |
@@ -677,7 +928,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, | |||
677 | * the hypercall to unmap the grants - that is all done in | 928 | * the hypercall to unmap the grants - that is all done in |
678 | * xen_blkbk_unmap. | 929 | * xen_blkbk_unmap. |
679 | */ | 930 | */ |
680 | if (xen_blkbk_map(req, pending_req, seg)) | 931 | if (xen_blkbk_map(req, pending_req, seg, pages)) |
681 | goto fail_flush; | 932 | goto fail_flush; |
682 | 933 | ||
683 | /* | 934 | /* |
@@ -689,7 +940,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, | |||
689 | for (i = 0; i < nseg; i++) { | 940 | for (i = 0; i < nseg; i++) { |
690 | while ((bio == NULL) || | 941 | while ((bio == NULL) || |
691 | (bio_add_page(bio, | 942 | (bio_add_page(bio, |
692 | blkbk->pending_page(pending_req, i), | 943 | pages[i], |
693 | seg[i].nsec << 9, | 944 | seg[i].nsec << 9, |
694 | seg[i].buf & ~PAGE_MASK) == 0)) { | 945 | seg[i].buf & ~PAGE_MASK) == 0)) { |
695 | 946 | ||
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index 9a54623e52d7..6072390c7f57 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h | |||
@@ -34,6 +34,7 @@ | |||
34 | #include <linux/vmalloc.h> | 34 | #include <linux/vmalloc.h> |
35 | #include <linux/wait.h> | 35 | #include <linux/wait.h> |
36 | #include <linux/io.h> | 36 | #include <linux/io.h> |
37 | #include <linux/rbtree.h> | ||
37 | #include <asm/setup.h> | 38 | #include <asm/setup.h> |
38 | #include <asm/pgalloc.h> | 39 | #include <asm/pgalloc.h> |
39 | #include <asm/hypervisor.h> | 40 | #include <asm/hypervisor.h> |
@@ -160,10 +161,21 @@ struct xen_vbd { | |||
160 | sector_t size; | 161 | sector_t size; |
161 | unsigned int flush_support:1; | 162 | unsigned int flush_support:1; |
162 | unsigned int discard_secure:1; | 163 | unsigned int discard_secure:1; |
164 | unsigned int feature_gnt_persistent:1; | ||
165 | unsigned int overflow_max_grants:1; | ||
163 | }; | 166 | }; |
164 | 167 | ||
165 | struct backend_info; | 168 | struct backend_info; |
166 | 169 | ||
170 | |||
171 | struct persistent_gnt { | ||
172 | struct page *page; | ||
173 | grant_ref_t gnt; | ||
174 | grant_handle_t handle; | ||
175 | uint64_t dev_bus_addr; | ||
176 | struct rb_node node; | ||
177 | }; | ||
178 | |||
167 | struct xen_blkif { | 179 | struct xen_blkif { |
168 | /* Unique identifier for this interface. */ | 180 | /* Unique identifier for this interface. */ |
169 | domid_t domid; | 181 | domid_t domid; |
@@ -190,6 +202,10 @@ struct xen_blkif { | |||
190 | struct task_struct *xenblkd; | 202 | struct task_struct *xenblkd; |
191 | unsigned int waiting_reqs; | 203 | unsigned int waiting_reqs; |
192 | 204 | ||
205 | /* tree to store persistent grants */ | ||
206 | struct rb_root persistent_gnts; | ||
207 | unsigned int persistent_gnt_c; | ||
208 | |||
193 | /* statistics */ | 209 | /* statistics */ |
194 | unsigned long st_print; | 210 | unsigned long st_print; |
195 | int st_rd_req; | 211 | int st_rd_req; |
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index f58434c2617c..63980722db41 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c | |||
@@ -117,6 +117,7 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid) | |||
117 | atomic_set(&blkif->drain, 0); | 117 | atomic_set(&blkif->drain, 0); |
118 | blkif->st_print = jiffies; | 118 | blkif->st_print = jiffies; |
119 | init_waitqueue_head(&blkif->waiting_to_free); | 119 | init_waitqueue_head(&blkif->waiting_to_free); |
120 | blkif->persistent_gnts.rb_node = NULL; | ||
120 | 121 | ||
121 | return blkif; | 122 | return blkif; |
122 | } | 123 | } |
@@ -672,6 +673,13 @@ again: | |||
672 | 673 | ||
673 | xen_blkbk_barrier(xbt, be, be->blkif->vbd.flush_support); | 674 | xen_blkbk_barrier(xbt, be, be->blkif->vbd.flush_support); |
674 | 675 | ||
676 | err = xenbus_printf(xbt, dev->nodename, "feature-persistent", "%u", 1); | ||
677 | if (err) { | ||
678 | xenbus_dev_fatal(dev, err, "writing %s/feature-persistent", | ||
679 | dev->nodename); | ||
680 | goto abort; | ||
681 | } | ||
682 | |||
675 | err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", | 683 | err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", |
676 | (unsigned long long)vbd_sz(&be->blkif->vbd)); | 684 | (unsigned long long)vbd_sz(&be->blkif->vbd)); |
677 | if (err) { | 685 | if (err) { |
@@ -720,6 +728,7 @@ static int connect_ring(struct backend_info *be) | |||
720 | struct xenbus_device *dev = be->dev; | 728 | struct xenbus_device *dev = be->dev; |
721 | unsigned long ring_ref; | 729 | unsigned long ring_ref; |
722 | unsigned int evtchn; | 730 | unsigned int evtchn; |
731 | unsigned int pers_grants; | ||
723 | char protocol[64] = ""; | 732 | char protocol[64] = ""; |
724 | int err; | 733 | int err; |
725 | 734 | ||
@@ -749,8 +758,18 @@ static int connect_ring(struct backend_info *be) | |||
749 | xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol); | 758 | xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol); |
750 | return -1; | 759 | return -1; |
751 | } | 760 | } |
752 | pr_info(DRV_PFX "ring-ref %ld, event-channel %d, protocol %d (%s)\n", | 761 | err = xenbus_gather(XBT_NIL, dev->otherend, |
753 | ring_ref, evtchn, be->blkif->blk_protocol, protocol); | 762 | "feature-persistent", "%u", |
763 | &pers_grants, NULL); | ||
764 | if (err) | ||
765 | pers_grants = 0; | ||
766 | |||
767 | be->blkif->vbd.feature_gnt_persistent = pers_grants; | ||
768 | be->blkif->vbd.overflow_max_grants = 0; | ||
769 | |||
770 | pr_info(DRV_PFX "ring-ref %ld, event-channel %d, protocol %d (%s) %s\n", | ||
771 | ring_ref, evtchn, be->blkif->blk_protocol, protocol, | ||
772 | pers_grants ? "persistent grants" : ""); | ||
754 | 773 | ||
755 | /* Map the shared frame, irq etc. */ | 774 | /* Map the shared frame, irq etc. */ |
756 | err = xen_blkif_map(be->blkif, ring_ref, evtchn); | 775 | err = xen_blkif_map(be->blkif, ring_ref, evtchn); |
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 007db8986e84..96e9b00db081 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c | |||
@@ -44,6 +44,7 @@ | |||
44 | #include <linux/mutex.h> | 44 | #include <linux/mutex.h> |
45 | #include <linux/scatterlist.h> | 45 | #include <linux/scatterlist.h> |
46 | #include <linux/bitmap.h> | 46 | #include <linux/bitmap.h> |
47 | #include <linux/llist.h> | ||
47 | 48 | ||
48 | #include <xen/xen.h> | 49 | #include <xen/xen.h> |
49 | #include <xen/xenbus.h> | 50 | #include <xen/xenbus.h> |
@@ -64,10 +65,17 @@ enum blkif_state { | |||
64 | BLKIF_STATE_SUSPENDED, | 65 | BLKIF_STATE_SUSPENDED, |
65 | }; | 66 | }; |
66 | 67 | ||
68 | struct grant { | ||
69 | grant_ref_t gref; | ||
70 | unsigned long pfn; | ||
71 | struct llist_node node; | ||
72 | }; | ||
73 | |||
67 | struct blk_shadow { | 74 | struct blk_shadow { |
68 | struct blkif_request req; | 75 | struct blkif_request req; |
69 | struct request *request; | 76 | struct request *request; |
70 | unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | 77 | unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST]; |
78 | struct grant *grants_used[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | ||
71 | }; | 79 | }; |
72 | 80 | ||
73 | static DEFINE_MUTEX(blkfront_mutex); | 81 | static DEFINE_MUTEX(blkfront_mutex); |
@@ -97,6 +105,8 @@ struct blkfront_info | |||
97 | struct work_struct work; | 105 | struct work_struct work; |
98 | struct gnttab_free_callback callback; | 106 | struct gnttab_free_callback callback; |
99 | struct blk_shadow shadow[BLK_RING_SIZE]; | 107 | struct blk_shadow shadow[BLK_RING_SIZE]; |
108 | struct llist_head persistent_gnts; | ||
109 | unsigned int persistent_gnts_c; | ||
100 | unsigned long shadow_free; | 110 | unsigned long shadow_free; |
101 | unsigned int feature_flush; | 111 | unsigned int feature_flush; |
102 | unsigned int flush_op; | 112 | unsigned int flush_op; |
@@ -104,6 +114,7 @@ struct blkfront_info | |||
104 | unsigned int feature_secdiscard:1; | 114 | unsigned int feature_secdiscard:1; |
105 | unsigned int discard_granularity; | 115 | unsigned int discard_granularity; |
106 | unsigned int discard_alignment; | 116 | unsigned int discard_alignment; |
117 | unsigned int feature_persistent:1; | ||
107 | int is_ready; | 118 | int is_ready; |
108 | }; | 119 | }; |
109 | 120 | ||
@@ -287,21 +298,36 @@ static int blkif_queue_request(struct request *req) | |||
287 | unsigned long id; | 298 | unsigned long id; |
288 | unsigned int fsect, lsect; | 299 | unsigned int fsect, lsect; |
289 | int i, ref; | 300 | int i, ref; |
301 | |||
302 | /* | ||
303 | * Used to store if we are able to queue the request by just using | ||
304 | * existing persistent grants, or if we have to get new grants, | ||
305 | * as there are not sufficiently many free. | ||
306 | */ | ||
307 | bool new_persistent_gnts; | ||
290 | grant_ref_t gref_head; | 308 | grant_ref_t gref_head; |
309 | struct page *granted_page; | ||
310 | struct grant *gnt_list_entry = NULL; | ||
291 | struct scatterlist *sg; | 311 | struct scatterlist *sg; |
292 | 312 | ||
293 | if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) | 313 | if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) |
294 | return 1; | 314 | return 1; |
295 | 315 | ||
296 | if (gnttab_alloc_grant_references( | 316 | /* Check if we have enought grants to allocate a requests */ |
297 | BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) { | 317 | if (info->persistent_gnts_c < BLKIF_MAX_SEGMENTS_PER_REQUEST) { |
298 | gnttab_request_free_callback( | 318 | new_persistent_gnts = 1; |
299 | &info->callback, | 319 | if (gnttab_alloc_grant_references( |
300 | blkif_restart_queue_callback, | 320 | BLKIF_MAX_SEGMENTS_PER_REQUEST - info->persistent_gnts_c, |
301 | info, | 321 | &gref_head) < 0) { |
302 | BLKIF_MAX_SEGMENTS_PER_REQUEST); | 322 | gnttab_request_free_callback( |
303 | return 1; | 323 | &info->callback, |
304 | } | 324 | blkif_restart_queue_callback, |
325 | info, | ||
326 | BLKIF_MAX_SEGMENTS_PER_REQUEST); | ||
327 | return 1; | ||
328 | } | ||
329 | } else | ||
330 | new_persistent_gnts = 0; | ||
305 | 331 | ||
306 | /* Fill out a communications ring structure. */ | 332 | /* Fill out a communications ring structure. */ |
307 | ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); | 333 | ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); |
@@ -341,18 +367,73 @@ static int blkif_queue_request(struct request *req) | |||
341 | BLKIF_MAX_SEGMENTS_PER_REQUEST); | 367 | BLKIF_MAX_SEGMENTS_PER_REQUEST); |
342 | 368 | ||
343 | for_each_sg(info->sg, sg, ring_req->u.rw.nr_segments, i) { | 369 | for_each_sg(info->sg, sg, ring_req->u.rw.nr_segments, i) { |
344 | buffer_mfn = pfn_to_mfn(page_to_pfn(sg_page(sg))); | ||
345 | fsect = sg->offset >> 9; | 370 | fsect = sg->offset >> 9; |
346 | lsect = fsect + (sg->length >> 9) - 1; | 371 | lsect = fsect + (sg->length >> 9) - 1; |
347 | /* install a grant reference. */ | ||
348 | ref = gnttab_claim_grant_reference(&gref_head); | ||
349 | BUG_ON(ref == -ENOSPC); | ||
350 | 372 | ||
351 | gnttab_grant_foreign_access_ref( | 373 | if (info->persistent_gnts_c) { |
352 | ref, | 374 | BUG_ON(llist_empty(&info->persistent_gnts)); |
375 | gnt_list_entry = llist_entry( | ||
376 | llist_del_first(&info->persistent_gnts), | ||
377 | struct grant, node); | ||
378 | |||
379 | ref = gnt_list_entry->gref; | ||
380 | buffer_mfn = pfn_to_mfn(gnt_list_entry->pfn); | ||
381 | info->persistent_gnts_c--; | ||
382 | } else { | ||
383 | ref = gnttab_claim_grant_reference(&gref_head); | ||
384 | BUG_ON(ref == -ENOSPC); | ||
385 | |||
386 | gnt_list_entry = | ||
387 | kmalloc(sizeof(struct grant), | ||
388 | GFP_ATOMIC); | ||
389 | if (!gnt_list_entry) | ||
390 | return -ENOMEM; | ||
391 | |||
392 | granted_page = alloc_page(GFP_ATOMIC); | ||
393 | if (!granted_page) { | ||
394 | kfree(gnt_list_entry); | ||
395 | return -ENOMEM; | ||
396 | } | ||
397 | |||
398 | gnt_list_entry->pfn = | ||
399 | page_to_pfn(granted_page); | ||
400 | gnt_list_entry->gref = ref; | ||
401 | |||
402 | buffer_mfn = pfn_to_mfn(page_to_pfn( | ||
403 | granted_page)); | ||
404 | gnttab_grant_foreign_access_ref(ref, | ||
353 | info->xbdev->otherend_id, | 405 | info->xbdev->otherend_id, |
354 | buffer_mfn, | 406 | buffer_mfn, 0); |
355 | rq_data_dir(req)); | 407 | } |
408 | |||
409 | info->shadow[id].grants_used[i] = gnt_list_entry; | ||
410 | |||
411 | if (rq_data_dir(req)) { | ||
412 | char *bvec_data; | ||
413 | void *shared_data; | ||
414 | |||
415 | BUG_ON(sg->offset + sg->length > PAGE_SIZE); | ||
416 | |||
417 | shared_data = kmap_atomic( | ||
418 | pfn_to_page(gnt_list_entry->pfn)); | ||
419 | bvec_data = kmap_atomic(sg_page(sg)); | ||
420 | |||
421 | /* | ||
422 | * this does not wipe data stored outside the | ||
423 | * range sg->offset..sg->offset+sg->length. | ||
424 | * Therefore, blkback *could* see data from | ||
425 | * previous requests. This is OK as long as | ||
426 | * persistent grants are shared with just one | ||
427 | * domain. It may need refactoring if this | ||
428 | * changes | ||
429 | */ | ||
430 | memcpy(shared_data + sg->offset, | ||
431 | bvec_data + sg->offset, | ||
432 | sg->length); | ||
433 | |||
434 | kunmap_atomic(bvec_data); | ||
435 | kunmap_atomic(shared_data); | ||
436 | } | ||
356 | 437 | ||
357 | info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn); | 438 | info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn); |
358 | ring_req->u.rw.seg[i] = | 439 | ring_req->u.rw.seg[i] = |
@@ -368,7 +449,8 @@ static int blkif_queue_request(struct request *req) | |||
368 | /* Keep a private copy so we can reissue requests when recovering. */ | 449 | /* Keep a private copy so we can reissue requests when recovering. */ |
369 | info->shadow[id].req = *ring_req; | 450 | info->shadow[id].req = *ring_req; |
370 | 451 | ||
371 | gnttab_free_grant_references(gref_head); | 452 | if (new_persistent_gnts) |
453 | gnttab_free_grant_references(gref_head); | ||
372 | 454 | ||
373 | return 0; | 455 | return 0; |
374 | } | 456 | } |
@@ -480,12 +562,13 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) | |||
480 | static void xlvbd_flush(struct blkfront_info *info) | 562 | static void xlvbd_flush(struct blkfront_info *info) |
481 | { | 563 | { |
482 | blk_queue_flush(info->rq, info->feature_flush); | 564 | blk_queue_flush(info->rq, info->feature_flush); |
483 | printk(KERN_INFO "blkfront: %s: %s: %s\n", | 565 | printk(KERN_INFO "blkfront: %s: %s: %s %s\n", |
484 | info->gd->disk_name, | 566 | info->gd->disk_name, |
485 | info->flush_op == BLKIF_OP_WRITE_BARRIER ? | 567 | info->flush_op == BLKIF_OP_WRITE_BARRIER ? |
486 | "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ? | 568 | "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ? |
487 | "flush diskcache" : "barrier or flush"), | 569 | "flush diskcache" : "barrier or flush"), |
488 | info->feature_flush ? "enabled" : "disabled"); | 570 | info->feature_flush ? "enabled" : "disabled", |
571 | info->feature_persistent ? "using persistent grants" : ""); | ||
489 | } | 572 | } |
490 | 573 | ||
491 | static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) | 574 | static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) |
@@ -707,6 +790,9 @@ static void blkif_restart_queue(struct work_struct *work) | |||
707 | 790 | ||
708 | static void blkif_free(struct blkfront_info *info, int suspend) | 791 | static void blkif_free(struct blkfront_info *info, int suspend) |
709 | { | 792 | { |
793 | struct llist_node *all_gnts; | ||
794 | struct grant *persistent_gnt; | ||
795 | |||
710 | /* Prevent new requests being issued until we fix things up. */ | 796 | /* Prevent new requests being issued until we fix things up. */ |
711 | spin_lock_irq(&info->io_lock); | 797 | spin_lock_irq(&info->io_lock); |
712 | info->connected = suspend ? | 798 | info->connected = suspend ? |
@@ -714,6 +800,18 @@ static void blkif_free(struct blkfront_info *info, int suspend) | |||
714 | /* No more blkif_request(). */ | 800 | /* No more blkif_request(). */ |
715 | if (info->rq) | 801 | if (info->rq) |
716 | blk_stop_queue(info->rq); | 802 | blk_stop_queue(info->rq); |
803 | |||
804 | /* Remove all persistent grants */ | ||
805 | if (info->persistent_gnts_c) { | ||
806 | all_gnts = llist_del_all(&info->persistent_gnts); | ||
807 | llist_for_each_entry(persistent_gnt, all_gnts, node) { | ||
808 | gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); | ||
809 | __free_page(pfn_to_page(persistent_gnt->pfn)); | ||
810 | kfree(persistent_gnt); | ||
811 | } | ||
812 | info->persistent_gnts_c = 0; | ||
813 | } | ||
814 | |||
717 | /* No more gnttab callback work. */ | 815 | /* No more gnttab callback work. */ |
718 | gnttab_cancel_free_callback(&info->callback); | 816 | gnttab_cancel_free_callback(&info->callback); |
719 | spin_unlock_irq(&info->io_lock); | 817 | spin_unlock_irq(&info->io_lock); |
@@ -734,13 +832,43 @@ static void blkif_free(struct blkfront_info *info, int suspend) | |||
734 | 832 | ||
735 | } | 833 | } |
736 | 834 | ||
737 | static void blkif_completion(struct blk_shadow *s) | 835 | static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, |
836 | struct blkif_response *bret) | ||
738 | { | 837 | { |
739 | int i; | 838 | int i; |
740 | /* Do not let BLKIF_OP_DISCARD as nr_segment is in the same place | 839 | struct bio_vec *bvec; |
741 | * flag. */ | 840 | struct req_iterator iter; |
742 | for (i = 0; i < s->req.u.rw.nr_segments; i++) | 841 | unsigned long flags; |
743 | gnttab_end_foreign_access(s->req.u.rw.seg[i].gref, 0, 0UL); | 842 | char *bvec_data; |
843 | void *shared_data; | ||
844 | unsigned int offset = 0; | ||
845 | |||
846 | if (bret->operation == BLKIF_OP_READ) { | ||
847 | /* | ||
848 | * Copy the data received from the backend into the bvec. | ||
849 | * Since bv_offset can be different than 0, and bv_len different | ||
850 | * than PAGE_SIZE, we have to keep track of the current offset, | ||
851 | * to be sure we are copying the data from the right shared page. | ||
852 | */ | ||
853 | rq_for_each_segment(bvec, s->request, iter) { | ||
854 | BUG_ON((bvec->bv_offset + bvec->bv_len) > PAGE_SIZE); | ||
855 | i = offset >> PAGE_SHIFT; | ||
856 | BUG_ON(i >= s->req.u.rw.nr_segments); | ||
857 | shared_data = kmap_atomic( | ||
858 | pfn_to_page(s->grants_used[i]->pfn)); | ||
859 | bvec_data = bvec_kmap_irq(bvec, &flags); | ||
860 | memcpy(bvec_data, shared_data + bvec->bv_offset, | ||
861 | bvec->bv_len); | ||
862 | bvec_kunmap_irq(bvec_data, &flags); | ||
863 | kunmap_atomic(shared_data); | ||
864 | offset += bvec->bv_len; | ||
865 | } | ||
866 | } | ||
867 | /* Add the persistent grant into the list of free grants */ | ||
868 | for (i = 0; i < s->req.u.rw.nr_segments; i++) { | ||
869 | llist_add(&s->grants_used[i]->node, &info->persistent_gnts); | ||
870 | info->persistent_gnts_c++; | ||
871 | } | ||
744 | } | 872 | } |
745 | 873 | ||
746 | static irqreturn_t blkif_interrupt(int irq, void *dev_id) | 874 | static irqreturn_t blkif_interrupt(int irq, void *dev_id) |
@@ -783,7 +911,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) | |||
783 | req = info->shadow[id].request; | 911 | req = info->shadow[id].request; |
784 | 912 | ||
785 | if (bret->operation != BLKIF_OP_DISCARD) | 913 | if (bret->operation != BLKIF_OP_DISCARD) |
786 | blkif_completion(&info->shadow[id]); | 914 | blkif_completion(&info->shadow[id], info, bret); |
787 | 915 | ||
788 | if (add_id_to_freelist(info, id)) { | 916 | if (add_id_to_freelist(info, id)) { |
789 | WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n", | 917 | WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n", |
@@ -942,6 +1070,11 @@ again: | |||
942 | message = "writing protocol"; | 1070 | message = "writing protocol"; |
943 | goto abort_transaction; | 1071 | goto abort_transaction; |
944 | } | 1072 | } |
1073 | err = xenbus_printf(xbt, dev->nodename, | ||
1074 | "feature-persistent", "%u", 1); | ||
1075 | if (err) | ||
1076 | dev_warn(&dev->dev, | ||
1077 | "writing persistent grants feature to xenbus"); | ||
945 | 1078 | ||
946 | err = xenbus_transaction_end(xbt, 0); | 1079 | err = xenbus_transaction_end(xbt, 0); |
947 | if (err) { | 1080 | if (err) { |
@@ -1029,6 +1162,8 @@ static int blkfront_probe(struct xenbus_device *dev, | |||
1029 | spin_lock_init(&info->io_lock); | 1162 | spin_lock_init(&info->io_lock); |
1030 | info->xbdev = dev; | 1163 | info->xbdev = dev; |
1031 | info->vdevice = vdevice; | 1164 | info->vdevice = vdevice; |
1165 | init_llist_head(&info->persistent_gnts); | ||
1166 | info->persistent_gnts_c = 0; | ||
1032 | info->connected = BLKIF_STATE_DISCONNECTED; | 1167 | info->connected = BLKIF_STATE_DISCONNECTED; |
1033 | INIT_WORK(&info->work, blkif_restart_queue); | 1168 | INIT_WORK(&info->work, blkif_restart_queue); |
1034 | 1169 | ||
@@ -1093,7 +1228,7 @@ static int blkif_recover(struct blkfront_info *info) | |||
1093 | req->u.rw.seg[j].gref, | 1228 | req->u.rw.seg[j].gref, |
1094 | info->xbdev->otherend_id, | 1229 | info->xbdev->otherend_id, |
1095 | pfn_to_mfn(info->shadow[req->u.rw.id].frame[j]), | 1230 | pfn_to_mfn(info->shadow[req->u.rw.id].frame[j]), |
1096 | rq_data_dir(info->shadow[req->u.rw.id].request)); | 1231 | 0); |
1097 | } | 1232 | } |
1098 | info->shadow[req->u.rw.id].req = *req; | 1233 | info->shadow[req->u.rw.id].req = *req; |
1099 | 1234 | ||
@@ -1225,7 +1360,7 @@ static void blkfront_connect(struct blkfront_info *info) | |||
1225 | unsigned long sector_size; | 1360 | unsigned long sector_size; |
1226 | unsigned int binfo; | 1361 | unsigned int binfo; |
1227 | int err; | 1362 | int err; |
1228 | int barrier, flush, discard; | 1363 | int barrier, flush, discard, persistent; |
1229 | 1364 | ||
1230 | switch (info->connected) { | 1365 | switch (info->connected) { |
1231 | case BLKIF_STATE_CONNECTED: | 1366 | case BLKIF_STATE_CONNECTED: |
@@ -1303,6 +1438,14 @@ static void blkfront_connect(struct blkfront_info *info) | |||
1303 | if (!err && discard) | 1438 | if (!err && discard) |
1304 | blkfront_setup_discard(info); | 1439 | blkfront_setup_discard(info); |
1305 | 1440 | ||
1441 | err = xenbus_gather(XBT_NIL, info->xbdev->otherend, | ||
1442 | "feature-persistent", "%u", &persistent, | ||
1443 | NULL); | ||
1444 | if (err) | ||
1445 | info->feature_persistent = 0; | ||
1446 | else | ||
1447 | info->feature_persistent = persistent; | ||
1448 | |||
1306 | err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size); | 1449 | err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size); |
1307 | if (err) { | 1450 | if (err) { |
1308 | xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", | 1451 | xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", |
diff --git a/drivers/md/md.c b/drivers/md/md.c index bd8bf0953fe3..4843b004c558 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -452,7 +452,7 @@ void md_flush_request(struct mddev *mddev, struct bio *bio) | |||
452 | spin_lock_irq(&mddev->write_lock); | 452 | spin_lock_irq(&mddev->write_lock); |
453 | wait_event_lock_irq(mddev->sb_wait, | 453 | wait_event_lock_irq(mddev->sb_wait, |
454 | !mddev->flush_bio, | 454 | !mddev->flush_bio, |
455 | mddev->write_lock, /*nothing*/); | 455 | mddev->write_lock); |
456 | mddev->flush_bio = bio; | 456 | mddev->flush_bio = bio; |
457 | spin_unlock_irq(&mddev->write_lock); | 457 | spin_unlock_irq(&mddev->write_lock); |
458 | 458 | ||
diff --git a/drivers/md/md.h b/drivers/md/md.h index af443ab868db..1e2fc3d9c74c 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h | |||
@@ -551,32 +551,6 @@ struct md_thread { | |||
551 | 551 | ||
552 | #define THREAD_WAKEUP 0 | 552 | #define THREAD_WAKEUP 0 |
553 | 553 | ||
554 | #define __wait_event_lock_irq(wq, condition, lock, cmd) \ | ||
555 | do { \ | ||
556 | wait_queue_t __wait; \ | ||
557 | init_waitqueue_entry(&__wait, current); \ | ||
558 | \ | ||
559 | add_wait_queue(&wq, &__wait); \ | ||
560 | for (;;) { \ | ||
561 | set_current_state(TASK_UNINTERRUPTIBLE); \ | ||
562 | if (condition) \ | ||
563 | break; \ | ||
564 | spin_unlock_irq(&lock); \ | ||
565 | cmd; \ | ||
566 | schedule(); \ | ||
567 | spin_lock_irq(&lock); \ | ||
568 | } \ | ||
569 | current->state = TASK_RUNNING; \ | ||
570 | remove_wait_queue(&wq, &__wait); \ | ||
571 | } while (0) | ||
572 | |||
573 | #define wait_event_lock_irq(wq, condition, lock, cmd) \ | ||
574 | do { \ | ||
575 | if (condition) \ | ||
576 | break; \ | ||
577 | __wait_event_lock_irq(wq, condition, lock, cmd); \ | ||
578 | } while (0) | ||
579 | |||
580 | static inline void safe_put_page(struct page *p) | 554 | static inline void safe_put_page(struct page *p) |
581 | { | 555 | { |
582 | if (p) put_page(p); | 556 | if (p) put_page(p); |
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index a0f73092176e..d5bddfc4010e 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -822,7 +822,7 @@ static void raise_barrier(struct r1conf *conf) | |||
822 | 822 | ||
823 | /* Wait until no block IO is waiting */ | 823 | /* Wait until no block IO is waiting */ |
824 | wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting, | 824 | wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting, |
825 | conf->resync_lock, ); | 825 | conf->resync_lock); |
826 | 826 | ||
827 | /* block any new IO from starting */ | 827 | /* block any new IO from starting */ |
828 | conf->barrier++; | 828 | conf->barrier++; |
@@ -830,7 +830,7 @@ static void raise_barrier(struct r1conf *conf) | |||
830 | /* Now wait for all pending IO to complete */ | 830 | /* Now wait for all pending IO to complete */ |
831 | wait_event_lock_irq(conf->wait_barrier, | 831 | wait_event_lock_irq(conf->wait_barrier, |
832 | !conf->nr_pending && conf->barrier < RESYNC_DEPTH, | 832 | !conf->nr_pending && conf->barrier < RESYNC_DEPTH, |
833 | conf->resync_lock, ); | 833 | conf->resync_lock); |
834 | 834 | ||
835 | spin_unlock_irq(&conf->resync_lock); | 835 | spin_unlock_irq(&conf->resync_lock); |
836 | } | 836 | } |
@@ -864,8 +864,7 @@ static void wait_barrier(struct r1conf *conf) | |||
864 | (conf->nr_pending && | 864 | (conf->nr_pending && |
865 | current->bio_list && | 865 | current->bio_list && |
866 | !bio_list_empty(current->bio_list)), | 866 | !bio_list_empty(current->bio_list)), |
867 | conf->resync_lock, | 867 | conf->resync_lock); |
868 | ); | ||
869 | conf->nr_waiting--; | 868 | conf->nr_waiting--; |
870 | } | 869 | } |
871 | conf->nr_pending++; | 870 | conf->nr_pending++; |
@@ -898,10 +897,10 @@ static void freeze_array(struct r1conf *conf) | |||
898 | spin_lock_irq(&conf->resync_lock); | 897 | spin_lock_irq(&conf->resync_lock); |
899 | conf->barrier++; | 898 | conf->barrier++; |
900 | conf->nr_waiting++; | 899 | conf->nr_waiting++; |
901 | wait_event_lock_irq(conf->wait_barrier, | 900 | wait_event_lock_irq_cmd(conf->wait_barrier, |
902 | conf->nr_pending == conf->nr_queued+1, | 901 | conf->nr_pending == conf->nr_queued+1, |
903 | conf->resync_lock, | 902 | conf->resync_lock, |
904 | flush_pending_writes(conf)); | 903 | flush_pending_writes(conf)); |
905 | spin_unlock_irq(&conf->resync_lock); | 904 | spin_unlock_irq(&conf->resync_lock); |
906 | } | 905 | } |
907 | static void unfreeze_array(struct r1conf *conf) | 906 | static void unfreeze_array(struct r1conf *conf) |
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index c9acbd717131..64d48249c03b 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -952,7 +952,7 @@ static void raise_barrier(struct r10conf *conf, int force) | |||
952 | 952 | ||
953 | /* Wait until no block IO is waiting (unless 'force') */ | 953 | /* Wait until no block IO is waiting (unless 'force') */ |
954 | wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting, | 954 | wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting, |
955 | conf->resync_lock, ); | 955 | conf->resync_lock); |
956 | 956 | ||
957 | /* block any new IO from starting */ | 957 | /* block any new IO from starting */ |
958 | conf->barrier++; | 958 | conf->barrier++; |
@@ -960,7 +960,7 @@ static void raise_barrier(struct r10conf *conf, int force) | |||
960 | /* Now wait for all pending IO to complete */ | 960 | /* Now wait for all pending IO to complete */ |
961 | wait_event_lock_irq(conf->wait_barrier, | 961 | wait_event_lock_irq(conf->wait_barrier, |
962 | !conf->nr_pending && conf->barrier < RESYNC_DEPTH, | 962 | !conf->nr_pending && conf->barrier < RESYNC_DEPTH, |
963 | conf->resync_lock, ); | 963 | conf->resync_lock); |
964 | 964 | ||
965 | spin_unlock_irq(&conf->resync_lock); | 965 | spin_unlock_irq(&conf->resync_lock); |
966 | } | 966 | } |
@@ -993,8 +993,7 @@ static void wait_barrier(struct r10conf *conf) | |||
993 | (conf->nr_pending && | 993 | (conf->nr_pending && |
994 | current->bio_list && | 994 | current->bio_list && |
995 | !bio_list_empty(current->bio_list)), | 995 | !bio_list_empty(current->bio_list)), |
996 | conf->resync_lock, | 996 | conf->resync_lock); |
997 | ); | ||
998 | conf->nr_waiting--; | 997 | conf->nr_waiting--; |
999 | } | 998 | } |
1000 | conf->nr_pending++; | 999 | conf->nr_pending++; |
@@ -1027,10 +1026,10 @@ static void freeze_array(struct r10conf *conf) | |||
1027 | spin_lock_irq(&conf->resync_lock); | 1026 | spin_lock_irq(&conf->resync_lock); |
1028 | conf->barrier++; | 1027 | conf->barrier++; |
1029 | conf->nr_waiting++; | 1028 | conf->nr_waiting++; |
1030 | wait_event_lock_irq(conf->wait_barrier, | 1029 | wait_event_lock_irq_cmd(conf->wait_barrier, |
1031 | conf->nr_pending == conf->nr_queued+1, | 1030 | conf->nr_pending == conf->nr_queued+1, |
1032 | conf->resync_lock, | 1031 | conf->resync_lock, |
1033 | flush_pending_writes(conf)); | 1032 | flush_pending_writes(conf)); |
1034 | 1033 | ||
1035 | spin_unlock_irq(&conf->resync_lock); | 1034 | spin_unlock_irq(&conf->resync_lock); |
1036 | } | 1035 | } |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 3380372c0393..8d8555bf3e1d 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -466,7 +466,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector, | |||
466 | do { | 466 | do { |
467 | wait_event_lock_irq(conf->wait_for_stripe, | 467 | wait_event_lock_irq(conf->wait_for_stripe, |
468 | conf->quiesce == 0 || noquiesce, | 468 | conf->quiesce == 0 || noquiesce, |
469 | conf->device_lock, /* nothing */); | 469 | conf->device_lock); |
470 | sh = __find_stripe(conf, sector, conf->generation - previous); | 470 | sh = __find_stripe(conf, sector, conf->generation - previous); |
471 | if (!sh) { | 471 | if (!sh) { |
472 | if (!conf->inactive_blocked) | 472 | if (!conf->inactive_blocked) |
@@ -480,8 +480,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector, | |||
480 | (atomic_read(&conf->active_stripes) | 480 | (atomic_read(&conf->active_stripes) |
481 | < (conf->max_nr_stripes *3/4) | 481 | < (conf->max_nr_stripes *3/4) |
482 | || !conf->inactive_blocked), | 482 | || !conf->inactive_blocked), |
483 | conf->device_lock, | 483 | conf->device_lock); |
484 | ); | ||
485 | conf->inactive_blocked = 0; | 484 | conf->inactive_blocked = 0; |
486 | } else | 485 | } else |
487 | init_stripe(sh, sector, previous); | 486 | init_stripe(sh, sector, previous); |
@@ -1646,8 +1645,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) | |||
1646 | spin_lock_irq(&conf->device_lock); | 1645 | spin_lock_irq(&conf->device_lock); |
1647 | wait_event_lock_irq(conf->wait_for_stripe, | 1646 | wait_event_lock_irq(conf->wait_for_stripe, |
1648 | !list_empty(&conf->inactive_list), | 1647 | !list_empty(&conf->inactive_list), |
1649 | conf->device_lock, | 1648 | conf->device_lock); |
1650 | ); | ||
1651 | osh = get_free_stripe(conf); | 1649 | osh = get_free_stripe(conf); |
1652 | spin_unlock_irq(&conf->device_lock); | 1650 | spin_unlock_irq(&conf->device_lock); |
1653 | atomic_set(&nsh->count, 1); | 1651 | atomic_set(&nsh->count, 1); |
@@ -4003,7 +4001,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) | |||
4003 | spin_lock_irq(&conf->device_lock); | 4001 | spin_lock_irq(&conf->device_lock); |
4004 | wait_event_lock_irq(conf->wait_for_stripe, | 4002 | wait_event_lock_irq(conf->wait_for_stripe, |
4005 | conf->quiesce == 0, | 4003 | conf->quiesce == 0, |
4006 | conf->device_lock, /* nothing */); | 4004 | conf->device_lock); |
4007 | atomic_inc(&conf->active_aligned_reads); | 4005 | atomic_inc(&conf->active_aligned_reads); |
4008 | spin_unlock_irq(&conf->device_lock); | 4006 | spin_unlock_irq(&conf->device_lock); |
4009 | 4007 | ||
@@ -6095,7 +6093,7 @@ static void raid5_quiesce(struct mddev *mddev, int state) | |||
6095 | wait_event_lock_irq(conf->wait_for_stripe, | 6093 | wait_event_lock_irq(conf->wait_for_stripe, |
6096 | atomic_read(&conf->active_stripes) == 0 && | 6094 | atomic_read(&conf->active_stripes) == 0 && |
6097 | atomic_read(&conf->active_aligned_reads) == 0, | 6095 | atomic_read(&conf->active_aligned_reads) == 0, |
6098 | conf->device_lock, /* nothing */); | 6096 | conf->device_lock); |
6099 | conf->quiesce = 1; | 6097 | conf->quiesce = 1; |
6100 | spin_unlock_irq(&conf->device_lock); | 6098 | spin_unlock_irq(&conf->device_lock); |
6101 | /* allow reshape to continue */ | 6099 | /* allow reshape to continue */ |
diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 47e3d4850584..0c5a18ec322c 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h | |||
@@ -51,12 +51,11 @@ | |||
51 | 51 | ||
52 | #endif | 52 | #endif |
53 | 53 | ||
54 | |||
55 | extern const char *drbd_buildtag(void); | 54 | extern const char *drbd_buildtag(void); |
56 | #define REL_VERSION "8.3.13" | 55 | #define REL_VERSION "8.4.2" |
57 | #define API_VERSION 88 | 56 | #define API_VERSION 1 |
58 | #define PRO_VERSION_MIN 86 | 57 | #define PRO_VERSION_MIN 86 |
59 | #define PRO_VERSION_MAX 96 | 58 | #define PRO_VERSION_MAX 101 |
60 | 59 | ||
61 | 60 | ||
62 | enum drbd_io_error_p { | 61 | enum drbd_io_error_p { |
@@ -66,7 +65,8 @@ enum drbd_io_error_p { | |||
66 | }; | 65 | }; |
67 | 66 | ||
68 | enum drbd_fencing_p { | 67 | enum drbd_fencing_p { |
69 | FP_DONT_CARE, | 68 | FP_NOT_AVAIL = -1, /* Not a policy */ |
69 | FP_DONT_CARE = 0, | ||
70 | FP_RESOURCE, | 70 | FP_RESOURCE, |
71 | FP_STONITH | 71 | FP_STONITH |
72 | }; | 72 | }; |
@@ -102,6 +102,20 @@ enum drbd_on_congestion { | |||
102 | OC_DISCONNECT, | 102 | OC_DISCONNECT, |
103 | }; | 103 | }; |
104 | 104 | ||
105 | enum drbd_read_balancing { | ||
106 | RB_PREFER_LOCAL, | ||
107 | RB_PREFER_REMOTE, | ||
108 | RB_ROUND_ROBIN, | ||
109 | RB_LEAST_PENDING, | ||
110 | RB_CONGESTED_REMOTE, | ||
111 | RB_32K_STRIPING, | ||
112 | RB_64K_STRIPING, | ||
113 | RB_128K_STRIPING, | ||
114 | RB_256K_STRIPING, | ||
115 | RB_512K_STRIPING, | ||
116 | RB_1M_STRIPING, | ||
117 | }; | ||
118 | |||
105 | /* KEEP the order, do not delete or insert. Only append. */ | 119 | /* KEEP the order, do not delete or insert. Only append. */ |
106 | enum drbd_ret_code { | 120 | enum drbd_ret_code { |
107 | ERR_CODE_BASE = 100, | 121 | ERR_CODE_BASE = 100, |
@@ -122,7 +136,7 @@ enum drbd_ret_code { | |||
122 | ERR_AUTH_ALG = 120, | 136 | ERR_AUTH_ALG = 120, |
123 | ERR_AUTH_ALG_ND = 121, | 137 | ERR_AUTH_ALG_ND = 121, |
124 | ERR_NOMEM = 122, | 138 | ERR_NOMEM = 122, |
125 | ERR_DISCARD = 123, | 139 | ERR_DISCARD_IMPOSSIBLE = 123, |
126 | ERR_DISK_CONFIGURED = 124, | 140 | ERR_DISK_CONFIGURED = 124, |
127 | ERR_NET_CONFIGURED = 125, | 141 | ERR_NET_CONFIGURED = 125, |
128 | ERR_MANDATORY_TAG = 126, | 142 | ERR_MANDATORY_TAG = 126, |
@@ -130,8 +144,8 @@ enum drbd_ret_code { | |||
130 | ERR_INTR = 129, /* EINTR */ | 144 | ERR_INTR = 129, /* EINTR */ |
131 | ERR_RESIZE_RESYNC = 130, | 145 | ERR_RESIZE_RESYNC = 130, |
132 | ERR_NO_PRIMARY = 131, | 146 | ERR_NO_PRIMARY = 131, |
133 | ERR_SYNC_AFTER = 132, | 147 | ERR_RESYNC_AFTER = 132, |
134 | ERR_SYNC_AFTER_CYCLE = 133, | 148 | ERR_RESYNC_AFTER_CYCLE = 133, |
135 | ERR_PAUSE_IS_SET = 134, | 149 | ERR_PAUSE_IS_SET = 134, |
136 | ERR_PAUSE_IS_CLEAR = 135, | 150 | ERR_PAUSE_IS_CLEAR = 135, |
137 | ERR_PACKET_NR = 137, | 151 | ERR_PACKET_NR = 137, |
@@ -155,6 +169,14 @@ enum drbd_ret_code { | |||
155 | ERR_CONG_NOT_PROTO_A = 155, | 169 | ERR_CONG_NOT_PROTO_A = 155, |
156 | ERR_PIC_AFTER_DEP = 156, | 170 | ERR_PIC_AFTER_DEP = 156, |
157 | ERR_PIC_PEER_DEP = 157, | 171 | ERR_PIC_PEER_DEP = 157, |
172 | ERR_RES_NOT_KNOWN = 158, | ||
173 | ERR_RES_IN_USE = 159, | ||
174 | ERR_MINOR_CONFIGURED = 160, | ||
175 | ERR_MINOR_EXISTS = 161, | ||
176 | ERR_INVALID_REQUEST = 162, | ||
177 | ERR_NEED_APV_100 = 163, | ||
178 | ERR_NEED_ALLOW_TWO_PRI = 164, | ||
179 | ERR_MD_UNCLEAN = 165, | ||
158 | 180 | ||
159 | /* insert new ones above this line */ | 181 | /* insert new ones above this line */ |
160 | AFTER_LAST_ERR_CODE | 182 | AFTER_LAST_ERR_CODE |
@@ -296,7 +318,8 @@ enum drbd_state_rv { | |||
296 | SS_NOT_SUPPORTED = -17, /* drbd-8.2 only */ | 318 | SS_NOT_SUPPORTED = -17, /* drbd-8.2 only */ |
297 | SS_IN_TRANSIENT_STATE = -18, /* Retry after the next state change */ | 319 | SS_IN_TRANSIENT_STATE = -18, /* Retry after the next state change */ |
298 | SS_CONCURRENT_ST_CHG = -19, /* Concurrent cluster side state change! */ | 320 | SS_CONCURRENT_ST_CHG = -19, /* Concurrent cluster side state change! */ |
299 | SS_AFTER_LAST_ERROR = -20, /* Keep this at bottom */ | 321 | SS_O_VOL_PEER_PRI = -20, |
322 | SS_AFTER_LAST_ERROR = -21, /* Keep this at bottom */ | ||
300 | }; | 323 | }; |
301 | 324 | ||
302 | /* from drbd_strings.c */ | 325 | /* from drbd_strings.c */ |
@@ -313,7 +336,9 @@ extern const char *drbd_set_st_err_str(enum drbd_state_rv); | |||
313 | #define MDF_FULL_SYNC (1 << 3) | 336 | #define MDF_FULL_SYNC (1 << 3) |
314 | #define MDF_WAS_UP_TO_DATE (1 << 4) | 337 | #define MDF_WAS_UP_TO_DATE (1 << 4) |
315 | #define MDF_PEER_OUT_DATED (1 << 5) | 338 | #define MDF_PEER_OUT_DATED (1 << 5) |
316 | #define MDF_CRASHED_PRIMARY (1 << 6) | 339 | #define MDF_CRASHED_PRIMARY (1 << 6) |
340 | #define MDF_AL_CLEAN (1 << 7) | ||
341 | #define MDF_AL_DISABLED (1 << 8) | ||
317 | 342 | ||
318 | enum drbd_uuid_index { | 343 | enum drbd_uuid_index { |
319 | UI_CURRENT, | 344 | UI_CURRENT, |
@@ -333,37 +358,23 @@ enum drbd_timeout_flag { | |||
333 | 358 | ||
334 | #define UUID_JUST_CREATED ((__u64)4) | 359 | #define UUID_JUST_CREATED ((__u64)4) |
335 | 360 | ||
361 | /* magic numbers used in meta data and network packets */ | ||
336 | #define DRBD_MAGIC 0x83740267 | 362 | #define DRBD_MAGIC 0x83740267 |
337 | #define BE_DRBD_MAGIC __constant_cpu_to_be32(DRBD_MAGIC) | ||
338 | #define DRBD_MAGIC_BIG 0x835a | 363 | #define DRBD_MAGIC_BIG 0x835a |
339 | #define BE_DRBD_MAGIC_BIG __constant_cpu_to_be16(DRBD_MAGIC_BIG) | 364 | #define DRBD_MAGIC_100 0x8620ec20 |
365 | |||
366 | #define DRBD_MD_MAGIC_07 (DRBD_MAGIC+3) | ||
367 | #define DRBD_MD_MAGIC_08 (DRBD_MAGIC+4) | ||
368 | #define DRBD_MD_MAGIC_84_UNCLEAN (DRBD_MAGIC+5) | ||
369 | |||
370 | |||
371 | /* how I came up with this magic? | ||
372 | * base64 decode "actlog==" ;) */ | ||
373 | #define DRBD_AL_MAGIC 0x69cb65a2 | ||
340 | 374 | ||
341 | /* these are of type "int" */ | 375 | /* these are of type "int" */ |
342 | #define DRBD_MD_INDEX_INTERNAL -1 | 376 | #define DRBD_MD_INDEX_INTERNAL -1 |
343 | #define DRBD_MD_INDEX_FLEX_EXT -2 | 377 | #define DRBD_MD_INDEX_FLEX_EXT -2 |
344 | #define DRBD_MD_INDEX_FLEX_INT -3 | 378 | #define DRBD_MD_INDEX_FLEX_INT -3 |
345 | 379 | ||
346 | /* Start of the new netlink/connector stuff */ | ||
347 | |||
348 | #define DRBD_NL_CREATE_DEVICE 0x01 | ||
349 | #define DRBD_NL_SET_DEFAULTS 0x02 | ||
350 | |||
351 | |||
352 | /* For searching a vacant cn_idx value */ | ||
353 | #define CN_IDX_STEP 6977 | ||
354 | |||
355 | struct drbd_nl_cfg_req { | ||
356 | int packet_type; | ||
357 | unsigned int drbd_minor; | ||
358 | int flags; | ||
359 | unsigned short tag_list[]; | ||
360 | }; | ||
361 | |||
362 | struct drbd_nl_cfg_reply { | ||
363 | int packet_type; | ||
364 | unsigned int minor; | ||
365 | int ret_code; /* enum ret_code or set_st_err_t */ | ||
366 | unsigned short tag_list[]; /* only used with get_* calls */ | ||
367 | }; | ||
368 | |||
369 | #endif | 380 | #endif |
diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h new file mode 100644 index 000000000000..d0d8fac8a6e4 --- /dev/null +++ b/include/linux/drbd_genl.h | |||
@@ -0,0 +1,378 @@ | |||
1 | /* | ||
2 | * General overview: | ||
3 | * full generic netlink message: | ||
4 | * |nlmsghdr|genlmsghdr|<payload> | ||
5 | * | ||
6 | * payload: | ||
7 | * |optional fixed size family header|<sequence of netlink attributes> | ||
8 | * | ||
9 | * sequence of netlink attributes: | ||
10 | * I chose to have all "top level" attributes NLA_NESTED, | ||
11 | * corresponding to some real struct. | ||
12 | * So we have a sequence of |tla, len|<nested nla sequence> | ||
13 | * | ||
14 | * nested nla sequence: | ||
15 | * may be empty, or contain a sequence of netlink attributes | ||
16 | * representing the struct fields. | ||
17 | * | ||
18 | * The tag number of any field (regardless of containing struct) | ||
19 | * will be available as T_ ## field_name, | ||
20 | * so you cannot have the same field name in two differnt structs. | ||
21 | * | ||
22 | * The tag numbers themselves are per struct, though, | ||
23 | * so should always begin at 1 (not 0, that is the special "NLA_UNSPEC" type, | ||
24 | * which we won't use here). | ||
25 | * The tag numbers are used as index in the respective nla_policy array. | ||
26 | * | ||
27 | * GENL_struct(tag_name, tag_number, struct name, struct fields) - struct and policy | ||
28 | * genl_magic_struct.h | ||
29 | * generates the struct declaration, | ||
30 | * generates an entry in the tla enum, | ||
31 | * genl_magic_func.h | ||
32 | * generates an entry in the static tla policy | ||
33 | * with .type = NLA_NESTED | ||
34 | * generates the static <struct_name>_nl_policy definition, | ||
35 | * and static conversion functions | ||
36 | * | ||
37 | * genl_magic_func.h | ||
38 | * | ||
39 | * GENL_mc_group(group) | ||
40 | * genl_magic_struct.h | ||
41 | * does nothing | ||
42 | * genl_magic_func.h | ||
43 | * defines and registers the mcast group, | ||
44 | * and provides a send helper | ||
45 | * | ||
46 | * GENL_notification(op_name, op_num, mcast_group, tla list) | ||
47 | * These are notifications to userspace. | ||
48 | * | ||
49 | * genl_magic_struct.h | ||
50 | * generates an entry in the genl_ops enum, | ||
51 | * genl_magic_func.h | ||
52 | * does nothing | ||
53 | * | ||
54 | * mcast group: the name of the mcast group this notification should be | ||
55 | * expected on | ||
56 | * tla list: the list of expected top level attributes, | ||
57 | * for documentation and sanity checking. | ||
58 | * | ||
59 | * GENL_op(op_name, op_num, flags and handler, tla list) - "genl operations" | ||
60 | * These are requests from userspace. | ||
61 | * | ||
62 | * _op and _notification share the same "number space", | ||
63 | * op_nr will be assigned to "genlmsghdr->cmd" | ||
64 | * | ||
65 | * genl_magic_struct.h | ||
66 | * generates an entry in the genl_ops enum, | ||
67 | * genl_magic_func.h | ||
68 | * generates an entry in the static genl_ops array, | ||
69 | * and static register/unregister functions to | ||
70 | * genl_register_family_with_ops(). | ||
71 | * | ||
72 | * flags and handler: | ||
73 | * GENL_op_init( .doit = x, .dumpit = y, .flags = something) | ||
74 | * GENL_doit(x) => .dumpit = NULL, .flags = GENL_ADMIN_PERM | ||
75 | * tla list: the list of expected top level attributes, | ||
76 | * for documentation and sanity checking. | ||
77 | */ | ||
78 | |||
79 | /* | ||
80 | * STRUCTS | ||
81 | */ | ||
82 | |||
83 | /* this is sent kernel -> userland on various error conditions, and contains | ||
84 | * informational textual info, which is supposedly human readable. | ||
85 | * The computer relevant return code is in the drbd_genlmsghdr. | ||
86 | */ | ||
87 | GENL_struct(DRBD_NLA_CFG_REPLY, 1, drbd_cfg_reply, | ||
88 | /* "arbitrary" size strings, nla_policy.len = 0 */ | ||
89 | __str_field(1, DRBD_GENLA_F_MANDATORY, info_text, 0) | ||
90 | ) | ||
91 | |||
92 | /* Configuration requests typically need a context to operate on. | ||
93 | * Possible keys are device minor (fits in the drbd_genlmsghdr), | ||
94 | * the replication link (aka connection) name, | ||
95 | * and/or the replication group (aka resource) name, | ||
96 | * and the volume id within the resource. */ | ||
97 | GENL_struct(DRBD_NLA_CFG_CONTEXT, 2, drbd_cfg_context, | ||
98 | __u32_field(1, DRBD_GENLA_F_MANDATORY, ctx_volume) | ||
99 | __str_field(2, DRBD_GENLA_F_MANDATORY, ctx_resource_name, 128) | ||
100 | __bin_field(3, DRBD_GENLA_F_MANDATORY, ctx_my_addr, 128) | ||
101 | __bin_field(4, DRBD_GENLA_F_MANDATORY, ctx_peer_addr, 128) | ||
102 | ) | ||
103 | |||
104 | GENL_struct(DRBD_NLA_DISK_CONF, 3, disk_conf, | ||
105 | __str_field(1, DRBD_F_REQUIRED | DRBD_F_INVARIANT, backing_dev, 128) | ||
106 | __str_field(2, DRBD_F_REQUIRED | DRBD_F_INVARIANT, meta_dev, 128) | ||
107 | __s32_field(3, DRBD_F_REQUIRED | DRBD_F_INVARIANT, meta_dev_idx) | ||
108 | |||
109 | /* use the resize command to try and change the disk_size */ | ||
110 | __u64_field(4, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, disk_size) | ||
111 | /* we could change the max_bio_bvecs, | ||
112 | * but it won't propagate through the stack */ | ||
113 | __u32_field(5, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, max_bio_bvecs) | ||
114 | |||
115 | __u32_field_def(6, DRBD_GENLA_F_MANDATORY, on_io_error, DRBD_ON_IO_ERROR_DEF) | ||
116 | __u32_field_def(7, DRBD_GENLA_F_MANDATORY, fencing, DRBD_FENCING_DEF) | ||
117 | |||
118 | __u32_field_def(8, DRBD_GENLA_F_MANDATORY, resync_rate, DRBD_RESYNC_RATE_DEF) | ||
119 | __s32_field_def(9, DRBD_GENLA_F_MANDATORY, resync_after, DRBD_MINOR_NUMBER_DEF) | ||
120 | __u32_field_def(10, DRBD_GENLA_F_MANDATORY, al_extents, DRBD_AL_EXTENTS_DEF) | ||
121 | __u32_field_def(11, DRBD_GENLA_F_MANDATORY, c_plan_ahead, DRBD_C_PLAN_AHEAD_DEF) | ||
122 | __u32_field_def(12, DRBD_GENLA_F_MANDATORY, c_delay_target, DRBD_C_DELAY_TARGET_DEF) | ||
123 | __u32_field_def(13, DRBD_GENLA_F_MANDATORY, c_fill_target, DRBD_C_FILL_TARGET_DEF) | ||
124 | __u32_field_def(14, DRBD_GENLA_F_MANDATORY, c_max_rate, DRBD_C_MAX_RATE_DEF) | ||
125 | __u32_field_def(15, DRBD_GENLA_F_MANDATORY, c_min_rate, DRBD_C_MIN_RATE_DEF) | ||
126 | |||
127 | __flg_field_def(16, DRBD_GENLA_F_MANDATORY, disk_barrier, DRBD_DISK_BARRIER_DEF) | ||
128 | __flg_field_def(17, DRBD_GENLA_F_MANDATORY, disk_flushes, DRBD_DISK_FLUSHES_DEF) | ||
129 | __flg_field_def(18, DRBD_GENLA_F_MANDATORY, disk_drain, DRBD_DISK_DRAIN_DEF) | ||
130 | __flg_field_def(19, DRBD_GENLA_F_MANDATORY, md_flushes, DRBD_MD_FLUSHES_DEF) | ||
131 | __u32_field_def(20, DRBD_GENLA_F_MANDATORY, disk_timeout, DRBD_DISK_TIMEOUT_DEF) | ||
132 | __u32_field_def(21, 0 /* OPTIONAL */, read_balancing, DRBD_READ_BALANCING_DEF) | ||
133 | /* 9: __u32_field_def(22, DRBD_GENLA_F_MANDATORY, unplug_watermark, DRBD_UNPLUG_WATERMARK_DEF) */ | ||
134 | __flg_field_def(23, 0 /* OPTIONAL */, al_updates, DRBD_AL_UPDATES_DEF) | ||
135 | ) | ||
136 | |||
137 | GENL_struct(DRBD_NLA_RESOURCE_OPTS, 4, res_opts, | ||
138 | __str_field_def(1, DRBD_GENLA_F_MANDATORY, cpu_mask, 32) | ||
139 | __u32_field_def(2, DRBD_GENLA_F_MANDATORY, on_no_data, DRBD_ON_NO_DATA_DEF) | ||
140 | ) | ||
141 | |||
142 | GENL_struct(DRBD_NLA_NET_CONF, 5, net_conf, | ||
143 | __str_field_def(1, DRBD_GENLA_F_MANDATORY | DRBD_F_SENSITIVE, | ||
144 | shared_secret, SHARED_SECRET_MAX) | ||
145 | __str_field_def(2, DRBD_GENLA_F_MANDATORY, cram_hmac_alg, SHARED_SECRET_MAX) | ||
146 | __str_field_def(3, DRBD_GENLA_F_MANDATORY, integrity_alg, SHARED_SECRET_MAX) | ||
147 | __str_field_def(4, DRBD_GENLA_F_MANDATORY, verify_alg, SHARED_SECRET_MAX) | ||
148 | __str_field_def(5, DRBD_GENLA_F_MANDATORY, csums_alg, SHARED_SECRET_MAX) | ||
149 | __u32_field_def(6, DRBD_GENLA_F_MANDATORY, wire_protocol, DRBD_PROTOCOL_DEF) | ||
150 | __u32_field_def(7, DRBD_GENLA_F_MANDATORY, connect_int, DRBD_CONNECT_INT_DEF) | ||
151 | __u32_field_def(8, DRBD_GENLA_F_MANDATORY, timeout, DRBD_TIMEOUT_DEF) | ||
152 | __u32_field_def(9, DRBD_GENLA_F_MANDATORY, ping_int, DRBD_PING_INT_DEF) | ||
153 | __u32_field_def(10, DRBD_GENLA_F_MANDATORY, ping_timeo, DRBD_PING_TIMEO_DEF) | ||
154 | __u32_field_def(11, DRBD_GENLA_F_MANDATORY, sndbuf_size, DRBD_SNDBUF_SIZE_DEF) | ||
155 | __u32_field_def(12, DRBD_GENLA_F_MANDATORY, rcvbuf_size, DRBD_RCVBUF_SIZE_DEF) | ||
156 | __u32_field_def(13, DRBD_GENLA_F_MANDATORY, ko_count, DRBD_KO_COUNT_DEF) | ||
157 | __u32_field_def(14, DRBD_GENLA_F_MANDATORY, max_buffers, DRBD_MAX_BUFFERS_DEF) | ||
158 | __u32_field_def(15, DRBD_GENLA_F_MANDATORY, max_epoch_size, DRBD_MAX_EPOCH_SIZE_DEF) | ||
159 | __u32_field_def(16, DRBD_GENLA_F_MANDATORY, unplug_watermark, DRBD_UNPLUG_WATERMARK_DEF) | ||
160 | __u32_field_def(17, DRBD_GENLA_F_MANDATORY, after_sb_0p, DRBD_AFTER_SB_0P_DEF) | ||
161 | __u32_field_def(18, DRBD_GENLA_F_MANDATORY, after_sb_1p, DRBD_AFTER_SB_1P_DEF) | ||
162 | __u32_field_def(19, DRBD_GENLA_F_MANDATORY, after_sb_2p, DRBD_AFTER_SB_2P_DEF) | ||
163 | __u32_field_def(20, DRBD_GENLA_F_MANDATORY, rr_conflict, DRBD_RR_CONFLICT_DEF) | ||
164 | __u32_field_def(21, DRBD_GENLA_F_MANDATORY, on_congestion, DRBD_ON_CONGESTION_DEF) | ||
165 | __u32_field_def(22, DRBD_GENLA_F_MANDATORY, cong_fill, DRBD_CONG_FILL_DEF) | ||
166 | __u32_field_def(23, DRBD_GENLA_F_MANDATORY, cong_extents, DRBD_CONG_EXTENTS_DEF) | ||
167 | __flg_field_def(24, DRBD_GENLA_F_MANDATORY, two_primaries, DRBD_ALLOW_TWO_PRIMARIES_DEF) | ||
168 | __flg_field(25, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, discard_my_data) | ||
169 | __flg_field_def(26, DRBD_GENLA_F_MANDATORY, tcp_cork, DRBD_TCP_CORK_DEF) | ||
170 | __flg_field_def(27, DRBD_GENLA_F_MANDATORY, always_asbp, DRBD_ALWAYS_ASBP_DEF) | ||
171 | __flg_field(28, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, tentative) | ||
172 | __flg_field_def(29, DRBD_GENLA_F_MANDATORY, use_rle, DRBD_USE_RLE_DEF) | ||
173 | /* 9: __u32_field_def(30, DRBD_GENLA_F_MANDATORY, fencing_policy, DRBD_FENCING_DEF) */ | ||
174 | ) | ||
175 | |||
176 | GENL_struct(DRBD_NLA_SET_ROLE_PARMS, 6, set_role_parms, | ||
177 | __flg_field(1, DRBD_GENLA_F_MANDATORY, assume_uptodate) | ||
178 | ) | ||
179 | |||
180 | GENL_struct(DRBD_NLA_RESIZE_PARMS, 7, resize_parms, | ||
181 | __u64_field(1, DRBD_GENLA_F_MANDATORY, resize_size) | ||
182 | __flg_field(2, DRBD_GENLA_F_MANDATORY, resize_force) | ||
183 | __flg_field(3, DRBD_GENLA_F_MANDATORY, no_resync) | ||
184 | ) | ||
185 | |||
186 | GENL_struct(DRBD_NLA_STATE_INFO, 8, state_info, | ||
187 | /* the reason of the broadcast, | ||
188 | * if this is an event triggered broadcast. */ | ||
189 | __u32_field(1, DRBD_GENLA_F_MANDATORY, sib_reason) | ||
190 | __u32_field(2, DRBD_F_REQUIRED, current_state) | ||
191 | __u64_field(3, DRBD_GENLA_F_MANDATORY, capacity) | ||
192 | __u64_field(4, DRBD_GENLA_F_MANDATORY, ed_uuid) | ||
193 | |||
194 | /* These are for broadcast from after state change work. | ||
195 | * prev_state and new_state are from the moment the state change took | ||
196 | * place, new_state is not neccessarily the same as current_state, | ||
197 | * there may have been more state changes since. Which will be | ||
198 | * broadcasted soon, in their respective after state change work. */ | ||
199 | __u32_field(5, DRBD_GENLA_F_MANDATORY, prev_state) | ||
200 | __u32_field(6, DRBD_GENLA_F_MANDATORY, new_state) | ||
201 | |||
202 | /* if we have a local disk: */ | ||
203 | __bin_field(7, DRBD_GENLA_F_MANDATORY, uuids, (UI_SIZE*sizeof(__u64))) | ||
204 | __u32_field(8, DRBD_GENLA_F_MANDATORY, disk_flags) | ||
205 | __u64_field(9, DRBD_GENLA_F_MANDATORY, bits_total) | ||
206 | __u64_field(10, DRBD_GENLA_F_MANDATORY, bits_oos) | ||
207 | /* and in case resync or online verify is active */ | ||
208 | __u64_field(11, DRBD_GENLA_F_MANDATORY, bits_rs_total) | ||
209 | __u64_field(12, DRBD_GENLA_F_MANDATORY, bits_rs_failed) | ||
210 | |||
211 | /* for pre and post notifications of helper execution */ | ||
212 | __str_field(13, DRBD_GENLA_F_MANDATORY, helper, 32) | ||
213 | __u32_field(14, DRBD_GENLA_F_MANDATORY, helper_exit_code) | ||
214 | |||
215 | __u64_field(15, 0, send_cnt) | ||
216 | __u64_field(16, 0, recv_cnt) | ||
217 | __u64_field(17, 0, read_cnt) | ||
218 | __u64_field(18, 0, writ_cnt) | ||
219 | __u64_field(19, 0, al_writ_cnt) | ||
220 | __u64_field(20, 0, bm_writ_cnt) | ||
221 | __u32_field(21, 0, ap_bio_cnt) | ||
222 | __u32_field(22, 0, ap_pending_cnt) | ||
223 | __u32_field(23, 0, rs_pending_cnt) | ||
224 | ) | ||
225 | |||
226 | GENL_struct(DRBD_NLA_START_OV_PARMS, 9, start_ov_parms, | ||
227 | __u64_field(1, DRBD_GENLA_F_MANDATORY, ov_start_sector) | ||
228 | __u64_field(2, DRBD_GENLA_F_MANDATORY, ov_stop_sector) | ||
229 | ) | ||
230 | |||
231 | GENL_struct(DRBD_NLA_NEW_C_UUID_PARMS, 10, new_c_uuid_parms, | ||
232 | __flg_field(1, DRBD_GENLA_F_MANDATORY, clear_bm) | ||
233 | ) | ||
234 | |||
235 | GENL_struct(DRBD_NLA_TIMEOUT_PARMS, 11, timeout_parms, | ||
236 | __u32_field(1, DRBD_F_REQUIRED, timeout_type) | ||
237 | ) | ||
238 | |||
239 | GENL_struct(DRBD_NLA_DISCONNECT_PARMS, 12, disconnect_parms, | ||
240 | __flg_field(1, DRBD_GENLA_F_MANDATORY, force_disconnect) | ||
241 | ) | ||
242 | |||
243 | GENL_struct(DRBD_NLA_DETACH_PARMS, 13, detach_parms, | ||
244 | __flg_field(1, DRBD_GENLA_F_MANDATORY, force_detach) | ||
245 | ) | ||
246 | |||
247 | /* | ||
248 | * Notifications and commands (genlmsghdr->cmd) | ||
249 | */ | ||
250 | GENL_mc_group(events) | ||
251 | |||
252 | /* kernel -> userspace announcement of changes */ | ||
253 | GENL_notification( | ||
254 | DRBD_EVENT, 1, events, | ||
255 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) | ||
256 | GENL_tla_expected(DRBD_NLA_STATE_INFO, DRBD_F_REQUIRED) | ||
257 | GENL_tla_expected(DRBD_NLA_NET_CONF, DRBD_GENLA_F_MANDATORY) | ||
258 | GENL_tla_expected(DRBD_NLA_DISK_CONF, DRBD_GENLA_F_MANDATORY) | ||
259 | GENL_tla_expected(DRBD_NLA_SYNCER_CONF, DRBD_GENLA_F_MANDATORY) | ||
260 | ) | ||
261 | |||
262 | /* query kernel for specific or all info */ | ||
263 | GENL_op( | ||
264 | DRBD_ADM_GET_STATUS, 2, | ||
265 | GENL_op_init( | ||
266 | .doit = drbd_adm_get_status, | ||
267 | .dumpit = drbd_adm_get_status_all, | ||
268 | /* anyone may ask for the status, | ||
269 | * it is broadcasted anyways */ | ||
270 | ), | ||
271 | /* To select the object .doit. | ||
272 | * Or a subset of objects in .dumpit. */ | ||
273 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY) | ||
274 | ) | ||
275 | |||
276 | /* add DRBD minor devices as volumes to resources */ | ||
277 | GENL_op(DRBD_ADM_NEW_MINOR, 5, GENL_doit(drbd_adm_add_minor), | ||
278 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) | ||
279 | GENL_op(DRBD_ADM_DEL_MINOR, 6, GENL_doit(drbd_adm_delete_minor), | ||
280 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) | ||
281 | |||
282 | /* add or delete resources */ | ||
283 | GENL_op(DRBD_ADM_NEW_RESOURCE, 7, GENL_doit(drbd_adm_new_resource), | ||
284 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) | ||
285 | GENL_op(DRBD_ADM_DEL_RESOURCE, 8, GENL_doit(drbd_adm_del_resource), | ||
286 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) | ||
287 | |||
288 | GENL_op(DRBD_ADM_RESOURCE_OPTS, 9, | ||
289 | GENL_doit(drbd_adm_resource_opts), | ||
290 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) | ||
291 | GENL_tla_expected(DRBD_NLA_RESOURCE_OPTS, DRBD_GENLA_F_MANDATORY) | ||
292 | ) | ||
293 | |||
294 | GENL_op( | ||
295 | DRBD_ADM_CONNECT, 10, | ||
296 | GENL_doit(drbd_adm_connect), | ||
297 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) | ||
298 | GENL_tla_expected(DRBD_NLA_NET_CONF, DRBD_F_REQUIRED) | ||
299 | ) | ||
300 | |||
301 | GENL_op( | ||
302 | DRBD_ADM_CHG_NET_OPTS, 29, | ||
303 | GENL_doit(drbd_adm_net_opts), | ||
304 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) | ||
305 | GENL_tla_expected(DRBD_NLA_NET_CONF, DRBD_F_REQUIRED) | ||
306 | ) | ||
307 | |||
308 | GENL_op(DRBD_ADM_DISCONNECT, 11, GENL_doit(drbd_adm_disconnect), | ||
309 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) | ||
310 | |||
311 | GENL_op(DRBD_ADM_ATTACH, 12, | ||
312 | GENL_doit(drbd_adm_attach), | ||
313 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) | ||
314 | GENL_tla_expected(DRBD_NLA_DISK_CONF, DRBD_F_REQUIRED) | ||
315 | ) | ||
316 | |||
317 | GENL_op(DRBD_ADM_CHG_DISK_OPTS, 28, | ||
318 | GENL_doit(drbd_adm_disk_opts), | ||
319 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) | ||
320 | GENL_tla_expected(DRBD_NLA_DISK_OPTS, DRBD_F_REQUIRED) | ||
321 | ) | ||
322 | |||
323 | GENL_op( | ||
324 | DRBD_ADM_RESIZE, 13, | ||
325 | GENL_doit(drbd_adm_resize), | ||
326 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) | ||
327 | GENL_tla_expected(DRBD_NLA_RESIZE_PARMS, DRBD_GENLA_F_MANDATORY) | ||
328 | ) | ||
329 | |||
330 | GENL_op( | ||
331 | DRBD_ADM_PRIMARY, 14, | ||
332 | GENL_doit(drbd_adm_set_role), | ||
333 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) | ||
334 | GENL_tla_expected(DRBD_NLA_SET_ROLE_PARMS, DRBD_F_REQUIRED) | ||
335 | ) | ||
336 | |||
337 | GENL_op( | ||
338 | DRBD_ADM_SECONDARY, 15, | ||
339 | GENL_doit(drbd_adm_set_role), | ||
340 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) | ||
341 | GENL_tla_expected(DRBD_NLA_SET_ROLE_PARMS, DRBD_F_REQUIRED) | ||
342 | ) | ||
343 | |||
344 | GENL_op( | ||
345 | DRBD_ADM_NEW_C_UUID, 16, | ||
346 | GENL_doit(drbd_adm_new_c_uuid), | ||
347 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) | ||
348 | GENL_tla_expected(DRBD_NLA_NEW_C_UUID_PARMS, DRBD_GENLA_F_MANDATORY) | ||
349 | ) | ||
350 | |||
351 | GENL_op( | ||
352 | DRBD_ADM_START_OV, 17, | ||
353 | GENL_doit(drbd_adm_start_ov), | ||
354 | GENL_tla_expected(DRBD_NLA_START_OV_PARMS, DRBD_GENLA_F_MANDATORY) | ||
355 | ) | ||
356 | |||
357 | GENL_op(DRBD_ADM_DETACH, 18, GENL_doit(drbd_adm_detach), | ||
358 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) | ||
359 | GENL_tla_expected(DRBD_NLA_DETACH_PARMS, DRBD_GENLA_F_MANDATORY)) | ||
360 | |||
361 | GENL_op(DRBD_ADM_INVALIDATE, 19, GENL_doit(drbd_adm_invalidate), | ||
362 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) | ||
363 | GENL_op(DRBD_ADM_INVAL_PEER, 20, GENL_doit(drbd_adm_invalidate_peer), | ||
364 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) | ||
365 | GENL_op(DRBD_ADM_PAUSE_SYNC, 21, GENL_doit(drbd_adm_pause_sync), | ||
366 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) | ||
367 | GENL_op(DRBD_ADM_RESUME_SYNC, 22, GENL_doit(drbd_adm_resume_sync), | ||
368 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) | ||
369 | GENL_op(DRBD_ADM_SUSPEND_IO, 23, GENL_doit(drbd_adm_suspend_io), | ||
370 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) | ||
371 | GENL_op(DRBD_ADM_RESUME_IO, 24, GENL_doit(drbd_adm_resume_io), | ||
372 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) | ||
373 | GENL_op(DRBD_ADM_OUTDATE, 25, GENL_doit(drbd_adm_outdate), | ||
374 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) | ||
375 | GENL_op(DRBD_ADM_GET_TIMEOUT_TYPE, 26, GENL_doit(drbd_adm_get_timeout_type), | ||
376 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) | ||
377 | GENL_op(DRBD_ADM_DOWN, 27, GENL_doit(drbd_adm_down), | ||
378 | GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) | ||
diff --git a/include/linux/drbd_genl_api.h b/include/linux/drbd_genl_api.h new file mode 100644 index 000000000000..9ef50d51e34e --- /dev/null +++ b/include/linux/drbd_genl_api.h | |||
@@ -0,0 +1,55 @@ | |||
1 | #ifndef DRBD_GENL_STRUCT_H | ||
2 | #define DRBD_GENL_STRUCT_H | ||
3 | |||
4 | /** | ||
5 | * struct drbd_genlmsghdr - DRBD specific header used in NETLINK_GENERIC requests | ||
6 | * @minor: | ||
7 | * For admin requests (user -> kernel): which minor device to operate on. | ||
8 | * For (unicast) replies or informational (broadcast) messages | ||
9 | * (kernel -> user): which minor device the information is about. | ||
10 | * If we do not operate on minors, but on connections or resources, | ||
11 | * the minor value shall be (~0), and the attribute DRBD_NLA_CFG_CONTEXT | ||
12 | * is used instead. | ||
13 | * @flags: possible operation modifiers (relevant only for user->kernel): | ||
14 | * DRBD_GENL_F_SET_DEFAULTS | ||
15 | * @volume: | ||
16 | * When creating a new minor (adding it to a resource), the resource needs | ||
17 | * to know which volume number within the resource this is supposed to be. | ||
18 | * The volume number corresponds to the same volume number on the remote side, | ||
19 | * whereas the minor number on the remote side may be different | ||
20 | * (union with flags). | ||
21 | * @ret_code: kernel->userland unicast cfg reply return code (union with flags); | ||
22 | */ | ||
23 | struct drbd_genlmsghdr { | ||
24 | __u32 minor; | ||
25 | union { | ||
26 | __u32 flags; | ||
27 | __s32 ret_code; | ||
28 | }; | ||
29 | }; | ||
30 | |||
31 | /* To be used in drbd_genlmsghdr.flags */ | ||
32 | enum { | ||
33 | DRBD_GENL_F_SET_DEFAULTS = 1, | ||
34 | }; | ||
35 | |||
36 | enum drbd_state_info_bcast_reason { | ||
37 | SIB_GET_STATUS_REPLY = 1, | ||
38 | SIB_STATE_CHANGE = 2, | ||
39 | SIB_HELPER_PRE = 3, | ||
40 | SIB_HELPER_POST = 4, | ||
41 | SIB_SYNC_PROGRESS = 5, | ||
42 | }; | ||
43 | |||
44 | /* hack around predefined gcc/cpp "linux=1", | ||
45 | * we cannot possibly include <1/drbd_genl.h> */ | ||
46 | #undef linux | ||
47 | |||
48 | #include <linux/drbd.h> | ||
49 | #define GENL_MAGIC_VERSION API_VERSION | ||
50 | #define GENL_MAGIC_FAMILY drbd | ||
51 | #define GENL_MAGIC_FAMILY_HDRSZ sizeof(struct drbd_genlmsghdr) | ||
52 | #define GENL_MAGIC_INCLUDE_FILE <linux/drbd_genl.h> | ||
53 | #include <linux/genl_magic_struct.h> | ||
54 | |||
55 | #endif | ||
diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h index fb670bf603f7..1fa19c5f5e64 100644 --- a/include/linux/drbd_limits.h +++ b/include/linux/drbd_limits.h | |||
@@ -16,29 +16,37 @@ | |||
16 | #define DEBUG_RANGE_CHECK 0 | 16 | #define DEBUG_RANGE_CHECK 0 |
17 | 17 | ||
18 | #define DRBD_MINOR_COUNT_MIN 1 | 18 | #define DRBD_MINOR_COUNT_MIN 1 |
19 | #define DRBD_MINOR_COUNT_MAX 256 | 19 | #define DRBD_MINOR_COUNT_MAX 255 |
20 | #define DRBD_MINOR_COUNT_DEF 32 | 20 | #define DRBD_MINOR_COUNT_DEF 32 |
21 | #define DRBD_MINOR_COUNT_SCALE '1' | ||
22 | |||
23 | #define DRBD_VOLUME_MAX 65535 | ||
21 | 24 | ||
22 | #define DRBD_DIALOG_REFRESH_MIN 0 | 25 | #define DRBD_DIALOG_REFRESH_MIN 0 |
23 | #define DRBD_DIALOG_REFRESH_MAX 600 | 26 | #define DRBD_DIALOG_REFRESH_MAX 600 |
27 | #define DRBD_DIALOG_REFRESH_SCALE '1' | ||
24 | 28 | ||
25 | /* valid port number */ | 29 | /* valid port number */ |
26 | #define DRBD_PORT_MIN 1 | 30 | #define DRBD_PORT_MIN 1 |
27 | #define DRBD_PORT_MAX 0xffff | 31 | #define DRBD_PORT_MAX 0xffff |
32 | #define DRBD_PORT_SCALE '1' | ||
28 | 33 | ||
29 | /* startup { */ | 34 | /* startup { */ |
30 | /* if you want more than 3.4 days, disable */ | 35 | /* if you want more than 3.4 days, disable */ |
31 | #define DRBD_WFC_TIMEOUT_MIN 0 | 36 | #define DRBD_WFC_TIMEOUT_MIN 0 |
32 | #define DRBD_WFC_TIMEOUT_MAX 300000 | 37 | #define DRBD_WFC_TIMEOUT_MAX 300000 |
33 | #define DRBD_WFC_TIMEOUT_DEF 0 | 38 | #define DRBD_WFC_TIMEOUT_DEF 0 |
39 | #define DRBD_WFC_TIMEOUT_SCALE '1' | ||
34 | 40 | ||
35 | #define DRBD_DEGR_WFC_TIMEOUT_MIN 0 | 41 | #define DRBD_DEGR_WFC_TIMEOUT_MIN 0 |
36 | #define DRBD_DEGR_WFC_TIMEOUT_MAX 300000 | 42 | #define DRBD_DEGR_WFC_TIMEOUT_MAX 300000 |
37 | #define DRBD_DEGR_WFC_TIMEOUT_DEF 0 | 43 | #define DRBD_DEGR_WFC_TIMEOUT_DEF 0 |
44 | #define DRBD_DEGR_WFC_TIMEOUT_SCALE '1' | ||
38 | 45 | ||
39 | #define DRBD_OUTDATED_WFC_TIMEOUT_MIN 0 | 46 | #define DRBD_OUTDATED_WFC_TIMEOUT_MIN 0 |
40 | #define DRBD_OUTDATED_WFC_TIMEOUT_MAX 300000 | 47 | #define DRBD_OUTDATED_WFC_TIMEOUT_MAX 300000 |
41 | #define DRBD_OUTDATED_WFC_TIMEOUT_DEF 0 | 48 | #define DRBD_OUTDATED_WFC_TIMEOUT_DEF 0 |
49 | #define DRBD_OUTDATED_WFC_TIMEOUT_SCALE '1' | ||
42 | /* }*/ | 50 | /* }*/ |
43 | 51 | ||
44 | /* net { */ | 52 | /* net { */ |
@@ -47,75 +55,91 @@ | |||
47 | #define DRBD_TIMEOUT_MIN 1 | 55 | #define DRBD_TIMEOUT_MIN 1 |
48 | #define DRBD_TIMEOUT_MAX 600 | 56 | #define DRBD_TIMEOUT_MAX 600 |
49 | #define DRBD_TIMEOUT_DEF 60 /* 6 seconds */ | 57 | #define DRBD_TIMEOUT_DEF 60 /* 6 seconds */ |
58 | #define DRBD_TIMEOUT_SCALE '1' | ||
50 | 59 | ||
51 | /* If backing disk takes longer than disk_timeout, mark the disk as failed */ | 60 | /* If backing disk takes longer than disk_timeout, mark the disk as failed */ |
52 | #define DRBD_DISK_TIMEOUT_MIN 0 /* 0 = disabled */ | 61 | #define DRBD_DISK_TIMEOUT_MIN 0 /* 0 = disabled */ |
53 | #define DRBD_DISK_TIMEOUT_MAX 6000 /* 10 Minutes */ | 62 | #define DRBD_DISK_TIMEOUT_MAX 6000 /* 10 Minutes */ |
54 | #define DRBD_DISK_TIMEOUT_DEF 0 /* disabled */ | 63 | #define DRBD_DISK_TIMEOUT_DEF 0 /* disabled */ |
64 | #define DRBD_DISK_TIMEOUT_SCALE '1' | ||
55 | 65 | ||
56 | /* active connection retries when C_WF_CONNECTION */ | 66 | /* active connection retries when C_WF_CONNECTION */ |
57 | #define DRBD_CONNECT_INT_MIN 1 | 67 | #define DRBD_CONNECT_INT_MIN 1 |
58 | #define DRBD_CONNECT_INT_MAX 120 | 68 | #define DRBD_CONNECT_INT_MAX 120 |
59 | #define DRBD_CONNECT_INT_DEF 10 /* seconds */ | 69 | #define DRBD_CONNECT_INT_DEF 10 /* seconds */ |
70 | #define DRBD_CONNECT_INT_SCALE '1' | ||
60 | 71 | ||
61 | /* keep-alive probes when idle */ | 72 | /* keep-alive probes when idle */ |
62 | #define DRBD_PING_INT_MIN 1 | 73 | #define DRBD_PING_INT_MIN 1 |
63 | #define DRBD_PING_INT_MAX 120 | 74 | #define DRBD_PING_INT_MAX 120 |
64 | #define DRBD_PING_INT_DEF 10 | 75 | #define DRBD_PING_INT_DEF 10 |
76 | #define DRBD_PING_INT_SCALE '1' | ||
65 | 77 | ||
66 | /* timeout for the ping packets.*/ | 78 | /* timeout for the ping packets.*/ |
67 | #define DRBD_PING_TIMEO_MIN 1 | 79 | #define DRBD_PING_TIMEO_MIN 1 |
68 | #define DRBD_PING_TIMEO_MAX 300 | 80 | #define DRBD_PING_TIMEO_MAX 300 |
69 | #define DRBD_PING_TIMEO_DEF 5 | 81 | #define DRBD_PING_TIMEO_DEF 5 |
82 | #define DRBD_PING_TIMEO_SCALE '1' | ||
70 | 83 | ||
71 | /* max number of write requests between write barriers */ | 84 | /* max number of write requests between write barriers */ |
72 | #define DRBD_MAX_EPOCH_SIZE_MIN 1 | 85 | #define DRBD_MAX_EPOCH_SIZE_MIN 1 |
73 | #define DRBD_MAX_EPOCH_SIZE_MAX 20000 | 86 | #define DRBD_MAX_EPOCH_SIZE_MAX 20000 |
74 | #define DRBD_MAX_EPOCH_SIZE_DEF 2048 | 87 | #define DRBD_MAX_EPOCH_SIZE_DEF 2048 |
88 | #define DRBD_MAX_EPOCH_SIZE_SCALE '1' | ||
75 | 89 | ||
76 | /* I don't think that a tcp send buffer of more than 10M is useful */ | 90 | /* I don't think that a tcp send buffer of more than 10M is useful */ |
77 | #define DRBD_SNDBUF_SIZE_MIN 0 | 91 | #define DRBD_SNDBUF_SIZE_MIN 0 |
78 | #define DRBD_SNDBUF_SIZE_MAX (10<<20) | 92 | #define DRBD_SNDBUF_SIZE_MAX (10<<20) |
79 | #define DRBD_SNDBUF_SIZE_DEF 0 | 93 | #define DRBD_SNDBUF_SIZE_DEF 0 |
94 | #define DRBD_SNDBUF_SIZE_SCALE '1' | ||
80 | 95 | ||
81 | #define DRBD_RCVBUF_SIZE_MIN 0 | 96 | #define DRBD_RCVBUF_SIZE_MIN 0 |
82 | #define DRBD_RCVBUF_SIZE_MAX (10<<20) | 97 | #define DRBD_RCVBUF_SIZE_MAX (10<<20) |
83 | #define DRBD_RCVBUF_SIZE_DEF 0 | 98 | #define DRBD_RCVBUF_SIZE_DEF 0 |
99 | #define DRBD_RCVBUF_SIZE_SCALE '1' | ||
84 | 100 | ||
85 | /* @4k PageSize -> 128kB - 512MB */ | 101 | /* @4k PageSize -> 128kB - 512MB */ |
86 | #define DRBD_MAX_BUFFERS_MIN 32 | 102 | #define DRBD_MAX_BUFFERS_MIN 32 |
87 | #define DRBD_MAX_BUFFERS_MAX 131072 | 103 | #define DRBD_MAX_BUFFERS_MAX 131072 |
88 | #define DRBD_MAX_BUFFERS_DEF 2048 | 104 | #define DRBD_MAX_BUFFERS_DEF 2048 |
105 | #define DRBD_MAX_BUFFERS_SCALE '1' | ||
89 | 106 | ||
90 | /* @4k PageSize -> 4kB - 512MB */ | 107 | /* @4k PageSize -> 4kB - 512MB */ |
91 | #define DRBD_UNPLUG_WATERMARK_MIN 1 | 108 | #define DRBD_UNPLUG_WATERMARK_MIN 1 |
92 | #define DRBD_UNPLUG_WATERMARK_MAX 131072 | 109 | #define DRBD_UNPLUG_WATERMARK_MAX 131072 |
93 | #define DRBD_UNPLUG_WATERMARK_DEF (DRBD_MAX_BUFFERS_DEF/16) | 110 | #define DRBD_UNPLUG_WATERMARK_DEF (DRBD_MAX_BUFFERS_DEF/16) |
111 | #define DRBD_UNPLUG_WATERMARK_SCALE '1' | ||
94 | 112 | ||
95 | /* 0 is disabled. | 113 | /* 0 is disabled. |
96 | * 200 should be more than enough even for very short timeouts */ | 114 | * 200 should be more than enough even for very short timeouts */ |
97 | #define DRBD_KO_COUNT_MIN 0 | 115 | #define DRBD_KO_COUNT_MIN 0 |
98 | #define DRBD_KO_COUNT_MAX 200 | 116 | #define DRBD_KO_COUNT_MAX 200 |
99 | #define DRBD_KO_COUNT_DEF 0 | 117 | #define DRBD_KO_COUNT_DEF 7 |
118 | #define DRBD_KO_COUNT_SCALE '1' | ||
100 | /* } */ | 119 | /* } */ |
101 | 120 | ||
102 | /* syncer { */ | 121 | /* syncer { */ |
103 | /* FIXME allow rate to be zero? */ | 122 | /* FIXME allow rate to be zero? */ |
104 | #define DRBD_RATE_MIN 1 | 123 | #define DRBD_RESYNC_RATE_MIN 1 |
105 | /* channel bonding 10 GbE, or other hardware */ | 124 | /* channel bonding 10 GbE, or other hardware */ |
106 | #define DRBD_RATE_MAX (4 << 20) | 125 | #define DRBD_RESYNC_RATE_MAX (4 << 20) |
107 | #define DRBD_RATE_DEF 250 /* kb/second */ | 126 | #define DRBD_RESYNC_RATE_DEF 250 |
127 | #define DRBD_RESYNC_RATE_SCALE 'k' /* kilobytes */ | ||
108 | 128 | ||
109 | /* less than 7 would hit performance unnecessarily. | 129 | /* less than 7 would hit performance unnecessarily. |
110 | * 3833 is the largest prime that still does fit | 130 | * 919 slots context information per transaction, |
111 | * into 64 sectors of activity log */ | 131 | * 32k activity log, 4k transaction size, |
132 | * one transaction in flight: | ||
133 | * 919 * 7 = 6433 */ | ||
112 | #define DRBD_AL_EXTENTS_MIN 7 | 134 | #define DRBD_AL_EXTENTS_MIN 7 |
113 | #define DRBD_AL_EXTENTS_MAX 3833 | 135 | #define DRBD_AL_EXTENTS_MAX 6433 |
114 | #define DRBD_AL_EXTENTS_DEF 127 | 136 | #define DRBD_AL_EXTENTS_DEF 1237 |
137 | #define DRBD_AL_EXTENTS_SCALE '1' | ||
115 | 138 | ||
116 | #define DRBD_AFTER_MIN -1 | 139 | #define DRBD_MINOR_NUMBER_MIN -1 |
117 | #define DRBD_AFTER_MAX 255 | 140 | #define DRBD_MINOR_NUMBER_MAX ((1 << 20) - 1) |
118 | #define DRBD_AFTER_DEF -1 | 141 | #define DRBD_MINOR_NUMBER_DEF -1 |
142 | #define DRBD_MINOR_NUMBER_SCALE '1' | ||
119 | 143 | ||
120 | /* } */ | 144 | /* } */ |
121 | 145 | ||
@@ -124,11 +148,12 @@ | |||
124 | * the upper limit with 64bit kernel, enough ram and flexible meta data | 148 | * the upper limit with 64bit kernel, enough ram and flexible meta data |
125 | * is 1 PiB, currently. */ | 149 | * is 1 PiB, currently. */ |
126 | /* DRBD_MAX_SECTORS */ | 150 | /* DRBD_MAX_SECTORS */ |
127 | #define DRBD_DISK_SIZE_SECT_MIN 0 | 151 | #define DRBD_DISK_SIZE_MIN 0 |
128 | #define DRBD_DISK_SIZE_SECT_MAX (1 * (2LLU << 40)) | 152 | #define DRBD_DISK_SIZE_MAX (1 * (2LLU << 40)) |
129 | #define DRBD_DISK_SIZE_SECT_DEF 0 /* = disabled = no user size... */ | 153 | #define DRBD_DISK_SIZE_DEF 0 /* = disabled = no user size... */ |
154 | #define DRBD_DISK_SIZE_SCALE 's' /* sectors */ | ||
130 | 155 | ||
131 | #define DRBD_ON_IO_ERROR_DEF EP_PASS_ON | 156 | #define DRBD_ON_IO_ERROR_DEF EP_DETACH |
132 | #define DRBD_FENCING_DEF FP_DONT_CARE | 157 | #define DRBD_FENCING_DEF FP_DONT_CARE |
133 | #define DRBD_AFTER_SB_0P_DEF ASB_DISCONNECT | 158 | #define DRBD_AFTER_SB_0P_DEF ASB_DISCONNECT |
134 | #define DRBD_AFTER_SB_1P_DEF ASB_DISCONNECT | 159 | #define DRBD_AFTER_SB_1P_DEF ASB_DISCONNECT |
@@ -136,38 +161,59 @@ | |||
136 | #define DRBD_RR_CONFLICT_DEF ASB_DISCONNECT | 161 | #define DRBD_RR_CONFLICT_DEF ASB_DISCONNECT |
137 | #define DRBD_ON_NO_DATA_DEF OND_IO_ERROR | 162 | #define DRBD_ON_NO_DATA_DEF OND_IO_ERROR |
138 | #define DRBD_ON_CONGESTION_DEF OC_BLOCK | 163 | #define DRBD_ON_CONGESTION_DEF OC_BLOCK |
164 | #define DRBD_READ_BALANCING_DEF RB_PREFER_LOCAL | ||
139 | 165 | ||
140 | #define DRBD_MAX_BIO_BVECS_MIN 0 | 166 | #define DRBD_MAX_BIO_BVECS_MIN 0 |
141 | #define DRBD_MAX_BIO_BVECS_MAX 128 | 167 | #define DRBD_MAX_BIO_BVECS_MAX 128 |
142 | #define DRBD_MAX_BIO_BVECS_DEF 0 | 168 | #define DRBD_MAX_BIO_BVECS_DEF 0 |
169 | #define DRBD_MAX_BIO_BVECS_SCALE '1' | ||
143 | 170 | ||
144 | #define DRBD_C_PLAN_AHEAD_MIN 0 | 171 | #define DRBD_C_PLAN_AHEAD_MIN 0 |
145 | #define DRBD_C_PLAN_AHEAD_MAX 300 | 172 | #define DRBD_C_PLAN_AHEAD_MAX 300 |
146 | #define DRBD_C_PLAN_AHEAD_DEF 0 /* RS rate controller disabled by default */ | 173 | #define DRBD_C_PLAN_AHEAD_DEF 20 |
174 | #define DRBD_C_PLAN_AHEAD_SCALE '1' | ||
147 | 175 | ||
148 | #define DRBD_C_DELAY_TARGET_MIN 1 | 176 | #define DRBD_C_DELAY_TARGET_MIN 1 |
149 | #define DRBD_C_DELAY_TARGET_MAX 100 | 177 | #define DRBD_C_DELAY_TARGET_MAX 100 |
150 | #define DRBD_C_DELAY_TARGET_DEF 10 | 178 | #define DRBD_C_DELAY_TARGET_DEF 10 |
179 | #define DRBD_C_DELAY_TARGET_SCALE '1' | ||
151 | 180 | ||
152 | #define DRBD_C_FILL_TARGET_MIN 0 | 181 | #define DRBD_C_FILL_TARGET_MIN 0 |
153 | #define DRBD_C_FILL_TARGET_MAX (1<<20) /* 500MByte in sec */ | 182 | #define DRBD_C_FILL_TARGET_MAX (1<<20) /* 500MByte in sec */ |
154 | #define DRBD_C_FILL_TARGET_DEF 0 /* By default disabled -> controlled by delay_target */ | 183 | #define DRBD_C_FILL_TARGET_DEF 100 /* Try to place 50KiB in socket send buffer during resync */ |
184 | #define DRBD_C_FILL_TARGET_SCALE 's' /* sectors */ | ||
155 | 185 | ||
156 | #define DRBD_C_MAX_RATE_MIN 250 /* kByte/sec */ | 186 | #define DRBD_C_MAX_RATE_MIN 250 |
157 | #define DRBD_C_MAX_RATE_MAX (4 << 20) | 187 | #define DRBD_C_MAX_RATE_MAX (4 << 20) |
158 | #define DRBD_C_MAX_RATE_DEF 102400 | 188 | #define DRBD_C_MAX_RATE_DEF 102400 |
189 | #define DRBD_C_MAX_RATE_SCALE 'k' /* kilobytes */ | ||
159 | 190 | ||
160 | #define DRBD_C_MIN_RATE_MIN 0 /* kByte/sec */ | 191 | #define DRBD_C_MIN_RATE_MIN 0 |
161 | #define DRBD_C_MIN_RATE_MAX (4 << 20) | 192 | #define DRBD_C_MIN_RATE_MAX (4 << 20) |
162 | #define DRBD_C_MIN_RATE_DEF 4096 | 193 | #define DRBD_C_MIN_RATE_DEF 250 |
194 | #define DRBD_C_MIN_RATE_SCALE 'k' /* kilobytes */ | ||
163 | 195 | ||
164 | #define DRBD_CONG_FILL_MIN 0 | 196 | #define DRBD_CONG_FILL_MIN 0 |
165 | #define DRBD_CONG_FILL_MAX (10<<21) /* 10GByte in sectors */ | 197 | #define DRBD_CONG_FILL_MAX (10<<21) /* 10GByte in sectors */ |
166 | #define DRBD_CONG_FILL_DEF 0 | 198 | #define DRBD_CONG_FILL_DEF 0 |
199 | #define DRBD_CONG_FILL_SCALE 's' /* sectors */ | ||
167 | 200 | ||
168 | #define DRBD_CONG_EXTENTS_MIN DRBD_AL_EXTENTS_MIN | 201 | #define DRBD_CONG_EXTENTS_MIN DRBD_AL_EXTENTS_MIN |
169 | #define DRBD_CONG_EXTENTS_MAX DRBD_AL_EXTENTS_MAX | 202 | #define DRBD_CONG_EXTENTS_MAX DRBD_AL_EXTENTS_MAX |
170 | #define DRBD_CONG_EXTENTS_DEF DRBD_AL_EXTENTS_DEF | 203 | #define DRBD_CONG_EXTENTS_DEF DRBD_AL_EXTENTS_DEF |
204 | #define DRBD_CONG_EXTENTS_SCALE DRBD_AL_EXTENTS_SCALE | ||
205 | |||
206 | #define DRBD_PROTOCOL_DEF DRBD_PROT_C | ||
207 | |||
208 | #define DRBD_DISK_BARRIER_DEF 0 | ||
209 | #define DRBD_DISK_FLUSHES_DEF 1 | ||
210 | #define DRBD_DISK_DRAIN_DEF 1 | ||
211 | #define DRBD_MD_FLUSHES_DEF 1 | ||
212 | #define DRBD_TCP_CORK_DEF 1 | ||
213 | #define DRBD_AL_UPDATES_DEF 1 | ||
214 | |||
215 | #define DRBD_ALLOW_TWO_PRIMARIES_DEF 0 | ||
216 | #define DRBD_ALWAYS_ASBP_DEF 0 | ||
217 | #define DRBD_USE_RLE_DEF 1 | ||
171 | 218 | ||
172 | #undef RANGE | ||
173 | #endif | 219 | #endif |
diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h deleted file mode 100644 index a8706f08ab36..000000000000 --- a/include/linux/drbd_nl.h +++ /dev/null | |||
@@ -1,163 +0,0 @@ | |||
1 | /* | ||
2 | PAKET( name, | ||
3 | TYPE ( pn, pr, member ) | ||
4 | ... | ||
5 | ) | ||
6 | |||
7 | You may never reissue one of the pn arguments | ||
8 | */ | ||
9 | |||
10 | #if !defined(NL_PACKET) || !defined(NL_STRING) || !defined(NL_INTEGER) || !defined(NL_BIT) || !defined(NL_INT64) | ||
11 | #error "The macros NL_PACKET, NL_STRING, NL_INTEGER, NL_INT64 and NL_BIT needs to be defined" | ||
12 | #endif | ||
13 | |||
14 | NL_PACKET(primary, 1, | ||
15 | NL_BIT( 1, T_MAY_IGNORE, primary_force) | ||
16 | ) | ||
17 | |||
18 | NL_PACKET(secondary, 2, ) | ||
19 | |||
20 | NL_PACKET(disk_conf, 3, | ||
21 | NL_INT64( 2, T_MAY_IGNORE, disk_size) | ||
22 | NL_STRING( 3, T_MANDATORY, backing_dev, 128) | ||
23 | NL_STRING( 4, T_MANDATORY, meta_dev, 128) | ||
24 | NL_INTEGER( 5, T_MANDATORY, meta_dev_idx) | ||
25 | NL_INTEGER( 6, T_MAY_IGNORE, on_io_error) | ||
26 | NL_INTEGER( 7, T_MAY_IGNORE, fencing) | ||
27 | NL_BIT( 37, T_MAY_IGNORE, use_bmbv) | ||
28 | NL_BIT( 53, T_MAY_IGNORE, no_disk_flush) | ||
29 | NL_BIT( 54, T_MAY_IGNORE, no_md_flush) | ||
30 | /* 55 max_bio_size was available in 8.2.6rc2 */ | ||
31 | NL_INTEGER( 56, T_MAY_IGNORE, max_bio_bvecs) | ||
32 | NL_BIT( 57, T_MAY_IGNORE, no_disk_barrier) | ||
33 | NL_BIT( 58, T_MAY_IGNORE, no_disk_drain) | ||
34 | NL_INTEGER( 89, T_MAY_IGNORE, disk_timeout) | ||
35 | ) | ||
36 | |||
37 | NL_PACKET(detach, 4, | ||
38 | NL_BIT( 88, T_MANDATORY, detach_force) | ||
39 | ) | ||
40 | |||
41 | NL_PACKET(net_conf, 5, | ||
42 | NL_STRING( 8, T_MANDATORY, my_addr, 128) | ||
43 | NL_STRING( 9, T_MANDATORY, peer_addr, 128) | ||
44 | NL_STRING( 10, T_MAY_IGNORE, shared_secret, SHARED_SECRET_MAX) | ||
45 | NL_STRING( 11, T_MAY_IGNORE, cram_hmac_alg, SHARED_SECRET_MAX) | ||
46 | NL_STRING( 44, T_MAY_IGNORE, integrity_alg, SHARED_SECRET_MAX) | ||
47 | NL_INTEGER( 14, T_MAY_IGNORE, timeout) | ||
48 | NL_INTEGER( 15, T_MANDATORY, wire_protocol) | ||
49 | NL_INTEGER( 16, T_MAY_IGNORE, try_connect_int) | ||
50 | NL_INTEGER( 17, T_MAY_IGNORE, ping_int) | ||
51 | NL_INTEGER( 18, T_MAY_IGNORE, max_epoch_size) | ||
52 | NL_INTEGER( 19, T_MAY_IGNORE, max_buffers) | ||
53 | NL_INTEGER( 20, T_MAY_IGNORE, unplug_watermark) | ||
54 | NL_INTEGER( 21, T_MAY_IGNORE, sndbuf_size) | ||
55 | NL_INTEGER( 22, T_MAY_IGNORE, ko_count) | ||
56 | NL_INTEGER( 24, T_MAY_IGNORE, after_sb_0p) | ||
57 | NL_INTEGER( 25, T_MAY_IGNORE, after_sb_1p) | ||
58 | NL_INTEGER( 26, T_MAY_IGNORE, after_sb_2p) | ||
59 | NL_INTEGER( 39, T_MAY_IGNORE, rr_conflict) | ||
60 | NL_INTEGER( 40, T_MAY_IGNORE, ping_timeo) | ||
61 | NL_INTEGER( 67, T_MAY_IGNORE, rcvbuf_size) | ||
62 | NL_INTEGER( 81, T_MAY_IGNORE, on_congestion) | ||
63 | NL_INTEGER( 82, T_MAY_IGNORE, cong_fill) | ||
64 | NL_INTEGER( 83, T_MAY_IGNORE, cong_extents) | ||
65 | /* 59 addr_family was available in GIT, never released */ | ||
66 | NL_BIT( 60, T_MANDATORY, mind_af) | ||
67 | NL_BIT( 27, T_MAY_IGNORE, want_lose) | ||
68 | NL_BIT( 28, T_MAY_IGNORE, two_primaries) | ||
69 | NL_BIT( 41, T_MAY_IGNORE, always_asbp) | ||
70 | NL_BIT( 61, T_MAY_IGNORE, no_cork) | ||
71 | NL_BIT( 62, T_MANDATORY, auto_sndbuf_size) | ||
72 | NL_BIT( 70, T_MANDATORY, dry_run) | ||
73 | ) | ||
74 | |||
75 | NL_PACKET(disconnect, 6, | ||
76 | NL_BIT( 84, T_MAY_IGNORE, force) | ||
77 | ) | ||
78 | |||
79 | NL_PACKET(resize, 7, | ||
80 | NL_INT64( 29, T_MAY_IGNORE, resize_size) | ||
81 | NL_BIT( 68, T_MAY_IGNORE, resize_force) | ||
82 | NL_BIT( 69, T_MANDATORY, no_resync) | ||
83 | ) | ||
84 | |||
85 | NL_PACKET(syncer_conf, 8, | ||
86 | NL_INTEGER( 30, T_MAY_IGNORE, rate) | ||
87 | NL_INTEGER( 31, T_MAY_IGNORE, after) | ||
88 | NL_INTEGER( 32, T_MAY_IGNORE, al_extents) | ||
89 | /* NL_INTEGER( 71, T_MAY_IGNORE, dp_volume) | ||
90 | * NL_INTEGER( 72, T_MAY_IGNORE, dp_interval) | ||
91 | * NL_INTEGER( 73, T_MAY_IGNORE, throttle_th) | ||
92 | * NL_INTEGER( 74, T_MAY_IGNORE, hold_off_th) | ||
93 | * feature will be reimplemented differently with 8.3.9 */ | ||
94 | NL_STRING( 52, T_MAY_IGNORE, verify_alg, SHARED_SECRET_MAX) | ||
95 | NL_STRING( 51, T_MAY_IGNORE, cpu_mask, 32) | ||
96 | NL_STRING( 64, T_MAY_IGNORE, csums_alg, SHARED_SECRET_MAX) | ||
97 | NL_BIT( 65, T_MAY_IGNORE, use_rle) | ||
98 | NL_INTEGER( 75, T_MAY_IGNORE, on_no_data) | ||
99 | NL_INTEGER( 76, T_MAY_IGNORE, c_plan_ahead) | ||
100 | NL_INTEGER( 77, T_MAY_IGNORE, c_delay_target) | ||
101 | NL_INTEGER( 78, T_MAY_IGNORE, c_fill_target) | ||
102 | NL_INTEGER( 79, T_MAY_IGNORE, c_max_rate) | ||
103 | NL_INTEGER( 80, T_MAY_IGNORE, c_min_rate) | ||
104 | ) | ||
105 | |||
106 | NL_PACKET(invalidate, 9, ) | ||
107 | NL_PACKET(invalidate_peer, 10, ) | ||
108 | NL_PACKET(pause_sync, 11, ) | ||
109 | NL_PACKET(resume_sync, 12, ) | ||
110 | NL_PACKET(suspend_io, 13, ) | ||
111 | NL_PACKET(resume_io, 14, ) | ||
112 | NL_PACKET(outdate, 15, ) | ||
113 | NL_PACKET(get_config, 16, ) | ||
114 | NL_PACKET(get_state, 17, | ||
115 | NL_INTEGER( 33, T_MAY_IGNORE, state_i) | ||
116 | ) | ||
117 | |||
118 | NL_PACKET(get_uuids, 18, | ||
119 | NL_STRING( 34, T_MAY_IGNORE, uuids, (UI_SIZE*sizeof(__u64))) | ||
120 | NL_INTEGER( 35, T_MAY_IGNORE, uuids_flags) | ||
121 | ) | ||
122 | |||
123 | NL_PACKET(get_timeout_flag, 19, | ||
124 | NL_BIT( 36, T_MAY_IGNORE, use_degraded) | ||
125 | ) | ||
126 | |||
127 | NL_PACKET(call_helper, 20, | ||
128 | NL_STRING( 38, T_MAY_IGNORE, helper, 32) | ||
129 | ) | ||
130 | |||
131 | /* Tag nr 42 already allocated in drbd-8.1 development. */ | ||
132 | |||
133 | NL_PACKET(sync_progress, 23, | ||
134 | NL_INTEGER( 43, T_MAY_IGNORE, sync_progress) | ||
135 | ) | ||
136 | |||
137 | NL_PACKET(dump_ee, 24, | ||
138 | NL_STRING( 45, T_MAY_IGNORE, dump_ee_reason, 32) | ||
139 | NL_STRING( 46, T_MAY_IGNORE, seen_digest, SHARED_SECRET_MAX) | ||
140 | NL_STRING( 47, T_MAY_IGNORE, calc_digest, SHARED_SECRET_MAX) | ||
141 | NL_INT64( 48, T_MAY_IGNORE, ee_sector) | ||
142 | NL_INT64( 49, T_MAY_IGNORE, ee_block_id) | ||
143 | NL_STRING( 50, T_MAY_IGNORE, ee_data, 32 << 10) | ||
144 | ) | ||
145 | |||
146 | NL_PACKET(start_ov, 25, | ||
147 | NL_INT64( 66, T_MAY_IGNORE, start_sector) | ||
148 | ) | ||
149 | |||
150 | NL_PACKET(new_c_uuid, 26, | ||
151 | NL_BIT( 63, T_MANDATORY, clear_bm) | ||
152 | ) | ||
153 | |||
154 | #ifdef NL_RESPONSE | ||
155 | NL_RESPONSE(return_code_only, 27) | ||
156 | #endif | ||
157 | |||
158 | #undef NL_PACKET | ||
159 | #undef NL_INTEGER | ||
160 | #undef NL_INT64 | ||
161 | #undef NL_BIT | ||
162 | #undef NL_STRING | ||
163 | #undef NL_RESPONSE | ||
diff --git a/include/linux/drbd_tag_magic.h b/include/linux/drbd_tag_magic.h deleted file mode 100644 index 82de1f9e48b1..000000000000 --- a/include/linux/drbd_tag_magic.h +++ /dev/null | |||
@@ -1,84 +0,0 @@ | |||
1 | #ifndef DRBD_TAG_MAGIC_H | ||
2 | #define DRBD_TAG_MAGIC_H | ||
3 | |||
4 | #define TT_END 0 | ||
5 | #define TT_REMOVED 0xE000 | ||
6 | |||
7 | /* declare packet_type enums */ | ||
8 | enum packet_types { | ||
9 | #define NL_PACKET(name, number, fields) P_ ## name = number, | ||
10 | #define NL_RESPONSE(name, number) P_ ## name = number, | ||
11 | #define NL_INTEGER(pn, pr, member) | ||
12 | #define NL_INT64(pn, pr, member) | ||
13 | #define NL_BIT(pn, pr, member) | ||
14 | #define NL_STRING(pn, pr, member, len) | ||
15 | #include <linux/drbd_nl.h> | ||
16 | P_nl_after_last_packet, | ||
17 | }; | ||
18 | |||
19 | /* These struct are used to deduce the size of the tag lists: */ | ||
20 | #define NL_PACKET(name, number, fields) \ | ||
21 | struct name ## _tag_len_struct { fields }; | ||
22 | #define NL_INTEGER(pn, pr, member) \ | ||
23 | int member; int tag_and_len ## member; | ||
24 | #define NL_INT64(pn, pr, member) \ | ||
25 | __u64 member; int tag_and_len ## member; | ||
26 | #define NL_BIT(pn, pr, member) \ | ||
27 | unsigned char member:1; int tag_and_len ## member; | ||
28 | #define NL_STRING(pn, pr, member, len) \ | ||
29 | unsigned char member[len]; int member ## _len; \ | ||
30 | int tag_and_len ## member; | ||
31 | #include <linux/drbd_nl.h> | ||
32 | |||
33 | /* declare tag-list-sizes */ | ||
34 | static const int tag_list_sizes[] = { | ||
35 | #define NL_PACKET(name, number, fields) 2 fields , | ||
36 | #define NL_INTEGER(pn, pr, member) + 4 + 4 | ||
37 | #define NL_INT64(pn, pr, member) + 4 + 8 | ||
38 | #define NL_BIT(pn, pr, member) + 4 + 1 | ||
39 | #define NL_STRING(pn, pr, member, len) + 4 + (len) | ||
40 | #include <linux/drbd_nl.h> | ||
41 | }; | ||
42 | |||
43 | /* The two highest bits are used for the tag type */ | ||
44 | #define TT_MASK 0xC000 | ||
45 | #define TT_INTEGER 0x0000 | ||
46 | #define TT_INT64 0x4000 | ||
47 | #define TT_BIT 0x8000 | ||
48 | #define TT_STRING 0xC000 | ||
49 | /* The next bit indicates if processing of the tag is mandatory */ | ||
50 | #define T_MANDATORY 0x2000 | ||
51 | #define T_MAY_IGNORE 0x0000 | ||
52 | #define TN_MASK 0x1fff | ||
53 | /* The remaining 13 bits are used to enumerate the tags */ | ||
54 | |||
55 | #define tag_type(T) ((T) & TT_MASK) | ||
56 | #define tag_number(T) ((T) & TN_MASK) | ||
57 | |||
58 | /* declare tag enums */ | ||
59 | #define NL_PACKET(name, number, fields) fields | ||
60 | enum drbd_tags { | ||
61 | #define NL_INTEGER(pn, pr, member) T_ ## member = pn | TT_INTEGER | pr , | ||
62 | #define NL_INT64(pn, pr, member) T_ ## member = pn | TT_INT64 | pr , | ||
63 | #define NL_BIT(pn, pr, member) T_ ## member = pn | TT_BIT | pr , | ||
64 | #define NL_STRING(pn, pr, member, len) T_ ## member = pn | TT_STRING | pr , | ||
65 | #include <linux/drbd_nl.h> | ||
66 | }; | ||
67 | |||
68 | struct tag { | ||
69 | const char *name; | ||
70 | int type_n_flags; | ||
71 | int max_len; | ||
72 | }; | ||
73 | |||
74 | /* declare tag names */ | ||
75 | #define NL_PACKET(name, number, fields) fields | ||
76 | static const struct tag tag_descriptions[] = { | ||
77 | #define NL_INTEGER(pn, pr, member) [ pn ] = { #member, TT_INTEGER | pr, sizeof(int) }, | ||
78 | #define NL_INT64(pn, pr, member) [ pn ] = { #member, TT_INT64 | pr, sizeof(__u64) }, | ||
79 | #define NL_BIT(pn, pr, member) [ pn ] = { #member, TT_BIT | pr, sizeof(int) }, | ||
80 | #define NL_STRING(pn, pr, member, len) [ pn ] = { #member, TT_STRING | pr, (len) }, | ||
81 | #include <linux/drbd_nl.h> | ||
82 | }; | ||
83 | |||
84 | #endif | ||
diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 4f440b3e89fe..79b8bba19363 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h | |||
@@ -88,10 +88,14 @@ struct disk_stats { | |||
88 | }; | 88 | }; |
89 | 89 | ||
90 | #define PARTITION_META_INFO_VOLNAMELTH 64 | 90 | #define PARTITION_META_INFO_VOLNAMELTH 64 |
91 | #define PARTITION_META_INFO_UUIDLTH 16 | 91 | /* |
92 | * Enough for the string representation of any kind of UUID plus NULL. | ||
93 | * EFI UUID is 36 characters. MSDOS UUID is 11 characters. | ||
94 | */ | ||
95 | #define PARTITION_META_INFO_UUIDLTH 37 | ||
92 | 96 | ||
93 | struct partition_meta_info { | 97 | struct partition_meta_info { |
94 | u8 uuid[PARTITION_META_INFO_UUIDLTH]; /* always big endian */ | 98 | char uuid[PARTITION_META_INFO_UUIDLTH]; |
95 | u8 volname[PARTITION_META_INFO_VOLNAMELTH]; | 99 | u8 volname[PARTITION_META_INFO_VOLNAMELTH]; |
96 | }; | 100 | }; |
97 | 101 | ||
diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h new file mode 100644 index 000000000000..023bc346b877 --- /dev/null +++ b/include/linux/genl_magic_func.h | |||
@@ -0,0 +1,422 @@ | |||
1 | #ifndef GENL_MAGIC_FUNC_H | ||
2 | #define GENL_MAGIC_FUNC_H | ||
3 | |||
4 | #include <linux/genl_magic_struct.h> | ||
5 | |||
6 | /* | ||
7 | * Magic: declare tla policy {{{1 | ||
8 | * Magic: declare nested policies | ||
9 | * {{{2 | ||
10 | */ | ||
11 | #undef GENL_mc_group | ||
12 | #define GENL_mc_group(group) | ||
13 | |||
14 | #undef GENL_notification | ||
15 | #define GENL_notification(op_name, op_num, mcast_group, tla_list) | ||
16 | |||
17 | #undef GENL_op | ||
18 | #define GENL_op(op_name, op_num, handler, tla_list) | ||
19 | |||
20 | #undef GENL_struct | ||
21 | #define GENL_struct(tag_name, tag_number, s_name, s_fields) \ | ||
22 | [tag_name] = { .type = NLA_NESTED }, | ||
23 | |||
24 | static struct nla_policy CONCAT_(GENL_MAGIC_FAMILY, _tla_nl_policy)[] = { | ||
25 | #include GENL_MAGIC_INCLUDE_FILE | ||
26 | }; | ||
27 | |||
28 | #undef GENL_struct | ||
29 | #define GENL_struct(tag_name, tag_number, s_name, s_fields) \ | ||
30 | static struct nla_policy s_name ## _nl_policy[] __read_mostly = \ | ||
31 | { s_fields }; | ||
32 | |||
33 | #undef __field | ||
34 | #define __field(attr_nr, attr_flag, name, nla_type, _type, __get, \ | ||
35 | __put, __is_signed) \ | ||
36 | [attr_nr] = { .type = nla_type }, | ||
37 | |||
38 | #undef __array | ||
39 | #define __array(attr_nr, attr_flag, name, nla_type, _type, maxlen, \ | ||
40 | __get, __put, __is_signed) \ | ||
41 | [attr_nr] = { .type = nla_type, \ | ||
42 | .len = maxlen - (nla_type == NLA_NUL_STRING) }, | ||
43 | |||
44 | #include GENL_MAGIC_INCLUDE_FILE | ||
45 | |||
46 | #ifndef __KERNEL__ | ||
47 | #ifndef pr_info | ||
48 | #define pr_info(args...) fprintf(stderr, args); | ||
49 | #endif | ||
50 | #endif | ||
51 | |||
52 | #ifdef GENL_MAGIC_DEBUG | ||
53 | static void dprint_field(const char *dir, int nla_type, | ||
54 | const char *name, void *valp) | ||
55 | { | ||
56 | __u64 val = valp ? *(__u32 *)valp : 1; | ||
57 | switch (nla_type) { | ||
58 | case NLA_U8: val = (__u8)val; | ||
59 | case NLA_U16: val = (__u16)val; | ||
60 | case NLA_U32: val = (__u32)val; | ||
61 | pr_info("%s attr %s: %d 0x%08x\n", dir, | ||
62 | name, (int)val, (unsigned)val); | ||
63 | break; | ||
64 | case NLA_U64: | ||
65 | val = *(__u64*)valp; | ||
66 | pr_info("%s attr %s: %lld 0x%08llx\n", dir, | ||
67 | name, (long long)val, (unsigned long long)val); | ||
68 | break; | ||
69 | case NLA_FLAG: | ||
70 | if (val) | ||
71 | pr_info("%s attr %s: set\n", dir, name); | ||
72 | break; | ||
73 | } | ||
74 | } | ||
75 | |||
76 | static void dprint_array(const char *dir, int nla_type, | ||
77 | const char *name, const char *val, unsigned len) | ||
78 | { | ||
79 | switch (nla_type) { | ||
80 | case NLA_NUL_STRING: | ||
81 | if (len && val[len-1] == '\0') | ||
82 | len--; | ||
83 | pr_info("%s attr %s: [len:%u] '%s'\n", dir, name, len, val); | ||
84 | break; | ||
85 | default: | ||
86 | /* we can always show 4 byte, | ||
87 | * thats what nlattr are aligned to. */ | ||
88 | pr_info("%s attr %s: [len:%u] %02x%02x%02x%02x ...\n", | ||
89 | dir, name, len, val[0], val[1], val[2], val[3]); | ||
90 | } | ||
91 | } | ||
92 | |||
93 | #define DPRINT_TLA(a, op, b) pr_info("%s %s %s\n", a, op, b); | ||
94 | |||
95 | /* Name is a member field name of the struct s. | ||
96 | * If s is NULL (only parsing, no copy requested in *_from_attrs()), | ||
97 | * nla is supposed to point to the attribute containing the information | ||
98 | * corresponding to that struct member. */ | ||
99 | #define DPRINT_FIELD(dir, nla_type, name, s, nla) \ | ||
100 | do { \ | ||
101 | if (s) \ | ||
102 | dprint_field(dir, nla_type, #name, &s->name); \ | ||
103 | else if (nla) \ | ||
104 | dprint_field(dir, nla_type, #name, \ | ||
105 | (nla_type == NLA_FLAG) ? NULL \ | ||
106 | : nla_data(nla)); \ | ||
107 | } while (0) | ||
108 | |||
109 | #define DPRINT_ARRAY(dir, nla_type, name, s, nla) \ | ||
110 | do { \ | ||
111 | if (s) \ | ||
112 | dprint_array(dir, nla_type, #name, \ | ||
113 | s->name, s->name ## _len); \ | ||
114 | else if (nla) \ | ||
115 | dprint_array(dir, nla_type, #name, \ | ||
116 | nla_data(nla), nla_len(nla)); \ | ||
117 | } while (0) | ||
118 | #else | ||
119 | #define DPRINT_TLA(a, op, b) do {} while (0) | ||
120 | #define DPRINT_FIELD(dir, nla_type, name, s, nla) do {} while (0) | ||
121 | #define DPRINT_ARRAY(dir, nla_type, name, s, nla) do {} while (0) | ||
122 | #endif | ||
123 | |||
124 | /* | ||
125 | * Magic: provide conversion functions {{{1 | ||
126 | * populate struct from attribute table: | ||
127 | * {{{2 | ||
128 | */ | ||
129 | |||
130 | /* processing of generic netlink messages is serialized. | ||
131 | * use one static buffer for parsing of nested attributes */ | ||
132 | static struct nlattr *nested_attr_tb[128]; | ||
133 | |||
134 | #ifndef BUILD_BUG_ON | ||
135 | /* Force a compilation error if condition is true */ | ||
136 | #define BUILD_BUG_ON(condition) ((void)BUILD_BUG_ON_ZERO(condition)) | ||
137 | /* Force a compilation error if condition is true, but also produce a | ||
138 | result (of value 0 and type size_t), so the expression can be used | ||
139 | e.g. in a structure initializer (or where-ever else comma expressions | ||
140 | aren't permitted). */ | ||
141 | #define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); })) | ||
142 | #define BUILD_BUG_ON_NULL(e) ((void *)sizeof(struct { int:-!!(e); })) | ||
143 | #endif | ||
144 | |||
145 | #undef GENL_struct | ||
146 | #define GENL_struct(tag_name, tag_number, s_name, s_fields) \ | ||
147 | /* *_from_attrs functions are static, but potentially unused */ \ | ||
148 | static int __ ## s_name ## _from_attrs(struct s_name *s, \ | ||
149 | struct genl_info *info, bool exclude_invariants) \ | ||
150 | { \ | ||
151 | const int maxtype = ARRAY_SIZE(s_name ## _nl_policy)-1; \ | ||
152 | struct nlattr *tla = info->attrs[tag_number]; \ | ||
153 | struct nlattr **ntb = nested_attr_tb; \ | ||
154 | struct nlattr *nla; \ | ||
155 | int err; \ | ||
156 | BUILD_BUG_ON(ARRAY_SIZE(s_name ## _nl_policy) > ARRAY_SIZE(nested_attr_tb)); \ | ||
157 | if (!tla) \ | ||
158 | return -ENOMSG; \ | ||
159 | DPRINT_TLA(#s_name, "<=-", #tag_name); \ | ||
160 | err = drbd_nla_parse_nested(ntb, maxtype, tla, s_name ## _nl_policy); \ | ||
161 | if (err) \ | ||
162 | return err; \ | ||
163 | \ | ||
164 | s_fields \ | ||
165 | return 0; \ | ||
166 | } __attribute__((unused)) \ | ||
167 | static int s_name ## _from_attrs(struct s_name *s, \ | ||
168 | struct genl_info *info) \ | ||
169 | { \ | ||
170 | return __ ## s_name ## _from_attrs(s, info, false); \ | ||
171 | } __attribute__((unused)) \ | ||
172 | static int s_name ## _from_attrs_for_change(struct s_name *s, \ | ||
173 | struct genl_info *info) \ | ||
174 | { \ | ||
175 | return __ ## s_name ## _from_attrs(s, info, true); \ | ||
176 | } __attribute__((unused)) \ | ||
177 | |||
178 | #define __assign(attr_nr, attr_flag, name, nla_type, type, assignment...) \ | ||
179 | nla = ntb[attr_nr]; \ | ||
180 | if (nla) { \ | ||
181 | if (exclude_invariants && ((attr_flag) & DRBD_F_INVARIANT)) { \ | ||
182 | pr_info("<< must not change invariant attr: %s\n", #name); \ | ||
183 | return -EEXIST; \ | ||
184 | } \ | ||
185 | assignment; \ | ||
186 | } else if (exclude_invariants && ((attr_flag) & DRBD_F_INVARIANT)) { \ | ||
187 | /* attribute missing from payload, */ \ | ||
188 | /* which was expected */ \ | ||
189 | } else if ((attr_flag) & DRBD_F_REQUIRED) { \ | ||
190 | pr_info("<< missing attr: %s\n", #name); \ | ||
191 | return -ENOMSG; \ | ||
192 | } | ||
193 | |||
194 | #undef __field | ||
195 | #define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \ | ||
196 | __is_signed) \ | ||
197 | __assign(attr_nr, attr_flag, name, nla_type, type, \ | ||
198 | if (s) \ | ||
199 | s->name = __get(nla); \ | ||
200 | DPRINT_FIELD("<<", nla_type, name, s, nla)) | ||
201 | |||
202 | /* validate_nla() already checked nla_len <= maxlen appropriately. */ | ||
203 | #undef __array | ||
204 | #define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \ | ||
205 | __get, __put, __is_signed) \ | ||
206 | __assign(attr_nr, attr_flag, name, nla_type, type, \ | ||
207 | if (s) \ | ||
208 | s->name ## _len = \ | ||
209 | __get(s->name, nla, maxlen); \ | ||
210 | DPRINT_ARRAY("<<", nla_type, name, s, nla)) | ||
211 | |||
212 | #include GENL_MAGIC_INCLUDE_FILE | ||
213 | |||
214 | #undef GENL_struct | ||
215 | #define GENL_struct(tag_name, tag_number, s_name, s_fields) | ||
216 | |||
217 | /* | ||
218 | * Magic: define op number to op name mapping {{{1 | ||
219 | * {{{2 | ||
220 | */ | ||
221 | const char *CONCAT_(GENL_MAGIC_FAMILY, _genl_cmd_to_str)(__u8 cmd) | ||
222 | { | ||
223 | switch (cmd) { | ||
224 | #undef GENL_op | ||
225 | #define GENL_op(op_name, op_num, handler, tla_list) \ | ||
226 | case op_num: return #op_name; | ||
227 | #include GENL_MAGIC_INCLUDE_FILE | ||
228 | default: | ||
229 | return "unknown"; | ||
230 | } | ||
231 | } | ||
232 | |||
233 | #ifdef __KERNEL__ | ||
234 | #include <linux/stringify.h> | ||
235 | /* | ||
236 | * Magic: define genl_ops {{{1 | ||
237 | * {{{2 | ||
238 | */ | ||
239 | |||
240 | #undef GENL_op | ||
241 | #define GENL_op(op_name, op_num, handler, tla_list) \ | ||
242 | { \ | ||
243 | handler \ | ||
244 | .cmd = op_name, \ | ||
245 | .policy = CONCAT_(GENL_MAGIC_FAMILY, _tla_nl_policy), \ | ||
246 | }, | ||
247 | |||
248 | #define ZZZ_genl_ops CONCAT_(GENL_MAGIC_FAMILY, _genl_ops) | ||
249 | static struct genl_ops ZZZ_genl_ops[] __read_mostly = { | ||
250 | #include GENL_MAGIC_INCLUDE_FILE | ||
251 | }; | ||
252 | |||
253 | #undef GENL_op | ||
254 | #define GENL_op(op_name, op_num, handler, tla_list) | ||
255 | |||
256 | /* | ||
257 | * Define the genl_family, multicast groups, {{{1 | ||
258 | * and provide register/unregister functions. | ||
259 | * {{{2 | ||
260 | */ | ||
261 | #define ZZZ_genl_family CONCAT_(GENL_MAGIC_FAMILY, _genl_family) | ||
262 | static struct genl_family ZZZ_genl_family __read_mostly = { | ||
263 | .id = GENL_ID_GENERATE, | ||
264 | .name = __stringify(GENL_MAGIC_FAMILY), | ||
265 | .version = GENL_MAGIC_VERSION, | ||
266 | #ifdef GENL_MAGIC_FAMILY_HDRSZ | ||
267 | .hdrsize = NLA_ALIGN(GENL_MAGIC_FAMILY_HDRSZ), | ||
268 | #endif | ||
269 | .maxattr = ARRAY_SIZE(drbd_tla_nl_policy)-1, | ||
270 | }; | ||
271 | |||
272 | /* | ||
273 | * Magic: define multicast groups | ||
274 | * Magic: define multicast group registration helper | ||
275 | */ | ||
276 | #undef GENL_mc_group | ||
277 | #define GENL_mc_group(group) \ | ||
278 | static struct genl_multicast_group \ | ||
279 | CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group) __read_mostly = { \ | ||
280 | .name = #group, \ | ||
281 | }; \ | ||
282 | static int CONCAT_(GENL_MAGIC_FAMILY, _genl_multicast_ ## group)( \ | ||
283 | struct sk_buff *skb, gfp_t flags) \ | ||
284 | { \ | ||
285 | unsigned int group_id = \ | ||
286 | CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group).id; \ | ||
287 | if (!group_id) \ | ||
288 | return -EINVAL; \ | ||
289 | return genlmsg_multicast(skb, 0, group_id, flags); \ | ||
290 | } | ||
291 | |||
292 | #include GENL_MAGIC_INCLUDE_FILE | ||
293 | |||
294 | int CONCAT_(GENL_MAGIC_FAMILY, _genl_register)(void) | ||
295 | { | ||
296 | int err = genl_register_family_with_ops(&ZZZ_genl_family, | ||
297 | ZZZ_genl_ops, ARRAY_SIZE(ZZZ_genl_ops)); | ||
298 | if (err) | ||
299 | return err; | ||
300 | #undef GENL_mc_group | ||
301 | #define GENL_mc_group(group) \ | ||
302 | err = genl_register_mc_group(&ZZZ_genl_family, \ | ||
303 | &CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group)); \ | ||
304 | if (err) \ | ||
305 | goto fail; \ | ||
306 | else \ | ||
307 | pr_info("%s: mcg %s: %u\n", #group, \ | ||
308 | __stringify(GENL_MAGIC_FAMILY), \ | ||
309 | CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group).id); | ||
310 | |||
311 | #include GENL_MAGIC_INCLUDE_FILE | ||
312 | |||
313 | #undef GENL_mc_group | ||
314 | #define GENL_mc_group(group) | ||
315 | return 0; | ||
316 | fail: | ||
317 | genl_unregister_family(&ZZZ_genl_family); | ||
318 | return err; | ||
319 | } | ||
320 | |||
321 | void CONCAT_(GENL_MAGIC_FAMILY, _genl_unregister)(void) | ||
322 | { | ||
323 | genl_unregister_family(&ZZZ_genl_family); | ||
324 | } | ||
325 | |||
326 | /* | ||
327 | * Magic: provide conversion functions {{{1 | ||
328 | * populate skb from struct. | ||
329 | * {{{2 | ||
330 | */ | ||
331 | |||
332 | #undef GENL_op | ||
333 | #define GENL_op(op_name, op_num, handler, tla_list) | ||
334 | |||
335 | #undef GENL_struct | ||
336 | #define GENL_struct(tag_name, tag_number, s_name, s_fields) \ | ||
337 | static int s_name ## _to_skb(struct sk_buff *skb, struct s_name *s, \ | ||
338 | const bool exclude_sensitive) \ | ||
339 | { \ | ||
340 | struct nlattr *tla = nla_nest_start(skb, tag_number); \ | ||
341 | if (!tla) \ | ||
342 | goto nla_put_failure; \ | ||
343 | DPRINT_TLA(#s_name, "-=>", #tag_name); \ | ||
344 | s_fields \ | ||
345 | nla_nest_end(skb, tla); \ | ||
346 | return 0; \ | ||
347 | \ | ||
348 | nla_put_failure: \ | ||
349 | if (tla) \ | ||
350 | nla_nest_cancel(skb, tla); \ | ||
351 | return -EMSGSIZE; \ | ||
352 | } \ | ||
353 | static inline int s_name ## _to_priv_skb(struct sk_buff *skb, \ | ||
354 | struct s_name *s) \ | ||
355 | { \ | ||
356 | return s_name ## _to_skb(skb, s, 0); \ | ||
357 | } \ | ||
358 | static inline int s_name ## _to_unpriv_skb(struct sk_buff *skb, \ | ||
359 | struct s_name *s) \ | ||
360 | { \ | ||
361 | return s_name ## _to_skb(skb, s, 1); \ | ||
362 | } | ||
363 | |||
364 | |||
365 | #undef __field | ||
366 | #define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \ | ||
367 | __is_signed) \ | ||
368 | if (!exclude_sensitive || !((attr_flag) & DRBD_F_SENSITIVE)) { \ | ||
369 | DPRINT_FIELD(">>", nla_type, name, s, NULL); \ | ||
370 | if (__put(skb, attr_nr, s->name)) \ | ||
371 | goto nla_put_failure; \ | ||
372 | } | ||
373 | |||
374 | #undef __array | ||
375 | #define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \ | ||
376 | __get, __put, __is_signed) \ | ||
377 | if (!exclude_sensitive || !((attr_flag) & DRBD_F_SENSITIVE)) { \ | ||
378 | DPRINT_ARRAY(">>",nla_type, name, s, NULL); \ | ||
379 | if (__put(skb, attr_nr, min_t(int, maxlen, \ | ||
380 | s->name ## _len + (nla_type == NLA_NUL_STRING)),\ | ||
381 | s->name)) \ | ||
382 | goto nla_put_failure; \ | ||
383 | } | ||
384 | |||
385 | #include GENL_MAGIC_INCLUDE_FILE | ||
386 | |||
387 | |||
388 | /* Functions for initializing structs to default values. */ | ||
389 | |||
390 | #undef __field | ||
391 | #define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \ | ||
392 | __is_signed) | ||
393 | #undef __array | ||
394 | #define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \ | ||
395 | __get, __put, __is_signed) | ||
396 | #undef __u32_field_def | ||
397 | #define __u32_field_def(attr_nr, attr_flag, name, default) \ | ||
398 | x->name = default; | ||
399 | #undef __s32_field_def | ||
400 | #define __s32_field_def(attr_nr, attr_flag, name, default) \ | ||
401 | x->name = default; | ||
402 | #undef __flg_field_def | ||
403 | #define __flg_field_def(attr_nr, attr_flag, name, default) \ | ||
404 | x->name = default; | ||
405 | #undef __str_field_def | ||
406 | #define __str_field_def(attr_nr, attr_flag, name, maxlen) \ | ||
407 | memset(x->name, 0, sizeof(x->name)); \ | ||
408 | x->name ## _len = 0; | ||
409 | #undef GENL_struct | ||
410 | #define GENL_struct(tag_name, tag_number, s_name, s_fields) \ | ||
411 | static void set_ ## s_name ## _defaults(struct s_name *x) __attribute__((unused)); \ | ||
412 | static void set_ ## s_name ## _defaults(struct s_name *x) { \ | ||
413 | s_fields \ | ||
414 | } | ||
415 | |||
416 | #include GENL_MAGIC_INCLUDE_FILE | ||
417 | |||
418 | #endif /* __KERNEL__ */ | ||
419 | |||
420 | /* }}}1 */ | ||
421 | #endif /* GENL_MAGIC_FUNC_H */ | ||
422 | /* vim: set foldmethod=marker foldlevel=1 nofoldenable : */ | ||
diff --git a/include/linux/genl_magic_struct.h b/include/linux/genl_magic_struct.h new file mode 100644 index 000000000000..eecd19b37001 --- /dev/null +++ b/include/linux/genl_magic_struct.h | |||
@@ -0,0 +1,277 @@ | |||
1 | #ifndef GENL_MAGIC_STRUCT_H | ||
2 | #define GENL_MAGIC_STRUCT_H | ||
3 | |||
4 | #ifndef GENL_MAGIC_FAMILY | ||
5 | # error "you need to define GENL_MAGIC_FAMILY before inclusion" | ||
6 | #endif | ||
7 | |||
8 | #ifndef GENL_MAGIC_VERSION | ||
9 | # error "you need to define GENL_MAGIC_VERSION before inclusion" | ||
10 | #endif | ||
11 | |||
12 | #ifndef GENL_MAGIC_INCLUDE_FILE | ||
13 | # error "you need to define GENL_MAGIC_INCLUDE_FILE before inclusion" | ||
14 | #endif | ||
15 | |||
16 | #include <linux/genetlink.h> | ||
17 | #include <linux/types.h> | ||
18 | |||
19 | #define CONCAT__(a,b) a ## b | ||
20 | #define CONCAT_(a,b) CONCAT__(a,b) | ||
21 | |||
22 | extern int CONCAT_(GENL_MAGIC_FAMILY, _genl_register)(void); | ||
23 | extern void CONCAT_(GENL_MAGIC_FAMILY, _genl_unregister)(void); | ||
24 | |||
25 | /* | ||
26 | * Extension of genl attribute validation policies {{{2 | ||
27 | */ | ||
28 | |||
29 | /* | ||
30 | * @DRBD_GENLA_F_MANDATORY: By default, netlink ignores attributes it does not | ||
31 | * know about. This flag can be set in nlattr->nla_type to indicate that this | ||
32 | * attribute must not be ignored. | ||
33 | * | ||
34 | * We check and remove this flag in drbd_nla_check_mandatory() before | ||
35 | * validating the attribute types and lengths via nla_parse_nested(). | ||
36 | */ | ||
37 | #define DRBD_GENLA_F_MANDATORY (1 << 14) | ||
38 | |||
39 | /* | ||
40 | * Flags specific to drbd and not visible at the netlink layer, used in | ||
41 | * <struct>_from_attrs and <struct>_to_skb: | ||
42 | * | ||
43 | * @DRBD_F_REQUIRED: Attribute is required; a request without this attribute is | ||
44 | * invalid. | ||
45 | * | ||
46 | * @DRBD_F_SENSITIVE: Attribute includes sensitive information and must not be | ||
47 | * included in unpriviledged get requests or broadcasts. | ||
48 | * | ||
49 | * @DRBD_F_INVARIANT: Attribute is set when an object is initially created, but | ||
50 | * cannot subsequently be changed. | ||
51 | */ | ||
52 | #define DRBD_F_REQUIRED (1 << 0) | ||
53 | #define DRBD_F_SENSITIVE (1 << 1) | ||
54 | #define DRBD_F_INVARIANT (1 << 2) | ||
55 | |||
56 | #define __nla_type(x) ((__u16)((x) & NLA_TYPE_MASK & ~DRBD_GENLA_F_MANDATORY)) | ||
57 | |||
58 | /* }}}1 | ||
59 | * MAGIC | ||
60 | * multi-include macro expansion magic starts here | ||
61 | */ | ||
62 | |||
63 | /* MAGIC helpers {{{2 */ | ||
64 | |||
65 | /* possible field types */ | ||
66 | #define __flg_field(attr_nr, attr_flag, name) \ | ||
67 | __field(attr_nr, attr_flag, name, NLA_U8, char, \ | ||
68 | nla_get_u8, nla_put_u8, false) | ||
69 | #define __u8_field(attr_nr, attr_flag, name) \ | ||
70 | __field(attr_nr, attr_flag, name, NLA_U8, unsigned char, \ | ||
71 | nla_get_u8, nla_put_u8, false) | ||
72 | #define __u16_field(attr_nr, attr_flag, name) \ | ||
73 | __field(attr_nr, attr_flag, name, NLA_U16, __u16, \ | ||
74 | nla_get_u16, nla_put_u16, false) | ||
75 | #define __u32_field(attr_nr, attr_flag, name) \ | ||
76 | __field(attr_nr, attr_flag, name, NLA_U32, __u32, \ | ||
77 | nla_get_u32, nla_put_u32, false) | ||
78 | #define __s32_field(attr_nr, attr_flag, name) \ | ||
79 | __field(attr_nr, attr_flag, name, NLA_U32, __s32, \ | ||
80 | nla_get_u32, nla_put_u32, true) | ||
81 | #define __u64_field(attr_nr, attr_flag, name) \ | ||
82 | __field(attr_nr, attr_flag, name, NLA_U64, __u64, \ | ||
83 | nla_get_u64, nla_put_u64, false) | ||
84 | #define __str_field(attr_nr, attr_flag, name, maxlen) \ | ||
85 | __array(attr_nr, attr_flag, name, NLA_NUL_STRING, char, maxlen, \ | ||
86 | nla_strlcpy, nla_put, false) | ||
87 | #define __bin_field(attr_nr, attr_flag, name, maxlen) \ | ||
88 | __array(attr_nr, attr_flag, name, NLA_BINARY, char, maxlen, \ | ||
89 | nla_memcpy, nla_put, false) | ||
90 | |||
91 | /* fields with default values */ | ||
92 | #define __flg_field_def(attr_nr, attr_flag, name, default) \ | ||
93 | __flg_field(attr_nr, attr_flag, name) | ||
94 | #define __u32_field_def(attr_nr, attr_flag, name, default) \ | ||
95 | __u32_field(attr_nr, attr_flag, name) | ||
96 | #define __s32_field_def(attr_nr, attr_flag, name, default) \ | ||
97 | __s32_field(attr_nr, attr_flag, name) | ||
98 | #define __str_field_def(attr_nr, attr_flag, name, maxlen) \ | ||
99 | __str_field(attr_nr, attr_flag, name, maxlen) | ||
100 | |||
101 | #define GENL_op_init(args...) args | ||
102 | #define GENL_doit(handler) \ | ||
103 | .doit = handler, \ | ||
104 | .flags = GENL_ADMIN_PERM, | ||
105 | #define GENL_dumpit(handler) \ | ||
106 | .dumpit = handler, \ | ||
107 | .flags = GENL_ADMIN_PERM, | ||
108 | |||
109 | /* }}}1 | ||
110 | * Magic: define the enum symbols for genl_ops | ||
111 | * Magic: define the enum symbols for top level attributes | ||
112 | * Magic: define the enum symbols for nested attributes | ||
113 | * {{{2 | ||
114 | */ | ||
115 | |||
116 | #undef GENL_struct | ||
117 | #define GENL_struct(tag_name, tag_number, s_name, s_fields) | ||
118 | |||
119 | #undef GENL_mc_group | ||
120 | #define GENL_mc_group(group) | ||
121 | |||
122 | #undef GENL_notification | ||
123 | #define GENL_notification(op_name, op_num, mcast_group, tla_list) \ | ||
124 | op_name = op_num, | ||
125 | |||
126 | #undef GENL_op | ||
127 | #define GENL_op(op_name, op_num, handler, tla_list) \ | ||
128 | op_name = op_num, | ||
129 | |||
130 | enum { | ||
131 | #include GENL_MAGIC_INCLUDE_FILE | ||
132 | }; | ||
133 | |||
134 | #undef GENL_notification | ||
135 | #define GENL_notification(op_name, op_num, mcast_group, tla_list) | ||
136 | |||
137 | #undef GENL_op | ||
138 | #define GENL_op(op_name, op_num, handler, attr_list) | ||
139 | |||
140 | #undef GENL_struct | ||
141 | #define GENL_struct(tag_name, tag_number, s_name, s_fields) \ | ||
142 | tag_name = tag_number, | ||
143 | |||
144 | enum { | ||
145 | #include GENL_MAGIC_INCLUDE_FILE | ||
146 | }; | ||
147 | |||
148 | #undef GENL_struct | ||
149 | #define GENL_struct(tag_name, tag_number, s_name, s_fields) \ | ||
150 | enum { \ | ||
151 | s_fields \ | ||
152 | }; | ||
153 | |||
154 | #undef __field | ||
155 | #define __field(attr_nr, attr_flag, name, nla_type, type, \ | ||
156 | __get, __put, __is_signed) \ | ||
157 | T_ ## name = (__u16)(attr_nr | ((attr_flag) & DRBD_GENLA_F_MANDATORY)), | ||
158 | |||
159 | #undef __array | ||
160 | #define __array(attr_nr, attr_flag, name, nla_type, type, \ | ||
161 | maxlen, __get, __put, __is_signed) \ | ||
162 | T_ ## name = (__u16)(attr_nr | ((attr_flag) & DRBD_GENLA_F_MANDATORY)), | ||
163 | |||
164 | #include GENL_MAGIC_INCLUDE_FILE | ||
165 | |||
166 | /* }}}1 | ||
167 | * Magic: compile time assert unique numbers for operations | ||
168 | * Magic: -"- unique numbers for top level attributes | ||
169 | * Magic: -"- unique numbers for nested attributes | ||
170 | * {{{2 | ||
171 | */ | ||
172 | |||
173 | #undef GENL_struct | ||
174 | #define GENL_struct(tag_name, tag_number, s_name, s_fields) | ||
175 | |||
176 | #undef GENL_op | ||
177 | #define GENL_op(op_name, op_num, handler, attr_list) \ | ||
178 | case op_name: | ||
179 | |||
180 | #undef GENL_notification | ||
181 | #define GENL_notification(op_name, op_num, mcast_group, tla_list) \ | ||
182 | case op_name: | ||
183 | |||
184 | static inline void ct_assert_unique_operations(void) | ||
185 | { | ||
186 | switch (0) { | ||
187 | #include GENL_MAGIC_INCLUDE_FILE | ||
188 | ; | ||
189 | } | ||
190 | } | ||
191 | |||
192 | #undef GENL_op | ||
193 | #define GENL_op(op_name, op_num, handler, attr_list) | ||
194 | |||
195 | #undef GENL_notification | ||
196 | #define GENL_notification(op_name, op_num, mcast_group, tla_list) | ||
197 | |||
198 | #undef GENL_struct | ||
199 | #define GENL_struct(tag_name, tag_number, s_name, s_fields) \ | ||
200 | case tag_number: | ||
201 | |||
202 | static inline void ct_assert_unique_top_level_attributes(void) | ||
203 | { | ||
204 | switch (0) { | ||
205 | #include GENL_MAGIC_INCLUDE_FILE | ||
206 | ; | ||
207 | } | ||
208 | } | ||
209 | |||
210 | #undef GENL_struct | ||
211 | #define GENL_struct(tag_name, tag_number, s_name, s_fields) \ | ||
212 | static inline void ct_assert_unique_ ## s_name ## _attributes(void) \ | ||
213 | { \ | ||
214 | switch (0) { \ | ||
215 | s_fields \ | ||
216 | ; \ | ||
217 | } \ | ||
218 | } | ||
219 | |||
220 | #undef __field | ||
221 | #define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \ | ||
222 | __is_signed) \ | ||
223 | case attr_nr: | ||
224 | |||
225 | #undef __array | ||
226 | #define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \ | ||
227 | __get, __put, __is_signed) \ | ||
228 | case attr_nr: | ||
229 | |||
230 | #include GENL_MAGIC_INCLUDE_FILE | ||
231 | |||
232 | /* }}}1 | ||
233 | * Magic: declare structs | ||
234 | * struct <name> { | ||
235 | * fields | ||
236 | * }; | ||
237 | * {{{2 | ||
238 | */ | ||
239 | |||
240 | #undef GENL_struct | ||
241 | #define GENL_struct(tag_name, tag_number, s_name, s_fields) \ | ||
242 | struct s_name { s_fields }; | ||
243 | |||
244 | #undef __field | ||
245 | #define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \ | ||
246 | __is_signed) \ | ||
247 | type name; | ||
248 | |||
249 | #undef __array | ||
250 | #define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \ | ||
251 | __get, __put, __is_signed) \ | ||
252 | type name[maxlen]; \ | ||
253 | __u32 name ## _len; | ||
254 | |||
255 | #include GENL_MAGIC_INCLUDE_FILE | ||
256 | |||
257 | #undef GENL_struct | ||
258 | #define GENL_struct(tag_name, tag_number, s_name, s_fields) \ | ||
259 | enum { \ | ||
260 | s_fields \ | ||
261 | }; | ||
262 | |||
263 | #undef __field | ||
264 | #define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \ | ||
265 | is_signed) \ | ||
266 | F_ ## name ## _IS_SIGNED = is_signed, | ||
267 | |||
268 | #undef __array | ||
269 | #define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \ | ||
270 | __get, __put, is_signed) \ | ||
271 | F_ ## name ## _IS_SIGNED = is_signed, | ||
272 | |||
273 | #include GENL_MAGIC_INCLUDE_FILE | ||
274 | |||
275 | /* }}}1 */ | ||
276 | #endif /* GENL_MAGIC_STRUCT_H */ | ||
277 | /* vim: set foldmethod=marker nofoldenable : */ | ||
diff --git a/include/linux/idr.h b/include/linux/idr.h index 87259a44c251..de7e190f1af4 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h | |||
@@ -152,4 +152,15 @@ void ida_simple_remove(struct ida *ida, unsigned int id); | |||
152 | 152 | ||
153 | void __init idr_init_cache(void); | 153 | void __init idr_init_cache(void); |
154 | 154 | ||
155 | /** | ||
156 | * idr_for_each_entry - iterate over an idr's elements of a given type | ||
157 | * @idp: idr handle | ||
158 | * @entry: the type * to use as cursor | ||
159 | * @id: id entry's key | ||
160 | */ | ||
161 | #define idr_for_each_entry(idp, entry, id) \ | ||
162 | for (id = 0, entry = (typeof(entry))idr_get_next((idp), &(id)); \ | ||
163 | entry != NULL; \ | ||
164 | ++id, entry = (typeof(entry))idr_get_next((idp), &(id))) | ||
165 | |||
155 | #endif /* __IDR_H__ */ | 166 | #endif /* __IDR_H__ */ |
diff --git a/include/linux/loop.h b/include/linux/loop.h index 6492181bcb1d..460b60fa7adf 100644 --- a/include/linux/loop.h +++ b/include/linux/loop.h | |||
@@ -53,10 +53,13 @@ struct loop_device { | |||
53 | 53 | ||
54 | spinlock_t lo_lock; | 54 | spinlock_t lo_lock; |
55 | struct bio_list lo_bio_list; | 55 | struct bio_list lo_bio_list; |
56 | unsigned int lo_bio_count; | ||
56 | int lo_state; | 57 | int lo_state; |
57 | struct mutex lo_ctl_mutex; | 58 | struct mutex lo_ctl_mutex; |
58 | struct task_struct *lo_thread; | 59 | struct task_struct *lo_thread; |
59 | wait_queue_head_t lo_event; | 60 | wait_queue_head_t lo_event; |
61 | /* wait queue for incoming requests */ | ||
62 | wait_queue_head_t lo_req_wait; | ||
60 | 63 | ||
61 | struct request_queue *lo_queue; | 64 | struct request_queue *lo_queue; |
62 | struct gendisk *lo_disk; | 65 | struct gendisk *lo_disk; |
diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h index cafc7f99e124..4019013c6593 100644 --- a/include/linux/lru_cache.h +++ b/include/linux/lru_cache.h | |||
@@ -166,9 +166,11 @@ struct lc_element { | |||
166 | /* if we want to track a larger set of objects, | 166 | /* if we want to track a larger set of objects, |
167 | * it needs to become arch independend u64 */ | 167 | * it needs to become arch independend u64 */ |
168 | unsigned lc_number; | 168 | unsigned lc_number; |
169 | |||
170 | /* special label when on free list */ | 169 | /* special label when on free list */ |
171 | #define LC_FREE (~0U) | 170 | #define LC_FREE (~0U) |
171 | |||
172 | /* for pending changes */ | ||
173 | unsigned lc_new_number; | ||
172 | }; | 174 | }; |
173 | 175 | ||
174 | struct lru_cache { | 176 | struct lru_cache { |
@@ -176,6 +178,7 @@ struct lru_cache { | |||
176 | struct list_head lru; | 178 | struct list_head lru; |
177 | struct list_head free; | 179 | struct list_head free; |
178 | struct list_head in_use; | 180 | struct list_head in_use; |
181 | struct list_head to_be_changed; | ||
179 | 182 | ||
180 | /* the pre-created kmem cache to allocate the objects from */ | 183 | /* the pre-created kmem cache to allocate the objects from */ |
181 | struct kmem_cache *lc_cache; | 184 | struct kmem_cache *lc_cache; |
@@ -186,7 +189,7 @@ struct lru_cache { | |||
186 | size_t element_off; | 189 | size_t element_off; |
187 | 190 | ||
188 | /* number of elements (indices) */ | 191 | /* number of elements (indices) */ |
189 | unsigned int nr_elements; | 192 | unsigned int nr_elements; |
190 | /* Arbitrary limit on maximum tracked objects. Practical limit is much | 193 | /* Arbitrary limit on maximum tracked objects. Practical limit is much |
191 | * lower due to allocation failures, probably. For typical use cases, | 194 | * lower due to allocation failures, probably. For typical use cases, |
192 | * nr_elements should be a few thousand at most. | 195 | * nr_elements should be a few thousand at most. |
@@ -194,18 +197,19 @@ struct lru_cache { | |||
194 | * 8 high bits of .lc_index to be overloaded with flags in the future. */ | 197 | * 8 high bits of .lc_index to be overloaded with flags in the future. */ |
195 | #define LC_MAX_ACTIVE (1<<24) | 198 | #define LC_MAX_ACTIVE (1<<24) |
196 | 199 | ||
200 | /* allow to accumulate a few (index:label) changes, | ||
201 | * but no more than max_pending_changes */ | ||
202 | unsigned int max_pending_changes; | ||
203 | /* number of elements currently on to_be_changed list */ | ||
204 | unsigned int pending_changes; | ||
205 | |||
197 | /* statistics */ | 206 | /* statistics */ |
198 | unsigned used; /* number of lelements currently on in_use list */ | 207 | unsigned used; /* number of elements currently on in_use list */ |
199 | unsigned long hits, misses, starving, dirty, changed; | 208 | unsigned long hits, misses, starving, locked, changed; |
200 | 209 | ||
201 | /* see below: flag-bits for lru_cache */ | 210 | /* see below: flag-bits for lru_cache */ |
202 | unsigned long flags; | 211 | unsigned long flags; |
203 | 212 | ||
204 | /* when changing the label of an index element */ | ||
205 | unsigned int new_number; | ||
206 | |||
207 | /* for paranoia when changing the label of an index element */ | ||
208 | struct lc_element *changing_element; | ||
209 | 213 | ||
210 | void *lc_private; | 214 | void *lc_private; |
211 | const char *name; | 215 | const char *name; |
@@ -221,10 +225,15 @@ enum { | |||
221 | /* debugging aid, to catch concurrent access early. | 225 | /* debugging aid, to catch concurrent access early. |
222 | * user needs to guarantee exclusive access by proper locking! */ | 226 | * user needs to guarantee exclusive access by proper locking! */ |
223 | __LC_PARANOIA, | 227 | __LC_PARANOIA, |
224 | /* if we need to change the set, but currently there is a changing | 228 | |
225 | * transaction pending, we are "dirty", and must deferr further | 229 | /* annotate that the set is "dirty", possibly accumulating further |
226 | * changing requests */ | 230 | * changes, until a transaction is finally triggered */ |
227 | __LC_DIRTY, | 231 | __LC_DIRTY, |
232 | |||
233 | /* Locked, no further changes allowed. | ||
234 | * Also used to serialize changing transactions. */ | ||
235 | __LC_LOCKED, | ||
236 | |||
228 | /* if we need to change the set, but currently there is no free nor | 237 | /* if we need to change the set, but currently there is no free nor |
229 | * unused element available, we are "starving", and must not give out | 238 | * unused element available, we are "starving", and must not give out |
230 | * further references, to guarantee that eventually some refcnt will | 239 | * further references, to guarantee that eventually some refcnt will |
@@ -236,9 +245,11 @@ enum { | |||
236 | }; | 245 | }; |
237 | #define LC_PARANOIA (1<<__LC_PARANOIA) | 246 | #define LC_PARANOIA (1<<__LC_PARANOIA) |
238 | #define LC_DIRTY (1<<__LC_DIRTY) | 247 | #define LC_DIRTY (1<<__LC_DIRTY) |
248 | #define LC_LOCKED (1<<__LC_LOCKED) | ||
239 | #define LC_STARVING (1<<__LC_STARVING) | 249 | #define LC_STARVING (1<<__LC_STARVING) |
240 | 250 | ||
241 | extern struct lru_cache *lc_create(const char *name, struct kmem_cache *cache, | 251 | extern struct lru_cache *lc_create(const char *name, struct kmem_cache *cache, |
252 | unsigned max_pending_changes, | ||
242 | unsigned e_count, size_t e_size, size_t e_off); | 253 | unsigned e_count, size_t e_size, size_t e_off); |
243 | extern void lc_reset(struct lru_cache *lc); | 254 | extern void lc_reset(struct lru_cache *lc); |
244 | extern void lc_destroy(struct lru_cache *lc); | 255 | extern void lc_destroy(struct lru_cache *lc); |
@@ -249,7 +260,7 @@ extern struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr); | |||
249 | extern struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr); | 260 | extern struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr); |
250 | extern struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr); | 261 | extern struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr); |
251 | extern unsigned int lc_put(struct lru_cache *lc, struct lc_element *e); | 262 | extern unsigned int lc_put(struct lru_cache *lc, struct lc_element *e); |
252 | extern void lc_changed(struct lru_cache *lc, struct lc_element *e); | 263 | extern void lc_committed(struct lru_cache *lc); |
253 | 264 | ||
254 | struct seq_file; | 265 | struct seq_file; |
255 | extern size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc); | 266 | extern size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc); |
@@ -258,32 +269,40 @@ extern void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char | |||
258 | void (*detail) (struct seq_file *, struct lc_element *)); | 269 | void (*detail) (struct seq_file *, struct lc_element *)); |
259 | 270 | ||
260 | /** | 271 | /** |
261 | * lc_try_lock - can be used to stop lc_get() from changing the tracked set | 272 | * lc_try_lock_for_transaction - can be used to stop lc_get() from changing the tracked set |
262 | * @lc: the lru cache to operate on | 273 | * @lc: the lru cache to operate on |
263 | * | 274 | * |
264 | * Note that the reference counts and order on the active and lru lists may | 275 | * Allows (expects) the set to be "dirty". Note that the reference counts and |
265 | * still change. Returns true if we acquired the lock. | 276 | * order on the active and lru lists may still change. Used to serialize |
277 | * changing transactions. Returns true if we aquired the lock. | ||
266 | */ | 278 | */ |
267 | static inline int lc_try_lock(struct lru_cache *lc) | 279 | static inline int lc_try_lock_for_transaction(struct lru_cache *lc) |
268 | { | 280 | { |
269 | return !test_and_set_bit(__LC_DIRTY, &lc->flags); | 281 | return !test_and_set_bit(__LC_LOCKED, &lc->flags); |
270 | } | 282 | } |
271 | 283 | ||
272 | /** | 284 | /** |
285 | * lc_try_lock - variant to stop lc_get() from changing the tracked set | ||
286 | * @lc: the lru cache to operate on | ||
287 | * | ||
288 | * Note that the reference counts and order on the active and lru lists may | ||
289 | * still change. Only works on a "clean" set. Returns true if we aquired the | ||
290 | * lock, which means there are no pending changes, and any further attempt to | ||
291 | * change the set will not succeed until the next lc_unlock(). | ||
292 | */ | ||
293 | extern int lc_try_lock(struct lru_cache *lc); | ||
294 | |||
295 | /** | ||
273 | * lc_unlock - unlock @lc, allow lc_get() to change the set again | 296 | * lc_unlock - unlock @lc, allow lc_get() to change the set again |
274 | * @lc: the lru cache to operate on | 297 | * @lc: the lru cache to operate on |
275 | */ | 298 | */ |
276 | static inline void lc_unlock(struct lru_cache *lc) | 299 | static inline void lc_unlock(struct lru_cache *lc) |
277 | { | 300 | { |
278 | clear_bit(__LC_DIRTY, &lc->flags); | 301 | clear_bit(__LC_DIRTY, &lc->flags); |
279 | smp_mb__after_clear_bit(); | 302 | clear_bit_unlock(__LC_LOCKED, &lc->flags); |
280 | } | 303 | } |
281 | 304 | ||
282 | static inline int lc_is_used(struct lru_cache *lc, unsigned int enr) | 305 | extern bool lc_is_used(struct lru_cache *lc, unsigned int enr); |
283 | { | ||
284 | struct lc_element *e = lc_find(lc, enr); | ||
285 | return e && e->refcnt; | ||
286 | } | ||
287 | 306 | ||
288 | #define lc_entry(ptr, type, member) \ | 307 | #define lc_entry(ptr, type, member) \ |
289 | container_of(ptr, type, member) | 308 | container_of(ptr, type, member) |
diff --git a/include/linux/wait.h b/include/linux/wait.h index 168dfe122dd3..7cb64d4b499d 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h | |||
@@ -550,6 +550,170 @@ do { \ | |||
550 | __ret; \ | 550 | __ret; \ |
551 | }) | 551 | }) |
552 | 552 | ||
553 | |||
554 | #define __wait_event_lock_irq(wq, condition, lock, cmd) \ | ||
555 | do { \ | ||
556 | DEFINE_WAIT(__wait); \ | ||
557 | \ | ||
558 | for (;;) { \ | ||
559 | prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE); \ | ||
560 | if (condition) \ | ||
561 | break; \ | ||
562 | spin_unlock_irq(&lock); \ | ||
563 | cmd; \ | ||
564 | schedule(); \ | ||
565 | spin_lock_irq(&lock); \ | ||
566 | } \ | ||
567 | finish_wait(&wq, &__wait); \ | ||
568 | } while (0) | ||
569 | |||
570 | /** | ||
571 | * wait_event_lock_irq_cmd - sleep until a condition gets true. The | ||
572 | * condition is checked under the lock. This | ||
573 | * is expected to be called with the lock | ||
574 | * taken. | ||
575 | * @wq: the waitqueue to wait on | ||
576 | * @condition: a C expression for the event to wait for | ||
577 | * @lock: a locked spinlock_t, which will be released before cmd | ||
578 | * and schedule() and reacquired afterwards. | ||
579 | * @cmd: a command which is invoked outside the critical section before | ||
580 | * sleep | ||
581 | * | ||
582 | * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the | ||
583 | * @condition evaluates to true. The @condition is checked each time | ||
584 | * the waitqueue @wq is woken up. | ||
585 | * | ||
586 | * wake_up() has to be called after changing any variable that could | ||
587 | * change the result of the wait condition. | ||
588 | * | ||
589 | * This is supposed to be called while holding the lock. The lock is | ||
590 | * dropped before invoking the cmd and going to sleep and is reacquired | ||
591 | * afterwards. | ||
592 | */ | ||
593 | #define wait_event_lock_irq_cmd(wq, condition, lock, cmd) \ | ||
594 | do { \ | ||
595 | if (condition) \ | ||
596 | break; \ | ||
597 | __wait_event_lock_irq(wq, condition, lock, cmd); \ | ||
598 | } while (0) | ||
599 | |||
600 | /** | ||
601 | * wait_event_lock_irq - sleep until a condition gets true. The | ||
602 | * condition is checked under the lock. This | ||
603 | * is expected to be called with the lock | ||
604 | * taken. | ||
605 | * @wq: the waitqueue to wait on | ||
606 | * @condition: a C expression for the event to wait for | ||
607 | * @lock: a locked spinlock_t, which will be released before schedule() | ||
608 | * and reacquired afterwards. | ||
609 | * | ||
610 | * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the | ||
611 | * @condition evaluates to true. The @condition is checked each time | ||
612 | * the waitqueue @wq is woken up. | ||
613 | * | ||
614 | * wake_up() has to be called after changing any variable that could | ||
615 | * change the result of the wait condition. | ||
616 | * | ||
617 | * This is supposed to be called while holding the lock. The lock is | ||
618 | * dropped before going to sleep and is reacquired afterwards. | ||
619 | */ | ||
620 | #define wait_event_lock_irq(wq, condition, lock) \ | ||
621 | do { \ | ||
622 | if (condition) \ | ||
623 | break; \ | ||
624 | __wait_event_lock_irq(wq, condition, lock, ); \ | ||
625 | } while (0) | ||
626 | |||
627 | |||
628 | #define __wait_event_interruptible_lock_irq(wq, condition, \ | ||
629 | lock, ret, cmd) \ | ||
630 | do { \ | ||
631 | DEFINE_WAIT(__wait); \ | ||
632 | \ | ||
633 | for (;;) { \ | ||
634 | prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \ | ||
635 | if (condition) \ | ||
636 | break; \ | ||
637 | if (signal_pending(current)) { \ | ||
638 | ret = -ERESTARTSYS; \ | ||
639 | break; \ | ||
640 | } \ | ||
641 | spin_unlock_irq(&lock); \ | ||
642 | cmd; \ | ||
643 | schedule(); \ | ||
644 | spin_lock_irq(&lock); \ | ||
645 | } \ | ||
646 | finish_wait(&wq, &__wait); \ | ||
647 | } while (0) | ||
648 | |||
649 | /** | ||
650 | * wait_event_interruptible_lock_irq_cmd - sleep until a condition gets true. | ||
651 | * The condition is checked under the lock. This is expected to | ||
652 | * be called with the lock taken. | ||
653 | * @wq: the waitqueue to wait on | ||
654 | * @condition: a C expression for the event to wait for | ||
655 | * @lock: a locked spinlock_t, which will be released before cmd and | ||
656 | * schedule() and reacquired afterwards. | ||
657 | * @cmd: a command which is invoked outside the critical section before | ||
658 | * sleep | ||
659 | * | ||
660 | * The process is put to sleep (TASK_INTERRUPTIBLE) until the | ||
661 | * @condition evaluates to true or a signal is received. The @condition is | ||
662 | * checked each time the waitqueue @wq is woken up. | ||
663 | * | ||
664 | * wake_up() has to be called after changing any variable that could | ||
665 | * change the result of the wait condition. | ||
666 | * | ||
667 | * This is supposed to be called while holding the lock. The lock is | ||
668 | * dropped before invoking the cmd and going to sleep and is reacquired | ||
669 | * afterwards. | ||
670 | * | ||
671 | * The macro will return -ERESTARTSYS if it was interrupted by a signal | ||
672 | * and 0 if @condition evaluated to true. | ||
673 | */ | ||
674 | #define wait_event_interruptible_lock_irq_cmd(wq, condition, lock, cmd) \ | ||
675 | ({ \ | ||
676 | int __ret = 0; \ | ||
677 | \ | ||
678 | if (!(condition)) \ | ||
679 | __wait_event_interruptible_lock_irq(wq, condition, \ | ||
680 | lock, __ret, cmd); \ | ||
681 | __ret; \ | ||
682 | }) | ||
683 | |||
684 | /** | ||
685 | * wait_event_interruptible_lock_irq - sleep until a condition gets true. | ||
686 | * The condition is checked under the lock. This is expected | ||
687 | * to be called with the lock taken. | ||
688 | * @wq: the waitqueue to wait on | ||
689 | * @condition: a C expression for the event to wait for | ||
690 | * @lock: a locked spinlock_t, which will be released before schedule() | ||
691 | * and reacquired afterwards. | ||
692 | * | ||
693 | * The process is put to sleep (TASK_INTERRUPTIBLE) until the | ||
694 | * @condition evaluates to true or signal is received. The @condition is | ||
695 | * checked each time the waitqueue @wq is woken up. | ||
696 | * | ||
697 | * wake_up() has to be called after changing any variable that could | ||
698 | * change the result of the wait condition. | ||
699 | * | ||
700 | * This is supposed to be called while holding the lock. The lock is | ||
701 | * dropped before going to sleep and is reacquired afterwards. | ||
702 | * | ||
703 | * The macro will return -ERESTARTSYS if it was interrupted by a signal | ||
704 | * and 0 if @condition evaluated to true. | ||
705 | */ | ||
706 | #define wait_event_interruptible_lock_irq(wq, condition, lock) \ | ||
707 | ({ \ | ||
708 | int __ret = 0; \ | ||
709 | \ | ||
710 | if (!(condition)) \ | ||
711 | __wait_event_interruptible_lock_irq(wq, condition, \ | ||
712 | lock, __ret, ); \ | ||
713 | __ret; \ | ||
714 | }) | ||
715 | |||
716 | |||
553 | /* | 717 | /* |
554 | * These are the old interfaces to sleep waiting for an event. | 718 | * These are the old interfaces to sleep waiting for an event. |
555 | * They are racy. DO NOT use them, use the wait_event* interfaces above. | 719 | * They are racy. DO NOT use them, use the wait_event* interfaces above. |
diff --git a/init/do_mounts.c b/init/do_mounts.c index f8a66424360d..1d1b6348f903 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c | |||
@@ -69,23 +69,28 @@ __setup("ro", readonly); | |||
69 | __setup("rw", readwrite); | 69 | __setup("rw", readwrite); |
70 | 70 | ||
71 | #ifdef CONFIG_BLOCK | 71 | #ifdef CONFIG_BLOCK |
72 | struct uuidcmp { | ||
73 | const char *uuid; | ||
74 | int len; | ||
75 | }; | ||
76 | |||
72 | /** | 77 | /** |
73 | * match_dev_by_uuid - callback for finding a partition using its uuid | 78 | * match_dev_by_uuid - callback for finding a partition using its uuid |
74 | * @dev: device passed in by the caller | 79 | * @dev: device passed in by the caller |
75 | * @data: opaque pointer to a 36 byte char array with a UUID | 80 | * @data: opaque pointer to the desired struct uuidcmp to match |
76 | * | 81 | * |
77 | * Returns 1 if the device matches, and 0 otherwise. | 82 | * Returns 1 if the device matches, and 0 otherwise. |
78 | */ | 83 | */ |
79 | static int match_dev_by_uuid(struct device *dev, void *data) | 84 | static int match_dev_by_uuid(struct device *dev, void *data) |
80 | { | 85 | { |
81 | u8 *uuid = data; | 86 | struct uuidcmp *cmp = data; |
82 | struct hd_struct *part = dev_to_part(dev); | 87 | struct hd_struct *part = dev_to_part(dev); |
83 | 88 | ||
84 | if (!part->info) | 89 | if (!part->info) |
85 | goto no_match; | 90 | goto no_match; |
86 | 91 | ||
87 | if (memcmp(uuid, part->info->uuid, sizeof(part->info->uuid))) | 92 | if (strncasecmp(cmp->uuid, part->info->uuid, cmp->len)) |
88 | goto no_match; | 93 | goto no_match; |
89 | 94 | ||
90 | return 1; | 95 | return 1; |
91 | no_match: | 96 | no_match: |
@@ -95,7 +100,7 @@ no_match: | |||
95 | 100 | ||
96 | /** | 101 | /** |
97 | * devt_from_partuuid - looks up the dev_t of a partition by its UUID | 102 | * devt_from_partuuid - looks up the dev_t of a partition by its UUID |
98 | * @uuid: min 36 byte char array containing a hex ascii UUID | 103 | * @uuid: char array containing ascii UUID |
99 | * | 104 | * |
100 | * The function will return the first partition which contains a matching | 105 | * The function will return the first partition which contains a matching |
101 | * UUID value in its partition_meta_info struct. This does not search | 106 | * UUID value in its partition_meta_info struct. This does not search |
@@ -106,38 +111,41 @@ no_match: | |||
106 | * | 111 | * |
107 | * Returns the matching dev_t on success or 0 on failure. | 112 | * Returns the matching dev_t on success or 0 on failure. |
108 | */ | 113 | */ |
109 | static dev_t devt_from_partuuid(char *uuid_str) | 114 | static dev_t devt_from_partuuid(const char *uuid_str) |
110 | { | 115 | { |
111 | dev_t res = 0; | 116 | dev_t res = 0; |
117 | struct uuidcmp cmp; | ||
112 | struct device *dev = NULL; | 118 | struct device *dev = NULL; |
113 | u8 uuid[16]; | ||
114 | struct gendisk *disk; | 119 | struct gendisk *disk; |
115 | struct hd_struct *part; | 120 | struct hd_struct *part; |
116 | int offset = 0; | 121 | int offset = 0; |
122 | bool clear_root_wait = false; | ||
123 | char *slash; | ||
117 | 124 | ||
118 | if (strlen(uuid_str) < 36) | 125 | cmp.uuid = uuid_str; |
119 | goto done; | ||
120 | 126 | ||
127 | slash = strchr(uuid_str, '/'); | ||
121 | /* Check for optional partition number offset attributes. */ | 128 | /* Check for optional partition number offset attributes. */ |
122 | if (uuid_str[36]) { | 129 | if (slash) { |
123 | char c = 0; | 130 | char c = 0; |
124 | /* Explicitly fail on poor PARTUUID syntax. */ | 131 | /* Explicitly fail on poor PARTUUID syntax. */ |
125 | if (sscanf(&uuid_str[36], | 132 | if (sscanf(slash + 1, |
126 | "/PARTNROFF=%d%c", &offset, &c) != 1) { | 133 | "PARTNROFF=%d%c", &offset, &c) != 1) { |
127 | printk(KERN_ERR "VFS: PARTUUID= is invalid.\n" | 134 | clear_root_wait = true; |
128 | "Expected PARTUUID=<valid-uuid-id>[/PARTNROFF=%%d]\n"); | ||
129 | if (root_wait) | ||
130 | printk(KERN_ERR | ||
131 | "Disabling rootwait; root= is invalid.\n"); | ||
132 | root_wait = 0; | ||
133 | goto done; | 135 | goto done; |
134 | } | 136 | } |
137 | cmp.len = slash - uuid_str; | ||
138 | } else { | ||
139 | cmp.len = strlen(uuid_str); | ||
135 | } | 140 | } |
136 | 141 | ||
137 | /* Pack the requested UUID in the expected format. */ | 142 | if (!cmp.len) { |
138 | part_pack_uuid(uuid_str, uuid); | 143 | clear_root_wait = true; |
144 | goto done; | ||
145 | } | ||
139 | 146 | ||
140 | dev = class_find_device(&block_class, NULL, uuid, &match_dev_by_uuid); | 147 | dev = class_find_device(&block_class, NULL, &cmp, |
148 | &match_dev_by_uuid); | ||
141 | if (!dev) | 149 | if (!dev) |
142 | goto done; | 150 | goto done; |
143 | 151 | ||
@@ -158,6 +166,13 @@ static dev_t devt_from_partuuid(char *uuid_str) | |||
158 | no_offset: | 166 | no_offset: |
159 | put_device(dev); | 167 | put_device(dev); |
160 | done: | 168 | done: |
169 | if (clear_root_wait) { | ||
170 | pr_err("VFS: PARTUUID= is invalid.\n" | ||
171 | "Expected PARTUUID=<valid-uuid-id>[/PARTNROFF=%%d]\n"); | ||
172 | if (root_wait) | ||
173 | pr_err("Disabling rootwait; root= is invalid.\n"); | ||
174 | root_wait = 0; | ||
175 | } | ||
161 | return res; | 176 | return res; |
162 | } | 177 | } |
163 | #endif | 178 | #endif |
@@ -174,6 +189,10 @@ done: | |||
174 | * used when disk name of partitioned disk ends on a digit. | 189 | * used when disk name of partitioned disk ends on a digit. |
175 | * 6) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the | 190 | * 6) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the |
176 | * unique id of a partition if the partition table provides it. | 191 | * unique id of a partition if the partition table provides it. |
192 | * The UUID may be either an EFI/GPT UUID, or refer to an MSDOS | ||
193 | * partition using the format SSSSSSSS-PP, where SSSSSSSS is a zero- | ||
194 | * filled hex representation of the 32-bit "NT disk signature", and PP | ||
195 | * is a zero-filled hex representation of the 1-based partition number. | ||
177 | * 7) PARTUUID=<UUID>/PARTNROFF=<int> to select a partition in relation to | 196 | * 7) PARTUUID=<UUID>/PARTNROFF=<int> to select a partition in relation to |
178 | * a partition with a known unique id. | 197 | * a partition with a known unique id. |
179 | * | 198 | * |
diff --git a/lib/lru_cache.c b/lib/lru_cache.c index a07e7268d7ed..d71d89498943 100644 --- a/lib/lru_cache.c +++ b/lib/lru_cache.c | |||
@@ -44,8 +44,8 @@ MODULE_LICENSE("GPL"); | |||
44 | } while (0) | 44 | } while (0) |
45 | 45 | ||
46 | #define RETURN(x...) do { \ | 46 | #define RETURN(x...) do { \ |
47 | clear_bit(__LC_PARANOIA, &lc->flags); \ | 47 | clear_bit_unlock(__LC_PARANOIA, &lc->flags); \ |
48 | smp_mb__after_clear_bit(); return x ; } while (0) | 48 | return x ; } while (0) |
49 | 49 | ||
50 | /* BUG() if e is not one of the elements tracked by lc */ | 50 | /* BUG() if e is not one of the elements tracked by lc */ |
51 | #define PARANOIA_LC_ELEMENT(lc, e) do { \ | 51 | #define PARANOIA_LC_ELEMENT(lc, e) do { \ |
@@ -55,9 +55,40 @@ MODULE_LICENSE("GPL"); | |||
55 | BUG_ON(i >= lc_->nr_elements); \ | 55 | BUG_ON(i >= lc_->nr_elements); \ |
56 | BUG_ON(lc_->lc_element[i] != e_); } while (0) | 56 | BUG_ON(lc_->lc_element[i] != e_); } while (0) |
57 | 57 | ||
58 | |||
59 | /* We need to atomically | ||
60 | * - try to grab the lock (set LC_LOCKED) | ||
61 | * - only if there is no pending transaction | ||
62 | * (neither LC_DIRTY nor LC_STARVING is set) | ||
63 | * Because of PARANOIA_ENTRY() above abusing lc->flags as well, | ||
64 | * it is not sufficient to just say | ||
65 | * return 0 == cmpxchg(&lc->flags, 0, LC_LOCKED); | ||
66 | */ | ||
67 | int lc_try_lock(struct lru_cache *lc) | ||
68 | { | ||
69 | unsigned long val; | ||
70 | do { | ||
71 | val = cmpxchg(&lc->flags, 0, LC_LOCKED); | ||
72 | } while (unlikely (val == LC_PARANOIA)); | ||
73 | /* Spin until no-one is inside a PARANOIA_ENTRY()/RETURN() section. */ | ||
74 | return 0 == val; | ||
75 | #if 0 | ||
76 | /* Alternative approach, spin in case someone enters or leaves a | ||
77 | * PARANOIA_ENTRY()/RETURN() section. */ | ||
78 | unsigned long old, new, val; | ||
79 | do { | ||
80 | old = lc->flags & LC_PARANOIA; | ||
81 | new = old | LC_LOCKED; | ||
82 | val = cmpxchg(&lc->flags, old, new); | ||
83 | } while (unlikely (val == (old ^ LC_PARANOIA))); | ||
84 | return old == val; | ||
85 | #endif | ||
86 | } | ||
87 | |||
58 | /** | 88 | /** |
59 | * lc_create - prepares to track objects in an active set | 89 | * lc_create - prepares to track objects in an active set |
60 | * @name: descriptive name only used in lc_seq_printf_stats and lc_seq_dump_details | 90 | * @name: descriptive name only used in lc_seq_printf_stats and lc_seq_dump_details |
91 | * @max_pending_changes: maximum changes to accumulate until a transaction is required | ||
61 | * @e_count: number of elements allowed to be active simultaneously | 92 | * @e_count: number of elements allowed to be active simultaneously |
62 | * @e_size: size of the tracked objects | 93 | * @e_size: size of the tracked objects |
63 | * @e_off: offset to the &struct lc_element member in a tracked object | 94 | * @e_off: offset to the &struct lc_element member in a tracked object |
@@ -66,6 +97,7 @@ MODULE_LICENSE("GPL"); | |||
66 | * or NULL on (allocation) failure. | 97 | * or NULL on (allocation) failure. |
67 | */ | 98 | */ |
68 | struct lru_cache *lc_create(const char *name, struct kmem_cache *cache, | 99 | struct lru_cache *lc_create(const char *name, struct kmem_cache *cache, |
100 | unsigned max_pending_changes, | ||
69 | unsigned e_count, size_t e_size, size_t e_off) | 101 | unsigned e_count, size_t e_size, size_t e_off) |
70 | { | 102 | { |
71 | struct hlist_head *slot = NULL; | 103 | struct hlist_head *slot = NULL; |
@@ -98,12 +130,13 @@ struct lru_cache *lc_create(const char *name, struct kmem_cache *cache, | |||
98 | INIT_LIST_HEAD(&lc->in_use); | 130 | INIT_LIST_HEAD(&lc->in_use); |
99 | INIT_LIST_HEAD(&lc->lru); | 131 | INIT_LIST_HEAD(&lc->lru); |
100 | INIT_LIST_HEAD(&lc->free); | 132 | INIT_LIST_HEAD(&lc->free); |
133 | INIT_LIST_HEAD(&lc->to_be_changed); | ||
101 | 134 | ||
102 | lc->name = name; | 135 | lc->name = name; |
103 | lc->element_size = e_size; | 136 | lc->element_size = e_size; |
104 | lc->element_off = e_off; | 137 | lc->element_off = e_off; |
105 | lc->nr_elements = e_count; | 138 | lc->nr_elements = e_count; |
106 | lc->new_number = LC_FREE; | 139 | lc->max_pending_changes = max_pending_changes; |
107 | lc->lc_cache = cache; | 140 | lc->lc_cache = cache; |
108 | lc->lc_element = element; | 141 | lc->lc_element = element; |
109 | lc->lc_slot = slot; | 142 | lc->lc_slot = slot; |
@@ -117,6 +150,7 @@ struct lru_cache *lc_create(const char *name, struct kmem_cache *cache, | |||
117 | e = p + e_off; | 150 | e = p + e_off; |
118 | e->lc_index = i; | 151 | e->lc_index = i; |
119 | e->lc_number = LC_FREE; | 152 | e->lc_number = LC_FREE; |
153 | e->lc_new_number = LC_FREE; | ||
120 | list_add(&e->list, &lc->free); | 154 | list_add(&e->list, &lc->free); |
121 | element[i] = e; | 155 | element[i] = e; |
122 | } | 156 | } |
@@ -175,15 +209,15 @@ void lc_reset(struct lru_cache *lc) | |||
175 | INIT_LIST_HEAD(&lc->in_use); | 209 | INIT_LIST_HEAD(&lc->in_use); |
176 | INIT_LIST_HEAD(&lc->lru); | 210 | INIT_LIST_HEAD(&lc->lru); |
177 | INIT_LIST_HEAD(&lc->free); | 211 | INIT_LIST_HEAD(&lc->free); |
212 | INIT_LIST_HEAD(&lc->to_be_changed); | ||
178 | lc->used = 0; | 213 | lc->used = 0; |
179 | lc->hits = 0; | 214 | lc->hits = 0; |
180 | lc->misses = 0; | 215 | lc->misses = 0; |
181 | lc->starving = 0; | 216 | lc->starving = 0; |
182 | lc->dirty = 0; | 217 | lc->locked = 0; |
183 | lc->changed = 0; | 218 | lc->changed = 0; |
219 | lc->pending_changes = 0; | ||
184 | lc->flags = 0; | 220 | lc->flags = 0; |
185 | lc->changing_element = NULL; | ||
186 | lc->new_number = LC_FREE; | ||
187 | memset(lc->lc_slot, 0, sizeof(struct hlist_head) * lc->nr_elements); | 221 | memset(lc->lc_slot, 0, sizeof(struct hlist_head) * lc->nr_elements); |
188 | 222 | ||
189 | for (i = 0; i < lc->nr_elements; i++) { | 223 | for (i = 0; i < lc->nr_elements; i++) { |
@@ -194,6 +228,7 @@ void lc_reset(struct lru_cache *lc) | |||
194 | /* re-init it */ | 228 | /* re-init it */ |
195 | e->lc_index = i; | 229 | e->lc_index = i; |
196 | e->lc_number = LC_FREE; | 230 | e->lc_number = LC_FREE; |
231 | e->lc_new_number = LC_FREE; | ||
197 | list_add(&e->list, &lc->free); | 232 | list_add(&e->list, &lc->free); |
198 | } | 233 | } |
199 | } | 234 | } |
@@ -208,14 +243,14 @@ size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc) | |||
208 | /* NOTE: | 243 | /* NOTE: |
209 | * total calls to lc_get are | 244 | * total calls to lc_get are |
210 | * (starving + hits + misses) | 245 | * (starving + hits + misses) |
211 | * misses include "dirty" count (update from an other thread in | 246 | * misses include "locked" count (update from an other thread in |
212 | * progress) and "changed", when this in fact lead to an successful | 247 | * progress) and "changed", when this in fact lead to an successful |
213 | * update of the cache. | 248 | * update of the cache. |
214 | */ | 249 | */ |
215 | return seq_printf(seq, "\t%s: used:%u/%u " | 250 | return seq_printf(seq, "\t%s: used:%u/%u " |
216 | "hits:%lu misses:%lu starving:%lu dirty:%lu changed:%lu\n", | 251 | "hits:%lu misses:%lu starving:%lu locked:%lu changed:%lu\n", |
217 | lc->name, lc->used, lc->nr_elements, | 252 | lc->name, lc->used, lc->nr_elements, |
218 | lc->hits, lc->misses, lc->starving, lc->dirty, lc->changed); | 253 | lc->hits, lc->misses, lc->starving, lc->locked, lc->changed); |
219 | } | 254 | } |
220 | 255 | ||
221 | static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr) | 256 | static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr) |
@@ -224,16 +259,8 @@ static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr) | |||
224 | } | 259 | } |
225 | 260 | ||
226 | 261 | ||
227 | /** | 262 | static struct lc_element *__lc_find(struct lru_cache *lc, unsigned int enr, |
228 | * lc_find - find element by label, if present in the hash table | 263 | bool include_changing) |
229 | * @lc: The lru_cache object | ||
230 | * @enr: element number | ||
231 | * | ||
232 | * Returns the pointer to an element, if the element with the requested | ||
233 | * "label" or element number is present in the hash table, | ||
234 | * or NULL if not found. Does not change the refcnt. | ||
235 | */ | ||
236 | struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr) | ||
237 | { | 264 | { |
238 | struct hlist_node *n; | 265 | struct hlist_node *n; |
239 | struct lc_element *e; | 266 | struct lc_element *e; |
@@ -241,29 +268,48 @@ struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr) | |||
241 | BUG_ON(!lc); | 268 | BUG_ON(!lc); |
242 | BUG_ON(!lc->nr_elements); | 269 | BUG_ON(!lc->nr_elements); |
243 | hlist_for_each_entry(e, n, lc_hash_slot(lc, enr), colision) { | 270 | hlist_for_each_entry(e, n, lc_hash_slot(lc, enr), colision) { |
244 | if (e->lc_number == enr) | 271 | /* "about to be changed" elements, pending transaction commit, |
272 | * are hashed by their "new number". "Normal" elements have | ||
273 | * lc_number == lc_new_number. */ | ||
274 | if (e->lc_new_number != enr) | ||
275 | continue; | ||
276 | if (e->lc_new_number == e->lc_number || include_changing) | ||
245 | return e; | 277 | return e; |
278 | break; | ||
246 | } | 279 | } |
247 | return NULL; | 280 | return NULL; |
248 | } | 281 | } |
249 | 282 | ||
250 | /* returned element will be "recycled" immediately */ | 283 | /** |
251 | static struct lc_element *lc_evict(struct lru_cache *lc) | 284 | * lc_find - find element by label, if present in the hash table |
285 | * @lc: The lru_cache object | ||
286 | * @enr: element number | ||
287 | * | ||
288 | * Returns the pointer to an element, if the element with the requested | ||
289 | * "label" or element number is present in the hash table, | ||
290 | * or NULL if not found. Does not change the refcnt. | ||
291 | * Ignores elements that are "about to be used", i.e. not yet in the active | ||
292 | * set, but still pending transaction commit. | ||
293 | */ | ||
294 | struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr) | ||
252 | { | 295 | { |
253 | struct list_head *n; | 296 | return __lc_find(lc, enr, 0); |
254 | struct lc_element *e; | 297 | } |
255 | |||
256 | if (list_empty(&lc->lru)) | ||
257 | return NULL; | ||
258 | |||
259 | n = lc->lru.prev; | ||
260 | e = list_entry(n, struct lc_element, list); | ||
261 | |||
262 | PARANOIA_LC_ELEMENT(lc, e); | ||
263 | 298 | ||
264 | list_del(&e->list); | 299 | /** |
265 | hlist_del(&e->colision); | 300 | * lc_is_used - find element by label |
266 | return e; | 301 | * @lc: The lru_cache object |
302 | * @enr: element number | ||
303 | * | ||
304 | * Returns true, if the element with the requested "label" or element number is | ||
305 | * present in the hash table, and is used (refcnt > 0). | ||
306 | * Also finds elements that are not _currently_ used but only "about to be | ||
307 | * used", i.e. on the "to_be_changed" list, pending transaction commit. | ||
308 | */ | ||
309 | bool lc_is_used(struct lru_cache *lc, unsigned int enr) | ||
310 | { | ||
311 | struct lc_element *e = __lc_find(lc, enr, 1); | ||
312 | return e && e->refcnt; | ||
267 | } | 313 | } |
268 | 314 | ||
269 | /** | 315 | /** |
@@ -280,22 +326,34 @@ void lc_del(struct lru_cache *lc, struct lc_element *e) | |||
280 | PARANOIA_LC_ELEMENT(lc, e); | 326 | PARANOIA_LC_ELEMENT(lc, e); |
281 | BUG_ON(e->refcnt); | 327 | BUG_ON(e->refcnt); |
282 | 328 | ||
283 | e->lc_number = LC_FREE; | 329 | e->lc_number = e->lc_new_number = LC_FREE; |
284 | hlist_del_init(&e->colision); | 330 | hlist_del_init(&e->colision); |
285 | list_move(&e->list, &lc->free); | 331 | list_move(&e->list, &lc->free); |
286 | RETURN(); | 332 | RETURN(); |
287 | } | 333 | } |
288 | 334 | ||
289 | static struct lc_element *lc_get_unused_element(struct lru_cache *lc) | 335 | static struct lc_element *lc_prepare_for_change(struct lru_cache *lc, unsigned new_number) |
290 | { | 336 | { |
291 | struct list_head *n; | 337 | struct list_head *n; |
338 | struct lc_element *e; | ||
339 | |||
340 | if (!list_empty(&lc->free)) | ||
341 | n = lc->free.next; | ||
342 | else if (!list_empty(&lc->lru)) | ||
343 | n = lc->lru.prev; | ||
344 | else | ||
345 | return NULL; | ||
346 | |||
347 | e = list_entry(n, struct lc_element, list); | ||
348 | PARANOIA_LC_ELEMENT(lc, e); | ||
292 | 349 | ||
293 | if (list_empty(&lc->free)) | 350 | e->lc_new_number = new_number; |
294 | return lc_evict(lc); | 351 | if (!hlist_unhashed(&e->colision)) |
352 | __hlist_del(&e->colision); | ||
353 | hlist_add_head(&e->colision, lc_hash_slot(lc, new_number)); | ||
354 | list_move(&e->list, &lc->to_be_changed); | ||
295 | 355 | ||
296 | n = lc->free.next; | 356 | return e; |
297 | list_del(n); | ||
298 | return list_entry(n, struct lc_element, list); | ||
299 | } | 357 | } |
300 | 358 | ||
301 | static int lc_unused_element_available(struct lru_cache *lc) | 359 | static int lc_unused_element_available(struct lru_cache *lc) |
@@ -308,45 +366,7 @@ static int lc_unused_element_available(struct lru_cache *lc) | |||
308 | return 0; | 366 | return 0; |
309 | } | 367 | } |
310 | 368 | ||
311 | 369 | static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool may_change) | |
312 | /** | ||
313 | * lc_get - get element by label, maybe change the active set | ||
314 | * @lc: the lru cache to operate on | ||
315 | * @enr: the label to look up | ||
316 | * | ||
317 | * Finds an element in the cache, increases its usage count, | ||
318 | * "touches" and returns it. | ||
319 | * | ||
320 | * In case the requested number is not present, it needs to be added to the | ||
321 | * cache. Therefore it is possible that an other element becomes evicted from | ||
322 | * the cache. In either case, the user is notified so he is able to e.g. keep | ||
323 | * a persistent log of the cache changes, and therefore the objects in use. | ||
324 | * | ||
325 | * Return values: | ||
326 | * NULL | ||
327 | * The cache was marked %LC_STARVING, | ||
328 | * or the requested label was not in the active set | ||
329 | * and a changing transaction is still pending (@lc was marked %LC_DIRTY). | ||
330 | * Or no unused or free element could be recycled (@lc will be marked as | ||
331 | * %LC_STARVING, blocking further lc_get() operations). | ||
332 | * | ||
333 | * pointer to the element with the REQUESTED element number. | ||
334 | * In this case, it can be used right away | ||
335 | * | ||
336 | * pointer to an UNUSED element with some different element number, | ||
337 | * where that different number may also be %LC_FREE. | ||
338 | * | ||
339 | * In this case, the cache is marked %LC_DIRTY (blocking further changes), | ||
340 | * and the returned element pointer is removed from the lru list and | ||
341 | * hash collision chains. The user now should do whatever housekeeping | ||
342 | * is necessary. | ||
343 | * Then he must call lc_changed(lc,element_pointer), to finish | ||
344 | * the change. | ||
345 | * | ||
346 | * NOTE: The user needs to check the lc_number on EACH use, so he recognizes | ||
347 | * any cache set change. | ||
348 | */ | ||
349 | struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr) | ||
350 | { | 370 | { |
351 | struct lc_element *e; | 371 | struct lc_element *e; |
352 | 372 | ||
@@ -356,8 +376,12 @@ struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr) | |||
356 | RETURN(NULL); | 376 | RETURN(NULL); |
357 | } | 377 | } |
358 | 378 | ||
359 | e = lc_find(lc, enr); | 379 | e = __lc_find(lc, enr, 1); |
360 | if (e) { | 380 | /* if lc_new_number != lc_number, |
381 | * this enr is currently being pulled in already, | ||
382 | * and will be available once the pending transaction | ||
383 | * has been committed. */ | ||
384 | if (e && e->lc_new_number == e->lc_number) { | ||
361 | ++lc->hits; | 385 | ++lc->hits; |
362 | if (e->refcnt++ == 0) | 386 | if (e->refcnt++ == 0) |
363 | lc->used++; | 387 | lc->used++; |
@@ -366,6 +390,26 @@ struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr) | |||
366 | } | 390 | } |
367 | 391 | ||
368 | ++lc->misses; | 392 | ++lc->misses; |
393 | if (!may_change) | ||
394 | RETURN(NULL); | ||
395 | |||
396 | /* It has been found above, but on the "to_be_changed" list, not yet | ||
397 | * committed. Don't pull it in twice, wait for the transaction, then | ||
398 | * try again */ | ||
399 | if (e) | ||
400 | RETURN(NULL); | ||
401 | |||
402 | /* To avoid races with lc_try_lock(), first, mark us dirty | ||
403 | * (using test_and_set_bit, as it implies memory barriers), ... */ | ||
404 | test_and_set_bit(__LC_DIRTY, &lc->flags); | ||
405 | |||
406 | /* ... only then check if it is locked anyways. If lc_unlock clears | ||
407 | * the dirty bit again, that's not a problem, we will come here again. | ||
408 | */ | ||
409 | if (test_bit(__LC_LOCKED, &lc->flags)) { | ||
410 | ++lc->locked; | ||
411 | RETURN(NULL); | ||
412 | } | ||
369 | 413 | ||
370 | /* In case there is nothing available and we can not kick out | 414 | /* In case there is nothing available and we can not kick out |
371 | * the LRU element, we have to wait ... | 415 | * the LRU element, we have to wait ... |
@@ -375,71 +419,109 @@ struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr) | |||
375 | RETURN(NULL); | 419 | RETURN(NULL); |
376 | } | 420 | } |
377 | 421 | ||
378 | /* it was not present in the active set. | 422 | /* It was not present in the active set. We are going to recycle an |
379 | * we are going to recycle an unused (or even "free") element. | 423 | * unused (or even "free") element, but we won't accumulate more than |
380 | * user may need to commit a transaction to record that change. | 424 | * max_pending_changes changes. */ |
381 | * we serialize on flags & TF_DIRTY */ | 425 | if (lc->pending_changes >= lc->max_pending_changes) |
382 | if (test_and_set_bit(__LC_DIRTY, &lc->flags)) { | ||
383 | ++lc->dirty; | ||
384 | RETURN(NULL); | 426 | RETURN(NULL); |
385 | } | ||
386 | 427 | ||
387 | e = lc_get_unused_element(lc); | 428 | e = lc_prepare_for_change(lc, enr); |
388 | BUG_ON(!e); | 429 | BUG_ON(!e); |
389 | 430 | ||
390 | clear_bit(__LC_STARVING, &lc->flags); | 431 | clear_bit(__LC_STARVING, &lc->flags); |
391 | BUG_ON(++e->refcnt != 1); | 432 | BUG_ON(++e->refcnt != 1); |
392 | lc->used++; | 433 | lc->used++; |
393 | 434 | lc->pending_changes++; | |
394 | lc->changing_element = e; | ||
395 | lc->new_number = enr; | ||
396 | 435 | ||
397 | RETURN(e); | 436 | RETURN(e); |
398 | } | 437 | } |
399 | 438 | ||
400 | /* similar to lc_get, | 439 | /** |
401 | * but only gets a new reference on an existing element. | 440 | * lc_get - get element by label, maybe change the active set |
402 | * you either get the requested element, or NULL. | 441 | * @lc: the lru cache to operate on |
403 | * will be consolidated into one function. | 442 | * @enr: the label to look up |
443 | * | ||
444 | * Finds an element in the cache, increases its usage count, | ||
445 | * "touches" and returns it. | ||
446 | * | ||
447 | * In case the requested number is not present, it needs to be added to the | ||
448 | * cache. Therefore it is possible that an other element becomes evicted from | ||
449 | * the cache. In either case, the user is notified so he is able to e.g. keep | ||
450 | * a persistent log of the cache changes, and therefore the objects in use. | ||
451 | * | ||
452 | * Return values: | ||
453 | * NULL | ||
454 | * The cache was marked %LC_STARVING, | ||
455 | * or the requested label was not in the active set | ||
456 | * and a changing transaction is still pending (@lc was marked %LC_DIRTY). | ||
457 | * Or no unused or free element could be recycled (@lc will be marked as | ||
458 | * %LC_STARVING, blocking further lc_get() operations). | ||
459 | * | ||
460 | * pointer to the element with the REQUESTED element number. | ||
461 | * In this case, it can be used right away | ||
462 | * | ||
463 | * pointer to an UNUSED element with some different element number, | ||
464 | * where that different number may also be %LC_FREE. | ||
465 | * | ||
466 | * In this case, the cache is marked %LC_DIRTY, | ||
467 | * so lc_try_lock() will no longer succeed. | ||
468 | * The returned element pointer is moved to the "to_be_changed" list, | ||
469 | * and registered with the new element number on the hash collision chains, | ||
470 | * so it is possible to pick it up from lc_is_used(). | ||
471 | * Up to "max_pending_changes" (see lc_create()) can be accumulated. | ||
472 | * The user now should do whatever housekeeping is necessary, | ||
473 | * typically serialize on lc_try_lock_for_transaction(), then call | ||
474 | * lc_committed(lc) and lc_unlock(), to finish the change. | ||
475 | * | ||
476 | * NOTE: The user needs to check the lc_number on EACH use, so he recognizes | ||
477 | * any cache set change. | ||
404 | */ | 478 | */ |
405 | struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr) | 479 | struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr) |
406 | { | 480 | { |
407 | struct lc_element *e; | 481 | return __lc_get(lc, enr, 1); |
408 | 482 | } | |
409 | PARANOIA_ENTRY(); | ||
410 | if (lc->flags & LC_STARVING) { | ||
411 | ++lc->starving; | ||
412 | RETURN(NULL); | ||
413 | } | ||
414 | 483 | ||
415 | e = lc_find(lc, enr); | 484 | /** |
416 | if (e) { | 485 | * lc_try_get - get element by label, if present; do not change the active set |
417 | ++lc->hits; | 486 | * @lc: the lru cache to operate on |
418 | if (e->refcnt++ == 0) | 487 | * @enr: the label to look up |
419 | lc->used++; | 488 | * |
420 | list_move(&e->list, &lc->in_use); /* Not evictable... */ | 489 | * Finds an element in the cache, increases its usage count, |
421 | } | 490 | * "touches" and returns it. |
422 | RETURN(e); | 491 | * |
492 | * Return values: | ||
493 | * NULL | ||
494 | * The cache was marked %LC_STARVING, | ||
495 | * or the requested label was not in the active set | ||
496 | * | ||
497 | * pointer to the element with the REQUESTED element number. | ||
498 | * In this case, it can be used right away | ||
499 | */ | ||
500 | struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr) | ||
501 | { | ||
502 | return __lc_get(lc, enr, 0); | ||
423 | } | 503 | } |
424 | 504 | ||
425 | /** | 505 | /** |
426 | * lc_changed - tell @lc that the change has been recorded | 506 | * lc_committed - tell @lc that pending changes have been recorded |
427 | * @lc: the lru cache to operate on | 507 | * @lc: the lru cache to operate on |
428 | * @e: the element pending label change | 508 | * |
509 | * User is expected to serialize on explicit lc_try_lock_for_transaction() | ||
510 | * before the transaction is started, and later needs to lc_unlock() explicitly | ||
511 | * as well. | ||
429 | */ | 512 | */ |
430 | void lc_changed(struct lru_cache *lc, struct lc_element *e) | 513 | void lc_committed(struct lru_cache *lc) |
431 | { | 514 | { |
515 | struct lc_element *e, *tmp; | ||
516 | |||
432 | PARANOIA_ENTRY(); | 517 | PARANOIA_ENTRY(); |
433 | BUG_ON(e != lc->changing_element); | 518 | list_for_each_entry_safe(e, tmp, &lc->to_be_changed, list) { |
434 | PARANOIA_LC_ELEMENT(lc, e); | 519 | /* count number of changes, not number of transactions */ |
435 | ++lc->changed; | 520 | ++lc->changed; |
436 | e->lc_number = lc->new_number; | 521 | e->lc_number = e->lc_new_number; |
437 | list_add(&e->list, &lc->in_use); | 522 | list_move(&e->list, &lc->in_use); |
438 | hlist_add_head(&e->colision, lc_hash_slot(lc, lc->new_number)); | 523 | } |
439 | lc->changing_element = NULL; | 524 | lc->pending_changes = 0; |
440 | lc->new_number = LC_FREE; | ||
441 | clear_bit(__LC_DIRTY, &lc->flags); | ||
442 | smp_mb__after_clear_bit(); | ||
443 | RETURN(); | 525 | RETURN(); |
444 | } | 526 | } |
445 | 527 | ||
@@ -458,13 +540,12 @@ unsigned int lc_put(struct lru_cache *lc, struct lc_element *e) | |||
458 | PARANOIA_ENTRY(); | 540 | PARANOIA_ENTRY(); |
459 | PARANOIA_LC_ELEMENT(lc, e); | 541 | PARANOIA_LC_ELEMENT(lc, e); |
460 | BUG_ON(e->refcnt == 0); | 542 | BUG_ON(e->refcnt == 0); |
461 | BUG_ON(e == lc->changing_element); | 543 | BUG_ON(e->lc_number != e->lc_new_number); |
462 | if (--e->refcnt == 0) { | 544 | if (--e->refcnt == 0) { |
463 | /* move it to the front of LRU. */ | 545 | /* move it to the front of LRU. */ |
464 | list_move(&e->list, &lc->lru); | 546 | list_move(&e->list, &lc->lru); |
465 | lc->used--; | 547 | lc->used--; |
466 | clear_bit(__LC_STARVING, &lc->flags); | 548 | clear_bit_unlock(__LC_STARVING, &lc->flags); |
467 | smp_mb__after_clear_bit(); | ||
468 | } | 549 | } |
469 | RETURN(e->refcnt); | 550 | RETURN(e->refcnt); |
470 | } | 551 | } |
@@ -504,16 +585,24 @@ unsigned int lc_index_of(struct lru_cache *lc, struct lc_element *e) | |||
504 | void lc_set(struct lru_cache *lc, unsigned int enr, int index) | 585 | void lc_set(struct lru_cache *lc, unsigned int enr, int index) |
505 | { | 586 | { |
506 | struct lc_element *e; | 587 | struct lc_element *e; |
588 | struct list_head *lh; | ||
507 | 589 | ||
508 | if (index < 0 || index >= lc->nr_elements) | 590 | if (index < 0 || index >= lc->nr_elements) |
509 | return; | 591 | return; |
510 | 592 | ||
511 | e = lc_element_by_index(lc, index); | 593 | e = lc_element_by_index(lc, index); |
512 | e->lc_number = enr; | 594 | BUG_ON(e->lc_number != e->lc_new_number); |
595 | BUG_ON(e->refcnt != 0); | ||
513 | 596 | ||
597 | e->lc_number = e->lc_new_number = enr; | ||
514 | hlist_del_init(&e->colision); | 598 | hlist_del_init(&e->colision); |
515 | hlist_add_head(&e->colision, lc_hash_slot(lc, enr)); | 599 | if (enr == LC_FREE) |
516 | list_move(&e->list, e->refcnt ? &lc->in_use : &lc->lru); | 600 | lh = &lc->free; |
601 | else { | ||
602 | hlist_add_head(&e->colision, lc_hash_slot(lc, enr)); | ||
603 | lh = &lc->lru; | ||
604 | } | ||
605 | list_move(&e->list, lh); | ||
517 | } | 606 | } |
518 | 607 | ||
519 | /** | 608 | /** |
@@ -553,8 +642,10 @@ EXPORT_SYMBOL(lc_try_get); | |||
553 | EXPORT_SYMBOL(lc_find); | 642 | EXPORT_SYMBOL(lc_find); |
554 | EXPORT_SYMBOL(lc_get); | 643 | EXPORT_SYMBOL(lc_get); |
555 | EXPORT_SYMBOL(lc_put); | 644 | EXPORT_SYMBOL(lc_put); |
556 | EXPORT_SYMBOL(lc_changed); | 645 | EXPORT_SYMBOL(lc_committed); |
557 | EXPORT_SYMBOL(lc_element_by_index); | 646 | EXPORT_SYMBOL(lc_element_by_index); |
558 | EXPORT_SYMBOL(lc_index_of); | 647 | EXPORT_SYMBOL(lc_index_of); |
559 | EXPORT_SYMBOL(lc_seq_printf_stats); | 648 | EXPORT_SYMBOL(lc_seq_printf_stats); |
560 | EXPORT_SYMBOL(lc_seq_dump_details); | 649 | EXPORT_SYMBOL(lc_seq_dump_details); |
650 | EXPORT_SYMBOL(lc_try_lock); | ||
651 | EXPORT_SYMBOL(lc_is_used); | ||