diff options
author | NeilBrown <neilb@suse.de> | 2015-04-21 18:00:20 -0400 |
---|---|---|
committer | NeilBrown <neilb@suse.de> | 2015-04-21 18:00:20 -0400 |
commit | d51e4fe6d68098d4361a6b6d41d8da727b1f1af4 (patch) | |
tree | 6c070a877db9697a2b68e84b2b2d0aa299bb6216 | |
parent | 47d68979cc968535cb87f3e5f2e6a3533ea48fbd (diff) | |
parent | 97f6cd39da227459cb46ed4088d37d5d8db51c50 (diff) |
Merge branch 'cluster' into for-next
-rw-r--r-- | Documentation/md-cluster.txt | 176 | ||||
-rw-r--r-- | drivers/md/Kconfig | 16 | ||||
-rw-r--r-- | drivers/md/Makefile | 1 | ||||
-rw-r--r-- | drivers/md/bitmap.c | 189 | ||||
-rw-r--r-- | drivers/md/bitmap.h | 10 | ||||
-rw-r--r-- | drivers/md/md-cluster.c | 965 | ||||
-rw-r--r-- | drivers/md/md-cluster.h | 29 | ||||
-rw-r--r-- | drivers/md/md.c | 353 | ||||
-rw-r--r-- | drivers/md/md.h | 24 | ||||
-rw-r--r-- | drivers/md/raid1.c | 20 | ||||
-rw-r--r-- | include/uapi/linux/raid/md_p.h | 7 | ||||
-rw-r--r-- | include/uapi/linux/raid/md_u.h | 1 |
12 files changed, 1709 insertions, 82 deletions
diff --git a/Documentation/md-cluster.txt b/Documentation/md-cluster.txt new file mode 100644 index 000000000000..de1af7db3355 --- /dev/null +++ b/Documentation/md-cluster.txt | |||
@@ -0,0 +1,176 @@ | |||
1 | The cluster MD is a shared-device RAID for a cluster. | ||
2 | |||
3 | |||
4 | 1. On-disk format | ||
5 | |||
6 | Separate write-intent-bitmap are used for each cluster node. | ||
7 | The bitmaps record all writes that may have been started on that node, | ||
8 | and may not yet have finished. The on-disk layout is: | ||
9 | |||
10 | 0 4k 8k 12k | ||
11 | ------------------------------------------------------------------- | ||
12 | | idle | md super | bm super [0] + bits | | ||
13 | | bm bits[0, contd] | bm super[1] + bits | bm bits[1, contd] | | ||
14 | | bm super[2] + bits | bm bits [2, contd] | bm super[3] + bits | | ||
15 | | bm bits [3, contd] | | | | ||
16 | |||
17 | During "normal" functioning we assume the filesystem ensures that only one | ||
18 | node writes to any given block at a time, so a write | ||
19 | request will | ||
20 | - set the appropriate bit (if not already set) | ||
21 | - commit the write to all mirrors | ||
22 | - schedule the bit to be cleared after a timeout. | ||
23 | |||
24 | Reads are just handled normally. It is up to the filesystem to | ||
25 | ensure one node doesn't read from a location where another node (or the same | ||
26 | node) is writing. | ||
27 | |||
28 | |||
29 | 2. DLM Locks for management | ||
30 | |||
31 | There are two locks for managing the device: | ||
32 | |||
33 | 2.1 Bitmap lock resource (bm_lockres) | ||
34 | |||
35 | The bm_lockres protects individual node bitmaps. They are named in the | ||
36 | form bitmap001 for node 1, bitmap002 for node and so on. When a node | ||
37 | joins the cluster, it acquires the lock in PW mode and it stays so | ||
38 | during the lifetime the node is part of the cluster. The lock resource | ||
39 | number is based on the slot number returned by the DLM subsystem. Since | ||
40 | DLM starts node count from one and bitmap slots start from zero, one is | ||
41 | subtracted from the DLM slot number to arrive at the bitmap slot number. | ||
42 | |||
43 | 3. Communication | ||
44 | |||
45 | Each node has to communicate with other nodes when starting or ending | ||
46 | resync, and metadata superblock updates. | ||
47 | |||
48 | 3.1 Message Types | ||
49 | |||
50 | There are 3 types, of messages which are passed | ||
51 | |||
52 | 3.1.1 METADATA_UPDATED: informs other nodes that the metadata has been | ||
53 | updated, and the node must re-read the md superblock. This is performed | ||
54 | synchronously. | ||
55 | |||
56 | 3.1.2 RESYNC: informs other nodes that a resync is initiated or ended | ||
57 | so that each node may suspend or resume the region. | ||
58 | |||
59 | 3.2 Communication mechanism | ||
60 | |||
61 | The DLM LVB is used to communicate within nodes of the cluster. There | ||
62 | are three resources used for the purpose: | ||
63 | |||
64 | 3.2.1 Token: The resource which protects the entire communication | ||
65 | system. The node having the token resource is allowed to | ||
66 | communicate. | ||
67 | |||
68 | 3.2.2 Message: The lock resource which carries the data to | ||
69 | communicate. | ||
70 | |||
71 | 3.2.3 Ack: The resource, acquiring which means the message has been | ||
72 | acknowledged by all nodes in the cluster. The BAST of the resource | ||
73 | is used to inform the receive node that a node wants to communicate. | ||
74 | |||
75 | The algorithm is: | ||
76 | |||
77 | 1. receive status | ||
78 | |||
79 | sender receiver receiver | ||
80 | ACK:CR ACK:CR ACK:CR | ||
81 | |||
82 | 2. sender get EX of TOKEN | ||
83 | sender get EX of MESSAGE | ||
84 | sender receiver receiver | ||
85 | TOKEN:EX ACK:CR ACK:CR | ||
86 | MESSAGE:EX | ||
87 | ACK:CR | ||
88 | |||
89 | Sender checks that it still needs to send a message. Messages received | ||
90 | or other events that happened while waiting for the TOKEN may have made | ||
91 | this message inappropriate or redundant. | ||
92 | |||
93 | 3. sender write LVB. | ||
94 | sender down-convert MESSAGE from EX to CR | ||
95 | sender try to get EX of ACK | ||
96 | [ wait until all receiver has *processed* the MESSAGE ] | ||
97 | |||
98 | [ triggered by bast of ACK ] | ||
99 | receiver get CR of MESSAGE | ||
100 | receiver read LVB | ||
101 | receiver processes the message | ||
102 | [ wait finish ] | ||
103 | receiver release ACK | ||
104 | |||
105 | sender receiver receiver | ||
106 | TOKEN:EX MESSAGE:CR MESSAGE:CR | ||
107 | MESSAGE:CR | ||
108 | ACK:EX | ||
109 | |||
110 | 4. triggered by grant of EX on ACK (indicating all receivers have processed | ||
111 | message) | ||
112 | sender down-convert ACK from EX to CR | ||
113 | sender release MESSAGE | ||
114 | sender release TOKEN | ||
115 | receiver upconvert to EX of MESSAGE | ||
116 | receiver get CR of ACK | ||
117 | receiver release MESSAGE | ||
118 | |||
119 | sender receiver receiver | ||
120 | ACK:CR ACK:CR ACK:CR | ||
121 | |||
122 | |||
123 | 4. Handling Failures | ||
124 | |||
125 | 4.1 Node Failure | ||
126 | When a node fails, the DLM informs the cluster with the slot. The node | ||
127 | starts a cluster recovery thread. The cluster recovery thread: | ||
128 | - acquires the bitmap<number> lock of the failed node | ||
129 | - opens the bitmap | ||
130 | - reads the bitmap of the failed node | ||
131 | - copies the set bitmap to local node | ||
132 | - cleans the bitmap of the failed node | ||
133 | - releases bitmap<number> lock of the failed node | ||
134 | - initiates resync of the bitmap on the current node | ||
135 | |||
136 | The resync process, is the regular md resync. However, in a clustered | ||
137 | environment when a resync is performed, it needs to tell other nodes | ||
138 | of the areas which are suspended. Before a resync starts, the node | ||
139 | send out RESYNC_START with the (lo,hi) range of the area which needs | ||
140 | to be suspended. Each node maintains a suspend_list, which contains | ||
141 | the list of ranges which are currently suspended. On receiving | ||
142 | RESYNC_START, the node adds the range to the suspend_list. Similarly, | ||
143 | when the node performing resync finishes, it send RESYNC_FINISHED | ||
144 | to other nodes and other nodes remove the corresponding entry from | ||
145 | the suspend_list. | ||
146 | |||
147 | A helper function, should_suspend() can be used to check if a particular | ||
148 | I/O range should be suspended or not. | ||
149 | |||
150 | 4.2 Device Failure | ||
151 | Device failures are handled and communicated with the metadata update | ||
152 | routine. | ||
153 | |||
154 | 5. Adding a new Device | ||
155 | For adding a new device, it is necessary that all nodes "see" the new device | ||
156 | to be added. For this, the following algorithm is used: | ||
157 | |||
158 | 1. Node 1 issues mdadm --manage /dev/mdX --add /dev/sdYY which issues | ||
159 | ioctl(ADD_NEW_DISC with disc.state set to MD_DISK_CLUSTER_ADD) | ||
160 | 2. Node 1 sends NEWDISK with uuid and slot number | ||
161 | 3. Other nodes issue kobject_uevent_env with uuid and slot number | ||
162 | (Steps 4,5 could be a udev rule) | ||
163 | 4. In userspace, the node searches for the disk, perhaps | ||
164 | using blkid -t SUB_UUID="" | ||
165 | 5. Other nodes issue either of the following depending on whether the disk | ||
166 | was found: | ||
167 | ioctl(ADD_NEW_DISK with disc.state set to MD_DISK_CANDIDATE and | ||
168 | disc.number set to slot number) | ||
169 | ioctl(CLUSTERED_DISK_NACK) | ||
170 | 6. Other nodes drop lock on no-new-devs (CR) if device is found | ||
171 | 7. Node 1 attempts EX lock on no-new-devs | ||
172 | 8. If node 1 gets the lock, it sends METADATA_UPDATED after unmarking the disk | ||
173 | as SpareLocal | ||
174 | 9. If not (get no-new-dev lock), it fails the operation and sends METADATA_UPDATED | ||
175 | 10. Other nodes get the information whether a disk is added or not | ||
176 | by the following METADATA_UPDATED. | ||
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 63e05e32b462..eed1fec2d97b 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
@@ -175,6 +175,22 @@ config MD_FAULTY | |||
175 | 175 | ||
176 | In unsure, say N. | 176 | In unsure, say N. |
177 | 177 | ||
178 | |||
179 | config MD_CLUSTER | ||
180 | tristate "Cluster Support for MD (EXPERIMENTAL)" | ||
181 | depends on BLK_DEV_MD | ||
182 | depends on DLM | ||
183 | default n | ||
184 | ---help--- | ||
185 | Clustering support for MD devices. This enables locking and | ||
186 | synchronization across multiple systems on the cluster, so all | ||
187 | nodes in the cluster can access the MD devices simultaneously. | ||
188 | |||
189 | This brings the redundancy (and uptime) of RAID levels across the | ||
190 | nodes of the cluster. | ||
191 | |||
192 | If unsure, say N. | ||
193 | |||
178 | source "drivers/md/bcache/Kconfig" | 194 | source "drivers/md/bcache/Kconfig" |
179 | 195 | ||
180 | config BLK_DEV_DM_BUILTIN | 196 | config BLK_DEV_DM_BUILTIN |
diff --git a/drivers/md/Makefile b/drivers/md/Makefile index a2da532b1c2b..7ed86876f3b7 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile | |||
@@ -30,6 +30,7 @@ obj-$(CONFIG_MD_RAID10) += raid10.o | |||
30 | obj-$(CONFIG_MD_RAID456) += raid456.o | 30 | obj-$(CONFIG_MD_RAID456) += raid456.o |
31 | obj-$(CONFIG_MD_MULTIPATH) += multipath.o | 31 | obj-$(CONFIG_MD_MULTIPATH) += multipath.o |
32 | obj-$(CONFIG_MD_FAULTY) += faulty.o | 32 | obj-$(CONFIG_MD_FAULTY) += faulty.o |
33 | obj-$(CONFIG_MD_CLUSTER) += md-cluster.o | ||
33 | obj-$(CONFIG_BCACHE) += bcache/ | 34 | obj-$(CONFIG_BCACHE) += bcache/ |
34 | obj-$(CONFIG_BLK_DEV_MD) += md-mod.o | 35 | obj-$(CONFIG_BLK_DEV_MD) += md-mod.o |
35 | obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o | 36 | obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o |
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 3a5767968ba0..2bc56e2a3526 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c | |||
@@ -205,6 +205,10 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) | |||
205 | struct block_device *bdev; | 205 | struct block_device *bdev; |
206 | struct mddev *mddev = bitmap->mddev; | 206 | struct mddev *mddev = bitmap->mddev; |
207 | struct bitmap_storage *store = &bitmap->storage; | 207 | struct bitmap_storage *store = &bitmap->storage; |
208 | int node_offset = 0; | ||
209 | |||
210 | if (mddev_is_clustered(bitmap->mddev)) | ||
211 | node_offset = bitmap->cluster_slot * store->file_pages; | ||
208 | 212 | ||
209 | while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { | 213 | while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { |
210 | int size = PAGE_SIZE; | 214 | int size = PAGE_SIZE; |
@@ -433,6 +437,7 @@ void bitmap_update_sb(struct bitmap *bitmap) | |||
433 | /* This might have been changed by a reshape */ | 437 | /* This might have been changed by a reshape */ |
434 | sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); | 438 | sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); |
435 | sb->chunksize = cpu_to_le32(bitmap->mddev->bitmap_info.chunksize); | 439 | sb->chunksize = cpu_to_le32(bitmap->mddev->bitmap_info.chunksize); |
440 | sb->nodes = cpu_to_le32(bitmap->mddev->bitmap_info.nodes); | ||
436 | sb->sectors_reserved = cpu_to_le32(bitmap->mddev-> | 441 | sb->sectors_reserved = cpu_to_le32(bitmap->mddev-> |
437 | bitmap_info.space); | 442 | bitmap_info.space); |
438 | kunmap_atomic(sb); | 443 | kunmap_atomic(sb); |
@@ -544,6 +549,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) | |||
544 | bitmap_super_t *sb; | 549 | bitmap_super_t *sb; |
545 | unsigned long chunksize, daemon_sleep, write_behind; | 550 | unsigned long chunksize, daemon_sleep, write_behind; |
546 | unsigned long long events; | 551 | unsigned long long events; |
552 | int nodes = 0; | ||
547 | unsigned long sectors_reserved = 0; | 553 | unsigned long sectors_reserved = 0; |
548 | int err = -EINVAL; | 554 | int err = -EINVAL; |
549 | struct page *sb_page; | 555 | struct page *sb_page; |
@@ -562,6 +568,22 @@ static int bitmap_read_sb(struct bitmap *bitmap) | |||
562 | return -ENOMEM; | 568 | return -ENOMEM; |
563 | bitmap->storage.sb_page = sb_page; | 569 | bitmap->storage.sb_page = sb_page; |
564 | 570 | ||
571 | re_read: | ||
572 | /* If cluster_slot is set, the cluster is setup */ | ||
573 | if (bitmap->cluster_slot >= 0) { | ||
574 | sector_t bm_blocks = bitmap->mddev->resync_max_sectors; | ||
575 | |||
576 | sector_div(bm_blocks, | ||
577 | bitmap->mddev->bitmap_info.chunksize >> 9); | ||
578 | /* bits to bytes */ | ||
579 | bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t); | ||
580 | /* to 4k blocks */ | ||
581 | bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096); | ||
582 | bitmap->mddev->bitmap_info.offset += bitmap->cluster_slot * (bm_blocks << 3); | ||
583 | pr_info("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__, | ||
584 | bitmap->cluster_slot, (unsigned long long)bitmap->mddev->bitmap_info.offset); | ||
585 | } | ||
586 | |||
565 | if (bitmap->storage.file) { | 587 | if (bitmap->storage.file) { |
566 | loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host); | 588 | loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host); |
567 | int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize; | 589 | int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize; |
@@ -577,12 +599,15 @@ static int bitmap_read_sb(struct bitmap *bitmap) | |||
577 | if (err) | 599 | if (err) |
578 | return err; | 600 | return err; |
579 | 601 | ||
602 | err = -EINVAL; | ||
580 | sb = kmap_atomic(sb_page); | 603 | sb = kmap_atomic(sb_page); |
581 | 604 | ||
582 | chunksize = le32_to_cpu(sb->chunksize); | 605 | chunksize = le32_to_cpu(sb->chunksize); |
583 | daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; | 606 | daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; |
584 | write_behind = le32_to_cpu(sb->write_behind); | 607 | write_behind = le32_to_cpu(sb->write_behind); |
585 | sectors_reserved = le32_to_cpu(sb->sectors_reserved); | 608 | sectors_reserved = le32_to_cpu(sb->sectors_reserved); |
609 | nodes = le32_to_cpu(sb->nodes); | ||
610 | strlcpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64); | ||
586 | 611 | ||
587 | /* verify that the bitmap-specific fields are valid */ | 612 | /* verify that the bitmap-specific fields are valid */ |
588 | if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) | 613 | if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) |
@@ -619,7 +644,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) | |||
619 | goto out; | 644 | goto out; |
620 | } | 645 | } |
621 | events = le64_to_cpu(sb->events); | 646 | events = le64_to_cpu(sb->events); |
622 | if (events < bitmap->mddev->events) { | 647 | if (!nodes && (events < bitmap->mddev->events)) { |
623 | printk(KERN_INFO | 648 | printk(KERN_INFO |
624 | "%s: bitmap file is out of date (%llu < %llu) " | 649 | "%s: bitmap file is out of date (%llu < %llu) " |
625 | "-- forcing full recovery\n", | 650 | "-- forcing full recovery\n", |
@@ -634,20 +659,40 @@ static int bitmap_read_sb(struct bitmap *bitmap) | |||
634 | if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) | 659 | if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) |
635 | set_bit(BITMAP_HOSTENDIAN, &bitmap->flags); | 660 | set_bit(BITMAP_HOSTENDIAN, &bitmap->flags); |
636 | bitmap->events_cleared = le64_to_cpu(sb->events_cleared); | 661 | bitmap->events_cleared = le64_to_cpu(sb->events_cleared); |
662 | strlcpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64); | ||
637 | err = 0; | 663 | err = 0; |
664 | |||
638 | out: | 665 | out: |
639 | kunmap_atomic(sb); | 666 | kunmap_atomic(sb); |
667 | /* Assiging chunksize is required for "re_read" */ | ||
668 | bitmap->mddev->bitmap_info.chunksize = chunksize; | ||
669 | if (nodes && (bitmap->cluster_slot < 0)) { | ||
670 | err = md_setup_cluster(bitmap->mddev, nodes); | ||
671 | if (err) { | ||
672 | pr_err("%s: Could not setup cluster service (%d)\n", | ||
673 | bmname(bitmap), err); | ||
674 | goto out_no_sb; | ||
675 | } | ||
676 | bitmap->cluster_slot = md_cluster_ops->slot_number(bitmap->mddev); | ||
677 | goto re_read; | ||
678 | } | ||
679 | |||
680 | |||
640 | out_no_sb: | 681 | out_no_sb: |
641 | if (test_bit(BITMAP_STALE, &bitmap->flags)) | 682 | if (test_bit(BITMAP_STALE, &bitmap->flags)) |
642 | bitmap->events_cleared = bitmap->mddev->events; | 683 | bitmap->events_cleared = bitmap->mddev->events; |
643 | bitmap->mddev->bitmap_info.chunksize = chunksize; | 684 | bitmap->mddev->bitmap_info.chunksize = chunksize; |
644 | bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; | 685 | bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; |
645 | bitmap->mddev->bitmap_info.max_write_behind = write_behind; | 686 | bitmap->mddev->bitmap_info.max_write_behind = write_behind; |
687 | bitmap->mddev->bitmap_info.nodes = nodes; | ||
646 | if (bitmap->mddev->bitmap_info.space == 0 || | 688 | if (bitmap->mddev->bitmap_info.space == 0 || |
647 | bitmap->mddev->bitmap_info.space > sectors_reserved) | 689 | bitmap->mddev->bitmap_info.space > sectors_reserved) |
648 | bitmap->mddev->bitmap_info.space = sectors_reserved; | 690 | bitmap->mddev->bitmap_info.space = sectors_reserved; |
649 | if (err) | 691 | if (err) { |
650 | bitmap_print_sb(bitmap); | 692 | bitmap_print_sb(bitmap); |
693 | if (bitmap->cluster_slot < 0) | ||
694 | md_cluster_stop(bitmap->mddev); | ||
695 | } | ||
651 | return err; | 696 | return err; |
652 | } | 697 | } |
653 | 698 | ||
@@ -692,9 +737,10 @@ static inline struct page *filemap_get_page(struct bitmap_storage *store, | |||
692 | } | 737 | } |
693 | 738 | ||
694 | static int bitmap_storage_alloc(struct bitmap_storage *store, | 739 | static int bitmap_storage_alloc(struct bitmap_storage *store, |
695 | unsigned long chunks, int with_super) | 740 | unsigned long chunks, int with_super, |
741 | int slot_number) | ||
696 | { | 742 | { |
697 | int pnum; | 743 | int pnum, offset = 0; |
698 | unsigned long num_pages; | 744 | unsigned long num_pages; |
699 | unsigned long bytes; | 745 | unsigned long bytes; |
700 | 746 | ||
@@ -703,6 +749,7 @@ static int bitmap_storage_alloc(struct bitmap_storage *store, | |||
703 | bytes += sizeof(bitmap_super_t); | 749 | bytes += sizeof(bitmap_super_t); |
704 | 750 | ||
705 | num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE); | 751 | num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE); |
752 | offset = slot_number * (num_pages - 1); | ||
706 | 753 | ||
707 | store->filemap = kmalloc(sizeof(struct page *) | 754 | store->filemap = kmalloc(sizeof(struct page *) |
708 | * num_pages, GFP_KERNEL); | 755 | * num_pages, GFP_KERNEL); |
@@ -713,20 +760,22 @@ static int bitmap_storage_alloc(struct bitmap_storage *store, | |||
713 | store->sb_page = alloc_page(GFP_KERNEL|__GFP_ZERO); | 760 | store->sb_page = alloc_page(GFP_KERNEL|__GFP_ZERO); |
714 | if (store->sb_page == NULL) | 761 | if (store->sb_page == NULL) |
715 | return -ENOMEM; | 762 | return -ENOMEM; |
716 | store->sb_page->index = 0; | ||
717 | } | 763 | } |
764 | |||
718 | pnum = 0; | 765 | pnum = 0; |
719 | if (store->sb_page) { | 766 | if (store->sb_page) { |
720 | store->filemap[0] = store->sb_page; | 767 | store->filemap[0] = store->sb_page; |
721 | pnum = 1; | 768 | pnum = 1; |
769 | store->sb_page->index = offset; | ||
722 | } | 770 | } |
771 | |||
723 | for ( ; pnum < num_pages; pnum++) { | 772 | for ( ; pnum < num_pages; pnum++) { |
724 | store->filemap[pnum] = alloc_page(GFP_KERNEL|__GFP_ZERO); | 773 | store->filemap[pnum] = alloc_page(GFP_KERNEL|__GFP_ZERO); |
725 | if (!store->filemap[pnum]) { | 774 | if (!store->filemap[pnum]) { |
726 | store->file_pages = pnum; | 775 | store->file_pages = pnum; |
727 | return -ENOMEM; | 776 | return -ENOMEM; |
728 | } | 777 | } |
729 | store->filemap[pnum]->index = pnum; | 778 | store->filemap[pnum]->index = pnum + offset; |
730 | } | 779 | } |
731 | store->file_pages = pnum; | 780 | store->file_pages = pnum; |
732 | 781 | ||
@@ -885,6 +934,28 @@ static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) | |||
885 | } | 934 | } |
886 | } | 935 | } |
887 | 936 | ||
937 | static int bitmap_file_test_bit(struct bitmap *bitmap, sector_t block) | ||
938 | { | ||
939 | unsigned long bit; | ||
940 | struct page *page; | ||
941 | void *paddr; | ||
942 | unsigned long chunk = block >> bitmap->counts.chunkshift; | ||
943 | int set = 0; | ||
944 | |||
945 | page = filemap_get_page(&bitmap->storage, chunk); | ||
946 | if (!page) | ||
947 | return -EINVAL; | ||
948 | bit = file_page_offset(&bitmap->storage, chunk); | ||
949 | paddr = kmap_atomic(page); | ||
950 | if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) | ||
951 | set = test_bit(bit, paddr); | ||
952 | else | ||
953 | set = test_bit_le(bit, paddr); | ||
954 | kunmap_atomic(paddr); | ||
955 | return set; | ||
956 | } | ||
957 | |||
958 | |||
888 | /* this gets called when the md device is ready to unplug its underlying | 959 | /* this gets called when the md device is ready to unplug its underlying |
889 | * (slave) device queues -- before we let any writes go down, we need to | 960 | * (slave) device queues -- before we let any writes go down, we need to |
890 | * sync the dirty pages of the bitmap file to disk */ | 961 | * sync the dirty pages of the bitmap file to disk */ |
@@ -935,7 +1006,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n | |||
935 | */ | 1006 | */ |
936 | static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) | 1007 | static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) |
937 | { | 1008 | { |
938 | unsigned long i, chunks, index, oldindex, bit; | 1009 | unsigned long i, chunks, index, oldindex, bit, node_offset = 0; |
939 | struct page *page = NULL; | 1010 | struct page *page = NULL; |
940 | unsigned long bit_cnt = 0; | 1011 | unsigned long bit_cnt = 0; |
941 | struct file *file; | 1012 | struct file *file; |
@@ -981,6 +1052,9 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) | |||
981 | if (!bitmap->mddev->bitmap_info.external) | 1052 | if (!bitmap->mddev->bitmap_info.external) |
982 | offset = sizeof(bitmap_super_t); | 1053 | offset = sizeof(bitmap_super_t); |
983 | 1054 | ||
1055 | if (mddev_is_clustered(bitmap->mddev)) | ||
1056 | node_offset = bitmap->cluster_slot * (DIV_ROUND_UP(store->bytes, PAGE_SIZE)); | ||
1057 | |||
984 | for (i = 0; i < chunks; i++) { | 1058 | for (i = 0; i < chunks; i++) { |
985 | int b; | 1059 | int b; |
986 | index = file_page_index(&bitmap->storage, i); | 1060 | index = file_page_index(&bitmap->storage, i); |
@@ -1001,7 +1075,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) | |||
1001 | bitmap->mddev, | 1075 | bitmap->mddev, |
1002 | bitmap->mddev->bitmap_info.offset, | 1076 | bitmap->mddev->bitmap_info.offset, |
1003 | page, | 1077 | page, |
1004 | index, count); | 1078 | index + node_offset, count); |
1005 | 1079 | ||
1006 | if (ret) | 1080 | if (ret) |
1007 | goto err; | 1081 | goto err; |
@@ -1207,7 +1281,6 @@ void bitmap_daemon_work(struct mddev *mddev) | |||
1207 | j < bitmap->storage.file_pages | 1281 | j < bitmap->storage.file_pages |
1208 | && !test_bit(BITMAP_STALE, &bitmap->flags); | 1282 | && !test_bit(BITMAP_STALE, &bitmap->flags); |
1209 | j++) { | 1283 | j++) { |
1210 | |||
1211 | if (test_page_attr(bitmap, j, | 1284 | if (test_page_attr(bitmap, j, |
1212 | BITMAP_PAGE_DIRTY)) | 1285 | BITMAP_PAGE_DIRTY)) |
1213 | /* bitmap_unplug will handle the rest */ | 1286 | /* bitmap_unplug will handle the rest */ |
@@ -1530,11 +1603,13 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n | |||
1530 | return; | 1603 | return; |
1531 | } | 1604 | } |
1532 | if (!*bmc) { | 1605 | if (!*bmc) { |
1533 | *bmc = 2 | (needed ? NEEDED_MASK : 0); | 1606 | *bmc = 2; |
1534 | bitmap_count_page(&bitmap->counts, offset, 1); | 1607 | bitmap_count_page(&bitmap->counts, offset, 1); |
1535 | bitmap_set_pending(&bitmap->counts, offset); | 1608 | bitmap_set_pending(&bitmap->counts, offset); |
1536 | bitmap->allclean = 0; | 1609 | bitmap->allclean = 0; |
1537 | } | 1610 | } |
1611 | if (needed) | ||
1612 | *bmc |= NEEDED_MASK; | ||
1538 | spin_unlock_irq(&bitmap->counts.lock); | 1613 | spin_unlock_irq(&bitmap->counts.lock); |
1539 | } | 1614 | } |
1540 | 1615 | ||
@@ -1591,6 +1666,10 @@ static void bitmap_free(struct bitmap *bitmap) | |||
1591 | if (!bitmap) /* there was no bitmap */ | 1666 | if (!bitmap) /* there was no bitmap */ |
1592 | return; | 1667 | return; |
1593 | 1668 | ||
1669 | if (mddev_is_clustered(bitmap->mddev) && bitmap->mddev->cluster_info && | ||
1670 | bitmap->cluster_slot == md_cluster_ops->slot_number(bitmap->mddev)) | ||
1671 | md_cluster_stop(bitmap->mddev); | ||
1672 | |||
1594 | /* Shouldn't be needed - but just in case.... */ | 1673 | /* Shouldn't be needed - but just in case.... */ |
1595 | wait_event(bitmap->write_wait, | 1674 | wait_event(bitmap->write_wait, |
1596 | atomic_read(&bitmap->pending_writes) == 0); | 1675 | atomic_read(&bitmap->pending_writes) == 0); |
@@ -1636,7 +1715,7 @@ void bitmap_destroy(struct mddev *mddev) | |||
1636 | * initialize the bitmap structure | 1715 | * initialize the bitmap structure |
1637 | * if this returns an error, bitmap_destroy must be called to do clean up | 1716 | * if this returns an error, bitmap_destroy must be called to do clean up |
1638 | */ | 1717 | */ |
1639 | int bitmap_create(struct mddev *mddev) | 1718 | struct bitmap *bitmap_create(struct mddev *mddev, int slot) |
1640 | { | 1719 | { |
1641 | struct bitmap *bitmap; | 1720 | struct bitmap *bitmap; |
1642 | sector_t blocks = mddev->resync_max_sectors; | 1721 | sector_t blocks = mddev->resync_max_sectors; |
@@ -1650,7 +1729,7 @@ int bitmap_create(struct mddev *mddev) | |||
1650 | 1729 | ||
1651 | bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); | 1730 | bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); |
1652 | if (!bitmap) | 1731 | if (!bitmap) |
1653 | return -ENOMEM; | 1732 | return ERR_PTR(-ENOMEM); |
1654 | 1733 | ||
1655 | spin_lock_init(&bitmap->counts.lock); | 1734 | spin_lock_init(&bitmap->counts.lock); |
1656 | atomic_set(&bitmap->pending_writes, 0); | 1735 | atomic_set(&bitmap->pending_writes, 0); |
@@ -1659,6 +1738,7 @@ int bitmap_create(struct mddev *mddev) | |||
1659 | init_waitqueue_head(&bitmap->behind_wait); | 1738 | init_waitqueue_head(&bitmap->behind_wait); |
1660 | 1739 | ||
1661 | bitmap->mddev = mddev; | 1740 | bitmap->mddev = mddev; |
1741 | bitmap->cluster_slot = slot; | ||
1662 | 1742 | ||
1663 | if (mddev->kobj.sd) | 1743 | if (mddev->kobj.sd) |
1664 | bm = sysfs_get_dirent(mddev->kobj.sd, "bitmap"); | 1744 | bm = sysfs_get_dirent(mddev->kobj.sd, "bitmap"); |
@@ -1706,12 +1786,14 @@ int bitmap_create(struct mddev *mddev) | |||
1706 | printk(KERN_INFO "created bitmap (%lu pages) for device %s\n", | 1786 | printk(KERN_INFO "created bitmap (%lu pages) for device %s\n", |
1707 | bitmap->counts.pages, bmname(bitmap)); | 1787 | bitmap->counts.pages, bmname(bitmap)); |
1708 | 1788 | ||
1709 | mddev->bitmap = bitmap; | 1789 | err = test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0; |
1710 | return test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0; | 1790 | if (err) |
1791 | goto error; | ||
1711 | 1792 | ||
1793 | return bitmap; | ||
1712 | error: | 1794 | error: |
1713 | bitmap_free(bitmap); | 1795 | bitmap_free(bitmap); |
1714 | return err; | 1796 | return ERR_PTR(err); |
1715 | } | 1797 | } |
1716 | 1798 | ||
1717 | int bitmap_load(struct mddev *mddev) | 1799 | int bitmap_load(struct mddev *mddev) |
@@ -1765,6 +1847,60 @@ out: | |||
1765 | } | 1847 | } |
1766 | EXPORT_SYMBOL_GPL(bitmap_load); | 1848 | EXPORT_SYMBOL_GPL(bitmap_load); |
1767 | 1849 | ||
1850 | /* Loads the bitmap associated with slot and copies the resync information | ||
1851 | * to our bitmap | ||
1852 | */ | ||
1853 | int bitmap_copy_from_slot(struct mddev *mddev, int slot, | ||
1854 | sector_t *low, sector_t *high, bool clear_bits) | ||
1855 | { | ||
1856 | int rv = 0, i, j; | ||
1857 | sector_t block, lo = 0, hi = 0; | ||
1858 | struct bitmap_counts *counts; | ||
1859 | struct bitmap *bitmap = bitmap_create(mddev, slot); | ||
1860 | |||
1861 | if (IS_ERR(bitmap)) | ||
1862 | return PTR_ERR(bitmap); | ||
1863 | |||
1864 | rv = bitmap_read_sb(bitmap); | ||
1865 | if (rv) | ||
1866 | goto err; | ||
1867 | |||
1868 | rv = bitmap_init_from_disk(bitmap, 0); | ||
1869 | if (rv) | ||
1870 | goto err; | ||
1871 | |||
1872 | counts = &bitmap->counts; | ||
1873 | for (j = 0; j < counts->chunks; j++) { | ||
1874 | block = (sector_t)j << counts->chunkshift; | ||
1875 | if (bitmap_file_test_bit(bitmap, block)) { | ||
1876 | if (!lo) | ||
1877 | lo = block; | ||
1878 | hi = block; | ||
1879 | bitmap_file_clear_bit(bitmap, block); | ||
1880 | bitmap_set_memory_bits(mddev->bitmap, block, 1); | ||
1881 | bitmap_file_set_bit(mddev->bitmap, block); | ||
1882 | } | ||
1883 | } | ||
1884 | |||
1885 | if (clear_bits) { | ||
1886 | bitmap_update_sb(bitmap); | ||
1887 | /* Setting this for the ev_page should be enough. | ||
1888 | * And we do not require both write_all and PAGE_DIRT either | ||
1889 | */ | ||
1890 | for (i = 0; i < bitmap->storage.file_pages; i++) | ||
1891 | set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY); | ||
1892 | bitmap_write_all(bitmap); | ||
1893 | bitmap_unplug(bitmap); | ||
1894 | } | ||
1895 | *low = lo; | ||
1896 | *high = hi; | ||
1897 | err: | ||
1898 | bitmap_free(bitmap); | ||
1899 | return rv; | ||
1900 | } | ||
1901 | EXPORT_SYMBOL_GPL(bitmap_copy_from_slot); | ||
1902 | |||
1903 | |||
1768 | void bitmap_status(struct seq_file *seq, struct bitmap *bitmap) | 1904 | void bitmap_status(struct seq_file *seq, struct bitmap *bitmap) |
1769 | { | 1905 | { |
1770 | unsigned long chunk_kb; | 1906 | unsigned long chunk_kb; |
@@ -1849,7 +1985,8 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, | |||
1849 | memset(&store, 0, sizeof(store)); | 1985 | memset(&store, 0, sizeof(store)); |
1850 | if (bitmap->mddev->bitmap_info.offset || bitmap->mddev->bitmap_info.file) | 1986 | if (bitmap->mddev->bitmap_info.offset || bitmap->mddev->bitmap_info.file) |
1851 | ret = bitmap_storage_alloc(&store, chunks, | 1987 | ret = bitmap_storage_alloc(&store, chunks, |
1852 | !bitmap->mddev->bitmap_info.external); | 1988 | !bitmap->mddev->bitmap_info.external, |
1989 | bitmap->cluster_slot); | ||
1853 | if (ret) | 1990 | if (ret) |
1854 | goto err; | 1991 | goto err; |
1855 | 1992 | ||
@@ -2021,13 +2158,18 @@ location_store(struct mddev *mddev, const char *buf, size_t len) | |||
2021 | return -EINVAL; | 2158 | return -EINVAL; |
2022 | mddev->bitmap_info.offset = offset; | 2159 | mddev->bitmap_info.offset = offset; |
2023 | if (mddev->pers) { | 2160 | if (mddev->pers) { |
2161 | struct bitmap *bitmap; | ||
2024 | mddev->pers->quiesce(mddev, 1); | 2162 | mddev->pers->quiesce(mddev, 1); |
2025 | rv = bitmap_create(mddev); | 2163 | bitmap = bitmap_create(mddev, -1); |
2026 | if (!rv) | 2164 | if (IS_ERR(bitmap)) |
2165 | rv = PTR_ERR(bitmap); | ||
2166 | else { | ||
2167 | mddev->bitmap = bitmap; | ||
2027 | rv = bitmap_load(mddev); | 2168 | rv = bitmap_load(mddev); |
2028 | if (rv) { | 2169 | if (rv) { |
2029 | bitmap_destroy(mddev); | 2170 | bitmap_destroy(mddev); |
2030 | mddev->bitmap_info.offset = 0; | 2171 | mddev->bitmap_info.offset = 0; |
2172 | } | ||
2031 | } | 2173 | } |
2032 | mddev->pers->quiesce(mddev, 0); | 2174 | mddev->pers->quiesce(mddev, 0); |
2033 | if (rv) | 2175 | if (rv) |
@@ -2186,6 +2328,8 @@ __ATTR(chunksize, S_IRUGO|S_IWUSR, chunksize_show, chunksize_store); | |||
2186 | 2328 | ||
2187 | static ssize_t metadata_show(struct mddev *mddev, char *page) | 2329 | static ssize_t metadata_show(struct mddev *mddev, char *page) |
2188 | { | 2330 | { |
2331 | if (mddev_is_clustered(mddev)) | ||
2332 | return sprintf(page, "clustered\n"); | ||
2189 | return sprintf(page, "%s\n", (mddev->bitmap_info.external | 2333 | return sprintf(page, "%s\n", (mddev->bitmap_info.external |
2190 | ? "external" : "internal")); | 2334 | ? "external" : "internal")); |
2191 | } | 2335 | } |
@@ -2198,7 +2342,8 @@ static ssize_t metadata_store(struct mddev *mddev, const char *buf, size_t len) | |||
2198 | return -EBUSY; | 2342 | return -EBUSY; |
2199 | if (strncmp(buf, "external", 8) == 0) | 2343 | if (strncmp(buf, "external", 8) == 0) |
2200 | mddev->bitmap_info.external = 1; | 2344 | mddev->bitmap_info.external = 1; |
2201 | else if (strncmp(buf, "internal", 8) == 0) | 2345 | else if ((strncmp(buf, "internal", 8) == 0) || |
2346 | (strncmp(buf, "clustered", 9) == 0)) | ||
2202 | mddev->bitmap_info.external = 0; | 2347 | mddev->bitmap_info.external = 0; |
2203 | else | 2348 | else |
2204 | return -EINVAL; | 2349 | return -EINVAL; |
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index 30210b9c4ef9..f1f4dd01090d 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h | |||
@@ -130,8 +130,9 @@ typedef struct bitmap_super_s { | |||
130 | __le32 write_behind; /* 60 number of outstanding write-behind writes */ | 130 | __le32 write_behind; /* 60 number of outstanding write-behind writes */ |
131 | __le32 sectors_reserved; /* 64 number of 512-byte sectors that are | 131 | __le32 sectors_reserved; /* 64 number of 512-byte sectors that are |
132 | * reserved for the bitmap. */ | 132 | * reserved for the bitmap. */ |
133 | 133 | __le32 nodes; /* 68 the maximum number of nodes in cluster. */ | |
134 | __u8 pad[256 - 68]; /* set to zero */ | 134 | __u8 cluster_name[64]; /* 72 cluster name to which this md belongs */ |
135 | __u8 pad[256 - 136]; /* set to zero */ | ||
135 | } bitmap_super_t; | 136 | } bitmap_super_t; |
136 | 137 | ||
137 | /* notes: | 138 | /* notes: |
@@ -226,12 +227,13 @@ struct bitmap { | |||
226 | wait_queue_head_t behind_wait; | 227 | wait_queue_head_t behind_wait; |
227 | 228 | ||
228 | struct kernfs_node *sysfs_can_clear; | 229 | struct kernfs_node *sysfs_can_clear; |
230 | int cluster_slot; /* Slot offset for clustered env */ | ||
229 | }; | 231 | }; |
230 | 232 | ||
231 | /* the bitmap API */ | 233 | /* the bitmap API */ |
232 | 234 | ||
233 | /* these are used only by md/bitmap */ | 235 | /* these are used only by md/bitmap */ |
234 | int bitmap_create(struct mddev *mddev); | 236 | struct bitmap *bitmap_create(struct mddev *mddev, int slot); |
235 | int bitmap_load(struct mddev *mddev); | 237 | int bitmap_load(struct mddev *mddev); |
236 | void bitmap_flush(struct mddev *mddev); | 238 | void bitmap_flush(struct mddev *mddev); |
237 | void bitmap_destroy(struct mddev *mddev); | 239 | void bitmap_destroy(struct mddev *mddev); |
@@ -260,6 +262,8 @@ void bitmap_daemon_work(struct mddev *mddev); | |||
260 | 262 | ||
261 | int bitmap_resize(struct bitmap *bitmap, sector_t blocks, | 263 | int bitmap_resize(struct bitmap *bitmap, sector_t blocks, |
262 | int chunksize, int init); | 264 | int chunksize, int init); |
265 | int bitmap_copy_from_slot(struct mddev *mddev, int slot, | ||
266 | sector_t *lo, sector_t *hi, bool clear_bits); | ||
263 | #endif | 267 | #endif |
264 | 268 | ||
265 | #endif | 269 | #endif |
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c new file mode 100644 index 000000000000..fcfc4b9b2672 --- /dev/null +++ b/drivers/md/md-cluster.c | |||
@@ -0,0 +1,965 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2015, SUSE | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2, or (at your option) | ||
7 | * any later version. | ||
8 | * | ||
9 | */ | ||
10 | |||
11 | |||
12 | #include <linux/module.h> | ||
13 | #include <linux/dlm.h> | ||
14 | #include <linux/sched.h> | ||
15 | #include <linux/raid/md_p.h> | ||
16 | #include "md.h" | ||
17 | #include "bitmap.h" | ||
18 | #include "md-cluster.h" | ||
19 | |||
20 | #define LVB_SIZE 64 | ||
21 | #define NEW_DEV_TIMEOUT 5000 | ||
22 | |||
23 | struct dlm_lock_resource { | ||
24 | dlm_lockspace_t *ls; | ||
25 | struct dlm_lksb lksb; | ||
26 | char *name; /* lock name. */ | ||
27 | uint32_t flags; /* flags to pass to dlm_lock() */ | ||
28 | struct completion completion; /* completion for synchronized locking */ | ||
29 | void (*bast)(void *arg, int mode); /* blocking AST function pointer*/ | ||
30 | struct mddev *mddev; /* pointing back to mddev. */ | ||
31 | }; | ||
32 | |||
33 | struct suspend_info { | ||
34 | int slot; | ||
35 | sector_t lo; | ||
36 | sector_t hi; | ||
37 | struct list_head list; | ||
38 | }; | ||
39 | |||
40 | struct resync_info { | ||
41 | __le64 lo; | ||
42 | __le64 hi; | ||
43 | }; | ||
44 | |||
45 | /* md_cluster_info flags */ | ||
46 | #define MD_CLUSTER_WAITING_FOR_NEWDISK 1 | ||
47 | |||
48 | |||
49 | struct md_cluster_info { | ||
50 | /* dlm lock space and resources for clustered raid. */ | ||
51 | dlm_lockspace_t *lockspace; | ||
52 | int slot_number; | ||
53 | struct completion completion; | ||
54 | struct dlm_lock_resource *sb_lock; | ||
55 | struct mutex sb_mutex; | ||
56 | struct dlm_lock_resource *bitmap_lockres; | ||
57 | struct list_head suspend_list; | ||
58 | spinlock_t suspend_lock; | ||
59 | struct md_thread *recovery_thread; | ||
60 | unsigned long recovery_map; | ||
61 | /* communication loc resources */ | ||
62 | struct dlm_lock_resource *ack_lockres; | ||
63 | struct dlm_lock_resource *message_lockres; | ||
64 | struct dlm_lock_resource *token_lockres; | ||
65 | struct dlm_lock_resource *no_new_dev_lockres; | ||
66 | struct md_thread *recv_thread; | ||
67 | struct completion newdisk_completion; | ||
68 | unsigned long state; | ||
69 | }; | ||
70 | |||
71 | enum msg_type { | ||
72 | METADATA_UPDATED = 0, | ||
73 | RESYNCING, | ||
74 | NEWDISK, | ||
75 | REMOVE, | ||
76 | RE_ADD, | ||
77 | }; | ||
78 | |||
79 | struct cluster_msg { | ||
80 | int type; | ||
81 | int slot; | ||
82 | /* TODO: Unionize this for smaller footprint */ | ||
83 | sector_t low; | ||
84 | sector_t high; | ||
85 | char uuid[16]; | ||
86 | int raid_slot; | ||
87 | }; | ||
88 | |||
89 | static void sync_ast(void *arg) | ||
90 | { | ||
91 | struct dlm_lock_resource *res; | ||
92 | |||
93 | res = (struct dlm_lock_resource *) arg; | ||
94 | complete(&res->completion); | ||
95 | } | ||
96 | |||
97 | static int dlm_lock_sync(struct dlm_lock_resource *res, int mode) | ||
98 | { | ||
99 | int ret = 0; | ||
100 | |||
101 | init_completion(&res->completion); | ||
102 | ret = dlm_lock(res->ls, mode, &res->lksb, | ||
103 | res->flags, res->name, strlen(res->name), | ||
104 | 0, sync_ast, res, res->bast); | ||
105 | if (ret) | ||
106 | return ret; | ||
107 | wait_for_completion(&res->completion); | ||
108 | return res->lksb.sb_status; | ||
109 | } | ||
110 | |||
111 | static int dlm_unlock_sync(struct dlm_lock_resource *res) | ||
112 | { | ||
113 | return dlm_lock_sync(res, DLM_LOCK_NL); | ||
114 | } | ||
115 | |||
116 | static struct dlm_lock_resource *lockres_init(struct mddev *mddev, | ||
117 | char *name, void (*bastfn)(void *arg, int mode), int with_lvb) | ||
118 | { | ||
119 | struct dlm_lock_resource *res = NULL; | ||
120 | int ret, namelen; | ||
121 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
122 | |||
123 | res = kzalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL); | ||
124 | if (!res) | ||
125 | return NULL; | ||
126 | res->ls = cinfo->lockspace; | ||
127 | res->mddev = mddev; | ||
128 | namelen = strlen(name); | ||
129 | res->name = kzalloc(namelen + 1, GFP_KERNEL); | ||
130 | if (!res->name) { | ||
131 | pr_err("md-cluster: Unable to allocate resource name for resource %s\n", name); | ||
132 | goto out_err; | ||
133 | } | ||
134 | strlcpy(res->name, name, namelen + 1); | ||
135 | if (with_lvb) { | ||
136 | res->lksb.sb_lvbptr = kzalloc(LVB_SIZE, GFP_KERNEL); | ||
137 | if (!res->lksb.sb_lvbptr) { | ||
138 | pr_err("md-cluster: Unable to allocate LVB for resource %s\n", name); | ||
139 | goto out_err; | ||
140 | } | ||
141 | res->flags = DLM_LKF_VALBLK; | ||
142 | } | ||
143 | |||
144 | if (bastfn) | ||
145 | res->bast = bastfn; | ||
146 | |||
147 | res->flags |= DLM_LKF_EXPEDITE; | ||
148 | |||
149 | ret = dlm_lock_sync(res, DLM_LOCK_NL); | ||
150 | if (ret) { | ||
151 | pr_err("md-cluster: Unable to lock NL on new lock resource %s\n", name); | ||
152 | goto out_err; | ||
153 | } | ||
154 | res->flags &= ~DLM_LKF_EXPEDITE; | ||
155 | res->flags |= DLM_LKF_CONVERT; | ||
156 | |||
157 | return res; | ||
158 | out_err: | ||
159 | kfree(res->lksb.sb_lvbptr); | ||
160 | kfree(res->name); | ||
161 | kfree(res); | ||
162 | return NULL; | ||
163 | } | ||
164 | |||
165 | static void lockres_free(struct dlm_lock_resource *res) | ||
166 | { | ||
167 | if (!res) | ||
168 | return; | ||
169 | |||
170 | init_completion(&res->completion); | ||
171 | dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res); | ||
172 | wait_for_completion(&res->completion); | ||
173 | |||
174 | kfree(res->name); | ||
175 | kfree(res->lksb.sb_lvbptr); | ||
176 | kfree(res); | ||
177 | } | ||
178 | |||
179 | static char *pretty_uuid(char *dest, char *src) | ||
180 | { | ||
181 | int i, len = 0; | ||
182 | |||
183 | for (i = 0; i < 16; i++) { | ||
184 | if (i == 4 || i == 6 || i == 8 || i == 10) | ||
185 | len += sprintf(dest + len, "-"); | ||
186 | len += sprintf(dest + len, "%02x", (__u8)src[i]); | ||
187 | } | ||
188 | return dest; | ||
189 | } | ||
190 | |||
191 | static void add_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres, | ||
192 | sector_t lo, sector_t hi) | ||
193 | { | ||
194 | struct resync_info *ri; | ||
195 | |||
196 | ri = (struct resync_info *)lockres->lksb.sb_lvbptr; | ||
197 | ri->lo = cpu_to_le64(lo); | ||
198 | ri->hi = cpu_to_le64(hi); | ||
199 | } | ||
200 | |||
201 | static struct suspend_info *read_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres) | ||
202 | { | ||
203 | struct resync_info ri; | ||
204 | struct suspend_info *s = NULL; | ||
205 | sector_t hi = 0; | ||
206 | |||
207 | dlm_lock_sync(lockres, DLM_LOCK_CR); | ||
208 | memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info)); | ||
209 | hi = le64_to_cpu(ri.hi); | ||
210 | if (ri.hi > 0) { | ||
211 | s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL); | ||
212 | if (!s) | ||
213 | goto out; | ||
214 | s->hi = hi; | ||
215 | s->lo = le64_to_cpu(ri.lo); | ||
216 | } | ||
217 | dlm_unlock_sync(lockres); | ||
218 | out: | ||
219 | return s; | ||
220 | } | ||
221 | |||
222 | static void recover_bitmaps(struct md_thread *thread) | ||
223 | { | ||
224 | struct mddev *mddev = thread->mddev; | ||
225 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
226 | struct dlm_lock_resource *bm_lockres; | ||
227 | char str[64]; | ||
228 | int slot, ret; | ||
229 | struct suspend_info *s, *tmp; | ||
230 | sector_t lo, hi; | ||
231 | |||
232 | while (cinfo->recovery_map) { | ||
233 | slot = fls64((u64)cinfo->recovery_map) - 1; | ||
234 | |||
235 | /* Clear suspend_area associated with the bitmap */ | ||
236 | spin_lock_irq(&cinfo->suspend_lock); | ||
237 | list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list) | ||
238 | if (slot == s->slot) { | ||
239 | list_del(&s->list); | ||
240 | kfree(s); | ||
241 | } | ||
242 | spin_unlock_irq(&cinfo->suspend_lock); | ||
243 | |||
244 | snprintf(str, 64, "bitmap%04d", slot); | ||
245 | bm_lockres = lockres_init(mddev, str, NULL, 1); | ||
246 | if (!bm_lockres) { | ||
247 | pr_err("md-cluster: Cannot initialize bitmaps\n"); | ||
248 | goto clear_bit; | ||
249 | } | ||
250 | |||
251 | ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); | ||
252 | if (ret) { | ||
253 | pr_err("md-cluster: Could not DLM lock %s: %d\n", | ||
254 | str, ret); | ||
255 | goto clear_bit; | ||
256 | } | ||
257 | ret = bitmap_copy_from_slot(mddev, slot, &lo, &hi, true); | ||
258 | if (ret) { | ||
259 | pr_err("md-cluster: Could not copy data from bitmap %d\n", slot); | ||
260 | goto dlm_unlock; | ||
261 | } | ||
262 | if (hi > 0) { | ||
263 | /* TODO:Wait for current resync to get over */ | ||
264 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
265 | if (lo < mddev->recovery_cp) | ||
266 | mddev->recovery_cp = lo; | ||
267 | md_check_recovery(mddev); | ||
268 | } | ||
269 | dlm_unlock: | ||
270 | dlm_unlock_sync(bm_lockres); | ||
271 | clear_bit: | ||
272 | clear_bit(slot, &cinfo->recovery_map); | ||
273 | } | ||
274 | } | ||
275 | |||
276 | static void recover_prep(void *arg) | ||
277 | { | ||
278 | } | ||
279 | |||
280 | static void recover_slot(void *arg, struct dlm_slot *slot) | ||
281 | { | ||
282 | struct mddev *mddev = arg; | ||
283 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
284 | |||
285 | pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n", | ||
286 | mddev->bitmap_info.cluster_name, | ||
287 | slot->nodeid, slot->slot, | ||
288 | cinfo->slot_number); | ||
289 | set_bit(slot->slot - 1, &cinfo->recovery_map); | ||
290 | if (!cinfo->recovery_thread) { | ||
291 | cinfo->recovery_thread = md_register_thread(recover_bitmaps, | ||
292 | mddev, "recover"); | ||
293 | if (!cinfo->recovery_thread) { | ||
294 | pr_warn("md-cluster: Could not create recovery thread\n"); | ||
295 | return; | ||
296 | } | ||
297 | } | ||
298 | md_wakeup_thread(cinfo->recovery_thread); | ||
299 | } | ||
300 | |||
301 | static void recover_done(void *arg, struct dlm_slot *slots, | ||
302 | int num_slots, int our_slot, | ||
303 | uint32_t generation) | ||
304 | { | ||
305 | struct mddev *mddev = arg; | ||
306 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
307 | |||
308 | cinfo->slot_number = our_slot; | ||
309 | complete(&cinfo->completion); | ||
310 | } | ||
311 | |||
312 | static const struct dlm_lockspace_ops md_ls_ops = { | ||
313 | .recover_prep = recover_prep, | ||
314 | .recover_slot = recover_slot, | ||
315 | .recover_done = recover_done, | ||
316 | }; | ||
317 | |||
318 | /* | ||
319 | * The BAST function for the ack lock resource | ||
320 | * This function wakes up the receive thread in | ||
321 | * order to receive and process the message. | ||
322 | */ | ||
323 | static void ack_bast(void *arg, int mode) | ||
324 | { | ||
325 | struct dlm_lock_resource *res = (struct dlm_lock_resource *)arg; | ||
326 | struct md_cluster_info *cinfo = res->mddev->cluster_info; | ||
327 | |||
328 | if (mode == DLM_LOCK_EX) | ||
329 | md_wakeup_thread(cinfo->recv_thread); | ||
330 | } | ||
331 | |||
332 | static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot) | ||
333 | { | ||
334 | struct suspend_info *s, *tmp; | ||
335 | |||
336 | list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list) | ||
337 | if (slot == s->slot) { | ||
338 | pr_info("%s:%d Deleting suspend_info: %d\n", | ||
339 | __func__, __LINE__, slot); | ||
340 | list_del(&s->list); | ||
341 | kfree(s); | ||
342 | break; | ||
343 | } | ||
344 | } | ||
345 | |||
346 | static void remove_suspend_info(struct md_cluster_info *cinfo, int slot) | ||
347 | { | ||
348 | spin_lock_irq(&cinfo->suspend_lock); | ||
349 | __remove_suspend_info(cinfo, slot); | ||
350 | spin_unlock_irq(&cinfo->suspend_lock); | ||
351 | } | ||
352 | |||
353 | |||
354 | static void process_suspend_info(struct md_cluster_info *cinfo, | ||
355 | int slot, sector_t lo, sector_t hi) | ||
356 | { | ||
357 | struct suspend_info *s; | ||
358 | |||
359 | if (!hi) { | ||
360 | remove_suspend_info(cinfo, slot); | ||
361 | return; | ||
362 | } | ||
363 | s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL); | ||
364 | if (!s) | ||
365 | return; | ||
366 | s->slot = slot; | ||
367 | s->lo = lo; | ||
368 | s->hi = hi; | ||
369 | spin_lock_irq(&cinfo->suspend_lock); | ||
370 | /* Remove existing entry (if exists) before adding */ | ||
371 | __remove_suspend_info(cinfo, slot); | ||
372 | list_add(&s->list, &cinfo->suspend_list); | ||
373 | spin_unlock_irq(&cinfo->suspend_lock); | ||
374 | } | ||
375 | |||
376 | static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) | ||
377 | { | ||
378 | char disk_uuid[64]; | ||
379 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
380 | char event_name[] = "EVENT=ADD_DEVICE"; | ||
381 | char raid_slot[16]; | ||
382 | char *envp[] = {event_name, disk_uuid, raid_slot, NULL}; | ||
383 | int len; | ||
384 | |||
385 | len = snprintf(disk_uuid, 64, "DEVICE_UUID="); | ||
386 | pretty_uuid(disk_uuid + len, cmsg->uuid); | ||
387 | snprintf(raid_slot, 16, "RAID_DISK=%d", cmsg->raid_slot); | ||
388 | pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot); | ||
389 | init_completion(&cinfo->newdisk_completion); | ||
390 | set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state); | ||
391 | kobject_uevent_env(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE, envp); | ||
392 | wait_for_completion_timeout(&cinfo->newdisk_completion, | ||
393 | NEW_DEV_TIMEOUT); | ||
394 | clear_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state); | ||
395 | } | ||
396 | |||
397 | |||
398 | static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg) | ||
399 | { | ||
400 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
401 | |||
402 | md_reload_sb(mddev); | ||
403 | dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); | ||
404 | } | ||
405 | |||
406 | static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg) | ||
407 | { | ||
408 | struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot); | ||
409 | |||
410 | if (rdev) | ||
411 | md_kick_rdev_from_array(rdev); | ||
412 | else | ||
413 | pr_warn("%s: %d Could not find disk(%d) to REMOVE\n", __func__, __LINE__, msg->raid_slot); | ||
414 | } | ||
415 | |||
416 | static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg) | ||
417 | { | ||
418 | struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot); | ||
419 | |||
420 | if (rdev && test_bit(Faulty, &rdev->flags)) | ||
421 | clear_bit(Faulty, &rdev->flags); | ||
422 | else | ||
423 | pr_warn("%s: %d Could not find disk(%d) which is faulty", __func__, __LINE__, msg->raid_slot); | ||
424 | } | ||
425 | |||
426 | static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) | ||
427 | { | ||
428 | switch (msg->type) { | ||
429 | case METADATA_UPDATED: | ||
430 | pr_info("%s: %d Received message: METADATA_UPDATE from %d\n", | ||
431 | __func__, __LINE__, msg->slot); | ||
432 | process_metadata_update(mddev, msg); | ||
433 | break; | ||
434 | case RESYNCING: | ||
435 | pr_info("%s: %d Received message: RESYNCING from %d\n", | ||
436 | __func__, __LINE__, msg->slot); | ||
437 | process_suspend_info(mddev->cluster_info, msg->slot, | ||
438 | msg->low, msg->high); | ||
439 | break; | ||
440 | case NEWDISK: | ||
441 | pr_info("%s: %d Received message: NEWDISK from %d\n", | ||
442 | __func__, __LINE__, msg->slot); | ||
443 | process_add_new_disk(mddev, msg); | ||
444 | break; | ||
445 | case REMOVE: | ||
446 | pr_info("%s: %d Received REMOVE from %d\n", | ||
447 | __func__, __LINE__, msg->slot); | ||
448 | process_remove_disk(mddev, msg); | ||
449 | break; | ||
450 | case RE_ADD: | ||
451 | pr_info("%s: %d Received RE_ADD from %d\n", | ||
452 | __func__, __LINE__, msg->slot); | ||
453 | process_readd_disk(mddev, msg); | ||
454 | break; | ||
455 | default: | ||
456 | pr_warn("%s:%d Received unknown message from %d\n", | ||
457 | __func__, __LINE__, msg->slot); | ||
458 | } | ||
459 | } | ||
460 | |||
461 | /* | ||
462 | * thread for receiving message | ||
463 | */ | ||
464 | static void recv_daemon(struct md_thread *thread) | ||
465 | { | ||
466 | struct md_cluster_info *cinfo = thread->mddev->cluster_info; | ||
467 | struct dlm_lock_resource *ack_lockres = cinfo->ack_lockres; | ||
468 | struct dlm_lock_resource *message_lockres = cinfo->message_lockres; | ||
469 | struct cluster_msg msg; | ||
470 | |||
471 | /*get CR on Message*/ | ||
472 | if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) { | ||
473 | pr_err("md/raid1:failed to get CR on MESSAGE\n"); | ||
474 | return; | ||
475 | } | ||
476 | |||
477 | /* read lvb and wake up thread to process this message_lockres */ | ||
478 | memcpy(&msg, message_lockres->lksb.sb_lvbptr, sizeof(struct cluster_msg)); | ||
479 | process_recvd_msg(thread->mddev, &msg); | ||
480 | |||
481 | /*release CR on ack_lockres*/ | ||
482 | dlm_unlock_sync(ack_lockres); | ||
483 | /*up-convert to EX on message_lockres*/ | ||
484 | dlm_lock_sync(message_lockres, DLM_LOCK_EX); | ||
485 | /*get CR on ack_lockres again*/ | ||
486 | dlm_lock_sync(ack_lockres, DLM_LOCK_CR); | ||
487 | /*release CR on message_lockres*/ | ||
488 | dlm_unlock_sync(message_lockres); | ||
489 | } | ||
490 | |||
491 | /* lock_comm() | ||
492 | * Takes the lock on the TOKEN lock resource so no other | ||
493 | * node can communicate while the operation is underway. | ||
494 | */ | ||
495 | static int lock_comm(struct md_cluster_info *cinfo) | ||
496 | { | ||
497 | int error; | ||
498 | |||
499 | error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX); | ||
500 | if (error) | ||
501 | pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n", | ||
502 | __func__, __LINE__, error); | ||
503 | return error; | ||
504 | } | ||
505 | |||
506 | static void unlock_comm(struct md_cluster_info *cinfo) | ||
507 | { | ||
508 | dlm_unlock_sync(cinfo->token_lockres); | ||
509 | } | ||
510 | |||
511 | /* __sendmsg() | ||
512 | * This function performs the actual sending of the message. This function is | ||
513 | * usually called after performing the encompassing operation | ||
514 | * The function: | ||
515 | * 1. Grabs the message lockresource in EX mode | ||
516 | * 2. Copies the message to the message LVB | ||
517 | * 3. Downconverts message lockresource to CR | ||
518 | * 4. Upconverts ack lock resource from CR to EX. This forces the BAST on other nodes | ||
519 | * and the other nodes read the message. The thread will wait here until all other | ||
520 | * nodes have released ack lock resource. | ||
521 | * 5. Downconvert ack lockresource to CR | ||
522 | */ | ||
523 | static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg) | ||
524 | { | ||
525 | int error; | ||
526 | int slot = cinfo->slot_number - 1; | ||
527 | |||
528 | cmsg->slot = cpu_to_le32(slot); | ||
529 | /*get EX on Message*/ | ||
530 | error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_EX); | ||
531 | if (error) { | ||
532 | pr_err("md-cluster: failed to get EX on MESSAGE (%d)\n", error); | ||
533 | goto failed_message; | ||
534 | } | ||
535 | |||
536 | memcpy(cinfo->message_lockres->lksb.sb_lvbptr, (void *)cmsg, | ||
537 | sizeof(struct cluster_msg)); | ||
538 | /*down-convert EX to CR on Message*/ | ||
539 | error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_CR); | ||
540 | if (error) { | ||
541 | pr_err("md-cluster: failed to convert EX to CR on MESSAGE(%d)\n", | ||
542 | error); | ||
543 | goto failed_message; | ||
544 | } | ||
545 | |||
546 | /*up-convert CR to EX on Ack*/ | ||
547 | error = dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_EX); | ||
548 | if (error) { | ||
549 | pr_err("md-cluster: failed to convert CR to EX on ACK(%d)\n", | ||
550 | error); | ||
551 | goto failed_ack; | ||
552 | } | ||
553 | |||
554 | /*down-convert EX to CR on Ack*/ | ||
555 | error = dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR); | ||
556 | if (error) { | ||
557 | pr_err("md-cluster: failed to convert EX to CR on ACK(%d)\n", | ||
558 | error); | ||
559 | goto failed_ack; | ||
560 | } | ||
561 | |||
562 | failed_ack: | ||
563 | dlm_unlock_sync(cinfo->message_lockres); | ||
564 | failed_message: | ||
565 | return error; | ||
566 | } | ||
567 | |||
568 | static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg) | ||
569 | { | ||
570 | int ret; | ||
571 | |||
572 | lock_comm(cinfo); | ||
573 | ret = __sendmsg(cinfo, cmsg); | ||
574 | unlock_comm(cinfo); | ||
575 | return ret; | ||
576 | } | ||
577 | |||
578 | static int gather_all_resync_info(struct mddev *mddev, int total_slots) | ||
579 | { | ||
580 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
581 | int i, ret = 0; | ||
582 | struct dlm_lock_resource *bm_lockres; | ||
583 | struct suspend_info *s; | ||
584 | char str[64]; | ||
585 | |||
586 | |||
587 | for (i = 0; i < total_slots; i++) { | ||
588 | memset(str, '\0', 64); | ||
589 | snprintf(str, 64, "bitmap%04d", i); | ||
590 | bm_lockres = lockres_init(mddev, str, NULL, 1); | ||
591 | if (!bm_lockres) | ||
592 | return -ENOMEM; | ||
593 | if (i == (cinfo->slot_number - 1)) | ||
594 | continue; | ||
595 | |||
596 | bm_lockres->flags |= DLM_LKF_NOQUEUE; | ||
597 | ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); | ||
598 | if (ret == -EAGAIN) { | ||
599 | memset(bm_lockres->lksb.sb_lvbptr, '\0', LVB_SIZE); | ||
600 | s = read_resync_info(mddev, bm_lockres); | ||
601 | if (s) { | ||
602 | pr_info("%s:%d Resync[%llu..%llu] in progress on %d\n", | ||
603 | __func__, __LINE__, | ||
604 | (unsigned long long) s->lo, | ||
605 | (unsigned long long) s->hi, i); | ||
606 | spin_lock_irq(&cinfo->suspend_lock); | ||
607 | s->slot = i; | ||
608 | list_add(&s->list, &cinfo->suspend_list); | ||
609 | spin_unlock_irq(&cinfo->suspend_lock); | ||
610 | } | ||
611 | ret = 0; | ||
612 | lockres_free(bm_lockres); | ||
613 | continue; | ||
614 | } | ||
615 | if (ret) | ||
616 | goto out; | ||
617 | /* TODO: Read the disk bitmap sb and check if it needs recovery */ | ||
618 | dlm_unlock_sync(bm_lockres); | ||
619 | lockres_free(bm_lockres); | ||
620 | } | ||
621 | out: | ||
622 | return ret; | ||
623 | } | ||
624 | |||
625 | static int join(struct mddev *mddev, int nodes) | ||
626 | { | ||
627 | struct md_cluster_info *cinfo; | ||
628 | int ret, ops_rv; | ||
629 | char str[64]; | ||
630 | |||
631 | if (!try_module_get(THIS_MODULE)) | ||
632 | return -ENOENT; | ||
633 | |||
634 | cinfo = kzalloc(sizeof(struct md_cluster_info), GFP_KERNEL); | ||
635 | if (!cinfo) | ||
636 | return -ENOMEM; | ||
637 | |||
638 | init_completion(&cinfo->completion); | ||
639 | |||
640 | mutex_init(&cinfo->sb_mutex); | ||
641 | mddev->cluster_info = cinfo; | ||
642 | |||
643 | memset(str, 0, 64); | ||
644 | pretty_uuid(str, mddev->uuid); | ||
645 | ret = dlm_new_lockspace(str, mddev->bitmap_info.cluster_name, | ||
646 | DLM_LSFL_FS, LVB_SIZE, | ||
647 | &md_ls_ops, mddev, &ops_rv, &cinfo->lockspace); | ||
648 | if (ret) | ||
649 | goto err; | ||
650 | wait_for_completion(&cinfo->completion); | ||
651 | if (nodes < cinfo->slot_number) { | ||
652 | pr_err("md-cluster: Slot allotted(%d) is greater than available slots(%d).", | ||
653 | cinfo->slot_number, nodes); | ||
654 | ret = -ERANGE; | ||
655 | goto err; | ||
656 | } | ||
657 | cinfo->sb_lock = lockres_init(mddev, "cmd-super", | ||
658 | NULL, 0); | ||
659 | if (!cinfo->sb_lock) { | ||
660 | ret = -ENOMEM; | ||
661 | goto err; | ||
662 | } | ||
663 | /* Initiate the communication resources */ | ||
664 | ret = -ENOMEM; | ||
665 | cinfo->recv_thread = md_register_thread(recv_daemon, mddev, "cluster_recv"); | ||
666 | if (!cinfo->recv_thread) { | ||
667 | pr_err("md-cluster: cannot allocate memory for recv_thread!\n"); | ||
668 | goto err; | ||
669 | } | ||
670 | cinfo->message_lockres = lockres_init(mddev, "message", NULL, 1); | ||
671 | if (!cinfo->message_lockres) | ||
672 | goto err; | ||
673 | cinfo->token_lockres = lockres_init(mddev, "token", NULL, 0); | ||
674 | if (!cinfo->token_lockres) | ||
675 | goto err; | ||
676 | cinfo->ack_lockres = lockres_init(mddev, "ack", ack_bast, 0); | ||
677 | if (!cinfo->ack_lockres) | ||
678 | goto err; | ||
679 | cinfo->no_new_dev_lockres = lockres_init(mddev, "no-new-dev", NULL, 0); | ||
680 | if (!cinfo->no_new_dev_lockres) | ||
681 | goto err; | ||
682 | |||
683 | /* get sync CR lock on ACK. */ | ||
684 | if (dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR)) | ||
685 | pr_err("md-cluster: failed to get a sync CR lock on ACK!(%d)\n", | ||
686 | ret); | ||
687 | /* get sync CR lock on no-new-dev. */ | ||
688 | if (dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR)) | ||
689 | pr_err("md-cluster: failed to get a sync CR lock on no-new-dev!(%d)\n", ret); | ||
690 | |||
691 | |||
692 | pr_info("md-cluster: Joined cluster %s slot %d\n", str, cinfo->slot_number); | ||
693 | snprintf(str, 64, "bitmap%04d", cinfo->slot_number - 1); | ||
694 | cinfo->bitmap_lockres = lockres_init(mddev, str, NULL, 1); | ||
695 | if (!cinfo->bitmap_lockres) | ||
696 | goto err; | ||
697 | if (dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW)) { | ||
698 | pr_err("Failed to get bitmap lock\n"); | ||
699 | ret = -EINVAL; | ||
700 | goto err; | ||
701 | } | ||
702 | |||
703 | INIT_LIST_HEAD(&cinfo->suspend_list); | ||
704 | spin_lock_init(&cinfo->suspend_lock); | ||
705 | |||
706 | ret = gather_all_resync_info(mddev, nodes); | ||
707 | if (ret) | ||
708 | goto err; | ||
709 | |||
710 | return 0; | ||
711 | err: | ||
712 | lockres_free(cinfo->message_lockres); | ||
713 | lockres_free(cinfo->token_lockres); | ||
714 | lockres_free(cinfo->ack_lockres); | ||
715 | lockres_free(cinfo->no_new_dev_lockres); | ||
716 | lockres_free(cinfo->bitmap_lockres); | ||
717 | lockres_free(cinfo->sb_lock); | ||
718 | if (cinfo->lockspace) | ||
719 | dlm_release_lockspace(cinfo->lockspace, 2); | ||
720 | mddev->cluster_info = NULL; | ||
721 | kfree(cinfo); | ||
722 | module_put(THIS_MODULE); | ||
723 | return ret; | ||
724 | } | ||
725 | |||
726 | static int leave(struct mddev *mddev) | ||
727 | { | ||
728 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
729 | |||
730 | if (!cinfo) | ||
731 | return 0; | ||
732 | md_unregister_thread(&cinfo->recovery_thread); | ||
733 | md_unregister_thread(&cinfo->recv_thread); | ||
734 | lockres_free(cinfo->message_lockres); | ||
735 | lockres_free(cinfo->token_lockres); | ||
736 | lockres_free(cinfo->ack_lockres); | ||
737 | lockres_free(cinfo->no_new_dev_lockres); | ||
738 | lockres_free(cinfo->sb_lock); | ||
739 | lockres_free(cinfo->bitmap_lockres); | ||
740 | dlm_release_lockspace(cinfo->lockspace, 2); | ||
741 | return 0; | ||
742 | } | ||
743 | |||
744 | /* slot_number(): Returns the MD slot number to use | ||
745 | * DLM starts the slot numbers from 1, wheras cluster-md | ||
746 | * wants the number to be from zero, so we deduct one | ||
747 | */ | ||
748 | static int slot_number(struct mddev *mddev) | ||
749 | { | ||
750 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
751 | |||
752 | return cinfo->slot_number - 1; | ||
753 | } | ||
754 | |||
755 | static void resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) | ||
756 | { | ||
757 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
758 | |||
759 | add_resync_info(mddev, cinfo->bitmap_lockres, lo, hi); | ||
760 | /* Re-acquire the lock to refresh LVB */ | ||
761 | dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW); | ||
762 | } | ||
763 | |||
764 | static int metadata_update_start(struct mddev *mddev) | ||
765 | { | ||
766 | return lock_comm(mddev->cluster_info); | ||
767 | } | ||
768 | |||
769 | static int metadata_update_finish(struct mddev *mddev) | ||
770 | { | ||
771 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
772 | struct cluster_msg cmsg; | ||
773 | int ret; | ||
774 | |||
775 | memset(&cmsg, 0, sizeof(cmsg)); | ||
776 | cmsg.type = cpu_to_le32(METADATA_UPDATED); | ||
777 | ret = __sendmsg(cinfo, &cmsg); | ||
778 | unlock_comm(cinfo); | ||
779 | return ret; | ||
780 | } | ||
781 | |||
782 | static int metadata_update_cancel(struct mddev *mddev) | ||
783 | { | ||
784 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
785 | |||
786 | return dlm_unlock_sync(cinfo->token_lockres); | ||
787 | } | ||
788 | |||
789 | static int resync_send(struct mddev *mddev, enum msg_type type, | ||
790 | sector_t lo, sector_t hi) | ||
791 | { | ||
792 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
793 | struct cluster_msg cmsg; | ||
794 | int slot = cinfo->slot_number - 1; | ||
795 | |||
796 | pr_info("%s:%d lo: %llu hi: %llu\n", __func__, __LINE__, | ||
797 | (unsigned long long)lo, | ||
798 | (unsigned long long)hi); | ||
799 | resync_info_update(mddev, lo, hi); | ||
800 | cmsg.type = cpu_to_le32(type); | ||
801 | cmsg.slot = cpu_to_le32(slot); | ||
802 | cmsg.low = cpu_to_le64(lo); | ||
803 | cmsg.high = cpu_to_le64(hi); | ||
804 | return sendmsg(cinfo, &cmsg); | ||
805 | } | ||
806 | |||
807 | static int resync_start(struct mddev *mddev, sector_t lo, sector_t hi) | ||
808 | { | ||
809 | pr_info("%s:%d\n", __func__, __LINE__); | ||
810 | return resync_send(mddev, RESYNCING, lo, hi); | ||
811 | } | ||
812 | |||
813 | static void resync_finish(struct mddev *mddev) | ||
814 | { | ||
815 | pr_info("%s:%d\n", __func__, __LINE__); | ||
816 | resync_send(mddev, RESYNCING, 0, 0); | ||
817 | } | ||
818 | |||
819 | static int area_resyncing(struct mddev *mddev, sector_t lo, sector_t hi) | ||
820 | { | ||
821 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
822 | int ret = 0; | ||
823 | struct suspend_info *s; | ||
824 | |||
825 | spin_lock_irq(&cinfo->suspend_lock); | ||
826 | if (list_empty(&cinfo->suspend_list)) | ||
827 | goto out; | ||
828 | list_for_each_entry(s, &cinfo->suspend_list, list) | ||
829 | if (hi > s->lo && lo < s->hi) { | ||
830 | ret = 1; | ||
831 | break; | ||
832 | } | ||
833 | out: | ||
834 | spin_unlock_irq(&cinfo->suspend_lock); | ||
835 | return ret; | ||
836 | } | ||
837 | |||
838 | static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev) | ||
839 | { | ||
840 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
841 | struct cluster_msg cmsg; | ||
842 | int ret = 0; | ||
843 | struct mdp_superblock_1 *sb = page_address(rdev->sb_page); | ||
844 | char *uuid = sb->device_uuid; | ||
845 | |||
846 | memset(&cmsg, 0, sizeof(cmsg)); | ||
847 | cmsg.type = cpu_to_le32(NEWDISK); | ||
848 | memcpy(cmsg.uuid, uuid, 16); | ||
849 | cmsg.raid_slot = rdev->desc_nr; | ||
850 | lock_comm(cinfo); | ||
851 | ret = __sendmsg(cinfo, &cmsg); | ||
852 | if (ret) | ||
853 | return ret; | ||
854 | cinfo->no_new_dev_lockres->flags |= DLM_LKF_NOQUEUE; | ||
855 | ret = dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_EX); | ||
856 | cinfo->no_new_dev_lockres->flags &= ~DLM_LKF_NOQUEUE; | ||
857 | /* Some node does not "see" the device */ | ||
858 | if (ret == -EAGAIN) | ||
859 | ret = -ENOENT; | ||
860 | else | ||
861 | dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); | ||
862 | return ret; | ||
863 | } | ||
864 | |||
865 | static int add_new_disk_finish(struct mddev *mddev) | ||
866 | { | ||
867 | struct cluster_msg cmsg; | ||
868 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
869 | int ret; | ||
870 | /* Write sb and inform others */ | ||
871 | md_update_sb(mddev, 1); | ||
872 | cmsg.type = METADATA_UPDATED; | ||
873 | ret = __sendmsg(cinfo, &cmsg); | ||
874 | unlock_comm(cinfo); | ||
875 | return ret; | ||
876 | } | ||
877 | |||
878 | static int new_disk_ack(struct mddev *mddev, bool ack) | ||
879 | { | ||
880 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
881 | |||
882 | if (!test_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state)) { | ||
883 | pr_warn("md-cluster(%s): Spurious cluster confirmation\n", mdname(mddev)); | ||
884 | return -EINVAL; | ||
885 | } | ||
886 | |||
887 | if (ack) | ||
888 | dlm_unlock_sync(cinfo->no_new_dev_lockres); | ||
889 | complete(&cinfo->newdisk_completion); | ||
890 | return 0; | ||
891 | } | ||
892 | |||
893 | static int remove_disk(struct mddev *mddev, struct md_rdev *rdev) | ||
894 | { | ||
895 | struct cluster_msg cmsg; | ||
896 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
897 | cmsg.type = REMOVE; | ||
898 | cmsg.raid_slot = rdev->desc_nr; | ||
899 | return __sendmsg(cinfo, &cmsg); | ||
900 | } | ||
901 | |||
902 | static int gather_bitmaps(struct md_rdev *rdev) | ||
903 | { | ||
904 | int sn, err; | ||
905 | sector_t lo, hi; | ||
906 | struct cluster_msg cmsg; | ||
907 | struct mddev *mddev = rdev->mddev; | ||
908 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
909 | |||
910 | cmsg.type = RE_ADD; | ||
911 | cmsg.raid_slot = rdev->desc_nr; | ||
912 | err = sendmsg(cinfo, &cmsg); | ||
913 | if (err) | ||
914 | goto out; | ||
915 | |||
916 | for (sn = 0; sn < mddev->bitmap_info.nodes; sn++) { | ||
917 | if (sn == (cinfo->slot_number - 1)) | ||
918 | continue; | ||
919 | err = bitmap_copy_from_slot(mddev, sn, &lo, &hi, false); | ||
920 | if (err) { | ||
921 | pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn); | ||
922 | goto out; | ||
923 | } | ||
924 | if ((hi > 0) && (lo < mddev->recovery_cp)) | ||
925 | mddev->recovery_cp = lo; | ||
926 | } | ||
927 | out: | ||
928 | return err; | ||
929 | } | ||
930 | |||
931 | static struct md_cluster_operations cluster_ops = { | ||
932 | .join = join, | ||
933 | .leave = leave, | ||
934 | .slot_number = slot_number, | ||
935 | .resync_info_update = resync_info_update, | ||
936 | .resync_start = resync_start, | ||
937 | .resync_finish = resync_finish, | ||
938 | .metadata_update_start = metadata_update_start, | ||
939 | .metadata_update_finish = metadata_update_finish, | ||
940 | .metadata_update_cancel = metadata_update_cancel, | ||
941 | .area_resyncing = area_resyncing, | ||
942 | .add_new_disk_start = add_new_disk_start, | ||
943 | .add_new_disk_finish = add_new_disk_finish, | ||
944 | .new_disk_ack = new_disk_ack, | ||
945 | .remove_disk = remove_disk, | ||
946 | .gather_bitmaps = gather_bitmaps, | ||
947 | }; | ||
948 | |||
949 | static int __init cluster_init(void) | ||
950 | { | ||
951 | pr_warn("md-cluster: EXPERIMENTAL. Use with caution\n"); | ||
952 | pr_info("Registering Cluster MD functions\n"); | ||
953 | register_md_cluster_operations(&cluster_ops, THIS_MODULE); | ||
954 | return 0; | ||
955 | } | ||
956 | |||
957 | static void cluster_exit(void) | ||
958 | { | ||
959 | unregister_md_cluster_operations(); | ||
960 | } | ||
961 | |||
962 | module_init(cluster_init); | ||
963 | module_exit(cluster_exit); | ||
964 | MODULE_LICENSE("GPL"); | ||
965 | MODULE_DESCRIPTION("Clustering support for MD"); | ||
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h new file mode 100644 index 000000000000..6817ee00e053 --- /dev/null +++ b/drivers/md/md-cluster.h | |||
@@ -0,0 +1,29 @@ | |||
1 | |||
2 | |||
3 | #ifndef _MD_CLUSTER_H | ||
4 | #define _MD_CLUSTER_H | ||
5 | |||
6 | #include "md.h" | ||
7 | |||
8 | struct mddev; | ||
9 | struct md_rdev; | ||
10 | |||
11 | struct md_cluster_operations { | ||
12 | int (*join)(struct mddev *mddev, int nodes); | ||
13 | int (*leave)(struct mddev *mddev); | ||
14 | int (*slot_number)(struct mddev *mddev); | ||
15 | void (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi); | ||
16 | int (*resync_start)(struct mddev *mddev, sector_t lo, sector_t hi); | ||
17 | void (*resync_finish)(struct mddev *mddev); | ||
18 | int (*metadata_update_start)(struct mddev *mddev); | ||
19 | int (*metadata_update_finish)(struct mddev *mddev); | ||
20 | int (*metadata_update_cancel)(struct mddev *mddev); | ||
21 | int (*area_resyncing)(struct mddev *mddev, sector_t lo, sector_t hi); | ||
22 | int (*add_new_disk_start)(struct mddev *mddev, struct md_rdev *rdev); | ||
23 | int (*add_new_disk_finish)(struct mddev *mddev); | ||
24 | int (*new_disk_ack)(struct mddev *mddev, bool ack); | ||
25 | int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev); | ||
26 | int (*gather_bitmaps)(struct md_rdev *rdev); | ||
27 | }; | ||
28 | |||
29 | #endif /* _MD_CLUSTER_H */ | ||
diff --git a/drivers/md/md.c b/drivers/md/md.c index e6178787ce3d..0d8968535976 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -53,6 +53,7 @@ | |||
53 | #include <linux/slab.h> | 53 | #include <linux/slab.h> |
54 | #include "md.h" | 54 | #include "md.h" |
55 | #include "bitmap.h" | 55 | #include "bitmap.h" |
56 | #include "md-cluster.h" | ||
56 | 57 | ||
57 | #ifndef MODULE | 58 | #ifndef MODULE |
58 | static void autostart_arrays(int part); | 59 | static void autostart_arrays(int part); |
@@ -66,6 +67,11 @@ static void autostart_arrays(int part); | |||
66 | static LIST_HEAD(pers_list); | 67 | static LIST_HEAD(pers_list); |
67 | static DEFINE_SPINLOCK(pers_lock); | 68 | static DEFINE_SPINLOCK(pers_lock); |
68 | 69 | ||
70 | struct md_cluster_operations *md_cluster_ops; | ||
71 | EXPORT_SYMBOL(md_cluster_ops); | ||
72 | struct module *md_cluster_mod; | ||
73 | EXPORT_SYMBOL(md_cluster_mod); | ||
74 | |||
69 | static DECLARE_WAIT_QUEUE_HEAD(resync_wait); | 75 | static DECLARE_WAIT_QUEUE_HEAD(resync_wait); |
70 | static struct workqueue_struct *md_wq; | 76 | static struct workqueue_struct *md_wq; |
71 | static struct workqueue_struct *md_misc_wq; | 77 | static struct workqueue_struct *md_misc_wq; |
@@ -640,7 +646,7 @@ void mddev_unlock(struct mddev *mddev) | |||
640 | } | 646 | } |
641 | EXPORT_SYMBOL_GPL(mddev_unlock); | 647 | EXPORT_SYMBOL_GPL(mddev_unlock); |
642 | 648 | ||
643 | static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr) | 649 | struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr) |
644 | { | 650 | { |
645 | struct md_rdev *rdev; | 651 | struct md_rdev *rdev; |
646 | 652 | ||
@@ -650,6 +656,7 @@ static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr) | |||
650 | 656 | ||
651 | return NULL; | 657 | return NULL; |
652 | } | 658 | } |
659 | EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu); | ||
653 | 660 | ||
654 | static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) | 661 | static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) |
655 | { | 662 | { |
@@ -2047,11 +2054,11 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) | |||
2047 | int choice = 0; | 2054 | int choice = 0; |
2048 | if (mddev->pers) | 2055 | if (mddev->pers) |
2049 | choice = mddev->raid_disks; | 2056 | choice = mddev->raid_disks; |
2050 | while (find_rdev_nr_rcu(mddev, choice)) | 2057 | while (md_find_rdev_nr_rcu(mddev, choice)) |
2051 | choice++; | 2058 | choice++; |
2052 | rdev->desc_nr = choice; | 2059 | rdev->desc_nr = choice; |
2053 | } else { | 2060 | } else { |
2054 | if (find_rdev_nr_rcu(mddev, rdev->desc_nr)) { | 2061 | if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) { |
2055 | rcu_read_unlock(); | 2062 | rcu_read_unlock(); |
2056 | return -EBUSY; | 2063 | return -EBUSY; |
2057 | } | 2064 | } |
@@ -2166,11 +2173,12 @@ static void export_rdev(struct md_rdev *rdev) | |||
2166 | kobject_put(&rdev->kobj); | 2173 | kobject_put(&rdev->kobj); |
2167 | } | 2174 | } |
2168 | 2175 | ||
2169 | static void kick_rdev_from_array(struct md_rdev *rdev) | 2176 | void md_kick_rdev_from_array(struct md_rdev *rdev) |
2170 | { | 2177 | { |
2171 | unbind_rdev_from_array(rdev); | 2178 | unbind_rdev_from_array(rdev); |
2172 | export_rdev(rdev); | 2179 | export_rdev(rdev); |
2173 | } | 2180 | } |
2181 | EXPORT_SYMBOL_GPL(md_kick_rdev_from_array); | ||
2174 | 2182 | ||
2175 | static void export_array(struct mddev *mddev) | 2183 | static void export_array(struct mddev *mddev) |
2176 | { | 2184 | { |
@@ -2179,7 +2187,7 @@ static void export_array(struct mddev *mddev) | |||
2179 | while (!list_empty(&mddev->disks)) { | 2187 | while (!list_empty(&mddev->disks)) { |
2180 | rdev = list_first_entry(&mddev->disks, struct md_rdev, | 2188 | rdev = list_first_entry(&mddev->disks, struct md_rdev, |
2181 | same_set); | 2189 | same_set); |
2182 | kick_rdev_from_array(rdev); | 2190 | md_kick_rdev_from_array(rdev); |
2183 | } | 2191 | } |
2184 | mddev->raid_disks = 0; | 2192 | mddev->raid_disks = 0; |
2185 | mddev->major_version = 0; | 2193 | mddev->major_version = 0; |
@@ -2208,7 +2216,7 @@ static void sync_sbs(struct mddev *mddev, int nospares) | |||
2208 | } | 2216 | } |
2209 | } | 2217 | } |
2210 | 2218 | ||
2211 | static void md_update_sb(struct mddev *mddev, int force_change) | 2219 | void md_update_sb(struct mddev *mddev, int force_change) |
2212 | { | 2220 | { |
2213 | struct md_rdev *rdev; | 2221 | struct md_rdev *rdev; |
2214 | int sync_req; | 2222 | int sync_req; |
@@ -2369,6 +2377,37 @@ repeat: | |||
2369 | wake_up(&rdev->blocked_wait); | 2377 | wake_up(&rdev->blocked_wait); |
2370 | } | 2378 | } |
2371 | } | 2379 | } |
2380 | EXPORT_SYMBOL(md_update_sb); | ||
2381 | |||
2382 | static int add_bound_rdev(struct md_rdev *rdev) | ||
2383 | { | ||
2384 | struct mddev *mddev = rdev->mddev; | ||
2385 | int err = 0; | ||
2386 | |||
2387 | if (!mddev->pers->hot_remove_disk) { | ||
2388 | /* If there is hot_add_disk but no hot_remove_disk | ||
2389 | * then added disks for geometry changes, | ||
2390 | * and should be added immediately. | ||
2391 | */ | ||
2392 | super_types[mddev->major_version]. | ||
2393 | validate_super(mddev, rdev); | ||
2394 | err = mddev->pers->hot_add_disk(mddev, rdev); | ||
2395 | if (err) { | ||
2396 | unbind_rdev_from_array(rdev); | ||
2397 | export_rdev(rdev); | ||
2398 | return err; | ||
2399 | } | ||
2400 | } | ||
2401 | sysfs_notify_dirent_safe(rdev->sysfs_state); | ||
2402 | |||
2403 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | ||
2404 | if (mddev->degraded) | ||
2405 | set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); | ||
2406 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
2407 | md_new_event(mddev); | ||
2408 | md_wakeup_thread(mddev->thread); | ||
2409 | return 0; | ||
2410 | } | ||
2372 | 2411 | ||
2373 | /* words written to sysfs files may, or may not, be \n terminated. | 2412 | /* words written to sysfs files may, or may not, be \n terminated. |
2374 | * We want to accept with case. For this we use cmd_match. | 2413 | * We want to accept with case. For this we use cmd_match. |
@@ -2471,10 +2510,16 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) | |||
2471 | err = -EBUSY; | 2510 | err = -EBUSY; |
2472 | else { | 2511 | else { |
2473 | struct mddev *mddev = rdev->mddev; | 2512 | struct mddev *mddev = rdev->mddev; |
2474 | kick_rdev_from_array(rdev); | 2513 | if (mddev_is_clustered(mddev)) |
2514 | md_cluster_ops->remove_disk(mddev, rdev); | ||
2515 | md_kick_rdev_from_array(rdev); | ||
2516 | if (mddev_is_clustered(mddev)) | ||
2517 | md_cluster_ops->metadata_update_start(mddev); | ||
2475 | if (mddev->pers) | 2518 | if (mddev->pers) |
2476 | md_update_sb(mddev, 1); | 2519 | md_update_sb(mddev, 1); |
2477 | md_new_event(mddev); | 2520 | md_new_event(mddev); |
2521 | if (mddev_is_clustered(mddev)) | ||
2522 | md_cluster_ops->metadata_update_finish(mddev); | ||
2478 | err = 0; | 2523 | err = 0; |
2479 | } | 2524 | } |
2480 | } else if (cmd_match(buf, "writemostly")) { | 2525 | } else if (cmd_match(buf, "writemostly")) { |
@@ -2553,6 +2598,21 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) | |||
2553 | clear_bit(Replacement, &rdev->flags); | 2598 | clear_bit(Replacement, &rdev->flags); |
2554 | err = 0; | 2599 | err = 0; |
2555 | } | 2600 | } |
2601 | } else if (cmd_match(buf, "re-add")) { | ||
2602 | if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1)) { | ||
2603 | /* clear_bit is performed _after_ all the devices | ||
2604 | * have their local Faulty bit cleared. If any writes | ||
2605 | * happen in the meantime in the local node, they | ||
2606 | * will land in the local bitmap, which will be synced | ||
2607 | * by this node eventually | ||
2608 | */ | ||
2609 | if (!mddev_is_clustered(rdev->mddev) || | ||
2610 | (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) { | ||
2611 | clear_bit(Faulty, &rdev->flags); | ||
2612 | err = add_bound_rdev(rdev); | ||
2613 | } | ||
2614 | } else | ||
2615 | err = -EBUSY; | ||
2556 | } | 2616 | } |
2557 | if (!err) | 2617 | if (!err) |
2558 | sysfs_notify_dirent_safe(rdev->sysfs_state); | 2618 | sysfs_notify_dirent_safe(rdev->sysfs_state); |
@@ -3127,7 +3187,7 @@ static void analyze_sbs(struct mddev *mddev) | |||
3127 | "md: fatal superblock inconsistency in %s" | 3187 | "md: fatal superblock inconsistency in %s" |
3128 | " -- removing from array\n", | 3188 | " -- removing from array\n", |
3129 | bdevname(rdev->bdev,b)); | 3189 | bdevname(rdev->bdev,b)); |
3130 | kick_rdev_from_array(rdev); | 3190 | md_kick_rdev_from_array(rdev); |
3131 | } | 3191 | } |
3132 | 3192 | ||
3133 | super_types[mddev->major_version]. | 3193 | super_types[mddev->major_version]. |
@@ -3142,18 +3202,27 @@ static void analyze_sbs(struct mddev *mddev) | |||
3142 | "md: %s: %s: only %d devices permitted\n", | 3202 | "md: %s: %s: only %d devices permitted\n", |
3143 | mdname(mddev), bdevname(rdev->bdev, b), | 3203 | mdname(mddev), bdevname(rdev->bdev, b), |
3144 | mddev->max_disks); | 3204 | mddev->max_disks); |
3145 | kick_rdev_from_array(rdev); | 3205 | md_kick_rdev_from_array(rdev); |
3146 | continue; | 3206 | continue; |
3147 | } | 3207 | } |
3148 | if (rdev != freshest) | 3208 | if (rdev != freshest) { |
3149 | if (super_types[mddev->major_version]. | 3209 | if (super_types[mddev->major_version]. |
3150 | validate_super(mddev, rdev)) { | 3210 | validate_super(mddev, rdev)) { |
3151 | printk(KERN_WARNING "md: kicking non-fresh %s" | 3211 | printk(KERN_WARNING "md: kicking non-fresh %s" |
3152 | " from array!\n", | 3212 | " from array!\n", |
3153 | bdevname(rdev->bdev,b)); | 3213 | bdevname(rdev->bdev,b)); |
3154 | kick_rdev_from_array(rdev); | 3214 | md_kick_rdev_from_array(rdev); |
3155 | continue; | 3215 | continue; |
3156 | } | 3216 | } |
3217 | /* No device should have a Candidate flag | ||
3218 | * when reading devices | ||
3219 | */ | ||
3220 | if (test_bit(Candidate, &rdev->flags)) { | ||
3221 | pr_info("md: kicking Cluster Candidate %s from array!\n", | ||
3222 | bdevname(rdev->bdev, b)); | ||
3223 | md_kick_rdev_from_array(rdev); | ||
3224 | } | ||
3225 | } | ||
3157 | if (mddev->level == LEVEL_MULTIPATH) { | 3226 | if (mddev->level == LEVEL_MULTIPATH) { |
3158 | rdev->desc_nr = i++; | 3227 | rdev->desc_nr = i++; |
3159 | rdev->raid_disk = rdev->desc_nr; | 3228 | rdev->raid_disk = rdev->desc_nr; |
@@ -4008,8 +4077,12 @@ size_store(struct mddev *mddev, const char *buf, size_t len) | |||
4008 | if (err) | 4077 | if (err) |
4009 | return err; | 4078 | return err; |
4010 | if (mddev->pers) { | 4079 | if (mddev->pers) { |
4080 | if (mddev_is_clustered(mddev)) | ||
4081 | md_cluster_ops->metadata_update_start(mddev); | ||
4011 | err = update_size(mddev, sectors); | 4082 | err = update_size(mddev, sectors); |
4012 | md_update_sb(mddev, 1); | 4083 | md_update_sb(mddev, 1); |
4084 | if (mddev_is_clustered(mddev)) | ||
4085 | md_cluster_ops->metadata_update_finish(mddev); | ||
4013 | } else { | 4086 | } else { |
4014 | if (mddev->dev_sectors == 0 || | 4087 | if (mddev->dev_sectors == 0 || |
4015 | mddev->dev_sectors > sectors) | 4088 | mddev->dev_sectors > sectors) |
@@ -5077,10 +5150,16 @@ int md_run(struct mddev *mddev) | |||
5077 | } | 5150 | } |
5078 | if (err == 0 && pers->sync_request && | 5151 | if (err == 0 && pers->sync_request && |
5079 | (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { | 5152 | (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { |
5080 | err = bitmap_create(mddev); | 5153 | struct bitmap *bitmap; |
5081 | if (err) | 5154 | |
5155 | bitmap = bitmap_create(mddev, -1); | ||
5156 | if (IS_ERR(bitmap)) { | ||
5157 | err = PTR_ERR(bitmap); | ||
5082 | printk(KERN_ERR "%s: failed to create bitmap (%d)\n", | 5158 | printk(KERN_ERR "%s: failed to create bitmap (%d)\n", |
5083 | mdname(mddev), err); | 5159 | mdname(mddev), err); |
5160 | } else | ||
5161 | mddev->bitmap = bitmap; | ||
5162 | |||
5084 | } | 5163 | } |
5085 | if (err) { | 5164 | if (err) { |
5086 | mddev_detach(mddev); | 5165 | mddev_detach(mddev); |
@@ -5232,6 +5311,8 @@ static void md_clean(struct mddev *mddev) | |||
5232 | 5311 | ||
5233 | static void __md_stop_writes(struct mddev *mddev) | 5312 | static void __md_stop_writes(struct mddev *mddev) |
5234 | { | 5313 | { |
5314 | if (mddev_is_clustered(mddev)) | ||
5315 | md_cluster_ops->metadata_update_start(mddev); | ||
5235 | set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); | 5316 | set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); |
5236 | flush_workqueue(md_misc_wq); | 5317 | flush_workqueue(md_misc_wq); |
5237 | if (mddev->sync_thread) { | 5318 | if (mddev->sync_thread) { |
@@ -5250,6 +5331,8 @@ static void __md_stop_writes(struct mddev *mddev) | |||
5250 | mddev->in_sync = 1; | 5331 | mddev->in_sync = 1; |
5251 | md_update_sb(mddev, 1); | 5332 | md_update_sb(mddev, 1); |
5252 | } | 5333 | } |
5334 | if (mddev_is_clustered(mddev)) | ||
5335 | md_cluster_ops->metadata_update_finish(mddev); | ||
5253 | } | 5336 | } |
5254 | 5337 | ||
5255 | void md_stop_writes(struct mddev *mddev) | 5338 | void md_stop_writes(struct mddev *mddev) |
@@ -5636,6 +5719,8 @@ static int get_array_info(struct mddev *mddev, void __user *arg) | |||
5636 | info.state = (1<<MD_SB_CLEAN); | 5719 | info.state = (1<<MD_SB_CLEAN); |
5637 | if (mddev->bitmap && mddev->bitmap_info.offset) | 5720 | if (mddev->bitmap && mddev->bitmap_info.offset) |
5638 | info.state |= (1<<MD_SB_BITMAP_PRESENT); | 5721 | info.state |= (1<<MD_SB_BITMAP_PRESENT); |
5722 | if (mddev_is_clustered(mddev)) | ||
5723 | info.state |= (1<<MD_SB_CLUSTERED); | ||
5639 | info.active_disks = insync; | 5724 | info.active_disks = insync; |
5640 | info.working_disks = working; | 5725 | info.working_disks = working; |
5641 | info.failed_disks = failed; | 5726 | info.failed_disks = failed; |
@@ -5691,7 +5776,7 @@ static int get_disk_info(struct mddev *mddev, void __user * arg) | |||
5691 | return -EFAULT; | 5776 | return -EFAULT; |
5692 | 5777 | ||
5693 | rcu_read_lock(); | 5778 | rcu_read_lock(); |
5694 | rdev = find_rdev_nr_rcu(mddev, info.number); | 5779 | rdev = md_find_rdev_nr_rcu(mddev, info.number); |
5695 | if (rdev) { | 5780 | if (rdev) { |
5696 | info.major = MAJOR(rdev->bdev->bd_dev); | 5781 | info.major = MAJOR(rdev->bdev->bd_dev); |
5697 | info.minor = MINOR(rdev->bdev->bd_dev); | 5782 | info.minor = MINOR(rdev->bdev->bd_dev); |
@@ -5724,6 +5809,13 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) | |||
5724 | struct md_rdev *rdev; | 5809 | struct md_rdev *rdev; |
5725 | dev_t dev = MKDEV(info->major,info->minor); | 5810 | dev_t dev = MKDEV(info->major,info->minor); |
5726 | 5811 | ||
5812 | if (mddev_is_clustered(mddev) && | ||
5813 | !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { | ||
5814 | pr_err("%s: Cannot add to clustered mddev.\n", | ||
5815 | mdname(mddev)); | ||
5816 | return -EINVAL; | ||
5817 | } | ||
5818 | |||
5727 | if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) | 5819 | if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) |
5728 | return -EOVERFLOW; | 5820 | return -EOVERFLOW; |
5729 | 5821 | ||
@@ -5810,31 +5902,38 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) | |||
5810 | else | 5902 | else |
5811 | clear_bit(WriteMostly, &rdev->flags); | 5903 | clear_bit(WriteMostly, &rdev->flags); |
5812 | 5904 | ||
5905 | /* | ||
5906 | * check whether the device shows up in other nodes | ||
5907 | */ | ||
5908 | if (mddev_is_clustered(mddev)) { | ||
5909 | if (info->state & (1 << MD_DISK_CANDIDATE)) { | ||
5910 | /* Through --cluster-confirm */ | ||
5911 | set_bit(Candidate, &rdev->flags); | ||
5912 | err = md_cluster_ops->new_disk_ack(mddev, true); | ||
5913 | if (err) { | ||
5914 | export_rdev(rdev); | ||
5915 | return err; | ||
5916 | } | ||
5917 | } else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { | ||
5918 | /* --add initiated by this node */ | ||
5919 | err = md_cluster_ops->add_new_disk_start(mddev, rdev); | ||
5920 | if (err) { | ||
5921 | md_cluster_ops->add_new_disk_finish(mddev); | ||
5922 | export_rdev(rdev); | ||
5923 | return err; | ||
5924 | } | ||
5925 | } | ||
5926 | } | ||
5927 | |||
5813 | rdev->raid_disk = -1; | 5928 | rdev->raid_disk = -1; |
5814 | err = bind_rdev_to_array(rdev, mddev); | 5929 | err = bind_rdev_to_array(rdev, mddev); |
5815 | if (!err && !mddev->pers->hot_remove_disk) { | ||
5816 | /* If there is hot_add_disk but no hot_remove_disk | ||
5817 | * then added disks for geometry changes, | ||
5818 | * and should be added immediately. | ||
5819 | */ | ||
5820 | super_types[mddev->major_version]. | ||
5821 | validate_super(mddev, rdev); | ||
5822 | err = mddev->pers->hot_add_disk(mddev, rdev); | ||
5823 | if (err) | ||
5824 | unbind_rdev_from_array(rdev); | ||
5825 | } | ||
5826 | if (err) | 5930 | if (err) |
5827 | export_rdev(rdev); | 5931 | export_rdev(rdev); |
5828 | else | 5932 | else |
5829 | sysfs_notify_dirent_safe(rdev->sysfs_state); | 5933 | err = add_bound_rdev(rdev); |
5830 | 5934 | if (mddev_is_clustered(mddev) && | |
5831 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 5935 | (info->state & (1 << MD_DISK_CLUSTER_ADD))) |
5832 | if (mddev->degraded) | 5936 | md_cluster_ops->add_new_disk_finish(mddev); |
5833 | set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); | ||
5834 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
5835 | if (!err) | ||
5836 | md_new_event(mddev); | ||
5837 | md_wakeup_thread(mddev->thread); | ||
5838 | return err; | 5937 | return err; |
5839 | } | 5938 | } |
5840 | 5939 | ||
@@ -5895,18 +5994,29 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) | |||
5895 | if (!rdev) | 5994 | if (!rdev) |
5896 | return -ENXIO; | 5995 | return -ENXIO; |
5897 | 5996 | ||
5997 | if (mddev_is_clustered(mddev)) | ||
5998 | md_cluster_ops->metadata_update_start(mddev); | ||
5999 | |||
5898 | clear_bit(Blocked, &rdev->flags); | 6000 | clear_bit(Blocked, &rdev->flags); |
5899 | remove_and_add_spares(mddev, rdev); | 6001 | remove_and_add_spares(mddev, rdev); |
5900 | 6002 | ||
5901 | if (rdev->raid_disk >= 0) | 6003 | if (rdev->raid_disk >= 0) |
5902 | goto busy; | 6004 | goto busy; |
5903 | 6005 | ||
5904 | kick_rdev_from_array(rdev); | 6006 | if (mddev_is_clustered(mddev)) |
6007 | md_cluster_ops->remove_disk(mddev, rdev); | ||
6008 | |||
6009 | md_kick_rdev_from_array(rdev); | ||
5905 | md_update_sb(mddev, 1); | 6010 | md_update_sb(mddev, 1); |
5906 | md_new_event(mddev); | 6011 | md_new_event(mddev); |
5907 | 6012 | ||
6013 | if (mddev_is_clustered(mddev)) | ||
6014 | md_cluster_ops->metadata_update_finish(mddev); | ||
6015 | |||
5908 | return 0; | 6016 | return 0; |
5909 | busy: | 6017 | busy: |
6018 | if (mddev_is_clustered(mddev)) | ||
6019 | md_cluster_ops->metadata_update_cancel(mddev); | ||
5910 | printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", | 6020 | printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", |
5911 | bdevname(rdev->bdev,b), mdname(mddev)); | 6021 | bdevname(rdev->bdev,b), mdname(mddev)); |
5912 | return -EBUSY; | 6022 | return -EBUSY; |
@@ -5956,12 +6066,15 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) | |||
5956 | err = -EINVAL; | 6066 | err = -EINVAL; |
5957 | goto abort_export; | 6067 | goto abort_export; |
5958 | } | 6068 | } |
6069 | |||
6070 | if (mddev_is_clustered(mddev)) | ||
6071 | md_cluster_ops->metadata_update_start(mddev); | ||
5959 | clear_bit(In_sync, &rdev->flags); | 6072 | clear_bit(In_sync, &rdev->flags); |
5960 | rdev->desc_nr = -1; | 6073 | rdev->desc_nr = -1; |
5961 | rdev->saved_raid_disk = -1; | 6074 | rdev->saved_raid_disk = -1; |
5962 | err = bind_rdev_to_array(rdev, mddev); | 6075 | err = bind_rdev_to_array(rdev, mddev); |
5963 | if (err) | 6076 | if (err) |
5964 | goto abort_export; | 6077 | goto abort_clustered; |
5965 | 6078 | ||
5966 | /* | 6079 | /* |
5967 | * The rest should better be atomic, we can have disk failures | 6080 | * The rest should better be atomic, we can have disk failures |
@@ -5972,6 +6085,8 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) | |||
5972 | 6085 | ||
5973 | md_update_sb(mddev, 1); | 6086 | md_update_sb(mddev, 1); |
5974 | 6087 | ||
6088 | if (mddev_is_clustered(mddev)) | ||
6089 | md_cluster_ops->metadata_update_finish(mddev); | ||
5975 | /* | 6090 | /* |
5976 | * Kick recovery, maybe this spare has to be added to the | 6091 | * Kick recovery, maybe this spare has to be added to the |
5977 | * array immediately. | 6092 | * array immediately. |
@@ -5981,6 +6096,9 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) | |||
5981 | md_new_event(mddev); | 6096 | md_new_event(mddev); |
5982 | return 0; | 6097 | return 0; |
5983 | 6098 | ||
6099 | abort_clustered: | ||
6100 | if (mddev_is_clustered(mddev)) | ||
6101 | md_cluster_ops->metadata_update_cancel(mddev); | ||
5984 | abort_export: | 6102 | abort_export: |
5985 | export_rdev(rdev); | 6103 | export_rdev(rdev); |
5986 | return err; | 6104 | return err; |
@@ -6038,9 +6156,14 @@ static int set_bitmap_file(struct mddev *mddev, int fd) | |||
6038 | if (mddev->pers) { | 6156 | if (mddev->pers) { |
6039 | mddev->pers->quiesce(mddev, 1); | 6157 | mddev->pers->quiesce(mddev, 1); |
6040 | if (fd >= 0) { | 6158 | if (fd >= 0) { |
6041 | err = bitmap_create(mddev); | 6159 | struct bitmap *bitmap; |
6042 | if (!err) | 6160 | |
6161 | bitmap = bitmap_create(mddev, -1); | ||
6162 | if (!IS_ERR(bitmap)) { | ||
6163 | mddev->bitmap = bitmap; | ||
6043 | err = bitmap_load(mddev); | 6164 | err = bitmap_load(mddev); |
6165 | } else | ||
6166 | err = PTR_ERR(bitmap); | ||
6044 | } | 6167 | } |
6045 | if (fd < 0 || err) { | 6168 | if (fd < 0 || err) { |
6046 | bitmap_destroy(mddev); | 6169 | bitmap_destroy(mddev); |
@@ -6293,6 +6416,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) | |||
6293 | return rv; | 6416 | return rv; |
6294 | } | 6417 | } |
6295 | } | 6418 | } |
6419 | if (mddev_is_clustered(mddev)) | ||
6420 | md_cluster_ops->metadata_update_start(mddev); | ||
6296 | if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) | 6421 | if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) |
6297 | rv = update_size(mddev, (sector_t)info->size * 2); | 6422 | rv = update_size(mddev, (sector_t)info->size * 2); |
6298 | 6423 | ||
@@ -6300,33 +6425,49 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) | |||
6300 | rv = update_raid_disks(mddev, info->raid_disks); | 6425 | rv = update_raid_disks(mddev, info->raid_disks); |
6301 | 6426 | ||
6302 | if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { | 6427 | if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { |
6303 | if (mddev->pers->quiesce == NULL || mddev->thread == NULL) | 6428 | if (mddev->pers->quiesce == NULL || mddev->thread == NULL) { |
6304 | return -EINVAL; | 6429 | rv = -EINVAL; |
6305 | if (mddev->recovery || mddev->sync_thread) | 6430 | goto err; |
6306 | return -EBUSY; | 6431 | } |
6432 | if (mddev->recovery || mddev->sync_thread) { | ||
6433 | rv = -EBUSY; | ||
6434 | goto err; | ||
6435 | } | ||
6307 | if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { | 6436 | if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { |
6437 | struct bitmap *bitmap; | ||
6308 | /* add the bitmap */ | 6438 | /* add the bitmap */ |
6309 | if (mddev->bitmap) | 6439 | if (mddev->bitmap) { |
6310 | return -EEXIST; | 6440 | rv = -EEXIST; |
6311 | if (mddev->bitmap_info.default_offset == 0) | 6441 | goto err; |
6312 | return -EINVAL; | 6442 | } |
6443 | if (mddev->bitmap_info.default_offset == 0) { | ||
6444 | rv = -EINVAL; | ||
6445 | goto err; | ||
6446 | } | ||
6313 | mddev->bitmap_info.offset = | 6447 | mddev->bitmap_info.offset = |
6314 | mddev->bitmap_info.default_offset; | 6448 | mddev->bitmap_info.default_offset; |
6315 | mddev->bitmap_info.space = | 6449 | mddev->bitmap_info.space = |
6316 | mddev->bitmap_info.default_space; | 6450 | mddev->bitmap_info.default_space; |
6317 | mddev->pers->quiesce(mddev, 1); | 6451 | mddev->pers->quiesce(mddev, 1); |
6318 | rv = bitmap_create(mddev); | 6452 | bitmap = bitmap_create(mddev, -1); |
6319 | if (!rv) | 6453 | if (!IS_ERR(bitmap)) { |
6454 | mddev->bitmap = bitmap; | ||
6320 | rv = bitmap_load(mddev); | 6455 | rv = bitmap_load(mddev); |
6456 | } else | ||
6457 | rv = PTR_ERR(bitmap); | ||
6321 | if (rv) | 6458 | if (rv) |
6322 | bitmap_destroy(mddev); | 6459 | bitmap_destroy(mddev); |
6323 | mddev->pers->quiesce(mddev, 0); | 6460 | mddev->pers->quiesce(mddev, 0); |
6324 | } else { | 6461 | } else { |
6325 | /* remove the bitmap */ | 6462 | /* remove the bitmap */ |
6326 | if (!mddev->bitmap) | 6463 | if (!mddev->bitmap) { |
6327 | return -ENOENT; | 6464 | rv = -ENOENT; |
6328 | if (mddev->bitmap->storage.file) | 6465 | goto err; |
6329 | return -EINVAL; | 6466 | } |
6467 | if (mddev->bitmap->storage.file) { | ||
6468 | rv = -EINVAL; | ||
6469 | goto err; | ||
6470 | } | ||
6330 | mddev->pers->quiesce(mddev, 1); | 6471 | mddev->pers->quiesce(mddev, 1); |
6331 | bitmap_destroy(mddev); | 6472 | bitmap_destroy(mddev); |
6332 | mddev->pers->quiesce(mddev, 0); | 6473 | mddev->pers->quiesce(mddev, 0); |
@@ -6334,6 +6475,12 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) | |||
6334 | } | 6475 | } |
6335 | } | 6476 | } |
6336 | md_update_sb(mddev, 1); | 6477 | md_update_sb(mddev, 1); |
6478 | if (mddev_is_clustered(mddev)) | ||
6479 | md_cluster_ops->metadata_update_finish(mddev); | ||
6480 | return rv; | ||
6481 | err: | ||
6482 | if (mddev_is_clustered(mddev)) | ||
6483 | md_cluster_ops->metadata_update_cancel(mddev); | ||
6337 | return rv; | 6484 | return rv; |
6338 | } | 6485 | } |
6339 | 6486 | ||
@@ -6393,6 +6540,7 @@ static inline bool md_ioctl_valid(unsigned int cmd) | |||
6393 | case SET_DISK_FAULTY: | 6540 | case SET_DISK_FAULTY: |
6394 | case STOP_ARRAY: | 6541 | case STOP_ARRAY: |
6395 | case STOP_ARRAY_RO: | 6542 | case STOP_ARRAY_RO: |
6543 | case CLUSTERED_DISK_NACK: | ||
6396 | return true; | 6544 | return true; |
6397 | default: | 6545 | default: |
6398 | return false; | 6546 | return false; |
@@ -6665,6 +6813,13 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, | |||
6665 | goto unlock; | 6813 | goto unlock; |
6666 | } | 6814 | } |
6667 | 6815 | ||
6816 | case CLUSTERED_DISK_NACK: | ||
6817 | if (mddev_is_clustered(mddev)) | ||
6818 | md_cluster_ops->new_disk_ack(mddev, false); | ||
6819 | else | ||
6820 | err = -EINVAL; | ||
6821 | goto unlock; | ||
6822 | |||
6668 | case HOT_ADD_DISK: | 6823 | case HOT_ADD_DISK: |
6669 | err = hot_add_disk(mddev, new_decode_dev(arg)); | 6824 | err = hot_add_disk(mddev, new_decode_dev(arg)); |
6670 | goto unlock; | 6825 | goto unlock; |
@@ -7238,6 +7393,55 @@ int unregister_md_personality(struct md_personality *p) | |||
7238 | } | 7393 | } |
7239 | EXPORT_SYMBOL(unregister_md_personality); | 7394 | EXPORT_SYMBOL(unregister_md_personality); |
7240 | 7395 | ||
7396 | int register_md_cluster_operations(struct md_cluster_operations *ops, struct module *module) | ||
7397 | { | ||
7398 | if (md_cluster_ops != NULL) | ||
7399 | return -EALREADY; | ||
7400 | spin_lock(&pers_lock); | ||
7401 | md_cluster_ops = ops; | ||
7402 | md_cluster_mod = module; | ||
7403 | spin_unlock(&pers_lock); | ||
7404 | return 0; | ||
7405 | } | ||
7406 | EXPORT_SYMBOL(register_md_cluster_operations); | ||
7407 | |||
7408 | int unregister_md_cluster_operations(void) | ||
7409 | { | ||
7410 | spin_lock(&pers_lock); | ||
7411 | md_cluster_ops = NULL; | ||
7412 | spin_unlock(&pers_lock); | ||
7413 | return 0; | ||
7414 | } | ||
7415 | EXPORT_SYMBOL(unregister_md_cluster_operations); | ||
7416 | |||
7417 | int md_setup_cluster(struct mddev *mddev, int nodes) | ||
7418 | { | ||
7419 | int err; | ||
7420 | |||
7421 | err = request_module("md-cluster"); | ||
7422 | if (err) { | ||
7423 | pr_err("md-cluster module not found.\n"); | ||
7424 | return err; | ||
7425 | } | ||
7426 | |||
7427 | spin_lock(&pers_lock); | ||
7428 | if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { | ||
7429 | spin_unlock(&pers_lock); | ||
7430 | return -ENOENT; | ||
7431 | } | ||
7432 | spin_unlock(&pers_lock); | ||
7433 | |||
7434 | return md_cluster_ops->join(mddev, nodes); | ||
7435 | } | ||
7436 | |||
7437 | void md_cluster_stop(struct mddev *mddev) | ||
7438 | { | ||
7439 | if (!md_cluster_ops) | ||
7440 | return; | ||
7441 | md_cluster_ops->leave(mddev); | ||
7442 | module_put(md_cluster_mod); | ||
7443 | } | ||
7444 | |||
7241 | static int is_mddev_idle(struct mddev *mddev, int init) | 7445 | static int is_mddev_idle(struct mddev *mddev, int init) |
7242 | { | 7446 | { |
7243 | struct md_rdev *rdev; | 7447 | struct md_rdev *rdev; |
@@ -7375,7 +7579,11 @@ int md_allow_write(struct mddev *mddev) | |||
7375 | mddev->safemode == 0) | 7579 | mddev->safemode == 0) |
7376 | mddev->safemode = 1; | 7580 | mddev->safemode = 1; |
7377 | spin_unlock(&mddev->lock); | 7581 | spin_unlock(&mddev->lock); |
7582 | if (mddev_is_clustered(mddev)) | ||
7583 | md_cluster_ops->metadata_update_start(mddev); | ||
7378 | md_update_sb(mddev, 0); | 7584 | md_update_sb(mddev, 0); |
7585 | if (mddev_is_clustered(mddev)) | ||
7586 | md_cluster_ops->metadata_update_finish(mddev); | ||
7379 | sysfs_notify_dirent_safe(mddev->sysfs_state); | 7587 | sysfs_notify_dirent_safe(mddev->sysfs_state); |
7380 | } else | 7588 | } else |
7381 | spin_unlock(&mddev->lock); | 7589 | spin_unlock(&mddev->lock); |
@@ -7576,6 +7784,9 @@ void md_do_sync(struct md_thread *thread) | |||
7576 | md_new_event(mddev); | 7784 | md_new_event(mddev); |
7577 | update_time = jiffies; | 7785 | update_time = jiffies; |
7578 | 7786 | ||
7787 | if (mddev_is_clustered(mddev)) | ||
7788 | md_cluster_ops->resync_start(mddev, j, max_sectors); | ||
7789 | |||
7579 | blk_start_plug(&plug); | 7790 | blk_start_plug(&plug); |
7580 | while (j < max_sectors) { | 7791 | while (j < max_sectors) { |
7581 | sector_t sectors; | 7792 | sector_t sectors; |
@@ -7636,6 +7847,8 @@ void md_do_sync(struct md_thread *thread) | |||
7636 | j += sectors; | 7847 | j += sectors; |
7637 | if (j > 2) | 7848 | if (j > 2) |
7638 | mddev->curr_resync = j; | 7849 | mddev->curr_resync = j; |
7850 | if (mddev_is_clustered(mddev)) | ||
7851 | md_cluster_ops->resync_info_update(mddev, j, max_sectors); | ||
7639 | mddev->curr_mark_cnt = io_sectors; | 7852 | mddev->curr_mark_cnt = io_sectors; |
7640 | if (last_check == 0) | 7853 | if (last_check == 0) |
7641 | /* this is the earliest that rebuild will be | 7854 | /* this is the earliest that rebuild will be |
@@ -7696,6 +7909,9 @@ void md_do_sync(struct md_thread *thread) | |||
7696 | /* tell personality that we are finished */ | 7909 | /* tell personality that we are finished */ |
7697 | mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); | 7910 | mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); |
7698 | 7911 | ||
7912 | if (mddev_is_clustered(mddev)) | ||
7913 | md_cluster_ops->resync_finish(mddev); | ||
7914 | |||
7699 | if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && | 7915 | if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && |
7700 | mddev->curr_resync > 2) { | 7916 | mddev->curr_resync > 2) { |
7701 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { | 7917 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { |
@@ -7925,8 +8141,13 @@ void md_check_recovery(struct mddev *mddev) | |||
7925 | sysfs_notify_dirent_safe(mddev->sysfs_state); | 8141 | sysfs_notify_dirent_safe(mddev->sysfs_state); |
7926 | } | 8142 | } |
7927 | 8143 | ||
7928 | if (mddev->flags & MD_UPDATE_SB_FLAGS) | 8144 | if (mddev->flags & MD_UPDATE_SB_FLAGS) { |
8145 | if (mddev_is_clustered(mddev)) | ||
8146 | md_cluster_ops->metadata_update_start(mddev); | ||
7929 | md_update_sb(mddev, 0); | 8147 | md_update_sb(mddev, 0); |
8148 | if (mddev_is_clustered(mddev)) | ||
8149 | md_cluster_ops->metadata_update_finish(mddev); | ||
8150 | } | ||
7930 | 8151 | ||
7931 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && | 8152 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && |
7932 | !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { | 8153 | !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { |
@@ -8024,6 +8245,8 @@ void md_reap_sync_thread(struct mddev *mddev) | |||
8024 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 8245 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
8025 | } | 8246 | } |
8026 | } | 8247 | } |
8248 | if (mddev_is_clustered(mddev)) | ||
8249 | md_cluster_ops->metadata_update_start(mddev); | ||
8027 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && | 8250 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && |
8028 | mddev->pers->finish_reshape) | 8251 | mddev->pers->finish_reshape) |
8029 | mddev->pers->finish_reshape(mddev); | 8252 | mddev->pers->finish_reshape(mddev); |
@@ -8036,6 +8259,8 @@ void md_reap_sync_thread(struct mddev *mddev) | |||
8036 | rdev->saved_raid_disk = -1; | 8259 | rdev->saved_raid_disk = -1; |
8037 | 8260 | ||
8038 | md_update_sb(mddev, 1); | 8261 | md_update_sb(mddev, 1); |
8262 | if (mddev_is_clustered(mddev)) | ||
8263 | md_cluster_ops->metadata_update_finish(mddev); | ||
8039 | clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); | 8264 | clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); |
8040 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); | 8265 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); |
8041 | clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); | 8266 | clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); |
@@ -8656,6 +8881,28 @@ err_wq: | |||
8656 | return ret; | 8881 | return ret; |
8657 | } | 8882 | } |
8658 | 8883 | ||
8884 | void md_reload_sb(struct mddev *mddev) | ||
8885 | { | ||
8886 | struct md_rdev *rdev, *tmp; | ||
8887 | |||
8888 | rdev_for_each_safe(rdev, tmp, mddev) { | ||
8889 | rdev->sb_loaded = 0; | ||
8890 | ClearPageUptodate(rdev->sb_page); | ||
8891 | } | ||
8892 | mddev->raid_disks = 0; | ||
8893 | analyze_sbs(mddev); | ||
8894 | rdev_for_each_safe(rdev, tmp, mddev) { | ||
8895 | struct mdp_superblock_1 *sb = page_address(rdev->sb_page); | ||
8896 | /* since we don't write to faulty devices, we figure out if the | ||
8897 | * disk is faulty by comparing events | ||
8898 | */ | ||
8899 | if (mddev->events > sb->events) | ||
8900 | set_bit(Faulty, &rdev->flags); | ||
8901 | } | ||
8902 | |||
8903 | } | ||
8904 | EXPORT_SYMBOL(md_reload_sb); | ||
8905 | |||
8659 | #ifndef MODULE | 8906 | #ifndef MODULE |
8660 | 8907 | ||
8661 | /* | 8908 | /* |
diff --git a/drivers/md/md.h b/drivers/md/md.h index 318ca8fd430f..ecdce36ec6b8 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/timer.h> | 23 | #include <linux/timer.h> |
24 | #include <linux/wait.h> | 24 | #include <linux/wait.h> |
25 | #include <linux/workqueue.h> | 25 | #include <linux/workqueue.h> |
26 | #include "md-cluster.h" | ||
26 | 27 | ||
27 | #define MaxSector (~(sector_t)0) | 28 | #define MaxSector (~(sector_t)0) |
28 | 29 | ||
@@ -170,6 +171,10 @@ enum flag_bits { | |||
170 | * a want_replacement device with same | 171 | * a want_replacement device with same |
171 | * raid_disk number. | 172 | * raid_disk number. |
172 | */ | 173 | */ |
174 | Candidate, /* For clustered environments only: | ||
175 | * This device is seen locally but not | ||
176 | * by the whole cluster | ||
177 | */ | ||
173 | }; | 178 | }; |
174 | 179 | ||
175 | #define BB_LEN_MASK (0x00000000000001FFULL) | 180 | #define BB_LEN_MASK (0x00000000000001FFULL) |
@@ -202,6 +207,8 @@ extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, | |||
202 | int is_new); | 207 | int is_new); |
203 | extern void md_ack_all_badblocks(struct badblocks *bb); | 208 | extern void md_ack_all_badblocks(struct badblocks *bb); |
204 | 209 | ||
210 | struct md_cluster_info; | ||
211 | |||
205 | struct mddev { | 212 | struct mddev { |
206 | void *private; | 213 | void *private; |
207 | struct md_personality *pers; | 214 | struct md_personality *pers; |
@@ -430,6 +437,8 @@ struct mddev { | |||
430 | unsigned long daemon_sleep; /* how many jiffies between updates? */ | 437 | unsigned long daemon_sleep; /* how many jiffies between updates? */ |
431 | unsigned long max_write_behind; /* write-behind mode */ | 438 | unsigned long max_write_behind; /* write-behind mode */ |
432 | int external; | 439 | int external; |
440 | int nodes; /* Maximum number of nodes in the cluster */ | ||
441 | char cluster_name[64]; /* Name of the cluster */ | ||
433 | } bitmap_info; | 442 | } bitmap_info; |
434 | 443 | ||
435 | atomic_t max_corr_read_errors; /* max read retries */ | 444 | atomic_t max_corr_read_errors; /* max read retries */ |
@@ -448,6 +457,7 @@ struct mddev { | |||
448 | struct work_struct flush_work; | 457 | struct work_struct flush_work; |
449 | struct work_struct event_work; /* used by dm to report failure event */ | 458 | struct work_struct event_work; /* used by dm to report failure event */ |
450 | void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); | 459 | void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); |
460 | struct md_cluster_info *cluster_info; | ||
451 | }; | 461 | }; |
452 | 462 | ||
453 | static inline int __must_check mddev_lock(struct mddev *mddev) | 463 | static inline int __must_check mddev_lock(struct mddev *mddev) |
@@ -608,6 +618,11 @@ static inline void safe_put_page(struct page *p) | |||
608 | 618 | ||
609 | extern int register_md_personality(struct md_personality *p); | 619 | extern int register_md_personality(struct md_personality *p); |
610 | extern int unregister_md_personality(struct md_personality *p); | 620 | extern int unregister_md_personality(struct md_personality *p); |
621 | extern int register_md_cluster_operations(struct md_cluster_operations *ops, | ||
622 | struct module *module); | ||
623 | extern int unregister_md_cluster_operations(void); | ||
624 | extern int md_setup_cluster(struct mddev *mddev, int nodes); | ||
625 | extern void md_cluster_stop(struct mddev *mddev); | ||
611 | extern struct md_thread *md_register_thread( | 626 | extern struct md_thread *md_register_thread( |
612 | void (*run)(struct md_thread *thread), | 627 | void (*run)(struct md_thread *thread), |
613 | struct mddev *mddev, | 628 | struct mddev *mddev, |
@@ -654,6 +669,10 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, | |||
654 | struct mddev *mddev); | 669 | struct mddev *mddev); |
655 | 670 | ||
656 | extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule); | 671 | extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule); |
672 | extern void md_reload_sb(struct mddev *mddev); | ||
673 | extern void md_update_sb(struct mddev *mddev, int force); | ||
674 | extern void md_kick_rdev_from_array(struct md_rdev * rdev); | ||
675 | struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); | ||
657 | static inline int mddev_check_plugged(struct mddev *mddev) | 676 | static inline int mddev_check_plugged(struct mddev *mddev) |
658 | { | 677 | { |
659 | return !!blk_check_plugged(md_unplug, mddev, | 678 | return !!blk_check_plugged(md_unplug, mddev, |
@@ -669,4 +688,9 @@ static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) | |||
669 | } | 688 | } |
670 | } | 689 | } |
671 | 690 | ||
691 | extern struct md_cluster_operations *md_cluster_ops; | ||
692 | static inline int mddev_is_clustered(struct mddev *mddev) | ||
693 | { | ||
694 | return mddev->cluster_info && mddev->bitmap_info.nodes > 1; | ||
695 | } | ||
672 | #endif /* _MD_MD_H */ | 696 | #endif /* _MD_MD_H */ |
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index d34e238afa54..4efa50186a2a 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -539,7 +539,13 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect | |||
539 | has_nonrot_disk = 0; | 539 | has_nonrot_disk = 0; |
540 | choose_next_idle = 0; | 540 | choose_next_idle = 0; |
541 | 541 | ||
542 | choose_first = (conf->mddev->recovery_cp < this_sector + sectors); | 542 | if ((conf->mddev->recovery_cp < this_sector + sectors) || |
543 | (mddev_is_clustered(conf->mddev) && | ||
544 | md_cluster_ops->area_resyncing(conf->mddev, this_sector, | ||
545 | this_sector + sectors))) | ||
546 | choose_first = 1; | ||
547 | else | ||
548 | choose_first = 0; | ||
543 | 549 | ||
544 | for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { | 550 | for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { |
545 | sector_t dist; | 551 | sector_t dist; |
@@ -1102,8 +1108,10 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
1102 | md_write_start(mddev, bio); /* wait on superblock update early */ | 1108 | md_write_start(mddev, bio); /* wait on superblock update early */ |
1103 | 1109 | ||
1104 | if (bio_data_dir(bio) == WRITE && | 1110 | if (bio_data_dir(bio) == WRITE && |
1105 | bio_end_sector(bio) > mddev->suspend_lo && | 1111 | ((bio_end_sector(bio) > mddev->suspend_lo && |
1106 | bio->bi_iter.bi_sector < mddev->suspend_hi) { | 1112 | bio->bi_iter.bi_sector < mddev->suspend_hi) || |
1113 | (mddev_is_clustered(mddev) && | ||
1114 | md_cluster_ops->area_resyncing(mddev, bio->bi_iter.bi_sector, bio_end_sector(bio))))) { | ||
1107 | /* As the suspend_* range is controlled by | 1115 | /* As the suspend_* range is controlled by |
1108 | * userspace, we want an interruptible | 1116 | * userspace, we want an interruptible |
1109 | * wait. | 1117 | * wait. |
@@ -1114,7 +1122,10 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
1114 | prepare_to_wait(&conf->wait_barrier, | 1122 | prepare_to_wait(&conf->wait_barrier, |
1115 | &w, TASK_INTERRUPTIBLE); | 1123 | &w, TASK_INTERRUPTIBLE); |
1116 | if (bio_end_sector(bio) <= mddev->suspend_lo || | 1124 | if (bio_end_sector(bio) <= mddev->suspend_lo || |
1117 | bio->bi_iter.bi_sector >= mddev->suspend_hi) | 1125 | bio->bi_iter.bi_sector >= mddev->suspend_hi || |
1126 | (mddev_is_clustered(mddev) && | ||
1127 | !md_cluster_ops->area_resyncing(mddev, | ||
1128 | bio->bi_iter.bi_sector, bio_end_sector(bio)))) | ||
1118 | break; | 1129 | break; |
1119 | schedule(); | 1130 | schedule(); |
1120 | } | 1131 | } |
@@ -1561,6 +1572,7 @@ static int raid1_spare_active(struct mddev *mddev) | |||
1561 | struct md_rdev *rdev = conf->mirrors[i].rdev; | 1572 | struct md_rdev *rdev = conf->mirrors[i].rdev; |
1562 | struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev; | 1573 | struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev; |
1563 | if (repl | 1574 | if (repl |
1575 | && !test_bit(Candidate, &repl->flags) | ||
1564 | && repl->recovery_offset == MaxSector | 1576 | && repl->recovery_offset == MaxSector |
1565 | && !test_bit(Faulty, &repl->flags) | 1577 | && !test_bit(Faulty, &repl->flags) |
1566 | && !test_and_set_bit(In_sync, &repl->flags)) { | 1578 | && !test_and_set_bit(In_sync, &repl->flags)) { |
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index 49f4210d4394..2ae6131e69a5 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h | |||
@@ -78,6 +78,12 @@ | |||
78 | #define MD_DISK_ACTIVE 1 /* disk is running or spare disk */ | 78 | #define MD_DISK_ACTIVE 1 /* disk is running or spare disk */ |
79 | #define MD_DISK_SYNC 2 /* disk is in sync with the raid set */ | 79 | #define MD_DISK_SYNC 2 /* disk is in sync with the raid set */ |
80 | #define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */ | 80 | #define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */ |
81 | #define MD_DISK_CLUSTER_ADD 4 /* Initiate a disk add across the cluster | ||
82 | * For clustered enviroments only. | ||
83 | */ | ||
84 | #define MD_DISK_CANDIDATE 5 /* disk is added as spare (local) until confirmed | ||
85 | * For clustered enviroments only. | ||
86 | */ | ||
81 | 87 | ||
82 | #define MD_DISK_WRITEMOSTLY 9 /* disk is "write-mostly" is RAID1 config. | 88 | #define MD_DISK_WRITEMOSTLY 9 /* disk is "write-mostly" is RAID1 config. |
83 | * read requests will only be sent here in | 89 | * read requests will only be sent here in |
@@ -101,6 +107,7 @@ typedef struct mdp_device_descriptor_s { | |||
101 | #define MD_SB_CLEAN 0 | 107 | #define MD_SB_CLEAN 0 |
102 | #define MD_SB_ERRORS 1 | 108 | #define MD_SB_ERRORS 1 |
103 | 109 | ||
110 | #define MD_SB_CLUSTERED 5 /* MD is clustered */ | ||
104 | #define MD_SB_BITMAP_PRESENT 8 /* bitmap may be present nearby */ | 111 | #define MD_SB_BITMAP_PRESENT 8 /* bitmap may be present nearby */ |
105 | 112 | ||
106 | /* | 113 | /* |
diff --git a/include/uapi/linux/raid/md_u.h b/include/uapi/linux/raid/md_u.h index 74e7c60c4716..1cb8aa6850b5 100644 --- a/include/uapi/linux/raid/md_u.h +++ b/include/uapi/linux/raid/md_u.h | |||
@@ -62,6 +62,7 @@ | |||
62 | #define STOP_ARRAY _IO (MD_MAJOR, 0x32) | 62 | #define STOP_ARRAY _IO (MD_MAJOR, 0x32) |
63 | #define STOP_ARRAY_RO _IO (MD_MAJOR, 0x33) | 63 | #define STOP_ARRAY_RO _IO (MD_MAJOR, 0x33) |
64 | #define RESTART_ARRAY_RW _IO (MD_MAJOR, 0x34) | 64 | #define RESTART_ARRAY_RW _IO (MD_MAJOR, 0x34) |
65 | #define CLUSTERED_DISK_NACK _IO (MD_MAJOR, 0x35) | ||
65 | 66 | ||
66 | /* 63 partitions with the alternate major number (mdp) */ | 67 | /* 63 partitions with the alternate major number (mdp) */ |
67 | #define MdpMinorShift 6 | 68 | #define MdpMinorShift 6 |