diff options
-rw-r--r-- | Documentation/md-cluster.txt | 176 | ||||
-rw-r--r-- | crypto/async_tx/async_pq.c | 19 | ||||
-rw-r--r-- | drivers/md/Kconfig | 16 | ||||
-rw-r--r-- | drivers/md/Makefile | 1 | ||||
-rw-r--r-- | drivers/md/bitmap.c | 189 | ||||
-rw-r--r-- | drivers/md/bitmap.h | 10 | ||||
-rw-r--r-- | drivers/md/md-cluster.c | 965 | ||||
-rw-r--r-- | drivers/md/md-cluster.h | 29 | ||||
-rw-r--r-- | drivers/md/md.c | 382 | ||||
-rw-r--r-- | drivers/md/md.h | 26 | ||||
-rw-r--r-- | drivers/md/raid0.c | 48 | ||||
-rw-r--r-- | drivers/md/raid1.c | 29 | ||||
-rw-r--r-- | drivers/md/raid10.c | 8 | ||||
-rw-r--r-- | drivers/md/raid5.c | 826 | ||||
-rw-r--r-- | drivers/md/raid5.h | 59 | ||||
-rw-r--r-- | include/linux/async_tx.h | 3 | ||||
-rw-r--r-- | include/linux/raid/pq.h | 1 | ||||
-rw-r--r-- | include/uapi/linux/raid/md_p.h | 7 | ||||
-rw-r--r-- | include/uapi/linux/raid/md_u.h | 1 | ||||
-rw-r--r-- | lib/raid6/algos.c | 41 | ||||
-rw-r--r-- | lib/raid6/altivec.uc | 1 | ||||
-rw-r--r-- | lib/raid6/avx2.c | 3 | ||||
-rw-r--r-- | lib/raid6/int.uc | 41 | ||||
-rw-r--r-- | lib/raid6/mmx.c | 2 | ||||
-rw-r--r-- | lib/raid6/neon.c | 1 | ||||
-rw-r--r-- | lib/raid6/sse1.c | 2 | ||||
-rw-r--r-- | lib/raid6/sse2.c | 227 | ||||
-rw-r--r-- | lib/raid6/test/test.c | 51 | ||||
-rw-r--r-- | lib/raid6/tilegx.uc | 1 |
29 files changed, 2860 insertions, 305 deletions
diff --git a/Documentation/md-cluster.txt b/Documentation/md-cluster.txt new file mode 100644 index 000000000000..de1af7db3355 --- /dev/null +++ b/Documentation/md-cluster.txt | |||
@@ -0,0 +1,176 @@ | |||
1 | The cluster MD is a shared-device RAID for a cluster. | ||
2 | |||
3 | |||
4 | 1. On-disk format | ||
5 | |||
6 | Separate write-intent-bitmap are used for each cluster node. | ||
7 | The bitmaps record all writes that may have been started on that node, | ||
8 | and may not yet have finished. The on-disk layout is: | ||
9 | |||
10 | 0 4k 8k 12k | ||
11 | ------------------------------------------------------------------- | ||
12 | | idle | md super | bm super [0] + bits | | ||
13 | | bm bits[0, contd] | bm super[1] + bits | bm bits[1, contd] | | ||
14 | | bm super[2] + bits | bm bits [2, contd] | bm super[3] + bits | | ||
15 | | bm bits [3, contd] | | | | ||
16 | |||
17 | During "normal" functioning we assume the filesystem ensures that only one | ||
18 | node writes to any given block at a time, so a write | ||
19 | request will | ||
20 | - set the appropriate bit (if not already set) | ||
21 | - commit the write to all mirrors | ||
22 | - schedule the bit to be cleared after a timeout. | ||
23 | |||
24 | Reads are just handled normally. It is up to the filesystem to | ||
25 | ensure one node doesn't read from a location where another node (or the same | ||
26 | node) is writing. | ||
27 | |||
28 | |||
29 | 2. DLM Locks for management | ||
30 | |||
31 | There are two locks for managing the device: | ||
32 | |||
33 | 2.1 Bitmap lock resource (bm_lockres) | ||
34 | |||
35 | The bm_lockres protects individual node bitmaps. They are named in the | ||
36 | form bitmap001 for node 1, bitmap002 for node and so on. When a node | ||
37 | joins the cluster, it acquires the lock in PW mode and it stays so | ||
38 | during the lifetime the node is part of the cluster. The lock resource | ||
39 | number is based on the slot number returned by the DLM subsystem. Since | ||
40 | DLM starts node count from one and bitmap slots start from zero, one is | ||
41 | subtracted from the DLM slot number to arrive at the bitmap slot number. | ||
42 | |||
43 | 3. Communication | ||
44 | |||
45 | Each node has to communicate with other nodes when starting or ending | ||
46 | resync, and metadata superblock updates. | ||
47 | |||
48 | 3.1 Message Types | ||
49 | |||
50 | There are 3 types, of messages which are passed | ||
51 | |||
52 | 3.1.1 METADATA_UPDATED: informs other nodes that the metadata has been | ||
53 | updated, and the node must re-read the md superblock. This is performed | ||
54 | synchronously. | ||
55 | |||
56 | 3.1.2 RESYNC: informs other nodes that a resync is initiated or ended | ||
57 | so that each node may suspend or resume the region. | ||
58 | |||
59 | 3.2 Communication mechanism | ||
60 | |||
61 | The DLM LVB is used to communicate within nodes of the cluster. There | ||
62 | are three resources used for the purpose: | ||
63 | |||
64 | 3.2.1 Token: The resource which protects the entire communication | ||
65 | system. The node having the token resource is allowed to | ||
66 | communicate. | ||
67 | |||
68 | 3.2.2 Message: The lock resource which carries the data to | ||
69 | communicate. | ||
70 | |||
71 | 3.2.3 Ack: The resource, acquiring which means the message has been | ||
72 | acknowledged by all nodes in the cluster. The BAST of the resource | ||
73 | is used to inform the receive node that a node wants to communicate. | ||
74 | |||
75 | The algorithm is: | ||
76 | |||
77 | 1. receive status | ||
78 | |||
79 | sender receiver receiver | ||
80 | ACK:CR ACK:CR ACK:CR | ||
81 | |||
82 | 2. sender get EX of TOKEN | ||
83 | sender get EX of MESSAGE | ||
84 | sender receiver receiver | ||
85 | TOKEN:EX ACK:CR ACK:CR | ||
86 | MESSAGE:EX | ||
87 | ACK:CR | ||
88 | |||
89 | Sender checks that it still needs to send a message. Messages received | ||
90 | or other events that happened while waiting for the TOKEN may have made | ||
91 | this message inappropriate or redundant. | ||
92 | |||
93 | 3. sender write LVB. | ||
94 | sender down-convert MESSAGE from EX to CR | ||
95 | sender try to get EX of ACK | ||
96 | [ wait until all receiver has *processed* the MESSAGE ] | ||
97 | |||
98 | [ triggered by bast of ACK ] | ||
99 | receiver get CR of MESSAGE | ||
100 | receiver read LVB | ||
101 | receiver processes the message | ||
102 | [ wait finish ] | ||
103 | receiver release ACK | ||
104 | |||
105 | sender receiver receiver | ||
106 | TOKEN:EX MESSAGE:CR MESSAGE:CR | ||
107 | MESSAGE:CR | ||
108 | ACK:EX | ||
109 | |||
110 | 4. triggered by grant of EX on ACK (indicating all receivers have processed | ||
111 | message) | ||
112 | sender down-convert ACK from EX to CR | ||
113 | sender release MESSAGE | ||
114 | sender release TOKEN | ||
115 | receiver upconvert to EX of MESSAGE | ||
116 | receiver get CR of ACK | ||
117 | receiver release MESSAGE | ||
118 | |||
119 | sender receiver receiver | ||
120 | ACK:CR ACK:CR ACK:CR | ||
121 | |||
122 | |||
123 | 4. Handling Failures | ||
124 | |||
125 | 4.1 Node Failure | ||
126 | When a node fails, the DLM informs the cluster with the slot. The node | ||
127 | starts a cluster recovery thread. The cluster recovery thread: | ||
128 | - acquires the bitmap<number> lock of the failed node | ||
129 | - opens the bitmap | ||
130 | - reads the bitmap of the failed node | ||
131 | - copies the set bitmap to local node | ||
132 | - cleans the bitmap of the failed node | ||
133 | - releases bitmap<number> lock of the failed node | ||
134 | - initiates resync of the bitmap on the current node | ||
135 | |||
136 | The resync process, is the regular md resync. However, in a clustered | ||
137 | environment when a resync is performed, it needs to tell other nodes | ||
138 | of the areas which are suspended. Before a resync starts, the node | ||
139 | send out RESYNC_START with the (lo,hi) range of the area which needs | ||
140 | to be suspended. Each node maintains a suspend_list, which contains | ||
141 | the list of ranges which are currently suspended. On receiving | ||
142 | RESYNC_START, the node adds the range to the suspend_list. Similarly, | ||
143 | when the node performing resync finishes, it send RESYNC_FINISHED | ||
144 | to other nodes and other nodes remove the corresponding entry from | ||
145 | the suspend_list. | ||
146 | |||
147 | A helper function, should_suspend() can be used to check if a particular | ||
148 | I/O range should be suspended or not. | ||
149 | |||
150 | 4.2 Device Failure | ||
151 | Device failures are handled and communicated with the metadata update | ||
152 | routine. | ||
153 | |||
154 | 5. Adding a new Device | ||
155 | For adding a new device, it is necessary that all nodes "see" the new device | ||
156 | to be added. For this, the following algorithm is used: | ||
157 | |||
158 | 1. Node 1 issues mdadm --manage /dev/mdX --add /dev/sdYY which issues | ||
159 | ioctl(ADD_NEW_DISC with disc.state set to MD_DISK_CLUSTER_ADD) | ||
160 | 2. Node 1 sends NEWDISK with uuid and slot number | ||
161 | 3. Other nodes issue kobject_uevent_env with uuid and slot number | ||
162 | (Steps 4,5 could be a udev rule) | ||
163 | 4. In userspace, the node searches for the disk, perhaps | ||
164 | using blkid -t SUB_UUID="" | ||
165 | 5. Other nodes issue either of the following depending on whether the disk | ||
166 | was found: | ||
167 | ioctl(ADD_NEW_DISK with disc.state set to MD_DISK_CANDIDATE and | ||
168 | disc.number set to slot number) | ||
169 | ioctl(CLUSTERED_DISK_NACK) | ||
170 | 6. Other nodes drop lock on no-new-devs (CR) if device is found | ||
171 | 7. Node 1 attempts EX lock on no-new-devs | ||
172 | 8. If node 1 gets the lock, it sends METADATA_UPDATED after unmarking the disk | ||
173 | as SpareLocal | ||
174 | 9. If not (get no-new-dev lock), it fails the operation and sends METADATA_UPDATED | ||
175 | 10. Other nodes get the information whether a disk is added or not | ||
176 | by the following METADATA_UPDATED. | ||
diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c index d05327caf69d..5d355e0c2633 100644 --- a/crypto/async_tx/async_pq.c +++ b/crypto/async_tx/async_pq.c | |||
@@ -124,6 +124,7 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks, | |||
124 | { | 124 | { |
125 | void **srcs; | 125 | void **srcs; |
126 | int i; | 126 | int i; |
127 | int start = -1, stop = disks - 3; | ||
127 | 128 | ||
128 | if (submit->scribble) | 129 | if (submit->scribble) |
129 | srcs = submit->scribble; | 130 | srcs = submit->scribble; |
@@ -134,10 +135,21 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks, | |||
134 | if (blocks[i] == NULL) { | 135 | if (blocks[i] == NULL) { |
135 | BUG_ON(i > disks - 3); /* P or Q can't be zero */ | 136 | BUG_ON(i > disks - 3); /* P or Q can't be zero */ |
136 | srcs[i] = (void*)raid6_empty_zero_page; | 137 | srcs[i] = (void*)raid6_empty_zero_page; |
137 | } else | 138 | } else { |
138 | srcs[i] = page_address(blocks[i]) + offset; | 139 | srcs[i] = page_address(blocks[i]) + offset; |
140 | if (i < disks - 2) { | ||
141 | stop = i; | ||
142 | if (start == -1) | ||
143 | start = i; | ||
144 | } | ||
145 | } | ||
139 | } | 146 | } |
140 | raid6_call.gen_syndrome(disks, len, srcs); | 147 | if (submit->flags & ASYNC_TX_PQ_XOR_DST) { |
148 | BUG_ON(!raid6_call.xor_syndrome); | ||
149 | if (start >= 0) | ||
150 | raid6_call.xor_syndrome(disks, start, stop, len, srcs); | ||
151 | } else | ||
152 | raid6_call.gen_syndrome(disks, len, srcs); | ||
141 | async_tx_sync_epilog(submit); | 153 | async_tx_sync_epilog(submit); |
142 | } | 154 | } |
143 | 155 | ||
@@ -178,7 +190,8 @@ async_gen_syndrome(struct page **blocks, unsigned int offset, int disks, | |||
178 | if (device) | 190 | if (device) |
179 | unmap = dmaengine_get_unmap_data(device->dev, disks, GFP_NOIO); | 191 | unmap = dmaengine_get_unmap_data(device->dev, disks, GFP_NOIO); |
180 | 192 | ||
181 | if (unmap && | 193 | /* XORing P/Q is only implemented in software */ |
194 | if (unmap && !(submit->flags & ASYNC_TX_PQ_XOR_DST) && | ||
182 | (src_cnt <= dma_maxpq(device, 0) || | 195 | (src_cnt <= dma_maxpq(device, 0) || |
183 | dma_maxpq(device, DMA_PREP_CONTINUE) > 0) && | 196 | dma_maxpq(device, DMA_PREP_CONTINUE) > 0) && |
184 | is_dma_pq_aligned(device, offset, 0, len)) { | 197 | is_dma_pq_aligned(device, offset, 0, len)) { |
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 6ddc983417d5..edcf4ab66e00 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
@@ -175,6 +175,22 @@ config MD_FAULTY | |||
175 | 175 | ||
176 | In unsure, say N. | 176 | In unsure, say N. |
177 | 177 | ||
178 | |||
179 | config MD_CLUSTER | ||
180 | tristate "Cluster Support for MD (EXPERIMENTAL)" | ||
181 | depends on BLK_DEV_MD | ||
182 | depends on DLM | ||
183 | default n | ||
184 | ---help--- | ||
185 | Clustering support for MD devices. This enables locking and | ||
186 | synchronization across multiple systems on the cluster, so all | ||
187 | nodes in the cluster can access the MD devices simultaneously. | ||
188 | |||
189 | This brings the redundancy (and uptime) of RAID levels across the | ||
190 | nodes of the cluster. | ||
191 | |||
192 | If unsure, say N. | ||
193 | |||
178 | source "drivers/md/bcache/Kconfig" | 194 | source "drivers/md/bcache/Kconfig" |
179 | 195 | ||
180 | config BLK_DEV_DM_BUILTIN | 196 | config BLK_DEV_DM_BUILTIN |
diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 1863feaa5846..dba4db5985fb 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile | |||
@@ -30,6 +30,7 @@ obj-$(CONFIG_MD_RAID10) += raid10.o | |||
30 | obj-$(CONFIG_MD_RAID456) += raid456.o | 30 | obj-$(CONFIG_MD_RAID456) += raid456.o |
31 | obj-$(CONFIG_MD_MULTIPATH) += multipath.o | 31 | obj-$(CONFIG_MD_MULTIPATH) += multipath.o |
32 | obj-$(CONFIG_MD_FAULTY) += faulty.o | 32 | obj-$(CONFIG_MD_FAULTY) += faulty.o |
33 | obj-$(CONFIG_MD_CLUSTER) += md-cluster.o | ||
33 | obj-$(CONFIG_BCACHE) += bcache/ | 34 | obj-$(CONFIG_BCACHE) += bcache/ |
34 | obj-$(CONFIG_BLK_DEV_MD) += md-mod.o | 35 | obj-$(CONFIG_BLK_DEV_MD) += md-mod.o |
35 | obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o | 36 | obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o |
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 3a5767968ba0..2bc56e2a3526 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c | |||
@@ -205,6 +205,10 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) | |||
205 | struct block_device *bdev; | 205 | struct block_device *bdev; |
206 | struct mddev *mddev = bitmap->mddev; | 206 | struct mddev *mddev = bitmap->mddev; |
207 | struct bitmap_storage *store = &bitmap->storage; | 207 | struct bitmap_storage *store = &bitmap->storage; |
208 | int node_offset = 0; | ||
209 | |||
210 | if (mddev_is_clustered(bitmap->mddev)) | ||
211 | node_offset = bitmap->cluster_slot * store->file_pages; | ||
208 | 212 | ||
209 | while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { | 213 | while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { |
210 | int size = PAGE_SIZE; | 214 | int size = PAGE_SIZE; |
@@ -433,6 +437,7 @@ void bitmap_update_sb(struct bitmap *bitmap) | |||
433 | /* This might have been changed by a reshape */ | 437 | /* This might have been changed by a reshape */ |
434 | sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); | 438 | sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); |
435 | sb->chunksize = cpu_to_le32(bitmap->mddev->bitmap_info.chunksize); | 439 | sb->chunksize = cpu_to_le32(bitmap->mddev->bitmap_info.chunksize); |
440 | sb->nodes = cpu_to_le32(bitmap->mddev->bitmap_info.nodes); | ||
436 | sb->sectors_reserved = cpu_to_le32(bitmap->mddev-> | 441 | sb->sectors_reserved = cpu_to_le32(bitmap->mddev-> |
437 | bitmap_info.space); | 442 | bitmap_info.space); |
438 | kunmap_atomic(sb); | 443 | kunmap_atomic(sb); |
@@ -544,6 +549,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) | |||
544 | bitmap_super_t *sb; | 549 | bitmap_super_t *sb; |
545 | unsigned long chunksize, daemon_sleep, write_behind; | 550 | unsigned long chunksize, daemon_sleep, write_behind; |
546 | unsigned long long events; | 551 | unsigned long long events; |
552 | int nodes = 0; | ||
547 | unsigned long sectors_reserved = 0; | 553 | unsigned long sectors_reserved = 0; |
548 | int err = -EINVAL; | 554 | int err = -EINVAL; |
549 | struct page *sb_page; | 555 | struct page *sb_page; |
@@ -562,6 +568,22 @@ static int bitmap_read_sb(struct bitmap *bitmap) | |||
562 | return -ENOMEM; | 568 | return -ENOMEM; |
563 | bitmap->storage.sb_page = sb_page; | 569 | bitmap->storage.sb_page = sb_page; |
564 | 570 | ||
571 | re_read: | ||
572 | /* If cluster_slot is set, the cluster is setup */ | ||
573 | if (bitmap->cluster_slot >= 0) { | ||
574 | sector_t bm_blocks = bitmap->mddev->resync_max_sectors; | ||
575 | |||
576 | sector_div(bm_blocks, | ||
577 | bitmap->mddev->bitmap_info.chunksize >> 9); | ||
578 | /* bits to bytes */ | ||
579 | bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t); | ||
580 | /* to 4k blocks */ | ||
581 | bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096); | ||
582 | bitmap->mddev->bitmap_info.offset += bitmap->cluster_slot * (bm_blocks << 3); | ||
583 | pr_info("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__, | ||
584 | bitmap->cluster_slot, (unsigned long long)bitmap->mddev->bitmap_info.offset); | ||
585 | } | ||
586 | |||
565 | if (bitmap->storage.file) { | 587 | if (bitmap->storage.file) { |
566 | loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host); | 588 | loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host); |
567 | int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize; | 589 | int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize; |
@@ -577,12 +599,15 @@ static int bitmap_read_sb(struct bitmap *bitmap) | |||
577 | if (err) | 599 | if (err) |
578 | return err; | 600 | return err; |
579 | 601 | ||
602 | err = -EINVAL; | ||
580 | sb = kmap_atomic(sb_page); | 603 | sb = kmap_atomic(sb_page); |
581 | 604 | ||
582 | chunksize = le32_to_cpu(sb->chunksize); | 605 | chunksize = le32_to_cpu(sb->chunksize); |
583 | daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; | 606 | daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; |
584 | write_behind = le32_to_cpu(sb->write_behind); | 607 | write_behind = le32_to_cpu(sb->write_behind); |
585 | sectors_reserved = le32_to_cpu(sb->sectors_reserved); | 608 | sectors_reserved = le32_to_cpu(sb->sectors_reserved); |
609 | nodes = le32_to_cpu(sb->nodes); | ||
610 | strlcpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64); | ||
586 | 611 | ||
587 | /* verify that the bitmap-specific fields are valid */ | 612 | /* verify that the bitmap-specific fields are valid */ |
588 | if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) | 613 | if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) |
@@ -619,7 +644,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) | |||
619 | goto out; | 644 | goto out; |
620 | } | 645 | } |
621 | events = le64_to_cpu(sb->events); | 646 | events = le64_to_cpu(sb->events); |
622 | if (events < bitmap->mddev->events) { | 647 | if (!nodes && (events < bitmap->mddev->events)) { |
623 | printk(KERN_INFO | 648 | printk(KERN_INFO |
624 | "%s: bitmap file is out of date (%llu < %llu) " | 649 | "%s: bitmap file is out of date (%llu < %llu) " |
625 | "-- forcing full recovery\n", | 650 | "-- forcing full recovery\n", |
@@ -634,20 +659,40 @@ static int bitmap_read_sb(struct bitmap *bitmap) | |||
634 | if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) | 659 | if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) |
635 | set_bit(BITMAP_HOSTENDIAN, &bitmap->flags); | 660 | set_bit(BITMAP_HOSTENDIAN, &bitmap->flags); |
636 | bitmap->events_cleared = le64_to_cpu(sb->events_cleared); | 661 | bitmap->events_cleared = le64_to_cpu(sb->events_cleared); |
662 | strlcpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64); | ||
637 | err = 0; | 663 | err = 0; |
664 | |||
638 | out: | 665 | out: |
639 | kunmap_atomic(sb); | 666 | kunmap_atomic(sb); |
667 | /* Assiging chunksize is required for "re_read" */ | ||
668 | bitmap->mddev->bitmap_info.chunksize = chunksize; | ||
669 | if (nodes && (bitmap->cluster_slot < 0)) { | ||
670 | err = md_setup_cluster(bitmap->mddev, nodes); | ||
671 | if (err) { | ||
672 | pr_err("%s: Could not setup cluster service (%d)\n", | ||
673 | bmname(bitmap), err); | ||
674 | goto out_no_sb; | ||
675 | } | ||
676 | bitmap->cluster_slot = md_cluster_ops->slot_number(bitmap->mddev); | ||
677 | goto re_read; | ||
678 | } | ||
679 | |||
680 | |||
640 | out_no_sb: | 681 | out_no_sb: |
641 | if (test_bit(BITMAP_STALE, &bitmap->flags)) | 682 | if (test_bit(BITMAP_STALE, &bitmap->flags)) |
642 | bitmap->events_cleared = bitmap->mddev->events; | 683 | bitmap->events_cleared = bitmap->mddev->events; |
643 | bitmap->mddev->bitmap_info.chunksize = chunksize; | 684 | bitmap->mddev->bitmap_info.chunksize = chunksize; |
644 | bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; | 685 | bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; |
645 | bitmap->mddev->bitmap_info.max_write_behind = write_behind; | 686 | bitmap->mddev->bitmap_info.max_write_behind = write_behind; |
687 | bitmap->mddev->bitmap_info.nodes = nodes; | ||
646 | if (bitmap->mddev->bitmap_info.space == 0 || | 688 | if (bitmap->mddev->bitmap_info.space == 0 || |
647 | bitmap->mddev->bitmap_info.space > sectors_reserved) | 689 | bitmap->mddev->bitmap_info.space > sectors_reserved) |
648 | bitmap->mddev->bitmap_info.space = sectors_reserved; | 690 | bitmap->mddev->bitmap_info.space = sectors_reserved; |
649 | if (err) | 691 | if (err) { |
650 | bitmap_print_sb(bitmap); | 692 | bitmap_print_sb(bitmap); |
693 | if (bitmap->cluster_slot < 0) | ||
694 | md_cluster_stop(bitmap->mddev); | ||
695 | } | ||
651 | return err; | 696 | return err; |
652 | } | 697 | } |
653 | 698 | ||
@@ -692,9 +737,10 @@ static inline struct page *filemap_get_page(struct bitmap_storage *store, | |||
692 | } | 737 | } |
693 | 738 | ||
694 | static int bitmap_storage_alloc(struct bitmap_storage *store, | 739 | static int bitmap_storage_alloc(struct bitmap_storage *store, |
695 | unsigned long chunks, int with_super) | 740 | unsigned long chunks, int with_super, |
741 | int slot_number) | ||
696 | { | 742 | { |
697 | int pnum; | 743 | int pnum, offset = 0; |
698 | unsigned long num_pages; | 744 | unsigned long num_pages; |
699 | unsigned long bytes; | 745 | unsigned long bytes; |
700 | 746 | ||
@@ -703,6 +749,7 @@ static int bitmap_storage_alloc(struct bitmap_storage *store, | |||
703 | bytes += sizeof(bitmap_super_t); | 749 | bytes += sizeof(bitmap_super_t); |
704 | 750 | ||
705 | num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE); | 751 | num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE); |
752 | offset = slot_number * (num_pages - 1); | ||
706 | 753 | ||
707 | store->filemap = kmalloc(sizeof(struct page *) | 754 | store->filemap = kmalloc(sizeof(struct page *) |
708 | * num_pages, GFP_KERNEL); | 755 | * num_pages, GFP_KERNEL); |
@@ -713,20 +760,22 @@ static int bitmap_storage_alloc(struct bitmap_storage *store, | |||
713 | store->sb_page = alloc_page(GFP_KERNEL|__GFP_ZERO); | 760 | store->sb_page = alloc_page(GFP_KERNEL|__GFP_ZERO); |
714 | if (store->sb_page == NULL) | 761 | if (store->sb_page == NULL) |
715 | return -ENOMEM; | 762 | return -ENOMEM; |
716 | store->sb_page->index = 0; | ||
717 | } | 763 | } |
764 | |||
718 | pnum = 0; | 765 | pnum = 0; |
719 | if (store->sb_page) { | 766 | if (store->sb_page) { |
720 | store->filemap[0] = store->sb_page; | 767 | store->filemap[0] = store->sb_page; |
721 | pnum = 1; | 768 | pnum = 1; |
769 | store->sb_page->index = offset; | ||
722 | } | 770 | } |
771 | |||
723 | for ( ; pnum < num_pages; pnum++) { | 772 | for ( ; pnum < num_pages; pnum++) { |
724 | store->filemap[pnum] = alloc_page(GFP_KERNEL|__GFP_ZERO); | 773 | store->filemap[pnum] = alloc_page(GFP_KERNEL|__GFP_ZERO); |
725 | if (!store->filemap[pnum]) { | 774 | if (!store->filemap[pnum]) { |
726 | store->file_pages = pnum; | 775 | store->file_pages = pnum; |
727 | return -ENOMEM; | 776 | return -ENOMEM; |
728 | } | 777 | } |
729 | store->filemap[pnum]->index = pnum; | 778 | store->filemap[pnum]->index = pnum + offset; |
730 | } | 779 | } |
731 | store->file_pages = pnum; | 780 | store->file_pages = pnum; |
732 | 781 | ||
@@ -885,6 +934,28 @@ static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) | |||
885 | } | 934 | } |
886 | } | 935 | } |
887 | 936 | ||
937 | static int bitmap_file_test_bit(struct bitmap *bitmap, sector_t block) | ||
938 | { | ||
939 | unsigned long bit; | ||
940 | struct page *page; | ||
941 | void *paddr; | ||
942 | unsigned long chunk = block >> bitmap->counts.chunkshift; | ||
943 | int set = 0; | ||
944 | |||
945 | page = filemap_get_page(&bitmap->storage, chunk); | ||
946 | if (!page) | ||
947 | return -EINVAL; | ||
948 | bit = file_page_offset(&bitmap->storage, chunk); | ||
949 | paddr = kmap_atomic(page); | ||
950 | if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) | ||
951 | set = test_bit(bit, paddr); | ||
952 | else | ||
953 | set = test_bit_le(bit, paddr); | ||
954 | kunmap_atomic(paddr); | ||
955 | return set; | ||
956 | } | ||
957 | |||
958 | |||
888 | /* this gets called when the md device is ready to unplug its underlying | 959 | /* this gets called when the md device is ready to unplug its underlying |
889 | * (slave) device queues -- before we let any writes go down, we need to | 960 | * (slave) device queues -- before we let any writes go down, we need to |
890 | * sync the dirty pages of the bitmap file to disk */ | 961 | * sync the dirty pages of the bitmap file to disk */ |
@@ -935,7 +1006,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n | |||
935 | */ | 1006 | */ |
936 | static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) | 1007 | static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) |
937 | { | 1008 | { |
938 | unsigned long i, chunks, index, oldindex, bit; | 1009 | unsigned long i, chunks, index, oldindex, bit, node_offset = 0; |
939 | struct page *page = NULL; | 1010 | struct page *page = NULL; |
940 | unsigned long bit_cnt = 0; | 1011 | unsigned long bit_cnt = 0; |
941 | struct file *file; | 1012 | struct file *file; |
@@ -981,6 +1052,9 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) | |||
981 | if (!bitmap->mddev->bitmap_info.external) | 1052 | if (!bitmap->mddev->bitmap_info.external) |
982 | offset = sizeof(bitmap_super_t); | 1053 | offset = sizeof(bitmap_super_t); |
983 | 1054 | ||
1055 | if (mddev_is_clustered(bitmap->mddev)) | ||
1056 | node_offset = bitmap->cluster_slot * (DIV_ROUND_UP(store->bytes, PAGE_SIZE)); | ||
1057 | |||
984 | for (i = 0; i < chunks; i++) { | 1058 | for (i = 0; i < chunks; i++) { |
985 | int b; | 1059 | int b; |
986 | index = file_page_index(&bitmap->storage, i); | 1060 | index = file_page_index(&bitmap->storage, i); |
@@ -1001,7 +1075,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) | |||
1001 | bitmap->mddev, | 1075 | bitmap->mddev, |
1002 | bitmap->mddev->bitmap_info.offset, | 1076 | bitmap->mddev->bitmap_info.offset, |
1003 | page, | 1077 | page, |
1004 | index, count); | 1078 | index + node_offset, count); |
1005 | 1079 | ||
1006 | if (ret) | 1080 | if (ret) |
1007 | goto err; | 1081 | goto err; |
@@ -1207,7 +1281,6 @@ void bitmap_daemon_work(struct mddev *mddev) | |||
1207 | j < bitmap->storage.file_pages | 1281 | j < bitmap->storage.file_pages |
1208 | && !test_bit(BITMAP_STALE, &bitmap->flags); | 1282 | && !test_bit(BITMAP_STALE, &bitmap->flags); |
1209 | j++) { | 1283 | j++) { |
1210 | |||
1211 | if (test_page_attr(bitmap, j, | 1284 | if (test_page_attr(bitmap, j, |
1212 | BITMAP_PAGE_DIRTY)) | 1285 | BITMAP_PAGE_DIRTY)) |
1213 | /* bitmap_unplug will handle the rest */ | 1286 | /* bitmap_unplug will handle the rest */ |
@@ -1530,11 +1603,13 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n | |||
1530 | return; | 1603 | return; |
1531 | } | 1604 | } |
1532 | if (!*bmc) { | 1605 | if (!*bmc) { |
1533 | *bmc = 2 | (needed ? NEEDED_MASK : 0); | 1606 | *bmc = 2; |
1534 | bitmap_count_page(&bitmap->counts, offset, 1); | 1607 | bitmap_count_page(&bitmap->counts, offset, 1); |
1535 | bitmap_set_pending(&bitmap->counts, offset); | 1608 | bitmap_set_pending(&bitmap->counts, offset); |
1536 | bitmap->allclean = 0; | 1609 | bitmap->allclean = 0; |
1537 | } | 1610 | } |
1611 | if (needed) | ||
1612 | *bmc |= NEEDED_MASK; | ||
1538 | spin_unlock_irq(&bitmap->counts.lock); | 1613 | spin_unlock_irq(&bitmap->counts.lock); |
1539 | } | 1614 | } |
1540 | 1615 | ||
@@ -1591,6 +1666,10 @@ static void bitmap_free(struct bitmap *bitmap) | |||
1591 | if (!bitmap) /* there was no bitmap */ | 1666 | if (!bitmap) /* there was no bitmap */ |
1592 | return; | 1667 | return; |
1593 | 1668 | ||
1669 | if (mddev_is_clustered(bitmap->mddev) && bitmap->mddev->cluster_info && | ||
1670 | bitmap->cluster_slot == md_cluster_ops->slot_number(bitmap->mddev)) | ||
1671 | md_cluster_stop(bitmap->mddev); | ||
1672 | |||
1594 | /* Shouldn't be needed - but just in case.... */ | 1673 | /* Shouldn't be needed - but just in case.... */ |
1595 | wait_event(bitmap->write_wait, | 1674 | wait_event(bitmap->write_wait, |
1596 | atomic_read(&bitmap->pending_writes) == 0); | 1675 | atomic_read(&bitmap->pending_writes) == 0); |
@@ -1636,7 +1715,7 @@ void bitmap_destroy(struct mddev *mddev) | |||
1636 | * initialize the bitmap structure | 1715 | * initialize the bitmap structure |
1637 | * if this returns an error, bitmap_destroy must be called to do clean up | 1716 | * if this returns an error, bitmap_destroy must be called to do clean up |
1638 | */ | 1717 | */ |
1639 | int bitmap_create(struct mddev *mddev) | 1718 | struct bitmap *bitmap_create(struct mddev *mddev, int slot) |
1640 | { | 1719 | { |
1641 | struct bitmap *bitmap; | 1720 | struct bitmap *bitmap; |
1642 | sector_t blocks = mddev->resync_max_sectors; | 1721 | sector_t blocks = mddev->resync_max_sectors; |
@@ -1650,7 +1729,7 @@ int bitmap_create(struct mddev *mddev) | |||
1650 | 1729 | ||
1651 | bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); | 1730 | bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); |
1652 | if (!bitmap) | 1731 | if (!bitmap) |
1653 | return -ENOMEM; | 1732 | return ERR_PTR(-ENOMEM); |
1654 | 1733 | ||
1655 | spin_lock_init(&bitmap->counts.lock); | 1734 | spin_lock_init(&bitmap->counts.lock); |
1656 | atomic_set(&bitmap->pending_writes, 0); | 1735 | atomic_set(&bitmap->pending_writes, 0); |
@@ -1659,6 +1738,7 @@ int bitmap_create(struct mddev *mddev) | |||
1659 | init_waitqueue_head(&bitmap->behind_wait); | 1738 | init_waitqueue_head(&bitmap->behind_wait); |
1660 | 1739 | ||
1661 | bitmap->mddev = mddev; | 1740 | bitmap->mddev = mddev; |
1741 | bitmap->cluster_slot = slot; | ||
1662 | 1742 | ||
1663 | if (mddev->kobj.sd) | 1743 | if (mddev->kobj.sd) |
1664 | bm = sysfs_get_dirent(mddev->kobj.sd, "bitmap"); | 1744 | bm = sysfs_get_dirent(mddev->kobj.sd, "bitmap"); |
@@ -1706,12 +1786,14 @@ int bitmap_create(struct mddev *mddev) | |||
1706 | printk(KERN_INFO "created bitmap (%lu pages) for device %s\n", | 1786 | printk(KERN_INFO "created bitmap (%lu pages) for device %s\n", |
1707 | bitmap->counts.pages, bmname(bitmap)); | 1787 | bitmap->counts.pages, bmname(bitmap)); |
1708 | 1788 | ||
1709 | mddev->bitmap = bitmap; | 1789 | err = test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0; |
1710 | return test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0; | 1790 | if (err) |
1791 | goto error; | ||
1711 | 1792 | ||
1793 | return bitmap; | ||
1712 | error: | 1794 | error: |
1713 | bitmap_free(bitmap); | 1795 | bitmap_free(bitmap); |
1714 | return err; | 1796 | return ERR_PTR(err); |
1715 | } | 1797 | } |
1716 | 1798 | ||
1717 | int bitmap_load(struct mddev *mddev) | 1799 | int bitmap_load(struct mddev *mddev) |
@@ -1765,6 +1847,60 @@ out: | |||
1765 | } | 1847 | } |
1766 | EXPORT_SYMBOL_GPL(bitmap_load); | 1848 | EXPORT_SYMBOL_GPL(bitmap_load); |
1767 | 1849 | ||
1850 | /* Loads the bitmap associated with slot and copies the resync information | ||
1851 | * to our bitmap | ||
1852 | */ | ||
1853 | int bitmap_copy_from_slot(struct mddev *mddev, int slot, | ||
1854 | sector_t *low, sector_t *high, bool clear_bits) | ||
1855 | { | ||
1856 | int rv = 0, i, j; | ||
1857 | sector_t block, lo = 0, hi = 0; | ||
1858 | struct bitmap_counts *counts; | ||
1859 | struct bitmap *bitmap = bitmap_create(mddev, slot); | ||
1860 | |||
1861 | if (IS_ERR(bitmap)) | ||
1862 | return PTR_ERR(bitmap); | ||
1863 | |||
1864 | rv = bitmap_read_sb(bitmap); | ||
1865 | if (rv) | ||
1866 | goto err; | ||
1867 | |||
1868 | rv = bitmap_init_from_disk(bitmap, 0); | ||
1869 | if (rv) | ||
1870 | goto err; | ||
1871 | |||
1872 | counts = &bitmap->counts; | ||
1873 | for (j = 0; j < counts->chunks; j++) { | ||
1874 | block = (sector_t)j << counts->chunkshift; | ||
1875 | if (bitmap_file_test_bit(bitmap, block)) { | ||
1876 | if (!lo) | ||
1877 | lo = block; | ||
1878 | hi = block; | ||
1879 | bitmap_file_clear_bit(bitmap, block); | ||
1880 | bitmap_set_memory_bits(mddev->bitmap, block, 1); | ||
1881 | bitmap_file_set_bit(mddev->bitmap, block); | ||
1882 | } | ||
1883 | } | ||
1884 | |||
1885 | if (clear_bits) { | ||
1886 | bitmap_update_sb(bitmap); | ||
1887 | /* Setting this for the ev_page should be enough. | ||
1888 | * And we do not require both write_all and PAGE_DIRT either | ||
1889 | */ | ||
1890 | for (i = 0; i < bitmap->storage.file_pages; i++) | ||
1891 | set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY); | ||
1892 | bitmap_write_all(bitmap); | ||
1893 | bitmap_unplug(bitmap); | ||
1894 | } | ||
1895 | *low = lo; | ||
1896 | *high = hi; | ||
1897 | err: | ||
1898 | bitmap_free(bitmap); | ||
1899 | return rv; | ||
1900 | } | ||
1901 | EXPORT_SYMBOL_GPL(bitmap_copy_from_slot); | ||
1902 | |||
1903 | |||
1768 | void bitmap_status(struct seq_file *seq, struct bitmap *bitmap) | 1904 | void bitmap_status(struct seq_file *seq, struct bitmap *bitmap) |
1769 | { | 1905 | { |
1770 | unsigned long chunk_kb; | 1906 | unsigned long chunk_kb; |
@@ -1849,7 +1985,8 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, | |||
1849 | memset(&store, 0, sizeof(store)); | 1985 | memset(&store, 0, sizeof(store)); |
1850 | if (bitmap->mddev->bitmap_info.offset || bitmap->mddev->bitmap_info.file) | 1986 | if (bitmap->mddev->bitmap_info.offset || bitmap->mddev->bitmap_info.file) |
1851 | ret = bitmap_storage_alloc(&store, chunks, | 1987 | ret = bitmap_storage_alloc(&store, chunks, |
1852 | !bitmap->mddev->bitmap_info.external); | 1988 | !bitmap->mddev->bitmap_info.external, |
1989 | bitmap->cluster_slot); | ||
1853 | if (ret) | 1990 | if (ret) |
1854 | goto err; | 1991 | goto err; |
1855 | 1992 | ||
@@ -2021,13 +2158,18 @@ location_store(struct mddev *mddev, const char *buf, size_t len) | |||
2021 | return -EINVAL; | 2158 | return -EINVAL; |
2022 | mddev->bitmap_info.offset = offset; | 2159 | mddev->bitmap_info.offset = offset; |
2023 | if (mddev->pers) { | 2160 | if (mddev->pers) { |
2161 | struct bitmap *bitmap; | ||
2024 | mddev->pers->quiesce(mddev, 1); | 2162 | mddev->pers->quiesce(mddev, 1); |
2025 | rv = bitmap_create(mddev); | 2163 | bitmap = bitmap_create(mddev, -1); |
2026 | if (!rv) | 2164 | if (IS_ERR(bitmap)) |
2165 | rv = PTR_ERR(bitmap); | ||
2166 | else { | ||
2167 | mddev->bitmap = bitmap; | ||
2027 | rv = bitmap_load(mddev); | 2168 | rv = bitmap_load(mddev); |
2028 | if (rv) { | 2169 | if (rv) { |
2029 | bitmap_destroy(mddev); | 2170 | bitmap_destroy(mddev); |
2030 | mddev->bitmap_info.offset = 0; | 2171 | mddev->bitmap_info.offset = 0; |
2172 | } | ||
2031 | } | 2173 | } |
2032 | mddev->pers->quiesce(mddev, 0); | 2174 | mddev->pers->quiesce(mddev, 0); |
2033 | if (rv) | 2175 | if (rv) |
@@ -2186,6 +2328,8 @@ __ATTR(chunksize, S_IRUGO|S_IWUSR, chunksize_show, chunksize_store); | |||
2186 | 2328 | ||
2187 | static ssize_t metadata_show(struct mddev *mddev, char *page) | 2329 | static ssize_t metadata_show(struct mddev *mddev, char *page) |
2188 | { | 2330 | { |
2331 | if (mddev_is_clustered(mddev)) | ||
2332 | return sprintf(page, "clustered\n"); | ||
2189 | return sprintf(page, "%s\n", (mddev->bitmap_info.external | 2333 | return sprintf(page, "%s\n", (mddev->bitmap_info.external |
2190 | ? "external" : "internal")); | 2334 | ? "external" : "internal")); |
2191 | } | 2335 | } |
@@ -2198,7 +2342,8 @@ static ssize_t metadata_store(struct mddev *mddev, const char *buf, size_t len) | |||
2198 | return -EBUSY; | 2342 | return -EBUSY; |
2199 | if (strncmp(buf, "external", 8) == 0) | 2343 | if (strncmp(buf, "external", 8) == 0) |
2200 | mddev->bitmap_info.external = 1; | 2344 | mddev->bitmap_info.external = 1; |
2201 | else if (strncmp(buf, "internal", 8) == 0) | 2345 | else if ((strncmp(buf, "internal", 8) == 0) || |
2346 | (strncmp(buf, "clustered", 9) == 0)) | ||
2202 | mddev->bitmap_info.external = 0; | 2347 | mddev->bitmap_info.external = 0; |
2203 | else | 2348 | else |
2204 | return -EINVAL; | 2349 | return -EINVAL; |
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index 30210b9c4ef9..f1f4dd01090d 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h | |||
@@ -130,8 +130,9 @@ typedef struct bitmap_super_s { | |||
130 | __le32 write_behind; /* 60 number of outstanding write-behind writes */ | 130 | __le32 write_behind; /* 60 number of outstanding write-behind writes */ |
131 | __le32 sectors_reserved; /* 64 number of 512-byte sectors that are | 131 | __le32 sectors_reserved; /* 64 number of 512-byte sectors that are |
132 | * reserved for the bitmap. */ | 132 | * reserved for the bitmap. */ |
133 | 133 | __le32 nodes; /* 68 the maximum number of nodes in cluster. */ | |
134 | __u8 pad[256 - 68]; /* set to zero */ | 134 | __u8 cluster_name[64]; /* 72 cluster name to which this md belongs */ |
135 | __u8 pad[256 - 136]; /* set to zero */ | ||
135 | } bitmap_super_t; | 136 | } bitmap_super_t; |
136 | 137 | ||
137 | /* notes: | 138 | /* notes: |
@@ -226,12 +227,13 @@ struct bitmap { | |||
226 | wait_queue_head_t behind_wait; | 227 | wait_queue_head_t behind_wait; |
227 | 228 | ||
228 | struct kernfs_node *sysfs_can_clear; | 229 | struct kernfs_node *sysfs_can_clear; |
230 | int cluster_slot; /* Slot offset for clustered env */ | ||
229 | }; | 231 | }; |
230 | 232 | ||
231 | /* the bitmap API */ | 233 | /* the bitmap API */ |
232 | 234 | ||
233 | /* these are used only by md/bitmap */ | 235 | /* these are used only by md/bitmap */ |
234 | int bitmap_create(struct mddev *mddev); | 236 | struct bitmap *bitmap_create(struct mddev *mddev, int slot); |
235 | int bitmap_load(struct mddev *mddev); | 237 | int bitmap_load(struct mddev *mddev); |
236 | void bitmap_flush(struct mddev *mddev); | 238 | void bitmap_flush(struct mddev *mddev); |
237 | void bitmap_destroy(struct mddev *mddev); | 239 | void bitmap_destroy(struct mddev *mddev); |
@@ -260,6 +262,8 @@ void bitmap_daemon_work(struct mddev *mddev); | |||
260 | 262 | ||
261 | int bitmap_resize(struct bitmap *bitmap, sector_t blocks, | 263 | int bitmap_resize(struct bitmap *bitmap, sector_t blocks, |
262 | int chunksize, int init); | 264 | int chunksize, int init); |
265 | int bitmap_copy_from_slot(struct mddev *mddev, int slot, | ||
266 | sector_t *lo, sector_t *hi, bool clear_bits); | ||
263 | #endif | 267 | #endif |
264 | 268 | ||
265 | #endif | 269 | #endif |
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c new file mode 100644 index 000000000000..fcfc4b9b2672 --- /dev/null +++ b/drivers/md/md-cluster.c | |||
@@ -0,0 +1,965 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2015, SUSE | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2, or (at your option) | ||
7 | * any later version. | ||
8 | * | ||
9 | */ | ||
10 | |||
11 | |||
12 | #include <linux/module.h> | ||
13 | #include <linux/dlm.h> | ||
14 | #include <linux/sched.h> | ||
15 | #include <linux/raid/md_p.h> | ||
16 | #include "md.h" | ||
17 | #include "bitmap.h" | ||
18 | #include "md-cluster.h" | ||
19 | |||
20 | #define LVB_SIZE 64 | ||
21 | #define NEW_DEV_TIMEOUT 5000 | ||
22 | |||
23 | struct dlm_lock_resource { | ||
24 | dlm_lockspace_t *ls; | ||
25 | struct dlm_lksb lksb; | ||
26 | char *name; /* lock name. */ | ||
27 | uint32_t flags; /* flags to pass to dlm_lock() */ | ||
28 | struct completion completion; /* completion for synchronized locking */ | ||
29 | void (*bast)(void *arg, int mode); /* blocking AST function pointer*/ | ||
30 | struct mddev *mddev; /* pointing back to mddev. */ | ||
31 | }; | ||
32 | |||
33 | struct suspend_info { | ||
34 | int slot; | ||
35 | sector_t lo; | ||
36 | sector_t hi; | ||
37 | struct list_head list; | ||
38 | }; | ||
39 | |||
40 | struct resync_info { | ||
41 | __le64 lo; | ||
42 | __le64 hi; | ||
43 | }; | ||
44 | |||
45 | /* md_cluster_info flags */ | ||
46 | #define MD_CLUSTER_WAITING_FOR_NEWDISK 1 | ||
47 | |||
48 | |||
49 | struct md_cluster_info { | ||
50 | /* dlm lock space and resources for clustered raid. */ | ||
51 | dlm_lockspace_t *lockspace; | ||
52 | int slot_number; | ||
53 | struct completion completion; | ||
54 | struct dlm_lock_resource *sb_lock; | ||
55 | struct mutex sb_mutex; | ||
56 | struct dlm_lock_resource *bitmap_lockres; | ||
57 | struct list_head suspend_list; | ||
58 | spinlock_t suspend_lock; | ||
59 | struct md_thread *recovery_thread; | ||
60 | unsigned long recovery_map; | ||
61 | /* communication loc resources */ | ||
62 | struct dlm_lock_resource *ack_lockres; | ||
63 | struct dlm_lock_resource *message_lockres; | ||
64 | struct dlm_lock_resource *token_lockres; | ||
65 | struct dlm_lock_resource *no_new_dev_lockres; | ||
66 | struct md_thread *recv_thread; | ||
67 | struct completion newdisk_completion; | ||
68 | unsigned long state; | ||
69 | }; | ||
70 | |||
71 | enum msg_type { | ||
72 | METADATA_UPDATED = 0, | ||
73 | RESYNCING, | ||
74 | NEWDISK, | ||
75 | REMOVE, | ||
76 | RE_ADD, | ||
77 | }; | ||
78 | |||
79 | struct cluster_msg { | ||
80 | int type; | ||
81 | int slot; | ||
82 | /* TODO: Unionize this for smaller footprint */ | ||
83 | sector_t low; | ||
84 | sector_t high; | ||
85 | char uuid[16]; | ||
86 | int raid_slot; | ||
87 | }; | ||
88 | |||
89 | static void sync_ast(void *arg) | ||
90 | { | ||
91 | struct dlm_lock_resource *res; | ||
92 | |||
93 | res = (struct dlm_lock_resource *) arg; | ||
94 | complete(&res->completion); | ||
95 | } | ||
96 | |||
97 | static int dlm_lock_sync(struct dlm_lock_resource *res, int mode) | ||
98 | { | ||
99 | int ret = 0; | ||
100 | |||
101 | init_completion(&res->completion); | ||
102 | ret = dlm_lock(res->ls, mode, &res->lksb, | ||
103 | res->flags, res->name, strlen(res->name), | ||
104 | 0, sync_ast, res, res->bast); | ||
105 | if (ret) | ||
106 | return ret; | ||
107 | wait_for_completion(&res->completion); | ||
108 | return res->lksb.sb_status; | ||
109 | } | ||
110 | |||
111 | static int dlm_unlock_sync(struct dlm_lock_resource *res) | ||
112 | { | ||
113 | return dlm_lock_sync(res, DLM_LOCK_NL); | ||
114 | } | ||
115 | |||
116 | static struct dlm_lock_resource *lockres_init(struct mddev *mddev, | ||
117 | char *name, void (*bastfn)(void *arg, int mode), int with_lvb) | ||
118 | { | ||
119 | struct dlm_lock_resource *res = NULL; | ||
120 | int ret, namelen; | ||
121 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
122 | |||
123 | res = kzalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL); | ||
124 | if (!res) | ||
125 | return NULL; | ||
126 | res->ls = cinfo->lockspace; | ||
127 | res->mddev = mddev; | ||
128 | namelen = strlen(name); | ||
129 | res->name = kzalloc(namelen + 1, GFP_KERNEL); | ||
130 | if (!res->name) { | ||
131 | pr_err("md-cluster: Unable to allocate resource name for resource %s\n", name); | ||
132 | goto out_err; | ||
133 | } | ||
134 | strlcpy(res->name, name, namelen + 1); | ||
135 | if (with_lvb) { | ||
136 | res->lksb.sb_lvbptr = kzalloc(LVB_SIZE, GFP_KERNEL); | ||
137 | if (!res->lksb.sb_lvbptr) { | ||
138 | pr_err("md-cluster: Unable to allocate LVB for resource %s\n", name); | ||
139 | goto out_err; | ||
140 | } | ||
141 | res->flags = DLM_LKF_VALBLK; | ||
142 | } | ||
143 | |||
144 | if (bastfn) | ||
145 | res->bast = bastfn; | ||
146 | |||
147 | res->flags |= DLM_LKF_EXPEDITE; | ||
148 | |||
149 | ret = dlm_lock_sync(res, DLM_LOCK_NL); | ||
150 | if (ret) { | ||
151 | pr_err("md-cluster: Unable to lock NL on new lock resource %s\n", name); | ||
152 | goto out_err; | ||
153 | } | ||
154 | res->flags &= ~DLM_LKF_EXPEDITE; | ||
155 | res->flags |= DLM_LKF_CONVERT; | ||
156 | |||
157 | return res; | ||
158 | out_err: | ||
159 | kfree(res->lksb.sb_lvbptr); | ||
160 | kfree(res->name); | ||
161 | kfree(res); | ||
162 | return NULL; | ||
163 | } | ||
164 | |||
165 | static void lockres_free(struct dlm_lock_resource *res) | ||
166 | { | ||
167 | if (!res) | ||
168 | return; | ||
169 | |||
170 | init_completion(&res->completion); | ||
171 | dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res); | ||
172 | wait_for_completion(&res->completion); | ||
173 | |||
174 | kfree(res->name); | ||
175 | kfree(res->lksb.sb_lvbptr); | ||
176 | kfree(res); | ||
177 | } | ||
178 | |||
179 | static char *pretty_uuid(char *dest, char *src) | ||
180 | { | ||
181 | int i, len = 0; | ||
182 | |||
183 | for (i = 0; i < 16; i++) { | ||
184 | if (i == 4 || i == 6 || i == 8 || i == 10) | ||
185 | len += sprintf(dest + len, "-"); | ||
186 | len += sprintf(dest + len, "%02x", (__u8)src[i]); | ||
187 | } | ||
188 | return dest; | ||
189 | } | ||
190 | |||
191 | static void add_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres, | ||
192 | sector_t lo, sector_t hi) | ||
193 | { | ||
194 | struct resync_info *ri; | ||
195 | |||
196 | ri = (struct resync_info *)lockres->lksb.sb_lvbptr; | ||
197 | ri->lo = cpu_to_le64(lo); | ||
198 | ri->hi = cpu_to_le64(hi); | ||
199 | } | ||
200 | |||
201 | static struct suspend_info *read_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres) | ||
202 | { | ||
203 | struct resync_info ri; | ||
204 | struct suspend_info *s = NULL; | ||
205 | sector_t hi = 0; | ||
206 | |||
207 | dlm_lock_sync(lockres, DLM_LOCK_CR); | ||
208 | memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info)); | ||
209 | hi = le64_to_cpu(ri.hi); | ||
210 | if (ri.hi > 0) { | ||
211 | s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL); | ||
212 | if (!s) | ||
213 | goto out; | ||
214 | s->hi = hi; | ||
215 | s->lo = le64_to_cpu(ri.lo); | ||
216 | } | ||
217 | dlm_unlock_sync(lockres); | ||
218 | out: | ||
219 | return s; | ||
220 | } | ||
221 | |||
222 | static void recover_bitmaps(struct md_thread *thread) | ||
223 | { | ||
224 | struct mddev *mddev = thread->mddev; | ||
225 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
226 | struct dlm_lock_resource *bm_lockres; | ||
227 | char str[64]; | ||
228 | int slot, ret; | ||
229 | struct suspend_info *s, *tmp; | ||
230 | sector_t lo, hi; | ||
231 | |||
232 | while (cinfo->recovery_map) { | ||
233 | slot = fls64((u64)cinfo->recovery_map) - 1; | ||
234 | |||
235 | /* Clear suspend_area associated with the bitmap */ | ||
236 | spin_lock_irq(&cinfo->suspend_lock); | ||
237 | list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list) | ||
238 | if (slot == s->slot) { | ||
239 | list_del(&s->list); | ||
240 | kfree(s); | ||
241 | } | ||
242 | spin_unlock_irq(&cinfo->suspend_lock); | ||
243 | |||
244 | snprintf(str, 64, "bitmap%04d", slot); | ||
245 | bm_lockres = lockres_init(mddev, str, NULL, 1); | ||
246 | if (!bm_lockres) { | ||
247 | pr_err("md-cluster: Cannot initialize bitmaps\n"); | ||
248 | goto clear_bit; | ||
249 | } | ||
250 | |||
251 | ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); | ||
252 | if (ret) { | ||
253 | pr_err("md-cluster: Could not DLM lock %s: %d\n", | ||
254 | str, ret); | ||
255 | goto clear_bit; | ||
256 | } | ||
257 | ret = bitmap_copy_from_slot(mddev, slot, &lo, &hi, true); | ||
258 | if (ret) { | ||
259 | pr_err("md-cluster: Could not copy data from bitmap %d\n", slot); | ||
260 | goto dlm_unlock; | ||
261 | } | ||
262 | if (hi > 0) { | ||
263 | /* TODO:Wait for current resync to get over */ | ||
264 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
265 | if (lo < mddev->recovery_cp) | ||
266 | mddev->recovery_cp = lo; | ||
267 | md_check_recovery(mddev); | ||
268 | } | ||
269 | dlm_unlock: | ||
270 | dlm_unlock_sync(bm_lockres); | ||
271 | clear_bit: | ||
272 | clear_bit(slot, &cinfo->recovery_map); | ||
273 | } | ||
274 | } | ||
275 | |||
276 | static void recover_prep(void *arg) | ||
277 | { | ||
278 | } | ||
279 | |||
280 | static void recover_slot(void *arg, struct dlm_slot *slot) | ||
281 | { | ||
282 | struct mddev *mddev = arg; | ||
283 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
284 | |||
285 | pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n", | ||
286 | mddev->bitmap_info.cluster_name, | ||
287 | slot->nodeid, slot->slot, | ||
288 | cinfo->slot_number); | ||
289 | set_bit(slot->slot - 1, &cinfo->recovery_map); | ||
290 | if (!cinfo->recovery_thread) { | ||
291 | cinfo->recovery_thread = md_register_thread(recover_bitmaps, | ||
292 | mddev, "recover"); | ||
293 | if (!cinfo->recovery_thread) { | ||
294 | pr_warn("md-cluster: Could not create recovery thread\n"); | ||
295 | return; | ||
296 | } | ||
297 | } | ||
298 | md_wakeup_thread(cinfo->recovery_thread); | ||
299 | } | ||
300 | |||
301 | static void recover_done(void *arg, struct dlm_slot *slots, | ||
302 | int num_slots, int our_slot, | ||
303 | uint32_t generation) | ||
304 | { | ||
305 | struct mddev *mddev = arg; | ||
306 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
307 | |||
308 | cinfo->slot_number = our_slot; | ||
309 | complete(&cinfo->completion); | ||
310 | } | ||
311 | |||
312 | static const struct dlm_lockspace_ops md_ls_ops = { | ||
313 | .recover_prep = recover_prep, | ||
314 | .recover_slot = recover_slot, | ||
315 | .recover_done = recover_done, | ||
316 | }; | ||
317 | |||
318 | /* | ||
319 | * The BAST function for the ack lock resource | ||
320 | * This function wakes up the receive thread in | ||
321 | * order to receive and process the message. | ||
322 | */ | ||
323 | static void ack_bast(void *arg, int mode) | ||
324 | { | ||
325 | struct dlm_lock_resource *res = (struct dlm_lock_resource *)arg; | ||
326 | struct md_cluster_info *cinfo = res->mddev->cluster_info; | ||
327 | |||
328 | if (mode == DLM_LOCK_EX) | ||
329 | md_wakeup_thread(cinfo->recv_thread); | ||
330 | } | ||
331 | |||
332 | static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot) | ||
333 | { | ||
334 | struct suspend_info *s, *tmp; | ||
335 | |||
336 | list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list) | ||
337 | if (slot == s->slot) { | ||
338 | pr_info("%s:%d Deleting suspend_info: %d\n", | ||
339 | __func__, __LINE__, slot); | ||
340 | list_del(&s->list); | ||
341 | kfree(s); | ||
342 | break; | ||
343 | } | ||
344 | } | ||
345 | |||
346 | static void remove_suspend_info(struct md_cluster_info *cinfo, int slot) | ||
347 | { | ||
348 | spin_lock_irq(&cinfo->suspend_lock); | ||
349 | __remove_suspend_info(cinfo, slot); | ||
350 | spin_unlock_irq(&cinfo->suspend_lock); | ||
351 | } | ||
352 | |||
353 | |||
354 | static void process_suspend_info(struct md_cluster_info *cinfo, | ||
355 | int slot, sector_t lo, sector_t hi) | ||
356 | { | ||
357 | struct suspend_info *s; | ||
358 | |||
359 | if (!hi) { | ||
360 | remove_suspend_info(cinfo, slot); | ||
361 | return; | ||
362 | } | ||
363 | s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL); | ||
364 | if (!s) | ||
365 | return; | ||
366 | s->slot = slot; | ||
367 | s->lo = lo; | ||
368 | s->hi = hi; | ||
369 | spin_lock_irq(&cinfo->suspend_lock); | ||
370 | /* Remove existing entry (if exists) before adding */ | ||
371 | __remove_suspend_info(cinfo, slot); | ||
372 | list_add(&s->list, &cinfo->suspend_list); | ||
373 | spin_unlock_irq(&cinfo->suspend_lock); | ||
374 | } | ||
375 | |||
376 | static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) | ||
377 | { | ||
378 | char disk_uuid[64]; | ||
379 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
380 | char event_name[] = "EVENT=ADD_DEVICE"; | ||
381 | char raid_slot[16]; | ||
382 | char *envp[] = {event_name, disk_uuid, raid_slot, NULL}; | ||
383 | int len; | ||
384 | |||
385 | len = snprintf(disk_uuid, 64, "DEVICE_UUID="); | ||
386 | pretty_uuid(disk_uuid + len, cmsg->uuid); | ||
387 | snprintf(raid_slot, 16, "RAID_DISK=%d", cmsg->raid_slot); | ||
388 | pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot); | ||
389 | init_completion(&cinfo->newdisk_completion); | ||
390 | set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state); | ||
391 | kobject_uevent_env(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE, envp); | ||
392 | wait_for_completion_timeout(&cinfo->newdisk_completion, | ||
393 | NEW_DEV_TIMEOUT); | ||
394 | clear_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state); | ||
395 | } | ||
396 | |||
397 | |||
398 | static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg) | ||
399 | { | ||
400 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
401 | |||
402 | md_reload_sb(mddev); | ||
403 | dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); | ||
404 | } | ||
405 | |||
406 | static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg) | ||
407 | { | ||
408 | struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot); | ||
409 | |||
410 | if (rdev) | ||
411 | md_kick_rdev_from_array(rdev); | ||
412 | else | ||
413 | pr_warn("%s: %d Could not find disk(%d) to REMOVE\n", __func__, __LINE__, msg->raid_slot); | ||
414 | } | ||
415 | |||
416 | static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg) | ||
417 | { | ||
418 | struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot); | ||
419 | |||
420 | if (rdev && test_bit(Faulty, &rdev->flags)) | ||
421 | clear_bit(Faulty, &rdev->flags); | ||
422 | else | ||
423 | pr_warn("%s: %d Could not find disk(%d) which is faulty", __func__, __LINE__, msg->raid_slot); | ||
424 | } | ||
425 | |||
426 | static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) | ||
427 | { | ||
428 | switch (msg->type) { | ||
429 | case METADATA_UPDATED: | ||
430 | pr_info("%s: %d Received message: METADATA_UPDATE from %d\n", | ||
431 | __func__, __LINE__, msg->slot); | ||
432 | process_metadata_update(mddev, msg); | ||
433 | break; | ||
434 | case RESYNCING: | ||
435 | pr_info("%s: %d Received message: RESYNCING from %d\n", | ||
436 | __func__, __LINE__, msg->slot); | ||
437 | process_suspend_info(mddev->cluster_info, msg->slot, | ||
438 | msg->low, msg->high); | ||
439 | break; | ||
440 | case NEWDISK: | ||
441 | pr_info("%s: %d Received message: NEWDISK from %d\n", | ||
442 | __func__, __LINE__, msg->slot); | ||
443 | process_add_new_disk(mddev, msg); | ||
444 | break; | ||
445 | case REMOVE: | ||
446 | pr_info("%s: %d Received REMOVE from %d\n", | ||
447 | __func__, __LINE__, msg->slot); | ||
448 | process_remove_disk(mddev, msg); | ||
449 | break; | ||
450 | case RE_ADD: | ||
451 | pr_info("%s: %d Received RE_ADD from %d\n", | ||
452 | __func__, __LINE__, msg->slot); | ||
453 | process_readd_disk(mddev, msg); | ||
454 | break; | ||
455 | default: | ||
456 | pr_warn("%s:%d Received unknown message from %d\n", | ||
457 | __func__, __LINE__, msg->slot); | ||
458 | } | ||
459 | } | ||
460 | |||
461 | /* | ||
462 | * thread for receiving message | ||
463 | */ | ||
464 | static void recv_daemon(struct md_thread *thread) | ||
465 | { | ||
466 | struct md_cluster_info *cinfo = thread->mddev->cluster_info; | ||
467 | struct dlm_lock_resource *ack_lockres = cinfo->ack_lockres; | ||
468 | struct dlm_lock_resource *message_lockres = cinfo->message_lockres; | ||
469 | struct cluster_msg msg; | ||
470 | |||
471 | /*get CR on Message*/ | ||
472 | if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) { | ||
473 | pr_err("md/raid1:failed to get CR on MESSAGE\n"); | ||
474 | return; | ||
475 | } | ||
476 | |||
477 | /* read lvb and wake up thread to process this message_lockres */ | ||
478 | memcpy(&msg, message_lockres->lksb.sb_lvbptr, sizeof(struct cluster_msg)); | ||
479 | process_recvd_msg(thread->mddev, &msg); | ||
480 | |||
481 | /*release CR on ack_lockres*/ | ||
482 | dlm_unlock_sync(ack_lockres); | ||
483 | /*up-convert to EX on message_lockres*/ | ||
484 | dlm_lock_sync(message_lockres, DLM_LOCK_EX); | ||
485 | /*get CR on ack_lockres again*/ | ||
486 | dlm_lock_sync(ack_lockres, DLM_LOCK_CR); | ||
487 | /*release CR on message_lockres*/ | ||
488 | dlm_unlock_sync(message_lockres); | ||
489 | } | ||
490 | |||
491 | /* lock_comm() | ||
492 | * Takes the lock on the TOKEN lock resource so no other | ||
493 | * node can communicate while the operation is underway. | ||
494 | */ | ||
495 | static int lock_comm(struct md_cluster_info *cinfo) | ||
496 | { | ||
497 | int error; | ||
498 | |||
499 | error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX); | ||
500 | if (error) | ||
501 | pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n", | ||
502 | __func__, __LINE__, error); | ||
503 | return error; | ||
504 | } | ||
505 | |||
506 | static void unlock_comm(struct md_cluster_info *cinfo) | ||
507 | { | ||
508 | dlm_unlock_sync(cinfo->token_lockres); | ||
509 | } | ||
510 | |||
511 | /* __sendmsg() | ||
512 | * This function performs the actual sending of the message. This function is | ||
513 | * usually called after performing the encompassing operation | ||
514 | * The function: | ||
515 | * 1. Grabs the message lockresource in EX mode | ||
516 | * 2. Copies the message to the message LVB | ||
517 | * 3. Downconverts message lockresource to CR | ||
518 | * 4. Upconverts ack lock resource from CR to EX. This forces the BAST on other nodes | ||
519 | * and the other nodes read the message. The thread will wait here until all other | ||
520 | * nodes have released ack lock resource. | ||
521 | * 5. Downconvert ack lockresource to CR | ||
522 | */ | ||
523 | static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg) | ||
524 | { | ||
525 | int error; | ||
526 | int slot = cinfo->slot_number - 1; | ||
527 | |||
528 | cmsg->slot = cpu_to_le32(slot); | ||
529 | /*get EX on Message*/ | ||
530 | error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_EX); | ||
531 | if (error) { | ||
532 | pr_err("md-cluster: failed to get EX on MESSAGE (%d)\n", error); | ||
533 | goto failed_message; | ||
534 | } | ||
535 | |||
536 | memcpy(cinfo->message_lockres->lksb.sb_lvbptr, (void *)cmsg, | ||
537 | sizeof(struct cluster_msg)); | ||
538 | /*down-convert EX to CR on Message*/ | ||
539 | error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_CR); | ||
540 | if (error) { | ||
541 | pr_err("md-cluster: failed to convert EX to CR on MESSAGE(%d)\n", | ||
542 | error); | ||
543 | goto failed_message; | ||
544 | } | ||
545 | |||
546 | /*up-convert CR to EX on Ack*/ | ||
547 | error = dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_EX); | ||
548 | if (error) { | ||
549 | pr_err("md-cluster: failed to convert CR to EX on ACK(%d)\n", | ||
550 | error); | ||
551 | goto failed_ack; | ||
552 | } | ||
553 | |||
554 | /*down-convert EX to CR on Ack*/ | ||
555 | error = dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR); | ||
556 | if (error) { | ||
557 | pr_err("md-cluster: failed to convert EX to CR on ACK(%d)\n", | ||
558 | error); | ||
559 | goto failed_ack; | ||
560 | } | ||
561 | |||
562 | failed_ack: | ||
563 | dlm_unlock_sync(cinfo->message_lockres); | ||
564 | failed_message: | ||
565 | return error; | ||
566 | } | ||
567 | |||
568 | static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg) | ||
569 | { | ||
570 | int ret; | ||
571 | |||
572 | lock_comm(cinfo); | ||
573 | ret = __sendmsg(cinfo, cmsg); | ||
574 | unlock_comm(cinfo); | ||
575 | return ret; | ||
576 | } | ||
577 | |||
578 | static int gather_all_resync_info(struct mddev *mddev, int total_slots) | ||
579 | { | ||
580 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
581 | int i, ret = 0; | ||
582 | struct dlm_lock_resource *bm_lockres; | ||
583 | struct suspend_info *s; | ||
584 | char str[64]; | ||
585 | |||
586 | |||
587 | for (i = 0; i < total_slots; i++) { | ||
588 | memset(str, '\0', 64); | ||
589 | snprintf(str, 64, "bitmap%04d", i); | ||
590 | bm_lockres = lockres_init(mddev, str, NULL, 1); | ||
591 | if (!bm_lockres) | ||
592 | return -ENOMEM; | ||
593 | if (i == (cinfo->slot_number - 1)) | ||
594 | continue; | ||
595 | |||
596 | bm_lockres->flags |= DLM_LKF_NOQUEUE; | ||
597 | ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); | ||
598 | if (ret == -EAGAIN) { | ||
599 | memset(bm_lockres->lksb.sb_lvbptr, '\0', LVB_SIZE); | ||
600 | s = read_resync_info(mddev, bm_lockres); | ||
601 | if (s) { | ||
602 | pr_info("%s:%d Resync[%llu..%llu] in progress on %d\n", | ||
603 | __func__, __LINE__, | ||
604 | (unsigned long long) s->lo, | ||
605 | (unsigned long long) s->hi, i); | ||
606 | spin_lock_irq(&cinfo->suspend_lock); | ||
607 | s->slot = i; | ||
608 | list_add(&s->list, &cinfo->suspend_list); | ||
609 | spin_unlock_irq(&cinfo->suspend_lock); | ||
610 | } | ||
611 | ret = 0; | ||
612 | lockres_free(bm_lockres); | ||
613 | continue; | ||
614 | } | ||
615 | if (ret) | ||
616 | goto out; | ||
617 | /* TODO: Read the disk bitmap sb and check if it needs recovery */ | ||
618 | dlm_unlock_sync(bm_lockres); | ||
619 | lockres_free(bm_lockres); | ||
620 | } | ||
621 | out: | ||
622 | return ret; | ||
623 | } | ||
624 | |||
625 | static int join(struct mddev *mddev, int nodes) | ||
626 | { | ||
627 | struct md_cluster_info *cinfo; | ||
628 | int ret, ops_rv; | ||
629 | char str[64]; | ||
630 | |||
631 | if (!try_module_get(THIS_MODULE)) | ||
632 | return -ENOENT; | ||
633 | |||
634 | cinfo = kzalloc(sizeof(struct md_cluster_info), GFP_KERNEL); | ||
635 | if (!cinfo) | ||
636 | return -ENOMEM; | ||
637 | |||
638 | init_completion(&cinfo->completion); | ||
639 | |||
640 | mutex_init(&cinfo->sb_mutex); | ||
641 | mddev->cluster_info = cinfo; | ||
642 | |||
643 | memset(str, 0, 64); | ||
644 | pretty_uuid(str, mddev->uuid); | ||
645 | ret = dlm_new_lockspace(str, mddev->bitmap_info.cluster_name, | ||
646 | DLM_LSFL_FS, LVB_SIZE, | ||
647 | &md_ls_ops, mddev, &ops_rv, &cinfo->lockspace); | ||
648 | if (ret) | ||
649 | goto err; | ||
650 | wait_for_completion(&cinfo->completion); | ||
651 | if (nodes < cinfo->slot_number) { | ||
652 | pr_err("md-cluster: Slot allotted(%d) is greater than available slots(%d).", | ||
653 | cinfo->slot_number, nodes); | ||
654 | ret = -ERANGE; | ||
655 | goto err; | ||
656 | } | ||
657 | cinfo->sb_lock = lockres_init(mddev, "cmd-super", | ||
658 | NULL, 0); | ||
659 | if (!cinfo->sb_lock) { | ||
660 | ret = -ENOMEM; | ||
661 | goto err; | ||
662 | } | ||
663 | /* Initiate the communication resources */ | ||
664 | ret = -ENOMEM; | ||
665 | cinfo->recv_thread = md_register_thread(recv_daemon, mddev, "cluster_recv"); | ||
666 | if (!cinfo->recv_thread) { | ||
667 | pr_err("md-cluster: cannot allocate memory for recv_thread!\n"); | ||
668 | goto err; | ||
669 | } | ||
670 | cinfo->message_lockres = lockres_init(mddev, "message", NULL, 1); | ||
671 | if (!cinfo->message_lockres) | ||
672 | goto err; | ||
673 | cinfo->token_lockres = lockres_init(mddev, "token", NULL, 0); | ||
674 | if (!cinfo->token_lockres) | ||
675 | goto err; | ||
676 | cinfo->ack_lockres = lockres_init(mddev, "ack", ack_bast, 0); | ||
677 | if (!cinfo->ack_lockres) | ||
678 | goto err; | ||
679 | cinfo->no_new_dev_lockres = lockres_init(mddev, "no-new-dev", NULL, 0); | ||
680 | if (!cinfo->no_new_dev_lockres) | ||
681 | goto err; | ||
682 | |||
683 | /* get sync CR lock on ACK. */ | ||
684 | if (dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR)) | ||
685 | pr_err("md-cluster: failed to get a sync CR lock on ACK!(%d)\n", | ||
686 | ret); | ||
687 | /* get sync CR lock on no-new-dev. */ | ||
688 | if (dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR)) | ||
689 | pr_err("md-cluster: failed to get a sync CR lock on no-new-dev!(%d)\n", ret); | ||
690 | |||
691 | |||
692 | pr_info("md-cluster: Joined cluster %s slot %d\n", str, cinfo->slot_number); | ||
693 | snprintf(str, 64, "bitmap%04d", cinfo->slot_number - 1); | ||
694 | cinfo->bitmap_lockres = lockres_init(mddev, str, NULL, 1); | ||
695 | if (!cinfo->bitmap_lockres) | ||
696 | goto err; | ||
697 | if (dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW)) { | ||
698 | pr_err("Failed to get bitmap lock\n"); | ||
699 | ret = -EINVAL; | ||
700 | goto err; | ||
701 | } | ||
702 | |||
703 | INIT_LIST_HEAD(&cinfo->suspend_list); | ||
704 | spin_lock_init(&cinfo->suspend_lock); | ||
705 | |||
706 | ret = gather_all_resync_info(mddev, nodes); | ||
707 | if (ret) | ||
708 | goto err; | ||
709 | |||
710 | return 0; | ||
711 | err: | ||
712 | lockres_free(cinfo->message_lockres); | ||
713 | lockres_free(cinfo->token_lockres); | ||
714 | lockres_free(cinfo->ack_lockres); | ||
715 | lockres_free(cinfo->no_new_dev_lockres); | ||
716 | lockres_free(cinfo->bitmap_lockres); | ||
717 | lockres_free(cinfo->sb_lock); | ||
718 | if (cinfo->lockspace) | ||
719 | dlm_release_lockspace(cinfo->lockspace, 2); | ||
720 | mddev->cluster_info = NULL; | ||
721 | kfree(cinfo); | ||
722 | module_put(THIS_MODULE); | ||
723 | return ret; | ||
724 | } | ||
725 | |||
726 | static int leave(struct mddev *mddev) | ||
727 | { | ||
728 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
729 | |||
730 | if (!cinfo) | ||
731 | return 0; | ||
732 | md_unregister_thread(&cinfo->recovery_thread); | ||
733 | md_unregister_thread(&cinfo->recv_thread); | ||
734 | lockres_free(cinfo->message_lockres); | ||
735 | lockres_free(cinfo->token_lockres); | ||
736 | lockres_free(cinfo->ack_lockres); | ||
737 | lockres_free(cinfo->no_new_dev_lockres); | ||
738 | lockres_free(cinfo->sb_lock); | ||
739 | lockres_free(cinfo->bitmap_lockres); | ||
740 | dlm_release_lockspace(cinfo->lockspace, 2); | ||
741 | return 0; | ||
742 | } | ||
743 | |||
744 | /* slot_number(): Returns the MD slot number to use | ||
745 | * DLM starts the slot numbers from 1, wheras cluster-md | ||
746 | * wants the number to be from zero, so we deduct one | ||
747 | */ | ||
748 | static int slot_number(struct mddev *mddev) | ||
749 | { | ||
750 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
751 | |||
752 | return cinfo->slot_number - 1; | ||
753 | } | ||
754 | |||
755 | static void resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) | ||
756 | { | ||
757 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
758 | |||
759 | add_resync_info(mddev, cinfo->bitmap_lockres, lo, hi); | ||
760 | /* Re-acquire the lock to refresh LVB */ | ||
761 | dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW); | ||
762 | } | ||
763 | |||
764 | static int metadata_update_start(struct mddev *mddev) | ||
765 | { | ||
766 | return lock_comm(mddev->cluster_info); | ||
767 | } | ||
768 | |||
769 | static int metadata_update_finish(struct mddev *mddev) | ||
770 | { | ||
771 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
772 | struct cluster_msg cmsg; | ||
773 | int ret; | ||
774 | |||
775 | memset(&cmsg, 0, sizeof(cmsg)); | ||
776 | cmsg.type = cpu_to_le32(METADATA_UPDATED); | ||
777 | ret = __sendmsg(cinfo, &cmsg); | ||
778 | unlock_comm(cinfo); | ||
779 | return ret; | ||
780 | } | ||
781 | |||
782 | static int metadata_update_cancel(struct mddev *mddev) | ||
783 | { | ||
784 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
785 | |||
786 | return dlm_unlock_sync(cinfo->token_lockres); | ||
787 | } | ||
788 | |||
789 | static int resync_send(struct mddev *mddev, enum msg_type type, | ||
790 | sector_t lo, sector_t hi) | ||
791 | { | ||
792 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
793 | struct cluster_msg cmsg; | ||
794 | int slot = cinfo->slot_number - 1; | ||
795 | |||
796 | pr_info("%s:%d lo: %llu hi: %llu\n", __func__, __LINE__, | ||
797 | (unsigned long long)lo, | ||
798 | (unsigned long long)hi); | ||
799 | resync_info_update(mddev, lo, hi); | ||
800 | cmsg.type = cpu_to_le32(type); | ||
801 | cmsg.slot = cpu_to_le32(slot); | ||
802 | cmsg.low = cpu_to_le64(lo); | ||
803 | cmsg.high = cpu_to_le64(hi); | ||
804 | return sendmsg(cinfo, &cmsg); | ||
805 | } | ||
806 | |||
807 | static int resync_start(struct mddev *mddev, sector_t lo, sector_t hi) | ||
808 | { | ||
809 | pr_info("%s:%d\n", __func__, __LINE__); | ||
810 | return resync_send(mddev, RESYNCING, lo, hi); | ||
811 | } | ||
812 | |||
813 | static void resync_finish(struct mddev *mddev) | ||
814 | { | ||
815 | pr_info("%s:%d\n", __func__, __LINE__); | ||
816 | resync_send(mddev, RESYNCING, 0, 0); | ||
817 | } | ||
818 | |||
819 | static int area_resyncing(struct mddev *mddev, sector_t lo, sector_t hi) | ||
820 | { | ||
821 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
822 | int ret = 0; | ||
823 | struct suspend_info *s; | ||
824 | |||
825 | spin_lock_irq(&cinfo->suspend_lock); | ||
826 | if (list_empty(&cinfo->suspend_list)) | ||
827 | goto out; | ||
828 | list_for_each_entry(s, &cinfo->suspend_list, list) | ||
829 | if (hi > s->lo && lo < s->hi) { | ||
830 | ret = 1; | ||
831 | break; | ||
832 | } | ||
833 | out: | ||
834 | spin_unlock_irq(&cinfo->suspend_lock); | ||
835 | return ret; | ||
836 | } | ||
837 | |||
838 | static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev) | ||
839 | { | ||
840 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
841 | struct cluster_msg cmsg; | ||
842 | int ret = 0; | ||
843 | struct mdp_superblock_1 *sb = page_address(rdev->sb_page); | ||
844 | char *uuid = sb->device_uuid; | ||
845 | |||
846 | memset(&cmsg, 0, sizeof(cmsg)); | ||
847 | cmsg.type = cpu_to_le32(NEWDISK); | ||
848 | memcpy(cmsg.uuid, uuid, 16); | ||
849 | cmsg.raid_slot = rdev->desc_nr; | ||
850 | lock_comm(cinfo); | ||
851 | ret = __sendmsg(cinfo, &cmsg); | ||
852 | if (ret) | ||
853 | return ret; | ||
854 | cinfo->no_new_dev_lockres->flags |= DLM_LKF_NOQUEUE; | ||
855 | ret = dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_EX); | ||
856 | cinfo->no_new_dev_lockres->flags &= ~DLM_LKF_NOQUEUE; | ||
857 | /* Some node does not "see" the device */ | ||
858 | if (ret == -EAGAIN) | ||
859 | ret = -ENOENT; | ||
860 | else | ||
861 | dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); | ||
862 | return ret; | ||
863 | } | ||
864 | |||
865 | static int add_new_disk_finish(struct mddev *mddev) | ||
866 | { | ||
867 | struct cluster_msg cmsg; | ||
868 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
869 | int ret; | ||
870 | /* Write sb and inform others */ | ||
871 | md_update_sb(mddev, 1); | ||
872 | cmsg.type = METADATA_UPDATED; | ||
873 | ret = __sendmsg(cinfo, &cmsg); | ||
874 | unlock_comm(cinfo); | ||
875 | return ret; | ||
876 | } | ||
877 | |||
878 | static int new_disk_ack(struct mddev *mddev, bool ack) | ||
879 | { | ||
880 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
881 | |||
882 | if (!test_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state)) { | ||
883 | pr_warn("md-cluster(%s): Spurious cluster confirmation\n", mdname(mddev)); | ||
884 | return -EINVAL; | ||
885 | } | ||
886 | |||
887 | if (ack) | ||
888 | dlm_unlock_sync(cinfo->no_new_dev_lockres); | ||
889 | complete(&cinfo->newdisk_completion); | ||
890 | return 0; | ||
891 | } | ||
892 | |||
893 | static int remove_disk(struct mddev *mddev, struct md_rdev *rdev) | ||
894 | { | ||
895 | struct cluster_msg cmsg; | ||
896 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
897 | cmsg.type = REMOVE; | ||
898 | cmsg.raid_slot = rdev->desc_nr; | ||
899 | return __sendmsg(cinfo, &cmsg); | ||
900 | } | ||
901 | |||
902 | static int gather_bitmaps(struct md_rdev *rdev) | ||
903 | { | ||
904 | int sn, err; | ||
905 | sector_t lo, hi; | ||
906 | struct cluster_msg cmsg; | ||
907 | struct mddev *mddev = rdev->mddev; | ||
908 | struct md_cluster_info *cinfo = mddev->cluster_info; | ||
909 | |||
910 | cmsg.type = RE_ADD; | ||
911 | cmsg.raid_slot = rdev->desc_nr; | ||
912 | err = sendmsg(cinfo, &cmsg); | ||
913 | if (err) | ||
914 | goto out; | ||
915 | |||
916 | for (sn = 0; sn < mddev->bitmap_info.nodes; sn++) { | ||
917 | if (sn == (cinfo->slot_number - 1)) | ||
918 | continue; | ||
919 | err = bitmap_copy_from_slot(mddev, sn, &lo, &hi, false); | ||
920 | if (err) { | ||
921 | pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn); | ||
922 | goto out; | ||
923 | } | ||
924 | if ((hi > 0) && (lo < mddev->recovery_cp)) | ||
925 | mddev->recovery_cp = lo; | ||
926 | } | ||
927 | out: | ||
928 | return err; | ||
929 | } | ||
930 | |||
931 | static struct md_cluster_operations cluster_ops = { | ||
932 | .join = join, | ||
933 | .leave = leave, | ||
934 | .slot_number = slot_number, | ||
935 | .resync_info_update = resync_info_update, | ||
936 | .resync_start = resync_start, | ||
937 | .resync_finish = resync_finish, | ||
938 | .metadata_update_start = metadata_update_start, | ||
939 | .metadata_update_finish = metadata_update_finish, | ||
940 | .metadata_update_cancel = metadata_update_cancel, | ||
941 | .area_resyncing = area_resyncing, | ||
942 | .add_new_disk_start = add_new_disk_start, | ||
943 | .add_new_disk_finish = add_new_disk_finish, | ||
944 | .new_disk_ack = new_disk_ack, | ||
945 | .remove_disk = remove_disk, | ||
946 | .gather_bitmaps = gather_bitmaps, | ||
947 | }; | ||
948 | |||
949 | static int __init cluster_init(void) | ||
950 | { | ||
951 | pr_warn("md-cluster: EXPERIMENTAL. Use with caution\n"); | ||
952 | pr_info("Registering Cluster MD functions\n"); | ||
953 | register_md_cluster_operations(&cluster_ops, THIS_MODULE); | ||
954 | return 0; | ||
955 | } | ||
956 | |||
957 | static void cluster_exit(void) | ||
958 | { | ||
959 | unregister_md_cluster_operations(); | ||
960 | } | ||
961 | |||
962 | module_init(cluster_init); | ||
963 | module_exit(cluster_exit); | ||
964 | MODULE_LICENSE("GPL"); | ||
965 | MODULE_DESCRIPTION("Clustering support for MD"); | ||
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h new file mode 100644 index 000000000000..6817ee00e053 --- /dev/null +++ b/drivers/md/md-cluster.h | |||
@@ -0,0 +1,29 @@ | |||
1 | |||
2 | |||
3 | #ifndef _MD_CLUSTER_H | ||
4 | #define _MD_CLUSTER_H | ||
5 | |||
6 | #include "md.h" | ||
7 | |||
8 | struct mddev; | ||
9 | struct md_rdev; | ||
10 | |||
11 | struct md_cluster_operations { | ||
12 | int (*join)(struct mddev *mddev, int nodes); | ||
13 | int (*leave)(struct mddev *mddev); | ||
14 | int (*slot_number)(struct mddev *mddev); | ||
15 | void (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi); | ||
16 | int (*resync_start)(struct mddev *mddev, sector_t lo, sector_t hi); | ||
17 | void (*resync_finish)(struct mddev *mddev); | ||
18 | int (*metadata_update_start)(struct mddev *mddev); | ||
19 | int (*metadata_update_finish)(struct mddev *mddev); | ||
20 | int (*metadata_update_cancel)(struct mddev *mddev); | ||
21 | int (*area_resyncing)(struct mddev *mddev, sector_t lo, sector_t hi); | ||
22 | int (*add_new_disk_start)(struct mddev *mddev, struct md_rdev *rdev); | ||
23 | int (*add_new_disk_finish)(struct mddev *mddev); | ||
24 | int (*new_disk_ack)(struct mddev *mddev, bool ack); | ||
25 | int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev); | ||
26 | int (*gather_bitmaps)(struct md_rdev *rdev); | ||
27 | }; | ||
28 | |||
29 | #endif /* _MD_CLUSTER_H */ | ||
diff --git a/drivers/md/md.c b/drivers/md/md.c index e6178787ce3d..d4f31e195e26 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -53,6 +53,7 @@ | |||
53 | #include <linux/slab.h> | 53 | #include <linux/slab.h> |
54 | #include "md.h" | 54 | #include "md.h" |
55 | #include "bitmap.h" | 55 | #include "bitmap.h" |
56 | #include "md-cluster.h" | ||
56 | 57 | ||
57 | #ifndef MODULE | 58 | #ifndef MODULE |
58 | static void autostart_arrays(int part); | 59 | static void autostart_arrays(int part); |
@@ -66,6 +67,11 @@ static void autostart_arrays(int part); | |||
66 | static LIST_HEAD(pers_list); | 67 | static LIST_HEAD(pers_list); |
67 | static DEFINE_SPINLOCK(pers_lock); | 68 | static DEFINE_SPINLOCK(pers_lock); |
68 | 69 | ||
70 | struct md_cluster_operations *md_cluster_ops; | ||
71 | EXPORT_SYMBOL(md_cluster_ops); | ||
72 | struct module *md_cluster_mod; | ||
73 | EXPORT_SYMBOL(md_cluster_mod); | ||
74 | |||
69 | static DECLARE_WAIT_QUEUE_HEAD(resync_wait); | 75 | static DECLARE_WAIT_QUEUE_HEAD(resync_wait); |
70 | static struct workqueue_struct *md_wq; | 76 | static struct workqueue_struct *md_wq; |
71 | static struct workqueue_struct *md_misc_wq; | 77 | static struct workqueue_struct *md_misc_wq; |
@@ -640,7 +646,7 @@ void mddev_unlock(struct mddev *mddev) | |||
640 | } | 646 | } |
641 | EXPORT_SYMBOL_GPL(mddev_unlock); | 647 | EXPORT_SYMBOL_GPL(mddev_unlock); |
642 | 648 | ||
643 | static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr) | 649 | struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr) |
644 | { | 650 | { |
645 | struct md_rdev *rdev; | 651 | struct md_rdev *rdev; |
646 | 652 | ||
@@ -650,6 +656,7 @@ static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr) | |||
650 | 656 | ||
651 | return NULL; | 657 | return NULL; |
652 | } | 658 | } |
659 | EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu); | ||
653 | 660 | ||
654 | static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) | 661 | static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) |
655 | { | 662 | { |
@@ -2047,11 +2054,11 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) | |||
2047 | int choice = 0; | 2054 | int choice = 0; |
2048 | if (mddev->pers) | 2055 | if (mddev->pers) |
2049 | choice = mddev->raid_disks; | 2056 | choice = mddev->raid_disks; |
2050 | while (find_rdev_nr_rcu(mddev, choice)) | 2057 | while (md_find_rdev_nr_rcu(mddev, choice)) |
2051 | choice++; | 2058 | choice++; |
2052 | rdev->desc_nr = choice; | 2059 | rdev->desc_nr = choice; |
2053 | } else { | 2060 | } else { |
2054 | if (find_rdev_nr_rcu(mddev, rdev->desc_nr)) { | 2061 | if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) { |
2055 | rcu_read_unlock(); | 2062 | rcu_read_unlock(); |
2056 | return -EBUSY; | 2063 | return -EBUSY; |
2057 | } | 2064 | } |
@@ -2166,11 +2173,12 @@ static void export_rdev(struct md_rdev *rdev) | |||
2166 | kobject_put(&rdev->kobj); | 2173 | kobject_put(&rdev->kobj); |
2167 | } | 2174 | } |
2168 | 2175 | ||
2169 | static void kick_rdev_from_array(struct md_rdev *rdev) | 2176 | void md_kick_rdev_from_array(struct md_rdev *rdev) |
2170 | { | 2177 | { |
2171 | unbind_rdev_from_array(rdev); | 2178 | unbind_rdev_from_array(rdev); |
2172 | export_rdev(rdev); | 2179 | export_rdev(rdev); |
2173 | } | 2180 | } |
2181 | EXPORT_SYMBOL_GPL(md_kick_rdev_from_array); | ||
2174 | 2182 | ||
2175 | static void export_array(struct mddev *mddev) | 2183 | static void export_array(struct mddev *mddev) |
2176 | { | 2184 | { |
@@ -2179,7 +2187,7 @@ static void export_array(struct mddev *mddev) | |||
2179 | while (!list_empty(&mddev->disks)) { | 2187 | while (!list_empty(&mddev->disks)) { |
2180 | rdev = list_first_entry(&mddev->disks, struct md_rdev, | 2188 | rdev = list_first_entry(&mddev->disks, struct md_rdev, |
2181 | same_set); | 2189 | same_set); |
2182 | kick_rdev_from_array(rdev); | 2190 | md_kick_rdev_from_array(rdev); |
2183 | } | 2191 | } |
2184 | mddev->raid_disks = 0; | 2192 | mddev->raid_disks = 0; |
2185 | mddev->major_version = 0; | 2193 | mddev->major_version = 0; |
@@ -2208,7 +2216,7 @@ static void sync_sbs(struct mddev *mddev, int nospares) | |||
2208 | } | 2216 | } |
2209 | } | 2217 | } |
2210 | 2218 | ||
2211 | static void md_update_sb(struct mddev *mddev, int force_change) | 2219 | void md_update_sb(struct mddev *mddev, int force_change) |
2212 | { | 2220 | { |
2213 | struct md_rdev *rdev; | 2221 | struct md_rdev *rdev; |
2214 | int sync_req; | 2222 | int sync_req; |
@@ -2369,6 +2377,37 @@ repeat: | |||
2369 | wake_up(&rdev->blocked_wait); | 2377 | wake_up(&rdev->blocked_wait); |
2370 | } | 2378 | } |
2371 | } | 2379 | } |
2380 | EXPORT_SYMBOL(md_update_sb); | ||
2381 | |||
2382 | static int add_bound_rdev(struct md_rdev *rdev) | ||
2383 | { | ||
2384 | struct mddev *mddev = rdev->mddev; | ||
2385 | int err = 0; | ||
2386 | |||
2387 | if (!mddev->pers->hot_remove_disk) { | ||
2388 | /* If there is hot_add_disk but no hot_remove_disk | ||
2389 | * then added disks for geometry changes, | ||
2390 | * and should be added immediately. | ||
2391 | */ | ||
2392 | super_types[mddev->major_version]. | ||
2393 | validate_super(mddev, rdev); | ||
2394 | err = mddev->pers->hot_add_disk(mddev, rdev); | ||
2395 | if (err) { | ||
2396 | unbind_rdev_from_array(rdev); | ||
2397 | export_rdev(rdev); | ||
2398 | return err; | ||
2399 | } | ||
2400 | } | ||
2401 | sysfs_notify_dirent_safe(rdev->sysfs_state); | ||
2402 | |||
2403 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | ||
2404 | if (mddev->degraded) | ||
2405 | set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); | ||
2406 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
2407 | md_new_event(mddev); | ||
2408 | md_wakeup_thread(mddev->thread); | ||
2409 | return 0; | ||
2410 | } | ||
2372 | 2411 | ||
2373 | /* words written to sysfs files may, or may not, be \n terminated. | 2412 | /* words written to sysfs files may, or may not, be \n terminated. |
2374 | * We want to accept with case. For this we use cmd_match. | 2413 | * We want to accept with case. For this we use cmd_match. |
@@ -2471,10 +2510,16 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) | |||
2471 | err = -EBUSY; | 2510 | err = -EBUSY; |
2472 | else { | 2511 | else { |
2473 | struct mddev *mddev = rdev->mddev; | 2512 | struct mddev *mddev = rdev->mddev; |
2474 | kick_rdev_from_array(rdev); | 2513 | if (mddev_is_clustered(mddev)) |
2514 | md_cluster_ops->remove_disk(mddev, rdev); | ||
2515 | md_kick_rdev_from_array(rdev); | ||
2516 | if (mddev_is_clustered(mddev)) | ||
2517 | md_cluster_ops->metadata_update_start(mddev); | ||
2475 | if (mddev->pers) | 2518 | if (mddev->pers) |
2476 | md_update_sb(mddev, 1); | 2519 | md_update_sb(mddev, 1); |
2477 | md_new_event(mddev); | 2520 | md_new_event(mddev); |
2521 | if (mddev_is_clustered(mddev)) | ||
2522 | md_cluster_ops->metadata_update_finish(mddev); | ||
2478 | err = 0; | 2523 | err = 0; |
2479 | } | 2524 | } |
2480 | } else if (cmd_match(buf, "writemostly")) { | 2525 | } else if (cmd_match(buf, "writemostly")) { |
@@ -2553,6 +2598,21 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) | |||
2553 | clear_bit(Replacement, &rdev->flags); | 2598 | clear_bit(Replacement, &rdev->flags); |
2554 | err = 0; | 2599 | err = 0; |
2555 | } | 2600 | } |
2601 | } else if (cmd_match(buf, "re-add")) { | ||
2602 | if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1)) { | ||
2603 | /* clear_bit is performed _after_ all the devices | ||
2604 | * have their local Faulty bit cleared. If any writes | ||
2605 | * happen in the meantime in the local node, they | ||
2606 | * will land in the local bitmap, which will be synced | ||
2607 | * by this node eventually | ||
2608 | */ | ||
2609 | if (!mddev_is_clustered(rdev->mddev) || | ||
2610 | (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) { | ||
2611 | clear_bit(Faulty, &rdev->flags); | ||
2612 | err = add_bound_rdev(rdev); | ||
2613 | } | ||
2614 | } else | ||
2615 | err = -EBUSY; | ||
2556 | } | 2616 | } |
2557 | if (!err) | 2617 | if (!err) |
2558 | sysfs_notify_dirent_safe(rdev->sysfs_state); | 2618 | sysfs_notify_dirent_safe(rdev->sysfs_state); |
@@ -3127,7 +3187,7 @@ static void analyze_sbs(struct mddev *mddev) | |||
3127 | "md: fatal superblock inconsistency in %s" | 3187 | "md: fatal superblock inconsistency in %s" |
3128 | " -- removing from array\n", | 3188 | " -- removing from array\n", |
3129 | bdevname(rdev->bdev,b)); | 3189 | bdevname(rdev->bdev,b)); |
3130 | kick_rdev_from_array(rdev); | 3190 | md_kick_rdev_from_array(rdev); |
3131 | } | 3191 | } |
3132 | 3192 | ||
3133 | super_types[mddev->major_version]. | 3193 | super_types[mddev->major_version]. |
@@ -3142,18 +3202,27 @@ static void analyze_sbs(struct mddev *mddev) | |||
3142 | "md: %s: %s: only %d devices permitted\n", | 3202 | "md: %s: %s: only %d devices permitted\n", |
3143 | mdname(mddev), bdevname(rdev->bdev, b), | 3203 | mdname(mddev), bdevname(rdev->bdev, b), |
3144 | mddev->max_disks); | 3204 | mddev->max_disks); |
3145 | kick_rdev_from_array(rdev); | 3205 | md_kick_rdev_from_array(rdev); |
3146 | continue; | 3206 | continue; |
3147 | } | 3207 | } |
3148 | if (rdev != freshest) | 3208 | if (rdev != freshest) { |
3149 | if (super_types[mddev->major_version]. | 3209 | if (super_types[mddev->major_version]. |
3150 | validate_super(mddev, rdev)) { | 3210 | validate_super(mddev, rdev)) { |
3151 | printk(KERN_WARNING "md: kicking non-fresh %s" | 3211 | printk(KERN_WARNING "md: kicking non-fresh %s" |
3152 | " from array!\n", | 3212 | " from array!\n", |
3153 | bdevname(rdev->bdev,b)); | 3213 | bdevname(rdev->bdev,b)); |
3154 | kick_rdev_from_array(rdev); | 3214 | md_kick_rdev_from_array(rdev); |
3155 | continue; | 3215 | continue; |
3156 | } | 3216 | } |
3217 | /* No device should have a Candidate flag | ||
3218 | * when reading devices | ||
3219 | */ | ||
3220 | if (test_bit(Candidate, &rdev->flags)) { | ||
3221 | pr_info("md: kicking Cluster Candidate %s from array!\n", | ||
3222 | bdevname(rdev->bdev, b)); | ||
3223 | md_kick_rdev_from_array(rdev); | ||
3224 | } | ||
3225 | } | ||
3157 | if (mddev->level == LEVEL_MULTIPATH) { | 3226 | if (mddev->level == LEVEL_MULTIPATH) { |
3158 | rdev->desc_nr = i++; | 3227 | rdev->desc_nr = i++; |
3159 | rdev->raid_disk = rdev->desc_nr; | 3228 | rdev->raid_disk = rdev->desc_nr; |
@@ -4008,8 +4077,12 @@ size_store(struct mddev *mddev, const char *buf, size_t len) | |||
4008 | if (err) | 4077 | if (err) |
4009 | return err; | 4078 | return err; |
4010 | if (mddev->pers) { | 4079 | if (mddev->pers) { |
4080 | if (mddev_is_clustered(mddev)) | ||
4081 | md_cluster_ops->metadata_update_start(mddev); | ||
4011 | err = update_size(mddev, sectors); | 4082 | err = update_size(mddev, sectors); |
4012 | md_update_sb(mddev, 1); | 4083 | md_update_sb(mddev, 1); |
4084 | if (mddev_is_clustered(mddev)) | ||
4085 | md_cluster_ops->metadata_update_finish(mddev); | ||
4013 | } else { | 4086 | } else { |
4014 | if (mddev->dev_sectors == 0 || | 4087 | if (mddev->dev_sectors == 0 || |
4015 | mddev->dev_sectors > sectors) | 4088 | mddev->dev_sectors > sectors) |
@@ -4354,7 +4427,6 @@ min_sync_store(struct mddev *mddev, const char *buf, size_t len) | |||
4354 | { | 4427 | { |
4355 | unsigned long long min; | 4428 | unsigned long long min; |
4356 | int err; | 4429 | int err; |
4357 | int chunk; | ||
4358 | 4430 | ||
4359 | if (kstrtoull(buf, 10, &min)) | 4431 | if (kstrtoull(buf, 10, &min)) |
4360 | return -EINVAL; | 4432 | return -EINVAL; |
@@ -4368,16 +4440,8 @@ min_sync_store(struct mddev *mddev, const char *buf, size_t len) | |||
4368 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) | 4440 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
4369 | goto out_unlock; | 4441 | goto out_unlock; |
4370 | 4442 | ||
4371 | /* Must be a multiple of chunk_size */ | 4443 | /* Round down to multiple of 4K for safety */ |
4372 | chunk = mddev->chunk_sectors; | 4444 | mddev->resync_min = round_down(min, 8); |
4373 | if (chunk) { | ||
4374 | sector_t temp = min; | ||
4375 | |||
4376 | err = -EINVAL; | ||
4377 | if (sector_div(temp, chunk)) | ||
4378 | goto out_unlock; | ||
4379 | } | ||
4380 | mddev->resync_min = min; | ||
4381 | err = 0; | 4445 | err = 0; |
4382 | 4446 | ||
4383 | out_unlock: | 4447 | out_unlock: |
@@ -5077,10 +5141,16 @@ int md_run(struct mddev *mddev) | |||
5077 | } | 5141 | } |
5078 | if (err == 0 && pers->sync_request && | 5142 | if (err == 0 && pers->sync_request && |
5079 | (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { | 5143 | (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { |
5080 | err = bitmap_create(mddev); | 5144 | struct bitmap *bitmap; |
5081 | if (err) | 5145 | |
5146 | bitmap = bitmap_create(mddev, -1); | ||
5147 | if (IS_ERR(bitmap)) { | ||
5148 | err = PTR_ERR(bitmap); | ||
5082 | printk(KERN_ERR "%s: failed to create bitmap (%d)\n", | 5149 | printk(KERN_ERR "%s: failed to create bitmap (%d)\n", |
5083 | mdname(mddev), err); | 5150 | mdname(mddev), err); |
5151 | } else | ||
5152 | mddev->bitmap = bitmap; | ||
5153 | |||
5084 | } | 5154 | } |
5085 | if (err) { | 5155 | if (err) { |
5086 | mddev_detach(mddev); | 5156 | mddev_detach(mddev); |
@@ -5232,6 +5302,8 @@ static void md_clean(struct mddev *mddev) | |||
5232 | 5302 | ||
5233 | static void __md_stop_writes(struct mddev *mddev) | 5303 | static void __md_stop_writes(struct mddev *mddev) |
5234 | { | 5304 | { |
5305 | if (mddev_is_clustered(mddev)) | ||
5306 | md_cluster_ops->metadata_update_start(mddev); | ||
5235 | set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); | 5307 | set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); |
5236 | flush_workqueue(md_misc_wq); | 5308 | flush_workqueue(md_misc_wq); |
5237 | if (mddev->sync_thread) { | 5309 | if (mddev->sync_thread) { |
@@ -5250,6 +5322,8 @@ static void __md_stop_writes(struct mddev *mddev) | |||
5250 | mddev->in_sync = 1; | 5322 | mddev->in_sync = 1; |
5251 | md_update_sb(mddev, 1); | 5323 | md_update_sb(mddev, 1); |
5252 | } | 5324 | } |
5325 | if (mddev_is_clustered(mddev)) | ||
5326 | md_cluster_ops->metadata_update_finish(mddev); | ||
5253 | } | 5327 | } |
5254 | 5328 | ||
5255 | void md_stop_writes(struct mddev *mddev) | 5329 | void md_stop_writes(struct mddev *mddev) |
@@ -5636,6 +5710,8 @@ static int get_array_info(struct mddev *mddev, void __user *arg) | |||
5636 | info.state = (1<<MD_SB_CLEAN); | 5710 | info.state = (1<<MD_SB_CLEAN); |
5637 | if (mddev->bitmap && mddev->bitmap_info.offset) | 5711 | if (mddev->bitmap && mddev->bitmap_info.offset) |
5638 | info.state |= (1<<MD_SB_BITMAP_PRESENT); | 5712 | info.state |= (1<<MD_SB_BITMAP_PRESENT); |
5713 | if (mddev_is_clustered(mddev)) | ||
5714 | info.state |= (1<<MD_SB_CLUSTERED); | ||
5639 | info.active_disks = insync; | 5715 | info.active_disks = insync; |
5640 | info.working_disks = working; | 5716 | info.working_disks = working; |
5641 | info.failed_disks = failed; | 5717 | info.failed_disks = failed; |
@@ -5691,7 +5767,7 @@ static int get_disk_info(struct mddev *mddev, void __user * arg) | |||
5691 | return -EFAULT; | 5767 | return -EFAULT; |
5692 | 5768 | ||
5693 | rcu_read_lock(); | 5769 | rcu_read_lock(); |
5694 | rdev = find_rdev_nr_rcu(mddev, info.number); | 5770 | rdev = md_find_rdev_nr_rcu(mddev, info.number); |
5695 | if (rdev) { | 5771 | if (rdev) { |
5696 | info.major = MAJOR(rdev->bdev->bd_dev); | 5772 | info.major = MAJOR(rdev->bdev->bd_dev); |
5697 | info.minor = MINOR(rdev->bdev->bd_dev); | 5773 | info.minor = MINOR(rdev->bdev->bd_dev); |
@@ -5724,6 +5800,13 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) | |||
5724 | struct md_rdev *rdev; | 5800 | struct md_rdev *rdev; |
5725 | dev_t dev = MKDEV(info->major,info->minor); | 5801 | dev_t dev = MKDEV(info->major,info->minor); |
5726 | 5802 | ||
5803 | if (mddev_is_clustered(mddev) && | ||
5804 | !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { | ||
5805 | pr_err("%s: Cannot add to clustered mddev.\n", | ||
5806 | mdname(mddev)); | ||
5807 | return -EINVAL; | ||
5808 | } | ||
5809 | |||
5727 | if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) | 5810 | if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) |
5728 | return -EOVERFLOW; | 5811 | return -EOVERFLOW; |
5729 | 5812 | ||
@@ -5810,31 +5893,38 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) | |||
5810 | else | 5893 | else |
5811 | clear_bit(WriteMostly, &rdev->flags); | 5894 | clear_bit(WriteMostly, &rdev->flags); |
5812 | 5895 | ||
5896 | /* | ||
5897 | * check whether the device shows up in other nodes | ||
5898 | */ | ||
5899 | if (mddev_is_clustered(mddev)) { | ||
5900 | if (info->state & (1 << MD_DISK_CANDIDATE)) { | ||
5901 | /* Through --cluster-confirm */ | ||
5902 | set_bit(Candidate, &rdev->flags); | ||
5903 | err = md_cluster_ops->new_disk_ack(mddev, true); | ||
5904 | if (err) { | ||
5905 | export_rdev(rdev); | ||
5906 | return err; | ||
5907 | } | ||
5908 | } else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { | ||
5909 | /* --add initiated by this node */ | ||
5910 | err = md_cluster_ops->add_new_disk_start(mddev, rdev); | ||
5911 | if (err) { | ||
5912 | md_cluster_ops->add_new_disk_finish(mddev); | ||
5913 | export_rdev(rdev); | ||
5914 | return err; | ||
5915 | } | ||
5916 | } | ||
5917 | } | ||
5918 | |||
5813 | rdev->raid_disk = -1; | 5919 | rdev->raid_disk = -1; |
5814 | err = bind_rdev_to_array(rdev, mddev); | 5920 | err = bind_rdev_to_array(rdev, mddev); |
5815 | if (!err && !mddev->pers->hot_remove_disk) { | ||
5816 | /* If there is hot_add_disk but no hot_remove_disk | ||
5817 | * then added disks for geometry changes, | ||
5818 | * and should be added immediately. | ||
5819 | */ | ||
5820 | super_types[mddev->major_version]. | ||
5821 | validate_super(mddev, rdev); | ||
5822 | err = mddev->pers->hot_add_disk(mddev, rdev); | ||
5823 | if (err) | ||
5824 | unbind_rdev_from_array(rdev); | ||
5825 | } | ||
5826 | if (err) | 5921 | if (err) |
5827 | export_rdev(rdev); | 5922 | export_rdev(rdev); |
5828 | else | 5923 | else |
5829 | sysfs_notify_dirent_safe(rdev->sysfs_state); | 5924 | err = add_bound_rdev(rdev); |
5830 | 5925 | if (mddev_is_clustered(mddev) && | |
5831 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 5926 | (info->state & (1 << MD_DISK_CLUSTER_ADD))) |
5832 | if (mddev->degraded) | 5927 | md_cluster_ops->add_new_disk_finish(mddev); |
5833 | set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); | ||
5834 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
5835 | if (!err) | ||
5836 | md_new_event(mddev); | ||
5837 | md_wakeup_thread(mddev->thread); | ||
5838 | return err; | 5928 | return err; |
5839 | } | 5929 | } |
5840 | 5930 | ||
@@ -5895,18 +5985,29 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) | |||
5895 | if (!rdev) | 5985 | if (!rdev) |
5896 | return -ENXIO; | 5986 | return -ENXIO; |
5897 | 5987 | ||
5988 | if (mddev_is_clustered(mddev)) | ||
5989 | md_cluster_ops->metadata_update_start(mddev); | ||
5990 | |||
5898 | clear_bit(Blocked, &rdev->flags); | 5991 | clear_bit(Blocked, &rdev->flags); |
5899 | remove_and_add_spares(mddev, rdev); | 5992 | remove_and_add_spares(mddev, rdev); |
5900 | 5993 | ||
5901 | if (rdev->raid_disk >= 0) | 5994 | if (rdev->raid_disk >= 0) |
5902 | goto busy; | 5995 | goto busy; |
5903 | 5996 | ||
5904 | kick_rdev_from_array(rdev); | 5997 | if (mddev_is_clustered(mddev)) |
5998 | md_cluster_ops->remove_disk(mddev, rdev); | ||
5999 | |||
6000 | md_kick_rdev_from_array(rdev); | ||
5905 | md_update_sb(mddev, 1); | 6001 | md_update_sb(mddev, 1); |
5906 | md_new_event(mddev); | 6002 | md_new_event(mddev); |
5907 | 6003 | ||
6004 | if (mddev_is_clustered(mddev)) | ||
6005 | md_cluster_ops->metadata_update_finish(mddev); | ||
6006 | |||
5908 | return 0; | 6007 | return 0; |
5909 | busy: | 6008 | busy: |
6009 | if (mddev_is_clustered(mddev)) | ||
6010 | md_cluster_ops->metadata_update_cancel(mddev); | ||
5910 | printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", | 6011 | printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", |
5911 | bdevname(rdev->bdev,b), mdname(mddev)); | 6012 | bdevname(rdev->bdev,b), mdname(mddev)); |
5912 | return -EBUSY; | 6013 | return -EBUSY; |
@@ -5956,12 +6057,15 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) | |||
5956 | err = -EINVAL; | 6057 | err = -EINVAL; |
5957 | goto abort_export; | 6058 | goto abort_export; |
5958 | } | 6059 | } |
6060 | |||
6061 | if (mddev_is_clustered(mddev)) | ||
6062 | md_cluster_ops->metadata_update_start(mddev); | ||
5959 | clear_bit(In_sync, &rdev->flags); | 6063 | clear_bit(In_sync, &rdev->flags); |
5960 | rdev->desc_nr = -1; | 6064 | rdev->desc_nr = -1; |
5961 | rdev->saved_raid_disk = -1; | 6065 | rdev->saved_raid_disk = -1; |
5962 | err = bind_rdev_to_array(rdev, mddev); | 6066 | err = bind_rdev_to_array(rdev, mddev); |
5963 | if (err) | 6067 | if (err) |
5964 | goto abort_export; | 6068 | goto abort_clustered; |
5965 | 6069 | ||
5966 | /* | 6070 | /* |
5967 | * The rest should better be atomic, we can have disk failures | 6071 | * The rest should better be atomic, we can have disk failures |
@@ -5972,6 +6076,8 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) | |||
5972 | 6076 | ||
5973 | md_update_sb(mddev, 1); | 6077 | md_update_sb(mddev, 1); |
5974 | 6078 | ||
6079 | if (mddev_is_clustered(mddev)) | ||
6080 | md_cluster_ops->metadata_update_finish(mddev); | ||
5975 | /* | 6081 | /* |
5976 | * Kick recovery, maybe this spare has to be added to the | 6082 | * Kick recovery, maybe this spare has to be added to the |
5977 | * array immediately. | 6083 | * array immediately. |
@@ -5981,6 +6087,9 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) | |||
5981 | md_new_event(mddev); | 6087 | md_new_event(mddev); |
5982 | return 0; | 6088 | return 0; |
5983 | 6089 | ||
6090 | abort_clustered: | ||
6091 | if (mddev_is_clustered(mddev)) | ||
6092 | md_cluster_ops->metadata_update_cancel(mddev); | ||
5984 | abort_export: | 6093 | abort_export: |
5985 | export_rdev(rdev); | 6094 | export_rdev(rdev); |
5986 | return err; | 6095 | return err; |
@@ -6038,9 +6147,14 @@ static int set_bitmap_file(struct mddev *mddev, int fd) | |||
6038 | if (mddev->pers) { | 6147 | if (mddev->pers) { |
6039 | mddev->pers->quiesce(mddev, 1); | 6148 | mddev->pers->quiesce(mddev, 1); |
6040 | if (fd >= 0) { | 6149 | if (fd >= 0) { |
6041 | err = bitmap_create(mddev); | 6150 | struct bitmap *bitmap; |
6042 | if (!err) | 6151 | |
6152 | bitmap = bitmap_create(mddev, -1); | ||
6153 | if (!IS_ERR(bitmap)) { | ||
6154 | mddev->bitmap = bitmap; | ||
6043 | err = bitmap_load(mddev); | 6155 | err = bitmap_load(mddev); |
6156 | } else | ||
6157 | err = PTR_ERR(bitmap); | ||
6044 | } | 6158 | } |
6045 | if (fd < 0 || err) { | 6159 | if (fd < 0 || err) { |
6046 | bitmap_destroy(mddev); | 6160 | bitmap_destroy(mddev); |
@@ -6293,6 +6407,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) | |||
6293 | return rv; | 6407 | return rv; |
6294 | } | 6408 | } |
6295 | } | 6409 | } |
6410 | if (mddev_is_clustered(mddev)) | ||
6411 | md_cluster_ops->metadata_update_start(mddev); | ||
6296 | if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) | 6412 | if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) |
6297 | rv = update_size(mddev, (sector_t)info->size * 2); | 6413 | rv = update_size(mddev, (sector_t)info->size * 2); |
6298 | 6414 | ||
@@ -6300,33 +6416,49 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) | |||
6300 | rv = update_raid_disks(mddev, info->raid_disks); | 6416 | rv = update_raid_disks(mddev, info->raid_disks); |
6301 | 6417 | ||
6302 | if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { | 6418 | if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { |
6303 | if (mddev->pers->quiesce == NULL || mddev->thread == NULL) | 6419 | if (mddev->pers->quiesce == NULL || mddev->thread == NULL) { |
6304 | return -EINVAL; | 6420 | rv = -EINVAL; |
6305 | if (mddev->recovery || mddev->sync_thread) | 6421 | goto err; |
6306 | return -EBUSY; | 6422 | } |
6423 | if (mddev->recovery || mddev->sync_thread) { | ||
6424 | rv = -EBUSY; | ||
6425 | goto err; | ||
6426 | } | ||
6307 | if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { | 6427 | if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { |
6428 | struct bitmap *bitmap; | ||
6308 | /* add the bitmap */ | 6429 | /* add the bitmap */ |
6309 | if (mddev->bitmap) | 6430 | if (mddev->bitmap) { |
6310 | return -EEXIST; | 6431 | rv = -EEXIST; |
6311 | if (mddev->bitmap_info.default_offset == 0) | 6432 | goto err; |
6312 | return -EINVAL; | 6433 | } |
6434 | if (mddev->bitmap_info.default_offset == 0) { | ||
6435 | rv = -EINVAL; | ||
6436 | goto err; | ||
6437 | } | ||
6313 | mddev->bitmap_info.offset = | 6438 | mddev->bitmap_info.offset = |
6314 | mddev->bitmap_info.default_offset; | 6439 | mddev->bitmap_info.default_offset; |
6315 | mddev->bitmap_info.space = | 6440 | mddev->bitmap_info.space = |
6316 | mddev->bitmap_info.default_space; | 6441 | mddev->bitmap_info.default_space; |
6317 | mddev->pers->quiesce(mddev, 1); | 6442 | mddev->pers->quiesce(mddev, 1); |
6318 | rv = bitmap_create(mddev); | 6443 | bitmap = bitmap_create(mddev, -1); |
6319 | if (!rv) | 6444 | if (!IS_ERR(bitmap)) { |
6445 | mddev->bitmap = bitmap; | ||
6320 | rv = bitmap_load(mddev); | 6446 | rv = bitmap_load(mddev); |
6447 | } else | ||
6448 | rv = PTR_ERR(bitmap); | ||
6321 | if (rv) | 6449 | if (rv) |
6322 | bitmap_destroy(mddev); | 6450 | bitmap_destroy(mddev); |
6323 | mddev->pers->quiesce(mddev, 0); | 6451 | mddev->pers->quiesce(mddev, 0); |
6324 | } else { | 6452 | } else { |
6325 | /* remove the bitmap */ | 6453 | /* remove the bitmap */ |
6326 | if (!mddev->bitmap) | 6454 | if (!mddev->bitmap) { |
6327 | return -ENOENT; | 6455 | rv = -ENOENT; |
6328 | if (mddev->bitmap->storage.file) | 6456 | goto err; |
6329 | return -EINVAL; | 6457 | } |
6458 | if (mddev->bitmap->storage.file) { | ||
6459 | rv = -EINVAL; | ||
6460 | goto err; | ||
6461 | } | ||
6330 | mddev->pers->quiesce(mddev, 1); | 6462 | mddev->pers->quiesce(mddev, 1); |
6331 | bitmap_destroy(mddev); | 6463 | bitmap_destroy(mddev); |
6332 | mddev->pers->quiesce(mddev, 0); | 6464 | mddev->pers->quiesce(mddev, 0); |
@@ -6334,6 +6466,12 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) | |||
6334 | } | 6466 | } |
6335 | } | 6467 | } |
6336 | md_update_sb(mddev, 1); | 6468 | md_update_sb(mddev, 1); |
6469 | if (mddev_is_clustered(mddev)) | ||
6470 | md_cluster_ops->metadata_update_finish(mddev); | ||
6471 | return rv; | ||
6472 | err: | ||
6473 | if (mddev_is_clustered(mddev)) | ||
6474 | md_cluster_ops->metadata_update_cancel(mddev); | ||
6337 | return rv; | 6475 | return rv; |
6338 | } | 6476 | } |
6339 | 6477 | ||
@@ -6393,6 +6531,7 @@ static inline bool md_ioctl_valid(unsigned int cmd) | |||
6393 | case SET_DISK_FAULTY: | 6531 | case SET_DISK_FAULTY: |
6394 | case STOP_ARRAY: | 6532 | case STOP_ARRAY: |
6395 | case STOP_ARRAY_RO: | 6533 | case STOP_ARRAY_RO: |
6534 | case CLUSTERED_DISK_NACK: | ||
6396 | return true; | 6535 | return true; |
6397 | default: | 6536 | default: |
6398 | return false; | 6537 | return false; |
@@ -6665,6 +6804,13 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, | |||
6665 | goto unlock; | 6804 | goto unlock; |
6666 | } | 6805 | } |
6667 | 6806 | ||
6807 | case CLUSTERED_DISK_NACK: | ||
6808 | if (mddev_is_clustered(mddev)) | ||
6809 | md_cluster_ops->new_disk_ack(mddev, false); | ||
6810 | else | ||
6811 | err = -EINVAL; | ||
6812 | goto unlock; | ||
6813 | |||
6668 | case HOT_ADD_DISK: | 6814 | case HOT_ADD_DISK: |
6669 | err = hot_add_disk(mddev, new_decode_dev(arg)); | 6815 | err = hot_add_disk(mddev, new_decode_dev(arg)); |
6670 | goto unlock; | 6816 | goto unlock; |
@@ -7238,6 +7384,55 @@ int unregister_md_personality(struct md_personality *p) | |||
7238 | } | 7384 | } |
7239 | EXPORT_SYMBOL(unregister_md_personality); | 7385 | EXPORT_SYMBOL(unregister_md_personality); |
7240 | 7386 | ||
7387 | int register_md_cluster_operations(struct md_cluster_operations *ops, struct module *module) | ||
7388 | { | ||
7389 | if (md_cluster_ops != NULL) | ||
7390 | return -EALREADY; | ||
7391 | spin_lock(&pers_lock); | ||
7392 | md_cluster_ops = ops; | ||
7393 | md_cluster_mod = module; | ||
7394 | spin_unlock(&pers_lock); | ||
7395 | return 0; | ||
7396 | } | ||
7397 | EXPORT_SYMBOL(register_md_cluster_operations); | ||
7398 | |||
7399 | int unregister_md_cluster_operations(void) | ||
7400 | { | ||
7401 | spin_lock(&pers_lock); | ||
7402 | md_cluster_ops = NULL; | ||
7403 | spin_unlock(&pers_lock); | ||
7404 | return 0; | ||
7405 | } | ||
7406 | EXPORT_SYMBOL(unregister_md_cluster_operations); | ||
7407 | |||
7408 | int md_setup_cluster(struct mddev *mddev, int nodes) | ||
7409 | { | ||
7410 | int err; | ||
7411 | |||
7412 | err = request_module("md-cluster"); | ||
7413 | if (err) { | ||
7414 | pr_err("md-cluster module not found.\n"); | ||
7415 | return err; | ||
7416 | } | ||
7417 | |||
7418 | spin_lock(&pers_lock); | ||
7419 | if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { | ||
7420 | spin_unlock(&pers_lock); | ||
7421 | return -ENOENT; | ||
7422 | } | ||
7423 | spin_unlock(&pers_lock); | ||
7424 | |||
7425 | return md_cluster_ops->join(mddev, nodes); | ||
7426 | } | ||
7427 | |||
7428 | void md_cluster_stop(struct mddev *mddev) | ||
7429 | { | ||
7430 | if (!md_cluster_ops) | ||
7431 | return; | ||
7432 | md_cluster_ops->leave(mddev); | ||
7433 | module_put(md_cluster_mod); | ||
7434 | } | ||
7435 | |||
7241 | static int is_mddev_idle(struct mddev *mddev, int init) | 7436 | static int is_mddev_idle(struct mddev *mddev, int init) |
7242 | { | 7437 | { |
7243 | struct md_rdev *rdev; | 7438 | struct md_rdev *rdev; |
@@ -7375,7 +7570,11 @@ int md_allow_write(struct mddev *mddev) | |||
7375 | mddev->safemode == 0) | 7570 | mddev->safemode == 0) |
7376 | mddev->safemode = 1; | 7571 | mddev->safemode = 1; |
7377 | spin_unlock(&mddev->lock); | 7572 | spin_unlock(&mddev->lock); |
7573 | if (mddev_is_clustered(mddev)) | ||
7574 | md_cluster_ops->metadata_update_start(mddev); | ||
7378 | md_update_sb(mddev, 0); | 7575 | md_update_sb(mddev, 0); |
7576 | if (mddev_is_clustered(mddev)) | ||
7577 | md_cluster_ops->metadata_update_finish(mddev); | ||
7379 | sysfs_notify_dirent_safe(mddev->sysfs_state); | 7578 | sysfs_notify_dirent_safe(mddev->sysfs_state); |
7380 | } else | 7579 | } else |
7381 | spin_unlock(&mddev->lock); | 7580 | spin_unlock(&mddev->lock); |
@@ -7576,6 +7775,9 @@ void md_do_sync(struct md_thread *thread) | |||
7576 | md_new_event(mddev); | 7775 | md_new_event(mddev); |
7577 | update_time = jiffies; | 7776 | update_time = jiffies; |
7578 | 7777 | ||
7778 | if (mddev_is_clustered(mddev)) | ||
7779 | md_cluster_ops->resync_start(mddev, j, max_sectors); | ||
7780 | |||
7579 | blk_start_plug(&plug); | 7781 | blk_start_plug(&plug); |
7580 | while (j < max_sectors) { | 7782 | while (j < max_sectors) { |
7581 | sector_t sectors; | 7783 | sector_t sectors; |
@@ -7618,8 +7820,7 @@ void md_do_sync(struct md_thread *thread) | |||
7618 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | 7820 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) |
7619 | break; | 7821 | break; |
7620 | 7822 | ||
7621 | sectors = mddev->pers->sync_request(mddev, j, &skipped, | 7823 | sectors = mddev->pers->sync_request(mddev, j, &skipped); |
7622 | currspeed < speed_min(mddev)); | ||
7623 | if (sectors == 0) { | 7824 | if (sectors == 0) { |
7624 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | 7825 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); |
7625 | break; | 7826 | break; |
@@ -7636,6 +7837,8 @@ void md_do_sync(struct md_thread *thread) | |||
7636 | j += sectors; | 7837 | j += sectors; |
7637 | if (j > 2) | 7838 | if (j > 2) |
7638 | mddev->curr_resync = j; | 7839 | mddev->curr_resync = j; |
7840 | if (mddev_is_clustered(mddev)) | ||
7841 | md_cluster_ops->resync_info_update(mddev, j, max_sectors); | ||
7639 | mddev->curr_mark_cnt = io_sectors; | 7842 | mddev->curr_mark_cnt = io_sectors; |
7640 | if (last_check == 0) | 7843 | if (last_check == 0) |
7641 | /* this is the earliest that rebuild will be | 7844 | /* this is the earliest that rebuild will be |
@@ -7677,11 +7880,18 @@ void md_do_sync(struct md_thread *thread) | |||
7677 | /((jiffies-mddev->resync_mark)/HZ +1) +1; | 7880 | /((jiffies-mddev->resync_mark)/HZ +1) +1; |
7678 | 7881 | ||
7679 | if (currspeed > speed_min(mddev)) { | 7882 | if (currspeed > speed_min(mddev)) { |
7680 | if ((currspeed > speed_max(mddev)) || | 7883 | if (currspeed > speed_max(mddev)) { |
7681 | !is_mddev_idle(mddev, 0)) { | ||
7682 | msleep(500); | 7884 | msleep(500); |
7683 | goto repeat; | 7885 | goto repeat; |
7684 | } | 7886 | } |
7887 | if (!is_mddev_idle(mddev, 0)) { | ||
7888 | /* | ||
7889 | * Give other IO more of a chance. | ||
7890 | * The faster the devices, the less we wait. | ||
7891 | */ | ||
7892 | wait_event(mddev->recovery_wait, | ||
7893 | !atomic_read(&mddev->recovery_active)); | ||
7894 | } | ||
7685 | } | 7895 | } |
7686 | } | 7896 | } |
7687 | printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc, | 7897 | printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc, |
@@ -7694,7 +7904,10 @@ void md_do_sync(struct md_thread *thread) | |||
7694 | wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); | 7904 | wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); |
7695 | 7905 | ||
7696 | /* tell personality that we are finished */ | 7906 | /* tell personality that we are finished */ |
7697 | mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); | 7907 | mddev->pers->sync_request(mddev, max_sectors, &skipped); |
7908 | |||
7909 | if (mddev_is_clustered(mddev)) | ||
7910 | md_cluster_ops->resync_finish(mddev); | ||
7698 | 7911 | ||
7699 | if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && | 7912 | if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && |
7700 | mddev->curr_resync > 2) { | 7913 | mddev->curr_resync > 2) { |
@@ -7925,8 +8138,13 @@ void md_check_recovery(struct mddev *mddev) | |||
7925 | sysfs_notify_dirent_safe(mddev->sysfs_state); | 8138 | sysfs_notify_dirent_safe(mddev->sysfs_state); |
7926 | } | 8139 | } |
7927 | 8140 | ||
7928 | if (mddev->flags & MD_UPDATE_SB_FLAGS) | 8141 | if (mddev->flags & MD_UPDATE_SB_FLAGS) { |
8142 | if (mddev_is_clustered(mddev)) | ||
8143 | md_cluster_ops->metadata_update_start(mddev); | ||
7929 | md_update_sb(mddev, 0); | 8144 | md_update_sb(mddev, 0); |
8145 | if (mddev_is_clustered(mddev)) | ||
8146 | md_cluster_ops->metadata_update_finish(mddev); | ||
8147 | } | ||
7930 | 8148 | ||
7931 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && | 8149 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && |
7932 | !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { | 8150 | !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { |
@@ -8024,6 +8242,8 @@ void md_reap_sync_thread(struct mddev *mddev) | |||
8024 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 8242 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
8025 | } | 8243 | } |
8026 | } | 8244 | } |
8245 | if (mddev_is_clustered(mddev)) | ||
8246 | md_cluster_ops->metadata_update_start(mddev); | ||
8027 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && | 8247 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && |
8028 | mddev->pers->finish_reshape) | 8248 | mddev->pers->finish_reshape) |
8029 | mddev->pers->finish_reshape(mddev); | 8249 | mddev->pers->finish_reshape(mddev); |
@@ -8036,6 +8256,8 @@ void md_reap_sync_thread(struct mddev *mddev) | |||
8036 | rdev->saved_raid_disk = -1; | 8256 | rdev->saved_raid_disk = -1; |
8037 | 8257 | ||
8038 | md_update_sb(mddev, 1); | 8258 | md_update_sb(mddev, 1); |
8259 | if (mddev_is_clustered(mddev)) | ||
8260 | md_cluster_ops->metadata_update_finish(mddev); | ||
8039 | clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); | 8261 | clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); |
8040 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); | 8262 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); |
8041 | clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); | 8263 | clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); |
@@ -8656,6 +8878,28 @@ err_wq: | |||
8656 | return ret; | 8878 | return ret; |
8657 | } | 8879 | } |
8658 | 8880 | ||
8881 | void md_reload_sb(struct mddev *mddev) | ||
8882 | { | ||
8883 | struct md_rdev *rdev, *tmp; | ||
8884 | |||
8885 | rdev_for_each_safe(rdev, tmp, mddev) { | ||
8886 | rdev->sb_loaded = 0; | ||
8887 | ClearPageUptodate(rdev->sb_page); | ||
8888 | } | ||
8889 | mddev->raid_disks = 0; | ||
8890 | analyze_sbs(mddev); | ||
8891 | rdev_for_each_safe(rdev, tmp, mddev) { | ||
8892 | struct mdp_superblock_1 *sb = page_address(rdev->sb_page); | ||
8893 | /* since we don't write to faulty devices, we figure out if the | ||
8894 | * disk is faulty by comparing events | ||
8895 | */ | ||
8896 | if (mddev->events > sb->events) | ||
8897 | set_bit(Faulty, &rdev->flags); | ||
8898 | } | ||
8899 | |||
8900 | } | ||
8901 | EXPORT_SYMBOL(md_reload_sb); | ||
8902 | |||
8659 | #ifndef MODULE | 8903 | #ifndef MODULE |
8660 | 8904 | ||
8661 | /* | 8905 | /* |
diff --git a/drivers/md/md.h b/drivers/md/md.h index 318ca8fd430f..4046a6c6f223 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/timer.h> | 23 | #include <linux/timer.h> |
24 | #include <linux/wait.h> | 24 | #include <linux/wait.h> |
25 | #include <linux/workqueue.h> | 25 | #include <linux/workqueue.h> |
26 | #include "md-cluster.h" | ||
26 | 27 | ||
27 | #define MaxSector (~(sector_t)0) | 28 | #define MaxSector (~(sector_t)0) |
28 | 29 | ||
@@ -170,6 +171,10 @@ enum flag_bits { | |||
170 | * a want_replacement device with same | 171 | * a want_replacement device with same |
171 | * raid_disk number. | 172 | * raid_disk number. |
172 | */ | 173 | */ |
174 | Candidate, /* For clustered environments only: | ||
175 | * This device is seen locally but not | ||
176 | * by the whole cluster | ||
177 | */ | ||
173 | }; | 178 | }; |
174 | 179 | ||
175 | #define BB_LEN_MASK (0x00000000000001FFULL) | 180 | #define BB_LEN_MASK (0x00000000000001FFULL) |
@@ -202,6 +207,8 @@ extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, | |||
202 | int is_new); | 207 | int is_new); |
203 | extern void md_ack_all_badblocks(struct badblocks *bb); | 208 | extern void md_ack_all_badblocks(struct badblocks *bb); |
204 | 209 | ||
210 | struct md_cluster_info; | ||
211 | |||
205 | struct mddev { | 212 | struct mddev { |
206 | void *private; | 213 | void *private; |
207 | struct md_personality *pers; | 214 | struct md_personality *pers; |
@@ -430,6 +437,8 @@ struct mddev { | |||
430 | unsigned long daemon_sleep; /* how many jiffies between updates? */ | 437 | unsigned long daemon_sleep; /* how many jiffies between updates? */ |
431 | unsigned long max_write_behind; /* write-behind mode */ | 438 | unsigned long max_write_behind; /* write-behind mode */ |
432 | int external; | 439 | int external; |
440 | int nodes; /* Maximum number of nodes in the cluster */ | ||
441 | char cluster_name[64]; /* Name of the cluster */ | ||
433 | } bitmap_info; | 442 | } bitmap_info; |
434 | 443 | ||
435 | atomic_t max_corr_read_errors; /* max read retries */ | 444 | atomic_t max_corr_read_errors; /* max read retries */ |
@@ -448,6 +457,7 @@ struct mddev { | |||
448 | struct work_struct flush_work; | 457 | struct work_struct flush_work; |
449 | struct work_struct event_work; /* used by dm to report failure event */ | 458 | struct work_struct event_work; /* used by dm to report failure event */ |
450 | void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); | 459 | void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); |
460 | struct md_cluster_info *cluster_info; | ||
451 | }; | 461 | }; |
452 | 462 | ||
453 | static inline int __must_check mddev_lock(struct mddev *mddev) | 463 | static inline int __must_check mddev_lock(struct mddev *mddev) |
@@ -496,7 +506,7 @@ struct md_personality | |||
496 | int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev); | 506 | int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev); |
497 | int (*hot_remove_disk) (struct mddev *mddev, struct md_rdev *rdev); | 507 | int (*hot_remove_disk) (struct mddev *mddev, struct md_rdev *rdev); |
498 | int (*spare_active) (struct mddev *mddev); | 508 | int (*spare_active) (struct mddev *mddev); |
499 | sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster); | 509 | sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped); |
500 | int (*resize) (struct mddev *mddev, sector_t sectors); | 510 | int (*resize) (struct mddev *mddev, sector_t sectors); |
501 | sector_t (*size) (struct mddev *mddev, sector_t sectors, int raid_disks); | 511 | sector_t (*size) (struct mddev *mddev, sector_t sectors, int raid_disks); |
502 | int (*check_reshape) (struct mddev *mddev); | 512 | int (*check_reshape) (struct mddev *mddev); |
@@ -608,6 +618,11 @@ static inline void safe_put_page(struct page *p) | |||
608 | 618 | ||
609 | extern int register_md_personality(struct md_personality *p); | 619 | extern int register_md_personality(struct md_personality *p); |
610 | extern int unregister_md_personality(struct md_personality *p); | 620 | extern int unregister_md_personality(struct md_personality *p); |
621 | extern int register_md_cluster_operations(struct md_cluster_operations *ops, | ||
622 | struct module *module); | ||
623 | extern int unregister_md_cluster_operations(void); | ||
624 | extern int md_setup_cluster(struct mddev *mddev, int nodes); | ||
625 | extern void md_cluster_stop(struct mddev *mddev); | ||
611 | extern struct md_thread *md_register_thread( | 626 | extern struct md_thread *md_register_thread( |
612 | void (*run)(struct md_thread *thread), | 627 | void (*run)(struct md_thread *thread), |
613 | struct mddev *mddev, | 628 | struct mddev *mddev, |
@@ -654,6 +669,10 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, | |||
654 | struct mddev *mddev); | 669 | struct mddev *mddev); |
655 | 670 | ||
656 | extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule); | 671 | extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule); |
672 | extern void md_reload_sb(struct mddev *mddev); | ||
673 | extern void md_update_sb(struct mddev *mddev, int force); | ||
674 | extern void md_kick_rdev_from_array(struct md_rdev * rdev); | ||
675 | struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); | ||
657 | static inline int mddev_check_plugged(struct mddev *mddev) | 676 | static inline int mddev_check_plugged(struct mddev *mddev) |
658 | { | 677 | { |
659 | return !!blk_check_plugged(md_unplug, mddev, | 678 | return !!blk_check_plugged(md_unplug, mddev, |
@@ -669,4 +688,9 @@ static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) | |||
669 | } | 688 | } |
670 | } | 689 | } |
671 | 690 | ||
691 | extern struct md_cluster_operations *md_cluster_ops; | ||
692 | static inline int mddev_is_clustered(struct mddev *mddev) | ||
693 | { | ||
694 | return mddev->cluster_info && mddev->bitmap_info.nodes > 1; | ||
695 | } | ||
672 | #endif /* _MD_MD_H */ | 696 | #endif /* _MD_MD_H */ |
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 3b5d7f704aa3..2cb59a641cd2 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c | |||
@@ -271,14 +271,16 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) | |||
271 | goto abort; | 271 | goto abort; |
272 | } | 272 | } |
273 | 273 | ||
274 | blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); | 274 | if (mddev->queue) { |
275 | blk_queue_io_opt(mddev->queue, | 275 | blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); |
276 | (mddev->chunk_sectors << 9) * mddev->raid_disks); | 276 | blk_queue_io_opt(mddev->queue, |
277 | 277 | (mddev->chunk_sectors << 9) * mddev->raid_disks); | |
278 | if (!discard_supported) | 278 | |
279 | queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); | 279 | if (!discard_supported) |
280 | else | 280 | queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); |
281 | queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); | 281 | else |
282 | queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); | ||
283 | } | ||
282 | 284 | ||
283 | pr_debug("md/raid0:%s: done.\n", mdname(mddev)); | 285 | pr_debug("md/raid0:%s: done.\n", mdname(mddev)); |
284 | *private_conf = conf; | 286 | *private_conf = conf; |
@@ -429,9 +431,12 @@ static int raid0_run(struct mddev *mddev) | |||
429 | } | 431 | } |
430 | if (md_check_no_bitmap(mddev)) | 432 | if (md_check_no_bitmap(mddev)) |
431 | return -EINVAL; | 433 | return -EINVAL; |
432 | blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); | 434 | |
433 | blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors); | 435 | if (mddev->queue) { |
434 | blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors); | 436 | blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); |
437 | blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors); | ||
438 | blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors); | ||
439 | } | ||
435 | 440 | ||
436 | /* if private is not null, we are here after takeover */ | 441 | /* if private is not null, we are here after takeover */ |
437 | if (mddev->private == NULL) { | 442 | if (mddev->private == NULL) { |
@@ -448,16 +453,17 @@ static int raid0_run(struct mddev *mddev) | |||
448 | printk(KERN_INFO "md/raid0:%s: md_size is %llu sectors.\n", | 453 | printk(KERN_INFO "md/raid0:%s: md_size is %llu sectors.\n", |
449 | mdname(mddev), | 454 | mdname(mddev), |
450 | (unsigned long long)mddev->array_sectors); | 455 | (unsigned long long)mddev->array_sectors); |
451 | /* calculate the max read-ahead size. | 456 | |
452 | * For read-ahead of large files to be effective, we need to | 457 | if (mddev->queue) { |
453 | * readahead at least twice a whole stripe. i.e. number of devices | 458 | /* calculate the max read-ahead size. |
454 | * multiplied by chunk size times 2. | 459 | * For read-ahead of large files to be effective, we need to |
455 | * If an individual device has an ra_pages greater than the | 460 | * readahead at least twice a whole stripe. i.e. number of devices |
456 | * chunk size, then we will not drive that device as hard as it | 461 | * multiplied by chunk size times 2. |
457 | * wants. We consider this a configuration error: a larger | 462 | * If an individual device has an ra_pages greater than the |
458 | * chunksize should be used in that case. | 463 | * chunk size, then we will not drive that device as hard as it |
459 | */ | 464 | * wants. We consider this a configuration error: a larger |
460 | { | 465 | * chunksize should be used in that case. |
466 | */ | ||
461 | int stripe = mddev->raid_disks * | 467 | int stripe = mddev->raid_disks * |
462 | (mddev->chunk_sectors << 9) / PAGE_SIZE; | 468 | (mddev->chunk_sectors << 9) / PAGE_SIZE; |
463 | if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) | 469 | if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) |
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index d34e238afa54..9157a29c8dbf 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -539,7 +539,13 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect | |||
539 | has_nonrot_disk = 0; | 539 | has_nonrot_disk = 0; |
540 | choose_next_idle = 0; | 540 | choose_next_idle = 0; |
541 | 541 | ||
542 | choose_first = (conf->mddev->recovery_cp < this_sector + sectors); | 542 | if ((conf->mddev->recovery_cp < this_sector + sectors) || |
543 | (mddev_is_clustered(conf->mddev) && | ||
544 | md_cluster_ops->area_resyncing(conf->mddev, this_sector, | ||
545 | this_sector + sectors))) | ||
546 | choose_first = 1; | ||
547 | else | ||
548 | choose_first = 0; | ||
543 | 549 | ||
544 | for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { | 550 | for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { |
545 | sector_t dist; | 551 | sector_t dist; |
@@ -1102,8 +1108,10 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
1102 | md_write_start(mddev, bio); /* wait on superblock update early */ | 1108 | md_write_start(mddev, bio); /* wait on superblock update early */ |
1103 | 1109 | ||
1104 | if (bio_data_dir(bio) == WRITE && | 1110 | if (bio_data_dir(bio) == WRITE && |
1105 | bio_end_sector(bio) > mddev->suspend_lo && | 1111 | ((bio_end_sector(bio) > mddev->suspend_lo && |
1106 | bio->bi_iter.bi_sector < mddev->suspend_hi) { | 1112 | bio->bi_iter.bi_sector < mddev->suspend_hi) || |
1113 | (mddev_is_clustered(mddev) && | ||
1114 | md_cluster_ops->area_resyncing(mddev, bio->bi_iter.bi_sector, bio_end_sector(bio))))) { | ||
1107 | /* As the suspend_* range is controlled by | 1115 | /* As the suspend_* range is controlled by |
1108 | * userspace, we want an interruptible | 1116 | * userspace, we want an interruptible |
1109 | * wait. | 1117 | * wait. |
@@ -1114,7 +1122,10 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
1114 | prepare_to_wait(&conf->wait_barrier, | 1122 | prepare_to_wait(&conf->wait_barrier, |
1115 | &w, TASK_INTERRUPTIBLE); | 1123 | &w, TASK_INTERRUPTIBLE); |
1116 | if (bio_end_sector(bio) <= mddev->suspend_lo || | 1124 | if (bio_end_sector(bio) <= mddev->suspend_lo || |
1117 | bio->bi_iter.bi_sector >= mddev->suspend_hi) | 1125 | bio->bi_iter.bi_sector >= mddev->suspend_hi || |
1126 | (mddev_is_clustered(mddev) && | ||
1127 | !md_cluster_ops->area_resyncing(mddev, | ||
1128 | bio->bi_iter.bi_sector, bio_end_sector(bio)))) | ||
1118 | break; | 1129 | break; |
1119 | schedule(); | 1130 | schedule(); |
1120 | } | 1131 | } |
@@ -1561,6 +1572,7 @@ static int raid1_spare_active(struct mddev *mddev) | |||
1561 | struct md_rdev *rdev = conf->mirrors[i].rdev; | 1572 | struct md_rdev *rdev = conf->mirrors[i].rdev; |
1562 | struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev; | 1573 | struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev; |
1563 | if (repl | 1574 | if (repl |
1575 | && !test_bit(Candidate, &repl->flags) | ||
1564 | && repl->recovery_offset == MaxSector | 1576 | && repl->recovery_offset == MaxSector |
1565 | && !test_bit(Faulty, &repl->flags) | 1577 | && !test_bit(Faulty, &repl->flags) |
1566 | && !test_and_set_bit(In_sync, &repl->flags)) { | 1578 | && !test_and_set_bit(In_sync, &repl->flags)) { |
@@ -2468,7 +2480,7 @@ static int init_resync(struct r1conf *conf) | |||
2468 | * that can be installed to exclude normal IO requests. | 2480 | * that can be installed to exclude normal IO requests. |
2469 | */ | 2481 | */ |
2470 | 2482 | ||
2471 | static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster) | 2483 | static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped) |
2472 | { | 2484 | { |
2473 | struct r1conf *conf = mddev->private; | 2485 | struct r1conf *conf = mddev->private; |
2474 | struct r1bio *r1_bio; | 2486 | struct r1bio *r1_bio; |
@@ -2521,13 +2533,6 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp | |||
2521 | *skipped = 1; | 2533 | *skipped = 1; |
2522 | return sync_blocks; | 2534 | return sync_blocks; |
2523 | } | 2535 | } |
2524 | /* | ||
2525 | * If there is non-resync activity waiting for a turn, | ||
2526 | * and resync is going fast enough, | ||
2527 | * then let it though before starting on this new sync request. | ||
2528 | */ | ||
2529 | if (!go_faster && conf->nr_waiting) | ||
2530 | msleep_interruptible(1000); | ||
2531 | 2536 | ||
2532 | bitmap_cond_end_sync(mddev->bitmap, sector_nr); | 2537 | bitmap_cond_end_sync(mddev->bitmap, sector_nr); |
2533 | r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); | 2538 | r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); |
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index a7196c49d15d..e793ab6b3570 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -2889,7 +2889,7 @@ static int init_resync(struct r10conf *conf) | |||
2889 | */ | 2889 | */ |
2890 | 2890 | ||
2891 | static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | 2891 | static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, |
2892 | int *skipped, int go_faster) | 2892 | int *skipped) |
2893 | { | 2893 | { |
2894 | struct r10conf *conf = mddev->private; | 2894 | struct r10conf *conf = mddev->private; |
2895 | struct r10bio *r10_bio; | 2895 | struct r10bio *r10_bio; |
@@ -2994,12 +2994,6 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2994 | if (conf->geo.near_copies < conf->geo.raid_disks && | 2994 | if (conf->geo.near_copies < conf->geo.raid_disks && |
2995 | max_sector > (sector_nr | chunk_mask)) | 2995 | max_sector > (sector_nr | chunk_mask)) |
2996 | max_sector = (sector_nr | chunk_mask) + 1; | 2996 | max_sector = (sector_nr | chunk_mask) + 1; |
2997 | /* | ||
2998 | * If there is non-resync activity waiting for us then | ||
2999 | * put in a delay to throttle resync. | ||
3000 | */ | ||
3001 | if (!go_faster && conf->nr_waiting) | ||
3002 | msleep_interruptible(1000); | ||
3003 | 2997 | ||
3004 | /* Again, very different code for resync and recovery. | 2998 | /* Again, very different code for resync and recovery. |
3005 | * Both must result in an r10bio with a list of bios that | 2999 | * Both must result in an r10bio with a list of bios that |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index cd2f96b2c572..77dfd720aaa0 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -54,6 +54,7 @@ | |||
54 | #include <linux/slab.h> | 54 | #include <linux/slab.h> |
55 | #include <linux/ratelimit.h> | 55 | #include <linux/ratelimit.h> |
56 | #include <linux/nodemask.h> | 56 | #include <linux/nodemask.h> |
57 | #include <linux/flex_array.h> | ||
57 | #include <trace/events/block.h> | 58 | #include <trace/events/block.h> |
58 | 59 | ||
59 | #include "md.h" | 60 | #include "md.h" |
@@ -496,7 +497,7 @@ static void shrink_buffers(struct stripe_head *sh) | |||
496 | } | 497 | } |
497 | } | 498 | } |
498 | 499 | ||
499 | static int grow_buffers(struct stripe_head *sh) | 500 | static int grow_buffers(struct stripe_head *sh, gfp_t gfp) |
500 | { | 501 | { |
501 | int i; | 502 | int i; |
502 | int num = sh->raid_conf->pool_size; | 503 | int num = sh->raid_conf->pool_size; |
@@ -504,7 +505,7 @@ static int grow_buffers(struct stripe_head *sh) | |||
504 | for (i = 0; i < num; i++) { | 505 | for (i = 0; i < num; i++) { |
505 | struct page *page; | 506 | struct page *page; |
506 | 507 | ||
507 | if (!(page = alloc_page(GFP_KERNEL))) { | 508 | if (!(page = alloc_page(gfp))) { |
508 | return 1; | 509 | return 1; |
509 | } | 510 | } |
510 | sh->dev[i].page = page; | 511 | sh->dev[i].page = page; |
@@ -525,6 +526,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) | |||
525 | BUG_ON(atomic_read(&sh->count) != 0); | 526 | BUG_ON(atomic_read(&sh->count) != 0); |
526 | BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); | 527 | BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); |
527 | BUG_ON(stripe_operations_active(sh)); | 528 | BUG_ON(stripe_operations_active(sh)); |
529 | BUG_ON(sh->batch_head); | ||
528 | 530 | ||
529 | pr_debug("init_stripe called, stripe %llu\n", | 531 | pr_debug("init_stripe called, stripe %llu\n", |
530 | (unsigned long long)sector); | 532 | (unsigned long long)sector); |
@@ -552,8 +554,10 @@ retry: | |||
552 | } | 554 | } |
553 | if (read_seqcount_retry(&conf->gen_lock, seq)) | 555 | if (read_seqcount_retry(&conf->gen_lock, seq)) |
554 | goto retry; | 556 | goto retry; |
557 | sh->overwrite_disks = 0; | ||
555 | insert_hash(conf, sh); | 558 | insert_hash(conf, sh); |
556 | sh->cpu = smp_processor_id(); | 559 | sh->cpu = smp_processor_id(); |
560 | set_bit(STRIPE_BATCH_READY, &sh->state); | ||
557 | } | 561 | } |
558 | 562 | ||
559 | static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, | 563 | static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, |
@@ -668,20 +672,28 @@ get_active_stripe(struct r5conf *conf, sector_t sector, | |||
668 | *(conf->hash_locks + hash)); | 672 | *(conf->hash_locks + hash)); |
669 | sh = __find_stripe(conf, sector, conf->generation - previous); | 673 | sh = __find_stripe(conf, sector, conf->generation - previous); |
670 | if (!sh) { | 674 | if (!sh) { |
671 | if (!conf->inactive_blocked) | 675 | if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) { |
672 | sh = get_free_stripe(conf, hash); | 676 | sh = get_free_stripe(conf, hash); |
677 | if (!sh && llist_empty(&conf->released_stripes) && | ||
678 | !test_bit(R5_DID_ALLOC, &conf->cache_state)) | ||
679 | set_bit(R5_ALLOC_MORE, | ||
680 | &conf->cache_state); | ||
681 | } | ||
673 | if (noblock && sh == NULL) | 682 | if (noblock && sh == NULL) |
674 | break; | 683 | break; |
675 | if (!sh) { | 684 | if (!sh) { |
676 | conf->inactive_blocked = 1; | 685 | set_bit(R5_INACTIVE_BLOCKED, |
686 | &conf->cache_state); | ||
677 | wait_event_lock_irq( | 687 | wait_event_lock_irq( |
678 | conf->wait_for_stripe, | 688 | conf->wait_for_stripe, |
679 | !list_empty(conf->inactive_list + hash) && | 689 | !list_empty(conf->inactive_list + hash) && |
680 | (atomic_read(&conf->active_stripes) | 690 | (atomic_read(&conf->active_stripes) |
681 | < (conf->max_nr_stripes * 3 / 4) | 691 | < (conf->max_nr_stripes * 3 / 4) |
682 | || !conf->inactive_blocked), | 692 | || !test_bit(R5_INACTIVE_BLOCKED, |
693 | &conf->cache_state)), | ||
683 | *(conf->hash_locks + hash)); | 694 | *(conf->hash_locks + hash)); |
684 | conf->inactive_blocked = 0; | 695 | clear_bit(R5_INACTIVE_BLOCKED, |
696 | &conf->cache_state); | ||
685 | } else { | 697 | } else { |
686 | init_stripe(sh, sector, previous); | 698 | init_stripe(sh, sector, previous); |
687 | atomic_inc(&sh->count); | 699 | atomic_inc(&sh->count); |
@@ -708,6 +720,130 @@ get_active_stripe(struct r5conf *conf, sector_t sector, | |||
708 | return sh; | 720 | return sh; |
709 | } | 721 | } |
710 | 722 | ||
723 | static bool is_full_stripe_write(struct stripe_head *sh) | ||
724 | { | ||
725 | BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded)); | ||
726 | return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded); | ||
727 | } | ||
728 | |||
729 | static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) | ||
730 | { | ||
731 | local_irq_disable(); | ||
732 | if (sh1 > sh2) { | ||
733 | spin_lock(&sh2->stripe_lock); | ||
734 | spin_lock_nested(&sh1->stripe_lock, 1); | ||
735 | } else { | ||
736 | spin_lock(&sh1->stripe_lock); | ||
737 | spin_lock_nested(&sh2->stripe_lock, 1); | ||
738 | } | ||
739 | } | ||
740 | |||
741 | static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) | ||
742 | { | ||
743 | spin_unlock(&sh1->stripe_lock); | ||
744 | spin_unlock(&sh2->stripe_lock); | ||
745 | local_irq_enable(); | ||
746 | } | ||
747 | |||
748 | /* Only freshly new full stripe normal write stripe can be added to a batch list */ | ||
749 | static bool stripe_can_batch(struct stripe_head *sh) | ||
750 | { | ||
751 | return test_bit(STRIPE_BATCH_READY, &sh->state) && | ||
752 | is_full_stripe_write(sh); | ||
753 | } | ||
754 | |||
755 | /* we only do back search */ | ||
756 | static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh) | ||
757 | { | ||
758 | struct stripe_head *head; | ||
759 | sector_t head_sector, tmp_sec; | ||
760 | int hash; | ||
761 | int dd_idx; | ||
762 | |||
763 | if (!stripe_can_batch(sh)) | ||
764 | return; | ||
765 | /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */ | ||
766 | tmp_sec = sh->sector; | ||
767 | if (!sector_div(tmp_sec, conf->chunk_sectors)) | ||
768 | return; | ||
769 | head_sector = sh->sector - STRIPE_SECTORS; | ||
770 | |||
771 | hash = stripe_hash_locks_hash(head_sector); | ||
772 | spin_lock_irq(conf->hash_locks + hash); | ||
773 | head = __find_stripe(conf, head_sector, conf->generation); | ||
774 | if (head && !atomic_inc_not_zero(&head->count)) { | ||
775 | spin_lock(&conf->device_lock); | ||
776 | if (!atomic_read(&head->count)) { | ||
777 | if (!test_bit(STRIPE_HANDLE, &head->state)) | ||
778 | atomic_inc(&conf->active_stripes); | ||
779 | BUG_ON(list_empty(&head->lru) && | ||
780 | !test_bit(STRIPE_EXPANDING, &head->state)); | ||
781 | list_del_init(&head->lru); | ||
782 | if (head->group) { | ||
783 | head->group->stripes_cnt--; | ||
784 | head->group = NULL; | ||
785 | } | ||
786 | } | ||
787 | atomic_inc(&head->count); | ||
788 | spin_unlock(&conf->device_lock); | ||
789 | } | ||
790 | spin_unlock_irq(conf->hash_locks + hash); | ||
791 | |||
792 | if (!head) | ||
793 | return; | ||
794 | if (!stripe_can_batch(head)) | ||
795 | goto out; | ||
796 | |||
797 | lock_two_stripes(head, sh); | ||
798 | /* clear_batch_ready clear the flag */ | ||
799 | if (!stripe_can_batch(head) || !stripe_can_batch(sh)) | ||
800 | goto unlock_out; | ||
801 | |||
802 | if (sh->batch_head) | ||
803 | goto unlock_out; | ||
804 | |||
805 | dd_idx = 0; | ||
806 | while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) | ||
807 | dd_idx++; | ||
808 | if (head->dev[dd_idx].towrite->bi_rw != sh->dev[dd_idx].towrite->bi_rw) | ||
809 | goto unlock_out; | ||
810 | |||
811 | if (head->batch_head) { | ||
812 | spin_lock(&head->batch_head->batch_lock); | ||
813 | /* This batch list is already running */ | ||
814 | if (!stripe_can_batch(head)) { | ||
815 | spin_unlock(&head->batch_head->batch_lock); | ||
816 | goto unlock_out; | ||
817 | } | ||
818 | |||
819 | /* | ||
820 | * at this point, head's BATCH_READY could be cleared, but we | ||
821 | * can still add the stripe to batch list | ||
822 | */ | ||
823 | list_add(&sh->batch_list, &head->batch_list); | ||
824 | spin_unlock(&head->batch_head->batch_lock); | ||
825 | |||
826 | sh->batch_head = head->batch_head; | ||
827 | } else { | ||
828 | head->batch_head = head; | ||
829 | sh->batch_head = head->batch_head; | ||
830 | spin_lock(&head->batch_lock); | ||
831 | list_add_tail(&sh->batch_list, &head->batch_list); | ||
832 | spin_unlock(&head->batch_lock); | ||
833 | } | ||
834 | |||
835 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | ||
836 | if (atomic_dec_return(&conf->preread_active_stripes) | ||
837 | < IO_THRESHOLD) | ||
838 | md_wakeup_thread(conf->mddev->thread); | ||
839 | |||
840 | atomic_inc(&sh->count); | ||
841 | unlock_out: | ||
842 | unlock_two_stripes(head, sh); | ||
843 | out: | ||
844 | release_stripe(head); | ||
845 | } | ||
846 | |||
711 | /* Determine if 'data_offset' or 'new_data_offset' should be used | 847 | /* Determine if 'data_offset' or 'new_data_offset' should be used |
712 | * in this stripe_head. | 848 | * in this stripe_head. |
713 | */ | 849 | */ |
@@ -738,6 +874,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
738 | { | 874 | { |
739 | struct r5conf *conf = sh->raid_conf; | 875 | struct r5conf *conf = sh->raid_conf; |
740 | int i, disks = sh->disks; | 876 | int i, disks = sh->disks; |
877 | struct stripe_head *head_sh = sh; | ||
741 | 878 | ||
742 | might_sleep(); | 879 | might_sleep(); |
743 | 880 | ||
@@ -746,6 +883,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
746 | int replace_only = 0; | 883 | int replace_only = 0; |
747 | struct bio *bi, *rbi; | 884 | struct bio *bi, *rbi; |
748 | struct md_rdev *rdev, *rrdev = NULL; | 885 | struct md_rdev *rdev, *rrdev = NULL; |
886 | |||
887 | sh = head_sh; | ||
749 | if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { | 888 | if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { |
750 | if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) | 889 | if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) |
751 | rw = WRITE_FUA; | 890 | rw = WRITE_FUA; |
@@ -764,6 +903,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
764 | if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) | 903 | if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) |
765 | rw |= REQ_SYNC; | 904 | rw |= REQ_SYNC; |
766 | 905 | ||
906 | again: | ||
767 | bi = &sh->dev[i].req; | 907 | bi = &sh->dev[i].req; |
768 | rbi = &sh->dev[i].rreq; /* For writing to replacement */ | 908 | rbi = &sh->dev[i].rreq; /* For writing to replacement */ |
769 | 909 | ||
@@ -782,7 +922,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
782 | /* We raced and saw duplicates */ | 922 | /* We raced and saw duplicates */ |
783 | rrdev = NULL; | 923 | rrdev = NULL; |
784 | } else { | 924 | } else { |
785 | if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev) | 925 | if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev) |
786 | rdev = rrdev; | 926 | rdev = rrdev; |
787 | rrdev = NULL; | 927 | rrdev = NULL; |
788 | } | 928 | } |
@@ -853,13 +993,15 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
853 | __func__, (unsigned long long)sh->sector, | 993 | __func__, (unsigned long long)sh->sector, |
854 | bi->bi_rw, i); | 994 | bi->bi_rw, i); |
855 | atomic_inc(&sh->count); | 995 | atomic_inc(&sh->count); |
996 | if (sh != head_sh) | ||
997 | atomic_inc(&head_sh->count); | ||
856 | if (use_new_offset(conf, sh)) | 998 | if (use_new_offset(conf, sh)) |
857 | bi->bi_iter.bi_sector = (sh->sector | 999 | bi->bi_iter.bi_sector = (sh->sector |
858 | + rdev->new_data_offset); | 1000 | + rdev->new_data_offset); |
859 | else | 1001 | else |
860 | bi->bi_iter.bi_sector = (sh->sector | 1002 | bi->bi_iter.bi_sector = (sh->sector |
861 | + rdev->data_offset); | 1003 | + rdev->data_offset); |
862 | if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) | 1004 | if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags)) |
863 | bi->bi_rw |= REQ_NOMERGE; | 1005 | bi->bi_rw |= REQ_NOMERGE; |
864 | 1006 | ||
865 | if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) | 1007 | if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) |
@@ -903,6 +1045,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
903 | __func__, (unsigned long long)sh->sector, | 1045 | __func__, (unsigned long long)sh->sector, |
904 | rbi->bi_rw, i); | 1046 | rbi->bi_rw, i); |
905 | atomic_inc(&sh->count); | 1047 | atomic_inc(&sh->count); |
1048 | if (sh != head_sh) | ||
1049 | atomic_inc(&head_sh->count); | ||
906 | if (use_new_offset(conf, sh)) | 1050 | if (use_new_offset(conf, sh)) |
907 | rbi->bi_iter.bi_sector = (sh->sector | 1051 | rbi->bi_iter.bi_sector = (sh->sector |
908 | + rrdev->new_data_offset); | 1052 | + rrdev->new_data_offset); |
@@ -934,8 +1078,18 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
934 | pr_debug("skip op %ld on disc %d for sector %llu\n", | 1078 | pr_debug("skip op %ld on disc %d for sector %llu\n", |
935 | bi->bi_rw, i, (unsigned long long)sh->sector); | 1079 | bi->bi_rw, i, (unsigned long long)sh->sector); |
936 | clear_bit(R5_LOCKED, &sh->dev[i].flags); | 1080 | clear_bit(R5_LOCKED, &sh->dev[i].flags); |
1081 | if (sh->batch_head) | ||
1082 | set_bit(STRIPE_BATCH_ERR, | ||
1083 | &sh->batch_head->state); | ||
937 | set_bit(STRIPE_HANDLE, &sh->state); | 1084 | set_bit(STRIPE_HANDLE, &sh->state); |
938 | } | 1085 | } |
1086 | |||
1087 | if (!head_sh->batch_head) | ||
1088 | continue; | ||
1089 | sh = list_first_entry(&sh->batch_list, struct stripe_head, | ||
1090 | batch_list); | ||
1091 | if (sh != head_sh) | ||
1092 | goto again; | ||
939 | } | 1093 | } |
940 | } | 1094 | } |
941 | 1095 | ||
@@ -1051,6 +1205,7 @@ static void ops_run_biofill(struct stripe_head *sh) | |||
1051 | struct async_submit_ctl submit; | 1205 | struct async_submit_ctl submit; |
1052 | int i; | 1206 | int i; |
1053 | 1207 | ||
1208 | BUG_ON(sh->batch_head); | ||
1054 | pr_debug("%s: stripe %llu\n", __func__, | 1209 | pr_debug("%s: stripe %llu\n", __func__, |
1055 | (unsigned long long)sh->sector); | 1210 | (unsigned long long)sh->sector); |
1056 | 1211 | ||
@@ -1109,16 +1264,28 @@ static void ops_complete_compute(void *stripe_head_ref) | |||
1109 | 1264 | ||
1110 | /* return a pointer to the address conversion region of the scribble buffer */ | 1265 | /* return a pointer to the address conversion region of the scribble buffer */ |
1111 | static addr_conv_t *to_addr_conv(struct stripe_head *sh, | 1266 | static addr_conv_t *to_addr_conv(struct stripe_head *sh, |
1112 | struct raid5_percpu *percpu) | 1267 | struct raid5_percpu *percpu, int i) |
1113 | { | 1268 | { |
1114 | return percpu->scribble + sizeof(struct page *) * (sh->disks + 2); | 1269 | void *addr; |
1270 | |||
1271 | addr = flex_array_get(percpu->scribble, i); | ||
1272 | return addr + sizeof(struct page *) * (sh->disks + 2); | ||
1273 | } | ||
1274 | |||
1275 | /* return a pointer to the address conversion region of the scribble buffer */ | ||
1276 | static struct page **to_addr_page(struct raid5_percpu *percpu, int i) | ||
1277 | { | ||
1278 | void *addr; | ||
1279 | |||
1280 | addr = flex_array_get(percpu->scribble, i); | ||
1281 | return addr; | ||
1115 | } | 1282 | } |
1116 | 1283 | ||
1117 | static struct dma_async_tx_descriptor * | 1284 | static struct dma_async_tx_descriptor * |
1118 | ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) | 1285 | ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) |
1119 | { | 1286 | { |
1120 | int disks = sh->disks; | 1287 | int disks = sh->disks; |
1121 | struct page **xor_srcs = percpu->scribble; | 1288 | struct page **xor_srcs = to_addr_page(percpu, 0); |
1122 | int target = sh->ops.target; | 1289 | int target = sh->ops.target; |
1123 | struct r5dev *tgt = &sh->dev[target]; | 1290 | struct r5dev *tgt = &sh->dev[target]; |
1124 | struct page *xor_dest = tgt->page; | 1291 | struct page *xor_dest = tgt->page; |
@@ -1127,6 +1294,8 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) | |||
1127 | struct async_submit_ctl submit; | 1294 | struct async_submit_ctl submit; |
1128 | int i; | 1295 | int i; |
1129 | 1296 | ||
1297 | BUG_ON(sh->batch_head); | ||
1298 | |||
1130 | pr_debug("%s: stripe %llu block: %d\n", | 1299 | pr_debug("%s: stripe %llu block: %d\n", |
1131 | __func__, (unsigned long long)sh->sector, target); | 1300 | __func__, (unsigned long long)sh->sector, target); |
1132 | BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); | 1301 | BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); |
@@ -1138,7 +1307,7 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) | |||
1138 | atomic_inc(&sh->count); | 1307 | atomic_inc(&sh->count); |
1139 | 1308 | ||
1140 | init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, | 1309 | init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, |
1141 | ops_complete_compute, sh, to_addr_conv(sh, percpu)); | 1310 | ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); |
1142 | if (unlikely(count == 1)) | 1311 | if (unlikely(count == 1)) |
1143 | tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); | 1312 | tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); |
1144 | else | 1313 | else |
@@ -1156,7 +1325,9 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) | |||
1156 | * destination buffer is recorded in srcs[count] and the Q destination | 1325 | * destination buffer is recorded in srcs[count] and the Q destination |
1157 | * is recorded in srcs[count+1]]. | 1326 | * is recorded in srcs[count+1]]. |
1158 | */ | 1327 | */ |
1159 | static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) | 1328 | static int set_syndrome_sources(struct page **srcs, |
1329 | struct stripe_head *sh, | ||
1330 | int srctype) | ||
1160 | { | 1331 | { |
1161 | int disks = sh->disks; | 1332 | int disks = sh->disks; |
1162 | int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); | 1333 | int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); |
@@ -1171,8 +1342,15 @@ static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) | |||
1171 | i = d0_idx; | 1342 | i = d0_idx; |
1172 | do { | 1343 | do { |
1173 | int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); | 1344 | int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); |
1345 | struct r5dev *dev = &sh->dev[i]; | ||
1174 | 1346 | ||
1175 | srcs[slot] = sh->dev[i].page; | 1347 | if (i == sh->qd_idx || i == sh->pd_idx || |
1348 | (srctype == SYNDROME_SRC_ALL) || | ||
1349 | (srctype == SYNDROME_SRC_WANT_DRAIN && | ||
1350 | test_bit(R5_Wantdrain, &dev->flags)) || | ||
1351 | (srctype == SYNDROME_SRC_WRITTEN && | ||
1352 | dev->written)) | ||
1353 | srcs[slot] = sh->dev[i].page; | ||
1176 | i = raid6_next_disk(i, disks); | 1354 | i = raid6_next_disk(i, disks); |
1177 | } while (i != d0_idx); | 1355 | } while (i != d0_idx); |
1178 | 1356 | ||
@@ -1183,7 +1361,7 @@ static struct dma_async_tx_descriptor * | |||
1183 | ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) | 1361 | ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) |
1184 | { | 1362 | { |
1185 | int disks = sh->disks; | 1363 | int disks = sh->disks; |
1186 | struct page **blocks = percpu->scribble; | 1364 | struct page **blocks = to_addr_page(percpu, 0); |
1187 | int target; | 1365 | int target; |
1188 | int qd_idx = sh->qd_idx; | 1366 | int qd_idx = sh->qd_idx; |
1189 | struct dma_async_tx_descriptor *tx; | 1367 | struct dma_async_tx_descriptor *tx; |
@@ -1193,6 +1371,7 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) | |||
1193 | int i; | 1371 | int i; |
1194 | int count; | 1372 | int count; |
1195 | 1373 | ||
1374 | BUG_ON(sh->batch_head); | ||
1196 | if (sh->ops.target < 0) | 1375 | if (sh->ops.target < 0) |
1197 | target = sh->ops.target2; | 1376 | target = sh->ops.target2; |
1198 | else if (sh->ops.target2 < 0) | 1377 | else if (sh->ops.target2 < 0) |
@@ -1211,12 +1390,12 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) | |||
1211 | atomic_inc(&sh->count); | 1390 | atomic_inc(&sh->count); |
1212 | 1391 | ||
1213 | if (target == qd_idx) { | 1392 | if (target == qd_idx) { |
1214 | count = set_syndrome_sources(blocks, sh); | 1393 | count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); |
1215 | blocks[count] = NULL; /* regenerating p is not necessary */ | 1394 | blocks[count] = NULL; /* regenerating p is not necessary */ |
1216 | BUG_ON(blocks[count+1] != dest); /* q should already be set */ | 1395 | BUG_ON(blocks[count+1] != dest); /* q should already be set */ |
1217 | init_async_submit(&submit, ASYNC_TX_FENCE, NULL, | 1396 | init_async_submit(&submit, ASYNC_TX_FENCE, NULL, |
1218 | ops_complete_compute, sh, | 1397 | ops_complete_compute, sh, |
1219 | to_addr_conv(sh, percpu)); | 1398 | to_addr_conv(sh, percpu, 0)); |
1220 | tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); | 1399 | tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); |
1221 | } else { | 1400 | } else { |
1222 | /* Compute any data- or p-drive using XOR */ | 1401 | /* Compute any data- or p-drive using XOR */ |
@@ -1229,7 +1408,7 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) | |||
1229 | 1408 | ||
1230 | init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, | 1409 | init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, |
1231 | NULL, ops_complete_compute, sh, | 1410 | NULL, ops_complete_compute, sh, |
1232 | to_addr_conv(sh, percpu)); | 1411 | to_addr_conv(sh, percpu, 0)); |
1233 | tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); | 1412 | tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); |
1234 | } | 1413 | } |
1235 | 1414 | ||
@@ -1248,9 +1427,10 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) | |||
1248 | struct r5dev *tgt = &sh->dev[target]; | 1427 | struct r5dev *tgt = &sh->dev[target]; |
1249 | struct r5dev *tgt2 = &sh->dev[target2]; | 1428 | struct r5dev *tgt2 = &sh->dev[target2]; |
1250 | struct dma_async_tx_descriptor *tx; | 1429 | struct dma_async_tx_descriptor *tx; |
1251 | struct page **blocks = percpu->scribble; | 1430 | struct page **blocks = to_addr_page(percpu, 0); |
1252 | struct async_submit_ctl submit; | 1431 | struct async_submit_ctl submit; |
1253 | 1432 | ||
1433 | BUG_ON(sh->batch_head); | ||
1254 | pr_debug("%s: stripe %llu block1: %d block2: %d\n", | 1434 | pr_debug("%s: stripe %llu block1: %d block2: %d\n", |
1255 | __func__, (unsigned long long)sh->sector, target, target2); | 1435 | __func__, (unsigned long long)sh->sector, target, target2); |
1256 | BUG_ON(target < 0 || target2 < 0); | 1436 | BUG_ON(target < 0 || target2 < 0); |
@@ -1290,7 +1470,7 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) | |||
1290 | /* Missing P+Q, just recompute */ | 1470 | /* Missing P+Q, just recompute */ |
1291 | init_async_submit(&submit, ASYNC_TX_FENCE, NULL, | 1471 | init_async_submit(&submit, ASYNC_TX_FENCE, NULL, |
1292 | ops_complete_compute, sh, | 1472 | ops_complete_compute, sh, |
1293 | to_addr_conv(sh, percpu)); | 1473 | to_addr_conv(sh, percpu, 0)); |
1294 | return async_gen_syndrome(blocks, 0, syndrome_disks+2, | 1474 | return async_gen_syndrome(blocks, 0, syndrome_disks+2, |
1295 | STRIPE_SIZE, &submit); | 1475 | STRIPE_SIZE, &submit); |
1296 | } else { | 1476 | } else { |
@@ -1314,21 +1494,21 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) | |||
1314 | init_async_submit(&submit, | 1494 | init_async_submit(&submit, |
1315 | ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, | 1495 | ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, |
1316 | NULL, NULL, NULL, | 1496 | NULL, NULL, NULL, |
1317 | to_addr_conv(sh, percpu)); | 1497 | to_addr_conv(sh, percpu, 0)); |
1318 | tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, | 1498 | tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, |
1319 | &submit); | 1499 | &submit); |
1320 | 1500 | ||
1321 | count = set_syndrome_sources(blocks, sh); | 1501 | count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); |
1322 | init_async_submit(&submit, ASYNC_TX_FENCE, tx, | 1502 | init_async_submit(&submit, ASYNC_TX_FENCE, tx, |
1323 | ops_complete_compute, sh, | 1503 | ops_complete_compute, sh, |
1324 | to_addr_conv(sh, percpu)); | 1504 | to_addr_conv(sh, percpu, 0)); |
1325 | return async_gen_syndrome(blocks, 0, count+2, | 1505 | return async_gen_syndrome(blocks, 0, count+2, |
1326 | STRIPE_SIZE, &submit); | 1506 | STRIPE_SIZE, &submit); |
1327 | } | 1507 | } |
1328 | } else { | 1508 | } else { |
1329 | init_async_submit(&submit, ASYNC_TX_FENCE, NULL, | 1509 | init_async_submit(&submit, ASYNC_TX_FENCE, NULL, |
1330 | ops_complete_compute, sh, | 1510 | ops_complete_compute, sh, |
1331 | to_addr_conv(sh, percpu)); | 1511 | to_addr_conv(sh, percpu, 0)); |
1332 | if (failb == syndrome_disks) { | 1512 | if (failb == syndrome_disks) { |
1333 | /* We're missing D+P. */ | 1513 | /* We're missing D+P. */ |
1334 | return async_raid6_datap_recov(syndrome_disks+2, | 1514 | return async_raid6_datap_recov(syndrome_disks+2, |
@@ -1352,17 +1532,18 @@ static void ops_complete_prexor(void *stripe_head_ref) | |||
1352 | } | 1532 | } |
1353 | 1533 | ||
1354 | static struct dma_async_tx_descriptor * | 1534 | static struct dma_async_tx_descriptor * |
1355 | ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, | 1535 | ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu, |
1356 | struct dma_async_tx_descriptor *tx) | 1536 | struct dma_async_tx_descriptor *tx) |
1357 | { | 1537 | { |
1358 | int disks = sh->disks; | 1538 | int disks = sh->disks; |
1359 | struct page **xor_srcs = percpu->scribble; | 1539 | struct page **xor_srcs = to_addr_page(percpu, 0); |
1360 | int count = 0, pd_idx = sh->pd_idx, i; | 1540 | int count = 0, pd_idx = sh->pd_idx, i; |
1361 | struct async_submit_ctl submit; | 1541 | struct async_submit_ctl submit; |
1362 | 1542 | ||
1363 | /* existing parity data subtracted */ | 1543 | /* existing parity data subtracted */ |
1364 | struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; | 1544 | struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; |
1365 | 1545 | ||
1546 | BUG_ON(sh->batch_head); | ||
1366 | pr_debug("%s: stripe %llu\n", __func__, | 1547 | pr_debug("%s: stripe %llu\n", __func__, |
1367 | (unsigned long long)sh->sector); | 1548 | (unsigned long long)sh->sector); |
1368 | 1549 | ||
@@ -1374,31 +1555,56 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, | |||
1374 | } | 1555 | } |
1375 | 1556 | ||
1376 | init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, | 1557 | init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, |
1377 | ops_complete_prexor, sh, to_addr_conv(sh, percpu)); | 1558 | ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); |
1378 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); | 1559 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); |
1379 | 1560 | ||
1380 | return tx; | 1561 | return tx; |
1381 | } | 1562 | } |
1382 | 1563 | ||
1383 | static struct dma_async_tx_descriptor * | 1564 | static struct dma_async_tx_descriptor * |
1565 | ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu, | ||
1566 | struct dma_async_tx_descriptor *tx) | ||
1567 | { | ||
1568 | struct page **blocks = to_addr_page(percpu, 0); | ||
1569 | int count; | ||
1570 | struct async_submit_ctl submit; | ||
1571 | |||
1572 | pr_debug("%s: stripe %llu\n", __func__, | ||
1573 | (unsigned long long)sh->sector); | ||
1574 | |||
1575 | count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN); | ||
1576 | |||
1577 | init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx, | ||
1578 | ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); | ||
1579 | tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); | ||
1580 | |||
1581 | return tx; | ||
1582 | } | ||
1583 | |||
1584 | static struct dma_async_tx_descriptor * | ||
1384 | ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | 1585 | ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) |
1385 | { | 1586 | { |
1386 | int disks = sh->disks; | 1587 | int disks = sh->disks; |
1387 | int i; | 1588 | int i; |
1589 | struct stripe_head *head_sh = sh; | ||
1388 | 1590 | ||
1389 | pr_debug("%s: stripe %llu\n", __func__, | 1591 | pr_debug("%s: stripe %llu\n", __func__, |
1390 | (unsigned long long)sh->sector); | 1592 | (unsigned long long)sh->sector); |
1391 | 1593 | ||
1392 | for (i = disks; i--; ) { | 1594 | for (i = disks; i--; ) { |
1393 | struct r5dev *dev = &sh->dev[i]; | 1595 | struct r5dev *dev; |
1394 | struct bio *chosen; | 1596 | struct bio *chosen; |
1395 | 1597 | ||
1396 | if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { | 1598 | sh = head_sh; |
1599 | if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) { | ||
1397 | struct bio *wbi; | 1600 | struct bio *wbi; |
1398 | 1601 | ||
1602 | again: | ||
1603 | dev = &sh->dev[i]; | ||
1399 | spin_lock_irq(&sh->stripe_lock); | 1604 | spin_lock_irq(&sh->stripe_lock); |
1400 | chosen = dev->towrite; | 1605 | chosen = dev->towrite; |
1401 | dev->towrite = NULL; | 1606 | dev->towrite = NULL; |
1607 | sh->overwrite_disks = 0; | ||
1402 | BUG_ON(dev->written); | 1608 | BUG_ON(dev->written); |
1403 | wbi = dev->written = chosen; | 1609 | wbi = dev->written = chosen; |
1404 | spin_unlock_irq(&sh->stripe_lock); | 1610 | spin_unlock_irq(&sh->stripe_lock); |
@@ -1423,6 +1629,15 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
1423 | } | 1629 | } |
1424 | wbi = r5_next_bio(wbi, dev->sector); | 1630 | wbi = r5_next_bio(wbi, dev->sector); |
1425 | } | 1631 | } |
1632 | |||
1633 | if (head_sh->batch_head) { | ||
1634 | sh = list_first_entry(&sh->batch_list, | ||
1635 | struct stripe_head, | ||
1636 | batch_list); | ||
1637 | if (sh == head_sh) | ||
1638 | continue; | ||
1639 | goto again; | ||
1640 | } | ||
1426 | } | 1641 | } |
1427 | } | 1642 | } |
1428 | 1643 | ||
@@ -1478,12 +1693,15 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, | |||
1478 | struct dma_async_tx_descriptor *tx) | 1693 | struct dma_async_tx_descriptor *tx) |
1479 | { | 1694 | { |
1480 | int disks = sh->disks; | 1695 | int disks = sh->disks; |
1481 | struct page **xor_srcs = percpu->scribble; | 1696 | struct page **xor_srcs; |
1482 | struct async_submit_ctl submit; | 1697 | struct async_submit_ctl submit; |
1483 | int count = 0, pd_idx = sh->pd_idx, i; | 1698 | int count, pd_idx = sh->pd_idx, i; |
1484 | struct page *xor_dest; | 1699 | struct page *xor_dest; |
1485 | int prexor = 0; | 1700 | int prexor = 0; |
1486 | unsigned long flags; | 1701 | unsigned long flags; |
1702 | int j = 0; | ||
1703 | struct stripe_head *head_sh = sh; | ||
1704 | int last_stripe; | ||
1487 | 1705 | ||
1488 | pr_debug("%s: stripe %llu\n", __func__, | 1706 | pr_debug("%s: stripe %llu\n", __func__, |
1489 | (unsigned long long)sh->sector); | 1707 | (unsigned long long)sh->sector); |
@@ -1500,15 +1718,18 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, | |||
1500 | ops_complete_reconstruct(sh); | 1718 | ops_complete_reconstruct(sh); |
1501 | return; | 1719 | return; |
1502 | } | 1720 | } |
1721 | again: | ||
1722 | count = 0; | ||
1723 | xor_srcs = to_addr_page(percpu, j); | ||
1503 | /* check if prexor is active which means only process blocks | 1724 | /* check if prexor is active which means only process blocks |
1504 | * that are part of a read-modify-write (written) | 1725 | * that are part of a read-modify-write (written) |
1505 | */ | 1726 | */ |
1506 | if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { | 1727 | if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) { |
1507 | prexor = 1; | 1728 | prexor = 1; |
1508 | xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; | 1729 | xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; |
1509 | for (i = disks; i--; ) { | 1730 | for (i = disks; i--; ) { |
1510 | struct r5dev *dev = &sh->dev[i]; | 1731 | struct r5dev *dev = &sh->dev[i]; |
1511 | if (dev->written) | 1732 | if (head_sh->dev[i].written) |
1512 | xor_srcs[count++] = dev->page; | 1733 | xor_srcs[count++] = dev->page; |
1513 | } | 1734 | } |
1514 | } else { | 1735 | } else { |
@@ -1525,17 +1746,32 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, | |||
1525 | * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST | 1746 | * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST |
1526 | * for the synchronous xor case | 1747 | * for the synchronous xor case |
1527 | */ | 1748 | */ |
1528 | flags = ASYNC_TX_ACK | | 1749 | last_stripe = !head_sh->batch_head || |
1529 | (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); | 1750 | list_first_entry(&sh->batch_list, |
1530 | 1751 | struct stripe_head, batch_list) == head_sh; | |
1531 | atomic_inc(&sh->count); | 1752 | if (last_stripe) { |
1753 | flags = ASYNC_TX_ACK | | ||
1754 | (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); | ||
1755 | |||
1756 | atomic_inc(&head_sh->count); | ||
1757 | init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh, | ||
1758 | to_addr_conv(sh, percpu, j)); | ||
1759 | } else { | ||
1760 | flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST; | ||
1761 | init_async_submit(&submit, flags, tx, NULL, NULL, | ||
1762 | to_addr_conv(sh, percpu, j)); | ||
1763 | } | ||
1532 | 1764 | ||
1533 | init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh, | ||
1534 | to_addr_conv(sh, percpu)); | ||
1535 | if (unlikely(count == 1)) | 1765 | if (unlikely(count == 1)) |
1536 | tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); | 1766 | tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); |
1537 | else | 1767 | else |
1538 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); | 1768 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); |
1769 | if (!last_stripe) { | ||
1770 | j++; | ||
1771 | sh = list_first_entry(&sh->batch_list, struct stripe_head, | ||
1772 | batch_list); | ||
1773 | goto again; | ||
1774 | } | ||
1539 | } | 1775 | } |
1540 | 1776 | ||
1541 | static void | 1777 | static void |
@@ -1543,8 +1779,12 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, | |||
1543 | struct dma_async_tx_descriptor *tx) | 1779 | struct dma_async_tx_descriptor *tx) |
1544 | { | 1780 | { |
1545 | struct async_submit_ctl submit; | 1781 | struct async_submit_ctl submit; |
1546 | struct page **blocks = percpu->scribble; | 1782 | struct page **blocks; |
1547 | int count, i; | 1783 | int count, i, j = 0; |
1784 | struct stripe_head *head_sh = sh; | ||
1785 | int last_stripe; | ||
1786 | int synflags; | ||
1787 | unsigned long txflags; | ||
1548 | 1788 | ||
1549 | pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); | 1789 | pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); |
1550 | 1790 | ||
@@ -1562,13 +1802,36 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, | |||
1562 | return; | 1802 | return; |
1563 | } | 1803 | } |
1564 | 1804 | ||
1565 | count = set_syndrome_sources(blocks, sh); | 1805 | again: |
1806 | blocks = to_addr_page(percpu, j); | ||
1566 | 1807 | ||
1567 | atomic_inc(&sh->count); | 1808 | if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { |
1809 | synflags = SYNDROME_SRC_WRITTEN; | ||
1810 | txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST; | ||
1811 | } else { | ||
1812 | synflags = SYNDROME_SRC_ALL; | ||
1813 | txflags = ASYNC_TX_ACK; | ||
1814 | } | ||
1815 | |||
1816 | count = set_syndrome_sources(blocks, sh, synflags); | ||
1817 | last_stripe = !head_sh->batch_head || | ||
1818 | list_first_entry(&sh->batch_list, | ||
1819 | struct stripe_head, batch_list) == head_sh; | ||
1568 | 1820 | ||
1569 | init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, | 1821 | if (last_stripe) { |
1570 | sh, to_addr_conv(sh, percpu)); | 1822 | atomic_inc(&head_sh->count); |
1823 | init_async_submit(&submit, txflags, tx, ops_complete_reconstruct, | ||
1824 | head_sh, to_addr_conv(sh, percpu, j)); | ||
1825 | } else | ||
1826 | init_async_submit(&submit, 0, tx, NULL, NULL, | ||
1827 | to_addr_conv(sh, percpu, j)); | ||
1571 | async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); | 1828 | async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); |
1829 | if (!last_stripe) { | ||
1830 | j++; | ||
1831 | sh = list_first_entry(&sh->batch_list, struct stripe_head, | ||
1832 | batch_list); | ||
1833 | goto again; | ||
1834 | } | ||
1572 | } | 1835 | } |
1573 | 1836 | ||
1574 | static void ops_complete_check(void *stripe_head_ref) | 1837 | static void ops_complete_check(void *stripe_head_ref) |
@@ -1589,7 +1852,7 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) | |||
1589 | int pd_idx = sh->pd_idx; | 1852 | int pd_idx = sh->pd_idx; |
1590 | int qd_idx = sh->qd_idx; | 1853 | int qd_idx = sh->qd_idx; |
1591 | struct page *xor_dest; | 1854 | struct page *xor_dest; |
1592 | struct page **xor_srcs = percpu->scribble; | 1855 | struct page **xor_srcs = to_addr_page(percpu, 0); |
1593 | struct dma_async_tx_descriptor *tx; | 1856 | struct dma_async_tx_descriptor *tx; |
1594 | struct async_submit_ctl submit; | 1857 | struct async_submit_ctl submit; |
1595 | int count; | 1858 | int count; |
@@ -1598,6 +1861,7 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) | |||
1598 | pr_debug("%s: stripe %llu\n", __func__, | 1861 | pr_debug("%s: stripe %llu\n", __func__, |
1599 | (unsigned long long)sh->sector); | 1862 | (unsigned long long)sh->sector); |
1600 | 1863 | ||
1864 | BUG_ON(sh->batch_head); | ||
1601 | count = 0; | 1865 | count = 0; |
1602 | xor_dest = sh->dev[pd_idx].page; | 1866 | xor_dest = sh->dev[pd_idx].page; |
1603 | xor_srcs[count++] = xor_dest; | 1867 | xor_srcs[count++] = xor_dest; |
@@ -1608,7 +1872,7 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) | |||
1608 | } | 1872 | } |
1609 | 1873 | ||
1610 | init_async_submit(&submit, 0, NULL, NULL, NULL, | 1874 | init_async_submit(&submit, 0, NULL, NULL, NULL, |
1611 | to_addr_conv(sh, percpu)); | 1875 | to_addr_conv(sh, percpu, 0)); |
1612 | tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, | 1876 | tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, |
1613 | &sh->ops.zero_sum_result, &submit); | 1877 | &sh->ops.zero_sum_result, &submit); |
1614 | 1878 | ||
@@ -1619,20 +1883,21 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) | |||
1619 | 1883 | ||
1620 | static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) | 1884 | static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) |
1621 | { | 1885 | { |
1622 | struct page **srcs = percpu->scribble; | 1886 | struct page **srcs = to_addr_page(percpu, 0); |
1623 | struct async_submit_ctl submit; | 1887 | struct async_submit_ctl submit; |
1624 | int count; | 1888 | int count; |
1625 | 1889 | ||
1626 | pr_debug("%s: stripe %llu checkp: %d\n", __func__, | 1890 | pr_debug("%s: stripe %llu checkp: %d\n", __func__, |
1627 | (unsigned long long)sh->sector, checkp); | 1891 | (unsigned long long)sh->sector, checkp); |
1628 | 1892 | ||
1629 | count = set_syndrome_sources(srcs, sh); | 1893 | BUG_ON(sh->batch_head); |
1894 | count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL); | ||
1630 | if (!checkp) | 1895 | if (!checkp) |
1631 | srcs[count] = NULL; | 1896 | srcs[count] = NULL; |
1632 | 1897 | ||
1633 | atomic_inc(&sh->count); | 1898 | atomic_inc(&sh->count); |
1634 | init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, | 1899 | init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, |
1635 | sh, to_addr_conv(sh, percpu)); | 1900 | sh, to_addr_conv(sh, percpu, 0)); |
1636 | async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, | 1901 | async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, |
1637 | &sh->ops.zero_sum_result, percpu->spare_page, &submit); | 1902 | &sh->ops.zero_sum_result, percpu->spare_page, &submit); |
1638 | } | 1903 | } |
@@ -1667,8 +1932,12 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) | |||
1667 | async_tx_ack(tx); | 1932 | async_tx_ack(tx); |
1668 | } | 1933 | } |
1669 | 1934 | ||
1670 | if (test_bit(STRIPE_OP_PREXOR, &ops_request)) | 1935 | if (test_bit(STRIPE_OP_PREXOR, &ops_request)) { |
1671 | tx = ops_run_prexor(sh, percpu, tx); | 1936 | if (level < 6) |
1937 | tx = ops_run_prexor5(sh, percpu, tx); | ||
1938 | else | ||
1939 | tx = ops_run_prexor6(sh, percpu, tx); | ||
1940 | } | ||
1672 | 1941 | ||
1673 | if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { | 1942 | if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { |
1674 | tx = ops_run_biodrain(sh, tx); | 1943 | tx = ops_run_biodrain(sh, tx); |
@@ -1693,7 +1962,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) | |||
1693 | BUG(); | 1962 | BUG(); |
1694 | } | 1963 | } |
1695 | 1964 | ||
1696 | if (overlap_clear) | 1965 | if (overlap_clear && !sh->batch_head) |
1697 | for (i = disks; i--; ) { | 1966 | for (i = disks; i--; ) { |
1698 | struct r5dev *dev = &sh->dev[i]; | 1967 | struct r5dev *dev = &sh->dev[i]; |
1699 | if (test_and_clear_bit(R5_Overlap, &dev->flags)) | 1968 | if (test_and_clear_bit(R5_Overlap, &dev->flags)) |
@@ -1702,10 +1971,10 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) | |||
1702 | put_cpu(); | 1971 | put_cpu(); |
1703 | } | 1972 | } |
1704 | 1973 | ||
1705 | static int grow_one_stripe(struct r5conf *conf, int hash) | 1974 | static int grow_one_stripe(struct r5conf *conf, gfp_t gfp) |
1706 | { | 1975 | { |
1707 | struct stripe_head *sh; | 1976 | struct stripe_head *sh; |
1708 | sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); | 1977 | sh = kmem_cache_zalloc(conf->slab_cache, gfp); |
1709 | if (!sh) | 1978 | if (!sh) |
1710 | return 0; | 1979 | return 0; |
1711 | 1980 | ||
@@ -1713,17 +1982,23 @@ static int grow_one_stripe(struct r5conf *conf, int hash) | |||
1713 | 1982 | ||
1714 | spin_lock_init(&sh->stripe_lock); | 1983 | spin_lock_init(&sh->stripe_lock); |
1715 | 1984 | ||
1716 | if (grow_buffers(sh)) { | 1985 | if (grow_buffers(sh, gfp)) { |
1717 | shrink_buffers(sh); | 1986 | shrink_buffers(sh); |
1718 | kmem_cache_free(conf->slab_cache, sh); | 1987 | kmem_cache_free(conf->slab_cache, sh); |
1719 | return 0; | 1988 | return 0; |
1720 | } | 1989 | } |
1721 | sh->hash_lock_index = hash; | 1990 | sh->hash_lock_index = |
1991 | conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; | ||
1722 | /* we just created an active stripe so... */ | 1992 | /* we just created an active stripe so... */ |
1723 | atomic_set(&sh->count, 1); | 1993 | atomic_set(&sh->count, 1); |
1724 | atomic_inc(&conf->active_stripes); | 1994 | atomic_inc(&conf->active_stripes); |
1725 | INIT_LIST_HEAD(&sh->lru); | 1995 | INIT_LIST_HEAD(&sh->lru); |
1996 | |||
1997 | spin_lock_init(&sh->batch_lock); | ||
1998 | INIT_LIST_HEAD(&sh->batch_list); | ||
1999 | sh->batch_head = NULL; | ||
1726 | release_stripe(sh); | 2000 | release_stripe(sh); |
2001 | conf->max_nr_stripes++; | ||
1727 | return 1; | 2002 | return 1; |
1728 | } | 2003 | } |
1729 | 2004 | ||
@@ -1731,7 +2006,6 @@ static int grow_stripes(struct r5conf *conf, int num) | |||
1731 | { | 2006 | { |
1732 | struct kmem_cache *sc; | 2007 | struct kmem_cache *sc; |
1733 | int devs = max(conf->raid_disks, conf->previous_raid_disks); | 2008 | int devs = max(conf->raid_disks, conf->previous_raid_disks); |
1734 | int hash; | ||
1735 | 2009 | ||
1736 | if (conf->mddev->gendisk) | 2010 | if (conf->mddev->gendisk) |
1737 | sprintf(conf->cache_name[0], | 2011 | sprintf(conf->cache_name[0], |
@@ -1749,13 +2023,10 @@ static int grow_stripes(struct r5conf *conf, int num) | |||
1749 | return 1; | 2023 | return 1; |
1750 | conf->slab_cache = sc; | 2024 | conf->slab_cache = sc; |
1751 | conf->pool_size = devs; | 2025 | conf->pool_size = devs; |
1752 | hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; | 2026 | while (num--) |
1753 | while (num--) { | 2027 | if (!grow_one_stripe(conf, GFP_KERNEL)) |
1754 | if (!grow_one_stripe(conf, hash)) | ||
1755 | return 1; | 2028 | return 1; |
1756 | conf->max_nr_stripes++; | 2029 | |
1757 | hash = (hash + 1) % NR_STRIPE_HASH_LOCKS; | ||
1758 | } | ||
1759 | return 0; | 2030 | return 0; |
1760 | } | 2031 | } |
1761 | 2032 | ||
@@ -1772,13 +2043,21 @@ static int grow_stripes(struct r5conf *conf, int num) | |||
1772 | * calculate over all devices (not just the data blocks), using zeros in place | 2043 | * calculate over all devices (not just the data blocks), using zeros in place |
1773 | * of the P and Q blocks. | 2044 | * of the P and Q blocks. |
1774 | */ | 2045 | */ |
1775 | static size_t scribble_len(int num) | 2046 | static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags) |
1776 | { | 2047 | { |
2048 | struct flex_array *ret; | ||
1777 | size_t len; | 2049 | size_t len; |
1778 | 2050 | ||
1779 | len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); | 2051 | len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); |
1780 | 2052 | ret = flex_array_alloc(len, cnt, flags); | |
1781 | return len; | 2053 | if (!ret) |
2054 | return NULL; | ||
2055 | /* always prealloc all elements, so no locking is required */ | ||
2056 | if (flex_array_prealloc(ret, 0, cnt, flags)) { | ||
2057 | flex_array_free(ret); | ||
2058 | return NULL; | ||
2059 | } | ||
2060 | return ret; | ||
1782 | } | 2061 | } |
1783 | 2062 | ||
1784 | static int resize_stripes(struct r5conf *conf, int newsize) | 2063 | static int resize_stripes(struct r5conf *conf, int newsize) |
@@ -1896,16 +2175,16 @@ static int resize_stripes(struct r5conf *conf, int newsize) | |||
1896 | err = -ENOMEM; | 2175 | err = -ENOMEM; |
1897 | 2176 | ||
1898 | get_online_cpus(); | 2177 | get_online_cpus(); |
1899 | conf->scribble_len = scribble_len(newsize); | ||
1900 | for_each_present_cpu(cpu) { | 2178 | for_each_present_cpu(cpu) { |
1901 | struct raid5_percpu *percpu; | 2179 | struct raid5_percpu *percpu; |
1902 | void *scribble; | 2180 | struct flex_array *scribble; |
1903 | 2181 | ||
1904 | percpu = per_cpu_ptr(conf->percpu, cpu); | 2182 | percpu = per_cpu_ptr(conf->percpu, cpu); |
1905 | scribble = kmalloc(conf->scribble_len, GFP_NOIO); | 2183 | scribble = scribble_alloc(newsize, conf->chunk_sectors / |
2184 | STRIPE_SECTORS, GFP_NOIO); | ||
1906 | 2185 | ||
1907 | if (scribble) { | 2186 | if (scribble) { |
1908 | kfree(percpu->scribble); | 2187 | flex_array_free(percpu->scribble); |
1909 | percpu->scribble = scribble; | 2188 | percpu->scribble = scribble; |
1910 | } else { | 2189 | } else { |
1911 | err = -ENOMEM; | 2190 | err = -ENOMEM; |
@@ -1937,9 +2216,10 @@ static int resize_stripes(struct r5conf *conf, int newsize) | |||
1937 | return err; | 2216 | return err; |
1938 | } | 2217 | } |
1939 | 2218 | ||
1940 | static int drop_one_stripe(struct r5conf *conf, int hash) | 2219 | static int drop_one_stripe(struct r5conf *conf) |
1941 | { | 2220 | { |
1942 | struct stripe_head *sh; | 2221 | struct stripe_head *sh; |
2222 | int hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS; | ||
1943 | 2223 | ||
1944 | spin_lock_irq(conf->hash_locks + hash); | 2224 | spin_lock_irq(conf->hash_locks + hash); |
1945 | sh = get_free_stripe(conf, hash); | 2225 | sh = get_free_stripe(conf, hash); |
@@ -1950,15 +2230,15 @@ static int drop_one_stripe(struct r5conf *conf, int hash) | |||
1950 | shrink_buffers(sh); | 2230 | shrink_buffers(sh); |
1951 | kmem_cache_free(conf->slab_cache, sh); | 2231 | kmem_cache_free(conf->slab_cache, sh); |
1952 | atomic_dec(&conf->active_stripes); | 2232 | atomic_dec(&conf->active_stripes); |
2233 | conf->max_nr_stripes--; | ||
1953 | return 1; | 2234 | return 1; |
1954 | } | 2235 | } |
1955 | 2236 | ||
1956 | static void shrink_stripes(struct r5conf *conf) | 2237 | static void shrink_stripes(struct r5conf *conf) |
1957 | { | 2238 | { |
1958 | int hash; | 2239 | while (conf->max_nr_stripes && |
1959 | for (hash = 0; hash < NR_STRIPE_HASH_LOCKS; hash++) | 2240 | drop_one_stripe(conf)) |
1960 | while (drop_one_stripe(conf, hash)) | 2241 | ; |
1961 | ; | ||
1962 | 2242 | ||
1963 | if (conf->slab_cache) | 2243 | if (conf->slab_cache) |
1964 | kmem_cache_destroy(conf->slab_cache); | 2244 | kmem_cache_destroy(conf->slab_cache); |
@@ -2154,10 +2434,16 @@ static void raid5_end_write_request(struct bio *bi, int error) | |||
2154 | } | 2434 | } |
2155 | rdev_dec_pending(rdev, conf->mddev); | 2435 | rdev_dec_pending(rdev, conf->mddev); |
2156 | 2436 | ||
2437 | if (sh->batch_head && !uptodate) | ||
2438 | set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state); | ||
2439 | |||
2157 | if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) | 2440 | if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) |
2158 | clear_bit(R5_LOCKED, &sh->dev[i].flags); | 2441 | clear_bit(R5_LOCKED, &sh->dev[i].flags); |
2159 | set_bit(STRIPE_HANDLE, &sh->state); | 2442 | set_bit(STRIPE_HANDLE, &sh->state); |
2160 | release_stripe(sh); | 2443 | release_stripe(sh); |
2444 | |||
2445 | if (sh->batch_head && sh != sh->batch_head) | ||
2446 | release_stripe(sh->batch_head); | ||
2161 | } | 2447 | } |
2162 | 2448 | ||
2163 | static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); | 2449 | static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); |
@@ -2535,7 +2821,7 @@ static void | |||
2535 | schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, | 2821 | schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, |
2536 | int rcw, int expand) | 2822 | int rcw, int expand) |
2537 | { | 2823 | { |
2538 | int i, pd_idx = sh->pd_idx, disks = sh->disks; | 2824 | int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks; |
2539 | struct r5conf *conf = sh->raid_conf; | 2825 | struct r5conf *conf = sh->raid_conf; |
2540 | int level = conf->level; | 2826 | int level = conf->level; |
2541 | 2827 | ||
@@ -2571,13 +2857,15 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, | |||
2571 | if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) | 2857 | if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) |
2572 | atomic_inc(&conf->pending_full_writes); | 2858 | atomic_inc(&conf->pending_full_writes); |
2573 | } else { | 2859 | } else { |
2574 | BUG_ON(level == 6); | ||
2575 | BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || | 2860 | BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || |
2576 | test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); | 2861 | test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); |
2862 | BUG_ON(level == 6 && | ||
2863 | (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) || | ||
2864 | test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags)))); | ||
2577 | 2865 | ||
2578 | for (i = disks; i--; ) { | 2866 | for (i = disks; i--; ) { |
2579 | struct r5dev *dev = &sh->dev[i]; | 2867 | struct r5dev *dev = &sh->dev[i]; |
2580 | if (i == pd_idx) | 2868 | if (i == pd_idx || i == qd_idx) |
2581 | continue; | 2869 | continue; |
2582 | 2870 | ||
2583 | if (dev->towrite && | 2871 | if (dev->towrite && |
@@ -2624,7 +2912,8 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, | |||
2624 | * toread/towrite point to the first in a chain. | 2912 | * toread/towrite point to the first in a chain. |
2625 | * The bi_next chain must be in order. | 2913 | * The bi_next chain must be in order. |
2626 | */ | 2914 | */ |
2627 | static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) | 2915 | static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, |
2916 | int forwrite, int previous) | ||
2628 | { | 2917 | { |
2629 | struct bio **bip; | 2918 | struct bio **bip; |
2630 | struct r5conf *conf = sh->raid_conf; | 2919 | struct r5conf *conf = sh->raid_conf; |
@@ -2643,6 +2932,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
2643 | * protect it. | 2932 | * protect it. |
2644 | */ | 2933 | */ |
2645 | spin_lock_irq(&sh->stripe_lock); | 2934 | spin_lock_irq(&sh->stripe_lock); |
2935 | /* Don't allow new IO added to stripes in batch list */ | ||
2936 | if (sh->batch_head) | ||
2937 | goto overlap; | ||
2646 | if (forwrite) { | 2938 | if (forwrite) { |
2647 | bip = &sh->dev[dd_idx].towrite; | 2939 | bip = &sh->dev[dd_idx].towrite; |
2648 | if (*bip == NULL) | 2940 | if (*bip == NULL) |
@@ -2657,6 +2949,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
2657 | if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi)) | 2949 | if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi)) |
2658 | goto overlap; | 2950 | goto overlap; |
2659 | 2951 | ||
2952 | if (!forwrite || previous) | ||
2953 | clear_bit(STRIPE_BATCH_READY, &sh->state); | ||
2954 | |||
2660 | BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); | 2955 | BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); |
2661 | if (*bip) | 2956 | if (*bip) |
2662 | bi->bi_next = *bip; | 2957 | bi->bi_next = *bip; |
@@ -2674,7 +2969,8 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
2674 | sector = bio_end_sector(bi); | 2969 | sector = bio_end_sector(bi); |
2675 | } | 2970 | } |
2676 | if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) | 2971 | if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) |
2677 | set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); | 2972 | if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags)) |
2973 | sh->overwrite_disks++; | ||
2678 | } | 2974 | } |
2679 | 2975 | ||
2680 | pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", | 2976 | pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", |
@@ -2688,6 +2984,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
2688 | sh->bm_seq = conf->seq_flush+1; | 2984 | sh->bm_seq = conf->seq_flush+1; |
2689 | set_bit(STRIPE_BIT_DELAY, &sh->state); | 2985 | set_bit(STRIPE_BIT_DELAY, &sh->state); |
2690 | } | 2986 | } |
2987 | |||
2988 | if (stripe_can_batch(sh)) | ||
2989 | stripe_add_to_batch_list(conf, sh); | ||
2691 | return 1; | 2990 | return 1; |
2692 | 2991 | ||
2693 | overlap: | 2992 | overlap: |
@@ -2720,6 +3019,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, | |||
2720 | struct bio **return_bi) | 3019 | struct bio **return_bi) |
2721 | { | 3020 | { |
2722 | int i; | 3021 | int i; |
3022 | BUG_ON(sh->batch_head); | ||
2723 | for (i = disks; i--; ) { | 3023 | for (i = disks; i--; ) { |
2724 | struct bio *bi; | 3024 | struct bio *bi; |
2725 | int bitmap_end = 0; | 3025 | int bitmap_end = 0; |
@@ -2746,6 +3046,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, | |||
2746 | /* fail all writes first */ | 3046 | /* fail all writes first */ |
2747 | bi = sh->dev[i].towrite; | 3047 | bi = sh->dev[i].towrite; |
2748 | sh->dev[i].towrite = NULL; | 3048 | sh->dev[i].towrite = NULL; |
3049 | sh->overwrite_disks = 0; | ||
2749 | spin_unlock_irq(&sh->stripe_lock); | 3050 | spin_unlock_irq(&sh->stripe_lock); |
2750 | if (bi) | 3051 | if (bi) |
2751 | bitmap_end = 1; | 3052 | bitmap_end = 1; |
@@ -2834,6 +3135,7 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, | |||
2834 | int abort = 0; | 3135 | int abort = 0; |
2835 | int i; | 3136 | int i; |
2836 | 3137 | ||
3138 | BUG_ON(sh->batch_head); | ||
2837 | clear_bit(STRIPE_SYNCING, &sh->state); | 3139 | clear_bit(STRIPE_SYNCING, &sh->state); |
2838 | if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) | 3140 | if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) |
2839 | wake_up(&conf->wait_for_overlap); | 3141 | wake_up(&conf->wait_for_overlap); |
@@ -3064,6 +3366,7 @@ static void handle_stripe_fill(struct stripe_head *sh, | |||
3064 | { | 3366 | { |
3065 | int i; | 3367 | int i; |
3066 | 3368 | ||
3369 | BUG_ON(sh->batch_head); | ||
3067 | /* look for blocks to read/compute, skip this if a compute | 3370 | /* look for blocks to read/compute, skip this if a compute |
3068 | * is already in flight, or if the stripe contents are in the | 3371 | * is already in flight, or if the stripe contents are in the |
3069 | * midst of changing due to a write | 3372 | * midst of changing due to a write |
@@ -3087,6 +3390,9 @@ static void handle_stripe_clean_event(struct r5conf *conf, | |||
3087 | int i; | 3390 | int i; |
3088 | struct r5dev *dev; | 3391 | struct r5dev *dev; |
3089 | int discard_pending = 0; | 3392 | int discard_pending = 0; |
3393 | struct stripe_head *head_sh = sh; | ||
3394 | bool do_endio = false; | ||
3395 | int wakeup_nr = 0; | ||
3090 | 3396 | ||
3091 | for (i = disks; i--; ) | 3397 | for (i = disks; i--; ) |
3092 | if (sh->dev[i].written) { | 3398 | if (sh->dev[i].written) { |
@@ -3102,8 +3408,11 @@ static void handle_stripe_clean_event(struct r5conf *conf, | |||
3102 | clear_bit(R5_UPTODATE, &dev->flags); | 3408 | clear_bit(R5_UPTODATE, &dev->flags); |
3103 | if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) { | 3409 | if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) { |
3104 | WARN_ON(test_bit(R5_UPTODATE, &dev->flags)); | 3410 | WARN_ON(test_bit(R5_UPTODATE, &dev->flags)); |
3105 | dev->page = dev->orig_page; | ||
3106 | } | 3411 | } |
3412 | do_endio = true; | ||
3413 | |||
3414 | returnbi: | ||
3415 | dev->page = dev->orig_page; | ||
3107 | wbi = dev->written; | 3416 | wbi = dev->written; |
3108 | dev->written = NULL; | 3417 | dev->written = NULL; |
3109 | while (wbi && wbi->bi_iter.bi_sector < | 3418 | while (wbi && wbi->bi_iter.bi_sector < |
@@ -3120,6 +3429,17 @@ static void handle_stripe_clean_event(struct r5conf *conf, | |||
3120 | STRIPE_SECTORS, | 3429 | STRIPE_SECTORS, |
3121 | !test_bit(STRIPE_DEGRADED, &sh->state), | 3430 | !test_bit(STRIPE_DEGRADED, &sh->state), |
3122 | 0); | 3431 | 0); |
3432 | if (head_sh->batch_head) { | ||
3433 | sh = list_first_entry(&sh->batch_list, | ||
3434 | struct stripe_head, | ||
3435 | batch_list); | ||
3436 | if (sh != head_sh) { | ||
3437 | dev = &sh->dev[i]; | ||
3438 | goto returnbi; | ||
3439 | } | ||
3440 | } | ||
3441 | sh = head_sh; | ||
3442 | dev = &sh->dev[i]; | ||
3123 | } else if (test_bit(R5_Discard, &dev->flags)) | 3443 | } else if (test_bit(R5_Discard, &dev->flags)) |
3124 | discard_pending = 1; | 3444 | discard_pending = 1; |
3125 | WARN_ON(test_bit(R5_SkipCopy, &dev->flags)); | 3445 | WARN_ON(test_bit(R5_SkipCopy, &dev->flags)); |
@@ -3141,8 +3461,17 @@ static void handle_stripe_clean_event(struct r5conf *conf, | |||
3141 | * will be reinitialized | 3461 | * will be reinitialized |
3142 | */ | 3462 | */ |
3143 | spin_lock_irq(&conf->device_lock); | 3463 | spin_lock_irq(&conf->device_lock); |
3464 | unhash: | ||
3144 | remove_hash(sh); | 3465 | remove_hash(sh); |
3466 | if (head_sh->batch_head) { | ||
3467 | sh = list_first_entry(&sh->batch_list, | ||
3468 | struct stripe_head, batch_list); | ||
3469 | if (sh != head_sh) | ||
3470 | goto unhash; | ||
3471 | } | ||
3145 | spin_unlock_irq(&conf->device_lock); | 3472 | spin_unlock_irq(&conf->device_lock); |
3473 | sh = head_sh; | ||
3474 | |||
3146 | if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) | 3475 | if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) |
3147 | set_bit(STRIPE_HANDLE, &sh->state); | 3476 | set_bit(STRIPE_HANDLE, &sh->state); |
3148 | 3477 | ||
@@ -3151,6 +3480,45 @@ static void handle_stripe_clean_event(struct r5conf *conf, | |||
3151 | if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) | 3480 | if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) |
3152 | if (atomic_dec_and_test(&conf->pending_full_writes)) | 3481 | if (atomic_dec_and_test(&conf->pending_full_writes)) |
3153 | md_wakeup_thread(conf->mddev->thread); | 3482 | md_wakeup_thread(conf->mddev->thread); |
3483 | |||
3484 | if (!head_sh->batch_head || !do_endio) | ||
3485 | return; | ||
3486 | for (i = 0; i < head_sh->disks; i++) { | ||
3487 | if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags)) | ||
3488 | wakeup_nr++; | ||
3489 | } | ||
3490 | while (!list_empty(&head_sh->batch_list)) { | ||
3491 | int i; | ||
3492 | sh = list_first_entry(&head_sh->batch_list, | ||
3493 | struct stripe_head, batch_list); | ||
3494 | list_del_init(&sh->batch_list); | ||
3495 | |||
3496 | set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG, | ||
3497 | head_sh->state & ~((1 << STRIPE_ACTIVE) | | ||
3498 | (1 << STRIPE_PREREAD_ACTIVE) | | ||
3499 | STRIPE_EXPAND_SYNC_FLAG)); | ||
3500 | sh->check_state = head_sh->check_state; | ||
3501 | sh->reconstruct_state = head_sh->reconstruct_state; | ||
3502 | for (i = 0; i < sh->disks; i++) { | ||
3503 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) | ||
3504 | wakeup_nr++; | ||
3505 | sh->dev[i].flags = head_sh->dev[i].flags; | ||
3506 | } | ||
3507 | |||
3508 | spin_lock_irq(&sh->stripe_lock); | ||
3509 | sh->batch_head = NULL; | ||
3510 | spin_unlock_irq(&sh->stripe_lock); | ||
3511 | if (sh->state & STRIPE_EXPAND_SYNC_FLAG) | ||
3512 | set_bit(STRIPE_HANDLE, &sh->state); | ||
3513 | release_stripe(sh); | ||
3514 | } | ||
3515 | |||
3516 | spin_lock_irq(&head_sh->stripe_lock); | ||
3517 | head_sh->batch_head = NULL; | ||
3518 | spin_unlock_irq(&head_sh->stripe_lock); | ||
3519 | wake_up_nr(&conf->wait_for_overlap, wakeup_nr); | ||
3520 | if (head_sh->state & STRIPE_EXPAND_SYNC_FLAG) | ||
3521 | set_bit(STRIPE_HANDLE, &head_sh->state); | ||
3154 | } | 3522 | } |
3155 | 3523 | ||
3156 | static void handle_stripe_dirtying(struct r5conf *conf, | 3524 | static void handle_stripe_dirtying(struct r5conf *conf, |
@@ -3161,28 +3529,27 @@ static void handle_stripe_dirtying(struct r5conf *conf, | |||
3161 | int rmw = 0, rcw = 0, i; | 3529 | int rmw = 0, rcw = 0, i; |
3162 | sector_t recovery_cp = conf->mddev->recovery_cp; | 3530 | sector_t recovery_cp = conf->mddev->recovery_cp; |
3163 | 3531 | ||
3164 | /* RAID6 requires 'rcw' in current implementation. | 3532 | /* Check whether resync is now happening or should start. |
3165 | * Otherwise, check whether resync is now happening or should start. | ||
3166 | * If yes, then the array is dirty (after unclean shutdown or | 3533 | * If yes, then the array is dirty (after unclean shutdown or |
3167 | * initial creation), so parity in some stripes might be inconsistent. | 3534 | * initial creation), so parity in some stripes might be inconsistent. |
3168 | * In this case, we need to always do reconstruct-write, to ensure | 3535 | * In this case, we need to always do reconstruct-write, to ensure |
3169 | * that in case of drive failure or read-error correction, we | 3536 | * that in case of drive failure or read-error correction, we |
3170 | * generate correct data from the parity. | 3537 | * generate correct data from the parity. |
3171 | */ | 3538 | */ |
3172 | if (conf->max_degraded == 2 || | 3539 | if (conf->rmw_level == PARITY_DISABLE_RMW || |
3173 | (recovery_cp < MaxSector && sh->sector >= recovery_cp && | 3540 | (recovery_cp < MaxSector && sh->sector >= recovery_cp && |
3174 | s->failed == 0)) { | 3541 | s->failed == 0)) { |
3175 | /* Calculate the real rcw later - for now make it | 3542 | /* Calculate the real rcw later - for now make it |
3176 | * look like rcw is cheaper | 3543 | * look like rcw is cheaper |
3177 | */ | 3544 | */ |
3178 | rcw = 1; rmw = 2; | 3545 | rcw = 1; rmw = 2; |
3179 | pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n", | 3546 | pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n", |
3180 | conf->max_degraded, (unsigned long long)recovery_cp, | 3547 | conf->rmw_level, (unsigned long long)recovery_cp, |
3181 | (unsigned long long)sh->sector); | 3548 | (unsigned long long)sh->sector); |
3182 | } else for (i = disks; i--; ) { | 3549 | } else for (i = disks; i--; ) { |
3183 | /* would I have to read this buffer for read_modify_write */ | 3550 | /* would I have to read this buffer for read_modify_write */ |
3184 | struct r5dev *dev = &sh->dev[i]; | 3551 | struct r5dev *dev = &sh->dev[i]; |
3185 | if ((dev->towrite || i == sh->pd_idx) && | 3552 | if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) && |
3186 | !test_bit(R5_LOCKED, &dev->flags) && | 3553 | !test_bit(R5_LOCKED, &dev->flags) && |
3187 | !(test_bit(R5_UPTODATE, &dev->flags) || | 3554 | !(test_bit(R5_UPTODATE, &dev->flags) || |
3188 | test_bit(R5_Wantcompute, &dev->flags))) { | 3555 | test_bit(R5_Wantcompute, &dev->flags))) { |
@@ -3192,7 +3559,8 @@ static void handle_stripe_dirtying(struct r5conf *conf, | |||
3192 | rmw += 2*disks; /* cannot read it */ | 3559 | rmw += 2*disks; /* cannot read it */ |
3193 | } | 3560 | } |
3194 | /* Would I have to read this buffer for reconstruct_write */ | 3561 | /* Would I have to read this buffer for reconstruct_write */ |
3195 | if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && | 3562 | if (!test_bit(R5_OVERWRITE, &dev->flags) && |
3563 | i != sh->pd_idx && i != sh->qd_idx && | ||
3196 | !test_bit(R5_LOCKED, &dev->flags) && | 3564 | !test_bit(R5_LOCKED, &dev->flags) && |
3197 | !(test_bit(R5_UPTODATE, &dev->flags) || | 3565 | !(test_bit(R5_UPTODATE, &dev->flags) || |
3198 | test_bit(R5_Wantcompute, &dev->flags))) { | 3566 | test_bit(R5_Wantcompute, &dev->flags))) { |
@@ -3205,7 +3573,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, | |||
3205 | pr_debug("for sector %llu, rmw=%d rcw=%d\n", | 3573 | pr_debug("for sector %llu, rmw=%d rcw=%d\n", |
3206 | (unsigned long long)sh->sector, rmw, rcw); | 3574 | (unsigned long long)sh->sector, rmw, rcw); |
3207 | set_bit(STRIPE_HANDLE, &sh->state); | 3575 | set_bit(STRIPE_HANDLE, &sh->state); |
3208 | if (rmw < rcw && rmw > 0) { | 3576 | if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_ENABLE_RMW)) && rmw > 0) { |
3209 | /* prefer read-modify-write, but need to get some data */ | 3577 | /* prefer read-modify-write, but need to get some data */ |
3210 | if (conf->mddev->queue) | 3578 | if (conf->mddev->queue) |
3211 | blk_add_trace_msg(conf->mddev->queue, | 3579 | blk_add_trace_msg(conf->mddev->queue, |
@@ -3213,7 +3581,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, | |||
3213 | (unsigned long long)sh->sector, rmw); | 3581 | (unsigned long long)sh->sector, rmw); |
3214 | for (i = disks; i--; ) { | 3582 | for (i = disks; i--; ) { |
3215 | struct r5dev *dev = &sh->dev[i]; | 3583 | struct r5dev *dev = &sh->dev[i]; |
3216 | if ((dev->towrite || i == sh->pd_idx) && | 3584 | if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) && |
3217 | !test_bit(R5_LOCKED, &dev->flags) && | 3585 | !test_bit(R5_LOCKED, &dev->flags) && |
3218 | !(test_bit(R5_UPTODATE, &dev->flags) || | 3586 | !(test_bit(R5_UPTODATE, &dev->flags) || |
3219 | test_bit(R5_Wantcompute, &dev->flags)) && | 3587 | test_bit(R5_Wantcompute, &dev->flags)) && |
@@ -3232,7 +3600,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, | |||
3232 | } | 3600 | } |
3233 | } | 3601 | } |
3234 | } | 3602 | } |
3235 | if (rcw <= rmw && rcw > 0) { | 3603 | if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_ENABLE_RMW)) && rcw > 0) { |
3236 | /* want reconstruct write, but need to get some data */ | 3604 | /* want reconstruct write, but need to get some data */ |
3237 | int qread =0; | 3605 | int qread =0; |
3238 | rcw = 0; | 3606 | rcw = 0; |
@@ -3290,6 +3658,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, | |||
3290 | { | 3658 | { |
3291 | struct r5dev *dev = NULL; | 3659 | struct r5dev *dev = NULL; |
3292 | 3660 | ||
3661 | BUG_ON(sh->batch_head); | ||
3293 | set_bit(STRIPE_HANDLE, &sh->state); | 3662 | set_bit(STRIPE_HANDLE, &sh->state); |
3294 | 3663 | ||
3295 | switch (sh->check_state) { | 3664 | switch (sh->check_state) { |
@@ -3380,6 +3749,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, | |||
3380 | int qd_idx = sh->qd_idx; | 3749 | int qd_idx = sh->qd_idx; |
3381 | struct r5dev *dev; | 3750 | struct r5dev *dev; |
3382 | 3751 | ||
3752 | BUG_ON(sh->batch_head); | ||
3383 | set_bit(STRIPE_HANDLE, &sh->state); | 3753 | set_bit(STRIPE_HANDLE, &sh->state); |
3384 | 3754 | ||
3385 | BUG_ON(s->failed > 2); | 3755 | BUG_ON(s->failed > 2); |
@@ -3543,6 +3913,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) | |||
3543 | * copy some of them into a target stripe for expand. | 3913 | * copy some of them into a target stripe for expand. |
3544 | */ | 3914 | */ |
3545 | struct dma_async_tx_descriptor *tx = NULL; | 3915 | struct dma_async_tx_descriptor *tx = NULL; |
3916 | BUG_ON(sh->batch_head); | ||
3546 | clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); | 3917 | clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); |
3547 | for (i = 0; i < sh->disks; i++) | 3918 | for (i = 0; i < sh->disks; i++) |
3548 | if (i != sh->pd_idx && i != sh->qd_idx) { | 3919 | if (i != sh->pd_idx && i != sh->qd_idx) { |
@@ -3615,8 +3986,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) | |||
3615 | 3986 | ||
3616 | memset(s, 0, sizeof(*s)); | 3987 | memset(s, 0, sizeof(*s)); |
3617 | 3988 | ||
3618 | s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); | 3989 | s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head; |
3619 | s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); | 3990 | s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head; |
3620 | s->failed_num[0] = -1; | 3991 | s->failed_num[0] = -1; |
3621 | s->failed_num[1] = -1; | 3992 | s->failed_num[1] = -1; |
3622 | 3993 | ||
@@ -3786,6 +4157,80 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) | |||
3786 | rcu_read_unlock(); | 4157 | rcu_read_unlock(); |
3787 | } | 4158 | } |
3788 | 4159 | ||
4160 | static int clear_batch_ready(struct stripe_head *sh) | ||
4161 | { | ||
4162 | struct stripe_head *tmp; | ||
4163 | if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state)) | ||
4164 | return 0; | ||
4165 | spin_lock(&sh->stripe_lock); | ||
4166 | if (!sh->batch_head) { | ||
4167 | spin_unlock(&sh->stripe_lock); | ||
4168 | return 0; | ||
4169 | } | ||
4170 | |||
4171 | /* | ||
4172 | * this stripe could be added to a batch list before we check | ||
4173 | * BATCH_READY, skips it | ||
4174 | */ | ||
4175 | if (sh->batch_head != sh) { | ||
4176 | spin_unlock(&sh->stripe_lock); | ||
4177 | return 1; | ||
4178 | } | ||
4179 | spin_lock(&sh->batch_lock); | ||
4180 | list_for_each_entry(tmp, &sh->batch_list, batch_list) | ||
4181 | clear_bit(STRIPE_BATCH_READY, &tmp->state); | ||
4182 | spin_unlock(&sh->batch_lock); | ||
4183 | spin_unlock(&sh->stripe_lock); | ||
4184 | |||
4185 | /* | ||
4186 | * BATCH_READY is cleared, no new stripes can be added. | ||
4187 | * batch_list can be accessed without lock | ||
4188 | */ | ||
4189 | return 0; | ||
4190 | } | ||
4191 | |||
4192 | static void check_break_stripe_batch_list(struct stripe_head *sh) | ||
4193 | { | ||
4194 | struct stripe_head *head_sh, *next; | ||
4195 | int i; | ||
4196 | |||
4197 | if (!test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state)) | ||
4198 | return; | ||
4199 | |||
4200 | head_sh = sh; | ||
4201 | do { | ||
4202 | sh = list_first_entry(&sh->batch_list, | ||
4203 | struct stripe_head, batch_list); | ||
4204 | BUG_ON(sh == head_sh); | ||
4205 | } while (!test_bit(STRIPE_DEGRADED, &sh->state)); | ||
4206 | |||
4207 | while (sh != head_sh) { | ||
4208 | next = list_first_entry(&sh->batch_list, | ||
4209 | struct stripe_head, batch_list); | ||
4210 | list_del_init(&sh->batch_list); | ||
4211 | |||
4212 | set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG, | ||
4213 | head_sh->state & ~((1 << STRIPE_ACTIVE) | | ||
4214 | (1 << STRIPE_PREREAD_ACTIVE) | | ||
4215 | (1 << STRIPE_DEGRADED) | | ||
4216 | STRIPE_EXPAND_SYNC_FLAG)); | ||
4217 | sh->check_state = head_sh->check_state; | ||
4218 | sh->reconstruct_state = head_sh->reconstruct_state; | ||
4219 | for (i = 0; i < sh->disks; i++) | ||
4220 | sh->dev[i].flags = head_sh->dev[i].flags & | ||
4221 | (~((1 << R5_WriteError) | (1 << R5_Overlap))); | ||
4222 | |||
4223 | spin_lock_irq(&sh->stripe_lock); | ||
4224 | sh->batch_head = NULL; | ||
4225 | spin_unlock_irq(&sh->stripe_lock); | ||
4226 | |||
4227 | set_bit(STRIPE_HANDLE, &sh->state); | ||
4228 | release_stripe(sh); | ||
4229 | |||
4230 | sh = next; | ||
4231 | } | ||
4232 | } | ||
4233 | |||
3789 | static void handle_stripe(struct stripe_head *sh) | 4234 | static void handle_stripe(struct stripe_head *sh) |
3790 | { | 4235 | { |
3791 | struct stripe_head_state s; | 4236 | struct stripe_head_state s; |
@@ -3803,7 +4248,14 @@ static void handle_stripe(struct stripe_head *sh) | |||
3803 | return; | 4248 | return; |
3804 | } | 4249 | } |
3805 | 4250 | ||
3806 | if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { | 4251 | if (clear_batch_ready(sh) ) { |
4252 | clear_bit_unlock(STRIPE_ACTIVE, &sh->state); | ||
4253 | return; | ||
4254 | } | ||
4255 | |||
4256 | check_break_stripe_batch_list(sh); | ||
4257 | |||
4258 | if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) { | ||
3807 | spin_lock(&sh->stripe_lock); | 4259 | spin_lock(&sh->stripe_lock); |
3808 | /* Cannot process 'sync' concurrently with 'discard' */ | 4260 | /* Cannot process 'sync' concurrently with 'discard' */ |
3809 | if (!test_bit(STRIPE_DISCARD, &sh->state) && | 4261 | if (!test_bit(STRIPE_DISCARD, &sh->state) && |
@@ -4158,7 +4610,7 @@ static int raid5_congested(struct mddev *mddev, int bits) | |||
4158 | * how busy the stripe_cache is | 4610 | * how busy the stripe_cache is |
4159 | */ | 4611 | */ |
4160 | 4612 | ||
4161 | if (conf->inactive_blocked) | 4613 | if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) |
4162 | return 1; | 4614 | return 1; |
4163 | if (conf->quiesce) | 4615 | if (conf->quiesce) |
4164 | return 1; | 4616 | return 1; |
@@ -4180,8 +4632,12 @@ static int raid5_mergeable_bvec(struct mddev *mddev, | |||
4180 | unsigned int chunk_sectors = mddev->chunk_sectors; | 4632 | unsigned int chunk_sectors = mddev->chunk_sectors; |
4181 | unsigned int bio_sectors = bvm->bi_size >> 9; | 4633 | unsigned int bio_sectors = bvm->bi_size >> 9; |
4182 | 4634 | ||
4183 | if ((bvm->bi_rw & 1) == WRITE) | 4635 | /* |
4184 | return biovec->bv_len; /* always allow writes to be mergeable */ | 4636 | * always allow writes to be mergeable, read as well if array |
4637 | * is degraded as we'll go through stripe cache anyway. | ||
4638 | */ | ||
4639 | if ((bvm->bi_rw & 1) == WRITE || mddev->degraded) | ||
4640 | return biovec->bv_len; | ||
4185 | 4641 | ||
4186 | if (mddev->new_chunk_sectors < mddev->chunk_sectors) | 4642 | if (mddev->new_chunk_sectors < mddev->chunk_sectors) |
4187 | chunk_sectors = mddev->new_chunk_sectors; | 4643 | chunk_sectors = mddev->new_chunk_sectors; |
@@ -4603,12 +5059,14 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) | |||
4603 | } | 5059 | } |
4604 | set_bit(STRIPE_DISCARD, &sh->state); | 5060 | set_bit(STRIPE_DISCARD, &sh->state); |
4605 | finish_wait(&conf->wait_for_overlap, &w); | 5061 | finish_wait(&conf->wait_for_overlap, &w); |
5062 | sh->overwrite_disks = 0; | ||
4606 | for (d = 0; d < conf->raid_disks; d++) { | 5063 | for (d = 0; d < conf->raid_disks; d++) { |
4607 | if (d == sh->pd_idx || d == sh->qd_idx) | 5064 | if (d == sh->pd_idx || d == sh->qd_idx) |
4608 | continue; | 5065 | continue; |
4609 | sh->dev[d].towrite = bi; | 5066 | sh->dev[d].towrite = bi; |
4610 | set_bit(R5_OVERWRITE, &sh->dev[d].flags); | 5067 | set_bit(R5_OVERWRITE, &sh->dev[d].flags); |
4611 | raid5_inc_bi_active_stripes(bi); | 5068 | raid5_inc_bi_active_stripes(bi); |
5069 | sh->overwrite_disks++; | ||
4612 | } | 5070 | } |
4613 | spin_unlock_irq(&sh->stripe_lock); | 5071 | spin_unlock_irq(&sh->stripe_lock); |
4614 | if (conf->mddev->bitmap) { | 5072 | if (conf->mddev->bitmap) { |
@@ -4656,7 +5114,12 @@ static void make_request(struct mddev *mddev, struct bio * bi) | |||
4656 | 5114 | ||
4657 | md_write_start(mddev, bi); | 5115 | md_write_start(mddev, bi); |
4658 | 5116 | ||
4659 | if (rw == READ && | 5117 | /* |
5118 | * If array is degraded, better not do chunk aligned read because | ||
5119 | * later we might have to read it again in order to reconstruct | ||
5120 | * data on failed drives. | ||
5121 | */ | ||
5122 | if (rw == READ && mddev->degraded == 0 && | ||
4660 | mddev->reshape_position == MaxSector && | 5123 | mddev->reshape_position == MaxSector && |
4661 | chunk_aligned_read(mddev,bi)) | 5124 | chunk_aligned_read(mddev,bi)) |
4662 | return; | 5125 | return; |
@@ -4772,7 +5235,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) | |||
4772 | } | 5235 | } |
4773 | 5236 | ||
4774 | if (test_bit(STRIPE_EXPANDING, &sh->state) || | 5237 | if (test_bit(STRIPE_EXPANDING, &sh->state) || |
4775 | !add_stripe_bio(sh, bi, dd_idx, rw)) { | 5238 | !add_stripe_bio(sh, bi, dd_idx, rw, previous)) { |
4776 | /* Stripe is busy expanding or | 5239 | /* Stripe is busy expanding or |
4777 | * add failed due to overlap. Flush everything | 5240 | * add failed due to overlap. Flush everything |
4778 | * and wait a while | 5241 | * and wait a while |
@@ -4785,7 +5248,8 @@ static void make_request(struct mddev *mddev, struct bio * bi) | |||
4785 | } | 5248 | } |
4786 | set_bit(STRIPE_HANDLE, &sh->state); | 5249 | set_bit(STRIPE_HANDLE, &sh->state); |
4787 | clear_bit(STRIPE_DELAYED, &sh->state); | 5250 | clear_bit(STRIPE_DELAYED, &sh->state); |
4788 | if ((bi->bi_rw & REQ_SYNC) && | 5251 | if ((!sh->batch_head || sh == sh->batch_head) && |
5252 | (bi->bi_rw & REQ_SYNC) && | ||
4789 | !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | 5253 | !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) |
4790 | atomic_inc(&conf->preread_active_stripes); | 5254 | atomic_inc(&conf->preread_active_stripes); |
4791 | release_stripe_plug(mddev, sh); | 5255 | release_stripe_plug(mddev, sh); |
@@ -5050,8 +5514,7 @@ ret: | |||
5050 | return reshape_sectors; | 5514 | return reshape_sectors; |
5051 | } | 5515 | } |
5052 | 5516 | ||
5053 | /* FIXME go_faster isn't used */ | 5517 | static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped) |
5054 | static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster) | ||
5055 | { | 5518 | { |
5056 | struct r5conf *conf = mddev->private; | 5519 | struct r5conf *conf = mddev->private; |
5057 | struct stripe_head *sh; | 5520 | struct stripe_head *sh; |
@@ -5186,7 +5649,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) | |||
5186 | return handled; | 5649 | return handled; |
5187 | } | 5650 | } |
5188 | 5651 | ||
5189 | if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { | 5652 | if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) { |
5190 | release_stripe(sh); | 5653 | release_stripe(sh); |
5191 | raid5_set_bi_processed_stripes(raid_bio, scnt); | 5654 | raid5_set_bi_processed_stripes(raid_bio, scnt); |
5192 | conf->retry_read_aligned = raid_bio; | 5655 | conf->retry_read_aligned = raid_bio; |
@@ -5312,6 +5775,8 @@ static void raid5d(struct md_thread *thread) | |||
5312 | int batch_size, released; | 5775 | int batch_size, released; |
5313 | 5776 | ||
5314 | released = release_stripe_list(conf, conf->temp_inactive_list); | 5777 | released = release_stripe_list(conf, conf->temp_inactive_list); |
5778 | if (released) | ||
5779 | clear_bit(R5_DID_ALLOC, &conf->cache_state); | ||
5315 | 5780 | ||
5316 | if ( | 5781 | if ( |
5317 | !list_empty(&conf->bitmap_list)) { | 5782 | !list_empty(&conf->bitmap_list)) { |
@@ -5350,6 +5815,13 @@ static void raid5d(struct md_thread *thread) | |||
5350 | pr_debug("%d stripes handled\n", handled); | 5815 | pr_debug("%d stripes handled\n", handled); |
5351 | 5816 | ||
5352 | spin_unlock_irq(&conf->device_lock); | 5817 | spin_unlock_irq(&conf->device_lock); |
5818 | if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state)) { | ||
5819 | grow_one_stripe(conf, __GFP_NOWARN); | ||
5820 | /* Set flag even if allocation failed. This helps | ||
5821 | * slow down allocation requests when mem is short | ||
5822 | */ | ||
5823 | set_bit(R5_DID_ALLOC, &conf->cache_state); | ||
5824 | } | ||
5353 | 5825 | ||
5354 | async_tx_issue_pending_all(); | 5826 | async_tx_issue_pending_all(); |
5355 | blk_finish_plug(&plug); | 5827 | blk_finish_plug(&plug); |
@@ -5365,7 +5837,7 @@ raid5_show_stripe_cache_size(struct mddev *mddev, char *page) | |||
5365 | spin_lock(&mddev->lock); | 5837 | spin_lock(&mddev->lock); |
5366 | conf = mddev->private; | 5838 | conf = mddev->private; |
5367 | if (conf) | 5839 | if (conf) |
5368 | ret = sprintf(page, "%d\n", conf->max_nr_stripes); | 5840 | ret = sprintf(page, "%d\n", conf->min_nr_stripes); |
5369 | spin_unlock(&mddev->lock); | 5841 | spin_unlock(&mddev->lock); |
5370 | return ret; | 5842 | return ret; |
5371 | } | 5843 | } |
@@ -5375,30 +5847,24 @@ raid5_set_cache_size(struct mddev *mddev, int size) | |||
5375 | { | 5847 | { |
5376 | struct r5conf *conf = mddev->private; | 5848 | struct r5conf *conf = mddev->private; |
5377 | int err; | 5849 | int err; |
5378 | int hash; | ||
5379 | 5850 | ||
5380 | if (size <= 16 || size > 32768) | 5851 | if (size <= 16 || size > 32768) |
5381 | return -EINVAL; | 5852 | return -EINVAL; |
5382 | hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS; | 5853 | |
5383 | while (size < conf->max_nr_stripes) { | 5854 | conf->min_nr_stripes = size; |
5384 | if (drop_one_stripe(conf, hash)) | 5855 | while (size < conf->max_nr_stripes && |
5385 | conf->max_nr_stripes--; | 5856 | drop_one_stripe(conf)) |
5386 | else | 5857 | ; |
5387 | break; | 5858 | |
5388 | hash--; | 5859 | |
5389 | if (hash < 0) | ||
5390 | hash = NR_STRIPE_HASH_LOCKS - 1; | ||
5391 | } | ||
5392 | err = md_allow_write(mddev); | 5860 | err = md_allow_write(mddev); |
5393 | if (err) | 5861 | if (err) |
5394 | return err; | 5862 | return err; |
5395 | hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; | 5863 | |
5396 | while (size > conf->max_nr_stripes) { | 5864 | while (size > conf->max_nr_stripes) |
5397 | if (grow_one_stripe(conf, hash)) | 5865 | if (!grow_one_stripe(conf, GFP_KERNEL)) |
5398 | conf->max_nr_stripes++; | 5866 | break; |
5399 | else break; | 5867 | |
5400 | hash = (hash + 1) % NR_STRIPE_HASH_LOCKS; | ||
5401 | } | ||
5402 | return 0; | 5868 | return 0; |
5403 | } | 5869 | } |
5404 | EXPORT_SYMBOL(raid5_set_cache_size); | 5870 | EXPORT_SYMBOL(raid5_set_cache_size); |
@@ -5433,6 +5899,49 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, | |||
5433 | raid5_store_stripe_cache_size); | 5899 | raid5_store_stripe_cache_size); |
5434 | 5900 | ||
5435 | static ssize_t | 5901 | static ssize_t |
5902 | raid5_show_rmw_level(struct mddev *mddev, char *page) | ||
5903 | { | ||
5904 | struct r5conf *conf = mddev->private; | ||
5905 | if (conf) | ||
5906 | return sprintf(page, "%d\n", conf->rmw_level); | ||
5907 | else | ||
5908 | return 0; | ||
5909 | } | ||
5910 | |||
5911 | static ssize_t | ||
5912 | raid5_store_rmw_level(struct mddev *mddev, const char *page, size_t len) | ||
5913 | { | ||
5914 | struct r5conf *conf = mddev->private; | ||
5915 | unsigned long new; | ||
5916 | |||
5917 | if (!conf) | ||
5918 | return -ENODEV; | ||
5919 | |||
5920 | if (len >= PAGE_SIZE) | ||
5921 | return -EINVAL; | ||
5922 | |||
5923 | if (kstrtoul(page, 10, &new)) | ||
5924 | return -EINVAL; | ||
5925 | |||
5926 | if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome) | ||
5927 | return -EINVAL; | ||
5928 | |||
5929 | if (new != PARITY_DISABLE_RMW && | ||
5930 | new != PARITY_ENABLE_RMW && | ||
5931 | new != PARITY_PREFER_RMW) | ||
5932 | return -EINVAL; | ||
5933 | |||
5934 | conf->rmw_level = new; | ||
5935 | return len; | ||
5936 | } | ||
5937 | |||
5938 | static struct md_sysfs_entry | ||
5939 | raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR, | ||
5940 | raid5_show_rmw_level, | ||
5941 | raid5_store_rmw_level); | ||
5942 | |||
5943 | |||
5944 | static ssize_t | ||
5436 | raid5_show_preread_threshold(struct mddev *mddev, char *page) | 5945 | raid5_show_preread_threshold(struct mddev *mddev, char *page) |
5437 | { | 5946 | { |
5438 | struct r5conf *conf; | 5947 | struct r5conf *conf; |
@@ -5463,7 +5972,7 @@ raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) | |||
5463 | conf = mddev->private; | 5972 | conf = mddev->private; |
5464 | if (!conf) | 5973 | if (!conf) |
5465 | err = -ENODEV; | 5974 | err = -ENODEV; |
5466 | else if (new > conf->max_nr_stripes) | 5975 | else if (new > conf->min_nr_stripes) |
5467 | err = -EINVAL; | 5976 | err = -EINVAL; |
5468 | else | 5977 | else |
5469 | conf->bypass_threshold = new; | 5978 | conf->bypass_threshold = new; |
@@ -5618,6 +6127,7 @@ static struct attribute *raid5_attrs[] = { | |||
5618 | &raid5_preread_bypass_threshold.attr, | 6127 | &raid5_preread_bypass_threshold.attr, |
5619 | &raid5_group_thread_cnt.attr, | 6128 | &raid5_group_thread_cnt.attr, |
5620 | &raid5_skip_copy.attr, | 6129 | &raid5_skip_copy.attr, |
6130 | &raid5_rmw_level.attr, | ||
5621 | NULL, | 6131 | NULL, |
5622 | }; | 6132 | }; |
5623 | static struct attribute_group raid5_attrs_group = { | 6133 | static struct attribute_group raid5_attrs_group = { |
@@ -5699,7 +6209,8 @@ raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) | |||
5699 | static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) | 6209 | static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) |
5700 | { | 6210 | { |
5701 | safe_put_page(percpu->spare_page); | 6211 | safe_put_page(percpu->spare_page); |
5702 | kfree(percpu->scribble); | 6212 | if (percpu->scribble) |
6213 | flex_array_free(percpu->scribble); | ||
5703 | percpu->spare_page = NULL; | 6214 | percpu->spare_page = NULL; |
5704 | percpu->scribble = NULL; | 6215 | percpu->scribble = NULL; |
5705 | } | 6216 | } |
@@ -5709,7 +6220,9 @@ static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu | |||
5709 | if (conf->level == 6 && !percpu->spare_page) | 6220 | if (conf->level == 6 && !percpu->spare_page) |
5710 | percpu->spare_page = alloc_page(GFP_KERNEL); | 6221 | percpu->spare_page = alloc_page(GFP_KERNEL); |
5711 | if (!percpu->scribble) | 6222 | if (!percpu->scribble) |
5712 | percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); | 6223 | percpu->scribble = scribble_alloc(max(conf->raid_disks, |
6224 | conf->previous_raid_disks), conf->chunk_sectors / | ||
6225 | STRIPE_SECTORS, GFP_KERNEL); | ||
5713 | 6226 | ||
5714 | if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) { | 6227 | if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) { |
5715 | free_scratch_buffer(conf, percpu); | 6228 | free_scratch_buffer(conf, percpu); |
@@ -5740,6 +6253,8 @@ static void raid5_free_percpu(struct r5conf *conf) | |||
5740 | 6253 | ||
5741 | static void free_conf(struct r5conf *conf) | 6254 | static void free_conf(struct r5conf *conf) |
5742 | { | 6255 | { |
6256 | if (conf->shrinker.seeks) | ||
6257 | unregister_shrinker(&conf->shrinker); | ||
5743 | free_thread_groups(conf); | 6258 | free_thread_groups(conf); |
5744 | shrink_stripes(conf); | 6259 | shrink_stripes(conf); |
5745 | raid5_free_percpu(conf); | 6260 | raid5_free_percpu(conf); |
@@ -5807,6 +6322,30 @@ static int raid5_alloc_percpu(struct r5conf *conf) | |||
5807 | return err; | 6322 | return err; |
5808 | } | 6323 | } |
5809 | 6324 | ||
6325 | static unsigned long raid5_cache_scan(struct shrinker *shrink, | ||
6326 | struct shrink_control *sc) | ||
6327 | { | ||
6328 | struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); | ||
6329 | int ret = 0; | ||
6330 | while (ret < sc->nr_to_scan) { | ||
6331 | if (drop_one_stripe(conf) == 0) | ||
6332 | return SHRINK_STOP; | ||
6333 | ret++; | ||
6334 | } | ||
6335 | return ret; | ||
6336 | } | ||
6337 | |||
6338 | static unsigned long raid5_cache_count(struct shrinker *shrink, | ||
6339 | struct shrink_control *sc) | ||
6340 | { | ||
6341 | struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); | ||
6342 | |||
6343 | if (conf->max_nr_stripes < conf->min_nr_stripes) | ||
6344 | /* unlikely, but not impossible */ | ||
6345 | return 0; | ||
6346 | return conf->max_nr_stripes - conf->min_nr_stripes; | ||
6347 | } | ||
6348 | |||
5810 | static struct r5conf *setup_conf(struct mddev *mddev) | 6349 | static struct r5conf *setup_conf(struct mddev *mddev) |
5811 | { | 6350 | { |
5812 | struct r5conf *conf; | 6351 | struct r5conf *conf; |
@@ -5879,7 +6418,6 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
5879 | else | 6418 | else |
5880 | conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; | 6419 | conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; |
5881 | max_disks = max(conf->raid_disks, conf->previous_raid_disks); | 6420 | max_disks = max(conf->raid_disks, conf->previous_raid_disks); |
5882 | conf->scribble_len = scribble_len(max_disks); | ||
5883 | 6421 | ||
5884 | conf->disks = kzalloc(max_disks * sizeof(struct disk_info), | 6422 | conf->disks = kzalloc(max_disks * sizeof(struct disk_info), |
5885 | GFP_KERNEL); | 6423 | GFP_KERNEL); |
@@ -5907,6 +6445,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
5907 | INIT_LIST_HEAD(conf->temp_inactive_list + i); | 6445 | INIT_LIST_HEAD(conf->temp_inactive_list + i); |
5908 | 6446 | ||
5909 | conf->level = mddev->new_level; | 6447 | conf->level = mddev->new_level; |
6448 | conf->chunk_sectors = mddev->new_chunk_sectors; | ||
5910 | if (raid5_alloc_percpu(conf) != 0) | 6449 | if (raid5_alloc_percpu(conf) != 0) |
5911 | goto abort; | 6450 | goto abort; |
5912 | 6451 | ||
@@ -5939,12 +6478,17 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
5939 | conf->fullsync = 1; | 6478 | conf->fullsync = 1; |
5940 | } | 6479 | } |
5941 | 6480 | ||
5942 | conf->chunk_sectors = mddev->new_chunk_sectors; | ||
5943 | conf->level = mddev->new_level; | 6481 | conf->level = mddev->new_level; |
5944 | if (conf->level == 6) | 6482 | if (conf->level == 6) { |
5945 | conf->max_degraded = 2; | 6483 | conf->max_degraded = 2; |
5946 | else | 6484 | if (raid6_call.xor_syndrome) |
6485 | conf->rmw_level = PARITY_ENABLE_RMW; | ||
6486 | else | ||
6487 | conf->rmw_level = PARITY_DISABLE_RMW; | ||
6488 | } else { | ||
5947 | conf->max_degraded = 1; | 6489 | conf->max_degraded = 1; |
6490 | conf->rmw_level = PARITY_ENABLE_RMW; | ||
6491 | } | ||
5948 | conf->algorithm = mddev->new_layout; | 6492 | conf->algorithm = mddev->new_layout; |
5949 | conf->reshape_progress = mddev->reshape_position; | 6493 | conf->reshape_progress = mddev->reshape_position; |
5950 | if (conf->reshape_progress != MaxSector) { | 6494 | if (conf->reshape_progress != MaxSector) { |
@@ -5952,10 +6496,11 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
5952 | conf->prev_algo = mddev->layout; | 6496 | conf->prev_algo = mddev->layout; |
5953 | } | 6497 | } |
5954 | 6498 | ||
5955 | memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + | 6499 | conf->min_nr_stripes = NR_STRIPES; |
6500 | memory = conf->min_nr_stripes * (sizeof(struct stripe_head) + | ||
5956 | max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; | 6501 | max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; |
5957 | atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); | 6502 | atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); |
5958 | if (grow_stripes(conf, NR_STRIPES)) { | 6503 | if (grow_stripes(conf, conf->min_nr_stripes)) { |
5959 | printk(KERN_ERR | 6504 | printk(KERN_ERR |
5960 | "md/raid:%s: couldn't allocate %dkB for buffers\n", | 6505 | "md/raid:%s: couldn't allocate %dkB for buffers\n", |
5961 | mdname(mddev), memory); | 6506 | mdname(mddev), memory); |
@@ -5963,6 +6508,17 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
5963 | } else | 6508 | } else |
5964 | printk(KERN_INFO "md/raid:%s: allocated %dkB\n", | 6509 | printk(KERN_INFO "md/raid:%s: allocated %dkB\n", |
5965 | mdname(mddev), memory); | 6510 | mdname(mddev), memory); |
6511 | /* | ||
6512 | * Losing a stripe head costs more than the time to refill it, | ||
6513 | * it reduces the queue depth and so can hurt throughput. | ||
6514 | * So set it rather large, scaled by number of devices. | ||
6515 | */ | ||
6516 | conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4; | ||
6517 | conf->shrinker.scan_objects = raid5_cache_scan; | ||
6518 | conf->shrinker.count_objects = raid5_cache_count; | ||
6519 | conf->shrinker.batch = 128; | ||
6520 | conf->shrinker.flags = 0; | ||
6521 | register_shrinker(&conf->shrinker); | ||
5966 | 6522 | ||
5967 | sprintf(pers_name, "raid%d", mddev->new_level); | 6523 | sprintf(pers_name, "raid%d", mddev->new_level); |
5968 | conf->thread = md_register_thread(raid5d, mddev, pers_name); | 6524 | conf->thread = md_register_thread(raid5d, mddev, pers_name); |
@@ -6604,9 +7160,9 @@ static int check_stripe_cache(struct mddev *mddev) | |||
6604 | */ | 7160 | */ |
6605 | struct r5conf *conf = mddev->private; | 7161 | struct r5conf *conf = mddev->private; |
6606 | if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 | 7162 | if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 |
6607 | > conf->max_nr_stripes || | 7163 | > conf->min_nr_stripes || |
6608 | ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 | 7164 | ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 |
6609 | > conf->max_nr_stripes) { | 7165 | > conf->min_nr_stripes) { |
6610 | printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", | 7166 | printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", |
6611 | mdname(mddev), | 7167 | mdname(mddev), |
6612 | ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) | 7168 | ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 983e18a83db1..7dc0dd86074b 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
@@ -210,11 +210,19 @@ struct stripe_head { | |||
210 | atomic_t count; /* nr of active thread/requests */ | 210 | atomic_t count; /* nr of active thread/requests */ |
211 | int bm_seq; /* sequence number for bitmap flushes */ | 211 | int bm_seq; /* sequence number for bitmap flushes */ |
212 | int disks; /* disks in stripe */ | 212 | int disks; /* disks in stripe */ |
213 | int overwrite_disks; /* total overwrite disks in stripe, | ||
214 | * this is only checked when stripe | ||
215 | * has STRIPE_BATCH_READY | ||
216 | */ | ||
213 | enum check_states check_state; | 217 | enum check_states check_state; |
214 | enum reconstruct_states reconstruct_state; | 218 | enum reconstruct_states reconstruct_state; |
215 | spinlock_t stripe_lock; | 219 | spinlock_t stripe_lock; |
216 | int cpu; | 220 | int cpu; |
217 | struct r5worker_group *group; | 221 | struct r5worker_group *group; |
222 | |||
223 | struct stripe_head *batch_head; /* protected by stripe lock */ | ||
224 | spinlock_t batch_lock; /* only header's lock is useful */ | ||
225 | struct list_head batch_list; /* protected by head's batch lock*/ | ||
218 | /** | 226 | /** |
219 | * struct stripe_operations | 227 | * struct stripe_operations |
220 | * @target - STRIPE_OP_COMPUTE_BLK target | 228 | * @target - STRIPE_OP_COMPUTE_BLK target |
@@ -327,8 +335,15 @@ enum { | |||
327 | STRIPE_ON_UNPLUG_LIST, | 335 | STRIPE_ON_UNPLUG_LIST, |
328 | STRIPE_DISCARD, | 336 | STRIPE_DISCARD, |
329 | STRIPE_ON_RELEASE_LIST, | 337 | STRIPE_ON_RELEASE_LIST, |
338 | STRIPE_BATCH_READY, | ||
339 | STRIPE_BATCH_ERR, | ||
330 | }; | 340 | }; |
331 | 341 | ||
342 | #define STRIPE_EXPAND_SYNC_FLAG \ | ||
343 | ((1 << STRIPE_EXPAND_SOURCE) |\ | ||
344 | (1 << STRIPE_EXPAND_READY) |\ | ||
345 | (1 << STRIPE_EXPANDING) |\ | ||
346 | (1 << STRIPE_SYNC_REQUESTED)) | ||
332 | /* | 347 | /* |
333 | * Operation request flags | 348 | * Operation request flags |
334 | */ | 349 | */ |
@@ -340,6 +355,24 @@ enum { | |||
340 | STRIPE_OP_RECONSTRUCT, | 355 | STRIPE_OP_RECONSTRUCT, |
341 | STRIPE_OP_CHECK, | 356 | STRIPE_OP_CHECK, |
342 | }; | 357 | }; |
358 | |||
359 | /* | ||
360 | * RAID parity calculation preferences | ||
361 | */ | ||
362 | enum { | ||
363 | PARITY_DISABLE_RMW = 0, | ||
364 | PARITY_ENABLE_RMW, | ||
365 | PARITY_PREFER_RMW, | ||
366 | }; | ||
367 | |||
368 | /* | ||
369 | * Pages requested from set_syndrome_sources() | ||
370 | */ | ||
371 | enum { | ||
372 | SYNDROME_SRC_ALL, | ||
373 | SYNDROME_SRC_WANT_DRAIN, | ||
374 | SYNDROME_SRC_WRITTEN, | ||
375 | }; | ||
343 | /* | 376 | /* |
344 | * Plugging: | 377 | * Plugging: |
345 | * | 378 | * |
@@ -396,10 +429,11 @@ struct r5conf { | |||
396 | spinlock_t hash_locks[NR_STRIPE_HASH_LOCKS]; | 429 | spinlock_t hash_locks[NR_STRIPE_HASH_LOCKS]; |
397 | struct mddev *mddev; | 430 | struct mddev *mddev; |
398 | int chunk_sectors; | 431 | int chunk_sectors; |
399 | int level, algorithm; | 432 | int level, algorithm, rmw_level; |
400 | int max_degraded; | 433 | int max_degraded; |
401 | int raid_disks; | 434 | int raid_disks; |
402 | int max_nr_stripes; | 435 | int max_nr_stripes; |
436 | int min_nr_stripes; | ||
403 | 437 | ||
404 | /* reshape_progress is the leading edge of a 'reshape' | 438 | /* reshape_progress is the leading edge of a 'reshape' |
405 | * It has value MaxSector when no reshape is happening | 439 | * It has value MaxSector when no reshape is happening |
@@ -458,15 +492,11 @@ struct r5conf { | |||
458 | /* per cpu variables */ | 492 | /* per cpu variables */ |
459 | struct raid5_percpu { | 493 | struct raid5_percpu { |
460 | struct page *spare_page; /* Used when checking P/Q in raid6 */ | 494 | struct page *spare_page; /* Used when checking P/Q in raid6 */ |
461 | void *scribble; /* space for constructing buffer | 495 | struct flex_array *scribble; /* space for constructing buffer |
462 | * lists and performing address | 496 | * lists and performing address |
463 | * conversions | 497 | * conversions |
464 | */ | 498 | */ |
465 | } __percpu *percpu; | 499 | } __percpu *percpu; |
466 | size_t scribble_len; /* size of scribble region must be | ||
467 | * associated with conf to handle | ||
468 | * cpu hotplug while reshaping | ||
469 | */ | ||
470 | #ifdef CONFIG_HOTPLUG_CPU | 500 | #ifdef CONFIG_HOTPLUG_CPU |
471 | struct notifier_block cpu_notify; | 501 | struct notifier_block cpu_notify; |
472 | #endif | 502 | #endif |
@@ -480,9 +510,19 @@ struct r5conf { | |||
480 | struct llist_head released_stripes; | 510 | struct llist_head released_stripes; |
481 | wait_queue_head_t wait_for_stripe; | 511 | wait_queue_head_t wait_for_stripe; |
482 | wait_queue_head_t wait_for_overlap; | 512 | wait_queue_head_t wait_for_overlap; |
483 | int inactive_blocked; /* release of inactive stripes blocked, | 513 | unsigned long cache_state; |
484 | * waiting for 25% to be free | 514 | #define R5_INACTIVE_BLOCKED 1 /* release of inactive stripes blocked, |
485 | */ | 515 | * waiting for 25% to be free |
516 | */ | ||
517 | #define R5_ALLOC_MORE 2 /* It might help to allocate another | ||
518 | * stripe. | ||
519 | */ | ||
520 | #define R5_DID_ALLOC 4 /* A stripe was allocated, don't allocate | ||
521 | * more until at least one has been | ||
522 | * released. This avoids flooding | ||
523 | * the cache. | ||
524 | */ | ||
525 | struct shrinker shrinker; | ||
486 | int pool_size; /* number of disks in stripeheads in pool */ | 526 | int pool_size; /* number of disks in stripeheads in pool */ |
487 | spinlock_t device_lock; | 527 | spinlock_t device_lock; |
488 | struct disk_info *disks; | 528 | struct disk_info *disks; |
@@ -497,6 +537,7 @@ struct r5conf { | |||
497 | int worker_cnt_per_group; | 537 | int worker_cnt_per_group; |
498 | }; | 538 | }; |
499 | 539 | ||
540 | |||
500 | /* | 541 | /* |
501 | * Our supported algorithms | 542 | * Our supported algorithms |
502 | */ | 543 | */ |
diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h index 179b38ffd351..388574ea38ed 100644 --- a/include/linux/async_tx.h +++ b/include/linux/async_tx.h | |||
@@ -60,12 +60,15 @@ struct dma_chan_ref { | |||
60 | * dependency chain | 60 | * dependency chain |
61 | * @ASYNC_TX_FENCE: specify that the next operation in the dependency | 61 | * @ASYNC_TX_FENCE: specify that the next operation in the dependency |
62 | * chain uses this operation's result as an input | 62 | * chain uses this operation's result as an input |
63 | * @ASYNC_TX_PQ_XOR_DST: do not overwrite the syndrome but XOR it with the | ||
64 | * input data. Required for rmw case. | ||
63 | */ | 65 | */ |
64 | enum async_tx_flags { | 66 | enum async_tx_flags { |
65 | ASYNC_TX_XOR_ZERO_DST = (1 << 0), | 67 | ASYNC_TX_XOR_ZERO_DST = (1 << 0), |
66 | ASYNC_TX_XOR_DROP_DST = (1 << 1), | 68 | ASYNC_TX_XOR_DROP_DST = (1 << 1), |
67 | ASYNC_TX_ACK = (1 << 2), | 69 | ASYNC_TX_ACK = (1 << 2), |
68 | ASYNC_TX_FENCE = (1 << 3), | 70 | ASYNC_TX_FENCE = (1 << 3), |
71 | ASYNC_TX_PQ_XOR_DST = (1 << 4), | ||
69 | }; | 72 | }; |
70 | 73 | ||
71 | /** | 74 | /** |
diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index 73069cb6c54a..a7a06d1dcf9c 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h | |||
@@ -72,6 +72,7 @@ extern const char raid6_empty_zero_page[PAGE_SIZE]; | |||
72 | /* Routine choices */ | 72 | /* Routine choices */ |
73 | struct raid6_calls { | 73 | struct raid6_calls { |
74 | void (*gen_syndrome)(int, size_t, void **); | 74 | void (*gen_syndrome)(int, size_t, void **); |
75 | void (*xor_syndrome)(int, int, int, size_t, void **); | ||
75 | int (*valid)(void); /* Returns 1 if this routine set is usable */ | 76 | int (*valid)(void); /* Returns 1 if this routine set is usable */ |
76 | const char *name; /* Name of this routine set */ | 77 | const char *name; /* Name of this routine set */ |
77 | int prefer; /* Has special performance attribute */ | 78 | int prefer; /* Has special performance attribute */ |
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index 49f4210d4394..2ae6131e69a5 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h | |||
@@ -78,6 +78,12 @@ | |||
78 | #define MD_DISK_ACTIVE 1 /* disk is running or spare disk */ | 78 | #define MD_DISK_ACTIVE 1 /* disk is running or spare disk */ |
79 | #define MD_DISK_SYNC 2 /* disk is in sync with the raid set */ | 79 | #define MD_DISK_SYNC 2 /* disk is in sync with the raid set */ |
80 | #define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */ | 80 | #define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */ |
81 | #define MD_DISK_CLUSTER_ADD 4 /* Initiate a disk add across the cluster | ||
82 | * For clustered enviroments only. | ||
83 | */ | ||
84 | #define MD_DISK_CANDIDATE 5 /* disk is added as spare (local) until confirmed | ||
85 | * For clustered enviroments only. | ||
86 | */ | ||
81 | 87 | ||
82 | #define MD_DISK_WRITEMOSTLY 9 /* disk is "write-mostly" is RAID1 config. | 88 | #define MD_DISK_WRITEMOSTLY 9 /* disk is "write-mostly" is RAID1 config. |
83 | * read requests will only be sent here in | 89 | * read requests will only be sent here in |
@@ -101,6 +107,7 @@ typedef struct mdp_device_descriptor_s { | |||
101 | #define MD_SB_CLEAN 0 | 107 | #define MD_SB_CLEAN 0 |
102 | #define MD_SB_ERRORS 1 | 108 | #define MD_SB_ERRORS 1 |
103 | 109 | ||
110 | #define MD_SB_CLUSTERED 5 /* MD is clustered */ | ||
104 | #define MD_SB_BITMAP_PRESENT 8 /* bitmap may be present nearby */ | 111 | #define MD_SB_BITMAP_PRESENT 8 /* bitmap may be present nearby */ |
105 | 112 | ||
106 | /* | 113 | /* |
diff --git a/include/uapi/linux/raid/md_u.h b/include/uapi/linux/raid/md_u.h index 74e7c60c4716..1cb8aa6850b5 100644 --- a/include/uapi/linux/raid/md_u.h +++ b/include/uapi/linux/raid/md_u.h | |||
@@ -62,6 +62,7 @@ | |||
62 | #define STOP_ARRAY _IO (MD_MAJOR, 0x32) | 62 | #define STOP_ARRAY _IO (MD_MAJOR, 0x32) |
63 | #define STOP_ARRAY_RO _IO (MD_MAJOR, 0x33) | 63 | #define STOP_ARRAY_RO _IO (MD_MAJOR, 0x33) |
64 | #define RESTART_ARRAY_RW _IO (MD_MAJOR, 0x34) | 64 | #define RESTART_ARRAY_RW _IO (MD_MAJOR, 0x34) |
65 | #define CLUSTERED_DISK_NACK _IO (MD_MAJOR, 0x35) | ||
65 | 66 | ||
66 | /* 63 partitions with the alternate major number (mdp) */ | 67 | /* 63 partitions with the alternate major number (mdp) */ |
67 | #define MdpMinorShift 6 | 68 | #define MdpMinorShift 6 |
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index dbef2314901e..975c6e0434bd 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c | |||
@@ -131,11 +131,12 @@ static inline const struct raid6_recov_calls *raid6_choose_recov(void) | |||
131 | static inline const struct raid6_calls *raid6_choose_gen( | 131 | static inline const struct raid6_calls *raid6_choose_gen( |
132 | void *(*const dptrs)[(65536/PAGE_SIZE)+2], const int disks) | 132 | void *(*const dptrs)[(65536/PAGE_SIZE)+2], const int disks) |
133 | { | 133 | { |
134 | unsigned long perf, bestperf, j0, j1; | 134 | unsigned long perf, bestgenperf, bestxorperf, j0, j1; |
135 | int start = (disks>>1)-1, stop = disks-3; /* work on the second half of the disks */ | ||
135 | const struct raid6_calls *const *algo; | 136 | const struct raid6_calls *const *algo; |
136 | const struct raid6_calls *best; | 137 | const struct raid6_calls *best; |
137 | 138 | ||
138 | for (bestperf = 0, best = NULL, algo = raid6_algos; *algo; algo++) { | 139 | for (bestgenperf = 0, bestxorperf = 0, best = NULL, algo = raid6_algos; *algo; algo++) { |
139 | if (!best || (*algo)->prefer >= best->prefer) { | 140 | if (!best || (*algo)->prefer >= best->prefer) { |
140 | if ((*algo)->valid && !(*algo)->valid()) | 141 | if ((*algo)->valid && !(*algo)->valid()) |
141 | continue; | 142 | continue; |
@@ -153,19 +154,45 @@ static inline const struct raid6_calls *raid6_choose_gen( | |||
153 | } | 154 | } |
154 | preempt_enable(); | 155 | preempt_enable(); |
155 | 156 | ||
156 | if (perf > bestperf) { | 157 | if (perf > bestgenperf) { |
157 | bestperf = perf; | 158 | bestgenperf = perf; |
158 | best = *algo; | 159 | best = *algo; |
159 | } | 160 | } |
160 | pr_info("raid6: %-8s %5ld MB/s\n", (*algo)->name, | 161 | pr_info("raid6: %-8s gen() %5ld MB/s\n", (*algo)->name, |
161 | (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); | 162 | (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); |
163 | |||
164 | if (!(*algo)->xor_syndrome) | ||
165 | continue; | ||
166 | |||
167 | perf = 0; | ||
168 | |||
169 | preempt_disable(); | ||
170 | j0 = jiffies; | ||
171 | while ((j1 = jiffies) == j0) | ||
172 | cpu_relax(); | ||
173 | while (time_before(jiffies, | ||
174 | j1 + (1<<RAID6_TIME_JIFFIES_LG2))) { | ||
175 | (*algo)->xor_syndrome(disks, start, stop, | ||
176 | PAGE_SIZE, *dptrs); | ||
177 | perf++; | ||
178 | } | ||
179 | preempt_enable(); | ||
180 | |||
181 | if (best == *algo) | ||
182 | bestxorperf = perf; | ||
183 | |||
184 | pr_info("raid6: %-8s xor() %5ld MB/s\n", (*algo)->name, | ||
185 | (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2+1)); | ||
162 | } | 186 | } |
163 | } | 187 | } |
164 | 188 | ||
165 | if (best) { | 189 | if (best) { |
166 | pr_info("raid6: using algorithm %s (%ld MB/s)\n", | 190 | pr_info("raid6: using algorithm %s gen() %ld MB/s\n", |
167 | best->name, | 191 | best->name, |
168 | (bestperf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); | 192 | (bestgenperf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); |
193 | if (best->xor_syndrome) | ||
194 | pr_info("raid6: .... xor() %ld MB/s, rmw enabled\n", | ||
195 | (bestxorperf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2+1)); | ||
169 | raid6_call = *best; | 196 | raid6_call = *best; |
170 | } else | 197 | } else |
171 | pr_err("raid6: Yikes! No algorithm found!\n"); | 198 | pr_err("raid6: Yikes! No algorithm found!\n"); |
diff --git a/lib/raid6/altivec.uc b/lib/raid6/altivec.uc index 7cc12b532e95..bec27fce7501 100644 --- a/lib/raid6/altivec.uc +++ b/lib/raid6/altivec.uc | |||
@@ -119,6 +119,7 @@ int raid6_have_altivec(void) | |||
119 | 119 | ||
120 | const struct raid6_calls raid6_altivec$# = { | 120 | const struct raid6_calls raid6_altivec$# = { |
121 | raid6_altivec$#_gen_syndrome, | 121 | raid6_altivec$#_gen_syndrome, |
122 | NULL, /* XOR not yet implemented */ | ||
122 | raid6_have_altivec, | 123 | raid6_have_altivec, |
123 | "altivecx$#", | 124 | "altivecx$#", |
124 | 0 | 125 | 0 |
diff --git a/lib/raid6/avx2.c b/lib/raid6/avx2.c index bc3b1dd436eb..76734004358d 100644 --- a/lib/raid6/avx2.c +++ b/lib/raid6/avx2.c | |||
@@ -89,6 +89,7 @@ static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs) | |||
89 | 89 | ||
90 | const struct raid6_calls raid6_avx2x1 = { | 90 | const struct raid6_calls raid6_avx2x1 = { |
91 | raid6_avx21_gen_syndrome, | 91 | raid6_avx21_gen_syndrome, |
92 | NULL, /* XOR not yet implemented */ | ||
92 | raid6_have_avx2, | 93 | raid6_have_avx2, |
93 | "avx2x1", | 94 | "avx2x1", |
94 | 1 /* Has cache hints */ | 95 | 1 /* Has cache hints */ |
@@ -150,6 +151,7 @@ static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs) | |||
150 | 151 | ||
151 | const struct raid6_calls raid6_avx2x2 = { | 152 | const struct raid6_calls raid6_avx2x2 = { |
152 | raid6_avx22_gen_syndrome, | 153 | raid6_avx22_gen_syndrome, |
154 | NULL, /* XOR not yet implemented */ | ||
153 | raid6_have_avx2, | 155 | raid6_have_avx2, |
154 | "avx2x2", | 156 | "avx2x2", |
155 | 1 /* Has cache hints */ | 157 | 1 /* Has cache hints */ |
@@ -242,6 +244,7 @@ static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs) | |||
242 | 244 | ||
243 | const struct raid6_calls raid6_avx2x4 = { | 245 | const struct raid6_calls raid6_avx2x4 = { |
244 | raid6_avx24_gen_syndrome, | 246 | raid6_avx24_gen_syndrome, |
247 | NULL, /* XOR not yet implemented */ | ||
245 | raid6_have_avx2, | 248 | raid6_have_avx2, |
246 | "avx2x4", | 249 | "avx2x4", |
247 | 1 /* Has cache hints */ | 250 | 1 /* Has cache hints */ |
diff --git a/lib/raid6/int.uc b/lib/raid6/int.uc index 5b50f8dfc5d2..558aeac9342a 100644 --- a/lib/raid6/int.uc +++ b/lib/raid6/int.uc | |||
@@ -107,9 +107,48 @@ static void raid6_int$#_gen_syndrome(int disks, size_t bytes, void **ptrs) | |||
107 | } | 107 | } |
108 | } | 108 | } |
109 | 109 | ||
110 | static void raid6_int$#_xor_syndrome(int disks, int start, int stop, | ||
111 | size_t bytes, void **ptrs) | ||
112 | { | ||
113 | u8 **dptr = (u8 **)ptrs; | ||
114 | u8 *p, *q; | ||
115 | int d, z, z0; | ||
116 | |||
117 | unative_t wd$$, wq$$, wp$$, w1$$, w2$$; | ||
118 | |||
119 | z0 = stop; /* P/Q right side optimization */ | ||
120 | p = dptr[disks-2]; /* XOR parity */ | ||
121 | q = dptr[disks-1]; /* RS syndrome */ | ||
122 | |||
123 | for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { | ||
124 | /* P/Q data pages */ | ||
125 | wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; | ||
126 | for ( z = z0-1 ; z >= start ; z-- ) { | ||
127 | wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; | ||
128 | wp$$ ^= wd$$; | ||
129 | w2$$ = MASK(wq$$); | ||
130 | w1$$ = SHLBYTE(wq$$); | ||
131 | w2$$ &= NBYTES(0x1d); | ||
132 | w1$$ ^= w2$$; | ||
133 | wq$$ = w1$$ ^ wd$$; | ||
134 | } | ||
135 | /* P/Q left side optimization */ | ||
136 | for ( z = start-1 ; z >= 0 ; z-- ) { | ||
137 | w2$$ = MASK(wq$$); | ||
138 | w1$$ = SHLBYTE(wq$$); | ||
139 | w2$$ &= NBYTES(0x1d); | ||
140 | wq$$ = w1$$ ^ w2$$; | ||
141 | } | ||
142 | *(unative_t *)&p[d+NSIZE*$$] ^= wp$$; | ||
143 | *(unative_t *)&q[d+NSIZE*$$] ^= wq$$; | ||
144 | } | ||
145 | |||
146 | } | ||
147 | |||
110 | const struct raid6_calls raid6_intx$# = { | 148 | const struct raid6_calls raid6_intx$# = { |
111 | raid6_int$#_gen_syndrome, | 149 | raid6_int$#_gen_syndrome, |
112 | NULL, /* always valid */ | 150 | raid6_int$#_xor_syndrome, |
151 | NULL, /* always valid */ | ||
113 | "int" NSTRING "x$#", | 152 | "int" NSTRING "x$#", |
114 | 0 | 153 | 0 |
115 | }; | 154 | }; |
diff --git a/lib/raid6/mmx.c b/lib/raid6/mmx.c index 590c71c9e200..b3b0e1fcd3af 100644 --- a/lib/raid6/mmx.c +++ b/lib/raid6/mmx.c | |||
@@ -76,6 +76,7 @@ static void raid6_mmx1_gen_syndrome(int disks, size_t bytes, void **ptrs) | |||
76 | 76 | ||
77 | const struct raid6_calls raid6_mmxx1 = { | 77 | const struct raid6_calls raid6_mmxx1 = { |
78 | raid6_mmx1_gen_syndrome, | 78 | raid6_mmx1_gen_syndrome, |
79 | NULL, /* XOR not yet implemented */ | ||
79 | raid6_have_mmx, | 80 | raid6_have_mmx, |
80 | "mmxx1", | 81 | "mmxx1", |
81 | 0 | 82 | 0 |
@@ -134,6 +135,7 @@ static void raid6_mmx2_gen_syndrome(int disks, size_t bytes, void **ptrs) | |||
134 | 135 | ||
135 | const struct raid6_calls raid6_mmxx2 = { | 136 | const struct raid6_calls raid6_mmxx2 = { |
136 | raid6_mmx2_gen_syndrome, | 137 | raid6_mmx2_gen_syndrome, |
138 | NULL, /* XOR not yet implemented */ | ||
137 | raid6_have_mmx, | 139 | raid6_have_mmx, |
138 | "mmxx2", | 140 | "mmxx2", |
139 | 0 | 141 | 0 |
diff --git a/lib/raid6/neon.c b/lib/raid6/neon.c index 36ad4705df1a..d9ad6ee284f4 100644 --- a/lib/raid6/neon.c +++ b/lib/raid6/neon.c | |||
@@ -42,6 +42,7 @@ | |||
42 | } \ | 42 | } \ |
43 | struct raid6_calls const raid6_neonx ## _n = { \ | 43 | struct raid6_calls const raid6_neonx ## _n = { \ |
44 | raid6_neon ## _n ## _gen_syndrome, \ | 44 | raid6_neon ## _n ## _gen_syndrome, \ |
45 | NULL, /* XOR not yet implemented */ \ | ||
45 | raid6_have_neon, \ | 46 | raid6_have_neon, \ |
46 | "neonx" #_n, \ | 47 | "neonx" #_n, \ |
47 | 0 \ | 48 | 0 \ |
diff --git a/lib/raid6/sse1.c b/lib/raid6/sse1.c index f76297139445..9025b8ca9aa3 100644 --- a/lib/raid6/sse1.c +++ b/lib/raid6/sse1.c | |||
@@ -92,6 +92,7 @@ static void raid6_sse11_gen_syndrome(int disks, size_t bytes, void **ptrs) | |||
92 | 92 | ||
93 | const struct raid6_calls raid6_sse1x1 = { | 93 | const struct raid6_calls raid6_sse1x1 = { |
94 | raid6_sse11_gen_syndrome, | 94 | raid6_sse11_gen_syndrome, |
95 | NULL, /* XOR not yet implemented */ | ||
95 | raid6_have_sse1_or_mmxext, | 96 | raid6_have_sse1_or_mmxext, |
96 | "sse1x1", | 97 | "sse1x1", |
97 | 1 /* Has cache hints */ | 98 | 1 /* Has cache hints */ |
@@ -154,6 +155,7 @@ static void raid6_sse12_gen_syndrome(int disks, size_t bytes, void **ptrs) | |||
154 | 155 | ||
155 | const struct raid6_calls raid6_sse1x2 = { | 156 | const struct raid6_calls raid6_sse1x2 = { |
156 | raid6_sse12_gen_syndrome, | 157 | raid6_sse12_gen_syndrome, |
158 | NULL, /* XOR not yet implemented */ | ||
157 | raid6_have_sse1_or_mmxext, | 159 | raid6_have_sse1_or_mmxext, |
158 | "sse1x2", | 160 | "sse1x2", |
159 | 1 /* Has cache hints */ | 161 | 1 /* Has cache hints */ |
diff --git a/lib/raid6/sse2.c b/lib/raid6/sse2.c index 85b82c85f28e..1d2276b007ee 100644 --- a/lib/raid6/sse2.c +++ b/lib/raid6/sse2.c | |||
@@ -88,8 +88,58 @@ static void raid6_sse21_gen_syndrome(int disks, size_t bytes, void **ptrs) | |||
88 | kernel_fpu_end(); | 88 | kernel_fpu_end(); |
89 | } | 89 | } |
90 | 90 | ||
91 | |||
92 | static void raid6_sse21_xor_syndrome(int disks, int start, int stop, | ||
93 | size_t bytes, void **ptrs) | ||
94 | { | ||
95 | u8 **dptr = (u8 **)ptrs; | ||
96 | u8 *p, *q; | ||
97 | int d, z, z0; | ||
98 | |||
99 | z0 = stop; /* P/Q right side optimization */ | ||
100 | p = dptr[disks-2]; /* XOR parity */ | ||
101 | q = dptr[disks-1]; /* RS syndrome */ | ||
102 | |||
103 | kernel_fpu_begin(); | ||
104 | |||
105 | asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); | ||
106 | |||
107 | for ( d = 0 ; d < bytes ; d += 16 ) { | ||
108 | asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d])); | ||
109 | asm volatile("movdqa %0,%%xmm2" : : "m" (p[d])); | ||
110 | asm volatile("pxor %xmm4,%xmm2"); | ||
111 | /* P/Q data pages */ | ||
112 | for ( z = z0-1 ; z >= start ; z-- ) { | ||
113 | asm volatile("pxor %xmm5,%xmm5"); | ||
114 | asm volatile("pcmpgtb %xmm4,%xmm5"); | ||
115 | asm volatile("paddb %xmm4,%xmm4"); | ||
116 | asm volatile("pand %xmm0,%xmm5"); | ||
117 | asm volatile("pxor %xmm5,%xmm4"); | ||
118 | asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d])); | ||
119 | asm volatile("pxor %xmm5,%xmm2"); | ||
120 | asm volatile("pxor %xmm5,%xmm4"); | ||
121 | } | ||
122 | /* P/Q left side optimization */ | ||
123 | for ( z = start-1 ; z >= 0 ; z-- ) { | ||
124 | asm volatile("pxor %xmm5,%xmm5"); | ||
125 | asm volatile("pcmpgtb %xmm4,%xmm5"); | ||
126 | asm volatile("paddb %xmm4,%xmm4"); | ||
127 | asm volatile("pand %xmm0,%xmm5"); | ||
128 | asm volatile("pxor %xmm5,%xmm4"); | ||
129 | } | ||
130 | asm volatile("pxor %0,%%xmm4" : : "m" (q[d])); | ||
131 | /* Don't use movntdq for r/w memory area < cache line */ | ||
132 | asm volatile("movdqa %%xmm4,%0" : "=m" (q[d])); | ||
133 | asm volatile("movdqa %%xmm2,%0" : "=m" (p[d])); | ||
134 | } | ||
135 | |||
136 | asm volatile("sfence" : : : "memory"); | ||
137 | kernel_fpu_end(); | ||
138 | } | ||
139 | |||
91 | const struct raid6_calls raid6_sse2x1 = { | 140 | const struct raid6_calls raid6_sse2x1 = { |
92 | raid6_sse21_gen_syndrome, | 141 | raid6_sse21_gen_syndrome, |
142 | raid6_sse21_xor_syndrome, | ||
93 | raid6_have_sse2, | 143 | raid6_have_sse2, |
94 | "sse2x1", | 144 | "sse2x1", |
95 | 1 /* Has cache hints */ | 145 | 1 /* Has cache hints */ |
@@ -150,8 +200,76 @@ static void raid6_sse22_gen_syndrome(int disks, size_t bytes, void **ptrs) | |||
150 | kernel_fpu_end(); | 200 | kernel_fpu_end(); |
151 | } | 201 | } |
152 | 202 | ||
203 | static void raid6_sse22_xor_syndrome(int disks, int start, int stop, | ||
204 | size_t bytes, void **ptrs) | ||
205 | { | ||
206 | u8 **dptr = (u8 **)ptrs; | ||
207 | u8 *p, *q; | ||
208 | int d, z, z0; | ||
209 | |||
210 | z0 = stop; /* P/Q right side optimization */ | ||
211 | p = dptr[disks-2]; /* XOR parity */ | ||
212 | q = dptr[disks-1]; /* RS syndrome */ | ||
213 | |||
214 | kernel_fpu_begin(); | ||
215 | |||
216 | asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); | ||
217 | |||
218 | for ( d = 0 ; d < bytes ; d += 32 ) { | ||
219 | asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d])); | ||
220 | asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16])); | ||
221 | asm volatile("movdqa %0,%%xmm2" : : "m" (p[d])); | ||
222 | asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16])); | ||
223 | asm volatile("pxor %xmm4,%xmm2"); | ||
224 | asm volatile("pxor %xmm6,%xmm3"); | ||
225 | /* P/Q data pages */ | ||
226 | for ( z = z0-1 ; z >= start ; z-- ) { | ||
227 | asm volatile("pxor %xmm5,%xmm5"); | ||
228 | asm volatile("pxor %xmm7,%xmm7"); | ||
229 | asm volatile("pcmpgtb %xmm4,%xmm5"); | ||
230 | asm volatile("pcmpgtb %xmm6,%xmm7"); | ||
231 | asm volatile("paddb %xmm4,%xmm4"); | ||
232 | asm volatile("paddb %xmm6,%xmm6"); | ||
233 | asm volatile("pand %xmm0,%xmm5"); | ||
234 | asm volatile("pand %xmm0,%xmm7"); | ||
235 | asm volatile("pxor %xmm5,%xmm4"); | ||
236 | asm volatile("pxor %xmm7,%xmm6"); | ||
237 | asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d])); | ||
238 | asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16])); | ||
239 | asm volatile("pxor %xmm5,%xmm2"); | ||
240 | asm volatile("pxor %xmm7,%xmm3"); | ||
241 | asm volatile("pxor %xmm5,%xmm4"); | ||
242 | asm volatile("pxor %xmm7,%xmm6"); | ||
243 | } | ||
244 | /* P/Q left side optimization */ | ||
245 | for ( z = start-1 ; z >= 0 ; z-- ) { | ||
246 | asm volatile("pxor %xmm5,%xmm5"); | ||
247 | asm volatile("pxor %xmm7,%xmm7"); | ||
248 | asm volatile("pcmpgtb %xmm4,%xmm5"); | ||
249 | asm volatile("pcmpgtb %xmm6,%xmm7"); | ||
250 | asm volatile("paddb %xmm4,%xmm4"); | ||
251 | asm volatile("paddb %xmm6,%xmm6"); | ||
252 | asm volatile("pand %xmm0,%xmm5"); | ||
253 | asm volatile("pand %xmm0,%xmm7"); | ||
254 | asm volatile("pxor %xmm5,%xmm4"); | ||
255 | asm volatile("pxor %xmm7,%xmm6"); | ||
256 | } | ||
257 | asm volatile("pxor %0,%%xmm4" : : "m" (q[d])); | ||
258 | asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16])); | ||
259 | /* Don't use movntdq for r/w memory area < cache line */ | ||
260 | asm volatile("movdqa %%xmm4,%0" : "=m" (q[d])); | ||
261 | asm volatile("movdqa %%xmm6,%0" : "=m" (q[d+16])); | ||
262 | asm volatile("movdqa %%xmm2,%0" : "=m" (p[d])); | ||
263 | asm volatile("movdqa %%xmm3,%0" : "=m" (p[d+16])); | ||
264 | } | ||
265 | |||
266 | asm volatile("sfence" : : : "memory"); | ||
267 | kernel_fpu_end(); | ||
268 | } | ||
269 | |||
153 | const struct raid6_calls raid6_sse2x2 = { | 270 | const struct raid6_calls raid6_sse2x2 = { |
154 | raid6_sse22_gen_syndrome, | 271 | raid6_sse22_gen_syndrome, |
272 | raid6_sse22_xor_syndrome, | ||
155 | raid6_have_sse2, | 273 | raid6_have_sse2, |
156 | "sse2x2", | 274 | "sse2x2", |
157 | 1 /* Has cache hints */ | 275 | 1 /* Has cache hints */ |
@@ -248,8 +366,117 @@ static void raid6_sse24_gen_syndrome(int disks, size_t bytes, void **ptrs) | |||
248 | kernel_fpu_end(); | 366 | kernel_fpu_end(); |
249 | } | 367 | } |
250 | 368 | ||
369 | static void raid6_sse24_xor_syndrome(int disks, int start, int stop, | ||
370 | size_t bytes, void **ptrs) | ||
371 | { | ||
372 | u8 **dptr = (u8 **)ptrs; | ||
373 | u8 *p, *q; | ||
374 | int d, z, z0; | ||
375 | |||
376 | z0 = stop; /* P/Q right side optimization */ | ||
377 | p = dptr[disks-2]; /* XOR parity */ | ||
378 | q = dptr[disks-1]; /* RS syndrome */ | ||
379 | |||
380 | kernel_fpu_begin(); | ||
381 | |||
382 | asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0])); | ||
383 | |||
384 | for ( d = 0 ; d < bytes ; d += 64 ) { | ||
385 | asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d])); | ||
386 | asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16])); | ||
387 | asm volatile("movdqa %0,%%xmm12" :: "m" (dptr[z0][d+32])); | ||
388 | asm volatile("movdqa %0,%%xmm14" :: "m" (dptr[z0][d+48])); | ||
389 | asm volatile("movdqa %0,%%xmm2" : : "m" (p[d])); | ||
390 | asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16])); | ||
391 | asm volatile("movdqa %0,%%xmm10" : : "m" (p[d+32])); | ||
392 | asm volatile("movdqa %0,%%xmm11" : : "m" (p[d+48])); | ||
393 | asm volatile("pxor %xmm4,%xmm2"); | ||
394 | asm volatile("pxor %xmm6,%xmm3"); | ||
395 | asm volatile("pxor %xmm12,%xmm10"); | ||
396 | asm volatile("pxor %xmm14,%xmm11"); | ||
397 | /* P/Q data pages */ | ||
398 | for ( z = z0-1 ; z >= start ; z-- ) { | ||
399 | asm volatile("prefetchnta %0" :: "m" (dptr[z][d])); | ||
400 | asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32])); | ||
401 | asm volatile("pxor %xmm5,%xmm5"); | ||
402 | asm volatile("pxor %xmm7,%xmm7"); | ||
403 | asm volatile("pxor %xmm13,%xmm13"); | ||
404 | asm volatile("pxor %xmm15,%xmm15"); | ||
405 | asm volatile("pcmpgtb %xmm4,%xmm5"); | ||
406 | asm volatile("pcmpgtb %xmm6,%xmm7"); | ||
407 | asm volatile("pcmpgtb %xmm12,%xmm13"); | ||
408 | asm volatile("pcmpgtb %xmm14,%xmm15"); | ||
409 | asm volatile("paddb %xmm4,%xmm4"); | ||
410 | asm volatile("paddb %xmm6,%xmm6"); | ||
411 | asm volatile("paddb %xmm12,%xmm12"); | ||
412 | asm volatile("paddb %xmm14,%xmm14"); | ||
413 | asm volatile("pand %xmm0,%xmm5"); | ||
414 | asm volatile("pand %xmm0,%xmm7"); | ||
415 | asm volatile("pand %xmm0,%xmm13"); | ||
416 | asm volatile("pand %xmm0,%xmm15"); | ||
417 | asm volatile("pxor %xmm5,%xmm4"); | ||
418 | asm volatile("pxor %xmm7,%xmm6"); | ||
419 | asm volatile("pxor %xmm13,%xmm12"); | ||
420 | asm volatile("pxor %xmm15,%xmm14"); | ||
421 | asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d])); | ||
422 | asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16])); | ||
423 | asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32])); | ||
424 | asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48])); | ||
425 | asm volatile("pxor %xmm5,%xmm2"); | ||
426 | asm volatile("pxor %xmm7,%xmm3"); | ||
427 | asm volatile("pxor %xmm13,%xmm10"); | ||
428 | asm volatile("pxor %xmm15,%xmm11"); | ||
429 | asm volatile("pxor %xmm5,%xmm4"); | ||
430 | asm volatile("pxor %xmm7,%xmm6"); | ||
431 | asm volatile("pxor %xmm13,%xmm12"); | ||
432 | asm volatile("pxor %xmm15,%xmm14"); | ||
433 | } | ||
434 | asm volatile("prefetchnta %0" :: "m" (q[d])); | ||
435 | asm volatile("prefetchnta %0" :: "m" (q[d+32])); | ||
436 | /* P/Q left side optimization */ | ||
437 | for ( z = start-1 ; z >= 0 ; z-- ) { | ||
438 | asm volatile("pxor %xmm5,%xmm5"); | ||
439 | asm volatile("pxor %xmm7,%xmm7"); | ||
440 | asm volatile("pxor %xmm13,%xmm13"); | ||
441 | asm volatile("pxor %xmm15,%xmm15"); | ||
442 | asm volatile("pcmpgtb %xmm4,%xmm5"); | ||
443 | asm volatile("pcmpgtb %xmm6,%xmm7"); | ||
444 | asm volatile("pcmpgtb %xmm12,%xmm13"); | ||
445 | asm volatile("pcmpgtb %xmm14,%xmm15"); | ||
446 | asm volatile("paddb %xmm4,%xmm4"); | ||
447 | asm volatile("paddb %xmm6,%xmm6"); | ||
448 | asm volatile("paddb %xmm12,%xmm12"); | ||
449 | asm volatile("paddb %xmm14,%xmm14"); | ||
450 | asm volatile("pand %xmm0,%xmm5"); | ||
451 | asm volatile("pand %xmm0,%xmm7"); | ||
452 | asm volatile("pand %xmm0,%xmm13"); | ||
453 | asm volatile("pand %xmm0,%xmm15"); | ||
454 | asm volatile("pxor %xmm5,%xmm4"); | ||
455 | asm volatile("pxor %xmm7,%xmm6"); | ||
456 | asm volatile("pxor %xmm13,%xmm12"); | ||
457 | asm volatile("pxor %xmm15,%xmm14"); | ||
458 | } | ||
459 | asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); | ||
460 | asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16])); | ||
461 | asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32])); | ||
462 | asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48])); | ||
463 | asm volatile("pxor %0,%%xmm4" : : "m" (q[d])); | ||
464 | asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16])); | ||
465 | asm volatile("pxor %0,%%xmm12" : : "m" (q[d+32])); | ||
466 | asm volatile("pxor %0,%%xmm14" : : "m" (q[d+48])); | ||
467 | asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); | ||
468 | asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16])); | ||
469 | asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32])); | ||
470 | asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48])); | ||
471 | } | ||
472 | asm volatile("sfence" : : : "memory"); | ||
473 | kernel_fpu_end(); | ||
474 | } | ||
475 | |||
476 | |||
251 | const struct raid6_calls raid6_sse2x4 = { | 477 | const struct raid6_calls raid6_sse2x4 = { |
252 | raid6_sse24_gen_syndrome, | 478 | raid6_sse24_gen_syndrome, |
479 | raid6_sse24_xor_syndrome, | ||
253 | raid6_have_sse2, | 480 | raid6_have_sse2, |
254 | "sse2x4", | 481 | "sse2x4", |
255 | 1 /* Has cache hints */ | 482 | 1 /* Has cache hints */ |
diff --git a/lib/raid6/test/test.c b/lib/raid6/test/test.c index 5a485b7a7d3c..3bebbabdb510 100644 --- a/lib/raid6/test/test.c +++ b/lib/raid6/test/test.c | |||
@@ -28,11 +28,11 @@ char *dataptrs[NDISKS]; | |||
28 | char data[NDISKS][PAGE_SIZE]; | 28 | char data[NDISKS][PAGE_SIZE]; |
29 | char recovi[PAGE_SIZE], recovj[PAGE_SIZE]; | 29 | char recovi[PAGE_SIZE], recovj[PAGE_SIZE]; |
30 | 30 | ||
31 | static void makedata(void) | 31 | static void makedata(int start, int stop) |
32 | { | 32 | { |
33 | int i, j; | 33 | int i, j; |
34 | 34 | ||
35 | for (i = 0; i < NDISKS; i++) { | 35 | for (i = start; i <= stop; i++) { |
36 | for (j = 0; j < PAGE_SIZE; j++) | 36 | for (j = 0; j < PAGE_SIZE; j++) |
37 | data[i][j] = rand(); | 37 | data[i][j] = rand(); |
38 | 38 | ||
@@ -91,34 +91,55 @@ int main(int argc, char *argv[]) | |||
91 | { | 91 | { |
92 | const struct raid6_calls *const *algo; | 92 | const struct raid6_calls *const *algo; |
93 | const struct raid6_recov_calls *const *ra; | 93 | const struct raid6_recov_calls *const *ra; |
94 | int i, j; | 94 | int i, j, p1, p2; |
95 | int err = 0; | 95 | int err = 0; |
96 | 96 | ||
97 | makedata(); | 97 | makedata(0, NDISKS-1); |
98 | 98 | ||
99 | for (ra = raid6_recov_algos; *ra; ra++) { | 99 | for (ra = raid6_recov_algos; *ra; ra++) { |
100 | if ((*ra)->valid && !(*ra)->valid()) | 100 | if ((*ra)->valid && !(*ra)->valid()) |
101 | continue; | 101 | continue; |
102 | |||
102 | raid6_2data_recov = (*ra)->data2; | 103 | raid6_2data_recov = (*ra)->data2; |
103 | raid6_datap_recov = (*ra)->datap; | 104 | raid6_datap_recov = (*ra)->datap; |
104 | 105 | ||
105 | printf("using recovery %s\n", (*ra)->name); | 106 | printf("using recovery %s\n", (*ra)->name); |
106 | 107 | ||
107 | for (algo = raid6_algos; *algo; algo++) { | 108 | for (algo = raid6_algos; *algo; algo++) { |
108 | if (!(*algo)->valid || (*algo)->valid()) { | 109 | if ((*algo)->valid && !(*algo)->valid()) |
109 | raid6_call = **algo; | 110 | continue; |
111 | |||
112 | raid6_call = **algo; | ||
113 | |||
114 | /* Nuke syndromes */ | ||
115 | memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE); | ||
116 | |||
117 | /* Generate assumed good syndrome */ | ||
118 | raid6_call.gen_syndrome(NDISKS, PAGE_SIZE, | ||
119 | (void **)&dataptrs); | ||
120 | |||
121 | for (i = 0; i < NDISKS-1; i++) | ||
122 | for (j = i+1; j < NDISKS; j++) | ||
123 | err += test_disks(i, j); | ||
124 | |||
125 | if (!raid6_call.xor_syndrome) | ||
126 | continue; | ||
127 | |||
128 | for (p1 = 0; p1 < NDISKS-2; p1++) | ||
129 | for (p2 = p1; p2 < NDISKS-2; p2++) { | ||
110 | 130 | ||
111 | /* Nuke syndromes */ | 131 | /* Simulate rmw run */ |
112 | memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE); | 132 | raid6_call.xor_syndrome(NDISKS, p1, p2, PAGE_SIZE, |
133 | (void **)&dataptrs); | ||
134 | makedata(p1, p2); | ||
135 | raid6_call.xor_syndrome(NDISKS, p1, p2, PAGE_SIZE, | ||
136 | (void **)&dataptrs); | ||
113 | 137 | ||
114 | /* Generate assumed good syndrome */ | 138 | for (i = 0; i < NDISKS-1; i++) |
115 | raid6_call.gen_syndrome(NDISKS, PAGE_SIZE, | 139 | for (j = i+1; j < NDISKS; j++) |
116 | (void **)&dataptrs); | 140 | err += test_disks(i, j); |
141 | } | ||
117 | 142 | ||
118 | for (i = 0; i < NDISKS-1; i++) | ||
119 | for (j = i+1; j < NDISKS; j++) | ||
120 | err += test_disks(i, j); | ||
121 | } | ||
122 | } | 143 | } |
123 | printf("\n"); | 144 | printf("\n"); |
124 | } | 145 | } |
diff --git a/lib/raid6/tilegx.uc b/lib/raid6/tilegx.uc index e7c29459cbcd..2dd291a11264 100644 --- a/lib/raid6/tilegx.uc +++ b/lib/raid6/tilegx.uc | |||
@@ -80,6 +80,7 @@ void raid6_tilegx$#_gen_syndrome(int disks, size_t bytes, void **ptrs) | |||
80 | 80 | ||
81 | const struct raid6_calls raid6_tilegx$# = { | 81 | const struct raid6_calls raid6_tilegx$# = { |
82 | raid6_tilegx$#_gen_syndrome, | 82 | raid6_tilegx$#_gen_syndrome, |
83 | NULL, /* XOR not yet implemented */ | ||
83 | NULL, | 84 | NULL, |
84 | "tilegx$#", | 85 | "tilegx$#", |
85 | 0 | 86 | 0 |