aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorJonathan Herman <hermanjl@cs.unc.edu>2013-01-17 16:15:55 -0500
committerJonathan Herman <hermanjl@cs.unc.edu>2013-01-17 16:15:55 -0500
commit8dea78da5cee153b8af9c07a2745f6c55057fe12 (patch)
treea8f4d49d63b1ecc92f2fddceba0655b2472c5bd9 /drivers/md
parent406089d01562f1e2bf9f089fd7637009ebaad589 (diff)
Patched in Tegra support.
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig66
-rw-r--r--drivers/md/Makefile6
-rw-r--r--drivers/md/bitmap.c1370
-rw-r--r--drivers/md/bitmap.h95
-rw-r--r--drivers/md/dm-bio-prison.c390
-rw-r--r--drivers/md/dm-bio-prison.h71
-rw-r--r--drivers/md/dm-bufio.c1750
-rw-r--r--drivers/md/dm-bufio.h120
-rw-r--r--drivers/md/dm-crypt.c721
-rw-r--r--drivers/md/dm-delay.c16
-rw-r--r--drivers/md/dm-exception-store.c16
-rw-r--r--drivers/md/dm-flakey.c37
-rw-r--r--drivers/md/dm-io.c47
-rw-r--r--drivers/md/dm-ioctl.c87
-rw-r--r--drivers/md/dm-kcopyd.c47
-rw-r--r--drivers/md/dm-linear.c23
-rw-r--r--drivers/md/dm-log-userspace-base.c38
-rw-r--r--drivers/md/dm-log-userspace-transfer.c2
-rw-r--r--drivers/md/dm-log.c16
-rw-r--r--drivers/md/dm-mpath.c172
-rw-r--r--drivers/md/dm-path-selector.c1
-rw-r--r--drivers/md/dm-queue-length.c3
-rw-r--r--drivers/md/dm-raid.c412
-rw-r--r--drivers/md/dm-raid1.c100
-rw-r--r--drivers/md/dm-region-hash.c5
-rw-r--r--drivers/md/dm-round-robin.c4
-rw-r--r--drivers/md/dm-service-time.c6
-rw-r--r--drivers/md/dm-snap-persistent.c1
-rw-r--r--drivers/md/dm-snap-transient.c1
-rw-r--r--drivers/md/dm-snap.c124
-rw-r--r--drivers/md/dm-stripe.c112
-rw-r--r--drivers/md/dm-table.c185
-rw-r--r--drivers/md/dm-target.c5
-rw-r--r--drivers/md/dm-thin-metadata.c1686
-rw-r--r--drivers/md/dm-thin-metadata.h197
-rw-r--r--drivers/md/dm-thin.c2818
-rw-r--r--drivers/md/dm-uevent.c1
-rw-r--r--drivers/md/dm-verity.c898
-rw-r--r--drivers/md/dm-zero.c5
-rw-r--r--drivers/md/dm.c419
-rw-r--r--drivers/md/dm.h10
-rw-r--r--drivers/md/faulty.c56
-rw-r--r--drivers/md/linear.c125
-rw-r--r--drivers/md/linear.h12
-rw-r--r--drivers/md/md.c1848
-rw-r--r--drivers/md/md.h296
-rw-r--r--drivers/md/multipath.c80
-rw-r--r--drivers/md/multipath.h10
-rw-r--r--drivers/md/persistent-data/Kconfig8
-rw-r--r--drivers/md/persistent-data/Makefile10
-rw-r--r--drivers/md/persistent-data/dm-block-manager.c635
-rw-r--r--drivers/md/persistent-data/dm-block-manager.h128
-rw-r--r--drivers/md/persistent-data/dm-btree-internal.h134
-rw-r--r--drivers/md/persistent-data/dm-btree-remove.c590
-rw-r--r--drivers/md/persistent-data/dm-btree-spine.c244
-rw-r--r--drivers/md/persistent-data/dm-btree.c809
-rw-r--r--drivers/md/persistent-data/dm-btree.h145
-rw-r--r--drivers/md/persistent-data/dm-persistent-data-internal.h19
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.c712
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.h127
-rw-r--r--drivers/md/persistent-data/dm-space-map-disk.c318
-rw-r--r--drivers/md/persistent-data/dm-space-map-disk.h25
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c596
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.h33
-rw-r--r--drivers/md/persistent-data/dm-space-map.h134
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.c382
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.h131
-rw-r--r--drivers/md/raid0.c417
-rw-r--r--drivers/md/raid0.h19
-rw-r--r--drivers/md/raid1.c917
-rw-r--r--drivers/md/raid1.h108
-rw-r--r--drivers/md/raid10.c2599
-rw-r--r--drivers/md/raid10.h128
-rw-r--r--drivers/md/raid5.c1921
-rw-r--r--drivers/md/raid5.h129
75 files changed, 4259 insertions, 21669 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 91a02eeeb31..f75a66e7d31 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -208,23 +208,6 @@ config DM_DEBUG
208 208
209 If unsure, say N. 209 If unsure, say N.
210 210
211config DM_BUFIO
212 tristate
213 depends on BLK_DEV_DM && EXPERIMENTAL
214 ---help---
215 This interface allows you to do buffered I/O on a device and acts
216 as a cache, holding recently-read blocks in memory and performing
217 delayed writes.
218
219config DM_BIO_PRISON
220 tristate
221 depends on BLK_DEV_DM && EXPERIMENTAL
222 ---help---
223 Some bio locking schemes used by other device-mapper targets
224 including thin provisioning.
225
226source "drivers/md/persistent-data/Kconfig"
227
228config DM_CRYPT 211config DM_CRYPT
229 tristate "Crypt target support" 212 tristate "Crypt target support"
230 depends on BLK_DEV_DM 213 depends on BLK_DEV_DM
@@ -250,24 +233,6 @@ config DM_SNAPSHOT
250 ---help--- 233 ---help---
251 Allow volume managers to take writable snapshots of a device. 234 Allow volume managers to take writable snapshots of a device.
252 235
253config DM_THIN_PROVISIONING
254 tristate "Thin provisioning target (EXPERIMENTAL)"
255 depends on BLK_DEV_DM && EXPERIMENTAL
256 select DM_PERSISTENT_DATA
257 select DM_BIO_PRISON
258 ---help---
259 Provides thin provisioning and snapshots that share a data store.
260
261config DM_DEBUG_BLOCK_STACK_TRACING
262 boolean "Keep stack trace of thin provisioning block lock holders"
263 depends on STACKTRACE_SUPPORT && DM_THIN_PROVISIONING
264 select STACKTRACE
265 ---help---
266 Enable this for messages that may help debug problems with the
267 block manager locking used by thin provisioning.
268
269 If unsure, say N.
270
271config DM_MIRROR 236config DM_MIRROR
272 tristate "Mirror target" 237 tristate "Mirror target"
273 depends on BLK_DEV_DM 238 depends on BLK_DEV_DM
@@ -276,14 +241,13 @@ config DM_MIRROR
276 needed for live data migration tools such as 'pvmove'. 241 needed for live data migration tools such as 'pvmove'.
277 242
278config DM_RAID 243config DM_RAID
279 tristate "RAID 1/4/5/6/10 target" 244 tristate "RAID 1/4/5/6 target (EXPERIMENTAL)"
280 depends on BLK_DEV_DM 245 depends on BLK_DEV_DM && EXPERIMENTAL
281 select MD_RAID1 246 select MD_RAID1
282 select MD_RAID10
283 select MD_RAID456 247 select MD_RAID456
284 select BLK_DEV_MD 248 select BLK_DEV_MD
285 ---help--- 249 ---help---
286 A dm target that supports RAID1, RAID10, RAID4, RAID5 and RAID6 mappings 250 A dm target that supports RAID1, RAID4, RAID5 and RAID6 mappings
287 251
288 A RAID-5 set of N drives with a capacity of C MB per drive provides 252 A RAID-5 set of N drives with a capacity of C MB per drive provides
289 the capacity of C * (N - 1) MB, and protects against a failure 253 the capacity of C * (N - 1) MB, and protects against a failure
@@ -359,8 +323,8 @@ config DM_DELAY
359 If unsure, say N. 323 If unsure, say N.
360 324
361config DM_UEVENT 325config DM_UEVENT
362 bool "DM uevents" 326 bool "DM uevents (EXPERIMENTAL)"
363 depends on BLK_DEV_DM 327 depends on BLK_DEV_DM && EXPERIMENTAL
364 ---help--- 328 ---help---
365 Generate udev events for DM events. 329 Generate udev events for DM events.
366 330
@@ -370,24 +334,4 @@ config DM_FLAKEY
370 ---help--- 334 ---help---
371 A target that intermittently fails I/O for debugging purposes. 335 A target that intermittently fails I/O for debugging purposes.
372 336
373config DM_VERITY
374 tristate "Verity target support (EXPERIMENTAL)"
375 depends on BLK_DEV_DM && EXPERIMENTAL
376 select CRYPTO
377 select CRYPTO_HASH
378 select DM_BUFIO
379 ---help---
380 This device-mapper target creates a read-only device that
381 transparently validates the data on one underlying device against
382 a pre-generated tree of cryptographic checksums stored on a second
383 device.
384
385 You'll need to activate the digests you're going to use in the
386 cryptoapi configuration.
387
388 To compile this code as a module, choose M here: the module will
389 be called dm-verity.
390
391 If unsure, say N.
392
393endif # MD 337endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 94dce8b4932..448838b1f92 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -10,7 +10,6 @@ dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \
10dm-mirror-y += dm-raid1.o 10dm-mirror-y += dm-raid1.o
11dm-log-userspace-y \ 11dm-log-userspace-y \
12 += dm-log-userspace-base.o dm-log-userspace-transfer.o 12 += dm-log-userspace-base.o dm-log-userspace-transfer.o
13dm-thin-pool-y += dm-thin.o dm-thin-metadata.o
14md-mod-y += md.o bitmap.o 13md-mod-y += md.o bitmap.o
15raid456-y += raid5.o 14raid456-y += raid5.o
16 15
@@ -28,8 +27,6 @@ obj-$(CONFIG_MD_MULTIPATH) += multipath.o
28obj-$(CONFIG_MD_FAULTY) += faulty.o 27obj-$(CONFIG_MD_FAULTY) += faulty.o
29obj-$(CONFIG_BLK_DEV_MD) += md-mod.o 28obj-$(CONFIG_BLK_DEV_MD) += md-mod.o
30obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o 29obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
31obj-$(CONFIG_DM_BUFIO) += dm-bufio.o
32obj-$(CONFIG_DM_BIO_PRISON) += dm-bio-prison.o
33obj-$(CONFIG_DM_CRYPT) += dm-crypt.o 30obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
34obj-$(CONFIG_DM_DELAY) += dm-delay.o 31obj-$(CONFIG_DM_DELAY) += dm-delay.o
35obj-$(CONFIG_DM_FLAKEY) += dm-flakey.o 32obj-$(CONFIG_DM_FLAKEY) += dm-flakey.o
@@ -37,13 +34,10 @@ obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
37obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o 34obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o
38obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o 35obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o
39obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o 36obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
40obj-$(CONFIG_DM_PERSISTENT_DATA) += persistent-data/
41obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o 37obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o
42obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o 38obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o
43obj-$(CONFIG_DM_ZERO) += dm-zero.o 39obj-$(CONFIG_DM_ZERO) += dm-zero.o
44obj-$(CONFIG_DM_RAID) += dm-raid.o 40obj-$(CONFIG_DM_RAID) += dm-raid.o
45obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o
46obj-$(CONFIG_DM_VERITY) += dm-verity.o
47 41
48ifeq ($(CONFIG_DM_UEVENT),y) 42ifeq ($(CONFIG_DM_UEVENT),y)
49dm-mod-objs += dm-uevent.o 43dm-mod-objs += dm-uevent.o
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 7155945f8eb..0dc6546b77a 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -26,16 +26,73 @@
26#include <linux/file.h> 26#include <linux/file.h>
27#include <linux/mount.h> 27#include <linux/mount.h>
28#include <linux/buffer_head.h> 28#include <linux/buffer_head.h>
29#include <linux/seq_file.h>
30#include "md.h" 29#include "md.h"
31#include "bitmap.h" 30#include "bitmap.h"
32 31
32/* debug macros */
33
34#define DEBUG 0
35
36#if DEBUG
37/* these are for debugging purposes only! */
38
39/* define one and only one of these */
40#define INJECT_FAULTS_1 0 /* cause bitmap_alloc_page to fail always */
41#define INJECT_FAULTS_2 0 /* cause bitmap file to be kicked when first bit set*/
42#define INJECT_FAULTS_3 0 /* treat bitmap file as kicked at init time */
43#define INJECT_FAULTS_4 0 /* undef */
44#define INJECT_FAULTS_5 0 /* undef */
45#define INJECT_FAULTS_6 0
46
47/* if these are defined, the driver will fail! debug only */
48#define INJECT_FATAL_FAULT_1 0 /* fail kmalloc, causing bitmap_create to fail */
49#define INJECT_FATAL_FAULT_2 0 /* undef */
50#define INJECT_FATAL_FAULT_3 0 /* undef */
51#endif
52
53#ifndef PRINTK
54# if DEBUG > 0
55# define PRINTK(x...) printk(KERN_DEBUG x)
56# else
57# define PRINTK(x...)
58# endif
59#endif
60
33static inline char *bmname(struct bitmap *bitmap) 61static inline char *bmname(struct bitmap *bitmap)
34{ 62{
35 return bitmap->mddev ? mdname(bitmap->mddev) : "mdX"; 63 return bitmap->mddev ? mdname(bitmap->mddev) : "mdX";
36} 64}
37 65
38/* 66/*
67 * just a placeholder - calls kmalloc for bitmap pages
68 */
69static unsigned char *bitmap_alloc_page(struct bitmap *bitmap)
70{
71 unsigned char *page;
72
73#ifdef INJECT_FAULTS_1
74 page = NULL;
75#else
76 page = kzalloc(PAGE_SIZE, GFP_NOIO);
77#endif
78 if (!page)
79 printk("%s: bitmap_alloc_page FAILED\n", bmname(bitmap));
80 else
81 PRINTK("%s: bitmap_alloc_page: allocated page at %p\n",
82 bmname(bitmap), page);
83 return page;
84}
85
86/*
87 * for now just a placeholder -- just calls kfree for bitmap pages
88 */
89static void bitmap_free_page(struct bitmap *bitmap, unsigned char *page)
90{
91 PRINTK("%s: bitmap_free_page: free page %p\n", bmname(bitmap), page);
92 kfree(page);
93}
94
95/*
39 * check a page and, if necessary, allocate it (or hijack it if the alloc fails) 96 * check a page and, if necessary, allocate it (or hijack it if the alloc fails)
40 * 97 *
41 * 1) check to see if this page is allocated, if it's not then try to alloc 98 * 1) check to see if this page is allocated, if it's not then try to alloc
@@ -45,7 +102,7 @@ static inline char *bmname(struct bitmap *bitmap)
45 * if we find our page, we increment the page's refcount so that it stays 102 * if we find our page, we increment the page's refcount so that it stays
46 * allocated while we're using it 103 * allocated while we're using it
47 */ 104 */
48static int bitmap_checkpage(struct bitmap_counts *bitmap, 105static int bitmap_checkpage(struct bitmap *bitmap,
49 unsigned long page, int create) 106 unsigned long page, int create)
50__releases(bitmap->lock) 107__releases(bitmap->lock)
51__acquires(bitmap->lock) 108__acquires(bitmap->lock)
@@ -72,11 +129,12 @@ __acquires(bitmap->lock)
72 /* this page has not been allocated yet */ 129 /* this page has not been allocated yet */
73 130
74 spin_unlock_irq(&bitmap->lock); 131 spin_unlock_irq(&bitmap->lock);
75 mappage = kzalloc(PAGE_SIZE, GFP_NOIO); 132 mappage = bitmap_alloc_page(bitmap);
76 spin_lock_irq(&bitmap->lock); 133 spin_lock_irq(&bitmap->lock);
77 134
78 if (mappage == NULL) { 135 if (mappage == NULL) {
79 pr_debug("md/bitmap: map page allocation failed, hijacking\n"); 136 PRINTK("%s: bitmap map page allocation failed, hijacking\n",
137 bmname(bitmap));
80 /* failed - set the hijacked flag so that we can use the 138 /* failed - set the hijacked flag so that we can use the
81 * pointer as a counter */ 139 * pointer as a counter */
82 if (!bitmap->bp[page].map) 140 if (!bitmap->bp[page].map)
@@ -84,7 +142,7 @@ __acquires(bitmap->lock)
84 } else if (bitmap->bp[page].map || 142 } else if (bitmap->bp[page].map ||
85 bitmap->bp[page].hijacked) { 143 bitmap->bp[page].hijacked) {
86 /* somebody beat us to getting the page */ 144 /* somebody beat us to getting the page */
87 kfree(mappage); 145 bitmap_free_page(bitmap, mappage);
88 return 0; 146 return 0;
89 } else { 147 } else {
90 148
@@ -99,7 +157,7 @@ __acquires(bitmap->lock)
99/* if page is completely empty, put it back on the free list, or dealloc it */ 157/* if page is completely empty, put it back on the free list, or dealloc it */
100/* if page was hijacked, unmark the flag so it might get alloced next time */ 158/* if page was hijacked, unmark the flag so it might get alloced next time */
101/* Note: lock should be held when calling this */ 159/* Note: lock should be held when calling this */
102static void bitmap_checkfree(struct bitmap_counts *bitmap, unsigned long page) 160static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page)
103{ 161{
104 char *ptr; 162 char *ptr;
105 163
@@ -116,7 +174,7 @@ static void bitmap_checkfree(struct bitmap_counts *bitmap, unsigned long page)
116 ptr = bitmap->bp[page].map; 174 ptr = bitmap->bp[page].map;
117 bitmap->bp[page].map = NULL; 175 bitmap->bp[page].map = NULL;
118 bitmap->missing_pages++; 176 bitmap->missing_pages++;
119 kfree(ptr); 177 bitmap_free_page(bitmap, ptr);
120 } 178 }
121} 179}
122 180
@@ -129,16 +187,24 @@ static void bitmap_checkfree(struct bitmap_counts *bitmap, unsigned long page)
129 */ 187 */
130 188
131/* IO operations when bitmap is stored near all superblocks */ 189/* IO operations when bitmap is stored near all superblocks */
132static int read_sb_page(struct mddev *mddev, loff_t offset, 190static struct page *read_sb_page(mddev_t *mddev, loff_t offset,
133 struct page *page, 191 struct page *page,
134 unsigned long index, int size) 192 unsigned long index, int size)
135{ 193{
136 /* choose a good rdev and read the page from there */ 194 /* choose a good rdev and read the page from there */
137 195
138 struct md_rdev *rdev; 196 mdk_rdev_t *rdev;
139 sector_t target; 197 sector_t target;
198 int did_alloc = 0;
199
200 if (!page) {
201 page = alloc_page(GFP_KERNEL);
202 if (!page)
203 return ERR_PTR(-ENOMEM);
204 did_alloc = 1;
205 }
140 206
141 rdev_for_each(rdev, mddev) { 207 list_for_each_entry(rdev, &mddev->disks, same_set) {
142 if (! test_bit(In_sync, &rdev->flags) 208 if (! test_bit(In_sync, &rdev->flags)
143 || test_bit(Faulty, &rdev->flags)) 209 || test_bit(Faulty, &rdev->flags))
144 continue; 210 continue;
@@ -149,13 +215,18 @@ static int read_sb_page(struct mddev *mddev, loff_t offset,
149 roundup(size, bdev_logical_block_size(rdev->bdev)), 215 roundup(size, bdev_logical_block_size(rdev->bdev)),
150 page, READ, true)) { 216 page, READ, true)) {
151 page->index = index; 217 page->index = index;
152 return 0; 218 attach_page_buffers(page, NULL); /* so that free_buffer will
219 * quietly no-op */
220 return page;
153 } 221 }
154 } 222 }
155 return -EIO; 223 if (did_alloc)
224 put_page(page);
225 return ERR_PTR(-EIO);
226
156} 227}
157 228
158static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mddev) 229static mdk_rdev_t *next_active_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
159{ 230{
160 /* Iterate the disks of an mddev, using rcu to protect access to the 231 /* Iterate the disks of an mddev, using rcu to protect access to the
161 * linked list, and raising the refcount of devices we return to ensure 232 * linked list, and raising the refcount of devices we return to ensure
@@ -163,17 +234,20 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde
163 * As devices are only added or removed when raid_disk is < 0 and 234 * As devices are only added or removed when raid_disk is < 0 and
164 * nr_pending is 0 and In_sync is clear, the entries we return will 235 * nr_pending is 0 and In_sync is clear, the entries we return will
165 * still be in the same position on the list when we re-enter 236 * still be in the same position on the list when we re-enter
166 * list_for_each_entry_continue_rcu. 237 * list_for_each_continue_rcu.
167 */ 238 */
239 struct list_head *pos;
168 rcu_read_lock(); 240 rcu_read_lock();
169 if (rdev == NULL) 241 if (rdev == NULL)
170 /* start at the beginning */ 242 /* start at the beginning */
171 rdev = list_entry_rcu(&mddev->disks, struct md_rdev, same_set); 243 pos = &mddev->disks;
172 else { 244 else {
173 /* release the previous rdev and start from there. */ 245 /* release the previous rdev and start from there. */
174 rdev_dec_pending(rdev, mddev); 246 rdev_dec_pending(rdev, mddev);
247 pos = &rdev->same_set;
175 } 248 }
176 list_for_each_entry_continue_rcu(rdev, &mddev->disks, same_set) { 249 list_for_each_continue_rcu(pos, &mddev->disks) {
250 rdev = list_entry(pos, mdk_rdev_t, same_set);
177 if (rdev->raid_disk >= 0 && 251 if (rdev->raid_disk >= 0 &&
178 !test_bit(Faulty, &rdev->flags)) { 252 !test_bit(Faulty, &rdev->flags)) {
179 /* this is a usable devices */ 253 /* this is a usable devices */
@@ -188,10 +262,9 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde
188 262
189static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) 263static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
190{ 264{
191 struct md_rdev *rdev = NULL; 265 mdk_rdev_t *rdev = NULL;
192 struct block_device *bdev; 266 struct block_device *bdev;
193 struct mddev *mddev = bitmap->mddev; 267 mddev_t *mddev = bitmap->mddev;
194 struct bitmap_storage *store = &bitmap->storage;
195 268
196 while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { 269 while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
197 int size = PAGE_SIZE; 270 int size = PAGE_SIZE;
@@ -199,13 +272,9 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
199 272
200 bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; 273 bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
201 274
202 if (page->index == store->file_pages-1) { 275 if (page->index == bitmap->file_pages-1)
203 int last_page_size = store->bytes & (PAGE_SIZE-1); 276 size = roundup(bitmap->last_page_size,
204 if (last_page_size == 0)
205 last_page_size = PAGE_SIZE;
206 size = roundup(last_page_size,
207 bdev_logical_block_size(bdev)); 277 bdev_logical_block_size(bdev));
208 }
209 /* Just make sure we aren't corrupting data or 278 /* Just make sure we aren't corrupting data or
210 * metadata 279 * metadata
211 */ 280 */
@@ -264,10 +333,10 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait)
264{ 333{
265 struct buffer_head *bh; 334 struct buffer_head *bh;
266 335
267 if (bitmap->storage.file == NULL) { 336 if (bitmap->file == NULL) {
268 switch (write_sb_page(bitmap, page, wait)) { 337 switch (write_sb_page(bitmap, page, wait)) {
269 case -EINVAL: 338 case -EINVAL:
270 set_bit(BITMAP_WRITE_ERROR, &bitmap->flags); 339 bitmap->flags |= BITMAP_WRITE_ERROR;
271 } 340 }
272 } else { 341 } else {
273 342
@@ -285,16 +354,20 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait)
285 wait_event(bitmap->write_wait, 354 wait_event(bitmap->write_wait,
286 atomic_read(&bitmap->pending_writes)==0); 355 atomic_read(&bitmap->pending_writes)==0);
287 } 356 }
288 if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) 357 if (bitmap->flags & BITMAP_WRITE_ERROR)
289 bitmap_file_kick(bitmap); 358 bitmap_file_kick(bitmap);
290} 359}
291 360
292static void end_bitmap_write(struct buffer_head *bh, int uptodate) 361static void end_bitmap_write(struct buffer_head *bh, int uptodate)
293{ 362{
294 struct bitmap *bitmap = bh->b_private; 363 struct bitmap *bitmap = bh->b_private;
364 unsigned long flags;
295 365
296 if (!uptodate) 366 if (!uptodate) {
297 set_bit(BITMAP_WRITE_ERROR, &bitmap->flags); 367 spin_lock_irqsave(&bitmap->lock, flags);
368 bitmap->flags |= BITMAP_WRITE_ERROR;
369 spin_unlock_irqrestore(&bitmap->lock, flags);
370 }
298 if (atomic_dec_and_test(&bitmap->pending_writes)) 371 if (atomic_dec_and_test(&bitmap->pending_writes))
299 wake_up(&bitmap->write_wait); 372 wake_up(&bitmap->write_wait);
300} 373}
@@ -309,12 +382,8 @@ __clear_page_buffers(struct page *page)
309} 382}
310static void free_buffers(struct page *page) 383static void free_buffers(struct page *page)
311{ 384{
312 struct buffer_head *bh; 385 struct buffer_head *bh = page_buffers(page);
313 386
314 if (!PagePrivate(page))
315 return;
316
317 bh = page_buffers(page);
318 while (bh) { 387 while (bh) {
319 struct buffer_head *next = bh->b_this_page; 388 struct buffer_head *next = bh->b_this_page;
320 free_buffer_head(bh); 389 free_buffer_head(bh);
@@ -331,22 +400,28 @@ static void free_buffers(struct page *page)
331 * This usage is similar to how swap files are handled, and allows us 400 * This usage is similar to how swap files are handled, and allows us
332 * to write to a file with no concerns of memory allocation failing. 401 * to write to a file with no concerns of memory allocation failing.
333 */ 402 */
334static int read_page(struct file *file, unsigned long index, 403static struct page *read_page(struct file *file, unsigned long index,
335 struct bitmap *bitmap, 404 struct bitmap *bitmap,
336 unsigned long count, 405 unsigned long count)
337 struct page *page)
338{ 406{
339 int ret = 0; 407 struct page *page = NULL;
340 struct inode *inode = file->f_path.dentry->d_inode; 408 struct inode *inode = file->f_path.dentry->d_inode;
341 struct buffer_head *bh; 409 struct buffer_head *bh;
342 sector_t block; 410 sector_t block;
343 411
344 pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE, 412 PRINTK("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE,
345 (unsigned long long)index << PAGE_SHIFT); 413 (unsigned long long)index << PAGE_SHIFT);
414
415 page = alloc_page(GFP_KERNEL);
416 if (!page)
417 page = ERR_PTR(-ENOMEM);
418 if (IS_ERR(page))
419 goto out;
346 420
347 bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0); 421 bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0);
348 if (!bh) { 422 if (!bh) {
349 ret = -ENOMEM; 423 put_page(page);
424 page = ERR_PTR(-ENOMEM);
350 goto out; 425 goto out;
351 } 426 }
352 attach_page_buffers(page, bh); 427 attach_page_buffers(page, bh);
@@ -358,7 +433,8 @@ static int read_page(struct file *file, unsigned long index,
358 bh->b_blocknr = bmap(inode, block); 433 bh->b_blocknr = bmap(inode, block);
359 if (bh->b_blocknr == 0) { 434 if (bh->b_blocknr == 0) {
360 /* Cannot use this file! */ 435 /* Cannot use this file! */
361 ret = -EINVAL; 436 free_buffers(page);
437 page = ERR_PTR(-EINVAL);
362 goto out; 438 goto out;
363 } 439 }
364 bh->b_bdev = inode->i_sb->s_bdev; 440 bh->b_bdev = inode->i_sb->s_bdev;
@@ -381,15 +457,17 @@ static int read_page(struct file *file, unsigned long index,
381 457
382 wait_event(bitmap->write_wait, 458 wait_event(bitmap->write_wait,
383 atomic_read(&bitmap->pending_writes)==0); 459 atomic_read(&bitmap->pending_writes)==0);
384 if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) 460 if (bitmap->flags & BITMAP_WRITE_ERROR) {
385 ret = -EIO; 461 free_buffers(page);
462 page = ERR_PTR(-EIO);
463 }
386out: 464out:
387 if (ret) 465 if (IS_ERR(page))
388 printk(KERN_ALERT "md: bitmap read error: (%dB @ %llu): %d\n", 466 printk(KERN_ALERT "md: bitmap read error: (%dB @ %llu): %ld\n",
389 (int)PAGE_SIZE, 467 (int)PAGE_SIZE,
390 (unsigned long long)index << PAGE_SHIFT, 468 (unsigned long long)index << PAGE_SHIFT,
391 ret); 469 PTR_ERR(page));
392 return ret; 470 return page;
393} 471}
394 472
395/* 473/*
@@ -400,14 +478,19 @@ out:
400void bitmap_update_sb(struct bitmap *bitmap) 478void bitmap_update_sb(struct bitmap *bitmap)
401{ 479{
402 bitmap_super_t *sb; 480 bitmap_super_t *sb;
481 unsigned long flags;
403 482
404 if (!bitmap || !bitmap->mddev) /* no bitmap for this array */ 483 if (!bitmap || !bitmap->mddev) /* no bitmap for this array */
405 return; 484 return;
406 if (bitmap->mddev->bitmap_info.external) 485 if (bitmap->mddev->bitmap_info.external)
407 return; 486 return;
408 if (!bitmap->storage.sb_page) /* no superblock */ 487 spin_lock_irqsave(&bitmap->lock, flags);
488 if (!bitmap->sb_page) { /* no superblock */
489 spin_unlock_irqrestore(&bitmap->lock, flags);
409 return; 490 return;
410 sb = kmap_atomic(bitmap->storage.sb_page); 491 }
492 spin_unlock_irqrestore(&bitmap->lock, flags);
493 sb = kmap_atomic(bitmap->sb_page, KM_USER0);
411 sb->events = cpu_to_le64(bitmap->mddev->events); 494 sb->events = cpu_to_le64(bitmap->mddev->events);
412 if (bitmap->mddev->events < bitmap->events_cleared) 495 if (bitmap->mddev->events < bitmap->events_cleared)
413 /* rocking back to read-only */ 496 /* rocking back to read-only */
@@ -417,13 +500,8 @@ void bitmap_update_sb(struct bitmap *bitmap)
417 /* Just in case these have been changed via sysfs: */ 500 /* Just in case these have been changed via sysfs: */
418 sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); 501 sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ);
419 sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); 502 sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind);
420 /* This might have been changed by a reshape */ 503 kunmap_atomic(sb, KM_USER0);
421 sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); 504 write_page(bitmap, bitmap->sb_page, 1);
422 sb->chunksize = cpu_to_le32(bitmap->mddev->bitmap_info.chunksize);
423 sb->sectors_reserved = cpu_to_le32(bitmap->mddev->
424 bitmap_info.space);
425 kunmap_atomic(sb);
426 write_page(bitmap, bitmap->storage.sb_page, 1);
427} 505}
428 506
429/* print out the bitmap file superblock */ 507/* print out the bitmap file superblock */
@@ -431,9 +509,9 @@ void bitmap_print_sb(struct bitmap *bitmap)
431{ 509{
432 bitmap_super_t *sb; 510 bitmap_super_t *sb;
433 511
434 if (!bitmap || !bitmap->storage.sb_page) 512 if (!bitmap || !bitmap->sb_page)
435 return; 513 return;
436 sb = kmap_atomic(bitmap->storage.sb_page); 514 sb = kmap_atomic(bitmap->sb_page, KM_USER0);
437 printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap)); 515 printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap));
438 printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic)); 516 printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic));
439 printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version)); 517 printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version));
@@ -452,7 +530,7 @@ void bitmap_print_sb(struct bitmap *bitmap)
452 printk(KERN_DEBUG " sync size: %llu KB\n", 530 printk(KERN_DEBUG " sync size: %llu KB\n",
453 (unsigned long long)le64_to_cpu(sb->sync_size)/2); 531 (unsigned long long)le64_to_cpu(sb->sync_size)/2);
454 printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind)); 532 printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind));
455 kunmap_atomic(sb); 533 kunmap_atomic(sb, KM_USER0);
456} 534}
457 535
458/* 536/*
@@ -470,13 +548,17 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
470{ 548{
471 bitmap_super_t *sb; 549 bitmap_super_t *sb;
472 unsigned long chunksize, daemon_sleep, write_behind; 550 unsigned long chunksize, daemon_sleep, write_behind;
551 int err = -EINVAL;
473 552
474 bitmap->storage.sb_page = alloc_page(GFP_KERNEL); 553 bitmap->sb_page = alloc_page(GFP_KERNEL);
475 if (bitmap->storage.sb_page == NULL) 554 if (IS_ERR(bitmap->sb_page)) {
476 return -ENOMEM; 555 err = PTR_ERR(bitmap->sb_page);
477 bitmap->storage.sb_page->index = 0; 556 bitmap->sb_page = NULL;
557 return err;
558 }
559 bitmap->sb_page->index = 0;
478 560
479 sb = kmap_atomic(bitmap->storage.sb_page); 561 sb = kmap_atomic(bitmap->sb_page, KM_USER0);
480 562
481 sb->magic = cpu_to_le32(BITMAP_MAGIC); 563 sb->magic = cpu_to_le32(BITMAP_MAGIC);
482 sb->version = cpu_to_le32(BITMAP_MAJOR_HI); 564 sb->version = cpu_to_le32(BITMAP_MAJOR_HI);
@@ -484,7 +566,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
484 chunksize = bitmap->mddev->bitmap_info.chunksize; 566 chunksize = bitmap->mddev->bitmap_info.chunksize;
485 BUG_ON(!chunksize); 567 BUG_ON(!chunksize);
486 if (!is_power_of_2(chunksize)) { 568 if (!is_power_of_2(chunksize)) {
487 kunmap_atomic(sb); 569 kunmap_atomic(sb, KM_USER0);
488 printk(KERN_ERR "bitmap chunksize not a power of 2\n"); 570 printk(KERN_ERR "bitmap chunksize not a power of 2\n");
489 return -EINVAL; 571 return -EINVAL;
490 } 572 }
@@ -514,12 +596,15 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
514 596
515 memcpy(sb->uuid, bitmap->mddev->uuid, 16); 597 memcpy(sb->uuid, bitmap->mddev->uuid, 16);
516 598
517 set_bit(BITMAP_STALE, &bitmap->flags); 599 bitmap->flags |= BITMAP_STALE;
518 sb->state = cpu_to_le32(bitmap->flags); 600 sb->state |= cpu_to_le32(BITMAP_STALE);
519 bitmap->events_cleared = bitmap->mddev->events; 601 bitmap->events_cleared = bitmap->mddev->events;
520 sb->events_cleared = cpu_to_le64(bitmap->mddev->events); 602 sb->events_cleared = cpu_to_le64(bitmap->mddev->events);
521 603
522 kunmap_atomic(sb); 604 bitmap->flags |= BITMAP_HOSTENDIAN;
605 sb->version = cpu_to_le32(BITMAP_MAJOR_HOSTENDIAN);
606
607 kunmap_atomic(sb, KM_USER0);
523 608
524 return 0; 609 return 0;
525} 610}
@@ -531,45 +616,31 @@ static int bitmap_read_sb(struct bitmap *bitmap)
531 bitmap_super_t *sb; 616 bitmap_super_t *sb;
532 unsigned long chunksize, daemon_sleep, write_behind; 617 unsigned long chunksize, daemon_sleep, write_behind;
533 unsigned long long events; 618 unsigned long long events;
534 unsigned long sectors_reserved = 0;
535 int err = -EINVAL; 619 int err = -EINVAL;
536 struct page *sb_page;
537 620
538 if (!bitmap->storage.file && !bitmap->mddev->bitmap_info.offset) {
539 chunksize = 128 * 1024 * 1024;
540 daemon_sleep = 5 * HZ;
541 write_behind = 0;
542 set_bit(BITMAP_STALE, &bitmap->flags);
543 err = 0;
544 goto out_no_sb;
545 }
546 /* page 0 is the superblock, read it... */ 621 /* page 0 is the superblock, read it... */
547 sb_page = alloc_page(GFP_KERNEL); 622 if (bitmap->file) {
548 if (!sb_page) 623 loff_t isize = i_size_read(bitmap->file->f_mapping->host);
549 return -ENOMEM;
550 bitmap->storage.sb_page = sb_page;
551
552 if (bitmap->storage.file) {
553 loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host);
554 int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize; 624 int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize;
555 625
556 err = read_page(bitmap->storage.file, 0, 626 bitmap->sb_page = read_page(bitmap->file, 0, bitmap, bytes);
557 bitmap, bytes, sb_page);
558 } else { 627 } else {
559 err = read_sb_page(bitmap->mddev, 628 bitmap->sb_page = read_sb_page(bitmap->mddev,
560 bitmap->mddev->bitmap_info.offset, 629 bitmap->mddev->bitmap_info.offset,
561 sb_page, 630 NULL,
562 0, sizeof(bitmap_super_t)); 631 0, sizeof(bitmap_super_t));
563 } 632 }
564 if (err) 633 if (IS_ERR(bitmap->sb_page)) {
634 err = PTR_ERR(bitmap->sb_page);
635 bitmap->sb_page = NULL;
565 return err; 636 return err;
637 }
566 638
567 sb = kmap_atomic(sb_page); 639 sb = kmap_atomic(bitmap->sb_page, KM_USER0);
568 640
569 chunksize = le32_to_cpu(sb->chunksize); 641 chunksize = le32_to_cpu(sb->chunksize);
570 daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; 642 daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
571 write_behind = le32_to_cpu(sb->write_behind); 643 write_behind = le32_to_cpu(sb->write_behind);
572 sectors_reserved = le32_to_cpu(sb->sectors_reserved);
573 644
574 /* verify that the bitmap-specific fields are valid */ 645 /* verify that the bitmap-specific fields are valid */
575 if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) 646 if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
@@ -594,50 +665,81 @@ static int bitmap_read_sb(struct bitmap *bitmap)
594 /* keep the array size field of the bitmap superblock up to date */ 665 /* keep the array size field of the bitmap superblock up to date */
595 sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); 666 sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
596 667
597 if (bitmap->mddev->persistent) { 668 if (!bitmap->mddev->persistent)
598 /* 669 goto success;
599 * We have a persistent array superblock, so compare the
600 * bitmap's UUID and event counter to the mddev's
601 */
602 if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) {
603 printk(KERN_INFO
604 "%s: bitmap superblock UUID mismatch\n",
605 bmname(bitmap));
606 goto out;
607 }
608 events = le64_to_cpu(sb->events);
609 if (events < bitmap->mddev->events) {
610 printk(KERN_INFO
611 "%s: bitmap file is out of date (%llu < %llu) "
612 "-- forcing full recovery\n",
613 bmname(bitmap), events,
614 (unsigned long long) bitmap->mddev->events);
615 set_bit(BITMAP_STALE, &bitmap->flags);
616 }
617 }
618 670
671 /*
672 * if we have a persistent array superblock, compare the
673 * bitmap's UUID and event counter to the mddev's
674 */
675 if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) {
676 printk(KERN_INFO "%s: bitmap superblock UUID mismatch\n",
677 bmname(bitmap));
678 goto out;
679 }
680 events = le64_to_cpu(sb->events);
681 if (events < bitmap->mddev->events) {
682 printk(KERN_INFO "%s: bitmap file is out of date (%llu < %llu) "
683 "-- forcing full recovery\n", bmname(bitmap), events,
684 (unsigned long long) bitmap->mddev->events);
685 sb->state |= cpu_to_le32(BITMAP_STALE);
686 }
687success:
619 /* assign fields using values from superblock */ 688 /* assign fields using values from superblock */
689 bitmap->mddev->bitmap_info.chunksize = chunksize;
690 bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
691 bitmap->mddev->bitmap_info.max_write_behind = write_behind;
620 bitmap->flags |= le32_to_cpu(sb->state); 692 bitmap->flags |= le32_to_cpu(sb->state);
621 if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) 693 if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)
622 set_bit(BITMAP_HOSTENDIAN, &bitmap->flags); 694 bitmap->flags |= BITMAP_HOSTENDIAN;
623 bitmap->events_cleared = le64_to_cpu(sb->events_cleared); 695 bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
696 if (bitmap->flags & BITMAP_STALE)
697 bitmap->events_cleared = bitmap->mddev->events;
624 err = 0; 698 err = 0;
625out: 699out:
626 kunmap_atomic(sb); 700 kunmap_atomic(sb, KM_USER0);
627out_no_sb:
628 if (test_bit(BITMAP_STALE, &bitmap->flags))
629 bitmap->events_cleared = bitmap->mddev->events;
630 bitmap->mddev->bitmap_info.chunksize = chunksize;
631 bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
632 bitmap->mddev->bitmap_info.max_write_behind = write_behind;
633 if (bitmap->mddev->bitmap_info.space == 0 ||
634 bitmap->mddev->bitmap_info.space > sectors_reserved)
635 bitmap->mddev->bitmap_info.space = sectors_reserved;
636 if (err) 701 if (err)
637 bitmap_print_sb(bitmap); 702 bitmap_print_sb(bitmap);
638 return err; 703 return err;
639} 704}
640 705
706enum bitmap_mask_op {
707 MASK_SET,
708 MASK_UNSET
709};
710
711/* record the state of the bitmap in the superblock. Return the old value */
712static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
713 enum bitmap_mask_op op)
714{
715 bitmap_super_t *sb;
716 unsigned long flags;
717 int old;
718
719 spin_lock_irqsave(&bitmap->lock, flags);
720 if (!bitmap->sb_page) { /* can't set the state */
721 spin_unlock_irqrestore(&bitmap->lock, flags);
722 return 0;
723 }
724 spin_unlock_irqrestore(&bitmap->lock, flags);
725 sb = kmap_atomic(bitmap->sb_page, KM_USER0);
726 old = le32_to_cpu(sb->state) & bits;
727 switch (op) {
728 case MASK_SET:
729 sb->state |= cpu_to_le32(bits);
730 bitmap->flags |= bits;
731 break;
732 case MASK_UNSET:
733 sb->state &= cpu_to_le32(~bits);
734 bitmap->flags &= ~bits;
735 break;
736 default:
737 BUG();
738 }
739 kunmap_atomic(sb, KM_USER0);
740 return old;
741}
742
641/* 743/*
642 * general bitmap file operations 744 * general bitmap file operations
643 */ 745 */
@@ -649,19 +751,17 @@ out_no_sb:
649 * file a page at a time. There's a superblock at the start of the file. 751 * file a page at a time. There's a superblock at the start of the file.
650 */ 752 */
651/* calculate the index of the page that contains this bit */ 753/* calculate the index of the page that contains this bit */
652static inline unsigned long file_page_index(struct bitmap_storage *store, 754static inline unsigned long file_page_index(struct bitmap *bitmap, unsigned long chunk)
653 unsigned long chunk)
654{ 755{
655 if (store->sb_page) 756 if (!bitmap->mddev->bitmap_info.external)
656 chunk += sizeof(bitmap_super_t) << 3; 757 chunk += sizeof(bitmap_super_t) << 3;
657 return chunk >> PAGE_BIT_SHIFT; 758 return chunk >> PAGE_BIT_SHIFT;
658} 759}
659 760
660/* calculate the (bit) offset of this bit within a page */ 761/* calculate the (bit) offset of this bit within a page */
661static inline unsigned long file_page_offset(struct bitmap_storage *store, 762static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned long chunk)
662 unsigned long chunk)
663{ 763{
664 if (store->sb_page) 764 if (!bitmap->mddev->bitmap_info.external)
665 chunk += sizeof(bitmap_super_t) << 3; 765 chunk += sizeof(bitmap_super_t) << 3;
666 return chunk & (PAGE_BITS - 1); 766 return chunk & (PAGE_BITS - 1);
667} 767}
@@ -673,86 +773,57 @@ static inline unsigned long file_page_offset(struct bitmap_storage *store,
673 * 1 page (e.g., x86) or less than 1 page -- so the bitmap might start on page 773 * 1 page (e.g., x86) or less than 1 page -- so the bitmap might start on page
674 * 0 or page 1 774 * 0 or page 1
675 */ 775 */
676static inline struct page *filemap_get_page(struct bitmap_storage *store, 776static inline struct page *filemap_get_page(struct bitmap *bitmap,
677 unsigned long chunk) 777 unsigned long chunk)
678{ 778{
679 if (file_page_index(store, chunk) >= store->file_pages) 779 if (file_page_index(bitmap, chunk) >= bitmap->file_pages)
680 return NULL; 780 return NULL;
681 return store->filemap[file_page_index(store, chunk) 781 return bitmap->filemap[file_page_index(bitmap, chunk)
682 - file_page_index(store, 0)]; 782 - file_page_index(bitmap, 0)];
683}
684
685static int bitmap_storage_alloc(struct bitmap_storage *store,
686 unsigned long chunks, int with_super)
687{
688 int pnum;
689 unsigned long num_pages;
690 unsigned long bytes;
691
692 bytes = DIV_ROUND_UP(chunks, 8);
693 if (with_super)
694 bytes += sizeof(bitmap_super_t);
695
696 num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE);
697
698 store->filemap = kmalloc(sizeof(struct page *)
699 * num_pages, GFP_KERNEL);
700 if (!store->filemap)
701 return -ENOMEM;
702
703 if (with_super && !store->sb_page) {
704 store->sb_page = alloc_page(GFP_KERNEL|__GFP_ZERO);
705 if (store->sb_page == NULL)
706 return -ENOMEM;
707 store->sb_page->index = 0;
708 }
709 pnum = 0;
710 if (store->sb_page) {
711 store->filemap[0] = store->sb_page;
712 pnum = 1;
713 }
714 for ( ; pnum < num_pages; pnum++) {
715 store->filemap[pnum] = alloc_page(GFP_KERNEL|__GFP_ZERO);
716 if (!store->filemap[pnum]) {
717 store->file_pages = pnum;
718 return -ENOMEM;
719 }
720 store->filemap[pnum]->index = pnum;
721 }
722 store->file_pages = pnum;
723
724 /* We need 4 bits per page, rounded up to a multiple
725 * of sizeof(unsigned long) */
726 store->filemap_attr = kzalloc(
727 roundup(DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)),
728 GFP_KERNEL);
729 if (!store->filemap_attr)
730 return -ENOMEM;
731
732 store->bytes = bytes;
733
734 return 0;
735} 783}
736 784
737static void bitmap_file_unmap(struct bitmap_storage *store) 785static void bitmap_file_unmap(struct bitmap *bitmap)
738{ 786{
739 struct page **map, *sb_page; 787 struct page **map, *sb_page;
788 unsigned long *attr;
740 int pages; 789 int pages;
741 struct file *file; 790 unsigned long flags;
742 791
743 file = store->file; 792 spin_lock_irqsave(&bitmap->lock, flags);
744 map = store->filemap; 793 map = bitmap->filemap;
745 pages = store->file_pages; 794 bitmap->filemap = NULL;
746 sb_page = store->sb_page; 795 attr = bitmap->filemap_attr;
796 bitmap->filemap_attr = NULL;
797 pages = bitmap->file_pages;
798 bitmap->file_pages = 0;
799 sb_page = bitmap->sb_page;
800 bitmap->sb_page = NULL;
801 spin_unlock_irqrestore(&bitmap->lock, flags);
747 802
748 while (pages--) 803 while (pages--)
749 if (map[pages] != sb_page) /* 0 is sb_page, release it below */ 804 if (map[pages] != sb_page) /* 0 is sb_page, release it below */
750 free_buffers(map[pages]); 805 free_buffers(map[pages]);
751 kfree(map); 806 kfree(map);
752 kfree(store->filemap_attr); 807 kfree(attr);
753 808
754 if (sb_page) 809 if (sb_page)
755 free_buffers(sb_page); 810 free_buffers(sb_page);
811}
812
813static void bitmap_file_put(struct bitmap *bitmap)
814{
815 struct file *file;
816 unsigned long flags;
817
818 spin_lock_irqsave(&bitmap->lock, flags);
819 file = bitmap->file;
820 bitmap->file = NULL;
821 spin_unlock_irqrestore(&bitmap->lock, flags);
822
823 if (file)
824 wait_event(bitmap->write_wait,
825 atomic_read(&bitmap->pending_writes)==0);
826 bitmap_file_unmap(bitmap);
756 827
757 if (file) { 828 if (file) {
758 struct inode *inode = file->f_path.dentry->d_inode; 829 struct inode *inode = file->f_path.dentry->d_inode;
@@ -770,14 +841,14 @@ static void bitmap_file_kick(struct bitmap *bitmap)
770{ 841{
771 char *path, *ptr = NULL; 842 char *path, *ptr = NULL;
772 843
773 if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) { 844 if (bitmap_mask_state(bitmap, BITMAP_STALE, MASK_SET) == 0) {
774 bitmap_update_sb(bitmap); 845 bitmap_update_sb(bitmap);
775 846
776 if (bitmap->storage.file) { 847 if (bitmap->file) {
777 path = kmalloc(PAGE_SIZE, GFP_KERNEL); 848 path = kmalloc(PAGE_SIZE, GFP_KERNEL);
778 if (path) 849 if (path)
779 ptr = d_path(&bitmap->storage.file->f_path, 850 ptr = d_path(&bitmap->file->f_path, path,
780 path, PAGE_SIZE); 851 PAGE_SIZE);
781 852
782 printk(KERN_ALERT 853 printk(KERN_ALERT
783 "%s: kicking failed bitmap file %s from array!\n", 854 "%s: kicking failed bitmap file %s from array!\n",
@@ -789,39 +860,36 @@ static void bitmap_file_kick(struct bitmap *bitmap)
789 "%s: disabling internal bitmap due to errors\n", 860 "%s: disabling internal bitmap due to errors\n",
790 bmname(bitmap)); 861 bmname(bitmap));
791 } 862 }
863
864 bitmap_file_put(bitmap);
865
866 return;
792} 867}
793 868
794enum bitmap_page_attr { 869enum bitmap_page_attr {
795 BITMAP_PAGE_DIRTY = 0, /* there are set bits that need to be synced */ 870 BITMAP_PAGE_DIRTY = 0, /* there are set bits that need to be synced */
796 BITMAP_PAGE_PENDING = 1, /* there are bits that are being cleaned. 871 BITMAP_PAGE_CLEAN = 1, /* there are bits that might need to be cleared */
797 * i.e. counter is 1 or 2. */
798 BITMAP_PAGE_NEEDWRITE = 2, /* there are cleared bits that need to be synced */ 872 BITMAP_PAGE_NEEDWRITE = 2, /* there are cleared bits that need to be synced */
799}; 873};
800 874
801static inline void set_page_attr(struct bitmap *bitmap, int pnum, 875static inline void set_page_attr(struct bitmap *bitmap, struct page *page,
802 enum bitmap_page_attr attr) 876 enum bitmap_page_attr attr)
803{
804 set_bit((pnum<<2) + attr, bitmap->storage.filemap_attr);
805}
806
807static inline void clear_page_attr(struct bitmap *bitmap, int pnum,
808 enum bitmap_page_attr attr)
809{ 877{
810 clear_bit((pnum<<2) + attr, bitmap->storage.filemap_attr); 878 __set_bit((page->index<<2) + attr, bitmap->filemap_attr);
811} 879}
812 880
813static inline int test_page_attr(struct bitmap *bitmap, int pnum, 881static inline void clear_page_attr(struct bitmap *bitmap, struct page *page,
814 enum bitmap_page_attr attr) 882 enum bitmap_page_attr attr)
815{ 883{
816 return test_bit((pnum<<2) + attr, bitmap->storage.filemap_attr); 884 __clear_bit((page->index<<2) + attr, bitmap->filemap_attr);
817} 885}
818 886
819static inline int test_and_clear_page_attr(struct bitmap *bitmap, int pnum, 887static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page,
820 enum bitmap_page_attr attr) 888 enum bitmap_page_attr attr)
821{ 889{
822 return test_and_clear_bit((pnum<<2) + attr, 890 return test_bit((page->index<<2) + attr, bitmap->filemap_attr);
823 bitmap->storage.filemap_attr);
824} 891}
892
825/* 893/*
826 * bitmap_file_set_bit -- called before performing a write to the md device 894 * bitmap_file_set_bit -- called before performing a write to the md device
827 * to set (and eventually sync) a particular bit in the bitmap file 895 * to set (and eventually sync) a particular bit in the bitmap file
@@ -834,46 +902,26 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
834 unsigned long bit; 902 unsigned long bit;
835 struct page *page; 903 struct page *page;
836 void *kaddr; 904 void *kaddr;
837 unsigned long chunk = block >> bitmap->counts.chunkshift; 905 unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap);
906
907 if (!bitmap->filemap)
908 return;
838 909
839 page = filemap_get_page(&bitmap->storage, chunk); 910 page = filemap_get_page(bitmap, chunk);
840 if (!page) 911 if (!page)
841 return; 912 return;
842 bit = file_page_offset(&bitmap->storage, chunk); 913 bit = file_page_offset(bitmap, chunk);
843 914
844 /* set the bit */ 915 /* set the bit */
845 kaddr = kmap_atomic(page); 916 kaddr = kmap_atomic(page, KM_USER0);
846 if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) 917 if (bitmap->flags & BITMAP_HOSTENDIAN)
847 set_bit(bit, kaddr); 918 set_bit(bit, kaddr);
848 else 919 else
849 test_and_set_bit_le(bit, kaddr); 920 __set_bit_le(bit, kaddr);
850 kunmap_atomic(kaddr); 921 kunmap_atomic(kaddr, KM_USER0);
851 pr_debug("set file bit %lu page %lu\n", bit, page->index); 922 PRINTK("set file bit %lu page %lu\n", bit, page->index);
852 /* record page number so it gets flushed to disk when unplug occurs */ 923 /* record page number so it gets flushed to disk when unplug occurs */
853 set_page_attr(bitmap, page->index, BITMAP_PAGE_DIRTY); 924 set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
854}
855
856static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
857{
858 unsigned long bit;
859 struct page *page;
860 void *paddr;
861 unsigned long chunk = block >> bitmap->counts.chunkshift;
862
863 page = filemap_get_page(&bitmap->storage, chunk);
864 if (!page)
865 return;
866 bit = file_page_offset(&bitmap->storage, chunk);
867 paddr = kmap_atomic(page);
868 if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
869 clear_bit(bit, paddr);
870 else
871 test_and_clear_bit_le(bit, paddr);
872 kunmap_atomic(paddr);
873 if (!test_page_attr(bitmap, page->index, BITMAP_PAGE_NEEDWRITE)) {
874 set_page_attr(bitmap, page->index, BITMAP_PAGE_PENDING);
875 bitmap->allclean = 0;
876 }
877} 925}
878 926
879/* this gets called when the md device is ready to unplug its underlying 927/* this gets called when the md device is ready to unplug its underlying
@@ -881,37 +929,42 @@ static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
881 * sync the dirty pages of the bitmap file to disk */ 929 * sync the dirty pages of the bitmap file to disk */
882void bitmap_unplug(struct bitmap *bitmap) 930void bitmap_unplug(struct bitmap *bitmap)
883{ 931{
884 unsigned long i; 932 unsigned long i, flags;
885 int dirty, need_write; 933 int dirty, need_write;
934 struct page *page;
886 int wait = 0; 935 int wait = 0;
887 936
888 if (!bitmap || !bitmap->storage.filemap || 937 if (!bitmap)
889 test_bit(BITMAP_STALE, &bitmap->flags))
890 return; 938 return;
891 939
892 /* look at each page to see if there are any set bits that need to be 940 /* look at each page to see if there are any set bits that need to be
893 * flushed out to disk */ 941 * flushed out to disk */
894 for (i = 0; i < bitmap->storage.file_pages; i++) { 942 for (i = 0; i < bitmap->file_pages; i++) {
895 if (!bitmap->storage.filemap) 943 spin_lock_irqsave(&bitmap->lock, flags);
944 if (!bitmap->filemap) {
945 spin_unlock_irqrestore(&bitmap->lock, flags);
896 return; 946 return;
897 dirty = test_and_clear_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
898 need_write = test_and_clear_page_attr(bitmap, i,
899 BITMAP_PAGE_NEEDWRITE);
900 if (dirty || need_write) {
901 clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING);
902 write_page(bitmap, bitmap->storage.filemap[i], 0);
903 } 947 }
948 page = bitmap->filemap[i];
949 dirty = test_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
950 need_write = test_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
951 clear_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
952 clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
904 if (dirty) 953 if (dirty)
905 wait = 1; 954 wait = 1;
955 spin_unlock_irqrestore(&bitmap->lock, flags);
956
957 if (dirty || need_write)
958 write_page(bitmap, page, 0);
906 } 959 }
907 if (wait) { /* if any writes were performed, we need to wait on them */ 960 if (wait) { /* if any writes were performed, we need to wait on them */
908 if (bitmap->storage.file) 961 if (bitmap->file)
909 wait_event(bitmap->write_wait, 962 wait_event(bitmap->write_wait,
910 atomic_read(&bitmap->pending_writes)==0); 963 atomic_read(&bitmap->pending_writes)==0);
911 else 964 else
912 md_super_wait(bitmap->mddev); 965 md_super_wait(bitmap->mddev);
913 } 966 }
914 if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) 967 if (bitmap->flags & BITMAP_WRITE_ERROR)
915 bitmap_file_kick(bitmap); 968 bitmap_file_kick(bitmap);
916} 969}
917EXPORT_SYMBOL(bitmap_unplug); 970EXPORT_SYMBOL(bitmap_unplug);
@@ -931,117 +984,149 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
931static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) 984static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
932{ 985{
933 unsigned long i, chunks, index, oldindex, bit; 986 unsigned long i, chunks, index, oldindex, bit;
934 struct page *page = NULL; 987 struct page *page = NULL, *oldpage = NULL;
935 unsigned long bit_cnt = 0; 988 unsigned long num_pages, bit_cnt = 0;
936 struct file *file; 989 struct file *file;
937 unsigned long offset; 990 unsigned long bytes, offset;
938 int outofdate; 991 int outofdate;
939 int ret = -ENOSPC; 992 int ret = -ENOSPC;
940 void *paddr; 993 void *paddr;
941 struct bitmap_storage *store = &bitmap->storage;
942 994
943 chunks = bitmap->counts.chunks; 995 chunks = bitmap->chunks;
944 file = store->file; 996 file = bitmap->file;
945 997
946 if (!file && !bitmap->mddev->bitmap_info.offset) { 998 BUG_ON(!file && !bitmap->mddev->bitmap_info.offset);
947 /* No permanent bitmap - fill with '1s'. */
948 store->filemap = NULL;
949 store->file_pages = 0;
950 for (i = 0; i < chunks ; i++) {
951 /* if the disk bit is set, set the memory bit */
952 int needed = ((sector_t)(i+1) << (bitmap->counts.chunkshift)
953 >= start);
954 bitmap_set_memory_bits(bitmap,
955 (sector_t)i << bitmap->counts.chunkshift,
956 needed);
957 }
958 return 0;
959 }
960 999
961 outofdate = test_bit(BITMAP_STALE, &bitmap->flags); 1000#ifdef INJECT_FAULTS_3
1001 outofdate = 1;
1002#else
1003 outofdate = bitmap->flags & BITMAP_STALE;
1004#endif
962 if (outofdate) 1005 if (outofdate)
963 printk(KERN_INFO "%s: bitmap file is out of date, doing full " 1006 printk(KERN_INFO "%s: bitmap file is out of date, doing full "
964 "recovery\n", bmname(bitmap)); 1007 "recovery\n", bmname(bitmap));
965 1008
966 if (file && i_size_read(file->f_mapping->host) < store->bytes) { 1009 bytes = DIV_ROUND_UP(bitmap->chunks, 8);
1010 if (!bitmap->mddev->bitmap_info.external)
1011 bytes += sizeof(bitmap_super_t);
1012
1013 num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE);
1014
1015 if (file && i_size_read(file->f_mapping->host) < bytes) {
967 printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n", 1016 printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n",
968 bmname(bitmap), 1017 bmname(bitmap),
969 (unsigned long) i_size_read(file->f_mapping->host), 1018 (unsigned long) i_size_read(file->f_mapping->host),
970 store->bytes); 1019 bytes);
971 goto err; 1020 goto err;
972 } 1021 }
973 1022
1023 ret = -ENOMEM;
1024
1025 bitmap->filemap = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL);
1026 if (!bitmap->filemap)
1027 goto err;
1028
1029 /* We need 4 bits per page, rounded up to a multiple of sizeof(unsigned long) */
1030 bitmap->filemap_attr = kzalloc(
1031 roundup(DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)),
1032 GFP_KERNEL);
1033 if (!bitmap->filemap_attr)
1034 goto err;
1035
974 oldindex = ~0L; 1036 oldindex = ~0L;
975 offset = 0;
976 if (!bitmap->mddev->bitmap_info.external)
977 offset = sizeof(bitmap_super_t);
978 1037
979 for (i = 0; i < chunks; i++) { 1038 for (i = 0; i < chunks; i++) {
980 int b; 1039 int b;
981 index = file_page_index(&bitmap->storage, i); 1040 index = file_page_index(bitmap, i);
982 bit = file_page_offset(&bitmap->storage, i); 1041 bit = file_page_offset(bitmap, i);
983 if (index != oldindex) { /* this is a new page, read it in */ 1042 if (index != oldindex) { /* this is a new page, read it in */
984 int count; 1043 int count;
985 /* unmap the old page, we're done with it */ 1044 /* unmap the old page, we're done with it */
986 if (index == store->file_pages-1) 1045 if (index == num_pages-1)
987 count = store->bytes - index * PAGE_SIZE; 1046 count = bytes - index * PAGE_SIZE;
988 else 1047 else
989 count = PAGE_SIZE; 1048 count = PAGE_SIZE;
990 page = store->filemap[index]; 1049 if (index == 0 && bitmap->sb_page) {
991 if (file) 1050 /*
992 ret = read_page(file, index, bitmap, 1051 * if we're here then the superblock page
993 count, page); 1052 * contains some bits (PAGE_SIZE != sizeof sb)
994 else 1053 * we've already read it in, so just use it
995 ret = read_sb_page( 1054 */
996 bitmap->mddev, 1055 page = bitmap->sb_page;
997 bitmap->mddev->bitmap_info.offset, 1056 offset = sizeof(bitmap_super_t);
998 page, 1057 if (!file)
999 index, count); 1058 page = read_sb_page(
1000 1059 bitmap->mddev,
1001 if (ret) 1060 bitmap->mddev->bitmap_info.offset,
1061 page,
1062 index, count);
1063 } else if (file) {
1064 page = read_page(file, index, bitmap, count);
1065 offset = 0;
1066 } else {
1067 page = read_sb_page(bitmap->mddev,
1068 bitmap->mddev->bitmap_info.offset,
1069 NULL,
1070 index, count);
1071 offset = 0;
1072 }
1073 if (IS_ERR(page)) { /* read error */
1074 ret = PTR_ERR(page);
1002 goto err; 1075 goto err;
1076 }
1003 1077
1004 oldindex = index; 1078 oldindex = index;
1079 oldpage = page;
1080
1081 bitmap->filemap[bitmap->file_pages++] = page;
1082 bitmap->last_page_size = count;
1005 1083
1006 if (outofdate) { 1084 if (outofdate) {
1007 /* 1085 /*
1008 * if bitmap is out of date, dirty the 1086 * if bitmap is out of date, dirty the
1009 * whole page and write it out 1087 * whole page and write it out
1010 */ 1088 */
1011 paddr = kmap_atomic(page); 1089 paddr = kmap_atomic(page, KM_USER0);
1012 memset(paddr + offset, 0xff, 1090 memset(paddr + offset, 0xff,
1013 PAGE_SIZE - offset); 1091 PAGE_SIZE - offset);
1014 kunmap_atomic(paddr); 1092 kunmap_atomic(paddr, KM_USER0);
1015 write_page(bitmap, page, 1); 1093 write_page(bitmap, page, 1);
1016 1094
1017 ret = -EIO; 1095 ret = -EIO;
1018 if (test_bit(BITMAP_WRITE_ERROR, 1096 if (bitmap->flags & BITMAP_WRITE_ERROR)
1019 &bitmap->flags))
1020 goto err; 1097 goto err;
1021 } 1098 }
1022 } 1099 }
1023 paddr = kmap_atomic(page); 1100 paddr = kmap_atomic(page, KM_USER0);
1024 if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) 1101 if (bitmap->flags & BITMAP_HOSTENDIAN)
1025 b = test_bit(bit, paddr); 1102 b = test_bit(bit, paddr);
1026 else 1103 else
1027 b = test_bit_le(bit, paddr); 1104 b = test_bit_le(bit, paddr);
1028 kunmap_atomic(paddr); 1105 kunmap_atomic(paddr, KM_USER0);
1029 if (b) { 1106 if (b) {
1030 /* if the disk bit is set, set the memory bit */ 1107 /* if the disk bit is set, set the memory bit */
1031 int needed = ((sector_t)(i+1) << bitmap->counts.chunkshift 1108 int needed = ((sector_t)(i+1) << (CHUNK_BLOCK_SHIFT(bitmap))
1032 >= start); 1109 >= start);
1033 bitmap_set_memory_bits(bitmap, 1110 bitmap_set_memory_bits(bitmap,
1034 (sector_t)i << bitmap->counts.chunkshift, 1111 (sector_t)i << CHUNK_BLOCK_SHIFT(bitmap),
1035 needed); 1112 needed);
1036 bit_cnt++; 1113 bit_cnt++;
1114 set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
1037 } 1115 }
1038 offset = 0; 1116 }
1117
1118 /* everything went OK */
1119 ret = 0;
1120 bitmap_mask_state(bitmap, BITMAP_STALE, MASK_UNSET);
1121
1122 if (bit_cnt) { /* Kick recovery if any bits were set */
1123 set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery);
1124 md_wakeup_thread(bitmap->mddev->thread);
1039 } 1125 }
1040 1126
1041 printk(KERN_INFO "%s: bitmap initialized from disk: " 1127 printk(KERN_INFO "%s: bitmap initialized from disk: "
1042 "read %lu pages, set %lu of %lu bits\n", 1128 "read %lu/%lu pages, set %lu of %lu bits\n",
1043 bmname(bitmap), store->file_pages, 1129 bmname(bitmap), bitmap->file_pages, num_pages, bit_cnt, chunks);
1044 bit_cnt, chunks);
1045 1130
1046 return 0; 1131 return 0;
1047 1132
@@ -1058,38 +1143,19 @@ void bitmap_write_all(struct bitmap *bitmap)
1058 */ 1143 */
1059 int i; 1144 int i;
1060 1145
1061 if (!bitmap || !bitmap->storage.filemap) 1146 for (i = 0; i < bitmap->file_pages; i++)
1062 return; 1147 set_page_attr(bitmap, bitmap->filemap[i],
1063 if (bitmap->storage.file)
1064 /* Only one copy, so nothing needed */
1065 return;
1066
1067 for (i = 0; i < bitmap->storage.file_pages; i++)
1068 set_page_attr(bitmap, i,
1069 BITMAP_PAGE_NEEDWRITE); 1148 BITMAP_PAGE_NEEDWRITE);
1070 bitmap->allclean = 0;
1071} 1149}
1072 1150
1073static void bitmap_count_page(struct bitmap_counts *bitmap, 1151static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc)
1074 sector_t offset, int inc)
1075{ 1152{
1076 sector_t chunk = offset >> bitmap->chunkshift; 1153 sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap);
1077 unsigned long page = chunk >> PAGE_COUNTER_SHIFT; 1154 unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
1078 bitmap->bp[page].count += inc; 1155 bitmap->bp[page].count += inc;
1079 bitmap_checkfree(bitmap, page); 1156 bitmap_checkfree(bitmap, page);
1080} 1157}
1081 1158static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
1082static void bitmap_set_pending(struct bitmap_counts *bitmap, sector_t offset)
1083{
1084 sector_t chunk = offset >> bitmap->chunkshift;
1085 unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
1086 struct bitmap_page *bp = &bitmap->bp[page];
1087
1088 if (!bp->pending)
1089 bp->pending = 1;
1090}
1091
1092static bitmap_counter_t *bitmap_get_counter(struct bitmap_counts *bitmap,
1093 sector_t offset, sector_t *blocks, 1159 sector_t offset, sector_t *blocks,
1094 int create); 1160 int create);
1095 1161
@@ -1098,13 +1164,14 @@ static bitmap_counter_t *bitmap_get_counter(struct bitmap_counts *bitmap,
1098 * out to disk 1164 * out to disk
1099 */ 1165 */
1100 1166
1101void bitmap_daemon_work(struct mddev *mddev) 1167void bitmap_daemon_work(mddev_t *mddev)
1102{ 1168{
1103 struct bitmap *bitmap; 1169 struct bitmap *bitmap;
1104 unsigned long j; 1170 unsigned long j;
1105 unsigned long nextpage; 1171 unsigned long flags;
1172 struct page *page = NULL, *lastpage = NULL;
1106 sector_t blocks; 1173 sector_t blocks;
1107 struct bitmap_counts *counts; 1174 void *paddr;
1108 1175
1109 /* Use a mutex to guard daemon_work against 1176 /* Use a mutex to guard daemon_work against
1110 * bitmap_destroy. 1177 * bitmap_destroy.
@@ -1116,111 +1183,129 @@ void bitmap_daemon_work(struct mddev *mddev)
1116 return; 1183 return;
1117 } 1184 }
1118 if (time_before(jiffies, bitmap->daemon_lastrun 1185 if (time_before(jiffies, bitmap->daemon_lastrun
1119 + mddev->bitmap_info.daemon_sleep)) 1186 + bitmap->mddev->bitmap_info.daemon_sleep))
1120 goto done; 1187 goto done;
1121 1188
1122 bitmap->daemon_lastrun = jiffies; 1189 bitmap->daemon_lastrun = jiffies;
1123 if (bitmap->allclean) { 1190 if (bitmap->allclean) {
1124 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; 1191 bitmap->mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
1125 goto done; 1192 goto done;
1126 } 1193 }
1127 bitmap->allclean = 1; 1194 bitmap->allclean = 1;
1128 1195
1129 /* Any file-page which is PENDING now needs to be written. 1196 spin_lock_irqsave(&bitmap->lock, flags);
1130 * So set NEEDWRITE now, then after we make any last-minute changes 1197 for (j = 0; j < bitmap->chunks; j++) {
1131 * we will write it.
1132 */
1133 for (j = 0; j < bitmap->storage.file_pages; j++)
1134 if (test_and_clear_page_attr(bitmap, j,
1135 BITMAP_PAGE_PENDING))
1136 set_page_attr(bitmap, j,
1137 BITMAP_PAGE_NEEDWRITE);
1138
1139 if (bitmap->need_sync &&
1140 mddev->bitmap_info.external == 0) {
1141 /* Arrange for superblock update as well as
1142 * other changes */
1143 bitmap_super_t *sb;
1144 bitmap->need_sync = 0;
1145 if (bitmap->storage.filemap) {
1146 sb = kmap_atomic(bitmap->storage.sb_page);
1147 sb->events_cleared =
1148 cpu_to_le64(bitmap->events_cleared);
1149 kunmap_atomic(sb);
1150 set_page_attr(bitmap, 0,
1151 BITMAP_PAGE_NEEDWRITE);
1152 }
1153 }
1154 /* Now look at the bitmap counters and if any are '2' or '1',
1155 * decrement and handle accordingly.
1156 */
1157 counts = &bitmap->counts;
1158 spin_lock_irq(&counts->lock);
1159 nextpage = 0;
1160 for (j = 0; j < counts->chunks; j++) {
1161 bitmap_counter_t *bmc; 1198 bitmap_counter_t *bmc;
1162 sector_t block = (sector_t)j << counts->chunkshift; 1199 if (!bitmap->filemap)
1200 /* error or shutdown */
1201 break;
1202
1203 page = filemap_get_page(bitmap, j);
1204
1205 if (page != lastpage) {
1206 /* skip this page unless it's marked as needing cleaning */
1207 if (!test_page_attr(bitmap, page, BITMAP_PAGE_CLEAN)) {
1208 int need_write = test_page_attr(bitmap, page,
1209 BITMAP_PAGE_NEEDWRITE);
1210 if (need_write)
1211 clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
1163 1212
1164 if (j == nextpage) { 1213 spin_unlock_irqrestore(&bitmap->lock, flags);
1165 nextpage += PAGE_COUNTER_RATIO; 1214 if (need_write) {
1166 if (!counts->bp[j >> PAGE_COUNTER_SHIFT].pending) { 1215 write_page(bitmap, page, 0);
1167 j |= PAGE_COUNTER_MASK; 1216 bitmap->allclean = 0;
1217 }
1218 spin_lock_irqsave(&bitmap->lock, flags);
1219 j |= (PAGE_BITS - 1);
1168 continue; 1220 continue;
1169 } 1221 }
1170 counts->bp[j >> PAGE_COUNTER_SHIFT].pending = 0; 1222
1223 /* grab the new page, sync and release the old */
1224 if (lastpage != NULL) {
1225 if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) {
1226 clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1227 spin_unlock_irqrestore(&bitmap->lock, flags);
1228 write_page(bitmap, lastpage, 0);
1229 } else {
1230 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1231 spin_unlock_irqrestore(&bitmap->lock, flags);
1232 }
1233 } else
1234 spin_unlock_irqrestore(&bitmap->lock, flags);
1235 lastpage = page;
1236
1237 /* We are possibly going to clear some bits, so make
1238 * sure that events_cleared is up-to-date.
1239 */
1240 if (bitmap->need_sync &&
1241 bitmap->mddev->bitmap_info.external == 0) {
1242 bitmap_super_t *sb;
1243 bitmap->need_sync = 0;
1244 sb = kmap_atomic(bitmap->sb_page, KM_USER0);
1245 sb->events_cleared =
1246 cpu_to_le64(bitmap->events_cleared);
1247 kunmap_atomic(sb, KM_USER0);
1248 write_page(bitmap, bitmap->sb_page, 1);
1249 }
1250 spin_lock_irqsave(&bitmap->lock, flags);
1251 if (!bitmap->need_sync)
1252 clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
1171 } 1253 }
1172 bmc = bitmap_get_counter(counts, 1254 bmc = bitmap_get_counter(bitmap,
1173 block, 1255 (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap),
1174 &blocks, 0); 1256 &blocks, 0);
1257 if (bmc) {
1258 if (*bmc)
1259 bitmap->allclean = 0;
1175 1260
1176 if (!bmc) { 1261 if (*bmc == 2) {
1262 *bmc = 1; /* maybe clear the bit next time */
1263 set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
1264 } else if (*bmc == 1 && !bitmap->need_sync) {
1265 /* we can clear the bit */
1266 *bmc = 0;
1267 bitmap_count_page(bitmap,
1268 (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap),
1269 -1);
1270
1271 /* clear the bit */
1272 paddr = kmap_atomic(page, KM_USER0);
1273 if (bitmap->flags & BITMAP_HOSTENDIAN)
1274 clear_bit(file_page_offset(bitmap, j),
1275 paddr);
1276 else
1277 __clear_bit_le(
1278 file_page_offset(bitmap,
1279 j),
1280 paddr);
1281 kunmap_atomic(paddr, KM_USER0);
1282 }
1283 } else
1177 j |= PAGE_COUNTER_MASK; 1284 j |= PAGE_COUNTER_MASK;
1178 continue;
1179 }
1180 if (*bmc == 1 && !bitmap->need_sync) {
1181 /* We can clear the bit */
1182 *bmc = 0;
1183 bitmap_count_page(counts, block, -1);
1184 bitmap_file_clear_bit(bitmap, block);
1185 } else if (*bmc && *bmc <= 2) {
1186 *bmc = 1;
1187 bitmap_set_pending(counts, block);
1188 bitmap->allclean = 0;
1189 }
1190 } 1285 }
1191 spin_unlock_irq(&counts->lock); 1286 spin_unlock_irqrestore(&bitmap->lock, flags);
1192 1287
1193 /* Now start writeout on any page in NEEDWRITE that isn't DIRTY. 1288 /* now sync the final page */
1194 * DIRTY pages need to be written by bitmap_unplug so it can wait 1289 if (lastpage != NULL) {
1195 * for them. 1290 spin_lock_irqsave(&bitmap->lock, flags);
1196 * If we find any DIRTY page we stop there and let bitmap_unplug 1291 if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) {
1197 * handle all the rest. This is important in the case where 1292 clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1198 * the first blocking holds the superblock and it has been updated. 1293 spin_unlock_irqrestore(&bitmap->lock, flags);
1199 * We mustn't write any other blocks before the superblock. 1294 write_page(bitmap, lastpage, 0);
1200 */ 1295 } else {
1201 for (j = 0; 1296 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1202 j < bitmap->storage.file_pages 1297 spin_unlock_irqrestore(&bitmap->lock, flags);
1203 && !test_bit(BITMAP_STALE, &bitmap->flags);
1204 j++) {
1205
1206 if (test_page_attr(bitmap, j,
1207 BITMAP_PAGE_DIRTY))
1208 /* bitmap_unplug will handle the rest */
1209 break;
1210 if (test_and_clear_page_attr(bitmap, j,
1211 BITMAP_PAGE_NEEDWRITE)) {
1212 write_page(bitmap, bitmap->storage.filemap[j], 0);
1213 } 1298 }
1214 } 1299 }
1215 1300
1216 done: 1301 done:
1217 if (bitmap->allclean == 0) 1302 if (bitmap->allclean == 0)
1218 mddev->thread->timeout = 1303 bitmap->mddev->thread->timeout =
1219 mddev->bitmap_info.daemon_sleep; 1304 bitmap->mddev->bitmap_info.daemon_sleep;
1220 mutex_unlock(&mddev->bitmap_info.mutex); 1305 mutex_unlock(&mddev->bitmap_info.mutex);
1221} 1306}
1222 1307
1223static bitmap_counter_t *bitmap_get_counter(struct bitmap_counts *bitmap, 1308static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
1224 sector_t offset, sector_t *blocks, 1309 sector_t offset, sector_t *blocks,
1225 int create) 1310 int create)
1226__releases(bitmap->lock) 1311__releases(bitmap->lock)
@@ -1230,7 +1315,7 @@ __acquires(bitmap->lock)
1230 * The lock must have been taken with interrupts enabled. 1315 * The lock must have been taken with interrupts enabled.
1231 * If !create, we don't release the lock. 1316 * If !create, we don't release the lock.
1232 */ 1317 */
1233 sector_t chunk = offset >> bitmap->chunkshift; 1318 sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap);
1234 unsigned long page = chunk >> PAGE_COUNTER_SHIFT; 1319 unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
1235 unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT; 1320 unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT;
1236 sector_t csize; 1321 sector_t csize;
@@ -1240,10 +1325,10 @@ __acquires(bitmap->lock)
1240 1325
1241 if (bitmap->bp[page].hijacked || 1326 if (bitmap->bp[page].hijacked ||
1242 bitmap->bp[page].map == NULL) 1327 bitmap->bp[page].map == NULL)
1243 csize = ((sector_t)1) << (bitmap->chunkshift + 1328 csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap) +
1244 PAGE_COUNTER_SHIFT - 1); 1329 PAGE_COUNTER_SHIFT - 1);
1245 else 1330 else
1246 csize = ((sector_t)1) << bitmap->chunkshift; 1331 csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap));
1247 *blocks = csize - (offset & (csize - 1)); 1332 *blocks = csize - (offset & (csize - 1));
1248 1333
1249 if (err < 0) 1334 if (err < 0)
@@ -1274,18 +1359,18 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
1274 if (bw > bitmap->behind_writes_used) 1359 if (bw > bitmap->behind_writes_used)
1275 bitmap->behind_writes_used = bw; 1360 bitmap->behind_writes_used = bw;
1276 1361
1277 pr_debug("inc write-behind count %d/%lu\n", 1362 PRINTK(KERN_DEBUG "inc write-behind count %d/%d\n",
1278 bw, bitmap->mddev->bitmap_info.max_write_behind); 1363 bw, bitmap->max_write_behind);
1279 } 1364 }
1280 1365
1281 while (sectors) { 1366 while (sectors) {
1282 sector_t blocks; 1367 sector_t blocks;
1283 bitmap_counter_t *bmc; 1368 bitmap_counter_t *bmc;
1284 1369
1285 spin_lock_irq(&bitmap->counts.lock); 1370 spin_lock_irq(&bitmap->lock);
1286 bmc = bitmap_get_counter(&bitmap->counts, offset, &blocks, 1); 1371 bmc = bitmap_get_counter(bitmap, offset, &blocks, 1);
1287 if (!bmc) { 1372 if (!bmc) {
1288 spin_unlock_irq(&bitmap->counts.lock); 1373 spin_unlock_irq(&bitmap->lock);
1289 return 0; 1374 return 0;
1290 } 1375 }
1291 1376
@@ -1297,8 +1382,8 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
1297 */ 1382 */
1298 prepare_to_wait(&bitmap->overflow_wait, &__wait, 1383 prepare_to_wait(&bitmap->overflow_wait, &__wait,
1299 TASK_UNINTERRUPTIBLE); 1384 TASK_UNINTERRUPTIBLE);
1300 spin_unlock_irq(&bitmap->counts.lock); 1385 spin_unlock_irq(&bitmap->lock);
1301 schedule(); 1386 io_schedule();
1302 finish_wait(&bitmap->overflow_wait, &__wait); 1387 finish_wait(&bitmap->overflow_wait, &__wait);
1303 continue; 1388 continue;
1304 } 1389 }
@@ -1306,7 +1391,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
1306 switch (*bmc) { 1391 switch (*bmc) {
1307 case 0: 1392 case 0:
1308 bitmap_file_set_bit(bitmap, offset); 1393 bitmap_file_set_bit(bitmap, offset);
1309 bitmap_count_page(&bitmap->counts, offset, 1); 1394 bitmap_count_page(bitmap, offset, 1);
1310 /* fall through */ 1395 /* fall through */
1311 case 1: 1396 case 1:
1312 *bmc = 2; 1397 *bmc = 2;
@@ -1314,7 +1399,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
1314 1399
1315 (*bmc)++; 1400 (*bmc)++;
1316 1401
1317 spin_unlock_irq(&bitmap->counts.lock); 1402 spin_unlock_irq(&bitmap->lock);
1318 1403
1319 offset += blocks; 1404 offset += blocks;
1320 if (sectors > blocks) 1405 if (sectors > blocks)
@@ -1322,6 +1407,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
1322 else 1407 else
1323 sectors = 0; 1408 sectors = 0;
1324 } 1409 }
1410 bitmap->allclean = 0;
1325 return 0; 1411 return 0;
1326} 1412}
1327EXPORT_SYMBOL(bitmap_startwrite); 1413EXPORT_SYMBOL(bitmap_startwrite);
@@ -1334,24 +1420,26 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
1334 if (behind) { 1420 if (behind) {
1335 if (atomic_dec_and_test(&bitmap->behind_writes)) 1421 if (atomic_dec_and_test(&bitmap->behind_writes))
1336 wake_up(&bitmap->behind_wait); 1422 wake_up(&bitmap->behind_wait);
1337 pr_debug("dec write-behind count %d/%lu\n", 1423 PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n",
1338 atomic_read(&bitmap->behind_writes), 1424 atomic_read(&bitmap->behind_writes), bitmap->max_write_behind);
1339 bitmap->mddev->bitmap_info.max_write_behind);
1340 } 1425 }
1426 if (bitmap->mddev->degraded)
1427 /* Never clear bits or update events_cleared when degraded */
1428 success = 0;
1341 1429
1342 while (sectors) { 1430 while (sectors) {
1343 sector_t blocks; 1431 sector_t blocks;
1344 unsigned long flags; 1432 unsigned long flags;
1345 bitmap_counter_t *bmc; 1433 bitmap_counter_t *bmc;
1346 1434
1347 spin_lock_irqsave(&bitmap->counts.lock, flags); 1435 spin_lock_irqsave(&bitmap->lock, flags);
1348 bmc = bitmap_get_counter(&bitmap->counts, offset, &blocks, 0); 1436 bmc = bitmap_get_counter(bitmap, offset, &blocks, 0);
1349 if (!bmc) { 1437 if (!bmc) {
1350 spin_unlock_irqrestore(&bitmap->counts.lock, flags); 1438 spin_unlock_irqrestore(&bitmap->lock, flags);
1351 return; 1439 return;
1352 } 1440 }
1353 1441
1354 if (success && !bitmap->mddev->degraded && 1442 if (success &&
1355 bitmap->events_cleared < bitmap->mddev->events) { 1443 bitmap->events_cleared < bitmap->mddev->events) {
1356 bitmap->events_cleared = bitmap->mddev->events; 1444 bitmap->events_cleared = bitmap->mddev->events;
1357 bitmap->need_sync = 1; 1445 bitmap->need_sync = 1;
@@ -1365,11 +1453,14 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
1365 wake_up(&bitmap->overflow_wait); 1453 wake_up(&bitmap->overflow_wait);
1366 1454
1367 (*bmc)--; 1455 (*bmc)--;
1368 if (*bmc <= 2) { 1456 if (*bmc <= 2)
1369 bitmap_set_pending(&bitmap->counts, offset); 1457 set_page_attr(bitmap,
1370 bitmap->allclean = 0; 1458 filemap_get_page(
1371 } 1459 bitmap,
1372 spin_unlock_irqrestore(&bitmap->counts.lock, flags); 1460 offset >> CHUNK_BLOCK_SHIFT(bitmap)),
1461 BITMAP_PAGE_CLEAN);
1462
1463 spin_unlock_irqrestore(&bitmap->lock, flags);
1373 offset += blocks; 1464 offset += blocks;
1374 if (sectors > blocks) 1465 if (sectors > blocks)
1375 sectors -= blocks; 1466 sectors -= blocks;
@@ -1388,8 +1479,8 @@ static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t
1388 *blocks = 1024; 1479 *blocks = 1024;
1389 return 1; /* always resync if no bitmap */ 1480 return 1; /* always resync if no bitmap */
1390 } 1481 }
1391 spin_lock_irq(&bitmap->counts.lock); 1482 spin_lock_irq(&bitmap->lock);
1392 bmc = bitmap_get_counter(&bitmap->counts, offset, blocks, 0); 1483 bmc = bitmap_get_counter(bitmap, offset, blocks, 0);
1393 rv = 0; 1484 rv = 0;
1394 if (bmc) { 1485 if (bmc) {
1395 /* locked */ 1486 /* locked */
@@ -1403,7 +1494,8 @@ static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t
1403 } 1494 }
1404 } 1495 }
1405 } 1496 }
1406 spin_unlock_irq(&bitmap->counts.lock); 1497 spin_unlock_irq(&bitmap->lock);
1498 bitmap->allclean = 0;
1407 return rv; 1499 return rv;
1408} 1500}
1409 1501
@@ -1440,8 +1532,8 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, i
1440 *blocks = 1024; 1532 *blocks = 1024;
1441 return; 1533 return;
1442 } 1534 }
1443 spin_lock_irqsave(&bitmap->counts.lock, flags); 1535 spin_lock_irqsave(&bitmap->lock, flags);
1444 bmc = bitmap_get_counter(&bitmap->counts, offset, blocks, 0); 1536 bmc = bitmap_get_counter(bitmap, offset, blocks, 0);
1445 if (bmc == NULL) 1537 if (bmc == NULL)
1446 goto unlock; 1538 goto unlock;
1447 /* locked */ 1539 /* locked */
@@ -1451,14 +1543,15 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, i
1451 if (!NEEDED(*bmc) && aborted) 1543 if (!NEEDED(*bmc) && aborted)
1452 *bmc |= NEEDED_MASK; 1544 *bmc |= NEEDED_MASK;
1453 else { 1545 else {
1454 if (*bmc <= 2) { 1546 if (*bmc <= 2)
1455 bitmap_set_pending(&bitmap->counts, offset); 1547 set_page_attr(bitmap,
1456 bitmap->allclean = 0; 1548 filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)),
1457 } 1549 BITMAP_PAGE_CLEAN);
1458 } 1550 }
1459 } 1551 }
1460 unlock: 1552 unlock:
1461 spin_unlock_irqrestore(&bitmap->counts.lock, flags); 1553 spin_unlock_irqrestore(&bitmap->lock, flags);
1554 bitmap->allclean = 0;
1462} 1555}
1463EXPORT_SYMBOL(bitmap_end_sync); 1556EXPORT_SYMBOL(bitmap_end_sync);
1464 1557
@@ -1498,7 +1591,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
1498 1591
1499 bitmap->mddev->curr_resync_completed = sector; 1592 bitmap->mddev->curr_resync_completed = sector;
1500 set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); 1593 set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags);
1501 sector &= ~((1ULL << bitmap->counts.chunkshift) - 1); 1594 sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1);
1502 s = 0; 1595 s = 0;
1503 while (s < sector && s < bitmap->mddev->resync_max_sectors) { 1596 while (s < sector && s < bitmap->mddev->resync_max_sectors) {
1504 bitmap_end_sync(bitmap, s, &blocks, 0); 1597 bitmap_end_sync(bitmap, s, &blocks, 0);
@@ -1512,25 +1605,27 @@ EXPORT_SYMBOL(bitmap_cond_end_sync);
1512static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) 1605static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed)
1513{ 1606{
1514 /* For each chunk covered by any of these sectors, set the 1607 /* For each chunk covered by any of these sectors, set the
1515 * counter to 2 and possibly set resync_needed. They should all 1608 * counter to 1 and set resync_needed. They should all
1516 * be 0 at this point 1609 * be 0 at this point
1517 */ 1610 */
1518 1611
1519 sector_t secs; 1612 sector_t secs;
1520 bitmap_counter_t *bmc; 1613 bitmap_counter_t *bmc;
1521 spin_lock_irq(&bitmap->counts.lock); 1614 spin_lock_irq(&bitmap->lock);
1522 bmc = bitmap_get_counter(&bitmap->counts, offset, &secs, 1); 1615 bmc = bitmap_get_counter(bitmap, offset, &secs, 1);
1523 if (!bmc) { 1616 if (!bmc) {
1524 spin_unlock_irq(&bitmap->counts.lock); 1617 spin_unlock_irq(&bitmap->lock);
1525 return; 1618 return;
1526 } 1619 }
1527 if (!*bmc) { 1620 if (!*bmc) {
1528 *bmc = 2 | (needed ? NEEDED_MASK : 0); 1621 struct page *page;
1529 bitmap_count_page(&bitmap->counts, offset, 1); 1622 *bmc = 1 | (needed ? NEEDED_MASK : 0);
1530 bitmap_set_pending(&bitmap->counts, offset); 1623 bitmap_count_page(bitmap, offset, 1);
1531 bitmap->allclean = 0; 1624 page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap));
1625 set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
1532 } 1626 }
1533 spin_unlock_irq(&bitmap->counts.lock); 1627 spin_unlock_irq(&bitmap->lock);
1628 bitmap->allclean = 0;
1534} 1629}
1535 1630
1536/* dirty the memory and file bits for bitmap chunks "s" to "e" */ 1631/* dirty the memory and file bits for bitmap chunks "s" to "e" */
@@ -1539,7 +1634,7 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e)
1539 unsigned long chunk; 1634 unsigned long chunk;
1540 1635
1541 for (chunk = s; chunk <= e; chunk++) { 1636 for (chunk = s; chunk <= e; chunk++) {
1542 sector_t sec = (sector_t)chunk << bitmap->counts.chunkshift; 1637 sector_t sec = (sector_t)chunk << CHUNK_BLOCK_SHIFT(bitmap);
1543 bitmap_set_memory_bits(bitmap, sec, 1); 1638 bitmap_set_memory_bits(bitmap, sec, 1);
1544 bitmap_file_set_bit(bitmap, sec); 1639 bitmap_file_set_bit(bitmap, sec);
1545 if (sec < bitmap->mddev->recovery_cp) 1640 if (sec < bitmap->mddev->recovery_cp)
@@ -1554,7 +1649,7 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e)
1554/* 1649/*
1555 * flush out any pending updates 1650 * flush out any pending updates
1556 */ 1651 */
1557void bitmap_flush(struct mddev *mddev) 1652void bitmap_flush(mddev_t *mddev)
1558{ 1653{
1559 struct bitmap *bitmap = mddev->bitmap; 1654 struct bitmap *bitmap = mddev->bitmap;
1560 long sleep; 1655 long sleep;
@@ -1586,15 +1681,11 @@ static void bitmap_free(struct bitmap *bitmap)
1586 if (!bitmap) /* there was no bitmap */ 1681 if (!bitmap) /* there was no bitmap */
1587 return; 1682 return;
1588 1683
1589 /* Shouldn't be needed - but just in case.... */ 1684 /* release the bitmap file and kill the daemon */
1590 wait_event(bitmap->write_wait, 1685 bitmap_file_put(bitmap);
1591 atomic_read(&bitmap->pending_writes) == 0);
1592
1593 /* release the bitmap file */
1594 bitmap_file_unmap(&bitmap->storage);
1595 1686
1596 bp = bitmap->counts.bp; 1687 bp = bitmap->bp;
1597 pages = bitmap->counts.pages; 1688 pages = bitmap->pages;
1598 1689
1599 /* free all allocated memory */ 1690 /* free all allocated memory */
1600 1691
@@ -1606,7 +1697,7 @@ static void bitmap_free(struct bitmap *bitmap)
1606 kfree(bitmap); 1697 kfree(bitmap);
1607} 1698}
1608 1699
1609void bitmap_destroy(struct mddev *mddev) 1700void bitmap_destroy(mddev_t *mddev)
1610{ 1701{
1611 struct bitmap *bitmap = mddev->bitmap; 1702 struct bitmap *bitmap = mddev->bitmap;
1612 1703
@@ -1629,23 +1720,29 @@ void bitmap_destroy(struct mddev *mddev)
1629 * initialize the bitmap structure 1720 * initialize the bitmap structure
1630 * if this returns an error, bitmap_destroy must be called to do clean up 1721 * if this returns an error, bitmap_destroy must be called to do clean up
1631 */ 1722 */
1632int bitmap_create(struct mddev *mddev) 1723int bitmap_create(mddev_t *mddev)
1633{ 1724{
1634 struct bitmap *bitmap; 1725 struct bitmap *bitmap;
1635 sector_t blocks = mddev->resync_max_sectors; 1726 sector_t blocks = mddev->resync_max_sectors;
1727 unsigned long chunks;
1728 unsigned long pages;
1636 struct file *file = mddev->bitmap_info.file; 1729 struct file *file = mddev->bitmap_info.file;
1637 int err; 1730 int err;
1638 struct sysfs_dirent *bm = NULL; 1731 struct sysfs_dirent *bm = NULL;
1639 1732
1640 BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); 1733 BUILD_BUG_ON(sizeof(bitmap_super_t) != 256);
1641 1734
1735 if (!file
1736 && !mddev->bitmap_info.offset) /* bitmap disabled, nothing to do */
1737 return 0;
1738
1642 BUG_ON(file && mddev->bitmap_info.offset); 1739 BUG_ON(file && mddev->bitmap_info.offset);
1643 1740
1644 bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); 1741 bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
1645 if (!bitmap) 1742 if (!bitmap)
1646 return -ENOMEM; 1743 return -ENOMEM;
1647 1744
1648 spin_lock_init(&bitmap->counts.lock); 1745 spin_lock_init(&bitmap->lock);
1649 atomic_set(&bitmap->pending_writes, 0); 1746 atomic_set(&bitmap->pending_writes, 0);
1650 init_waitqueue_head(&bitmap->write_wait); 1747 init_waitqueue_head(&bitmap->write_wait);
1651 init_waitqueue_head(&bitmap->overflow_wait); 1748 init_waitqueue_head(&bitmap->overflow_wait);
@@ -1661,7 +1758,7 @@ int bitmap_create(struct mddev *mddev)
1661 } else 1758 } else
1662 bitmap->sysfs_can_clear = NULL; 1759 bitmap->sysfs_can_clear = NULL;
1663 1760
1664 bitmap->storage.file = file; 1761 bitmap->file = file;
1665 if (file) { 1762 if (file) {
1666 get_file(file); 1763 get_file(file);
1667 /* As future accesses to this file will use bmap, 1764 /* As future accesses to this file will use bmap,
@@ -1692,22 +1789,42 @@ int bitmap_create(struct mddev *mddev)
1692 goto error; 1789 goto error;
1693 1790
1694 bitmap->daemon_lastrun = jiffies; 1791 bitmap->daemon_lastrun = jiffies;
1695 err = bitmap_resize(bitmap, blocks, mddev->bitmap_info.chunksize, 1); 1792 bitmap->chunkshift = ffz(~mddev->bitmap_info.chunksize);
1696 if (err) 1793
1794 /* now that chunksize and chunkshift are set, we can use these macros */
1795 chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) >>
1796 CHUNK_BLOCK_SHIFT(bitmap);
1797 pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO;
1798
1799 BUG_ON(!pages);
1800
1801 bitmap->chunks = chunks;
1802 bitmap->pages = pages;
1803 bitmap->missing_pages = pages;
1804
1805#ifdef INJECT_FATAL_FAULT_1
1806 bitmap->bp = NULL;
1807#else
1808 bitmap->bp = kzalloc(pages * sizeof(*bitmap->bp), GFP_KERNEL);
1809#endif
1810 err = -ENOMEM;
1811 if (!bitmap->bp)
1697 goto error; 1812 goto error;
1698 1813
1699 printk(KERN_INFO "created bitmap (%lu pages) for device %s\n", 1814 printk(KERN_INFO "created bitmap (%lu pages) for device %s\n",
1700 bitmap->counts.pages, bmname(bitmap)); 1815 pages, bmname(bitmap));
1701 1816
1702 mddev->bitmap = bitmap; 1817 mddev->bitmap = bitmap;
1703 return test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0; 1818
1819
1820 return (bitmap->flags & BITMAP_WRITE_ERROR) ? -EIO : 0;
1704 1821
1705 error: 1822 error:
1706 bitmap_free(bitmap); 1823 bitmap_free(bitmap);
1707 return err; 1824 return err;
1708} 1825}
1709 1826
1710int bitmap_load(struct mddev *mddev) 1827int bitmap_load(mddev_t *mddev)
1711{ 1828{
1712 int err = 0; 1829 int err = 0;
1713 sector_t start = 0; 1830 sector_t start = 0;
@@ -1735,222 +1852,25 @@ int bitmap_load(struct mddev *mddev)
1735 * re-add of a missing device */ 1852 * re-add of a missing device */
1736 start = mddev->recovery_cp; 1853 start = mddev->recovery_cp;
1737 1854
1738 mutex_lock(&mddev->bitmap_info.mutex);
1739 err = bitmap_init_from_disk(bitmap, start); 1855 err = bitmap_init_from_disk(bitmap, start);
1740 mutex_unlock(&mddev->bitmap_info.mutex);
1741 1856
1742 if (err) 1857 if (err)
1743 goto out; 1858 goto out;
1744 clear_bit(BITMAP_STALE, &bitmap->flags);
1745
1746 /* Kick recovery in case any bits were set */
1747 set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery);
1748 1859
1749 mddev->thread->timeout = mddev->bitmap_info.daemon_sleep; 1860 mddev->thread->timeout = mddev->bitmap_info.daemon_sleep;
1750 md_wakeup_thread(mddev->thread); 1861 md_wakeup_thread(mddev->thread);
1751 1862
1752 bitmap_update_sb(bitmap); 1863 bitmap_update_sb(bitmap);
1753 1864
1754 if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) 1865 if (bitmap->flags & BITMAP_WRITE_ERROR)
1755 err = -EIO; 1866 err = -EIO;
1756out: 1867out:
1757 return err; 1868 return err;
1758} 1869}
1759EXPORT_SYMBOL_GPL(bitmap_load); 1870EXPORT_SYMBOL_GPL(bitmap_load);
1760 1871
1761void bitmap_status(struct seq_file *seq, struct bitmap *bitmap)
1762{
1763 unsigned long chunk_kb;
1764 struct bitmap_counts *counts;
1765
1766 if (!bitmap)
1767 return;
1768
1769 counts = &bitmap->counts;
1770
1771 chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10;
1772 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
1773 "%lu%s chunk",
1774 counts->pages - counts->missing_pages,
1775 counts->pages,
1776 (counts->pages - counts->missing_pages)
1777 << (PAGE_SHIFT - 10),
1778 chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize,
1779 chunk_kb ? "KB" : "B");
1780 if (bitmap->storage.file) {
1781 seq_printf(seq, ", file: ");
1782 seq_path(seq, &bitmap->storage.file->f_path, " \t\n");
1783 }
1784
1785 seq_printf(seq, "\n");
1786}
1787
1788int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
1789 int chunksize, int init)
1790{
1791 /* If chunk_size is 0, choose an appropriate chunk size.
1792 * Then possibly allocate new storage space.
1793 * Then quiesce, copy bits, replace bitmap, and re-start
1794 *
1795 * This function is called both to set up the initial bitmap
1796 * and to resize the bitmap while the array is active.
1797 * If this happens as a result of the array being resized,
1798 * chunksize will be zero, and we need to choose a suitable
1799 * chunksize, otherwise we use what we are given.
1800 */
1801 struct bitmap_storage store;
1802 struct bitmap_counts old_counts;
1803 unsigned long chunks;
1804 sector_t block;
1805 sector_t old_blocks, new_blocks;
1806 int chunkshift;
1807 int ret = 0;
1808 long pages;
1809 struct bitmap_page *new_bp;
1810
1811 if (chunksize == 0) {
1812 /* If there is enough space, leave the chunk size unchanged,
1813 * else increase by factor of two until there is enough space.
1814 */
1815 long bytes;
1816 long space = bitmap->mddev->bitmap_info.space;
1817
1818 if (space == 0) {
1819 /* We don't know how much space there is, so limit
1820 * to current size - in sectors.
1821 */
1822 bytes = DIV_ROUND_UP(bitmap->counts.chunks, 8);
1823 if (!bitmap->mddev->bitmap_info.external)
1824 bytes += sizeof(bitmap_super_t);
1825 space = DIV_ROUND_UP(bytes, 512);
1826 bitmap->mddev->bitmap_info.space = space;
1827 }
1828 chunkshift = bitmap->counts.chunkshift;
1829 chunkshift--;
1830 do {
1831 /* 'chunkshift' is shift from block size to chunk size */
1832 chunkshift++;
1833 chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift);
1834 bytes = DIV_ROUND_UP(chunks, 8);
1835 if (!bitmap->mddev->bitmap_info.external)
1836 bytes += sizeof(bitmap_super_t);
1837 } while (bytes > (space << 9));
1838 } else
1839 chunkshift = ffz(~chunksize) - BITMAP_BLOCK_SHIFT;
1840
1841 chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift);
1842 memset(&store, 0, sizeof(store));
1843 if (bitmap->mddev->bitmap_info.offset || bitmap->mddev->bitmap_info.file)
1844 ret = bitmap_storage_alloc(&store, chunks,
1845 !bitmap->mddev->bitmap_info.external);
1846 if (ret)
1847 goto err;
1848
1849 pages = DIV_ROUND_UP(chunks, PAGE_COUNTER_RATIO);
1850
1851 new_bp = kzalloc(pages * sizeof(*new_bp), GFP_KERNEL);
1852 ret = -ENOMEM;
1853 if (!new_bp) {
1854 bitmap_file_unmap(&store);
1855 goto err;
1856 }
1857
1858 if (!init)
1859 bitmap->mddev->pers->quiesce(bitmap->mddev, 1);
1860
1861 store.file = bitmap->storage.file;
1862 bitmap->storage.file = NULL;
1863
1864 if (store.sb_page && bitmap->storage.sb_page)
1865 memcpy(page_address(store.sb_page),
1866 page_address(bitmap->storage.sb_page),
1867 sizeof(bitmap_super_t));
1868 bitmap_file_unmap(&bitmap->storage);
1869 bitmap->storage = store;
1870
1871 old_counts = bitmap->counts;
1872 bitmap->counts.bp = new_bp;
1873 bitmap->counts.pages = pages;
1874 bitmap->counts.missing_pages = pages;
1875 bitmap->counts.chunkshift = chunkshift;
1876 bitmap->counts.chunks = chunks;
1877 bitmap->mddev->bitmap_info.chunksize = 1 << (chunkshift +
1878 BITMAP_BLOCK_SHIFT);
1879
1880 blocks = min(old_counts.chunks << old_counts.chunkshift,
1881 chunks << chunkshift);
1882
1883 spin_lock_irq(&bitmap->counts.lock);
1884 for (block = 0; block < blocks; ) {
1885 bitmap_counter_t *bmc_old, *bmc_new;
1886 int set;
1887
1888 bmc_old = bitmap_get_counter(&old_counts, block,
1889 &old_blocks, 0);
1890 set = bmc_old && NEEDED(*bmc_old);
1891
1892 if (set) {
1893 bmc_new = bitmap_get_counter(&bitmap->counts, block,
1894 &new_blocks, 1);
1895 if (*bmc_new == 0) {
1896 /* need to set on-disk bits too. */
1897 sector_t end = block + new_blocks;
1898 sector_t start = block >> chunkshift;
1899 start <<= chunkshift;
1900 while (start < end) {
1901 bitmap_file_set_bit(bitmap, block);
1902 start += 1 << chunkshift;
1903 }
1904 *bmc_new = 2;
1905 bitmap_count_page(&bitmap->counts,
1906 block, 1);
1907 bitmap_set_pending(&bitmap->counts,
1908 block);
1909 }
1910 *bmc_new |= NEEDED_MASK;
1911 if (new_blocks < old_blocks)
1912 old_blocks = new_blocks;
1913 }
1914 block += old_blocks;
1915 }
1916
1917 if (!init) {
1918 int i;
1919 while (block < (chunks << chunkshift)) {
1920 bitmap_counter_t *bmc;
1921 bmc = bitmap_get_counter(&bitmap->counts, block,
1922 &new_blocks, 1);
1923 if (bmc) {
1924 /* new space. It needs to be resynced, so
1925 * we set NEEDED_MASK.
1926 */
1927 if (*bmc == 0) {
1928 *bmc = NEEDED_MASK | 2;
1929 bitmap_count_page(&bitmap->counts,
1930 block, 1);
1931 bitmap_set_pending(&bitmap->counts,
1932 block);
1933 }
1934 }
1935 block += new_blocks;
1936 }
1937 for (i = 0; i < bitmap->storage.file_pages; i++)
1938 set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
1939 }
1940 spin_unlock_irq(&bitmap->counts.lock);
1941
1942 if (!init) {
1943 bitmap_unplug(bitmap);
1944 bitmap->mddev->pers->quiesce(bitmap->mddev, 0);
1945 }
1946 ret = 0;
1947err:
1948 return ret;
1949}
1950EXPORT_SYMBOL_GPL(bitmap_resize);
1951
1952static ssize_t 1872static ssize_t
1953location_show(struct mddev *mddev, char *page) 1873location_show(mddev_t *mddev, char *page)
1954{ 1874{
1955 ssize_t len; 1875 ssize_t len;
1956 if (mddev->bitmap_info.file) 1876 if (mddev->bitmap_info.file)
@@ -1964,7 +1884,7 @@ location_show(struct mddev *mddev, char *page)
1964} 1884}
1965 1885
1966static ssize_t 1886static ssize_t
1967location_store(struct mddev *mddev, const char *buf, size_t len) 1887location_store(mddev_t *mddev, const char *buf, size_t len)
1968{ 1888{
1969 1889
1970 if (mddev->pers) { 1890 if (mddev->pers) {
@@ -2017,8 +1937,6 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
2017 if (mddev->pers) { 1937 if (mddev->pers) {
2018 mddev->pers->quiesce(mddev, 1); 1938 mddev->pers->quiesce(mddev, 1);
2019 rv = bitmap_create(mddev); 1939 rv = bitmap_create(mddev);
2020 if (!rv)
2021 rv = bitmap_load(mddev);
2022 if (rv) { 1940 if (rv) {
2023 bitmap_destroy(mddev); 1941 bitmap_destroy(mddev);
2024 mddev->bitmap_info.offset = 0; 1942 mddev->bitmap_info.offset = 0;
@@ -2042,45 +1960,8 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
2042static struct md_sysfs_entry bitmap_location = 1960static struct md_sysfs_entry bitmap_location =
2043__ATTR(location, S_IRUGO|S_IWUSR, location_show, location_store); 1961__ATTR(location, S_IRUGO|S_IWUSR, location_show, location_store);
2044 1962
2045/* 'bitmap/space' is the space available at 'location' for the
2046 * bitmap. This allows the kernel to know when it is safe to
2047 * resize the bitmap to match a resized array.
2048 */
2049static ssize_t
2050space_show(struct mddev *mddev, char *page)
2051{
2052 return sprintf(page, "%lu\n", mddev->bitmap_info.space);
2053}
2054
2055static ssize_t
2056space_store(struct mddev *mddev, const char *buf, size_t len)
2057{
2058 unsigned long sectors;
2059 int rv;
2060
2061 rv = kstrtoul(buf, 10, &sectors);
2062 if (rv)
2063 return rv;
2064
2065 if (sectors == 0)
2066 return -EINVAL;
2067
2068 if (mddev->bitmap &&
2069 sectors < (mddev->bitmap->storage.bytes + 511) >> 9)
2070 return -EFBIG; /* Bitmap is too big for this small space */
2071
2072 /* could make sure it isn't too big, but that isn't really
2073 * needed - user-space should be careful.
2074 */
2075 mddev->bitmap_info.space = sectors;
2076 return len;
2077}
2078
2079static struct md_sysfs_entry bitmap_space =
2080__ATTR(space, S_IRUGO|S_IWUSR, space_show, space_store);
2081
2082static ssize_t 1963static ssize_t
2083timeout_show(struct mddev *mddev, char *page) 1964timeout_show(mddev_t *mddev, char *page)
2084{ 1965{
2085 ssize_t len; 1966 ssize_t len;
2086 unsigned long secs = mddev->bitmap_info.daemon_sleep / HZ; 1967 unsigned long secs = mddev->bitmap_info.daemon_sleep / HZ;
@@ -2094,7 +1975,7 @@ timeout_show(struct mddev *mddev, char *page)
2094} 1975}
2095 1976
2096static ssize_t 1977static ssize_t
2097timeout_store(struct mddev *mddev, const char *buf, size_t len) 1978timeout_store(mddev_t *mddev, const char *buf, size_t len)
2098{ 1979{
2099 /* timeout can be set at any time */ 1980 /* timeout can be set at any time */
2100 unsigned long timeout; 1981 unsigned long timeout;
@@ -2130,13 +2011,13 @@ static struct md_sysfs_entry bitmap_timeout =
2130__ATTR(time_base, S_IRUGO|S_IWUSR, timeout_show, timeout_store); 2011__ATTR(time_base, S_IRUGO|S_IWUSR, timeout_show, timeout_store);
2131 2012
2132static ssize_t 2013static ssize_t
2133backlog_show(struct mddev *mddev, char *page) 2014backlog_show(mddev_t *mddev, char *page)
2134{ 2015{
2135 return sprintf(page, "%lu\n", mddev->bitmap_info.max_write_behind); 2016 return sprintf(page, "%lu\n", mddev->bitmap_info.max_write_behind);
2136} 2017}
2137 2018
2138static ssize_t 2019static ssize_t
2139backlog_store(struct mddev *mddev, const char *buf, size_t len) 2020backlog_store(mddev_t *mddev, const char *buf, size_t len)
2140{ 2021{
2141 unsigned long backlog; 2022 unsigned long backlog;
2142 int rv = strict_strtoul(buf, 10, &backlog); 2023 int rv = strict_strtoul(buf, 10, &backlog);
@@ -2152,13 +2033,13 @@ static struct md_sysfs_entry bitmap_backlog =
2152__ATTR(backlog, S_IRUGO|S_IWUSR, backlog_show, backlog_store); 2033__ATTR(backlog, S_IRUGO|S_IWUSR, backlog_show, backlog_store);
2153 2034
2154static ssize_t 2035static ssize_t
2155chunksize_show(struct mddev *mddev, char *page) 2036chunksize_show(mddev_t *mddev, char *page)
2156{ 2037{
2157 return sprintf(page, "%lu\n", mddev->bitmap_info.chunksize); 2038 return sprintf(page, "%lu\n", mddev->bitmap_info.chunksize);
2158} 2039}
2159 2040
2160static ssize_t 2041static ssize_t
2161chunksize_store(struct mddev *mddev, const char *buf, size_t len) 2042chunksize_store(mddev_t *mddev, const char *buf, size_t len)
2162{ 2043{
2163 /* Can only be changed when no bitmap is active */ 2044 /* Can only be changed when no bitmap is active */
2164 int rv; 2045 int rv;
@@ -2178,13 +2059,13 @@ chunksize_store(struct mddev *mddev, const char *buf, size_t len)
2178static struct md_sysfs_entry bitmap_chunksize = 2059static struct md_sysfs_entry bitmap_chunksize =
2179__ATTR(chunksize, S_IRUGO|S_IWUSR, chunksize_show, chunksize_store); 2060__ATTR(chunksize, S_IRUGO|S_IWUSR, chunksize_show, chunksize_store);
2180 2061
2181static ssize_t metadata_show(struct mddev *mddev, char *page) 2062static ssize_t metadata_show(mddev_t *mddev, char *page)
2182{ 2063{
2183 return sprintf(page, "%s\n", (mddev->bitmap_info.external 2064 return sprintf(page, "%s\n", (mddev->bitmap_info.external
2184 ? "external" : "internal")); 2065 ? "external" : "internal"));
2185} 2066}
2186 2067
2187static ssize_t metadata_store(struct mddev *mddev, const char *buf, size_t len) 2068static ssize_t metadata_store(mddev_t *mddev, const char *buf, size_t len)
2188{ 2069{
2189 if (mddev->bitmap || 2070 if (mddev->bitmap ||
2190 mddev->bitmap_info.file || 2071 mddev->bitmap_info.file ||
@@ -2202,7 +2083,7 @@ static ssize_t metadata_store(struct mddev *mddev, const char *buf, size_t len)
2202static struct md_sysfs_entry bitmap_metadata = 2083static struct md_sysfs_entry bitmap_metadata =
2203__ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 2084__ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
2204 2085
2205static ssize_t can_clear_show(struct mddev *mddev, char *page) 2086static ssize_t can_clear_show(mddev_t *mddev, char *page)
2206{ 2087{
2207 int len; 2088 int len;
2208 if (mddev->bitmap) 2089 if (mddev->bitmap)
@@ -2213,7 +2094,7 @@ static ssize_t can_clear_show(struct mddev *mddev, char *page)
2213 return len; 2094 return len;
2214} 2095}
2215 2096
2216static ssize_t can_clear_store(struct mddev *mddev, const char *buf, size_t len) 2097static ssize_t can_clear_store(mddev_t *mddev, const char *buf, size_t len)
2217{ 2098{
2218 if (mddev->bitmap == NULL) 2099 if (mddev->bitmap == NULL)
2219 return -ENOENT; 2100 return -ENOENT;
@@ -2232,7 +2113,7 @@ static struct md_sysfs_entry bitmap_can_clear =
2232__ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store); 2113__ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store);
2233 2114
2234static ssize_t 2115static ssize_t
2235behind_writes_used_show(struct mddev *mddev, char *page) 2116behind_writes_used_show(mddev_t *mddev, char *page)
2236{ 2117{
2237 if (mddev->bitmap == NULL) 2118 if (mddev->bitmap == NULL)
2238 return sprintf(page, "0\n"); 2119 return sprintf(page, "0\n");
@@ -2241,7 +2122,7 @@ behind_writes_used_show(struct mddev *mddev, char *page)
2241} 2122}
2242 2123
2243static ssize_t 2124static ssize_t
2244behind_writes_used_reset(struct mddev *mddev, const char *buf, size_t len) 2125behind_writes_used_reset(mddev_t *mddev, const char *buf, size_t len)
2245{ 2126{
2246 if (mddev->bitmap) 2127 if (mddev->bitmap)
2247 mddev->bitmap->behind_writes_used = 0; 2128 mddev->bitmap->behind_writes_used = 0;
@@ -2254,7 +2135,6 @@ __ATTR(max_backlog_used, S_IRUGO | S_IWUSR,
2254 2135
2255static struct attribute *md_bitmap_attrs[] = { 2136static struct attribute *md_bitmap_attrs[] = {
2256 &bitmap_location.attr, 2137 &bitmap_location.attr,
2257 &bitmap_space.attr,
2258 &bitmap_timeout.attr, 2138 &bitmap_timeout.attr,
2259 &bitmap_backlog.attr, 2139 &bitmap_backlog.attr,
2260 &bitmap_chunksize.attr, 2140 &bitmap_chunksize.attr,
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index df4aeb6ac6f..a28f2e5588c 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -13,6 +13,8 @@
13#define BITMAP_MAJOR_HI 4 13#define BITMAP_MAJOR_HI 4
14#define BITMAP_MAJOR_HOSTENDIAN 3 14#define BITMAP_MAJOR_HOSTENDIAN 3
15 15
16#define BITMAP_MINOR 39
17
16/* 18/*
17 * in-memory bitmap: 19 * in-memory bitmap:
18 * 20 *
@@ -99,8 +101,22 @@ typedef __u16 bitmap_counter_t;
99/* same, except a mask value for more efficient bitops */ 101/* same, except a mask value for more efficient bitops */
100#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1) 102#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1)
101 103
104#define BITMAP_BLOCK_SIZE 512
102#define BITMAP_BLOCK_SHIFT 9 105#define BITMAP_BLOCK_SHIFT 9
103 106
107/* how many blocks per chunk? (this is variable) */
108#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->mddev->bitmap_info.chunksize >> BITMAP_BLOCK_SHIFT)
109#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT)
110#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1)
111
112/* when hijacked, the counters and bits represent even larger "chunks" */
113/* there will be 1024 chunks represented by each counter in the page pointers */
114#define PAGEPTR_BLOCK_RATIO(bitmap) \
115 (CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1)
116#define PAGEPTR_BLOCK_SHIFT(bitmap) \
117 (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1)
118#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1)
119
104#endif 120#endif
105 121
106/* 122/*
@@ -111,9 +127,9 @@ typedef __u16 bitmap_counter_t;
111 127
112/* use these for bitmap->flags and bitmap->sb->state bit-fields */ 128/* use these for bitmap->flags and bitmap->sb->state bit-fields */
113enum bitmap_state { 129enum bitmap_state {
114 BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */ 130 BITMAP_STALE = 0x002, /* the bitmap file is out of date or had -EIO */
115 BITMAP_WRITE_ERROR = 2, /* A write error has occurred */ 131 BITMAP_WRITE_ERROR = 0x004, /* A write error has occurred */
116 BITMAP_HOSTENDIAN =15, 132 BITMAP_HOSTENDIAN = 0x8000,
117}; 133};
118 134
119/* the superblock at the front of the bitmap file -- little endian */ 135/* the superblock at the front of the bitmap file -- little endian */
@@ -128,10 +144,8 @@ typedef struct bitmap_super_s {
128 __le32 chunksize; /* 52 the bitmap chunk size in bytes */ 144 __le32 chunksize; /* 52 the bitmap chunk size in bytes */
129 __le32 daemon_sleep; /* 56 seconds between disk flushes */ 145 __le32 daemon_sleep; /* 56 seconds between disk flushes */
130 __le32 write_behind; /* 60 number of outstanding write-behind writes */ 146 __le32 write_behind; /* 60 number of outstanding write-behind writes */
131 __le32 sectors_reserved; /* 64 number of 512-byte sectors that are
132 * reserved for the bitmap. */
133 147
134 __u8 pad[256 - 68]; /* set to zero */ 148 __u8 pad[256 - 64]; /* set to zero */
135} bitmap_super_t; 149} bitmap_super_t;
136 150
137/* notes: 151/* notes:
@@ -162,48 +176,41 @@ struct bitmap_page {
162 */ 176 */
163 unsigned int hijacked:1; 177 unsigned int hijacked:1;
164 /* 178 /*
165 * If any counter in this page is '1' or '2' - and so could be
166 * cleared then that page is marked as 'pending'
167 */
168 unsigned int pending:1;
169 /*
170 * count of dirty bits on the page 179 * count of dirty bits on the page
171 */ 180 */
172 unsigned int count:30; 181 unsigned int count:31;
182};
183
184/* keep track of bitmap file pages that have pending writes on them */
185struct page_list {
186 struct list_head list;
187 struct page *page;
173}; 188};
174 189
175/* the main bitmap structure - one per mddev */ 190/* the main bitmap structure - one per mddev */
176struct bitmap { 191struct bitmap {
192 struct bitmap_page *bp;
193 unsigned long pages; /* total number of pages in the bitmap */
194 unsigned long missing_pages; /* number of pages not yet allocated */
195
196 mddev_t *mddev; /* the md device that the bitmap is for */
177 197
178 struct bitmap_counts { 198 /* bitmap chunksize -- how much data does each bit represent? */
179 spinlock_t lock; 199 unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */
180 struct bitmap_page *bp; 200 unsigned long chunks; /* total number of data chunks for the array */
181 unsigned long pages; /* total number of pages
182 * in the bitmap */
183 unsigned long missing_pages; /* number of pages
184 * not yet allocated */
185 unsigned long chunkshift; /* chunksize = 2^chunkshift
186 * (for bitops) */
187 unsigned long chunks; /* Total number of data
188 * chunks for the array */
189 } counts;
190
191 struct mddev *mddev; /* the md device that the bitmap is for */
192 201
193 __u64 events_cleared; 202 __u64 events_cleared;
194 int need_sync; 203 int need_sync;
195 204
196 struct bitmap_storage { 205 /* bitmap spinlock */
197 struct file *file; /* backing disk file */ 206 spinlock_t lock;
198 struct page *sb_page; /* cached copy of the bitmap 207
199 * file superblock */ 208 struct file *file; /* backing disk file */
200 struct page **filemap; /* list of cache pages for 209 struct page *sb_page; /* cached copy of the bitmap file superblock */
201 * the file */ 210 struct page **filemap; /* list of cache pages for the file */
202 unsigned long *filemap_attr; /* attributes associated 211 unsigned long *filemap_attr; /* attributes associated w/ filemap pages */
203 * w/ filemap pages */ 212 unsigned long file_pages; /* number of pages in the file */
204 unsigned long file_pages; /* number of pages in the file*/ 213 int last_page_size; /* bytes in the last page */
205 unsigned long bytes; /* total bytes in the bitmap */
206 } storage;
207 214
208 unsigned long flags; 215 unsigned long flags;
209 216
@@ -231,14 +238,13 @@ struct bitmap {
231/* the bitmap API */ 238/* the bitmap API */
232 239
233/* these are used only by md/bitmap */ 240/* these are used only by md/bitmap */
234int bitmap_create(struct mddev *mddev); 241int bitmap_create(mddev_t *mddev);
235int bitmap_load(struct mddev *mddev); 242int bitmap_load(mddev_t *mddev);
236void bitmap_flush(struct mddev *mddev); 243void bitmap_flush(mddev_t *mddev);
237void bitmap_destroy(struct mddev *mddev); 244void bitmap_destroy(mddev_t *mddev);
238 245
239void bitmap_print_sb(struct bitmap *bitmap); 246void bitmap_print_sb(struct bitmap *bitmap);
240void bitmap_update_sb(struct bitmap *bitmap); 247void bitmap_update_sb(struct bitmap *bitmap);
241void bitmap_status(struct seq_file *seq, struct bitmap *bitmap);
242 248
243int bitmap_setallbits(struct bitmap *bitmap); 249int bitmap_setallbits(struct bitmap *bitmap);
244void bitmap_write_all(struct bitmap *bitmap); 250void bitmap_write_all(struct bitmap *bitmap);
@@ -256,10 +262,7 @@ void bitmap_close_sync(struct bitmap *bitmap);
256void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector); 262void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector);
257 263
258void bitmap_unplug(struct bitmap *bitmap); 264void bitmap_unplug(struct bitmap *bitmap);
259void bitmap_daemon_work(struct mddev *mddev); 265void bitmap_daemon_work(mddev_t *mddev);
260
261int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
262 int chunksize, int init);
263#endif 266#endif
264 267
265#endif 268#endif
diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c
deleted file mode 100644
index aefb78e3cbf..00000000000
--- a/drivers/md/dm-bio-prison.c
+++ /dev/null
@@ -1,390 +0,0 @@
1/*
2 * Copyright (C) 2012 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm.h"
8#include "dm-bio-prison.h"
9
10#include <linux/spinlock.h>
11#include <linux/mempool.h>
12#include <linux/module.h>
13#include <linux/slab.h>
14
15/*----------------------------------------------------------------*/
16
17struct dm_bio_prison_cell {
18 struct hlist_node list;
19 struct dm_bio_prison *prison;
20 struct dm_cell_key key;
21 struct bio *holder;
22 struct bio_list bios;
23};
24
25struct dm_bio_prison {
26 spinlock_t lock;
27 mempool_t *cell_pool;
28
29 unsigned nr_buckets;
30 unsigned hash_mask;
31 struct hlist_head *cells;
32};
33
34/*----------------------------------------------------------------*/
35
36static uint32_t calc_nr_buckets(unsigned nr_cells)
37{
38 uint32_t n = 128;
39
40 nr_cells /= 4;
41 nr_cells = min(nr_cells, 8192u);
42
43 while (n < nr_cells)
44 n <<= 1;
45
46 return n;
47}
48
49static struct kmem_cache *_cell_cache;
50
51/*
52 * @nr_cells should be the number of cells you want in use _concurrently_.
53 * Don't confuse it with the number of distinct keys.
54 */
55struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells)
56{
57 unsigned i;
58 uint32_t nr_buckets = calc_nr_buckets(nr_cells);
59 size_t len = sizeof(struct dm_bio_prison) +
60 (sizeof(struct hlist_head) * nr_buckets);
61 struct dm_bio_prison *prison = kmalloc(len, GFP_KERNEL);
62
63 if (!prison)
64 return NULL;
65
66 spin_lock_init(&prison->lock);
67 prison->cell_pool = mempool_create_slab_pool(nr_cells, _cell_cache);
68 if (!prison->cell_pool) {
69 kfree(prison);
70 return NULL;
71 }
72
73 prison->nr_buckets = nr_buckets;
74 prison->hash_mask = nr_buckets - 1;
75 prison->cells = (struct hlist_head *) (prison + 1);
76 for (i = 0; i < nr_buckets; i++)
77 INIT_HLIST_HEAD(prison->cells + i);
78
79 return prison;
80}
81EXPORT_SYMBOL_GPL(dm_bio_prison_create);
82
83void dm_bio_prison_destroy(struct dm_bio_prison *prison)
84{
85 mempool_destroy(prison->cell_pool);
86 kfree(prison);
87}
88EXPORT_SYMBOL_GPL(dm_bio_prison_destroy);
89
90static uint32_t hash_key(struct dm_bio_prison *prison, struct dm_cell_key *key)
91{
92 const unsigned long BIG_PRIME = 4294967291UL;
93 uint64_t hash = key->block * BIG_PRIME;
94
95 return (uint32_t) (hash & prison->hash_mask);
96}
97
98static int keys_equal(struct dm_cell_key *lhs, struct dm_cell_key *rhs)
99{
100 return (lhs->virtual == rhs->virtual) &&
101 (lhs->dev == rhs->dev) &&
102 (lhs->block == rhs->block);
103}
104
105static struct dm_bio_prison_cell *__search_bucket(struct hlist_head *bucket,
106 struct dm_cell_key *key)
107{
108 struct dm_bio_prison_cell *cell;
109 struct hlist_node *tmp;
110
111 hlist_for_each_entry(cell, tmp, bucket, list)
112 if (keys_equal(&cell->key, key))
113 return cell;
114
115 return NULL;
116}
117
118/*
119 * This may block if a new cell needs allocating. You must ensure that
120 * cells will be unlocked even if the calling thread is blocked.
121 *
122 * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
123 */
124int dm_bio_detain(struct dm_bio_prison *prison, struct dm_cell_key *key,
125 struct bio *inmate, struct dm_bio_prison_cell **ref)
126{
127 int r = 1;
128 unsigned long flags;
129 uint32_t hash = hash_key(prison, key);
130 struct dm_bio_prison_cell *cell, *cell2;
131
132 BUG_ON(hash > prison->nr_buckets);
133
134 spin_lock_irqsave(&prison->lock, flags);
135
136 cell = __search_bucket(prison->cells + hash, key);
137 if (cell) {
138 bio_list_add(&cell->bios, inmate);
139 goto out;
140 }
141
142 /*
143 * Allocate a new cell
144 */
145 spin_unlock_irqrestore(&prison->lock, flags);
146 cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO);
147 spin_lock_irqsave(&prison->lock, flags);
148
149 /*
150 * We've been unlocked, so we have to double check that
151 * nobody else has inserted this cell in the meantime.
152 */
153 cell = __search_bucket(prison->cells + hash, key);
154 if (cell) {
155 mempool_free(cell2, prison->cell_pool);
156 bio_list_add(&cell->bios, inmate);
157 goto out;
158 }
159
160 /*
161 * Use new cell.
162 */
163 cell = cell2;
164
165 cell->prison = prison;
166 memcpy(&cell->key, key, sizeof(cell->key));
167 cell->holder = inmate;
168 bio_list_init(&cell->bios);
169 hlist_add_head(&cell->list, prison->cells + hash);
170
171 r = 0;
172
173out:
174 spin_unlock_irqrestore(&prison->lock, flags);
175
176 *ref = cell;
177
178 return r;
179}
180EXPORT_SYMBOL_GPL(dm_bio_detain);
181
182/*
183 * @inmates must have been initialised prior to this call
184 */
185static void __cell_release(struct dm_bio_prison_cell *cell, struct bio_list *inmates)
186{
187 struct dm_bio_prison *prison = cell->prison;
188
189 hlist_del(&cell->list);
190
191 if (inmates) {
192 bio_list_add(inmates, cell->holder);
193 bio_list_merge(inmates, &cell->bios);
194 }
195
196 mempool_free(cell, prison->cell_pool);
197}
198
199void dm_cell_release(struct dm_bio_prison_cell *cell, struct bio_list *bios)
200{
201 unsigned long flags;
202 struct dm_bio_prison *prison = cell->prison;
203
204 spin_lock_irqsave(&prison->lock, flags);
205 __cell_release(cell, bios);
206 spin_unlock_irqrestore(&prison->lock, flags);
207}
208EXPORT_SYMBOL_GPL(dm_cell_release);
209
210/*
211 * Sometimes we don't want the holder, just the additional bios.
212 */
213static void __cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list *inmates)
214{
215 struct dm_bio_prison *prison = cell->prison;
216
217 hlist_del(&cell->list);
218 bio_list_merge(inmates, &cell->bios);
219
220 mempool_free(cell, prison->cell_pool);
221}
222
223void dm_cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list *inmates)
224{
225 unsigned long flags;
226 struct dm_bio_prison *prison = cell->prison;
227
228 spin_lock_irqsave(&prison->lock, flags);
229 __cell_release_no_holder(cell, inmates);
230 spin_unlock_irqrestore(&prison->lock, flags);
231}
232EXPORT_SYMBOL_GPL(dm_cell_release_no_holder);
233
234void dm_cell_error(struct dm_bio_prison_cell *cell)
235{
236 struct dm_bio_prison *prison = cell->prison;
237 struct bio_list bios;
238 struct bio *bio;
239 unsigned long flags;
240
241 bio_list_init(&bios);
242
243 spin_lock_irqsave(&prison->lock, flags);
244 __cell_release(cell, &bios);
245 spin_unlock_irqrestore(&prison->lock, flags);
246
247 while ((bio = bio_list_pop(&bios)))
248 bio_io_error(bio);
249}
250EXPORT_SYMBOL_GPL(dm_cell_error);
251
252/*----------------------------------------------------------------*/
253
254#define DEFERRED_SET_SIZE 64
255
256struct dm_deferred_entry {
257 struct dm_deferred_set *ds;
258 unsigned count;
259 struct list_head work_items;
260};
261
262struct dm_deferred_set {
263 spinlock_t lock;
264 unsigned current_entry;
265 unsigned sweeper;
266 struct dm_deferred_entry entries[DEFERRED_SET_SIZE];
267};
268
269struct dm_deferred_set *dm_deferred_set_create(void)
270{
271 int i;
272 struct dm_deferred_set *ds;
273
274 ds = kmalloc(sizeof(*ds), GFP_KERNEL);
275 if (!ds)
276 return NULL;
277
278 spin_lock_init(&ds->lock);
279 ds->current_entry = 0;
280 ds->sweeper = 0;
281 for (i = 0; i < DEFERRED_SET_SIZE; i++) {
282 ds->entries[i].ds = ds;
283 ds->entries[i].count = 0;
284 INIT_LIST_HEAD(&ds->entries[i].work_items);
285 }
286
287 return ds;
288}
289EXPORT_SYMBOL_GPL(dm_deferred_set_create);
290
291void dm_deferred_set_destroy(struct dm_deferred_set *ds)
292{
293 kfree(ds);
294}
295EXPORT_SYMBOL_GPL(dm_deferred_set_destroy);
296
297struct dm_deferred_entry *dm_deferred_entry_inc(struct dm_deferred_set *ds)
298{
299 unsigned long flags;
300 struct dm_deferred_entry *entry;
301
302 spin_lock_irqsave(&ds->lock, flags);
303 entry = ds->entries + ds->current_entry;
304 entry->count++;
305 spin_unlock_irqrestore(&ds->lock, flags);
306
307 return entry;
308}
309EXPORT_SYMBOL_GPL(dm_deferred_entry_inc);
310
311static unsigned ds_next(unsigned index)
312{
313 return (index + 1) % DEFERRED_SET_SIZE;
314}
315
316static void __sweep(struct dm_deferred_set *ds, struct list_head *head)
317{
318 while ((ds->sweeper != ds->current_entry) &&
319 !ds->entries[ds->sweeper].count) {
320 list_splice_init(&ds->entries[ds->sweeper].work_items, head);
321 ds->sweeper = ds_next(ds->sweeper);
322 }
323
324 if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->sweeper].count)
325 list_splice_init(&ds->entries[ds->sweeper].work_items, head);
326}
327
328void dm_deferred_entry_dec(struct dm_deferred_entry *entry, struct list_head *head)
329{
330 unsigned long flags;
331
332 spin_lock_irqsave(&entry->ds->lock, flags);
333 BUG_ON(!entry->count);
334 --entry->count;
335 __sweep(entry->ds, head);
336 spin_unlock_irqrestore(&entry->ds->lock, flags);
337}
338EXPORT_SYMBOL_GPL(dm_deferred_entry_dec);
339
340/*
341 * Returns 1 if deferred or 0 if no pending items to delay job.
342 */
343int dm_deferred_set_add_work(struct dm_deferred_set *ds, struct list_head *work)
344{
345 int r = 1;
346 unsigned long flags;
347 unsigned next_entry;
348
349 spin_lock_irqsave(&ds->lock, flags);
350 if ((ds->sweeper == ds->current_entry) &&
351 !ds->entries[ds->current_entry].count)
352 r = 0;
353 else {
354 list_add(work, &ds->entries[ds->current_entry].work_items);
355 next_entry = ds_next(ds->current_entry);
356 if (!ds->entries[next_entry].count)
357 ds->current_entry = next_entry;
358 }
359 spin_unlock_irqrestore(&ds->lock, flags);
360
361 return r;
362}
363EXPORT_SYMBOL_GPL(dm_deferred_set_add_work);
364
365/*----------------------------------------------------------------*/
366
367static int __init dm_bio_prison_init(void)
368{
369 _cell_cache = KMEM_CACHE(dm_bio_prison_cell, 0);
370 if (!_cell_cache)
371 return -ENOMEM;
372
373 return 0;
374}
375
376static void __exit dm_bio_prison_exit(void)
377{
378 kmem_cache_destroy(_cell_cache);
379 _cell_cache = NULL;
380}
381
382/*
383 * module hooks
384 */
385module_init(dm_bio_prison_init);
386module_exit(dm_bio_prison_exit);
387
388MODULE_DESCRIPTION(DM_NAME " bio prison");
389MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
390MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison.h
deleted file mode 100644
index 53d1a7a84e2..00000000000
--- a/drivers/md/dm-bio-prison.h
+++ /dev/null
@@ -1,71 +0,0 @@
1/*
2 * Copyright (C) 2011-2012 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef DM_BIO_PRISON_H
8#define DM_BIO_PRISON_H
9
10#include "persistent-data/dm-block-manager.h" /* FIXME: for dm_block_t */
11#include "dm-thin-metadata.h" /* FIXME: for dm_thin_id */
12
13#include <linux/list.h>
14#include <linux/bio.h>
15
16/*----------------------------------------------------------------*/
17
18/*
19 * Sometimes we can't deal with a bio straight away. We put them in prison
20 * where they can't cause any mischief. Bios are put in a cell identified
21 * by a key, multiple bios can be in the same cell. When the cell is
22 * subsequently unlocked the bios become available.
23 */
24struct dm_bio_prison;
25struct dm_bio_prison_cell;
26
27/* FIXME: this needs to be more abstract */
28struct dm_cell_key {
29 int virtual;
30 dm_thin_id dev;
31 dm_block_t block;
32};
33
34struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells);
35void dm_bio_prison_destroy(struct dm_bio_prison *prison);
36
37/*
38 * This may block if a new cell needs allocating. You must ensure that
39 * cells will be unlocked even if the calling thread is blocked.
40 *
41 * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
42 */
43int dm_bio_detain(struct dm_bio_prison *prison, struct dm_cell_key *key,
44 struct bio *inmate, struct dm_bio_prison_cell **ref);
45
46void dm_cell_release(struct dm_bio_prison_cell *cell, struct bio_list *bios);
47void dm_cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list *inmates);
48void dm_cell_error(struct dm_bio_prison_cell *cell);
49
50/*----------------------------------------------------------------*/
51
52/*
53 * We use the deferred set to keep track of pending reads to shared blocks.
54 * We do this to ensure the new mapping caused by a write isn't performed
55 * until these prior reads have completed. Otherwise the insertion of the
56 * new mapping could free the old block that the read bios are mapped to.
57 */
58
59struct dm_deferred_set;
60struct dm_deferred_entry;
61
62struct dm_deferred_set *dm_deferred_set_create(void);
63void dm_deferred_set_destroy(struct dm_deferred_set *ds);
64
65struct dm_deferred_entry *dm_deferred_entry_inc(struct dm_deferred_set *ds);
66void dm_deferred_entry_dec(struct dm_deferred_entry *entry, struct list_head *head);
67int dm_deferred_set_add_work(struct dm_deferred_set *ds, struct list_head *work);
68
69/*----------------------------------------------------------------*/
70
71#endif
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
deleted file mode 100644
index 651ca79881d..00000000000
--- a/drivers/md/dm-bufio.c
+++ /dev/null
@@ -1,1750 +0,0 @@
1/*
2 * Copyright (C) 2009-2011 Red Hat, Inc.
3 *
4 * Author: Mikulas Patocka <mpatocka@redhat.com>
5 *
6 * This file is released under the GPL.
7 */
8
9#include "dm-bufio.h"
10
11#include <linux/device-mapper.h>
12#include <linux/dm-io.h>
13#include <linux/slab.h>
14#include <linux/vmalloc.h>
15#include <linux/shrinker.h>
16#include <linux/module.h>
17
18#define DM_MSG_PREFIX "bufio"
19
20/*
21 * Memory management policy:
22 * Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory
23 * or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower).
24 * Always allocate at least DM_BUFIO_MIN_BUFFERS buffers.
25 * Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT
26 * dirty buffers.
27 */
28#define DM_BUFIO_MIN_BUFFERS 8
29
30#define DM_BUFIO_MEMORY_PERCENT 2
31#define DM_BUFIO_VMALLOC_PERCENT 25
32#define DM_BUFIO_WRITEBACK_PERCENT 75
33
34/*
35 * Check buffer ages in this interval (seconds)
36 */
37#define DM_BUFIO_WORK_TIMER_SECS 10
38
39/*
40 * Free buffers when they are older than this (seconds)
41 */
42#define DM_BUFIO_DEFAULT_AGE_SECS 60
43
44/*
45 * The number of bvec entries that are embedded directly in the buffer.
46 * If the chunk size is larger, dm-io is used to do the io.
47 */
48#define DM_BUFIO_INLINE_VECS 16
49
50/*
51 * Buffer hash
52 */
53#define DM_BUFIO_HASH_BITS 20
54#define DM_BUFIO_HASH(block) \
55 ((((block) >> DM_BUFIO_HASH_BITS) ^ (block)) & \
56 ((1 << DM_BUFIO_HASH_BITS) - 1))
57
58/*
59 * Don't try to use kmem_cache_alloc for blocks larger than this.
60 * For explanation, see alloc_buffer_data below.
61 */
62#define DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT (PAGE_SIZE >> 1)
63#define DM_BUFIO_BLOCK_SIZE_GFP_LIMIT (PAGE_SIZE << (MAX_ORDER - 1))
64
65/*
66 * dm_buffer->list_mode
67 */
68#define LIST_CLEAN 0
69#define LIST_DIRTY 1
70#define LIST_SIZE 2
71
72/*
73 * Linking of buffers:
74 * All buffers are linked to cache_hash with their hash_list field.
75 *
76 * Clean buffers that are not being written (B_WRITING not set)
77 * are linked to lru[LIST_CLEAN] with their lru_list field.
78 *
79 * Dirty and clean buffers that are being written are linked to
80 * lru[LIST_DIRTY] with their lru_list field. When the write
81 * finishes, the buffer cannot be relinked immediately (because we
82 * are in an interrupt context and relinking requires process
83 * context), so some clean-not-writing buffers can be held on
84 * dirty_lru too. They are later added to lru in the process
85 * context.
86 */
87struct dm_bufio_client {
88 struct mutex lock;
89
90 struct list_head lru[LIST_SIZE];
91 unsigned long n_buffers[LIST_SIZE];
92
93 struct block_device *bdev;
94 unsigned block_size;
95 unsigned char sectors_per_block_bits;
96 unsigned char pages_per_block_bits;
97 unsigned char blocks_per_page_bits;
98 unsigned aux_size;
99 void (*alloc_callback)(struct dm_buffer *);
100 void (*write_callback)(struct dm_buffer *);
101
102 struct dm_io_client *dm_io;
103
104 struct list_head reserved_buffers;
105 unsigned need_reserved_buffers;
106
107 struct hlist_head *cache_hash;
108 wait_queue_head_t free_buffer_wait;
109
110 int async_write_error;
111
112 struct list_head client_list;
113 struct shrinker shrinker;
114};
115
116/*
117 * Buffer state bits.
118 */
119#define B_READING 0
120#define B_WRITING 1
121#define B_DIRTY 2
122
123/*
124 * Describes how the block was allocated:
125 * kmem_cache_alloc(), __get_free_pages() or vmalloc().
126 * See the comment at alloc_buffer_data.
127 */
128enum data_mode {
129 DATA_MODE_SLAB = 0,
130 DATA_MODE_GET_FREE_PAGES = 1,
131 DATA_MODE_VMALLOC = 2,
132 DATA_MODE_LIMIT = 3
133};
134
135struct dm_buffer {
136 struct hlist_node hash_list;
137 struct list_head lru_list;
138 sector_t block;
139 void *data;
140 enum data_mode data_mode;
141 unsigned char list_mode; /* LIST_* */
142 unsigned hold_count;
143 int read_error;
144 int write_error;
145 unsigned long state;
146 unsigned long last_accessed;
147 struct dm_bufio_client *c;
148 struct bio bio;
149 struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS];
150};
151
152/*----------------------------------------------------------------*/
153
154static struct kmem_cache *dm_bufio_caches[PAGE_SHIFT - SECTOR_SHIFT];
155static char *dm_bufio_cache_names[PAGE_SHIFT - SECTOR_SHIFT];
156
157static inline int dm_bufio_cache_index(struct dm_bufio_client *c)
158{
159 unsigned ret = c->blocks_per_page_bits - 1;
160
161 BUG_ON(ret >= ARRAY_SIZE(dm_bufio_caches));
162
163 return ret;
164}
165
166#define DM_BUFIO_CACHE(c) (dm_bufio_caches[dm_bufio_cache_index(c)])
167#define DM_BUFIO_CACHE_NAME(c) (dm_bufio_cache_names[dm_bufio_cache_index(c)])
168
169#define dm_bufio_in_request() (!!current->bio_list)
170
171static void dm_bufio_lock(struct dm_bufio_client *c)
172{
173 mutex_lock_nested(&c->lock, dm_bufio_in_request());
174}
175
176static int dm_bufio_trylock(struct dm_bufio_client *c)
177{
178 return mutex_trylock(&c->lock);
179}
180
181static void dm_bufio_unlock(struct dm_bufio_client *c)
182{
183 mutex_unlock(&c->lock);
184}
185
186/*
187 * FIXME Move to sched.h?
188 */
189#ifdef CONFIG_PREEMPT_VOLUNTARY
190# define dm_bufio_cond_resched() \
191do { \
192 if (unlikely(need_resched())) \
193 _cond_resched(); \
194} while (0)
195#else
196# define dm_bufio_cond_resched() do { } while (0)
197#endif
198
199/*----------------------------------------------------------------*/
200
201/*
202 * Default cache size: available memory divided by the ratio.
203 */
204static unsigned long dm_bufio_default_cache_size;
205
206/*
207 * Total cache size set by the user.
208 */
209static unsigned long dm_bufio_cache_size;
210
211/*
212 * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change
213 * at any time. If it disagrees, the user has changed cache size.
214 */
215static unsigned long dm_bufio_cache_size_latch;
216
217static DEFINE_SPINLOCK(param_spinlock);
218
219/*
220 * Buffers are freed after this timeout
221 */
222static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
223
224static unsigned long dm_bufio_peak_allocated;
225static unsigned long dm_bufio_allocated_kmem_cache;
226static unsigned long dm_bufio_allocated_get_free_pages;
227static unsigned long dm_bufio_allocated_vmalloc;
228static unsigned long dm_bufio_current_allocated;
229
230/*----------------------------------------------------------------*/
231
232/*
233 * Per-client cache: dm_bufio_cache_size / dm_bufio_client_count
234 */
235static unsigned long dm_bufio_cache_size_per_client;
236
237/*
238 * The current number of clients.
239 */
240static int dm_bufio_client_count;
241
242/*
243 * The list of all clients.
244 */
245static LIST_HEAD(dm_bufio_all_clients);
246
247/*
248 * This mutex protects dm_bufio_cache_size_latch,
249 * dm_bufio_cache_size_per_client and dm_bufio_client_count
250 */
251static DEFINE_MUTEX(dm_bufio_clients_lock);
252
253/*----------------------------------------------------------------*/
254
255static void adjust_total_allocated(enum data_mode data_mode, long diff)
256{
257 static unsigned long * const class_ptr[DATA_MODE_LIMIT] = {
258 &dm_bufio_allocated_kmem_cache,
259 &dm_bufio_allocated_get_free_pages,
260 &dm_bufio_allocated_vmalloc,
261 };
262
263 spin_lock(&param_spinlock);
264
265 *class_ptr[data_mode] += diff;
266
267 dm_bufio_current_allocated += diff;
268
269 if (dm_bufio_current_allocated > dm_bufio_peak_allocated)
270 dm_bufio_peak_allocated = dm_bufio_current_allocated;
271
272 spin_unlock(&param_spinlock);
273}
274
275/*
276 * Change the number of clients and recalculate per-client limit.
277 */
278static void __cache_size_refresh(void)
279{
280 BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock));
281 BUG_ON(dm_bufio_client_count < 0);
282
283 dm_bufio_cache_size_latch = ACCESS_ONCE(dm_bufio_cache_size);
284
285 /*
286 * Use default if set to 0 and report the actual cache size used.
287 */
288 if (!dm_bufio_cache_size_latch) {
289 (void)cmpxchg(&dm_bufio_cache_size, 0,
290 dm_bufio_default_cache_size);
291 dm_bufio_cache_size_latch = dm_bufio_default_cache_size;
292 }
293
294 dm_bufio_cache_size_per_client = dm_bufio_cache_size_latch /
295 (dm_bufio_client_count ? : 1);
296}
297
298/*
299 * Allocating buffer data.
300 *
301 * Small buffers are allocated with kmem_cache, to use space optimally.
302 *
303 * For large buffers, we choose between get_free_pages and vmalloc.
304 * Each has advantages and disadvantages.
305 *
306 * __get_free_pages can randomly fail if the memory is fragmented.
307 * __vmalloc won't randomly fail, but vmalloc space is limited (it may be
308 * as low as 128M) so using it for caching is not appropriate.
309 *
310 * If the allocation may fail we use __get_free_pages. Memory fragmentation
311 * won't have a fatal effect here, but it just causes flushes of some other
312 * buffers and more I/O will be performed. Don't use __get_free_pages if it
313 * always fails (i.e. order >= MAX_ORDER).
314 *
315 * If the allocation shouldn't fail we use __vmalloc. This is only for the
316 * initial reserve allocation, so there's no risk of wasting all vmalloc
317 * space.
318 */
319static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
320 enum data_mode *data_mode)
321{
322 if (c->block_size <= DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT) {
323 *data_mode = DATA_MODE_SLAB;
324 return kmem_cache_alloc(DM_BUFIO_CACHE(c), gfp_mask);
325 }
326
327 if (c->block_size <= DM_BUFIO_BLOCK_SIZE_GFP_LIMIT &&
328 gfp_mask & __GFP_NORETRY) {
329 *data_mode = DATA_MODE_GET_FREE_PAGES;
330 return (void *)__get_free_pages(gfp_mask,
331 c->pages_per_block_bits);
332 }
333
334 *data_mode = DATA_MODE_VMALLOC;
335 return __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL);
336}
337
338/*
339 * Free buffer's data.
340 */
341static void free_buffer_data(struct dm_bufio_client *c,
342 void *data, enum data_mode data_mode)
343{
344 switch (data_mode) {
345 case DATA_MODE_SLAB:
346 kmem_cache_free(DM_BUFIO_CACHE(c), data);
347 break;
348
349 case DATA_MODE_GET_FREE_PAGES:
350 free_pages((unsigned long)data, c->pages_per_block_bits);
351 break;
352
353 case DATA_MODE_VMALLOC:
354 vfree(data);
355 break;
356
357 default:
358 DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d",
359 data_mode);
360 BUG();
361 }
362}
363
364/*
365 * Allocate buffer and its data.
366 */
367static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask)
368{
369 struct dm_buffer *b = kmalloc(sizeof(struct dm_buffer) + c->aux_size,
370 gfp_mask);
371
372 if (!b)
373 return NULL;
374
375 b->c = c;
376
377 b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode);
378 if (!b->data) {
379 kfree(b);
380 return NULL;
381 }
382
383 adjust_total_allocated(b->data_mode, (long)c->block_size);
384
385 return b;
386}
387
388/*
389 * Free buffer and its data.
390 */
391static void free_buffer(struct dm_buffer *b)
392{
393 struct dm_bufio_client *c = b->c;
394
395 adjust_total_allocated(b->data_mode, -(long)c->block_size);
396
397 free_buffer_data(c, b->data, b->data_mode);
398 kfree(b);
399}
400
401/*
402 * Link buffer to the hash list and clean or dirty queue.
403 */
404static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty)
405{
406 struct dm_bufio_client *c = b->c;
407
408 c->n_buffers[dirty]++;
409 b->block = block;
410 b->list_mode = dirty;
411 list_add(&b->lru_list, &c->lru[dirty]);
412 hlist_add_head(&b->hash_list, &c->cache_hash[DM_BUFIO_HASH(block)]);
413 b->last_accessed = jiffies;
414}
415
416/*
417 * Unlink buffer from the hash list and dirty or clean queue.
418 */
419static void __unlink_buffer(struct dm_buffer *b)
420{
421 struct dm_bufio_client *c = b->c;
422
423 BUG_ON(!c->n_buffers[b->list_mode]);
424
425 c->n_buffers[b->list_mode]--;
426 hlist_del(&b->hash_list);
427 list_del(&b->lru_list);
428}
429
430/*
431 * Place the buffer to the head of dirty or clean LRU queue.
432 */
433static void __relink_lru(struct dm_buffer *b, int dirty)
434{
435 struct dm_bufio_client *c = b->c;
436
437 BUG_ON(!c->n_buffers[b->list_mode]);
438
439 c->n_buffers[b->list_mode]--;
440 c->n_buffers[dirty]++;
441 b->list_mode = dirty;
442 list_move(&b->lru_list, &c->lru[dirty]);
443}
444
445/*----------------------------------------------------------------
446 * Submit I/O on the buffer.
447 *
448 * Bio interface is faster but it has some problems:
449 * the vector list is limited (increasing this limit increases
450 * memory-consumption per buffer, so it is not viable);
451 *
452 * the memory must be direct-mapped, not vmalloced;
453 *
454 * the I/O driver can reject requests spuriously if it thinks that
455 * the requests are too big for the device or if they cross a
456 * controller-defined memory boundary.
457 *
458 * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and
459 * it is not vmalloced, try using the bio interface.
460 *
461 * If the buffer is big, if it is vmalloced or if the underlying device
462 * rejects the bio because it is too large, use dm-io layer to do the I/O.
463 * The dm-io layer splits the I/O into multiple requests, avoiding the above
464 * shortcomings.
465 *--------------------------------------------------------------*/
466
467/*
468 * dm-io completion routine. It just calls b->bio.bi_end_io, pretending
469 * that the request was handled directly with bio interface.
470 */
471static void dmio_complete(unsigned long error, void *context)
472{
473 struct dm_buffer *b = context;
474
475 b->bio.bi_end_io(&b->bio, error ? -EIO : 0);
476}
477
478static void use_dmio(struct dm_buffer *b, int rw, sector_t block,
479 bio_end_io_t *end_io)
480{
481 int r;
482 struct dm_io_request io_req = {
483 .bi_rw = rw,
484 .notify.fn = dmio_complete,
485 .notify.context = b,
486 .client = b->c->dm_io,
487 };
488 struct dm_io_region region = {
489 .bdev = b->c->bdev,
490 .sector = block << b->c->sectors_per_block_bits,
491 .count = b->c->block_size >> SECTOR_SHIFT,
492 };
493
494 if (b->data_mode != DATA_MODE_VMALLOC) {
495 io_req.mem.type = DM_IO_KMEM;
496 io_req.mem.ptr.addr = b->data;
497 } else {
498 io_req.mem.type = DM_IO_VMA;
499 io_req.mem.ptr.vma = b->data;
500 }
501
502 b->bio.bi_end_io = end_io;
503
504 r = dm_io(&io_req, 1, &region, NULL);
505 if (r)
506 end_io(&b->bio, r);
507}
508
509static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
510 bio_end_io_t *end_io)
511{
512 char *ptr;
513 int len;
514
515 bio_init(&b->bio);
516 b->bio.bi_io_vec = b->bio_vec;
517 b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS;
518 b->bio.bi_sector = block << b->c->sectors_per_block_bits;
519 b->bio.bi_bdev = b->c->bdev;
520 b->bio.bi_end_io = end_io;
521
522 /*
523 * We assume that if len >= PAGE_SIZE ptr is page-aligned.
524 * If len < PAGE_SIZE the buffer doesn't cross page boundary.
525 */
526 ptr = b->data;
527 len = b->c->block_size;
528
529 if (len >= PAGE_SIZE)
530 BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1));
531 else
532 BUG_ON((unsigned long)ptr & (len - 1));
533
534 do {
535 if (!bio_add_page(&b->bio, virt_to_page(ptr),
536 len < PAGE_SIZE ? len : PAGE_SIZE,
537 virt_to_phys(ptr) & (PAGE_SIZE - 1))) {
538 BUG_ON(b->c->block_size <= PAGE_SIZE);
539 use_dmio(b, rw, block, end_io);
540 return;
541 }
542
543 len -= PAGE_SIZE;
544 ptr += PAGE_SIZE;
545 } while (len > 0);
546
547 submit_bio(rw, &b->bio);
548}
549
550static void submit_io(struct dm_buffer *b, int rw, sector_t block,
551 bio_end_io_t *end_io)
552{
553 if (rw == WRITE && b->c->write_callback)
554 b->c->write_callback(b);
555
556 if (b->c->block_size <= DM_BUFIO_INLINE_VECS * PAGE_SIZE &&
557 b->data_mode != DATA_MODE_VMALLOC)
558 use_inline_bio(b, rw, block, end_io);
559 else
560 use_dmio(b, rw, block, end_io);
561}
562
563/*----------------------------------------------------------------
564 * Writing dirty buffers
565 *--------------------------------------------------------------*/
566
567/*
568 * The endio routine for write.
569 *
570 * Set the error, clear B_WRITING bit and wake anyone who was waiting on
571 * it.
572 */
573static void write_endio(struct bio *bio, int error)
574{
575 struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
576
577 b->write_error = error;
578 if (unlikely(error)) {
579 struct dm_bufio_client *c = b->c;
580 (void)cmpxchg(&c->async_write_error, 0, error);
581 }
582
583 BUG_ON(!test_bit(B_WRITING, &b->state));
584
585 smp_mb__before_clear_bit();
586 clear_bit(B_WRITING, &b->state);
587 smp_mb__after_clear_bit();
588
589 wake_up_bit(&b->state, B_WRITING);
590}
591
592/*
593 * This function is called when wait_on_bit is actually waiting.
594 */
595static int do_io_schedule(void *word)
596{
597 io_schedule();
598
599 return 0;
600}
601
602/*
603 * Initiate a write on a dirty buffer, but don't wait for it.
604 *
605 * - If the buffer is not dirty, exit.
606 * - If there some previous write going on, wait for it to finish (we can't
607 * have two writes on the same buffer simultaneously).
608 * - Submit our write and don't wait on it. We set B_WRITING indicating
609 * that there is a write in progress.
610 */
611static void __write_dirty_buffer(struct dm_buffer *b)
612{
613 if (!test_bit(B_DIRTY, &b->state))
614 return;
615
616 clear_bit(B_DIRTY, &b->state);
617 wait_on_bit_lock(&b->state, B_WRITING,
618 do_io_schedule, TASK_UNINTERRUPTIBLE);
619
620 submit_io(b, WRITE, b->block, write_endio);
621}
622
623/*
624 * Wait until any activity on the buffer finishes. Possibly write the
625 * buffer if it is dirty. When this function finishes, there is no I/O
626 * running on the buffer and the buffer is not dirty.
627 */
628static void __make_buffer_clean(struct dm_buffer *b)
629{
630 BUG_ON(b->hold_count);
631
632 if (!b->state) /* fast case */
633 return;
634
635 wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE);
636 __write_dirty_buffer(b);
637 wait_on_bit(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE);
638}
639
640/*
641 * Find some buffer that is not held by anybody, clean it, unlink it and
642 * return it.
643 */
644static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c)
645{
646 struct dm_buffer *b;
647
648 list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) {
649 BUG_ON(test_bit(B_WRITING, &b->state));
650 BUG_ON(test_bit(B_DIRTY, &b->state));
651
652 if (!b->hold_count) {
653 __make_buffer_clean(b);
654 __unlink_buffer(b);
655 return b;
656 }
657 dm_bufio_cond_resched();
658 }
659
660 list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) {
661 BUG_ON(test_bit(B_READING, &b->state));
662
663 if (!b->hold_count) {
664 __make_buffer_clean(b);
665 __unlink_buffer(b);
666 return b;
667 }
668 dm_bufio_cond_resched();
669 }
670
671 return NULL;
672}
673
674/*
675 * Wait until some other threads free some buffer or release hold count on
676 * some buffer.
677 *
678 * This function is entered with c->lock held, drops it and regains it
679 * before exiting.
680 */
681static void __wait_for_free_buffer(struct dm_bufio_client *c)
682{
683 DECLARE_WAITQUEUE(wait, current);
684
685 add_wait_queue(&c->free_buffer_wait, &wait);
686 set_task_state(current, TASK_UNINTERRUPTIBLE);
687 dm_bufio_unlock(c);
688
689 io_schedule();
690
691 set_task_state(current, TASK_RUNNING);
692 remove_wait_queue(&c->free_buffer_wait, &wait);
693
694 dm_bufio_lock(c);
695}
696
697enum new_flag {
698 NF_FRESH = 0,
699 NF_READ = 1,
700 NF_GET = 2,
701 NF_PREFETCH = 3
702};
703
704/*
705 * Allocate a new buffer. If the allocation is not possible, wait until
706 * some other thread frees a buffer.
707 *
708 * May drop the lock and regain it.
709 */
710static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf)
711{
712 struct dm_buffer *b;
713
714 /*
715 * dm-bufio is resistant to allocation failures (it just keeps
716 * one buffer reserved in cases all the allocations fail).
717 * So set flags to not try too hard:
718 * GFP_NOIO: don't recurse into the I/O layer
719 * __GFP_NORETRY: don't retry and rather return failure
720 * __GFP_NOMEMALLOC: don't use emergency reserves
721 * __GFP_NOWARN: don't print a warning in case of failure
722 *
723 * For debugging, if we set the cache size to 1, no new buffers will
724 * be allocated.
725 */
726 while (1) {
727 if (dm_bufio_cache_size_latch != 1) {
728 b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
729 if (b)
730 return b;
731 }
732
733 if (nf == NF_PREFETCH)
734 return NULL;
735
736 if (!list_empty(&c->reserved_buffers)) {
737 b = list_entry(c->reserved_buffers.next,
738 struct dm_buffer, lru_list);
739 list_del(&b->lru_list);
740 c->need_reserved_buffers++;
741
742 return b;
743 }
744
745 b = __get_unclaimed_buffer(c);
746 if (b)
747 return b;
748
749 __wait_for_free_buffer(c);
750 }
751}
752
753static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf)
754{
755 struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf);
756
757 if (!b)
758 return NULL;
759
760 if (c->alloc_callback)
761 c->alloc_callback(b);
762
763 return b;
764}
765
766/*
767 * Free a buffer and wake other threads waiting for free buffers.
768 */
769static void __free_buffer_wake(struct dm_buffer *b)
770{
771 struct dm_bufio_client *c = b->c;
772
773 if (!c->need_reserved_buffers)
774 free_buffer(b);
775 else {
776 list_add(&b->lru_list, &c->reserved_buffers);
777 c->need_reserved_buffers--;
778 }
779
780 wake_up(&c->free_buffer_wait);
781}
782
783static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait)
784{
785 struct dm_buffer *b, *tmp;
786
787 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
788 BUG_ON(test_bit(B_READING, &b->state));
789
790 if (!test_bit(B_DIRTY, &b->state) &&
791 !test_bit(B_WRITING, &b->state)) {
792 __relink_lru(b, LIST_CLEAN);
793 continue;
794 }
795
796 if (no_wait && test_bit(B_WRITING, &b->state))
797 return;
798
799 __write_dirty_buffer(b);
800 dm_bufio_cond_resched();
801 }
802}
803
804/*
805 * Get writeback threshold and buffer limit for a given client.
806 */
807static void __get_memory_limit(struct dm_bufio_client *c,
808 unsigned long *threshold_buffers,
809 unsigned long *limit_buffers)
810{
811 unsigned long buffers;
812
813 if (ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch) {
814 mutex_lock(&dm_bufio_clients_lock);
815 __cache_size_refresh();
816 mutex_unlock(&dm_bufio_clients_lock);
817 }
818
819 buffers = dm_bufio_cache_size_per_client >>
820 (c->sectors_per_block_bits + SECTOR_SHIFT);
821
822 if (buffers < DM_BUFIO_MIN_BUFFERS)
823 buffers = DM_BUFIO_MIN_BUFFERS;
824
825 *limit_buffers = buffers;
826 *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100;
827}
828
829/*
830 * Check if we're over watermark.
831 * If we are over threshold_buffers, start freeing buffers.
832 * If we're over "limit_buffers", block until we get under the limit.
833 */
834static void __check_watermark(struct dm_bufio_client *c)
835{
836 unsigned long threshold_buffers, limit_buffers;
837
838 __get_memory_limit(c, &threshold_buffers, &limit_buffers);
839
840 while (c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY] >
841 limit_buffers) {
842
843 struct dm_buffer *b = __get_unclaimed_buffer(c);
844
845 if (!b)
846 return;
847
848 __free_buffer_wake(b);
849 dm_bufio_cond_resched();
850 }
851
852 if (c->n_buffers[LIST_DIRTY] > threshold_buffers)
853 __write_dirty_buffers_async(c, 1);
854}
855
856/*
857 * Find a buffer in the hash.
858 */
859static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
860{
861 struct dm_buffer *b;
862 struct hlist_node *hn;
863
864 hlist_for_each_entry(b, hn, &c->cache_hash[DM_BUFIO_HASH(block)],
865 hash_list) {
866 dm_bufio_cond_resched();
867 if (b->block == block)
868 return b;
869 }
870
871 return NULL;
872}
873
874/*----------------------------------------------------------------
875 * Getting a buffer
876 *--------------------------------------------------------------*/
877
878static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
879 enum new_flag nf, int *need_submit)
880{
881 struct dm_buffer *b, *new_b = NULL;
882
883 *need_submit = 0;
884
885 b = __find(c, block);
886 if (b)
887 goto found_buffer;
888
889 if (nf == NF_GET)
890 return NULL;
891
892 new_b = __alloc_buffer_wait(c, nf);
893 if (!new_b)
894 return NULL;
895
896 /*
897 * We've had a period where the mutex was unlocked, so need to
898 * recheck the hash table.
899 */
900 b = __find(c, block);
901 if (b) {
902 __free_buffer_wake(new_b);
903 goto found_buffer;
904 }
905
906 __check_watermark(c);
907
908 b = new_b;
909 b->hold_count = 1;
910 b->read_error = 0;
911 b->write_error = 0;
912 __link_buffer(b, block, LIST_CLEAN);
913
914 if (nf == NF_FRESH) {
915 b->state = 0;
916 return b;
917 }
918
919 b->state = 1 << B_READING;
920 *need_submit = 1;
921
922 return b;
923
924found_buffer:
925 if (nf == NF_PREFETCH)
926 return NULL;
927 /*
928 * Note: it is essential that we don't wait for the buffer to be
929 * read if dm_bufio_get function is used. Both dm_bufio_get and
930 * dm_bufio_prefetch can be used in the driver request routine.
931 * If the user called both dm_bufio_prefetch and dm_bufio_get on
932 * the same buffer, it would deadlock if we waited.
933 */
934 if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state)))
935 return NULL;
936
937 b->hold_count++;
938 __relink_lru(b, test_bit(B_DIRTY, &b->state) ||
939 test_bit(B_WRITING, &b->state));
940 return b;
941}
942
943/*
944 * The endio routine for reading: set the error, clear the bit and wake up
945 * anyone waiting on the buffer.
946 */
947static void read_endio(struct bio *bio, int error)
948{
949 struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
950
951 b->read_error = error;
952
953 BUG_ON(!test_bit(B_READING, &b->state));
954
955 smp_mb__before_clear_bit();
956 clear_bit(B_READING, &b->state);
957 smp_mb__after_clear_bit();
958
959 wake_up_bit(&b->state, B_READING);
960}
961
962/*
963 * A common routine for dm_bufio_new and dm_bufio_read. Operation of these
964 * functions is similar except that dm_bufio_new doesn't read the
965 * buffer from the disk (assuming that the caller overwrites all the data
966 * and uses dm_bufio_mark_buffer_dirty to write new data back).
967 */
968static void *new_read(struct dm_bufio_client *c, sector_t block,
969 enum new_flag nf, struct dm_buffer **bp)
970{
971 int need_submit;
972 struct dm_buffer *b;
973
974 dm_bufio_lock(c);
975 b = __bufio_new(c, block, nf, &need_submit);
976 dm_bufio_unlock(c);
977
978 if (!b)
979 return b;
980
981 if (need_submit)
982 submit_io(b, READ, b->block, read_endio);
983
984 wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE);
985
986 if (b->read_error) {
987 int error = b->read_error;
988
989 dm_bufio_release(b);
990
991 return ERR_PTR(error);
992 }
993
994 *bp = b;
995
996 return b->data;
997}
998
999void *dm_bufio_get(struct dm_bufio_client *c, sector_t block,
1000 struct dm_buffer **bp)
1001{
1002 return new_read(c, block, NF_GET, bp);
1003}
1004EXPORT_SYMBOL_GPL(dm_bufio_get);
1005
1006void *dm_bufio_read(struct dm_bufio_client *c, sector_t block,
1007 struct dm_buffer **bp)
1008{
1009 BUG_ON(dm_bufio_in_request());
1010
1011 return new_read(c, block, NF_READ, bp);
1012}
1013EXPORT_SYMBOL_GPL(dm_bufio_read);
1014
1015void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
1016 struct dm_buffer **bp)
1017{
1018 BUG_ON(dm_bufio_in_request());
1019
1020 return new_read(c, block, NF_FRESH, bp);
1021}
1022EXPORT_SYMBOL_GPL(dm_bufio_new);
1023
1024void dm_bufio_prefetch(struct dm_bufio_client *c,
1025 sector_t block, unsigned n_blocks)
1026{
1027 struct blk_plug plug;
1028
1029 blk_start_plug(&plug);
1030 dm_bufio_lock(c);
1031
1032 for (; n_blocks--; block++) {
1033 int need_submit;
1034 struct dm_buffer *b;
1035 b = __bufio_new(c, block, NF_PREFETCH, &need_submit);
1036 if (unlikely(b != NULL)) {
1037 dm_bufio_unlock(c);
1038
1039 if (need_submit)
1040 submit_io(b, READ, b->block, read_endio);
1041 dm_bufio_release(b);
1042
1043 dm_bufio_cond_resched();
1044
1045 if (!n_blocks)
1046 goto flush_plug;
1047 dm_bufio_lock(c);
1048 }
1049
1050 }
1051
1052 dm_bufio_unlock(c);
1053
1054flush_plug:
1055 blk_finish_plug(&plug);
1056}
1057EXPORT_SYMBOL_GPL(dm_bufio_prefetch);
1058
1059void dm_bufio_release(struct dm_buffer *b)
1060{
1061 struct dm_bufio_client *c = b->c;
1062
1063 dm_bufio_lock(c);
1064
1065 BUG_ON(!b->hold_count);
1066
1067 b->hold_count--;
1068 if (!b->hold_count) {
1069 wake_up(&c->free_buffer_wait);
1070
1071 /*
1072 * If there were errors on the buffer, and the buffer is not
1073 * to be written, free the buffer. There is no point in caching
1074 * invalid buffer.
1075 */
1076 if ((b->read_error || b->write_error) &&
1077 !test_bit(B_READING, &b->state) &&
1078 !test_bit(B_WRITING, &b->state) &&
1079 !test_bit(B_DIRTY, &b->state)) {
1080 __unlink_buffer(b);
1081 __free_buffer_wake(b);
1082 }
1083 }
1084
1085 dm_bufio_unlock(c);
1086}
1087EXPORT_SYMBOL_GPL(dm_bufio_release);
1088
1089void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
1090{
1091 struct dm_bufio_client *c = b->c;
1092
1093 dm_bufio_lock(c);
1094
1095 BUG_ON(test_bit(B_READING, &b->state));
1096
1097 if (!test_and_set_bit(B_DIRTY, &b->state))
1098 __relink_lru(b, LIST_DIRTY);
1099
1100 dm_bufio_unlock(c);
1101}
1102EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty);
1103
1104void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c)
1105{
1106 BUG_ON(dm_bufio_in_request());
1107
1108 dm_bufio_lock(c);
1109 __write_dirty_buffers_async(c, 0);
1110 dm_bufio_unlock(c);
1111}
1112EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
1113
1114/*
1115 * For performance, it is essential that the buffers are written asynchronously
1116 * and simultaneously (so that the block layer can merge the writes) and then
1117 * waited upon.
1118 *
1119 * Finally, we flush hardware disk cache.
1120 */
1121int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
1122{
1123 int a, f;
1124 unsigned long buffers_processed = 0;
1125 struct dm_buffer *b, *tmp;
1126
1127 dm_bufio_lock(c);
1128 __write_dirty_buffers_async(c, 0);
1129
1130again:
1131 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
1132 int dropped_lock = 0;
1133
1134 if (buffers_processed < c->n_buffers[LIST_DIRTY])
1135 buffers_processed++;
1136
1137 BUG_ON(test_bit(B_READING, &b->state));
1138
1139 if (test_bit(B_WRITING, &b->state)) {
1140 if (buffers_processed < c->n_buffers[LIST_DIRTY]) {
1141 dropped_lock = 1;
1142 b->hold_count++;
1143 dm_bufio_unlock(c);
1144 wait_on_bit(&b->state, B_WRITING,
1145 do_io_schedule,
1146 TASK_UNINTERRUPTIBLE);
1147 dm_bufio_lock(c);
1148 b->hold_count--;
1149 } else
1150 wait_on_bit(&b->state, B_WRITING,
1151 do_io_schedule,
1152 TASK_UNINTERRUPTIBLE);
1153 }
1154
1155 if (!test_bit(B_DIRTY, &b->state) &&
1156 !test_bit(B_WRITING, &b->state))
1157 __relink_lru(b, LIST_CLEAN);
1158
1159 dm_bufio_cond_resched();
1160
1161 /*
1162 * If we dropped the lock, the list is no longer consistent,
1163 * so we must restart the search.
1164 *
1165 * In the most common case, the buffer just processed is
1166 * relinked to the clean list, so we won't loop scanning the
1167 * same buffer again and again.
1168 *
1169 * This may livelock if there is another thread simultaneously
1170 * dirtying buffers, so we count the number of buffers walked
1171 * and if it exceeds the total number of buffers, it means that
1172 * someone is doing some writes simultaneously with us. In
1173 * this case, stop, dropping the lock.
1174 */
1175 if (dropped_lock)
1176 goto again;
1177 }
1178 wake_up(&c->free_buffer_wait);
1179 dm_bufio_unlock(c);
1180
1181 a = xchg(&c->async_write_error, 0);
1182 f = dm_bufio_issue_flush(c);
1183 if (a)
1184 return a;
1185
1186 return f;
1187}
1188EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers);
1189
1190/*
1191 * Use dm-io to send and empty barrier flush the device.
1192 */
1193int dm_bufio_issue_flush(struct dm_bufio_client *c)
1194{
1195 struct dm_io_request io_req = {
1196 .bi_rw = REQ_FLUSH,
1197 .mem.type = DM_IO_KMEM,
1198 .mem.ptr.addr = NULL,
1199 .client = c->dm_io,
1200 };
1201 struct dm_io_region io_reg = {
1202 .bdev = c->bdev,
1203 .sector = 0,
1204 .count = 0,
1205 };
1206
1207 BUG_ON(dm_bufio_in_request());
1208
1209 return dm_io(&io_req, 1, &io_reg, NULL);
1210}
1211EXPORT_SYMBOL_GPL(dm_bufio_issue_flush);
1212
1213/*
1214 * We first delete any other buffer that may be at that new location.
1215 *
1216 * Then, we write the buffer to the original location if it was dirty.
1217 *
1218 * Then, if we are the only one who is holding the buffer, relink the buffer
1219 * in the hash queue for the new location.
1220 *
1221 * If there was someone else holding the buffer, we write it to the new
1222 * location but not relink it, because that other user needs to have the buffer
1223 * at the same place.
1224 */
1225void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block)
1226{
1227 struct dm_bufio_client *c = b->c;
1228 struct dm_buffer *new;
1229
1230 BUG_ON(dm_bufio_in_request());
1231
1232 dm_bufio_lock(c);
1233
1234retry:
1235 new = __find(c, new_block);
1236 if (new) {
1237 if (new->hold_count) {
1238 __wait_for_free_buffer(c);
1239 goto retry;
1240 }
1241
1242 /*
1243 * FIXME: Is there any point waiting for a write that's going
1244 * to be overwritten in a bit?
1245 */
1246 __make_buffer_clean(new);
1247 __unlink_buffer(new);
1248 __free_buffer_wake(new);
1249 }
1250
1251 BUG_ON(!b->hold_count);
1252 BUG_ON(test_bit(B_READING, &b->state));
1253
1254 __write_dirty_buffer(b);
1255 if (b->hold_count == 1) {
1256 wait_on_bit(&b->state, B_WRITING,
1257 do_io_schedule, TASK_UNINTERRUPTIBLE);
1258 set_bit(B_DIRTY, &b->state);
1259 __unlink_buffer(b);
1260 __link_buffer(b, new_block, LIST_DIRTY);
1261 } else {
1262 sector_t old_block;
1263 wait_on_bit_lock(&b->state, B_WRITING,
1264 do_io_schedule, TASK_UNINTERRUPTIBLE);
1265 /*
1266 * Relink buffer to "new_block" so that write_callback
1267 * sees "new_block" as a block number.
1268 * After the write, link the buffer back to old_block.
1269 * All this must be done in bufio lock, so that block number
1270 * change isn't visible to other threads.
1271 */
1272 old_block = b->block;
1273 __unlink_buffer(b);
1274 __link_buffer(b, new_block, b->list_mode);
1275 submit_io(b, WRITE, new_block, write_endio);
1276 wait_on_bit(&b->state, B_WRITING,
1277 do_io_schedule, TASK_UNINTERRUPTIBLE);
1278 __unlink_buffer(b);
1279 __link_buffer(b, old_block, b->list_mode);
1280 }
1281
1282 dm_bufio_unlock(c);
1283 dm_bufio_release(b);
1284}
1285EXPORT_SYMBOL_GPL(dm_bufio_release_move);
1286
1287unsigned dm_bufio_get_block_size(struct dm_bufio_client *c)
1288{
1289 return c->block_size;
1290}
1291EXPORT_SYMBOL_GPL(dm_bufio_get_block_size);
1292
1293sector_t dm_bufio_get_device_size(struct dm_bufio_client *c)
1294{
1295 return i_size_read(c->bdev->bd_inode) >>
1296 (SECTOR_SHIFT + c->sectors_per_block_bits);
1297}
1298EXPORT_SYMBOL_GPL(dm_bufio_get_device_size);
1299
1300sector_t dm_bufio_get_block_number(struct dm_buffer *b)
1301{
1302 return b->block;
1303}
1304EXPORT_SYMBOL_GPL(dm_bufio_get_block_number);
1305
1306void *dm_bufio_get_block_data(struct dm_buffer *b)
1307{
1308 return b->data;
1309}
1310EXPORT_SYMBOL_GPL(dm_bufio_get_block_data);
1311
1312void *dm_bufio_get_aux_data(struct dm_buffer *b)
1313{
1314 return b + 1;
1315}
1316EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data);
1317
1318struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b)
1319{
1320 return b->c;
1321}
1322EXPORT_SYMBOL_GPL(dm_bufio_get_client);
1323
1324static void drop_buffers(struct dm_bufio_client *c)
1325{
1326 struct dm_buffer *b;
1327 int i;
1328
1329 BUG_ON(dm_bufio_in_request());
1330
1331 /*
1332 * An optimization so that the buffers are not written one-by-one.
1333 */
1334 dm_bufio_write_dirty_buffers_async(c);
1335
1336 dm_bufio_lock(c);
1337
1338 while ((b = __get_unclaimed_buffer(c)))
1339 __free_buffer_wake(b);
1340
1341 for (i = 0; i < LIST_SIZE; i++)
1342 list_for_each_entry(b, &c->lru[i], lru_list)
1343 DMERR("leaked buffer %llx, hold count %u, list %d",
1344 (unsigned long long)b->block, b->hold_count, i);
1345
1346 for (i = 0; i < LIST_SIZE; i++)
1347 BUG_ON(!list_empty(&c->lru[i]));
1348
1349 dm_bufio_unlock(c);
1350}
1351
1352/*
1353 * Test if the buffer is unused and too old, and commit it.
1354 * At if noio is set, we must not do any I/O because we hold
1355 * dm_bufio_clients_lock and we would risk deadlock if the I/O gets rerouted to
1356 * different bufio client.
1357 */
1358static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp,
1359 unsigned long max_jiffies)
1360{
1361 if (jiffies - b->last_accessed < max_jiffies)
1362 return 1;
1363
1364 if (!(gfp & __GFP_IO)) {
1365 if (test_bit(B_READING, &b->state) ||
1366 test_bit(B_WRITING, &b->state) ||
1367 test_bit(B_DIRTY, &b->state))
1368 return 1;
1369 }
1370
1371 if (b->hold_count)
1372 return 1;
1373
1374 __make_buffer_clean(b);
1375 __unlink_buffer(b);
1376 __free_buffer_wake(b);
1377
1378 return 0;
1379}
1380
1381static void __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
1382 struct shrink_control *sc)
1383{
1384 int l;
1385 struct dm_buffer *b, *tmp;
1386
1387 for (l = 0; l < LIST_SIZE; l++) {
1388 list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list)
1389 if (!__cleanup_old_buffer(b, sc->gfp_mask, 0) &&
1390 !--nr_to_scan)
1391 return;
1392 dm_bufio_cond_resched();
1393 }
1394}
1395
1396static int shrink(struct shrinker *shrinker, struct shrink_control *sc)
1397{
1398 struct dm_bufio_client *c =
1399 container_of(shrinker, struct dm_bufio_client, shrinker);
1400 unsigned long r;
1401 unsigned long nr_to_scan = sc->nr_to_scan;
1402
1403 if (sc->gfp_mask & __GFP_IO)
1404 dm_bufio_lock(c);
1405 else if (!dm_bufio_trylock(c))
1406 return !nr_to_scan ? 0 : -1;
1407
1408 if (nr_to_scan)
1409 __scan(c, nr_to_scan, sc);
1410
1411 r = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
1412 if (r > INT_MAX)
1413 r = INT_MAX;
1414
1415 dm_bufio_unlock(c);
1416
1417 return r;
1418}
1419
1420/*
1421 * Create the buffering interface
1422 */
1423struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size,
1424 unsigned reserved_buffers, unsigned aux_size,
1425 void (*alloc_callback)(struct dm_buffer *),
1426 void (*write_callback)(struct dm_buffer *))
1427{
1428 int r;
1429 struct dm_bufio_client *c;
1430 unsigned i;
1431
1432 BUG_ON(block_size < 1 << SECTOR_SHIFT ||
1433 (block_size & (block_size - 1)));
1434
1435 c = kmalloc(sizeof(*c), GFP_KERNEL);
1436 if (!c) {
1437 r = -ENOMEM;
1438 goto bad_client;
1439 }
1440 c->cache_hash = vmalloc(sizeof(struct hlist_head) << DM_BUFIO_HASH_BITS);
1441 if (!c->cache_hash) {
1442 r = -ENOMEM;
1443 goto bad_hash;
1444 }
1445
1446 c->bdev = bdev;
1447 c->block_size = block_size;
1448 c->sectors_per_block_bits = ffs(block_size) - 1 - SECTOR_SHIFT;
1449 c->pages_per_block_bits = (ffs(block_size) - 1 >= PAGE_SHIFT) ?
1450 ffs(block_size) - 1 - PAGE_SHIFT : 0;
1451 c->blocks_per_page_bits = (ffs(block_size) - 1 < PAGE_SHIFT ?
1452 PAGE_SHIFT - (ffs(block_size) - 1) : 0);
1453
1454 c->aux_size = aux_size;
1455 c->alloc_callback = alloc_callback;
1456 c->write_callback = write_callback;
1457
1458 for (i = 0; i < LIST_SIZE; i++) {
1459 INIT_LIST_HEAD(&c->lru[i]);
1460 c->n_buffers[i] = 0;
1461 }
1462
1463 for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++)
1464 INIT_HLIST_HEAD(&c->cache_hash[i]);
1465
1466 mutex_init(&c->lock);
1467 INIT_LIST_HEAD(&c->reserved_buffers);
1468 c->need_reserved_buffers = reserved_buffers;
1469
1470 init_waitqueue_head(&c->free_buffer_wait);
1471 c->async_write_error = 0;
1472
1473 c->dm_io = dm_io_client_create();
1474 if (IS_ERR(c->dm_io)) {
1475 r = PTR_ERR(c->dm_io);
1476 goto bad_dm_io;
1477 }
1478
1479 mutex_lock(&dm_bufio_clients_lock);
1480 if (c->blocks_per_page_bits) {
1481 if (!DM_BUFIO_CACHE_NAME(c)) {
1482 DM_BUFIO_CACHE_NAME(c) = kasprintf(GFP_KERNEL, "dm_bufio_cache-%u", c->block_size);
1483 if (!DM_BUFIO_CACHE_NAME(c)) {
1484 r = -ENOMEM;
1485 mutex_unlock(&dm_bufio_clients_lock);
1486 goto bad_cache;
1487 }
1488 }
1489
1490 if (!DM_BUFIO_CACHE(c)) {
1491 DM_BUFIO_CACHE(c) = kmem_cache_create(DM_BUFIO_CACHE_NAME(c),
1492 c->block_size,
1493 c->block_size, 0, NULL);
1494 if (!DM_BUFIO_CACHE(c)) {
1495 r = -ENOMEM;
1496 mutex_unlock(&dm_bufio_clients_lock);
1497 goto bad_cache;
1498 }
1499 }
1500 }
1501 mutex_unlock(&dm_bufio_clients_lock);
1502
1503 while (c->need_reserved_buffers) {
1504 struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL);
1505
1506 if (!b) {
1507 r = -ENOMEM;
1508 goto bad_buffer;
1509 }
1510 __free_buffer_wake(b);
1511 }
1512
1513 mutex_lock(&dm_bufio_clients_lock);
1514 dm_bufio_client_count++;
1515 list_add(&c->client_list, &dm_bufio_all_clients);
1516 __cache_size_refresh();
1517 mutex_unlock(&dm_bufio_clients_lock);
1518
1519 c->shrinker.shrink = shrink;
1520 c->shrinker.seeks = 1;
1521 c->shrinker.batch = 0;
1522 register_shrinker(&c->shrinker);
1523
1524 return c;
1525
1526bad_buffer:
1527bad_cache:
1528 while (!list_empty(&c->reserved_buffers)) {
1529 struct dm_buffer *b = list_entry(c->reserved_buffers.next,
1530 struct dm_buffer, lru_list);
1531 list_del(&b->lru_list);
1532 free_buffer(b);
1533 }
1534 dm_io_client_destroy(c->dm_io);
1535bad_dm_io:
1536 vfree(c->cache_hash);
1537bad_hash:
1538 kfree(c);
1539bad_client:
1540 return ERR_PTR(r);
1541}
1542EXPORT_SYMBOL_GPL(dm_bufio_client_create);
1543
1544/*
1545 * Free the buffering interface.
1546 * It is required that there are no references on any buffers.
1547 */
1548void dm_bufio_client_destroy(struct dm_bufio_client *c)
1549{
1550 unsigned i;
1551
1552 drop_buffers(c);
1553
1554 unregister_shrinker(&c->shrinker);
1555
1556 mutex_lock(&dm_bufio_clients_lock);
1557
1558 list_del(&c->client_list);
1559 dm_bufio_client_count--;
1560 __cache_size_refresh();
1561
1562 mutex_unlock(&dm_bufio_clients_lock);
1563
1564 for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++)
1565 BUG_ON(!hlist_empty(&c->cache_hash[i]));
1566
1567 BUG_ON(c->need_reserved_buffers);
1568
1569 while (!list_empty(&c->reserved_buffers)) {
1570 struct dm_buffer *b = list_entry(c->reserved_buffers.next,
1571 struct dm_buffer, lru_list);
1572 list_del(&b->lru_list);
1573 free_buffer(b);
1574 }
1575
1576 for (i = 0; i < LIST_SIZE; i++)
1577 if (c->n_buffers[i])
1578 DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]);
1579
1580 for (i = 0; i < LIST_SIZE; i++)
1581 BUG_ON(c->n_buffers[i]);
1582
1583 dm_io_client_destroy(c->dm_io);
1584 vfree(c->cache_hash);
1585 kfree(c);
1586}
1587EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
1588
1589static void cleanup_old_buffers(void)
1590{
1591 unsigned long max_age = ACCESS_ONCE(dm_bufio_max_age);
1592 struct dm_bufio_client *c;
1593
1594 if (max_age > ULONG_MAX / HZ)
1595 max_age = ULONG_MAX / HZ;
1596
1597 mutex_lock(&dm_bufio_clients_lock);
1598 list_for_each_entry(c, &dm_bufio_all_clients, client_list) {
1599 if (!dm_bufio_trylock(c))
1600 continue;
1601
1602 while (!list_empty(&c->lru[LIST_CLEAN])) {
1603 struct dm_buffer *b;
1604 b = list_entry(c->lru[LIST_CLEAN].prev,
1605 struct dm_buffer, lru_list);
1606 if (__cleanup_old_buffer(b, 0, max_age * HZ))
1607 break;
1608 dm_bufio_cond_resched();
1609 }
1610
1611 dm_bufio_unlock(c);
1612 dm_bufio_cond_resched();
1613 }
1614 mutex_unlock(&dm_bufio_clients_lock);
1615}
1616
1617static struct workqueue_struct *dm_bufio_wq;
1618static struct delayed_work dm_bufio_work;
1619
1620static void work_fn(struct work_struct *w)
1621{
1622 cleanup_old_buffers();
1623
1624 queue_delayed_work(dm_bufio_wq, &dm_bufio_work,
1625 DM_BUFIO_WORK_TIMER_SECS * HZ);
1626}
1627
1628/*----------------------------------------------------------------
1629 * Module setup
1630 *--------------------------------------------------------------*/
1631
1632/*
1633 * This is called only once for the whole dm_bufio module.
1634 * It initializes memory limit.
1635 */
1636static int __init dm_bufio_init(void)
1637{
1638 __u64 mem;
1639
1640 memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches);
1641 memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names);
1642
1643 mem = (__u64)((totalram_pages - totalhigh_pages) *
1644 DM_BUFIO_MEMORY_PERCENT / 100) << PAGE_SHIFT;
1645
1646 if (mem > ULONG_MAX)
1647 mem = ULONG_MAX;
1648
1649#ifdef CONFIG_MMU
1650 /*
1651 * Get the size of vmalloc space the same way as VMALLOC_TOTAL
1652 * in fs/proc/internal.h
1653 */
1654 if (mem > (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100)
1655 mem = (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100;
1656#endif
1657
1658 dm_bufio_default_cache_size = mem;
1659
1660 mutex_lock(&dm_bufio_clients_lock);
1661 __cache_size_refresh();
1662 mutex_unlock(&dm_bufio_clients_lock);
1663
1664 dm_bufio_wq = create_singlethread_workqueue("dm_bufio_cache");
1665 if (!dm_bufio_wq)
1666 return -ENOMEM;
1667
1668 INIT_DELAYED_WORK(&dm_bufio_work, work_fn);
1669 queue_delayed_work(dm_bufio_wq, &dm_bufio_work,
1670 DM_BUFIO_WORK_TIMER_SECS * HZ);
1671
1672 return 0;
1673}
1674
1675/*
1676 * This is called once when unloading the dm_bufio module.
1677 */
1678static void __exit dm_bufio_exit(void)
1679{
1680 int bug = 0;
1681 int i;
1682
1683 cancel_delayed_work_sync(&dm_bufio_work);
1684 destroy_workqueue(dm_bufio_wq);
1685
1686 for (i = 0; i < ARRAY_SIZE(dm_bufio_caches); i++) {
1687 struct kmem_cache *kc = dm_bufio_caches[i];
1688
1689 if (kc)
1690 kmem_cache_destroy(kc);
1691 }
1692
1693 for (i = 0; i < ARRAY_SIZE(dm_bufio_cache_names); i++)
1694 kfree(dm_bufio_cache_names[i]);
1695
1696 if (dm_bufio_client_count) {
1697 DMCRIT("%s: dm_bufio_client_count leaked: %d",
1698 __func__, dm_bufio_client_count);
1699 bug = 1;
1700 }
1701
1702 if (dm_bufio_current_allocated) {
1703 DMCRIT("%s: dm_bufio_current_allocated leaked: %lu",
1704 __func__, dm_bufio_current_allocated);
1705 bug = 1;
1706 }
1707
1708 if (dm_bufio_allocated_get_free_pages) {
1709 DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu",
1710 __func__, dm_bufio_allocated_get_free_pages);
1711 bug = 1;
1712 }
1713
1714 if (dm_bufio_allocated_vmalloc) {
1715 DMCRIT("%s: dm_bufio_vmalloc leaked: %lu",
1716 __func__, dm_bufio_allocated_vmalloc);
1717 bug = 1;
1718 }
1719
1720 if (bug)
1721 BUG();
1722}
1723
1724module_init(dm_bufio_init)
1725module_exit(dm_bufio_exit)
1726
1727module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, S_IRUGO | S_IWUSR);
1728MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
1729
1730module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR);
1731MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
1732
1733module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR);
1734MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory");
1735
1736module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, S_IRUGO);
1737MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc");
1738
1739module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, S_IRUGO);
1740MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages");
1741
1742module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, S_IRUGO);
1743MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc");
1744
1745module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, S_IRUGO);
1746MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache");
1747
1748MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
1749MODULE_DESCRIPTION(DM_NAME " buffered I/O library");
1750MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-bufio.h b/drivers/md/dm-bufio.h
deleted file mode 100644
index b142946a9e3..00000000000
--- a/drivers/md/dm-bufio.h
+++ /dev/null
@@ -1,120 +0,0 @@
1/*
2 * Copyright (C) 2009-2011 Red Hat, Inc.
3 *
4 * Author: Mikulas Patocka <mpatocka@redhat.com>
5 *
6 * This file is released under the GPL.
7 */
8
9#ifndef DM_BUFIO_H
10#define DM_BUFIO_H
11
12#include <linux/blkdev.h>
13#include <linux/types.h>
14
15/*----------------------------------------------------------------*/
16
17struct dm_bufio_client;
18struct dm_buffer;
19
20/*
21 * Create a buffered IO cache on a given device
22 */
23struct dm_bufio_client *
24dm_bufio_client_create(struct block_device *bdev, unsigned block_size,
25 unsigned reserved_buffers, unsigned aux_size,
26 void (*alloc_callback)(struct dm_buffer *),
27 void (*write_callback)(struct dm_buffer *));
28
29/*
30 * Release a buffered IO cache.
31 */
32void dm_bufio_client_destroy(struct dm_bufio_client *c);
33
34/*
35 * WARNING: to avoid deadlocks, these conditions are observed:
36 *
37 * - At most one thread can hold at most "reserved_buffers" simultaneously.
38 * - Each other threads can hold at most one buffer.
39 * - Threads which call only dm_bufio_get can hold unlimited number of
40 * buffers.
41 */
42
43/*
44 * Read a given block from disk. Returns pointer to data. Returns a
45 * pointer to dm_buffer that can be used to release the buffer or to make
46 * it dirty.
47 */
48void *dm_bufio_read(struct dm_bufio_client *c, sector_t block,
49 struct dm_buffer **bp);
50
51/*
52 * Like dm_bufio_read, but return buffer from cache, don't read
53 * it. If the buffer is not in the cache, return NULL.
54 */
55void *dm_bufio_get(struct dm_bufio_client *c, sector_t block,
56 struct dm_buffer **bp);
57
58/*
59 * Like dm_bufio_read, but don't read anything from the disk. It is
60 * expected that the caller initializes the buffer and marks it dirty.
61 */
62void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
63 struct dm_buffer **bp);
64
65/*
66 * Prefetch the specified blocks to the cache.
67 * The function starts to read the blocks and returns without waiting for
68 * I/O to finish.
69 */
70void dm_bufio_prefetch(struct dm_bufio_client *c,
71 sector_t block, unsigned n_blocks);
72
73/*
74 * Release a reference obtained with dm_bufio_{read,get,new}. The data
75 * pointer and dm_buffer pointer is no longer valid after this call.
76 */
77void dm_bufio_release(struct dm_buffer *b);
78
79/*
80 * Mark a buffer dirty. It should be called after the buffer is modified.
81 *
82 * In case of memory pressure, the buffer may be written after
83 * dm_bufio_mark_buffer_dirty, but before dm_bufio_write_dirty_buffers. So
84 * dm_bufio_write_dirty_buffers guarantees that the buffer is on-disk but
85 * the actual writing may occur earlier.
86 */
87void dm_bufio_mark_buffer_dirty(struct dm_buffer *b);
88
89/*
90 * Initiate writing of dirty buffers, without waiting for completion.
91 */
92void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c);
93
94/*
95 * Write all dirty buffers. Guarantees that all dirty buffers created prior
96 * to this call are on disk when this call exits.
97 */
98int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c);
99
100/*
101 * Send an empty write barrier to the device to flush hardware disk cache.
102 */
103int dm_bufio_issue_flush(struct dm_bufio_client *c);
104
105/*
106 * Like dm_bufio_release but also move the buffer to the new
107 * block. dm_bufio_write_dirty_buffers is needed to commit the new block.
108 */
109void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block);
110
111unsigned dm_bufio_get_block_size(struct dm_bufio_client *c);
112sector_t dm_bufio_get_device_size(struct dm_bufio_client *c);
113sector_t dm_bufio_get_block_number(struct dm_buffer *b);
114void *dm_bufio_get_block_data(struct dm_buffer *b);
115void *dm_bufio_get_aux_data(struct dm_buffer *b);
116struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b);
117
118/*----------------------------------------------------------------*/
119
120#endif
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index f7369f9d859..1f1d3423d39 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -18,14 +18,10 @@
18#include <linux/crypto.h> 18#include <linux/crypto.h>
19#include <linux/workqueue.h> 19#include <linux/workqueue.h>
20#include <linux/backing-dev.h> 20#include <linux/backing-dev.h>
21#include <linux/percpu.h> 21#include <asm/atomic.h>
22#include <linux/atomic.h>
23#include <linux/scatterlist.h> 22#include <linux/scatterlist.h>
24#include <asm/page.h> 23#include <asm/page.h>
25#include <asm/unaligned.h> 24#include <asm/unaligned.h>
26#include <crypto/hash.h>
27#include <crypto/md5.h>
28#include <crypto/algapi.h>
29 25
30#include <linux/device-mapper.h> 26#include <linux/device-mapper.h>
31 27
@@ -42,21 +38,21 @@ struct convert_context {
42 unsigned int offset_out; 38 unsigned int offset_out;
43 unsigned int idx_in; 39 unsigned int idx_in;
44 unsigned int idx_out; 40 unsigned int idx_out;
45 sector_t cc_sector; 41 sector_t sector;
46 atomic_t cc_pending; 42 atomic_t pending;
47}; 43};
48 44
49/* 45/*
50 * per bio private data 46 * per bio private data
51 */ 47 */
52struct dm_crypt_io { 48struct dm_crypt_io {
53 struct crypt_config *cc; 49 struct dm_target *target;
54 struct bio *base_bio; 50 struct bio *base_bio;
55 struct work_struct work; 51 struct work_struct work;
56 52
57 struct convert_context ctx; 53 struct convert_context ctx;
58 54
59 atomic_t io_pending; 55 atomic_t pending;
60 int error; 56 int error;
61 sector_t sector; 57 sector_t sector;
62 struct dm_crypt_io *base_io; 58 struct dm_crypt_io *base_io;
@@ -66,7 +62,6 @@ struct dm_crypt_request {
66 struct convert_context *ctx; 62 struct convert_context *ctx;
67 struct scatterlist sg_in; 63 struct scatterlist sg_in;
68 struct scatterlist sg_out; 64 struct scatterlist sg_out;
69 sector_t iv_sector;
70}; 65};
71 66
72struct crypt_config; 67struct crypt_config;
@@ -77,13 +72,11 @@ struct crypt_iv_operations {
77 void (*dtr)(struct crypt_config *cc); 72 void (*dtr)(struct crypt_config *cc);
78 int (*init)(struct crypt_config *cc); 73 int (*init)(struct crypt_config *cc);
79 int (*wipe)(struct crypt_config *cc); 74 int (*wipe)(struct crypt_config *cc);
80 int (*generator)(struct crypt_config *cc, u8 *iv, 75 int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector);
81 struct dm_crypt_request *dmreq);
82 int (*post)(struct crypt_config *cc, u8 *iv,
83 struct dm_crypt_request *dmreq);
84}; 76};
85 77
86struct iv_essiv_private { 78struct iv_essiv_private {
79 struct crypto_cipher *tfm;
87 struct crypto_hash *hash_tfm; 80 struct crypto_hash *hash_tfm;
88 u8 *salt; 81 u8 *salt;
89}; 82};
@@ -92,29 +85,11 @@ struct iv_benbi_private {
92 int shift; 85 int shift;
93}; 86};
94 87
95#define LMK_SEED_SIZE 64 /* hash + 0 */
96struct iv_lmk_private {
97 struct crypto_shash *hash_tfm;
98 u8 *seed;
99};
100
101/* 88/*
102 * Crypt: maps a linear range of a block device 89 * Crypt: maps a linear range of a block device
103 * and encrypts / decrypts at the same time. 90 * and encrypts / decrypts at the same time.
104 */ 91 */
105enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID }; 92enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
106
107/*
108 * Duplicated per-CPU state for cipher.
109 */
110struct crypt_cpu {
111 struct ablkcipher_request *req;
112};
113
114/*
115 * The fields in here must be read only after initialization,
116 * changing state should be in crypt_cpu.
117 */
118struct crypt_config { 93struct crypt_config {
119 struct dm_dev *dev; 94 struct dm_dev *dev;
120 sector_t start; 95 sector_t start;
@@ -138,23 +113,11 @@ struct crypt_config {
138 union { 113 union {
139 struct iv_essiv_private essiv; 114 struct iv_essiv_private essiv;
140 struct iv_benbi_private benbi; 115 struct iv_benbi_private benbi;
141 struct iv_lmk_private lmk;
142 } iv_gen_private; 116 } iv_gen_private;
143 sector_t iv_offset; 117 sector_t iv_offset;
144 unsigned int iv_size; 118 unsigned int iv_size;
145 119
146 /* 120 /*
147 * Duplicated per cpu state. Access through
148 * per_cpu_ptr() only.
149 */
150 struct crypt_cpu __percpu *cpu;
151
152 /* ESSIV: struct crypto_cipher *essiv_tfm */
153 void *iv_private;
154 struct crypto_ablkcipher **tfms;
155 unsigned tfms_count;
156
157 /*
158 * Layout of each crypto request: 121 * Layout of each crypto request:
159 * 122 *
160 * struct ablkcipher_request 123 * struct ablkcipher_request
@@ -168,34 +131,22 @@ struct crypt_config {
168 * correctly aligned. 131 * correctly aligned.
169 */ 132 */
170 unsigned int dmreq_start; 133 unsigned int dmreq_start;
134 struct ablkcipher_request *req;
171 135
136 struct crypto_ablkcipher *tfm;
172 unsigned long flags; 137 unsigned long flags;
173 unsigned int key_size; 138 unsigned int key_size;
174 unsigned int key_parts;
175 u8 key[0]; 139 u8 key[0];
176}; 140};
177 141
178#define MIN_IOS 16 142#define MIN_IOS 16
179#define MIN_POOL_PAGES 32 143#define MIN_POOL_PAGES 32
144#define MIN_BIO_PAGES 8
180 145
181static struct kmem_cache *_crypt_io_pool; 146static struct kmem_cache *_crypt_io_pool;
182 147
183static void clone_init(struct dm_crypt_io *, struct bio *); 148static void clone_init(struct dm_crypt_io *, struct bio *);
184static void kcryptd_queue_crypt(struct dm_crypt_io *io); 149static void kcryptd_queue_crypt(struct dm_crypt_io *io);
185static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
186
187static struct crypt_cpu *this_crypt_config(struct crypt_config *cc)
188{
189 return this_cpu_ptr(cc->cpu);
190}
191
192/*
193 * Use this to access cipher attributes that are the same for each CPU.
194 */
195static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
196{
197 return cc->tfms[0];
198}
199 150
200/* 151/*
201 * Different IV generation algorithms: 152 * Different IV generation algorithms:
@@ -216,38 +167,23 @@ static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
216 * null: the initial vector is always zero. Provides compatibility with 167 * null: the initial vector is always zero. Provides compatibility with
217 * obsolete loop_fish2 devices. Do not use for new devices. 168 * obsolete loop_fish2 devices. Do not use for new devices.
218 * 169 *
219 * lmk: Compatible implementation of the block chaining mode used
220 * by the Loop-AES block device encryption system
221 * designed by Jari Ruusu. See http://loop-aes.sourceforge.net/
222 * It operates on full 512 byte sectors and uses CBC
223 * with an IV derived from the sector number, the data and
224 * optionally extra IV seed.
225 * This means that after decryption the first block
226 * of sector must be tweaked according to decrypted data.
227 * Loop-AES can use three encryption schemes:
228 * version 1: is plain aes-cbc mode
229 * version 2: uses 64 multikey scheme with lmk IV generator
230 * version 3: the same as version 2 with additional IV seed
231 * (it uses 65 keys, last key is used as IV seed)
232 *
233 * plumb: unimplemented, see: 170 * plumb: unimplemented, see:
234 * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454 171 * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454
235 */ 172 */
236 173
237static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, 174static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
238 struct dm_crypt_request *dmreq)
239{ 175{
240 memset(iv, 0, cc->iv_size); 176 memset(iv, 0, cc->iv_size);
241 *(__le32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff); 177 *(u32 *)iv = cpu_to_le32(sector & 0xffffffff);
242 178
243 return 0; 179 return 0;
244} 180}
245 181
246static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv, 182static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv,
247 struct dm_crypt_request *dmreq) 183 sector_t sector)
248{ 184{
249 memset(iv, 0, cc->iv_size); 185 memset(iv, 0, cc->iv_size);
250 *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector); 186 *(u64 *)iv = cpu_to_le64(sector);
251 187
252 return 0; 188 return 0;
253} 189}
@@ -258,7 +194,6 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
258 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; 194 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
259 struct hash_desc desc; 195 struct hash_desc desc;
260 struct scatterlist sg; 196 struct scatterlist sg;
261 struct crypto_cipher *essiv_tfm;
262 int err; 197 int err;
263 198
264 sg_init_one(&sg, cc->key, cc->key_size); 199 sg_init_one(&sg, cc->key, cc->key_size);
@@ -269,14 +204,8 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
269 if (err) 204 if (err)
270 return err; 205 return err;
271 206
272 essiv_tfm = cc->iv_private; 207 return crypto_cipher_setkey(essiv->tfm, essiv->salt,
273 208 crypto_hash_digestsize(essiv->hash_tfm));
274 err = crypto_cipher_setkey(essiv_tfm, essiv->salt,
275 crypto_hash_digestsize(essiv->hash_tfm));
276 if (err)
277 return err;
278
279 return 0;
280} 209}
281 210
282/* Wipe salt and reset key derived from volume key */ 211/* Wipe salt and reset key derived from volume key */
@@ -284,69 +213,24 @@ static int crypt_iv_essiv_wipe(struct crypt_config *cc)
284{ 213{
285 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; 214 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
286 unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm); 215 unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm);
287 struct crypto_cipher *essiv_tfm;
288 int r, err = 0;
289 216
290 memset(essiv->salt, 0, salt_size); 217 memset(essiv->salt, 0, salt_size);
291 218
292 essiv_tfm = cc->iv_private; 219 return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size);
293 r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size);
294 if (r)
295 err = r;
296
297 return err;
298}
299
300/* Set up per cpu cipher state */
301static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc,
302 struct dm_target *ti,
303 u8 *salt, unsigned saltsize)
304{
305 struct crypto_cipher *essiv_tfm;
306 int err;
307
308 /* Setup the essiv_tfm with the given salt */
309 essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
310 if (IS_ERR(essiv_tfm)) {
311 ti->error = "Error allocating crypto tfm for ESSIV";
312 return essiv_tfm;
313 }
314
315 if (crypto_cipher_blocksize(essiv_tfm) !=
316 crypto_ablkcipher_ivsize(any_tfm(cc))) {
317 ti->error = "Block size of ESSIV cipher does "
318 "not match IV size of block cipher";
319 crypto_free_cipher(essiv_tfm);
320 return ERR_PTR(-EINVAL);
321 }
322
323 err = crypto_cipher_setkey(essiv_tfm, salt, saltsize);
324 if (err) {
325 ti->error = "Failed to set key for ESSIV cipher";
326 crypto_free_cipher(essiv_tfm);
327 return ERR_PTR(err);
328 }
329
330 return essiv_tfm;
331} 220}
332 221
333static void crypt_iv_essiv_dtr(struct crypt_config *cc) 222static void crypt_iv_essiv_dtr(struct crypt_config *cc)
334{ 223{
335 struct crypto_cipher *essiv_tfm;
336 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; 224 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
337 225
226 crypto_free_cipher(essiv->tfm);
227 essiv->tfm = NULL;
228
338 crypto_free_hash(essiv->hash_tfm); 229 crypto_free_hash(essiv->hash_tfm);
339 essiv->hash_tfm = NULL; 230 essiv->hash_tfm = NULL;
340 231
341 kzfree(essiv->salt); 232 kzfree(essiv->salt);
342 essiv->salt = NULL; 233 essiv->salt = NULL;
343
344 essiv_tfm = cc->iv_private;
345
346 if (essiv_tfm)
347 crypto_free_cipher(essiv_tfm);
348
349 cc->iv_private = NULL;
350} 234}
351 235
352static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, 236static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
@@ -377,42 +261,48 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
377 goto bad; 261 goto bad;
378 } 262 }
379 263
380 cc->iv_gen_private.essiv.salt = salt; 264 /* Allocate essiv_tfm */
381 cc->iv_gen_private.essiv.hash_tfm = hash_tfm; 265 essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
382
383 essiv_tfm = setup_essiv_cpu(cc, ti, salt,
384 crypto_hash_digestsize(hash_tfm));
385 if (IS_ERR(essiv_tfm)) { 266 if (IS_ERR(essiv_tfm)) {
386 crypt_iv_essiv_dtr(cc); 267 ti->error = "Error allocating crypto tfm for ESSIV";
387 return PTR_ERR(essiv_tfm); 268 err = PTR_ERR(essiv_tfm);
269 goto bad;
388 } 270 }
389 cc->iv_private = essiv_tfm; 271 if (crypto_cipher_blocksize(essiv_tfm) !=
272 crypto_ablkcipher_ivsize(cc->tfm)) {
273 ti->error = "Block size of ESSIV cipher does "
274 "not match IV size of block cipher";
275 err = -EINVAL;
276 goto bad;
277 }
278
279 cc->iv_gen_private.essiv.salt = salt;
280 cc->iv_gen_private.essiv.tfm = essiv_tfm;
281 cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
390 282
391 return 0; 283 return 0;
392 284
393bad: 285bad:
286 if (essiv_tfm && !IS_ERR(essiv_tfm))
287 crypto_free_cipher(essiv_tfm);
394 if (hash_tfm && !IS_ERR(hash_tfm)) 288 if (hash_tfm && !IS_ERR(hash_tfm))
395 crypto_free_hash(hash_tfm); 289 crypto_free_hash(hash_tfm);
396 kfree(salt); 290 kfree(salt);
397 return err; 291 return err;
398} 292}
399 293
400static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, 294static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
401 struct dm_crypt_request *dmreq)
402{ 295{
403 struct crypto_cipher *essiv_tfm = cc->iv_private;
404
405 memset(iv, 0, cc->iv_size); 296 memset(iv, 0, cc->iv_size);
406 *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector); 297 *(u64 *)iv = cpu_to_le64(sector);
407 crypto_cipher_encrypt_one(essiv_tfm, iv, iv); 298 crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv);
408
409 return 0; 299 return 0;
410} 300}
411 301
412static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, 302static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
413 const char *opts) 303 const char *opts)
414{ 304{
415 unsigned bs = crypto_ablkcipher_blocksize(any_tfm(cc)); 305 unsigned bs = crypto_ablkcipher_blocksize(cc->tfm);
416 int log = ilog2(bs); 306 int log = ilog2(bs);
417 307
418 /* we need to calculate how far we must shift the sector count 308 /* we need to calculate how far we must shift the sector count
@@ -437,177 +327,25 @@ static void crypt_iv_benbi_dtr(struct crypt_config *cc)
437{ 327{
438} 328}
439 329
440static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, 330static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
441 struct dm_crypt_request *dmreq)
442{ 331{
443 __be64 val; 332 __be64 val;
444 333
445 memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */ 334 memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */
446 335
447 val = cpu_to_be64(((u64)dmreq->iv_sector << cc->iv_gen_private.benbi.shift) + 1); 336 val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi.shift) + 1);
448 put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64))); 337 put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64)));
449 338
450 return 0; 339 return 0;
451} 340}
452 341
453static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, 342static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
454 struct dm_crypt_request *dmreq)
455{ 343{
456 memset(iv, 0, cc->iv_size); 344 memset(iv, 0, cc->iv_size);
457 345
458 return 0; 346 return 0;
459} 347}
460 348
461static void crypt_iv_lmk_dtr(struct crypt_config *cc)
462{
463 struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
464
465 if (lmk->hash_tfm && !IS_ERR(lmk->hash_tfm))
466 crypto_free_shash(lmk->hash_tfm);
467 lmk->hash_tfm = NULL;
468
469 kzfree(lmk->seed);
470 lmk->seed = NULL;
471}
472
473static int crypt_iv_lmk_ctr(struct crypt_config *cc, struct dm_target *ti,
474 const char *opts)
475{
476 struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
477
478 lmk->hash_tfm = crypto_alloc_shash("md5", 0, 0);
479 if (IS_ERR(lmk->hash_tfm)) {
480 ti->error = "Error initializing LMK hash";
481 return PTR_ERR(lmk->hash_tfm);
482 }
483
484 /* No seed in LMK version 2 */
485 if (cc->key_parts == cc->tfms_count) {
486 lmk->seed = NULL;
487 return 0;
488 }
489
490 lmk->seed = kzalloc(LMK_SEED_SIZE, GFP_KERNEL);
491 if (!lmk->seed) {
492 crypt_iv_lmk_dtr(cc);
493 ti->error = "Error kmallocing seed storage in LMK";
494 return -ENOMEM;
495 }
496
497 return 0;
498}
499
500static int crypt_iv_lmk_init(struct crypt_config *cc)
501{
502 struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
503 int subkey_size = cc->key_size / cc->key_parts;
504
505 /* LMK seed is on the position of LMK_KEYS + 1 key */
506 if (lmk->seed)
507 memcpy(lmk->seed, cc->key + (cc->tfms_count * subkey_size),
508 crypto_shash_digestsize(lmk->hash_tfm));
509
510 return 0;
511}
512
513static int crypt_iv_lmk_wipe(struct crypt_config *cc)
514{
515 struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
516
517 if (lmk->seed)
518 memset(lmk->seed, 0, LMK_SEED_SIZE);
519
520 return 0;
521}
522
523static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv,
524 struct dm_crypt_request *dmreq,
525 u8 *data)
526{
527 struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
528 struct {
529 struct shash_desc desc;
530 char ctx[crypto_shash_descsize(lmk->hash_tfm)];
531 } sdesc;
532 struct md5_state md5state;
533 u32 buf[4];
534 int i, r;
535
536 sdesc.desc.tfm = lmk->hash_tfm;
537 sdesc.desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
538
539 r = crypto_shash_init(&sdesc.desc);
540 if (r)
541 return r;
542
543 if (lmk->seed) {
544 r = crypto_shash_update(&sdesc.desc, lmk->seed, LMK_SEED_SIZE);
545 if (r)
546 return r;
547 }
548
549 /* Sector is always 512B, block size 16, add data of blocks 1-31 */
550 r = crypto_shash_update(&sdesc.desc, data + 16, 16 * 31);
551 if (r)
552 return r;
553
554 /* Sector is cropped to 56 bits here */
555 buf[0] = cpu_to_le32(dmreq->iv_sector & 0xFFFFFFFF);
556 buf[1] = cpu_to_le32((((u64)dmreq->iv_sector >> 32) & 0x00FFFFFF) | 0x80000000);
557 buf[2] = cpu_to_le32(4024);
558 buf[3] = 0;
559 r = crypto_shash_update(&sdesc.desc, (u8 *)buf, sizeof(buf));
560 if (r)
561 return r;
562
563 /* No MD5 padding here */
564 r = crypto_shash_export(&sdesc.desc, &md5state);
565 if (r)
566 return r;
567
568 for (i = 0; i < MD5_HASH_WORDS; i++)
569 __cpu_to_le32s(&md5state.hash[i]);
570 memcpy(iv, &md5state.hash, cc->iv_size);
571
572 return 0;
573}
574
575static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv,
576 struct dm_crypt_request *dmreq)
577{
578 u8 *src;
579 int r = 0;
580
581 if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
582 src = kmap_atomic(sg_page(&dmreq->sg_in));
583 r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset);
584 kunmap_atomic(src);
585 } else
586 memset(iv, 0, cc->iv_size);
587
588 return r;
589}
590
591static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv,
592 struct dm_crypt_request *dmreq)
593{
594 u8 *dst;
595 int r;
596
597 if (bio_data_dir(dmreq->ctx->bio_in) == WRITE)
598 return 0;
599
600 dst = kmap_atomic(sg_page(&dmreq->sg_out));
601 r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset);
602
603 /* Tweak the first block of plaintext sector */
604 if (!r)
605 crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size);
606
607 kunmap_atomic(dst);
608 return r;
609}
610
611static struct crypt_iv_operations crypt_iv_plain_ops = { 349static struct crypt_iv_operations crypt_iv_plain_ops = {
612 .generator = crypt_iv_plain_gen 350 .generator = crypt_iv_plain_gen
613}; 351};
@@ -634,15 +372,6 @@ static struct crypt_iv_operations crypt_iv_null_ops = {
634 .generator = crypt_iv_null_gen 372 .generator = crypt_iv_null_gen
635}; 373};
636 374
637static struct crypt_iv_operations crypt_iv_lmk_ops = {
638 .ctr = crypt_iv_lmk_ctr,
639 .dtr = crypt_iv_lmk_dtr,
640 .init = crypt_iv_lmk_init,
641 .wipe = crypt_iv_lmk_wipe,
642 .generator = crypt_iv_lmk_gen,
643 .post = crypt_iv_lmk_post
644};
645
646static void crypt_convert_init(struct crypt_config *cc, 375static void crypt_convert_init(struct crypt_config *cc,
647 struct convert_context *ctx, 376 struct convert_context *ctx,
648 struct bio *bio_out, struct bio *bio_in, 377 struct bio *bio_out, struct bio *bio_in,
@@ -654,7 +383,7 @@ static void crypt_convert_init(struct crypt_config *cc,
654 ctx->offset_out = 0; 383 ctx->offset_out = 0;
655 ctx->idx_in = bio_in ? bio_in->bi_idx : 0; 384 ctx->idx_in = bio_in ? bio_in->bi_idx : 0;
656 ctx->idx_out = bio_out ? bio_out->bi_idx : 0; 385 ctx->idx_out = bio_out ? bio_out->bi_idx : 0;
657 ctx->cc_sector = sector + cc->iv_offset; 386 ctx->sector = sector + cc->iv_offset;
658 init_completion(&ctx->restart); 387 init_completion(&ctx->restart);
659} 388}
660 389
@@ -670,13 +399,6 @@ static struct ablkcipher_request *req_of_dmreq(struct crypt_config *cc,
670 return (struct ablkcipher_request *)((char *)dmreq - cc->dmreq_start); 399 return (struct ablkcipher_request *)((char *)dmreq - cc->dmreq_start);
671} 400}
672 401
673static u8 *iv_of_dmreq(struct crypt_config *cc,
674 struct dm_crypt_request *dmreq)
675{
676 return (u8 *)ALIGN((unsigned long)(dmreq + 1),
677 crypto_ablkcipher_alignmask(any_tfm(cc)) + 1);
678}
679
680static int crypt_convert_block(struct crypt_config *cc, 402static int crypt_convert_block(struct crypt_config *cc,
681 struct convert_context *ctx, 403 struct convert_context *ctx,
682 struct ablkcipher_request *req) 404 struct ablkcipher_request *req)
@@ -685,12 +407,12 @@ static int crypt_convert_block(struct crypt_config *cc,
685 struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out); 407 struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out);
686 struct dm_crypt_request *dmreq; 408 struct dm_crypt_request *dmreq;
687 u8 *iv; 409 u8 *iv;
688 int r; 410 int r = 0;
689 411
690 dmreq = dmreq_of_req(cc, req); 412 dmreq = dmreq_of_req(cc, req);
691 iv = iv_of_dmreq(cc, dmreq); 413 iv = (u8 *)ALIGN((unsigned long)(dmreq + 1),
414 crypto_ablkcipher_alignmask(cc->tfm) + 1);
692 415
693 dmreq->iv_sector = ctx->cc_sector;
694 dmreq->ctx = ctx; 416 dmreq->ctx = ctx;
695 sg_init_table(&dmreq->sg_in, 1); 417 sg_init_table(&dmreq->sg_in, 1);
696 sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT, 418 sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT,
@@ -713,7 +435,7 @@ static int crypt_convert_block(struct crypt_config *cc,
713 } 435 }
714 436
715 if (cc->iv_gen_ops) { 437 if (cc->iv_gen_ops) {
716 r = cc->iv_gen_ops->generator(cc, iv, dmreq); 438 r = cc->iv_gen_ops->generator(cc, iv, ctx->sector);
717 if (r < 0) 439 if (r < 0)
718 return r; 440 return r;
719 } 441 }
@@ -726,28 +448,21 @@ static int crypt_convert_block(struct crypt_config *cc,
726 else 448 else
727 r = crypto_ablkcipher_decrypt(req); 449 r = crypto_ablkcipher_decrypt(req);
728 450
729 if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
730 r = cc->iv_gen_ops->post(cc, iv, dmreq);
731
732 return r; 451 return r;
733} 452}
734 453
735static void kcryptd_async_done(struct crypto_async_request *async_req, 454static void kcryptd_async_done(struct crypto_async_request *async_req,
736 int error); 455 int error);
737
738static void crypt_alloc_req(struct crypt_config *cc, 456static void crypt_alloc_req(struct crypt_config *cc,
739 struct convert_context *ctx) 457 struct convert_context *ctx)
740{ 458{
741 struct crypt_cpu *this_cc = this_crypt_config(cc); 459 if (!cc->req)
742 unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1); 460 cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
743 461 ablkcipher_request_set_tfm(cc->req, cc->tfm);
744 if (!this_cc->req) 462 ablkcipher_request_set_callback(cc->req, CRYPTO_TFM_REQ_MAY_BACKLOG |
745 this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO); 463 CRYPTO_TFM_REQ_MAY_SLEEP,
746 464 kcryptd_async_done,
747 ablkcipher_request_set_tfm(this_cc->req, cc->tfms[key_index]); 465 dmreq_of_req(cc, cc->req));
748 ablkcipher_request_set_callback(this_cc->req,
749 CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
750 kcryptd_async_done, dmreq_of_req(cc, this_cc->req));
751} 466}
752 467
753/* 468/*
@@ -756,19 +471,18 @@ static void crypt_alloc_req(struct crypt_config *cc,
756static int crypt_convert(struct crypt_config *cc, 471static int crypt_convert(struct crypt_config *cc,
757 struct convert_context *ctx) 472 struct convert_context *ctx)
758{ 473{
759 struct crypt_cpu *this_cc = this_crypt_config(cc);
760 int r; 474 int r;
761 475
762 atomic_set(&ctx->cc_pending, 1); 476 atomic_set(&ctx->pending, 1);
763 477
764 while(ctx->idx_in < ctx->bio_in->bi_vcnt && 478 while(ctx->idx_in < ctx->bio_in->bi_vcnt &&
765 ctx->idx_out < ctx->bio_out->bi_vcnt) { 479 ctx->idx_out < ctx->bio_out->bi_vcnt) {
766 480
767 crypt_alloc_req(cc, ctx); 481 crypt_alloc_req(cc, ctx);
768 482
769 atomic_inc(&ctx->cc_pending); 483 atomic_inc(&ctx->pending);
770 484
771 r = crypt_convert_block(cc, ctx, this_cc->req); 485 r = crypt_convert_block(cc, ctx, cc->req);
772 486
773 switch (r) { 487 switch (r) {
774 /* async */ 488 /* async */
@@ -777,20 +491,20 @@ static int crypt_convert(struct crypt_config *cc,
777 INIT_COMPLETION(ctx->restart); 491 INIT_COMPLETION(ctx->restart);
778 /* fall through*/ 492 /* fall through*/
779 case -EINPROGRESS: 493 case -EINPROGRESS:
780 this_cc->req = NULL; 494 cc->req = NULL;
781 ctx->cc_sector++; 495 ctx->sector++;
782 continue; 496 continue;
783 497
784 /* sync */ 498 /* sync */
785 case 0: 499 case 0:
786 atomic_dec(&ctx->cc_pending); 500 atomic_dec(&ctx->pending);
787 ctx->cc_sector++; 501 ctx->sector++;
788 cond_resched(); 502 cond_resched();
789 continue; 503 continue;
790 504
791 /* error */ 505 /* error */
792 default: 506 default:
793 atomic_dec(&ctx->cc_pending); 507 atomic_dec(&ctx->pending);
794 return r; 508 return r;
795 } 509 }
796 } 510 }
@@ -798,6 +512,14 @@ static int crypt_convert(struct crypt_config *cc,
798 return 0; 512 return 0;
799} 513}
800 514
515static void dm_crypt_bio_destructor(struct bio *bio)
516{
517 struct dm_crypt_io *io = bio->bi_private;
518 struct crypt_config *cc = io->target->private;
519
520 bio_free(bio, cc->bs);
521}
522
801/* 523/*
802 * Generate a new unfragmented bio with the given size 524 * Generate a new unfragmented bio with the given size
803 * This should never violate the device limitations 525 * This should never violate the device limitations
@@ -807,7 +529,7 @@ static int crypt_convert(struct crypt_config *cc,
807static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size, 529static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size,
808 unsigned *out_of_pages) 530 unsigned *out_of_pages)
809{ 531{
810 struct crypt_config *cc = io->cc; 532 struct crypt_config *cc = io->target->private;
811 struct bio *clone; 533 struct bio *clone;
812 unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 534 unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
813 gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM; 535 gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM;
@@ -829,11 +551,12 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size,
829 } 551 }
830 552
831 /* 553 /*
832 * If additional pages cannot be allocated without waiting, 554 * if additional pages cannot be allocated without waiting,
833 * return a partially-allocated bio. The caller will then try 555 * return a partially allocated bio, the caller will then try
834 * to allocate more bios while submitting this partial bio. 556 * to allocate additional bios while submitting this partial bio
835 */ 557 */
836 gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT; 558 if (i == (MIN_BIO_PAGES - 1))
559 gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT;
837 560
838 len = (size > PAGE_SIZE) ? PAGE_SIZE : size; 561 len = (size > PAGE_SIZE) ? PAGE_SIZE : size;
839 562
@@ -866,25 +589,26 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
866 } 589 }
867} 590}
868 591
869static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc, 592static struct dm_crypt_io *crypt_io_alloc(struct dm_target *ti,
870 struct bio *bio, sector_t sector) 593 struct bio *bio, sector_t sector)
871{ 594{
595 struct crypt_config *cc = ti->private;
872 struct dm_crypt_io *io; 596 struct dm_crypt_io *io;
873 597
874 io = mempool_alloc(cc->io_pool, GFP_NOIO); 598 io = mempool_alloc(cc->io_pool, GFP_NOIO);
875 io->cc = cc; 599 io->target = ti;
876 io->base_bio = bio; 600 io->base_bio = bio;
877 io->sector = sector; 601 io->sector = sector;
878 io->error = 0; 602 io->error = 0;
879 io->base_io = NULL; 603 io->base_io = NULL;
880 atomic_set(&io->io_pending, 0); 604 atomic_set(&io->pending, 0);
881 605
882 return io; 606 return io;
883} 607}
884 608
885static void crypt_inc_pending(struct dm_crypt_io *io) 609static void crypt_inc_pending(struct dm_crypt_io *io)
886{ 610{
887 atomic_inc(&io->io_pending); 611 atomic_inc(&io->pending);
888} 612}
889 613
890/* 614/*
@@ -894,12 +618,12 @@ static void crypt_inc_pending(struct dm_crypt_io *io)
894 */ 618 */
895static void crypt_dec_pending(struct dm_crypt_io *io) 619static void crypt_dec_pending(struct dm_crypt_io *io)
896{ 620{
897 struct crypt_config *cc = io->cc; 621 struct crypt_config *cc = io->target->private;
898 struct bio *base_bio = io->base_bio; 622 struct bio *base_bio = io->base_bio;
899 struct dm_crypt_io *base_io = io->base_io; 623 struct dm_crypt_io *base_io = io->base_io;
900 int error = io->error; 624 int error = io->error;
901 625
902 if (!atomic_dec_and_test(&io->io_pending)) 626 if (!atomic_dec_and_test(&io->pending))
903 return; 627 return;
904 628
905 mempool_free(io, cc->io_pool); 629 mempool_free(io, cc->io_pool);
@@ -926,14 +650,11 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
926 * They must be separated as otherwise the final stages could be 650 * They must be separated as otherwise the final stages could be
927 * starved by new requests which can block in the first stages due 651 * starved by new requests which can block in the first stages due
928 * to memory allocation. 652 * to memory allocation.
929 *
930 * The work is done per CPU global for all dm-crypt instances.
931 * They should not depend on each other and do not block.
932 */ 653 */
933static void crypt_endio(struct bio *clone, int error) 654static void crypt_endio(struct bio *clone, int error)
934{ 655{
935 struct dm_crypt_io *io = clone->bi_private; 656 struct dm_crypt_io *io = clone->bi_private;
936 struct crypt_config *cc = io->cc; 657 struct crypt_config *cc = io->target->private;
937 unsigned rw = bio_data_dir(clone); 658 unsigned rw = bio_data_dir(clone);
938 659
939 if (unlikely(!bio_flagged(clone, BIO_UPTODATE) && !error)) 660 if (unlikely(!bio_flagged(clone, BIO_UPTODATE) && !error))
@@ -960,36 +681,44 @@ static void crypt_endio(struct bio *clone, int error)
960 681
961static void clone_init(struct dm_crypt_io *io, struct bio *clone) 682static void clone_init(struct dm_crypt_io *io, struct bio *clone)
962{ 683{
963 struct crypt_config *cc = io->cc; 684 struct crypt_config *cc = io->target->private;
964 685
965 clone->bi_private = io; 686 clone->bi_private = io;
966 clone->bi_end_io = crypt_endio; 687 clone->bi_end_io = crypt_endio;
967 clone->bi_bdev = cc->dev->bdev; 688 clone->bi_bdev = cc->dev->bdev;
968 clone->bi_rw = io->base_bio->bi_rw; 689 clone->bi_rw = io->base_bio->bi_rw;
690 clone->bi_destructor = dm_crypt_bio_destructor;
969} 691}
970 692
971static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) 693static void kcryptd_io_read(struct dm_crypt_io *io)
972{ 694{
973 struct crypt_config *cc = io->cc; 695 struct crypt_config *cc = io->target->private;
974 struct bio *base_bio = io->base_bio; 696 struct bio *base_bio = io->base_bio;
975 struct bio *clone; 697 struct bio *clone;
976 698
699 crypt_inc_pending(io);
700
977 /* 701 /*
978 * The block layer might modify the bvec array, so always 702 * The block layer might modify the bvec array, so always
979 * copy the required bvecs because we need the original 703 * copy the required bvecs because we need the original
980 * one in order to decrypt the whole bio data *afterwards*. 704 * one in order to decrypt the whole bio data *afterwards*.
981 */ 705 */
982 clone = bio_clone_bioset(base_bio, gfp, cc->bs); 706 clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs);
983 if (!clone) 707 if (unlikely(!clone)) {
984 return 1; 708 io->error = -ENOMEM;
985 709 crypt_dec_pending(io);
986 crypt_inc_pending(io); 710 return;
711 }
987 712
988 clone_init(io, clone); 713 clone_init(io, clone);
714 clone->bi_idx = 0;
715 clone->bi_vcnt = bio_segments(base_bio);
716 clone->bi_size = base_bio->bi_size;
989 clone->bi_sector = cc->start + io->sector; 717 clone->bi_sector = cc->start + io->sector;
718 memcpy(clone->bi_io_vec, bio_iovec(base_bio),
719 sizeof(struct bio_vec) * clone->bi_vcnt);
990 720
991 generic_make_request(clone); 721 generic_make_request(clone);
992 return 0;
993} 722}
994 723
995static void kcryptd_io_write(struct dm_crypt_io *io) 724static void kcryptd_io_write(struct dm_crypt_io *io)
@@ -1002,31 +731,30 @@ static void kcryptd_io(struct work_struct *work)
1002{ 731{
1003 struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); 732 struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
1004 733
1005 if (bio_data_dir(io->base_bio) == READ) { 734 if (bio_data_dir(io->base_bio) == READ)
1006 crypt_inc_pending(io); 735 kcryptd_io_read(io);
1007 if (kcryptd_io_read(io, GFP_NOIO)) 736 else
1008 io->error = -ENOMEM;
1009 crypt_dec_pending(io);
1010 } else
1011 kcryptd_io_write(io); 737 kcryptd_io_write(io);
1012} 738}
1013 739
1014static void kcryptd_queue_io(struct dm_crypt_io *io) 740static void kcryptd_queue_io(struct dm_crypt_io *io)
1015{ 741{
1016 struct crypt_config *cc = io->cc; 742 struct crypt_config *cc = io->target->private;
1017 743
1018 INIT_WORK(&io->work, kcryptd_io); 744 INIT_WORK(&io->work, kcryptd_io);
1019 queue_work(cc->io_queue, &io->work); 745 queue_work(cc->io_queue, &io->work);
1020} 746}
1021 747
1022static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) 748static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io,
749 int error, int async)
1023{ 750{
1024 struct bio *clone = io->ctx.bio_out; 751 struct bio *clone = io->ctx.bio_out;
1025 struct crypt_config *cc = io->cc; 752 struct crypt_config *cc = io->target->private;
1026 753
1027 if (unlikely(io->error < 0)) { 754 if (unlikely(error < 0)) {
1028 crypt_free_buffer_pages(cc, clone); 755 crypt_free_buffer_pages(cc, clone);
1029 bio_put(clone); 756 bio_put(clone);
757 io->error = -EIO;
1030 crypt_dec_pending(io); 758 crypt_dec_pending(io);
1031 return; 759 return;
1032 } 760 }
@@ -1044,7 +772,7 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
1044 772
1045static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) 773static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
1046{ 774{
1047 struct crypt_config *cc = io->cc; 775 struct crypt_config *cc = io->target->private;
1048 struct bio *clone; 776 struct bio *clone;
1049 struct dm_crypt_io *new_io; 777 struct dm_crypt_io *new_io;
1050 int crypt_finished; 778 int crypt_finished;
@@ -1077,16 +805,12 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
1077 sector += bio_sectors(clone); 805 sector += bio_sectors(clone);
1078 806
1079 crypt_inc_pending(io); 807 crypt_inc_pending(io);
1080
1081 r = crypt_convert(cc, &io->ctx); 808 r = crypt_convert(cc, &io->ctx);
1082 if (r < 0) 809 crypt_finished = atomic_dec_and_test(&io->ctx.pending);
1083 io->error = -EIO;
1084
1085 crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending);
1086 810
1087 /* Encryption was already finished, submit io now */ 811 /* Encryption was already finished, submit io now */
1088 if (crypt_finished) { 812 if (crypt_finished) {
1089 kcryptd_crypt_write_io_submit(io, 0); 813 kcryptd_crypt_write_io_submit(io, r, 0);
1090 814
1091 /* 815 /*
1092 * If there was an error, do not try next fragments. 816 * If there was an error, do not try next fragments.
@@ -1110,7 +834,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
1110 * between fragments, so switch to a new dm_crypt_io structure. 834 * between fragments, so switch to a new dm_crypt_io structure.
1111 */ 835 */
1112 if (unlikely(!crypt_finished && remaining)) { 836 if (unlikely(!crypt_finished && remaining)) {
1113 new_io = crypt_io_alloc(io->cc, io->base_bio, 837 new_io = crypt_io_alloc(io->target, io->base_bio,
1114 sector); 838 sector);
1115 crypt_inc_pending(new_io); 839 crypt_inc_pending(new_io);
1116 crypt_convert_init(cc, &new_io->ctx, NULL, 840 crypt_convert_init(cc, &new_io->ctx, NULL,
@@ -1137,14 +861,17 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
1137 crypt_dec_pending(io); 861 crypt_dec_pending(io);
1138} 862}
1139 863
1140static void kcryptd_crypt_read_done(struct dm_crypt_io *io) 864static void kcryptd_crypt_read_done(struct dm_crypt_io *io, int error)
1141{ 865{
866 if (unlikely(error < 0))
867 io->error = -EIO;
868
1142 crypt_dec_pending(io); 869 crypt_dec_pending(io);
1143} 870}
1144 871
1145static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) 872static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
1146{ 873{
1147 struct crypt_config *cc = io->cc; 874 struct crypt_config *cc = io->target->private;
1148 int r = 0; 875 int r = 0;
1149 876
1150 crypt_inc_pending(io); 877 crypt_inc_pending(io);
@@ -1153,11 +880,9 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
1153 io->sector); 880 io->sector);
1154 881
1155 r = crypt_convert(cc, &io->ctx); 882 r = crypt_convert(cc, &io->ctx);
1156 if (r < 0)
1157 io->error = -EIO;
1158 883
1159 if (atomic_dec_and_test(&io->ctx.cc_pending)) 884 if (atomic_dec_and_test(&io->ctx.pending))
1160 kcryptd_crypt_read_done(io); 885 kcryptd_crypt_read_done(io, r);
1161 886
1162 crypt_dec_pending(io); 887 crypt_dec_pending(io);
1163} 888}
@@ -1168,28 +893,22 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
1168 struct dm_crypt_request *dmreq = async_req->data; 893 struct dm_crypt_request *dmreq = async_req->data;
1169 struct convert_context *ctx = dmreq->ctx; 894 struct convert_context *ctx = dmreq->ctx;
1170 struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx); 895 struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx);
1171 struct crypt_config *cc = io->cc; 896 struct crypt_config *cc = io->target->private;
1172 897
1173 if (error == -EINPROGRESS) { 898 if (error == -EINPROGRESS) {
1174 complete(&ctx->restart); 899 complete(&ctx->restart);
1175 return; 900 return;
1176 } 901 }
1177 902
1178 if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
1179 error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq);
1180
1181 if (error < 0)
1182 io->error = -EIO;
1183
1184 mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool); 903 mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool);
1185 904
1186 if (!atomic_dec_and_test(&ctx->cc_pending)) 905 if (!atomic_dec_and_test(&ctx->pending))
1187 return; 906 return;
1188 907
1189 if (bio_data_dir(io->base_bio) == READ) 908 if (bio_data_dir(io->base_bio) == READ)
1190 kcryptd_crypt_read_done(io); 909 kcryptd_crypt_read_done(io, error);
1191 else 910 else
1192 kcryptd_crypt_write_io_submit(io, 1); 911 kcryptd_crypt_write_io_submit(io, error, 1);
1193} 912}
1194 913
1195static void kcryptd_crypt(struct work_struct *work) 914static void kcryptd_crypt(struct work_struct *work)
@@ -1204,7 +923,7 @@ static void kcryptd_crypt(struct work_struct *work)
1204 923
1205static void kcryptd_queue_crypt(struct dm_crypt_io *io) 924static void kcryptd_queue_crypt(struct dm_crypt_io *io)
1206{ 925{
1207 struct crypt_config *cc = io->cc; 926 struct crypt_config *cc = io->target->private;
1208 927
1209 INIT_WORK(&io->work, kcryptd_crypt); 928 INIT_WORK(&io->work, kcryptd_crypt);
1210 queue_work(cc->crypt_queue, &io->work); 929 queue_work(cc->crypt_queue, &io->work);
@@ -1216,6 +935,7 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io)
1216static int crypt_decode_key(u8 *key, char *hex, unsigned int size) 935static int crypt_decode_key(u8 *key, char *hex, unsigned int size)
1217{ 936{
1218 char buffer[3]; 937 char buffer[3];
938 char *endp;
1219 unsigned int i; 939 unsigned int i;
1220 940
1221 buffer[2] = '\0'; 941 buffer[2] = '\0';
@@ -1224,7 +944,9 @@ static int crypt_decode_key(u8 *key, char *hex, unsigned int size)
1224 buffer[0] = *hex++; 944 buffer[0] = *hex++;
1225 buffer[1] = *hex++; 945 buffer[1] = *hex++;
1226 946
1227 if (kstrtou8(buffer, 16, &key[i])) 947 key[i] = (u8)simple_strtoul(buffer, &endp, 16);
948
949 if (endp != &buffer[2])
1228 return -EINVAL; 950 return -EINVAL;
1229 } 951 }
1230 952
@@ -1248,101 +970,34 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
1248 } 970 }
1249} 971}
1250 972
1251static void crypt_free_tfms(struct crypt_config *cc)
1252{
1253 unsigned i;
1254
1255 if (!cc->tfms)
1256 return;
1257
1258 for (i = 0; i < cc->tfms_count; i++)
1259 if (cc->tfms[i] && !IS_ERR(cc->tfms[i])) {
1260 crypto_free_ablkcipher(cc->tfms[i]);
1261 cc->tfms[i] = NULL;
1262 }
1263
1264 kfree(cc->tfms);
1265 cc->tfms = NULL;
1266}
1267
1268static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode)
1269{
1270 unsigned i;
1271 int err;
1272
1273 cc->tfms = kmalloc(cc->tfms_count * sizeof(struct crypto_ablkcipher *),
1274 GFP_KERNEL);
1275 if (!cc->tfms)
1276 return -ENOMEM;
1277
1278 for (i = 0; i < cc->tfms_count; i++) {
1279 cc->tfms[i] = crypto_alloc_ablkcipher(ciphermode, 0, 0);
1280 if (IS_ERR(cc->tfms[i])) {
1281 err = PTR_ERR(cc->tfms[i]);
1282 crypt_free_tfms(cc);
1283 return err;
1284 }
1285 }
1286
1287 return 0;
1288}
1289
1290static int crypt_setkey_allcpus(struct crypt_config *cc)
1291{
1292 unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count);
1293 int err = 0, i, r;
1294
1295 for (i = 0; i < cc->tfms_count; i++) {
1296 r = crypto_ablkcipher_setkey(cc->tfms[i],
1297 cc->key + (i * subkey_size),
1298 subkey_size);
1299 if (r)
1300 err = r;
1301 }
1302
1303 return err;
1304}
1305
1306static int crypt_set_key(struct crypt_config *cc, char *key) 973static int crypt_set_key(struct crypt_config *cc, char *key)
1307{ 974{
1308 int r = -EINVAL;
1309 int key_string_len = strlen(key);
1310
1311 /* The key size may not be changed. */ 975 /* The key size may not be changed. */
1312 if (cc->key_size != (key_string_len >> 1)) 976 if (cc->key_size != (strlen(key) >> 1))
1313 goto out; 977 return -EINVAL;
1314 978
1315 /* Hyphen (which gives a key_size of zero) means there is no key. */ 979 /* Hyphen (which gives a key_size of zero) means there is no key. */
1316 if (!cc->key_size && strcmp(key, "-")) 980 if (!cc->key_size && strcmp(key, "-"))
1317 goto out; 981 return -EINVAL;
1318 982
1319 if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0) 983 if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0)
1320 goto out; 984 return -EINVAL;
1321 985
1322 set_bit(DM_CRYPT_KEY_VALID, &cc->flags); 986 set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
1323 987
1324 r = crypt_setkey_allcpus(cc); 988 return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
1325
1326out:
1327 /* Hex key string not needed after here, so wipe it. */
1328 memset(key, '0', key_string_len);
1329
1330 return r;
1331} 989}
1332 990
1333static int crypt_wipe_key(struct crypt_config *cc) 991static int crypt_wipe_key(struct crypt_config *cc)
1334{ 992{
1335 clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); 993 clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
1336 memset(&cc->key, 0, cc->key_size * sizeof(u8)); 994 memset(&cc->key, 0, cc->key_size * sizeof(u8));
1337 995 return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
1338 return crypt_setkey_allcpus(cc);
1339} 996}
1340 997
1341static void crypt_dtr(struct dm_target *ti) 998static void crypt_dtr(struct dm_target *ti)
1342{ 999{
1343 struct crypt_config *cc = ti->private; 1000 struct crypt_config *cc = ti->private;
1344 struct crypt_cpu *cpu_cc;
1345 int cpu;
1346 1001
1347 ti->private = NULL; 1002 ti->private = NULL;
1348 1003
@@ -1354,15 +1009,6 @@ static void crypt_dtr(struct dm_target *ti)
1354 if (cc->crypt_queue) 1009 if (cc->crypt_queue)
1355 destroy_workqueue(cc->crypt_queue); 1010 destroy_workqueue(cc->crypt_queue);
1356 1011
1357 if (cc->cpu)
1358 for_each_possible_cpu(cpu) {
1359 cpu_cc = per_cpu_ptr(cc->cpu, cpu);
1360 if (cpu_cc->req)
1361 mempool_free(cpu_cc->req, cc->req_pool);
1362 }
1363
1364 crypt_free_tfms(cc);
1365
1366 if (cc->bs) 1012 if (cc->bs)
1367 bioset_free(cc->bs); 1013 bioset_free(cc->bs);
1368 1014
@@ -1376,12 +1022,12 @@ static void crypt_dtr(struct dm_target *ti)
1376 if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) 1022 if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
1377 cc->iv_gen_ops->dtr(cc); 1023 cc->iv_gen_ops->dtr(cc);
1378 1024
1025 if (cc->tfm && !IS_ERR(cc->tfm))
1026 crypto_free_ablkcipher(cc->tfm);
1027
1379 if (cc->dev) 1028 if (cc->dev)
1380 dm_put_device(ti, cc->dev); 1029 dm_put_device(ti, cc->dev);
1381 1030
1382 if (cc->cpu)
1383 free_percpu(cc->cpu);
1384
1385 kzfree(cc->cipher); 1031 kzfree(cc->cipher);
1386 kzfree(cc->cipher_string); 1032 kzfree(cc->cipher_string);
1387 1033
@@ -1393,10 +1039,9 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1393 char *cipher_in, char *key) 1039 char *cipher_in, char *key)
1394{ 1040{
1395 struct crypt_config *cc = ti->private; 1041 struct crypt_config *cc = ti->private;
1396 char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount; 1042 char *tmp, *cipher, *chainmode, *ivmode, *ivopts;
1397 char *cipher_api = NULL; 1043 char *cipher_api = NULL;
1398 int ret = -EINVAL; 1044 int ret = -EINVAL;
1399 char dummy;
1400 1045
1401 /* Convert to crypto api definition? */ 1046 /* Convert to crypto api definition? */
1402 if (strchr(cipher_in, '(')) { 1047 if (strchr(cipher_in, '(')) {
@@ -1410,20 +1055,10 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1410 1055
1411 /* 1056 /*
1412 * Legacy dm-crypt cipher specification 1057 * Legacy dm-crypt cipher specification
1413 * cipher[:keycount]-mode-iv:ivopts 1058 * cipher-mode-iv:ivopts
1414 */ 1059 */
1415 tmp = cipher_in; 1060 tmp = cipher_in;
1416 keycount = strsep(&tmp, "-"); 1061 cipher = strsep(&tmp, "-");
1417 cipher = strsep(&keycount, ":");
1418
1419 if (!keycount)
1420 cc->tfms_count = 1;
1421 else if (sscanf(keycount, "%u%c", &cc->tfms_count, &dummy) != 1 ||
1422 !is_power_of_2(cc->tfms_count)) {
1423 ti->error = "Bad cipher key count specification";
1424 return -EINVAL;
1425 }
1426 cc->key_parts = cc->tfms_count;
1427 1062
1428 cc->cipher = kstrdup(cipher, GFP_KERNEL); 1063 cc->cipher = kstrdup(cipher, GFP_KERNEL);
1429 if (!cc->cipher) 1064 if (!cc->cipher)
@@ -1436,13 +1071,6 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1436 if (tmp) 1071 if (tmp)
1437 DMWARN("Ignoring unexpected additional cipher options"); 1072 DMWARN("Ignoring unexpected additional cipher options");
1438 1073
1439 cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)),
1440 __alignof__(struct crypt_cpu));
1441 if (!cc->cpu) {
1442 ti->error = "Cannot allocate per cpu state";
1443 goto bad_mem;
1444 }
1445
1446 /* 1074 /*
1447 * For compatibility with the original dm-crypt mapping format, if 1075 * For compatibility with the original dm-crypt mapping format, if
1448 * only the cipher name is supplied, use cbc-plain. 1076 * only the cipher name is supplied, use cbc-plain.
@@ -1469,8 +1097,9 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1469 } 1097 }
1470 1098
1471 /* Allocate cipher */ 1099 /* Allocate cipher */
1472 ret = crypt_alloc_tfms(cc, cipher_api); 1100 cc->tfm = crypto_alloc_ablkcipher(cipher_api, 0, 0);
1473 if (ret < 0) { 1101 if (IS_ERR(cc->tfm)) {
1102 ret = PTR_ERR(cc->tfm);
1474 ti->error = "Error allocating crypto tfm"; 1103 ti->error = "Error allocating crypto tfm";
1475 goto bad; 1104 goto bad;
1476 } 1105 }
@@ -1483,7 +1112,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1483 } 1112 }
1484 1113
1485 /* Initialize IV */ 1114 /* Initialize IV */
1486 cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc)); 1115 cc->iv_size = crypto_ablkcipher_ivsize(cc->tfm);
1487 if (cc->iv_size) 1116 if (cc->iv_size)
1488 /* at least a 64 bit sector number should fit in our buffer */ 1117 /* at least a 64 bit sector number should fit in our buffer */
1489 cc->iv_size = max(cc->iv_size, 1118 cc->iv_size = max(cc->iv_size,
@@ -1506,15 +1135,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1506 cc->iv_gen_ops = &crypt_iv_benbi_ops; 1135 cc->iv_gen_ops = &crypt_iv_benbi_ops;
1507 else if (strcmp(ivmode, "null") == 0) 1136 else if (strcmp(ivmode, "null") == 0)
1508 cc->iv_gen_ops = &crypt_iv_null_ops; 1137 cc->iv_gen_ops = &crypt_iv_null_ops;
1509 else if (strcmp(ivmode, "lmk") == 0) { 1138 else {
1510 cc->iv_gen_ops = &crypt_iv_lmk_ops;
1511 /* Version 2 and 3 is recognised according
1512 * to length of provided multi-key string.
1513 * If present (version 3), last key is used as IV seed.
1514 */
1515 if (cc->key_size % cc->key_parts)
1516 cc->key_parts++;
1517 } else {
1518 ret = -EINVAL; 1139 ret = -EINVAL;
1519 ti->error = "Invalid IV mode"; 1140 ti->error = "Invalid IV mode";
1520 goto bad; 1141 goto bad;
@@ -1560,7 +1181,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1560 int ret; 1181 int ret;
1561 struct dm_arg_set as; 1182 struct dm_arg_set as;
1562 const char *opt_string; 1183 const char *opt_string;
1563 char dummy;
1564 1184
1565 static struct dm_arg _args[] = { 1185 static struct dm_arg _args[] = {
1566 {0, 1, "Invalid number of feature args"}, 1186 {0, 1, "Invalid number of feature args"},
@@ -1593,9 +1213,9 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1593 } 1213 }
1594 1214
1595 cc->dmreq_start = sizeof(struct ablkcipher_request); 1215 cc->dmreq_start = sizeof(struct ablkcipher_request);
1596 cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc)); 1216 cc->dmreq_start += crypto_ablkcipher_reqsize(cc->tfm);
1597 cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment()); 1217 cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment());
1598 cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) & 1218 cc->dmreq_start += crypto_ablkcipher_alignmask(cc->tfm) &
1599 ~(crypto_tfm_ctx_alignment() - 1); 1219 ~(crypto_tfm_ctx_alignment() - 1);
1600 1220
1601 cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + 1221 cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
@@ -1604,6 +1224,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1604 ti->error = "Cannot allocate crypt request mempool"; 1224 ti->error = "Cannot allocate crypt request mempool";
1605 goto bad; 1225 goto bad;
1606 } 1226 }
1227 cc->req = NULL;
1607 1228
1608 cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0); 1229 cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
1609 if (!cc->page_pool) { 1230 if (!cc->page_pool) {
@@ -1618,7 +1239,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1618 } 1239 }
1619 1240
1620 ret = -EINVAL; 1241 ret = -EINVAL;
1621 if (sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) { 1242 if (sscanf(argv[2], "%llu", &tmpll) != 1) {
1622 ti->error = "Invalid iv_offset sector"; 1243 ti->error = "Invalid iv_offset sector";
1623 goto bad; 1244 goto bad;
1624 } 1245 }
@@ -1629,7 +1250,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1629 goto bad; 1250 goto bad;
1630 } 1251 }
1631 1252
1632 if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) { 1253 if (sscanf(argv[4], "%llu", &tmpll) != 1) {
1633 ti->error = "Invalid device sector"; 1254 ti->error = "Invalid device sector";
1634 goto bad; 1255 goto bad;
1635 } 1256 }
@@ -1660,27 +1281,20 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1660 } 1281 }
1661 1282
1662 ret = -ENOMEM; 1283 ret = -ENOMEM;
1663 cc->io_queue = alloc_workqueue("kcryptd_io", 1284 cc->io_queue = create_singlethread_workqueue("kcryptd_io");
1664 WQ_NON_REENTRANT|
1665 WQ_MEM_RECLAIM,
1666 1);
1667 if (!cc->io_queue) { 1285 if (!cc->io_queue) {
1668 ti->error = "Couldn't create kcryptd io queue"; 1286 ti->error = "Couldn't create kcryptd io queue";
1669 goto bad; 1287 goto bad;
1670 } 1288 }
1671 1289
1672 cc->crypt_queue = alloc_workqueue("kcryptd", 1290 cc->crypt_queue = create_singlethread_workqueue("kcryptd");
1673 WQ_NON_REENTRANT|
1674 WQ_CPU_INTENSIVE|
1675 WQ_MEM_RECLAIM,
1676 1);
1677 if (!cc->crypt_queue) { 1291 if (!cc->crypt_queue) {
1678 ti->error = "Couldn't create kcryptd queue"; 1292 ti->error = "Couldn't create kcryptd queue";
1679 goto bad; 1293 goto bad;
1680 } 1294 }
1681 1295
1682 ti->num_flush_requests = 1; 1296 ti->num_flush_requests = 1;
1683 ti->discard_zeroes_data_unsupported = true; 1297 ti->discard_zeroes_data_unsupported = 1;
1684 1298
1685 return 0; 1299 return 0;
1686 1300
@@ -1689,10 +1303,11 @@ bad:
1689 return ret; 1303 return ret;
1690} 1304}
1691 1305
1692static int crypt_map(struct dm_target *ti, struct bio *bio) 1306static int crypt_map(struct dm_target *ti, struct bio *bio,
1307 union map_info *map_context)
1693{ 1308{
1694 struct dm_crypt_io *io; 1309 struct dm_crypt_io *io;
1695 struct crypt_config *cc = ti->private; 1310 struct crypt_config *cc;
1696 1311
1697 /* 1312 /*
1698 * If bio is REQ_FLUSH or REQ_DISCARD, just bypass crypt queues. 1313 * If bio is REQ_FLUSH or REQ_DISCARD, just bypass crypt queues.
@@ -1700,25 +1315,25 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
1700 * - for REQ_DISCARD caller must use flush if IO ordering matters 1315 * - for REQ_DISCARD caller must use flush if IO ordering matters
1701 */ 1316 */
1702 if (unlikely(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) { 1317 if (unlikely(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) {
1318 cc = ti->private;
1703 bio->bi_bdev = cc->dev->bdev; 1319 bio->bi_bdev = cc->dev->bdev;
1704 if (bio_sectors(bio)) 1320 if (bio_sectors(bio))
1705 bio->bi_sector = cc->start + dm_target_offset(ti, bio->bi_sector); 1321 bio->bi_sector = cc->start + dm_target_offset(ti, bio->bi_sector);
1706 return DM_MAPIO_REMAPPED; 1322 return DM_MAPIO_REMAPPED;
1707 } 1323 }
1708 1324
1709 io = crypt_io_alloc(cc, bio, dm_target_offset(ti, bio->bi_sector)); 1325 io = crypt_io_alloc(ti, bio, dm_target_offset(ti, bio->bi_sector));
1710 1326
1711 if (bio_data_dir(io->base_bio) == READ) { 1327 if (bio_data_dir(io->base_bio) == READ)
1712 if (kcryptd_io_read(io, GFP_NOWAIT)) 1328 kcryptd_queue_io(io);
1713 kcryptd_queue_io(io); 1329 else
1714 } else
1715 kcryptd_queue_crypt(io); 1330 kcryptd_queue_crypt(io);
1716 1331
1717 return DM_MAPIO_SUBMITTED; 1332 return DM_MAPIO_SUBMITTED;
1718} 1333}
1719 1334
1720static int crypt_status(struct dm_target *ti, status_type_t type, 1335static int crypt_status(struct dm_target *ti, status_type_t type,
1721 unsigned status_flags, char *result, unsigned maxlen) 1336 char *result, unsigned int maxlen)
1722{ 1337{
1723 struct crypt_config *cc = ti->private; 1338 struct crypt_config *cc = ti->private;
1724 unsigned int sz = 0; 1339 unsigned int sz = 0;
@@ -1845,7 +1460,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
1845 1460
1846static struct target_type crypt_target = { 1461static struct target_type crypt_target = {
1847 .name = "crypt", 1462 .name = "crypt",
1848 .version = {1, 12, 0}, 1463 .version = {1, 8, 0},
1849 .module = THIS_MODULE, 1464 .module = THIS_MODULE,
1850 .ctr = crypt_ctr, 1465 .ctr = crypt_ctr,
1851 .dtr = crypt_dtr, 1466 .dtr = crypt_dtr,
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index cc1bd048acb..f18375dcedd 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -131,7 +131,6 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
131{ 131{
132 struct delay_c *dc; 132 struct delay_c *dc;
133 unsigned long long tmpll; 133 unsigned long long tmpll;
134 char dummy;
135 134
136 if (argc != 3 && argc != 6) { 135 if (argc != 3 && argc != 6) {
137 ti->error = "requires exactly 3 or 6 arguments"; 136 ti->error = "requires exactly 3 or 6 arguments";
@@ -146,13 +145,13 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
146 145
147 dc->reads = dc->writes = 0; 146 dc->reads = dc->writes = 0;
148 147
149 if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1) { 148 if (sscanf(argv[1], "%llu", &tmpll) != 1) {
150 ti->error = "Invalid device sector"; 149 ti->error = "Invalid device sector";
151 goto bad; 150 goto bad;
152 } 151 }
153 dc->start_read = tmpll; 152 dc->start_read = tmpll;
154 153
155 if (sscanf(argv[2], "%u%c", &dc->read_delay, &dummy) != 1) { 154 if (sscanf(argv[2], "%u", &dc->read_delay) != 1) {
156 ti->error = "Invalid delay"; 155 ti->error = "Invalid delay";
157 goto bad; 156 goto bad;
158 } 157 }
@@ -167,13 +166,13 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
167 if (argc == 3) 166 if (argc == 3)
168 goto out; 167 goto out;
169 168
170 if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) { 169 if (sscanf(argv[4], "%llu", &tmpll) != 1) {
171 ti->error = "Invalid write device sector"; 170 ti->error = "Invalid write device sector";
172 goto bad_dev_read; 171 goto bad_dev_read;
173 } 172 }
174 dc->start_write = tmpll; 173 dc->start_write = tmpll;
175 174
176 if (sscanf(argv[5], "%u%c", &dc->write_delay, &dummy) != 1) { 175 if (sscanf(argv[5], "%u", &dc->write_delay) != 1) {
177 ti->error = "Invalid write delay"; 176 ti->error = "Invalid write delay";
178 goto bad_dev_read; 177 goto bad_dev_read;
179 } 178 }
@@ -274,7 +273,8 @@ static void delay_resume(struct dm_target *ti)
274 atomic_set(&dc->may_delay, 1); 273 atomic_set(&dc->may_delay, 1);
275} 274}
276 275
277static int delay_map(struct dm_target *ti, struct bio *bio) 276static int delay_map(struct dm_target *ti, struct bio *bio,
277 union map_info *map_context)
278{ 278{
279 struct delay_c *dc = ti->private; 279 struct delay_c *dc = ti->private;
280 280
@@ -294,7 +294,7 @@ static int delay_map(struct dm_target *ti, struct bio *bio)
294} 294}
295 295
296static int delay_status(struct dm_target *ti, status_type_t type, 296static int delay_status(struct dm_target *ti, status_type_t type,
297 unsigned status_flags, char *result, unsigned maxlen) 297 char *result, unsigned maxlen)
298{ 298{
299 struct delay_c *dc = ti->private; 299 struct delay_c *dc = ti->private;
300 int sz = 0; 300 int sz = 0;
@@ -337,7 +337,7 @@ out:
337 337
338static struct target_type delay_target = { 338static struct target_type delay_target = {
339 .name = "delay", 339 .name = "delay",
340 .version = {1, 2, 0}, 340 .version = {1, 1, 0},
341 .module = THIS_MODULE, 341 .module = THIS_MODULE,
342 .ctr = delay_ctr, 342 .ctr = delay_ctr,
343 .dtr = delay_dtr, 343 .dtr = delay_dtr,
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index ebaa4f803ee..0bdb201c2c2 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -11,7 +11,6 @@
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/pagemap.h> 12#include <linux/pagemap.h>
13#include <linux/vmalloc.h> 13#include <linux/vmalloc.h>
14#include <linux/module.h>
15#include <linux/slab.h> 14#include <linux/slab.h>
16 15
17#define DM_MSG_PREFIX "snapshot exception stores" 16#define DM_MSG_PREFIX "snapshot exception stores"
@@ -142,19 +141,24 @@ EXPORT_SYMBOL(dm_exception_store_type_unregister);
142static int set_chunk_size(struct dm_exception_store *store, 141static int set_chunk_size(struct dm_exception_store *store,
143 const char *chunk_size_arg, char **error) 142 const char *chunk_size_arg, char **error)
144{ 143{
145 unsigned chunk_size; 144 unsigned long chunk_size_ulong;
145 char *value;
146 146
147 if (kstrtouint(chunk_size_arg, 10, &chunk_size)) { 147 chunk_size_ulong = simple_strtoul(chunk_size_arg, &value, 10);
148 if (*chunk_size_arg == '\0' || *value != '\0' ||
149 chunk_size_ulong > UINT_MAX) {
148 *error = "Invalid chunk size"; 150 *error = "Invalid chunk size";
149 return -EINVAL; 151 return -EINVAL;
150 } 152 }
151 153
152 if (!chunk_size) { 154 if (!chunk_size_ulong) {
153 store->chunk_size = store->chunk_mask = store->chunk_shift = 0; 155 store->chunk_size = store->chunk_mask = store->chunk_shift = 0;
154 return 0; 156 return 0;
155 } 157 }
156 158
157 return dm_exception_store_set_chunk_size(store, chunk_size, error); 159 return dm_exception_store_set_chunk_size(store,
160 (unsigned) chunk_size_ulong,
161 error);
158} 162}
159 163
160int dm_exception_store_set_chunk_size(struct dm_exception_store *store, 164int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
@@ -278,7 +282,7 @@ int dm_exception_store_init(void)
278 return 0; 282 return 0;
279 283
280persistent_fail: 284persistent_fail:
281 dm_transient_snapshot_exit(); 285 dm_persistent_snapshot_exit();
282transient_fail: 286transient_fail:
283 return r; 287 return r;
284} 288}
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 9721f2ffb1a..f84c08029b2 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -39,10 +39,6 @@ enum feature_flag_bits {
39 DROP_WRITES 39 DROP_WRITES
40}; 40};
41 41
42struct per_bio_data {
43 bool bio_submitted;
44};
45
46static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, 42static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
47 struct dm_target *ti) 43 struct dm_target *ti)
48{ 44{
@@ -164,7 +160,6 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
164 unsigned long long tmpll; 160 unsigned long long tmpll;
165 struct dm_arg_set as; 161 struct dm_arg_set as;
166 const char *devname; 162 const char *devname;
167 char dummy;
168 163
169 as.argc = argc; 164 as.argc = argc;
170 as.argv = argv; 165 as.argv = argv;
@@ -183,7 +178,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
183 178
184 devname = dm_shift_arg(&as); 179 devname = dm_shift_arg(&as);
185 180
186 if (sscanf(dm_shift_arg(&as), "%llu%c", &tmpll, &dummy) != 1) { 181 if (sscanf(dm_shift_arg(&as), "%llu", &tmpll) != 1) {
187 ti->error = "Invalid device sector"; 182 ti->error = "Invalid device sector";
188 goto bad; 183 goto bad;
189 } 184 }
@@ -218,7 +213,6 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
218 213
219 ti->num_flush_requests = 1; 214 ti->num_flush_requests = 1;
220 ti->num_discard_requests = 1; 215 ti->num_discard_requests = 1;
221 ti->per_bio_data_size = sizeof(struct per_bio_data);
222 ti->private = fc; 216 ti->private = fc;
223 return 0; 217 return 0;
224 218
@@ -270,12 +264,11 @@ static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc)
270 } 264 }
271} 265}
272 266
273static int flakey_map(struct dm_target *ti, struct bio *bio) 267static int flakey_map(struct dm_target *ti, struct bio *bio,
268 union map_info *map_context)
274{ 269{
275 struct flakey_c *fc = ti->private; 270 struct flakey_c *fc = ti->private;
276 unsigned elapsed; 271 unsigned elapsed;
277 struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
278 pb->bio_submitted = false;
279 272
280 /* Are we alive ? */ 273 /* Are we alive ? */
281 elapsed = (jiffies - fc->start_time) / HZ; 274 elapsed = (jiffies - fc->start_time) / HZ;
@@ -283,7 +276,7 @@ static int flakey_map(struct dm_target *ti, struct bio *bio)
283 /* 276 /*
284 * Flag this bio as submitted while down. 277 * Flag this bio as submitted while down.
285 */ 278 */
286 pb->bio_submitted = true; 279 map_context->ll = 1;
287 280
288 /* 281 /*
289 * Map reads as normal. 282 * Map reads as normal.
@@ -320,16 +313,17 @@ map_bio:
320 return DM_MAPIO_REMAPPED; 313 return DM_MAPIO_REMAPPED;
321} 314}
322 315
323static int flakey_end_io(struct dm_target *ti, struct bio *bio, int error) 316static int flakey_end_io(struct dm_target *ti, struct bio *bio,
317 int error, union map_info *map_context)
324{ 318{
325 struct flakey_c *fc = ti->private; 319 struct flakey_c *fc = ti->private;
326 struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); 320 unsigned bio_submitted_while_down = map_context->ll;
327 321
328 /* 322 /*
329 * Corrupt successful READs while in down state. 323 * Corrupt successful READs while in down state.
330 * If flags were specified, only corrupt those that match. 324 * If flags were specified, only corrupt those that match.
331 */ 325 */
332 if (fc->corrupt_bio_byte && !error && pb->bio_submitted && 326 if (!error && bio_submitted_while_down &&
333 (bio_data_dir(bio) == READ) && (fc->corrupt_bio_rw == READ) && 327 (bio_data_dir(bio) == READ) && (fc->corrupt_bio_rw == READ) &&
334 all_corrupt_bio_flags_match(bio, fc)) 328 all_corrupt_bio_flags_match(bio, fc))
335 corrupt_bio_data(bio, fc); 329 corrupt_bio_data(bio, fc);
@@ -338,7 +332,7 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio, int error)
338} 332}
339 333
340static int flakey_status(struct dm_target *ti, status_type_t type, 334static int flakey_status(struct dm_target *ti, status_type_t type,
341 unsigned status_flags, char *result, unsigned maxlen) 335 char *result, unsigned int maxlen)
342{ 336{
343 unsigned sz = 0; 337 unsigned sz = 0;
344 struct flakey_c *fc = ti->private; 338 struct flakey_c *fc = ti->private;
@@ -374,17 +368,8 @@ static int flakey_status(struct dm_target *ti, status_type_t type,
374static int flakey_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg) 368static int flakey_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg)
375{ 369{
376 struct flakey_c *fc = ti->private; 370 struct flakey_c *fc = ti->private;
377 struct dm_dev *dev = fc->dev;
378 int r = 0;
379
380 /*
381 * Only pass ioctls through if the device sizes match exactly.
382 */
383 if (fc->start ||
384 ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
385 r = scsi_verify_blk_ioctl(NULL, cmd);
386 371
387 return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg); 372 return __blkdev_driver_ioctl(fc->dev->bdev, fc->dev->mode, cmd, arg);
388} 373}
389 374
390static int flakey_merge(struct dm_target *ti, struct bvec_merge_data *bvm, 375static int flakey_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
@@ -411,7 +396,7 @@ static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_
411 396
412static struct target_type flakey_target = { 397static struct target_type flakey_target = {
413 .name = "flakey", 398 .name = "flakey",
414 .version = {1, 3, 0}, 399 .version = {1, 2, 0},
415 .module = THIS_MODULE, 400 .module = THIS_MODULE,
416 .ctr = flakey_ctr, 401 .ctr = flakey_ctr,
417 .dtr = flakey_dtr, 402 .dtr = flakey_dtr,
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index ea49834377c..ad2eba40e31 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -249,6 +249,16 @@ static void vm_dp_init(struct dpages *dp, void *data)
249 dp->context_ptr = data; 249 dp->context_ptr = data;
250} 250}
251 251
252static void dm_bio_destructor(struct bio *bio)
253{
254 unsigned region;
255 struct io *io;
256
257 retrieve_io_and_region_from_bio(bio, &io, &region);
258
259 bio_free(bio, io->client->bios);
260}
261
252/* 262/*
253 * Functions for getting the pages from kernel memory. 263 * Functions for getting the pages from kernel memory.
254 */ 264 */
@@ -286,9 +296,6 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
286 unsigned offset; 296 unsigned offset;
287 unsigned num_bvecs; 297 unsigned num_bvecs;
288 sector_t remaining = where->count; 298 sector_t remaining = where->count;
289 struct request_queue *q = bdev_get_queue(where->bdev);
290 unsigned short logical_block_size = queue_logical_block_size(q);
291 sector_t num_sectors;
292 299
293 /* 300 /*
294 * where->count may be zero if rw holds a flush and we need to 301 * where->count may be zero if rw holds a flush and we need to
@@ -298,38 +305,20 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
298 /* 305 /*
299 * Allocate a suitably sized-bio. 306 * Allocate a suitably sized-bio.
300 */ 307 */
301 if ((rw & REQ_DISCARD) || (rw & REQ_WRITE_SAME)) 308 num_bvecs = dm_sector_div_up(remaining,
302 num_bvecs = 1; 309 (PAGE_SIZE >> SECTOR_SHIFT));
303 else 310 num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), num_bvecs);
304 num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev),
305 dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT)));
306
307 bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios); 311 bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
308 bio->bi_sector = where->sector + (where->count - remaining); 312 bio->bi_sector = where->sector + (where->count - remaining);
309 bio->bi_bdev = where->bdev; 313 bio->bi_bdev = where->bdev;
310 bio->bi_end_io = endio; 314 bio->bi_end_io = endio;
315 bio->bi_destructor = dm_bio_destructor;
311 store_io_and_region_in_bio(bio, io, region); 316 store_io_and_region_in_bio(bio, io, region);
312 317
313 if (rw & REQ_DISCARD) { 318 /*
314 num_sectors = min_t(sector_t, q->limits.max_discard_sectors, remaining); 319 * Try and add as many pages as possible.
315 bio->bi_size = num_sectors << SECTOR_SHIFT; 320 */
316 remaining -= num_sectors; 321 while (remaining) {
317 } else if (rw & REQ_WRITE_SAME) {
318 /*
319 * WRITE SAME only uses a single page.
320 */
321 dp->get_page(dp, &page, &len, &offset);
322 bio_add_page(bio, page, logical_block_size, offset);
323 num_sectors = min_t(sector_t, q->limits.max_write_same_sectors, remaining);
324 bio->bi_size = num_sectors << SECTOR_SHIFT;
325
326 offset = 0;
327 remaining -= num_sectors;
328 dp->next_page(dp);
329 } else while (remaining) {
330 /*
331 * Try and add as many pages as possible.
332 */
333 dp->get_page(dp, &page, &len, &offset); 322 dp->get_page(dp, &page, &len, &offset);
334 len = min(len, to_bytes(remaining)); 323 len = min(len, to_bytes(remaining));
335 if (!bio_add_page(bio, page, len, offset)) 324 if (!bio_add_page(bio, page, len, offset))
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 0666b5d14b8..2e9a3ca37bd 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -880,7 +880,6 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
880 struct hd_geometry geometry; 880 struct hd_geometry geometry;
881 unsigned long indata[4]; 881 unsigned long indata[4];
882 char *geostr = (char *) param + param->data_start; 882 char *geostr = (char *) param + param->data_start;
883 char dummy;
884 883
885 md = find_device(param); 884 md = find_device(param);
886 if (!md) 885 if (!md)
@@ -892,8 +891,8 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
892 goto out; 891 goto out;
893 } 892 }
894 893
895 x = sscanf(geostr, "%lu %lu %lu %lu%c", indata, 894 x = sscanf(geostr, "%lu %lu %lu %lu", indata,
896 indata + 1, indata + 2, indata + 3, &dummy); 895 indata + 1, indata + 2, indata + 3);
897 896
898 if (x != 4) { 897 if (x != 4) {
899 DMWARN("Unable to interpret geometry settings."); 898 DMWARN("Unable to interpret geometry settings.");
@@ -1054,7 +1053,6 @@ static void retrieve_status(struct dm_table *table,
1054 char *outbuf, *outptr; 1053 char *outbuf, *outptr;
1055 status_type_t type; 1054 status_type_t type;
1056 size_t remaining, len, used = 0; 1055 size_t remaining, len, used = 0;
1057 unsigned status_flags = 0;
1058 1056
1059 outptr = outbuf = get_result_buffer(param, param_size, &len); 1057 outptr = outbuf = get_result_buffer(param, param_size, &len);
1060 1058
@@ -1091,9 +1089,7 @@ static void retrieve_status(struct dm_table *table,
1091 1089
1092 /* Get the status/table string from the target driver */ 1090 /* Get the status/table string from the target driver */
1093 if (ti->type->status) { 1091 if (ti->type->status) {
1094 if (param->flags & DM_NOFLUSH_FLAG) 1092 if (ti->type->status(ti, type, outptr, remaining)) {
1095 status_flags |= DM_STATUS_NOFLUSH_FLAG;
1096 if (ti->type->status(ti, type, status_flags, outptr, remaining)) {
1097 param->flags |= DM_BUFFER_FULL_FLAG; 1093 param->flags |= DM_BUFFER_FULL_FLAG;
1098 break; 1094 break;
1099 } 1095 }
@@ -1219,7 +1215,6 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
1219 struct hash_cell *hc; 1215 struct hash_cell *hc;
1220 struct dm_table *t; 1216 struct dm_table *t;
1221 struct mapped_device *md; 1217 struct mapped_device *md;
1222 struct target_type *immutable_target_type;
1223 1218
1224 md = find_device(param); 1219 md = find_device(param);
1225 if (!md) 1220 if (!md)
@@ -1235,16 +1230,6 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
1235 goto out; 1230 goto out;
1236 } 1231 }
1237 1232
1238 immutable_target_type = dm_get_immutable_target_type(md);
1239 if (immutable_target_type &&
1240 (immutable_target_type != dm_table_get_immutable_target_type(t))) {
1241 DMWARN("can't replace immutable target type %s",
1242 immutable_target_type->name);
1243 dm_table_destroy(t);
1244 r = -EINVAL;
1245 goto out;
1246 }
1247
1248 /* Protect md->type and md->queue against concurrent table loads. */ 1233 /* Protect md->type and md->queue against concurrent table loads. */
1249 dm_lock_md_type(md); 1234 dm_lock_md_type(md);
1250 if (dm_get_md_type(md) == DM_TYPE_NONE) 1235 if (dm_get_md_type(md) == DM_TYPE_NONE)
@@ -1441,7 +1426,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
1441 1426
1442 if (!argc) { 1427 if (!argc) {
1443 DMWARN("Empty message received."); 1428 DMWARN("Empty message received.");
1444 goto out_argv; 1429 goto out;
1445 } 1430 }
1446 1431
1447 table = dm_get_live_table(md); 1432 table = dm_get_live_table(md);
@@ -1543,21 +1528,7 @@ static int check_version(unsigned int cmd, struct dm_ioctl __user *user)
1543 return r; 1528 return r;
1544} 1529}
1545 1530
1546#define DM_PARAMS_VMALLOC 0x0001 /* Params alloced with vmalloc not kmalloc */ 1531static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param)
1547#define DM_WIPE_BUFFER 0x0010 /* Wipe input buffer before returning from ioctl */
1548
1549static void free_params(struct dm_ioctl *param, size_t param_size, int param_flags)
1550{
1551 if (param_flags & DM_WIPE_BUFFER)
1552 memset(param, 0, param_size);
1553
1554 if (param_flags & DM_PARAMS_VMALLOC)
1555 vfree(param);
1556 else
1557 kfree(param);
1558}
1559
1560static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param, int *param_flags)
1561{ 1532{
1562 struct dm_ioctl tmp, *dmi; 1533 struct dm_ioctl tmp, *dmi;
1563 int secure_data; 1534 int secure_data;
@@ -1570,21 +1541,7 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param, in
1570 1541
1571 secure_data = tmp.flags & DM_SECURE_DATA_FLAG; 1542 secure_data = tmp.flags & DM_SECURE_DATA_FLAG;
1572 1543
1573 *param_flags = secure_data ? DM_WIPE_BUFFER : 0; 1544 dmi = vmalloc(tmp.data_size);
1574
1575 /*
1576 * Try to avoid low memory issues when a device is suspended.
1577 * Use kmalloc() rather than vmalloc() when we can.
1578 */
1579 dmi = NULL;
1580 if (tmp.data_size <= KMALLOC_MAX_SIZE)
1581 dmi = kmalloc(tmp.data_size, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
1582
1583 if (!dmi) {
1584 dmi = __vmalloc(tmp.data_size, GFP_NOIO | __GFP_REPEAT | __GFP_HIGH, PAGE_KERNEL);
1585 *param_flags |= DM_PARAMS_VMALLOC;
1586 }
1587
1588 if (!dmi) { 1545 if (!dmi) {
1589 if (secure_data && clear_user(user, tmp.data_size)) 1546 if (secure_data && clear_user(user, tmp.data_size))
1590 return -EFAULT; 1547 return -EFAULT;
@@ -1594,14 +1551,6 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param, in
1594 if (copy_from_user(dmi, user, tmp.data_size)) 1551 if (copy_from_user(dmi, user, tmp.data_size))
1595 goto bad; 1552 goto bad;
1596 1553
1597 /*
1598 * Abort if something changed the ioctl data while it was being copied.
1599 */
1600 if (dmi->data_size != tmp.data_size) {
1601 DMERR("rejecting ioctl: data size modified while processing parameters");
1602 goto bad;
1603 }
1604
1605 /* Wipe the user buffer so we do not return it to userspace */ 1554 /* Wipe the user buffer so we do not return it to userspace */
1606 if (secure_data && clear_user(user, tmp.data_size)) 1555 if (secure_data && clear_user(user, tmp.data_size))
1607 goto bad; 1556 goto bad;
@@ -1610,8 +1559,9 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param, in
1610 return 0; 1559 return 0;
1611 1560
1612bad: 1561bad:
1613 free_params(dmi, tmp.data_size, *param_flags); 1562 if (secure_data)
1614 1563 memset(dmi, 0, tmp.data_size);
1564 vfree(dmi);
1615 return -EFAULT; 1565 return -EFAULT;
1616} 1566}
1617 1567
@@ -1648,7 +1598,7 @@ static int validate_params(uint cmd, struct dm_ioctl *param)
1648static int ctl_ioctl(uint command, struct dm_ioctl __user *user) 1598static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
1649{ 1599{
1650 int r = 0; 1600 int r = 0;
1651 int param_flags; 1601 int wipe_buffer;
1652 unsigned int cmd; 1602 unsigned int cmd;
1653 struct dm_ioctl *uninitialized_var(param); 1603 struct dm_ioctl *uninitialized_var(param);
1654 ioctl_fn fn = NULL; 1604 ioctl_fn fn = NULL;
@@ -1684,14 +1634,24 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
1684 } 1634 }
1685 1635
1686 /* 1636 /*
1637 * Trying to avoid low memory issues when a device is
1638 * suspended.
1639 */
1640 current->flags |= PF_MEMALLOC;
1641
1642 /*
1687 * Copy the parameters into kernel space. 1643 * Copy the parameters into kernel space.
1688 */ 1644 */
1689 r = copy_params(user, &param, &param_flags); 1645 r = copy_params(user, &param);
1646
1647 current->flags &= ~PF_MEMALLOC;
1690 1648
1691 if (r) 1649 if (r)
1692 return r; 1650 return r;
1693 1651
1694 input_param_size = param->data_size; 1652 input_param_size = param->data_size;
1653 wipe_buffer = param->flags & DM_SECURE_DATA_FLAG;
1654
1695 r = validate_params(cmd, param); 1655 r = validate_params(cmd, param);
1696 if (r) 1656 if (r)
1697 goto out; 1657 goto out;
@@ -1706,7 +1666,10 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
1706 r = -EFAULT; 1666 r = -EFAULT;
1707 1667
1708out: 1668out:
1709 free_params(param, input_param_size, param_flags); 1669 if (wipe_buffer)
1670 memset(param, 0, input_param_size);
1671
1672 vfree(param);
1710 return r; 1673 return r;
1711} 1674}
1712 1675
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 68c02673263..32ac70861d6 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -66,8 +66,6 @@ struct dm_kcopyd_client {
66 struct list_head pages_jobs; 66 struct list_head pages_jobs;
67}; 67};
68 68
69static struct page_list zero_page_list;
70
71static void wake(struct dm_kcopyd_client *kc) 69static void wake(struct dm_kcopyd_client *kc)
72{ 70{
73 queue_work(kc->kcopyd_wq, &kc->kcopyd_work); 71 queue_work(kc->kcopyd_wq, &kc->kcopyd_work);
@@ -256,9 +254,6 @@ int __init dm_kcopyd_init(void)
256 if (!_job_cache) 254 if (!_job_cache)
257 return -ENOMEM; 255 return -ENOMEM;
258 256
259 zero_page_list.next = &zero_page_list;
260 zero_page_list.page = ZERO_PAGE(0);
261
262 return 0; 257 return 0;
263} 258}
264 259
@@ -327,7 +322,7 @@ static int run_complete_job(struct kcopyd_job *job)
327 dm_kcopyd_notify_fn fn = job->fn; 322 dm_kcopyd_notify_fn fn = job->fn;
328 struct dm_kcopyd_client *kc = job->kc; 323 struct dm_kcopyd_client *kc = job->kc;
329 324
330 if (job->pages && job->pages != &zero_page_list) 325 if (job->pages)
331 kcopyd_put_pages(kc, job->pages); 326 kcopyd_put_pages(kc, job->pages);
332 /* 327 /*
333 * If this is the master job, the sub jobs have already 328 * If this is the master job, the sub jobs have already
@@ -349,7 +344,7 @@ static void complete_io(unsigned long error, void *context)
349 struct dm_kcopyd_client *kc = job->kc; 344 struct dm_kcopyd_client *kc = job->kc;
350 345
351 if (error) { 346 if (error) {
352 if (job->rw & WRITE) 347 if (job->rw == WRITE)
353 job->write_err |= error; 348 job->write_err |= error;
354 else 349 else
355 job->read_err = 1; 350 job->read_err = 1;
@@ -361,7 +356,7 @@ static void complete_io(unsigned long error, void *context)
361 } 356 }
362 } 357 }
363 358
364 if (job->rw & WRITE) 359 if (job->rw == WRITE)
365 push(&kc->complete_jobs, job); 360 push(&kc->complete_jobs, job);
366 361
367 else { 362 else {
@@ -432,7 +427,7 @@ static int process_jobs(struct list_head *jobs, struct dm_kcopyd_client *kc,
432 427
433 if (r < 0) { 428 if (r < 0) {
434 /* error this rogue job */ 429 /* error this rogue job */
435 if (job->rw & WRITE) 430 if (job->rw == WRITE)
436 job->write_err = (unsigned long) -1L; 431 job->write_err = (unsigned long) -1L;
437 else 432 else
438 job->read_err = 1; 433 job->read_err = 1;
@@ -489,8 +484,6 @@ static void dispatch_job(struct kcopyd_job *job)
489 atomic_inc(&kc->nr_jobs); 484 atomic_inc(&kc->nr_jobs);
490 if (unlikely(!job->source.count)) 485 if (unlikely(!job->source.count))
491 push(&kc->complete_jobs, job); 486 push(&kc->complete_jobs, job);
492 else if (job->pages == &zero_page_list)
493 push(&kc->io_jobs, job);
494 else 487 else
495 push(&kc->pages_jobs, job); 488 push(&kc->pages_jobs, job);
496 wake(kc); 489 wake(kc);
@@ -585,7 +578,6 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
585 unsigned int flags, dm_kcopyd_notify_fn fn, void *context) 578 unsigned int flags, dm_kcopyd_notify_fn fn, void *context)
586{ 579{
587 struct kcopyd_job *job; 580 struct kcopyd_job *job;
588 int i;
589 581
590 /* 582 /*
591 * Allocate an array of jobs consisting of one master job 583 * Allocate an array of jobs consisting of one master job
@@ -600,29 +592,14 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
600 job->flags = flags; 592 job->flags = flags;
601 job->read_err = 0; 593 job->read_err = 0;
602 job->write_err = 0; 594 job->write_err = 0;
595 job->rw = READ;
596
597 job->source = *from;
603 598
604 job->num_dests = num_dests; 599 job->num_dests = num_dests;
605 memcpy(&job->dests, dests, sizeof(*dests) * num_dests); 600 memcpy(&job->dests, dests, sizeof(*dests) * num_dests);
606 601
607 if (from) { 602 job->pages = NULL;
608 job->source = *from;
609 job->pages = NULL;
610 job->rw = READ;
611 } else {
612 memset(&job->source, 0, sizeof job->source);
613 job->source.count = job->dests[0].count;
614 job->pages = &zero_page_list;
615
616 /*
617 * Use WRITE SAME to optimize zeroing if all dests support it.
618 */
619 job->rw = WRITE | REQ_WRITE_SAME;
620 for (i = 0; i < job->num_dests; i++)
621 if (!bdev_write_same(job->dests[i].bdev)) {
622 job->rw = WRITE;
623 break;
624 }
625 }
626 603
627 job->fn = fn; 604 job->fn = fn;
628 job->context = context; 605 job->context = context;
@@ -640,14 +617,6 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
640} 617}
641EXPORT_SYMBOL(dm_kcopyd_copy); 618EXPORT_SYMBOL(dm_kcopyd_copy);
642 619
643int dm_kcopyd_zero(struct dm_kcopyd_client *kc,
644 unsigned num_dests, struct dm_io_region *dests,
645 unsigned flags, dm_kcopyd_notify_fn fn, void *context)
646{
647 return dm_kcopyd_copy(kc, NULL, num_dests, dests, flags, fn, context);
648}
649EXPORT_SYMBOL(dm_kcopyd_zero);
650
651void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc, 620void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc,
652 dm_kcopyd_notify_fn fn, void *context) 621 dm_kcopyd_notify_fn fn, void *context)
653{ 622{
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 328cad5617a..3921e3bb43c 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -29,7 +29,6 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
29{ 29{
30 struct linear_c *lc; 30 struct linear_c *lc;
31 unsigned long long tmp; 31 unsigned long long tmp;
32 char dummy;
33 32
34 if (argc != 2) { 33 if (argc != 2) {
35 ti->error = "Invalid argument count"; 34 ti->error = "Invalid argument count";
@@ -42,7 +41,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
42 return -ENOMEM; 41 return -ENOMEM;
43 } 42 }
44 43
45 if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1) { 44 if (sscanf(argv[1], "%llu", &tmp) != 1) {
46 ti->error = "dm-linear: Invalid device sector"; 45 ti->error = "dm-linear: Invalid device sector";
47 goto bad; 46 goto bad;
48 } 47 }
@@ -55,7 +54,6 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
55 54
56 ti->num_flush_requests = 1; 55 ti->num_flush_requests = 1;
57 ti->num_discard_requests = 1; 56 ti->num_discard_requests = 1;
58 ti->num_write_same_requests = 1;
59 ti->private = lc; 57 ti->private = lc;
60 return 0; 58 return 0;
61 59
@@ -88,7 +86,8 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio)
88 bio->bi_sector = linear_map_sector(ti, bio->bi_sector); 86 bio->bi_sector = linear_map_sector(ti, bio->bi_sector);
89} 87}
90 88
91static int linear_map(struct dm_target *ti, struct bio *bio) 89static int linear_map(struct dm_target *ti, struct bio *bio,
90 union map_info *map_context)
92{ 91{
93 linear_map_bio(ti, bio); 92 linear_map_bio(ti, bio);
94 93
@@ -96,7 +95,7 @@ static int linear_map(struct dm_target *ti, struct bio *bio)
96} 95}
97 96
98static int linear_status(struct dm_target *ti, status_type_t type, 97static int linear_status(struct dm_target *ti, status_type_t type,
99 unsigned status_flags, char *result, unsigned maxlen) 98 char *result, unsigned int maxlen)
100{ 99{
101 struct linear_c *lc = (struct linear_c *) ti->private; 100 struct linear_c *lc = (struct linear_c *) ti->private;
102 101
@@ -117,17 +116,7 @@ static int linear_ioctl(struct dm_target *ti, unsigned int cmd,
117 unsigned long arg) 116 unsigned long arg)
118{ 117{
119 struct linear_c *lc = (struct linear_c *) ti->private; 118 struct linear_c *lc = (struct linear_c *) ti->private;
120 struct dm_dev *dev = lc->dev; 119 return __blkdev_driver_ioctl(lc->dev->bdev, lc->dev->mode, cmd, arg);
121 int r = 0;
122
123 /*
124 * Only pass ioctls through if the device sizes match exactly.
125 */
126 if (lc->start ||
127 ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
128 r = scsi_verify_blk_ioctl(NULL, cmd);
129
130 return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg);
131} 120}
132 121
133static int linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm, 122static int linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
@@ -155,7 +144,7 @@ static int linear_iterate_devices(struct dm_target *ti,
155 144
156static struct target_type linear_target = { 145static struct target_type linear_target = {
157 .name = "linear", 146 .name = "linear",
158 .version = {1, 2, 0}, 147 .version = {1, 1, 0},
159 .module = THIS_MODULE, 148 .module = THIS_MODULE,
160 .ctr = linear_ctr, 149 .ctr = linear_ctr,
161 .dtr = linear_dtr, 150 .dtr = linear_dtr,
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 9429159d9ee..1021c898601 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -9,7 +9,6 @@
9#include <linux/dm-dirty-log.h> 9#include <linux/dm-dirty-log.h>
10#include <linux/device-mapper.h> 10#include <linux/device-mapper.h>
11#include <linux/dm-log-userspace.h> 11#include <linux/dm-log-userspace.h>
12#include <linux/module.h>
13 12
14#include "dm-log-userspace-transfer.h" 13#include "dm-log-userspace-transfer.h"
15 14
@@ -31,7 +30,6 @@ struct flush_entry {
31 30
32struct log_c { 31struct log_c {
33 struct dm_target *ti; 32 struct dm_target *ti;
34 struct dm_dev *log_dev;
35 uint32_t region_size; 33 uint32_t region_size;
36 region_t region_count; 34 region_t region_count;
37 uint64_t luid; 35 uint64_t luid;
@@ -148,7 +146,7 @@ static int build_constructor_string(struct dm_target *ti,
148 * <UUID> <other args> 146 * <UUID> <other args>
149 * Where 'other args' is the userspace implementation specific log 147 * Where 'other args' is the userspace implementation specific log
150 * arguments. An example might be: 148 * arguments. An example might be:
151 * <UUID> clustered-disk <arg count> <log dev> <region_size> [[no]sync] 149 * <UUID> clustered_disk <arg count> <log dev> <region_size> [[no]sync]
152 * 150 *
153 * So, this module will strip off the <UUID> for identification purposes 151 * So, this module will strip off the <UUID> for identification purposes
154 * when communicating with userspace about a log; but will pass on everything 152 * when communicating with userspace about a log; but will pass on everything
@@ -163,15 +161,13 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
163 struct log_c *lc = NULL; 161 struct log_c *lc = NULL;
164 uint64_t rdata; 162 uint64_t rdata;
165 size_t rdata_size = sizeof(rdata); 163 size_t rdata_size = sizeof(rdata);
166 char *devices_rdata = NULL;
167 size_t devices_rdata_size = DM_NAME_LEN;
168 164
169 if (argc < 3) { 165 if (argc < 3) {
170 DMWARN("Too few arguments to userspace dirty log"); 166 DMWARN("Too few arguments to userspace dirty log");
171 return -EINVAL; 167 return -EINVAL;
172 } 168 }
173 169
174 lc = kzalloc(sizeof(*lc), GFP_KERNEL); 170 lc = kmalloc(sizeof(*lc), GFP_KERNEL);
175 if (!lc) { 171 if (!lc) {
176 DMWARN("Unable to allocate userspace log context."); 172 DMWARN("Unable to allocate userspace log context.");
177 return -ENOMEM; 173 return -ENOMEM;
@@ -199,19 +195,9 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
199 return str_size; 195 return str_size;
200 } 196 }
201 197
202 devices_rdata = kzalloc(devices_rdata_size, GFP_KERNEL); 198 /* Send table string */
203 if (!devices_rdata) {
204 DMERR("Failed to allocate memory for device information");
205 r = -ENOMEM;
206 goto out;
207 }
208
209 /*
210 * Send table string and get back any opened device.
211 */
212 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR, 199 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR,
213 ctr_str, str_size, 200 ctr_str, str_size, NULL, NULL);
214 devices_rdata, &devices_rdata_size);
215 201
216 if (r < 0) { 202 if (r < 0) {
217 if (r == -ESRCH) 203 if (r == -ESRCH)
@@ -234,20 +220,7 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
234 lc->region_size = (uint32_t)rdata; 220 lc->region_size = (uint32_t)rdata;
235 lc->region_count = dm_sector_div_up(ti->len, lc->region_size); 221 lc->region_count = dm_sector_div_up(ti->len, lc->region_size);
236 222
237 if (devices_rdata_size) {
238 if (devices_rdata[devices_rdata_size - 1] != '\0') {
239 DMERR("DM_ULOG_CTR device return string not properly terminated");
240 r = -EINVAL;
241 goto out;
242 }
243 r = dm_get_device(ti, devices_rdata,
244 dm_table_get_mode(ti->table), &lc->log_dev);
245 if (r)
246 DMERR("Failed to register %s with device-mapper",
247 devices_rdata);
248 }
249out: 223out:
250 kfree(devices_rdata);
251 if (r) { 224 if (r) {
252 kfree(lc); 225 kfree(lc);
253 kfree(ctr_str); 226 kfree(ctr_str);
@@ -268,9 +241,6 @@ static void userspace_dtr(struct dm_dirty_log *log)
268 NULL, 0, 241 NULL, 0,
269 NULL, NULL); 242 NULL, NULL);
270 243
271 if (lc->log_dev)
272 dm_put_device(lc->ti, lc->log_dev);
273
274 kfree(lc->usr_argv_str); 244 kfree(lc->usr_argv_str);
275 kfree(lc); 245 kfree(lc);
276 246
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
index 08d9a207259..1f23e048f07 100644
--- a/drivers/md/dm-log-userspace-transfer.c
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -134,7 +134,7 @@ static void cn_ulog_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp)
134{ 134{
135 struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1); 135 struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1);
136 136
137 if (!capable(CAP_SYS_ADMIN)) 137 if (!cap_raised(current_cap(), CAP_SYS_ADMIN))
138 return; 138 return;
139 139
140 spin_lock(&receiving_list_lock); 140 spin_lock(&receiving_list_lock);
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 627d19186d5..3b52bb72bd1 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -369,7 +369,6 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
369 unsigned int region_count; 369 unsigned int region_count;
370 size_t bitset_size, buf_size; 370 size_t bitset_size, buf_size;
371 int r; 371 int r;
372 char dummy;
373 372
374 if (argc < 1 || argc > 2) { 373 if (argc < 1 || argc > 2) {
375 DMWARN("wrong number of arguments to dirty region log"); 374 DMWARN("wrong number of arguments to dirty region log");
@@ -388,7 +387,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
388 } 387 }
389 } 388 }
390 389
391 if (sscanf(argv[0], "%u%c", &region_size, &dummy) != 1 || 390 if (sscanf(argv[0], "%u", &region_size) != 1 ||
392 !_check_region_size(ti, region_size)) { 391 !_check_region_size(ti, region_size)) {
393 DMWARN("invalid region size %s", argv[0]); 392 DMWARN("invalid region size %s", argv[0]);
394 return -EINVAL; 393 return -EINVAL;
@@ -571,6 +570,16 @@ static void disk_dtr(struct dm_dirty_log *log)
571 destroy_log_context(lc); 570 destroy_log_context(lc);
572} 571}
573 572
573static int count_bits32(uint32_t *addr, unsigned size)
574{
575 int count = 0, i;
576
577 for (i = 0; i < size; i++) {
578 count += hweight32(*(addr+i));
579 }
580 return count;
581}
582
574static void fail_log_device(struct log_c *lc) 583static void fail_log_device(struct log_c *lc)
575{ 584{
576 if (lc->log_dev_failed) 585 if (lc->log_dev_failed)
@@ -619,8 +628,7 @@ static int disk_resume(struct dm_dirty_log *log)
619 628
620 /* copy clean across to sync */ 629 /* copy clean across to sync */
621 memcpy(lc->sync_bits, lc->clean_bits, size); 630 memcpy(lc->sync_bits, lc->clean_bits, size);
622 lc->sync_count = memweight(lc->clean_bits, 631 lc->sync_count = count_bits32(lc->clean_bits, lc->bitset_uint32_count);
623 lc->bitset_uint32_count * sizeof(uint32_t));
624 lc->sync_search = 0; 632 lc->sync_search = 0;
625 633
626 /* set the correct number of regions in the header */ 634 /* set the correct number of regions in the header */
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 573bd04591b..5e0090ef418 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -18,7 +18,6 @@
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/time.h> 19#include <linux/time.h>
20#include <linux/workqueue.h> 20#include <linux/workqueue.h>
21#include <linux/delay.h>
22#include <scsi/scsi_dh.h> 21#include <scsi/scsi_dh.h>
23#include <linux/atomic.h> 22#include <linux/atomic.h>
24 23
@@ -62,11 +61,11 @@ struct multipath {
62 struct list_head list; 61 struct list_head list;
63 struct dm_target *ti; 62 struct dm_target *ti;
64 63
64 spinlock_t lock;
65
65 const char *hw_handler_name; 66 const char *hw_handler_name;
66 char *hw_handler_params; 67 char *hw_handler_params;
67 68
68 spinlock_t lock;
69
70 unsigned nr_priority_groups; 69 unsigned nr_priority_groups;
71 struct list_head priority_groups; 70 struct list_head priority_groups;
72 71
@@ -82,18 +81,16 @@ struct multipath {
82 struct priority_group *next_pg; /* Switch to this PG if set */ 81 struct priority_group *next_pg; /* Switch to this PG if set */
83 unsigned repeat_count; /* I/Os left before calling PS again */ 82 unsigned repeat_count; /* I/Os left before calling PS again */
84 83
85 unsigned queue_io:1; /* Must we queue all I/O? */ 84 unsigned queue_io; /* Must we queue all I/O? */
86 unsigned queue_if_no_path:1; /* Queue I/O if last path fails? */ 85 unsigned queue_if_no_path; /* Queue I/O if last path fails? */
87 unsigned saved_queue_if_no_path:1; /* Saved state during suspension */ 86 unsigned saved_queue_if_no_path;/* Saved state during suspension */
88 unsigned retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */
89
90 unsigned pg_init_retries; /* Number of times to retry pg_init */ 87 unsigned pg_init_retries; /* Number of times to retry pg_init */
91 unsigned pg_init_count; /* Number of times pg_init called */ 88 unsigned pg_init_count; /* Number of times pg_init called */
92 unsigned pg_init_delay_msecs; /* Number of msecs before pg_init retry */ 89 unsigned pg_init_delay_msecs; /* Number of msecs before pg_init retry */
93 90
94 unsigned queue_size;
95 struct work_struct process_queued_ios; 91 struct work_struct process_queued_ios;
96 struct list_head queued_ios; 92 struct list_head queued_ios;
93 unsigned queue_size;
97 94
98 struct work_struct trigger_event; 95 struct work_struct trigger_event;
99 96
@@ -229,27 +226,6 @@ static void free_multipath(struct multipath *m)
229 kfree(m); 226 kfree(m);
230} 227}
231 228
232static int set_mapinfo(struct multipath *m, union map_info *info)
233{
234 struct dm_mpath_io *mpio;
235
236 mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC);
237 if (!mpio)
238 return -ENOMEM;
239
240 memset(mpio, 0, sizeof(*mpio));
241 info->ptr = mpio;
242
243 return 0;
244}
245
246static void clear_mapinfo(struct multipath *m, union map_info *info)
247{
248 struct dm_mpath_io *mpio = info->ptr;
249
250 info->ptr = NULL;
251 mempool_free(mpio, m->mpio_pool);
252}
253 229
254/*----------------------------------------------- 230/*-----------------------------------------------
255 * Path selection 231 * Path selection
@@ -331,18 +307,14 @@ static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
331 /* 307 /*
332 * Loop through priority groups until we find a valid path. 308 * Loop through priority groups until we find a valid path.
333 * First time we skip PGs marked 'bypassed'. 309 * First time we skip PGs marked 'bypassed'.
334 * Second time we only try the ones we skipped, but set 310 * Second time we only try the ones we skipped.
335 * pg_init_delay_retry so we do not hammer controllers.
336 */ 311 */
337 do { 312 do {
338 list_for_each_entry(pg, &m->priority_groups, list) { 313 list_for_each_entry(pg, &m->priority_groups, list) {
339 if (pg->bypassed == bypassed) 314 if (pg->bypassed == bypassed)
340 continue; 315 continue;
341 if (!__choose_path_in_pg(m, pg, nr_bytes)) { 316 if (!__choose_path_in_pg(m, pg, nr_bytes))
342 if (!bypassed)
343 m->pg_init_delay_retry = 1;
344 return; 317 return;
345 }
346 } 318 }
347 } while (bypassed--); 319 } while (bypassed--);
348 320
@@ -369,14 +341,13 @@ static int __must_push_back(struct multipath *m)
369} 341}
370 342
371static int map_io(struct multipath *m, struct request *clone, 343static int map_io(struct multipath *m, struct request *clone,
372 union map_info *map_context, unsigned was_queued) 344 struct dm_mpath_io *mpio, unsigned was_queued)
373{ 345{
374 int r = DM_MAPIO_REMAPPED; 346 int r = DM_MAPIO_REMAPPED;
375 size_t nr_bytes = blk_rq_bytes(clone); 347 size_t nr_bytes = blk_rq_bytes(clone);
376 unsigned long flags; 348 unsigned long flags;
377 struct pgpath *pgpath; 349 struct pgpath *pgpath;
378 struct block_device *bdev; 350 struct block_device *bdev;
379 struct dm_mpath_io *mpio = map_context->ptr;
380 351
381 spin_lock_irqsave(&m->lock, flags); 352 spin_lock_irqsave(&m->lock, flags);
382 353
@@ -452,6 +423,7 @@ static void dispatch_queued_ios(struct multipath *m)
452{ 423{
453 int r; 424 int r;
454 unsigned long flags; 425 unsigned long flags;
426 struct dm_mpath_io *mpio;
455 union map_info *info; 427 union map_info *info;
456 struct request *clone, *n; 428 struct request *clone, *n;
457 LIST_HEAD(cl); 429 LIST_HEAD(cl);
@@ -464,15 +436,16 @@ static void dispatch_queued_ios(struct multipath *m)
464 list_del_init(&clone->queuelist); 436 list_del_init(&clone->queuelist);
465 437
466 info = dm_get_rq_mapinfo(clone); 438 info = dm_get_rq_mapinfo(clone);
439 mpio = info->ptr;
467 440
468 r = map_io(m, clone, info, 1); 441 r = map_io(m, clone, mpio, 1);
469 if (r < 0) { 442 if (r < 0) {
470 clear_mapinfo(m, info); 443 mempool_free(mpio, m->mpio_pool);
471 dm_kill_unmapped_request(clone, r); 444 dm_kill_unmapped_request(clone, r);
472 } else if (r == DM_MAPIO_REMAPPED) 445 } else if (r == DM_MAPIO_REMAPPED)
473 dm_dispatch_request(clone); 446 dm_dispatch_request(clone);
474 else if (r == DM_MAPIO_REQUEUE) { 447 else if (r == DM_MAPIO_REQUEUE) {
475 clear_mapinfo(m, info); 448 mempool_free(mpio, m->mpio_pool);
476 dm_requeue_unmapped_request(clone); 449 dm_requeue_unmapped_request(clone);
477 } 450 }
478 } 451 }
@@ -488,6 +461,9 @@ static void process_queued_ios(struct work_struct *work)
488 461
489 spin_lock_irqsave(&m->lock, flags); 462 spin_lock_irqsave(&m->lock, flags);
490 463
464 if (!m->queue_size)
465 goto out;
466
491 if (!m->current_pgpath) 467 if (!m->current_pgpath)
492 __choose_pgpath(m, 0); 468 __choose_pgpath(m, 0);
493 469
@@ -500,6 +476,7 @@ static void process_queued_ios(struct work_struct *work)
500 if (m->pg_init_required && !m->pg_init_in_progress && pgpath) 476 if (m->pg_init_required && !m->pg_init_in_progress && pgpath)
501 __pg_init_all_paths(m); 477 __pg_init_all_paths(m);
502 478
479out:
503 spin_unlock_irqrestore(&m->lock, flags); 480 spin_unlock_irqrestore(&m->lock, flags);
504 if (!must_queue) 481 if (!must_queue)
505 dispatch_queued_ios(m); 482 dispatch_queued_ios(m);
@@ -569,8 +546,6 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
569 int r; 546 int r;
570 struct pgpath *p; 547 struct pgpath *p;
571 struct multipath *m = ti->private; 548 struct multipath *m = ti->private;
572 struct request_queue *q = NULL;
573 const char *attached_handler_name;
574 549
575 /* we need at least a path arg */ 550 /* we need at least a path arg */
576 if (as->argc < 1) { 551 if (as->argc < 1) {
@@ -589,37 +564,13 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
589 goto bad; 564 goto bad;
590 } 565 }
591 566
592 if (m->retain_attached_hw_handler || m->hw_handler_name)
593 q = bdev_get_queue(p->path.dev->bdev);
594
595 if (m->retain_attached_hw_handler) {
596 attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL);
597 if (attached_handler_name) {
598 /*
599 * Reset hw_handler_name to match the attached handler
600 * and clear any hw_handler_params associated with the
601 * ignored handler.
602 *
603 * NB. This modifies the table line to show the actual
604 * handler instead of the original table passed in.
605 */
606 kfree(m->hw_handler_name);
607 m->hw_handler_name = attached_handler_name;
608
609 kfree(m->hw_handler_params);
610 m->hw_handler_params = NULL;
611 }
612 }
613
614 if (m->hw_handler_name) { 567 if (m->hw_handler_name) {
615 /* 568 struct request_queue *q = bdev_get_queue(p->path.dev->bdev);
616 * Increments scsi_dh reference, even when using an 569
617 * already-attached handler.
618 */
619 r = scsi_dh_attach(q, m->hw_handler_name); 570 r = scsi_dh_attach(q, m->hw_handler_name);
620 if (r == -EBUSY) { 571 if (r == -EBUSY) {
621 /* 572 /*
622 * Already attached to different hw_handler: 573 * Already attached to different hw_handler,
623 * try to reattach with correct one. 574 * try to reattach with correct one.
624 */ 575 */
625 scsi_dh_detach(q); 576 scsi_dh_detach(q);
@@ -747,8 +698,8 @@ static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m)
747 return 0; 698 return 0;
748 699
749 m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL); 700 m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL);
750 if (!try_then_request_module(scsi_dh_handler_exist(m->hw_handler_name), 701 request_module("scsi_dh_%s", m->hw_handler_name);
751 "scsi_dh_%s", m->hw_handler_name)) { 702 if (scsi_dh_handler_exist(m->hw_handler_name) == 0) {
752 ti->error = "unknown hardware handler type"; 703 ti->error = "unknown hardware handler type";
753 ret = -EINVAL; 704 ret = -EINVAL;
754 goto fail; 705 goto fail;
@@ -787,7 +738,7 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m)
787 const char *arg_name; 738 const char *arg_name;
788 739
789 static struct dm_arg _args[] = { 740 static struct dm_arg _args[] = {
790 {0, 6, "invalid number of feature args"}, 741 {0, 5, "invalid number of feature args"},
791 {1, 50, "pg_init_retries must be between 1 and 50"}, 742 {1, 50, "pg_init_retries must be between 1 and 50"},
792 {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"}, 743 {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
793 }; 744 };
@@ -808,11 +759,6 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m)
808 continue; 759 continue;
809 } 760 }
810 761
811 if (!strcasecmp(arg_name, "retain_attached_hw_handler")) {
812 m->retain_attached_hw_handler = 1;
813 continue;
814 }
815
816 if (!strcasecmp(arg_name, "pg_init_retries") && 762 if (!strcasecmp(arg_name, "pg_init_retries") &&
817 (argc >= 1)) { 763 (argc >= 1)) {
818 r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error); 764 r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error);
@@ -944,7 +890,7 @@ static void flush_multipath_work(struct multipath *m)
944 flush_workqueue(kmpath_handlerd); 890 flush_workqueue(kmpath_handlerd);
945 multipath_wait_for_pg_init_completion(m); 891 multipath_wait_for_pg_init_completion(m);
946 flush_workqueue(kmultipathd); 892 flush_workqueue(kmultipathd);
947 flush_work(&m->trigger_event); 893 flush_work_sync(&m->trigger_event);
948} 894}
949 895
950static void multipath_dtr(struct dm_target *ti) 896static void multipath_dtr(struct dm_target *ti)
@@ -962,16 +908,20 @@ static int multipath_map(struct dm_target *ti, struct request *clone,
962 union map_info *map_context) 908 union map_info *map_context)
963{ 909{
964 int r; 910 int r;
911 struct dm_mpath_io *mpio;
965 struct multipath *m = (struct multipath *) ti->private; 912 struct multipath *m = (struct multipath *) ti->private;
966 913
967 if (set_mapinfo(m, map_context) < 0) 914 mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC);
915 if (!mpio)
968 /* ENOMEM, requeue */ 916 /* ENOMEM, requeue */
969 return DM_MAPIO_REQUEUE; 917 return DM_MAPIO_REQUEUE;
918 memset(mpio, 0, sizeof(*mpio));
970 919
920 map_context->ptr = mpio;
971 clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; 921 clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
972 r = map_io(m, clone, map_context, 0); 922 r = map_io(m, clone, mpio, 0);
973 if (r < 0 || r == DM_MAPIO_REQUEUE) 923 if (r < 0 || r == DM_MAPIO_REQUEUE)
974 clear_mapinfo(m, map_context); 924 mempool_free(mpio, m->mpio_pool);
975 925
976 return r; 926 return r;
977} 927}
@@ -1104,9 +1054,8 @@ static int switch_pg_num(struct multipath *m, const char *pgstr)
1104 struct priority_group *pg; 1054 struct priority_group *pg;
1105 unsigned pgnum; 1055 unsigned pgnum;
1106 unsigned long flags; 1056 unsigned long flags;
1107 char dummy;
1108 1057
1109 if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || 1058 if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum ||
1110 (pgnum > m->nr_priority_groups)) { 1059 (pgnum > m->nr_priority_groups)) {
1111 DMWARN("invalid PG number supplied to switch_pg_num"); 1060 DMWARN("invalid PG number supplied to switch_pg_num");
1112 return -EINVAL; 1061 return -EINVAL;
@@ -1136,9 +1085,8 @@ static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed)
1136{ 1085{
1137 struct priority_group *pg; 1086 struct priority_group *pg;
1138 unsigned pgnum; 1087 unsigned pgnum;
1139 char dummy;
1140 1088
1141 if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || 1089 if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum ||
1142 (pgnum > m->nr_priority_groups)) { 1090 (pgnum > m->nr_priority_groups)) {
1143 DMWARN("invalid PG number supplied to bypass_pg"); 1091 DMWARN("invalid PG number supplied to bypass_pg");
1144 return -EINVAL; 1092 return -EINVAL;
@@ -1309,20 +1257,17 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
1309{ 1257{
1310 struct multipath *m = ti->private; 1258 struct multipath *m = ti->private;
1311 struct dm_mpath_io *mpio = map_context->ptr; 1259 struct dm_mpath_io *mpio = map_context->ptr;
1312 struct pgpath *pgpath; 1260 struct pgpath *pgpath = mpio->pgpath;
1313 struct path_selector *ps; 1261 struct path_selector *ps;
1314 int r; 1262 int r;
1315 1263
1316 BUG_ON(!mpio);
1317
1318 r = do_end_io(m, clone, error, mpio); 1264 r = do_end_io(m, clone, error, mpio);
1319 pgpath = mpio->pgpath;
1320 if (pgpath) { 1265 if (pgpath) {
1321 ps = &pgpath->pg->ps; 1266 ps = &pgpath->pg->ps;
1322 if (ps->type->end_io) 1267 if (ps->type->end_io)
1323 ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); 1268 ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
1324 } 1269 }
1325 clear_mapinfo(m, map_context); 1270 mempool_free(mpio, m->mpio_pool);
1326 1271
1327 return r; 1272 return r;
1328} 1273}
@@ -1379,7 +1324,7 @@ static void multipath_resume(struct dm_target *ti)
1379 * num_paths num_selector_args [path_dev [selector_args]* ]+ ]+ 1324 * num_paths num_selector_args [path_dev [selector_args]* ]+ ]+
1380 */ 1325 */
1381static int multipath_status(struct dm_target *ti, status_type_t type, 1326static int multipath_status(struct dm_target *ti, status_type_t type,
1382 unsigned status_flags, char *result, unsigned maxlen) 1327 char *result, unsigned int maxlen)
1383{ 1328{
1384 int sz = 0; 1329 int sz = 0;
1385 unsigned long flags; 1330 unsigned long flags;
@@ -1397,16 +1342,13 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
1397 else { 1342 else {
1398 DMEMIT("%u ", m->queue_if_no_path + 1343 DMEMIT("%u ", m->queue_if_no_path +
1399 (m->pg_init_retries > 0) * 2 + 1344 (m->pg_init_retries > 0) * 2 +
1400 (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 + 1345 (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2);
1401 m->retain_attached_hw_handler);
1402 if (m->queue_if_no_path) 1346 if (m->queue_if_no_path)
1403 DMEMIT("queue_if_no_path "); 1347 DMEMIT("queue_if_no_path ");
1404 if (m->pg_init_retries) 1348 if (m->pg_init_retries)
1405 DMEMIT("pg_init_retries %u ", m->pg_init_retries); 1349 DMEMIT("pg_init_retries %u ", m->pg_init_retries);
1406 if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) 1350 if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT)
1407 DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs); 1351 DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
1408 if (m->retain_attached_hw_handler)
1409 DMEMIT("retain_attached_hw_handler ");
1410 } 1352 }
1411 1353
1412 if (!m->hw_handler_name || type == STATUSTYPE_INFO) 1354 if (!m->hw_handler_name || type == STATUSTYPE_INFO)
@@ -1555,49 +1497,29 @@ out:
1555static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, 1497static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
1556 unsigned long arg) 1498 unsigned long arg)
1557{ 1499{
1558 struct multipath *m = ti->private; 1500 struct multipath *m = (struct multipath *) ti->private;
1559 struct pgpath *pgpath; 1501 struct block_device *bdev = NULL;
1560 struct block_device *bdev; 1502 fmode_t mode = 0;
1561 fmode_t mode;
1562 unsigned long flags; 1503 unsigned long flags;
1563 int r; 1504 int r = 0;
1564
1565again:
1566 bdev = NULL;
1567 mode = 0;
1568 r = 0;
1569 1505
1570 spin_lock_irqsave(&m->lock, flags); 1506 spin_lock_irqsave(&m->lock, flags);
1571 1507
1572 if (!m->current_pgpath) 1508 if (!m->current_pgpath)
1573 __choose_pgpath(m, 0); 1509 __choose_pgpath(m, 0);
1574 1510
1575 pgpath = m->current_pgpath; 1511 if (m->current_pgpath) {
1576 1512 bdev = m->current_pgpath->path.dev->bdev;
1577 if (pgpath) { 1513 mode = m->current_pgpath->path.dev->mode;
1578 bdev = pgpath->path.dev->bdev;
1579 mode = pgpath->path.dev->mode;
1580 } 1514 }
1581 1515
1582 if ((pgpath && m->queue_io) || (!pgpath && m->queue_if_no_path)) 1516 if (m->queue_io)
1583 r = -EAGAIN; 1517 r = -EAGAIN;
1584 else if (!bdev) 1518 else if (!bdev)
1585 r = -EIO; 1519 r = -EIO;
1586 1520
1587 spin_unlock_irqrestore(&m->lock, flags); 1521 spin_unlock_irqrestore(&m->lock, flags);
1588 1522
1589 /*
1590 * Only pass ioctls through if the device sizes match exactly.
1591 */
1592 if (!r && ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT)
1593 r = scsi_verify_blk_ioctl(NULL, cmd);
1594
1595 if (r == -EAGAIN && !fatal_signal_pending(current)) {
1596 queue_work(kmultipathd, &m->process_queued_ios);
1597 msleep(10);
1598 goto again;
1599 }
1600
1601 return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg); 1523 return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
1602} 1524}
1603 1525
@@ -1695,7 +1617,7 @@ out:
1695 *---------------------------------------------------------------*/ 1617 *---------------------------------------------------------------*/
1696static struct target_type multipath_target = { 1618static struct target_type multipath_target = {
1697 .name = "multipath", 1619 .name = "multipath",
1698 .version = {1, 5, 0}, 1620 .version = {1, 3, 0},
1699 .module = THIS_MODULE, 1621 .module = THIS_MODULE,
1700 .ctr = multipath_ctr, 1622 .ctr = multipath_ctr,
1701 .dtr = multipath_dtr, 1623 .dtr = multipath_dtr,
diff --git a/drivers/md/dm-path-selector.c b/drivers/md/dm-path-selector.c
index fa0ccc585cb..42c04f04a0c 100644
--- a/drivers/md/dm-path-selector.c
+++ b/drivers/md/dm-path-selector.c
@@ -10,7 +10,6 @@
10 */ 10 */
11 11
12#include <linux/device-mapper.h> 12#include <linux/device-mapper.h>
13#include <linux/module.h>
14 13
15#include "dm-path-selector.h" 14#include "dm-path-selector.h"
16 15
diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c
index 3941fae0de9..03a837aa5ce 100644
--- a/drivers/md/dm-queue-length.c
+++ b/drivers/md/dm-queue-length.c
@@ -112,7 +112,6 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path,
112 struct selector *s = ps->context; 112 struct selector *s = ps->context;
113 struct path_info *pi; 113 struct path_info *pi;
114 unsigned repeat_count = QL_MIN_IO; 114 unsigned repeat_count = QL_MIN_IO;
115 char dummy;
116 115
117 /* 116 /*
118 * Arguments: [<repeat_count>] 117 * Arguments: [<repeat_count>]
@@ -124,7 +123,7 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path,
124 return -EINVAL; 123 return -EINVAL;
125 } 124 }
126 125
127 if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { 126 if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
128 *error = "queue-length ps: invalid repeat count"; 127 *error = "queue-length ps: invalid repeat count";
129 return -EINVAL; 128 return -EINVAL;
130 } 129 }
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 3d8984edeff..86df8b2cf92 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -6,12 +6,10 @@
6 */ 6 */
7 7
8#include <linux/slab.h> 8#include <linux/slab.h>
9#include <linux/module.h>
10 9
11#include "md.h" 10#include "md.h"
12#include "raid1.h" 11#include "raid1.h"
13#include "raid5.h" 12#include "raid5.h"
14#include "raid10.h"
15#include "bitmap.h" 13#include "bitmap.h"
16 14
17#include <linux/device-mapper.h> 15#include <linux/device-mapper.h>
@@ -39,7 +37,7 @@ struct raid_dev {
39 */ 37 */
40 struct dm_dev *meta_dev; 38 struct dm_dev *meta_dev;
41 struct dm_dev *data_dev; 39 struct dm_dev *data_dev;
42 struct md_rdev rdev; 40 struct mdk_rdev_s rdev;
43}; 41};
44 42
45/* 43/*
@@ -53,17 +51,13 @@ struct raid_dev {
53#define DMPF_MAX_RECOVERY_RATE 0x20 51#define DMPF_MAX_RECOVERY_RATE 0x20
54#define DMPF_MAX_WRITE_BEHIND 0x40 52#define DMPF_MAX_WRITE_BEHIND 0x40
55#define DMPF_STRIPE_CACHE 0x80 53#define DMPF_STRIPE_CACHE 0x80
56#define DMPF_REGION_SIZE 0x100 54#define DMPF_REGION_SIZE 0X100
57#define DMPF_RAID10_COPIES 0x200
58#define DMPF_RAID10_FORMAT 0x400
59
60struct raid_set { 55struct raid_set {
61 struct dm_target *ti; 56 struct dm_target *ti;
62 57
63 uint32_t bitmap_loaded; 58 uint64_t print_flags;
64 uint32_t print_flags;
65 59
66 struct mddev md; 60 struct mddev_s md;
67 struct raid_type *raid_type; 61 struct raid_type *raid_type;
68 struct dm_target_callbacks callbacks; 62 struct dm_target_callbacks callbacks;
69 63
@@ -80,7 +74,6 @@ static struct raid_type {
80 const unsigned algorithm; /* RAID algorithm. */ 74 const unsigned algorithm; /* RAID algorithm. */
81} raid_types[] = { 75} raid_types[] = {
82 {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */}, 76 {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */},
83 {"raid10", "RAID10 (striped mirrors)", 0, 2, 10, UINT_MAX /* Varies */},
84 {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, 77 {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0},
85 {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, 78 {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
86 {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, 79 {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
@@ -91,17 +84,6 @@ static struct raid_type {
91 {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE} 84 {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
92}; 85};
93 86
94static unsigned raid10_md_layout_to_copies(int layout)
95{
96 return layout & 0xFF;
97}
98
99static int raid10_format_to_md_layout(char *format, unsigned copies)
100{
101 /* 1 "far" copy, and 'copies' "near" copies */
102 return (1 << 8) | (copies & 0xFF);
103}
104
105static struct raid_type *get_raid_type(char *name) 87static struct raid_type *get_raid_type(char *name)
106{ 88{
107 int i; 89 int i;
@@ -117,12 +99,20 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra
117{ 99{
118 unsigned i; 100 unsigned i;
119 struct raid_set *rs; 101 struct raid_set *rs;
102 sector_t sectors_per_dev;
120 103
121 if (raid_devs <= raid_type->parity_devs) { 104 if (raid_devs <= raid_type->parity_devs) {
122 ti->error = "Insufficient number of devices"; 105 ti->error = "Insufficient number of devices";
123 return ERR_PTR(-EINVAL); 106 return ERR_PTR(-EINVAL);
124 } 107 }
125 108
109 sectors_per_dev = ti->len;
110 if ((raid_type->level > 1) &&
111 sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) {
112 ti->error = "Target length not divisible by number of data devices";
113 return ERR_PTR(-EINVAL);
114 }
115
126 rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL); 116 rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL);
127 if (!rs) { 117 if (!rs) {
128 ti->error = "Cannot allocate raid context"; 118 ti->error = "Cannot allocate raid context";
@@ -136,6 +126,7 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra
136 rs->md.raid_disks = raid_devs; 126 rs->md.raid_disks = raid_devs;
137 rs->md.level = raid_type->level; 127 rs->md.level = raid_type->level;
138 rs->md.new_level = rs->md.level; 128 rs->md.new_level = rs->md.level;
129 rs->md.dev_sectors = sectors_per_dev;
139 rs->md.layout = raid_type->algorithm; 130 rs->md.layout = raid_type->algorithm;
140 rs->md.new_layout = rs->md.layout; 131 rs->md.new_layout = rs->md.layout;
141 rs->md.delta_disks = 0; 132 rs->md.delta_disks = 0;
@@ -150,7 +141,6 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra
150 * rs->md.external 141 * rs->md.external
151 * rs->md.chunk_sectors 142 * rs->md.chunk_sectors
152 * rs->md.new_chunk_sectors 143 * rs->md.new_chunk_sectors
153 * rs->md.dev_sectors
154 */ 144 */
155 145
156 return rs; 146 return rs;
@@ -163,7 +153,10 @@ static void context_free(struct raid_set *rs)
163 for (i = 0; i < rs->md.raid_disks; i++) { 153 for (i = 0; i < rs->md.raid_disks; i++) {
164 if (rs->dev[i].meta_dev) 154 if (rs->dev[i].meta_dev)
165 dm_put_device(rs->ti, rs->dev[i].meta_dev); 155 dm_put_device(rs->ti, rs->dev[i].meta_dev);
166 md_rdev_clear(&rs->dev[i].rdev); 156 if (rs->dev[i].rdev.sb_page)
157 put_page(rs->dev[i].rdev.sb_page);
158 rs->dev[i].rdev.sb_page = NULL;
159 rs->dev[i].rdev.sb_loaded = 0;
167 if (rs->dev[i].data_dev) 160 if (rs->dev[i].data_dev)
168 dm_put_device(rs->ti, rs->dev[i].data_dev); 161 dm_put_device(rs->ti, rs->dev[i].data_dev);
169 } 162 }
@@ -295,11 +288,9 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size)
295 * Choose a reasonable default. All figures in sectors. 288 * Choose a reasonable default. All figures in sectors.
296 */ 289 */
297 if (min_region_size > (1 << 13)) { 290 if (min_region_size > (1 << 13)) {
298 /* If not a power of 2, make it the next power of 2 */
299 if (min_region_size & (min_region_size - 1))
300 region_size = 1 << fls(region_size);
301 DMINFO("Choosing default region size of %lu sectors", 291 DMINFO("Choosing default region size of %lu sectors",
302 region_size); 292 region_size);
293 region_size = min_region_size;
303 } else { 294 } else {
304 DMINFO("Choosing default region size of 4MiB"); 295 DMINFO("Choosing default region size of 4MiB");
305 region_size = 1 << 13; /* sectors */ 296 region_size = 1 << 13; /* sectors */
@@ -340,84 +331,6 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size)
340} 331}
341 332
342/* 333/*
343 * validate_rebuild_devices
344 * @rs
345 *
346 * Determine if the devices specified for rebuild can result in a valid
347 * usable array that is capable of rebuilding the given devices.
348 *
349 * Returns: 0 on success, -EINVAL on failure.
350 */
351static int validate_rebuild_devices(struct raid_set *rs)
352{
353 unsigned i, rebuild_cnt = 0;
354 unsigned rebuilds_per_group, copies, d;
355
356 if (!(rs->print_flags & DMPF_REBUILD))
357 return 0;
358
359 for (i = 0; i < rs->md.raid_disks; i++)
360 if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
361 rebuild_cnt++;
362
363 switch (rs->raid_type->level) {
364 case 1:
365 if (rebuild_cnt >= rs->md.raid_disks)
366 goto too_many;
367 break;
368 case 4:
369 case 5:
370 case 6:
371 if (rebuild_cnt > rs->raid_type->parity_devs)
372 goto too_many;
373 break;
374 case 10:
375 copies = raid10_md_layout_to_copies(rs->md.layout);
376 if (rebuild_cnt < copies)
377 break;
378
379 /*
380 * It is possible to have a higher rebuild count for RAID10,
381 * as long as the failed devices occur in different mirror
382 * groups (i.e. different stripes).
383 *
384 * Right now, we only allow for "near" copies. When other
385 * formats are added, we will have to check those too.
386 *
387 * When checking "near" format, make sure no adjacent devices
388 * have failed beyond what can be handled. In addition to the
389 * simple case where the number of devices is a multiple of the
390 * number of copies, we must also handle cases where the number
391 * of devices is not a multiple of the number of copies.
392 * E.g. dev1 dev2 dev3 dev4 dev5
393 * A A B B C
394 * C D D E E
395 */
396 rebuilds_per_group = 0;
397 for (i = 0; i < rs->md.raid_disks * copies; i++) {
398 d = i % rs->md.raid_disks;
399 if (!test_bit(In_sync, &rs->dev[d].rdev.flags) &&
400 (++rebuilds_per_group >= copies))
401 goto too_many;
402 if (!((i + 1) % copies))
403 rebuilds_per_group = 0;
404 }
405 break;
406 default:
407 DMERR("The rebuild parameter is not supported for %s",
408 rs->raid_type->name);
409 rs->ti->error = "Rebuild not supported for this RAID type";
410 return -EINVAL;
411 }
412
413 return 0;
414
415too_many:
416 rs->ti->error = "Too many rebuild devices specified";
417 return -EINVAL;
418}
419
420/*
421 * Possible arguments are... 334 * Possible arguments are...
422 * <chunk_size> [optional_args] 335 * <chunk_size> [optional_args]
423 * 336 *
@@ -435,20 +348,12 @@ too_many:
435 * [max_write_behind <sectors>] See '-write-behind=' (man mdadm) 348 * [max_write_behind <sectors>] See '-write-behind=' (man mdadm)
436 * [stripe_cache <sectors>] Stripe cache size for higher RAIDs 349 * [stripe_cache <sectors>] Stripe cache size for higher RAIDs
437 * [region_size <sectors>] Defines granularity of bitmap 350 * [region_size <sectors>] Defines granularity of bitmap
438 *
439 * RAID10-only options:
440 * [raid10_copies <# copies>] Number of copies. (Default: 2)
441 * [raid10_format <near>] Layout algorithm. (Default: near)
442 */ 351 */
443static int parse_raid_params(struct raid_set *rs, char **argv, 352static int parse_raid_params(struct raid_set *rs, char **argv,
444 unsigned num_raid_params) 353 unsigned num_raid_params)
445{ 354{
446 char *raid10_format = "near"; 355 unsigned i, rebuild_cnt = 0;
447 unsigned raid10_copies = 2;
448 unsigned i;
449 unsigned long value, region_size = 0; 356 unsigned long value, region_size = 0;
450 sector_t sectors_per_dev = rs->ti->len;
451 sector_t max_io_len;
452 char *key; 357 char *key;
453 358
454 /* 359 /*
@@ -518,30 +423,21 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
518 } 423 }
519 424
520 key = argv[i++]; 425 key = argv[i++];
521
522 /* Parameters that take a string value are checked here. */
523 if (!strcasecmp(key, "raid10_format")) {
524 if (rs->raid_type->level != 10) {
525 rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type";
526 return -EINVAL;
527 }
528 if (strcmp("near", argv[i])) {
529 rs->ti->error = "Invalid 'raid10_format' value given";
530 return -EINVAL;
531 }
532 raid10_format = argv[i];
533 rs->print_flags |= DMPF_RAID10_FORMAT;
534 continue;
535 }
536
537 if (strict_strtoul(argv[i], 10, &value) < 0) { 426 if (strict_strtoul(argv[i], 10, &value) < 0) {
538 rs->ti->error = "Bad numerical argument given in raid params"; 427 rs->ti->error = "Bad numerical argument given in raid params";
539 return -EINVAL; 428 return -EINVAL;
540 } 429 }
541 430
542 /* Parameters that take a numeric value are checked here */
543 if (!strcasecmp(key, "rebuild")) { 431 if (!strcasecmp(key, "rebuild")) {
544 if (value >= rs->md.raid_disks) { 432 rebuild_cnt++;
433 if (((rs->raid_type->level != 1) &&
434 (rebuild_cnt > rs->raid_type->parity_devs)) ||
435 ((rs->raid_type->level == 1) &&
436 (rebuild_cnt > (rs->md.raid_disks - 1)))) {
437 rs->ti->error = "Too many rebuild devices specified for given RAID type";
438 return -EINVAL;
439 }
440 if (value > rs->md.raid_disks) {
545 rs->ti->error = "Invalid rebuild index given"; 441 rs->ti->error = "Invalid rebuild index given";
546 return -EINVAL; 442 return -EINVAL;
547 } 443 }
@@ -591,8 +487,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
591 */ 487 */
592 value /= 2; 488 value /= 2;
593 489
594 if ((rs->raid_type->level != 5) && 490 if (rs->raid_type->level < 5) {
595 (rs->raid_type->level != 6)) {
596 rs->ti->error = "Inappropriate argument: stripe_cache"; 491 rs->ti->error = "Inappropriate argument: stripe_cache";
597 return -EINVAL; 492 return -EINVAL;
598 } 493 }
@@ -617,14 +512,6 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
617 } else if (!strcasecmp(key, "region_size")) { 512 } else if (!strcasecmp(key, "region_size")) {
618 rs->print_flags |= DMPF_REGION_SIZE; 513 rs->print_flags |= DMPF_REGION_SIZE;
619 region_size = value; 514 region_size = value;
620 } else if (!strcasecmp(key, "raid10_copies") &&
621 (rs->raid_type->level == 10)) {
622 if ((value < 2) || (value > 0xFF)) {
623 rs->ti->error = "Bad value for 'raid10_copies'";
624 return -EINVAL;
625 }
626 rs->print_flags |= DMPF_RAID10_COPIES;
627 raid10_copies = value;
628 } else { 515 } else {
629 DMERR("Unable to parse RAID parameter: %s", key); 516 DMERR("Unable to parse RAID parameter: %s", key);
630 rs->ti->error = "Unable to parse RAID parameters"; 517 rs->ti->error = "Unable to parse RAID parameters";
@@ -636,36 +523,14 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
636 return -EINVAL; 523 return -EINVAL;
637 524
638 if (rs->md.chunk_sectors) 525 if (rs->md.chunk_sectors)
639 max_io_len = rs->md.chunk_sectors; 526 rs->ti->split_io = rs->md.chunk_sectors;
640 else 527 else
641 max_io_len = region_size; 528 rs->ti->split_io = region_size;
642
643 if (dm_set_target_max_io_len(rs->ti, max_io_len))
644 return -EINVAL;
645
646 if (rs->raid_type->level == 10) {
647 if (raid10_copies > rs->md.raid_disks) {
648 rs->ti->error = "Not enough devices to satisfy specification";
649 return -EINVAL;
650 }
651
652 /* (Len * #mirrors) / #devices */
653 sectors_per_dev = rs->ti->len * raid10_copies;
654 sector_div(sectors_per_dev, rs->md.raid_disks);
655
656 rs->md.layout = raid10_format_to_md_layout(raid10_format,
657 raid10_copies);
658 rs->md.new_layout = rs->md.layout;
659 } else if ((rs->raid_type->level > 1) &&
660 sector_div(sectors_per_dev,
661 (rs->md.raid_disks - rs->raid_type->parity_devs))) {
662 rs->ti->error = "Target length not divisible by number of data devices";
663 return -EINVAL;
664 }
665 rs->md.dev_sectors = sectors_per_dev;
666 529
667 if (validate_rebuild_devices(rs)) 530 if (rs->md.chunk_sectors)
668 return -EINVAL; 531 rs->ti->split_io = rs->md.chunk_sectors;
532 else
533 rs->ti->split_io = region_size;
669 534
670 /* Assume there are no metadata devices until the drives are parsed */ 535 /* Assume there are no metadata devices until the drives are parsed */
671 rs->md.persistent = 0; 536 rs->md.persistent = 0;
@@ -688,9 +553,6 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
688 if (rs->raid_type->level == 1) 553 if (rs->raid_type->level == 1)
689 return md_raid1_congested(&rs->md, bits); 554 return md_raid1_congested(&rs->md, bits);
690 555
691 if (rs->raid_type->level == 10)
692 return md_raid10_congested(&rs->md, bits);
693
694 return md_raid5_congested(&rs->md, bits); 556 return md_raid5_congested(&rs->md, bits);
695} 557}
696 558
@@ -732,7 +594,7 @@ struct dm_raid_superblock {
732 /* Always set to 0 when writing. */ 594 /* Always set to 0 when writing. */
733} __packed; 595} __packed;
734 596
735static int read_disk_sb(struct md_rdev *rdev, int size) 597static int read_disk_sb(mdk_rdev_t *rdev, int size)
736{ 598{
737 BUG_ON(!rdev->sb_page); 599 BUG_ON(!rdev->sb_page);
738 600
@@ -740,9 +602,7 @@ static int read_disk_sb(struct md_rdev *rdev, int size)
740 return 0; 602 return 0;
741 603
742 if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) { 604 if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) {
743 DMERR("Failed to read superblock of device at position %d", 605 DMERR("Failed to read device superblock");
744 rdev->raid_disk);
745 md_error(rdev->mddev, rdev);
746 return -EINVAL; 606 return -EINVAL;
747 } 607 }
748 608
@@ -751,20 +611,18 @@ static int read_disk_sb(struct md_rdev *rdev, int size)
751 return 0; 611 return 0;
752} 612}
753 613
754static void super_sync(struct mddev *mddev, struct md_rdev *rdev) 614static void super_sync(mddev_t *mddev, mdk_rdev_t *rdev)
755{ 615{
756 int i; 616 mdk_rdev_t *r, *t;
757 uint64_t failed_devices; 617 uint64_t failed_devices;
758 struct dm_raid_superblock *sb; 618 struct dm_raid_superblock *sb;
759 struct raid_set *rs = container_of(mddev, struct raid_set, md);
760 619
761 sb = page_address(rdev->sb_page); 620 sb = page_address(rdev->sb_page);
762 failed_devices = le64_to_cpu(sb->failed_devices); 621 failed_devices = le64_to_cpu(sb->failed_devices);
763 622
764 for (i = 0; i < mddev->raid_disks; i++) 623 rdev_for_each(r, t, mddev)
765 if (!rs->dev[i].data_dev || 624 if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags))
766 test_bit(Faulty, &(rs->dev[i].rdev.flags))) 625 failed_devices |= (1ULL << r->raid_disk);
767 failed_devices |= (1ULL << i);
768 626
769 memset(sb, 0, sizeof(*sb)); 627 memset(sb, 0, sizeof(*sb));
770 628
@@ -793,7 +651,7 @@ static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
793 * 651 *
794 * Return: 1 if use rdev, 0 if use refdev, -Exxx otherwise 652 * Return: 1 if use rdev, 0 if use refdev, -Exxx otherwise
795 */ 653 */
796static int super_load(struct md_rdev *rdev, struct md_rdev *refdev) 654static int super_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev)
797{ 655{
798 int ret; 656 int ret;
799 struct dm_raid_superblock *sb; 657 struct dm_raid_superblock *sb;
@@ -808,14 +666,7 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
808 return ret; 666 return ret;
809 667
810 sb = page_address(rdev->sb_page); 668 sb = page_address(rdev->sb_page);
811 669 if (sb->magic != cpu_to_le32(DM_RAID_MAGIC)) {
812 /*
813 * Two cases that we want to write new superblocks and rebuild:
814 * 1) New device (no matching magic number)
815 * 2) Device specified for rebuild (!In_sync w/ offset == 0)
816 */
817 if ((sb->magic != cpu_to_le32(DM_RAID_MAGIC)) ||
818 (!test_bit(In_sync, &rdev->flags) && !rdev->recovery_offset)) {
819 super_sync(rdev->mddev, rdev); 670 super_sync(rdev->mddev, rdev);
820 671
821 set_bit(FirstUse, &rdev->flags); 672 set_bit(FirstUse, &rdev->flags);
@@ -838,7 +689,7 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
838 return (events_sb > events_refsb) ? 1 : 0; 689 return (events_sb > events_refsb) ? 1 : 0;
839} 690}
840 691
841static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev) 692static int super_init_validation(mddev_t *mddev, mdk_rdev_t *rdev)
842{ 693{
843 int role; 694 int role;
844 struct raid_set *rs = container_of(mddev, struct raid_set, md); 695 struct raid_set *rs = container_of(mddev, struct raid_set, md);
@@ -847,7 +698,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
847 struct dm_raid_superblock *sb; 698 struct dm_raid_superblock *sb;
848 uint32_t new_devs = 0; 699 uint32_t new_devs = 0;
849 uint32_t rebuilds = 0; 700 uint32_t rebuilds = 0;
850 struct md_rdev *r; 701 mdk_rdev_t *r, *t;
851 struct dm_raid_superblock *sb2; 702 struct dm_raid_superblock *sb2;
852 703
853 sb = page_address(rdev->sb_page); 704 sb = page_address(rdev->sb_page);
@@ -890,10 +741,13 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
890 * case the In_sync bit will /not/ be set and 741 * case the In_sync bit will /not/ be set and
891 * recovery_cp must be MaxSector. 742 * recovery_cp must be MaxSector.
892 */ 743 */
893 rdev_for_each(r, mddev) { 744 rdev_for_each(r, t, mddev) {
894 if (!test_bit(In_sync, &r->flags)) { 745 if (!test_bit(In_sync, &r->flags)) {
895 DMINFO("Device %d specified for rebuild: " 746 if (!test_bit(FirstUse, &r->flags))
896 "Clearing superblock", r->raid_disk); 747 DMERR("Superblock area of "
748 "rebuild device %d should have been "
749 "cleared.", r->raid_disk);
750 set_bit(FirstUse, &r->flags);
897 rebuilds++; 751 rebuilds++;
898 } else if (test_bit(FirstUse, &r->flags)) 752 } else if (test_bit(FirstUse, &r->flags))
899 new_devs++; 753 new_devs++;
@@ -922,7 +776,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
922 * Now we set the Faulty bit for those devices that are 776 * Now we set the Faulty bit for those devices that are
923 * recorded in the superblock as failed. 777 * recorded in the superblock as failed.
924 */ 778 */
925 rdev_for_each(r, mddev) { 779 rdev_for_each(r, t, mddev) {
926 if (!r->sb_page) 780 if (!r->sb_page)
927 continue; 781 continue;
928 sb2 = page_address(r->sb_page); 782 sb2 = page_address(r->sb_page);
@@ -955,7 +809,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
955 return 0; 809 return 0;
956} 810}
957 811
958static int super_validate(struct mddev *mddev, struct md_rdev *rdev) 812static int super_validate(mddev_t *mddev, mdk_rdev_t *rdev)
959{ 813{
960 struct dm_raid_superblock *sb = page_address(rdev->sb_page); 814 struct dm_raid_superblock *sb = page_address(rdev->sb_page);
961 815
@@ -995,43 +849,11 @@ static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
995static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) 849static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
996{ 850{
997 int ret; 851 int ret;
998 unsigned redundancy = 0; 852 mdk_rdev_t *rdev, *freshest, *tmp;
999 struct raid_dev *dev; 853 mddev_t *mddev = &rs->md;
1000 struct md_rdev *rdev, *tmp, *freshest;
1001 struct mddev *mddev = &rs->md;
1002
1003 switch (rs->raid_type->level) {
1004 case 1:
1005 redundancy = rs->md.raid_disks - 1;
1006 break;
1007 case 4:
1008 case 5:
1009 case 6:
1010 redundancy = rs->raid_type->parity_devs;
1011 break;
1012 case 10:
1013 redundancy = raid10_md_layout_to_copies(mddev->layout) - 1;
1014 break;
1015 default:
1016 ti->error = "Unknown RAID type";
1017 return -EINVAL;
1018 }
1019 854
1020 freshest = NULL; 855 freshest = NULL;
1021 rdev_for_each_safe(rdev, tmp, mddev) { 856 rdev_for_each(rdev, tmp, mddev) {
1022 /*
1023 * Skipping super_load due to DMPF_SYNC will cause
1024 * the array to undergo initialization again as
1025 * though it were new. This is the intended effect
1026 * of the "sync" directive.
1027 *
1028 * When reshaping capability is added, we must ensure
1029 * that the "sync" directive is disallowed during the
1030 * reshape.
1031 */
1032 if (rs->print_flags & DMPF_SYNC)
1033 continue;
1034
1035 if (!rdev->meta_bdev) 857 if (!rdev->meta_bdev)
1036 continue; 858 continue;
1037 859
@@ -1044,37 +866,6 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
1044 case 0: 866 case 0:
1045 break; 867 break;
1046 default: 868 default:
1047 dev = container_of(rdev, struct raid_dev, rdev);
1048 if (redundancy--) {
1049 if (dev->meta_dev)
1050 dm_put_device(ti, dev->meta_dev);
1051
1052 dev->meta_dev = NULL;
1053 rdev->meta_bdev = NULL;
1054
1055 if (rdev->sb_page)
1056 put_page(rdev->sb_page);
1057
1058 rdev->sb_page = NULL;
1059
1060 rdev->sb_loaded = 0;
1061
1062 /*
1063 * We might be able to salvage the data device
1064 * even though the meta device has failed. For
1065 * now, we behave as though '- -' had been
1066 * set for this device in the table.
1067 */
1068 if (dev->data_dev)
1069 dm_put_device(ti, dev->data_dev);
1070
1071 dev->data_dev = NULL;
1072 rdev->bdev = NULL;
1073
1074 list_del(&rdev->same_set);
1075
1076 continue;
1077 }
1078 ti->error = "Failed to load superblock"; 869 ti->error = "Failed to load superblock";
1079 return ret; 870 return ret;
1080 } 871 }
@@ -1091,7 +882,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
1091 if (super_validate(mddev, freshest)) 882 if (super_validate(mddev, freshest))
1092 return -EINVAL; 883 return -EINVAL;
1093 884
1094 rdev_for_each(rdev, mddev) 885 rdev_for_each(rdev, tmp, mddev)
1095 if ((rdev != freshest) && super_validate(mddev, rdev)) 886 if ((rdev != freshest) && super_validate(mddev, rdev))
1096 return -EINVAL; 887 return -EINVAL;
1097 888
@@ -1178,7 +969,6 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
1178 969
1179 INIT_WORK(&rs->md.event_work, do_table_event); 970 INIT_WORK(&rs->md.event_work, do_table_event);
1180 ti->private = rs; 971 ti->private = rs;
1181 ti->num_flush_requests = 1;
1182 972
1183 mutex_lock(&rs->md.reconfig_mutex); 973 mutex_lock(&rs->md.reconfig_mutex);
1184 ret = md_run(&rs->md); 974 ret = md_run(&rs->md);
@@ -1190,19 +980,12 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
1190 goto bad; 980 goto bad;
1191 } 981 }
1192 982
1193 if (ti->len != rs->md.array_sectors) {
1194 ti->error = "Array size does not match requested target length";
1195 ret = -EINVAL;
1196 goto size_mismatch;
1197 }
1198 rs->callbacks.congested_fn = raid_is_congested; 983 rs->callbacks.congested_fn = raid_is_congested;
1199 dm_table_add_target_callbacks(ti->table, &rs->callbacks); 984 dm_table_add_target_callbacks(ti->table, &rs->callbacks);
1200 985
1201 mddev_suspend(&rs->md); 986 mddev_suspend(&rs->md);
1202 return 0; 987 return 0;
1203 988
1204size_mismatch:
1205 md_stop(&rs->md);
1206bad: 989bad:
1207 context_free(rs); 990 context_free(rs);
1208 991
@@ -1218,10 +1001,10 @@ static void raid_dtr(struct dm_target *ti)
1218 context_free(rs); 1001 context_free(rs);
1219} 1002}
1220 1003
1221static int raid_map(struct dm_target *ti, struct bio *bio) 1004static int raid_map(struct dm_target *ti, struct bio *bio, union map_info *map_context)
1222{ 1005{
1223 struct raid_set *rs = ti->private; 1006 struct raid_set *rs = ti->private;
1224 struct mddev *mddev = &rs->md; 1007 mddev_t *mddev = &rs->md;
1225 1008
1226 mddev->pers->make_request(mddev, bio); 1009 mddev->pers->make_request(mddev, bio);
1227 1010
@@ -1229,61 +1012,35 @@ static int raid_map(struct dm_target *ti, struct bio *bio)
1229} 1012}
1230 1013
1231static int raid_status(struct dm_target *ti, status_type_t type, 1014static int raid_status(struct dm_target *ti, status_type_t type,
1232 unsigned status_flags, char *result, unsigned maxlen) 1015 char *result, unsigned maxlen)
1233{ 1016{
1234 struct raid_set *rs = ti->private; 1017 struct raid_set *rs = ti->private;
1235 unsigned raid_param_cnt = 1; /* at least 1 for chunksize */ 1018 unsigned raid_param_cnt = 1; /* at least 1 for chunksize */
1236 unsigned sz = 0; 1019 unsigned sz = 0;
1237 int i, array_in_sync = 0; 1020 int i;
1238 sector_t sync; 1021 sector_t sync;
1239 1022
1240 switch (type) { 1023 switch (type) {
1241 case STATUSTYPE_INFO: 1024 case STATUSTYPE_INFO:
1242 DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks); 1025 DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks);
1243 1026
1027 for (i = 0; i < rs->md.raid_disks; i++) {
1028 if (test_bit(Faulty, &rs->dev[i].rdev.flags))
1029 DMEMIT("D");
1030 else if (test_bit(In_sync, &rs->dev[i].rdev.flags))
1031 DMEMIT("A");
1032 else
1033 DMEMIT("a");
1034 }
1035
1244 if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery)) 1036 if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
1245 sync = rs->md.curr_resync_completed; 1037 sync = rs->md.curr_resync_completed;
1246 else 1038 else
1247 sync = rs->md.recovery_cp; 1039 sync = rs->md.recovery_cp;
1248 1040
1249 if (sync >= rs->md.resync_max_sectors) { 1041 if (sync > rs->md.resync_max_sectors)
1250 array_in_sync = 1;
1251 sync = rs->md.resync_max_sectors; 1042 sync = rs->md.resync_max_sectors;
1252 } else {
1253 /*
1254 * The array may be doing an initial sync, or it may
1255 * be rebuilding individual components. If all the
1256 * devices are In_sync, then it is the array that is
1257 * being initialized.
1258 */
1259 for (i = 0; i < rs->md.raid_disks; i++)
1260 if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
1261 array_in_sync = 1;
1262 }
1263 /*
1264 * Status characters:
1265 * 'D' = Dead/Failed device
1266 * 'a' = Alive but not in-sync
1267 * 'A' = Alive and in-sync
1268 */
1269 for (i = 0; i < rs->md.raid_disks; i++) {
1270 if (test_bit(Faulty, &rs->dev[i].rdev.flags))
1271 DMEMIT("D");
1272 else if (!array_in_sync ||
1273 !test_bit(In_sync, &rs->dev[i].rdev.flags))
1274 DMEMIT("a");
1275 else
1276 DMEMIT("A");
1277 }
1278 1043
1279 /*
1280 * In-sync ratio:
1281 * The in-sync ratio shows the progress of:
1282 * - Initializing the array
1283 * - Rebuilding a subset of devices of the array
1284 * The user can distinguish between the two by referring
1285 * to the status characters.
1286 */
1287 DMEMIT(" %llu/%llu", 1044 DMEMIT(" %llu/%llu",
1288 (unsigned long long) sync, 1045 (unsigned long long) sync,
1289 (unsigned long long) rs->md.resync_max_sectors); 1046 (unsigned long long) rs->md.resync_max_sectors);
@@ -1301,7 +1058,7 @@ static int raid_status(struct dm_target *ti, status_type_t type,
1301 raid_param_cnt += 2; 1058 raid_param_cnt += 2;
1302 } 1059 }
1303 1060
1304 raid_param_cnt += (hweight32(rs->print_flags & ~DMPF_REBUILD) * 2); 1061 raid_param_cnt += (hweight64(rs->print_flags & ~DMPF_REBUILD) * 2);
1305 if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)) 1062 if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))
1306 raid_param_cnt--; 1063 raid_param_cnt--;
1307 1064
@@ -1340,7 +1097,7 @@ static int raid_status(struct dm_target *ti, status_type_t type,
1340 rs->md.bitmap_info.max_write_behind); 1097 rs->md.bitmap_info.max_write_behind);
1341 1098
1342 if (rs->print_flags & DMPF_STRIPE_CACHE) { 1099 if (rs->print_flags & DMPF_STRIPE_CACHE) {
1343 struct r5conf *conf = rs->md.private; 1100 raid5_conf_t *conf = rs->md.private;
1344 1101
1345 /* convert from kiB to sectors */ 1102 /* convert from kiB to sectors */
1346 DMEMIT(" stripe_cache %d", 1103 DMEMIT(" stripe_cache %d",
@@ -1351,13 +1108,6 @@ static int raid_status(struct dm_target *ti, status_type_t type,
1351 DMEMIT(" region_size %lu", 1108 DMEMIT(" region_size %lu",
1352 rs->md.bitmap_info.chunksize >> 9); 1109 rs->md.bitmap_info.chunksize >> 9);
1353 1110
1354 if (rs->print_flags & DMPF_RAID10_COPIES)
1355 DMEMIT(" raid10_copies %u",
1356 raid10_md_layout_to_copies(rs->md.layout));
1357
1358 if (rs->print_flags & DMPF_RAID10_FORMAT)
1359 DMEMIT(" raid10_format near");
1360
1361 DMEMIT(" %d", rs->md.raid_disks); 1111 DMEMIT(" %d", rs->md.raid_disks);
1362 for (i = 0; i < rs->md.raid_disks; i++) { 1112 for (i = 0; i < rs->md.raid_disks; i++) {
1363 if (rs->dev[i].meta_dev) 1113 if (rs->dev[i].meta_dev)
@@ -1396,7 +1146,7 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
1396{ 1146{
1397 struct raid_set *rs = ti->private; 1147 struct raid_set *rs = ti->private;
1398 unsigned chunk_size = rs->md.chunk_sectors << 9; 1148 unsigned chunk_size = rs->md.chunk_sectors << 9;
1399 struct r5conf *conf = rs->md.private; 1149 raid5_conf_t *conf = rs->md.private;
1400 1150
1401 blk_limits_io_min(limits, chunk_size); 1151 blk_limits_io_min(limits, chunk_size);
1402 blk_limits_io_opt(limits, chunk_size * (conf->raid_disks - conf->max_degraded)); 1152 blk_limits_io_opt(limits, chunk_size * (conf->raid_disks - conf->max_degraded));
@@ -1420,19 +1170,13 @@ static void raid_resume(struct dm_target *ti)
1420{ 1170{
1421 struct raid_set *rs = ti->private; 1171 struct raid_set *rs = ti->private;
1422 1172
1423 set_bit(MD_CHANGE_DEVS, &rs->md.flags); 1173 bitmap_load(&rs->md);
1424 if (!rs->bitmap_loaded) {
1425 bitmap_load(&rs->md);
1426 rs->bitmap_loaded = 1;
1427 }
1428
1429 clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
1430 mddev_resume(&rs->md); 1174 mddev_resume(&rs->md);
1431} 1175}
1432 1176
1433static struct target_type raid_target = { 1177static struct target_type raid_target = {
1434 .name = "raid", 1178 .name = "raid",
1435 .version = {1, 4, 0}, 1179 .version = {1, 1, 0},
1436 .module = THIS_MODULE, 1180 .module = THIS_MODULE,
1437 .ctr = raid_ctr, 1181 .ctr = raid_ctr,
1438 .dtr = raid_dtr, 1182 .dtr = raid_dtr,
@@ -1459,8 +1203,6 @@ module_init(dm_raid_init);
1459module_exit(dm_raid_exit); 1203module_exit(dm_raid_exit);
1460 1204
1461MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target"); 1205MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target");
1462MODULE_ALIAS("dm-raid1");
1463MODULE_ALIAS("dm-raid10");
1464MODULE_ALIAS("dm-raid4"); 1206MODULE_ALIAS("dm-raid4");
1465MODULE_ALIAS("dm-raid5"); 1207MODULE_ALIAS("dm-raid5");
1466MODULE_ALIAS("dm-raid6"); 1208MODULE_ALIAS("dm-raid6");
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index fa519185ebb..9bfd057be68 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -61,6 +61,7 @@ struct mirror_set {
61 struct dm_region_hash *rh; 61 struct dm_region_hash *rh;
62 struct dm_kcopyd_client *kcopyd_client; 62 struct dm_kcopyd_client *kcopyd_client;
63 struct dm_io_client *io_client; 63 struct dm_io_client *io_client;
64 mempool_t *read_record_pool;
64 65
65 /* recovery */ 66 /* recovery */
66 region_t nr_regions; 67 region_t nr_regions;
@@ -138,13 +139,14 @@ static void dispatch_bios(void *context, struct bio_list *bio_list)
138 queue_bio(ms, bio, WRITE); 139 queue_bio(ms, bio, WRITE);
139} 140}
140 141
141struct dm_raid1_bio_record { 142#define MIN_READ_RECORDS 20
143struct dm_raid1_read_record {
142 struct mirror *m; 144 struct mirror *m;
143 /* if details->bi_bdev == NULL, details were not saved */
144 struct dm_bio_details details; 145 struct dm_bio_details details;
145 region_t write_region;
146}; 146};
147 147
148static struct kmem_cache *_dm_raid1_read_record_cache;
149
148/* 150/*
149 * Every mirror should look like this one. 151 * Every mirror should look like this one.
150 */ 152 */
@@ -874,9 +876,19 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
874 atomic_set(&ms->suspend, 0); 876 atomic_set(&ms->suspend, 0);
875 atomic_set(&ms->default_mirror, DEFAULT_MIRROR); 877 atomic_set(&ms->default_mirror, DEFAULT_MIRROR);
876 878
879 ms->read_record_pool = mempool_create_slab_pool(MIN_READ_RECORDS,
880 _dm_raid1_read_record_cache);
881
882 if (!ms->read_record_pool) {
883 ti->error = "Error creating mirror read_record_pool";
884 kfree(ms);
885 return NULL;
886 }
887
877 ms->io_client = dm_io_client_create(); 888 ms->io_client = dm_io_client_create();
878 if (IS_ERR(ms->io_client)) { 889 if (IS_ERR(ms->io_client)) {
879 ti->error = "Error creating dm_io client"; 890 ti->error = "Error creating dm_io client";
891 mempool_destroy(ms->read_record_pool);
880 kfree(ms); 892 kfree(ms);
881 return NULL; 893 return NULL;
882 } 894 }
@@ -888,6 +900,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
888 if (IS_ERR(ms->rh)) { 900 if (IS_ERR(ms->rh)) {
889 ti->error = "Error creating dirty region hash"; 901 ti->error = "Error creating dirty region hash";
890 dm_io_client_destroy(ms->io_client); 902 dm_io_client_destroy(ms->io_client);
903 mempool_destroy(ms->read_record_pool);
891 kfree(ms); 904 kfree(ms);
892 return NULL; 905 return NULL;
893 } 906 }
@@ -903,6 +916,7 @@ static void free_context(struct mirror_set *ms, struct dm_target *ti,
903 916
904 dm_io_client_destroy(ms->io_client); 917 dm_io_client_destroy(ms->io_client);
905 dm_region_hash_destroy(ms->rh); 918 dm_region_hash_destroy(ms->rh);
919 mempool_destroy(ms->read_record_pool);
906 kfree(ms); 920 kfree(ms);
907} 921}
908 922
@@ -910,9 +924,8 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
910 unsigned int mirror, char **argv) 924 unsigned int mirror, char **argv)
911{ 925{
912 unsigned long long offset; 926 unsigned long long offset;
913 char dummy;
914 927
915 if (sscanf(argv[1], "%llu%c", &offset, &dummy) != 1) { 928 if (sscanf(argv[1], "%llu", &offset) != 1) {
916 ti->error = "Invalid offset"; 929 ti->error = "Invalid offset";
917 return -EINVAL; 930 return -EINVAL;
918 } 931 }
@@ -940,14 +953,13 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti,
940{ 953{
941 unsigned param_count; 954 unsigned param_count;
942 struct dm_dirty_log *dl; 955 struct dm_dirty_log *dl;
943 char dummy;
944 956
945 if (argc < 2) { 957 if (argc < 2) {
946 ti->error = "Insufficient mirror log arguments"; 958 ti->error = "Insufficient mirror log arguments";
947 return NULL; 959 return NULL;
948 } 960 }
949 961
950 if (sscanf(argv[1], "%u%c", &param_count, &dummy) != 1) { 962 if (sscanf(argv[1], "%u", &param_count) != 1) {
951 ti->error = "Invalid mirror log argument count"; 963 ti->error = "Invalid mirror log argument count";
952 return NULL; 964 return NULL;
953 } 965 }
@@ -974,14 +986,13 @@ static int parse_features(struct mirror_set *ms, unsigned argc, char **argv,
974{ 986{
975 unsigned num_features; 987 unsigned num_features;
976 struct dm_target *ti = ms->ti; 988 struct dm_target *ti = ms->ti;
977 char dummy;
978 989
979 *args_used = 0; 990 *args_used = 0;
980 991
981 if (!argc) 992 if (!argc)
982 return 0; 993 return 0;
983 994
984 if (sscanf(argv[0], "%u%c", &num_features, &dummy) != 1) { 995 if (sscanf(argv[0], "%u", &num_features) != 1) {
985 ti->error = "Invalid number of features"; 996 ti->error = "Invalid number of features";
986 return -EINVAL; 997 return -EINVAL;
987 } 998 }
@@ -1025,7 +1036,6 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1025 unsigned int nr_mirrors, m, args_used; 1036 unsigned int nr_mirrors, m, args_used;
1026 struct mirror_set *ms; 1037 struct mirror_set *ms;
1027 struct dm_dirty_log *dl; 1038 struct dm_dirty_log *dl;
1028 char dummy;
1029 1039
1030 dl = create_dirty_log(ti, argc, argv, &args_used); 1040 dl = create_dirty_log(ti, argc, argv, &args_used);
1031 if (!dl) 1041 if (!dl)
@@ -1034,7 +1044,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1034 argv += args_used; 1044 argv += args_used;
1035 argc -= args_used; 1045 argc -= args_used;
1036 1046
1037 if (!argc || sscanf(argv[0], "%u%c", &nr_mirrors, &dummy) != 1 || 1047 if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 ||
1038 nr_mirrors < 2 || nr_mirrors > DM_KCOPYD_MAX_REGIONS + 1) { 1048 nr_mirrors < 2 || nr_mirrors > DM_KCOPYD_MAX_REGIONS + 1) {
1039 ti->error = "Invalid number of mirrors"; 1049 ti->error = "Invalid number of mirrors";
1040 dm_dirty_log_destroy(dl); 1050 dm_dirty_log_destroy(dl);
@@ -1067,15 +1077,9 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1067 } 1077 }
1068 1078
1069 ti->private = ms; 1079 ti->private = ms;
1070 1080 ti->split_io = dm_rh_get_region_size(ms->rh);
1071 r = dm_set_target_max_io_len(ti, dm_rh_get_region_size(ms->rh));
1072 if (r)
1073 goto err_free_context;
1074
1075 ti->num_flush_requests = 1; 1081 ti->num_flush_requests = 1;
1076 ti->num_discard_requests = 1; 1082 ti->num_discard_requests = 1;
1077 ti->per_bio_data_size = sizeof(struct dm_raid1_bio_record);
1078 ti->discard_zeroes_data_unsupported = true;
1079 1083
1080 ms->kmirrord_wq = alloc_workqueue("kmirrord", 1084 ms->kmirrord_wq = alloc_workqueue("kmirrord",
1081 WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); 1085 WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
@@ -1133,7 +1137,7 @@ static void mirror_dtr(struct dm_target *ti)
1133 1137
1134 del_timer_sync(&ms->timer); 1138 del_timer_sync(&ms->timer);
1135 flush_workqueue(ms->kmirrord_wq); 1139 flush_workqueue(ms->kmirrord_wq);
1136 flush_work(&ms->trigger_event); 1140 flush_work_sync(&ms->trigger_event);
1137 dm_kcopyd_client_destroy(ms->kcopyd_client); 1141 dm_kcopyd_client_destroy(ms->kcopyd_client);
1138 destroy_workqueue(ms->kmirrord_wq); 1142 destroy_workqueue(ms->kmirrord_wq);
1139 free_context(ms, ti, ms->nr_mirrors); 1143 free_context(ms, ti, ms->nr_mirrors);
@@ -1142,20 +1146,18 @@ static void mirror_dtr(struct dm_target *ti)
1142/* 1146/*
1143 * Mirror mapping function 1147 * Mirror mapping function
1144 */ 1148 */
1145static int mirror_map(struct dm_target *ti, struct bio *bio) 1149static int mirror_map(struct dm_target *ti, struct bio *bio,
1150 union map_info *map_context)
1146{ 1151{
1147 int r, rw = bio_rw(bio); 1152 int r, rw = bio_rw(bio);
1148 struct mirror *m; 1153 struct mirror *m;
1149 struct mirror_set *ms = ti->private; 1154 struct mirror_set *ms = ti->private;
1155 struct dm_raid1_read_record *read_record = NULL;
1150 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); 1156 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
1151 struct dm_raid1_bio_record *bio_record =
1152 dm_per_bio_data(bio, sizeof(struct dm_raid1_bio_record));
1153
1154 bio_record->details.bi_bdev = NULL;
1155 1157
1156 if (rw == WRITE) { 1158 if (rw == WRITE) {
1157 /* Save region for mirror_end_io() handler */ 1159 /* Save region for mirror_end_io() handler */
1158 bio_record->write_region = dm_rh_bio_to_region(ms->rh, bio); 1160 map_context->ll = dm_rh_bio_to_region(ms->rh, bio);
1159 queue_bio(ms, bio, rw); 1161 queue_bio(ms, bio, rw);
1160 return DM_MAPIO_SUBMITTED; 1162 return DM_MAPIO_SUBMITTED;
1161 } 1163 }
@@ -1183,29 +1185,33 @@ static int mirror_map(struct dm_target *ti, struct bio *bio)
1183 if (unlikely(!m)) 1185 if (unlikely(!m))
1184 return -EIO; 1186 return -EIO;
1185 1187
1186 dm_bio_record(&bio_record->details, bio); 1188 read_record = mempool_alloc(ms->read_record_pool, GFP_NOIO);
1187 bio_record->m = m; 1189 if (likely(read_record)) {
1190 dm_bio_record(&read_record->details, bio);
1191 map_context->ptr = read_record;
1192 read_record->m = m;
1193 }
1188 1194
1189 map_bio(m, bio); 1195 map_bio(m, bio);
1190 1196
1191 return DM_MAPIO_REMAPPED; 1197 return DM_MAPIO_REMAPPED;
1192} 1198}
1193 1199
1194static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error) 1200static int mirror_end_io(struct dm_target *ti, struct bio *bio,
1201 int error, union map_info *map_context)
1195{ 1202{
1196 int rw = bio_rw(bio); 1203 int rw = bio_rw(bio);
1197 struct mirror_set *ms = (struct mirror_set *) ti->private; 1204 struct mirror_set *ms = (struct mirror_set *) ti->private;
1198 struct mirror *m = NULL; 1205 struct mirror *m = NULL;
1199 struct dm_bio_details *bd = NULL; 1206 struct dm_bio_details *bd = NULL;
1200 struct dm_raid1_bio_record *bio_record = 1207 struct dm_raid1_read_record *read_record = map_context->ptr;
1201 dm_per_bio_data(bio, sizeof(struct dm_raid1_bio_record));
1202 1208
1203 /* 1209 /*
1204 * We need to dec pending if this was a write. 1210 * We need to dec pending if this was a write.
1205 */ 1211 */
1206 if (rw == WRITE) { 1212 if (rw == WRITE) {
1207 if (!(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) 1213 if (!(bio->bi_rw & REQ_FLUSH))
1208 dm_rh_dec(ms->rh, bio_record->write_region); 1214 dm_rh_dec(ms->rh, map_context->ll);
1209 return error; 1215 return error;
1210 } 1216 }
1211 1217
@@ -1216,7 +1222,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
1216 goto out; 1222 goto out;
1217 1223
1218 if (unlikely(error)) { 1224 if (unlikely(error)) {
1219 if (!bio_record->details.bi_bdev) { 1225 if (!read_record) {
1220 /* 1226 /*
1221 * There wasn't enough memory to record necessary 1227 * There wasn't enough memory to record necessary
1222 * information for a retry or there was no other 1228 * information for a retry or there was no other
@@ -1226,7 +1232,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
1226 return -EIO; 1232 return -EIO;
1227 } 1233 }
1228 1234
1229 m = bio_record->m; 1235 m = read_record->m;
1230 1236
1231 DMERR("Mirror read failed from %s. Trying alternative device.", 1237 DMERR("Mirror read failed from %s. Trying alternative device.",
1232 m->dev->name); 1238 m->dev->name);
@@ -1238,18 +1244,22 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
1238 * mirror. 1244 * mirror.
1239 */ 1245 */
1240 if (default_ok(m) || mirror_available(ms, bio)) { 1246 if (default_ok(m) || mirror_available(ms, bio)) {
1241 bd = &bio_record->details; 1247 bd = &read_record->details;
1242 1248
1243 dm_bio_restore(bd, bio); 1249 dm_bio_restore(bd, bio);
1244 bio_record->details.bi_bdev = NULL; 1250 mempool_free(read_record, ms->read_record_pool);
1251 map_context->ptr = NULL;
1245 queue_bio(ms, bio, rw); 1252 queue_bio(ms, bio, rw);
1246 return DM_ENDIO_INCOMPLETE; 1253 return 1;
1247 } 1254 }
1248 DMERR("All replicated volumes dead, failing I/O"); 1255 DMERR("All replicated volumes dead, failing I/O");
1249 } 1256 }
1250 1257
1251out: 1258out:
1252 bio_record->details.bi_bdev = NULL; 1259 if (read_record) {
1260 mempool_free(read_record, ms->read_record_pool);
1261 map_context->ptr = NULL;
1262 }
1253 1263
1254 return error; 1264 return error;
1255} 1265}
@@ -1348,7 +1358,7 @@ static char device_status_char(struct mirror *m)
1348 1358
1349 1359
1350static int mirror_status(struct dm_target *ti, status_type_t type, 1360static int mirror_status(struct dm_target *ti, status_type_t type,
1351 unsigned status_flags, char *result, unsigned maxlen) 1361 char *result, unsigned int maxlen)
1352{ 1362{
1353 unsigned int m, sz = 0; 1363 unsigned int m, sz = 0;
1354 struct mirror_set *ms = (struct mirror_set *) ti->private; 1364 struct mirror_set *ms = (struct mirror_set *) ti->private;
@@ -1403,7 +1413,7 @@ static int mirror_iterate_devices(struct dm_target *ti,
1403 1413
1404static struct target_type mirror_target = { 1414static struct target_type mirror_target = {
1405 .name = "mirror", 1415 .name = "mirror",
1406 .version = {1, 13, 1}, 1416 .version = {1, 12, 1},
1407 .module = THIS_MODULE, 1417 .module = THIS_MODULE,
1408 .ctr = mirror_ctr, 1418 .ctr = mirror_ctr,
1409 .dtr = mirror_dtr, 1419 .dtr = mirror_dtr,
@@ -1420,6 +1430,13 @@ static int __init dm_mirror_init(void)
1420{ 1430{
1421 int r; 1431 int r;
1422 1432
1433 _dm_raid1_read_record_cache = KMEM_CACHE(dm_raid1_read_record, 0);
1434 if (!_dm_raid1_read_record_cache) {
1435 DMERR("Can't allocate dm_raid1_read_record cache");
1436 r = -ENOMEM;
1437 goto bad_cache;
1438 }
1439
1423 r = dm_register_target(&mirror_target); 1440 r = dm_register_target(&mirror_target);
1424 if (r < 0) { 1441 if (r < 0) {
1425 DMERR("Failed to register mirror target"); 1442 DMERR("Failed to register mirror target");
@@ -1429,12 +1446,15 @@ static int __init dm_mirror_init(void)
1429 return 0; 1446 return 0;
1430 1447
1431bad_target: 1448bad_target:
1449 kmem_cache_destroy(_dm_raid1_read_record_cache);
1450bad_cache:
1432 return r; 1451 return r;
1433} 1452}
1434 1453
1435static void __exit dm_mirror_exit(void) 1454static void __exit dm_mirror_exit(void)
1436{ 1455{
1437 dm_unregister_target(&mirror_target); 1456 dm_unregister_target(&mirror_target);
1457 kmem_cache_destroy(_dm_raid1_read_record_cache);
1438} 1458}
1439 1459
1440/* Module hooks */ 1460/* Module hooks */
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index 69732e03eb3..7771ed21218 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -404,9 +404,6 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio)
404 return; 404 return;
405 } 405 }
406 406
407 if (bio->bi_rw & REQ_DISCARD)
408 return;
409
410 /* We must inform the log that the sync count has changed. */ 407 /* We must inform the log that the sync count has changed. */
411 log->type->set_region_sync(log, region, 0); 408 log->type->set_region_sync(log, region, 0);
412 409
@@ -527,7 +524,7 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
527 struct bio *bio; 524 struct bio *bio;
528 525
529 for (bio = bios->head; bio; bio = bio->bi_next) { 526 for (bio = bios->head; bio; bio = bio->bi_next) {
530 if (bio->bi_rw & (REQ_FLUSH | REQ_DISCARD)) 527 if (bio->bi_rw & REQ_FLUSH)
531 continue; 528 continue;
532 rh_inc(rh, dm_rh_bio_to_region(rh, bio)); 529 rh_inc(rh, dm_rh_bio_to_region(rh, bio));
533 } 530 }
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c
index 6ab1192cdd5..24752f449be 100644
--- a/drivers/md/dm-round-robin.c
+++ b/drivers/md/dm-round-robin.c
@@ -14,7 +14,6 @@
14#include "dm-path-selector.h" 14#include "dm-path-selector.h"
15 15
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/module.h>
18 17
19#define DM_MSG_PREFIX "multipath round-robin" 18#define DM_MSG_PREFIX "multipath round-robin"
20 19
@@ -114,7 +113,6 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path,
114 struct selector *s = (struct selector *) ps->context; 113 struct selector *s = (struct selector *) ps->context;
115 struct path_info *pi; 114 struct path_info *pi;
116 unsigned repeat_count = RR_MIN_IO; 115 unsigned repeat_count = RR_MIN_IO;
117 char dummy;
118 116
119 if (argc > 1) { 117 if (argc > 1) {
120 *error = "round-robin ps: incorrect number of arguments"; 118 *error = "round-robin ps: incorrect number of arguments";
@@ -122,7 +120,7 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path,
122 } 120 }
123 121
124 /* First path argument is number of I/Os before switching path */ 122 /* First path argument is number of I/Os before switching path */
125 if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { 123 if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
126 *error = "round-robin ps: invalid repeat count"; 124 *error = "round-robin ps: invalid repeat count";
127 return -EINVAL; 125 return -EINVAL;
128 } 126 }
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c
index 9df8f6bd641..9c6c2e47ad6 100644
--- a/drivers/md/dm-service-time.c
+++ b/drivers/md/dm-service-time.c
@@ -12,7 +12,6 @@
12#include "dm-path-selector.h" 12#include "dm-path-selector.h"
13 13
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/module.h>
16 15
17#define DM_MSG_PREFIX "multipath service-time" 16#define DM_MSG_PREFIX "multipath service-time"
18#define ST_MIN_IO 1 17#define ST_MIN_IO 1
@@ -110,7 +109,6 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path,
110 struct path_info *pi; 109 struct path_info *pi;
111 unsigned repeat_count = ST_MIN_IO; 110 unsigned repeat_count = ST_MIN_IO;
112 unsigned relative_throughput = 1; 111 unsigned relative_throughput = 1;
113 char dummy;
114 112
115 /* 113 /*
116 * Arguments: [<repeat_count> [<relative_throughput>]] 114 * Arguments: [<repeat_count> [<relative_throughput>]]
@@ -129,13 +127,13 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path,
129 return -EINVAL; 127 return -EINVAL;
130 } 128 }
131 129
132 if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { 130 if (argc && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
133 *error = "service-time ps: invalid repeat count"; 131 *error = "service-time ps: invalid repeat count";
134 return -EINVAL; 132 return -EINVAL;
135 } 133 }
136 134
137 if ((argc == 2) && 135 if ((argc == 2) &&
138 (sscanf(argv[1], "%u%c", &relative_throughput, &dummy) != 1 || 136 (sscanf(argv[1], "%u", &relative_throughput) != 1 ||
139 relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) { 137 relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) {
140 *error = "service-time ps: invalid relative_throughput value"; 138 *error = "service-time ps: invalid relative_throughput value";
141 return -EINVAL; 139 return -EINVAL;
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 3ac415675b6..d1f1d701710 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -10,7 +10,6 @@
10#include <linux/mm.h> 10#include <linux/mm.h>
11#include <linux/pagemap.h> 11#include <linux/pagemap.h>
12#include <linux/vmalloc.h> 12#include <linux/vmalloc.h>
13#include <linux/export.h>
14#include <linux/slab.h> 13#include <linux/slab.h>
15#include <linux/dm-io.h> 14#include <linux/dm-io.h>
16 15
diff --git a/drivers/md/dm-snap-transient.c b/drivers/md/dm-snap-transient.c
index 1ce9a2586e4..a0898a66a2f 100644
--- a/drivers/md/dm-snap-transient.c
+++ b/drivers/md/dm-snap-transient.c
@@ -10,7 +10,6 @@
10#include <linux/mm.h> 10#include <linux/mm.h>
11#include <linux/pagemap.h> 11#include <linux/pagemap.h>
12#include <linux/vmalloc.h> 12#include <linux/vmalloc.h>
13#include <linux/export.h>
14#include <linux/slab.h> 13#include <linux/slab.h>
15#include <linux/dm-io.h> 14#include <linux/dm-io.h>
16 15
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 59fc18ae52c..6f758870fc1 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -79,6 +79,7 @@ struct dm_snapshot {
79 79
80 /* Chunks with outstanding reads */ 80 /* Chunks with outstanding reads */
81 spinlock_t tracked_chunk_lock; 81 spinlock_t tracked_chunk_lock;
82 mempool_t *tracked_chunk_pool;
82 struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE]; 83 struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE];
83 84
84 /* The on disk metadata handler */ 85 /* The on disk metadata handler */
@@ -190,38 +191,35 @@ struct dm_snap_tracked_chunk {
190 chunk_t chunk; 191 chunk_t chunk;
191}; 192};
192 193
193static void init_tracked_chunk(struct bio *bio) 194static struct kmem_cache *tracked_chunk_cache;
194{
195 struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
196 INIT_HLIST_NODE(&c->node);
197}
198
199static bool is_bio_tracked(struct bio *bio)
200{
201 struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
202 return !hlist_unhashed(&c->node);
203}
204 195
205static void track_chunk(struct dm_snapshot *s, struct bio *bio, chunk_t chunk) 196static struct dm_snap_tracked_chunk *track_chunk(struct dm_snapshot *s,
197 chunk_t chunk)
206{ 198{
207 struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk)); 199 struct dm_snap_tracked_chunk *c = mempool_alloc(s->tracked_chunk_pool,
200 GFP_NOIO);
201 unsigned long flags;
208 202
209 c->chunk = chunk; 203 c->chunk = chunk;
210 204
211 spin_lock_irq(&s->tracked_chunk_lock); 205 spin_lock_irqsave(&s->tracked_chunk_lock, flags);
212 hlist_add_head(&c->node, 206 hlist_add_head(&c->node,
213 &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)]); 207 &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)]);
214 spin_unlock_irq(&s->tracked_chunk_lock); 208 spin_unlock_irqrestore(&s->tracked_chunk_lock, flags);
209
210 return c;
215} 211}
216 212
217static void stop_tracking_chunk(struct dm_snapshot *s, struct bio *bio) 213static void stop_tracking_chunk(struct dm_snapshot *s,
214 struct dm_snap_tracked_chunk *c)
218{ 215{
219 struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
220 unsigned long flags; 216 unsigned long flags;
221 217
222 spin_lock_irqsave(&s->tracked_chunk_lock, flags); 218 spin_lock_irqsave(&s->tracked_chunk_lock, flags);
223 hlist_del(&c->node); 219 hlist_del(&c->node);
224 spin_unlock_irqrestore(&s->tracked_chunk_lock, flags); 220 spin_unlock_irqrestore(&s->tracked_chunk_lock, flags);
221
222 mempool_free(c, s->tracked_chunk_pool);
225} 223}
226 224
227static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk) 225static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk)
@@ -693,7 +691,7 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new)
693 * Return a minimum chunk size of all snapshots that have the specified origin. 691 * Return a minimum chunk size of all snapshots that have the specified origin.
694 * Return zero if the origin has no snapshots. 692 * Return zero if the origin has no snapshots.
695 */ 693 */
696static uint32_t __minimum_chunk_size(struct origin *o) 694static sector_t __minimum_chunk_size(struct origin *o)
697{ 695{
698 struct dm_snapshot *snap; 696 struct dm_snapshot *snap;
699 unsigned chunk_size = 0; 697 unsigned chunk_size = 0;
@@ -703,7 +701,7 @@ static uint32_t __minimum_chunk_size(struct origin *o)
703 chunk_size = min_not_zero(chunk_size, 701 chunk_size = min_not_zero(chunk_size,
704 snap->store->chunk_size); 702 snap->store->chunk_size);
705 703
706 return (uint32_t) chunk_size; 704 return chunk_size;
707} 705}
708 706
709/* 707/*
@@ -1122,6 +1120,14 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1122 goto bad_pending_pool; 1120 goto bad_pending_pool;
1123 } 1121 }
1124 1122
1123 s->tracked_chunk_pool = mempool_create_slab_pool(MIN_IOS,
1124 tracked_chunk_cache);
1125 if (!s->tracked_chunk_pool) {
1126 ti->error = "Could not allocate tracked_chunk mempool for "
1127 "tracking reads";
1128 goto bad_tracked_chunk_pool;
1129 }
1130
1125 for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++) 1131 for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++)
1126 INIT_HLIST_HEAD(&s->tracked_chunk_hash[i]); 1132 INIT_HLIST_HEAD(&s->tracked_chunk_hash[i]);
1127 1133
@@ -1129,7 +1135,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1129 1135
1130 ti->private = s; 1136 ti->private = s;
1131 ti->num_flush_requests = num_flush_requests; 1137 ti->num_flush_requests = num_flush_requests;
1132 ti->per_bio_data_size = sizeof(struct dm_snap_tracked_chunk);
1133 1138
1134 /* Add snapshot to the list of snapshots for this origin */ 1139 /* Add snapshot to the list of snapshots for this origin */
1135 /* Exceptions aren't triggered till snapshot_resume() is called */ 1140 /* Exceptions aren't triggered till snapshot_resume() is called */
@@ -1167,10 +1172,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1167 ti->error = "Chunk size not set"; 1172 ti->error = "Chunk size not set";
1168 goto bad_read_metadata; 1173 goto bad_read_metadata;
1169 } 1174 }
1170 1175 ti->split_io = s->store->chunk_size;
1171 r = dm_set_target_max_io_len(ti, s->store->chunk_size);
1172 if (r)
1173 goto bad_read_metadata;
1174 1176
1175 return 0; 1177 return 0;
1176 1178
@@ -1178,6 +1180,9 @@ bad_read_metadata:
1178 unregister_snapshot(s); 1180 unregister_snapshot(s);
1179 1181
1180bad_load_and_register: 1182bad_load_and_register:
1183 mempool_destroy(s->tracked_chunk_pool);
1184
1185bad_tracked_chunk_pool:
1181 mempool_destroy(s->pending_pool); 1186 mempool_destroy(s->pending_pool);
1182 1187
1183bad_pending_pool: 1188bad_pending_pool:
@@ -1234,7 +1239,7 @@ static void __handover_exceptions(struct dm_snapshot *snap_src,
1234 snap_dest->store->snap = snap_dest; 1239 snap_dest->store->snap = snap_dest;
1235 snap_src->store->snap = snap_src; 1240 snap_src->store->snap = snap_src;
1236 1241
1237 snap_dest->ti->max_io_len = snap_dest->store->chunk_size; 1242 snap_dest->ti->split_io = snap_dest->store->chunk_size;
1238 snap_dest->valid = snap_src->valid; 1243 snap_dest->valid = snap_src->valid;
1239 1244
1240 /* 1245 /*
@@ -1282,6 +1287,8 @@ static void snapshot_dtr(struct dm_target *ti)
1282 BUG_ON(!hlist_empty(&s->tracked_chunk_hash[i])); 1287 BUG_ON(!hlist_empty(&s->tracked_chunk_hash[i]));
1283#endif 1288#endif
1284 1289
1290 mempool_destroy(s->tracked_chunk_pool);
1291
1285 __free_exceptions(s); 1292 __free_exceptions(s);
1286 1293
1287 mempool_destroy(s->pending_pool); 1294 mempool_destroy(s->pending_pool);
@@ -1567,7 +1574,8 @@ static void remap_exception(struct dm_snapshot *s, struct dm_exception *e,
1567 s->store->chunk_mask); 1574 s->store->chunk_mask);
1568} 1575}
1569 1576
1570static int snapshot_map(struct dm_target *ti, struct bio *bio) 1577static int snapshot_map(struct dm_target *ti, struct bio *bio,
1578 union map_info *map_context)
1571{ 1579{
1572 struct dm_exception *e; 1580 struct dm_exception *e;
1573 struct dm_snapshot *s = ti->private; 1581 struct dm_snapshot *s = ti->private;
@@ -1575,8 +1583,6 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
1575 chunk_t chunk; 1583 chunk_t chunk;
1576 struct dm_snap_pending_exception *pe = NULL; 1584 struct dm_snap_pending_exception *pe = NULL;
1577 1585
1578 init_tracked_chunk(bio);
1579
1580 if (bio->bi_rw & REQ_FLUSH) { 1586 if (bio->bi_rw & REQ_FLUSH) {
1581 bio->bi_bdev = s->cow->bdev; 1587 bio->bi_bdev = s->cow->bdev;
1582 return DM_MAPIO_REMAPPED; 1588 return DM_MAPIO_REMAPPED;
@@ -1661,7 +1667,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
1661 } 1667 }
1662 } else { 1668 } else {
1663 bio->bi_bdev = s->origin->bdev; 1669 bio->bi_bdev = s->origin->bdev;
1664 track_chunk(s, bio, chunk); 1670 map_context->ptr = track_chunk(s, chunk);
1665 } 1671 }
1666 1672
1667out_unlock: 1673out_unlock:
@@ -1682,20 +1688,20 @@ out:
1682 * If merging is currently taking place on the chunk in question, the 1688 * If merging is currently taking place on the chunk in question, the
1683 * I/O is deferred by adding it to s->bios_queued_during_merge. 1689 * I/O is deferred by adding it to s->bios_queued_during_merge.
1684 */ 1690 */
1685static int snapshot_merge_map(struct dm_target *ti, struct bio *bio) 1691static int snapshot_merge_map(struct dm_target *ti, struct bio *bio,
1692 union map_info *map_context)
1686{ 1693{
1687 struct dm_exception *e; 1694 struct dm_exception *e;
1688 struct dm_snapshot *s = ti->private; 1695 struct dm_snapshot *s = ti->private;
1689 int r = DM_MAPIO_REMAPPED; 1696 int r = DM_MAPIO_REMAPPED;
1690 chunk_t chunk; 1697 chunk_t chunk;
1691 1698
1692 init_tracked_chunk(bio);
1693
1694 if (bio->bi_rw & REQ_FLUSH) { 1699 if (bio->bi_rw & REQ_FLUSH) {
1695 if (!dm_bio_get_target_request_nr(bio)) 1700 if (!map_context->target_request_nr)
1696 bio->bi_bdev = s->origin->bdev; 1701 bio->bi_bdev = s->origin->bdev;
1697 else 1702 else
1698 bio->bi_bdev = s->cow->bdev; 1703 bio->bi_bdev = s->cow->bdev;
1704 map_context->ptr = NULL;
1699 return DM_MAPIO_REMAPPED; 1705 return DM_MAPIO_REMAPPED;
1700 } 1706 }
1701 1707
@@ -1724,7 +1730,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
1724 remap_exception(s, e, bio, chunk); 1730 remap_exception(s, e, bio, chunk);
1725 1731
1726 if (bio_rw(bio) == WRITE) 1732 if (bio_rw(bio) == WRITE)
1727 track_chunk(s, bio, chunk); 1733 map_context->ptr = track_chunk(s, chunk);
1728 goto out_unlock; 1734 goto out_unlock;
1729 } 1735 }
1730 1736
@@ -1742,12 +1748,14 @@ out_unlock:
1742 return r; 1748 return r;
1743} 1749}
1744 1750
1745static int snapshot_end_io(struct dm_target *ti, struct bio *bio, int error) 1751static int snapshot_end_io(struct dm_target *ti, struct bio *bio,
1752 int error, union map_info *map_context)
1746{ 1753{
1747 struct dm_snapshot *s = ti->private; 1754 struct dm_snapshot *s = ti->private;
1755 struct dm_snap_tracked_chunk *c = map_context->ptr;
1748 1756
1749 if (is_bio_tracked(bio)) 1757 if (c)
1750 stop_tracking_chunk(s, bio); 1758 stop_tracking_chunk(s, c);
1751 1759
1752 return 0; 1760 return 0;
1753} 1761}
@@ -1809,9 +1817,9 @@ static void snapshot_resume(struct dm_target *ti)
1809 up_write(&s->lock); 1817 up_write(&s->lock);
1810} 1818}
1811 1819
1812static uint32_t get_origin_minimum_chunksize(struct block_device *bdev) 1820static sector_t get_origin_minimum_chunksize(struct block_device *bdev)
1813{ 1821{
1814 uint32_t min_chunksize; 1822 sector_t min_chunksize;
1815 1823
1816 down_read(&_origins_lock); 1824 down_read(&_origins_lock);
1817 min_chunksize = __minimum_chunk_size(__lookup_origin(bdev)); 1825 min_chunksize = __minimum_chunk_size(__lookup_origin(bdev));
@@ -1830,15 +1838,15 @@ static void snapshot_merge_resume(struct dm_target *ti)
1830 snapshot_resume(ti); 1838 snapshot_resume(ti);
1831 1839
1832 /* 1840 /*
1833 * snapshot-merge acts as an origin, so set ti->max_io_len 1841 * snapshot-merge acts as an origin, so set ti->split_io
1834 */ 1842 */
1835 ti->max_io_len = get_origin_minimum_chunksize(s->origin->bdev); 1843 ti->split_io = get_origin_minimum_chunksize(s->origin->bdev);
1836 1844
1837 start_merge(s); 1845 start_merge(s);
1838} 1846}
1839 1847
1840static int snapshot_status(struct dm_target *ti, status_type_t type, 1848static int snapshot_status(struct dm_target *ti, status_type_t type,
1841 unsigned status_flags, char *result, unsigned maxlen) 1849 char *result, unsigned int maxlen)
1842{ 1850{
1843 unsigned sz = 0; 1851 unsigned sz = 0;
1844 struct dm_snapshot *snap = ti->private; 1852 struct dm_snapshot *snap = ti->private;
@@ -2065,12 +2073,12 @@ static int origin_write_extent(struct dm_snapshot *merging_snap,
2065 struct origin *o; 2073 struct origin *o;
2066 2074
2067 /* 2075 /*
2068 * The origin's __minimum_chunk_size() got stored in max_io_len 2076 * The origin's __minimum_chunk_size() got stored in split_io
2069 * by snapshot_merge_resume(). 2077 * by snapshot_merge_resume().
2070 */ 2078 */
2071 down_read(&_origins_lock); 2079 down_read(&_origins_lock);
2072 o = __lookup_origin(merging_snap->origin->bdev); 2080 o = __lookup_origin(merging_snap->origin->bdev);
2073 for (n = 0; n < size; n += merging_snap->ti->max_io_len) 2081 for (n = 0; n < size; n += merging_snap->ti->split_io)
2074 if (__origin_write(&o->snapshots, sector + n, NULL) == 2082 if (__origin_write(&o->snapshots, sector + n, NULL) ==
2075 DM_MAPIO_SUBMITTED) 2083 DM_MAPIO_SUBMITTED)
2076 must_wait = 1; 2084 must_wait = 1;
@@ -2116,7 +2124,8 @@ static void origin_dtr(struct dm_target *ti)
2116 dm_put_device(ti, dev); 2124 dm_put_device(ti, dev);
2117} 2125}
2118 2126
2119static int origin_map(struct dm_target *ti, struct bio *bio) 2127static int origin_map(struct dm_target *ti, struct bio *bio,
2128 union map_info *map_context)
2120{ 2129{
2121 struct dm_dev *dev = ti->private; 2130 struct dm_dev *dev = ti->private;
2122 bio->bi_bdev = dev->bdev; 2131 bio->bi_bdev = dev->bdev;
@@ -2129,18 +2138,18 @@ static int origin_map(struct dm_target *ti, struct bio *bio)
2129} 2138}
2130 2139
2131/* 2140/*
2132 * Set the target "max_io_len" field to the minimum of all the snapshots' 2141 * Set the target "split_io" field to the minimum of all the snapshots'
2133 * chunk sizes. 2142 * chunk sizes.
2134 */ 2143 */
2135static void origin_resume(struct dm_target *ti) 2144static void origin_resume(struct dm_target *ti)
2136{ 2145{
2137 struct dm_dev *dev = ti->private; 2146 struct dm_dev *dev = ti->private;
2138 2147
2139 ti->max_io_len = get_origin_minimum_chunksize(dev->bdev); 2148 ti->split_io = get_origin_minimum_chunksize(dev->bdev);
2140} 2149}
2141 2150
2142static int origin_status(struct dm_target *ti, status_type_t type, 2151static int origin_status(struct dm_target *ti, status_type_t type, char *result,
2143 unsigned status_flags, char *result, unsigned maxlen) 2152 unsigned int maxlen)
2144{ 2153{
2145 struct dm_dev *dev = ti->private; 2154 struct dm_dev *dev = ti->private;
2146 2155
@@ -2167,6 +2176,7 @@ static int origin_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
2167 return max_size; 2176 return max_size;
2168 2177
2169 bvm->bi_bdev = dev->bdev; 2178 bvm->bi_bdev = dev->bdev;
2179 bvm->bi_sector = bvm->bi_sector;
2170 2180
2171 return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); 2181 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2172} 2182}
@@ -2181,7 +2191,7 @@ static int origin_iterate_devices(struct dm_target *ti,
2181 2191
2182static struct target_type origin_target = { 2192static struct target_type origin_target = {
2183 .name = "snapshot-origin", 2193 .name = "snapshot-origin",
2184 .version = {1, 8, 0}, 2194 .version = {1, 7, 1},
2185 .module = THIS_MODULE, 2195 .module = THIS_MODULE,
2186 .ctr = origin_ctr, 2196 .ctr = origin_ctr,
2187 .dtr = origin_dtr, 2197 .dtr = origin_dtr,
@@ -2194,7 +2204,7 @@ static struct target_type origin_target = {
2194 2204
2195static struct target_type snapshot_target = { 2205static struct target_type snapshot_target = {
2196 .name = "snapshot", 2206 .name = "snapshot",
2197 .version = {1, 11, 0}, 2207 .version = {1, 10, 0},
2198 .module = THIS_MODULE, 2208 .module = THIS_MODULE,
2199 .ctr = snapshot_ctr, 2209 .ctr = snapshot_ctr,
2200 .dtr = snapshot_dtr, 2210 .dtr = snapshot_dtr,
@@ -2208,7 +2218,7 @@ static struct target_type snapshot_target = {
2208 2218
2209static struct target_type merge_target = { 2219static struct target_type merge_target = {
2210 .name = dm_snapshot_merge_target_name, 2220 .name = dm_snapshot_merge_target_name,
2211 .version = {1, 2, 0}, 2221 .version = {1, 1, 0},
2212 .module = THIS_MODULE, 2222 .module = THIS_MODULE,
2213 .ctr = snapshot_ctr, 2223 .ctr = snapshot_ctr,
2214 .dtr = snapshot_dtr, 2224 .dtr = snapshot_dtr,
@@ -2269,8 +2279,17 @@ static int __init dm_snapshot_init(void)
2269 goto bad_pending_cache; 2279 goto bad_pending_cache;
2270 } 2280 }
2271 2281
2282 tracked_chunk_cache = KMEM_CACHE(dm_snap_tracked_chunk, 0);
2283 if (!tracked_chunk_cache) {
2284 DMERR("Couldn't create cache to track chunks in use.");
2285 r = -ENOMEM;
2286 goto bad_tracked_chunk_cache;
2287 }
2288
2272 return 0; 2289 return 0;
2273 2290
2291bad_tracked_chunk_cache:
2292 kmem_cache_destroy(pending_cache);
2274bad_pending_cache: 2293bad_pending_cache:
2275 kmem_cache_destroy(exception_cache); 2294 kmem_cache_destroy(exception_cache);
2276bad_exception_cache: 2295bad_exception_cache:
@@ -2296,6 +2315,7 @@ static void __exit dm_snapshot_exit(void)
2296 exit_origin_hash(); 2315 exit_origin_hash();
2297 kmem_cache_destroy(pending_cache); 2316 kmem_cache_destroy(pending_cache);
2298 kmem_cache_destroy(exception_cache); 2317 kmem_cache_destroy(exception_cache);
2318 kmem_cache_destroy(tracked_chunk_cache);
2299 2319
2300 dm_exception_store_exit(); 2320 dm_exception_store_exit();
2301} 2321}
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index c89cde86d40..3d80cf0c152 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -26,12 +26,14 @@ struct stripe {
26struct stripe_c { 26struct stripe_c {
27 uint32_t stripes; 27 uint32_t stripes;
28 int stripes_shift; 28 int stripes_shift;
29 sector_t stripes_mask;
29 30
30 /* The size of this target / num. stripes */ 31 /* The size of this target / num. stripes */
31 sector_t stripe_width; 32 sector_t stripe_width;
32 33
33 uint32_t chunk_size; 34 /* stripe chunk size */
34 int chunk_size_shift; 35 uint32_t chunk_shift;
36 sector_t chunk_mask;
35 37
36 /* Needed for handling events */ 38 /* Needed for handling events */
37 struct dm_target *ti; 39 struct dm_target *ti;
@@ -73,9 +75,8 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
73 unsigned int stripe, char **argv) 75 unsigned int stripe, char **argv)
74{ 76{
75 unsigned long long start; 77 unsigned long long start;
76 char dummy;
77 78
78 if (sscanf(argv[1], "%llu%c", &start, &dummy) != 1) 79 if (sscanf(argv[1], "%llu", &start) != 1)
79 return -EINVAL; 80 return -EINVAL;
80 81
81 if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), 82 if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
@@ -89,7 +90,7 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
89 90
90/* 91/*
91 * Construct a striped mapping. 92 * Construct a striped mapping.
92 * <number of stripes> <chunk size> [<dev_path> <offset>]+ 93 * <number of stripes> <chunk size (2^^n)> [<dev_path> <offset>]+
93 */ 94 */
94static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) 95static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
95{ 96{
@@ -97,6 +98,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
97 sector_t width; 98 sector_t width;
98 uint32_t stripes; 99 uint32_t stripes;
99 uint32_t chunk_size; 100 uint32_t chunk_size;
101 char *end;
100 int r; 102 int r;
101 unsigned int i; 103 unsigned int i;
102 104
@@ -105,23 +107,34 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
105 return -EINVAL; 107 return -EINVAL;
106 } 108 }
107 109
108 if (kstrtouint(argv[0], 10, &stripes) || !stripes) { 110 stripes = simple_strtoul(argv[0], &end, 10);
111 if (!stripes || *end) {
109 ti->error = "Invalid stripe count"; 112 ti->error = "Invalid stripe count";
110 return -EINVAL; 113 return -EINVAL;
111 } 114 }
112 115
113 if (kstrtouint(argv[1], 10, &chunk_size) || !chunk_size) { 116 chunk_size = simple_strtoul(argv[1], &end, 10);
117 if (*end) {
114 ti->error = "Invalid chunk_size"; 118 ti->error = "Invalid chunk_size";
115 return -EINVAL; 119 return -EINVAL;
116 } 120 }
117 121
118 width = ti->len; 122 /*
119 if (sector_div(width, chunk_size)) { 123 * chunk_size is a power of two
124 */
125 if (!is_power_of_2(chunk_size) ||
126 (chunk_size < (PAGE_SIZE >> SECTOR_SHIFT))) {
127 ti->error = "Invalid chunk size";
128 return -EINVAL;
129 }
130
131 if (ti->len & (chunk_size - 1)) {
120 ti->error = "Target length not divisible by " 132 ti->error = "Target length not divisible by "
121 "chunk size"; 133 "chunk size";
122 return -EINVAL; 134 return -EINVAL;
123 } 135 }
124 136
137 width = ti->len;
125 if (sector_div(width, stripes)) { 138 if (sector_div(width, stripes)) {
126 ti->error = "Target length not divisible by " 139 ti->error = "Target length not divisible by "
127 "number of stripes"; 140 "number of stripes";
@@ -153,22 +166,17 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
153 166
154 if (stripes & (stripes - 1)) 167 if (stripes & (stripes - 1))
155 sc->stripes_shift = -1; 168 sc->stripes_shift = -1;
156 else 169 else {
157 sc->stripes_shift = __ffs(stripes); 170 sc->stripes_shift = ffs(stripes) - 1;
158 171 sc->stripes_mask = ((sector_t) stripes) - 1;
159 r = dm_set_target_max_io_len(ti, chunk_size); 172 }
160 if (r)
161 return r;
162 173
174 ti->split_io = chunk_size;
163 ti->num_flush_requests = stripes; 175 ti->num_flush_requests = stripes;
164 ti->num_discard_requests = stripes; 176 ti->num_discard_requests = stripes;
165 ti->num_write_same_requests = stripes;
166 177
167 sc->chunk_size = chunk_size; 178 sc->chunk_shift = ffs(chunk_size) - 1;
168 if (chunk_size & (chunk_size - 1)) 179 sc->chunk_mask = ((sector_t) chunk_size) - 1;
169 sc->chunk_size_shift = -1;
170 else
171 sc->chunk_size_shift = __ffs(chunk_size);
172 180
173 /* 181 /*
174 * Get the stripe destinations. 182 * Get the stripe destinations.
@@ -200,36 +208,24 @@ static void stripe_dtr(struct dm_target *ti)
200 for (i = 0; i < sc->stripes; i++) 208 for (i = 0; i < sc->stripes; i++)
201 dm_put_device(ti, sc->stripe[i].dev); 209 dm_put_device(ti, sc->stripe[i].dev);
202 210
203 flush_work(&sc->trigger_event); 211 flush_work_sync(&sc->trigger_event);
204 kfree(sc); 212 kfree(sc);
205} 213}
206 214
207static void stripe_map_sector(struct stripe_c *sc, sector_t sector, 215static void stripe_map_sector(struct stripe_c *sc, sector_t sector,
208 uint32_t *stripe, sector_t *result) 216 uint32_t *stripe, sector_t *result)
209{ 217{
210 sector_t chunk = dm_target_offset(sc->ti, sector); 218 sector_t offset = dm_target_offset(sc->ti, sector);
211 sector_t chunk_offset; 219 sector_t chunk = offset >> sc->chunk_shift;
212
213 if (sc->chunk_size_shift < 0)
214 chunk_offset = sector_div(chunk, sc->chunk_size);
215 else {
216 chunk_offset = chunk & (sc->chunk_size - 1);
217 chunk >>= sc->chunk_size_shift;
218 }
219 220
220 if (sc->stripes_shift < 0) 221 if (sc->stripes_shift < 0)
221 *stripe = sector_div(chunk, sc->stripes); 222 *stripe = sector_div(chunk, sc->stripes);
222 else { 223 else {
223 *stripe = chunk & (sc->stripes - 1); 224 *stripe = chunk & sc->stripes_mask;
224 chunk >>= sc->stripes_shift; 225 chunk >>= sc->stripes_shift;
225 } 226 }
226 227
227 if (sc->chunk_size_shift < 0) 228 *result = (chunk << sc->chunk_shift) | (offset & sc->chunk_mask);
228 chunk *= sc->chunk_size;
229 else
230 chunk <<= sc->chunk_size_shift;
231
232 *result = chunk + chunk_offset;
233} 229}
234 230
235static void stripe_map_range_sector(struct stripe_c *sc, sector_t sector, 231static void stripe_map_range_sector(struct stripe_c *sc, sector_t sector,
@@ -240,20 +236,13 @@ static void stripe_map_range_sector(struct stripe_c *sc, sector_t sector,
240 stripe_map_sector(sc, sector, &stripe, result); 236 stripe_map_sector(sc, sector, &stripe, result);
241 if (stripe == target_stripe) 237 if (stripe == target_stripe)
242 return; 238 return;
243 239 *result &= ~sc->chunk_mask; /* round down */
244 /* round down */
245 sector = *result;
246 if (sc->chunk_size_shift < 0)
247 *result -= sector_div(sector, sc->chunk_size);
248 else
249 *result = sector & ~(sector_t)(sc->chunk_size - 1);
250
251 if (target_stripe < stripe) 240 if (target_stripe < stripe)
252 *result += sc->chunk_size; /* next chunk */ 241 *result += sc->chunk_mask + 1; /* next chunk */
253} 242}
254 243
255static int stripe_map_range(struct stripe_c *sc, struct bio *bio, 244static int stripe_map_discard(struct stripe_c *sc, struct bio *bio,
256 uint32_t target_stripe) 245 uint32_t target_stripe)
257{ 246{
258 sector_t begin, end; 247 sector_t begin, end;
259 248
@@ -272,23 +261,23 @@ static int stripe_map_range(struct stripe_c *sc, struct bio *bio,
272 } 261 }
273} 262}
274 263
275static int stripe_map(struct dm_target *ti, struct bio *bio) 264static int stripe_map(struct dm_target *ti, struct bio *bio,
265 union map_info *map_context)
276{ 266{
277 struct stripe_c *sc = ti->private; 267 struct stripe_c *sc = ti->private;
278 uint32_t stripe; 268 uint32_t stripe;
279 unsigned target_request_nr; 269 unsigned target_request_nr;
280 270
281 if (bio->bi_rw & REQ_FLUSH) { 271 if (bio->bi_rw & REQ_FLUSH) {
282 target_request_nr = dm_bio_get_target_request_nr(bio); 272 target_request_nr = map_context->target_request_nr;
283 BUG_ON(target_request_nr >= sc->stripes); 273 BUG_ON(target_request_nr >= sc->stripes);
284 bio->bi_bdev = sc->stripe[target_request_nr].dev->bdev; 274 bio->bi_bdev = sc->stripe[target_request_nr].dev->bdev;
285 return DM_MAPIO_REMAPPED; 275 return DM_MAPIO_REMAPPED;
286 } 276 }
287 if (unlikely(bio->bi_rw & REQ_DISCARD) || 277 if (unlikely(bio->bi_rw & REQ_DISCARD)) {
288 unlikely(bio->bi_rw & REQ_WRITE_SAME)) { 278 target_request_nr = map_context->target_request_nr;
289 target_request_nr = dm_bio_get_target_request_nr(bio);
290 BUG_ON(target_request_nr >= sc->stripes); 279 BUG_ON(target_request_nr >= sc->stripes);
291 return stripe_map_range(sc, bio, target_request_nr); 280 return stripe_map_discard(sc, bio, target_request_nr);
292 } 281 }
293 282
294 stripe_map_sector(sc, bio->bi_sector, &stripe, &bio->bi_sector); 283 stripe_map_sector(sc, bio->bi_sector, &stripe, &bio->bi_sector);
@@ -312,8 +301,8 @@ static int stripe_map(struct dm_target *ti, struct bio *bio)
312 * 301 *
313 */ 302 */
314 303
315static int stripe_status(struct dm_target *ti, status_type_t type, 304static int stripe_status(struct dm_target *ti,
316 unsigned status_flags, char *result, unsigned maxlen) 305 status_type_t type, char *result, unsigned int maxlen)
317{ 306{
318 struct stripe_c *sc = (struct stripe_c *) ti->private; 307 struct stripe_c *sc = (struct stripe_c *) ti->private;
319 char buffer[sc->stripes + 1]; 308 char buffer[sc->stripes + 1];
@@ -334,7 +323,7 @@ static int stripe_status(struct dm_target *ti, status_type_t type,
334 323
335 case STATUSTYPE_TABLE: 324 case STATUSTYPE_TABLE:
336 DMEMIT("%d %llu", sc->stripes, 325 DMEMIT("%d %llu", sc->stripes,
337 (unsigned long long)sc->chunk_size); 326 (unsigned long long)sc->chunk_mask + 1);
338 for (i = 0; i < sc->stripes; i++) 327 for (i = 0; i < sc->stripes; i++)
339 DMEMIT(" %s %llu", sc->stripe[i].dev->name, 328 DMEMIT(" %s %llu", sc->stripe[i].dev->name,
340 (unsigned long long)sc->stripe[i].physical_start); 329 (unsigned long long)sc->stripe[i].physical_start);
@@ -343,7 +332,8 @@ static int stripe_status(struct dm_target *ti, status_type_t type,
343 return 0; 332 return 0;
344} 333}
345 334
346static int stripe_end_io(struct dm_target *ti, struct bio *bio, int error) 335static int stripe_end_io(struct dm_target *ti, struct bio *bio,
336 int error, union map_info *map_context)
347{ 337{
348 unsigned i; 338 unsigned i;
349 char major_minor[16]; 339 char major_minor[16];
@@ -400,7 +390,7 @@ static void stripe_io_hints(struct dm_target *ti,
400 struct queue_limits *limits) 390 struct queue_limits *limits)
401{ 391{
402 struct stripe_c *sc = ti->private; 392 struct stripe_c *sc = ti->private;
403 unsigned chunk_size = sc->chunk_size << SECTOR_SHIFT; 393 unsigned chunk_size = (sc->chunk_mask + 1) << 9;
404 394
405 blk_limits_io_min(limits, chunk_size); 395 blk_limits_io_min(limits, chunk_size);
406 blk_limits_io_opt(limits, chunk_size * sc->stripes); 396 blk_limits_io_opt(limits, chunk_size * sc->stripes);
@@ -428,7 +418,7 @@ static int stripe_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
428 418
429static struct target_type stripe_target = { 419static struct target_type stripe_target = {
430 .name = "striped", 420 .name = "striped",
431 .version = {1, 5, 0}, 421 .version = {1, 4, 0},
432 .module = THIS_MODULE, 422 .module = THIS_MODULE,
433 .ctr = stripe_ctr, 423 .ctr = stripe_ctr,
434 .dtr = stripe_dtr, 424 .dtr = stripe_dtr,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index daf25d0890b..bc04518e9d8 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -54,9 +54,7 @@ struct dm_table {
54 sector_t *highs; 54 sector_t *highs;
55 struct dm_target *targets; 55 struct dm_target *targets;
56 56
57 struct target_type *immutable_target_type;
58 unsigned integrity_supported:1; 57 unsigned integrity_supported:1;
59 unsigned singleton:1;
60 58
61 /* 59 /*
62 * Indicates the rw permissions for the new logical 60 * Indicates the rw permissions for the new logical
@@ -268,7 +266,8 @@ void dm_table_destroy(struct dm_table *t)
268 vfree(t->highs); 266 vfree(t->highs);
269 267
270 /* free the device list */ 268 /* free the device list */
271 free_devices(&t->devices); 269 if (t->devices.next != &t->devices)
270 free_devices(&t->devices);
272 271
273 dm_free_md_mempools(t->mempools); 272 dm_free_md_mempools(t->mempools);
274 273
@@ -463,11 +462,10 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
463 struct dm_dev_internal *dd; 462 struct dm_dev_internal *dd;
464 unsigned int major, minor; 463 unsigned int major, minor;
465 struct dm_table *t = ti->table; 464 struct dm_table *t = ti->table;
466 char dummy;
467 465
468 BUG_ON(!t); 466 BUG_ON(!t);
469 467
470 if (sscanf(path, "%u:%u%c", &major, &minor, &dummy) == 2) { 468 if (sscanf(path, "%u:%u", &major, &minor) == 2) {
471 /* Extract the major/minor numbers */ 469 /* Extract the major/minor numbers */
472 dev = MKDEV(major, minor); 470 dev = MKDEV(major, minor);
473 if (MAJOR(dev) != major || MINOR(dev) != minor) 471 if (MAJOR(dev) != major || MINOR(dev) != minor)
@@ -699,7 +697,7 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table,
699 while (i < dm_table_get_num_targets(table)) { 697 while (i < dm_table_get_num_targets(table)) {
700 ti = dm_table_get_target(table, i++); 698 ti = dm_table_get_target(table, i++);
701 699
702 blk_set_stacking_limits(&ti_limits); 700 blk_set_default_limits(&ti_limits);
703 701
704 /* combine all target devices' limits */ 702 /* combine all target devices' limits */
705 if (ti->type->iterate_devices) 703 if (ti->type->iterate_devices)
@@ -742,12 +740,6 @@ int dm_table_add_target(struct dm_table *t, const char *type,
742 char **argv; 740 char **argv;
743 struct dm_target *tgt; 741 struct dm_target *tgt;
744 742
745 if (t->singleton) {
746 DMERR("%s: target type %s must appear alone in table",
747 dm_device_name(t->md), t->targets->type->name);
748 return -EINVAL;
749 }
750
751 if ((r = check_space(t))) 743 if ((r = check_space(t)))
752 return r; 744 return r;
753 745
@@ -766,36 +758,6 @@ int dm_table_add_target(struct dm_table *t, const char *type,
766 return -EINVAL; 758 return -EINVAL;
767 } 759 }
768 760
769 if (dm_target_needs_singleton(tgt->type)) {
770 if (t->num_targets) {
771 DMERR("%s: target type %s must appear alone in table",
772 dm_device_name(t->md), type);
773 return -EINVAL;
774 }
775 t->singleton = 1;
776 }
777
778 if (dm_target_always_writeable(tgt->type) && !(t->mode & FMODE_WRITE)) {
779 DMERR("%s: target type %s may not be included in read-only tables",
780 dm_device_name(t->md), type);
781 return -EINVAL;
782 }
783
784 if (t->immutable_target_type) {
785 if (t->immutable_target_type != tgt->type) {
786 DMERR("%s: immutable target type %s cannot be mixed with other target types",
787 dm_device_name(t->md), t->immutable_target_type->name);
788 return -EINVAL;
789 }
790 } else if (dm_target_is_immutable(tgt->type)) {
791 if (t->num_targets) {
792 DMERR("%s: immutable target type %s cannot be mixed with other target types",
793 dm_device_name(t->md), tgt->type->name);
794 return -EINVAL;
795 }
796 t->immutable_target_type = tgt->type;
797 }
798
799 tgt->table = t; 761 tgt->table = t;
800 tgt->begin = start; 762 tgt->begin = start;
801 tgt->len = len; 763 tgt->len = len;
@@ -842,10 +804,9 @@ static int validate_next_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
842 unsigned *value, char **error, unsigned grouped) 804 unsigned *value, char **error, unsigned grouped)
843{ 805{
844 const char *arg_str = dm_shift_arg(arg_set); 806 const char *arg_str = dm_shift_arg(arg_set);
845 char dummy;
846 807
847 if (!arg_str || 808 if (!arg_str ||
848 (sscanf(arg_str, "%u%c", value, &dummy) != 1) || 809 (sscanf(arg_str, "%u", value) != 1) ||
849 (*value < arg->min) || 810 (*value < arg->min) ||
850 (*value > arg->max) || 811 (*value > arg->max) ||
851 (grouped && arg_set->argc < *value)) { 812 (grouped && arg_set->argc < *value)) {
@@ -954,11 +915,6 @@ unsigned dm_table_get_type(struct dm_table *t)
954 return t->type; 915 return t->type;
955} 916}
956 917
957struct target_type *dm_table_get_immutable_target_type(struct dm_table *t)
958{
959 return t->immutable_target_type;
960}
961
962bool dm_table_request_based(struct dm_table *t) 918bool dm_table_request_based(struct dm_table *t)
963{ 919{
964 return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED; 920 return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED;
@@ -967,22 +923,13 @@ bool dm_table_request_based(struct dm_table *t)
967int dm_table_alloc_md_mempools(struct dm_table *t) 923int dm_table_alloc_md_mempools(struct dm_table *t)
968{ 924{
969 unsigned type = dm_table_get_type(t); 925 unsigned type = dm_table_get_type(t);
970 unsigned per_bio_data_size = 0;
971 struct dm_target *tgt;
972 unsigned i;
973 926
974 if (unlikely(type == DM_TYPE_NONE)) { 927 if (unlikely(type == DM_TYPE_NONE)) {
975 DMWARN("no table type is set, can't allocate mempools"); 928 DMWARN("no table type is set, can't allocate mempools");
976 return -EINVAL; 929 return -EINVAL;
977 } 930 }
978 931
979 if (type == DM_TYPE_BIO_BASED) 932 t->mempools = dm_alloc_md_mempools(type, t->integrity_supported);
980 for (i = 0; i < t->num_targets; i++) {
981 tgt = t->targets + i;
982 per_bio_data_size = max(per_bio_data_size, tgt->per_bio_data_size);
983 }
984
985 t->mempools = dm_alloc_md_mempools(type, t->integrity_supported, per_bio_data_size);
986 if (!t->mempools) 933 if (!t->mempools)
987 return -ENOMEM; 934 return -ENOMEM;
988 935
@@ -1221,41 +1168,6 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
1221 return &t->targets[(KEYS_PER_NODE * n) + k]; 1168 return &t->targets[(KEYS_PER_NODE * n) + k];
1222} 1169}
1223 1170
1224static int count_device(struct dm_target *ti, struct dm_dev *dev,
1225 sector_t start, sector_t len, void *data)
1226{
1227 unsigned *num_devices = data;
1228
1229 (*num_devices)++;
1230
1231 return 0;
1232}
1233
1234/*
1235 * Check whether a table has no data devices attached using each
1236 * target's iterate_devices method.
1237 * Returns false if the result is unknown because a target doesn't
1238 * support iterate_devices.
1239 */
1240bool dm_table_has_no_data_devices(struct dm_table *table)
1241{
1242 struct dm_target *uninitialized_var(ti);
1243 unsigned i = 0, num_devices = 0;
1244
1245 while (i < dm_table_get_num_targets(table)) {
1246 ti = dm_table_get_target(table, i++);
1247
1248 if (!ti->type->iterate_devices)
1249 return false;
1250
1251 ti->type->iterate_devices(ti, count_device, &num_devices);
1252 if (num_devices)
1253 return false;
1254 }
1255
1256 return true;
1257}
1258
1259/* 1171/*
1260 * Establish the new table's queue_limits and validate them. 1172 * Establish the new table's queue_limits and validate them.
1261 */ 1173 */
@@ -1266,10 +1178,10 @@ int dm_calculate_queue_limits(struct dm_table *table,
1266 struct queue_limits ti_limits; 1178 struct queue_limits ti_limits;
1267 unsigned i = 0; 1179 unsigned i = 0;
1268 1180
1269 blk_set_stacking_limits(limits); 1181 blk_set_default_limits(limits);
1270 1182
1271 while (i < dm_table_get_num_targets(table)) { 1183 while (i < dm_table_get_num_targets(table)) {
1272 blk_set_stacking_limits(&ti_limits); 1184 blk_set_default_limits(&ti_limits);
1273 1185
1274 ti = dm_table_get_target(table, i++); 1186 ti = dm_table_get_target(table, i++);
1275 1187
@@ -1363,9 +1275,6 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned flush)
1363 if (!ti->num_flush_requests) 1275 if (!ti->num_flush_requests)
1364 continue; 1276 continue;
1365 1277
1366 if (ti->flush_supported)
1367 return 1;
1368
1369 if (ti->type->iterate_devices && 1278 if (ti->type->iterate_devices &&
1370 ti->type->iterate_devices(ti, device_flush_capable, &flush)) 1279 ti->type->iterate_devices(ti, device_flush_capable, &flush))
1371 return 1; 1280 return 1;
@@ -1390,66 +1299,6 @@ static bool dm_table_discard_zeroes_data(struct dm_table *t)
1390 return 1; 1299 return 1;
1391} 1300}
1392 1301
1393static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev,
1394 sector_t start, sector_t len, void *data)
1395{
1396 struct request_queue *q = bdev_get_queue(dev->bdev);
1397
1398 return q && blk_queue_nonrot(q);
1399}
1400
1401static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev,
1402 sector_t start, sector_t len, void *data)
1403{
1404 struct request_queue *q = bdev_get_queue(dev->bdev);
1405
1406 return q && !blk_queue_add_random(q);
1407}
1408
1409static bool dm_table_all_devices_attribute(struct dm_table *t,
1410 iterate_devices_callout_fn func)
1411{
1412 struct dm_target *ti;
1413 unsigned i = 0;
1414
1415 while (i < dm_table_get_num_targets(t)) {
1416 ti = dm_table_get_target(t, i++);
1417
1418 if (!ti->type->iterate_devices ||
1419 !ti->type->iterate_devices(ti, func, NULL))
1420 return 0;
1421 }
1422
1423 return 1;
1424}
1425
1426static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *dev,
1427 sector_t start, sector_t len, void *data)
1428{
1429 struct request_queue *q = bdev_get_queue(dev->bdev);
1430
1431 return q && !q->limits.max_write_same_sectors;
1432}
1433
1434static bool dm_table_supports_write_same(struct dm_table *t)
1435{
1436 struct dm_target *ti;
1437 unsigned i = 0;
1438
1439 while (i < dm_table_get_num_targets(t)) {
1440 ti = dm_table_get_target(t, i++);
1441
1442 if (!ti->num_write_same_requests)
1443 return false;
1444
1445 if (!ti->type->iterate_devices ||
1446 !ti->type->iterate_devices(ti, device_not_write_same_capable, NULL))
1447 return false;
1448 }
1449
1450 return true;
1451}
1452
1453void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, 1302void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
1454 struct queue_limits *limits) 1303 struct queue_limits *limits)
1455{ 1304{
@@ -1475,27 +1324,9 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
1475 if (!dm_table_discard_zeroes_data(t)) 1324 if (!dm_table_discard_zeroes_data(t))
1476 q->limits.discard_zeroes_data = 0; 1325 q->limits.discard_zeroes_data = 0;
1477 1326
1478 /* Ensure that all underlying devices are non-rotational. */
1479 if (dm_table_all_devices_attribute(t, device_is_nonrot))
1480 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
1481 else
1482 queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, q);
1483
1484 if (!dm_table_supports_write_same(t))
1485 q->limits.max_write_same_sectors = 0;
1486
1487 dm_table_set_integrity(t); 1327 dm_table_set_integrity(t);
1488 1328
1489 /* 1329 /*
1490 * Determine whether or not this queue's I/O timings contribute
1491 * to the entropy pool, Only request-based targets use this.
1492 * Clear QUEUE_FLAG_ADD_RANDOM if any underlying device does not
1493 * have it set.
1494 */
1495 if (blk_queue_add_random(q) && dm_table_all_devices_attribute(t, device_is_not_random))
1496 queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q);
1497
1498 /*
1499 * QUEUE_FLAG_STACKABLE must be set after all queue settings are 1330 * QUEUE_FLAG_STACKABLE must be set after all queue settings are
1500 * visible to other CPUs because, once the flag is set, incoming bios 1331 * visible to other CPUs because, once the flag is set, incoming bios
1501 * are processed by request-based dm, which refers to the queue 1332 * are processed by request-based dm, which refers to the queue
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 617d21a7725..8da366cf381 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -126,14 +126,15 @@ static void io_err_dtr(struct dm_target *tt)
126 /* empty */ 126 /* empty */
127} 127}
128 128
129static int io_err_map(struct dm_target *tt, struct bio *bio) 129static int io_err_map(struct dm_target *tt, struct bio *bio,
130 union map_info *map_context)
130{ 131{
131 return -EIO; 132 return -EIO;
132} 133}
133 134
134static struct target_type error_target = { 135static struct target_type error_target = {
135 .name = "error", 136 .name = "error",
136 .version = {1, 1, 0}, 137 .version = {1, 0, 1},
137 .ctr = io_err_ctr, 138 .ctr = io_err_ctr,
138 .dtr = io_err_dtr, 139 .dtr = io_err_dtr,
139 .map = io_err_map, 140 .map = io_err_map,
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
deleted file mode 100644
index 4d6e85367b8..00000000000
--- a/drivers/md/dm-thin-metadata.c
+++ /dev/null
@@ -1,1686 +0,0 @@
1/*
2 * Copyright (C) 2011-2012 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-thin-metadata.h"
8#include "persistent-data/dm-btree.h"
9#include "persistent-data/dm-space-map.h"
10#include "persistent-data/dm-space-map-disk.h"
11#include "persistent-data/dm-transaction-manager.h"
12
13#include <linux/list.h>
14#include <linux/device-mapper.h>
15#include <linux/workqueue.h>
16
17/*--------------------------------------------------------------------------
18 * As far as the metadata goes, there is:
19 *
20 * - A superblock in block zero, taking up fewer than 512 bytes for
21 * atomic writes.
22 *
23 * - A space map managing the metadata blocks.
24 *
25 * - A space map managing the data blocks.
26 *
27 * - A btree mapping our internal thin dev ids onto struct disk_device_details.
28 *
29 * - A hierarchical btree, with 2 levels which effectively maps (thin
30 * dev id, virtual block) -> block_time. Block time is a 64-bit
31 * field holding the time in the low 24 bits, and block in the top 48
32 * bits.
33 *
34 * BTrees consist solely of btree_nodes, that fill a block. Some are
35 * internal nodes, as such their values are a __le64 pointing to other
36 * nodes. Leaf nodes can store data of any reasonable size (ie. much
37 * smaller than the block size). The nodes consist of the header,
38 * followed by an array of keys, followed by an array of values. We have
39 * to binary search on the keys so they're all held together to help the
40 * cpu cache.
41 *
42 * Space maps have 2 btrees:
43 *
44 * - One maps a uint64_t onto a struct index_entry. Which points to a
45 * bitmap block, and has some details about how many free entries there
46 * are etc.
47 *
48 * - The bitmap blocks have a header (for the checksum). Then the rest
49 * of the block is pairs of bits. With the meaning being:
50 *
51 * 0 - ref count is 0
52 * 1 - ref count is 1
53 * 2 - ref count is 2
54 * 3 - ref count is higher than 2
55 *
56 * - If the count is higher than 2 then the ref count is entered in a
57 * second btree that directly maps the block_address to a uint32_t ref
58 * count.
59 *
60 * The space map metadata variant doesn't have a bitmaps btree. Instead
61 * it has one single blocks worth of index_entries. This avoids
62 * recursive issues with the bitmap btree needing to allocate space in
63 * order to insert. With a small data block size such as 64k the
64 * metadata support data devices that are hundreds of terrabytes.
65 *
66 * The space maps allocate space linearly from front to back. Space that
67 * is freed in a transaction is never recycled within that transaction.
68 * To try and avoid fragmenting _free_ space the allocator always goes
69 * back and fills in gaps.
70 *
71 * All metadata io is in THIN_METADATA_BLOCK_SIZE sized/aligned chunks
72 * from the block manager.
73 *--------------------------------------------------------------------------*/
74
75#define DM_MSG_PREFIX "thin metadata"
76
77#define THIN_SUPERBLOCK_MAGIC 27022010
78#define THIN_SUPERBLOCK_LOCATION 0
79#define THIN_VERSION 1
80#define THIN_METADATA_CACHE_SIZE 64
81#define SECTOR_TO_BLOCK_SHIFT 3
82
83/*
84 * 3 for btree insert +
85 * 2 for btree lookup used within space map
86 */
87#define THIN_MAX_CONCURRENT_LOCKS 5
88
89/* This should be plenty */
90#define SPACE_MAP_ROOT_SIZE 128
91
92/*
93 * Little endian on-disk superblock and device details.
94 */
95struct thin_disk_superblock {
96 __le32 csum; /* Checksum of superblock except for this field. */
97 __le32 flags;
98 __le64 blocknr; /* This block number, dm_block_t. */
99
100 __u8 uuid[16];
101 __le64 magic;
102 __le32 version;
103 __le32 time;
104
105 __le64 trans_id;
106
107 /*
108 * Root held by userspace transactions.
109 */
110 __le64 held_root;
111
112 __u8 data_space_map_root[SPACE_MAP_ROOT_SIZE];
113 __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
114
115 /*
116 * 2-level btree mapping (dev_id, (dev block, time)) -> data block
117 */
118 __le64 data_mapping_root;
119
120 /*
121 * Device detail root mapping dev_id -> device_details
122 */
123 __le64 device_details_root;
124
125 __le32 data_block_size; /* In 512-byte sectors. */
126
127 __le32 metadata_block_size; /* In 512-byte sectors. */
128 __le64 metadata_nr_blocks;
129
130 __le32 compat_flags;
131 __le32 compat_ro_flags;
132 __le32 incompat_flags;
133} __packed;
134
135struct disk_device_details {
136 __le64 mapped_blocks;
137 __le64 transaction_id; /* When created. */
138 __le32 creation_time;
139 __le32 snapshotted_time;
140} __packed;
141
142struct dm_pool_metadata {
143 struct hlist_node hash;
144
145 struct block_device *bdev;
146 struct dm_block_manager *bm;
147 struct dm_space_map *metadata_sm;
148 struct dm_space_map *data_sm;
149 struct dm_transaction_manager *tm;
150 struct dm_transaction_manager *nb_tm;
151
152 /*
153 * Two-level btree.
154 * First level holds thin_dev_t.
155 * Second level holds mappings.
156 */
157 struct dm_btree_info info;
158
159 /*
160 * Non-blocking version of the above.
161 */
162 struct dm_btree_info nb_info;
163
164 /*
165 * Just the top level for deleting whole devices.
166 */
167 struct dm_btree_info tl_info;
168
169 /*
170 * Just the bottom level for creating new devices.
171 */
172 struct dm_btree_info bl_info;
173
174 /*
175 * Describes the device details btree.
176 */
177 struct dm_btree_info details_info;
178
179 struct rw_semaphore root_lock;
180 uint32_t time;
181 dm_block_t root;
182 dm_block_t details_root;
183 struct list_head thin_devices;
184 uint64_t trans_id;
185 unsigned long flags;
186 sector_t data_block_size;
187 bool read_only:1;
188
189 /*
190 * Set if a transaction has to be aborted but the attempt to roll back
191 * to the previous (good) transaction failed. The only pool metadata
192 * operation possible in this state is the closing of the device.
193 */
194 bool fail_io:1;
195};
196
197struct dm_thin_device {
198 struct list_head list;
199 struct dm_pool_metadata *pmd;
200 dm_thin_id id;
201
202 int open_count;
203 bool changed:1;
204 bool aborted_with_changes:1;
205 uint64_t mapped_blocks;
206 uint64_t transaction_id;
207 uint32_t creation_time;
208 uint32_t snapshotted_time;
209};
210
211/*----------------------------------------------------------------
212 * superblock validator
213 *--------------------------------------------------------------*/
214
215#define SUPERBLOCK_CSUM_XOR 160774
216
217static void sb_prepare_for_write(struct dm_block_validator *v,
218 struct dm_block *b,
219 size_t block_size)
220{
221 struct thin_disk_superblock *disk_super = dm_block_data(b);
222
223 disk_super->blocknr = cpu_to_le64(dm_block_location(b));
224 disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
225 block_size - sizeof(__le32),
226 SUPERBLOCK_CSUM_XOR));
227}
228
229static int sb_check(struct dm_block_validator *v,
230 struct dm_block *b,
231 size_t block_size)
232{
233 struct thin_disk_superblock *disk_super = dm_block_data(b);
234 __le32 csum_le;
235
236 if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) {
237 DMERR("sb_check failed: blocknr %llu: "
238 "wanted %llu", le64_to_cpu(disk_super->blocknr),
239 (unsigned long long)dm_block_location(b));
240 return -ENOTBLK;
241 }
242
243 if (le64_to_cpu(disk_super->magic) != THIN_SUPERBLOCK_MAGIC) {
244 DMERR("sb_check failed: magic %llu: "
245 "wanted %llu", le64_to_cpu(disk_super->magic),
246 (unsigned long long)THIN_SUPERBLOCK_MAGIC);
247 return -EILSEQ;
248 }
249
250 csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
251 block_size - sizeof(__le32),
252 SUPERBLOCK_CSUM_XOR));
253 if (csum_le != disk_super->csum) {
254 DMERR("sb_check failed: csum %u: wanted %u",
255 le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum));
256 return -EILSEQ;
257 }
258
259 return 0;
260}
261
262static struct dm_block_validator sb_validator = {
263 .name = "superblock",
264 .prepare_for_write = sb_prepare_for_write,
265 .check = sb_check
266};
267
268/*----------------------------------------------------------------
269 * Methods for the btree value types
270 *--------------------------------------------------------------*/
271
272static uint64_t pack_block_time(dm_block_t b, uint32_t t)
273{
274 return (b << 24) | t;
275}
276
277static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t)
278{
279 *b = v >> 24;
280 *t = v & ((1 << 24) - 1);
281}
282
283static void data_block_inc(void *context, void *value_le)
284{
285 struct dm_space_map *sm = context;
286 __le64 v_le;
287 uint64_t b;
288 uint32_t t;
289
290 memcpy(&v_le, value_le, sizeof(v_le));
291 unpack_block_time(le64_to_cpu(v_le), &b, &t);
292 dm_sm_inc_block(sm, b);
293}
294
295static void data_block_dec(void *context, void *value_le)
296{
297 struct dm_space_map *sm = context;
298 __le64 v_le;
299 uint64_t b;
300 uint32_t t;
301
302 memcpy(&v_le, value_le, sizeof(v_le));
303 unpack_block_time(le64_to_cpu(v_le), &b, &t);
304 dm_sm_dec_block(sm, b);
305}
306
307static int data_block_equal(void *context, void *value1_le, void *value2_le)
308{
309 __le64 v1_le, v2_le;
310 uint64_t b1, b2;
311 uint32_t t;
312
313 memcpy(&v1_le, value1_le, sizeof(v1_le));
314 memcpy(&v2_le, value2_le, sizeof(v2_le));
315 unpack_block_time(le64_to_cpu(v1_le), &b1, &t);
316 unpack_block_time(le64_to_cpu(v2_le), &b2, &t);
317
318 return b1 == b2;
319}
320
321static void subtree_inc(void *context, void *value)
322{
323 struct dm_btree_info *info = context;
324 __le64 root_le;
325 uint64_t root;
326
327 memcpy(&root_le, value, sizeof(root_le));
328 root = le64_to_cpu(root_le);
329 dm_tm_inc(info->tm, root);
330}
331
332static void subtree_dec(void *context, void *value)
333{
334 struct dm_btree_info *info = context;
335 __le64 root_le;
336 uint64_t root;
337
338 memcpy(&root_le, value, sizeof(root_le));
339 root = le64_to_cpu(root_le);
340 if (dm_btree_del(info, root))
341 DMERR("btree delete failed\n");
342}
343
344static int subtree_equal(void *context, void *value1_le, void *value2_le)
345{
346 __le64 v1_le, v2_le;
347 memcpy(&v1_le, value1_le, sizeof(v1_le));
348 memcpy(&v2_le, value2_le, sizeof(v2_le));
349
350 return v1_le == v2_le;
351}
352
353/*----------------------------------------------------------------*/
354
355static int superblock_lock_zero(struct dm_pool_metadata *pmd,
356 struct dm_block **sblock)
357{
358 return dm_bm_write_lock_zero(pmd->bm, THIN_SUPERBLOCK_LOCATION,
359 &sb_validator, sblock);
360}
361
362static int superblock_lock(struct dm_pool_metadata *pmd,
363 struct dm_block **sblock)
364{
365 return dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
366 &sb_validator, sblock);
367}
368
369static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result)
370{
371 int r;
372 unsigned i;
373 struct dm_block *b;
374 __le64 *data_le, zero = cpu_to_le64(0);
375 unsigned block_size = dm_bm_block_size(bm) / sizeof(__le64);
376
377 /*
378 * We can't use a validator here - it may be all zeroes.
379 */
380 r = dm_bm_read_lock(bm, THIN_SUPERBLOCK_LOCATION, NULL, &b);
381 if (r)
382 return r;
383
384 data_le = dm_block_data(b);
385 *result = 1;
386 for (i = 0; i < block_size; i++) {
387 if (data_le[i] != zero) {
388 *result = 0;
389 break;
390 }
391 }
392
393 return dm_bm_unlock(b);
394}
395
396static void __setup_btree_details(struct dm_pool_metadata *pmd)
397{
398 pmd->info.tm = pmd->tm;
399 pmd->info.levels = 2;
400 pmd->info.value_type.context = pmd->data_sm;
401 pmd->info.value_type.size = sizeof(__le64);
402 pmd->info.value_type.inc = data_block_inc;
403 pmd->info.value_type.dec = data_block_dec;
404 pmd->info.value_type.equal = data_block_equal;
405
406 memcpy(&pmd->nb_info, &pmd->info, sizeof(pmd->nb_info));
407 pmd->nb_info.tm = pmd->nb_tm;
408
409 pmd->tl_info.tm = pmd->tm;
410 pmd->tl_info.levels = 1;
411 pmd->tl_info.value_type.context = &pmd->bl_info;
412 pmd->tl_info.value_type.size = sizeof(__le64);
413 pmd->tl_info.value_type.inc = subtree_inc;
414 pmd->tl_info.value_type.dec = subtree_dec;
415 pmd->tl_info.value_type.equal = subtree_equal;
416
417 pmd->bl_info.tm = pmd->tm;
418 pmd->bl_info.levels = 1;
419 pmd->bl_info.value_type.context = pmd->data_sm;
420 pmd->bl_info.value_type.size = sizeof(__le64);
421 pmd->bl_info.value_type.inc = data_block_inc;
422 pmd->bl_info.value_type.dec = data_block_dec;
423 pmd->bl_info.value_type.equal = data_block_equal;
424
425 pmd->details_info.tm = pmd->tm;
426 pmd->details_info.levels = 1;
427 pmd->details_info.value_type.context = NULL;
428 pmd->details_info.value_type.size = sizeof(struct disk_device_details);
429 pmd->details_info.value_type.inc = NULL;
430 pmd->details_info.value_type.dec = NULL;
431 pmd->details_info.value_type.equal = NULL;
432}
433
434static int __write_initial_superblock(struct dm_pool_metadata *pmd)
435{
436 int r;
437 struct dm_block *sblock;
438 size_t metadata_len, data_len;
439 struct thin_disk_superblock *disk_super;
440 sector_t bdev_size = i_size_read(pmd->bdev->bd_inode) >> SECTOR_SHIFT;
441
442 if (bdev_size > THIN_METADATA_MAX_SECTORS)
443 bdev_size = THIN_METADATA_MAX_SECTORS;
444
445 r = dm_sm_root_size(pmd->metadata_sm, &metadata_len);
446 if (r < 0)
447 return r;
448
449 r = dm_sm_root_size(pmd->data_sm, &data_len);
450 if (r < 0)
451 return r;
452
453 r = dm_sm_commit(pmd->data_sm);
454 if (r < 0)
455 return r;
456
457 r = dm_tm_pre_commit(pmd->tm);
458 if (r < 0)
459 return r;
460
461 r = superblock_lock_zero(pmd, &sblock);
462 if (r)
463 return r;
464
465 disk_super = dm_block_data(sblock);
466 disk_super->flags = 0;
467 memset(disk_super->uuid, 0, sizeof(disk_super->uuid));
468 disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC);
469 disk_super->version = cpu_to_le32(THIN_VERSION);
470 disk_super->time = 0;
471 disk_super->trans_id = 0;
472 disk_super->held_root = 0;
473
474 r = dm_sm_copy_root(pmd->metadata_sm, &disk_super->metadata_space_map_root,
475 metadata_len);
476 if (r < 0)
477 goto bad_locked;
478
479 r = dm_sm_copy_root(pmd->data_sm, &disk_super->data_space_map_root,
480 data_len);
481 if (r < 0)
482 goto bad_locked;
483
484 disk_super->data_mapping_root = cpu_to_le64(pmd->root);
485 disk_super->device_details_root = cpu_to_le64(pmd->details_root);
486 disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
487 disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT);
488 disk_super->data_block_size = cpu_to_le32(pmd->data_block_size);
489
490 return dm_tm_commit(pmd->tm, sblock);
491
492bad_locked:
493 dm_bm_unlock(sblock);
494 return r;
495}
496
497static int __format_metadata(struct dm_pool_metadata *pmd)
498{
499 int r;
500
501 r = dm_tm_create_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION,
502 &pmd->tm, &pmd->metadata_sm);
503 if (r < 0) {
504 DMERR("tm_create_with_sm failed");
505 return r;
506 }
507
508 pmd->data_sm = dm_sm_disk_create(pmd->tm, 0);
509 if (IS_ERR(pmd->data_sm)) {
510 DMERR("sm_disk_create failed");
511 r = PTR_ERR(pmd->data_sm);
512 goto bad_cleanup_tm;
513 }
514
515 pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm);
516 if (!pmd->nb_tm) {
517 DMERR("could not create non-blocking clone tm");
518 r = -ENOMEM;
519 goto bad_cleanup_data_sm;
520 }
521
522 __setup_btree_details(pmd);
523
524 r = dm_btree_empty(&pmd->info, &pmd->root);
525 if (r < 0)
526 goto bad_cleanup_nb_tm;
527
528 r = dm_btree_empty(&pmd->details_info, &pmd->details_root);
529 if (r < 0) {
530 DMERR("couldn't create devices root");
531 goto bad_cleanup_nb_tm;
532 }
533
534 r = __write_initial_superblock(pmd);
535 if (r)
536 goto bad_cleanup_nb_tm;
537
538 return 0;
539
540bad_cleanup_nb_tm:
541 dm_tm_destroy(pmd->nb_tm);
542bad_cleanup_data_sm:
543 dm_sm_destroy(pmd->data_sm);
544bad_cleanup_tm:
545 dm_tm_destroy(pmd->tm);
546 dm_sm_destroy(pmd->metadata_sm);
547
548 return r;
549}
550
551static int __check_incompat_features(struct thin_disk_superblock *disk_super,
552 struct dm_pool_metadata *pmd)
553{
554 uint32_t features;
555
556 features = le32_to_cpu(disk_super->incompat_flags) & ~THIN_FEATURE_INCOMPAT_SUPP;
557 if (features) {
558 DMERR("could not access metadata due to unsupported optional features (%lx).",
559 (unsigned long)features);
560 return -EINVAL;
561 }
562
563 /*
564 * Check for read-only metadata to skip the following RDWR checks.
565 */
566 if (get_disk_ro(pmd->bdev->bd_disk))
567 return 0;
568
569 features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP;
570 if (features) {
571 DMERR("could not access metadata RDWR due to unsupported optional features (%lx).",
572 (unsigned long)features);
573 return -EINVAL;
574 }
575
576 return 0;
577}
578
579static int __open_metadata(struct dm_pool_metadata *pmd)
580{
581 int r;
582 struct dm_block *sblock;
583 struct thin_disk_superblock *disk_super;
584
585 r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
586 &sb_validator, &sblock);
587 if (r < 0) {
588 DMERR("couldn't read superblock");
589 return r;
590 }
591
592 disk_super = dm_block_data(sblock);
593
594 r = __check_incompat_features(disk_super, pmd);
595 if (r < 0)
596 goto bad_unlock_sblock;
597
598 r = dm_tm_open_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION,
599 disk_super->metadata_space_map_root,
600 sizeof(disk_super->metadata_space_map_root),
601 &pmd->tm, &pmd->metadata_sm);
602 if (r < 0) {
603 DMERR("tm_open_with_sm failed");
604 goto bad_unlock_sblock;
605 }
606
607 pmd->data_sm = dm_sm_disk_open(pmd->tm, disk_super->data_space_map_root,
608 sizeof(disk_super->data_space_map_root));
609 if (IS_ERR(pmd->data_sm)) {
610 DMERR("sm_disk_open failed");
611 r = PTR_ERR(pmd->data_sm);
612 goto bad_cleanup_tm;
613 }
614
615 pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm);
616 if (!pmd->nb_tm) {
617 DMERR("could not create non-blocking clone tm");
618 r = -ENOMEM;
619 goto bad_cleanup_data_sm;
620 }
621
622 __setup_btree_details(pmd);
623 return dm_bm_unlock(sblock);
624
625bad_cleanup_data_sm:
626 dm_sm_destroy(pmd->data_sm);
627bad_cleanup_tm:
628 dm_tm_destroy(pmd->tm);
629 dm_sm_destroy(pmd->metadata_sm);
630bad_unlock_sblock:
631 dm_bm_unlock(sblock);
632
633 return r;
634}
635
636static int __open_or_format_metadata(struct dm_pool_metadata *pmd, bool format_device)
637{
638 int r, unformatted;
639
640 r = __superblock_all_zeroes(pmd->bm, &unformatted);
641 if (r)
642 return r;
643
644 if (unformatted)
645 return format_device ? __format_metadata(pmd) : -EPERM;
646
647 return __open_metadata(pmd);
648}
649
650static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool format_device)
651{
652 int r;
653
654 pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE,
655 THIN_METADATA_CACHE_SIZE,
656 THIN_MAX_CONCURRENT_LOCKS);
657 if (IS_ERR(pmd->bm)) {
658 DMERR("could not create block manager");
659 return PTR_ERR(pmd->bm);
660 }
661
662 r = __open_or_format_metadata(pmd, format_device);
663 if (r)
664 dm_block_manager_destroy(pmd->bm);
665
666 return r;
667}
668
669static void __destroy_persistent_data_objects(struct dm_pool_metadata *pmd)
670{
671 dm_sm_destroy(pmd->data_sm);
672 dm_sm_destroy(pmd->metadata_sm);
673 dm_tm_destroy(pmd->nb_tm);
674 dm_tm_destroy(pmd->tm);
675 dm_block_manager_destroy(pmd->bm);
676}
677
678static int __begin_transaction(struct dm_pool_metadata *pmd)
679{
680 int r;
681 struct thin_disk_superblock *disk_super;
682 struct dm_block *sblock;
683
684 /*
685 * We re-read the superblock every time. Shouldn't need to do this
686 * really.
687 */
688 r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
689 &sb_validator, &sblock);
690 if (r)
691 return r;
692
693 disk_super = dm_block_data(sblock);
694 pmd->time = le32_to_cpu(disk_super->time);
695 pmd->root = le64_to_cpu(disk_super->data_mapping_root);
696 pmd->details_root = le64_to_cpu(disk_super->device_details_root);
697 pmd->trans_id = le64_to_cpu(disk_super->trans_id);
698 pmd->flags = le32_to_cpu(disk_super->flags);
699 pmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
700
701 dm_bm_unlock(sblock);
702 return 0;
703}
704
705static int __write_changed_details(struct dm_pool_metadata *pmd)
706{
707 int r;
708 struct dm_thin_device *td, *tmp;
709 struct disk_device_details details;
710 uint64_t key;
711
712 list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
713 if (!td->changed)
714 continue;
715
716 key = td->id;
717
718 details.mapped_blocks = cpu_to_le64(td->mapped_blocks);
719 details.transaction_id = cpu_to_le64(td->transaction_id);
720 details.creation_time = cpu_to_le32(td->creation_time);
721 details.snapshotted_time = cpu_to_le32(td->snapshotted_time);
722 __dm_bless_for_disk(&details);
723
724 r = dm_btree_insert(&pmd->details_info, pmd->details_root,
725 &key, &details, &pmd->details_root);
726 if (r)
727 return r;
728
729 if (td->open_count)
730 td->changed = 0;
731 else {
732 list_del(&td->list);
733 kfree(td);
734 }
735 }
736
737 return 0;
738}
739
740static int __commit_transaction(struct dm_pool_metadata *pmd)
741{
742 int r;
743 size_t metadata_len, data_len;
744 struct thin_disk_superblock *disk_super;
745 struct dm_block *sblock;
746
747 /*
748 * We need to know if the thin_disk_superblock exceeds a 512-byte sector.
749 */
750 BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512);
751
752 r = __write_changed_details(pmd);
753 if (r < 0)
754 return r;
755
756 r = dm_sm_commit(pmd->data_sm);
757 if (r < 0)
758 return r;
759
760 r = dm_tm_pre_commit(pmd->tm);
761 if (r < 0)
762 return r;
763
764 r = dm_sm_root_size(pmd->metadata_sm, &metadata_len);
765 if (r < 0)
766 return r;
767
768 r = dm_sm_root_size(pmd->data_sm, &data_len);
769 if (r < 0)
770 return r;
771
772 r = superblock_lock(pmd, &sblock);
773 if (r)
774 return r;
775
776 disk_super = dm_block_data(sblock);
777 disk_super->time = cpu_to_le32(pmd->time);
778 disk_super->data_mapping_root = cpu_to_le64(pmd->root);
779 disk_super->device_details_root = cpu_to_le64(pmd->details_root);
780 disk_super->trans_id = cpu_to_le64(pmd->trans_id);
781 disk_super->flags = cpu_to_le32(pmd->flags);
782
783 r = dm_sm_copy_root(pmd->metadata_sm, &disk_super->metadata_space_map_root,
784 metadata_len);
785 if (r < 0)
786 goto out_locked;
787
788 r = dm_sm_copy_root(pmd->data_sm, &disk_super->data_space_map_root,
789 data_len);
790 if (r < 0)
791 goto out_locked;
792
793 return dm_tm_commit(pmd->tm, sblock);
794
795out_locked:
796 dm_bm_unlock(sblock);
797 return r;
798}
799
800struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
801 sector_t data_block_size,
802 bool format_device)
803{
804 int r;
805 struct dm_pool_metadata *pmd;
806
807 pmd = kmalloc(sizeof(*pmd), GFP_KERNEL);
808 if (!pmd) {
809 DMERR("could not allocate metadata struct");
810 return ERR_PTR(-ENOMEM);
811 }
812
813 init_rwsem(&pmd->root_lock);
814 pmd->time = 0;
815 INIT_LIST_HEAD(&pmd->thin_devices);
816 pmd->read_only = false;
817 pmd->fail_io = false;
818 pmd->bdev = bdev;
819 pmd->data_block_size = data_block_size;
820
821 r = __create_persistent_data_objects(pmd, format_device);
822 if (r) {
823 kfree(pmd);
824 return ERR_PTR(r);
825 }
826
827 r = __begin_transaction(pmd);
828 if (r < 0) {
829 if (dm_pool_metadata_close(pmd) < 0)
830 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
831 return ERR_PTR(r);
832 }
833
834 return pmd;
835}
836
837int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
838{
839 int r;
840 unsigned open_devices = 0;
841 struct dm_thin_device *td, *tmp;
842
843 down_read(&pmd->root_lock);
844 list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
845 if (td->open_count)
846 open_devices++;
847 else {
848 list_del(&td->list);
849 kfree(td);
850 }
851 }
852 up_read(&pmd->root_lock);
853
854 if (open_devices) {
855 DMERR("attempt to close pmd when %u device(s) are still open",
856 open_devices);
857 return -EBUSY;
858 }
859
860 if (!pmd->read_only && !pmd->fail_io) {
861 r = __commit_transaction(pmd);
862 if (r < 0)
863 DMWARN("%s: __commit_transaction() failed, error = %d",
864 __func__, r);
865 }
866
867 if (!pmd->fail_io)
868 __destroy_persistent_data_objects(pmd);
869
870 kfree(pmd);
871 return 0;
872}
873
874/*
875 * __open_device: Returns @td corresponding to device with id @dev,
876 * creating it if @create is set and incrementing @td->open_count.
877 * On failure, @td is undefined.
878 */
879static int __open_device(struct dm_pool_metadata *pmd,
880 dm_thin_id dev, int create,
881 struct dm_thin_device **td)
882{
883 int r, changed = 0;
884 struct dm_thin_device *td2;
885 uint64_t key = dev;
886 struct disk_device_details details_le;
887
888 /*
889 * If the device is already open, return it.
890 */
891 list_for_each_entry(td2, &pmd->thin_devices, list)
892 if (td2->id == dev) {
893 /*
894 * May not create an already-open device.
895 */
896 if (create)
897 return -EEXIST;
898
899 td2->open_count++;
900 *td = td2;
901 return 0;
902 }
903
904 /*
905 * Check the device exists.
906 */
907 r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
908 &key, &details_le);
909 if (r) {
910 if (r != -ENODATA || !create)
911 return r;
912
913 /*
914 * Create new device.
915 */
916 changed = 1;
917 details_le.mapped_blocks = 0;
918 details_le.transaction_id = cpu_to_le64(pmd->trans_id);
919 details_le.creation_time = cpu_to_le32(pmd->time);
920 details_le.snapshotted_time = cpu_to_le32(pmd->time);
921 }
922
923 *td = kmalloc(sizeof(**td), GFP_NOIO);
924 if (!*td)
925 return -ENOMEM;
926
927 (*td)->pmd = pmd;
928 (*td)->id = dev;
929 (*td)->open_count = 1;
930 (*td)->changed = changed;
931 (*td)->aborted_with_changes = false;
932 (*td)->mapped_blocks = le64_to_cpu(details_le.mapped_blocks);
933 (*td)->transaction_id = le64_to_cpu(details_le.transaction_id);
934 (*td)->creation_time = le32_to_cpu(details_le.creation_time);
935 (*td)->snapshotted_time = le32_to_cpu(details_le.snapshotted_time);
936
937 list_add(&(*td)->list, &pmd->thin_devices);
938
939 return 0;
940}
941
942static void __close_device(struct dm_thin_device *td)
943{
944 --td->open_count;
945}
946
947static int __create_thin(struct dm_pool_metadata *pmd,
948 dm_thin_id dev)
949{
950 int r;
951 dm_block_t dev_root;
952 uint64_t key = dev;
953 struct disk_device_details details_le;
954 struct dm_thin_device *td;
955 __le64 value;
956
957 r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
958 &key, &details_le);
959 if (!r)
960 return -EEXIST;
961
962 /*
963 * Create an empty btree for the mappings.
964 */
965 r = dm_btree_empty(&pmd->bl_info, &dev_root);
966 if (r)
967 return r;
968
969 /*
970 * Insert it into the main mapping tree.
971 */
972 value = cpu_to_le64(dev_root);
973 __dm_bless_for_disk(&value);
974 r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root);
975 if (r) {
976 dm_btree_del(&pmd->bl_info, dev_root);
977 return r;
978 }
979
980 r = __open_device(pmd, dev, 1, &td);
981 if (r) {
982 dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
983 dm_btree_del(&pmd->bl_info, dev_root);
984 return r;
985 }
986 __close_device(td);
987
988 return r;
989}
990
991int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev)
992{
993 int r = -EINVAL;
994
995 down_write(&pmd->root_lock);
996 if (!pmd->fail_io)
997 r = __create_thin(pmd, dev);
998 up_write(&pmd->root_lock);
999
1000 return r;
1001}
1002
1003static int __set_snapshot_details(struct dm_pool_metadata *pmd,
1004 struct dm_thin_device *snap,
1005 dm_thin_id origin, uint32_t time)
1006{
1007 int r;
1008 struct dm_thin_device *td;
1009
1010 r = __open_device(pmd, origin, 0, &td);
1011 if (r)
1012 return r;
1013
1014 td->changed = 1;
1015 td->snapshotted_time = time;
1016
1017 snap->mapped_blocks = td->mapped_blocks;
1018 snap->snapshotted_time = time;
1019 __close_device(td);
1020
1021 return 0;
1022}
1023
1024static int __create_snap(struct dm_pool_metadata *pmd,
1025 dm_thin_id dev, dm_thin_id origin)
1026{
1027 int r;
1028 dm_block_t origin_root;
1029 uint64_t key = origin, dev_key = dev;
1030 struct dm_thin_device *td;
1031 struct disk_device_details details_le;
1032 __le64 value;
1033
1034 /* check this device is unused */
1035 r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
1036 &dev_key, &details_le);
1037 if (!r)
1038 return -EEXIST;
1039
1040 /* find the mapping tree for the origin */
1041 r = dm_btree_lookup(&pmd->tl_info, pmd->root, &key, &value);
1042 if (r)
1043 return r;
1044 origin_root = le64_to_cpu(value);
1045
1046 /* clone the origin, an inc will do */
1047 dm_tm_inc(pmd->tm, origin_root);
1048
1049 /* insert into the main mapping tree */
1050 value = cpu_to_le64(origin_root);
1051 __dm_bless_for_disk(&value);
1052 key = dev;
1053 r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root);
1054 if (r) {
1055 dm_tm_dec(pmd->tm, origin_root);
1056 return r;
1057 }
1058
1059 pmd->time++;
1060
1061 r = __open_device(pmd, dev, 1, &td);
1062 if (r)
1063 goto bad;
1064
1065 r = __set_snapshot_details(pmd, td, origin, pmd->time);
1066 __close_device(td);
1067
1068 if (r)
1069 goto bad;
1070
1071 return 0;
1072
1073bad:
1074 dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
1075 dm_btree_remove(&pmd->details_info, pmd->details_root,
1076 &key, &pmd->details_root);
1077 return r;
1078}
1079
1080int dm_pool_create_snap(struct dm_pool_metadata *pmd,
1081 dm_thin_id dev,
1082 dm_thin_id origin)
1083{
1084 int r = -EINVAL;
1085
1086 down_write(&pmd->root_lock);
1087 if (!pmd->fail_io)
1088 r = __create_snap(pmd, dev, origin);
1089 up_write(&pmd->root_lock);
1090
1091 return r;
1092}
1093
1094static int __delete_device(struct dm_pool_metadata *pmd, dm_thin_id dev)
1095{
1096 int r;
1097 uint64_t key = dev;
1098 struct dm_thin_device *td;
1099
1100 /* TODO: failure should mark the transaction invalid */
1101 r = __open_device(pmd, dev, 0, &td);
1102 if (r)
1103 return r;
1104
1105 if (td->open_count > 1) {
1106 __close_device(td);
1107 return -EBUSY;
1108 }
1109
1110 list_del(&td->list);
1111 kfree(td);
1112 r = dm_btree_remove(&pmd->details_info, pmd->details_root,
1113 &key, &pmd->details_root);
1114 if (r)
1115 return r;
1116
1117 r = dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
1118 if (r)
1119 return r;
1120
1121 return 0;
1122}
1123
1124int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd,
1125 dm_thin_id dev)
1126{
1127 int r = -EINVAL;
1128
1129 down_write(&pmd->root_lock);
1130 if (!pmd->fail_io)
1131 r = __delete_device(pmd, dev);
1132 up_write(&pmd->root_lock);
1133
1134 return r;
1135}
1136
1137int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd,
1138 uint64_t current_id,
1139 uint64_t new_id)
1140{
1141 int r = -EINVAL;
1142
1143 down_write(&pmd->root_lock);
1144
1145 if (pmd->fail_io)
1146 goto out;
1147
1148 if (pmd->trans_id != current_id) {
1149 DMERR("mismatched transaction id");
1150 goto out;
1151 }
1152
1153 pmd->trans_id = new_id;
1154 r = 0;
1155
1156out:
1157 up_write(&pmd->root_lock);
1158
1159 return r;
1160}
1161
1162int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd,
1163 uint64_t *result)
1164{
1165 int r = -EINVAL;
1166
1167 down_read(&pmd->root_lock);
1168 if (!pmd->fail_io) {
1169 *result = pmd->trans_id;
1170 r = 0;
1171 }
1172 up_read(&pmd->root_lock);
1173
1174 return r;
1175}
1176
1177static int __reserve_metadata_snap(struct dm_pool_metadata *pmd)
1178{
1179 int r, inc;
1180 struct thin_disk_superblock *disk_super;
1181 struct dm_block *copy, *sblock;
1182 dm_block_t held_root;
1183
1184 /*
1185 * Copy the superblock.
1186 */
1187 dm_sm_inc_block(pmd->metadata_sm, THIN_SUPERBLOCK_LOCATION);
1188 r = dm_tm_shadow_block(pmd->tm, THIN_SUPERBLOCK_LOCATION,
1189 &sb_validator, &copy, &inc);
1190 if (r)
1191 return r;
1192
1193 BUG_ON(!inc);
1194
1195 held_root = dm_block_location(copy);
1196 disk_super = dm_block_data(copy);
1197
1198 if (le64_to_cpu(disk_super->held_root)) {
1199 DMWARN("Pool metadata snapshot already exists: release this before taking another.");
1200
1201 dm_tm_dec(pmd->tm, held_root);
1202 dm_tm_unlock(pmd->tm, copy);
1203 return -EBUSY;
1204 }
1205
1206 /*
1207 * Wipe the spacemap since we're not publishing this.
1208 */
1209 memset(&disk_super->data_space_map_root, 0,
1210 sizeof(disk_super->data_space_map_root));
1211 memset(&disk_super->metadata_space_map_root, 0,
1212 sizeof(disk_super->metadata_space_map_root));
1213
1214 /*
1215 * Increment the data structures that need to be preserved.
1216 */
1217 dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->data_mapping_root));
1218 dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->device_details_root));
1219 dm_tm_unlock(pmd->tm, copy);
1220
1221 /*
1222 * Write the held root into the superblock.
1223 */
1224 r = superblock_lock(pmd, &sblock);
1225 if (r) {
1226 dm_tm_dec(pmd->tm, held_root);
1227 return r;
1228 }
1229
1230 disk_super = dm_block_data(sblock);
1231 disk_super->held_root = cpu_to_le64(held_root);
1232 dm_bm_unlock(sblock);
1233 return 0;
1234}
1235
1236int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd)
1237{
1238 int r = -EINVAL;
1239
1240 down_write(&pmd->root_lock);
1241 if (!pmd->fail_io)
1242 r = __reserve_metadata_snap(pmd);
1243 up_write(&pmd->root_lock);
1244
1245 return r;
1246}
1247
1248static int __release_metadata_snap(struct dm_pool_metadata *pmd)
1249{
1250 int r;
1251 struct thin_disk_superblock *disk_super;
1252 struct dm_block *sblock, *copy;
1253 dm_block_t held_root;
1254
1255 r = superblock_lock(pmd, &sblock);
1256 if (r)
1257 return r;
1258
1259 disk_super = dm_block_data(sblock);
1260 held_root = le64_to_cpu(disk_super->held_root);
1261 disk_super->held_root = cpu_to_le64(0);
1262
1263 dm_bm_unlock(sblock);
1264
1265 if (!held_root) {
1266 DMWARN("No pool metadata snapshot found: nothing to release.");
1267 return -EINVAL;
1268 }
1269
1270 r = dm_tm_read_lock(pmd->tm, held_root, &sb_validator, &copy);
1271 if (r)
1272 return r;
1273
1274 disk_super = dm_block_data(copy);
1275 dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(disk_super->data_mapping_root));
1276 dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(disk_super->device_details_root));
1277 dm_sm_dec_block(pmd->metadata_sm, held_root);
1278
1279 return dm_tm_unlock(pmd->tm, copy);
1280}
1281
1282int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd)
1283{
1284 int r = -EINVAL;
1285
1286 down_write(&pmd->root_lock);
1287 if (!pmd->fail_io)
1288 r = __release_metadata_snap(pmd);
1289 up_write(&pmd->root_lock);
1290
1291 return r;
1292}
1293
1294static int __get_metadata_snap(struct dm_pool_metadata *pmd,
1295 dm_block_t *result)
1296{
1297 int r;
1298 struct thin_disk_superblock *disk_super;
1299 struct dm_block *sblock;
1300
1301 r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
1302 &sb_validator, &sblock);
1303 if (r)
1304 return r;
1305
1306 disk_super = dm_block_data(sblock);
1307 *result = le64_to_cpu(disk_super->held_root);
1308
1309 return dm_bm_unlock(sblock);
1310}
1311
1312int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd,
1313 dm_block_t *result)
1314{
1315 int r = -EINVAL;
1316
1317 down_read(&pmd->root_lock);
1318 if (!pmd->fail_io)
1319 r = __get_metadata_snap(pmd, result);
1320 up_read(&pmd->root_lock);
1321
1322 return r;
1323}
1324
1325int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev,
1326 struct dm_thin_device **td)
1327{
1328 int r = -EINVAL;
1329
1330 down_write(&pmd->root_lock);
1331 if (!pmd->fail_io)
1332 r = __open_device(pmd, dev, 0, td);
1333 up_write(&pmd->root_lock);
1334
1335 return r;
1336}
1337
1338int dm_pool_close_thin_device(struct dm_thin_device *td)
1339{
1340 down_write(&td->pmd->root_lock);
1341 __close_device(td);
1342 up_write(&td->pmd->root_lock);
1343
1344 return 0;
1345}
1346
1347dm_thin_id dm_thin_dev_id(struct dm_thin_device *td)
1348{
1349 return td->id;
1350}
1351
1352static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time)
1353{
1354 return td->snapshotted_time > time;
1355}
1356
1357int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
1358 int can_block, struct dm_thin_lookup_result *result)
1359{
1360 int r = -EINVAL;
1361 uint64_t block_time = 0;
1362 __le64 value;
1363 struct dm_pool_metadata *pmd = td->pmd;
1364 dm_block_t keys[2] = { td->id, block };
1365 struct dm_btree_info *info;
1366
1367 if (can_block) {
1368 down_read(&pmd->root_lock);
1369 info = &pmd->info;
1370 } else if (down_read_trylock(&pmd->root_lock))
1371 info = &pmd->nb_info;
1372 else
1373 return -EWOULDBLOCK;
1374
1375 if (pmd->fail_io)
1376 goto out;
1377
1378 r = dm_btree_lookup(info, pmd->root, keys, &value);
1379 if (!r)
1380 block_time = le64_to_cpu(value);
1381
1382out:
1383 up_read(&pmd->root_lock);
1384
1385 if (!r) {
1386 dm_block_t exception_block;
1387 uint32_t exception_time;
1388 unpack_block_time(block_time, &exception_block,
1389 &exception_time);
1390 result->block = exception_block;
1391 result->shared = __snapshotted_since(td, exception_time);
1392 }
1393
1394 return r;
1395}
1396
1397static int __insert(struct dm_thin_device *td, dm_block_t block,
1398 dm_block_t data_block)
1399{
1400 int r, inserted;
1401 __le64 value;
1402 struct dm_pool_metadata *pmd = td->pmd;
1403 dm_block_t keys[2] = { td->id, block };
1404
1405 value = cpu_to_le64(pack_block_time(data_block, pmd->time));
1406 __dm_bless_for_disk(&value);
1407
1408 r = dm_btree_insert_notify(&pmd->info, pmd->root, keys, &value,
1409 &pmd->root, &inserted);
1410 if (r)
1411 return r;
1412
1413 td->changed = 1;
1414 if (inserted)
1415 td->mapped_blocks++;
1416
1417 return 0;
1418}
1419
1420int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block,
1421 dm_block_t data_block)
1422{
1423 int r = -EINVAL;
1424
1425 down_write(&td->pmd->root_lock);
1426 if (!td->pmd->fail_io)
1427 r = __insert(td, block, data_block);
1428 up_write(&td->pmd->root_lock);
1429
1430 return r;
1431}
1432
1433static int __remove(struct dm_thin_device *td, dm_block_t block)
1434{
1435 int r;
1436 struct dm_pool_metadata *pmd = td->pmd;
1437 dm_block_t keys[2] = { td->id, block };
1438
1439 r = dm_btree_remove(&pmd->info, pmd->root, keys, &pmd->root);
1440 if (r)
1441 return r;
1442
1443 td->mapped_blocks--;
1444 td->changed = 1;
1445
1446 return 0;
1447}
1448
1449int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
1450{
1451 int r = -EINVAL;
1452
1453 down_write(&td->pmd->root_lock);
1454 if (!td->pmd->fail_io)
1455 r = __remove(td, block);
1456 up_write(&td->pmd->root_lock);
1457
1458 return r;
1459}
1460
1461bool dm_thin_changed_this_transaction(struct dm_thin_device *td)
1462{
1463 int r;
1464
1465 down_read(&td->pmd->root_lock);
1466 r = td->changed;
1467 up_read(&td->pmd->root_lock);
1468
1469 return r;
1470}
1471
1472bool dm_thin_aborted_changes(struct dm_thin_device *td)
1473{
1474 bool r;
1475
1476 down_read(&td->pmd->root_lock);
1477 r = td->aborted_with_changes;
1478 up_read(&td->pmd->root_lock);
1479
1480 return r;
1481}
1482
1483int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result)
1484{
1485 int r = -EINVAL;
1486
1487 down_write(&pmd->root_lock);
1488 if (!pmd->fail_io)
1489 r = dm_sm_new_block(pmd->data_sm, result);
1490 up_write(&pmd->root_lock);
1491
1492 return r;
1493}
1494
1495int dm_pool_commit_metadata(struct dm_pool_metadata *pmd)
1496{
1497 int r = -EINVAL;
1498
1499 down_write(&pmd->root_lock);
1500 if (pmd->fail_io)
1501 goto out;
1502
1503 r = __commit_transaction(pmd);
1504 if (r <= 0)
1505 goto out;
1506
1507 /*
1508 * Open the next transaction.
1509 */
1510 r = __begin_transaction(pmd);
1511out:
1512 up_write(&pmd->root_lock);
1513 return r;
1514}
1515
1516static void __set_abort_with_changes_flags(struct dm_pool_metadata *pmd)
1517{
1518 struct dm_thin_device *td;
1519
1520 list_for_each_entry(td, &pmd->thin_devices, list)
1521 td->aborted_with_changes = td->changed;
1522}
1523
1524int dm_pool_abort_metadata(struct dm_pool_metadata *pmd)
1525{
1526 int r = -EINVAL;
1527
1528 down_write(&pmd->root_lock);
1529 if (pmd->fail_io)
1530 goto out;
1531
1532 __set_abort_with_changes_flags(pmd);
1533 __destroy_persistent_data_objects(pmd);
1534 r = __create_persistent_data_objects(pmd, false);
1535 if (r)
1536 pmd->fail_io = true;
1537
1538out:
1539 up_write(&pmd->root_lock);
1540
1541 return r;
1542}
1543
1544int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result)
1545{
1546 int r = -EINVAL;
1547
1548 down_read(&pmd->root_lock);
1549 if (!pmd->fail_io)
1550 r = dm_sm_get_nr_free(pmd->data_sm, result);
1551 up_read(&pmd->root_lock);
1552
1553 return r;
1554}
1555
1556int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
1557 dm_block_t *result)
1558{
1559 int r = -EINVAL;
1560
1561 down_read(&pmd->root_lock);
1562 if (!pmd->fail_io)
1563 r = dm_sm_get_nr_free(pmd->metadata_sm, result);
1564 up_read(&pmd->root_lock);
1565
1566 return r;
1567}
1568
1569int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd,
1570 dm_block_t *result)
1571{
1572 int r = -EINVAL;
1573
1574 down_read(&pmd->root_lock);
1575 if (!pmd->fail_io)
1576 r = dm_sm_get_nr_blocks(pmd->metadata_sm, result);
1577 up_read(&pmd->root_lock);
1578
1579 return r;
1580}
1581
1582int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result)
1583{
1584 down_read(&pmd->root_lock);
1585 *result = pmd->data_block_size;
1586 up_read(&pmd->root_lock);
1587
1588 return 0;
1589}
1590
1591int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result)
1592{
1593 int r = -EINVAL;
1594
1595 down_read(&pmd->root_lock);
1596 if (!pmd->fail_io)
1597 r = dm_sm_get_nr_blocks(pmd->data_sm, result);
1598 up_read(&pmd->root_lock);
1599
1600 return r;
1601}
1602
1603int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result)
1604{
1605 int r = -EINVAL;
1606 struct dm_pool_metadata *pmd = td->pmd;
1607
1608 down_read(&pmd->root_lock);
1609 if (!pmd->fail_io) {
1610 *result = td->mapped_blocks;
1611 r = 0;
1612 }
1613 up_read(&pmd->root_lock);
1614
1615 return r;
1616}
1617
1618static int __highest_block(struct dm_thin_device *td, dm_block_t *result)
1619{
1620 int r;
1621 __le64 value_le;
1622 dm_block_t thin_root;
1623 struct dm_pool_metadata *pmd = td->pmd;
1624
1625 r = dm_btree_lookup(&pmd->tl_info, pmd->root, &td->id, &value_le);
1626 if (r)
1627 return r;
1628
1629 thin_root = le64_to_cpu(value_le);
1630
1631 return dm_btree_find_highest_key(&pmd->bl_info, thin_root, result);
1632}
1633
1634int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
1635 dm_block_t *result)
1636{
1637 int r = -EINVAL;
1638 struct dm_pool_metadata *pmd = td->pmd;
1639
1640 down_read(&pmd->root_lock);
1641 if (!pmd->fail_io)
1642 r = __highest_block(td, result);
1643 up_read(&pmd->root_lock);
1644
1645 return r;
1646}
1647
1648static int __resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
1649{
1650 int r;
1651 dm_block_t old_count;
1652
1653 r = dm_sm_get_nr_blocks(pmd->data_sm, &old_count);
1654 if (r)
1655 return r;
1656
1657 if (new_count == old_count)
1658 return 0;
1659
1660 if (new_count < old_count) {
1661 DMERR("cannot reduce size of data device");
1662 return -EINVAL;
1663 }
1664
1665 return dm_sm_extend(pmd->data_sm, new_count - old_count);
1666}
1667
1668int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
1669{
1670 int r = -EINVAL;
1671
1672 down_write(&pmd->root_lock);
1673 if (!pmd->fail_io)
1674 r = __resize_data_dev(pmd, new_count);
1675 up_write(&pmd->root_lock);
1676
1677 return r;
1678}
1679
1680void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
1681{
1682 down_write(&pmd->root_lock);
1683 pmd->read_only = true;
1684 dm_bm_set_read_only(pmd->bm);
1685 up_write(&pmd->root_lock);
1686}
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
deleted file mode 100644
index 0cecc370288..00000000000
--- a/drivers/md/dm-thin-metadata.h
+++ /dev/null
@@ -1,197 +0,0 @@
1/*
2 * Copyright (C) 2010-2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef DM_THIN_METADATA_H
8#define DM_THIN_METADATA_H
9
10#include "persistent-data/dm-block-manager.h"
11
12#define THIN_METADATA_BLOCK_SIZE 4096
13
14/*
15 * The metadata device is currently limited in size.
16 *
17 * We have one block of index, which can hold 255 index entries. Each
18 * index entry contains allocation info about 16k metadata blocks.
19 */
20#define THIN_METADATA_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
21
22/*
23 * A metadata device larger than 16GB triggers a warning.
24 */
25#define THIN_METADATA_MAX_SECTORS_WARNING (16 * (1024 * 1024 * 1024 >> SECTOR_SHIFT))
26
27/*----------------------------------------------------------------*/
28
29struct dm_pool_metadata;
30struct dm_thin_device;
31
32/*
33 * Device identifier
34 */
35typedef uint64_t dm_thin_id;
36
37/*
38 * Reopens or creates a new, empty metadata volume.
39 */
40struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
41 sector_t data_block_size,
42 bool format_device);
43
44int dm_pool_metadata_close(struct dm_pool_metadata *pmd);
45
46/*
47 * Compat feature flags. Any incompat flags beyond the ones
48 * specified below will prevent use of the thin metadata.
49 */
50#define THIN_FEATURE_COMPAT_SUPP 0UL
51#define THIN_FEATURE_COMPAT_RO_SUPP 0UL
52#define THIN_FEATURE_INCOMPAT_SUPP 0UL
53
54/*
55 * Device creation/deletion.
56 */
57int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev);
58
59/*
60 * An internal snapshot.
61 *
62 * You can only snapshot a quiesced origin i.e. one that is either
63 * suspended or not instanced at all.
64 */
65int dm_pool_create_snap(struct dm_pool_metadata *pmd, dm_thin_id dev,
66 dm_thin_id origin);
67
68/*
69 * Deletes a virtual device from the metadata. It _is_ safe to call this
70 * when that device is open. Operations on that device will just start
71 * failing. You still need to call close() on the device.
72 */
73int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd,
74 dm_thin_id dev);
75
76/*
77 * Commits _all_ metadata changes: device creation, deletion, mapping
78 * updates.
79 */
80int dm_pool_commit_metadata(struct dm_pool_metadata *pmd);
81
82/*
83 * Discards all uncommitted changes. Rereads the superblock, rolling back
84 * to the last good transaction. Thin devices remain open.
85 * dm_thin_aborted_changes() tells you if they had uncommitted changes.
86 *
87 * If this call fails it's only useful to call dm_pool_metadata_close().
88 * All other methods will fail with -EINVAL.
89 */
90int dm_pool_abort_metadata(struct dm_pool_metadata *pmd);
91
92/*
93 * Set/get userspace transaction id.
94 */
95int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd,
96 uint64_t current_id,
97 uint64_t new_id);
98
99int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd,
100 uint64_t *result);
101
102/*
103 * Hold/get root for userspace transaction.
104 *
105 * The metadata snapshot is a copy of the current superblock (minus the
106 * space maps). Userland can access the data structures for READ
107 * operations only. A small performance hit is incurred by providing this
108 * copy of the metadata to userland due to extra copy-on-write operations
109 * on the metadata nodes. Release this as soon as you finish with it.
110 */
111int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd);
112int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd);
113
114int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd,
115 dm_block_t *result);
116
117/*
118 * Actions on a single virtual device.
119 */
120
121/*
122 * Opening the same device more than once will fail with -EBUSY.
123 */
124int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev,
125 struct dm_thin_device **td);
126
127int dm_pool_close_thin_device(struct dm_thin_device *td);
128
129dm_thin_id dm_thin_dev_id(struct dm_thin_device *td);
130
131struct dm_thin_lookup_result {
132 dm_block_t block;
133 unsigned shared:1;
134};
135
136/*
137 * Returns:
138 * -EWOULDBLOCK iff @can_block is set and would block.
139 * -ENODATA iff that mapping is not present.
140 * 0 success
141 */
142int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
143 int can_block, struct dm_thin_lookup_result *result);
144
145/*
146 * Obtain an unused block.
147 */
148int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result);
149
150/*
151 * Insert or remove block.
152 */
153int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block,
154 dm_block_t data_block);
155
156int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block);
157
158/*
159 * Queries.
160 */
161bool dm_thin_changed_this_transaction(struct dm_thin_device *td);
162
163bool dm_thin_aborted_changes(struct dm_thin_device *td);
164
165int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
166 dm_block_t *highest_mapped);
167
168int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result);
169
170int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd,
171 dm_block_t *result);
172
173int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
174 dm_block_t *result);
175
176int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd,
177 dm_block_t *result);
178
179int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result);
180
181int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result);
182
183/*
184 * Returns -ENOSPC if the new size is too small and already allocated
185 * blocks would be lost.
186 */
187int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_size);
188
189/*
190 * Flicks the underlying block manager into read only mode, so you know
191 * that nothing is changing.
192 */
193void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd);
194
195/*----------------------------------------------------------------*/
196
197#endif
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
deleted file mode 100644
index 675ae527401..00000000000
--- a/drivers/md/dm-thin.c
+++ /dev/null
@@ -1,2818 +0,0 @@
1/*
2 * Copyright (C) 2011-2012 Red Hat UK.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-thin-metadata.h"
8#include "dm-bio-prison.h"
9#include "dm.h"
10
11#include <linux/device-mapper.h>
12#include <linux/dm-io.h>
13#include <linux/dm-kcopyd.h>
14#include <linux/list.h>
15#include <linux/init.h>
16#include <linux/module.h>
17#include <linux/slab.h>
18
19#define DM_MSG_PREFIX "thin"
20
21/*
22 * Tunable constants
23 */
24#define ENDIO_HOOK_POOL_SIZE 1024
25#define MAPPING_POOL_SIZE 1024
26#define PRISON_CELLS 1024
27#define COMMIT_PERIOD HZ
28
29/*
30 * The block size of the device holding pool data must be
31 * between 64KB and 1GB.
32 */
33#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
34#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
35
36/*
37 * Device id is restricted to 24 bits.
38 */
39#define MAX_DEV_ID ((1 << 24) - 1)
40
41/*
42 * How do we handle breaking sharing of data blocks?
43 * =================================================
44 *
45 * We use a standard copy-on-write btree to store the mappings for the
46 * devices (note I'm talking about copy-on-write of the metadata here, not
47 * the data). When you take an internal snapshot you clone the root node
48 * of the origin btree. After this there is no concept of an origin or a
49 * snapshot. They are just two device trees that happen to point to the
50 * same data blocks.
51 *
52 * When we get a write in we decide if it's to a shared data block using
53 * some timestamp magic. If it is, we have to break sharing.
54 *
55 * Let's say we write to a shared block in what was the origin. The
56 * steps are:
57 *
58 * i) plug io further to this physical block. (see bio_prison code).
59 *
60 * ii) quiesce any read io to that shared data block. Obviously
61 * including all devices that share this block. (see dm_deferred_set code)
62 *
63 * iii) copy the data block to a newly allocate block. This step can be
64 * missed out if the io covers the block. (schedule_copy).
65 *
66 * iv) insert the new mapping into the origin's btree
67 * (process_prepared_mapping). This act of inserting breaks some
68 * sharing of btree nodes between the two devices. Breaking sharing only
69 * effects the btree of that specific device. Btrees for the other
70 * devices that share the block never change. The btree for the origin
71 * device as it was after the last commit is untouched, ie. we're using
72 * persistent data structures in the functional programming sense.
73 *
74 * v) unplug io to this physical block, including the io that triggered
75 * the breaking of sharing.
76 *
77 * Steps (ii) and (iii) occur in parallel.
78 *
79 * The metadata _doesn't_ need to be committed before the io continues. We
80 * get away with this because the io is always written to a _new_ block.
81 * If there's a crash, then:
82 *
83 * - The origin mapping will point to the old origin block (the shared
84 * one). This will contain the data as it was before the io that triggered
85 * the breaking of sharing came in.
86 *
87 * - The snap mapping still points to the old block. As it would after
88 * the commit.
89 *
90 * The downside of this scheme is the timestamp magic isn't perfect, and
91 * will continue to think that data block in the snapshot device is shared
92 * even after the write to the origin has broken sharing. I suspect data
93 * blocks will typically be shared by many different devices, so we're
94 * breaking sharing n + 1 times, rather than n, where n is the number of
95 * devices that reference this data block. At the moment I think the
96 * benefits far, far outweigh the disadvantages.
97 */
98
99/*----------------------------------------------------------------*/
100
101/*
102 * Key building.
103 */
104static void build_data_key(struct dm_thin_device *td,
105 dm_block_t b, struct dm_cell_key *key)
106{
107 key->virtual = 0;
108 key->dev = dm_thin_dev_id(td);
109 key->block = b;
110}
111
112static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
113 struct dm_cell_key *key)
114{
115 key->virtual = 1;
116 key->dev = dm_thin_dev_id(td);
117 key->block = b;
118}
119
120/*----------------------------------------------------------------*/
121
122/*
123 * A pool device ties together a metadata device and a data device. It
124 * also provides the interface for creating and destroying internal
125 * devices.
126 */
127struct dm_thin_new_mapping;
128
129/*
130 * The pool runs in 3 modes. Ordered in degraded order for comparisons.
131 */
132enum pool_mode {
133 PM_WRITE, /* metadata may be changed */
134 PM_READ_ONLY, /* metadata may not be changed */
135 PM_FAIL, /* all I/O fails */
136};
137
138struct pool_features {
139 enum pool_mode mode;
140
141 bool zero_new_blocks:1;
142 bool discard_enabled:1;
143 bool discard_passdown:1;
144};
145
146struct thin_c;
147typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
148typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
149
150struct pool {
151 struct list_head list;
152 struct dm_target *ti; /* Only set if a pool target is bound */
153
154 struct mapped_device *pool_md;
155 struct block_device *md_dev;
156 struct dm_pool_metadata *pmd;
157
158 dm_block_t low_water_blocks;
159 uint32_t sectors_per_block;
160 int sectors_per_block_shift;
161
162 struct pool_features pf;
163 unsigned low_water_triggered:1; /* A dm event has been sent */
164 unsigned no_free_space:1; /* A -ENOSPC warning has been issued */
165
166 struct dm_bio_prison *prison;
167 struct dm_kcopyd_client *copier;
168
169 struct workqueue_struct *wq;
170 struct work_struct worker;
171 struct delayed_work waker;
172
173 unsigned long last_commit_jiffies;
174 unsigned ref_count;
175
176 spinlock_t lock;
177 struct bio_list deferred_bios;
178 struct bio_list deferred_flush_bios;
179 struct list_head prepared_mappings;
180 struct list_head prepared_discards;
181
182 struct bio_list retry_on_resume_list;
183
184 struct dm_deferred_set *shared_read_ds;
185 struct dm_deferred_set *all_io_ds;
186
187 struct dm_thin_new_mapping *next_mapping;
188 mempool_t *mapping_pool;
189
190 process_bio_fn process_bio;
191 process_bio_fn process_discard;
192
193 process_mapping_fn process_prepared_mapping;
194 process_mapping_fn process_prepared_discard;
195};
196
197static enum pool_mode get_pool_mode(struct pool *pool);
198static void set_pool_mode(struct pool *pool, enum pool_mode mode);
199
200/*
201 * Target context for a pool.
202 */
203struct pool_c {
204 struct dm_target *ti;
205 struct pool *pool;
206 struct dm_dev *data_dev;
207 struct dm_dev *metadata_dev;
208 struct dm_target_callbacks callbacks;
209
210 dm_block_t low_water_blocks;
211 struct pool_features requested_pf; /* Features requested during table load */
212 struct pool_features adjusted_pf; /* Features used after adjusting for constituent devices */
213};
214
215/*
216 * Target context for a thin.
217 */
218struct thin_c {
219 struct dm_dev *pool_dev;
220 struct dm_dev *origin_dev;
221 dm_thin_id dev_id;
222
223 struct pool *pool;
224 struct dm_thin_device *td;
225};
226
227/*----------------------------------------------------------------*/
228
229/*
230 * A global list of pools that uses a struct mapped_device as a key.
231 */
232static struct dm_thin_pool_table {
233 struct mutex mutex;
234 struct list_head pools;
235} dm_thin_pool_table;
236
237static void pool_table_init(void)
238{
239 mutex_init(&dm_thin_pool_table.mutex);
240 INIT_LIST_HEAD(&dm_thin_pool_table.pools);
241}
242
243static void __pool_table_insert(struct pool *pool)
244{
245 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
246 list_add(&pool->list, &dm_thin_pool_table.pools);
247}
248
249static void __pool_table_remove(struct pool *pool)
250{
251 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
252 list_del(&pool->list);
253}
254
255static struct pool *__pool_table_lookup(struct mapped_device *md)
256{
257 struct pool *pool = NULL, *tmp;
258
259 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
260
261 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
262 if (tmp->pool_md == md) {
263 pool = tmp;
264 break;
265 }
266 }
267
268 return pool;
269}
270
271static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev)
272{
273 struct pool *pool = NULL, *tmp;
274
275 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
276
277 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
278 if (tmp->md_dev == md_dev) {
279 pool = tmp;
280 break;
281 }
282 }
283
284 return pool;
285}
286
287/*----------------------------------------------------------------*/
288
289struct dm_thin_endio_hook {
290 struct thin_c *tc;
291 struct dm_deferred_entry *shared_read_entry;
292 struct dm_deferred_entry *all_io_entry;
293 struct dm_thin_new_mapping *overwrite_mapping;
294};
295
296static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
297{
298 struct bio *bio;
299 struct bio_list bios;
300
301 bio_list_init(&bios);
302 bio_list_merge(&bios, master);
303 bio_list_init(master);
304
305 while ((bio = bio_list_pop(&bios))) {
306 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
307
308 if (h->tc == tc)
309 bio_endio(bio, DM_ENDIO_REQUEUE);
310 else
311 bio_list_add(master, bio);
312 }
313}
314
315static void requeue_io(struct thin_c *tc)
316{
317 struct pool *pool = tc->pool;
318 unsigned long flags;
319
320 spin_lock_irqsave(&pool->lock, flags);
321 __requeue_bio_list(tc, &pool->deferred_bios);
322 __requeue_bio_list(tc, &pool->retry_on_resume_list);
323 spin_unlock_irqrestore(&pool->lock, flags);
324}
325
326/*
327 * This section of code contains the logic for processing a thin device's IO.
328 * Much of the code depends on pool object resources (lists, workqueues, etc)
329 * but most is exclusively called from the thin target rather than the thin-pool
330 * target.
331 */
332
333static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
334{
335 sector_t block_nr = bio->bi_sector;
336
337 if (tc->pool->sectors_per_block_shift < 0)
338 (void) sector_div(block_nr, tc->pool->sectors_per_block);
339 else
340 block_nr >>= tc->pool->sectors_per_block_shift;
341
342 return block_nr;
343}
344
345static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
346{
347 struct pool *pool = tc->pool;
348 sector_t bi_sector = bio->bi_sector;
349
350 bio->bi_bdev = tc->pool_dev->bdev;
351 if (tc->pool->sectors_per_block_shift < 0)
352 bio->bi_sector = (block * pool->sectors_per_block) +
353 sector_div(bi_sector, pool->sectors_per_block);
354 else
355 bio->bi_sector = (block << pool->sectors_per_block_shift) |
356 (bi_sector & (pool->sectors_per_block - 1));
357}
358
359static void remap_to_origin(struct thin_c *tc, struct bio *bio)
360{
361 bio->bi_bdev = tc->origin_dev->bdev;
362}
363
364static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
365{
366 return (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) &&
367 dm_thin_changed_this_transaction(tc->td);
368}
369
370static void inc_all_io_entry(struct pool *pool, struct bio *bio)
371{
372 struct dm_thin_endio_hook *h;
373
374 if (bio->bi_rw & REQ_DISCARD)
375 return;
376
377 h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
378 h->all_io_entry = dm_deferred_entry_inc(pool->all_io_ds);
379}
380
381static void issue(struct thin_c *tc, struct bio *bio)
382{
383 struct pool *pool = tc->pool;
384 unsigned long flags;
385
386 if (!bio_triggers_commit(tc, bio)) {
387 generic_make_request(bio);
388 return;
389 }
390
391 /*
392 * Complete bio with an error if earlier I/O caused changes to
393 * the metadata that can't be committed e.g, due to I/O errors
394 * on the metadata device.
395 */
396 if (dm_thin_aborted_changes(tc->td)) {
397 bio_io_error(bio);
398 return;
399 }
400
401 /*
402 * Batch together any bios that trigger commits and then issue a
403 * single commit for them in process_deferred_bios().
404 */
405 spin_lock_irqsave(&pool->lock, flags);
406 bio_list_add(&pool->deferred_flush_bios, bio);
407 spin_unlock_irqrestore(&pool->lock, flags);
408}
409
410static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
411{
412 remap_to_origin(tc, bio);
413 issue(tc, bio);
414}
415
416static void remap_and_issue(struct thin_c *tc, struct bio *bio,
417 dm_block_t block)
418{
419 remap(tc, bio, block);
420 issue(tc, bio);
421}
422
423/*
424 * wake_worker() is used when new work is queued and when pool_resume is
425 * ready to continue deferred IO processing.
426 */
427static void wake_worker(struct pool *pool)
428{
429 queue_work(pool->wq, &pool->worker);
430}
431
432/*----------------------------------------------------------------*/
433
434/*
435 * Bio endio functions.
436 */
437struct dm_thin_new_mapping {
438 struct list_head list;
439
440 unsigned quiesced:1;
441 unsigned prepared:1;
442 unsigned pass_discard:1;
443
444 struct thin_c *tc;
445 dm_block_t virt_block;
446 dm_block_t data_block;
447 struct dm_bio_prison_cell *cell, *cell2;
448 int err;
449
450 /*
451 * If the bio covers the whole area of a block then we can avoid
452 * zeroing or copying. Instead this bio is hooked. The bio will
453 * still be in the cell, so care has to be taken to avoid issuing
454 * the bio twice.
455 */
456 struct bio *bio;
457 bio_end_io_t *saved_bi_end_io;
458};
459
460static void __maybe_add_mapping(struct dm_thin_new_mapping *m)
461{
462 struct pool *pool = m->tc->pool;
463
464 if (m->quiesced && m->prepared) {
465 list_add(&m->list, &pool->prepared_mappings);
466 wake_worker(pool);
467 }
468}
469
470static void copy_complete(int read_err, unsigned long write_err, void *context)
471{
472 unsigned long flags;
473 struct dm_thin_new_mapping *m = context;
474 struct pool *pool = m->tc->pool;
475
476 m->err = read_err || write_err ? -EIO : 0;
477
478 spin_lock_irqsave(&pool->lock, flags);
479 m->prepared = 1;
480 __maybe_add_mapping(m);
481 spin_unlock_irqrestore(&pool->lock, flags);
482}
483
484static void overwrite_endio(struct bio *bio, int err)
485{
486 unsigned long flags;
487 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
488 struct dm_thin_new_mapping *m = h->overwrite_mapping;
489 struct pool *pool = m->tc->pool;
490
491 m->err = err;
492
493 spin_lock_irqsave(&pool->lock, flags);
494 m->prepared = 1;
495 __maybe_add_mapping(m);
496 spin_unlock_irqrestore(&pool->lock, flags);
497}
498
499/*----------------------------------------------------------------*/
500
501/*
502 * Workqueue.
503 */
504
505/*
506 * Prepared mapping jobs.
507 */
508
509/*
510 * This sends the bios in the cell back to the deferred_bios list.
511 */
512static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell)
513{
514 struct pool *pool = tc->pool;
515 unsigned long flags;
516
517 spin_lock_irqsave(&pool->lock, flags);
518 dm_cell_release(cell, &pool->deferred_bios);
519 spin_unlock_irqrestore(&tc->pool->lock, flags);
520
521 wake_worker(pool);
522}
523
524/*
525 * Same as cell_defer except it omits the original holder of the cell.
526 */
527static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)
528{
529 struct pool *pool = tc->pool;
530 unsigned long flags;
531
532 spin_lock_irqsave(&pool->lock, flags);
533 dm_cell_release_no_holder(cell, &pool->deferred_bios);
534 spin_unlock_irqrestore(&pool->lock, flags);
535
536 wake_worker(pool);
537}
538
539static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
540{
541 if (m->bio)
542 m->bio->bi_end_io = m->saved_bi_end_io;
543 dm_cell_error(m->cell);
544 list_del(&m->list);
545 mempool_free(m, m->tc->pool->mapping_pool);
546}
547static void process_prepared_mapping(struct dm_thin_new_mapping *m)
548{
549 struct thin_c *tc = m->tc;
550 struct bio *bio;
551 int r;
552
553 bio = m->bio;
554 if (bio)
555 bio->bi_end_io = m->saved_bi_end_io;
556
557 if (m->err) {
558 dm_cell_error(m->cell);
559 goto out;
560 }
561
562 /*
563 * Commit the prepared block into the mapping btree.
564 * Any I/O for this block arriving after this point will get
565 * remapped to it directly.
566 */
567 r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
568 if (r) {
569 DMERR_LIMIT("dm_thin_insert_block() failed");
570 dm_cell_error(m->cell);
571 goto out;
572 }
573
574 /*
575 * Release any bios held while the block was being provisioned.
576 * If we are processing a write bio that completely covers the block,
577 * we already processed it so can ignore it now when processing
578 * the bios in the cell.
579 */
580 if (bio) {
581 cell_defer_no_holder(tc, m->cell);
582 bio_endio(bio, 0);
583 } else
584 cell_defer(tc, m->cell);
585
586out:
587 list_del(&m->list);
588 mempool_free(m, tc->pool->mapping_pool);
589}
590
591static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
592{
593 struct thin_c *tc = m->tc;
594
595 bio_io_error(m->bio);
596 cell_defer_no_holder(tc, m->cell);
597 cell_defer_no_holder(tc, m->cell2);
598 mempool_free(m, tc->pool->mapping_pool);
599}
600
601static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
602{
603 struct thin_c *tc = m->tc;
604
605 inc_all_io_entry(tc->pool, m->bio);
606 cell_defer_no_holder(tc, m->cell);
607 cell_defer_no_holder(tc, m->cell2);
608
609 if (m->pass_discard)
610 remap_and_issue(tc, m->bio, m->data_block);
611 else
612 bio_endio(m->bio, 0);
613
614 mempool_free(m, tc->pool->mapping_pool);
615}
616
617static void process_prepared_discard(struct dm_thin_new_mapping *m)
618{
619 int r;
620 struct thin_c *tc = m->tc;
621
622 r = dm_thin_remove_block(tc->td, m->virt_block);
623 if (r)
624 DMERR_LIMIT("dm_thin_remove_block() failed");
625
626 process_prepared_discard_passdown(m);
627}
628
629static void process_prepared(struct pool *pool, struct list_head *head,
630 process_mapping_fn *fn)
631{
632 unsigned long flags;
633 struct list_head maps;
634 struct dm_thin_new_mapping *m, *tmp;
635
636 INIT_LIST_HEAD(&maps);
637 spin_lock_irqsave(&pool->lock, flags);
638 list_splice_init(head, &maps);
639 spin_unlock_irqrestore(&pool->lock, flags);
640
641 list_for_each_entry_safe(m, tmp, &maps, list)
642 (*fn)(m);
643}
644
645/*
646 * Deferred bio jobs.
647 */
648static int io_overlaps_block(struct pool *pool, struct bio *bio)
649{
650 return bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT);
651}
652
653static int io_overwrites_block(struct pool *pool, struct bio *bio)
654{
655 return (bio_data_dir(bio) == WRITE) &&
656 io_overlaps_block(pool, bio);
657}
658
659static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
660 bio_end_io_t *fn)
661{
662 *save = bio->bi_end_io;
663 bio->bi_end_io = fn;
664}
665
666static int ensure_next_mapping(struct pool *pool)
667{
668 if (pool->next_mapping)
669 return 0;
670
671 pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC);
672
673 return pool->next_mapping ? 0 : -ENOMEM;
674}
675
676static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
677{
678 struct dm_thin_new_mapping *r = pool->next_mapping;
679
680 BUG_ON(!pool->next_mapping);
681
682 pool->next_mapping = NULL;
683
684 return r;
685}
686
687static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
688 struct dm_dev *origin, dm_block_t data_origin,
689 dm_block_t data_dest,
690 struct dm_bio_prison_cell *cell, struct bio *bio)
691{
692 int r;
693 struct pool *pool = tc->pool;
694 struct dm_thin_new_mapping *m = get_next_mapping(pool);
695
696 INIT_LIST_HEAD(&m->list);
697 m->quiesced = 0;
698 m->prepared = 0;
699 m->tc = tc;
700 m->virt_block = virt_block;
701 m->data_block = data_dest;
702 m->cell = cell;
703 m->err = 0;
704 m->bio = NULL;
705
706 if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
707 m->quiesced = 1;
708
709 /*
710 * IO to pool_dev remaps to the pool target's data_dev.
711 *
712 * If the whole block of data is being overwritten, we can issue the
713 * bio immediately. Otherwise we use kcopyd to clone the data first.
714 */
715 if (io_overwrites_block(pool, bio)) {
716 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
717
718 h->overwrite_mapping = m;
719 m->bio = bio;
720 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
721 inc_all_io_entry(pool, bio);
722 remap_and_issue(tc, bio, data_dest);
723 } else {
724 struct dm_io_region from, to;
725
726 from.bdev = origin->bdev;
727 from.sector = data_origin * pool->sectors_per_block;
728 from.count = pool->sectors_per_block;
729
730 to.bdev = tc->pool_dev->bdev;
731 to.sector = data_dest * pool->sectors_per_block;
732 to.count = pool->sectors_per_block;
733
734 r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
735 0, copy_complete, m);
736 if (r < 0) {
737 mempool_free(m, pool->mapping_pool);
738 DMERR_LIMIT("dm_kcopyd_copy() failed");
739 dm_cell_error(cell);
740 }
741 }
742}
743
744static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
745 dm_block_t data_origin, dm_block_t data_dest,
746 struct dm_bio_prison_cell *cell, struct bio *bio)
747{
748 schedule_copy(tc, virt_block, tc->pool_dev,
749 data_origin, data_dest, cell, bio);
750}
751
752static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
753 dm_block_t data_dest,
754 struct dm_bio_prison_cell *cell, struct bio *bio)
755{
756 schedule_copy(tc, virt_block, tc->origin_dev,
757 virt_block, data_dest, cell, bio);
758}
759
760static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
761 dm_block_t data_block, struct dm_bio_prison_cell *cell,
762 struct bio *bio)
763{
764 struct pool *pool = tc->pool;
765 struct dm_thin_new_mapping *m = get_next_mapping(pool);
766
767 INIT_LIST_HEAD(&m->list);
768 m->quiesced = 1;
769 m->prepared = 0;
770 m->tc = tc;
771 m->virt_block = virt_block;
772 m->data_block = data_block;
773 m->cell = cell;
774 m->err = 0;
775 m->bio = NULL;
776
777 /*
778 * If the whole block of data is being overwritten or we are not
779 * zeroing pre-existing data, we can issue the bio immediately.
780 * Otherwise we use kcopyd to zero the data first.
781 */
782 if (!pool->pf.zero_new_blocks)
783 process_prepared_mapping(m);
784
785 else if (io_overwrites_block(pool, bio)) {
786 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
787
788 h->overwrite_mapping = m;
789 m->bio = bio;
790 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
791 inc_all_io_entry(pool, bio);
792 remap_and_issue(tc, bio, data_block);
793 } else {
794 int r;
795 struct dm_io_region to;
796
797 to.bdev = tc->pool_dev->bdev;
798 to.sector = data_block * pool->sectors_per_block;
799 to.count = pool->sectors_per_block;
800
801 r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m);
802 if (r < 0) {
803 mempool_free(m, pool->mapping_pool);
804 DMERR_LIMIT("dm_kcopyd_zero() failed");
805 dm_cell_error(cell);
806 }
807 }
808}
809
810static int commit(struct pool *pool)
811{
812 int r;
813
814 r = dm_pool_commit_metadata(pool->pmd);
815 if (r)
816 DMERR_LIMIT("commit failed: error = %d", r);
817
818 return r;
819}
820
821/*
822 * A non-zero return indicates read_only or fail_io mode.
823 * Many callers don't care about the return value.
824 */
825static int commit_or_fallback(struct pool *pool)
826{
827 int r;
828
829 if (get_pool_mode(pool) != PM_WRITE)
830 return -EINVAL;
831
832 r = commit(pool);
833 if (r)
834 set_pool_mode(pool, PM_READ_ONLY);
835
836 return r;
837}
838
839static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
840{
841 int r;
842 dm_block_t free_blocks;
843 unsigned long flags;
844 struct pool *pool = tc->pool;
845
846 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
847 if (r)
848 return r;
849
850 if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
851 DMWARN("%s: reached low water mark, sending event.",
852 dm_device_name(pool->pool_md));
853 spin_lock_irqsave(&pool->lock, flags);
854 pool->low_water_triggered = 1;
855 spin_unlock_irqrestore(&pool->lock, flags);
856 dm_table_event(pool->ti->table);
857 }
858
859 if (!free_blocks) {
860 if (pool->no_free_space)
861 return -ENOSPC;
862 else {
863 /*
864 * Try to commit to see if that will free up some
865 * more space.
866 */
867 (void) commit_or_fallback(pool);
868
869 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
870 if (r)
871 return r;
872
873 /*
874 * If we still have no space we set a flag to avoid
875 * doing all this checking and return -ENOSPC.
876 */
877 if (!free_blocks) {
878 DMWARN("%s: no free space available.",
879 dm_device_name(pool->pool_md));
880 spin_lock_irqsave(&pool->lock, flags);
881 pool->no_free_space = 1;
882 spin_unlock_irqrestore(&pool->lock, flags);
883 return -ENOSPC;
884 }
885 }
886 }
887
888 r = dm_pool_alloc_data_block(pool->pmd, result);
889 if (r)
890 return r;
891
892 return 0;
893}
894
895/*
896 * If we have run out of space, queue bios until the device is
897 * resumed, presumably after having been reloaded with more space.
898 */
899static void retry_on_resume(struct bio *bio)
900{
901 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
902 struct thin_c *tc = h->tc;
903 struct pool *pool = tc->pool;
904 unsigned long flags;
905
906 spin_lock_irqsave(&pool->lock, flags);
907 bio_list_add(&pool->retry_on_resume_list, bio);
908 spin_unlock_irqrestore(&pool->lock, flags);
909}
910
911static void no_space(struct dm_bio_prison_cell *cell)
912{
913 struct bio *bio;
914 struct bio_list bios;
915
916 bio_list_init(&bios);
917 dm_cell_release(cell, &bios);
918
919 while ((bio = bio_list_pop(&bios)))
920 retry_on_resume(bio);
921}
922
923static void process_discard(struct thin_c *tc, struct bio *bio)
924{
925 int r;
926 unsigned long flags;
927 struct pool *pool = tc->pool;
928 struct dm_bio_prison_cell *cell, *cell2;
929 struct dm_cell_key key, key2;
930 dm_block_t block = get_bio_block(tc, bio);
931 struct dm_thin_lookup_result lookup_result;
932 struct dm_thin_new_mapping *m;
933
934 build_virtual_key(tc->td, block, &key);
935 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell))
936 return;
937
938 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
939 switch (r) {
940 case 0:
941 /*
942 * Check nobody is fiddling with this pool block. This can
943 * happen if someone's in the process of breaking sharing
944 * on this block.
945 */
946 build_data_key(tc->td, lookup_result.block, &key2);
947 if (dm_bio_detain(tc->pool->prison, &key2, bio, &cell2)) {
948 cell_defer_no_holder(tc, cell);
949 break;
950 }
951
952 if (io_overlaps_block(pool, bio)) {
953 /*
954 * IO may still be going to the destination block. We must
955 * quiesce before we can do the removal.
956 */
957 m = get_next_mapping(pool);
958 m->tc = tc;
959 m->pass_discard = (!lookup_result.shared) && pool->pf.discard_passdown;
960 m->virt_block = block;
961 m->data_block = lookup_result.block;
962 m->cell = cell;
963 m->cell2 = cell2;
964 m->err = 0;
965 m->bio = bio;
966
967 if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) {
968 spin_lock_irqsave(&pool->lock, flags);
969 list_add(&m->list, &pool->prepared_discards);
970 spin_unlock_irqrestore(&pool->lock, flags);
971 wake_worker(pool);
972 }
973 } else {
974 inc_all_io_entry(pool, bio);
975 cell_defer_no_holder(tc, cell);
976 cell_defer_no_holder(tc, cell2);
977
978 /*
979 * The DM core makes sure that the discard doesn't span
980 * a block boundary. So we submit the discard of a
981 * partial block appropriately.
982 */
983 if ((!lookup_result.shared) && pool->pf.discard_passdown)
984 remap_and_issue(tc, bio, lookup_result.block);
985 else
986 bio_endio(bio, 0);
987 }
988 break;
989
990 case -ENODATA:
991 /*
992 * It isn't provisioned, just forget it.
993 */
994 cell_defer_no_holder(tc, cell);
995 bio_endio(bio, 0);
996 break;
997
998 default:
999 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
1000 __func__, r);
1001 cell_defer_no_holder(tc, cell);
1002 bio_io_error(bio);
1003 break;
1004 }
1005}
1006
1007static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
1008 struct dm_cell_key *key,
1009 struct dm_thin_lookup_result *lookup_result,
1010 struct dm_bio_prison_cell *cell)
1011{
1012 int r;
1013 dm_block_t data_block;
1014
1015 r = alloc_data_block(tc, &data_block);
1016 switch (r) {
1017 case 0:
1018 schedule_internal_copy(tc, block, lookup_result->block,
1019 data_block, cell, bio);
1020 break;
1021
1022 case -ENOSPC:
1023 no_space(cell);
1024 break;
1025
1026 default:
1027 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1028 __func__, r);
1029 dm_cell_error(cell);
1030 break;
1031 }
1032}
1033
1034static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1035 dm_block_t block,
1036 struct dm_thin_lookup_result *lookup_result)
1037{
1038 struct dm_bio_prison_cell *cell;
1039 struct pool *pool = tc->pool;
1040 struct dm_cell_key key;
1041
1042 /*
1043 * If cell is already occupied, then sharing is already in the process
1044 * of being broken so we have nothing further to do here.
1045 */
1046 build_data_key(tc->td, lookup_result->block, &key);
1047 if (dm_bio_detain(pool->prison, &key, bio, &cell))
1048 return;
1049
1050 if (bio_data_dir(bio) == WRITE && bio->bi_size)
1051 break_sharing(tc, bio, block, &key, lookup_result, cell);
1052 else {
1053 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1054
1055 h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);
1056 inc_all_io_entry(pool, bio);
1057 cell_defer_no_holder(tc, cell);
1058
1059 remap_and_issue(tc, bio, lookup_result->block);
1060 }
1061}
1062
1063static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block,
1064 struct dm_bio_prison_cell *cell)
1065{
1066 int r;
1067 dm_block_t data_block;
1068
1069 /*
1070 * Remap empty bios (flushes) immediately, without provisioning.
1071 */
1072 if (!bio->bi_size) {
1073 inc_all_io_entry(tc->pool, bio);
1074 cell_defer_no_holder(tc, cell);
1075
1076 remap_and_issue(tc, bio, 0);
1077 return;
1078 }
1079
1080 /*
1081 * Fill read bios with zeroes and complete them immediately.
1082 */
1083 if (bio_data_dir(bio) == READ) {
1084 zero_fill_bio(bio);
1085 cell_defer_no_holder(tc, cell);
1086 bio_endio(bio, 0);
1087 return;
1088 }
1089
1090 r = alloc_data_block(tc, &data_block);
1091 switch (r) {
1092 case 0:
1093 if (tc->origin_dev)
1094 schedule_external_copy(tc, block, data_block, cell, bio);
1095 else
1096 schedule_zero(tc, block, data_block, cell, bio);
1097 break;
1098
1099 case -ENOSPC:
1100 no_space(cell);
1101 break;
1102
1103 default:
1104 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1105 __func__, r);
1106 set_pool_mode(tc->pool, PM_READ_ONLY);
1107 dm_cell_error(cell);
1108 break;
1109 }
1110}
1111
1112static void process_bio(struct thin_c *tc, struct bio *bio)
1113{
1114 int r;
1115 dm_block_t block = get_bio_block(tc, bio);
1116 struct dm_bio_prison_cell *cell;
1117 struct dm_cell_key key;
1118 struct dm_thin_lookup_result lookup_result;
1119
1120 /*
1121 * If cell is already occupied, then the block is already
1122 * being provisioned so we have nothing further to do here.
1123 */
1124 build_virtual_key(tc->td, block, &key);
1125 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell))
1126 return;
1127
1128 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1129 switch (r) {
1130 case 0:
1131 if (lookup_result.shared) {
1132 process_shared_bio(tc, bio, block, &lookup_result);
1133 cell_defer_no_holder(tc, cell);
1134 } else {
1135 inc_all_io_entry(tc->pool, bio);
1136 cell_defer_no_holder(tc, cell);
1137
1138 remap_and_issue(tc, bio, lookup_result.block);
1139 }
1140 break;
1141
1142 case -ENODATA:
1143 if (bio_data_dir(bio) == READ && tc->origin_dev) {
1144 inc_all_io_entry(tc->pool, bio);
1145 cell_defer_no_holder(tc, cell);
1146
1147 remap_to_origin_and_issue(tc, bio);
1148 } else
1149 provision_block(tc, bio, block, cell);
1150 break;
1151
1152 default:
1153 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
1154 __func__, r);
1155 cell_defer_no_holder(tc, cell);
1156 bio_io_error(bio);
1157 break;
1158 }
1159}
1160
1161static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1162{
1163 int r;
1164 int rw = bio_data_dir(bio);
1165 dm_block_t block = get_bio_block(tc, bio);
1166 struct dm_thin_lookup_result lookup_result;
1167
1168 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1169 switch (r) {
1170 case 0:
1171 if (lookup_result.shared && (rw == WRITE) && bio->bi_size)
1172 bio_io_error(bio);
1173 else {
1174 inc_all_io_entry(tc->pool, bio);
1175 remap_and_issue(tc, bio, lookup_result.block);
1176 }
1177 break;
1178
1179 case -ENODATA:
1180 if (rw != READ) {
1181 bio_io_error(bio);
1182 break;
1183 }
1184
1185 if (tc->origin_dev) {
1186 inc_all_io_entry(tc->pool, bio);
1187 remap_to_origin_and_issue(tc, bio);
1188 break;
1189 }
1190
1191 zero_fill_bio(bio);
1192 bio_endio(bio, 0);
1193 break;
1194
1195 default:
1196 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
1197 __func__, r);
1198 bio_io_error(bio);
1199 break;
1200 }
1201}
1202
1203static void process_bio_fail(struct thin_c *tc, struct bio *bio)
1204{
1205 bio_io_error(bio);
1206}
1207
1208static int need_commit_due_to_time(struct pool *pool)
1209{
1210 return jiffies < pool->last_commit_jiffies ||
1211 jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
1212}
1213
1214static void process_deferred_bios(struct pool *pool)
1215{
1216 unsigned long flags;
1217 struct bio *bio;
1218 struct bio_list bios;
1219
1220 bio_list_init(&bios);
1221
1222 spin_lock_irqsave(&pool->lock, flags);
1223 bio_list_merge(&bios, &pool->deferred_bios);
1224 bio_list_init(&pool->deferred_bios);
1225 spin_unlock_irqrestore(&pool->lock, flags);
1226
1227 while ((bio = bio_list_pop(&bios))) {
1228 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1229 struct thin_c *tc = h->tc;
1230
1231 /*
1232 * If we've got no free new_mapping structs, and processing
1233 * this bio might require one, we pause until there are some
1234 * prepared mappings to process.
1235 */
1236 if (ensure_next_mapping(pool)) {
1237 spin_lock_irqsave(&pool->lock, flags);
1238 bio_list_merge(&pool->deferred_bios, &bios);
1239 spin_unlock_irqrestore(&pool->lock, flags);
1240
1241 break;
1242 }
1243
1244 if (bio->bi_rw & REQ_DISCARD)
1245 pool->process_discard(tc, bio);
1246 else
1247 pool->process_bio(tc, bio);
1248 }
1249
1250 /*
1251 * If there are any deferred flush bios, we must commit
1252 * the metadata before issuing them.
1253 */
1254 bio_list_init(&bios);
1255 spin_lock_irqsave(&pool->lock, flags);
1256 bio_list_merge(&bios, &pool->deferred_flush_bios);
1257 bio_list_init(&pool->deferred_flush_bios);
1258 spin_unlock_irqrestore(&pool->lock, flags);
1259
1260 if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
1261 return;
1262
1263 if (commit_or_fallback(pool)) {
1264 while ((bio = bio_list_pop(&bios)))
1265 bio_io_error(bio);
1266 return;
1267 }
1268 pool->last_commit_jiffies = jiffies;
1269
1270 while ((bio = bio_list_pop(&bios)))
1271 generic_make_request(bio);
1272}
1273
1274static void do_worker(struct work_struct *ws)
1275{
1276 struct pool *pool = container_of(ws, struct pool, worker);
1277
1278 process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
1279 process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
1280 process_deferred_bios(pool);
1281}
1282
1283/*
1284 * We want to commit periodically so that not too much
1285 * unwritten data builds up.
1286 */
1287static void do_waker(struct work_struct *ws)
1288{
1289 struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
1290 wake_worker(pool);
1291 queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
1292}
1293
1294/*----------------------------------------------------------------*/
1295
1296static enum pool_mode get_pool_mode(struct pool *pool)
1297{
1298 return pool->pf.mode;
1299}
1300
1301static void set_pool_mode(struct pool *pool, enum pool_mode mode)
1302{
1303 int r;
1304
1305 pool->pf.mode = mode;
1306
1307 switch (mode) {
1308 case PM_FAIL:
1309 DMERR("switching pool to failure mode");
1310 pool->process_bio = process_bio_fail;
1311 pool->process_discard = process_bio_fail;
1312 pool->process_prepared_mapping = process_prepared_mapping_fail;
1313 pool->process_prepared_discard = process_prepared_discard_fail;
1314 break;
1315
1316 case PM_READ_ONLY:
1317 DMERR("switching pool to read-only mode");
1318 r = dm_pool_abort_metadata(pool->pmd);
1319 if (r) {
1320 DMERR("aborting transaction failed");
1321 set_pool_mode(pool, PM_FAIL);
1322 } else {
1323 dm_pool_metadata_read_only(pool->pmd);
1324 pool->process_bio = process_bio_read_only;
1325 pool->process_discard = process_discard;
1326 pool->process_prepared_mapping = process_prepared_mapping_fail;
1327 pool->process_prepared_discard = process_prepared_discard_passdown;
1328 }
1329 break;
1330
1331 case PM_WRITE:
1332 pool->process_bio = process_bio;
1333 pool->process_discard = process_discard;
1334 pool->process_prepared_mapping = process_prepared_mapping;
1335 pool->process_prepared_discard = process_prepared_discard;
1336 break;
1337 }
1338}
1339
1340/*----------------------------------------------------------------*/
1341
1342/*
1343 * Mapping functions.
1344 */
1345
1346/*
1347 * Called only while mapping a thin bio to hand it over to the workqueue.
1348 */
1349static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
1350{
1351 unsigned long flags;
1352 struct pool *pool = tc->pool;
1353
1354 spin_lock_irqsave(&pool->lock, flags);
1355 bio_list_add(&pool->deferred_bios, bio);
1356 spin_unlock_irqrestore(&pool->lock, flags);
1357
1358 wake_worker(pool);
1359}
1360
1361static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
1362{
1363 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1364
1365 h->tc = tc;
1366 h->shared_read_entry = NULL;
1367 h->all_io_entry = NULL;
1368 h->overwrite_mapping = NULL;
1369}
1370
1371/*
1372 * Non-blocking function called from the thin target's map function.
1373 */
1374static int thin_bio_map(struct dm_target *ti, struct bio *bio)
1375{
1376 int r;
1377 struct thin_c *tc = ti->private;
1378 dm_block_t block = get_bio_block(tc, bio);
1379 struct dm_thin_device *td = tc->td;
1380 struct dm_thin_lookup_result result;
1381 struct dm_bio_prison_cell *cell1, *cell2;
1382 struct dm_cell_key key;
1383
1384 thin_hook_bio(tc, bio);
1385
1386 if (get_pool_mode(tc->pool) == PM_FAIL) {
1387 bio_io_error(bio);
1388 return DM_MAPIO_SUBMITTED;
1389 }
1390
1391 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
1392 thin_defer_bio(tc, bio);
1393 return DM_MAPIO_SUBMITTED;
1394 }
1395
1396 r = dm_thin_find_block(td, block, 0, &result);
1397
1398 /*
1399 * Note that we defer readahead too.
1400 */
1401 switch (r) {
1402 case 0:
1403 if (unlikely(result.shared)) {
1404 /*
1405 * We have a race condition here between the
1406 * result.shared value returned by the lookup and
1407 * snapshot creation, which may cause new
1408 * sharing.
1409 *
1410 * To avoid this always quiesce the origin before
1411 * taking the snap. You want to do this anyway to
1412 * ensure a consistent application view
1413 * (i.e. lockfs).
1414 *
1415 * More distant ancestors are irrelevant. The
1416 * shared flag will be set in their case.
1417 */
1418 thin_defer_bio(tc, bio);
1419 return DM_MAPIO_SUBMITTED;
1420 }
1421
1422 build_virtual_key(tc->td, block, &key);
1423 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1))
1424 return DM_MAPIO_SUBMITTED;
1425
1426 build_data_key(tc->td, result.block, &key);
1427 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2)) {
1428 cell_defer_no_holder(tc, cell1);
1429 return DM_MAPIO_SUBMITTED;
1430 }
1431
1432 inc_all_io_entry(tc->pool, bio);
1433 cell_defer_no_holder(tc, cell2);
1434 cell_defer_no_holder(tc, cell1);
1435
1436 remap(tc, bio, result.block);
1437 return DM_MAPIO_REMAPPED;
1438
1439 case -ENODATA:
1440 if (get_pool_mode(tc->pool) == PM_READ_ONLY) {
1441 /*
1442 * This block isn't provisioned, and we have no way
1443 * of doing so. Just error it.
1444 */
1445 bio_io_error(bio);
1446 return DM_MAPIO_SUBMITTED;
1447 }
1448 /* fall through */
1449
1450 case -EWOULDBLOCK:
1451 /*
1452 * In future, the failed dm_thin_find_block above could
1453 * provide the hint to load the metadata into cache.
1454 */
1455 thin_defer_bio(tc, bio);
1456 return DM_MAPIO_SUBMITTED;
1457
1458 default:
1459 /*
1460 * Must always call bio_io_error on failure.
1461 * dm_thin_find_block can fail with -EINVAL if the
1462 * pool is switched to fail-io mode.
1463 */
1464 bio_io_error(bio);
1465 return DM_MAPIO_SUBMITTED;
1466 }
1467}
1468
1469static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1470{
1471 int r;
1472 unsigned long flags;
1473 struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
1474
1475 spin_lock_irqsave(&pt->pool->lock, flags);
1476 r = !bio_list_empty(&pt->pool->retry_on_resume_list);
1477 spin_unlock_irqrestore(&pt->pool->lock, flags);
1478
1479 if (!r) {
1480 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
1481 r = bdi_congested(&q->backing_dev_info, bdi_bits);
1482 }
1483
1484 return r;
1485}
1486
1487static void __requeue_bios(struct pool *pool)
1488{
1489 bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list);
1490 bio_list_init(&pool->retry_on_resume_list);
1491}
1492
1493/*----------------------------------------------------------------
1494 * Binding of control targets to a pool object
1495 *--------------------------------------------------------------*/
1496static bool data_dev_supports_discard(struct pool_c *pt)
1497{
1498 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
1499
1500 return q && blk_queue_discard(q);
1501}
1502
1503/*
1504 * If discard_passdown was enabled verify that the data device
1505 * supports discards. Disable discard_passdown if not.
1506 */
1507static void disable_passdown_if_not_supported(struct pool_c *pt)
1508{
1509 struct pool *pool = pt->pool;
1510 struct block_device *data_bdev = pt->data_dev->bdev;
1511 struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
1512 sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT;
1513 const char *reason = NULL;
1514 char buf[BDEVNAME_SIZE];
1515
1516 if (!pt->adjusted_pf.discard_passdown)
1517 return;
1518
1519 if (!data_dev_supports_discard(pt))
1520 reason = "discard unsupported";
1521
1522 else if (data_limits->max_discard_sectors < pool->sectors_per_block)
1523 reason = "max discard sectors smaller than a block";
1524
1525 else if (data_limits->discard_granularity > block_size)
1526 reason = "discard granularity larger than a block";
1527
1528 else if (block_size & (data_limits->discard_granularity - 1))
1529 reason = "discard granularity not a factor of block size";
1530
1531 if (reason) {
1532 DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason);
1533 pt->adjusted_pf.discard_passdown = false;
1534 }
1535}
1536
1537static int bind_control_target(struct pool *pool, struct dm_target *ti)
1538{
1539 struct pool_c *pt = ti->private;
1540
1541 /*
1542 * We want to make sure that degraded pools are never upgraded.
1543 */
1544 enum pool_mode old_mode = pool->pf.mode;
1545 enum pool_mode new_mode = pt->adjusted_pf.mode;
1546
1547 if (old_mode > new_mode)
1548 new_mode = old_mode;
1549
1550 pool->ti = ti;
1551 pool->low_water_blocks = pt->low_water_blocks;
1552 pool->pf = pt->adjusted_pf;
1553
1554 set_pool_mode(pool, new_mode);
1555
1556 return 0;
1557}
1558
1559static void unbind_control_target(struct pool *pool, struct dm_target *ti)
1560{
1561 if (pool->ti == ti)
1562 pool->ti = NULL;
1563}
1564
1565/*----------------------------------------------------------------
1566 * Pool creation
1567 *--------------------------------------------------------------*/
1568/* Initialize pool features. */
1569static void pool_features_init(struct pool_features *pf)
1570{
1571 pf->mode = PM_WRITE;
1572 pf->zero_new_blocks = true;
1573 pf->discard_enabled = true;
1574 pf->discard_passdown = true;
1575}
1576
1577static void __pool_destroy(struct pool *pool)
1578{
1579 __pool_table_remove(pool);
1580
1581 if (dm_pool_metadata_close(pool->pmd) < 0)
1582 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
1583
1584 dm_bio_prison_destroy(pool->prison);
1585 dm_kcopyd_client_destroy(pool->copier);
1586
1587 if (pool->wq)
1588 destroy_workqueue(pool->wq);
1589
1590 if (pool->next_mapping)
1591 mempool_free(pool->next_mapping, pool->mapping_pool);
1592 mempool_destroy(pool->mapping_pool);
1593 dm_deferred_set_destroy(pool->shared_read_ds);
1594 dm_deferred_set_destroy(pool->all_io_ds);
1595 kfree(pool);
1596}
1597
1598static struct kmem_cache *_new_mapping_cache;
1599
1600static struct pool *pool_create(struct mapped_device *pool_md,
1601 struct block_device *metadata_dev,
1602 unsigned long block_size,
1603 int read_only, char **error)
1604{
1605 int r;
1606 void *err_p;
1607 struct pool *pool;
1608 struct dm_pool_metadata *pmd;
1609 bool format_device = read_only ? false : true;
1610
1611 pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device);
1612 if (IS_ERR(pmd)) {
1613 *error = "Error creating metadata object";
1614 return (struct pool *)pmd;
1615 }
1616
1617 pool = kmalloc(sizeof(*pool), GFP_KERNEL);
1618 if (!pool) {
1619 *error = "Error allocating memory for pool";
1620 err_p = ERR_PTR(-ENOMEM);
1621 goto bad_pool;
1622 }
1623
1624 pool->pmd = pmd;
1625 pool->sectors_per_block = block_size;
1626 if (block_size & (block_size - 1))
1627 pool->sectors_per_block_shift = -1;
1628 else
1629 pool->sectors_per_block_shift = __ffs(block_size);
1630 pool->low_water_blocks = 0;
1631 pool_features_init(&pool->pf);
1632 pool->prison = dm_bio_prison_create(PRISON_CELLS);
1633 if (!pool->prison) {
1634 *error = "Error creating pool's bio prison";
1635 err_p = ERR_PTR(-ENOMEM);
1636 goto bad_prison;
1637 }
1638
1639 pool->copier = dm_kcopyd_client_create();
1640 if (IS_ERR(pool->copier)) {
1641 r = PTR_ERR(pool->copier);
1642 *error = "Error creating pool's kcopyd client";
1643 err_p = ERR_PTR(r);
1644 goto bad_kcopyd_client;
1645 }
1646
1647 /*
1648 * Create singlethreaded workqueue that will service all devices
1649 * that use this metadata.
1650 */
1651 pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
1652 if (!pool->wq) {
1653 *error = "Error creating pool's workqueue";
1654 err_p = ERR_PTR(-ENOMEM);
1655 goto bad_wq;
1656 }
1657
1658 INIT_WORK(&pool->worker, do_worker);
1659 INIT_DELAYED_WORK(&pool->waker, do_waker);
1660 spin_lock_init(&pool->lock);
1661 bio_list_init(&pool->deferred_bios);
1662 bio_list_init(&pool->deferred_flush_bios);
1663 INIT_LIST_HEAD(&pool->prepared_mappings);
1664 INIT_LIST_HEAD(&pool->prepared_discards);
1665 pool->low_water_triggered = 0;
1666 pool->no_free_space = 0;
1667 bio_list_init(&pool->retry_on_resume_list);
1668
1669 pool->shared_read_ds = dm_deferred_set_create();
1670 if (!pool->shared_read_ds) {
1671 *error = "Error creating pool's shared read deferred set";
1672 err_p = ERR_PTR(-ENOMEM);
1673 goto bad_shared_read_ds;
1674 }
1675
1676 pool->all_io_ds = dm_deferred_set_create();
1677 if (!pool->all_io_ds) {
1678 *error = "Error creating pool's all io deferred set";
1679 err_p = ERR_PTR(-ENOMEM);
1680 goto bad_all_io_ds;
1681 }
1682
1683 pool->next_mapping = NULL;
1684 pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE,
1685 _new_mapping_cache);
1686 if (!pool->mapping_pool) {
1687 *error = "Error creating pool's mapping mempool";
1688 err_p = ERR_PTR(-ENOMEM);
1689 goto bad_mapping_pool;
1690 }
1691
1692 pool->ref_count = 1;
1693 pool->last_commit_jiffies = jiffies;
1694 pool->pool_md = pool_md;
1695 pool->md_dev = metadata_dev;
1696 __pool_table_insert(pool);
1697
1698 return pool;
1699
1700bad_mapping_pool:
1701 dm_deferred_set_destroy(pool->all_io_ds);
1702bad_all_io_ds:
1703 dm_deferred_set_destroy(pool->shared_read_ds);
1704bad_shared_read_ds:
1705 destroy_workqueue(pool->wq);
1706bad_wq:
1707 dm_kcopyd_client_destroy(pool->copier);
1708bad_kcopyd_client:
1709 dm_bio_prison_destroy(pool->prison);
1710bad_prison:
1711 kfree(pool);
1712bad_pool:
1713 if (dm_pool_metadata_close(pmd))
1714 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
1715
1716 return err_p;
1717}
1718
1719static void __pool_inc(struct pool *pool)
1720{
1721 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
1722 pool->ref_count++;
1723}
1724
1725static void __pool_dec(struct pool *pool)
1726{
1727 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
1728 BUG_ON(!pool->ref_count);
1729 if (!--pool->ref_count)
1730 __pool_destroy(pool);
1731}
1732
1733static struct pool *__pool_find(struct mapped_device *pool_md,
1734 struct block_device *metadata_dev,
1735 unsigned long block_size, int read_only,
1736 char **error, int *created)
1737{
1738 struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
1739
1740 if (pool) {
1741 if (pool->pool_md != pool_md) {
1742 *error = "metadata device already in use by a pool";
1743 return ERR_PTR(-EBUSY);
1744 }
1745 __pool_inc(pool);
1746
1747 } else {
1748 pool = __pool_table_lookup(pool_md);
1749 if (pool) {
1750 if (pool->md_dev != metadata_dev) {
1751 *error = "different pool cannot replace a pool";
1752 return ERR_PTR(-EINVAL);
1753 }
1754 __pool_inc(pool);
1755
1756 } else {
1757 pool = pool_create(pool_md, metadata_dev, block_size, read_only, error);
1758 *created = 1;
1759 }
1760 }
1761
1762 return pool;
1763}
1764
1765/*----------------------------------------------------------------
1766 * Pool target methods
1767 *--------------------------------------------------------------*/
1768static void pool_dtr(struct dm_target *ti)
1769{
1770 struct pool_c *pt = ti->private;
1771
1772 mutex_lock(&dm_thin_pool_table.mutex);
1773
1774 unbind_control_target(pt->pool, ti);
1775 __pool_dec(pt->pool);
1776 dm_put_device(ti, pt->metadata_dev);
1777 dm_put_device(ti, pt->data_dev);
1778 kfree(pt);
1779
1780 mutex_unlock(&dm_thin_pool_table.mutex);
1781}
1782
1783static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
1784 struct dm_target *ti)
1785{
1786 int r;
1787 unsigned argc;
1788 const char *arg_name;
1789
1790 static struct dm_arg _args[] = {
1791 {0, 3, "Invalid number of pool feature arguments"},
1792 };
1793
1794 /*
1795 * No feature arguments supplied.
1796 */
1797 if (!as->argc)
1798 return 0;
1799
1800 r = dm_read_arg_group(_args, as, &argc, &ti->error);
1801 if (r)
1802 return -EINVAL;
1803
1804 while (argc && !r) {
1805 arg_name = dm_shift_arg(as);
1806 argc--;
1807
1808 if (!strcasecmp(arg_name, "skip_block_zeroing"))
1809 pf->zero_new_blocks = false;
1810
1811 else if (!strcasecmp(arg_name, "ignore_discard"))
1812 pf->discard_enabled = false;
1813
1814 else if (!strcasecmp(arg_name, "no_discard_passdown"))
1815 pf->discard_passdown = false;
1816
1817 else if (!strcasecmp(arg_name, "read_only"))
1818 pf->mode = PM_READ_ONLY;
1819
1820 else {
1821 ti->error = "Unrecognised pool feature requested";
1822 r = -EINVAL;
1823 break;
1824 }
1825 }
1826
1827 return r;
1828}
1829
1830/*
1831 * thin-pool <metadata dev> <data dev>
1832 * <data block size (sectors)>
1833 * <low water mark (blocks)>
1834 * [<#feature args> [<arg>]*]
1835 *
1836 * Optional feature arguments are:
1837 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
1838 * ignore_discard: disable discard
1839 * no_discard_passdown: don't pass discards down to the data device
1840 */
1841static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1842{
1843 int r, pool_created = 0;
1844 struct pool_c *pt;
1845 struct pool *pool;
1846 struct pool_features pf;
1847 struct dm_arg_set as;
1848 struct dm_dev *data_dev;
1849 unsigned long block_size;
1850 dm_block_t low_water_blocks;
1851 struct dm_dev *metadata_dev;
1852 sector_t metadata_dev_size;
1853 char b[BDEVNAME_SIZE];
1854
1855 /*
1856 * FIXME Remove validation from scope of lock.
1857 */
1858 mutex_lock(&dm_thin_pool_table.mutex);
1859
1860 if (argc < 4) {
1861 ti->error = "Invalid argument count";
1862 r = -EINVAL;
1863 goto out_unlock;
1864 }
1865 as.argc = argc;
1866 as.argv = argv;
1867
1868 r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &metadata_dev);
1869 if (r) {
1870 ti->error = "Error opening metadata block device";
1871 goto out_unlock;
1872 }
1873
1874 metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT;
1875 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
1876 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
1877 bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
1878
1879 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
1880 if (r) {
1881 ti->error = "Error getting data device";
1882 goto out_metadata;
1883 }
1884
1885 if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
1886 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
1887 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
1888 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
1889 ti->error = "Invalid block size";
1890 r = -EINVAL;
1891 goto out;
1892 }
1893
1894 if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
1895 ti->error = "Invalid low water mark";
1896 r = -EINVAL;
1897 goto out;
1898 }
1899
1900 /*
1901 * Set default pool features.
1902 */
1903 pool_features_init(&pf);
1904
1905 dm_consume_args(&as, 4);
1906 r = parse_pool_features(&as, &pf, ti);
1907 if (r)
1908 goto out;
1909
1910 pt = kzalloc(sizeof(*pt), GFP_KERNEL);
1911 if (!pt) {
1912 r = -ENOMEM;
1913 goto out;
1914 }
1915
1916 pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
1917 block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
1918 if (IS_ERR(pool)) {
1919 r = PTR_ERR(pool);
1920 goto out_free_pt;
1921 }
1922
1923 /*
1924 * 'pool_created' reflects whether this is the first table load.
1925 * Top level discard support is not allowed to be changed after
1926 * initial load. This would require a pool reload to trigger thin
1927 * device changes.
1928 */
1929 if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
1930 ti->error = "Discard support cannot be disabled once enabled";
1931 r = -EINVAL;
1932 goto out_flags_changed;
1933 }
1934
1935 pt->pool = pool;
1936 pt->ti = ti;
1937 pt->metadata_dev = metadata_dev;
1938 pt->data_dev = data_dev;
1939 pt->low_water_blocks = low_water_blocks;
1940 pt->adjusted_pf = pt->requested_pf = pf;
1941 ti->num_flush_requests = 1;
1942
1943 /*
1944 * Only need to enable discards if the pool should pass
1945 * them down to the data device. The thin device's discard
1946 * processing will cause mappings to be removed from the btree.
1947 */
1948 if (pf.discard_enabled && pf.discard_passdown) {
1949 ti->num_discard_requests = 1;
1950
1951 /*
1952 * Setting 'discards_supported' circumvents the normal
1953 * stacking of discard limits (this keeps the pool and
1954 * thin devices' discard limits consistent).
1955 */
1956 ti->discards_supported = true;
1957 ti->discard_zeroes_data_unsupported = true;
1958 }
1959 ti->private = pt;
1960
1961 pt->callbacks.congested_fn = pool_is_congested;
1962 dm_table_add_target_callbacks(ti->table, &pt->callbacks);
1963
1964 mutex_unlock(&dm_thin_pool_table.mutex);
1965
1966 return 0;
1967
1968out_flags_changed:
1969 __pool_dec(pool);
1970out_free_pt:
1971 kfree(pt);
1972out:
1973 dm_put_device(ti, data_dev);
1974out_metadata:
1975 dm_put_device(ti, metadata_dev);
1976out_unlock:
1977 mutex_unlock(&dm_thin_pool_table.mutex);
1978
1979 return r;
1980}
1981
1982static int pool_map(struct dm_target *ti, struct bio *bio)
1983{
1984 int r;
1985 struct pool_c *pt = ti->private;
1986 struct pool *pool = pt->pool;
1987 unsigned long flags;
1988
1989 /*
1990 * As this is a singleton target, ti->begin is always zero.
1991 */
1992 spin_lock_irqsave(&pool->lock, flags);
1993 bio->bi_bdev = pt->data_dev->bdev;
1994 r = DM_MAPIO_REMAPPED;
1995 spin_unlock_irqrestore(&pool->lock, flags);
1996
1997 return r;
1998}
1999
2000/*
2001 * Retrieves the number of blocks of the data device from
2002 * the superblock and compares it to the actual device size,
2003 * thus resizing the data device in case it has grown.
2004 *
2005 * This both copes with opening preallocated data devices in the ctr
2006 * being followed by a resume
2007 * -and-
2008 * calling the resume method individually after userspace has
2009 * grown the data device in reaction to a table event.
2010 */
2011static int pool_preresume(struct dm_target *ti)
2012{
2013 int r;
2014 struct pool_c *pt = ti->private;
2015 struct pool *pool = pt->pool;
2016 sector_t data_size = ti->len;
2017 dm_block_t sb_data_size;
2018
2019 /*
2020 * Take control of the pool object.
2021 */
2022 r = bind_control_target(pool, ti);
2023 if (r)
2024 return r;
2025
2026 (void) sector_div(data_size, pool->sectors_per_block);
2027
2028 r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
2029 if (r) {
2030 DMERR("failed to retrieve data device size");
2031 return r;
2032 }
2033
2034 if (data_size < sb_data_size) {
2035 DMERR("pool target too small, is %llu blocks (expected %llu)",
2036 (unsigned long long)data_size, sb_data_size);
2037 return -EINVAL;
2038
2039 } else if (data_size > sb_data_size) {
2040 r = dm_pool_resize_data_dev(pool->pmd, data_size);
2041 if (r) {
2042 DMERR("failed to resize data device");
2043 /* FIXME Stricter than necessary: Rollback transaction instead here */
2044 set_pool_mode(pool, PM_READ_ONLY);
2045 return r;
2046 }
2047
2048 (void) commit_or_fallback(pool);
2049 }
2050
2051 return 0;
2052}
2053
2054static void pool_resume(struct dm_target *ti)
2055{
2056 struct pool_c *pt = ti->private;
2057 struct pool *pool = pt->pool;
2058 unsigned long flags;
2059
2060 spin_lock_irqsave(&pool->lock, flags);
2061 pool->low_water_triggered = 0;
2062 pool->no_free_space = 0;
2063 __requeue_bios(pool);
2064 spin_unlock_irqrestore(&pool->lock, flags);
2065
2066 do_waker(&pool->waker.work);
2067}
2068
2069static void pool_postsuspend(struct dm_target *ti)
2070{
2071 struct pool_c *pt = ti->private;
2072 struct pool *pool = pt->pool;
2073
2074 cancel_delayed_work(&pool->waker);
2075 flush_workqueue(pool->wq);
2076 (void) commit_or_fallback(pool);
2077}
2078
2079static int check_arg_count(unsigned argc, unsigned args_required)
2080{
2081 if (argc != args_required) {
2082 DMWARN("Message received with %u arguments instead of %u.",
2083 argc, args_required);
2084 return -EINVAL;
2085 }
2086
2087 return 0;
2088}
2089
2090static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning)
2091{
2092 if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
2093 *dev_id <= MAX_DEV_ID)
2094 return 0;
2095
2096 if (warning)
2097 DMWARN("Message received with invalid device id: %s", arg);
2098
2099 return -EINVAL;
2100}
2101
2102static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool)
2103{
2104 dm_thin_id dev_id;
2105 int r;
2106
2107 r = check_arg_count(argc, 2);
2108 if (r)
2109 return r;
2110
2111 r = read_dev_id(argv[1], &dev_id, 1);
2112 if (r)
2113 return r;
2114
2115 r = dm_pool_create_thin(pool->pmd, dev_id);
2116 if (r) {
2117 DMWARN("Creation of new thinly-provisioned device with id %s failed.",
2118 argv[1]);
2119 return r;
2120 }
2121
2122 return 0;
2123}
2124
2125static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2126{
2127 dm_thin_id dev_id;
2128 dm_thin_id origin_dev_id;
2129 int r;
2130
2131 r = check_arg_count(argc, 3);
2132 if (r)
2133 return r;
2134
2135 r = read_dev_id(argv[1], &dev_id, 1);
2136 if (r)
2137 return r;
2138
2139 r = read_dev_id(argv[2], &origin_dev_id, 1);
2140 if (r)
2141 return r;
2142
2143 r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
2144 if (r) {
2145 DMWARN("Creation of new snapshot %s of device %s failed.",
2146 argv[1], argv[2]);
2147 return r;
2148 }
2149
2150 return 0;
2151}
2152
2153static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool)
2154{
2155 dm_thin_id dev_id;
2156 int r;
2157
2158 r = check_arg_count(argc, 2);
2159 if (r)
2160 return r;
2161
2162 r = read_dev_id(argv[1], &dev_id, 1);
2163 if (r)
2164 return r;
2165
2166 r = dm_pool_delete_thin_device(pool->pmd, dev_id);
2167 if (r)
2168 DMWARN("Deletion of thin device %s failed.", argv[1]);
2169
2170 return r;
2171}
2172
2173static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool)
2174{
2175 dm_thin_id old_id, new_id;
2176 int r;
2177
2178 r = check_arg_count(argc, 3);
2179 if (r)
2180 return r;
2181
2182 if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
2183 DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
2184 return -EINVAL;
2185 }
2186
2187 if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
2188 DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
2189 return -EINVAL;
2190 }
2191
2192 r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
2193 if (r) {
2194 DMWARN("Failed to change transaction id from %s to %s.",
2195 argv[1], argv[2]);
2196 return r;
2197 }
2198
2199 return 0;
2200}
2201
2202static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2203{
2204 int r;
2205
2206 r = check_arg_count(argc, 1);
2207 if (r)
2208 return r;
2209
2210 (void) commit_or_fallback(pool);
2211
2212 r = dm_pool_reserve_metadata_snap(pool->pmd);
2213 if (r)
2214 DMWARN("reserve_metadata_snap message failed.");
2215
2216 return r;
2217}
2218
2219static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2220{
2221 int r;
2222
2223 r = check_arg_count(argc, 1);
2224 if (r)
2225 return r;
2226
2227 r = dm_pool_release_metadata_snap(pool->pmd);
2228 if (r)
2229 DMWARN("release_metadata_snap message failed.");
2230
2231 return r;
2232}
2233
2234/*
2235 * Messages supported:
2236 * create_thin <dev_id>
2237 * create_snap <dev_id> <origin_id>
2238 * delete <dev_id>
2239 * trim <dev_id> <new_size_in_sectors>
2240 * set_transaction_id <current_trans_id> <new_trans_id>
2241 * reserve_metadata_snap
2242 * release_metadata_snap
2243 */
2244static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
2245{
2246 int r = -EINVAL;
2247 struct pool_c *pt = ti->private;
2248 struct pool *pool = pt->pool;
2249
2250 if (!strcasecmp(argv[0], "create_thin"))
2251 r = process_create_thin_mesg(argc, argv, pool);
2252
2253 else if (!strcasecmp(argv[0], "create_snap"))
2254 r = process_create_snap_mesg(argc, argv, pool);
2255
2256 else if (!strcasecmp(argv[0], "delete"))
2257 r = process_delete_mesg(argc, argv, pool);
2258
2259 else if (!strcasecmp(argv[0], "set_transaction_id"))
2260 r = process_set_transaction_id_mesg(argc, argv, pool);
2261
2262 else if (!strcasecmp(argv[0], "reserve_metadata_snap"))
2263 r = process_reserve_metadata_snap_mesg(argc, argv, pool);
2264
2265 else if (!strcasecmp(argv[0], "release_metadata_snap"))
2266 r = process_release_metadata_snap_mesg(argc, argv, pool);
2267
2268 else
2269 DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
2270
2271 if (!r)
2272 (void) commit_or_fallback(pool);
2273
2274 return r;
2275}
2276
2277static void emit_flags(struct pool_features *pf, char *result,
2278 unsigned sz, unsigned maxlen)
2279{
2280 unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
2281 !pf->discard_passdown + (pf->mode == PM_READ_ONLY);
2282 DMEMIT("%u ", count);
2283
2284 if (!pf->zero_new_blocks)
2285 DMEMIT("skip_block_zeroing ");
2286
2287 if (!pf->discard_enabled)
2288 DMEMIT("ignore_discard ");
2289
2290 if (!pf->discard_passdown)
2291 DMEMIT("no_discard_passdown ");
2292
2293 if (pf->mode == PM_READ_ONLY)
2294 DMEMIT("read_only ");
2295}
2296
2297/*
2298 * Status line is:
2299 * <transaction id> <used metadata sectors>/<total metadata sectors>
2300 * <used data sectors>/<total data sectors> <held metadata root>
2301 */
2302static int pool_status(struct dm_target *ti, status_type_t type,
2303 unsigned status_flags, char *result, unsigned maxlen)
2304{
2305 int r;
2306 unsigned sz = 0;
2307 uint64_t transaction_id;
2308 dm_block_t nr_free_blocks_data;
2309 dm_block_t nr_free_blocks_metadata;
2310 dm_block_t nr_blocks_data;
2311 dm_block_t nr_blocks_metadata;
2312 dm_block_t held_root;
2313 char buf[BDEVNAME_SIZE];
2314 char buf2[BDEVNAME_SIZE];
2315 struct pool_c *pt = ti->private;
2316 struct pool *pool = pt->pool;
2317
2318 switch (type) {
2319 case STATUSTYPE_INFO:
2320 if (get_pool_mode(pool) == PM_FAIL) {
2321 DMEMIT("Fail");
2322 break;
2323 }
2324
2325 /* Commit to ensure statistics aren't out-of-date */
2326 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
2327 (void) commit_or_fallback(pool);
2328
2329 r = dm_pool_get_metadata_transaction_id(pool->pmd,
2330 &transaction_id);
2331 if (r)
2332 return r;
2333
2334 r = dm_pool_get_free_metadata_block_count(pool->pmd,
2335 &nr_free_blocks_metadata);
2336 if (r)
2337 return r;
2338
2339 r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
2340 if (r)
2341 return r;
2342
2343 r = dm_pool_get_free_block_count(pool->pmd,
2344 &nr_free_blocks_data);
2345 if (r)
2346 return r;
2347
2348 r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
2349 if (r)
2350 return r;
2351
2352 r = dm_pool_get_metadata_snap(pool->pmd, &held_root);
2353 if (r)
2354 return r;
2355
2356 DMEMIT("%llu %llu/%llu %llu/%llu ",
2357 (unsigned long long)transaction_id,
2358 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
2359 (unsigned long long)nr_blocks_metadata,
2360 (unsigned long long)(nr_blocks_data - nr_free_blocks_data),
2361 (unsigned long long)nr_blocks_data);
2362
2363 if (held_root)
2364 DMEMIT("%llu ", held_root);
2365 else
2366 DMEMIT("- ");
2367
2368 if (pool->pf.mode == PM_READ_ONLY)
2369 DMEMIT("ro ");
2370 else
2371 DMEMIT("rw ");
2372
2373 if (!pool->pf.discard_enabled)
2374 DMEMIT("ignore_discard");
2375 else if (pool->pf.discard_passdown)
2376 DMEMIT("discard_passdown");
2377 else
2378 DMEMIT("no_discard_passdown");
2379
2380 break;
2381
2382 case STATUSTYPE_TABLE:
2383 DMEMIT("%s %s %lu %llu ",
2384 format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
2385 format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
2386 (unsigned long)pool->sectors_per_block,
2387 (unsigned long long)pt->low_water_blocks);
2388 emit_flags(&pt->requested_pf, result, sz, maxlen);
2389 break;
2390 }
2391
2392 return 0;
2393}
2394
2395static int pool_iterate_devices(struct dm_target *ti,
2396 iterate_devices_callout_fn fn, void *data)
2397{
2398 struct pool_c *pt = ti->private;
2399
2400 return fn(ti, pt->data_dev, 0, ti->len, data);
2401}
2402
2403static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
2404 struct bio_vec *biovec, int max_size)
2405{
2406 struct pool_c *pt = ti->private;
2407 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
2408
2409 if (!q->merge_bvec_fn)
2410 return max_size;
2411
2412 bvm->bi_bdev = pt->data_dev->bdev;
2413
2414 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2415}
2416
2417static bool block_size_is_power_of_two(struct pool *pool)
2418{
2419 return pool->sectors_per_block_shift >= 0;
2420}
2421
2422static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
2423{
2424 struct pool *pool = pt->pool;
2425 struct queue_limits *data_limits;
2426
2427 limits->max_discard_sectors = pool->sectors_per_block;
2428
2429 /*
2430 * discard_granularity is just a hint, and not enforced.
2431 */
2432 if (pt->adjusted_pf.discard_passdown) {
2433 data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
2434 limits->discard_granularity = data_limits->discard_granularity;
2435 } else if (block_size_is_power_of_two(pool))
2436 limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
2437 else
2438 /*
2439 * Use largest power of 2 that is a factor of sectors_per_block
2440 * but at least DATA_DEV_BLOCK_SIZE_MIN_SECTORS.
2441 */
2442 limits->discard_granularity = max(1 << (ffs(pool->sectors_per_block) - 1),
2443 DATA_DEV_BLOCK_SIZE_MIN_SECTORS) << SECTOR_SHIFT;
2444}
2445
2446static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
2447{
2448 struct pool_c *pt = ti->private;
2449 struct pool *pool = pt->pool;
2450
2451 blk_limits_io_min(limits, 0);
2452 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
2453
2454 /*
2455 * pt->adjusted_pf is a staging area for the actual features to use.
2456 * They get transferred to the live pool in bind_control_target()
2457 * called from pool_preresume().
2458 */
2459 if (!pt->adjusted_pf.discard_enabled)
2460 return;
2461
2462 disable_passdown_if_not_supported(pt);
2463
2464 set_discard_limits(pt, limits);
2465}
2466
2467static struct target_type pool_target = {
2468 .name = "thin-pool",
2469 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
2470 DM_TARGET_IMMUTABLE,
2471 .version = {1, 6, 0},
2472 .module = THIS_MODULE,
2473 .ctr = pool_ctr,
2474 .dtr = pool_dtr,
2475 .map = pool_map,
2476 .postsuspend = pool_postsuspend,
2477 .preresume = pool_preresume,
2478 .resume = pool_resume,
2479 .message = pool_message,
2480 .status = pool_status,
2481 .merge = pool_merge,
2482 .iterate_devices = pool_iterate_devices,
2483 .io_hints = pool_io_hints,
2484};
2485
2486/*----------------------------------------------------------------
2487 * Thin target methods
2488 *--------------------------------------------------------------*/
2489static void thin_dtr(struct dm_target *ti)
2490{
2491 struct thin_c *tc = ti->private;
2492
2493 mutex_lock(&dm_thin_pool_table.mutex);
2494
2495 __pool_dec(tc->pool);
2496 dm_pool_close_thin_device(tc->td);
2497 dm_put_device(ti, tc->pool_dev);
2498 if (tc->origin_dev)
2499 dm_put_device(ti, tc->origin_dev);
2500 kfree(tc);
2501
2502 mutex_unlock(&dm_thin_pool_table.mutex);
2503}
2504
2505/*
2506 * Thin target parameters:
2507 *
2508 * <pool_dev> <dev_id> [origin_dev]
2509 *
2510 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
2511 * dev_id: the internal device identifier
2512 * origin_dev: a device external to the pool that should act as the origin
2513 *
2514 * If the pool device has discards disabled, they get disabled for the thin
2515 * device as well.
2516 */
2517static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2518{
2519 int r;
2520 struct thin_c *tc;
2521 struct dm_dev *pool_dev, *origin_dev;
2522 struct mapped_device *pool_md;
2523
2524 mutex_lock(&dm_thin_pool_table.mutex);
2525
2526 if (argc != 2 && argc != 3) {
2527 ti->error = "Invalid argument count";
2528 r = -EINVAL;
2529 goto out_unlock;
2530 }
2531
2532 tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
2533 if (!tc) {
2534 ti->error = "Out of memory";
2535 r = -ENOMEM;
2536 goto out_unlock;
2537 }
2538
2539 if (argc == 3) {
2540 r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
2541 if (r) {
2542 ti->error = "Error opening origin device";
2543 goto bad_origin_dev;
2544 }
2545 tc->origin_dev = origin_dev;
2546 }
2547
2548 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
2549 if (r) {
2550 ti->error = "Error opening pool device";
2551 goto bad_pool_dev;
2552 }
2553 tc->pool_dev = pool_dev;
2554
2555 if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
2556 ti->error = "Invalid device id";
2557 r = -EINVAL;
2558 goto bad_common;
2559 }
2560
2561 pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
2562 if (!pool_md) {
2563 ti->error = "Couldn't get pool mapped device";
2564 r = -EINVAL;
2565 goto bad_common;
2566 }
2567
2568 tc->pool = __pool_table_lookup(pool_md);
2569 if (!tc->pool) {
2570 ti->error = "Couldn't find pool object";
2571 r = -EINVAL;
2572 goto bad_pool_lookup;
2573 }
2574 __pool_inc(tc->pool);
2575
2576 if (get_pool_mode(tc->pool) == PM_FAIL) {
2577 ti->error = "Couldn't open thin device, Pool is in fail mode";
2578 goto bad_thin_open;
2579 }
2580
2581 r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
2582 if (r) {
2583 ti->error = "Couldn't open thin internal device";
2584 goto bad_thin_open;
2585 }
2586
2587 r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
2588 if (r)
2589 goto bad_thin_open;
2590
2591 ti->num_flush_requests = 1;
2592 ti->flush_supported = true;
2593 ti->per_bio_data_size = sizeof(struct dm_thin_endio_hook);
2594
2595 /* In case the pool supports discards, pass them on. */
2596 if (tc->pool->pf.discard_enabled) {
2597 ti->discards_supported = true;
2598 ti->num_discard_requests = 1;
2599 ti->discard_zeroes_data_unsupported = true;
2600 /* Discard requests must be split on a block boundary */
2601 ti->split_discard_requests = true;
2602 }
2603
2604 dm_put(pool_md);
2605
2606 mutex_unlock(&dm_thin_pool_table.mutex);
2607
2608 return 0;
2609
2610bad_thin_open:
2611 __pool_dec(tc->pool);
2612bad_pool_lookup:
2613 dm_put(pool_md);
2614bad_common:
2615 dm_put_device(ti, tc->pool_dev);
2616bad_pool_dev:
2617 if (tc->origin_dev)
2618 dm_put_device(ti, tc->origin_dev);
2619bad_origin_dev:
2620 kfree(tc);
2621out_unlock:
2622 mutex_unlock(&dm_thin_pool_table.mutex);
2623
2624 return r;
2625}
2626
2627static int thin_map(struct dm_target *ti, struct bio *bio)
2628{
2629 bio->bi_sector = dm_target_offset(ti, bio->bi_sector);
2630
2631 return thin_bio_map(ti, bio);
2632}
2633
2634static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
2635{
2636 unsigned long flags;
2637 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
2638 struct list_head work;
2639 struct dm_thin_new_mapping *m, *tmp;
2640 struct pool *pool = h->tc->pool;
2641
2642 if (h->shared_read_entry) {
2643 INIT_LIST_HEAD(&work);
2644 dm_deferred_entry_dec(h->shared_read_entry, &work);
2645
2646 spin_lock_irqsave(&pool->lock, flags);
2647 list_for_each_entry_safe(m, tmp, &work, list) {
2648 list_del(&m->list);
2649 m->quiesced = 1;
2650 __maybe_add_mapping(m);
2651 }
2652 spin_unlock_irqrestore(&pool->lock, flags);
2653 }
2654
2655 if (h->all_io_entry) {
2656 INIT_LIST_HEAD(&work);
2657 dm_deferred_entry_dec(h->all_io_entry, &work);
2658 if (!list_empty(&work)) {
2659 spin_lock_irqsave(&pool->lock, flags);
2660 list_for_each_entry_safe(m, tmp, &work, list)
2661 list_add(&m->list, &pool->prepared_discards);
2662 spin_unlock_irqrestore(&pool->lock, flags);
2663 wake_worker(pool);
2664 }
2665 }
2666
2667 return 0;
2668}
2669
2670static void thin_postsuspend(struct dm_target *ti)
2671{
2672 if (dm_noflush_suspending(ti))
2673 requeue_io((struct thin_c *)ti->private);
2674}
2675
2676/*
2677 * <nr mapped sectors> <highest mapped sector>
2678 */
2679static int thin_status(struct dm_target *ti, status_type_t type,
2680 unsigned status_flags, char *result, unsigned maxlen)
2681{
2682 int r;
2683 ssize_t sz = 0;
2684 dm_block_t mapped, highest;
2685 char buf[BDEVNAME_SIZE];
2686 struct thin_c *tc = ti->private;
2687
2688 if (get_pool_mode(tc->pool) == PM_FAIL) {
2689 DMEMIT("Fail");
2690 return 0;
2691 }
2692
2693 if (!tc->td)
2694 DMEMIT("-");
2695 else {
2696 switch (type) {
2697 case STATUSTYPE_INFO:
2698 r = dm_thin_get_mapped_count(tc->td, &mapped);
2699 if (r)
2700 return r;
2701
2702 r = dm_thin_get_highest_mapped_block(tc->td, &highest);
2703 if (r < 0)
2704 return r;
2705
2706 DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
2707 if (r)
2708 DMEMIT("%llu", ((highest + 1) *
2709 tc->pool->sectors_per_block) - 1);
2710 else
2711 DMEMIT("-");
2712 break;
2713
2714 case STATUSTYPE_TABLE:
2715 DMEMIT("%s %lu",
2716 format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
2717 (unsigned long) tc->dev_id);
2718 if (tc->origin_dev)
2719 DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
2720 break;
2721 }
2722 }
2723
2724 return 0;
2725}
2726
2727static int thin_iterate_devices(struct dm_target *ti,
2728 iterate_devices_callout_fn fn, void *data)
2729{
2730 sector_t blocks;
2731 struct thin_c *tc = ti->private;
2732 struct pool *pool = tc->pool;
2733
2734 /*
2735 * We can't call dm_pool_get_data_dev_size() since that blocks. So
2736 * we follow a more convoluted path through to the pool's target.
2737 */
2738 if (!pool->ti)
2739 return 0; /* nothing is bound */
2740
2741 blocks = pool->ti->len;
2742 (void) sector_div(blocks, pool->sectors_per_block);
2743 if (blocks)
2744 return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data);
2745
2746 return 0;
2747}
2748
2749/*
2750 * A thin device always inherits its queue limits from its pool.
2751 */
2752static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
2753{
2754 struct thin_c *tc = ti->private;
2755
2756 *limits = bdev_get_queue(tc->pool_dev->bdev)->limits;
2757}
2758
2759static struct target_type thin_target = {
2760 .name = "thin",
2761 .version = {1, 6, 0},
2762 .module = THIS_MODULE,
2763 .ctr = thin_ctr,
2764 .dtr = thin_dtr,
2765 .map = thin_map,
2766 .end_io = thin_endio,
2767 .postsuspend = thin_postsuspend,
2768 .status = thin_status,
2769 .iterate_devices = thin_iterate_devices,
2770 .io_hints = thin_io_hints,
2771};
2772
2773/*----------------------------------------------------------------*/
2774
2775static int __init dm_thin_init(void)
2776{
2777 int r;
2778
2779 pool_table_init();
2780
2781 r = dm_register_target(&thin_target);
2782 if (r)
2783 return r;
2784
2785 r = dm_register_target(&pool_target);
2786 if (r)
2787 goto bad_pool_target;
2788
2789 r = -ENOMEM;
2790
2791 _new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0);
2792 if (!_new_mapping_cache)
2793 goto bad_new_mapping_cache;
2794
2795 return 0;
2796
2797bad_new_mapping_cache:
2798 dm_unregister_target(&pool_target);
2799bad_pool_target:
2800 dm_unregister_target(&thin_target);
2801
2802 return r;
2803}
2804
2805static void dm_thin_exit(void)
2806{
2807 dm_unregister_target(&thin_target);
2808 dm_unregister_target(&pool_target);
2809
2810 kmem_cache_destroy(_new_mapping_cache);
2811}
2812
2813module_init(dm_thin_init);
2814module_exit(dm_thin_exit);
2815
2816MODULE_DESCRIPTION(DM_NAME " thin provisioning target");
2817MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
2818MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-uevent.c b/drivers/md/dm-uevent.c
index 8efe033bab5..6b1e3b61b25 100644
--- a/drivers/md/dm-uevent.c
+++ b/drivers/md/dm-uevent.c
@@ -22,7 +22,6 @@
22#include <linux/slab.h> 22#include <linux/slab.h>
23#include <linux/kobject.h> 23#include <linux/kobject.h>
24#include <linux/dm-ioctl.h> 24#include <linux/dm-ioctl.h>
25#include <linux/export.h>
26 25
27#include "dm.h" 26#include "dm.h"
28#include "dm-uevent.h" 27#include "dm-uevent.h"
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
deleted file mode 100644
index 52cde982164..00000000000
--- a/drivers/md/dm-verity.c
+++ /dev/null
@@ -1,898 +0,0 @@
1/*
2 * Copyright (C) 2012 Red Hat, Inc.
3 *
4 * Author: Mikulas Patocka <mpatocka@redhat.com>
5 *
6 * Based on Chromium dm-verity driver (C) 2011 The Chromium OS Authors
7 *
8 * This file is released under the GPLv2.
9 *
10 * In the file "/sys/module/dm_verity/parameters/prefetch_cluster" you can set
11 * default prefetch value. Data are read in "prefetch_cluster" chunks from the
12 * hash device. Setting this greatly improves performance when data and hash
13 * are on the same disk on different partitions on devices with poor random
14 * access behavior.
15 */
16
17#include "dm-bufio.h"
18
19#include <linux/module.h>
20#include <linux/device-mapper.h>
21#include <crypto/hash.h>
22
23#define DM_MSG_PREFIX "verity"
24
25#define DM_VERITY_IO_VEC_INLINE 16
26#define DM_VERITY_MEMPOOL_SIZE 4
27#define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144
28
29#define DM_VERITY_MAX_LEVELS 63
30
31static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE;
32
33module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR);
34
35struct dm_verity {
36 struct dm_dev *data_dev;
37 struct dm_dev *hash_dev;
38 struct dm_target *ti;
39 struct dm_bufio_client *bufio;
40 char *alg_name;
41 struct crypto_shash *tfm;
42 u8 *root_digest; /* digest of the root block */
43 u8 *salt; /* salt: its size is salt_size */
44 unsigned salt_size;
45 sector_t data_start; /* data offset in 512-byte sectors */
46 sector_t hash_start; /* hash start in blocks */
47 sector_t data_blocks; /* the number of data blocks */
48 sector_t hash_blocks; /* the number of hash blocks */
49 unsigned char data_dev_block_bits; /* log2(data blocksize) */
50 unsigned char hash_dev_block_bits; /* log2(hash blocksize) */
51 unsigned char hash_per_block_bits; /* log2(hashes in hash block) */
52 unsigned char levels; /* the number of tree levels */
53 unsigned char version;
54 unsigned digest_size; /* digest size for the current hash algorithm */
55 unsigned shash_descsize;/* the size of temporary space for crypto */
56 int hash_failed; /* set to 1 if hash of any block failed */
57
58 mempool_t *vec_mempool; /* mempool of bio vector */
59
60 struct workqueue_struct *verify_wq;
61
62 /* starting blocks for each tree level. 0 is the lowest level. */
63 sector_t hash_level_block[DM_VERITY_MAX_LEVELS];
64};
65
66struct dm_verity_io {
67 struct dm_verity *v;
68
69 /* original values of bio->bi_end_io and bio->bi_private */
70 bio_end_io_t *orig_bi_end_io;
71 void *orig_bi_private;
72
73 sector_t block;
74 unsigned n_blocks;
75
76 /* saved bio vector */
77 struct bio_vec *io_vec;
78 unsigned io_vec_size;
79
80 struct work_struct work;
81
82 /* A space for short vectors; longer vectors are allocated separately. */
83 struct bio_vec io_vec_inline[DM_VERITY_IO_VEC_INLINE];
84
85 /*
86 * Three variably-size fields follow this struct:
87 *
88 * u8 hash_desc[v->shash_descsize];
89 * u8 real_digest[v->digest_size];
90 * u8 want_digest[v->digest_size];
91 *
92 * To access them use: io_hash_desc(), io_real_digest() and io_want_digest().
93 */
94};
95
96static struct shash_desc *io_hash_desc(struct dm_verity *v, struct dm_verity_io *io)
97{
98 return (struct shash_desc *)(io + 1);
99}
100
101static u8 *io_real_digest(struct dm_verity *v, struct dm_verity_io *io)
102{
103 return (u8 *)(io + 1) + v->shash_descsize;
104}
105
106static u8 *io_want_digest(struct dm_verity *v, struct dm_verity_io *io)
107{
108 return (u8 *)(io + 1) + v->shash_descsize + v->digest_size;
109}
110
111/*
112 * Auxiliary structure appended to each dm-bufio buffer. If the value
113 * hash_verified is nonzero, hash of the block has been verified.
114 *
115 * The variable hash_verified is set to 0 when allocating the buffer, then
116 * it can be changed to 1 and it is never reset to 0 again.
117 *
118 * There is no lock around this value, a race condition can at worst cause
119 * that multiple processes verify the hash of the same buffer simultaneously
120 * and write 1 to hash_verified simultaneously.
121 * This condition is harmless, so we don't need locking.
122 */
123struct buffer_aux {
124 int hash_verified;
125};
126
127/*
128 * Initialize struct buffer_aux for a freshly created buffer.
129 */
130static void dm_bufio_alloc_callback(struct dm_buffer *buf)
131{
132 struct buffer_aux *aux = dm_bufio_get_aux_data(buf);
133
134 aux->hash_verified = 0;
135}
136
137/*
138 * Translate input sector number to the sector number on the target device.
139 */
140static sector_t verity_map_sector(struct dm_verity *v, sector_t bi_sector)
141{
142 return v->data_start + dm_target_offset(v->ti, bi_sector);
143}
144
145/*
146 * Return hash position of a specified block at a specified tree level
147 * (0 is the lowest level).
148 * The lowest "hash_per_block_bits"-bits of the result denote hash position
149 * inside a hash block. The remaining bits denote location of the hash block.
150 */
151static sector_t verity_position_at_level(struct dm_verity *v, sector_t block,
152 int level)
153{
154 return block >> (level * v->hash_per_block_bits);
155}
156
157static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level,
158 sector_t *hash_block, unsigned *offset)
159{
160 sector_t position = verity_position_at_level(v, block, level);
161 unsigned idx;
162
163 *hash_block = v->hash_level_block[level] + (position >> v->hash_per_block_bits);
164
165 if (!offset)
166 return;
167
168 idx = position & ((1 << v->hash_per_block_bits) - 1);
169 if (!v->version)
170 *offset = idx * v->digest_size;
171 else
172 *offset = idx << (v->hash_dev_block_bits - v->hash_per_block_bits);
173}
174
175/*
176 * Verify hash of a metadata block pertaining to the specified data block
177 * ("block" argument) at a specified level ("level" argument).
178 *
179 * On successful return, io_want_digest(v, io) contains the hash value for
180 * a lower tree level or for the data block (if we're at the lowest leve).
181 *
182 * If "skip_unverified" is true, unverified buffer is skipped and 1 is returned.
183 * If "skip_unverified" is false, unverified buffer is hashed and verified
184 * against current value of io_want_digest(v, io).
185 */
186static int verity_verify_level(struct dm_verity_io *io, sector_t block,
187 int level, bool skip_unverified)
188{
189 struct dm_verity *v = io->v;
190 struct dm_buffer *buf;
191 struct buffer_aux *aux;
192 u8 *data;
193 int r;
194 sector_t hash_block;
195 unsigned offset;
196
197 verity_hash_at_level(v, block, level, &hash_block, &offset);
198
199 data = dm_bufio_read(v->bufio, hash_block, &buf);
200 if (unlikely(IS_ERR(data)))
201 return PTR_ERR(data);
202
203 aux = dm_bufio_get_aux_data(buf);
204
205 if (!aux->hash_verified) {
206 struct shash_desc *desc;
207 u8 *result;
208
209 if (skip_unverified) {
210 r = 1;
211 goto release_ret_r;
212 }
213
214 desc = io_hash_desc(v, io);
215 desc->tfm = v->tfm;
216 desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
217 r = crypto_shash_init(desc);
218 if (r < 0) {
219 DMERR("crypto_shash_init failed: %d", r);
220 goto release_ret_r;
221 }
222
223 if (likely(v->version >= 1)) {
224 r = crypto_shash_update(desc, v->salt, v->salt_size);
225 if (r < 0) {
226 DMERR("crypto_shash_update failed: %d", r);
227 goto release_ret_r;
228 }
229 }
230
231 r = crypto_shash_update(desc, data, 1 << v->hash_dev_block_bits);
232 if (r < 0) {
233 DMERR("crypto_shash_update failed: %d", r);
234 goto release_ret_r;
235 }
236
237 if (!v->version) {
238 r = crypto_shash_update(desc, v->salt, v->salt_size);
239 if (r < 0) {
240 DMERR("crypto_shash_update failed: %d", r);
241 goto release_ret_r;
242 }
243 }
244
245 result = io_real_digest(v, io);
246 r = crypto_shash_final(desc, result);
247 if (r < 0) {
248 DMERR("crypto_shash_final failed: %d", r);
249 goto release_ret_r;
250 }
251 if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
252 DMERR_LIMIT("metadata block %llu is corrupted",
253 (unsigned long long)hash_block);
254 v->hash_failed = 1;
255 r = -EIO;
256 goto release_ret_r;
257 } else
258 aux->hash_verified = 1;
259 }
260
261 data += offset;
262
263 memcpy(io_want_digest(v, io), data, v->digest_size);
264
265 dm_bufio_release(buf);
266 return 0;
267
268release_ret_r:
269 dm_bufio_release(buf);
270
271 return r;
272}
273
274/*
275 * Verify one "dm_verity_io" structure.
276 */
277static int verity_verify_io(struct dm_verity_io *io)
278{
279 struct dm_verity *v = io->v;
280 unsigned b;
281 int i;
282 unsigned vector = 0, offset = 0;
283
284 for (b = 0; b < io->n_blocks; b++) {
285 struct shash_desc *desc;
286 u8 *result;
287 int r;
288 unsigned todo;
289
290 if (likely(v->levels)) {
291 /*
292 * First, we try to get the requested hash for
293 * the current block. If the hash block itself is
294 * verified, zero is returned. If it isn't, this
295 * function returns 0 and we fall back to whole
296 * chain verification.
297 */
298 int r = verity_verify_level(io, io->block + b, 0, true);
299 if (likely(!r))
300 goto test_block_hash;
301 if (r < 0)
302 return r;
303 }
304
305 memcpy(io_want_digest(v, io), v->root_digest, v->digest_size);
306
307 for (i = v->levels - 1; i >= 0; i--) {
308 int r = verity_verify_level(io, io->block + b, i, false);
309 if (unlikely(r))
310 return r;
311 }
312
313test_block_hash:
314 desc = io_hash_desc(v, io);
315 desc->tfm = v->tfm;
316 desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
317 r = crypto_shash_init(desc);
318 if (r < 0) {
319 DMERR("crypto_shash_init failed: %d", r);
320 return r;
321 }
322
323 if (likely(v->version >= 1)) {
324 r = crypto_shash_update(desc, v->salt, v->salt_size);
325 if (r < 0) {
326 DMERR("crypto_shash_update failed: %d", r);
327 return r;
328 }
329 }
330
331 todo = 1 << v->data_dev_block_bits;
332 do {
333 struct bio_vec *bv;
334 u8 *page;
335 unsigned len;
336
337 BUG_ON(vector >= io->io_vec_size);
338 bv = &io->io_vec[vector];
339 page = kmap_atomic(bv->bv_page);
340 len = bv->bv_len - offset;
341 if (likely(len >= todo))
342 len = todo;
343 r = crypto_shash_update(desc,
344 page + bv->bv_offset + offset, len);
345 kunmap_atomic(page);
346 if (r < 0) {
347 DMERR("crypto_shash_update failed: %d", r);
348 return r;
349 }
350 offset += len;
351 if (likely(offset == bv->bv_len)) {
352 offset = 0;
353 vector++;
354 }
355 todo -= len;
356 } while (todo);
357
358 if (!v->version) {
359 r = crypto_shash_update(desc, v->salt, v->salt_size);
360 if (r < 0) {
361 DMERR("crypto_shash_update failed: %d", r);
362 return r;
363 }
364 }
365
366 result = io_real_digest(v, io);
367 r = crypto_shash_final(desc, result);
368 if (r < 0) {
369 DMERR("crypto_shash_final failed: %d", r);
370 return r;
371 }
372 if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
373 DMERR_LIMIT("data block %llu is corrupted",
374 (unsigned long long)(io->block + b));
375 v->hash_failed = 1;
376 return -EIO;
377 }
378 }
379 BUG_ON(vector != io->io_vec_size);
380 BUG_ON(offset);
381
382 return 0;
383}
384
385/*
386 * End one "io" structure with a given error.
387 */
388static void verity_finish_io(struct dm_verity_io *io, int error)
389{
390 struct dm_verity *v = io->v;
391 struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_bio_data_size);
392
393 bio->bi_end_io = io->orig_bi_end_io;
394 bio->bi_private = io->orig_bi_private;
395
396 if (io->io_vec != io->io_vec_inline)
397 mempool_free(io->io_vec, v->vec_mempool);
398
399 bio_endio(bio, error);
400}
401
402static void verity_work(struct work_struct *w)
403{
404 struct dm_verity_io *io = container_of(w, struct dm_verity_io, work);
405
406 verity_finish_io(io, verity_verify_io(io));
407}
408
409static void verity_end_io(struct bio *bio, int error)
410{
411 struct dm_verity_io *io = bio->bi_private;
412
413 if (error) {
414 verity_finish_io(io, error);
415 return;
416 }
417
418 INIT_WORK(&io->work, verity_work);
419 queue_work(io->v->verify_wq, &io->work);
420}
421
422/*
423 * Prefetch buffers for the specified io.
424 * The root buffer is not prefetched, it is assumed that it will be cached
425 * all the time.
426 */
427static void verity_prefetch_io(struct dm_verity *v, struct dm_verity_io *io)
428{
429 int i;
430
431 for (i = v->levels - 2; i >= 0; i--) {
432 sector_t hash_block_start;
433 sector_t hash_block_end;
434 verity_hash_at_level(v, io->block, i, &hash_block_start, NULL);
435 verity_hash_at_level(v, io->block + io->n_blocks - 1, i, &hash_block_end, NULL);
436 if (!i) {
437 unsigned cluster = ACCESS_ONCE(dm_verity_prefetch_cluster);
438
439 cluster >>= v->data_dev_block_bits;
440 if (unlikely(!cluster))
441 goto no_prefetch_cluster;
442
443 if (unlikely(cluster & (cluster - 1)))
444 cluster = 1 << (fls(cluster) - 1);
445
446 hash_block_start &= ~(sector_t)(cluster - 1);
447 hash_block_end |= cluster - 1;
448 if (unlikely(hash_block_end >= v->hash_blocks))
449 hash_block_end = v->hash_blocks - 1;
450 }
451no_prefetch_cluster:
452 dm_bufio_prefetch(v->bufio, hash_block_start,
453 hash_block_end - hash_block_start + 1);
454 }
455}
456
457/*
458 * Bio map function. It allocates dm_verity_io structure and bio vector and
459 * fills them. Then it issues prefetches and the I/O.
460 */
461static int verity_map(struct dm_target *ti, struct bio *bio)
462{
463 struct dm_verity *v = ti->private;
464 struct dm_verity_io *io;
465
466 bio->bi_bdev = v->data_dev->bdev;
467 bio->bi_sector = verity_map_sector(v, bio->bi_sector);
468
469 if (((unsigned)bio->bi_sector | bio_sectors(bio)) &
470 ((1 << (v->data_dev_block_bits - SECTOR_SHIFT)) - 1)) {
471 DMERR_LIMIT("unaligned io");
472 return -EIO;
473 }
474
475 if ((bio->bi_sector + bio_sectors(bio)) >>
476 (v->data_dev_block_bits - SECTOR_SHIFT) > v->data_blocks) {
477 DMERR_LIMIT("io out of range");
478 return -EIO;
479 }
480
481 if (bio_data_dir(bio) == WRITE)
482 return -EIO;
483
484 io = dm_per_bio_data(bio, ti->per_bio_data_size);
485 io->v = v;
486 io->orig_bi_end_io = bio->bi_end_io;
487 io->orig_bi_private = bio->bi_private;
488 io->block = bio->bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT);
489 io->n_blocks = bio->bi_size >> v->data_dev_block_bits;
490
491 bio->bi_end_io = verity_end_io;
492 bio->bi_private = io;
493 io->io_vec_size = bio->bi_vcnt - bio->bi_idx;
494 if (io->io_vec_size < DM_VERITY_IO_VEC_INLINE)
495 io->io_vec = io->io_vec_inline;
496 else
497 io->io_vec = mempool_alloc(v->vec_mempool, GFP_NOIO);
498 memcpy(io->io_vec, bio_iovec(bio),
499 io->io_vec_size * sizeof(struct bio_vec));
500
501 verity_prefetch_io(v, io);
502
503 generic_make_request(bio);
504
505 return DM_MAPIO_SUBMITTED;
506}
507
508/*
509 * Status: V (valid) or C (corruption found)
510 */
511static int verity_status(struct dm_target *ti, status_type_t type,
512 unsigned status_flags, char *result, unsigned maxlen)
513{
514 struct dm_verity *v = ti->private;
515 unsigned sz = 0;
516 unsigned x;
517
518 switch (type) {
519 case STATUSTYPE_INFO:
520 DMEMIT("%c", v->hash_failed ? 'C' : 'V');
521 break;
522 case STATUSTYPE_TABLE:
523 DMEMIT("%u %s %s %u %u %llu %llu %s ",
524 v->version,
525 v->data_dev->name,
526 v->hash_dev->name,
527 1 << v->data_dev_block_bits,
528 1 << v->hash_dev_block_bits,
529 (unsigned long long)v->data_blocks,
530 (unsigned long long)v->hash_start,
531 v->alg_name
532 );
533 for (x = 0; x < v->digest_size; x++)
534 DMEMIT("%02x", v->root_digest[x]);
535 DMEMIT(" ");
536 if (!v->salt_size)
537 DMEMIT("-");
538 else
539 for (x = 0; x < v->salt_size; x++)
540 DMEMIT("%02x", v->salt[x]);
541 break;
542 }
543
544 return 0;
545}
546
547static int verity_ioctl(struct dm_target *ti, unsigned cmd,
548 unsigned long arg)
549{
550 struct dm_verity *v = ti->private;
551 int r = 0;
552
553 if (v->data_start ||
554 ti->len != i_size_read(v->data_dev->bdev->bd_inode) >> SECTOR_SHIFT)
555 r = scsi_verify_blk_ioctl(NULL, cmd);
556
557 return r ? : __blkdev_driver_ioctl(v->data_dev->bdev, v->data_dev->mode,
558 cmd, arg);
559}
560
561static int verity_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
562 struct bio_vec *biovec, int max_size)
563{
564 struct dm_verity *v = ti->private;
565 struct request_queue *q = bdev_get_queue(v->data_dev->bdev);
566
567 if (!q->merge_bvec_fn)
568 return max_size;
569
570 bvm->bi_bdev = v->data_dev->bdev;
571 bvm->bi_sector = verity_map_sector(v, bvm->bi_sector);
572
573 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
574}
575
576static int verity_iterate_devices(struct dm_target *ti,
577 iterate_devices_callout_fn fn, void *data)
578{
579 struct dm_verity *v = ti->private;
580
581 return fn(ti, v->data_dev, v->data_start, ti->len, data);
582}
583
584static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits)
585{
586 struct dm_verity *v = ti->private;
587
588 if (limits->logical_block_size < 1 << v->data_dev_block_bits)
589 limits->logical_block_size = 1 << v->data_dev_block_bits;
590
591 if (limits->physical_block_size < 1 << v->data_dev_block_bits)
592 limits->physical_block_size = 1 << v->data_dev_block_bits;
593
594 blk_limits_io_min(limits, limits->logical_block_size);
595}
596
597static void verity_dtr(struct dm_target *ti)
598{
599 struct dm_verity *v = ti->private;
600
601 if (v->verify_wq)
602 destroy_workqueue(v->verify_wq);
603
604 if (v->vec_mempool)
605 mempool_destroy(v->vec_mempool);
606
607 if (v->bufio)
608 dm_bufio_client_destroy(v->bufio);
609
610 kfree(v->salt);
611 kfree(v->root_digest);
612
613 if (v->tfm)
614 crypto_free_shash(v->tfm);
615
616 kfree(v->alg_name);
617
618 if (v->hash_dev)
619 dm_put_device(ti, v->hash_dev);
620
621 if (v->data_dev)
622 dm_put_device(ti, v->data_dev);
623
624 kfree(v);
625}
626
627/*
628 * Target parameters:
629 * <version> The current format is version 1.
630 * Vsn 0 is compatible with original Chromium OS releases.
631 * <data device>
632 * <hash device>
633 * <data block size>
634 * <hash block size>
635 * <the number of data blocks>
636 * <hash start block>
637 * <algorithm>
638 * <digest>
639 * <salt> Hex string or "-" if no salt.
640 */
641static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
642{
643 struct dm_verity *v;
644 unsigned num;
645 unsigned long long num_ll;
646 int r;
647 int i;
648 sector_t hash_position;
649 char dummy;
650
651 v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL);
652 if (!v) {
653 ti->error = "Cannot allocate verity structure";
654 return -ENOMEM;
655 }
656 ti->private = v;
657 v->ti = ti;
658
659 if ((dm_table_get_mode(ti->table) & ~FMODE_READ)) {
660 ti->error = "Device must be readonly";
661 r = -EINVAL;
662 goto bad;
663 }
664
665 if (argc != 10) {
666 ti->error = "Invalid argument count: exactly 10 arguments required";
667 r = -EINVAL;
668 goto bad;
669 }
670
671 if (sscanf(argv[0], "%d%c", &num, &dummy) != 1 ||
672 num < 0 || num > 1) {
673 ti->error = "Invalid version";
674 r = -EINVAL;
675 goto bad;
676 }
677 v->version = num;
678
679 r = dm_get_device(ti, argv[1], FMODE_READ, &v->data_dev);
680 if (r) {
681 ti->error = "Data device lookup failed";
682 goto bad;
683 }
684
685 r = dm_get_device(ti, argv[2], FMODE_READ, &v->hash_dev);
686 if (r) {
687 ti->error = "Data device lookup failed";
688 goto bad;
689 }
690
691 if (sscanf(argv[3], "%u%c", &num, &dummy) != 1 ||
692 !num || (num & (num - 1)) ||
693 num < bdev_logical_block_size(v->data_dev->bdev) ||
694 num > PAGE_SIZE) {
695 ti->error = "Invalid data device block size";
696 r = -EINVAL;
697 goto bad;
698 }
699 v->data_dev_block_bits = ffs(num) - 1;
700
701 if (sscanf(argv[4], "%u%c", &num, &dummy) != 1 ||
702 !num || (num & (num - 1)) ||
703 num < bdev_logical_block_size(v->hash_dev->bdev) ||
704 num > INT_MAX) {
705 ti->error = "Invalid hash device block size";
706 r = -EINVAL;
707 goto bad;
708 }
709 v->hash_dev_block_bits = ffs(num) - 1;
710
711 if (sscanf(argv[5], "%llu%c", &num_ll, &dummy) != 1 ||
712 (sector_t)(num_ll << (v->data_dev_block_bits - SECTOR_SHIFT))
713 >> (v->data_dev_block_bits - SECTOR_SHIFT) != num_ll) {
714 ti->error = "Invalid data blocks";
715 r = -EINVAL;
716 goto bad;
717 }
718 v->data_blocks = num_ll;
719
720 if (ti->len > (v->data_blocks << (v->data_dev_block_bits - SECTOR_SHIFT))) {
721 ti->error = "Data device is too small";
722 r = -EINVAL;
723 goto bad;
724 }
725
726 if (sscanf(argv[6], "%llu%c", &num_ll, &dummy) != 1 ||
727 (sector_t)(num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT))
728 >> (v->hash_dev_block_bits - SECTOR_SHIFT) != num_ll) {
729 ti->error = "Invalid hash start";
730 r = -EINVAL;
731 goto bad;
732 }
733 v->hash_start = num_ll;
734
735 v->alg_name = kstrdup(argv[7], GFP_KERNEL);
736 if (!v->alg_name) {
737 ti->error = "Cannot allocate algorithm name";
738 r = -ENOMEM;
739 goto bad;
740 }
741
742 v->tfm = crypto_alloc_shash(v->alg_name, 0, 0);
743 if (IS_ERR(v->tfm)) {
744 ti->error = "Cannot initialize hash function";
745 r = PTR_ERR(v->tfm);
746 v->tfm = NULL;
747 goto bad;
748 }
749 v->digest_size = crypto_shash_digestsize(v->tfm);
750 if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) {
751 ti->error = "Digest size too big";
752 r = -EINVAL;
753 goto bad;
754 }
755 v->shash_descsize =
756 sizeof(struct shash_desc) + crypto_shash_descsize(v->tfm);
757
758 v->root_digest = kmalloc(v->digest_size, GFP_KERNEL);
759 if (!v->root_digest) {
760 ti->error = "Cannot allocate root digest";
761 r = -ENOMEM;
762 goto bad;
763 }
764 if (strlen(argv[8]) != v->digest_size * 2 ||
765 hex2bin(v->root_digest, argv[8], v->digest_size)) {
766 ti->error = "Invalid root digest";
767 r = -EINVAL;
768 goto bad;
769 }
770
771 if (strcmp(argv[9], "-")) {
772 v->salt_size = strlen(argv[9]) / 2;
773 v->salt = kmalloc(v->salt_size, GFP_KERNEL);
774 if (!v->salt) {
775 ti->error = "Cannot allocate salt";
776 r = -ENOMEM;
777 goto bad;
778 }
779 if (strlen(argv[9]) != v->salt_size * 2 ||
780 hex2bin(v->salt, argv[9], v->salt_size)) {
781 ti->error = "Invalid salt";
782 r = -EINVAL;
783 goto bad;
784 }
785 }
786
787 v->hash_per_block_bits =
788 fls((1 << v->hash_dev_block_bits) / v->digest_size) - 1;
789
790 v->levels = 0;
791 if (v->data_blocks)
792 while (v->hash_per_block_bits * v->levels < 64 &&
793 (unsigned long long)(v->data_blocks - 1) >>
794 (v->hash_per_block_bits * v->levels))
795 v->levels++;
796
797 if (v->levels > DM_VERITY_MAX_LEVELS) {
798 ti->error = "Too many tree levels";
799 r = -E2BIG;
800 goto bad;
801 }
802
803 hash_position = v->hash_start;
804 for (i = v->levels - 1; i >= 0; i--) {
805 sector_t s;
806 v->hash_level_block[i] = hash_position;
807 s = verity_position_at_level(v, v->data_blocks, i);
808 s = (s >> v->hash_per_block_bits) +
809 !!(s & ((1 << v->hash_per_block_bits) - 1));
810 if (hash_position + s < hash_position) {
811 ti->error = "Hash device offset overflow";
812 r = -E2BIG;
813 goto bad;
814 }
815 hash_position += s;
816 }
817 v->hash_blocks = hash_position;
818
819 v->bufio = dm_bufio_client_create(v->hash_dev->bdev,
820 1 << v->hash_dev_block_bits, 1, sizeof(struct buffer_aux),
821 dm_bufio_alloc_callback, NULL);
822 if (IS_ERR(v->bufio)) {
823 ti->error = "Cannot initialize dm-bufio";
824 r = PTR_ERR(v->bufio);
825 v->bufio = NULL;
826 goto bad;
827 }
828
829 if (dm_bufio_get_device_size(v->bufio) < v->hash_blocks) {
830 ti->error = "Hash device is too small";
831 r = -E2BIG;
832 goto bad;
833 }
834
835 ti->per_bio_data_size = roundup(sizeof(struct dm_verity_io) + v->shash_descsize + v->digest_size * 2, __alignof__(struct dm_verity_io));
836
837 v->vec_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE,
838 BIO_MAX_PAGES * sizeof(struct bio_vec));
839 if (!v->vec_mempool) {
840 ti->error = "Cannot allocate vector mempool";
841 r = -ENOMEM;
842 goto bad;
843 }
844
845 /* WQ_UNBOUND greatly improves performance when running on ramdisk */
846 v->verify_wq = alloc_workqueue("kverityd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus());
847 if (!v->verify_wq) {
848 ti->error = "Cannot allocate workqueue";
849 r = -ENOMEM;
850 goto bad;
851 }
852
853 return 0;
854
855bad:
856 verity_dtr(ti);
857
858 return r;
859}
860
861static struct target_type verity_target = {
862 .name = "verity",
863 .version = {1, 1, 0},
864 .module = THIS_MODULE,
865 .ctr = verity_ctr,
866 .dtr = verity_dtr,
867 .map = verity_map,
868 .status = verity_status,
869 .ioctl = verity_ioctl,
870 .merge = verity_merge,
871 .iterate_devices = verity_iterate_devices,
872 .io_hints = verity_io_hints,
873};
874
875static int __init dm_verity_init(void)
876{
877 int r;
878
879 r = dm_register_target(&verity_target);
880 if (r < 0)
881 DMERR("register failed %d", r);
882
883 return r;
884}
885
886static void __exit dm_verity_exit(void)
887{
888 dm_unregister_target(&verity_target);
889}
890
891module_init(dm_verity_init);
892module_exit(dm_verity_exit);
893
894MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
895MODULE_AUTHOR("Mandeep Baines <msb@chromium.org>");
896MODULE_AUTHOR("Will Drewry <wad@chromium.org>");
897MODULE_DESCRIPTION(DM_NAME " target for transparent disk integrity checking");
898MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c
index 69a5c3b3b34..cc2b3cb8194 100644
--- a/drivers/md/dm-zero.c
+++ b/drivers/md/dm-zero.c
@@ -33,7 +33,8 @@ static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv)
33/* 33/*
34 * Return zeros only on reads 34 * Return zeros only on reads
35 */ 35 */
36static int zero_map(struct dm_target *ti, struct bio *bio) 36static int zero_map(struct dm_target *ti, struct bio *bio,
37 union map_info *map_context)
37{ 38{
38 switch(bio_rw(bio)) { 39 switch(bio_rw(bio)) {
39 case READ: 40 case READ:
@@ -55,7 +56,7 @@ static int zero_map(struct dm_target *ti, struct bio *bio)
55 56
56static struct target_type zero_target = { 57static struct target_type zero_target = {
57 .name = "zero", 58 .name = "zero",
58 .version = {1, 1, 0}, 59 .version = {1, 0, 0},
59 .module = THIS_MODULE, 60 .module = THIS_MODULE,
60 .ctr = zero_ctr, 61 .ctr = zero_ctr,
61 .map = zero_map, 62 .map = zero_map,
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index c72e4d5a961..52b39f335bb 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -14,6 +14,7 @@
14#include <linux/moduleparam.h> 14#include <linux/moduleparam.h>
15#include <linux/blkpg.h> 15#include <linux/blkpg.h>
16#include <linux/bio.h> 16#include <linux/bio.h>
17#include <linux/buffer_head.h>
17#include <linux/mempool.h> 18#include <linux/mempool.h>
18#include <linux/slab.h> 19#include <linux/slab.h>
19#include <linux/idr.h> 20#include <linux/idr.h>
@@ -24,16 +25,6 @@
24 25
25#define DM_MSG_PREFIX "core" 26#define DM_MSG_PREFIX "core"
26 27
27#ifdef CONFIG_PRINTK
28/*
29 * ratelimit state to be used in DMXXX_LIMIT().
30 */
31DEFINE_RATELIMIT_STATE(dm_ratelimit_state,
32 DEFAULT_RATELIMIT_INTERVAL,
33 DEFAULT_RATELIMIT_BURST);
34EXPORT_SYMBOL(dm_ratelimit_state);
35#endif
36
37/* 28/*
38 * Cookies are numeric values sent with CHANGE and REMOVE 29 * Cookies are numeric values sent with CHANGE and REMOVE
39 * uevents while resuming, removing or renaming the device. 30 * uevents while resuming, removing or renaming the device.
@@ -63,6 +54,17 @@ struct dm_io {
63}; 54};
64 55
65/* 56/*
57 * For bio-based dm.
58 * One of these is allocated per target within a bio. Hopefully
59 * this will be simplified out one day.
60 */
61struct dm_target_io {
62 struct dm_io *io;
63 struct dm_target *ti;
64 union map_info info;
65};
66
67/*
66 * For request-based dm. 68 * For request-based dm.
67 * One of these is allocated per request. 69 * One of these is allocated per request.
68 */ 70 */
@@ -75,17 +77,12 @@ struct dm_rq_target_io {
75}; 77};
76 78
77/* 79/*
78 * For request-based dm - the bio clones we allocate are embedded in these 80 * For request-based dm.
79 * structs. 81 * One of these is allocated per bio.
80 *
81 * We allocate these with bio_alloc_bioset, using the front_pad parameter when
82 * the bioset is created - this means the bio has to come at the end of the
83 * struct.
84 */ 82 */
85struct dm_rq_clone_bio_info { 83struct dm_rq_clone_bio_info {
86 struct bio *orig; 84 struct bio *orig;
87 struct dm_rq_target_io *tio; 85 struct dm_rq_target_io *tio;
88 struct bio clone;
89}; 86};
90 87
91union map_info *dm_get_mapinfo(struct bio *bio) 88union map_info *dm_get_mapinfo(struct bio *bio)
@@ -133,8 +130,6 @@ struct mapped_device {
133 /* Protect queue and type against concurrent access. */ 130 /* Protect queue and type against concurrent access. */
134 struct mutex type_lock; 131 struct mutex type_lock;
135 132
136 struct target_type *immutable_target_type;
137
138 struct gendisk *disk; 133 struct gendisk *disk;
139 char name[16]; 134 char name[16];
140 135
@@ -185,6 +180,9 @@ struct mapped_device {
185 /* forced geometry settings */ 180 /* forced geometry settings */
186 struct hd_geometry geometry; 181 struct hd_geometry geometry;
187 182
183 /* For saving the address of __make_request for request based dm */
184 make_request_fn *saved_make_request_fn;
185
188 /* sysfs handle */ 186 /* sysfs handle */
189 struct kobject kobj; 187 struct kobject kobj;
190 188
@@ -203,12 +201,8 @@ struct dm_md_mempools {
203 201
204#define MIN_IOS 256 202#define MIN_IOS 256
205static struct kmem_cache *_io_cache; 203static struct kmem_cache *_io_cache;
204static struct kmem_cache *_tio_cache;
206static struct kmem_cache *_rq_tio_cache; 205static struct kmem_cache *_rq_tio_cache;
207
208/*
209 * Unused now, and needs to be deleted. But since io_pool is overloaded and it's
210 * still used for _io_cache, I'm leaving this for a later cleanup
211 */
212static struct kmem_cache *_rq_bio_info_cache; 206static struct kmem_cache *_rq_bio_info_cache;
213 207
214static int __init local_init(void) 208static int __init local_init(void)
@@ -220,9 +214,14 @@ static int __init local_init(void)
220 if (!_io_cache) 214 if (!_io_cache)
221 return r; 215 return r;
222 216
217 /* allocate a slab for the target ios */
218 _tio_cache = KMEM_CACHE(dm_target_io, 0);
219 if (!_tio_cache)
220 goto out_free_io_cache;
221
223 _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0); 222 _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
224 if (!_rq_tio_cache) 223 if (!_rq_tio_cache)
225 goto out_free_io_cache; 224 goto out_free_tio_cache;
226 225
227 _rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0); 226 _rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0);
228 if (!_rq_bio_info_cache) 227 if (!_rq_bio_info_cache)
@@ -248,6 +247,8 @@ out_free_rq_bio_info_cache:
248 kmem_cache_destroy(_rq_bio_info_cache); 247 kmem_cache_destroy(_rq_bio_info_cache);
249out_free_rq_tio_cache: 248out_free_rq_tio_cache:
250 kmem_cache_destroy(_rq_tio_cache); 249 kmem_cache_destroy(_rq_tio_cache);
250out_free_tio_cache:
251 kmem_cache_destroy(_tio_cache);
251out_free_io_cache: 252out_free_io_cache:
252 kmem_cache_destroy(_io_cache); 253 kmem_cache_destroy(_io_cache);
253 254
@@ -258,6 +259,7 @@ static void local_exit(void)
258{ 259{
259 kmem_cache_destroy(_rq_bio_info_cache); 260 kmem_cache_destroy(_rq_bio_info_cache);
260 kmem_cache_destroy(_rq_tio_cache); 261 kmem_cache_destroy(_rq_tio_cache);
262 kmem_cache_destroy(_tio_cache);
261 kmem_cache_destroy(_io_cache); 263 kmem_cache_destroy(_io_cache);
262 unregister_blkdev(_major, _name); 264 unregister_blkdev(_major, _name);
263 dm_uevent_exit(); 265 dm_uevent_exit();
@@ -443,7 +445,7 @@ static void free_io(struct mapped_device *md, struct dm_io *io)
443 445
444static void free_tio(struct mapped_device *md, struct dm_target_io *tio) 446static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
445{ 447{
446 bio_put(&tio->clone); 448 mempool_free(tio, md->tio_pool);
447} 449}
448 450
449static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md, 451static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,
@@ -457,6 +459,16 @@ static void free_rq_tio(struct dm_rq_target_io *tio)
457 mempool_free(tio, tio->md->tio_pool); 459 mempool_free(tio, tio->md->tio_pool);
458} 460}
459 461
462static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md)
463{
464 return mempool_alloc(md->io_pool, GFP_ATOMIC);
465}
466
467static void free_bio_info(struct dm_rq_clone_bio_info *info)
468{
469 mempool_free(info, info->tio->md->io_pool);
470}
471
460static int md_in_flight(struct mapped_device *md) 472static int md_in_flight(struct mapped_device *md)
461{ 473{
462 return atomic_read(&md->pending[READ]) + 474 return atomic_read(&md->pending[READ]) +
@@ -645,7 +657,7 @@ static void clone_endio(struct bio *bio, int error)
645 error = -EIO; 657 error = -EIO;
646 658
647 if (endio) { 659 if (endio) {
648 r = endio(tio->ti, bio, error); 660 r = endio(tio->ti, bio, error, &tio->info);
649 if (r < 0 || r == DM_ENDIO_REQUEUE) 661 if (r < 0 || r == DM_ENDIO_REQUEUE)
650 /* 662 /*
651 * error and requeue request are handled 663 * error and requeue request are handled
@@ -661,7 +673,13 @@ static void clone_endio(struct bio *bio, int error)
661 } 673 }
662 } 674 }
663 675
676 /*
677 * Store md for cleanup instead of tio which is about to get freed.
678 */
679 bio->bi_private = md->bs;
680
664 free_tio(md, tio); 681 free_tio(md, tio);
682 bio_put(bio);
665 dec_pending(io, error); 683 dec_pending(io, error);
666} 684}
667 685
@@ -728,14 +746,8 @@ static void rq_completed(struct mapped_device *md, int rw, int run_queue)
728 if (!md_in_flight(md)) 746 if (!md_in_flight(md))
729 wake_up(&md->wait); 747 wake_up(&md->wait);
730 748
731 /*
732 * Run this off this callpath, as drivers could invoke end_io while
733 * inside their request_fn (and holding the queue lock). Calling
734 * back into ->request_fn() could deadlock attempting to grab the
735 * queue lock again.
736 */
737 if (run_queue) 749 if (run_queue)
738 blk_run_queue_async(md->queue); 750 blk_run_queue(md->queue);
739 751
740 /* 752 /*
741 * dm_put() must be at the end of this function. See the comment above 753 * dm_put() must be at the end of this function. See the comment above
@@ -845,14 +857,10 @@ static void dm_done(struct request *clone, int error, bool mapped)
845{ 857{
846 int r = error; 858 int r = error;
847 struct dm_rq_target_io *tio = clone->end_io_data; 859 struct dm_rq_target_io *tio = clone->end_io_data;
848 dm_request_endio_fn rq_end_io = NULL; 860 dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;
849 861
850 if (tio->ti) { 862 if (mapped && rq_end_io)
851 rq_end_io = tio->ti->type->rq_end_io; 863 r = rq_end_io(tio->ti, clone, error, &tio->info);
852
853 if (mapped && rq_end_io)
854 r = rq_end_io(tio->ti, clone, error, &tio->info);
855 }
856 864
857 if (r <= 0) 865 if (r <= 0)
858 /* The target wants to complete the I/O */ 866 /* The target wants to complete the I/O */
@@ -952,47 +960,28 @@ static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti
952static sector_t max_io_len(sector_t sector, struct dm_target *ti) 960static sector_t max_io_len(sector_t sector, struct dm_target *ti)
953{ 961{
954 sector_t len = max_io_len_target_boundary(sector, ti); 962 sector_t len = max_io_len_target_boundary(sector, ti);
955 sector_t offset, max_len;
956 963
957 /* 964 /*
958 * Does the target need to split even further? 965 * Does the target need to split even further ?
959 */ 966 */
960 if (ti->max_io_len) { 967 if (ti->split_io) {
961 offset = dm_target_offset(ti, sector); 968 sector_t boundary;
962 if (unlikely(ti->max_io_len & (ti->max_io_len - 1))) 969 sector_t offset = dm_target_offset(ti, sector);
963 max_len = sector_div(offset, ti->max_io_len); 970 boundary = ((offset + ti->split_io) & ~(ti->split_io - 1))
964 else 971 - offset;
965 max_len = offset & (ti->max_io_len - 1); 972 if (len > boundary)
966 max_len = ti->max_io_len - max_len; 973 len = boundary;
967
968 if (len > max_len)
969 len = max_len;
970 } 974 }
971 975
972 return len; 976 return len;
973} 977}
974 978
975int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) 979static void __map_bio(struct dm_target *ti, struct bio *clone,
976{ 980 struct dm_target_io *tio)
977 if (len > UINT_MAX) {
978 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
979 (unsigned long long)len, UINT_MAX);
980 ti->error = "Maximum size of target IO is too large";
981 return -EINVAL;
982 }
983
984 ti->max_io_len = (uint32_t) len;
985
986 return 0;
987}
988EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
989
990static void __map_bio(struct dm_target *ti, struct dm_target_io *tio)
991{ 981{
992 int r; 982 int r;
993 sector_t sector; 983 sector_t sector;
994 struct mapped_device *md; 984 struct mapped_device *md;
995 struct bio *clone = &tio->clone;
996 985
997 clone->bi_end_io = clone_endio; 986 clone->bi_end_io = clone_endio;
998 clone->bi_private = tio; 987 clone->bi_private = tio;
@@ -1004,7 +993,7 @@ static void __map_bio(struct dm_target *ti, struct dm_target_io *tio)
1004 */ 993 */
1005 atomic_inc(&tio->io->io_count); 994 atomic_inc(&tio->io->io_count);
1006 sector = clone->bi_sector; 995 sector = clone->bi_sector;
1007 r = ti->type->map(ti, clone); 996 r = ti->type->map(ti, clone, &tio->info);
1008 if (r == DM_MAPIO_REMAPPED) { 997 if (r == DM_MAPIO_REMAPPED) {
1009 /* the bio has been remapped so dispatch it */ 998 /* the bio has been remapped so dispatch it */
1010 999
@@ -1016,6 +1005,11 @@ static void __map_bio(struct dm_target *ti, struct dm_target_io *tio)
1016 /* error the io and bail out, or requeue it if needed */ 1005 /* error the io and bail out, or requeue it if needed */
1017 md = tio->io->md; 1006 md = tio->io->md;
1018 dec_pending(tio->io, r); 1007 dec_pending(tio->io, r);
1008 /*
1009 * Store bio_set for cleanup.
1010 */
1011 clone->bi_private = md->bs;
1012 bio_put(clone);
1019 free_tio(md, tio); 1013 free_tio(md, tio);
1020 } else if (r) { 1014 } else if (r) {
1021 DMWARN("unimplemented target map return value: %d", r); 1015 DMWARN("unimplemented target map return value: %d", r);
@@ -1033,16 +1027,25 @@ struct clone_info {
1033 unsigned short idx; 1027 unsigned short idx;
1034}; 1028};
1035 1029
1030static void dm_bio_destructor(struct bio *bio)
1031{
1032 struct bio_set *bs = bio->bi_private;
1033
1034 bio_free(bio, bs);
1035}
1036
1036/* 1037/*
1037 * Creates a little bio that just does part of a bvec. 1038 * Creates a little bio that just does part of a bvec.
1038 */ 1039 */
1039static void split_bvec(struct dm_target_io *tio, struct bio *bio, 1040static struct bio *split_bvec(struct bio *bio, sector_t sector,
1040 sector_t sector, unsigned short idx, unsigned int offset, 1041 unsigned short idx, unsigned int offset,
1041 unsigned int len, struct bio_set *bs) 1042 unsigned int len, struct bio_set *bs)
1042{ 1043{
1043 struct bio *clone = &tio->clone; 1044 struct bio *clone;
1044 struct bio_vec *bv = bio->bi_io_vec + idx; 1045 struct bio_vec *bv = bio->bi_io_vec + idx;
1045 1046
1047 clone = bio_alloc_bioset(GFP_NOIO, 1, bs);
1048 clone->bi_destructor = dm_bio_destructor;
1046 *clone->bi_io_vec = *bv; 1049 *clone->bi_io_vec = *bv;
1047 1050
1048 clone->bi_sector = sector; 1051 clone->bi_sector = sector;
@@ -1055,23 +1058,26 @@ static void split_bvec(struct dm_target_io *tio, struct bio *bio,
1055 clone->bi_flags |= 1 << BIO_CLONED; 1058 clone->bi_flags |= 1 << BIO_CLONED;
1056 1059
1057 if (bio_integrity(bio)) { 1060 if (bio_integrity(bio)) {
1058 bio_integrity_clone(clone, bio, GFP_NOIO); 1061 bio_integrity_clone(clone, bio, GFP_NOIO, bs);
1059 bio_integrity_trim(clone, 1062 bio_integrity_trim(clone,
1060 bio_sector_offset(bio, idx, offset), len); 1063 bio_sector_offset(bio, idx, offset), len);
1061 } 1064 }
1065
1066 return clone;
1062} 1067}
1063 1068
1064/* 1069/*
1065 * Creates a bio that consists of range of complete bvecs. 1070 * Creates a bio that consists of range of complete bvecs.
1066 */ 1071 */
1067static void clone_bio(struct dm_target_io *tio, struct bio *bio, 1072static struct bio *clone_bio(struct bio *bio, sector_t sector,
1068 sector_t sector, unsigned short idx, 1073 unsigned short idx, unsigned short bv_count,
1069 unsigned short bv_count, unsigned int len, 1074 unsigned int len, struct bio_set *bs)
1070 struct bio_set *bs)
1071{ 1075{
1072 struct bio *clone = &tio->clone; 1076 struct bio *clone;
1073 1077
1078 clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
1074 __bio_clone(clone, bio); 1079 __bio_clone(clone, bio);
1080 clone->bi_destructor = dm_bio_destructor;
1075 clone->bi_sector = sector; 1081 clone->bi_sector = sector;
1076 clone->bi_idx = idx; 1082 clone->bi_idx = idx;
1077 clone->bi_vcnt = idx + bv_count; 1083 clone->bi_vcnt = idx + bv_count;
@@ -1079,27 +1085,24 @@ static void clone_bio(struct dm_target_io *tio, struct bio *bio,
1079 clone->bi_flags &= ~(1 << BIO_SEG_VALID); 1085 clone->bi_flags &= ~(1 << BIO_SEG_VALID);
1080 1086
1081 if (bio_integrity(bio)) { 1087 if (bio_integrity(bio)) {
1082 bio_integrity_clone(clone, bio, GFP_NOIO); 1088 bio_integrity_clone(clone, bio, GFP_NOIO, bs);
1083 1089
1084 if (idx != bio->bi_idx || clone->bi_size < bio->bi_size) 1090 if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
1085 bio_integrity_trim(clone, 1091 bio_integrity_trim(clone,
1086 bio_sector_offset(bio, idx, 0), len); 1092 bio_sector_offset(bio, idx, 0), len);
1087 } 1093 }
1094
1095 return clone;
1088} 1096}
1089 1097
1090static struct dm_target_io *alloc_tio(struct clone_info *ci, 1098static struct dm_target_io *alloc_tio(struct clone_info *ci,
1091 struct dm_target *ti, int nr_iovecs) 1099 struct dm_target *ti)
1092{ 1100{
1093 struct dm_target_io *tio; 1101 struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO);
1094 struct bio *clone;
1095
1096 clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, ci->md->bs);
1097 tio = container_of(clone, struct dm_target_io, clone);
1098 1102
1099 tio->io = ci->io; 1103 tio->io = ci->io;
1100 tio->ti = ti; 1104 tio->ti = ti;
1101 memset(&tio->info, 0, sizeof(tio->info)); 1105 memset(&tio->info, 0, sizeof(tio->info));
1102 tio->target_request_nr = 0;
1103 1106
1104 return tio; 1107 return tio;
1105} 1108}
@@ -1107,24 +1110,25 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci,
1107static void __issue_target_request(struct clone_info *ci, struct dm_target *ti, 1110static void __issue_target_request(struct clone_info *ci, struct dm_target *ti,
1108 unsigned request_nr, sector_t len) 1111 unsigned request_nr, sector_t len)
1109{ 1112{
1110 struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs); 1113 struct dm_target_io *tio = alloc_tio(ci, ti);
1111 struct bio *clone = &tio->clone; 1114 struct bio *clone;
1112 1115
1113 tio->target_request_nr = request_nr; 1116 tio->info.target_request_nr = request_nr;
1114 1117
1115 /* 1118 /*
1116 * Discard requests require the bio's inline iovecs be initialized. 1119 * Discard requests require the bio's inline iovecs be initialized.
1117 * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush 1120 * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
1118 * and discard, so no need for concern about wasted bvec allocations. 1121 * and discard, so no need for concern about wasted bvec allocations.
1119 */ 1122 */
1120 1123 clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs);
1121 __bio_clone(clone, ci->bio); 1124 __bio_clone(clone, ci->bio);
1125 clone->bi_destructor = dm_bio_destructor;
1122 if (len) { 1126 if (len) {
1123 clone->bi_sector = ci->sector; 1127 clone->bi_sector = ci->sector;
1124 clone->bi_size = to_bytes(len); 1128 clone->bi_size = to_bytes(len);
1125 } 1129 }
1126 1130
1127 __map_bio(ti, tio); 1131 __map_bio(ti, clone, tio);
1128} 1132}
1129 1133
1130static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti, 1134static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti,
@@ -1153,38 +1157,18 @@ static int __clone_and_map_empty_flush(struct clone_info *ci)
1153 */ 1157 */
1154static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti) 1158static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti)
1155{ 1159{
1156 struct bio *bio = ci->bio; 1160 struct bio *clone, *bio = ci->bio;
1157 struct dm_target_io *tio; 1161 struct dm_target_io *tio;
1158 1162
1159 tio = alloc_tio(ci, ti, bio->bi_max_vecs); 1163 tio = alloc_tio(ci, ti);
1160 clone_bio(tio, bio, ci->sector, ci->idx, bio->bi_vcnt - ci->idx, 1164 clone = clone_bio(bio, ci->sector, ci->idx,
1161 ci->sector_count, ci->md->bs); 1165 bio->bi_vcnt - ci->idx, ci->sector_count,
1162 __map_bio(ti, tio); 1166 ci->md->bs);
1167 __map_bio(ti, clone, tio);
1163 ci->sector_count = 0; 1168 ci->sector_count = 0;
1164} 1169}
1165 1170
1166typedef unsigned (*get_num_requests_fn)(struct dm_target *ti); 1171static int __clone_and_map_discard(struct clone_info *ci)
1167
1168static unsigned get_num_discard_requests(struct dm_target *ti)
1169{
1170 return ti->num_discard_requests;
1171}
1172
1173static unsigned get_num_write_same_requests(struct dm_target *ti)
1174{
1175 return ti->num_write_same_requests;
1176}
1177
1178typedef bool (*is_split_required_fn)(struct dm_target *ti);
1179
1180static bool is_split_required_for_discard(struct dm_target *ti)
1181{
1182 return ti->split_discard_requests;
1183}
1184
1185static int __clone_and_map_changing_extent_only(struct clone_info *ci,
1186 get_num_requests_fn get_num_requests,
1187 is_split_required_fn is_split_required)
1188{ 1172{
1189 struct dm_target *ti; 1173 struct dm_target *ti;
1190 sector_t len; 1174 sector_t len;
@@ -1195,18 +1179,15 @@ static int __clone_and_map_changing_extent_only(struct clone_info *ci,
1195 return -EIO; 1179 return -EIO;
1196 1180
1197 /* 1181 /*
1198 * Even though the device advertised support for this type of 1182 * Even though the device advertised discard support,
1199 * request, that does not mean every target supports it, and 1183 * that does not mean every target supports it, and
1200 * reconfiguration might also have changed that since the 1184 * reconfiguration might also have changed that since the
1201 * check was performed. 1185 * check was performed.
1202 */ 1186 */
1203 if (!get_num_requests || !get_num_requests(ti)) 1187 if (!ti->num_discard_requests)
1204 return -EOPNOTSUPP; 1188 return -EOPNOTSUPP;
1205 1189
1206 if (is_split_required && !is_split_required(ti)) 1190 len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1207 len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1208 else
1209 len = min(ci->sector_count, max_io_len(ci->sector, ti));
1210 1191
1211 __issue_target_requests(ci, ti, ti->num_discard_requests, len); 1192 __issue_target_requests(ci, ti, ti->num_discard_requests, len);
1212 1193
@@ -1216,28 +1197,15 @@ static int __clone_and_map_changing_extent_only(struct clone_info *ci,
1216 return 0; 1197 return 0;
1217} 1198}
1218 1199
1219static int __clone_and_map_discard(struct clone_info *ci)
1220{
1221 return __clone_and_map_changing_extent_only(ci, get_num_discard_requests,
1222 is_split_required_for_discard);
1223}
1224
1225static int __clone_and_map_write_same(struct clone_info *ci)
1226{
1227 return __clone_and_map_changing_extent_only(ci, get_num_write_same_requests, NULL);
1228}
1229
1230static int __clone_and_map(struct clone_info *ci) 1200static int __clone_and_map(struct clone_info *ci)
1231{ 1201{
1232 struct bio *bio = ci->bio; 1202 struct bio *clone, *bio = ci->bio;
1233 struct dm_target *ti; 1203 struct dm_target *ti;
1234 sector_t len = 0, max; 1204 sector_t len = 0, max;
1235 struct dm_target_io *tio; 1205 struct dm_target_io *tio;
1236 1206
1237 if (unlikely(bio->bi_rw & REQ_DISCARD)) 1207 if (unlikely(bio->bi_rw & REQ_DISCARD))
1238 return __clone_and_map_discard(ci); 1208 return __clone_and_map_discard(ci);
1239 else if (unlikely(bio->bi_rw & REQ_WRITE_SAME))
1240 return __clone_and_map_write_same(ci);
1241 1209
1242 ti = dm_table_find_target(ci->map, ci->sector); 1210 ti = dm_table_find_target(ci->map, ci->sector);
1243 if (!dm_target_is_valid(ti)) 1211 if (!dm_target_is_valid(ti))
@@ -1271,10 +1239,10 @@ static int __clone_and_map(struct clone_info *ci)
1271 len += bv_len; 1239 len += bv_len;
1272 } 1240 }
1273 1241
1274 tio = alloc_tio(ci, ti, bio->bi_max_vecs); 1242 tio = alloc_tio(ci, ti);
1275 clone_bio(tio, bio, ci->sector, ci->idx, i - ci->idx, len, 1243 clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len,
1276 ci->md->bs); 1244 ci->md->bs);
1277 __map_bio(ti, tio); 1245 __map_bio(ti, clone, tio);
1278 1246
1279 ci->sector += len; 1247 ci->sector += len;
1280 ci->sector_count -= len; 1248 ci->sector_count -= len;
@@ -1299,11 +1267,12 @@ static int __clone_and_map(struct clone_info *ci)
1299 1267
1300 len = min(remaining, max); 1268 len = min(remaining, max);
1301 1269
1302 tio = alloc_tio(ci, ti, 1); 1270 tio = alloc_tio(ci, ti);
1303 split_bvec(tio, bio, ci->sector, ci->idx, 1271 clone = split_bvec(bio, ci->sector, ci->idx,
1304 bv->bv_offset + offset, len, ci->md->bs); 1272 bv->bv_offset + offset, len,
1273 ci->md->bs);
1305 1274
1306 __map_bio(ti, tio); 1275 __map_bio(ti, clone, tio);
1307 1276
1308 ci->sector += len; 1277 ci->sector += len;
1309 ci->sector_count -= len; 1278 ci->sector_count -= len;
@@ -1422,7 +1391,7 @@ out:
1422 * The request function that just remaps the bio built up by 1391 * The request function that just remaps the bio built up by
1423 * dm_merge_bvec. 1392 * dm_merge_bvec.
1424 */ 1393 */
1425static void _dm_request(struct request_queue *q, struct bio *bio) 1394static int _dm_request(struct request_queue *q, struct bio *bio)
1426{ 1395{
1427 int rw = bio_data_dir(bio); 1396 int rw = bio_data_dir(bio);
1428 struct mapped_device *md = q->queuedata; 1397 struct mapped_device *md = q->queuedata;
@@ -1443,12 +1412,19 @@ static void _dm_request(struct request_queue *q, struct bio *bio)
1443 queue_io(md, bio); 1412 queue_io(md, bio);
1444 else 1413 else
1445 bio_io_error(bio); 1414 bio_io_error(bio);
1446 return; 1415 return 0;
1447 } 1416 }
1448 1417
1449 __split_and_process_bio(md, bio); 1418 __split_and_process_bio(md, bio);
1450 up_read(&md->io_lock); 1419 up_read(&md->io_lock);
1451 return; 1420 return 0;
1421}
1422
1423static int dm_make_request(struct request_queue *q, struct bio *bio)
1424{
1425 struct mapped_device *md = q->queuedata;
1426
1427 return md->saved_make_request_fn(q, bio); /* call __make_request() */
1452} 1428}
1453 1429
1454static int dm_request_based(struct mapped_device *md) 1430static int dm_request_based(struct mapped_device *md)
@@ -1456,14 +1432,14 @@ static int dm_request_based(struct mapped_device *md)
1456 return blk_queue_stackable(md->queue); 1432 return blk_queue_stackable(md->queue);
1457} 1433}
1458 1434
1459static void dm_request(struct request_queue *q, struct bio *bio) 1435static int dm_request(struct request_queue *q, struct bio *bio)
1460{ 1436{
1461 struct mapped_device *md = q->queuedata; 1437 struct mapped_device *md = q->queuedata;
1462 1438
1463 if (dm_request_based(md)) 1439 if (dm_request_based(md))
1464 blk_queue_bio(q, bio); 1440 return dm_make_request(q, bio);
1465 else 1441
1466 _dm_request(q, bio); 1442 return _dm_request(q, bio);
1467} 1443}
1468 1444
1469void dm_dispatch_request(struct request *rq) 1445void dm_dispatch_request(struct request *rq)
@@ -1480,17 +1456,30 @@ void dm_dispatch_request(struct request *rq)
1480} 1456}
1481EXPORT_SYMBOL_GPL(dm_dispatch_request); 1457EXPORT_SYMBOL_GPL(dm_dispatch_request);
1482 1458
1459static void dm_rq_bio_destructor(struct bio *bio)
1460{
1461 struct dm_rq_clone_bio_info *info = bio->bi_private;
1462 struct mapped_device *md = info->tio->md;
1463
1464 free_bio_info(info);
1465 bio_free(bio, md->bs);
1466}
1467
1483static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, 1468static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
1484 void *data) 1469 void *data)
1485{ 1470{
1486 struct dm_rq_target_io *tio = data; 1471 struct dm_rq_target_io *tio = data;
1487 struct dm_rq_clone_bio_info *info = 1472 struct mapped_device *md = tio->md;
1488 container_of(bio, struct dm_rq_clone_bio_info, clone); 1473 struct dm_rq_clone_bio_info *info = alloc_bio_info(md);
1474
1475 if (!info)
1476 return -ENOMEM;
1489 1477
1490 info->orig = bio_orig; 1478 info->orig = bio_orig;
1491 info->tio = tio; 1479 info->tio = tio;
1492 bio->bi_end_io = end_clone_bio; 1480 bio->bi_end_io = end_clone_bio;
1493 bio->bi_private = info; 1481 bio->bi_private = info;
1482 bio->bi_destructor = dm_rq_bio_destructor;
1494 1483
1495 return 0; 1484 return 0;
1496} 1485}
@@ -1575,6 +1564,15 @@ static int map_request(struct dm_target *ti, struct request *clone,
1575 int r, requeued = 0; 1564 int r, requeued = 0;
1576 struct dm_rq_target_io *tio = clone->end_io_data; 1565 struct dm_rq_target_io *tio = clone->end_io_data;
1577 1566
1567 /*
1568 * Hold the md reference here for the in-flight I/O.
1569 * We can't rely on the reference count by device opener,
1570 * because the device may be closed during the request completion
1571 * when all bios are completed.
1572 * See the comment in rq_completed() too.
1573 */
1574 dm_get(md);
1575
1578 tio->ti = ti; 1576 tio->ti = ti;
1579 r = ti->type->map_rq(ti, clone, &tio->info); 1577 r = ti->type->map_rq(ti, clone, &tio->info);
1580 switch (r) { 1578 switch (r) {
@@ -1606,26 +1604,6 @@ static int map_request(struct dm_target *ti, struct request *clone,
1606 return requeued; 1604 return requeued;
1607} 1605}
1608 1606
1609static struct request *dm_start_request(struct mapped_device *md, struct request *orig)
1610{
1611 struct request *clone;
1612
1613 blk_start_request(orig);
1614 clone = orig->special;
1615 atomic_inc(&md->pending[rq_data_dir(clone)]);
1616
1617 /*
1618 * Hold the md reference here for the in-flight I/O.
1619 * We can't rely on the reference count by device opener,
1620 * because the device may be closed during the request completion
1621 * when all bios are completed.
1622 * See the comment in rq_completed() too.
1623 */
1624 dm_get(md);
1625
1626 return clone;
1627}
1628
1629/* 1607/*
1630 * q->request_fn for request-based dm. 1608 * q->request_fn for request-based dm.
1631 * Called with the queue lock held. 1609 * Called with the queue lock held.
@@ -1655,21 +1633,14 @@ static void dm_request_fn(struct request_queue *q)
1655 pos = blk_rq_pos(rq); 1633 pos = blk_rq_pos(rq);
1656 1634
1657 ti = dm_table_find_target(map, pos); 1635 ti = dm_table_find_target(map, pos);
1658 if (!dm_target_is_valid(ti)) { 1636 BUG_ON(!dm_target_is_valid(ti));
1659 /*
1660 * Must perform setup, that dm_done() requires,
1661 * before calling dm_kill_unmapped_request
1662 */
1663 DMERR_LIMIT("request attempted access beyond the end of device");
1664 clone = dm_start_request(md, rq);
1665 dm_kill_unmapped_request(clone, -EIO);
1666 continue;
1667 }
1668 1637
1669 if (ti->type->busy && ti->type->busy(ti)) 1638 if (ti->type->busy && ti->type->busy(ti))
1670 goto delay_and_out; 1639 goto delay_and_out;
1671 1640
1672 clone = dm_start_request(md, rq); 1641 blk_start_request(rq);
1642 clone = rq->special;
1643 atomic_inc(&md->pending[rq_data_dir(clone)]);
1673 1644
1674 spin_unlock(q->queue_lock); 1645 spin_unlock(q->queue_lock);
1675 if (map_request(ti, clone, md)) 1646 if (map_request(ti, clone, md))
@@ -1689,6 +1660,8 @@ delay_and_out:
1689 blk_delay_queue(q, HZ / 10); 1660 blk_delay_queue(q, HZ / 10);
1690out: 1661out:
1691 dm_table_put(map); 1662 dm_table_put(map);
1663
1664 return;
1692} 1665}
1693 1666
1694int dm_underlying_device_busy(struct request_queue *q) 1667int dm_underlying_device_busy(struct request_queue *q)
@@ -1969,20 +1942,13 @@ static void free_dev(struct mapped_device *md)
1969 1942
1970static void __bind_mempools(struct mapped_device *md, struct dm_table *t) 1943static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
1971{ 1944{
1972 struct dm_md_mempools *p = dm_table_get_md_mempools(t); 1945 struct dm_md_mempools *p;
1973 1946
1974 if (md->io_pool && (md->tio_pool || dm_table_get_type(t) == DM_TYPE_BIO_BASED) && md->bs) { 1947 if (md->io_pool && md->tio_pool && md->bs)
1975 /* 1948 /* the md already has necessary mempools */
1976 * The md already has necessary mempools. Reload just the
1977 * bioset because front_pad may have changed because
1978 * a different table was loaded.
1979 */
1980 bioset_free(md->bs);
1981 md->bs = p->bs;
1982 p->bs = NULL;
1983 goto out; 1949 goto out;
1984 }
1985 1950
1951 p = dm_table_get_md_mempools(t);
1986 BUG_ON(!p || md->io_pool || md->tio_pool || md->bs); 1952 BUG_ON(!p || md->io_pool || md->tio_pool || md->bs);
1987 1953
1988 md->io_pool = p->io_pool; 1954 md->io_pool = p->io_pool;
@@ -2120,8 +2086,6 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2120 write_lock_irqsave(&md->map_lock, flags); 2086 write_lock_irqsave(&md->map_lock, flags);
2121 old_map = md->map; 2087 old_map = md->map;
2122 md->map = t; 2088 md->map = t;
2123 md->immutable_target_type = dm_table_get_immutable_target_type(t);
2124
2125 dm_table_set_restrictions(t, q, limits); 2089 dm_table_set_restrictions(t, q, limits);
2126 if (merge_is_optional) 2090 if (merge_is_optional)
2127 set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); 2091 set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
@@ -2192,11 +2156,6 @@ unsigned dm_get_md_type(struct mapped_device *md)
2192 return md->type; 2156 return md->type;
2193} 2157}
2194 2158
2195struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2196{
2197 return md->immutable_target_type;
2198}
2199
2200/* 2159/*
2201 * Fully initialize a request-based queue (->elevator, ->request_fn, etc). 2160 * Fully initialize a request-based queue (->elevator, ->request_fn, etc).
2202 */ 2161 */
@@ -2213,6 +2172,7 @@ static int dm_init_request_based_queue(struct mapped_device *md)
2213 return 0; 2172 return 0;
2214 2173
2215 md->queue = q; 2174 md->queue = q;
2175 md->saved_make_request_fn = md->queue->make_request_fn;
2216 dm_init_md_queue(md); 2176 dm_init_md_queue(md);
2217 blk_queue_softirq_done(md->queue, dm_softirq_done); 2177 blk_queue_softirq_done(md->queue, dm_softirq_done);
2218 blk_queue_prep_rq(md->queue, dm_prep_fn); 2178 blk_queue_prep_rq(md->queue, dm_prep_fn);
@@ -2271,7 +2231,6 @@ struct mapped_device *dm_get_md(dev_t dev)
2271 2231
2272 return md; 2232 return md;
2273} 2233}
2274EXPORT_SYMBOL_GPL(dm_get_md);
2275 2234
2276void *dm_get_mdptr(struct mapped_device *md) 2235void *dm_get_mdptr(struct mapped_device *md)
2277{ 2236{
@@ -2357,6 +2316,7 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
2357 while (1) { 2316 while (1) {
2358 set_current_state(interruptible); 2317 set_current_state(interruptible);
2359 2318
2319 smp_mb();
2360 if (!md_in_flight(md)) 2320 if (!md_in_flight(md))
2361 break; 2321 break;
2362 2322
@@ -2419,7 +2379,7 @@ static void dm_queue_flush(struct mapped_device *md)
2419 */ 2379 */
2420struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) 2380struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2421{ 2381{
2422 struct dm_table *live_map, *map = ERR_PTR(-EINVAL); 2382 struct dm_table *map = ERR_PTR(-EINVAL);
2423 struct queue_limits limits; 2383 struct queue_limits limits;
2424 int r; 2384 int r;
2425 2385
@@ -2429,19 +2389,6 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2429 if (!dm_suspended_md(md)) 2389 if (!dm_suspended_md(md))
2430 goto out; 2390 goto out;
2431 2391
2432 /*
2433 * If the new table has no data devices, retain the existing limits.
2434 * This helps multipath with queue_if_no_path if all paths disappear,
2435 * then new I/O is queued based on these limits, and then some paths
2436 * reappear.
2437 */
2438 if (dm_table_has_no_data_devices(table)) {
2439 live_map = dm_get_live_table(md);
2440 if (live_map)
2441 limits = md->queue->limits;
2442 dm_table_put(live_map);
2443 }
2444
2445 r = dm_calculate_queue_limits(table, &limits); 2392 r = dm_calculate_queue_limits(table, &limits);
2446 if (r) { 2393 if (r) {
2447 map = ERR_PTR(r); 2394 map = ERR_PTR(r);
@@ -2741,7 +2688,7 @@ int dm_noflush_suspending(struct dm_target *ti)
2741} 2688}
2742EXPORT_SYMBOL_GPL(dm_noflush_suspending); 2689EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2743 2690
2744struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size) 2691struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity)
2745{ 2692{
2746 struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL); 2693 struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL);
2747 unsigned int pool_size = (type == DM_TYPE_BIO_BASED) ? 16 : MIN_IOS; 2694 unsigned int pool_size = (type == DM_TYPE_BIO_BASED) ? 16 : MIN_IOS;
@@ -2749,26 +2696,19 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
2749 if (!pools) 2696 if (!pools)
2750 return NULL; 2697 return NULL;
2751 2698
2752 per_bio_data_size = roundup(per_bio_data_size, __alignof__(struct dm_target_io));
2753
2754 pools->io_pool = (type == DM_TYPE_BIO_BASED) ? 2699 pools->io_pool = (type == DM_TYPE_BIO_BASED) ?
2755 mempool_create_slab_pool(MIN_IOS, _io_cache) : 2700 mempool_create_slab_pool(MIN_IOS, _io_cache) :
2756 mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache); 2701 mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache);
2757 if (!pools->io_pool) 2702 if (!pools->io_pool)
2758 goto free_pools_and_out; 2703 goto free_pools_and_out;
2759 2704
2760 pools->tio_pool = NULL; 2705 pools->tio_pool = (type == DM_TYPE_BIO_BASED) ?
2761 if (type == DM_TYPE_REQUEST_BASED) { 2706 mempool_create_slab_pool(MIN_IOS, _tio_cache) :
2762 pools->tio_pool = mempool_create_slab_pool(MIN_IOS, _rq_tio_cache); 2707 mempool_create_slab_pool(MIN_IOS, _rq_tio_cache);
2763 if (!pools->tio_pool) 2708 if (!pools->tio_pool)
2764 goto free_io_pool_and_out; 2709 goto free_io_pool_and_out;
2765 }
2766 2710
2767 pools->bs = (type == DM_TYPE_BIO_BASED) ? 2711 pools->bs = bioset_create(pool_size, 0);
2768 bioset_create(pool_size,
2769 per_bio_data_size + offsetof(struct dm_target_io, clone)) :
2770 bioset_create(pool_size,
2771 offsetof(struct dm_rq_clone_bio_info, clone));
2772 if (!pools->bs) 2712 if (!pools->bs)
2773 goto free_tio_pool_and_out; 2713 goto free_tio_pool_and_out;
2774 2714
@@ -2781,8 +2721,7 @@ free_bioset_and_out:
2781 bioset_free(pools->bs); 2721 bioset_free(pools->bs);
2782 2722
2783free_tio_pool_and_out: 2723free_tio_pool_and_out:
2784 if (pools->tio_pool) 2724 mempool_destroy(pools->tio_pool);
2785 mempool_destroy(pools->tio_pool);
2786 2725
2787free_io_pool_and_out: 2726free_io_pool_and_out:
2788 mempool_destroy(pools->io_pool); 2727 mempool_destroy(pools->io_pool);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 45b97da1bd0..6745dbd278a 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -23,11 +23,6 @@
23#define DM_SUSPEND_NOFLUSH_FLAG (1 << 1) 23#define DM_SUSPEND_NOFLUSH_FLAG (1 << 1)
24 24
25/* 25/*
26 * Status feature flags
27 */
28#define DM_STATUS_NOFLUSH_FLAG (1 << 0)
29
30/*
31 * Type of table and mapped_device's mempool 26 * Type of table and mapped_device's mempool
32 */ 27 */
33#define DM_TYPE_NONE 0 28#define DM_TYPE_NONE 0
@@ -54,7 +49,6 @@ void dm_table_event_callback(struct dm_table *t,
54 void (*fn)(void *), void *context); 49 void (*fn)(void *), void *context);
55struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index); 50struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
56struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); 51struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
57bool dm_table_has_no_data_devices(struct dm_table *table);
58int dm_calculate_queue_limits(struct dm_table *table, 52int dm_calculate_queue_limits(struct dm_table *table,
59 struct queue_limits *limits); 53 struct queue_limits *limits);
60void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, 54void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
@@ -66,7 +60,6 @@ int dm_table_resume_targets(struct dm_table *t);
66int dm_table_any_congested(struct dm_table *t, int bdi_bits); 60int dm_table_any_congested(struct dm_table *t, int bdi_bits);
67int dm_table_any_busy_target(struct dm_table *t); 61int dm_table_any_busy_target(struct dm_table *t);
68unsigned dm_table_get_type(struct dm_table *t); 62unsigned dm_table_get_type(struct dm_table *t);
69struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
70bool dm_table_request_based(struct dm_table *t); 63bool dm_table_request_based(struct dm_table *t);
71bool dm_table_supports_discards(struct dm_table *t); 64bool dm_table_supports_discards(struct dm_table *t);
72int dm_table_alloc_md_mempools(struct dm_table *t); 65int dm_table_alloc_md_mempools(struct dm_table *t);
@@ -79,7 +72,6 @@ void dm_lock_md_type(struct mapped_device *md);
79void dm_unlock_md_type(struct mapped_device *md); 72void dm_unlock_md_type(struct mapped_device *md);
80void dm_set_md_type(struct mapped_device *md, unsigned type); 73void dm_set_md_type(struct mapped_device *md, unsigned type);
81unsigned dm_get_md_type(struct mapped_device *md); 74unsigned dm_get_md_type(struct mapped_device *md);
82struct target_type *dm_get_immutable_target_type(struct mapped_device *md);
83 75
84int dm_setup_md_queue(struct mapped_device *md); 76int dm_setup_md_queue(struct mapped_device *md);
85 77
@@ -159,7 +151,7 @@ void dm_kcopyd_exit(void);
159/* 151/*
160 * Mempool operations 152 * Mempool operations
161 */ 153 */
162struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size); 154struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity);
163void dm_free_md_mempools(struct dm_md_mempools *pools); 155void dm_free_md_mempools(struct dm_md_mempools *pools);
164 156
165#endif 157#endif
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 5e7dc772f5d..23078dabb6d 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -63,7 +63,6 @@
63 63
64#define MaxFault 50 64#define MaxFault 50
65#include <linux/blkdev.h> 65#include <linux/blkdev.h>
66#include <linux/module.h>
67#include <linux/raid/md_u.h> 66#include <linux/raid/md_u.h>
68#include <linux/slab.h> 67#include <linux/slab.h>
69#include "md.h" 68#include "md.h"
@@ -82,16 +81,16 @@ static void faulty_fail(struct bio *bio, int error)
82 bio_io_error(b); 81 bio_io_error(b);
83} 82}
84 83
85struct faulty_conf { 84typedef struct faulty_conf {
86 int period[Modes]; 85 int period[Modes];
87 atomic_t counters[Modes]; 86 atomic_t counters[Modes];
88 sector_t faults[MaxFault]; 87 sector_t faults[MaxFault];
89 int modes[MaxFault]; 88 int modes[MaxFault];
90 int nfaults; 89 int nfaults;
91 struct md_rdev *rdev; 90 mdk_rdev_t *rdev;
92}; 91} conf_t;
93 92
94static int check_mode(struct faulty_conf *conf, int mode) 93static int check_mode(conf_t *conf, int mode)
95{ 94{
96 if (conf->period[mode] == 0 && 95 if (conf->period[mode] == 0 &&
97 atomic_read(&conf->counters[mode]) <= 0) 96 atomic_read(&conf->counters[mode]) <= 0)
@@ -106,7 +105,7 @@ static int check_mode(struct faulty_conf *conf, int mode)
106 return 0; 105 return 0;
107} 106}
108 107
109static int check_sector(struct faulty_conf *conf, sector_t start, sector_t end, int dir) 108static int check_sector(conf_t *conf, sector_t start, sector_t end, int dir)
110{ 109{
111 /* If we find a ReadFixable sector, we fix it ... */ 110 /* If we find a ReadFixable sector, we fix it ... */
112 int i; 111 int i;
@@ -130,7 +129,7 @@ static int check_sector(struct faulty_conf *conf, sector_t start, sector_t end,
130 return 0; 129 return 0;
131} 130}
132 131
133static void add_sector(struct faulty_conf *conf, sector_t start, int mode) 132static void add_sector(conf_t *conf, sector_t start, int mode)
134{ 133{
135 int i; 134 int i;
136 int n = conf->nfaults; 135 int n = conf->nfaults;
@@ -170,9 +169,9 @@ static void add_sector(struct faulty_conf *conf, sector_t start, int mode)
170 conf->nfaults = n+1; 169 conf->nfaults = n+1;
171} 170}
172 171
173static void make_request(struct mddev *mddev, struct bio *bio) 172static int make_request(mddev_t *mddev, struct bio *bio)
174{ 173{
175 struct faulty_conf *conf = mddev->private; 174 conf_t *conf = mddev->private;
176 int failit = 0; 175 int failit = 0;
177 176
178 if (bio_data_dir(bio) == WRITE) { 177 if (bio_data_dir(bio) == WRITE) {
@@ -182,7 +181,7 @@ static void make_request(struct mddev *mddev, struct bio *bio)
182 * just fail immediately 181 * just fail immediately
183 */ 182 */
184 bio_endio(bio, -EIO); 183 bio_endio(bio, -EIO);
185 return; 184 return 0;
186 } 185 }
187 186
188 if (check_sector(conf, bio->bi_sector, bio->bi_sector+(bio->bi_size>>9), 187 if (check_sector(conf, bio->bi_sector, bio->bi_sector+(bio->bi_size>>9),
@@ -212,20 +211,20 @@ static void make_request(struct mddev *mddev, struct bio *bio)
212 } 211 }
213 if (failit) { 212 if (failit) {
214 struct bio *b = bio_clone_mddev(bio, GFP_NOIO, mddev); 213 struct bio *b = bio_clone_mddev(bio, GFP_NOIO, mddev);
215
216 b->bi_bdev = conf->rdev->bdev; 214 b->bi_bdev = conf->rdev->bdev;
217 b->bi_private = bio; 215 b->bi_private = bio;
218 b->bi_end_io = faulty_fail; 216 b->bi_end_io = faulty_fail;
219 bio = b; 217 generic_make_request(b);
220 } else 218 return 0;
219 } else {
221 bio->bi_bdev = conf->rdev->bdev; 220 bio->bi_bdev = conf->rdev->bdev;
222 221 return 1;
223 generic_make_request(bio); 222 }
224} 223}
225 224
226static void status(struct seq_file *seq, struct mddev *mddev) 225static void status(struct seq_file *seq, mddev_t *mddev)
227{ 226{
228 struct faulty_conf *conf = mddev->private; 227 conf_t *conf = mddev->private;
229 int n; 228 int n;
230 229
231 if ((n=atomic_read(&conf->counters[WriteTransient])) != 0) 230 if ((n=atomic_read(&conf->counters[WriteTransient])) != 0)
@@ -256,11 +255,11 @@ static void status(struct seq_file *seq, struct mddev *mddev)
256} 255}
257 256
258 257
259static int reshape(struct mddev *mddev) 258static int reshape(mddev_t *mddev)
260{ 259{
261 int mode = mddev->new_layout & ModeMask; 260 int mode = mddev->new_layout & ModeMask;
262 int count = mddev->new_layout >> ModeShift; 261 int count = mddev->new_layout >> ModeShift;
263 struct faulty_conf *conf = mddev->private; 262 conf_t *conf = mddev->private;
264 263
265 if (mddev->new_layout < 0) 264 if (mddev->new_layout < 0)
266 return 0; 265 return 0;
@@ -285,7 +284,7 @@ static int reshape(struct mddev *mddev)
285 return 0; 284 return 0;
286} 285}
287 286
288static sector_t faulty_size(struct mddev *mddev, sector_t sectors, int raid_disks) 287static sector_t faulty_size(mddev_t *mddev, sector_t sectors, int raid_disks)
289{ 288{
290 WARN_ONCE(raid_disks, 289 WARN_ONCE(raid_disks,
291 "%s does not support generic reshape\n", __func__); 290 "%s does not support generic reshape\n", __func__);
@@ -296,11 +295,11 @@ static sector_t faulty_size(struct mddev *mddev, sector_t sectors, int raid_disk
296 return sectors; 295 return sectors;
297} 296}
298 297
299static int run(struct mddev *mddev) 298static int run(mddev_t *mddev)
300{ 299{
301 struct md_rdev *rdev; 300 mdk_rdev_t *rdev;
302 int i; 301 int i;
303 struct faulty_conf *conf; 302 conf_t *conf;
304 303
305 if (md_check_no_bitmap(mddev)) 304 if (md_check_no_bitmap(mddev))
306 return -EINVAL; 305 return -EINVAL;
@@ -315,11 +314,8 @@ static int run(struct mddev *mddev)
315 } 314 }
316 conf->nfaults = 0; 315 conf->nfaults = 0;
317 316
318 rdev_for_each(rdev, mddev) { 317 list_for_each_entry(rdev, &mddev->disks, same_set)
319 conf->rdev = rdev; 318 conf->rdev = rdev;
320 disk_stack_limits(mddev->gendisk, rdev->bdev,
321 rdev->data_offset << 9);
322 }
323 319
324 md_set_array_sectors(mddev, faulty_size(mddev, 0, 0)); 320 md_set_array_sectors(mddev, faulty_size(mddev, 0, 0));
325 mddev->private = conf; 321 mddev->private = conf;
@@ -329,16 +325,16 @@ static int run(struct mddev *mddev)
329 return 0; 325 return 0;
330} 326}
331 327
332static int stop(struct mddev *mddev) 328static int stop(mddev_t *mddev)
333{ 329{
334 struct faulty_conf *conf = mddev->private; 330 conf_t *conf = mddev->private;
335 331
336 kfree(conf); 332 kfree(conf);
337 mddev->private = NULL; 333 mddev->private = NULL;
338 return 0; 334 return 0;
339} 335}
340 336
341static struct md_personality faulty_personality = 337static struct mdk_personality faulty_personality =
342{ 338{
343 .name = "faulty", 339 .name = "faulty",
344 .level = LEVEL_FAULTY, 340 .level = LEVEL_FAULTY,
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 21014836bdb..6cd2c313e80 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -19,7 +19,6 @@
19#include <linux/blkdev.h> 19#include <linux/blkdev.h>
20#include <linux/raid/md_u.h> 20#include <linux/raid/md_u.h>
21#include <linux/seq_file.h> 21#include <linux/seq_file.h>
22#include <linux/module.h>
23#include <linux/slab.h> 22#include <linux/slab.h>
24#include "md.h" 23#include "md.h"
25#include "linear.h" 24#include "linear.h"
@@ -27,10 +26,10 @@
27/* 26/*
28 * find which device holds a particular offset 27 * find which device holds a particular offset
29 */ 28 */
30static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector) 29static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector)
31{ 30{
32 int lo, mid, hi; 31 int lo, mid, hi;
33 struct linear_conf *conf; 32 linear_conf_t *conf;
34 33
35 lo = 0; 34 lo = 0;
36 hi = mddev->raid_disks - 1; 35 hi = mddev->raid_disks - 1;
@@ -64,23 +63,14 @@ static int linear_mergeable_bvec(struct request_queue *q,
64 struct bvec_merge_data *bvm, 63 struct bvec_merge_data *bvm,
65 struct bio_vec *biovec) 64 struct bio_vec *biovec)
66{ 65{
67 struct mddev *mddev = q->queuedata; 66 mddev_t *mddev = q->queuedata;
68 struct dev_info *dev0; 67 dev_info_t *dev0;
69 unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9; 68 unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9;
70 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 69 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
71 int maxbytes = biovec->bv_len;
72 struct request_queue *subq;
73 70
74 rcu_read_lock(); 71 rcu_read_lock();
75 dev0 = which_dev(mddev, sector); 72 dev0 = which_dev(mddev, sector);
76 maxsectors = dev0->end_sector - sector; 73 maxsectors = dev0->end_sector - sector;
77 subq = bdev_get_queue(dev0->rdev->bdev);
78 if (subq->merge_bvec_fn) {
79 bvm->bi_bdev = dev0->rdev->bdev;
80 bvm->bi_sector -= dev0->end_sector - dev0->rdev->sectors;
81 maxbytes = min(maxbytes, subq->merge_bvec_fn(subq, bvm,
82 biovec));
83 }
84 rcu_read_unlock(); 74 rcu_read_unlock();
85 75
86 if (maxsectors < bio_sectors) 76 if (maxsectors < bio_sectors)
@@ -89,18 +79,18 @@ static int linear_mergeable_bvec(struct request_queue *q,
89 maxsectors -= bio_sectors; 79 maxsectors -= bio_sectors;
90 80
91 if (maxsectors <= (PAGE_SIZE >> 9 ) && bio_sectors == 0) 81 if (maxsectors <= (PAGE_SIZE >> 9 ) && bio_sectors == 0)
92 return maxbytes; 82 return biovec->bv_len;
93 83 /* The bytes available at this offset could be really big,
94 if (maxsectors > (maxbytes >> 9)) 84 * so we cap at 2^31 to avoid overflow */
95 return maxbytes; 85 if (maxsectors > (1 << (31-9)))
96 else 86 return 1<<31;
97 return maxsectors << 9; 87 return maxsectors << 9;
98} 88}
99 89
100static int linear_congested(void *data, int bits) 90static int linear_congested(void *data, int bits)
101{ 91{
102 struct mddev *mddev = data; 92 mddev_t *mddev = data;
103 struct linear_conf *conf; 93 linear_conf_t *conf;
104 int i, ret = 0; 94 int i, ret = 0;
105 95
106 if (mddev_congested(mddev, bits)) 96 if (mddev_congested(mddev, bits))
@@ -118,9 +108,9 @@ static int linear_congested(void *data, int bits)
118 return ret; 108 return ret;
119} 109}
120 110
121static sector_t linear_size(struct mddev *mddev, sector_t sectors, int raid_disks) 111static sector_t linear_size(mddev_t *mddev, sector_t sectors, int raid_disks)
122{ 112{
123 struct linear_conf *conf; 113 linear_conf_t *conf;
124 sector_t array_sectors; 114 sector_t array_sectors;
125 115
126 rcu_read_lock(); 116 rcu_read_lock();
@@ -133,14 +123,13 @@ static sector_t linear_size(struct mddev *mddev, sector_t sectors, int raid_disk
133 return array_sectors; 123 return array_sectors;
134} 124}
135 125
136static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) 126static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
137{ 127{
138 struct linear_conf *conf; 128 linear_conf_t *conf;
139 struct md_rdev *rdev; 129 mdk_rdev_t *rdev;
140 int i, cnt; 130 int i, cnt;
141 bool discard_supported = false;
142 131
143 conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(struct dev_info), 132 conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(dev_info_t),
144 GFP_KERNEL); 133 GFP_KERNEL);
145 if (!conf) 134 if (!conf)
146 return NULL; 135 return NULL;
@@ -148,9 +137,9 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
148 cnt = 0; 137 cnt = 0;
149 conf->array_sectors = 0; 138 conf->array_sectors = 0;
150 139
151 rdev_for_each(rdev, mddev) { 140 list_for_each_entry(rdev, &mddev->disks, same_set) {
152 int j = rdev->raid_disk; 141 int j = rdev->raid_disk;
153 struct dev_info *disk = conf->disks + j; 142 dev_info_t *disk = conf->disks + j;
154 sector_t sectors; 143 sector_t sectors;
155 144
156 if (j < 0 || j >= raid_disks || disk->rdev) { 145 if (j < 0 || j >= raid_disks || disk->rdev) {
@@ -168,12 +157,19 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
168 157
169 disk_stack_limits(mddev->gendisk, rdev->bdev, 158 disk_stack_limits(mddev->gendisk, rdev->bdev,
170 rdev->data_offset << 9); 159 rdev->data_offset << 9);
160 /* as we don't honour merge_bvec_fn, we must never risk
161 * violating it, so limit max_segments to 1 lying within
162 * a single page.
163 */
164 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
165 blk_queue_max_segments(mddev->queue, 1);
166 blk_queue_segment_boundary(mddev->queue,
167 PAGE_CACHE_SIZE - 1);
168 }
171 169
172 conf->array_sectors += rdev->sectors; 170 conf->array_sectors += rdev->sectors;
173 cnt++; 171 cnt++;
174 172
175 if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
176 discard_supported = true;
177 } 173 }
178 if (cnt != raid_disks) { 174 if (cnt != raid_disks) {
179 printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n", 175 printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n",
@@ -181,11 +177,6 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
181 goto out; 177 goto out;
182 } 178 }
183 179
184 if (!discard_supported)
185 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
186 else
187 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
188
189 /* 180 /*
190 * Here we calculate the device offsets. 181 * Here we calculate the device offsets.
191 */ 182 */
@@ -203,10 +194,9 @@ out:
203 return NULL; 194 return NULL;
204} 195}
205 196
206static int linear_run (struct mddev *mddev) 197static int linear_run (mddev_t *mddev)
207{ 198{
208 struct linear_conf *conf; 199 linear_conf_t *conf;
209 int ret;
210 200
211 if (md_check_no_bitmap(mddev)) 201 if (md_check_no_bitmap(mddev))
212 return -EINVAL; 202 return -EINVAL;
@@ -220,16 +210,10 @@ static int linear_run (struct mddev *mddev)
220 blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); 210 blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
221 mddev->queue->backing_dev_info.congested_fn = linear_congested; 211 mddev->queue->backing_dev_info.congested_fn = linear_congested;
222 mddev->queue->backing_dev_info.congested_data = mddev; 212 mddev->queue->backing_dev_info.congested_data = mddev;
223 213 return md_integrity_register(mddev);
224 ret = md_integrity_register(mddev);
225 if (ret) {
226 kfree(conf);
227 mddev->private = NULL;
228 }
229 return ret;
230} 214}
231 215
232static int linear_add(struct mddev *mddev, struct md_rdev *rdev) 216static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
233{ 217{
234 /* Adding a drive to a linear array allows the array to grow. 218 /* Adding a drive to a linear array allows the array to grow.
235 * It is permitted if the new drive has a matching superblock 219 * It is permitted if the new drive has a matching superblock
@@ -239,22 +223,19 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
239 * The current one is never freed until the array is stopped. 223 * The current one is never freed until the array is stopped.
240 * This avoids races. 224 * This avoids races.
241 */ 225 */
242 struct linear_conf *newconf, *oldconf; 226 linear_conf_t *newconf, *oldconf;
243 227
244 if (rdev->saved_raid_disk != mddev->raid_disks) 228 if (rdev->saved_raid_disk != mddev->raid_disks)
245 return -EINVAL; 229 return -EINVAL;
246 230
247 rdev->raid_disk = rdev->saved_raid_disk; 231 rdev->raid_disk = rdev->saved_raid_disk;
248 rdev->saved_raid_disk = -1;
249 232
250 newconf = linear_conf(mddev,mddev->raid_disks+1); 233 newconf = linear_conf(mddev,mddev->raid_disks+1);
251 234
252 if (!newconf) 235 if (!newconf)
253 return -ENOMEM; 236 return -ENOMEM;
254 237
255 oldconf = rcu_dereference_protected(mddev->private, 238 oldconf = rcu_dereference(mddev->private);
256 lockdep_is_held(
257 &mddev->reconfig_mutex));
258 mddev->raid_disks++; 239 mddev->raid_disks++;
259 rcu_assign_pointer(mddev->private, newconf); 240 rcu_assign_pointer(mddev->private, newconf);
260 md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); 241 md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
@@ -264,12 +245,9 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
264 return 0; 245 return 0;
265} 246}
266 247
267static int linear_stop (struct mddev *mddev) 248static int linear_stop (mddev_t *mddev)
268{ 249{
269 struct linear_conf *conf = 250 linear_conf_t *conf = mddev->private;
270 rcu_dereference_protected(mddev->private,
271 lockdep_is_held(
272 &mddev->reconfig_mutex));
273 251
274 /* 252 /*
275 * We do not require rcu protection here since 253 * We do not require rcu protection here since
@@ -286,14 +264,14 @@ static int linear_stop (struct mddev *mddev)
286 return 0; 264 return 0;
287} 265}
288 266
289static void linear_make_request(struct mddev *mddev, struct bio *bio) 267static int linear_make_request (mddev_t *mddev, struct bio *bio)
290{ 268{
291 struct dev_info *tmp_dev; 269 dev_info_t *tmp_dev;
292 sector_t start_sector; 270 sector_t start_sector;
293 271
294 if (unlikely(bio->bi_rw & REQ_FLUSH)) { 272 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
295 md_flush_request(mddev, bio); 273 md_flush_request(mddev, bio);
296 return; 274 return 0;
297 } 275 }
298 276
299 rcu_read_lock(); 277 rcu_read_lock();
@@ -315,7 +293,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
315 (unsigned long long)start_sector); 293 (unsigned long long)start_sector);
316 rcu_read_unlock(); 294 rcu_read_unlock();
317 bio_io_error(bio); 295 bio_io_error(bio);
318 return; 296 return 0;
319 } 297 }
320 if (unlikely(bio->bi_sector + (bio->bi_size >> 9) > 298 if (unlikely(bio->bi_sector + (bio->bi_size >> 9) >
321 tmp_dev->end_sector)) { 299 tmp_dev->end_sector)) {
@@ -329,10 +307,12 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
329 307
330 bp = bio_split(bio, end_sector - bio->bi_sector); 308 bp = bio_split(bio, end_sector - bio->bi_sector);
331 309
332 linear_make_request(mddev, &bp->bio1); 310 if (linear_make_request(mddev, &bp->bio1))
333 linear_make_request(mddev, &bp->bio2); 311 generic_make_request(&bp->bio1);
312 if (linear_make_request(mddev, &bp->bio2))
313 generic_make_request(&bp->bio2);
334 bio_pair_release(bp); 314 bio_pair_release(bp);
335 return; 315 return 0;
336 } 316 }
337 317
338 bio->bi_bdev = tmp_dev->rdev->bdev; 318 bio->bi_bdev = tmp_dev->rdev->bdev;
@@ -340,24 +320,17 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
340 + tmp_dev->rdev->data_offset; 320 + tmp_dev->rdev->data_offset;
341 rcu_read_unlock(); 321 rcu_read_unlock();
342 322
343 if (unlikely((bio->bi_rw & REQ_DISCARD) && 323 return 1;
344 !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
345 /* Just ignore it */
346 bio_endio(bio, 0);
347 return;
348 }
349
350 generic_make_request(bio);
351} 324}
352 325
353static void linear_status (struct seq_file *seq, struct mddev *mddev) 326static void linear_status (struct seq_file *seq, mddev_t *mddev)
354{ 327{
355 328
356 seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2); 329 seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2);
357} 330}
358 331
359 332
360static struct md_personality linear_personality = 333static struct mdk_personality linear_personality =
361{ 334{
362 .name = "linear", 335 .name = "linear",
363 .level = LEVEL_LINEAR, 336 .level = LEVEL_LINEAR,
diff --git a/drivers/md/linear.h b/drivers/md/linear.h
index b685ddd7d7f..2f2da05b2ce 100644
--- a/drivers/md/linear.h
+++ b/drivers/md/linear.h
@@ -2,14 +2,20 @@
2#define _LINEAR_H 2#define _LINEAR_H
3 3
4struct dev_info { 4struct dev_info {
5 struct md_rdev *rdev; 5 mdk_rdev_t *rdev;
6 sector_t end_sector; 6 sector_t end_sector;
7}; 7};
8 8
9struct linear_conf 9typedef struct dev_info dev_info_t;
10
11struct linear_private_data
10{ 12{
11 struct rcu_head rcu; 13 struct rcu_head rcu;
12 sector_t array_sectors; 14 sector_t array_sectors;
13 struct dev_info disks[0]; 15 dev_info_t disks[0];
14}; 16};
17
18
19typedef struct linear_private_data linear_conf_t;
20
15#endif 21#endif
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3db3d1b271f..5c95ccb5950 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -36,14 +36,14 @@
36#include <linux/blkdev.h> 36#include <linux/blkdev.h>
37#include <linux/sysctl.h> 37#include <linux/sysctl.h>
38#include <linux/seq_file.h> 38#include <linux/seq_file.h>
39#include <linux/fs.h> 39#include <linux/mutex.h>
40#include <linux/buffer_head.h> /* for invalidate_bdev */
40#include <linux/poll.h> 41#include <linux/poll.h>
41#include <linux/ctype.h> 42#include <linux/ctype.h>
42#include <linux/string.h> 43#include <linux/string.h>
43#include <linux/hdreg.h> 44#include <linux/hdreg.h>
44#include <linux/proc_fs.h> 45#include <linux/proc_fs.h>
45#include <linux/random.h> 46#include <linux/random.h>
46#include <linux/module.h>
47#include <linux/reboot.h> 47#include <linux/reboot.h>
48#include <linux/file.h> 48#include <linux/file.h>
49#include <linux/compat.h> 49#include <linux/compat.h>
@@ -54,6 +54,9 @@
54#include "md.h" 54#include "md.h"
55#include "bitmap.h" 55#include "bitmap.h"
56 56
57#define DEBUG 0
58#define dprintk(x...) ((void)(DEBUG && printk(x)))
59
57#ifndef MODULE 60#ifndef MODULE
58static void autostart_arrays(int part); 61static void autostart_arrays(int part);
59#endif 62#endif
@@ -95,13 +98,13 @@ static struct workqueue_struct *md_misc_wq;
95 98
96static int sysctl_speed_limit_min = 1000; 99static int sysctl_speed_limit_min = 1000;
97static int sysctl_speed_limit_max = 200000; 100static int sysctl_speed_limit_max = 200000;
98static inline int speed_min(struct mddev *mddev) 101static inline int speed_min(mddev_t *mddev)
99{ 102{
100 return mddev->sync_speed_min ? 103 return mddev->sync_speed_min ?
101 mddev->sync_speed_min : sysctl_speed_limit_min; 104 mddev->sync_speed_min : sysctl_speed_limit_min;
102} 105}
103 106
104static inline int speed_max(struct mddev *mddev) 107static inline int speed_max(mddev_t *mddev)
105{ 108{
106 return mddev->sync_speed_max ? 109 return mddev->sync_speed_max ?
107 mddev->sync_speed_max : sysctl_speed_limit_max; 110 mddev->sync_speed_max : sysctl_speed_limit_max;
@@ -155,28 +158,65 @@ static int start_readonly;
155 * like bio_clone, but with a local bio set 158 * like bio_clone, but with a local bio set
156 */ 159 */
157 160
161static void mddev_bio_destructor(struct bio *bio)
162{
163 mddev_t *mddev, **mddevp;
164
165 mddevp = (void*)bio;
166 mddev = mddevp[-1];
167
168 bio_free(bio, mddev->bio_set);
169}
170
158struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, 171struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
159 struct mddev *mddev) 172 mddev_t *mddev)
160{ 173{
161 struct bio *b; 174 struct bio *b;
175 mddev_t **mddevp;
162 176
163 if (!mddev || !mddev->bio_set) 177 if (!mddev || !mddev->bio_set)
164 return bio_alloc(gfp_mask, nr_iovecs); 178 return bio_alloc(gfp_mask, nr_iovecs);
165 179
166 b = bio_alloc_bioset(gfp_mask, nr_iovecs, mddev->bio_set); 180 b = bio_alloc_bioset(gfp_mask, nr_iovecs,
181 mddev->bio_set);
167 if (!b) 182 if (!b)
168 return NULL; 183 return NULL;
184 mddevp = (void*)b;
185 mddevp[-1] = mddev;
186 b->bi_destructor = mddev_bio_destructor;
169 return b; 187 return b;
170} 188}
171EXPORT_SYMBOL_GPL(bio_alloc_mddev); 189EXPORT_SYMBOL_GPL(bio_alloc_mddev);
172 190
173struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, 191struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
174 struct mddev *mddev) 192 mddev_t *mddev)
175{ 193{
194 struct bio *b;
195 mddev_t **mddevp;
196
176 if (!mddev || !mddev->bio_set) 197 if (!mddev || !mddev->bio_set)
177 return bio_clone(bio, gfp_mask); 198 return bio_clone(bio, gfp_mask);
178 199
179 return bio_clone_bioset(bio, gfp_mask, mddev->bio_set); 200 b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs,
201 mddev->bio_set);
202 if (!b)
203 return NULL;
204 mddevp = (void*)b;
205 mddevp[-1] = mddev;
206 b->bi_destructor = mddev_bio_destructor;
207 __bio_clone(b, bio);
208 if (bio_integrity(bio)) {
209 int ret;
210
211 ret = bio_integrity_clone(b, bio, gfp_mask, mddev->bio_set);
212
213 if (ret < 0) {
214 bio_put(b);
215 return NULL;
216 }
217 }
218
219 return b;
180} 220}
181EXPORT_SYMBOL_GPL(bio_clone_mddev); 221EXPORT_SYMBOL_GPL(bio_clone_mddev);
182 222
@@ -241,7 +281,7 @@ EXPORT_SYMBOL_GPL(md_trim_bio);
241 */ 281 */
242static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 282static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
243static atomic_t md_event_count; 283static atomic_t md_event_count;
244void md_new_event(struct mddev *mddev) 284void md_new_event(mddev_t *mddev)
245{ 285{
246 atomic_inc(&md_event_count); 286 atomic_inc(&md_event_count);
247 wake_up(&md_event_waiters); 287 wake_up(&md_event_waiters);
@@ -251,7 +291,7 @@ EXPORT_SYMBOL_GPL(md_new_event);
251/* Alternate version that can be called from interrupts 291/* Alternate version that can be called from interrupts
252 * when calling sysfs_notify isn't needed. 292 * when calling sysfs_notify isn't needed.
253 */ 293 */
254static void md_new_event_inintr(struct mddev *mddev) 294static void md_new_event_inintr(mddev_t *mddev)
255{ 295{
256 atomic_inc(&md_event_count); 296 atomic_inc(&md_event_count);
257 wake_up(&md_event_waiters); 297 wake_up(&md_event_waiters);
@@ -272,19 +312,19 @@ static DEFINE_SPINLOCK(all_mddevs_lock);
272 * Any code which breaks out of this loop while own 312 * Any code which breaks out of this loop while own
273 * a reference to the current mddev and must mddev_put it. 313 * a reference to the current mddev and must mddev_put it.
274 */ 314 */
275#define for_each_mddev(_mddev,_tmp) \ 315#define for_each_mddev(mddev,tmp) \
276 \ 316 \
277 for (({ spin_lock(&all_mddevs_lock); \ 317 for (({ spin_lock(&all_mddevs_lock); \
278 _tmp = all_mddevs.next; \ 318 tmp = all_mddevs.next; \
279 _mddev = NULL;}); \ 319 mddev = NULL;}); \
280 ({ if (_tmp != &all_mddevs) \ 320 ({ if (tmp != &all_mddevs) \
281 mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\ 321 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
282 spin_unlock(&all_mddevs_lock); \ 322 spin_unlock(&all_mddevs_lock); \
283 if (_mddev) mddev_put(_mddev); \ 323 if (mddev) mddev_put(mddev); \
284 _mddev = list_entry(_tmp, struct mddev, all_mddevs); \ 324 mddev = list_entry(tmp, mddev_t, all_mddevs); \
285 _tmp != &all_mddevs;}); \ 325 tmp != &all_mddevs;}); \
286 ({ spin_lock(&all_mddevs_lock); \ 326 ({ spin_lock(&all_mddevs_lock); \
287 _tmp = _tmp->next;}) \ 327 tmp = tmp->next;}) \
288 ) 328 )
289 329
290 330
@@ -295,17 +335,18 @@ static DEFINE_SPINLOCK(all_mddevs_lock);
295 * call has finished, the bio has been linked into some internal structure 335 * call has finished, the bio has been linked into some internal structure
296 * and so is visible to ->quiesce(), so we don't need the refcount any more. 336 * and so is visible to ->quiesce(), so we don't need the refcount any more.
297 */ 337 */
298static void md_make_request(struct request_queue *q, struct bio *bio) 338static int md_make_request(struct request_queue *q, struct bio *bio)
299{ 339{
300 const int rw = bio_data_dir(bio); 340 const int rw = bio_data_dir(bio);
301 struct mddev *mddev = q->queuedata; 341 mddev_t *mddev = q->queuedata;
342 int rv;
302 int cpu; 343 int cpu;
303 unsigned int sectors; 344 unsigned int sectors;
304 345
305 if (mddev == NULL || mddev->pers == NULL 346 if (mddev == NULL || mddev->pers == NULL
306 || !mddev->ready) { 347 || !mddev->ready) {
307 bio_io_error(bio); 348 bio_io_error(bio);
308 return; 349 return 0;
309 } 350 }
310 smp_rmb(); /* Ensure implications of 'active' are visible */ 351 smp_rmb(); /* Ensure implications of 'active' are visible */
311 rcu_read_lock(); 352 rcu_read_lock();
@@ -330,7 +371,7 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
330 * go away inside make_request 371 * go away inside make_request
331 */ 372 */
332 sectors = bio_sectors(bio); 373 sectors = bio_sectors(bio);
333 mddev->pers->make_request(mddev, bio); 374 rv = mddev->pers->make_request(mddev, bio);
334 375
335 cpu = part_stat_lock(); 376 cpu = part_stat_lock();
336 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); 377 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
@@ -339,6 +380,8 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
339 380
340 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) 381 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
341 wake_up(&mddev->sb_wait); 382 wake_up(&mddev->sb_wait);
383
384 return rv;
342} 385}
343 386
344/* mddev_suspend makes sure no new requests are submitted 387/* mddev_suspend makes sure no new requests are submitted
@@ -347,31 +390,28 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
347 * Once ->stop is called and completes, the module will be completely 390 * Once ->stop is called and completes, the module will be completely
348 * unused. 391 * unused.
349 */ 392 */
350void mddev_suspend(struct mddev *mddev) 393void mddev_suspend(mddev_t *mddev)
351{ 394{
352 BUG_ON(mddev->suspended); 395 BUG_ON(mddev->suspended);
353 mddev->suspended = 1; 396 mddev->suspended = 1;
354 synchronize_rcu(); 397 synchronize_rcu();
355 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); 398 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
356 mddev->pers->quiesce(mddev, 1); 399 mddev->pers->quiesce(mddev, 1);
357
358 del_timer_sync(&mddev->safemode_timer);
359} 400}
360EXPORT_SYMBOL_GPL(mddev_suspend); 401EXPORT_SYMBOL_GPL(mddev_suspend);
361 402
362void mddev_resume(struct mddev *mddev) 403void mddev_resume(mddev_t *mddev)
363{ 404{
364 mddev->suspended = 0; 405 mddev->suspended = 0;
365 wake_up(&mddev->sb_wait); 406 wake_up(&mddev->sb_wait);
366 mddev->pers->quiesce(mddev, 0); 407 mddev->pers->quiesce(mddev, 0);
367 408
368 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
369 md_wakeup_thread(mddev->thread); 409 md_wakeup_thread(mddev->thread);
370 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 410 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
371} 411}
372EXPORT_SYMBOL_GPL(mddev_resume); 412EXPORT_SYMBOL_GPL(mddev_resume);
373 413
374int mddev_congested(struct mddev *mddev, int bits) 414int mddev_congested(mddev_t *mddev, int bits)
375{ 415{
376 return mddev->suspended; 416 return mddev->suspended;
377} 417}
@@ -383,8 +423,8 @@ EXPORT_SYMBOL(mddev_congested);
383 423
384static void md_end_flush(struct bio *bio, int err) 424static void md_end_flush(struct bio *bio, int err)
385{ 425{
386 struct md_rdev *rdev = bio->bi_private; 426 mdk_rdev_t *rdev = bio->bi_private;
387 struct mddev *mddev = rdev->mddev; 427 mddev_t *mddev = rdev->mddev;
388 428
389 rdev_dec_pending(rdev, mddev); 429 rdev_dec_pending(rdev, mddev);
390 430
@@ -399,13 +439,13 @@ static void md_submit_flush_data(struct work_struct *ws);
399 439
400static void submit_flushes(struct work_struct *ws) 440static void submit_flushes(struct work_struct *ws)
401{ 441{
402 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 442 mddev_t *mddev = container_of(ws, mddev_t, flush_work);
403 struct md_rdev *rdev; 443 mdk_rdev_t *rdev;
404 444
405 INIT_WORK(&mddev->flush_work, md_submit_flush_data); 445 INIT_WORK(&mddev->flush_work, md_submit_flush_data);
406 atomic_set(&mddev->flush_pending, 1); 446 atomic_set(&mddev->flush_pending, 1);
407 rcu_read_lock(); 447 rcu_read_lock();
408 rdev_for_each_rcu(rdev, mddev) 448 list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
409 if (rdev->raid_disk >= 0 && 449 if (rdev->raid_disk >= 0 &&
410 !test_bit(Faulty, &rdev->flags)) { 450 !test_bit(Faulty, &rdev->flags)) {
411 /* Take two references, one is dropped 451 /* Take two references, one is dropped
@@ -416,7 +456,7 @@ static void submit_flushes(struct work_struct *ws)
416 atomic_inc(&rdev->nr_pending); 456 atomic_inc(&rdev->nr_pending);
417 atomic_inc(&rdev->nr_pending); 457 atomic_inc(&rdev->nr_pending);
418 rcu_read_unlock(); 458 rcu_read_unlock();
419 bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); 459 bi = bio_alloc_mddev(GFP_KERNEL, 0, mddev);
420 bi->bi_end_io = md_end_flush; 460 bi->bi_end_io = md_end_flush;
421 bi->bi_private = rdev; 461 bi->bi_private = rdev;
422 bi->bi_bdev = rdev->bdev; 462 bi->bi_bdev = rdev->bdev;
@@ -432,7 +472,7 @@ static void submit_flushes(struct work_struct *ws)
432 472
433static void md_submit_flush_data(struct work_struct *ws) 473static void md_submit_flush_data(struct work_struct *ws)
434{ 474{
435 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 475 mddev_t *mddev = container_of(ws, mddev_t, flush_work);
436 struct bio *bio = mddev->flush_bio; 476 struct bio *bio = mddev->flush_bio;
437 477
438 if (bio->bi_size == 0) 478 if (bio->bi_size == 0)
@@ -440,19 +480,20 @@ static void md_submit_flush_data(struct work_struct *ws)
440 bio_endio(bio, 0); 480 bio_endio(bio, 0);
441 else { 481 else {
442 bio->bi_rw &= ~REQ_FLUSH; 482 bio->bi_rw &= ~REQ_FLUSH;
443 mddev->pers->make_request(mddev, bio); 483 if (mddev->pers->make_request(mddev, bio))
484 generic_make_request(bio);
444 } 485 }
445 486
446 mddev->flush_bio = NULL; 487 mddev->flush_bio = NULL;
447 wake_up(&mddev->sb_wait); 488 wake_up(&mddev->sb_wait);
448} 489}
449 490
450void md_flush_request(struct mddev *mddev, struct bio *bio) 491void md_flush_request(mddev_t *mddev, struct bio *bio)
451{ 492{
452 spin_lock_irq(&mddev->write_lock); 493 spin_lock_irq(&mddev->write_lock);
453 wait_event_lock_irq(mddev->sb_wait, 494 wait_event_lock_irq(mddev->sb_wait,
454 !mddev->flush_bio, 495 !mddev->flush_bio,
455 mddev->write_lock); 496 mddev->write_lock, /*nothing*/);
456 mddev->flush_bio = bio; 497 mddev->flush_bio = bio;
457 spin_unlock_irq(&mddev->write_lock); 498 spin_unlock_irq(&mddev->write_lock);
458 499
@@ -461,15 +502,63 @@ void md_flush_request(struct mddev *mddev, struct bio *bio)
461} 502}
462EXPORT_SYMBOL(md_flush_request); 503EXPORT_SYMBOL(md_flush_request);
463 504
464void md_unplug(struct blk_plug_cb *cb, bool from_schedule) 505/* Support for plugging.
506 * This mirrors the plugging support in request_queue, but does not
507 * require having a whole queue or request structures.
508 * We allocate an md_plug_cb for each md device and each thread it gets
509 * plugged on. This links tot the private plug_handle structure in the
510 * personality data where we keep a count of the number of outstanding
511 * plugs so other code can see if a plug is active.
512 */
513struct md_plug_cb {
514 struct blk_plug_cb cb;
515 mddev_t *mddev;
516};
517
518static void plugger_unplug(struct blk_plug_cb *cb)
465{ 519{
466 struct mddev *mddev = cb->data; 520 struct md_plug_cb *mdcb = container_of(cb, struct md_plug_cb, cb);
467 md_wakeup_thread(mddev->thread); 521 if (atomic_dec_and_test(&mdcb->mddev->plug_cnt))
468 kfree(cb); 522 md_wakeup_thread(mdcb->mddev->thread);
523 kfree(mdcb);
469} 524}
470EXPORT_SYMBOL(md_unplug);
471 525
472static inline struct mddev *mddev_get(struct mddev *mddev) 526/* Check that an unplug wakeup will come shortly.
527 * If not, wakeup the md thread immediately
528 */
529int mddev_check_plugged(mddev_t *mddev)
530{
531 struct blk_plug *plug = current->plug;
532 struct md_plug_cb *mdcb;
533
534 if (!plug)
535 return 0;
536
537 list_for_each_entry(mdcb, &plug->cb_list, cb.list) {
538 if (mdcb->cb.callback == plugger_unplug &&
539 mdcb->mddev == mddev) {
540 /* Already on the list, move to top */
541 if (mdcb != list_first_entry(&plug->cb_list,
542 struct md_plug_cb,
543 cb.list))
544 list_move(&mdcb->cb.list, &plug->cb_list);
545 return 1;
546 }
547 }
548 /* Not currently on the callback list */
549 mdcb = kmalloc(sizeof(*mdcb), GFP_ATOMIC);
550 if (!mdcb)
551 return 0;
552
553 mdcb->mddev = mddev;
554 mdcb->cb.callback = plugger_unplug;
555 atomic_inc(&mddev->plug_cnt);
556 list_add(&mdcb->cb.list, &plug->cb_list);
557 return 1;
558}
559EXPORT_SYMBOL_GPL(mddev_check_plugged);
560
561static inline mddev_t *mddev_get(mddev_t *mddev)
473{ 562{
474 atomic_inc(&mddev->active); 563 atomic_inc(&mddev->active);
475 return mddev; 564 return mddev;
@@ -477,7 +566,7 @@ static inline struct mddev *mddev_get(struct mddev *mddev)
477 566
478static void mddev_delayed_delete(struct work_struct *ws); 567static void mddev_delayed_delete(struct work_struct *ws);
479 568
480static void mddev_put(struct mddev *mddev) 569static void mddev_put(mddev_t *mddev)
481{ 570{
482 struct bio_set *bs = NULL; 571 struct bio_set *bs = NULL;
483 572
@@ -487,7 +576,7 @@ static void mddev_put(struct mddev *mddev)
487 mddev->ctime == 0 && !mddev->hold_active) { 576 mddev->ctime == 0 && !mddev->hold_active) {
488 /* Array is not configured at all, and not held active, 577 /* Array is not configured at all, and not held active,
489 * so destroy it */ 578 * so destroy it */
490 list_del_init(&mddev->all_mddevs); 579 list_del(&mddev->all_mddevs);
491 bs = mddev->bio_set; 580 bs = mddev->bio_set;
492 mddev->bio_set = NULL; 581 mddev->bio_set = NULL;
493 if (mddev->gendisk) { 582 if (mddev->gendisk) {
@@ -506,7 +595,7 @@ static void mddev_put(struct mddev *mddev)
506 bioset_free(bs); 595 bioset_free(bs);
507} 596}
508 597
509void mddev_init(struct mddev *mddev) 598void mddev_init(mddev_t *mddev)
510{ 599{
511 mutex_init(&mddev->open_mutex); 600 mutex_init(&mddev->open_mutex);
512 mutex_init(&mddev->reconfig_mutex); 601 mutex_init(&mddev->reconfig_mutex);
@@ -517,21 +606,21 @@ void mddev_init(struct mddev *mddev)
517 atomic_set(&mddev->active, 1); 606 atomic_set(&mddev->active, 1);
518 atomic_set(&mddev->openers, 0); 607 atomic_set(&mddev->openers, 0);
519 atomic_set(&mddev->active_io, 0); 608 atomic_set(&mddev->active_io, 0);
609 atomic_set(&mddev->plug_cnt, 0);
520 spin_lock_init(&mddev->write_lock); 610 spin_lock_init(&mddev->write_lock);
521 atomic_set(&mddev->flush_pending, 0); 611 atomic_set(&mddev->flush_pending, 0);
522 init_waitqueue_head(&mddev->sb_wait); 612 init_waitqueue_head(&mddev->sb_wait);
523 init_waitqueue_head(&mddev->recovery_wait); 613 init_waitqueue_head(&mddev->recovery_wait);
524 mddev->reshape_position = MaxSector; 614 mddev->reshape_position = MaxSector;
525 mddev->reshape_backwards = 0;
526 mddev->resync_min = 0; 615 mddev->resync_min = 0;
527 mddev->resync_max = MaxSector; 616 mddev->resync_max = MaxSector;
528 mddev->level = LEVEL_NONE; 617 mddev->level = LEVEL_NONE;
529} 618}
530EXPORT_SYMBOL_GPL(mddev_init); 619EXPORT_SYMBOL_GPL(mddev_init);
531 620
532static struct mddev * mddev_find(dev_t unit) 621static mddev_t * mddev_find(dev_t unit)
533{ 622{
534 struct mddev *mddev, *new = NULL; 623 mddev_t *mddev, *new = NULL;
535 624
536 if (unit && MAJOR(unit) != MD_MAJOR) 625 if (unit && MAJOR(unit) != MD_MAJOR)
537 unit &= ~((1<<MdpMinorShift)-1); 626 unit &= ~((1<<MdpMinorShift)-1);
@@ -603,24 +692,24 @@ static struct mddev * mddev_find(dev_t unit)
603 goto retry; 692 goto retry;
604} 693}
605 694
606static inline int mddev_lock(struct mddev * mddev) 695static inline int mddev_lock(mddev_t * mddev)
607{ 696{
608 return mutex_lock_interruptible(&mddev->reconfig_mutex); 697 return mutex_lock_interruptible(&mddev->reconfig_mutex);
609} 698}
610 699
611static inline int mddev_is_locked(struct mddev *mddev) 700static inline int mddev_is_locked(mddev_t *mddev)
612{ 701{
613 return mutex_is_locked(&mddev->reconfig_mutex); 702 return mutex_is_locked(&mddev->reconfig_mutex);
614} 703}
615 704
616static inline int mddev_trylock(struct mddev * mddev) 705static inline int mddev_trylock(mddev_t * mddev)
617{ 706{
618 return mutex_trylock(&mddev->reconfig_mutex); 707 return mutex_trylock(&mddev->reconfig_mutex);
619} 708}
620 709
621static struct attribute_group md_redundancy_group; 710static struct attribute_group md_redundancy_group;
622 711
623static void mddev_unlock(struct mddev * mddev) 712static void mddev_unlock(mddev_t * mddev)
624{ 713{
625 if (mddev->to_remove) { 714 if (mddev->to_remove) {
626 /* These cannot be removed under reconfig_mutex as 715 /* These cannot be removed under reconfig_mutex as
@@ -655,61 +744,39 @@ static void mddev_unlock(struct mddev * mddev)
655 } else 744 } else
656 mutex_unlock(&mddev->reconfig_mutex); 745 mutex_unlock(&mddev->reconfig_mutex);
657 746
658 /* As we've dropped the mutex we need a spinlock to 747 /* was we've dropped the mutex we need a spinlock to
659 * make sure the thread doesn't disappear 748 * make sur the thread doesn't disappear
660 */ 749 */
661 spin_lock(&pers_lock); 750 spin_lock(&pers_lock);
662 md_wakeup_thread(mddev->thread); 751 md_wakeup_thread(mddev->thread);
663 spin_unlock(&pers_lock); 752 spin_unlock(&pers_lock);
664} 753}
665 754
666static struct md_rdev * find_rdev_nr(struct mddev *mddev, int nr) 755static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
667{ 756{
668 struct md_rdev *rdev; 757 mdk_rdev_t *rdev;
669 758
670 rdev_for_each(rdev, mddev) 759 list_for_each_entry(rdev, &mddev->disks, same_set)
671 if (rdev->desc_nr == nr) 760 if (rdev->desc_nr == nr)
672 return rdev; 761 return rdev;
673 762
674 return NULL; 763 return NULL;
675} 764}
676 765
677static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr) 766static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
678{
679 struct md_rdev *rdev;
680
681 rdev_for_each_rcu(rdev, mddev)
682 if (rdev->desc_nr == nr)
683 return rdev;
684
685 return NULL;
686}
687
688static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
689{
690 struct md_rdev *rdev;
691
692 rdev_for_each(rdev, mddev)
693 if (rdev->bdev->bd_dev == dev)
694 return rdev;
695
696 return NULL;
697}
698
699static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev)
700{ 767{
701 struct md_rdev *rdev; 768 mdk_rdev_t *rdev;
702 769
703 rdev_for_each_rcu(rdev, mddev) 770 list_for_each_entry(rdev, &mddev->disks, same_set)
704 if (rdev->bdev->bd_dev == dev) 771 if (rdev->bdev->bd_dev == dev)
705 return rdev; 772 return rdev;
706 773
707 return NULL; 774 return NULL;
708} 775}
709 776
710static struct md_personality *find_pers(int level, char *clevel) 777static struct mdk_personality *find_pers(int level, char *clevel)
711{ 778{
712 struct md_personality *pers; 779 struct mdk_personality *pers;
713 list_for_each_entry(pers, &pers_list, list) { 780 list_for_each_entry(pers, &pers_list, list) {
714 if (level != LEVEL_NONE && pers->level == level) 781 if (level != LEVEL_NONE && pers->level == level)
715 return pers; 782 return pers;
@@ -720,13 +787,13 @@ static struct md_personality *find_pers(int level, char *clevel)
720} 787}
721 788
722/* return the offset of the super block in 512byte sectors */ 789/* return the offset of the super block in 512byte sectors */
723static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) 790static inline sector_t calc_dev_sboffset(mdk_rdev_t *rdev)
724{ 791{
725 sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512; 792 sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
726 return MD_NEW_SIZE_SECTORS(num_sectors); 793 return MD_NEW_SIZE_SECTORS(num_sectors);
727} 794}
728 795
729static int alloc_disk_sb(struct md_rdev * rdev) 796static int alloc_disk_sb(mdk_rdev_t * rdev)
730{ 797{
731 if (rdev->sb_page) 798 if (rdev->sb_page)
732 MD_BUG(); 799 MD_BUG();
@@ -740,7 +807,7 @@ static int alloc_disk_sb(struct md_rdev * rdev)
740 return 0; 807 return 0;
741} 808}
742 809
743void md_rdev_clear(struct md_rdev *rdev) 810static void free_disk_sb(mdk_rdev_t * rdev)
744{ 811{
745 if (rdev->sb_page) { 812 if (rdev->sb_page) {
746 put_page(rdev->sb_page); 813 put_page(rdev->sb_page);
@@ -753,15 +820,13 @@ void md_rdev_clear(struct md_rdev *rdev)
753 put_page(rdev->bb_page); 820 put_page(rdev->bb_page);
754 rdev->bb_page = NULL; 821 rdev->bb_page = NULL;
755 } 822 }
756 kfree(rdev->badblocks.page);
757 rdev->badblocks.page = NULL;
758} 823}
759EXPORT_SYMBOL_GPL(md_rdev_clear); 824
760 825
761static void super_written(struct bio *bio, int error) 826static void super_written(struct bio *bio, int error)
762{ 827{
763 struct md_rdev *rdev = bio->bi_private; 828 mdk_rdev_t *rdev = bio->bi_private;
764 struct mddev *mddev = rdev->mddev; 829 mddev_t *mddev = rdev->mddev;
765 830
766 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) { 831 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
767 printk("md: super_written gets error=%d, uptodate=%d\n", 832 printk("md: super_written gets error=%d, uptodate=%d\n",
@@ -775,7 +840,7 @@ static void super_written(struct bio *bio, int error)
775 bio_put(bio); 840 bio_put(bio);
776} 841}
777 842
778void md_super_write(struct mddev *mddev, struct md_rdev *rdev, 843void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
779 sector_t sector, int size, struct page *page) 844 sector_t sector, int size, struct page *page)
780{ 845{
781 /* write first size bytes of page to sector of rdev 846 /* write first size bytes of page to sector of rdev
@@ -796,7 +861,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
796 submit_bio(WRITE_FLUSH_FUA, bio); 861 submit_bio(WRITE_FLUSH_FUA, bio);
797} 862}
798 863
799void md_super_wait(struct mddev *mddev) 864void md_super_wait(mddev_t *mddev)
800{ 865{
801 /* wait for all superblock writes that were scheduled to complete */ 866 /* wait for all superblock writes that were scheduled to complete */
802 DEFINE_WAIT(wq); 867 DEFINE_WAIT(wq);
@@ -814,7 +879,7 @@ static void bi_complete(struct bio *bio, int error)
814 complete((struct completion*)bio->bi_private); 879 complete((struct completion*)bio->bi_private);
815} 880}
816 881
817int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, 882int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
818 struct page *page, int rw, bool metadata_op) 883 struct page *page, int rw, bool metadata_op)
819{ 884{
820 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); 885 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
@@ -827,10 +892,6 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
827 rdev->meta_bdev : rdev->bdev; 892 rdev->meta_bdev : rdev->bdev;
828 if (metadata_op) 893 if (metadata_op)
829 bio->bi_sector = sector + rdev->sb_start; 894 bio->bi_sector = sector + rdev->sb_start;
830 else if (rdev->mddev->reshape_position != MaxSector &&
831 (rdev->mddev->reshape_backwards ==
832 (sector >= rdev->mddev->reshape_position)))
833 bio->bi_sector = sector + rdev->new_data_offset;
834 else 895 else
835 bio->bi_sector = sector + rdev->data_offset; 896 bio->bi_sector = sector + rdev->data_offset;
836 bio_add_page(bio, page, size, 0); 897 bio_add_page(bio, page, size, 0);
@@ -846,7 +907,7 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
846} 907}
847EXPORT_SYMBOL_GPL(sync_page_io); 908EXPORT_SYMBOL_GPL(sync_page_io);
848 909
849static int read_disk_sb(struct md_rdev * rdev, int size) 910static int read_disk_sb(mdk_rdev_t * rdev, int size)
850{ 911{
851 char b[BDEVNAME_SIZE]; 912 char b[BDEVNAME_SIZE];
852 if (!rdev->sb_page) { 913 if (!rdev->sb_page) {
@@ -953,7 +1014,7 @@ static unsigned int calc_sb_csum(mdp_super_t * sb)
953 * We rely on user-space to write the initial superblock, and support 1014 * We rely on user-space to write the initial superblock, and support
954 * reading and updating of superblocks. 1015 * reading and updating of superblocks.
955 * Interface methods are: 1016 * Interface methods are:
956 * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) 1017 * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version)
957 * loads and validates a superblock on dev. 1018 * loads and validates a superblock on dev.
958 * if refdev != NULL, compare superblocks on both devices 1019 * if refdev != NULL, compare superblocks on both devices
959 * Return: 1020 * Return:
@@ -963,13 +1024,13 @@ static unsigned int calc_sb_csum(mdp_super_t * sb)
963 * -EINVAL superblock incompatible or invalid 1024 * -EINVAL superblock incompatible or invalid
964 * -othererror e.g. -EIO 1025 * -othererror e.g. -EIO
965 * 1026 *
966 * int validate_super(struct mddev *mddev, struct md_rdev *dev) 1027 * int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
967 * Verify that dev is acceptable into mddev. 1028 * Verify that dev is acceptable into mddev.
968 * The first time, mddev->raid_disks will be 0, and data from 1029 * The first time, mddev->raid_disks will be 0, and data from
969 * dev should be merged in. Subsequent calls check that dev 1030 * dev should be merged in. Subsequent calls check that dev
970 * is new enough. Return 0 or -EINVAL 1031 * is new enough. Return 0 or -EINVAL
971 * 1032 *
972 * void sync_super(struct mddev *mddev, struct md_rdev *dev) 1033 * void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
973 * Update the superblock for rdev with data in mddev 1034 * Update the superblock for rdev with data in mddev
974 * This does not write to disc. 1035 * This does not write to disc.
975 * 1036 *
@@ -978,17 +1039,12 @@ static unsigned int calc_sb_csum(mdp_super_t * sb)
978struct super_type { 1039struct super_type {
979 char *name; 1040 char *name;
980 struct module *owner; 1041 struct module *owner;
981 int (*load_super)(struct md_rdev *rdev, 1042 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev,
982 struct md_rdev *refdev,
983 int minor_version); 1043 int minor_version);
984 int (*validate_super)(struct mddev *mddev, 1044 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
985 struct md_rdev *rdev); 1045 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
986 void (*sync_super)(struct mddev *mddev, 1046 unsigned long long (*rdev_size_change)(mdk_rdev_t *rdev,
987 struct md_rdev *rdev);
988 unsigned long long (*rdev_size_change)(struct md_rdev *rdev,
989 sector_t num_sectors); 1047 sector_t num_sectors);
990 int (*allow_new_offset)(struct md_rdev *rdev,
991 unsigned long long new_offset);
992}; 1048};
993 1049
994/* 1050/*
@@ -999,7 +1055,7 @@ struct super_type {
999 * has a bitmap. Otherwise, it returns 0. 1055 * has a bitmap. Otherwise, it returns 0.
1000 * 1056 *
1001 */ 1057 */
1002int md_check_no_bitmap(struct mddev *mddev) 1058int md_check_no_bitmap(mddev_t *mddev)
1003{ 1059{
1004 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) 1060 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
1005 return 0; 1061 return 0;
@@ -1012,7 +1068,7 @@ EXPORT_SYMBOL(md_check_no_bitmap);
1012/* 1068/*
1013 * load_super for 0.90.0 1069 * load_super for 0.90.0
1014 */ 1070 */
1015static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1071static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1016{ 1072{
1017 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1073 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1018 mdp_super_t *sb; 1074 mdp_super_t *sb;
@@ -1060,7 +1116,6 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
1060 1116
1061 rdev->preferred_minor = sb->md_minor; 1117 rdev->preferred_minor = sb->md_minor;
1062 rdev->data_offset = 0; 1118 rdev->data_offset = 0;
1063 rdev->new_data_offset = 0;
1064 rdev->sb_size = MD_SB_BYTES; 1119 rdev->sb_size = MD_SB_BYTES;
1065 rdev->badblocks.shift = -1; 1120 rdev->badblocks.shift = -1;
1066 1121
@@ -1093,11 +1148,8 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
1093 ret = 0; 1148 ret = 0;
1094 } 1149 }
1095 rdev->sectors = rdev->sb_start; 1150 rdev->sectors = rdev->sb_start;
1096 /* Limit to 4TB as metadata cannot record more than that. 1151 /* Limit to 4TB as metadata cannot record more than that */
1097 * (not needed for Linear and RAID0 as metadata doesn't 1152 if (rdev->sectors >= (2ULL << 32))
1098 * record this size)
1099 */
1100 if (rdev->sectors >= (2ULL << 32) && sb->level >= 1)
1101 rdev->sectors = (2ULL << 32) - 2; 1153 rdev->sectors = (2ULL << 32) - 2;
1102 1154
1103 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) 1155 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
@@ -1111,7 +1163,7 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
1111/* 1163/*
1112 * validate_super for 0.90.0 1164 * validate_super for 0.90.0
1113 */ 1165 */
1114static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) 1166static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1115{ 1167{
1116 mdp_disk_t *desc; 1168 mdp_disk_t *desc;
1117 mdp_super_t *sb = page_address(rdev->sb_page); 1169 mdp_super_t *sb = page_address(rdev->sb_page);
@@ -1137,11 +1189,7 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
1137 mddev->dev_sectors = ((sector_t)sb->size) * 2; 1189 mddev->dev_sectors = ((sector_t)sb->size) * 2;
1138 mddev->events = ev1; 1190 mddev->events = ev1;
1139 mddev->bitmap_info.offset = 0; 1191 mddev->bitmap_info.offset = 0;
1140 mddev->bitmap_info.space = 0;
1141 /* bitmap can use 60 K after the 4K superblocks */
1142 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 1192 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
1143 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
1144 mddev->reshape_backwards = 0;
1145 1193
1146 if (mddev->minor_version >= 91) { 1194 if (mddev->minor_version >= 91) {
1147 mddev->reshape_position = sb->reshape_position; 1195 mddev->reshape_position = sb->reshape_position;
@@ -1149,8 +1197,6 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
1149 mddev->new_level = sb->new_level; 1197 mddev->new_level = sb->new_level;
1150 mddev->new_layout = sb->new_layout; 1198 mddev->new_layout = sb->new_layout;
1151 mddev->new_chunk_sectors = sb->new_chunk >> 9; 1199 mddev->new_chunk_sectors = sb->new_chunk >> 9;
1152 if (mddev->delta_disks < 0)
1153 mddev->reshape_backwards = 1;
1154 } else { 1200 } else {
1155 mddev->reshape_position = MaxSector; 1201 mddev->reshape_position = MaxSector;
1156 mddev->delta_disks = 0; 1202 mddev->delta_disks = 0;
@@ -1177,12 +1223,9 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
1177 mddev->max_disks = MD_SB_DISKS; 1223 mddev->max_disks = MD_SB_DISKS;
1178 1224
1179 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 1225 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
1180 mddev->bitmap_info.file == NULL) { 1226 mddev->bitmap_info.file == NULL)
1181 mddev->bitmap_info.offset = 1227 mddev->bitmap_info.offset =
1182 mddev->bitmap_info.default_offset; 1228 mddev->bitmap_info.default_offset;
1183 mddev->bitmap_info.space =
1184 mddev->bitmap_info.space;
1185 }
1186 1229
1187 } else if (mddev->pers == NULL) { 1230 } else if (mddev->pers == NULL) {
1188 /* Insist on good event counter while assembling, except 1231 /* Insist on good event counter while assembling, except
@@ -1232,10 +1275,10 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
1232/* 1275/*
1233 * sync_super for 0.90.0 1276 * sync_super for 0.90.0
1234 */ 1277 */
1235static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) 1278static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1236{ 1279{
1237 mdp_super_t *sb; 1280 mdp_super_t *sb;
1238 struct md_rdev *rdev2; 1281 mdk_rdev_t *rdev2;
1239 int next_spare = mddev->raid_disks; 1282 int next_spare = mddev->raid_disks;
1240 1283
1241 1284
@@ -1306,7 +1349,7 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
1306 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1349 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1307 1350
1308 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1351 sb->disks[0].state = (1<<MD_DISK_REMOVED);
1309 rdev_for_each(rdev2, mddev) { 1352 list_for_each_entry(rdev2, &mddev->disks, same_set) {
1310 mdp_disk_t *d; 1353 mdp_disk_t *d;
1311 int desc_nr; 1354 int desc_nr;
1312 int is_active = test_bit(In_sync, &rdev2->flags); 1355 int is_active = test_bit(In_sync, &rdev2->flags);
@@ -1376,7 +1419,7 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
1376 * rdev_size_change for 0.90.0 1419 * rdev_size_change for 0.90.0
1377 */ 1420 */
1378static unsigned long long 1421static unsigned long long
1379super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 1422super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1380{ 1423{
1381 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1424 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1382 return 0; /* component must fit device */ 1425 return 0; /* component must fit device */
@@ -1388,7 +1431,7 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1388 /* Limit to 4TB as metadata cannot record more than that. 1431 /* Limit to 4TB as metadata cannot record more than that.
1389 * 4TB == 2^32 KB, or 2*2^32 sectors. 1432 * 4TB == 2^32 KB, or 2*2^32 sectors.
1390 */ 1433 */
1391 if (num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) 1434 if (num_sectors >= (2ULL << 32))
1392 num_sectors = (2ULL << 32) - 2; 1435 num_sectors = (2ULL << 32) - 2;
1393 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1436 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1394 rdev->sb_page); 1437 rdev->sb_page);
@@ -1396,12 +1439,6 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1396 return num_sectors; 1439 return num_sectors;
1397} 1440}
1398 1441
1399static int
1400super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
1401{
1402 /* non-zero offset changes not possible with v0.90 */
1403 return new_offset == 0;
1404}
1405 1442
1406/* 1443/*
1407 * version 1 superblock 1444 * version 1 superblock
@@ -1414,11 +1451,12 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
1414 unsigned long long newcsum; 1451 unsigned long long newcsum;
1415 int size = 256 + le32_to_cpu(sb->max_dev)*2; 1452 int size = 256 + le32_to_cpu(sb->max_dev)*2;
1416 __le32 *isuper = (__le32*)sb; 1453 __le32 *isuper = (__le32*)sb;
1454 int i;
1417 1455
1418 disk_csum = sb->sb_csum; 1456 disk_csum = sb->sb_csum;
1419 sb->sb_csum = 0; 1457 sb->sb_csum = 0;
1420 newcsum = 0; 1458 newcsum = 0;
1421 for (; size >= 4; size -= 4) 1459 for (i=0; size>=4; size -= 4 )
1422 newcsum += le32_to_cpu(*isuper++); 1460 newcsum += le32_to_cpu(*isuper++);
1423 1461
1424 if (size == 2) 1462 if (size == 2)
@@ -1431,12 +1469,11 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
1431 1469
1432static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, 1470static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
1433 int acknowledged); 1471 int acknowledged);
1434static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1472static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1435{ 1473{
1436 struct mdp_superblock_1 *sb; 1474 struct mdp_superblock_1 *sb;
1437 int ret; 1475 int ret;
1438 sector_t sb_start; 1476 sector_t sb_start;
1439 sector_t sectors;
1440 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1477 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1441 int bmask; 1478 int bmask;
1442 1479
@@ -1491,18 +1528,9 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
1491 bdevname(rdev->bdev,b)); 1528 bdevname(rdev->bdev,b));
1492 return -EINVAL; 1529 return -EINVAL;
1493 } 1530 }
1494 if (sb->pad0 ||
1495 sb->pad3[0] ||
1496 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
1497 /* Some padding is non-zero, might be a new feature */
1498 return -EINVAL;
1499 1531
1500 rdev->preferred_minor = 0xffff; 1532 rdev->preferred_minor = 0xffff;
1501 rdev->data_offset = le64_to_cpu(sb->data_offset); 1533 rdev->data_offset = le64_to_cpu(sb->data_offset);
1502 rdev->new_data_offset = rdev->data_offset;
1503 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
1504 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
1505 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
1506 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1534 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1507 1535
1508 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1536 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
@@ -1513,9 +1541,6 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
1513 if (minor_version 1541 if (minor_version
1514 && rdev->data_offset < sb_start + (rdev->sb_size/512)) 1542 && rdev->data_offset < sb_start + (rdev->sb_size/512))
1515 return -EINVAL; 1543 return -EINVAL;
1516 if (minor_version
1517 && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
1518 return -EINVAL;
1519 1544
1520 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) 1545 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1521 rdev->desc_nr = -1; 1546 rdev->desc_nr = -1;
@@ -1587,18 +1612,20 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
1587 else 1612 else
1588 ret = 0; 1613 ret = 0;
1589 } 1614 }
1590 if (minor_version) { 1615 if (minor_version)
1591 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9); 1616 rdev->sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
1592 sectors -= rdev->data_offset; 1617 le64_to_cpu(sb->data_offset);
1593 } else 1618 else
1594 sectors = rdev->sb_start; 1619 rdev->sectors = rdev->sb_start;
1595 if (sectors < le64_to_cpu(sb->data_size)) 1620 if (rdev->sectors < le64_to_cpu(sb->data_size))
1596 return -EINVAL; 1621 return -EINVAL;
1597 rdev->sectors = le64_to_cpu(sb->data_size); 1622 rdev->sectors = le64_to_cpu(sb->data_size);
1623 if (le64_to_cpu(sb->size) > rdev->sectors)
1624 return -EINVAL;
1598 return ret; 1625 return ret;
1599} 1626}
1600 1627
1601static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) 1628static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1602{ 1629{
1603 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 1630 struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1604 __u64 ev1 = le64_to_cpu(sb->events); 1631 __u64 ev1 = le64_to_cpu(sb->events);
@@ -1622,37 +1649,17 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1622 mddev->dev_sectors = le64_to_cpu(sb->size); 1649 mddev->dev_sectors = le64_to_cpu(sb->size);
1623 mddev->events = ev1; 1650 mddev->events = ev1;
1624 mddev->bitmap_info.offset = 0; 1651 mddev->bitmap_info.offset = 0;
1625 mddev->bitmap_info.space = 0;
1626 /* Default location for bitmap is 1K after superblock
1627 * using 3K - total of 4K
1628 */
1629 mddev->bitmap_info.default_offset = 1024 >> 9; 1652 mddev->bitmap_info.default_offset = 1024 >> 9;
1630 mddev->bitmap_info.default_space = (4096-1024) >> 9; 1653
1631 mddev->reshape_backwards = 0;
1632
1633 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1654 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1634 memcpy(mddev->uuid, sb->set_uuid, 16); 1655 memcpy(mddev->uuid, sb->set_uuid, 16);
1635 1656
1636 mddev->max_disks = (4096-256)/2; 1657 mddev->max_disks = (4096-256)/2;
1637 1658
1638 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1659 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1639 mddev->bitmap_info.file == NULL) { 1660 mddev->bitmap_info.file == NULL )
1640 mddev->bitmap_info.offset = 1661 mddev->bitmap_info.offset =
1641 (__s32)le32_to_cpu(sb->bitmap_offset); 1662 (__s32)le32_to_cpu(sb->bitmap_offset);
1642 /* Metadata doesn't record how much space is available.
1643 * For 1.0, we assume we can use up to the superblock
1644 * if before, else to 4K beyond superblock.
1645 * For others, assume no change is possible.
1646 */
1647 if (mddev->minor_version > 0)
1648 mddev->bitmap_info.space = 0;
1649 else if (mddev->bitmap_info.offset > 0)
1650 mddev->bitmap_info.space =
1651 8 - mddev->bitmap_info.offset;
1652 else
1653 mddev->bitmap_info.space =
1654 -mddev->bitmap_info.offset;
1655 }
1656 1663
1657 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1664 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1658 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1665 mddev->reshape_position = le64_to_cpu(sb->reshape_position);
@@ -1660,11 +1667,6 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1660 mddev->new_level = le32_to_cpu(sb->new_level); 1667 mddev->new_level = le32_to_cpu(sb->new_level);
1661 mddev->new_layout = le32_to_cpu(sb->new_layout); 1668 mddev->new_layout = le32_to_cpu(sb->new_layout);
1662 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); 1669 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1663 if (mddev->delta_disks < 0 ||
1664 (mddev->delta_disks == 0 &&
1665 (le32_to_cpu(sb->feature_map)
1666 & MD_FEATURE_RESHAPE_BACKWARDS)))
1667 mddev->reshape_backwards = 1;
1668 } else { 1670 } else {
1669 mddev->reshape_position = MaxSector; 1671 mddev->reshape_position = MaxSector;
1670 mddev->delta_disks = 0; 1672 mddev->delta_disks = 0;
@@ -1718,18 +1720,16 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1718 } 1720 }
1719 if (sb->devflags & WriteMostly1) 1721 if (sb->devflags & WriteMostly1)
1720 set_bit(WriteMostly, &rdev->flags); 1722 set_bit(WriteMostly, &rdev->flags);
1721 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
1722 set_bit(Replacement, &rdev->flags);
1723 } else /* MULTIPATH are always insync */ 1723 } else /* MULTIPATH are always insync */
1724 set_bit(In_sync, &rdev->flags); 1724 set_bit(In_sync, &rdev->flags);
1725 1725
1726 return 0; 1726 return 0;
1727} 1727}
1728 1728
1729static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) 1729static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1730{ 1730{
1731 struct mdp_superblock_1 *sb; 1731 struct mdp_superblock_1 *sb;
1732 struct md_rdev *rdev2; 1732 mdk_rdev_t *rdev2;
1733 int max_dev, i; 1733 int max_dev, i;
1734 /* make rdev->sb match mddev and rdev data. */ 1734 /* make rdev->sb match mddev and rdev data. */
1735 1735
@@ -1738,6 +1738,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
1738 sb->feature_map = 0; 1738 sb->feature_map = 0;
1739 sb->pad0 = 0; 1739 sb->pad0 = 0;
1740 sb->recovery_offset = cpu_to_le64(0); 1740 sb->recovery_offset = cpu_to_le64(0);
1741 memset(sb->pad1, 0, sizeof(sb->pad1));
1741 memset(sb->pad3, 0, sizeof(sb->pad3)); 1742 memset(sb->pad3, 0, sizeof(sb->pad3));
1742 1743
1743 sb->utime = cpu_to_le64((__u64)mddev->utime); 1744 sb->utime = cpu_to_le64((__u64)mddev->utime);
@@ -1759,8 +1760,6 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
1759 sb->devflags |= WriteMostly1; 1760 sb->devflags |= WriteMostly1;
1760 else 1761 else
1761 sb->devflags &= ~WriteMostly1; 1762 sb->devflags &= ~WriteMostly1;
1762 sb->data_offset = cpu_to_le64(rdev->data_offset);
1763 sb->data_size = cpu_to_le64(rdev->sectors);
1764 1763
1765 if (mddev->bitmap && mddev->bitmap_info.file == NULL) { 1764 if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
1766 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); 1765 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
@@ -1774,9 +1773,6 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
1774 sb->recovery_offset = 1773 sb->recovery_offset =
1775 cpu_to_le64(rdev->recovery_offset); 1774 cpu_to_le64(rdev->recovery_offset);
1776 } 1775 }
1777 if (test_bit(Replacement, &rdev->flags))
1778 sb->feature_map |=
1779 cpu_to_le32(MD_FEATURE_REPLACEMENT);
1780 1776
1781 if (mddev->reshape_position != MaxSector) { 1777 if (mddev->reshape_position != MaxSector) {
1782 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 1778 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
@@ -1785,16 +1781,6 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
1785 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 1781 sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1786 sb->new_level = cpu_to_le32(mddev->new_level); 1782 sb->new_level = cpu_to_le32(mddev->new_level);
1787 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); 1783 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
1788 if (mddev->delta_disks == 0 &&
1789 mddev->reshape_backwards)
1790 sb->feature_map
1791 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
1792 if (rdev->new_data_offset != rdev->data_offset) {
1793 sb->feature_map
1794 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
1795 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
1796 - rdev->data_offset));
1797 }
1798 } 1784 }
1799 1785
1800 if (rdev->badblocks.count == 0) 1786 if (rdev->badblocks.count == 0)
@@ -1816,23 +1802,23 @@ retry:
1816 memset(bbp, 0xff, PAGE_SIZE); 1802 memset(bbp, 0xff, PAGE_SIZE);
1817 1803
1818 for (i = 0 ; i < bb->count ; i++) { 1804 for (i = 0 ; i < bb->count ; i++) {
1819 u64 internal_bb = p[i]; 1805 u64 internal_bb = *p++;
1820 u64 store_bb = ((BB_OFFSET(internal_bb) << 10) 1806 u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
1821 | BB_LEN(internal_bb)); 1807 | BB_LEN(internal_bb));
1822 bbp[i] = cpu_to_le64(store_bb); 1808 *bbp++ = cpu_to_le64(store_bb);
1823 } 1809 }
1824 bb->changed = 0;
1825 if (read_seqretry(&bb->lock, seq)) 1810 if (read_seqretry(&bb->lock, seq))
1826 goto retry; 1811 goto retry;
1827 1812
1828 bb->sector = (rdev->sb_start + 1813 bb->sector = (rdev->sb_start +
1829 (int)le32_to_cpu(sb->bblog_offset)); 1814 (int)le32_to_cpu(sb->bblog_offset));
1830 bb->size = le16_to_cpu(sb->bblog_size); 1815 bb->size = le16_to_cpu(sb->bblog_size);
1816 bb->changed = 0;
1831 } 1817 }
1832 } 1818 }
1833 1819
1834 max_dev = 0; 1820 max_dev = 0;
1835 rdev_for_each(rdev2, mddev) 1821 list_for_each_entry(rdev2, &mddev->disks, same_set)
1836 if (rdev2->desc_nr+1 > max_dev) 1822 if (rdev2->desc_nr+1 > max_dev)
1837 max_dev = rdev2->desc_nr+1; 1823 max_dev = rdev2->desc_nr+1;
1838 1824
@@ -1849,7 +1835,7 @@ retry:
1849 for (i=0; i<max_dev;i++) 1835 for (i=0; i<max_dev;i++)
1850 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1836 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1851 1837
1852 rdev_for_each(rdev2, mddev) { 1838 list_for_each_entry(rdev2, &mddev->disks, same_set) {
1853 i = rdev2->desc_nr; 1839 i = rdev2->desc_nr;
1854 if (test_bit(Faulty, &rdev2->flags)) 1840 if (test_bit(Faulty, &rdev2->flags))
1855 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1841 sb->dev_roles[i] = cpu_to_le16(0xfffe);
@@ -1865,14 +1851,12 @@ retry:
1865} 1851}
1866 1852
1867static unsigned long long 1853static unsigned long long
1868super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 1854super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1869{ 1855{
1870 struct mdp_superblock_1 *sb; 1856 struct mdp_superblock_1 *sb;
1871 sector_t max_sectors; 1857 sector_t max_sectors;
1872 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1858 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1873 return 0; /* component must fit device */ 1859 return 0; /* component must fit device */
1874 if (rdev->data_offset != rdev->new_data_offset)
1875 return 0; /* too confusing */
1876 if (rdev->sb_start < rdev->data_offset) { 1860 if (rdev->sb_start < rdev->data_offset) {
1877 /* minor versions 1 and 2; superblock before data */ 1861 /* minor versions 1 and 2; superblock before data */
1878 max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9; 1862 max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
@@ -1900,40 +1884,6 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1900 rdev->sb_page); 1884 rdev->sb_page);
1901 md_super_wait(rdev->mddev); 1885 md_super_wait(rdev->mddev);
1902 return num_sectors; 1886 return num_sectors;
1903
1904}
1905
1906static int
1907super_1_allow_new_offset(struct md_rdev *rdev,
1908 unsigned long long new_offset)
1909{
1910 /* All necessary checks on new >= old have been done */
1911 struct bitmap *bitmap;
1912 if (new_offset >= rdev->data_offset)
1913 return 1;
1914
1915 /* with 1.0 metadata, there is no metadata to tread on
1916 * so we can always move back */
1917 if (rdev->mddev->minor_version == 0)
1918 return 1;
1919
1920 /* otherwise we must be sure not to step on
1921 * any metadata, so stay:
1922 * 36K beyond start of superblock
1923 * beyond end of badblocks
1924 * beyond write-intent bitmap
1925 */
1926 if (rdev->sb_start + (32+4)*2 > new_offset)
1927 return 0;
1928 bitmap = rdev->mddev->bitmap;
1929 if (bitmap && !rdev->mddev->bitmap_info.file &&
1930 rdev->sb_start + rdev->mddev->bitmap_info.offset +
1931 bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset)
1932 return 0;
1933 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
1934 return 0;
1935
1936 return 1;
1937} 1887}
1938 1888
1939static struct super_type super_types[] = { 1889static struct super_type super_types[] = {
@@ -1944,7 +1894,6 @@ static struct super_type super_types[] = {
1944 .validate_super = super_90_validate, 1894 .validate_super = super_90_validate,
1945 .sync_super = super_90_sync, 1895 .sync_super = super_90_sync,
1946 .rdev_size_change = super_90_rdev_size_change, 1896 .rdev_size_change = super_90_rdev_size_change,
1947 .allow_new_offset = super_90_allow_new_offset,
1948 }, 1897 },
1949 [1] = { 1898 [1] = {
1950 .name = "md-1", 1899 .name = "md-1",
@@ -1953,11 +1902,10 @@ static struct super_type super_types[] = {
1953 .validate_super = super_1_validate, 1902 .validate_super = super_1_validate,
1954 .sync_super = super_1_sync, 1903 .sync_super = super_1_sync,
1955 .rdev_size_change = super_1_rdev_size_change, 1904 .rdev_size_change = super_1_rdev_size_change,
1956 .allow_new_offset = super_1_allow_new_offset,
1957 }, 1905 },
1958}; 1906};
1959 1907
1960static void sync_super(struct mddev *mddev, struct md_rdev *rdev) 1908static void sync_super(mddev_t *mddev, mdk_rdev_t *rdev)
1961{ 1909{
1962 if (mddev->sync_super) { 1910 if (mddev->sync_super) {
1963 mddev->sync_super(mddev, rdev); 1911 mddev->sync_super(mddev, rdev);
@@ -1969,9 +1917,9 @@ static void sync_super(struct mddev *mddev, struct md_rdev *rdev)
1969 super_types[mddev->major_version].sync_super(mddev, rdev); 1917 super_types[mddev->major_version].sync_super(mddev, rdev);
1970} 1918}
1971 1919
1972static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) 1920static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
1973{ 1921{
1974 struct md_rdev *rdev, *rdev2; 1922 mdk_rdev_t *rdev, *rdev2;
1975 1923
1976 rcu_read_lock(); 1924 rcu_read_lock();
1977 rdev_for_each_rcu(rdev, mddev1) 1925 rdev_for_each_rcu(rdev, mddev1)
@@ -1994,15 +1942,15 @@ static LIST_HEAD(pending_raid_disks);
1994 * from the array. It only succeeds if all working and active component devices 1942 * from the array. It only succeeds if all working and active component devices
1995 * are integrity capable with matching profiles. 1943 * are integrity capable with matching profiles.
1996 */ 1944 */
1997int md_integrity_register(struct mddev *mddev) 1945int md_integrity_register(mddev_t *mddev)
1998{ 1946{
1999 struct md_rdev *rdev, *reference = NULL; 1947 mdk_rdev_t *rdev, *reference = NULL;
2000 1948
2001 if (list_empty(&mddev->disks)) 1949 if (list_empty(&mddev->disks))
2002 return 0; /* nothing to do */ 1950 return 0; /* nothing to do */
2003 if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) 1951 if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
2004 return 0; /* shouldn't register, or already is */ 1952 return 0; /* shouldn't register, or already is */
2005 rdev_for_each(rdev, mddev) { 1953 list_for_each_entry(rdev, &mddev->disks, same_set) {
2006 /* skip spares and non-functional disks */ 1954 /* skip spares and non-functional disks */
2007 if (test_bit(Faulty, &rdev->flags)) 1955 if (test_bit(Faulty, &rdev->flags))
2008 continue; 1956 continue;
@@ -2041,16 +1989,10 @@ int md_integrity_register(struct mddev *mddev)
2041EXPORT_SYMBOL(md_integrity_register); 1989EXPORT_SYMBOL(md_integrity_register);
2042 1990
2043/* Disable data integrity if non-capable/non-matching disk is being added */ 1991/* Disable data integrity if non-capable/non-matching disk is being added */
2044void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) 1992void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
2045{ 1993{
2046 struct blk_integrity *bi_rdev; 1994 struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev);
2047 struct blk_integrity *bi_mddev; 1995 struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk);
2048
2049 if (!mddev->gendisk)
2050 return;
2051
2052 bi_rdev = bdev_get_integrity(rdev->bdev);
2053 bi_mddev = blk_get_integrity(mddev->gendisk);
2054 1996
2055 if (!bi_mddev) /* nothing to do */ 1997 if (!bi_mddev) /* nothing to do */
2056 return; 1998 return;
@@ -2064,7 +2006,7 @@ void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
2064} 2006}
2065EXPORT_SYMBOL(md_integrity_add_rdev); 2007EXPORT_SYMBOL(md_integrity_add_rdev);
2066 2008
2067static int bind_rdev_to_array(struct md_rdev * rdev, struct mddev * mddev) 2009static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
2068{ 2010{
2069 char b[BDEVNAME_SIZE]; 2011 char b[BDEVNAME_SIZE];
2070 struct kobject *ko; 2012 struct kobject *ko;
@@ -2144,12 +2086,12 @@ static int bind_rdev_to_array(struct md_rdev * rdev, struct mddev * mddev)
2144 2086
2145static void md_delayed_delete(struct work_struct *ws) 2087static void md_delayed_delete(struct work_struct *ws)
2146{ 2088{
2147 struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work); 2089 mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work);
2148 kobject_del(&rdev->kobj); 2090 kobject_del(&rdev->kobj);
2149 kobject_put(&rdev->kobj); 2091 kobject_put(&rdev->kobj);
2150} 2092}
2151 2093
2152static void unbind_rdev_from_array(struct md_rdev * rdev) 2094static void unbind_rdev_from_array(mdk_rdev_t * rdev)
2153{ 2095{
2154 char b[BDEVNAME_SIZE]; 2096 char b[BDEVNAME_SIZE];
2155 if (!rdev->mddev) { 2097 if (!rdev->mddev) {
@@ -2163,7 +2105,9 @@ static void unbind_rdev_from_array(struct md_rdev * rdev)
2163 sysfs_remove_link(&rdev->kobj, "block"); 2105 sysfs_remove_link(&rdev->kobj, "block");
2164 sysfs_put(rdev->sysfs_state); 2106 sysfs_put(rdev->sysfs_state);
2165 rdev->sysfs_state = NULL; 2107 rdev->sysfs_state = NULL;
2108 kfree(rdev->badblocks.page);
2166 rdev->badblocks.count = 0; 2109 rdev->badblocks.count = 0;
2110 rdev->badblocks.page = NULL;
2167 /* We need to delay this, otherwise we can deadlock when 2111 /* We need to delay this, otherwise we can deadlock when
2168 * writing to 'remove' to "dev/state". We also need 2112 * writing to 'remove' to "dev/state". We also need
2169 * to delay it due to rcu usage. 2113 * to delay it due to rcu usage.
@@ -2179,14 +2123,14 @@ static void unbind_rdev_from_array(struct md_rdev * rdev)
2179 * otherwise reused by a RAID array (or any other kernel 2123 * otherwise reused by a RAID array (or any other kernel
2180 * subsystem), by bd_claiming the device. 2124 * subsystem), by bd_claiming the device.
2181 */ 2125 */
2182static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared) 2126static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared)
2183{ 2127{
2184 int err = 0; 2128 int err = 0;
2185 struct block_device *bdev; 2129 struct block_device *bdev;
2186 char b[BDEVNAME_SIZE]; 2130 char b[BDEVNAME_SIZE];
2187 2131
2188 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, 2132 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
2189 shared ? (struct md_rdev *)lock_rdev : rdev); 2133 shared ? (mdk_rdev_t *)lock_rdev : rdev);
2190 if (IS_ERR(bdev)) { 2134 if (IS_ERR(bdev)) {
2191 printk(KERN_ERR "md: could not open %s.\n", 2135 printk(KERN_ERR "md: could not open %s.\n",
2192 __bdevname(dev, b)); 2136 __bdevname(dev, b));
@@ -2196,7 +2140,7 @@ static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared)
2196 return err; 2140 return err;
2197} 2141}
2198 2142
2199static void unlock_rdev(struct md_rdev *rdev) 2143static void unlock_rdev(mdk_rdev_t *rdev)
2200{ 2144{
2201 struct block_device *bdev = rdev->bdev; 2145 struct block_device *bdev = rdev->bdev;
2202 rdev->bdev = NULL; 2146 rdev->bdev = NULL;
@@ -2207,14 +2151,14 @@ static void unlock_rdev(struct md_rdev *rdev)
2207 2151
2208void md_autodetect_dev(dev_t dev); 2152void md_autodetect_dev(dev_t dev);
2209 2153
2210static void export_rdev(struct md_rdev * rdev) 2154static void export_rdev(mdk_rdev_t * rdev)
2211{ 2155{
2212 char b[BDEVNAME_SIZE]; 2156 char b[BDEVNAME_SIZE];
2213 printk(KERN_INFO "md: export_rdev(%s)\n", 2157 printk(KERN_INFO "md: export_rdev(%s)\n",
2214 bdevname(rdev->bdev,b)); 2158 bdevname(rdev->bdev,b));
2215 if (rdev->mddev) 2159 if (rdev->mddev)
2216 MD_BUG(); 2160 MD_BUG();
2217 md_rdev_clear(rdev); 2161 free_disk_sb(rdev);
2218#ifndef MODULE 2162#ifndef MODULE
2219 if (test_bit(AutoDetected, &rdev->flags)) 2163 if (test_bit(AutoDetected, &rdev->flags))
2220 md_autodetect_dev(rdev->bdev->bd_dev); 2164 md_autodetect_dev(rdev->bdev->bd_dev);
@@ -2223,17 +2167,17 @@ static void export_rdev(struct md_rdev * rdev)
2223 kobject_put(&rdev->kobj); 2167 kobject_put(&rdev->kobj);
2224} 2168}
2225 2169
2226static void kick_rdev_from_array(struct md_rdev * rdev) 2170static void kick_rdev_from_array(mdk_rdev_t * rdev)
2227{ 2171{
2228 unbind_rdev_from_array(rdev); 2172 unbind_rdev_from_array(rdev);
2229 export_rdev(rdev); 2173 export_rdev(rdev);
2230} 2174}
2231 2175
2232static void export_array(struct mddev *mddev) 2176static void export_array(mddev_t *mddev)
2233{ 2177{
2234 struct md_rdev *rdev, *tmp; 2178 mdk_rdev_t *rdev, *tmp;
2235 2179
2236 rdev_for_each_safe(rdev, tmp, mddev) { 2180 rdev_for_each(rdev, tmp, mddev) {
2237 if (!rdev->mddev) { 2181 if (!rdev->mddev) {
2238 MD_BUG(); 2182 MD_BUG();
2239 continue; 2183 continue;
@@ -2327,7 +2271,7 @@ static void print_sb_1(struct mdp_superblock_1 *sb)
2327 ); 2271 );
2328} 2272}
2329 2273
2330static void print_rdev(struct md_rdev *rdev, int major_version) 2274static void print_rdev(mdk_rdev_t *rdev, int major_version)
2331{ 2275{
2332 char b[BDEVNAME_SIZE]; 2276 char b[BDEVNAME_SIZE];
2333 printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n", 2277 printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n",
@@ -2351,8 +2295,8 @@ static void print_rdev(struct md_rdev *rdev, int major_version)
2351static void md_print_devices(void) 2295static void md_print_devices(void)
2352{ 2296{
2353 struct list_head *tmp; 2297 struct list_head *tmp;
2354 struct md_rdev *rdev; 2298 mdk_rdev_t *rdev;
2355 struct mddev *mddev; 2299 mddev_t *mddev;
2356 char b[BDEVNAME_SIZE]; 2300 char b[BDEVNAME_SIZE];
2357 2301
2358 printk("\n"); 2302 printk("\n");
@@ -2365,11 +2309,11 @@ static void md_print_devices(void)
2365 bitmap_print_sb(mddev->bitmap); 2309 bitmap_print_sb(mddev->bitmap);
2366 else 2310 else
2367 printk("%s: ", mdname(mddev)); 2311 printk("%s: ", mdname(mddev));
2368 rdev_for_each(rdev, mddev) 2312 list_for_each_entry(rdev, &mddev->disks, same_set)
2369 printk("<%s>", bdevname(rdev->bdev,b)); 2313 printk("<%s>", bdevname(rdev->bdev,b));
2370 printk("\n"); 2314 printk("\n");
2371 2315
2372 rdev_for_each(rdev, mddev) 2316 list_for_each_entry(rdev, &mddev->disks, same_set)
2373 print_rdev(rdev, mddev->major_version); 2317 print_rdev(rdev, mddev->major_version);
2374 } 2318 }
2375 printk("md: **********************************\n"); 2319 printk("md: **********************************\n");
@@ -2377,7 +2321,7 @@ static void md_print_devices(void)
2377} 2321}
2378 2322
2379 2323
2380static void sync_sbs(struct mddev * mddev, int nospares) 2324static void sync_sbs(mddev_t * mddev, int nospares)
2381{ 2325{
2382 /* Update each superblock (in-memory image), but 2326 /* Update each superblock (in-memory image), but
2383 * if we are allowed to, skip spares which already 2327 * if we are allowed to, skip spares which already
@@ -2385,8 +2329,8 @@ static void sync_sbs(struct mddev * mddev, int nospares)
2385 * (which would mean they aren't being marked as dirty 2329 * (which would mean they aren't being marked as dirty
2386 * with the rest of the array) 2330 * with the rest of the array)
2387 */ 2331 */
2388 struct md_rdev *rdev; 2332 mdk_rdev_t *rdev;
2389 rdev_for_each(rdev, mddev) { 2333 list_for_each_entry(rdev, &mddev->disks, same_set) {
2390 if (rdev->sb_events == mddev->events || 2334 if (rdev->sb_events == mddev->events ||
2391 (nospares && 2335 (nospares &&
2392 rdev->raid_disk < 0 && 2336 rdev->raid_disk < 0 &&
@@ -2400,16 +2344,16 @@ static void sync_sbs(struct mddev * mddev, int nospares)
2400 } 2344 }
2401} 2345}
2402 2346
2403static void md_update_sb(struct mddev * mddev, int force_change) 2347static void md_update_sb(mddev_t * mddev, int force_change)
2404{ 2348{
2405 struct md_rdev *rdev; 2349 mdk_rdev_t *rdev;
2406 int sync_req; 2350 int sync_req;
2407 int nospares = 0; 2351 int nospares = 0;
2408 int any_badblocks_changed = 0; 2352 int any_badblocks_changed = 0;
2409 2353
2410repeat: 2354repeat:
2411 /* First make sure individual recovery_offsets are correct */ 2355 /* First make sure individual recovery_offsets are correct */
2412 rdev_for_each(rdev, mddev) { 2356 list_for_each_entry(rdev, &mddev->disks, same_set) {
2413 if (rdev->raid_disk >= 0 && 2357 if (rdev->raid_disk >= 0 &&
2414 mddev->delta_disks >= 0 && 2358 mddev->delta_disks >= 0 &&
2415 !test_bit(In_sync, &rdev->flags) && 2359 !test_bit(In_sync, &rdev->flags) &&
@@ -2422,9 +2366,8 @@ repeat:
2422 clear_bit(MD_CHANGE_DEVS, &mddev->flags); 2366 clear_bit(MD_CHANGE_DEVS, &mddev->flags);
2423 if (!mddev->external) { 2367 if (!mddev->external) {
2424 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2368 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2425 rdev_for_each(rdev, mddev) { 2369 list_for_each_entry(rdev, &mddev->disks, same_set) {
2426 if (rdev->badblocks.changed) { 2370 if (rdev->badblocks.changed) {
2427 rdev->badblocks.changed = 0;
2428 md_ack_all_badblocks(&rdev->badblocks); 2371 md_ack_all_badblocks(&rdev->badblocks);
2429 md_error(mddev, rdev); 2372 md_error(mddev, rdev);
2430 } 2373 }
@@ -2489,7 +2432,7 @@ repeat:
2489 mddev->events --; 2432 mddev->events --;
2490 } 2433 }
2491 2434
2492 rdev_for_each(rdev, mddev) { 2435 list_for_each_entry(rdev, &mddev->disks, same_set) {
2493 if (rdev->badblocks.changed) 2436 if (rdev->badblocks.changed)
2494 any_badblocks_changed++; 2437 any_badblocks_changed++;
2495 if (test_bit(Faulty, &rdev->flags)) 2438 if (test_bit(Faulty, &rdev->flags))
@@ -2499,24 +2442,27 @@ repeat:
2499 sync_sbs(mddev, nospares); 2442 sync_sbs(mddev, nospares);
2500 spin_unlock_irq(&mddev->write_lock); 2443 spin_unlock_irq(&mddev->write_lock);
2501 2444
2502 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", 2445 dprintk(KERN_INFO
2503 mdname(mddev), mddev->in_sync); 2446 "md: updating %s RAID superblock on device (in sync %d)\n",
2447 mdname(mddev),mddev->in_sync);
2504 2448
2505 bitmap_update_sb(mddev->bitmap); 2449 bitmap_update_sb(mddev->bitmap);
2506 rdev_for_each(rdev, mddev) { 2450 list_for_each_entry(rdev, &mddev->disks, same_set) {
2507 char b[BDEVNAME_SIZE]; 2451 char b[BDEVNAME_SIZE];
2508 2452 dprintk(KERN_INFO "md: ");
2509 if (rdev->sb_loaded != 1) 2453 if (rdev->sb_loaded != 1)
2510 continue; /* no noise on spare devices */ 2454 continue; /* no noise on spare devices */
2455 if (test_bit(Faulty, &rdev->flags))
2456 dprintk("(skipping faulty ");
2511 2457
2512 if (!test_bit(Faulty, &rdev->flags) && 2458 dprintk("%s ", bdevname(rdev->bdev,b));
2513 rdev->saved_raid_disk == -1) { 2459 if (!test_bit(Faulty, &rdev->flags)) {
2514 md_super_write(mddev,rdev, 2460 md_super_write(mddev,rdev,
2515 rdev->sb_start, rdev->sb_size, 2461 rdev->sb_start, rdev->sb_size,
2516 rdev->sb_page); 2462 rdev->sb_page);
2517 pr_debug("md: (write) %s's sb offset: %llu\n", 2463 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
2518 bdevname(rdev->bdev, b), 2464 bdevname(rdev->bdev,b),
2519 (unsigned long long)rdev->sb_start); 2465 (unsigned long long)rdev->sb_start);
2520 rdev->sb_events = mddev->events; 2466 rdev->sb_events = mddev->events;
2521 if (rdev->badblocks.size) { 2467 if (rdev->badblocks.size) {
2522 md_super_write(mddev, rdev, 2468 md_super_write(mddev, rdev,
@@ -2526,12 +2472,8 @@ repeat:
2526 rdev->badblocks.size = 0; 2472 rdev->badblocks.size = 0;
2527 } 2473 }
2528 2474
2529 } else if (test_bit(Faulty, &rdev->flags)) 2475 } else
2530 pr_debug("md: %s (skipping faulty)\n", 2476 dprintk(")\n");
2531 bdevname(rdev->bdev, b));
2532 else
2533 pr_debug("(skipping incremental s/r ");
2534
2535 if (mddev->level == LEVEL_MULTIPATH) 2477 if (mddev->level == LEVEL_MULTIPATH)
2536 /* only need to write one superblock... */ 2478 /* only need to write one superblock... */
2537 break; 2479 break;
@@ -2552,7 +2494,7 @@ repeat:
2552 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2494 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2553 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 2495 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2554 2496
2555 rdev_for_each(rdev, mddev) { 2497 list_for_each_entry(rdev, &mddev->disks, same_set) {
2556 if (test_and_clear_bit(FaultRecorded, &rdev->flags)) 2498 if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2557 clear_bit(Blocked, &rdev->flags); 2499 clear_bit(Blocked, &rdev->flags);
2558 2500
@@ -2585,12 +2527,12 @@ static int cmd_match(const char *cmd, const char *str)
2585 2527
2586struct rdev_sysfs_entry { 2528struct rdev_sysfs_entry {
2587 struct attribute attr; 2529 struct attribute attr;
2588 ssize_t (*show)(struct md_rdev *, char *); 2530 ssize_t (*show)(mdk_rdev_t *, char *);
2589 ssize_t (*store)(struct md_rdev *, const char *, size_t); 2531 ssize_t (*store)(mdk_rdev_t *, const char *, size_t);
2590}; 2532};
2591 2533
2592static ssize_t 2534static ssize_t
2593state_show(struct md_rdev *rdev, char *page) 2535state_show(mdk_rdev_t *rdev, char *page)
2594{ 2536{
2595 char *sep = ""; 2537 char *sep = "";
2596 size_t len = 0; 2538 size_t len = 0;
@@ -2609,8 +2551,7 @@ state_show(struct md_rdev *rdev, char *page)
2609 sep = ","; 2551 sep = ",";
2610 } 2552 }
2611 if (test_bit(Blocked, &rdev->flags) || 2553 if (test_bit(Blocked, &rdev->flags) ||
2612 (rdev->badblocks.unacked_exist 2554 rdev->badblocks.unacked_exist) {
2613 && !test_bit(Faulty, &rdev->flags))) {
2614 len += sprintf(page+len, "%sblocked", sep); 2555 len += sprintf(page+len, "%sblocked", sep);
2615 sep = ","; 2556 sep = ",";
2616 } 2557 }
@@ -2623,20 +2564,11 @@ state_show(struct md_rdev *rdev, char *page)
2623 len += sprintf(page+len, "%swrite_error", sep); 2564 len += sprintf(page+len, "%swrite_error", sep);
2624 sep = ","; 2565 sep = ",";
2625 } 2566 }
2626 if (test_bit(WantReplacement, &rdev->flags)) {
2627 len += sprintf(page+len, "%swant_replacement", sep);
2628 sep = ",";
2629 }
2630 if (test_bit(Replacement, &rdev->flags)) {
2631 len += sprintf(page+len, "%sreplacement", sep);
2632 sep = ",";
2633 }
2634
2635 return len+sprintf(page+len, "\n"); 2567 return len+sprintf(page+len, "\n");
2636} 2568}
2637 2569
2638static ssize_t 2570static ssize_t
2639state_store(struct md_rdev *rdev, const char *buf, size_t len) 2571state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2640{ 2572{
2641 /* can write 2573 /* can write
2642 * faulty - simulates an error 2574 * faulty - simulates an error
@@ -2660,7 +2592,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
2660 if (rdev->raid_disk >= 0) 2592 if (rdev->raid_disk >= 0)
2661 err = -EBUSY; 2593 err = -EBUSY;
2662 else { 2594 else {
2663 struct mddev *mddev = rdev->mddev; 2595 mddev_t *mddev = rdev->mddev;
2664 kick_rdev_from_array(rdev); 2596 kick_rdev_from_array(rdev);
2665 if (mddev->pers) 2597 if (mddev->pers)
2666 md_update_sb(mddev, 1); 2598 md_update_sb(mddev, 1);
@@ -2700,42 +2632,6 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
2700 } else if (cmd_match(buf, "-write_error")) { 2632 } else if (cmd_match(buf, "-write_error")) {
2701 clear_bit(WriteErrorSeen, &rdev->flags); 2633 clear_bit(WriteErrorSeen, &rdev->flags);
2702 err = 0; 2634 err = 0;
2703 } else if (cmd_match(buf, "want_replacement")) {
2704 /* Any non-spare device that is not a replacement can
2705 * become want_replacement at any time, but we then need to
2706 * check if recovery is needed.
2707 */
2708 if (rdev->raid_disk >= 0 &&
2709 !test_bit(Replacement, &rdev->flags))
2710 set_bit(WantReplacement, &rdev->flags);
2711 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2712 md_wakeup_thread(rdev->mddev->thread);
2713 err = 0;
2714 } else if (cmd_match(buf, "-want_replacement")) {
2715 /* Clearing 'want_replacement' is always allowed.
2716 * Once replacements starts it is too late though.
2717 */
2718 err = 0;
2719 clear_bit(WantReplacement, &rdev->flags);
2720 } else if (cmd_match(buf, "replacement")) {
2721 /* Can only set a device as a replacement when array has not
2722 * yet been started. Once running, replacement is automatic
2723 * from spares, or by assigning 'slot'.
2724 */
2725 if (rdev->mddev->pers)
2726 err = -EBUSY;
2727 else {
2728 set_bit(Replacement, &rdev->flags);
2729 err = 0;
2730 }
2731 } else if (cmd_match(buf, "-replacement")) {
2732 /* Similarly, can only clear Replacement before start */
2733 if (rdev->mddev->pers)
2734 err = -EBUSY;
2735 else {
2736 clear_bit(Replacement, &rdev->flags);
2737 err = 0;
2738 }
2739 } 2635 }
2740 if (!err) 2636 if (!err)
2741 sysfs_notify_dirent_safe(rdev->sysfs_state); 2637 sysfs_notify_dirent_safe(rdev->sysfs_state);
@@ -2745,13 +2641,13 @@ static struct rdev_sysfs_entry rdev_state =
2745__ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store); 2641__ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
2746 2642
2747static ssize_t 2643static ssize_t
2748errors_show(struct md_rdev *rdev, char *page) 2644errors_show(mdk_rdev_t *rdev, char *page)
2749{ 2645{
2750 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 2646 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
2751} 2647}
2752 2648
2753static ssize_t 2649static ssize_t
2754errors_store(struct md_rdev *rdev, const char *buf, size_t len) 2650errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2755{ 2651{
2756 char *e; 2652 char *e;
2757 unsigned long n = simple_strtoul(buf, &e, 10); 2653 unsigned long n = simple_strtoul(buf, &e, 10);
@@ -2765,7 +2661,7 @@ static struct rdev_sysfs_entry rdev_errors =
2765__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 2661__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
2766 2662
2767static ssize_t 2663static ssize_t
2768slot_show(struct md_rdev *rdev, char *page) 2664slot_show(mdk_rdev_t *rdev, char *page)
2769{ 2665{
2770 if (rdev->raid_disk < 0) 2666 if (rdev->raid_disk < 0)
2771 return sprintf(page, "none\n"); 2667 return sprintf(page, "none\n");
@@ -2774,7 +2670,7 @@ slot_show(struct md_rdev *rdev, char *page)
2774} 2670}
2775 2671
2776static ssize_t 2672static ssize_t
2777slot_store(struct md_rdev *rdev, const char *buf, size_t len) 2673slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2778{ 2674{
2779 char *e; 2675 char *e;
2780 int err; 2676 int err;
@@ -2797,7 +2693,7 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
2797 if (rdev->mddev->pers->hot_remove_disk == NULL) 2693 if (rdev->mddev->pers->hot_remove_disk == NULL)
2798 return -EINVAL; 2694 return -EINVAL;
2799 err = rdev->mddev->pers-> 2695 err = rdev->mddev->pers->
2800 hot_remove_disk(rdev->mddev, rdev); 2696 hot_remove_disk(rdev->mddev, rdev->raid_disk);
2801 if (err) 2697 if (err)
2802 return err; 2698 return err;
2803 sysfs_unlink_rdev(rdev->mddev, rdev); 2699 sysfs_unlink_rdev(rdev->mddev, rdev);
@@ -2805,6 +2701,7 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
2805 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2701 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2806 md_wakeup_thread(rdev->mddev->thread); 2702 md_wakeup_thread(rdev->mddev->thread);
2807 } else if (rdev->mddev->pers) { 2703 } else if (rdev->mddev->pers) {
2704 mdk_rdev_t *rdev2;
2808 /* Activating a spare .. or possibly reactivating 2705 /* Activating a spare .. or possibly reactivating
2809 * if we ever get bitmaps working here. 2706 * if we ever get bitmaps working here.
2810 */ 2707 */
@@ -2818,6 +2715,10 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
2818 if (rdev->mddev->pers->hot_add_disk == NULL) 2715 if (rdev->mddev->pers->hot_add_disk == NULL)
2819 return -EINVAL; 2716 return -EINVAL;
2820 2717
2718 list_for_each_entry(rdev2, &rdev->mddev->disks, same_set)
2719 if (rdev2->raid_disk == slot)
2720 return -EEXIST;
2721
2821 if (slot >= rdev->mddev->raid_disks && 2722 if (slot >= rdev->mddev->raid_disks &&
2822 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 2723 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
2823 return -ENOSPC; 2724 return -ENOSPC;
@@ -2827,7 +2728,6 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
2827 rdev->saved_raid_disk = slot; 2728 rdev->saved_raid_disk = slot;
2828 else 2729 else
2829 rdev->saved_raid_disk = -1; 2730 rdev->saved_raid_disk = -1;
2830 clear_bit(In_sync, &rdev->flags);
2831 err = rdev->mddev->pers-> 2731 err = rdev->mddev->pers->
2832 hot_add_disk(rdev->mddev, rdev); 2732 hot_add_disk(rdev->mddev, rdev);
2833 if (err) { 2733 if (err) {
@@ -2857,16 +2757,17 @@ static struct rdev_sysfs_entry rdev_slot =
2857__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 2757__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
2858 2758
2859static ssize_t 2759static ssize_t
2860offset_show(struct md_rdev *rdev, char *page) 2760offset_show(mdk_rdev_t *rdev, char *page)
2861{ 2761{
2862 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 2762 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
2863} 2763}
2864 2764
2865static ssize_t 2765static ssize_t
2866offset_store(struct md_rdev *rdev, const char *buf, size_t len) 2766offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2867{ 2767{
2868 unsigned long long offset; 2768 char *e;
2869 if (strict_strtoull(buf, 10, &offset) < 0) 2769 unsigned long long offset = simple_strtoull(buf, &e, 10);
2770 if (e==buf || (*e && *e != '\n'))
2870 return -EINVAL; 2771 return -EINVAL;
2871 if (rdev->mddev->pers && rdev->raid_disk >= 0) 2772 if (rdev->mddev->pers && rdev->raid_disk >= 0)
2872 return -EBUSY; 2773 return -EBUSY;
@@ -2875,72 +2776,14 @@ offset_store(struct md_rdev *rdev, const char *buf, size_t len)
2875 * can be sane */ 2776 * can be sane */
2876 return -EBUSY; 2777 return -EBUSY;
2877 rdev->data_offset = offset; 2778 rdev->data_offset = offset;
2878 rdev->new_data_offset = offset;
2879 return len; 2779 return len;
2880} 2780}
2881 2781
2882static struct rdev_sysfs_entry rdev_offset = 2782static struct rdev_sysfs_entry rdev_offset =
2883__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 2783__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
2884 2784
2885static ssize_t new_offset_show(struct md_rdev *rdev, char *page)
2886{
2887 return sprintf(page, "%llu\n",
2888 (unsigned long long)rdev->new_data_offset);
2889}
2890
2891static ssize_t new_offset_store(struct md_rdev *rdev,
2892 const char *buf, size_t len)
2893{
2894 unsigned long long new_offset;
2895 struct mddev *mddev = rdev->mddev;
2896
2897 if (strict_strtoull(buf, 10, &new_offset) < 0)
2898 return -EINVAL;
2899
2900 if (mddev->sync_thread)
2901 return -EBUSY;
2902 if (new_offset == rdev->data_offset)
2903 /* reset is always permitted */
2904 ;
2905 else if (new_offset > rdev->data_offset) {
2906 /* must not push array size beyond rdev_sectors */
2907 if (new_offset - rdev->data_offset
2908 + mddev->dev_sectors > rdev->sectors)
2909 return -E2BIG;
2910 }
2911 /* Metadata worries about other space details. */
2912
2913 /* decreasing the offset is inconsistent with a backwards
2914 * reshape.
2915 */
2916 if (new_offset < rdev->data_offset &&
2917 mddev->reshape_backwards)
2918 return -EINVAL;
2919 /* Increasing offset is inconsistent with forwards
2920 * reshape. reshape_direction should be set to
2921 * 'backwards' first.
2922 */
2923 if (new_offset > rdev->data_offset &&
2924 !mddev->reshape_backwards)
2925 return -EINVAL;
2926
2927 if (mddev->pers && mddev->persistent &&
2928 !super_types[mddev->major_version]
2929 .allow_new_offset(rdev, new_offset))
2930 return -E2BIG;
2931 rdev->new_data_offset = new_offset;
2932 if (new_offset > rdev->data_offset)
2933 mddev->reshape_backwards = 1;
2934 else if (new_offset < rdev->data_offset)
2935 mddev->reshape_backwards = 0;
2936
2937 return len;
2938}
2939static struct rdev_sysfs_entry rdev_new_offset =
2940__ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store);
2941
2942static ssize_t 2785static ssize_t
2943rdev_size_show(struct md_rdev *rdev, char *page) 2786rdev_size_show(mdk_rdev_t *rdev, char *page)
2944{ 2787{
2945 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); 2788 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
2946} 2789}
@@ -2975,16 +2818,14 @@ static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
2975} 2818}
2976 2819
2977static ssize_t 2820static ssize_t
2978rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) 2821rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2979{ 2822{
2980 struct mddev *my_mddev = rdev->mddev; 2823 mddev_t *my_mddev = rdev->mddev;
2981 sector_t oldsectors = rdev->sectors; 2824 sector_t oldsectors = rdev->sectors;
2982 sector_t sectors; 2825 sector_t sectors;
2983 2826
2984 if (strict_blocks_to_sectors(buf, &sectors) < 0) 2827 if (strict_blocks_to_sectors(buf, &sectors) < 0)
2985 return -EINVAL; 2828 return -EINVAL;
2986 if (rdev->data_offset != rdev->new_data_offset)
2987 return -EINVAL; /* too confusing */
2988 if (my_mddev->pers && rdev->raid_disk >= 0) { 2829 if (my_mddev->pers && rdev->raid_disk >= 0) {
2989 if (my_mddev->persistent) { 2830 if (my_mddev->persistent) {
2990 sectors = super_types[my_mddev->major_version]. 2831 sectors = super_types[my_mddev->major_version].
@@ -3005,16 +2846,16 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
3005 * a deadlock. We have already changed rdev->sectors, and if 2846 * a deadlock. We have already changed rdev->sectors, and if
3006 * we have to change it back, we will have the lock again. 2847 * we have to change it back, we will have the lock again.
3007 */ 2848 */
3008 struct mddev *mddev; 2849 mddev_t *mddev;
3009 int overlap = 0; 2850 int overlap = 0;
3010 struct list_head *tmp; 2851 struct list_head *tmp;
3011 2852
3012 mddev_unlock(my_mddev); 2853 mddev_unlock(my_mddev);
3013 for_each_mddev(mddev, tmp) { 2854 for_each_mddev(mddev, tmp) {
3014 struct md_rdev *rdev2; 2855 mdk_rdev_t *rdev2;
3015 2856
3016 mddev_lock(mddev); 2857 mddev_lock(mddev);
3017 rdev_for_each(rdev2, mddev) 2858 list_for_each_entry(rdev2, &mddev->disks, same_set)
3018 if (rdev->bdev == rdev2->bdev && 2859 if (rdev->bdev == rdev2->bdev &&
3019 rdev != rdev2 && 2860 rdev != rdev2 &&
3020 overlaps(rdev->data_offset, rdev->sectors, 2861 overlaps(rdev->data_offset, rdev->sectors,
@@ -3048,7 +2889,7 @@ static struct rdev_sysfs_entry rdev_size =
3048__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 2889__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
3049 2890
3050 2891
3051static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) 2892static ssize_t recovery_start_show(mdk_rdev_t *rdev, char *page)
3052{ 2893{
3053 unsigned long long recovery_start = rdev->recovery_offset; 2894 unsigned long long recovery_start = rdev->recovery_offset;
3054 2895
@@ -3059,7 +2900,7 @@ static ssize_t recovery_start_show(struct md_rdev *rdev, char *page)
3059 return sprintf(page, "%llu\n", recovery_start); 2900 return sprintf(page, "%llu\n", recovery_start);
3060} 2901}
3061 2902
3062static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) 2903static ssize_t recovery_start_store(mdk_rdev_t *rdev, const char *buf, size_t len)
3063{ 2904{
3064 unsigned long long recovery_start; 2905 unsigned long long recovery_start;
3065 2906
@@ -3089,11 +2930,11 @@ badblocks_show(struct badblocks *bb, char *page, int unack);
3089static ssize_t 2930static ssize_t
3090badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack); 2931badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack);
3091 2932
3092static ssize_t bb_show(struct md_rdev *rdev, char *page) 2933static ssize_t bb_show(mdk_rdev_t *rdev, char *page)
3093{ 2934{
3094 return badblocks_show(&rdev->badblocks, page, 0); 2935 return badblocks_show(&rdev->badblocks, page, 0);
3095} 2936}
3096static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) 2937static ssize_t bb_store(mdk_rdev_t *rdev, const char *page, size_t len)
3097{ 2938{
3098 int rv = badblocks_store(&rdev->badblocks, page, len, 0); 2939 int rv = badblocks_store(&rdev->badblocks, page, len, 0);
3099 /* Maybe that ack was all we needed */ 2940 /* Maybe that ack was all we needed */
@@ -3105,11 +2946,11 @@ static struct rdev_sysfs_entry rdev_bad_blocks =
3105__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); 2946__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
3106 2947
3107 2948
3108static ssize_t ubb_show(struct md_rdev *rdev, char *page) 2949static ssize_t ubb_show(mdk_rdev_t *rdev, char *page)
3109{ 2950{
3110 return badblocks_show(&rdev->badblocks, page, 1); 2951 return badblocks_show(&rdev->badblocks, page, 1);
3111} 2952}
3112static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) 2953static ssize_t ubb_store(mdk_rdev_t *rdev, const char *page, size_t len)
3113{ 2954{
3114 return badblocks_store(&rdev->badblocks, page, len, 1); 2955 return badblocks_store(&rdev->badblocks, page, len, 1);
3115} 2956}
@@ -3121,7 +2962,6 @@ static struct attribute *rdev_default_attrs[] = {
3121 &rdev_errors.attr, 2962 &rdev_errors.attr,
3122 &rdev_slot.attr, 2963 &rdev_slot.attr,
3123 &rdev_offset.attr, 2964 &rdev_offset.attr,
3124 &rdev_new_offset.attr,
3125 &rdev_size.attr, 2965 &rdev_size.attr,
3126 &rdev_recovery_start.attr, 2966 &rdev_recovery_start.attr,
3127 &rdev_bad_blocks.attr, 2967 &rdev_bad_blocks.attr,
@@ -3132,8 +2972,8 @@ static ssize_t
3132rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 2972rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3133{ 2973{
3134 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 2974 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3135 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 2975 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
3136 struct mddev *mddev = rdev->mddev; 2976 mddev_t *mddev = rdev->mddev;
3137 ssize_t rv; 2977 ssize_t rv;
3138 2978
3139 if (!entry->show) 2979 if (!entry->show)
@@ -3155,9 +2995,9 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
3155 const char *page, size_t length) 2995 const char *page, size_t length)
3156{ 2996{
3157 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 2997 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3158 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 2998 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
3159 ssize_t rv; 2999 ssize_t rv;
3160 struct mddev *mddev = rdev->mddev; 3000 mddev_t *mddev = rdev->mddev;
3161 3001
3162 if (!entry->store) 3002 if (!entry->store)
3163 return -EIO; 3003 return -EIO;
@@ -3176,7 +3016,7 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
3176 3016
3177static void rdev_free(struct kobject *ko) 3017static void rdev_free(struct kobject *ko)
3178{ 3018{
3179 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj); 3019 mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj);
3180 kfree(rdev); 3020 kfree(rdev);
3181} 3021}
3182static const struct sysfs_ops rdev_sysfs_ops = { 3022static const struct sysfs_ops rdev_sysfs_ops = {
@@ -3189,14 +3029,13 @@ static struct kobj_type rdev_ktype = {
3189 .default_attrs = rdev_default_attrs, 3029 .default_attrs = rdev_default_attrs,
3190}; 3030};
3191 3031
3192int md_rdev_init(struct md_rdev *rdev) 3032int md_rdev_init(mdk_rdev_t *rdev)
3193{ 3033{
3194 rdev->desc_nr = -1; 3034 rdev->desc_nr = -1;
3195 rdev->saved_raid_disk = -1; 3035 rdev->saved_raid_disk = -1;
3196 rdev->raid_disk = -1; 3036 rdev->raid_disk = -1;
3197 rdev->flags = 0; 3037 rdev->flags = 0;
3198 rdev->data_offset = 0; 3038 rdev->data_offset = 0;
3199 rdev->new_data_offset = 0;
3200 rdev->sb_events = 0; 3039 rdev->sb_events = 0;
3201 rdev->last_read_error.tv_sec = 0; 3040 rdev->last_read_error.tv_sec = 0;
3202 rdev->last_read_error.tv_nsec = 0; 3041 rdev->last_read_error.tv_nsec = 0;
@@ -3233,11 +3072,11 @@ EXPORT_SYMBOL_GPL(md_rdev_init);
3233 * 3072 *
3234 * a faulty rdev _never_ has rdev->sb set. 3073 * a faulty rdev _never_ has rdev->sb set.
3235 */ 3074 */
3236static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) 3075static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
3237{ 3076{
3238 char b[BDEVNAME_SIZE]; 3077 char b[BDEVNAME_SIZE];
3239 int err; 3078 int err;
3240 struct md_rdev *rdev; 3079 mdk_rdev_t *rdev;
3241 sector_t size; 3080 sector_t size;
3242 3081
3243 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 3082 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
@@ -3295,7 +3134,8 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
3295abort_free: 3134abort_free:
3296 if (rdev->bdev) 3135 if (rdev->bdev)
3297 unlock_rdev(rdev); 3136 unlock_rdev(rdev);
3298 md_rdev_clear(rdev); 3137 free_disk_sb(rdev);
3138 kfree(rdev->badblocks.page);
3299 kfree(rdev); 3139 kfree(rdev);
3300 return ERR_PTR(err); 3140 return ERR_PTR(err);
3301} 3141}
@@ -3305,14 +3145,14 @@ abort_free:
3305 */ 3145 */
3306 3146
3307 3147
3308static void analyze_sbs(struct mddev * mddev) 3148static void analyze_sbs(mddev_t * mddev)
3309{ 3149{
3310 int i; 3150 int i;
3311 struct md_rdev *rdev, *freshest, *tmp; 3151 mdk_rdev_t *rdev, *freshest, *tmp;
3312 char b[BDEVNAME_SIZE]; 3152 char b[BDEVNAME_SIZE];
3313 3153
3314 freshest = NULL; 3154 freshest = NULL;
3315 rdev_for_each_safe(rdev, tmp, mddev) 3155 rdev_for_each(rdev, tmp, mddev)
3316 switch (super_types[mddev->major_version]. 3156 switch (super_types[mddev->major_version].
3317 load_super(rdev, freshest, mddev->minor_version)) { 3157 load_super(rdev, freshest, mddev->minor_version)) {
3318 case 1: 3158 case 1:
@@ -3333,7 +3173,7 @@ static void analyze_sbs(struct mddev * mddev)
3333 validate_super(mddev, freshest); 3173 validate_super(mddev, freshest);
3334 3174
3335 i = 0; 3175 i = 0;
3336 rdev_for_each_safe(rdev, tmp, mddev) { 3176 rdev_for_each(rdev, tmp, mddev) {
3337 if (mddev->max_disks && 3177 if (mddev->max_disks &&
3338 (rdev->desc_nr >= mddev->max_disks || 3178 (rdev->desc_nr >= mddev->max_disks ||
3339 i > mddev->max_disks)) { 3179 i > mddev->max_disks)) {
@@ -3408,13 +3248,13 @@ int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
3408static void md_safemode_timeout(unsigned long data); 3248static void md_safemode_timeout(unsigned long data);
3409 3249
3410static ssize_t 3250static ssize_t
3411safe_delay_show(struct mddev *mddev, char *page) 3251safe_delay_show(mddev_t *mddev, char *page)
3412{ 3252{
3413 int msec = (mddev->safemode_delay*1000)/HZ; 3253 int msec = (mddev->safemode_delay*1000)/HZ;
3414 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); 3254 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
3415} 3255}
3416static ssize_t 3256static ssize_t
3417safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) 3257safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
3418{ 3258{
3419 unsigned long msec; 3259 unsigned long msec;
3420 3260
@@ -3436,9 +3276,9 @@ static struct md_sysfs_entry md_safe_delay =
3436__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 3276__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
3437 3277
3438static ssize_t 3278static ssize_t
3439level_show(struct mddev *mddev, char *page) 3279level_show(mddev_t *mddev, char *page)
3440{ 3280{
3441 struct md_personality *p = mddev->pers; 3281 struct mdk_personality *p = mddev->pers;
3442 if (p) 3282 if (p)
3443 return sprintf(page, "%s\n", p->name); 3283 return sprintf(page, "%s\n", p->name);
3444 else if (mddev->clevel[0]) 3284 else if (mddev->clevel[0])
@@ -3450,14 +3290,14 @@ level_show(struct mddev *mddev, char *page)
3450} 3290}
3451 3291
3452static ssize_t 3292static ssize_t
3453level_store(struct mddev *mddev, const char *buf, size_t len) 3293level_store(mddev_t *mddev, const char *buf, size_t len)
3454{ 3294{
3455 char clevel[16]; 3295 char clevel[16];
3456 ssize_t rv = len; 3296 ssize_t rv = len;
3457 struct md_personality *pers; 3297 struct mdk_personality *pers;
3458 long level; 3298 long level;
3459 void *priv; 3299 void *priv;
3460 struct md_rdev *rdev; 3300 mdk_rdev_t *rdev;
3461 3301
3462 if (mddev->pers == NULL) { 3302 if (mddev->pers == NULL) {
3463 if (len == 0) 3303 if (len == 0)
@@ -3522,7 +3362,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3522 return -EINVAL; 3362 return -EINVAL;
3523 } 3363 }
3524 3364
3525 rdev_for_each(rdev, mddev) 3365 list_for_each_entry(rdev, &mddev->disks, same_set)
3526 rdev->new_raid_disk = rdev->raid_disk; 3366 rdev->new_raid_disk = rdev->raid_disk;
3527 3367
3528 /* ->takeover must set new_* and/or delta_disks 3368 /* ->takeover must set new_* and/or delta_disks
@@ -3535,7 +3375,6 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3535 mddev->new_chunk_sectors = mddev->chunk_sectors; 3375 mddev->new_chunk_sectors = mddev->chunk_sectors;
3536 mddev->raid_disks -= mddev->delta_disks; 3376 mddev->raid_disks -= mddev->delta_disks;
3537 mddev->delta_disks = 0; 3377 mddev->delta_disks = 0;
3538 mddev->reshape_backwards = 0;
3539 module_put(pers->owner); 3378 module_put(pers->owner);
3540 printk(KERN_WARNING "md: %s: %s would not accept array\n", 3379 printk(KERN_WARNING "md: %s: %s would not accept array\n",
3541 mdname(mddev), clevel); 3380 mdname(mddev), clevel);
@@ -3576,7 +3415,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3576 mddev->safemode = 0; 3415 mddev->safemode = 0;
3577 } 3416 }
3578 3417
3579 rdev_for_each(rdev, mddev) { 3418 list_for_each_entry(rdev, &mddev->disks, same_set) {
3580 if (rdev->raid_disk < 0) 3419 if (rdev->raid_disk < 0)
3581 continue; 3420 continue;
3582 if (rdev->new_raid_disk >= mddev->raid_disks) 3421 if (rdev->new_raid_disk >= mddev->raid_disks)
@@ -3585,7 +3424,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3585 continue; 3424 continue;
3586 sysfs_unlink_rdev(mddev, rdev); 3425 sysfs_unlink_rdev(mddev, rdev);
3587 } 3426 }
3588 rdev_for_each(rdev, mddev) { 3427 list_for_each_entry(rdev, &mddev->disks, same_set) {
3589 if (rdev->raid_disk < 0) 3428 if (rdev->raid_disk < 0)
3590 continue; 3429 continue;
3591 if (rdev->new_raid_disk == rdev->raid_disk) 3430 if (rdev->new_raid_disk == rdev->raid_disk)
@@ -3609,7 +3448,6 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3609 mddev->layout = mddev->new_layout; 3448 mddev->layout = mddev->new_layout;
3610 mddev->chunk_sectors = mddev->new_chunk_sectors; 3449 mddev->chunk_sectors = mddev->new_chunk_sectors;
3611 mddev->delta_disks = 0; 3450 mddev->delta_disks = 0;
3612 mddev->reshape_backwards = 0;
3613 mddev->degraded = 0; 3451 mddev->degraded = 0;
3614 if (mddev->pers->sync_request == NULL) { 3452 if (mddev->pers->sync_request == NULL) {
3615 /* this is now an array without redundancy, so 3453 /* this is now an array without redundancy, so
@@ -3619,8 +3457,10 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3619 del_timer_sync(&mddev->safemode_timer); 3457 del_timer_sync(&mddev->safemode_timer);
3620 } 3458 }
3621 pers->run(mddev); 3459 pers->run(mddev);
3622 set_bit(MD_CHANGE_DEVS, &mddev->flags);
3623 mddev_resume(mddev); 3460 mddev_resume(mddev);
3461 set_bit(MD_CHANGE_DEVS, &mddev->flags);
3462 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3463 md_wakeup_thread(mddev->thread);
3624 sysfs_notify(&mddev->kobj, NULL, "level"); 3464 sysfs_notify(&mddev->kobj, NULL, "level");
3625 md_new_event(mddev); 3465 md_new_event(mddev);
3626 return rv; 3466 return rv;
@@ -3631,7 +3471,7 @@ __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
3631 3471
3632 3472
3633static ssize_t 3473static ssize_t
3634layout_show(struct mddev *mddev, char *page) 3474layout_show(mddev_t *mddev, char *page)
3635{ 3475{
3636 /* just a number, not meaningful for all levels */ 3476 /* just a number, not meaningful for all levels */
3637 if (mddev->reshape_position != MaxSector && 3477 if (mddev->reshape_position != MaxSector &&
@@ -3642,7 +3482,7 @@ layout_show(struct mddev *mddev, char *page)
3642} 3482}
3643 3483
3644static ssize_t 3484static ssize_t
3645layout_store(struct mddev *mddev, const char *buf, size_t len) 3485layout_store(mddev_t *mddev, const char *buf, size_t len)
3646{ 3486{
3647 char *e; 3487 char *e;
3648 unsigned long n = simple_strtoul(buf, &e, 10); 3488 unsigned long n = simple_strtoul(buf, &e, 10);
@@ -3672,7 +3512,7 @@ __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
3672 3512
3673 3513
3674static ssize_t 3514static ssize_t
3675raid_disks_show(struct mddev *mddev, char *page) 3515raid_disks_show(mddev_t *mddev, char *page)
3676{ 3516{
3677 if (mddev->raid_disks == 0) 3517 if (mddev->raid_disks == 0)
3678 return 0; 3518 return 0;
@@ -3683,10 +3523,10 @@ raid_disks_show(struct mddev *mddev, char *page)
3683 return sprintf(page, "%d\n", mddev->raid_disks); 3523 return sprintf(page, "%d\n", mddev->raid_disks);
3684} 3524}
3685 3525
3686static int update_raid_disks(struct mddev *mddev, int raid_disks); 3526static int update_raid_disks(mddev_t *mddev, int raid_disks);
3687 3527
3688static ssize_t 3528static ssize_t
3689raid_disks_store(struct mddev *mddev, const char *buf, size_t len) 3529raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
3690{ 3530{
3691 char *e; 3531 char *e;
3692 int rv = 0; 3532 int rv = 0;
@@ -3698,20 +3538,9 @@ raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
3698 if (mddev->pers) 3538 if (mddev->pers)
3699 rv = update_raid_disks(mddev, n); 3539 rv = update_raid_disks(mddev, n);
3700 else if (mddev->reshape_position != MaxSector) { 3540 else if (mddev->reshape_position != MaxSector) {
3701 struct md_rdev *rdev;
3702 int olddisks = mddev->raid_disks - mddev->delta_disks; 3541 int olddisks = mddev->raid_disks - mddev->delta_disks;
3703
3704 rdev_for_each(rdev, mddev) {
3705 if (olddisks < n &&
3706 rdev->data_offset < rdev->new_data_offset)
3707 return -EINVAL;
3708 if (olddisks > n &&
3709 rdev->data_offset > rdev->new_data_offset)
3710 return -EINVAL;
3711 }
3712 mddev->delta_disks = n - olddisks; 3542 mddev->delta_disks = n - olddisks;
3713 mddev->raid_disks = n; 3543 mddev->raid_disks = n;
3714 mddev->reshape_backwards = (mddev->delta_disks < 0);
3715 } else 3544 } else
3716 mddev->raid_disks = n; 3545 mddev->raid_disks = n;
3717 return rv ? rv : len; 3546 return rv ? rv : len;
@@ -3720,7 +3549,7 @@ static struct md_sysfs_entry md_raid_disks =
3720__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 3549__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
3721 3550
3722static ssize_t 3551static ssize_t
3723chunk_size_show(struct mddev *mddev, char *page) 3552chunk_size_show(mddev_t *mddev, char *page)
3724{ 3553{
3725 if (mddev->reshape_position != MaxSector && 3554 if (mddev->reshape_position != MaxSector &&
3726 mddev->chunk_sectors != mddev->new_chunk_sectors) 3555 mddev->chunk_sectors != mddev->new_chunk_sectors)
@@ -3731,7 +3560,7 @@ chunk_size_show(struct mddev *mddev, char *page)
3731} 3560}
3732 3561
3733static ssize_t 3562static ssize_t
3734chunk_size_store(struct mddev *mddev, const char *buf, size_t len) 3563chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
3735{ 3564{
3736 char *e; 3565 char *e;
3737 unsigned long n = simple_strtoul(buf, &e, 10); 3566 unsigned long n = simple_strtoul(buf, &e, 10);
@@ -3760,7 +3589,7 @@ static struct md_sysfs_entry md_chunk_size =
3760__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 3589__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
3761 3590
3762static ssize_t 3591static ssize_t
3763resync_start_show(struct mddev *mddev, char *page) 3592resync_start_show(mddev_t *mddev, char *page)
3764{ 3593{
3765 if (mddev->recovery_cp == MaxSector) 3594 if (mddev->recovery_cp == MaxSector)
3766 return sprintf(page, "none\n"); 3595 return sprintf(page, "none\n");
@@ -3768,7 +3597,7 @@ resync_start_show(struct mddev *mddev, char *page)
3768} 3597}
3769 3598
3770static ssize_t 3599static ssize_t
3771resync_start_store(struct mddev *mddev, const char *buf, size_t len) 3600resync_start_store(mddev_t *mddev, const char *buf, size_t len)
3772{ 3601{
3773 char *e; 3602 char *e;
3774 unsigned long long n = simple_strtoull(buf, &e, 10); 3603 unsigned long long n = simple_strtoull(buf, &e, 10);
@@ -3781,8 +3610,6 @@ resync_start_store(struct mddev *mddev, const char *buf, size_t len)
3781 return -EINVAL; 3610 return -EINVAL;
3782 3611
3783 mddev->recovery_cp = n; 3612 mddev->recovery_cp = n;
3784 if (mddev->pers)
3785 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
3786 return len; 3613 return len;
3787} 3614}
3788static struct md_sysfs_entry md_resync_start = 3615static struct md_sysfs_entry md_resync_start =
@@ -3840,7 +3667,7 @@ static int match_word(const char *word, char **list)
3840} 3667}
3841 3668
3842static ssize_t 3669static ssize_t
3843array_state_show(struct mddev *mddev, char *page) 3670array_state_show(mddev_t *mddev, char *page)
3844{ 3671{
3845 enum array_state st = inactive; 3672 enum array_state st = inactive;
3846 3673
@@ -3873,13 +3700,13 @@ array_state_show(struct mddev *mddev, char *page)
3873 return sprintf(page, "%s\n", array_states[st]); 3700 return sprintf(page, "%s\n", array_states[st]);
3874} 3701}
3875 3702
3876static int do_md_stop(struct mddev * mddev, int ro, struct block_device *bdev); 3703static int do_md_stop(mddev_t * mddev, int ro, int is_open);
3877static int md_set_readonly(struct mddev * mddev, struct block_device *bdev); 3704static int md_set_readonly(mddev_t * mddev, int is_open);
3878static int do_md_run(struct mddev * mddev); 3705static int do_md_run(mddev_t * mddev);
3879static int restart_array(struct mddev *mddev); 3706static int restart_array(mddev_t *mddev);
3880 3707
3881static ssize_t 3708static ssize_t
3882array_state_store(struct mddev *mddev, const char *buf, size_t len) 3709array_state_store(mddev_t *mddev, const char *buf, size_t len)
3883{ 3710{
3884 int err = -EINVAL; 3711 int err = -EINVAL;
3885 enum array_state st = match_word(buf, array_states); 3712 enum array_state st = match_word(buf, array_states);
@@ -3888,20 +3715,24 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
3888 break; 3715 break;
3889 case clear: 3716 case clear:
3890 /* stopping an active array */ 3717 /* stopping an active array */
3891 err = do_md_stop(mddev, 0, NULL); 3718 if (atomic_read(&mddev->openers) > 0)
3719 return -EBUSY;
3720 err = do_md_stop(mddev, 0, 0);
3892 break; 3721 break;
3893 case inactive: 3722 case inactive:
3894 /* stopping an active array */ 3723 /* stopping an active array */
3895 if (mddev->pers) 3724 if (mddev->pers) {
3896 err = do_md_stop(mddev, 2, NULL); 3725 if (atomic_read(&mddev->openers) > 0)
3897 else 3726 return -EBUSY;
3727 err = do_md_stop(mddev, 2, 0);
3728 } else
3898 err = 0; /* already inactive */ 3729 err = 0; /* already inactive */
3899 break; 3730 break;
3900 case suspended: 3731 case suspended:
3901 break; /* not supported yet */ 3732 break; /* not supported yet */
3902 case readonly: 3733 case readonly:
3903 if (mddev->pers) 3734 if (mddev->pers)
3904 err = md_set_readonly(mddev, NULL); 3735 err = md_set_readonly(mddev, 0);
3905 else { 3736 else {
3906 mddev->ro = 1; 3737 mddev->ro = 1;
3907 set_disk_ro(mddev->gendisk, 1); 3738 set_disk_ro(mddev->gendisk, 1);
@@ -3911,7 +3742,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
3911 case read_auto: 3742 case read_auto:
3912 if (mddev->pers) { 3743 if (mddev->pers) {
3913 if (mddev->ro == 0) 3744 if (mddev->ro == 0)
3914 err = md_set_readonly(mddev, NULL); 3745 err = md_set_readonly(mddev, 0);
3915 else if (mddev->ro == 1) 3746 else if (mddev->ro == 1)
3916 err = restart_array(mddev); 3747 err = restart_array(mddev);
3917 if (err == 0) { 3748 if (err == 0) {
@@ -3961,8 +3792,6 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
3961 if (err) 3792 if (err)
3962 return err; 3793 return err;
3963 else { 3794 else {
3964 if (mddev->hold_active == UNTIL_IOCTL)
3965 mddev->hold_active = 0;
3966 sysfs_notify_dirent_safe(mddev->sysfs_state); 3795 sysfs_notify_dirent_safe(mddev->sysfs_state);
3967 return len; 3796 return len;
3968 } 3797 }
@@ -3971,13 +3800,13 @@ static struct md_sysfs_entry md_array_state =
3971__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 3800__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
3972 3801
3973static ssize_t 3802static ssize_t
3974max_corrected_read_errors_show(struct mddev *mddev, char *page) { 3803max_corrected_read_errors_show(mddev_t *mddev, char *page) {
3975 return sprintf(page, "%d\n", 3804 return sprintf(page, "%d\n",
3976 atomic_read(&mddev->max_corr_read_errors)); 3805 atomic_read(&mddev->max_corr_read_errors));
3977} 3806}
3978 3807
3979static ssize_t 3808static ssize_t
3980max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) 3809max_corrected_read_errors_store(mddev_t *mddev, const char *buf, size_t len)
3981{ 3810{
3982 char *e; 3811 char *e;
3983 unsigned long n = simple_strtoul(buf, &e, 10); 3812 unsigned long n = simple_strtoul(buf, &e, 10);
@@ -3994,13 +3823,13 @@ __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
3994 max_corrected_read_errors_store); 3823 max_corrected_read_errors_store);
3995 3824
3996static ssize_t 3825static ssize_t
3997null_show(struct mddev *mddev, char *page) 3826null_show(mddev_t *mddev, char *page)
3998{ 3827{
3999 return -EINVAL; 3828 return -EINVAL;
4000} 3829}
4001 3830
4002static ssize_t 3831static ssize_t
4003new_dev_store(struct mddev *mddev, const char *buf, size_t len) 3832new_dev_store(mddev_t *mddev, const char *buf, size_t len)
4004{ 3833{
4005 /* buf must be %d:%d\n? giving major and minor numbers */ 3834 /* buf must be %d:%d\n? giving major and minor numbers */
4006 /* The new device is added to the array. 3835 /* The new device is added to the array.
@@ -4013,7 +3842,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
4013 int major = simple_strtoul(buf, &e, 10); 3842 int major = simple_strtoul(buf, &e, 10);
4014 int minor; 3843 int minor;
4015 dev_t dev; 3844 dev_t dev;
4016 struct md_rdev *rdev; 3845 mdk_rdev_t *rdev;
4017 int err; 3846 int err;
4018 3847
4019 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 3848 if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
@@ -4031,9 +3860,8 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
4031 rdev = md_import_device(dev, mddev->major_version, 3860 rdev = md_import_device(dev, mddev->major_version,
4032 mddev->minor_version); 3861 mddev->minor_version);
4033 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 3862 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
4034 struct md_rdev *rdev0 3863 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
4035 = list_entry(mddev->disks.next, 3864 mdk_rdev_t, same_set);
4036 struct md_rdev, same_set);
4037 err = super_types[mddev->major_version] 3865 err = super_types[mddev->major_version]
4038 .load_super(rdev, rdev0, mddev->minor_version); 3866 .load_super(rdev, rdev0, mddev->minor_version);
4039 if (err < 0) 3867 if (err < 0)
@@ -4057,7 +3885,7 @@ static struct md_sysfs_entry md_new_device =
4057__ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 3885__ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
4058 3886
4059static ssize_t 3887static ssize_t
4060bitmap_store(struct mddev *mddev, const char *buf, size_t len) 3888bitmap_store(mddev_t *mddev, const char *buf, size_t len)
4061{ 3889{
4062 char *end; 3890 char *end;
4063 unsigned long chunk, end_chunk; 3891 unsigned long chunk, end_chunk;
@@ -4086,16 +3914,16 @@ static struct md_sysfs_entry md_bitmap =
4086__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 3914__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
4087 3915
4088static ssize_t 3916static ssize_t
4089size_show(struct mddev *mddev, char *page) 3917size_show(mddev_t *mddev, char *page)
4090{ 3918{
4091 return sprintf(page, "%llu\n", 3919 return sprintf(page, "%llu\n",
4092 (unsigned long long)mddev->dev_sectors / 2); 3920 (unsigned long long)mddev->dev_sectors / 2);
4093} 3921}
4094 3922
4095static int update_size(struct mddev *mddev, sector_t num_sectors); 3923static int update_size(mddev_t *mddev, sector_t num_sectors);
4096 3924
4097static ssize_t 3925static ssize_t
4098size_store(struct mddev *mddev, const char *buf, size_t len) 3926size_store(mddev_t *mddev, const char *buf, size_t len)
4099{ 3927{
4100 /* If array is inactive, we can reduce the component size, but 3928 /* If array is inactive, we can reduce the component size, but
4101 * not increase it (except from 0). 3929 * not increase it (except from 0).
@@ -4123,14 +3951,14 @@ static struct md_sysfs_entry md_size =
4123__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 3951__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
4124 3952
4125 3953
4126/* Metadata version. 3954/* Metdata version.
4127 * This is one of 3955 * This is one of
4128 * 'none' for arrays with no metadata (good luck...) 3956 * 'none' for arrays with no metadata (good luck...)
4129 * 'external' for arrays with externally managed metadata, 3957 * 'external' for arrays with externally managed metadata,
4130 * or N.M for internally known formats 3958 * or N.M for internally known formats
4131 */ 3959 */
4132static ssize_t 3960static ssize_t
4133metadata_show(struct mddev *mddev, char *page) 3961metadata_show(mddev_t *mddev, char *page)
4134{ 3962{
4135 if (mddev->persistent) 3963 if (mddev->persistent)
4136 return sprintf(page, "%d.%d\n", 3964 return sprintf(page, "%d.%d\n",
@@ -4142,7 +3970,7 @@ metadata_show(struct mddev *mddev, char *page)
4142} 3970}
4143 3971
4144static ssize_t 3972static ssize_t
4145metadata_store(struct mddev *mddev, const char *buf, size_t len) 3973metadata_store(mddev_t *mddev, const char *buf, size_t len)
4146{ 3974{
4147 int major, minor; 3975 int major, minor;
4148 char *e; 3976 char *e;
@@ -4196,7 +4024,7 @@ static struct md_sysfs_entry md_metadata =
4196__ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 4024__ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
4197 4025
4198static ssize_t 4026static ssize_t
4199action_show(struct mddev *mddev, char *page) 4027action_show(mddev_t *mddev, char *page)
4200{ 4028{
4201 char *type = "idle"; 4029 char *type = "idle";
4202 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 4030 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
@@ -4218,10 +4046,10 @@ action_show(struct mddev *mddev, char *page)
4218 return sprintf(page, "%s\n", type); 4046 return sprintf(page, "%s\n", type);
4219} 4047}
4220 4048
4221static void reap_sync_thread(struct mddev *mddev); 4049static void reap_sync_thread(mddev_t *mddev);
4222 4050
4223static ssize_t 4051static ssize_t
4224action_store(struct mddev *mddev, const char *page, size_t len) 4052action_store(mddev_t *mddev, const char *page, size_t len)
4225{ 4053{
4226 if (!mddev->pers || !mddev->pers->sync_request) 4054 if (!mddev->pers || !mddev->pers->sync_request)
4227 return -EINVAL; 4055 return -EINVAL;
@@ -4260,13 +4088,6 @@ action_store(struct mddev *mddev, const char *page, size_t len)
4260 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 4088 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
4261 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4089 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4262 } 4090 }
4263 if (mddev->ro == 2) {
4264 /* A write to sync_action is enough to justify
4265 * canceling read-auto mode
4266 */
4267 mddev->ro = 0;
4268 md_wakeup_thread(mddev->sync_thread);
4269 }
4270 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4091 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4271 md_wakeup_thread(mddev->thread); 4092 md_wakeup_thread(mddev->thread);
4272 sysfs_notify_dirent_safe(mddev->sysfs_action); 4093 sysfs_notify_dirent_safe(mddev->sysfs_action);
@@ -4274,11 +4095,10 @@ action_store(struct mddev *mddev, const char *page, size_t len)
4274} 4095}
4275 4096
4276static ssize_t 4097static ssize_t
4277mismatch_cnt_show(struct mddev *mddev, char *page) 4098mismatch_cnt_show(mddev_t *mddev, char *page)
4278{ 4099{
4279 return sprintf(page, "%llu\n", 4100 return sprintf(page, "%llu\n",
4280 (unsigned long long) 4101 (unsigned long long) mddev->resync_mismatches);
4281 atomic64_read(&mddev->resync_mismatches));
4282} 4102}
4283 4103
4284static struct md_sysfs_entry md_scan_mode = 4104static struct md_sysfs_entry md_scan_mode =
@@ -4288,14 +4108,14 @@ __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
4288static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 4108static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
4289 4109
4290static ssize_t 4110static ssize_t
4291sync_min_show(struct mddev *mddev, char *page) 4111sync_min_show(mddev_t *mddev, char *page)
4292{ 4112{
4293 return sprintf(page, "%d (%s)\n", speed_min(mddev), 4113 return sprintf(page, "%d (%s)\n", speed_min(mddev),
4294 mddev->sync_speed_min ? "local": "system"); 4114 mddev->sync_speed_min ? "local": "system");
4295} 4115}
4296 4116
4297static ssize_t 4117static ssize_t
4298sync_min_store(struct mddev *mddev, const char *buf, size_t len) 4118sync_min_store(mddev_t *mddev, const char *buf, size_t len)
4299{ 4119{
4300 int min; 4120 int min;
4301 char *e; 4121 char *e;
@@ -4314,14 +4134,14 @@ static struct md_sysfs_entry md_sync_min =
4314__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 4134__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
4315 4135
4316static ssize_t 4136static ssize_t
4317sync_max_show(struct mddev *mddev, char *page) 4137sync_max_show(mddev_t *mddev, char *page)
4318{ 4138{
4319 return sprintf(page, "%d (%s)\n", speed_max(mddev), 4139 return sprintf(page, "%d (%s)\n", speed_max(mddev),
4320 mddev->sync_speed_max ? "local": "system"); 4140 mddev->sync_speed_max ? "local": "system");
4321} 4141}
4322 4142
4323static ssize_t 4143static ssize_t
4324sync_max_store(struct mddev *mddev, const char *buf, size_t len) 4144sync_max_store(mddev_t *mddev, const char *buf, size_t len)
4325{ 4145{
4326 int max; 4146 int max;
4327 char *e; 4147 char *e;
@@ -4340,20 +4160,20 @@ static struct md_sysfs_entry md_sync_max =
4340__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 4160__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
4341 4161
4342static ssize_t 4162static ssize_t
4343degraded_show(struct mddev *mddev, char *page) 4163degraded_show(mddev_t *mddev, char *page)
4344{ 4164{
4345 return sprintf(page, "%d\n", mddev->degraded); 4165 return sprintf(page, "%d\n", mddev->degraded);
4346} 4166}
4347static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); 4167static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
4348 4168
4349static ssize_t 4169static ssize_t
4350sync_force_parallel_show(struct mddev *mddev, char *page) 4170sync_force_parallel_show(mddev_t *mddev, char *page)
4351{ 4171{
4352 return sprintf(page, "%d\n", mddev->parallel_resync); 4172 return sprintf(page, "%d\n", mddev->parallel_resync);
4353} 4173}
4354 4174
4355static ssize_t 4175static ssize_t
4356sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) 4176sync_force_parallel_store(mddev_t *mddev, const char *buf, size_t len)
4357{ 4177{
4358 long n; 4178 long n;
4359 4179
@@ -4377,7 +4197,7 @@ __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
4377 sync_force_parallel_show, sync_force_parallel_store); 4197 sync_force_parallel_show, sync_force_parallel_store);
4378 4198
4379static ssize_t 4199static ssize_t
4380sync_speed_show(struct mddev *mddev, char *page) 4200sync_speed_show(mddev_t *mddev, char *page)
4381{ 4201{
4382 unsigned long resync, dt, db; 4202 unsigned long resync, dt, db;
4383 if (mddev->curr_resync == 0) 4203 if (mddev->curr_resync == 0)
@@ -4392,19 +4212,14 @@ sync_speed_show(struct mddev *mddev, char *page)
4392static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 4212static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
4393 4213
4394static ssize_t 4214static ssize_t
4395sync_completed_show(struct mddev *mddev, char *page) 4215sync_completed_show(mddev_t *mddev, char *page)
4396{ 4216{
4397 unsigned long long max_sectors, resync; 4217 unsigned long long max_sectors, resync;
4398 4218
4399 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4219 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4400 return sprintf(page, "none\n"); 4220 return sprintf(page, "none\n");
4401 4221
4402 if (mddev->curr_resync == 1 || 4222 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
4403 mddev->curr_resync == 2)
4404 return sprintf(page, "delayed\n");
4405
4406 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
4407 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
4408 max_sectors = mddev->resync_max_sectors; 4223 max_sectors = mddev->resync_max_sectors;
4409 else 4224 else
4410 max_sectors = mddev->dev_sectors; 4225 max_sectors = mddev->dev_sectors;
@@ -4416,13 +4231,13 @@ sync_completed_show(struct mddev *mddev, char *page)
4416static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); 4231static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
4417 4232
4418static ssize_t 4233static ssize_t
4419min_sync_show(struct mddev *mddev, char *page) 4234min_sync_show(mddev_t *mddev, char *page)
4420{ 4235{
4421 return sprintf(page, "%llu\n", 4236 return sprintf(page, "%llu\n",
4422 (unsigned long long)mddev->resync_min); 4237 (unsigned long long)mddev->resync_min);
4423} 4238}
4424static ssize_t 4239static ssize_t
4425min_sync_store(struct mddev *mddev, const char *buf, size_t len) 4240min_sync_store(mddev_t *mddev, const char *buf, size_t len)
4426{ 4241{
4427 unsigned long long min; 4242 unsigned long long min;
4428 if (strict_strtoull(buf, 10, &min)) 4243 if (strict_strtoull(buf, 10, &min))
@@ -4447,7 +4262,7 @@ static struct md_sysfs_entry md_min_sync =
4447__ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); 4262__ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
4448 4263
4449static ssize_t 4264static ssize_t
4450max_sync_show(struct mddev *mddev, char *page) 4265max_sync_show(mddev_t *mddev, char *page)
4451{ 4266{
4452 if (mddev->resync_max == MaxSector) 4267 if (mddev->resync_max == MaxSector)
4453 return sprintf(page, "max\n"); 4268 return sprintf(page, "max\n");
@@ -4456,7 +4271,7 @@ max_sync_show(struct mddev *mddev, char *page)
4456 (unsigned long long)mddev->resync_max); 4271 (unsigned long long)mddev->resync_max);
4457} 4272}
4458static ssize_t 4273static ssize_t
4459max_sync_store(struct mddev *mddev, const char *buf, size_t len) 4274max_sync_store(mddev_t *mddev, const char *buf, size_t len)
4460{ 4275{
4461 if (strncmp(buf, "max", 3) == 0) 4276 if (strncmp(buf, "max", 3) == 0)
4462 mddev->resync_max = MaxSector; 4277 mddev->resync_max = MaxSector;
@@ -4487,13 +4302,13 @@ static struct md_sysfs_entry md_max_sync =
4487__ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); 4302__ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
4488 4303
4489static ssize_t 4304static ssize_t
4490suspend_lo_show(struct mddev *mddev, char *page) 4305suspend_lo_show(mddev_t *mddev, char *page)
4491{ 4306{
4492 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); 4307 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
4493} 4308}
4494 4309
4495static ssize_t 4310static ssize_t
4496suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) 4311suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
4497{ 4312{
4498 char *e; 4313 char *e;
4499 unsigned long long new = simple_strtoull(buf, &e, 10); 4314 unsigned long long new = simple_strtoull(buf, &e, 10);
@@ -4521,13 +4336,13 @@ __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
4521 4336
4522 4337
4523static ssize_t 4338static ssize_t
4524suspend_hi_show(struct mddev *mddev, char *page) 4339suspend_hi_show(mddev_t *mddev, char *page)
4525{ 4340{
4526 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); 4341 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
4527} 4342}
4528 4343
4529static ssize_t 4344static ssize_t
4530suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) 4345suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
4531{ 4346{
4532 char *e; 4347 char *e;
4533 unsigned long long new = simple_strtoull(buf, &e, 10); 4348 unsigned long long new = simple_strtoull(buf, &e, 10);
@@ -4554,7 +4369,7 @@ static struct md_sysfs_entry md_suspend_hi =
4554__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 4369__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
4555 4370
4556static ssize_t 4371static ssize_t
4557reshape_position_show(struct mddev *mddev, char *page) 4372reshape_position_show(mddev_t *mddev, char *page)
4558{ 4373{
4559 if (mddev->reshape_position != MaxSector) 4374 if (mddev->reshape_position != MaxSector)
4560 return sprintf(page, "%llu\n", 4375 return sprintf(page, "%llu\n",
@@ -4564,9 +4379,8 @@ reshape_position_show(struct mddev *mddev, char *page)
4564} 4379}
4565 4380
4566static ssize_t 4381static ssize_t
4567reshape_position_store(struct mddev *mddev, const char *buf, size_t len) 4382reshape_position_store(mddev_t *mddev, const char *buf, size_t len)
4568{ 4383{
4569 struct md_rdev *rdev;
4570 char *e; 4384 char *e;
4571 unsigned long long new = simple_strtoull(buf, &e, 10); 4385 unsigned long long new = simple_strtoull(buf, &e, 10);
4572 if (mddev->pers) 4386 if (mddev->pers)
@@ -4575,12 +4389,9 @@ reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
4575 return -EINVAL; 4389 return -EINVAL;
4576 mddev->reshape_position = new; 4390 mddev->reshape_position = new;
4577 mddev->delta_disks = 0; 4391 mddev->delta_disks = 0;
4578 mddev->reshape_backwards = 0;
4579 mddev->new_level = mddev->level; 4392 mddev->new_level = mddev->level;
4580 mddev->new_layout = mddev->layout; 4393 mddev->new_layout = mddev->layout;
4581 mddev->new_chunk_sectors = mddev->chunk_sectors; 4394 mddev->new_chunk_sectors = mddev->chunk_sectors;
4582 rdev_for_each(rdev, mddev)
4583 rdev->new_data_offset = rdev->data_offset;
4584 return len; 4395 return len;
4585} 4396}
4586 4397
@@ -4589,43 +4400,7 @@ __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
4589 reshape_position_store); 4400 reshape_position_store);
4590 4401
4591static ssize_t 4402static ssize_t
4592reshape_direction_show(struct mddev *mddev, char *page) 4403array_size_show(mddev_t *mddev, char *page)
4593{
4594 return sprintf(page, "%s\n",
4595 mddev->reshape_backwards ? "backwards" : "forwards");
4596}
4597
4598static ssize_t
4599reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
4600{
4601 int backwards = 0;
4602 if (cmd_match(buf, "forwards"))
4603 backwards = 0;
4604 else if (cmd_match(buf, "backwards"))
4605 backwards = 1;
4606 else
4607 return -EINVAL;
4608 if (mddev->reshape_backwards == backwards)
4609 return len;
4610
4611 /* check if we are allowed to change */
4612 if (mddev->delta_disks)
4613 return -EBUSY;
4614
4615 if (mddev->persistent &&
4616 mddev->major_version == 0)
4617 return -EINVAL;
4618
4619 mddev->reshape_backwards = backwards;
4620 return len;
4621}
4622
4623static struct md_sysfs_entry md_reshape_direction =
4624__ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show,
4625 reshape_direction_store);
4626
4627static ssize_t
4628array_size_show(struct mddev *mddev, char *page)
4629{ 4404{
4630 if (mddev->external_size) 4405 if (mddev->external_size)
4631 return sprintf(page, "%llu\n", 4406 return sprintf(page, "%llu\n",
@@ -4635,7 +4410,7 @@ array_size_show(struct mddev *mddev, char *page)
4635} 4410}
4636 4411
4637static ssize_t 4412static ssize_t
4638array_size_store(struct mddev *mddev, const char *buf, size_t len) 4413array_size_store(mddev_t *mddev, const char *buf, size_t len)
4639{ 4414{
4640 sector_t sectors; 4415 sector_t sectors;
4641 4416
@@ -4679,7 +4454,6 @@ static struct attribute *md_default_attrs[] = {
4679 &md_safe_delay.attr, 4454 &md_safe_delay.attr,
4680 &md_array_state.attr, 4455 &md_array_state.attr,
4681 &md_reshape_position.attr, 4456 &md_reshape_position.attr,
4682 &md_reshape_direction.attr,
4683 &md_array_size.attr, 4457 &md_array_size.attr,
4684 &max_corr_read_errors.attr, 4458 &max_corr_read_errors.attr,
4685 NULL, 4459 NULL,
@@ -4711,25 +4485,16 @@ static ssize_t
4711md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 4485md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
4712{ 4486{
4713 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 4487 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
4714 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 4488 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
4715 ssize_t rv; 4489 ssize_t rv;
4716 4490
4717 if (!entry->show) 4491 if (!entry->show)
4718 return -EIO; 4492 return -EIO;
4719 spin_lock(&all_mddevs_lock);
4720 if (list_empty(&mddev->all_mddevs)) {
4721 spin_unlock(&all_mddevs_lock);
4722 return -EBUSY;
4723 }
4724 mddev_get(mddev);
4725 spin_unlock(&all_mddevs_lock);
4726
4727 rv = mddev_lock(mddev); 4493 rv = mddev_lock(mddev);
4728 if (!rv) { 4494 if (!rv) {
4729 rv = entry->show(mddev, page); 4495 rv = entry->show(mddev, page);
4730 mddev_unlock(mddev); 4496 mddev_unlock(mddev);
4731 } 4497 }
4732 mddev_put(mddev);
4733 return rv; 4498 return rv;
4734} 4499}
4735 4500
@@ -4738,34 +4503,26 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
4738 const char *page, size_t length) 4503 const char *page, size_t length)
4739{ 4504{
4740 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 4505 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
4741 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 4506 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
4742 ssize_t rv; 4507 ssize_t rv;
4743 4508
4744 if (!entry->store) 4509 if (!entry->store)
4745 return -EIO; 4510 return -EIO;
4746 if (!capable(CAP_SYS_ADMIN)) 4511 if (!capable(CAP_SYS_ADMIN))
4747 return -EACCES; 4512 return -EACCES;
4748 spin_lock(&all_mddevs_lock);
4749 if (list_empty(&mddev->all_mddevs)) {
4750 spin_unlock(&all_mddevs_lock);
4751 return -EBUSY;
4752 }
4753 mddev_get(mddev);
4754 spin_unlock(&all_mddevs_lock);
4755 if (entry->store == new_dev_store)
4756 flush_workqueue(md_misc_wq);
4757 rv = mddev_lock(mddev); 4513 rv = mddev_lock(mddev);
4514 if (mddev->hold_active == UNTIL_IOCTL)
4515 mddev->hold_active = 0;
4758 if (!rv) { 4516 if (!rv) {
4759 rv = entry->store(mddev, page, length); 4517 rv = entry->store(mddev, page, length);
4760 mddev_unlock(mddev); 4518 mddev_unlock(mddev);
4761 } 4519 }
4762 mddev_put(mddev);
4763 return rv; 4520 return rv;
4764} 4521}
4765 4522
4766static void md_free(struct kobject *ko) 4523static void md_free(struct kobject *ko)
4767{ 4524{
4768 struct mddev *mddev = container_of(ko, struct mddev, kobj); 4525 mddev_t *mddev = container_of(ko, mddev_t, kobj);
4769 4526
4770 if (mddev->sysfs_state) 4527 if (mddev->sysfs_state)
4771 sysfs_put(mddev->sysfs_state); 4528 sysfs_put(mddev->sysfs_state);
@@ -4794,7 +4551,7 @@ int mdp_major = 0;
4794 4551
4795static void mddev_delayed_delete(struct work_struct *ws) 4552static void mddev_delayed_delete(struct work_struct *ws)
4796{ 4553{
4797 struct mddev *mddev = container_of(ws, struct mddev, del_work); 4554 mddev_t *mddev = container_of(ws, mddev_t, del_work);
4798 4555
4799 sysfs_remove_group(&mddev->kobj, &md_bitmap_group); 4556 sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
4800 kobject_del(&mddev->kobj); 4557 kobject_del(&mddev->kobj);
@@ -4804,7 +4561,7 @@ static void mddev_delayed_delete(struct work_struct *ws)
4804static int md_alloc(dev_t dev, char *name) 4561static int md_alloc(dev_t dev, char *name)
4805{ 4562{
4806 static DEFINE_MUTEX(disks_mutex); 4563 static DEFINE_MUTEX(disks_mutex);
4807 struct mddev *mddev = mddev_find(dev); 4564 mddev_t *mddev = mddev_find(dev);
4808 struct gendisk *disk; 4565 struct gendisk *disk;
4809 int partitioned; 4566 int partitioned;
4810 int shift; 4567 int shift;
@@ -4831,7 +4588,7 @@ static int md_alloc(dev_t dev, char *name)
4831 if (name) { 4588 if (name) {
4832 /* Need to ensure that 'name' is not a duplicate. 4589 /* Need to ensure that 'name' is not a duplicate.
4833 */ 4590 */
4834 struct mddev *mddev2; 4591 mddev_t *mddev2;
4835 spin_lock(&all_mddevs_lock); 4592 spin_lock(&all_mddevs_lock);
4836 4593
4837 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) 4594 list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
@@ -4850,7 +4607,6 @@ static int md_alloc(dev_t dev, char *name)
4850 mddev->queue->queuedata = mddev; 4607 mddev->queue->queuedata = mddev;
4851 4608
4852 blk_queue_make_request(mddev->queue, md_make_request); 4609 blk_queue_make_request(mddev->queue, md_make_request);
4853 blk_set_stacking_limits(&mddev->queue->limits);
4854 4610
4855 disk = alloc_disk(1 << shift); 4611 disk = alloc_disk(1 << shift);
4856 if (!disk) { 4612 if (!disk) {
@@ -4933,7 +4689,7 @@ static int add_named_array(const char *val, struct kernel_param *kp)
4933 4689
4934static void md_safemode_timeout(unsigned long data) 4690static void md_safemode_timeout(unsigned long data)
4935{ 4691{
4936 struct mddev *mddev = (struct mddev *) data; 4692 mddev_t *mddev = (mddev_t *) data;
4937 4693
4938 if (!atomic_read(&mddev->writes_pending)) { 4694 if (!atomic_read(&mddev->writes_pending)) {
4939 mddev->safemode = 1; 4695 mddev->safemode = 1;
@@ -4945,11 +4701,11 @@ static void md_safemode_timeout(unsigned long data)
4945 4701
4946static int start_dirty_degraded; 4702static int start_dirty_degraded;
4947 4703
4948int md_run(struct mddev *mddev) 4704int md_run(mddev_t *mddev)
4949{ 4705{
4950 int err; 4706 int err;
4951 struct md_rdev *rdev; 4707 mdk_rdev_t *rdev;
4952 struct md_personality *pers; 4708 struct mdk_personality *pers;
4953 4709
4954 if (list_empty(&mddev->disks)) 4710 if (list_empty(&mddev->disks))
4955 /* cannot run an array with no devices.. */ 4711 /* cannot run an array with no devices.. */
@@ -4980,7 +4736,7 @@ int md_run(struct mddev *mddev)
4980 * the only valid external interface is through the md 4736 * the only valid external interface is through the md
4981 * device. 4737 * device.
4982 */ 4738 */
4983 rdev_for_each(rdev, mddev) { 4739 list_for_each_entry(rdev, &mddev->disks, same_set) {
4984 if (test_bit(Faulty, &rdev->flags)) 4740 if (test_bit(Faulty, &rdev->flags))
4985 continue; 4741 continue;
4986 sync_blockdev(rdev->bdev); 4742 sync_blockdev(rdev->bdev);
@@ -5012,7 +4768,8 @@ int md_run(struct mddev *mddev)
5012 } 4768 }
5013 4769
5014 if (mddev->bio_set == NULL) 4770 if (mddev->bio_set == NULL)
5015 mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0); 4771 mddev->bio_set = bioset_create(BIO_POOL_SIZE,
4772 sizeof(mddev_t *));
5016 4773
5017 spin_lock(&pers_lock); 4774 spin_lock(&pers_lock);
5018 pers = find_pers(mddev->level, mddev->clevel); 4775 pers = find_pers(mddev->level, mddev->clevel);
@@ -5047,11 +4804,11 @@ int md_run(struct mddev *mddev)
5047 * configuration. 4804 * configuration.
5048 */ 4805 */
5049 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 4806 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
5050 struct md_rdev *rdev2; 4807 mdk_rdev_t *rdev2;
5051 int warned = 0; 4808 int warned = 0;
5052 4809
5053 rdev_for_each(rdev, mddev) 4810 list_for_each_entry(rdev, &mddev->disks, same_set)
5054 rdev_for_each(rdev2, mddev) { 4811 list_for_each_entry(rdev2, &mddev->disks, same_set) {
5055 if (rdev < rdev2 && 4812 if (rdev < rdev2 &&
5056 rdev->bdev->bd_contains == 4813 rdev->bdev->bd_contains ==
5057 rdev2->bdev->bd_contains) { 4814 rdev2->bdev->bd_contains) {
@@ -5094,8 +4851,7 @@ int md_run(struct mddev *mddev)
5094 err = -EINVAL; 4851 err = -EINVAL;
5095 mddev->pers->stop(mddev); 4852 mddev->pers->stop(mddev);
5096 } 4853 }
5097 if (err == 0 && mddev->pers->sync_request && 4854 if (err == 0 && mddev->pers->sync_request) {
5098 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
5099 err = bitmap_create(mddev); 4855 err = bitmap_create(mddev);
5100 if (err) { 4856 if (err) {
5101 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 4857 printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
@@ -5129,7 +4885,7 @@ int md_run(struct mddev *mddev)
5129 mddev->in_sync = 1; 4885 mddev->in_sync = 1;
5130 smp_wmb(); 4886 smp_wmb();
5131 mddev->ready = 1; 4887 mddev->ready = 1;
5132 rdev_for_each(rdev, mddev) 4888 list_for_each_entry(rdev, &mddev->disks, same_set)
5133 if (rdev->raid_disk >= 0) 4889 if (rdev->raid_disk >= 0)
5134 if (sysfs_link_rdev(mddev, rdev)) 4890 if (sysfs_link_rdev(mddev, rdev))
5135 /* failure here is OK */; 4891 /* failure here is OK */;
@@ -5147,7 +4903,7 @@ int md_run(struct mddev *mddev)
5147} 4903}
5148EXPORT_SYMBOL_GPL(md_run); 4904EXPORT_SYMBOL_GPL(md_run);
5149 4905
5150static int do_md_run(struct mddev *mddev) 4906static int do_md_run(mddev_t *mddev)
5151{ 4907{
5152 int err; 4908 int err;
5153 4909
@@ -5171,7 +4927,7 @@ out:
5171 return err; 4927 return err;
5172} 4928}
5173 4929
5174static int restart_array(struct mddev *mddev) 4930static int restart_array(mddev_t *mddev)
5175{ 4931{
5176 struct gendisk *disk = mddev->gendisk; 4932 struct gendisk *disk = mddev->gendisk;
5177 4933
@@ -5221,7 +4977,7 @@ void restore_bitmap_write_access(struct file *file)
5221 spin_unlock(&inode->i_lock); 4977 spin_unlock(&inode->i_lock);
5222} 4978}
5223 4979
5224static void md_clean(struct mddev *mddev) 4980static void md_clean(mddev_t *mddev)
5225{ 4981{
5226 mddev->array_sectors = 0; 4982 mddev->array_sectors = 0;
5227 mddev->external_size = 0; 4983 mddev->external_size = 0;
@@ -5245,12 +5001,11 @@ static void md_clean(struct mddev *mddev)
5245 mddev->events = 0; 5001 mddev->events = 0;
5246 mddev->can_decrease_events = 0; 5002 mddev->can_decrease_events = 0;
5247 mddev->delta_disks = 0; 5003 mddev->delta_disks = 0;
5248 mddev->reshape_backwards = 0;
5249 mddev->new_level = LEVEL_NONE; 5004 mddev->new_level = LEVEL_NONE;
5250 mddev->new_layout = 0; 5005 mddev->new_layout = 0;
5251 mddev->new_chunk_sectors = 0; 5006 mddev->new_chunk_sectors = 0;
5252 mddev->curr_resync = 0; 5007 mddev->curr_resync = 0;
5253 atomic64_set(&mddev->resync_mismatches, 0); 5008 mddev->resync_mismatches = 0;
5254 mddev->suspend_lo = mddev->suspend_hi = 0; 5009 mddev->suspend_lo = mddev->suspend_hi = 0;
5255 mddev->sync_speed_min = mddev->sync_speed_max = 0; 5010 mddev->sync_speed_min = mddev->sync_speed_max = 0;
5256 mddev->recovery = 0; 5011 mddev->recovery = 0;
@@ -5258,16 +5013,14 @@ static void md_clean(struct mddev *mddev)
5258 mddev->changed = 0; 5013 mddev->changed = 0;
5259 mddev->degraded = 0; 5014 mddev->degraded = 0;
5260 mddev->safemode = 0; 5015 mddev->safemode = 0;
5261 mddev->merge_check_needed = 0;
5262 mddev->bitmap_info.offset = 0; 5016 mddev->bitmap_info.offset = 0;
5263 mddev->bitmap_info.default_offset = 0; 5017 mddev->bitmap_info.default_offset = 0;
5264 mddev->bitmap_info.default_space = 0;
5265 mddev->bitmap_info.chunksize = 0; 5018 mddev->bitmap_info.chunksize = 0;
5266 mddev->bitmap_info.daemon_sleep = 0; 5019 mddev->bitmap_info.daemon_sleep = 0;
5267 mddev->bitmap_info.max_write_behind = 0; 5020 mddev->bitmap_info.max_write_behind = 0;
5268} 5021}
5269 5022
5270static void __md_stop_writes(struct mddev *mddev) 5023static void __md_stop_writes(mddev_t *mddev)
5271{ 5024{
5272 if (mddev->sync_thread) { 5025 if (mddev->sync_thread) {
5273 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5026 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
@@ -5287,7 +5040,7 @@ static void __md_stop_writes(struct mddev *mddev)
5287 } 5040 }
5288} 5041}
5289 5042
5290void md_stop_writes(struct mddev *mddev) 5043void md_stop_writes(mddev_t *mddev)
5291{ 5044{
5292 mddev_lock(mddev); 5045 mddev_lock(mddev);
5293 __md_stop_writes(mddev); 5046 __md_stop_writes(mddev);
@@ -5295,7 +5048,7 @@ void md_stop_writes(struct mddev *mddev)
5295} 5048}
5296EXPORT_SYMBOL_GPL(md_stop_writes); 5049EXPORT_SYMBOL_GPL(md_stop_writes);
5297 5050
5298static void __md_stop(struct mddev *mddev) 5051void md_stop(mddev_t *mddev)
5299{ 5052{
5300 mddev->ready = 0; 5053 mddev->ready = 0;
5301 mddev->pers->stop(mddev); 5054 mddev->pers->stop(mddev);
@@ -5305,31 +5058,17 @@ static void __md_stop(struct mddev *mddev)
5305 mddev->pers = NULL; 5058 mddev->pers = NULL;
5306 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5059 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5307} 5060}
5308
5309void md_stop(struct mddev *mddev)
5310{
5311 /* stop the array and free an attached data structures.
5312 * This is called from dm-raid
5313 */
5314 __md_stop(mddev);
5315 bitmap_destroy(mddev);
5316 if (mddev->bio_set)
5317 bioset_free(mddev->bio_set);
5318}
5319
5320EXPORT_SYMBOL_GPL(md_stop); 5061EXPORT_SYMBOL_GPL(md_stop);
5321 5062
5322static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) 5063static int md_set_readonly(mddev_t *mddev, int is_open)
5323{ 5064{
5324 int err = 0; 5065 int err = 0;
5325 mutex_lock(&mddev->open_mutex); 5066 mutex_lock(&mddev->open_mutex);
5326 if (atomic_read(&mddev->openers) > !!bdev) { 5067 if (atomic_read(&mddev->openers) > is_open) {
5327 printk("md: %s still in use.\n",mdname(mddev)); 5068 printk("md: %s still in use.\n",mdname(mddev));
5328 err = -EBUSY; 5069 err = -EBUSY;
5329 goto out; 5070 goto out;
5330 } 5071 }
5331 if (bdev)
5332 sync_blockdev(bdev);
5333 if (mddev->pers) { 5072 if (mddev->pers) {
5334 __md_stop_writes(mddev); 5073 __md_stop_writes(mddev);
5335 5074
@@ -5351,40 +5090,32 @@ out:
5351 * 0 - completely stop and dis-assemble array 5090 * 0 - completely stop and dis-assemble array
5352 * 2 - stop but do not disassemble array 5091 * 2 - stop but do not disassemble array
5353 */ 5092 */
5354static int do_md_stop(struct mddev * mddev, int mode, 5093static int do_md_stop(mddev_t * mddev, int mode, int is_open)
5355 struct block_device *bdev)
5356{ 5094{
5357 struct gendisk *disk = mddev->gendisk; 5095 struct gendisk *disk = mddev->gendisk;
5358 struct md_rdev *rdev; 5096 mdk_rdev_t *rdev;
5359 5097
5360 mutex_lock(&mddev->open_mutex); 5098 mutex_lock(&mddev->open_mutex);
5361 if (atomic_read(&mddev->openers) > !!bdev || 5099 if (atomic_read(&mddev->openers) > is_open ||
5362 mddev->sysfs_active) { 5100 mddev->sysfs_active) {
5363 printk("md: %s still in use.\n",mdname(mddev)); 5101 printk("md: %s still in use.\n",mdname(mddev));
5364 mutex_unlock(&mddev->open_mutex); 5102 mutex_unlock(&mddev->open_mutex);
5365 return -EBUSY; 5103 return -EBUSY;
5366 } 5104 }
5367 if (bdev)
5368 /* It is possible IO was issued on some other
5369 * open file which was closed before we took ->open_mutex.
5370 * As that was not the last close __blkdev_put will not
5371 * have called sync_blockdev, so we must.
5372 */
5373 sync_blockdev(bdev);
5374 5105
5375 if (mddev->pers) { 5106 if (mddev->pers) {
5376 if (mddev->ro) 5107 if (mddev->ro)
5377 set_disk_ro(disk, 0); 5108 set_disk_ro(disk, 0);
5378 5109
5379 __md_stop_writes(mddev); 5110 __md_stop_writes(mddev);
5380 __md_stop(mddev); 5111 md_stop(mddev);
5381 mddev->queue->merge_bvec_fn = NULL; 5112 mddev->queue->merge_bvec_fn = NULL;
5382 mddev->queue->backing_dev_info.congested_fn = NULL; 5113 mddev->queue->backing_dev_info.congested_fn = NULL;
5383 5114
5384 /* tell userspace to handle 'inactive' */ 5115 /* tell userspace to handle 'inactive' */
5385 sysfs_notify_dirent_safe(mddev->sysfs_state); 5116 sysfs_notify_dirent_safe(mddev->sysfs_state);
5386 5117
5387 rdev_for_each(rdev, mddev) 5118 list_for_each_entry(rdev, &mddev->disks, same_set)
5388 if (rdev->raid_disk >= 0) 5119 if (rdev->raid_disk >= 0)
5389 sysfs_unlink_rdev(mddev, rdev); 5120 sysfs_unlink_rdev(mddev, rdev);
5390 5121
@@ -5425,9 +5156,9 @@ static int do_md_stop(struct mddev * mddev, int mode,
5425} 5156}
5426 5157
5427#ifndef MODULE 5158#ifndef MODULE
5428static void autorun_array(struct mddev *mddev) 5159static void autorun_array(mddev_t *mddev)
5429{ 5160{
5430 struct md_rdev *rdev; 5161 mdk_rdev_t *rdev;
5431 int err; 5162 int err;
5432 5163
5433 if (list_empty(&mddev->disks)) 5164 if (list_empty(&mddev->disks))
@@ -5435,7 +5166,7 @@ static void autorun_array(struct mddev *mddev)
5435 5166
5436 printk(KERN_INFO "md: running: "); 5167 printk(KERN_INFO "md: running: ");
5437 5168
5438 rdev_for_each(rdev, mddev) { 5169 list_for_each_entry(rdev, &mddev->disks, same_set) {
5439 char b[BDEVNAME_SIZE]; 5170 char b[BDEVNAME_SIZE];
5440 printk("<%s>", bdevname(rdev->bdev,b)); 5171 printk("<%s>", bdevname(rdev->bdev,b));
5441 } 5172 }
@@ -5444,7 +5175,7 @@ static void autorun_array(struct mddev *mddev)
5444 err = do_md_run(mddev); 5175 err = do_md_run(mddev);
5445 if (err) { 5176 if (err) {
5446 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 5177 printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
5447 do_md_stop(mddev, 0, NULL); 5178 do_md_stop(mddev, 0, 0);
5448 } 5179 }
5449} 5180}
5450 5181
@@ -5462,8 +5193,8 @@ static void autorun_array(struct mddev *mddev)
5462 */ 5193 */
5463static void autorun_devices(int part) 5194static void autorun_devices(int part)
5464{ 5195{
5465 struct md_rdev *rdev0, *rdev, *tmp; 5196 mdk_rdev_t *rdev0, *rdev, *tmp;
5466 struct mddev *mddev; 5197 mddev_t *mddev;
5467 char b[BDEVNAME_SIZE]; 5198 char b[BDEVNAME_SIZE];
5468 5199
5469 printk(KERN_INFO "md: autorun ...\n"); 5200 printk(KERN_INFO "md: autorun ...\n");
@@ -5472,7 +5203,7 @@ static void autorun_devices(int part)
5472 dev_t dev; 5203 dev_t dev;
5473 LIST_HEAD(candidates); 5204 LIST_HEAD(candidates);
5474 rdev0 = list_entry(pending_raid_disks.next, 5205 rdev0 = list_entry(pending_raid_disks.next,
5475 struct md_rdev, same_set); 5206 mdk_rdev_t, same_set);
5476 5207
5477 printk(KERN_INFO "md: considering %s ...\n", 5208 printk(KERN_INFO "md: considering %s ...\n",
5478 bdevname(rdev0->bdev,b)); 5209 bdevname(rdev0->bdev,b));
@@ -5558,15 +5289,14 @@ static int get_version(void __user * arg)
5558 return 0; 5289 return 0;
5559} 5290}
5560 5291
5561static int get_array_info(struct mddev * mddev, void __user * arg) 5292static int get_array_info(mddev_t * mddev, void __user * arg)
5562{ 5293{
5563 mdu_array_info_t info; 5294 mdu_array_info_t info;
5564 int nr,working,insync,failed,spare; 5295 int nr,working,insync,failed,spare;
5565 struct md_rdev *rdev; 5296 mdk_rdev_t *rdev;
5566 5297
5567 nr = working = insync = failed = spare = 0; 5298 nr=working=insync=failed=spare=0;
5568 rcu_read_lock(); 5299 list_for_each_entry(rdev, &mddev->disks, same_set) {
5569 rdev_for_each_rcu(rdev, mddev) {
5570 nr++; 5300 nr++;
5571 if (test_bit(Faulty, &rdev->flags)) 5301 if (test_bit(Faulty, &rdev->flags))
5572 failed++; 5302 failed++;
@@ -5578,7 +5308,6 @@ static int get_array_info(struct mddev * mddev, void __user * arg)
5578 spare++; 5308 spare++;
5579 } 5309 }
5580 } 5310 }
5581 rcu_read_unlock();
5582 5311
5583 info.major_version = mddev->major_version; 5312 info.major_version = mddev->major_version;
5584 info.minor_version = mddev->minor_version; 5313 info.minor_version = mddev->minor_version;
@@ -5613,7 +5342,7 @@ static int get_array_info(struct mddev * mddev, void __user * arg)
5613 return 0; 5342 return 0;
5614} 5343}
5615 5344
5616static int get_bitmap_file(struct mddev * mddev, void __user * arg) 5345static int get_bitmap_file(mddev_t * mddev, void __user * arg)
5617{ 5346{
5618 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 5347 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
5619 char *ptr, *buf = NULL; 5348 char *ptr, *buf = NULL;
@@ -5628,7 +5357,7 @@ static int get_bitmap_file(struct mddev * mddev, void __user * arg)
5628 goto out; 5357 goto out;
5629 5358
5630 /* bitmap disabled, zero the first byte and copy out */ 5359 /* bitmap disabled, zero the first byte and copy out */
5631 if (!mddev->bitmap || !mddev->bitmap->storage.file) { 5360 if (!mddev->bitmap || !mddev->bitmap->file) {
5632 file->pathname[0] = '\0'; 5361 file->pathname[0] = '\0';
5633 goto copy_out; 5362 goto copy_out;
5634 } 5363 }
@@ -5637,8 +5366,7 @@ static int get_bitmap_file(struct mddev * mddev, void __user * arg)
5637 if (!buf) 5366 if (!buf)
5638 goto out; 5367 goto out;
5639 5368
5640 ptr = d_path(&mddev->bitmap->storage.file->f_path, 5369 ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname));
5641 buf, sizeof(file->pathname));
5642 if (IS_ERR(ptr)) 5370 if (IS_ERR(ptr))
5643 goto out; 5371 goto out;
5644 5372
@@ -5654,16 +5382,15 @@ out:
5654 return err; 5382 return err;
5655} 5383}
5656 5384
5657static int get_disk_info(struct mddev * mddev, void __user * arg) 5385static int get_disk_info(mddev_t * mddev, void __user * arg)
5658{ 5386{
5659 mdu_disk_info_t info; 5387 mdu_disk_info_t info;
5660 struct md_rdev *rdev; 5388 mdk_rdev_t *rdev;
5661 5389
5662 if (copy_from_user(&info, arg, sizeof(info))) 5390 if (copy_from_user(&info, arg, sizeof(info)))
5663 return -EFAULT; 5391 return -EFAULT;
5664 5392
5665 rcu_read_lock(); 5393 rdev = find_rdev_nr(mddev, info.number);
5666 rdev = find_rdev_nr_rcu(mddev, info.number);
5667 if (rdev) { 5394 if (rdev) {
5668 info.major = MAJOR(rdev->bdev->bd_dev); 5395 info.major = MAJOR(rdev->bdev->bd_dev);
5669 info.minor = MINOR(rdev->bdev->bd_dev); 5396 info.minor = MINOR(rdev->bdev->bd_dev);
@@ -5682,7 +5409,6 @@ static int get_disk_info(struct mddev * mddev, void __user * arg)
5682 info.raid_disk = -1; 5409 info.raid_disk = -1;
5683 info.state = (1<<MD_DISK_REMOVED); 5410 info.state = (1<<MD_DISK_REMOVED);
5684 } 5411 }
5685 rcu_read_unlock();
5686 5412
5687 if (copy_to_user(arg, &info, sizeof(info))) 5413 if (copy_to_user(arg, &info, sizeof(info)))
5688 return -EFAULT; 5414 return -EFAULT;
@@ -5690,10 +5416,10 @@ static int get_disk_info(struct mddev * mddev, void __user * arg)
5690 return 0; 5416 return 0;
5691} 5417}
5692 5418
5693static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info) 5419static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
5694{ 5420{
5695 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 5421 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
5696 struct md_rdev *rdev; 5422 mdk_rdev_t *rdev;
5697 dev_t dev = MKDEV(info->major,info->minor); 5423 dev_t dev = MKDEV(info->major,info->minor);
5698 5424
5699 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 5425 if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
@@ -5710,9 +5436,8 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info)
5710 return PTR_ERR(rdev); 5436 return PTR_ERR(rdev);
5711 } 5437 }
5712 if (!list_empty(&mddev->disks)) { 5438 if (!list_empty(&mddev->disks)) {
5713 struct md_rdev *rdev0 5439 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
5714 = list_entry(mddev->disks.next, 5440 mdk_rdev_t, same_set);
5715 struct md_rdev, same_set);
5716 err = super_types[mddev->major_version] 5441 err = super_types[mddev->major_version]
5717 .load_super(rdev, rdev0, mddev->minor_version); 5442 .load_super(rdev, rdev0, mddev->minor_version);
5718 if (err < 0) { 5443 if (err < 0) {
@@ -5766,7 +5491,8 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info)
5766 super_types[mddev->major_version]. 5491 super_types[mddev->major_version].
5767 validate_super(mddev, rdev); 5492 validate_super(mddev, rdev);
5768 if ((info->state & (1<<MD_DISK_SYNC)) && 5493 if ((info->state & (1<<MD_DISK_SYNC)) &&
5769 rdev->raid_disk != info->raid_disk) { 5494 (!test_bit(In_sync, &rdev->flags) ||
5495 rdev->raid_disk != info->raid_disk)) {
5770 /* This was a hot-add request, but events doesn't 5496 /* This was a hot-add request, but events doesn't
5771 * match, so reject it. 5497 * match, so reject it.
5772 */ 5498 */
@@ -5861,10 +5587,10 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info)
5861 return 0; 5587 return 0;
5862} 5588}
5863 5589
5864static int hot_remove_disk(struct mddev * mddev, dev_t dev) 5590static int hot_remove_disk(mddev_t * mddev, dev_t dev)
5865{ 5591{
5866 char b[BDEVNAME_SIZE]; 5592 char b[BDEVNAME_SIZE];
5867 struct md_rdev *rdev; 5593 mdk_rdev_t *rdev;
5868 5594
5869 rdev = find_rdev(mddev, dev); 5595 rdev = find_rdev(mddev, dev);
5870 if (!rdev) 5596 if (!rdev)
@@ -5884,11 +5610,11 @@ busy:
5884 return -EBUSY; 5610 return -EBUSY;
5885} 5611}
5886 5612
5887static int hot_add_disk(struct mddev * mddev, dev_t dev) 5613static int hot_add_disk(mddev_t * mddev, dev_t dev)
5888{ 5614{
5889 char b[BDEVNAME_SIZE]; 5615 char b[BDEVNAME_SIZE];
5890 int err; 5616 int err;
5891 struct md_rdev *rdev; 5617 mdk_rdev_t *rdev;
5892 5618
5893 if (!mddev->pers) 5619 if (!mddev->pers)
5894 return -ENODEV; 5620 return -ENODEV;
@@ -5958,7 +5684,7 @@ abort_export:
5958 return err; 5684 return err;
5959} 5685}
5960 5686
5961static int set_bitmap_file(struct mddev *mddev, int fd) 5687static int set_bitmap_file(mddev_t *mddev, int fd)
5962{ 5688{
5963 int err; 5689 int err;
5964 5690
@@ -6031,7 +5757,7 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
6031 * The minor and patch _version numbers are also kept incase the 5757 * The minor and patch _version numbers are also kept incase the
6032 * super_block handler wishes to interpret them. 5758 * super_block handler wishes to interpret them.
6033 */ 5759 */
6034static int set_array_info(struct mddev * mddev, mdu_array_info_t *info) 5760static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
6035{ 5761{
6036 5762
6037 if (info->raid_disks == 0) { 5763 if (info->raid_disks == 0) {
@@ -6084,7 +5810,6 @@ static int set_array_info(struct mddev * mddev, mdu_array_info_t *info)
6084 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5810 set_bit(MD_CHANGE_DEVS, &mddev->flags);
6085 5811
6086 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 5812 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
6087 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
6088 mddev->bitmap_info.offset = 0; 5813 mddev->bitmap_info.offset = 0;
6089 5814
6090 mddev->reshape_position = MaxSector; 5815 mddev->reshape_position = MaxSector;
@@ -6098,12 +5823,11 @@ static int set_array_info(struct mddev * mddev, mdu_array_info_t *info)
6098 mddev->new_chunk_sectors = mddev->chunk_sectors; 5823 mddev->new_chunk_sectors = mddev->chunk_sectors;
6099 mddev->new_layout = mddev->layout; 5824 mddev->new_layout = mddev->layout;
6100 mddev->delta_disks = 0; 5825 mddev->delta_disks = 0;
6101 mddev->reshape_backwards = 0;
6102 5826
6103 return 0; 5827 return 0;
6104} 5828}
6105 5829
6106void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) 5830void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors)
6107{ 5831{
6108 WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__); 5832 WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__);
6109 5833
@@ -6114,9 +5838,9 @@ void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors)
6114} 5838}
6115EXPORT_SYMBOL(md_set_array_sectors); 5839EXPORT_SYMBOL(md_set_array_sectors);
6116 5840
6117static int update_size(struct mddev *mddev, sector_t num_sectors) 5841static int update_size(mddev_t *mddev, sector_t num_sectors)
6118{ 5842{
6119 struct md_rdev *rdev; 5843 mdk_rdev_t *rdev;
6120 int rv; 5844 int rv;
6121 int fit = (num_sectors == 0); 5845 int fit = (num_sectors == 0);
6122 5846
@@ -6133,8 +5857,12 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
6133 */ 5857 */
6134 if (mddev->sync_thread) 5858 if (mddev->sync_thread)
6135 return -EBUSY; 5859 return -EBUSY;
6136 5860 if (mddev->bitmap)
6137 rdev_for_each(rdev, mddev) { 5861 /* Sorry, cannot grow a bitmap yet, just remove it,
5862 * grow, and re-add.
5863 */
5864 return -EBUSY;
5865 list_for_each_entry(rdev, &mddev->disks, same_set) {
6138 sector_t avail = rdev->sectors; 5866 sector_t avail = rdev->sectors;
6139 5867
6140 if (fit && (num_sectors == 0 || num_sectors > avail)) 5868 if (fit && (num_sectors == 0 || num_sectors > avail))
@@ -6148,10 +5876,9 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
6148 return rv; 5876 return rv;
6149} 5877}
6150 5878
6151static int update_raid_disks(struct mddev *mddev, int raid_disks) 5879static int update_raid_disks(mddev_t *mddev, int raid_disks)
6152{ 5880{
6153 int rv; 5881 int rv;
6154 struct md_rdev *rdev;
6155 /* change the number of raid disks */ 5882 /* change the number of raid disks */
6156 if (mddev->pers->check_reshape == NULL) 5883 if (mddev->pers->check_reshape == NULL)
6157 return -EINVAL; 5884 return -EINVAL;
@@ -6160,27 +5887,11 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks)
6160 return -EINVAL; 5887 return -EINVAL;
6161 if (mddev->sync_thread || mddev->reshape_position != MaxSector) 5888 if (mddev->sync_thread || mddev->reshape_position != MaxSector)
6162 return -EBUSY; 5889 return -EBUSY;
6163
6164 rdev_for_each(rdev, mddev) {
6165 if (mddev->raid_disks < raid_disks &&
6166 rdev->data_offset < rdev->new_data_offset)
6167 return -EINVAL;
6168 if (mddev->raid_disks > raid_disks &&
6169 rdev->data_offset > rdev->new_data_offset)
6170 return -EINVAL;
6171 }
6172
6173 mddev->delta_disks = raid_disks - mddev->raid_disks; 5890 mddev->delta_disks = raid_disks - mddev->raid_disks;
6174 if (mddev->delta_disks < 0)
6175 mddev->reshape_backwards = 1;
6176 else if (mddev->delta_disks > 0)
6177 mddev->reshape_backwards = 0;
6178 5891
6179 rv = mddev->pers->check_reshape(mddev); 5892 rv = mddev->pers->check_reshape(mddev);
6180 if (rv < 0) { 5893 if (rv < 0)
6181 mddev->delta_disks = 0; 5894 mddev->delta_disks = 0;
6182 mddev->reshape_backwards = 0;
6183 }
6184 return rv; 5895 return rv;
6185} 5896}
6186 5897
@@ -6193,7 +5904,7 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks)
6193 * Any differences that cannot be handled will cause an error. 5904 * Any differences that cannot be handled will cause an error.
6194 * Normally, only one change can be managed at a time. 5905 * Normally, only one change can be managed at a time.
6195 */ 5906 */
6196static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) 5907static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
6197{ 5908{
6198 int rv = 0; 5909 int rv = 0;
6199 int cnt = 0; 5910 int cnt = 0;
@@ -6263,8 +5974,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
6263 return -EINVAL; 5974 return -EINVAL;
6264 mddev->bitmap_info.offset = 5975 mddev->bitmap_info.offset =
6265 mddev->bitmap_info.default_offset; 5976 mddev->bitmap_info.default_offset;
6266 mddev->bitmap_info.space =
6267 mddev->bitmap_info.default_space;
6268 mddev->pers->quiesce(mddev, 1); 5977 mddev->pers->quiesce(mddev, 1);
6269 rv = bitmap_create(mddev); 5978 rv = bitmap_create(mddev);
6270 if (!rv) 5979 if (!rv)
@@ -6276,7 +5985,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
6276 /* remove the bitmap */ 5985 /* remove the bitmap */
6277 if (!mddev->bitmap) 5986 if (!mddev->bitmap)
6278 return -ENOENT; 5987 return -ENOENT;
6279 if (mddev->bitmap->storage.file) 5988 if (mddev->bitmap->file)
6280 return -EINVAL; 5989 return -EINVAL;
6281 mddev->pers->quiesce(mddev, 1); 5990 mddev->pers->quiesce(mddev, 1);
6282 bitmap_destroy(mddev); 5991 bitmap_destroy(mddev);
@@ -6288,25 +5997,21 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
6288 return rv; 5997 return rv;
6289} 5998}
6290 5999
6291static int set_disk_faulty(struct mddev *mddev, dev_t dev) 6000static int set_disk_faulty(mddev_t *mddev, dev_t dev)
6292{ 6001{
6293 struct md_rdev *rdev; 6002 mdk_rdev_t *rdev;
6294 int err = 0;
6295 6003
6296 if (mddev->pers == NULL) 6004 if (mddev->pers == NULL)
6297 return -ENODEV; 6005 return -ENODEV;
6298 6006
6299 rcu_read_lock(); 6007 rdev = find_rdev(mddev, dev);
6300 rdev = find_rdev_rcu(mddev, dev);
6301 if (!rdev) 6008 if (!rdev)
6302 err = -ENODEV; 6009 return -ENODEV;
6303 else { 6010
6304 md_error(mddev, rdev); 6011 md_error(mddev, rdev);
6305 if (!test_bit(Faulty, &rdev->flags)) 6012 if (!test_bit(Faulty, &rdev->flags))
6306 err = -EBUSY; 6013 return -EBUSY;
6307 } 6014 return 0;
6308 rcu_read_unlock();
6309 return err;
6310} 6015}
6311 6016
6312/* 6017/*
@@ -6317,7 +6022,7 @@ static int set_disk_faulty(struct mddev *mddev, dev_t dev)
6317 */ 6022 */
6318static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 6023static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
6319{ 6024{
6320 struct mddev *mddev = bdev->bd_disk->private_data; 6025 mddev_t *mddev = bdev->bd_disk->private_data;
6321 6026
6322 geo->heads = 2; 6027 geo->heads = 2;
6323 geo->sectors = 4; 6028 geo->sectors = 4;
@@ -6330,40 +6035,34 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
6330{ 6035{
6331 int err = 0; 6036 int err = 0;
6332 void __user *argp = (void __user *)arg; 6037 void __user *argp = (void __user *)arg;
6333 struct mddev *mddev = NULL; 6038 mddev_t *mddev = NULL;
6334 int ro; 6039 int ro;
6335 6040
6336 switch (cmd) { 6041 if (!capable(CAP_SYS_ADMIN))
6337 case RAID_VERSION: 6042 return -EACCES;
6338 case GET_ARRAY_INFO:
6339 case GET_DISK_INFO:
6340 break;
6341 default:
6342 if (!capable(CAP_SYS_ADMIN))
6343 return -EACCES;
6344 }
6345 6043
6346 /* 6044 /*
6347 * Commands dealing with the RAID driver but not any 6045 * Commands dealing with the RAID driver but not any
6348 * particular array: 6046 * particular array:
6349 */ 6047 */
6350 switch (cmd) { 6048 switch (cmd)
6351 case RAID_VERSION: 6049 {
6352 err = get_version(argp); 6050 case RAID_VERSION:
6353 goto done; 6051 err = get_version(argp);
6052 goto done;
6354 6053
6355 case PRINT_RAID_DEBUG: 6054 case PRINT_RAID_DEBUG:
6356 err = 0; 6055 err = 0;
6357 md_print_devices(); 6056 md_print_devices();
6358 goto done; 6057 goto done;
6359 6058
6360#ifndef MODULE 6059#ifndef MODULE
6361 case RAID_AUTORUN: 6060 case RAID_AUTORUN:
6362 err = 0; 6061 err = 0;
6363 autostart_arrays(arg); 6062 autostart_arrays(arg);
6364 goto done; 6063 goto done;
6365#endif 6064#endif
6366 default:; 6065 default:;
6367 } 6066 }
6368 6067
6369 /* 6068 /*
@@ -6377,31 +6076,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
6377 goto abort; 6076 goto abort;
6378 } 6077 }
6379 6078
6380 /* Some actions do not requires the mutex */
6381 switch (cmd) {
6382 case GET_ARRAY_INFO:
6383 if (!mddev->raid_disks && !mddev->external)
6384 err = -ENODEV;
6385 else
6386 err = get_array_info(mddev, argp);
6387 goto abort;
6388
6389 case GET_DISK_INFO:
6390 if (!mddev->raid_disks && !mddev->external)
6391 err = -ENODEV;
6392 else
6393 err = get_disk_info(mddev, argp);
6394 goto abort;
6395
6396 case SET_DISK_FAULTY:
6397 err = set_disk_faulty(mddev, new_decode_dev(arg));
6398 goto abort;
6399 }
6400
6401 if (cmd == ADD_NEW_DISK)
6402 /* need to ensure md_delayed_delete() has completed */
6403 flush_workqueue(md_misc_wq);
6404
6405 err = mddev_lock(mddev); 6079 err = mddev_lock(mddev);
6406 if (err) { 6080 if (err) {
6407 printk(KERN_INFO 6081 printk(KERN_INFO
@@ -6410,44 +6084,50 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
6410 goto abort; 6084 goto abort;
6411 } 6085 }
6412 6086
6413 if (cmd == SET_ARRAY_INFO) { 6087 switch (cmd)
6414 mdu_array_info_t info; 6088 {
6415 if (!arg) 6089 case SET_ARRAY_INFO:
6416 memset(&info, 0, sizeof(info)); 6090 {
6417 else if (copy_from_user(&info, argp, sizeof(info))) { 6091 mdu_array_info_t info;
6418 err = -EFAULT; 6092 if (!arg)
6419 goto abort_unlock; 6093 memset(&info, 0, sizeof(info));
6420 } 6094 else if (copy_from_user(&info, argp, sizeof(info))) {
6421 if (mddev->pers) { 6095 err = -EFAULT;
6422 err = update_array_info(mddev, &info); 6096 goto abort_unlock;
6423 if (err) { 6097 }
6424 printk(KERN_WARNING "md: couldn't update" 6098 if (mddev->pers) {
6425 " array info. %d\n", err); 6099 err = update_array_info(mddev, &info);
6426 goto abort_unlock; 6100 if (err) {
6101 printk(KERN_WARNING "md: couldn't update"
6102 " array info. %d\n", err);
6103 goto abort_unlock;
6104 }
6105 goto done_unlock;
6106 }
6107 if (!list_empty(&mddev->disks)) {
6108 printk(KERN_WARNING
6109 "md: array %s already has disks!\n",
6110 mdname(mddev));
6111 err = -EBUSY;
6112 goto abort_unlock;
6113 }
6114 if (mddev->raid_disks) {
6115 printk(KERN_WARNING
6116 "md: array %s already initialised!\n",
6117 mdname(mddev));
6118 err = -EBUSY;
6119 goto abort_unlock;
6120 }
6121 err = set_array_info(mddev, &info);
6122 if (err) {
6123 printk(KERN_WARNING "md: couldn't set"
6124 " array info. %d\n", err);
6125 goto abort_unlock;
6126 }
6427 } 6127 }
6428 goto done_unlock; 6128 goto done_unlock;
6429 } 6129
6430 if (!list_empty(&mddev->disks)) { 6130 default:;
6431 printk(KERN_WARNING
6432 "md: array %s already has disks!\n",
6433 mdname(mddev));
6434 err = -EBUSY;
6435 goto abort_unlock;
6436 }
6437 if (mddev->raid_disks) {
6438 printk(KERN_WARNING
6439 "md: array %s already initialised!\n",
6440 mdname(mddev));
6441 err = -EBUSY;
6442 goto abort_unlock;
6443 }
6444 err = set_array_info(mddev, &info);
6445 if (err) {
6446 printk(KERN_WARNING "md: couldn't set"
6447 " array info. %d\n", err);
6448 goto abort_unlock;
6449 }
6450 goto done_unlock;
6451 } 6131 }
6452 6132
6453 /* 6133 /*
@@ -6466,51 +6146,60 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
6466 /* 6146 /*
6467 * Commands even a read-only array can execute: 6147 * Commands even a read-only array can execute:
6468 */ 6148 */
6469 switch (cmd) { 6149 switch (cmd)
6470 case GET_BITMAP_FILE: 6150 {
6471 err = get_bitmap_file(mddev, argp); 6151 case GET_ARRAY_INFO:
6472 goto done_unlock; 6152 err = get_array_info(mddev, argp);
6473 6153 goto done_unlock;
6474 case RESTART_ARRAY_RW:
6475 err = restart_array(mddev);
6476 goto done_unlock;
6477 6154
6478 case STOP_ARRAY: 6155 case GET_BITMAP_FILE:
6479 err = do_md_stop(mddev, 0, bdev); 6156 err = get_bitmap_file(mddev, argp);
6480 goto done_unlock; 6157 goto done_unlock;
6481 6158
6482 case STOP_ARRAY_RO: 6159 case GET_DISK_INFO:
6483 err = md_set_readonly(mddev, bdev); 6160 err = get_disk_info(mddev, argp);
6484 goto done_unlock; 6161 goto done_unlock;
6485 6162
6486 case BLKROSET: 6163 case RESTART_ARRAY_RW:
6487 if (get_user(ro, (int __user *)(arg))) { 6164 err = restart_array(mddev);
6488 err = -EFAULT;
6489 goto done_unlock; 6165 goto done_unlock;
6490 }
6491 err = -EINVAL;
6492 6166
6493 /* if the bdev is going readonly the value of mddev->ro 6167 case STOP_ARRAY:
6494 * does not matter, no writes are coming 6168 err = do_md_stop(mddev, 0, 1);
6495 */
6496 if (ro)
6497 goto done_unlock; 6169 goto done_unlock;
6498 6170
6499 /* are we are already prepared for writes? */ 6171 case STOP_ARRAY_RO:
6500 if (mddev->ro != 1) 6172 err = md_set_readonly(mddev, 1);
6501 goto done_unlock; 6173 goto done_unlock;
6502 6174
6503 /* transitioning to readauto need only happen for 6175 case BLKROSET:
6504 * arrays that call md_write_start 6176 if (get_user(ro, (int __user *)(arg))) {
6505 */ 6177 err = -EFAULT;
6506 if (mddev->pers) { 6178 goto done_unlock;
6507 err = restart_array(mddev);
6508 if (err == 0) {
6509 mddev->ro = 2;
6510 set_disk_ro(mddev->gendisk, 0);
6511 } 6179 }
6512 } 6180 err = -EINVAL;
6513 goto done_unlock; 6181
6182 /* if the bdev is going readonly the value of mddev->ro
6183 * does not matter, no writes are coming
6184 */
6185 if (ro)
6186 goto done_unlock;
6187
6188 /* are we are already prepared for writes? */
6189 if (mddev->ro != 1)
6190 goto done_unlock;
6191
6192 /* transitioning to readauto need only happen for
6193 * arrays that call md_write_start
6194 */
6195 if (mddev->pers) {
6196 err = restart_array(mddev);
6197 if (err == 0) {
6198 mddev->ro = 2;
6199 set_disk_ro(mddev->gendisk, 0);
6200 }
6201 }
6202 goto done_unlock;
6514 } 6203 }
6515 6204
6516 /* 6205 /*
@@ -6532,36 +6221,41 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
6532 } 6221 }
6533 } 6222 }
6534 6223
6535 switch (cmd) { 6224 switch (cmd)
6536 case ADD_NEW_DISK:
6537 { 6225 {
6538 mdu_disk_info_t info; 6226 case ADD_NEW_DISK:
6539 if (copy_from_user(&info, argp, sizeof(info))) 6227 {
6540 err = -EFAULT; 6228 mdu_disk_info_t info;
6541 else 6229 if (copy_from_user(&info, argp, sizeof(info)))
6542 err = add_new_disk(mddev, &info); 6230 err = -EFAULT;
6543 goto done_unlock; 6231 else
6544 } 6232 err = add_new_disk(mddev, &info);
6233 goto done_unlock;
6234 }
6545 6235
6546 case HOT_REMOVE_DISK: 6236 case HOT_REMOVE_DISK:
6547 err = hot_remove_disk(mddev, new_decode_dev(arg)); 6237 err = hot_remove_disk(mddev, new_decode_dev(arg));
6548 goto done_unlock; 6238 goto done_unlock;
6549 6239
6550 case HOT_ADD_DISK: 6240 case HOT_ADD_DISK:
6551 err = hot_add_disk(mddev, new_decode_dev(arg)); 6241 err = hot_add_disk(mddev, new_decode_dev(arg));
6552 goto done_unlock; 6242 goto done_unlock;
6553 6243
6554 case RUN_ARRAY: 6244 case SET_DISK_FAULTY:
6555 err = do_md_run(mddev); 6245 err = set_disk_faulty(mddev, new_decode_dev(arg));
6556 goto done_unlock; 6246 goto done_unlock;
6557 6247
6558 case SET_BITMAP_FILE: 6248 case RUN_ARRAY:
6559 err = set_bitmap_file(mddev, (int)arg); 6249 err = do_md_run(mddev);
6560 goto done_unlock; 6250 goto done_unlock;
6561 6251
6562 default: 6252 case SET_BITMAP_FILE:
6563 err = -EINVAL; 6253 err = set_bitmap_file(mddev, (int)arg);
6564 goto abort_unlock; 6254 goto done_unlock;
6255
6256 default:
6257 err = -EINVAL;
6258 goto abort_unlock;
6565 } 6259 }
6566 6260
6567done_unlock: 6261done_unlock:
@@ -6604,12 +6298,9 @@ static int md_open(struct block_device *bdev, fmode_t mode)
6604 * Succeed if we can lock the mddev, which confirms that 6298 * Succeed if we can lock the mddev, which confirms that
6605 * it isn't being stopped right now. 6299 * it isn't being stopped right now.
6606 */ 6300 */
6607 struct mddev *mddev = mddev_find(bdev->bd_dev); 6301 mddev_t *mddev = mddev_find(bdev->bd_dev);
6608 int err; 6302 int err;
6609 6303
6610 if (!mddev)
6611 return -ENODEV;
6612
6613 if (mddev->gendisk != bdev->bd_disk) { 6304 if (mddev->gendisk != bdev->bd_disk) {
6614 /* we are racing with mddev_put which is discarding this 6305 /* we are racing with mddev_put which is discarding this
6615 * bd_disk. 6306 * bd_disk.
@@ -6636,7 +6327,7 @@ static int md_open(struct block_device *bdev, fmode_t mode)
6636 6327
6637static int md_release(struct gendisk *disk, fmode_t mode) 6328static int md_release(struct gendisk *disk, fmode_t mode)
6638{ 6329{
6639 struct mddev *mddev = disk->private_data; 6330 mddev_t *mddev = disk->private_data;
6640 6331
6641 BUG_ON(!mddev); 6332 BUG_ON(!mddev);
6642 atomic_dec(&mddev->openers); 6333 atomic_dec(&mddev->openers);
@@ -6647,14 +6338,14 @@ static int md_release(struct gendisk *disk, fmode_t mode)
6647 6338
6648static int md_media_changed(struct gendisk *disk) 6339static int md_media_changed(struct gendisk *disk)
6649{ 6340{
6650 struct mddev *mddev = disk->private_data; 6341 mddev_t *mddev = disk->private_data;
6651 6342
6652 return mddev->changed; 6343 return mddev->changed;
6653} 6344}
6654 6345
6655static int md_revalidate(struct gendisk *disk) 6346static int md_revalidate(struct gendisk *disk)
6656{ 6347{
6657 struct mddev *mddev = disk->private_data; 6348 mddev_t *mddev = disk->private_data;
6658 6349
6659 mddev->changed = 0; 6350 mddev->changed = 0;
6660 return 0; 6351 return 0;
@@ -6675,7 +6366,7 @@ static const struct block_device_operations md_fops =
6675 6366
6676static int md_thread(void * arg) 6367static int md_thread(void * arg)
6677{ 6368{
6678 struct md_thread *thread = arg; 6369 mdk_thread_t *thread = arg;
6679 6370
6680 /* 6371 /*
6681 * md_thread is a 'system-thread', it's priority should be very 6372 * md_thread is a 'system-thread', it's priority should be very
@@ -6708,27 +6399,27 @@ static int md_thread(void * arg)
6708 6399
6709 clear_bit(THREAD_WAKEUP, &thread->flags); 6400 clear_bit(THREAD_WAKEUP, &thread->flags);
6710 if (!kthread_should_stop()) 6401 if (!kthread_should_stop())
6711 thread->run(thread); 6402 thread->run(thread->mddev);
6712 } 6403 }
6713 6404
6714 return 0; 6405 return 0;
6715} 6406}
6716 6407
6717void md_wakeup_thread(struct md_thread *thread) 6408void md_wakeup_thread(mdk_thread_t *thread)
6718{ 6409{
6719 if (thread) { 6410 if (thread) {
6720 pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm); 6411 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm);
6721 set_bit(THREAD_WAKEUP, &thread->flags); 6412 set_bit(THREAD_WAKEUP, &thread->flags);
6722 wake_up(&thread->wqueue); 6413 wake_up(&thread->wqueue);
6723 } 6414 }
6724} 6415}
6725 6416
6726struct md_thread *md_register_thread(void (*run) (struct md_thread *), 6417mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
6727 struct mddev *mddev, const char *name) 6418 const char *name)
6728{ 6419{
6729 struct md_thread *thread; 6420 mdk_thread_t *thread;
6730 6421
6731 thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL); 6422 thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL);
6732 if (!thread) 6423 if (!thread)
6733 return NULL; 6424 return NULL;
6734 6425
@@ -6740,7 +6431,7 @@ struct md_thread *md_register_thread(void (*run) (struct md_thread *),
6740 thread->tsk = kthread_run(md_thread, thread, 6431 thread->tsk = kthread_run(md_thread, thread,
6741 "%s_%s", 6432 "%s_%s",
6742 mdname(thread->mddev), 6433 mdname(thread->mddev),
6743 name); 6434 name ?: mddev->pers->name);
6744 if (IS_ERR(thread->tsk)) { 6435 if (IS_ERR(thread->tsk)) {
6745 kfree(thread); 6436 kfree(thread);
6746 return NULL; 6437 return NULL;
@@ -6748,12 +6439,12 @@ struct md_thread *md_register_thread(void (*run) (struct md_thread *),
6748 return thread; 6439 return thread;
6749} 6440}
6750 6441
6751void md_unregister_thread(struct md_thread **threadp) 6442void md_unregister_thread(mdk_thread_t **threadp)
6752{ 6443{
6753 struct md_thread *thread = *threadp; 6444 mdk_thread_t *thread = *threadp;
6754 if (!thread) 6445 if (!thread)
6755 return; 6446 return;
6756 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); 6447 dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
6757 /* Locking ensures that mddev_unlock does not wake_up a 6448 /* Locking ensures that mddev_unlock does not wake_up a
6758 * non-existent thread 6449 * non-existent thread
6759 */ 6450 */
@@ -6765,7 +6456,7 @@ void md_unregister_thread(struct md_thread **threadp)
6765 kfree(thread); 6456 kfree(thread);
6766} 6457}
6767 6458
6768void md_error(struct mddev *mddev, struct md_rdev *rdev) 6459void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
6769{ 6460{
6770 if (!mddev) { 6461 if (!mddev) {
6771 MD_BUG(); 6462 MD_BUG();
@@ -6794,7 +6485,7 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev)
6794static void status_unused(struct seq_file *seq) 6485static void status_unused(struct seq_file *seq)
6795{ 6486{
6796 int i = 0; 6487 int i = 0;
6797 struct md_rdev *rdev; 6488 mdk_rdev_t *rdev;
6798 6489
6799 seq_printf(seq, "unused devices: "); 6490 seq_printf(seq, "unused devices: ");
6800 6491
@@ -6811,7 +6502,7 @@ static void status_unused(struct seq_file *seq)
6811} 6502}
6812 6503
6813 6504
6814static void status_resync(struct seq_file *seq, struct mddev * mddev) 6505static void status_resync(struct seq_file *seq, mddev_t * mddev)
6815{ 6506{
6816 sector_t max_sectors, resync, res; 6507 sector_t max_sectors, resync, res;
6817 unsigned long dt, db; 6508 unsigned long dt, db;
@@ -6819,14 +6510,9 @@ static void status_resync(struct seq_file *seq, struct mddev * mddev)
6819 int scale; 6510 int scale;
6820 unsigned int per_milli; 6511 unsigned int per_milli;
6821 6512
6822 if (mddev->curr_resync <= 3) 6513 resync = mddev->curr_resync - atomic_read(&mddev->recovery_active);
6823 resync = 0;
6824 else
6825 resync = mddev->curr_resync
6826 - atomic_read(&mddev->recovery_active);
6827 6514
6828 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 6515 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
6829 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6830 max_sectors = mddev->resync_max_sectors; 6516 max_sectors = mddev->resync_max_sectors;
6831 else 6517 else
6832 max_sectors = mddev->dev_sectors; 6518 max_sectors = mddev->dev_sectors;
@@ -6907,7 +6593,7 @@ static void *md_seq_start(struct seq_file *seq, loff_t *pos)
6907{ 6593{
6908 struct list_head *tmp; 6594 struct list_head *tmp;
6909 loff_t l = *pos; 6595 loff_t l = *pos;
6910 struct mddev *mddev; 6596 mddev_t *mddev;
6911 6597
6912 if (l >= 0x10000) 6598 if (l >= 0x10000)
6913 return NULL; 6599 return NULL;
@@ -6918,7 +6604,7 @@ static void *md_seq_start(struct seq_file *seq, loff_t *pos)
6918 spin_lock(&all_mddevs_lock); 6604 spin_lock(&all_mddevs_lock);
6919 list_for_each(tmp,&all_mddevs) 6605 list_for_each(tmp,&all_mddevs)
6920 if (!l--) { 6606 if (!l--) {
6921 mddev = list_entry(tmp, struct mddev, all_mddevs); 6607 mddev = list_entry(tmp, mddev_t, all_mddevs);
6922 mddev_get(mddev); 6608 mddev_get(mddev);
6923 spin_unlock(&all_mddevs_lock); 6609 spin_unlock(&all_mddevs_lock);
6924 return mddev; 6610 return mddev;
@@ -6932,7 +6618,7 @@ static void *md_seq_start(struct seq_file *seq, loff_t *pos)
6932static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 6618static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
6933{ 6619{
6934 struct list_head *tmp; 6620 struct list_head *tmp;
6935 struct mddev *next_mddev, *mddev = v; 6621 mddev_t *next_mddev, *mddev = v;
6936 6622
6937 ++*pos; 6623 ++*pos;
6938 if (v == (void*)2) 6624 if (v == (void*)2)
@@ -6944,7 +6630,7 @@ static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
6944 else 6630 else
6945 tmp = mddev->all_mddevs.next; 6631 tmp = mddev->all_mddevs.next;
6946 if (tmp != &all_mddevs) 6632 if (tmp != &all_mddevs)
6947 next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs)); 6633 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
6948 else { 6634 else {
6949 next_mddev = (void*)2; 6635 next_mddev = (void*)2;
6950 *pos = 0x10000; 6636 *pos = 0x10000;
@@ -6959,7 +6645,7 @@ static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
6959 6645
6960static void md_seq_stop(struct seq_file *seq, void *v) 6646static void md_seq_stop(struct seq_file *seq, void *v)
6961{ 6647{
6962 struct mddev *mddev = v; 6648 mddev_t *mddev = v;
6963 6649
6964 if (mddev && v != (void*)1 && v != (void*)2) 6650 if (mddev && v != (void*)1 && v != (void*)2)
6965 mddev_put(mddev); 6651 mddev_put(mddev);
@@ -6967,12 +6653,13 @@ static void md_seq_stop(struct seq_file *seq, void *v)
6967 6653
6968static int md_seq_show(struct seq_file *seq, void *v) 6654static int md_seq_show(struct seq_file *seq, void *v)
6969{ 6655{
6970 struct mddev *mddev = v; 6656 mddev_t *mddev = v;
6971 sector_t sectors; 6657 sector_t sectors;
6972 struct md_rdev *rdev; 6658 mdk_rdev_t *rdev;
6659 struct bitmap *bitmap;
6973 6660
6974 if (v == (void*)1) { 6661 if (v == (void*)1) {
6975 struct md_personality *pers; 6662 struct mdk_personality *pers;
6976 seq_printf(seq, "Personalities : "); 6663 seq_printf(seq, "Personalities : ");
6977 spin_lock(&pers_lock); 6664 spin_lock(&pers_lock);
6978 list_for_each_entry(pers, &pers_list, list) 6665 list_for_each_entry(pers, &pers_list, list)
@@ -7003,7 +6690,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
7003 } 6690 }
7004 6691
7005 sectors = 0; 6692 sectors = 0;
7006 rdev_for_each(rdev, mddev) { 6693 list_for_each_entry(rdev, &mddev->disks, same_set) {
7007 char b[BDEVNAME_SIZE]; 6694 char b[BDEVNAME_SIZE];
7008 seq_printf(seq, " %s[%d]", 6695 seq_printf(seq, " %s[%d]",
7009 bdevname(rdev->bdev,b), rdev->desc_nr); 6696 bdevname(rdev->bdev,b), rdev->desc_nr);
@@ -7012,11 +6699,8 @@ static int md_seq_show(struct seq_file *seq, void *v)
7012 if (test_bit(Faulty, &rdev->flags)) { 6699 if (test_bit(Faulty, &rdev->flags)) {
7013 seq_printf(seq, "(F)"); 6700 seq_printf(seq, "(F)");
7014 continue; 6701 continue;
7015 } 6702 } else if (rdev->raid_disk < 0)
7016 if (rdev->raid_disk < 0)
7017 seq_printf(seq, "(S)"); /* spare */ 6703 seq_printf(seq, "(S)"); /* spare */
7018 if (test_bit(Replacement, &rdev->flags))
7019 seq_printf(seq, "(R)");
7020 sectors += rdev->sectors; 6704 sectors += rdev->sectors;
7021 } 6705 }
7022 6706
@@ -7049,7 +6733,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
7049 if (mddev->curr_resync > 2) { 6733 if (mddev->curr_resync > 2) {
7050 status_resync(seq, mddev); 6734 status_resync(seq, mddev);
7051 seq_printf(seq, "\n "); 6735 seq_printf(seq, "\n ");
7052 } else if (mddev->curr_resync >= 1) 6736 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
7053 seq_printf(seq, "\tresync=DELAYED\n "); 6737 seq_printf(seq, "\tresync=DELAYED\n ");
7054 else if (mddev->recovery_cp < MaxSector) 6738 else if (mddev->recovery_cp < MaxSector)
7055 seq_printf(seq, "\tresync=PENDING\n "); 6739 seq_printf(seq, "\tresync=PENDING\n ");
@@ -7057,7 +6741,27 @@ static int md_seq_show(struct seq_file *seq, void *v)
7057 } else 6741 } else
7058 seq_printf(seq, "\n "); 6742 seq_printf(seq, "\n ");
7059 6743
7060 bitmap_status(seq, mddev->bitmap); 6744 if ((bitmap = mddev->bitmap)) {
6745 unsigned long chunk_kb;
6746 unsigned long flags;
6747 spin_lock_irqsave(&bitmap->lock, flags);
6748 chunk_kb = mddev->bitmap_info.chunksize >> 10;
6749 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
6750 "%lu%s chunk",
6751 bitmap->pages - bitmap->missing_pages,
6752 bitmap->pages,
6753 (bitmap->pages - bitmap->missing_pages)
6754 << (PAGE_SHIFT - 10),
6755 chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize,
6756 chunk_kb ? "KB" : "B");
6757 if (bitmap->file) {
6758 seq_printf(seq, ", file: ");
6759 seq_path(seq, &bitmap->file->f_path, " \t\n");
6760 }
6761
6762 seq_printf(seq, "\n");
6763 spin_unlock_irqrestore(&bitmap->lock, flags);
6764 }
7061 6765
7062 seq_printf(seq, "\n"); 6766 seq_printf(seq, "\n");
7063 } 6767 }
@@ -7111,7 +6815,7 @@ static const struct file_operations md_seq_fops = {
7111 .poll = mdstat_poll, 6815 .poll = mdstat_poll,
7112}; 6816};
7113 6817
7114int register_md_personality(struct md_personality *p) 6818int register_md_personality(struct mdk_personality *p)
7115{ 6819{
7116 spin_lock(&pers_lock); 6820 spin_lock(&pers_lock);
7117 list_add_tail(&p->list, &pers_list); 6821 list_add_tail(&p->list, &pers_list);
@@ -7120,7 +6824,7 @@ int register_md_personality(struct md_personality *p)
7120 return 0; 6824 return 0;
7121} 6825}
7122 6826
7123int unregister_md_personality(struct md_personality *p) 6827int unregister_md_personality(struct mdk_personality *p)
7124{ 6828{
7125 printk(KERN_INFO "md: %s personality unregistered\n", p->name); 6829 printk(KERN_INFO "md: %s personality unregistered\n", p->name);
7126 spin_lock(&pers_lock); 6830 spin_lock(&pers_lock);
@@ -7129,9 +6833,9 @@ int unregister_md_personality(struct md_personality *p)
7129 return 0; 6833 return 0;
7130} 6834}
7131 6835
7132static int is_mddev_idle(struct mddev *mddev, int init) 6836static int is_mddev_idle(mddev_t *mddev, int init)
7133{ 6837{
7134 struct md_rdev * rdev; 6838 mdk_rdev_t * rdev;
7135 int idle; 6839 int idle;
7136 int curr_events; 6840 int curr_events;
7137 6841
@@ -7173,14 +6877,13 @@ static int is_mddev_idle(struct mddev *mddev, int init)
7173 return idle; 6877 return idle;
7174} 6878}
7175 6879
7176void md_done_sync(struct mddev *mddev, int blocks, int ok) 6880void md_done_sync(mddev_t *mddev, int blocks, int ok)
7177{ 6881{
7178 /* another "blocks" (512byte) blocks have been synced */ 6882 /* another "blocks" (512byte) blocks have been synced */
7179 atomic_sub(blocks, &mddev->recovery_active); 6883 atomic_sub(blocks, &mddev->recovery_active);
7180 wake_up(&mddev->recovery_wait); 6884 wake_up(&mddev->recovery_wait);
7181 if (!ok) { 6885 if (!ok) {
7182 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6886 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7183 set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
7184 md_wakeup_thread(mddev->thread); 6887 md_wakeup_thread(mddev->thread);
7185 // stop recovery, signal do_sync .... 6888 // stop recovery, signal do_sync ....
7186 } 6889 }
@@ -7192,7 +6895,7 @@ void md_done_sync(struct mddev *mddev, int blocks, int ok)
7192 * in superblock) before writing, schedule a superblock update 6895 * in superblock) before writing, schedule a superblock update
7193 * and wait for it to complete. 6896 * and wait for it to complete.
7194 */ 6897 */
7195void md_write_start(struct mddev *mddev, struct bio *bi) 6898void md_write_start(mddev_t *mddev, struct bio *bi)
7196{ 6899{
7197 int did_change = 0; 6900 int did_change = 0;
7198 if (bio_data_dir(bi) != WRITE) 6901 if (bio_data_dir(bi) != WRITE)
@@ -7227,7 +6930,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
7227 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 6930 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
7228} 6931}
7229 6932
7230void md_write_end(struct mddev *mddev) 6933void md_write_end(mddev_t *mddev)
7231{ 6934{
7232 if (atomic_dec_and_test(&mddev->writes_pending)) { 6935 if (atomic_dec_and_test(&mddev->writes_pending)) {
7233 if (mddev->safemode == 2) 6936 if (mddev->safemode == 2)
@@ -7246,7 +6949,7 @@ void md_write_end(struct mddev *mddev)
7246 * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock 6949 * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
7247 * is dropped, so return -EAGAIN after notifying userspace. 6950 * is dropped, so return -EAGAIN after notifying userspace.
7248 */ 6951 */
7249int md_allow_write(struct mddev *mddev) 6952int md_allow_write(mddev_t *mddev)
7250{ 6953{
7251 if (!mddev->pers) 6954 if (!mddev->pers)
7252 return 0; 6955 return 0;
@@ -7278,24 +6981,20 @@ EXPORT_SYMBOL_GPL(md_allow_write);
7278 6981
7279#define SYNC_MARKS 10 6982#define SYNC_MARKS 10
7280#define SYNC_MARK_STEP (3*HZ) 6983#define SYNC_MARK_STEP (3*HZ)
7281#define UPDATE_FREQUENCY (5*60*HZ) 6984void md_do_sync(mddev_t *mddev)
7282void md_do_sync(struct md_thread *thread)
7283{ 6985{
7284 struct mddev *mddev = thread->mddev; 6986 mddev_t *mddev2;
7285 struct mddev *mddev2;
7286 unsigned int currspeed = 0, 6987 unsigned int currspeed = 0,
7287 window; 6988 window;
7288 sector_t max_sectors,j, io_sectors; 6989 sector_t max_sectors,j, io_sectors;
7289 unsigned long mark[SYNC_MARKS]; 6990 unsigned long mark[SYNC_MARKS];
7290 unsigned long update_time;
7291 sector_t mark_cnt[SYNC_MARKS]; 6991 sector_t mark_cnt[SYNC_MARKS];
7292 int last_mark,m; 6992 int last_mark,m;
7293 struct list_head *tmp; 6993 struct list_head *tmp;
7294 sector_t last_check; 6994 sector_t last_check;
7295 int skipped = 0; 6995 int skipped = 0;
7296 struct md_rdev *rdev; 6996 mdk_rdev_t *rdev;
7297 char *desc; 6997 char *desc;
7298 struct blk_plug plug;
7299 6998
7300 /* just incase thread restarts... */ 6999 /* just incase thread restarts... */
7301 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 7000 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
@@ -7386,7 +7085,7 @@ void md_do_sync(struct md_thread *thread)
7386 * which defaults to physical size, but can be virtual size 7085 * which defaults to physical size, but can be virtual size
7387 */ 7086 */
7388 max_sectors = mddev->resync_max_sectors; 7087 max_sectors = mddev->resync_max_sectors;
7389 atomic64_set(&mddev->resync_mismatches, 0); 7088 mddev->resync_mismatches = 0;
7390 /* we don't use the checkpoint if there's a bitmap */ 7089 /* we don't use the checkpoint if there's a bitmap */
7391 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 7090 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
7392 j = mddev->resync_min; 7091 j = mddev->resync_min;
@@ -7394,13 +7093,13 @@ void md_do_sync(struct md_thread *thread)
7394 j = mddev->recovery_cp; 7093 j = mddev->recovery_cp;
7395 7094
7396 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 7095 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
7397 max_sectors = mddev->resync_max_sectors; 7096 max_sectors = mddev->dev_sectors;
7398 else { 7097 else {
7399 /* recovery follows the physical size of devices */ 7098 /* recovery follows the physical size of devices */
7400 max_sectors = mddev->dev_sectors; 7099 max_sectors = mddev->dev_sectors;
7401 j = MaxSector; 7100 j = MaxSector;
7402 rcu_read_lock(); 7101 rcu_read_lock();
7403 rdev_for_each_rcu(rdev, mddev) 7102 list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
7404 if (rdev->raid_disk >= 0 && 7103 if (rdev->raid_disk >= 0 &&
7405 !test_bit(Faulty, &rdev->flags) && 7104 !test_bit(Faulty, &rdev->flags) &&
7406 !test_bit(In_sync, &rdev->flags) && 7105 !test_bit(In_sync, &rdev->flags) &&
@@ -7442,14 +7141,9 @@ void md_do_sync(struct md_thread *thread)
7442 "md: resuming %s of %s from checkpoint.\n", 7141 "md: resuming %s of %s from checkpoint.\n",
7443 desc, mdname(mddev)); 7142 desc, mdname(mddev));
7444 mddev->curr_resync = j; 7143 mddev->curr_resync = j;
7445 } else 7144 }
7446 mddev->curr_resync = 3; /* no longer delayed */
7447 mddev->curr_resync_completed = j; 7145 mddev->curr_resync_completed = j;
7448 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
7449 md_new_event(mddev);
7450 update_time = jiffies;
7451 7146
7452 blk_start_plug(&plug);
7453 while (j < max_sectors) { 7147 while (j < max_sectors) {
7454 sector_t sectors; 7148 sector_t sectors;
7455 7149
@@ -7459,7 +7153,6 @@ void md_do_sync(struct md_thread *thread)
7459 ((mddev->curr_resync > mddev->curr_resync_completed && 7153 ((mddev->curr_resync > mddev->curr_resync_completed &&
7460 (mddev->curr_resync - mddev->curr_resync_completed) 7154 (mddev->curr_resync - mddev->curr_resync_completed)
7461 > (max_sectors >> 4)) || 7155 > (max_sectors >> 4)) ||
7462 time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
7463 (j - mddev->curr_resync_completed)*2 7156 (j - mddev->curr_resync_completed)*2
7464 >= mddev->resync_max - mddev->curr_resync_completed 7157 >= mddev->resync_max - mddev->curr_resync_completed
7465 )) { 7158 )) {
@@ -7467,10 +7160,6 @@ void md_do_sync(struct md_thread *thread)
7467 wait_event(mddev->recovery_wait, 7160 wait_event(mddev->recovery_wait,
7468 atomic_read(&mddev->recovery_active) == 0); 7161 atomic_read(&mddev->recovery_active) == 0);
7469 mddev->curr_resync_completed = j; 7162 mddev->curr_resync_completed = j;
7470 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
7471 j > mddev->recovery_cp)
7472 mddev->recovery_cp = j;
7473 update_time = jiffies;
7474 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 7163 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
7475 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 7164 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
7476 } 7165 }
@@ -7505,8 +7194,7 @@ void md_do_sync(struct md_thread *thread)
7505 break; 7194 break;
7506 7195
7507 j += sectors; 7196 j += sectors;
7508 if (j > 2) 7197 if (j>1) mddev->curr_resync = j;
7509 mddev->curr_resync = j;
7510 mddev->curr_mark_cnt = io_sectors; 7198 mddev->curr_mark_cnt = io_sectors;
7511 if (last_check == 0) 7199 if (last_check == 0)
7512 /* this is the earliest that rebuild will be 7200 /* this is the earliest that rebuild will be
@@ -7561,7 +7249,6 @@ void md_do_sync(struct md_thread *thread)
7561 * this also signals 'finished resyncing' to md_stop 7249 * this also signals 'finished resyncing' to md_stop
7562 */ 7250 */
7563 out: 7251 out:
7564 blk_finish_plug(&plug);
7565 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 7252 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
7566 7253
7567 /* tell personality that we are finished */ 7254 /* tell personality that we are finished */
@@ -7575,13 +7262,7 @@ void md_do_sync(struct md_thread *thread)
7575 printk(KERN_INFO 7262 printk(KERN_INFO
7576 "md: checkpointing %s of %s.\n", 7263 "md: checkpointing %s of %s.\n",
7577 desc, mdname(mddev)); 7264 desc, mdname(mddev));
7578 if (test_bit(MD_RECOVERY_ERROR, 7265 mddev->recovery_cp = mddev->curr_resync;
7579 &mddev->recovery))
7580 mddev->recovery_cp =
7581 mddev->curr_resync_completed;
7582 else
7583 mddev->recovery_cp =
7584 mddev->curr_resync;
7585 } 7266 }
7586 } else 7267 } else
7587 mddev->recovery_cp = MaxSector; 7268 mddev->recovery_cp = MaxSector;
@@ -7589,7 +7270,7 @@ void md_do_sync(struct md_thread *thread)
7589 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 7270 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7590 mddev->curr_resync = MaxSector; 7271 mddev->curr_resync = MaxSector;
7591 rcu_read_lock(); 7272 rcu_read_lock();
7592 rdev_for_each_rcu(rdev, mddev) 7273 list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
7593 if (rdev->raid_disk >= 0 && 7274 if (rdev->raid_disk >= 0 &&
7594 mddev->delta_disks >= 0 && 7275 mddev->delta_disks >= 0 &&
7595 !test_bit(Faulty, &rdev->flags) && 7276 !test_bit(Faulty, &rdev->flags) &&
@@ -7599,9 +7280,9 @@ void md_do_sync(struct md_thread *thread)
7599 rcu_read_unlock(); 7280 rcu_read_unlock();
7600 } 7281 }
7601 } 7282 }
7602 skip:
7603 set_bit(MD_CHANGE_DEVS, &mddev->flags); 7283 set_bit(MD_CHANGE_DEVS, &mddev->flags);
7604 7284
7285 skip:
7605 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 7286 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
7606 /* We completed so min/max setting can be forgotten if used. */ 7287 /* We completed so min/max setting can be forgotten if used. */
7607 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 7288 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
@@ -7627,56 +7308,53 @@ void md_do_sync(struct md_thread *thread)
7627} 7308}
7628EXPORT_SYMBOL_GPL(md_do_sync); 7309EXPORT_SYMBOL_GPL(md_do_sync);
7629 7310
7630static int remove_and_add_spares(struct mddev *mddev) 7311static int remove_and_add_spares(mddev_t *mddev)
7631{ 7312{
7632 struct md_rdev *rdev; 7313 mdk_rdev_t *rdev;
7633 int spares = 0; 7314 int spares = 0;
7634 int removed = 0;
7635 7315
7636 rdev_for_each(rdev, mddev) 7316 mddev->curr_resync_completed = 0;
7317
7318 list_for_each_entry(rdev, &mddev->disks, same_set)
7637 if (rdev->raid_disk >= 0 && 7319 if (rdev->raid_disk >= 0 &&
7638 !test_bit(Blocked, &rdev->flags) && 7320 !test_bit(Blocked, &rdev->flags) &&
7639 (test_bit(Faulty, &rdev->flags) || 7321 (test_bit(Faulty, &rdev->flags) ||
7640 ! test_bit(In_sync, &rdev->flags)) && 7322 ! test_bit(In_sync, &rdev->flags)) &&
7641 atomic_read(&rdev->nr_pending)==0) { 7323 atomic_read(&rdev->nr_pending)==0) {
7642 if (mddev->pers->hot_remove_disk( 7324 if (mddev->pers->hot_remove_disk(
7643 mddev, rdev) == 0) { 7325 mddev, rdev->raid_disk)==0) {
7644 sysfs_unlink_rdev(mddev, rdev); 7326 sysfs_unlink_rdev(mddev, rdev);
7645 rdev->raid_disk = -1; 7327 rdev->raid_disk = -1;
7646 removed++;
7647 } 7328 }
7648 } 7329 }
7649 if (removed)
7650 sysfs_notify(&mddev->kobj, NULL,
7651 "degraded");
7652
7653 7330
7654 rdev_for_each(rdev, mddev) { 7331 if (mddev->degraded) {
7655 if (rdev->raid_disk >= 0 && 7332 list_for_each_entry(rdev, &mddev->disks, same_set) {
7656 !test_bit(In_sync, &rdev->flags) && 7333 if (rdev->raid_disk >= 0 &&
7657 !test_bit(Faulty, &rdev->flags)) 7334 !test_bit(In_sync, &rdev->flags) &&
7658 spares++; 7335 !test_bit(Faulty, &rdev->flags))
7659 if (rdev->raid_disk < 0
7660 && !test_bit(Faulty, &rdev->flags)) {
7661 rdev->recovery_offset = 0;
7662 if (mddev->pers->
7663 hot_add_disk(mddev, rdev) == 0) {
7664 if (sysfs_link_rdev(mddev, rdev))
7665 /* failure here is OK */;
7666 spares++; 7336 spares++;
7667 md_new_event(mddev); 7337 if (rdev->raid_disk < 0
7668 set_bit(MD_CHANGE_DEVS, &mddev->flags); 7338 && !test_bit(Faulty, &rdev->flags)) {
7339 rdev->recovery_offset = 0;
7340 if (mddev->pers->
7341 hot_add_disk(mddev, rdev) == 0) {
7342 if (sysfs_link_rdev(mddev, rdev))
7343 /* failure here is OK */;
7344 spares++;
7345 md_new_event(mddev);
7346 set_bit(MD_CHANGE_DEVS, &mddev->flags);
7347 } else
7348 break;
7669 } 7349 }
7670 } 7350 }
7671 } 7351 }
7672 if (removed)
7673 set_bit(MD_CHANGE_DEVS, &mddev->flags);
7674 return spares; 7352 return spares;
7675} 7353}
7676 7354
7677static void reap_sync_thread(struct mddev *mddev) 7355static void reap_sync_thread(mddev_t *mddev)
7678{ 7356{
7679 struct md_rdev *rdev; 7357 mdk_rdev_t *rdev;
7680 7358
7681 /* resync has finished, collect result */ 7359 /* resync has finished, collect result */
7682 md_unregister_thread(&mddev->sync_thread); 7360 md_unregister_thread(&mddev->sync_thread);
@@ -7684,28 +7362,22 @@ static void reap_sync_thread(struct mddev *mddev)
7684 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 7362 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
7685 /* success...*/ 7363 /* success...*/
7686 /* activate any spares */ 7364 /* activate any spares */
7687 if (mddev->pers->spare_active(mddev)) { 7365 if (mddev->pers->spare_active(mddev))
7688 sysfs_notify(&mddev->kobj, NULL, 7366 sysfs_notify(&mddev->kobj, NULL,
7689 "degraded"); 7367 "degraded");
7690 set_bit(MD_CHANGE_DEVS, &mddev->flags);
7691 }
7692 } 7368 }
7693 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 7369 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
7694 mddev->pers->finish_reshape) 7370 mddev->pers->finish_reshape)
7695 mddev->pers->finish_reshape(mddev); 7371 mddev->pers->finish_reshape(mddev);
7372 md_update_sb(mddev, 1);
7696 7373
7697 /* If array is no-longer degraded, then any saved_raid_disk 7374 /* if array is no-longer degraded, then any saved_raid_disk
7698 * information must be scrapped. Also if any device is now 7375 * information must be scrapped
7699 * In_sync we must scrape the saved_raid_disk for that device
7700 * do the superblock for an incrementally recovered device
7701 * written out.
7702 */ 7376 */
7703 rdev_for_each(rdev, mddev) 7377 if (!mddev->degraded)
7704 if (!mddev->degraded || 7378 list_for_each_entry(rdev, &mddev->disks, same_set)
7705 test_bit(In_sync, &rdev->flags))
7706 rdev->saved_raid_disk = -1; 7379 rdev->saved_raid_disk = -1;
7707 7380
7708 md_update_sb(mddev, 1);
7709 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 7381 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7710 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 7382 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7711 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 7383 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
@@ -7741,7 +7413,7 @@ static void reap_sync_thread(struct mddev *mddev)
7741 * 5/ If array is degraded, try to add spares devices 7413 * 5/ If array is degraded, try to add spares devices
7742 * 6/ If array has spares or is not in-sync, start a resync thread. 7414 * 6/ If array has spares or is not in-sync, start a resync thread.
7743 */ 7415 */
7744void md_check_recovery(struct mddev *mddev) 7416void md_check_recovery(mddev_t *mddev)
7745{ 7417{
7746 if (mddev->suspended) 7418 if (mddev->suspended)
7747 return; 7419 return;
@@ -7777,14 +7449,14 @@ void md_check_recovery(struct mddev *mddev)
7777 /* Only thing we do on a ro array is remove 7449 /* Only thing we do on a ro array is remove
7778 * failed devices. 7450 * failed devices.
7779 */ 7451 */
7780 struct md_rdev *rdev; 7452 mdk_rdev_t *rdev;
7781 rdev_for_each(rdev, mddev) 7453 list_for_each_entry(rdev, &mddev->disks, same_set)
7782 if (rdev->raid_disk >= 0 && 7454 if (rdev->raid_disk >= 0 &&
7783 !test_bit(Blocked, &rdev->flags) && 7455 !test_bit(Blocked, &rdev->flags) &&
7784 test_bit(Faulty, &rdev->flags) && 7456 test_bit(Faulty, &rdev->flags) &&
7785 atomic_read(&rdev->nr_pending)==0) { 7457 atomic_read(&rdev->nr_pending)==0) {
7786 if (mddev->pers->hot_remove_disk( 7458 if (mddev->pers->hot_remove_disk(
7787 mddev, rdev) == 0) { 7459 mddev, rdev->raid_disk)==0) {
7788 sysfs_unlink_rdev(mddev, rdev); 7460 sysfs_unlink_rdev(mddev, rdev);
7789 rdev->raid_disk = -1; 7461 rdev->raid_disk = -1;
7790 } 7462 }
@@ -7827,21 +7499,20 @@ void md_check_recovery(struct mddev *mddev)
7827 /* Set RUNNING before clearing NEEDED to avoid 7499 /* Set RUNNING before clearing NEEDED to avoid
7828 * any transients in the value of "sync_action". 7500 * any transients in the value of "sync_action".
7829 */ 7501 */
7830 mddev->curr_resync_completed = 0;
7831 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 7502 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7503 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7832 /* Clear some bits that don't mean anything, but 7504 /* Clear some bits that don't mean anything, but
7833 * might be left set 7505 * might be left set
7834 */ 7506 */
7835 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 7507 clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
7836 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 7508 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
7837 7509
7838 if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 7510 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
7839 test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
7840 goto unlock; 7511 goto unlock;
7841 /* no recovery is running. 7512 /* no recovery is running.
7842 * remove any failed drives, then 7513 * remove any failed drives, then
7843 * add spares if possible. 7514 * add spares if possible.
7844 * Spares are also removed and re-added, to allow 7515 * Spare are also removed and re-added, to allow
7845 * the personality to fail the re-add. 7516 * the personality to fail the re-add.
7846 */ 7517 */
7847 7518
@@ -7865,7 +7536,7 @@ void md_check_recovery(struct mddev *mddev)
7865 goto unlock; 7536 goto unlock;
7866 7537
7867 if (mddev->pers->sync_request) { 7538 if (mddev->pers->sync_request) {
7868 if (spares) { 7539 if (spares && mddev->bitmap && ! mddev->bitmap->file) {
7869 /* We are adding a device or devices to an array 7540 /* We are adding a device or devices to an array
7870 * which has the bitmap stored on all devices. 7541 * which has the bitmap stored on all devices.
7871 * So make sure all bitmap pages get written 7542 * So make sure all bitmap pages get written
@@ -7902,7 +7573,7 @@ void md_check_recovery(struct mddev *mddev)
7902 } 7573 }
7903} 7574}
7904 7575
7905void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) 7576void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
7906{ 7577{
7907 sysfs_notify_dirent_safe(rdev->sysfs_state); 7578 sysfs_notify_dirent_safe(rdev->sysfs_state);
7908 wait_event_timeout(rdev->blocked_wait, 7579 wait_event_timeout(rdev->blocked_wait,
@@ -7913,20 +7584,6 @@ void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
7913} 7584}
7914EXPORT_SYMBOL(md_wait_for_blocked_rdev); 7585EXPORT_SYMBOL(md_wait_for_blocked_rdev);
7915 7586
7916void md_finish_reshape(struct mddev *mddev)
7917{
7918 /* called be personality module when reshape completes. */
7919 struct md_rdev *rdev;
7920
7921 rdev_for_each(rdev, mddev) {
7922 if (rdev->data_offset > rdev->new_data_offset)
7923 rdev->sectors += rdev->data_offset - rdev->new_data_offset;
7924 else
7925 rdev->sectors -= rdev->new_data_offset - rdev->data_offset;
7926 rdev->data_offset = rdev->new_data_offset;
7927 }
7928}
7929EXPORT_SYMBOL(md_finish_reshape);
7930 7587
7931/* Bad block management. 7588/* Bad block management.
7932 * We can record which blocks on each device are 'bad' and so just 7589 * We can record which blocks on each device are 'bad' and so just
@@ -7958,9 +7615,9 @@ int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
7958 sector_t *first_bad, int *bad_sectors) 7615 sector_t *first_bad, int *bad_sectors)
7959{ 7616{
7960 int hi; 7617 int hi;
7961 int lo; 7618 int lo = 0;
7962 u64 *p = bb->page; 7619 u64 *p = bb->page;
7963 int rv; 7620 int rv = 0;
7964 sector_t target = s + sectors; 7621 sector_t target = s + sectors;
7965 unsigned seq; 7622 unsigned seq;
7966 7623
@@ -7975,8 +7632,7 @@ int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
7975 7632
7976retry: 7633retry:
7977 seq = read_seqbegin(&bb->lock); 7634 seq = read_seqbegin(&bb->lock);
7978 lo = 0; 7635
7979 rv = 0;
7980 hi = bb->count; 7636 hi = bb->count;
7981 7637
7982 /* Binary search between lo and hi for 'target' 7638 /* Binary search between lo and hi for 'target'
@@ -8175,19 +7831,13 @@ static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
8175 return rv; 7831 return rv;
8176} 7832}
8177 7833
8178int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 7834int rdev_set_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors,
8179 int is_new) 7835 int acknowledged)
8180{ 7836{
8181 int rv; 7837 int rv = md_set_badblocks(&rdev->badblocks,
8182 if (is_new) 7838 s + rdev->data_offset, sectors, acknowledged);
8183 s += rdev->new_data_offset;
8184 else
8185 s += rdev->data_offset;
8186 rv = md_set_badblocks(&rdev->badblocks,
8187 s, sectors, 0);
8188 if (rv) { 7839 if (rv) {
8189 /* Make sure they get written out promptly */ 7840 /* Make sure they get written out promptly */
8190 sysfs_notify_dirent_safe(rdev->sysfs_state);
8191 set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags); 7841 set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
8192 md_wakeup_thread(rdev->mddev->thread); 7842 md_wakeup_thread(rdev->mddev->thread);
8193 } 7843 }
@@ -8290,15 +7940,11 @@ out:
8290 return rv; 7940 return rv;
8291} 7941}
8292 7942
8293int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 7943int rdev_clear_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors)
8294 int is_new)
8295{ 7944{
8296 if (is_new)
8297 s += rdev->new_data_offset;
8298 else
8299 s += rdev->data_offset;
8300 return md_clear_badblocks(&rdev->badblocks, 7945 return md_clear_badblocks(&rdev->badblocks,
8301 s, sectors); 7946 s + rdev->data_offset,
7947 sectors);
8302} 7948}
8303EXPORT_SYMBOL_GPL(rdev_clear_badblocks); 7949EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
8304 7950
@@ -8314,7 +7960,7 @@ void md_ack_all_badblocks(struct badblocks *bb)
8314 return; 7960 return;
8315 write_seqlock_irq(&bb->lock); 7961 write_seqlock_irq(&bb->lock);
8316 7962
8317 if (bb->changed == 0 && bb->unacked_exist) { 7963 if (bb->changed == 0) {
8318 u64 *p = bb->page; 7964 u64 *p = bb->page;
8319 int i; 7965 int i;
8320 for (i = 0; i < bb->count ; i++) { 7966 for (i = 0; i < bb->count ; i++) {
@@ -8428,27 +8074,29 @@ static int md_notify_reboot(struct notifier_block *this,
8428 unsigned long code, void *x) 8074 unsigned long code, void *x)
8429{ 8075{
8430 struct list_head *tmp; 8076 struct list_head *tmp;
8431 struct mddev *mddev; 8077 mddev_t *mddev;
8432 int need_delay = 0;
8433 8078
8434 for_each_mddev(mddev, tmp) { 8079 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
8435 if (mddev_trylock(mddev)) { 8080
8436 if (mddev->pers) 8081 printk(KERN_INFO "md: stopping all md devices.\n");
8437 __md_stop_writes(mddev);
8438 mddev->safemode = 2;
8439 mddev_unlock(mddev);
8440 }
8441 need_delay = 1;
8442 }
8443 /*
8444 * certain more exotic SCSI devices are known to be
8445 * volatile wrt too early system reboots. While the
8446 * right place to handle this issue is the given
8447 * driver, we do want to have a safe RAID driver ...
8448 */
8449 if (need_delay)
8450 mdelay(1000*1);
8451 8082
8083 for_each_mddev(mddev, tmp)
8084 if (mddev_trylock(mddev)) {
8085 /* Force a switch to readonly even array
8086 * appears to still be in use. Hence
8087 * the '100'.
8088 */
8089 md_set_readonly(mddev, 100);
8090 mddev_unlock(mddev);
8091 }
8092 /*
8093 * certain more exotic SCSI devices are known to be
8094 * volatile wrt too early system reboots. While the
8095 * right place to handle this issue is the given
8096 * driver, we do want to have a safe RAID driver ...
8097 */
8098 mdelay(1000*1);
8099 }
8452 return NOTIFY_DONE; 8100 return NOTIFY_DONE;
8453} 8101}
8454 8102
@@ -8460,7 +8108,7 @@ static struct notifier_block md_notifier = {
8460 8108
8461static void md_geninit(void) 8109static void md_geninit(void)
8462{ 8110{
8463 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 8111 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
8464 8112
8465 proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops); 8113 proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
8466} 8114}
@@ -8535,7 +8183,7 @@ void md_autodetect_dev(dev_t dev)
8535 8183
8536static void autostart_arrays(int part) 8184static void autostart_arrays(int part)
8537{ 8185{
8538 struct md_rdev *rdev; 8186 mdk_rdev_t *rdev;
8539 struct detected_devices_node *node_detected_dev; 8187 struct detected_devices_node *node_detected_dev;
8540 dev_t dev; 8188 dev_t dev;
8541 int i_scanned, i_passed; 8189 int i_scanned, i_passed;
@@ -8575,7 +8223,7 @@ static void autostart_arrays(int part)
8575 8223
8576static __exit void md_exit(void) 8224static __exit void md_exit(void)
8577{ 8225{
8578 struct mddev *mddev; 8226 mddev_t *mddev;
8579 struct list_head *tmp; 8227 struct list_head *tmp;
8580 8228
8581 blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS); 8229 blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index eca59c3074e..0a309dc29b4 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -1,5 +1,5 @@
1/* 1/*
2 md.h : kernel internal structure of the Linux MD driver 2 md_k.h : kernel internal structure of the Linux MD driver
3 Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman 3 Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
4 4
5 This program is free software; you can redistribute it and/or modify 5 This program is free software; you can redistribute it and/or modify
@@ -26,6 +26,9 @@
26 26
27#define MaxSector (~(sector_t)0) 27#define MaxSector (~(sector_t)0)
28 28
29typedef struct mddev_s mddev_t;
30typedef struct mdk_rdev_s mdk_rdev_t;
31
29/* Bad block numbers are stored sorted in a single page. 32/* Bad block numbers are stored sorted in a single page.
30 * 64bits is used for each block or extent. 33 * 64bits is used for each block or extent.
31 * 54 bits are sector number, 9 bits are extent size, 34 * 54 bits are sector number, 9 bits are extent size,
@@ -36,11 +39,12 @@
36/* 39/*
37 * MD's 'extended' device 40 * MD's 'extended' device
38 */ 41 */
39struct md_rdev { 42struct mdk_rdev_s
43{
40 struct list_head same_set; /* RAID devices within the same set */ 44 struct list_head same_set; /* RAID devices within the same set */
41 45
42 sector_t sectors; /* Device size (in 512bytes sectors) */ 46 sector_t sectors; /* Device size (in 512bytes sectors) */
43 struct mddev *mddev; /* RAID array if running */ 47 mddev_t *mddev; /* RAID array if running */
44 int last_events; /* IO event timestamp */ 48 int last_events; /* IO event timestamp */
45 49
46 /* 50 /*
@@ -55,7 +59,6 @@ struct md_rdev {
55 int sb_loaded; 59 int sb_loaded;
56 __u64 sb_events; 60 __u64 sb_events;
57 sector_t data_offset; /* start of data in array */ 61 sector_t data_offset; /* start of data in array */
58 sector_t new_data_offset;/* only relevant while reshaping */
59 sector_t sb_start; /* offset of the super block (in 512byte sectors) */ 62 sector_t sb_start; /* offset of the super block (in 512byte sectors) */
60 int sb_size; /* bytes in the superblock */ 63 int sb_size; /* bytes in the superblock */
61 int preferred_minor; /* autorun support */ 64 int preferred_minor; /* autorun support */
@@ -73,7 +76,34 @@ struct md_rdev {
73 * This reduces the burden of testing multiple flags in many cases 76 * This reduces the burden of testing multiple flags in many cases
74 */ 77 */
75 78
76 unsigned long flags; /* bit set of 'enum flag_bits' bits. */ 79 unsigned long flags;
80#define Faulty 1 /* device is known to have a fault */
81#define In_sync 2 /* device is in_sync with rest of array */
82#define WriteMostly 4 /* Avoid reading if at all possible */
83#define AutoDetected 7 /* added by auto-detect */
84#define Blocked 8 /* An error occurred but has not yet
85 * been acknowledged by the metadata
86 * handler, so don't allow writes
87 * until it is cleared */
88#define WriteErrorSeen 9 /* A write error has been seen on this
89 * device
90 */
91#define FaultRecorded 10 /* Intermediate state for clearing
92 * Blocked. The Fault is/will-be
93 * recorded in the metadata, but that
94 * metadata hasn't been stored safely
95 * on disk yet.
96 */
97#define BlockedBadBlocks 11 /* A writer is blocked because they
98 * found an unacknowledged bad-block.
99 * This can safely be cleared at any
100 * time, and the writer will re-check.
101 * It may be set at any time, and at
102 * worst the writer will timeout and
103 * re-check. So setting it as
104 * accurately as possible is good, but
105 * not absolutely critical.
106 */
77 wait_queue_head_t blocked_wait; 107 wait_queue_head_t blocked_wait;
78 108
79 int desc_nr; /* descriptor index in the superblock */ 109 int desc_nr; /* descriptor index in the superblock */
@@ -126,48 +156,6 @@ struct md_rdev {
126 sector_t size; /* in sectors */ 156 sector_t size; /* in sectors */
127 } badblocks; 157 } badblocks;
128}; 158};
129enum flag_bits {
130 Faulty, /* device is known to have a fault */
131 In_sync, /* device is in_sync with rest of array */
132 Unmerged, /* device is being added to array and should
133 * be considerred for bvec_merge_fn but not
134 * yet for actual IO
135 */
136 WriteMostly, /* Avoid reading if at all possible */
137 AutoDetected, /* added by auto-detect */
138 Blocked, /* An error occurred but has not yet
139 * been acknowledged by the metadata
140 * handler, so don't allow writes
141 * until it is cleared */
142 WriteErrorSeen, /* A write error has been seen on this
143 * device
144 */
145 FaultRecorded, /* Intermediate state for clearing
146 * Blocked. The Fault is/will-be
147 * recorded in the metadata, but that
148 * metadata hasn't been stored safely
149 * on disk yet.
150 */
151 BlockedBadBlocks, /* A writer is blocked because they
152 * found an unacknowledged bad-block.
153 * This can safely be cleared at any
154 * time, and the writer will re-check.
155 * It may be set at any time, and at
156 * worst the writer will timeout and
157 * re-check. So setting it as
158 * accurately as possible is good, but
159 * not absolutely critical.
160 */
161 WantReplacement, /* This device is a candidate to be
162 * hot-replaced, either because it has
163 * reported some faults, or because
164 * of explicit request.
165 */
166 Replacement, /* This device is a replacement for
167 * a want_replacement device with same
168 * raid_disk number.
169 */
170};
171 159
172#define BB_LEN_MASK (0x00000000000001FFULL) 160#define BB_LEN_MASK (0x00000000000001FFULL)
173#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL) 161#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL)
@@ -180,7 +168,7 @@ enum flag_bits {
180 168
181extern int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, 169extern int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
182 sector_t *first_bad, int *bad_sectors); 170 sector_t *first_bad, int *bad_sectors);
183static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, 171static inline int is_badblock(mdk_rdev_t *rdev, sector_t s, int sectors,
184 sector_t *first_bad, int *bad_sectors) 172 sector_t *first_bad, int *bad_sectors)
185{ 173{
186 if (unlikely(rdev->badblocks.count)) { 174 if (unlikely(rdev->badblocks.count)) {
@@ -193,15 +181,15 @@ static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
193 } 181 }
194 return 0; 182 return 0;
195} 183}
196extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 184extern int rdev_set_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors,
197 int is_new); 185 int acknowledged);
198extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 186extern int rdev_clear_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors);
199 int is_new);
200extern void md_ack_all_badblocks(struct badblocks *bb); 187extern void md_ack_all_badblocks(struct badblocks *bb);
201 188
202struct mddev { 189struct mddev_s
190{
203 void *private; 191 void *private;
204 struct md_personality *pers; 192 struct mdk_personality *pers;
205 dev_t unit; 193 dev_t unit;
206 int md_minor; 194 int md_minor;
207 struct list_head disks; 195 struct list_head disks;
@@ -264,10 +252,12 @@ struct mddev {
264 sector_t reshape_position; 252 sector_t reshape_position;
265 int delta_disks, new_level, new_layout; 253 int delta_disks, new_level, new_layout;
266 int new_chunk_sectors; 254 int new_chunk_sectors;
267 int reshape_backwards;
268 255
269 struct md_thread *thread; /* management thread */ 256 atomic_t plug_cnt; /* If device is expecting
270 struct md_thread *sync_thread; /* doing resync or reconstruct */ 257 * more bios soon.
258 */
259 struct mdk_thread_s *thread; /* management thread */
260 struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */
271 sector_t curr_resync; /* last block scheduled */ 261 sector_t curr_resync; /* last block scheduled */
272 /* As resync requests can complete out of order, we cannot easily track 262 /* As resync requests can complete out of order, we cannot easily track
273 * how much resync has been completed. So we occasionally pause until 263 * how much resync has been completed. So we occasionally pause until
@@ -282,7 +272,7 @@ struct mddev {
282 272
283 sector_t resync_max_sectors; /* may be set by personality */ 273 sector_t resync_max_sectors; /* may be set by personality */
284 274
285 atomic64_t resync_mismatches; /* count of sectors where 275 sector_t resync_mismatches; /* count of sectors where
286 * parity/replica mismatch found 276 * parity/replica mismatch found
287 */ 277 */
288 278
@@ -307,7 +297,6 @@ struct mddev {
307 * REQUEST: user-space has requested a sync (used with SYNC) 297 * REQUEST: user-space has requested a sync (used with SYNC)
308 * CHECK: user-space request for check-only, no repair 298 * CHECK: user-space request for check-only, no repair
309 * RESHAPE: A reshape is happening 299 * RESHAPE: A reshape is happening
310 * ERROR: sync-action interrupted because io-error
311 * 300 *
312 * If neither SYNC or RESHAPE are set, then it is a recovery. 301 * If neither SYNC or RESHAPE are set, then it is a recovery.
313 */ 302 */
@@ -321,7 +310,6 @@ struct mddev {
321#define MD_RECOVERY_CHECK 7 310#define MD_RECOVERY_CHECK 7
322#define MD_RECOVERY_RESHAPE 8 311#define MD_RECOVERY_RESHAPE 8
323#define MD_RECOVERY_FROZEN 9 312#define MD_RECOVERY_FROZEN 9
324#define MD_RECOVERY_ERROR 10
325 313
326 unsigned long recovery; 314 unsigned long recovery;
327 /* If a RAID personality determines that recovery (of a particular 315 /* If a RAID personality determines that recovery (of a particular
@@ -351,10 +339,6 @@ struct mddev {
351 int degraded; /* whether md should consider 339 int degraded; /* whether md should consider
352 * adding a spare 340 * adding a spare
353 */ 341 */
354 int merge_check_needed; /* at least one
355 * member device
356 * has a
357 * merge_bvec_fn */
358 342
359 atomic_t recovery_active; /* blocks scheduled, but not written */ 343 atomic_t recovery_active; /* blocks scheduled, but not written */
360 wait_queue_head_t recovery_wait; 344 wait_queue_head_t recovery_wait;
@@ -392,13 +376,10 @@ struct mddev {
392 * For external metadata, offset 376 * For external metadata, offset
393 * from start of device. 377 * from start of device.
394 */ 378 */
395 unsigned long space; /* space available at this offset */
396 loff_t default_offset; /* this is the offset to use when 379 loff_t default_offset; /* this is the offset to use when
397 * hot-adding a bitmap. It should 380 * hot-adding a bitmap. It should
398 * eventually be settable by sysfs. 381 * eventually be settable by sysfs.
399 */ 382 */
400 unsigned long default_space; /* space available at
401 * default offset */
402 struct mutex mutex; 383 struct mutex mutex;
403 unsigned long chunksize; 384 unsigned long chunksize;
404 unsigned long daemon_sleep; /* how many jiffies between updates? */ 385 unsigned long daemon_sleep; /* how many jiffies between updates? */
@@ -421,11 +402,11 @@ struct mddev {
421 atomic_t flush_pending; 402 atomic_t flush_pending;
422 struct work_struct flush_work; 403 struct work_struct flush_work;
423 struct work_struct event_work; /* used by dm to report failure event */ 404 struct work_struct event_work; /* used by dm to report failure event */
424 void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); 405 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
425}; 406};
426 407
427 408
428static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) 409static inline void rdev_dec_pending(mdk_rdev_t *rdev, mddev_t *mddev)
429{ 410{
430 int faulty = test_bit(Faulty, &rdev->flags); 411 int faulty = test_bit(Faulty, &rdev->flags);
431 if (atomic_dec_and_test(&rdev->nr_pending) && faulty) 412 if (atomic_dec_and_test(&rdev->nr_pending) && faulty)
@@ -437,35 +418,35 @@ static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sect
437 atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io); 418 atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io);
438} 419}
439 420
440struct md_personality 421struct mdk_personality
441{ 422{
442 char *name; 423 char *name;
443 int level; 424 int level;
444 struct list_head list; 425 struct list_head list;
445 struct module *owner; 426 struct module *owner;
446 void (*make_request)(struct mddev *mddev, struct bio *bio); 427 int (*make_request)(mddev_t *mddev, struct bio *bio);
447 int (*run)(struct mddev *mddev); 428 int (*run)(mddev_t *mddev);
448 int (*stop)(struct mddev *mddev); 429 int (*stop)(mddev_t *mddev);
449 void (*status)(struct seq_file *seq, struct mddev *mddev); 430 void (*status)(struct seq_file *seq, mddev_t *mddev);
450 /* error_handler must set ->faulty and clear ->in_sync 431 /* error_handler must set ->faulty and clear ->in_sync
451 * if appropriate, and should abort recovery if needed 432 * if appropriate, and should abort recovery if needed
452 */ 433 */
453 void (*error_handler)(struct mddev *mddev, struct md_rdev *rdev); 434 void (*error_handler)(mddev_t *mddev, mdk_rdev_t *rdev);
454 int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev); 435 int (*hot_add_disk) (mddev_t *mddev, mdk_rdev_t *rdev);
455 int (*hot_remove_disk) (struct mddev *mddev, struct md_rdev *rdev); 436 int (*hot_remove_disk) (mddev_t *mddev, int number);
456 int (*spare_active) (struct mddev *mddev); 437 int (*spare_active) (mddev_t *mddev);
457 sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster); 438 sector_t (*sync_request)(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster);
458 int (*resize) (struct mddev *mddev, sector_t sectors); 439 int (*resize) (mddev_t *mddev, sector_t sectors);
459 sector_t (*size) (struct mddev *mddev, sector_t sectors, int raid_disks); 440 sector_t (*size) (mddev_t *mddev, sector_t sectors, int raid_disks);
460 int (*check_reshape) (struct mddev *mddev); 441 int (*check_reshape) (mddev_t *mddev);
461 int (*start_reshape) (struct mddev *mddev); 442 int (*start_reshape) (mddev_t *mddev);
462 void (*finish_reshape) (struct mddev *mddev); 443 void (*finish_reshape) (mddev_t *mddev);
463 /* quiesce moves between quiescence states 444 /* quiesce moves between quiescence states
464 * 0 - fully active 445 * 0 - fully active
465 * 1 - no new requests allowed 446 * 1 - no new requests allowed
466 * others - reserved 447 * others - reserved
467 */ 448 */
468 void (*quiesce) (struct mddev *mddev, int state); 449 void (*quiesce) (mddev_t *mddev, int state);
469 /* takeover is used to transition an array from one 450 /* takeover is used to transition an array from one
470 * personality to another. The new personality must be able 451 * personality to another. The new personality must be able
471 * to handle the data in the current layout. 452 * to handle the data in the current layout.
@@ -475,14 +456,14 @@ struct md_personality
475 * This needs to be installed and then ->run used to activate the 456 * This needs to be installed and then ->run used to activate the
476 * array. 457 * array.
477 */ 458 */
478 void *(*takeover) (struct mddev *mddev); 459 void *(*takeover) (mddev_t *mddev);
479}; 460};
480 461
481 462
482struct md_sysfs_entry { 463struct md_sysfs_entry {
483 struct attribute attr; 464 struct attribute attr;
484 ssize_t (*show)(struct mddev *, char *); 465 ssize_t (*show)(mddev_t *, char *);
485 ssize_t (*store)(struct mddev *, const char *, size_t); 466 ssize_t (*store)(mddev_t *, const char *, size_t);
486}; 467};
487extern struct attribute_group md_bitmap_group; 468extern struct attribute_group md_bitmap_group;
488 469
@@ -498,28 +479,23 @@ static inline void sysfs_notify_dirent_safe(struct sysfs_dirent *sd)
498 sysfs_notify_dirent(sd); 479 sysfs_notify_dirent(sd);
499} 480}
500 481
501static inline char * mdname (struct mddev * mddev) 482static inline char * mdname (mddev_t * mddev)
502{ 483{
503 return mddev->gendisk ? mddev->gendisk->disk_name : "mdX"; 484 return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
504} 485}
505 486
506static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev) 487static inline int sysfs_link_rdev(mddev_t *mddev, mdk_rdev_t *rdev)
507{ 488{
508 char nm[20]; 489 char nm[20];
509 if (!test_bit(Replacement, &rdev->flags)) { 490 sprintf(nm, "rd%d", rdev->raid_disk);
510 sprintf(nm, "rd%d", rdev->raid_disk); 491 return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
511 return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
512 } else
513 return 0;
514} 492}
515 493
516static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev) 494static inline void sysfs_unlink_rdev(mddev_t *mddev, mdk_rdev_t *rdev)
517{ 495{
518 char nm[20]; 496 char nm[20];
519 if (!test_bit(Replacement, &rdev->flags)) { 497 sprintf(nm, "rd%d", rdev->raid_disk);
520 sprintf(nm, "rd%d", rdev->raid_disk); 498 sysfs_remove_link(&mddev->kobj, nm);
521 sysfs_remove_link(&mddev->kobj, nm);
522 }
523} 499}
524 500
525/* 501/*
@@ -532,84 +508,96 @@ static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
532/* 508/*
533 * iterates through the 'same array disks' ringlist 509 * iterates through the 'same array disks' ringlist
534 */ 510 */
535#define rdev_for_each(rdev, mddev) \ 511#define rdev_for_each(rdev, tmp, mddev) \
536 list_for_each_entry(rdev, &((mddev)->disks), same_set)
537
538#define rdev_for_each_safe(rdev, tmp, mddev) \
539 list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set) 512 list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set)
540 513
541#define rdev_for_each_rcu(rdev, mddev) \ 514#define rdev_for_each_rcu(rdev, mddev) \
542 list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set) 515 list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set)
543 516
544struct md_thread { 517typedef struct mdk_thread_s {
545 void (*run) (struct md_thread *thread); 518 void (*run) (mddev_t *mddev);
546 struct mddev *mddev; 519 mddev_t *mddev;
547 wait_queue_head_t wqueue; 520 wait_queue_head_t wqueue;
548 unsigned long flags; 521 unsigned long flags;
549 struct task_struct *tsk; 522 struct task_struct *tsk;
550 unsigned long timeout; 523 unsigned long timeout;
551 void *private; 524} mdk_thread_t;
552};
553 525
554#define THREAD_WAKEUP 0 526#define THREAD_WAKEUP 0
555 527
528#define __wait_event_lock_irq(wq, condition, lock, cmd) \
529do { \
530 wait_queue_t __wait; \
531 init_waitqueue_entry(&__wait, current); \
532 \
533 add_wait_queue(&wq, &__wait); \
534 for (;;) { \
535 set_current_state(TASK_UNINTERRUPTIBLE); \
536 if (condition) \
537 break; \
538 spin_unlock_irq(&lock); \
539 cmd; \
540 schedule(); \
541 spin_lock_irq(&lock); \
542 } \
543 current->state = TASK_RUNNING; \
544 remove_wait_queue(&wq, &__wait); \
545} while (0)
546
547#define wait_event_lock_irq(wq, condition, lock, cmd) \
548do { \
549 if (condition) \
550 break; \
551 __wait_event_lock_irq(wq, condition, lock, cmd); \
552} while (0)
553
556static inline void safe_put_page(struct page *p) 554static inline void safe_put_page(struct page *p)
557{ 555{
558 if (p) put_page(p); 556 if (p) put_page(p);
559} 557}
560 558
561extern int register_md_personality(struct md_personality *p); 559extern int register_md_personality(struct mdk_personality *p);
562extern int unregister_md_personality(struct md_personality *p); 560extern int unregister_md_personality(struct mdk_personality *p);
563extern struct md_thread *md_register_thread( 561extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev),
564 void (*run)(struct md_thread *thread), 562 mddev_t *mddev, const char *name);
565 struct mddev *mddev, 563extern void md_unregister_thread(mdk_thread_t **threadp);
566 const char *name); 564extern void md_wakeup_thread(mdk_thread_t *thread);
567extern void md_unregister_thread(struct md_thread **threadp); 565extern void md_check_recovery(mddev_t *mddev);
568extern void md_wakeup_thread(struct md_thread *thread); 566extern void md_write_start(mddev_t *mddev, struct bio *bi);
569extern void md_check_recovery(struct mddev *mddev); 567extern void md_write_end(mddev_t *mddev);
570extern void md_write_start(struct mddev *mddev, struct bio *bi); 568extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
571extern void md_write_end(struct mddev *mddev); 569extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
572extern void md_done_sync(struct mddev *mddev, int blocks, int ok); 570
573extern void md_error(struct mddev *mddev, struct md_rdev *rdev); 571extern int mddev_congested(mddev_t *mddev, int bits);
574extern void md_finish_reshape(struct mddev *mddev); 572extern void md_flush_request(mddev_t *mddev, struct bio *bio);
575 573extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
576extern int mddev_congested(struct mddev *mddev, int bits);
577extern void md_flush_request(struct mddev *mddev, struct bio *bio);
578extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
579 sector_t sector, int size, struct page *page); 574 sector_t sector, int size, struct page *page);
580extern void md_super_wait(struct mddev *mddev); 575extern void md_super_wait(mddev_t *mddev);
581extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, 576extern int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
582 struct page *page, int rw, bool metadata_op); 577 struct page *page, int rw, bool metadata_op);
583extern void md_do_sync(struct md_thread *thread); 578extern void md_do_sync(mddev_t *mddev);
584extern void md_new_event(struct mddev *mddev); 579extern void md_new_event(mddev_t *mddev);
585extern int md_allow_write(struct mddev *mddev); 580extern int md_allow_write(mddev_t *mddev);
586extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev); 581extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
587extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors); 582extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors);
588extern int md_check_no_bitmap(struct mddev *mddev); 583extern int md_check_no_bitmap(mddev_t *mddev);
589extern int md_integrity_register(struct mddev *mddev); 584extern int md_integrity_register(mddev_t *mddev);
590extern void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev); 585extern void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
591extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); 586extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
592extern void restore_bitmap_write_access(struct file *file); 587extern void restore_bitmap_write_access(struct file *file);
593 588
594extern void mddev_init(struct mddev *mddev); 589extern void mddev_init(mddev_t *mddev);
595extern int md_run(struct mddev *mddev); 590extern int md_run(mddev_t *mddev);
596extern void md_stop(struct mddev *mddev); 591extern void md_stop(mddev_t *mddev);
597extern void md_stop_writes(struct mddev *mddev); 592extern void md_stop_writes(mddev_t *mddev);
598extern int md_rdev_init(struct md_rdev *rdev); 593extern int md_rdev_init(mdk_rdev_t *rdev);
599extern void md_rdev_clear(struct md_rdev *rdev);
600 594
601extern void mddev_suspend(struct mddev *mddev); 595extern void mddev_suspend(mddev_t *mddev);
602extern void mddev_resume(struct mddev *mddev); 596extern void mddev_resume(mddev_t *mddev);
603extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, 597extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
604 struct mddev *mddev); 598 mddev_t *mddev);
605extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, 599extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
606 struct mddev *mddev); 600 mddev_t *mddev);
601extern int mddev_check_plugged(mddev_t *mddev);
607extern void md_trim_bio(struct bio *bio, int offset, int size); 602extern void md_trim_bio(struct bio *bio, int offset, int size);
608
609extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule);
610static inline int mddev_check_plugged(struct mddev *mddev)
611{
612 return !!blk_check_plugged(md_unplug, mddev,
613 sizeof(struct blk_plug_cb));
614}
615#endif /* _MD_MD_H */ 603#endif /* _MD_MD_H */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 1642eae75a3..d5b5fb30017 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -20,7 +20,6 @@
20 */ 20 */
21 21
22#include <linux/blkdev.h> 22#include <linux/blkdev.h>
23#include <linux/module.h>
24#include <linux/raid/md_u.h> 23#include <linux/raid/md_u.h>
25#include <linux/seq_file.h> 24#include <linux/seq_file.h>
26#include <linux/slab.h> 25#include <linux/slab.h>
@@ -32,7 +31,7 @@
32#define NR_RESERVED_BUFS 32 31#define NR_RESERVED_BUFS 32
33 32
34 33
35static int multipath_map (struct mpconf *conf) 34static int multipath_map (multipath_conf_t *conf)
36{ 35{
37 int i, disks = conf->raid_disks; 36 int i, disks = conf->raid_disks;
38 37
@@ -43,7 +42,7 @@ static int multipath_map (struct mpconf *conf)
43 42
44 rcu_read_lock(); 43 rcu_read_lock();
45 for (i = 0; i < disks; i++) { 44 for (i = 0; i < disks; i++) {
46 struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev); 45 mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
47 if (rdev && test_bit(In_sync, &rdev->flags)) { 46 if (rdev && test_bit(In_sync, &rdev->flags)) {
48 atomic_inc(&rdev->nr_pending); 47 atomic_inc(&rdev->nr_pending);
49 rcu_read_unlock(); 48 rcu_read_unlock();
@@ -59,8 +58,8 @@ static int multipath_map (struct mpconf *conf)
59static void multipath_reschedule_retry (struct multipath_bh *mp_bh) 58static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
60{ 59{
61 unsigned long flags; 60 unsigned long flags;
62 struct mddev *mddev = mp_bh->mddev; 61 mddev_t *mddev = mp_bh->mddev;
63 struct mpconf *conf = mddev->private; 62 multipath_conf_t *conf = mddev->private;
64 63
65 spin_lock_irqsave(&conf->device_lock, flags); 64 spin_lock_irqsave(&conf->device_lock, flags);
66 list_add(&mp_bh->retry_list, &conf->retry_list); 65 list_add(&mp_bh->retry_list, &conf->retry_list);
@@ -77,7 +76,7 @@ static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
77static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err) 76static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err)
78{ 77{
79 struct bio *bio = mp_bh->master_bio; 78 struct bio *bio = mp_bh->master_bio;
80 struct mpconf *conf = mp_bh->mddev->private; 79 multipath_conf_t *conf = mp_bh->mddev->private;
81 80
82 bio_endio(bio, err); 81 bio_endio(bio, err);
83 mempool_free(mp_bh, conf->pool); 82 mempool_free(mp_bh, conf->pool);
@@ -87,8 +86,8 @@ static void multipath_end_request(struct bio *bio, int error)
87{ 86{
88 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 87 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
89 struct multipath_bh *mp_bh = bio->bi_private; 88 struct multipath_bh *mp_bh = bio->bi_private;
90 struct mpconf *conf = mp_bh->mddev->private; 89 multipath_conf_t *conf = mp_bh->mddev->private;
91 struct md_rdev *rdev = conf->multipaths[mp_bh->path].rdev; 90 mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev;
92 91
93 if (uptodate) 92 if (uptodate)
94 multipath_end_bh_io(mp_bh, 0); 93 multipath_end_bh_io(mp_bh, 0);
@@ -107,15 +106,15 @@ static void multipath_end_request(struct bio *bio, int error)
107 rdev_dec_pending(rdev, conf->mddev); 106 rdev_dec_pending(rdev, conf->mddev);
108} 107}
109 108
110static void multipath_make_request(struct mddev *mddev, struct bio * bio) 109static int multipath_make_request(mddev_t *mddev, struct bio * bio)
111{ 110{
112 struct mpconf *conf = mddev->private; 111 multipath_conf_t *conf = mddev->private;
113 struct multipath_bh * mp_bh; 112 struct multipath_bh * mp_bh;
114 struct multipath_info *multipath; 113 struct multipath_info *multipath;
115 114
116 if (unlikely(bio->bi_rw & REQ_FLUSH)) { 115 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
117 md_flush_request(mddev, bio); 116 md_flush_request(mddev, bio);
118 return; 117 return 0;
119 } 118 }
120 119
121 mp_bh = mempool_alloc(conf->pool, GFP_NOIO); 120 mp_bh = mempool_alloc(conf->pool, GFP_NOIO);
@@ -127,7 +126,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
127 if (mp_bh->path < 0) { 126 if (mp_bh->path < 0) {
128 bio_endio(bio, -EIO); 127 bio_endio(bio, -EIO);
129 mempool_free(mp_bh, conf->pool); 128 mempool_free(mp_bh, conf->pool);
130 return; 129 return 0;
131 } 130 }
132 multipath = conf->multipaths + mp_bh->path; 131 multipath = conf->multipaths + mp_bh->path;
133 132
@@ -138,12 +137,12 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
138 mp_bh->bio.bi_end_io = multipath_end_request; 137 mp_bh->bio.bi_end_io = multipath_end_request;
139 mp_bh->bio.bi_private = mp_bh; 138 mp_bh->bio.bi_private = mp_bh;
140 generic_make_request(&mp_bh->bio); 139 generic_make_request(&mp_bh->bio);
141 return; 140 return 0;
142} 141}
143 142
144static void multipath_status (struct seq_file *seq, struct mddev *mddev) 143static void multipath_status (struct seq_file *seq, mddev_t *mddev)
145{ 144{
146 struct mpconf *conf = mddev->private; 145 multipath_conf_t *conf = mddev->private;
147 int i; 146 int i;
148 147
149 seq_printf (seq, " [%d/%d] [", conf->raid_disks, 148 seq_printf (seq, " [%d/%d] [", conf->raid_disks,
@@ -157,8 +156,8 @@ static void multipath_status (struct seq_file *seq, struct mddev *mddev)
157 156
158static int multipath_congested(void *data, int bits) 157static int multipath_congested(void *data, int bits)
159{ 158{
160 struct mddev *mddev = data; 159 mddev_t *mddev = data;
161 struct mpconf *conf = mddev->private; 160 multipath_conf_t *conf = mddev->private;
162 int i, ret = 0; 161 int i, ret = 0;
163 162
164 if (mddev_congested(mddev, bits)) 163 if (mddev_congested(mddev, bits))
@@ -166,7 +165,7 @@ static int multipath_congested(void *data, int bits)
166 165
167 rcu_read_lock(); 166 rcu_read_lock();
168 for (i = 0; i < mddev->raid_disks ; i++) { 167 for (i = 0; i < mddev->raid_disks ; i++) {
169 struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev); 168 mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
170 if (rdev && !test_bit(Faulty, &rdev->flags)) { 169 if (rdev && !test_bit(Faulty, &rdev->flags)) {
171 struct request_queue *q = bdev_get_queue(rdev->bdev); 170 struct request_queue *q = bdev_get_queue(rdev->bdev);
172 171
@@ -184,9 +183,9 @@ static int multipath_congested(void *data, int bits)
184/* 183/*
185 * Careful, this can execute in IRQ contexts as well! 184 * Careful, this can execute in IRQ contexts as well!
186 */ 185 */
187static void multipath_error (struct mddev *mddev, struct md_rdev *rdev) 186static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev)
188{ 187{
189 struct mpconf *conf = mddev->private; 188 multipath_conf_t *conf = mddev->private;
190 char b[BDEVNAME_SIZE]; 189 char b[BDEVNAME_SIZE];
191 190
192 if (conf->raid_disks - mddev->degraded <= 1) { 191 if (conf->raid_disks - mddev->degraded <= 1) {
@@ -219,7 +218,7 @@ static void multipath_error (struct mddev *mddev, struct md_rdev *rdev)
219 conf->raid_disks - mddev->degraded); 218 conf->raid_disks - mddev->degraded);
220} 219}
221 220
222static void print_multipath_conf (struct mpconf *conf) 221static void print_multipath_conf (multipath_conf_t *conf)
223{ 222{
224 int i; 223 int i;
225 struct multipath_info *tmp; 224 struct multipath_info *tmp;
@@ -243,9 +242,9 @@ static void print_multipath_conf (struct mpconf *conf)
243} 242}
244 243
245 244
246static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) 245static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
247{ 246{
248 struct mpconf *conf = mddev->private; 247 multipath_conf_t *conf = mddev->private;
249 struct request_queue *q; 248 struct request_queue *q;
250 int err = -EEXIST; 249 int err = -EEXIST;
251 int path; 250 int path;
@@ -292,16 +291,17 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
292 return err; 291 return err;
293} 292}
294 293
295static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev) 294static int multipath_remove_disk(mddev_t *mddev, int number)
296{ 295{
297 struct mpconf *conf = mddev->private; 296 multipath_conf_t *conf = mddev->private;
298 int err = 0; 297 int err = 0;
299 int number = rdev->raid_disk; 298 mdk_rdev_t *rdev;
300 struct multipath_info *p = conf->multipaths + number; 299 struct multipath_info *p = conf->multipaths + number;
301 300
302 print_multipath_conf(conf); 301 print_multipath_conf(conf);
303 302
304 if (rdev == p->rdev) { 303 rdev = p->rdev;
304 if (rdev) {
305 if (test_bit(In_sync, &rdev->flags) || 305 if (test_bit(In_sync, &rdev->flags) ||
306 atomic_read(&rdev->nr_pending)) { 306 atomic_read(&rdev->nr_pending)) {
307 printk(KERN_ERR "hot-remove-disk, slot %d is identified" 307 printk(KERN_ERR "hot-remove-disk, slot %d is identified"
@@ -335,13 +335,12 @@ abort:
335 * 3. Performs writes following reads for array syncronising. 335 * 3. Performs writes following reads for array syncronising.
336 */ 336 */
337 337
338static void multipathd(struct md_thread *thread) 338static void multipathd (mddev_t *mddev)
339{ 339{
340 struct mddev *mddev = thread->mddev;
341 struct multipath_bh *mp_bh; 340 struct multipath_bh *mp_bh;
342 struct bio *bio; 341 struct bio *bio;
343 unsigned long flags; 342 unsigned long flags;
344 struct mpconf *conf = mddev->private; 343 multipath_conf_t *conf = mddev->private;
345 struct list_head *head = &conf->retry_list; 344 struct list_head *head = &conf->retry_list;
346 345
347 md_check_recovery(mddev); 346 md_check_recovery(mddev);
@@ -380,7 +379,7 @@ static void multipathd(struct md_thread *thread)
380 spin_unlock_irqrestore(&conf->device_lock, flags); 379 spin_unlock_irqrestore(&conf->device_lock, flags);
381} 380}
382 381
383static sector_t multipath_size(struct mddev *mddev, sector_t sectors, int raid_disks) 382static sector_t multipath_size(mddev_t *mddev, sector_t sectors, int raid_disks)
384{ 383{
385 WARN_ONCE(sectors || raid_disks, 384 WARN_ONCE(sectors || raid_disks,
386 "%s does not support generic reshape\n", __func__); 385 "%s does not support generic reshape\n", __func__);
@@ -388,12 +387,12 @@ static sector_t multipath_size(struct mddev *mddev, sector_t sectors, int raid_d
388 return mddev->dev_sectors; 387 return mddev->dev_sectors;
389} 388}
390 389
391static int multipath_run (struct mddev *mddev) 390static int multipath_run (mddev_t *mddev)
392{ 391{
393 struct mpconf *conf; 392 multipath_conf_t *conf;
394 int disk_idx; 393 int disk_idx;
395 struct multipath_info *disk; 394 struct multipath_info *disk;
396 struct md_rdev *rdev; 395 mdk_rdev_t *rdev;
397 int working_disks; 396 int working_disks;
398 397
399 if (md_check_no_bitmap(mddev)) 398 if (md_check_no_bitmap(mddev))
@@ -410,7 +409,7 @@ static int multipath_run (struct mddev *mddev)
410 * should be freed in multipath_stop()] 409 * should be freed in multipath_stop()]
411 */ 410 */
412 411
413 conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL); 412 conf = kzalloc(sizeof(multipath_conf_t), GFP_KERNEL);
414 mddev->private = conf; 413 mddev->private = conf;
415 if (!conf) { 414 if (!conf) {
416 printk(KERN_ERR 415 printk(KERN_ERR
@@ -429,7 +428,7 @@ static int multipath_run (struct mddev *mddev)
429 } 428 }
430 429
431 working_disks = 0; 430 working_disks = 0;
432 rdev_for_each(rdev, mddev) { 431 list_for_each_entry(rdev, &mddev->disks, same_set) {
433 disk_idx = rdev->raid_disk; 432 disk_idx = rdev->raid_disk;
434 if (disk_idx < 0 || 433 if (disk_idx < 0 ||
435 disk_idx >= mddev->raid_disks) 434 disk_idx >= mddev->raid_disks)
@@ -475,8 +474,7 @@ static int multipath_run (struct mddev *mddev)
475 } 474 }
476 475
477 { 476 {
478 mddev->thread = md_register_thread(multipathd, mddev, 477 mddev->thread = md_register_thread(multipathd, mddev, NULL);
479 "multipath");
480 if (!mddev->thread) { 478 if (!mddev->thread) {
481 printk(KERN_ERR "multipath: couldn't allocate thread" 479 printk(KERN_ERR "multipath: couldn't allocate thread"
482 " for %s\n", mdname(mddev)); 480 " for %s\n", mdname(mddev));
@@ -512,9 +510,9 @@ out:
512} 510}
513 511
514 512
515static int multipath_stop (struct mddev *mddev) 513static int multipath_stop (mddev_t *mddev)
516{ 514{
517 struct mpconf *conf = mddev->private; 515 multipath_conf_t *conf = mddev->private;
518 516
519 md_unregister_thread(&mddev->thread); 517 md_unregister_thread(&mddev->thread);
520 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 518 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
@@ -525,7 +523,7 @@ static int multipath_stop (struct mddev *mddev)
525 return 0; 523 return 0;
526} 524}
527 525
528static struct md_personality multipath_personality = 526static struct mdk_personality multipath_personality =
529{ 527{
530 .name = "multipath", 528 .name = "multipath",
531 .level = LEVEL_MULTIPATH, 529 .level = LEVEL_MULTIPATH,
diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h
index 717c60f6289..3c5a45eb5f8 100644
--- a/drivers/md/multipath.h
+++ b/drivers/md/multipath.h
@@ -2,11 +2,11 @@
2#define _MULTIPATH_H 2#define _MULTIPATH_H
3 3
4struct multipath_info { 4struct multipath_info {
5 struct md_rdev *rdev; 5 mdk_rdev_t *rdev;
6}; 6};
7 7
8struct mpconf { 8struct multipath_private_data {
9 struct mddev *mddev; 9 mddev_t *mddev;
10 struct multipath_info *multipaths; 10 struct multipath_info *multipaths;
11 int raid_disks; 11 int raid_disks;
12 spinlock_t device_lock; 12 spinlock_t device_lock;
@@ -15,6 +15,8 @@ struct mpconf {
15 mempool_t *pool; 15 mempool_t *pool;
16}; 16};
17 17
18typedef struct multipath_private_data multipath_conf_t;
19
18/* 20/*
19 * this is our 'private' 'collective' MULTIPATH buffer head. 21 * this is our 'private' 'collective' MULTIPATH buffer head.
20 * it contains information about what kind of IO operations were started 22 * it contains information about what kind of IO operations were started
@@ -22,7 +24,7 @@ struct mpconf {
22 */ 24 */
23 25
24struct multipath_bh { 26struct multipath_bh {
25 struct mddev *mddev; 27 mddev_t *mddev;
26 struct bio *master_bio; 28 struct bio *master_bio;
27 struct bio bio; 29 struct bio bio;
28 int path; 30 int path;
diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig
deleted file mode 100644
index ceb359050a5..00000000000
--- a/drivers/md/persistent-data/Kconfig
+++ /dev/null
@@ -1,8 +0,0 @@
1config DM_PERSISTENT_DATA
2 tristate
3 depends on BLK_DEV_DM && EXPERIMENTAL
4 select LIBCRC32C
5 select DM_BUFIO
6 ---help---
7 Library providing immutable on-disk data structure support for
8 device-mapper targets such as the thin provisioning target.
diff --git a/drivers/md/persistent-data/Makefile b/drivers/md/persistent-data/Makefile
deleted file mode 100644
index d8e7cb767c1..00000000000
--- a/drivers/md/persistent-data/Makefile
+++ /dev/null
@@ -1,10 +0,0 @@
1obj-$(CONFIG_DM_PERSISTENT_DATA) += dm-persistent-data.o
2dm-persistent-data-objs := \
3 dm-block-manager.o \
4 dm-space-map-common.o \
5 dm-space-map-disk.o \
6 dm-space-map-metadata.o \
7 dm-transaction-manager.o \
8 dm-btree.o \
9 dm-btree-remove.o \
10 dm-btree-spine.o
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
deleted file mode 100644
index 28c3ed072a7..00000000000
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ /dev/null
@@ -1,635 +0,0 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6#include "dm-block-manager.h"
7#include "dm-persistent-data-internal.h"
8#include "../dm-bufio.h"
9
10#include <linux/crc32c.h>
11#include <linux/module.h>
12#include <linux/slab.h>
13#include <linux/rwsem.h>
14#include <linux/device-mapper.h>
15#include <linux/stacktrace.h>
16
17#define DM_MSG_PREFIX "block manager"
18
19/*----------------------------------------------------------------*/
20
21/*
22 * This is a read/write semaphore with a couple of differences.
23 *
24 * i) There is a restriction on the number of concurrent read locks that
25 * may be held at once. This is just an implementation detail.
26 *
27 * ii) Recursive locking attempts are detected and return EINVAL. A stack
28 * trace is also emitted for the previous lock acquisition.
29 *
30 * iii) Priority is given to write locks.
31 */
32#define MAX_HOLDERS 4
33#define MAX_STACK 10
34
35typedef unsigned long stack_entries[MAX_STACK];
36
37struct block_lock {
38 spinlock_t lock;
39 __s32 count;
40 struct list_head waiters;
41 struct task_struct *holders[MAX_HOLDERS];
42
43#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
44 struct stack_trace traces[MAX_HOLDERS];
45 stack_entries entries[MAX_HOLDERS];
46#endif
47};
48
49struct waiter {
50 struct list_head list;
51 struct task_struct *task;
52 int wants_write;
53};
54
55static unsigned __find_holder(struct block_lock *lock,
56 struct task_struct *task)
57{
58 unsigned i;
59
60 for (i = 0; i < MAX_HOLDERS; i++)
61 if (lock->holders[i] == task)
62 break;
63
64 BUG_ON(i == MAX_HOLDERS);
65 return i;
66}
67
68/* call this *after* you increment lock->count */
69static void __add_holder(struct block_lock *lock, struct task_struct *task)
70{
71 unsigned h = __find_holder(lock, NULL);
72#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
73 struct stack_trace *t;
74#endif
75
76 get_task_struct(task);
77 lock->holders[h] = task;
78
79#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
80 t = lock->traces + h;
81 t->nr_entries = 0;
82 t->max_entries = MAX_STACK;
83 t->entries = lock->entries[h];
84 t->skip = 2;
85 save_stack_trace(t);
86#endif
87}
88
89/* call this *before* you decrement lock->count */
90static void __del_holder(struct block_lock *lock, struct task_struct *task)
91{
92 unsigned h = __find_holder(lock, task);
93 lock->holders[h] = NULL;
94 put_task_struct(task);
95}
96
97static int __check_holder(struct block_lock *lock)
98{
99 unsigned i;
100#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
101 static struct stack_trace t;
102 static stack_entries entries;
103#endif
104
105 for (i = 0; i < MAX_HOLDERS; i++) {
106 if (lock->holders[i] == current) {
107 DMERR("recursive lock detected in pool metadata");
108#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
109 DMERR("previously held here:");
110 print_stack_trace(lock->traces + i, 4);
111
112 DMERR("subsequent acquisition attempted here:");
113 t.nr_entries = 0;
114 t.max_entries = MAX_STACK;
115 t.entries = entries;
116 t.skip = 3;
117 save_stack_trace(&t);
118 print_stack_trace(&t, 4);
119#endif
120 return -EINVAL;
121 }
122 }
123
124 return 0;
125}
126
127static void __wait(struct waiter *w)
128{
129 for (;;) {
130 set_task_state(current, TASK_UNINTERRUPTIBLE);
131
132 if (!w->task)
133 break;
134
135 schedule();
136 }
137
138 set_task_state(current, TASK_RUNNING);
139}
140
141static void __wake_waiter(struct waiter *w)
142{
143 struct task_struct *task;
144
145 list_del(&w->list);
146 task = w->task;
147 smp_mb();
148 w->task = NULL;
149 wake_up_process(task);
150}
151
152/*
153 * We either wake a few readers or a single writer.
154 */
155static void __wake_many(struct block_lock *lock)
156{
157 struct waiter *w, *tmp;
158
159 BUG_ON(lock->count < 0);
160 list_for_each_entry_safe(w, tmp, &lock->waiters, list) {
161 if (lock->count >= MAX_HOLDERS)
162 return;
163
164 if (w->wants_write) {
165 if (lock->count > 0)
166 return; /* still read locked */
167
168 lock->count = -1;
169 __add_holder(lock, w->task);
170 __wake_waiter(w);
171 return;
172 }
173
174 lock->count++;
175 __add_holder(lock, w->task);
176 __wake_waiter(w);
177 }
178}
179
180static void bl_init(struct block_lock *lock)
181{
182 int i;
183
184 spin_lock_init(&lock->lock);
185 lock->count = 0;
186 INIT_LIST_HEAD(&lock->waiters);
187 for (i = 0; i < MAX_HOLDERS; i++)
188 lock->holders[i] = NULL;
189}
190
191static int __available_for_read(struct block_lock *lock)
192{
193 return lock->count >= 0 &&
194 lock->count < MAX_HOLDERS &&
195 list_empty(&lock->waiters);
196}
197
198static int bl_down_read(struct block_lock *lock)
199{
200 int r;
201 struct waiter w;
202
203 spin_lock(&lock->lock);
204 r = __check_holder(lock);
205 if (r) {
206 spin_unlock(&lock->lock);
207 return r;
208 }
209
210 if (__available_for_read(lock)) {
211 lock->count++;
212 __add_holder(lock, current);
213 spin_unlock(&lock->lock);
214 return 0;
215 }
216
217 get_task_struct(current);
218
219 w.task = current;
220 w.wants_write = 0;
221 list_add_tail(&w.list, &lock->waiters);
222 spin_unlock(&lock->lock);
223
224 __wait(&w);
225 put_task_struct(current);
226 return 0;
227}
228
229static int bl_down_read_nonblock(struct block_lock *lock)
230{
231 int r;
232
233 spin_lock(&lock->lock);
234 r = __check_holder(lock);
235 if (r)
236 goto out;
237
238 if (__available_for_read(lock)) {
239 lock->count++;
240 __add_holder(lock, current);
241 r = 0;
242 } else
243 r = -EWOULDBLOCK;
244
245out:
246 spin_unlock(&lock->lock);
247 return r;
248}
249
250static void bl_up_read(struct block_lock *lock)
251{
252 spin_lock(&lock->lock);
253 BUG_ON(lock->count <= 0);
254 __del_holder(lock, current);
255 --lock->count;
256 if (!list_empty(&lock->waiters))
257 __wake_many(lock);
258 spin_unlock(&lock->lock);
259}
260
261static int bl_down_write(struct block_lock *lock)
262{
263 int r;
264 struct waiter w;
265
266 spin_lock(&lock->lock);
267 r = __check_holder(lock);
268 if (r) {
269 spin_unlock(&lock->lock);
270 return r;
271 }
272
273 if (lock->count == 0 && list_empty(&lock->waiters)) {
274 lock->count = -1;
275 __add_holder(lock, current);
276 spin_unlock(&lock->lock);
277 return 0;
278 }
279
280 get_task_struct(current);
281 w.task = current;
282 w.wants_write = 1;
283
284 /*
285 * Writers given priority. We know there's only one mutator in the
286 * system, so ignoring the ordering reversal.
287 */
288 list_add(&w.list, &lock->waiters);
289 spin_unlock(&lock->lock);
290
291 __wait(&w);
292 put_task_struct(current);
293
294 return 0;
295}
296
297static void bl_up_write(struct block_lock *lock)
298{
299 spin_lock(&lock->lock);
300 __del_holder(lock, current);
301 lock->count = 0;
302 if (!list_empty(&lock->waiters))
303 __wake_many(lock);
304 spin_unlock(&lock->lock);
305}
306
307static void report_recursive_bug(dm_block_t b, int r)
308{
309 if (r == -EINVAL)
310 DMERR("recursive acquisition of block %llu requested.",
311 (unsigned long long) b);
312}
313
314/*----------------------------------------------------------------*/
315
316/*
317 * Block manager is currently implemented using dm-bufio. struct
318 * dm_block_manager and struct dm_block map directly onto a couple of
319 * structs in the bufio interface. I want to retain the freedom to move
320 * away from bufio in the future. So these structs are just cast within
321 * this .c file, rather than making it through to the public interface.
322 */
323static struct dm_buffer *to_buffer(struct dm_block *b)
324{
325 return (struct dm_buffer *) b;
326}
327
328dm_block_t dm_block_location(struct dm_block *b)
329{
330 return dm_bufio_get_block_number(to_buffer(b));
331}
332EXPORT_SYMBOL_GPL(dm_block_location);
333
334void *dm_block_data(struct dm_block *b)
335{
336 return dm_bufio_get_block_data(to_buffer(b));
337}
338EXPORT_SYMBOL_GPL(dm_block_data);
339
340struct buffer_aux {
341 struct dm_block_validator *validator;
342 struct block_lock lock;
343 int write_locked;
344};
345
346static void dm_block_manager_alloc_callback(struct dm_buffer *buf)
347{
348 struct buffer_aux *aux = dm_bufio_get_aux_data(buf);
349 aux->validator = NULL;
350 bl_init(&aux->lock);
351}
352
353static void dm_block_manager_write_callback(struct dm_buffer *buf)
354{
355 struct buffer_aux *aux = dm_bufio_get_aux_data(buf);
356 if (aux->validator) {
357 aux->validator->prepare_for_write(aux->validator, (struct dm_block *) buf,
358 dm_bufio_get_block_size(dm_bufio_get_client(buf)));
359 }
360}
361
362/*----------------------------------------------------------------
363 * Public interface
364 *--------------------------------------------------------------*/
365struct dm_block_manager {
366 struct dm_bufio_client *bufio;
367 bool read_only:1;
368};
369
370struct dm_block_manager *dm_block_manager_create(struct block_device *bdev,
371 unsigned block_size,
372 unsigned cache_size,
373 unsigned max_held_per_thread)
374{
375 int r;
376 struct dm_block_manager *bm;
377
378 bm = kmalloc(sizeof(*bm), GFP_KERNEL);
379 if (!bm) {
380 r = -ENOMEM;
381 goto bad;
382 }
383
384 bm->bufio = dm_bufio_client_create(bdev, block_size, max_held_per_thread,
385 sizeof(struct buffer_aux),
386 dm_block_manager_alloc_callback,
387 dm_block_manager_write_callback);
388 if (IS_ERR(bm->bufio)) {
389 r = PTR_ERR(bm->bufio);
390 kfree(bm);
391 goto bad;
392 }
393
394 bm->read_only = false;
395
396 return bm;
397
398bad:
399 return ERR_PTR(r);
400}
401EXPORT_SYMBOL_GPL(dm_block_manager_create);
402
403void dm_block_manager_destroy(struct dm_block_manager *bm)
404{
405 dm_bufio_client_destroy(bm->bufio);
406 kfree(bm);
407}
408EXPORT_SYMBOL_GPL(dm_block_manager_destroy);
409
410unsigned dm_bm_block_size(struct dm_block_manager *bm)
411{
412 return dm_bufio_get_block_size(bm->bufio);
413}
414EXPORT_SYMBOL_GPL(dm_bm_block_size);
415
416dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm)
417{
418 return dm_bufio_get_device_size(bm->bufio);
419}
420
421static int dm_bm_validate_buffer(struct dm_block_manager *bm,
422 struct dm_buffer *buf,
423 struct buffer_aux *aux,
424 struct dm_block_validator *v)
425{
426 if (unlikely(!aux->validator)) {
427 int r;
428 if (!v)
429 return 0;
430 r = v->check(v, (struct dm_block *) buf, dm_bufio_get_block_size(bm->bufio));
431 if (unlikely(r)) {
432 DMERR_LIMIT("%s validator check failed for block %llu", v->name,
433 (unsigned long long) dm_bufio_get_block_number(buf));
434 return r;
435 }
436 aux->validator = v;
437 } else {
438 if (unlikely(aux->validator != v)) {
439 DMERR_LIMIT("validator mismatch (old=%s vs new=%s) for block %llu",
440 aux->validator->name, v ? v->name : "NULL",
441 (unsigned long long) dm_bufio_get_block_number(buf));
442 return -EINVAL;
443 }
444 }
445
446 return 0;
447}
448int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b,
449 struct dm_block_validator *v,
450 struct dm_block **result)
451{
452 struct buffer_aux *aux;
453 void *p;
454 int r;
455
456 p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result);
457 if (unlikely(IS_ERR(p)))
458 return PTR_ERR(p);
459
460 aux = dm_bufio_get_aux_data(to_buffer(*result));
461 r = bl_down_read(&aux->lock);
462 if (unlikely(r)) {
463 dm_bufio_release(to_buffer(*result));
464 report_recursive_bug(b, r);
465 return r;
466 }
467
468 aux->write_locked = 0;
469
470 r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v);
471 if (unlikely(r)) {
472 bl_up_read(&aux->lock);
473 dm_bufio_release(to_buffer(*result));
474 return r;
475 }
476
477 return 0;
478}
479EXPORT_SYMBOL_GPL(dm_bm_read_lock);
480
481int dm_bm_write_lock(struct dm_block_manager *bm,
482 dm_block_t b, struct dm_block_validator *v,
483 struct dm_block **result)
484{
485 struct buffer_aux *aux;
486 void *p;
487 int r;
488
489 if (bm->read_only)
490 return -EPERM;
491
492 p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result);
493 if (unlikely(IS_ERR(p)))
494 return PTR_ERR(p);
495
496 aux = dm_bufio_get_aux_data(to_buffer(*result));
497 r = bl_down_write(&aux->lock);
498 if (r) {
499 dm_bufio_release(to_buffer(*result));
500 report_recursive_bug(b, r);
501 return r;
502 }
503
504 aux->write_locked = 1;
505
506 r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v);
507 if (unlikely(r)) {
508 bl_up_write(&aux->lock);
509 dm_bufio_release(to_buffer(*result));
510 return r;
511 }
512
513 return 0;
514}
515EXPORT_SYMBOL_GPL(dm_bm_write_lock);
516
517int dm_bm_read_try_lock(struct dm_block_manager *bm,
518 dm_block_t b, struct dm_block_validator *v,
519 struct dm_block **result)
520{
521 struct buffer_aux *aux;
522 void *p;
523 int r;
524
525 p = dm_bufio_get(bm->bufio, b, (struct dm_buffer **) result);
526 if (unlikely(IS_ERR(p)))
527 return PTR_ERR(p);
528 if (unlikely(!p))
529 return -EWOULDBLOCK;
530
531 aux = dm_bufio_get_aux_data(to_buffer(*result));
532 r = bl_down_read_nonblock(&aux->lock);
533 if (r < 0) {
534 dm_bufio_release(to_buffer(*result));
535 report_recursive_bug(b, r);
536 return r;
537 }
538 aux->write_locked = 0;
539
540 r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v);
541 if (unlikely(r)) {
542 bl_up_read(&aux->lock);
543 dm_bufio_release(to_buffer(*result));
544 return r;
545 }
546
547 return 0;
548}
549
550int dm_bm_write_lock_zero(struct dm_block_manager *bm,
551 dm_block_t b, struct dm_block_validator *v,
552 struct dm_block **result)
553{
554 int r;
555 struct buffer_aux *aux;
556 void *p;
557
558 if (bm->read_only)
559 return -EPERM;
560
561 p = dm_bufio_new(bm->bufio, b, (struct dm_buffer **) result);
562 if (unlikely(IS_ERR(p)))
563 return PTR_ERR(p);
564
565 memset(p, 0, dm_bm_block_size(bm));
566
567 aux = dm_bufio_get_aux_data(to_buffer(*result));
568 r = bl_down_write(&aux->lock);
569 if (r) {
570 dm_bufio_release(to_buffer(*result));
571 return r;
572 }
573
574 aux->write_locked = 1;
575 aux->validator = v;
576
577 return 0;
578}
579EXPORT_SYMBOL_GPL(dm_bm_write_lock_zero);
580
581int dm_bm_unlock(struct dm_block *b)
582{
583 struct buffer_aux *aux;
584 aux = dm_bufio_get_aux_data(to_buffer(b));
585
586 if (aux->write_locked) {
587 dm_bufio_mark_buffer_dirty(to_buffer(b));
588 bl_up_write(&aux->lock);
589 } else
590 bl_up_read(&aux->lock);
591
592 dm_bufio_release(to_buffer(b));
593
594 return 0;
595}
596EXPORT_SYMBOL_GPL(dm_bm_unlock);
597
598int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
599 struct dm_block *superblock)
600{
601 int r;
602
603 if (bm->read_only)
604 return -EPERM;
605
606 r = dm_bufio_write_dirty_buffers(bm->bufio);
607 if (unlikely(r)) {
608 dm_bm_unlock(superblock);
609 return r;
610 }
611
612 dm_bm_unlock(superblock);
613
614 return dm_bufio_write_dirty_buffers(bm->bufio);
615}
616
617void dm_bm_set_read_only(struct dm_block_manager *bm)
618{
619 bm->read_only = true;
620}
621EXPORT_SYMBOL_GPL(dm_bm_set_read_only);
622
623u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor)
624{
625 return crc32c(~(u32) 0, data, len) ^ init_xor;
626}
627EXPORT_SYMBOL_GPL(dm_bm_checksum);
628
629/*----------------------------------------------------------------*/
630
631MODULE_LICENSE("GPL");
632MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
633MODULE_DESCRIPTION("Immutable metadata library for dm");
634
635/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h
deleted file mode 100644
index be5bff61be2..00000000000
--- a/drivers/md/persistent-data/dm-block-manager.h
+++ /dev/null
@@ -1,128 +0,0 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef _LINUX_DM_BLOCK_MANAGER_H
8#define _LINUX_DM_BLOCK_MANAGER_H
9
10#include <linux/types.h>
11#include <linux/blkdev.h>
12
13/*----------------------------------------------------------------*/
14
15/*
16 * Block number.
17 */
18typedef uint64_t dm_block_t;
19struct dm_block;
20
21dm_block_t dm_block_location(struct dm_block *b);
22void *dm_block_data(struct dm_block *b);
23
24/*----------------------------------------------------------------*/
25
26/*
27 * @name should be a unique identifier for the block manager, no longer
28 * than 32 chars.
29 *
30 * @max_held_per_thread should be the maximum number of locks, read or
31 * write, that an individual thread holds at any one time.
32 */
33struct dm_block_manager;
34struct dm_block_manager *dm_block_manager_create(
35 struct block_device *bdev, unsigned block_size,
36 unsigned cache_size, unsigned max_held_per_thread);
37void dm_block_manager_destroy(struct dm_block_manager *bm);
38
39unsigned dm_bm_block_size(struct dm_block_manager *bm);
40dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm);
41
42/*----------------------------------------------------------------*/
43
44/*
45 * The validator allows the caller to verify newly-read data and modify
46 * the data just before writing, e.g. to calculate checksums. It's
47 * important to be consistent with your use of validators. The only time
48 * you can change validators is if you call dm_bm_write_lock_zero.
49 */
50struct dm_block_validator {
51 const char *name;
52 void (*prepare_for_write)(struct dm_block_validator *v, struct dm_block *b, size_t block_size);
53
54 /*
55 * Return 0 if the checksum is valid or < 0 on error.
56 */
57 int (*check)(struct dm_block_validator *v, struct dm_block *b, size_t block_size);
58};
59
60/*----------------------------------------------------------------*/
61
62/*
63 * You can have multiple concurrent readers or a single writer holding a
64 * block lock.
65 */
66
67/*
68 * dm_bm_lock() locks a block and returns through @result a pointer to
69 * memory that holds a copy of that block. If you have write-locked the
70 * block then any changes you make to memory pointed to by @result will be
71 * written back to the disk sometime after dm_bm_unlock is called.
72 */
73int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b,
74 struct dm_block_validator *v,
75 struct dm_block **result);
76
77int dm_bm_write_lock(struct dm_block_manager *bm, dm_block_t b,
78 struct dm_block_validator *v,
79 struct dm_block **result);
80
81/*
82 * The *_try_lock variants return -EWOULDBLOCK if the block isn't
83 * available immediately.
84 */
85int dm_bm_read_try_lock(struct dm_block_manager *bm, dm_block_t b,
86 struct dm_block_validator *v,
87 struct dm_block **result);
88
89/*
90 * Use dm_bm_write_lock_zero() when you know you're going to
91 * overwrite the block completely. It saves a disk read.
92 */
93int dm_bm_write_lock_zero(struct dm_block_manager *bm, dm_block_t b,
94 struct dm_block_validator *v,
95 struct dm_block **result);
96
97int dm_bm_unlock(struct dm_block *b);
98
99/*
100 * It's a common idiom to have a superblock that should be committed last.
101 *
102 * @superblock should be write-locked on entry. It will be unlocked during
103 * this function. All dirty blocks are guaranteed to be written and flushed
104 * before the superblock.
105 *
106 * This method always blocks.
107 */
108int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
109 struct dm_block *superblock);
110
111/*
112 * Switches the bm to a read only mode. Once read-only mode
113 * has been entered the following functions will return -EPERM.
114 *
115 * dm_bm_write_lock
116 * dm_bm_write_lock_zero
117 * dm_bm_flush_and_unlock
118 *
119 * Additionally you should not use dm_bm_unlock_move, however no error will
120 * be returned if you do.
121 */
122void dm_bm_set_read_only(struct dm_block_manager *bm);
123
124u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor);
125
126/*----------------------------------------------------------------*/
127
128#endif /* _LINUX_DM_BLOCK_MANAGER_H */
diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h
deleted file mode 100644
index accbb05f17b..00000000000
--- a/drivers/md/persistent-data/dm-btree-internal.h
+++ /dev/null
@@ -1,134 +0,0 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef DM_BTREE_INTERNAL_H
8#define DM_BTREE_INTERNAL_H
9
10#include "dm-btree.h"
11
12/*----------------------------------------------------------------*/
13
14/*
15 * We'll need 2 accessor functions for n->csum and n->blocknr
16 * to support dm-btree-spine.c in that case.
17 */
18
19enum node_flags {
20 INTERNAL_NODE = 1,
21 LEAF_NODE = 1 << 1
22};
23
24/*
25 * Every btree node begins with this structure. Make sure it's a multiple
26 * of 8-bytes in size, otherwise the 64bit keys will be mis-aligned.
27 */
28struct node_header {
29 __le32 csum;
30 __le32 flags;
31 __le64 blocknr; /* Block this node is supposed to live in. */
32
33 __le32 nr_entries;
34 __le32 max_entries;
35 __le32 value_size;
36 __le32 padding;
37} __packed;
38
39struct btree_node {
40 struct node_header header;
41 __le64 keys[0];
42} __packed;
43
44
45void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
46 struct dm_btree_value_type *vt);
47
48int new_block(struct dm_btree_info *info, struct dm_block **result);
49int unlock_block(struct dm_btree_info *info, struct dm_block *b);
50
51/*
52 * Spines keep track of the rolling locks. There are 2 variants, read-only
53 * and one that uses shadowing. These are separate structs to allow the
54 * type checker to spot misuse, for example accidentally calling read_lock
55 * on a shadow spine.
56 */
57struct ro_spine {
58 struct dm_btree_info *info;
59
60 int count;
61 struct dm_block *nodes[2];
62};
63
64void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info);
65int exit_ro_spine(struct ro_spine *s);
66int ro_step(struct ro_spine *s, dm_block_t new_child);
67struct btree_node *ro_node(struct ro_spine *s);
68
69struct shadow_spine {
70 struct dm_btree_info *info;
71
72 int count;
73 struct dm_block *nodes[2];
74
75 dm_block_t root;
76};
77
78void init_shadow_spine(struct shadow_spine *s, struct dm_btree_info *info);
79int exit_shadow_spine(struct shadow_spine *s);
80
81int shadow_step(struct shadow_spine *s, dm_block_t b,
82 struct dm_btree_value_type *vt);
83
84/*
85 * The spine must have at least one entry before calling this.
86 */
87struct dm_block *shadow_current(struct shadow_spine *s);
88
89/*
90 * The spine must have at least two entries before calling this.
91 */
92struct dm_block *shadow_parent(struct shadow_spine *s);
93
94int shadow_has_parent(struct shadow_spine *s);
95
96int shadow_root(struct shadow_spine *s);
97
98/*
99 * Some inlines.
100 */
101static inline __le64 *key_ptr(struct btree_node *n, uint32_t index)
102{
103 return n->keys + index;
104}
105
106static inline void *value_base(struct btree_node *n)
107{
108 return &n->keys[le32_to_cpu(n->header.max_entries)];
109}
110
111static inline void *value_ptr(struct btree_node *n, uint32_t index)
112{
113 uint32_t value_size = le32_to_cpu(n->header.value_size);
114 return value_base(n) + (value_size * index);
115}
116
117/*
118 * Assumes the values are suitably-aligned and converts to core format.
119 */
120static inline uint64_t value64(struct btree_node *n, uint32_t index)
121{
122 __le64 *values_le = value_base(n);
123
124 return le64_to_cpu(values_le[index]);
125}
126
127/*
128 * Searching for a key within a single node.
129 */
130int lower_bound(struct btree_node *n, uint64_t key);
131
132extern struct dm_block_validator btree_node_validator;
133
134#endif /* DM_BTREE_INTERNAL_H */
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
deleted file mode 100644
index c4f28133ef8..00000000000
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ /dev/null
@@ -1,590 +0,0 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-btree.h"
8#include "dm-btree-internal.h"
9#include "dm-transaction-manager.h"
10
11#include <linux/export.h>
12
13/*
14 * Removing an entry from a btree
15 * ==============================
16 *
17 * A very important constraint for our btree is that no node, except the
18 * root, may have fewer than a certain number of entries.
19 * (MIN_ENTRIES <= nr_entries <= MAX_ENTRIES).
20 *
21 * Ensuring this is complicated by the way we want to only ever hold the
22 * locks on 2 nodes concurrently, and only change nodes in a top to bottom
23 * fashion.
24 *
25 * Each node may have a left or right sibling. When decending the spine,
26 * if a node contains only MIN_ENTRIES then we try and increase this to at
27 * least MIN_ENTRIES + 1. We do this in the following ways:
28 *
29 * [A] No siblings => this can only happen if the node is the root, in which
30 * case we copy the childs contents over the root.
31 *
32 * [B] No left sibling
33 * ==> rebalance(node, right sibling)
34 *
35 * [C] No right sibling
36 * ==> rebalance(left sibling, node)
37 *
38 * [D] Both siblings, total_entries(left, node, right) <= DEL_THRESHOLD
39 * ==> delete node adding it's contents to left and right
40 *
41 * [E] Both siblings, total_entries(left, node, right) > DEL_THRESHOLD
42 * ==> rebalance(left, node, right)
43 *
44 * After these operations it's possible that the our original node no
45 * longer contains the desired sub tree. For this reason this rebalancing
46 * is performed on the children of the current node. This also avoids
47 * having a special case for the root.
48 *
49 * Once this rebalancing has occurred we can then step into the child node
50 * for internal nodes. Or delete the entry for leaf nodes.
51 */
52
53/*
54 * Some little utilities for moving node data around.
55 */
56static void node_shift(struct btree_node *n, int shift)
57{
58 uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
59 uint32_t value_size = le32_to_cpu(n->header.value_size);
60
61 if (shift < 0) {
62 shift = -shift;
63 BUG_ON(shift > nr_entries);
64 BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift));
65 memmove(key_ptr(n, 0),
66 key_ptr(n, shift),
67 (nr_entries - shift) * sizeof(__le64));
68 memmove(value_ptr(n, 0),
69 value_ptr(n, shift),
70 (nr_entries - shift) * value_size);
71 } else {
72 BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries));
73 memmove(key_ptr(n, shift),
74 key_ptr(n, 0),
75 nr_entries * sizeof(__le64));
76 memmove(value_ptr(n, shift),
77 value_ptr(n, 0),
78 nr_entries * value_size);
79 }
80}
81
82static void node_copy(struct btree_node *left, struct btree_node *right, int shift)
83{
84 uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
85 uint32_t value_size = le32_to_cpu(left->header.value_size);
86 BUG_ON(value_size != le32_to_cpu(right->header.value_size));
87
88 if (shift < 0) {
89 shift = -shift;
90 BUG_ON(nr_left + shift > le32_to_cpu(left->header.max_entries));
91 memcpy(key_ptr(left, nr_left),
92 key_ptr(right, 0),
93 shift * sizeof(__le64));
94 memcpy(value_ptr(left, nr_left),
95 value_ptr(right, 0),
96 shift * value_size);
97 } else {
98 BUG_ON(shift > le32_to_cpu(right->header.max_entries));
99 memcpy(key_ptr(right, 0),
100 key_ptr(left, nr_left - shift),
101 shift * sizeof(__le64));
102 memcpy(value_ptr(right, 0),
103 value_ptr(left, nr_left - shift),
104 shift * value_size);
105 }
106}
107
108/*
109 * Delete a specific entry from a leaf node.
110 */
111static void delete_at(struct btree_node *n, unsigned index)
112{
113 unsigned nr_entries = le32_to_cpu(n->header.nr_entries);
114 unsigned nr_to_copy = nr_entries - (index + 1);
115 uint32_t value_size = le32_to_cpu(n->header.value_size);
116 BUG_ON(index >= nr_entries);
117
118 if (nr_to_copy) {
119 memmove(key_ptr(n, index),
120 key_ptr(n, index + 1),
121 nr_to_copy * sizeof(__le64));
122
123 memmove(value_ptr(n, index),
124 value_ptr(n, index + 1),
125 nr_to_copy * value_size);
126 }
127
128 n->header.nr_entries = cpu_to_le32(nr_entries - 1);
129}
130
131static unsigned merge_threshold(struct btree_node *n)
132{
133 return le32_to_cpu(n->header.max_entries) / 3;
134}
135
136struct child {
137 unsigned index;
138 struct dm_block *block;
139 struct btree_node *n;
140};
141
142static struct dm_btree_value_type le64_type = {
143 .context = NULL,
144 .size = sizeof(__le64),
145 .inc = NULL,
146 .dec = NULL,
147 .equal = NULL
148};
149
150static int init_child(struct dm_btree_info *info, struct btree_node *parent,
151 unsigned index, struct child *result)
152{
153 int r, inc;
154 dm_block_t root;
155
156 result->index = index;
157 root = value64(parent, index);
158
159 r = dm_tm_shadow_block(info->tm, root, &btree_node_validator,
160 &result->block, &inc);
161 if (r)
162 return r;
163
164 result->n = dm_block_data(result->block);
165
166 if (inc)
167 inc_children(info->tm, result->n, &le64_type);
168
169 *((__le64 *) value_ptr(parent, index)) =
170 cpu_to_le64(dm_block_location(result->block));
171
172 return 0;
173}
174
175static int exit_child(struct dm_btree_info *info, struct child *c)
176{
177 return dm_tm_unlock(info->tm, c->block);
178}
179
180static void shift(struct btree_node *left, struct btree_node *right, int count)
181{
182 uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
183 uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
184 uint32_t max_entries = le32_to_cpu(left->header.max_entries);
185 uint32_t r_max_entries = le32_to_cpu(right->header.max_entries);
186
187 BUG_ON(max_entries != r_max_entries);
188 BUG_ON(nr_left - count > max_entries);
189 BUG_ON(nr_right + count > max_entries);
190
191 if (!count)
192 return;
193
194 if (count > 0) {
195 node_shift(right, count);
196 node_copy(left, right, count);
197 } else {
198 node_copy(left, right, count);
199 node_shift(right, count);
200 }
201
202 left->header.nr_entries = cpu_to_le32(nr_left - count);
203 right->header.nr_entries = cpu_to_le32(nr_right + count);
204}
205
206static void __rebalance2(struct dm_btree_info *info, struct btree_node *parent,
207 struct child *l, struct child *r)
208{
209 struct btree_node *left = l->n;
210 struct btree_node *right = r->n;
211 uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
212 uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
213 unsigned threshold = 2 * merge_threshold(left) + 1;
214
215 if (nr_left + nr_right < threshold) {
216 /*
217 * Merge
218 */
219 node_copy(left, right, -nr_right);
220 left->header.nr_entries = cpu_to_le32(nr_left + nr_right);
221 delete_at(parent, r->index);
222
223 /*
224 * We need to decrement the right block, but not it's
225 * children, since they're still referenced by left.
226 */
227 dm_tm_dec(info->tm, dm_block_location(r->block));
228 } else {
229 /*
230 * Rebalance.
231 */
232 unsigned target_left = (nr_left + nr_right) / 2;
233 shift(left, right, nr_left - target_left);
234 *key_ptr(parent, r->index) = right->keys[0];
235 }
236}
237
238static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
239 unsigned left_index)
240{
241 int r;
242 struct btree_node *parent;
243 struct child left, right;
244
245 parent = dm_block_data(shadow_current(s));
246
247 r = init_child(info, parent, left_index, &left);
248 if (r)
249 return r;
250
251 r = init_child(info, parent, left_index + 1, &right);
252 if (r) {
253 exit_child(info, &left);
254 return r;
255 }
256
257 __rebalance2(info, parent, &left, &right);
258
259 r = exit_child(info, &left);
260 if (r) {
261 exit_child(info, &right);
262 return r;
263 }
264
265 return exit_child(info, &right);
266}
267
268/*
269 * We dump as many entries from center as possible into left, then the rest
270 * in right, then rebalance2. This wastes some cpu, but I want something
271 * simple atm.
272 */
273static void delete_center_node(struct dm_btree_info *info, struct btree_node *parent,
274 struct child *l, struct child *c, struct child *r,
275 struct btree_node *left, struct btree_node *center, struct btree_node *right,
276 uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
277{
278 uint32_t max_entries = le32_to_cpu(left->header.max_entries);
279 unsigned shift = min(max_entries - nr_left, nr_center);
280
281 BUG_ON(nr_left + shift > max_entries);
282 node_copy(left, center, -shift);
283 left->header.nr_entries = cpu_to_le32(nr_left + shift);
284
285 if (shift != nr_center) {
286 shift = nr_center - shift;
287 BUG_ON((nr_right + shift) > max_entries);
288 node_shift(right, shift);
289 node_copy(center, right, shift);
290 right->header.nr_entries = cpu_to_le32(nr_right + shift);
291 }
292 *key_ptr(parent, r->index) = right->keys[0];
293
294 delete_at(parent, c->index);
295 r->index--;
296
297 dm_tm_dec(info->tm, dm_block_location(c->block));
298 __rebalance2(info, parent, l, r);
299}
300
301/*
302 * Redistributes entries among 3 sibling nodes.
303 */
304static void redistribute3(struct dm_btree_info *info, struct btree_node *parent,
305 struct child *l, struct child *c, struct child *r,
306 struct btree_node *left, struct btree_node *center, struct btree_node *right,
307 uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
308{
309 int s;
310 uint32_t max_entries = le32_to_cpu(left->header.max_entries);
311 unsigned target = (nr_left + nr_center + nr_right) / 3;
312 BUG_ON(target > max_entries);
313
314 if (nr_left < nr_right) {
315 s = nr_left - target;
316
317 if (s < 0 && nr_center < -s) {
318 /* not enough in central node */
319 shift(left, center, nr_center);
320 s = nr_center - target;
321 shift(left, right, s);
322 nr_right += s;
323 } else
324 shift(left, center, s);
325
326 shift(center, right, target - nr_right);
327
328 } else {
329 s = target - nr_right;
330 if (s > 0 && nr_center < s) {
331 /* not enough in central node */
332 shift(center, right, nr_center);
333 s = target - nr_center;
334 shift(left, right, s);
335 nr_left -= s;
336 } else
337 shift(center, right, s);
338
339 shift(left, center, nr_left - target);
340 }
341
342 *key_ptr(parent, c->index) = center->keys[0];
343 *key_ptr(parent, r->index) = right->keys[0];
344}
345
346static void __rebalance3(struct dm_btree_info *info, struct btree_node *parent,
347 struct child *l, struct child *c, struct child *r)
348{
349 struct btree_node *left = l->n;
350 struct btree_node *center = c->n;
351 struct btree_node *right = r->n;
352
353 uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
354 uint32_t nr_center = le32_to_cpu(center->header.nr_entries);
355 uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
356
357 unsigned threshold = merge_threshold(left) * 4 + 1;
358
359 BUG_ON(left->header.max_entries != center->header.max_entries);
360 BUG_ON(center->header.max_entries != right->header.max_entries);
361
362 if ((nr_left + nr_center + nr_right) < threshold)
363 delete_center_node(info, parent, l, c, r, left, center, right,
364 nr_left, nr_center, nr_right);
365 else
366 redistribute3(info, parent, l, c, r, left, center, right,
367 nr_left, nr_center, nr_right);
368}
369
370static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
371 unsigned left_index)
372{
373 int r;
374 struct btree_node *parent = dm_block_data(shadow_current(s));
375 struct child left, center, right;
376
377 /*
378 * FIXME: fill out an array?
379 */
380 r = init_child(info, parent, left_index, &left);
381 if (r)
382 return r;
383
384 r = init_child(info, parent, left_index + 1, &center);
385 if (r) {
386 exit_child(info, &left);
387 return r;
388 }
389
390 r = init_child(info, parent, left_index + 2, &right);
391 if (r) {
392 exit_child(info, &left);
393 exit_child(info, &center);
394 return r;
395 }
396
397 __rebalance3(info, parent, &left, &center, &right);
398
399 r = exit_child(info, &left);
400 if (r) {
401 exit_child(info, &center);
402 exit_child(info, &right);
403 return r;
404 }
405
406 r = exit_child(info, &center);
407 if (r) {
408 exit_child(info, &right);
409 return r;
410 }
411
412 r = exit_child(info, &right);
413 if (r)
414 return r;
415
416 return 0;
417}
418
419static int get_nr_entries(struct dm_transaction_manager *tm,
420 dm_block_t b, uint32_t *result)
421{
422 int r;
423 struct dm_block *block;
424 struct btree_node *n;
425
426 r = dm_tm_read_lock(tm, b, &btree_node_validator, &block);
427 if (r)
428 return r;
429
430 n = dm_block_data(block);
431 *result = le32_to_cpu(n->header.nr_entries);
432
433 return dm_tm_unlock(tm, block);
434}
435
436static int rebalance_children(struct shadow_spine *s,
437 struct dm_btree_info *info, uint64_t key)
438{
439 int i, r, has_left_sibling, has_right_sibling;
440 uint32_t child_entries;
441 struct btree_node *n;
442
443 n = dm_block_data(shadow_current(s));
444
445 if (le32_to_cpu(n->header.nr_entries) == 1) {
446 struct dm_block *child;
447 dm_block_t b = value64(n, 0);
448
449 r = dm_tm_read_lock(info->tm, b, &btree_node_validator, &child);
450 if (r)
451 return r;
452
453 memcpy(n, dm_block_data(child),
454 dm_bm_block_size(dm_tm_get_bm(info->tm)));
455 r = dm_tm_unlock(info->tm, child);
456 if (r)
457 return r;
458
459 dm_tm_dec(info->tm, dm_block_location(child));
460 return 0;
461 }
462
463 i = lower_bound(n, key);
464 if (i < 0)
465 return -ENODATA;
466
467 r = get_nr_entries(info->tm, value64(n, i), &child_entries);
468 if (r)
469 return r;
470
471 has_left_sibling = i > 0;
472 has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1);
473
474 if (!has_left_sibling)
475 r = rebalance2(s, info, i);
476
477 else if (!has_right_sibling)
478 r = rebalance2(s, info, i - 1);
479
480 else
481 r = rebalance3(s, info, i - 1);
482
483 return r;
484}
485
486static int do_leaf(struct btree_node *n, uint64_t key, unsigned *index)
487{
488 int i = lower_bound(n, key);
489
490 if ((i < 0) ||
491 (i >= le32_to_cpu(n->header.nr_entries)) ||
492 (le64_to_cpu(n->keys[i]) != key))
493 return -ENODATA;
494
495 *index = i;
496
497 return 0;
498}
499
500/*
501 * Prepares for removal from one level of the hierarchy. The caller must
502 * call delete_at() to remove the entry at index.
503 */
504static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
505 struct dm_btree_value_type *vt, dm_block_t root,
506 uint64_t key, unsigned *index)
507{
508 int i = *index, r;
509 struct btree_node *n;
510
511 for (;;) {
512 r = shadow_step(s, root, vt);
513 if (r < 0)
514 break;
515
516 /*
517 * We have to patch up the parent node, ugly, but I don't
518 * see a way to do this automatically as part of the spine
519 * op.
520 */
521 if (shadow_has_parent(s)) {
522 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
523 memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
524 &location, sizeof(__le64));
525 }
526
527 n = dm_block_data(shadow_current(s));
528
529 if (le32_to_cpu(n->header.flags) & LEAF_NODE)
530 return do_leaf(n, key, index);
531
532 r = rebalance_children(s, info, key);
533 if (r)
534 break;
535
536 n = dm_block_data(shadow_current(s));
537 if (le32_to_cpu(n->header.flags) & LEAF_NODE)
538 return do_leaf(n, key, index);
539
540 i = lower_bound(n, key);
541
542 /*
543 * We know the key is present, or else
544 * rebalance_children would have returned
545 * -ENODATA
546 */
547 root = value64(n, i);
548 }
549
550 return r;
551}
552
553int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
554 uint64_t *keys, dm_block_t *new_root)
555{
556 unsigned level, last_level = info->levels - 1;
557 int index = 0, r = 0;
558 struct shadow_spine spine;
559 struct btree_node *n;
560
561 init_shadow_spine(&spine, info);
562 for (level = 0; level < info->levels; level++) {
563 r = remove_raw(&spine, info,
564 (level == last_level ?
565 &info->value_type : &le64_type),
566 root, keys[level], (unsigned *)&index);
567 if (r < 0)
568 break;
569
570 n = dm_block_data(shadow_current(&spine));
571 if (level != last_level) {
572 root = value64(n, index);
573 continue;
574 }
575
576 BUG_ON(index < 0 || index >= le32_to_cpu(n->header.nr_entries));
577
578 if (info->value_type.dec)
579 info->value_type.dec(info->value_type.context,
580 value_ptr(n, index));
581
582 delete_at(n, index);
583 }
584
585 *new_root = shadow_root(&spine);
586 exit_shadow_spine(&spine);
587
588 return r;
589}
590EXPORT_SYMBOL_GPL(dm_btree_remove);
diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c
deleted file mode 100644
index f199a0c4ed0..00000000000
--- a/drivers/md/persistent-data/dm-btree-spine.c
+++ /dev/null
@@ -1,244 +0,0 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-btree-internal.h"
8#include "dm-transaction-manager.h"
9
10#include <linux/device-mapper.h>
11
12#define DM_MSG_PREFIX "btree spine"
13
14/*----------------------------------------------------------------*/
15
16#define BTREE_CSUM_XOR 121107
17
18static int node_check(struct dm_block_validator *v,
19 struct dm_block *b,
20 size_t block_size);
21
22static void node_prepare_for_write(struct dm_block_validator *v,
23 struct dm_block *b,
24 size_t block_size)
25{
26 struct btree_node *n = dm_block_data(b);
27 struct node_header *h = &n->header;
28
29 h->blocknr = cpu_to_le64(dm_block_location(b));
30 h->csum = cpu_to_le32(dm_bm_checksum(&h->flags,
31 block_size - sizeof(__le32),
32 BTREE_CSUM_XOR));
33
34 BUG_ON(node_check(v, b, 4096));
35}
36
37static int node_check(struct dm_block_validator *v,
38 struct dm_block *b,
39 size_t block_size)
40{
41 struct btree_node *n = dm_block_data(b);
42 struct node_header *h = &n->header;
43 size_t value_size;
44 __le32 csum_disk;
45 uint32_t flags;
46
47 if (dm_block_location(b) != le64_to_cpu(h->blocknr)) {
48 DMERR_LIMIT("node_check failed: blocknr %llu != wanted %llu",
49 le64_to_cpu(h->blocknr), dm_block_location(b));
50 return -ENOTBLK;
51 }
52
53 csum_disk = cpu_to_le32(dm_bm_checksum(&h->flags,
54 block_size - sizeof(__le32),
55 BTREE_CSUM_XOR));
56 if (csum_disk != h->csum) {
57 DMERR_LIMIT("node_check failed: csum %u != wanted %u",
58 le32_to_cpu(csum_disk), le32_to_cpu(h->csum));
59 return -EILSEQ;
60 }
61
62 value_size = le32_to_cpu(h->value_size);
63
64 if (sizeof(struct node_header) +
65 (sizeof(__le64) + value_size) * le32_to_cpu(h->max_entries) > block_size) {
66 DMERR_LIMIT("node_check failed: max_entries too large");
67 return -EILSEQ;
68 }
69
70 if (le32_to_cpu(h->nr_entries) > le32_to_cpu(h->max_entries)) {
71 DMERR_LIMIT("node_check failed: too many entries");
72 return -EILSEQ;
73 }
74
75 /*
76 * The node must be either INTERNAL or LEAF.
77 */
78 flags = le32_to_cpu(h->flags);
79 if (!(flags & INTERNAL_NODE) && !(flags & LEAF_NODE)) {
80 DMERR_LIMIT("node_check failed: node is neither INTERNAL or LEAF");
81 return -EILSEQ;
82 }
83
84 return 0;
85}
86
87struct dm_block_validator btree_node_validator = {
88 .name = "btree_node",
89 .prepare_for_write = node_prepare_for_write,
90 .check = node_check
91};
92
93/*----------------------------------------------------------------*/
94
95static int bn_read_lock(struct dm_btree_info *info, dm_block_t b,
96 struct dm_block **result)
97{
98 return dm_tm_read_lock(info->tm, b, &btree_node_validator, result);
99}
100
101static int bn_shadow(struct dm_btree_info *info, dm_block_t orig,
102 struct dm_btree_value_type *vt,
103 struct dm_block **result)
104{
105 int r, inc;
106
107 r = dm_tm_shadow_block(info->tm, orig, &btree_node_validator,
108 result, &inc);
109 if (!r && inc)
110 inc_children(info->tm, dm_block_data(*result), vt);
111
112 return r;
113}
114
115int new_block(struct dm_btree_info *info, struct dm_block **result)
116{
117 return dm_tm_new_block(info->tm, &btree_node_validator, result);
118}
119
120int unlock_block(struct dm_btree_info *info, struct dm_block *b)
121{
122 return dm_tm_unlock(info->tm, b);
123}
124
125/*----------------------------------------------------------------*/
126
127void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info)
128{
129 s->info = info;
130 s->count = 0;
131 s->nodes[0] = NULL;
132 s->nodes[1] = NULL;
133}
134
135int exit_ro_spine(struct ro_spine *s)
136{
137 int r = 0, i;
138
139 for (i = 0; i < s->count; i++) {
140 int r2 = unlock_block(s->info, s->nodes[i]);
141 if (r2 < 0)
142 r = r2;
143 }
144
145 return r;
146}
147
148int ro_step(struct ro_spine *s, dm_block_t new_child)
149{
150 int r;
151
152 if (s->count == 2) {
153 r = unlock_block(s->info, s->nodes[0]);
154 if (r < 0)
155 return r;
156 s->nodes[0] = s->nodes[1];
157 s->count--;
158 }
159
160 r = bn_read_lock(s->info, new_child, s->nodes + s->count);
161 if (!r)
162 s->count++;
163
164 return r;
165}
166
167struct btree_node *ro_node(struct ro_spine *s)
168{
169 struct dm_block *block;
170
171 BUG_ON(!s->count);
172 block = s->nodes[s->count - 1];
173
174 return dm_block_data(block);
175}
176
177/*----------------------------------------------------------------*/
178
179void init_shadow_spine(struct shadow_spine *s, struct dm_btree_info *info)
180{
181 s->info = info;
182 s->count = 0;
183}
184
185int exit_shadow_spine(struct shadow_spine *s)
186{
187 int r = 0, i;
188
189 for (i = 0; i < s->count; i++) {
190 int r2 = unlock_block(s->info, s->nodes[i]);
191 if (r2 < 0)
192 r = r2;
193 }
194
195 return r;
196}
197
198int shadow_step(struct shadow_spine *s, dm_block_t b,
199 struct dm_btree_value_type *vt)
200{
201 int r;
202
203 if (s->count == 2) {
204 r = unlock_block(s->info, s->nodes[0]);
205 if (r < 0)
206 return r;
207 s->nodes[0] = s->nodes[1];
208 s->count--;
209 }
210
211 r = bn_shadow(s->info, b, vt, s->nodes + s->count);
212 if (!r) {
213 if (!s->count)
214 s->root = dm_block_location(s->nodes[0]);
215
216 s->count++;
217 }
218
219 return r;
220}
221
222struct dm_block *shadow_current(struct shadow_spine *s)
223{
224 BUG_ON(!s->count);
225
226 return s->nodes[s->count - 1];
227}
228
229struct dm_block *shadow_parent(struct shadow_spine *s)
230{
231 BUG_ON(s->count != 2);
232
233 return s->count == 2 ? s->nodes[0] : NULL;
234}
235
236int shadow_has_parent(struct shadow_spine *s)
237{
238 return s->count >= 2;
239}
240
241int shadow_root(struct shadow_spine *s)
242{
243 return s->root;
244}
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
deleted file mode 100644
index 4caf66918cd..00000000000
--- a/drivers/md/persistent-data/dm-btree.c
+++ /dev/null
@@ -1,809 +0,0 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-btree-internal.h"
8#include "dm-space-map.h"
9#include "dm-transaction-manager.h"
10
11#include <linux/export.h>
12#include <linux/device-mapper.h>
13
14#define DM_MSG_PREFIX "btree"
15
16/*----------------------------------------------------------------
17 * Array manipulation
18 *--------------------------------------------------------------*/
19static void memcpy_disk(void *dest, const void *src, size_t len)
20 __dm_written_to_disk(src)
21{
22 memcpy(dest, src, len);
23 __dm_unbless_for_disk(src);
24}
25
26static void array_insert(void *base, size_t elt_size, unsigned nr_elts,
27 unsigned index, void *elt)
28 __dm_written_to_disk(elt)
29{
30 if (index < nr_elts)
31 memmove(base + (elt_size * (index + 1)),
32 base + (elt_size * index),
33 (nr_elts - index) * elt_size);
34
35 memcpy_disk(base + (elt_size * index), elt, elt_size);
36}
37
38/*----------------------------------------------------------------*/
39
40/* makes the assumption that no two keys are the same. */
41static int bsearch(struct btree_node *n, uint64_t key, int want_hi)
42{
43 int lo = -1, hi = le32_to_cpu(n->header.nr_entries);
44
45 while (hi - lo > 1) {
46 int mid = lo + ((hi - lo) / 2);
47 uint64_t mid_key = le64_to_cpu(n->keys[mid]);
48
49 if (mid_key == key)
50 return mid;
51
52 if (mid_key < key)
53 lo = mid;
54 else
55 hi = mid;
56 }
57
58 return want_hi ? hi : lo;
59}
60
61int lower_bound(struct btree_node *n, uint64_t key)
62{
63 return bsearch(n, key, 0);
64}
65
66void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
67 struct dm_btree_value_type *vt)
68{
69 unsigned i;
70 uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
71
72 if (le32_to_cpu(n->header.flags) & INTERNAL_NODE)
73 for (i = 0; i < nr_entries; i++)
74 dm_tm_inc(tm, value64(n, i));
75 else if (vt->inc)
76 for (i = 0; i < nr_entries; i++)
77 vt->inc(vt->context, value_ptr(n, i));
78}
79
80static int insert_at(size_t value_size, struct btree_node *node, unsigned index,
81 uint64_t key, void *value)
82 __dm_written_to_disk(value)
83{
84 uint32_t nr_entries = le32_to_cpu(node->header.nr_entries);
85 __le64 key_le = cpu_to_le64(key);
86
87 if (index > nr_entries ||
88 index >= le32_to_cpu(node->header.max_entries)) {
89 DMERR("too many entries in btree node for insert");
90 __dm_unbless_for_disk(value);
91 return -ENOMEM;
92 }
93
94 __dm_bless_for_disk(&key_le);
95
96 array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le);
97 array_insert(value_base(node), value_size, nr_entries, index, value);
98 node->header.nr_entries = cpu_to_le32(nr_entries + 1);
99
100 return 0;
101}
102
103/*----------------------------------------------------------------*/
104
105/*
106 * We want 3n entries (for some n). This works more nicely for repeated
107 * insert remove loops than (2n + 1).
108 */
109static uint32_t calc_max_entries(size_t value_size, size_t block_size)
110{
111 uint32_t total, n;
112 size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */
113
114 block_size -= sizeof(struct node_header);
115 total = block_size / elt_size;
116 n = total / 3; /* rounds down */
117
118 return 3 * n;
119}
120
121int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root)
122{
123 int r;
124 struct dm_block *b;
125 struct btree_node *n;
126 size_t block_size;
127 uint32_t max_entries;
128
129 r = new_block(info, &b);
130 if (r < 0)
131 return r;
132
133 block_size = dm_bm_block_size(dm_tm_get_bm(info->tm));
134 max_entries = calc_max_entries(info->value_type.size, block_size);
135
136 n = dm_block_data(b);
137 memset(n, 0, block_size);
138 n->header.flags = cpu_to_le32(LEAF_NODE);
139 n->header.nr_entries = cpu_to_le32(0);
140 n->header.max_entries = cpu_to_le32(max_entries);
141 n->header.value_size = cpu_to_le32(info->value_type.size);
142
143 *root = dm_block_location(b);
144 return unlock_block(info, b);
145}
146EXPORT_SYMBOL_GPL(dm_btree_empty);
147
148/*----------------------------------------------------------------*/
149
150/*
151 * Deletion uses a recursive algorithm, since we have limited stack space
152 * we explicitly manage our own stack on the heap.
153 */
154#define MAX_SPINE_DEPTH 64
155struct frame {
156 struct dm_block *b;
157 struct btree_node *n;
158 unsigned level;
159 unsigned nr_children;
160 unsigned current_child;
161};
162
163struct del_stack {
164 struct dm_transaction_manager *tm;
165 int top;
166 struct frame spine[MAX_SPINE_DEPTH];
167};
168
169static int top_frame(struct del_stack *s, struct frame **f)
170{
171 if (s->top < 0) {
172 DMERR("btree deletion stack empty");
173 return -EINVAL;
174 }
175
176 *f = s->spine + s->top;
177
178 return 0;
179}
180
181static int unprocessed_frames(struct del_stack *s)
182{
183 return s->top >= 0;
184}
185
186static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
187{
188 int r;
189 uint32_t ref_count;
190
191 if (s->top >= MAX_SPINE_DEPTH - 1) {
192 DMERR("btree deletion stack out of memory");
193 return -ENOMEM;
194 }
195
196 r = dm_tm_ref(s->tm, b, &ref_count);
197 if (r)
198 return r;
199
200 if (ref_count > 1)
201 /*
202 * This is a shared node, so we can just decrement it's
203 * reference counter and leave the children.
204 */
205 dm_tm_dec(s->tm, b);
206
207 else {
208 struct frame *f = s->spine + ++s->top;
209
210 r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b);
211 if (r) {
212 s->top--;
213 return r;
214 }
215
216 f->n = dm_block_data(f->b);
217 f->level = level;
218 f->nr_children = le32_to_cpu(f->n->header.nr_entries);
219 f->current_child = 0;
220 }
221
222 return 0;
223}
224
225static void pop_frame(struct del_stack *s)
226{
227 struct frame *f = s->spine + s->top--;
228
229 dm_tm_dec(s->tm, dm_block_location(f->b));
230 dm_tm_unlock(s->tm, f->b);
231}
232
233static bool is_internal_level(struct dm_btree_info *info, struct frame *f)
234{
235 return f->level < (info->levels - 1);
236}
237
238int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
239{
240 int r;
241 struct del_stack *s;
242
243 s = kmalloc(sizeof(*s), GFP_KERNEL);
244 if (!s)
245 return -ENOMEM;
246 s->tm = info->tm;
247 s->top = -1;
248
249 r = push_frame(s, root, 0);
250 if (r)
251 goto out;
252
253 while (unprocessed_frames(s)) {
254 uint32_t flags;
255 struct frame *f;
256 dm_block_t b;
257
258 r = top_frame(s, &f);
259 if (r)
260 goto out;
261
262 if (f->current_child >= f->nr_children) {
263 pop_frame(s);
264 continue;
265 }
266
267 flags = le32_to_cpu(f->n->header.flags);
268 if (flags & INTERNAL_NODE) {
269 b = value64(f->n, f->current_child);
270 f->current_child++;
271 r = push_frame(s, b, f->level);
272 if (r)
273 goto out;
274
275 } else if (is_internal_level(info, f)) {
276 b = value64(f->n, f->current_child);
277 f->current_child++;
278 r = push_frame(s, b, f->level + 1);
279 if (r)
280 goto out;
281
282 } else {
283 if (info->value_type.dec) {
284 unsigned i;
285
286 for (i = 0; i < f->nr_children; i++)
287 info->value_type.dec(info->value_type.context,
288 value_ptr(f->n, i));
289 }
290 f->current_child = f->nr_children;
291 }
292 }
293
294out:
295 kfree(s);
296 return r;
297}
298EXPORT_SYMBOL_GPL(dm_btree_del);
299
300/*----------------------------------------------------------------*/
301
302static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
303 int (*search_fn)(struct btree_node *, uint64_t),
304 uint64_t *result_key, void *v, size_t value_size)
305{
306 int i, r;
307 uint32_t flags, nr_entries;
308
309 do {
310 r = ro_step(s, block);
311 if (r < 0)
312 return r;
313
314 i = search_fn(ro_node(s), key);
315
316 flags = le32_to_cpu(ro_node(s)->header.flags);
317 nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries);
318 if (i < 0 || i >= nr_entries)
319 return -ENODATA;
320
321 if (flags & INTERNAL_NODE)
322 block = value64(ro_node(s), i);
323
324 } while (!(flags & LEAF_NODE));
325
326 *result_key = le64_to_cpu(ro_node(s)->keys[i]);
327 memcpy(v, value_ptr(ro_node(s), i), value_size);
328
329 return 0;
330}
331
332int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
333 uint64_t *keys, void *value_le)
334{
335 unsigned level, last_level = info->levels - 1;
336 int r = -ENODATA;
337 uint64_t rkey;
338 __le64 internal_value_le;
339 struct ro_spine spine;
340
341 init_ro_spine(&spine, info);
342 for (level = 0; level < info->levels; level++) {
343 size_t size;
344 void *value_p;
345
346 if (level == last_level) {
347 value_p = value_le;
348 size = info->value_type.size;
349
350 } else {
351 value_p = &internal_value_le;
352 size = sizeof(uint64_t);
353 }
354
355 r = btree_lookup_raw(&spine, root, keys[level],
356 lower_bound, &rkey,
357 value_p, size);
358
359 if (!r) {
360 if (rkey != keys[level]) {
361 exit_ro_spine(&spine);
362 return -ENODATA;
363 }
364 } else {
365 exit_ro_spine(&spine);
366 return r;
367 }
368
369 root = le64_to_cpu(internal_value_le);
370 }
371 exit_ro_spine(&spine);
372
373 return r;
374}
375EXPORT_SYMBOL_GPL(dm_btree_lookup);
376
377/*
378 * Splits a node by creating a sibling node and shifting half the nodes
379 * contents across. Assumes there is a parent node, and it has room for
380 * another child.
381 *
382 * Before:
383 * +--------+
384 * | Parent |
385 * +--------+
386 * |
387 * v
388 * +----------+
389 * | A ++++++ |
390 * +----------+
391 *
392 *
393 * After:
394 * +--------+
395 * | Parent |
396 * +--------+
397 * | |
398 * v +------+
399 * +---------+ |
400 * | A* +++ | v
401 * +---------+ +-------+
402 * | B +++ |
403 * +-------+
404 *
405 * Where A* is a shadow of A.
406 */
407static int btree_split_sibling(struct shadow_spine *s, dm_block_t root,
408 unsigned parent_index, uint64_t key)
409{
410 int r;
411 size_t size;
412 unsigned nr_left, nr_right;
413 struct dm_block *left, *right, *parent;
414 struct btree_node *ln, *rn, *pn;
415 __le64 location;
416
417 left = shadow_current(s);
418
419 r = new_block(s->info, &right);
420 if (r < 0)
421 return r;
422
423 ln = dm_block_data(left);
424 rn = dm_block_data(right);
425
426 nr_left = le32_to_cpu(ln->header.nr_entries) / 2;
427 nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left;
428
429 ln->header.nr_entries = cpu_to_le32(nr_left);
430
431 rn->header.flags = ln->header.flags;
432 rn->header.nr_entries = cpu_to_le32(nr_right);
433 rn->header.max_entries = ln->header.max_entries;
434 rn->header.value_size = ln->header.value_size;
435 memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0]));
436
437 size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
438 sizeof(uint64_t) : s->info->value_type.size;
439 memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left),
440 size * nr_right);
441
442 /*
443 * Patch up the parent
444 */
445 parent = shadow_parent(s);
446
447 pn = dm_block_data(parent);
448 location = cpu_to_le64(dm_block_location(left));
449 __dm_bless_for_disk(&location);
450 memcpy_disk(value_ptr(pn, parent_index),
451 &location, sizeof(__le64));
452
453 location = cpu_to_le64(dm_block_location(right));
454 __dm_bless_for_disk(&location);
455
456 r = insert_at(sizeof(__le64), pn, parent_index + 1,
457 le64_to_cpu(rn->keys[0]), &location);
458 if (r)
459 return r;
460
461 if (key < le64_to_cpu(rn->keys[0])) {
462 unlock_block(s->info, right);
463 s->nodes[1] = left;
464 } else {
465 unlock_block(s->info, left);
466 s->nodes[1] = right;
467 }
468
469 return 0;
470}
471
472/*
473 * Splits a node by creating two new children beneath the given node.
474 *
475 * Before:
476 * +----------+
477 * | A ++++++ |
478 * +----------+
479 *
480 *
481 * After:
482 * +------------+
483 * | A (shadow) |
484 * +------------+
485 * | |
486 * +------+ +----+
487 * | |
488 * v v
489 * +-------+ +-------+
490 * | B +++ | | C +++ |
491 * +-------+ +-------+
492 */
493static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
494{
495 int r;
496 size_t size;
497 unsigned nr_left, nr_right;
498 struct dm_block *left, *right, *new_parent;
499 struct btree_node *pn, *ln, *rn;
500 __le64 val;
501
502 new_parent = shadow_current(s);
503
504 r = new_block(s->info, &left);
505 if (r < 0)
506 return r;
507
508 r = new_block(s->info, &right);
509 if (r < 0) {
510 /* FIXME: put left */
511 return r;
512 }
513
514 pn = dm_block_data(new_parent);
515 ln = dm_block_data(left);
516 rn = dm_block_data(right);
517
518 nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
519 nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
520
521 ln->header.flags = pn->header.flags;
522 ln->header.nr_entries = cpu_to_le32(nr_left);
523 ln->header.max_entries = pn->header.max_entries;
524 ln->header.value_size = pn->header.value_size;
525
526 rn->header.flags = pn->header.flags;
527 rn->header.nr_entries = cpu_to_le32(nr_right);
528 rn->header.max_entries = pn->header.max_entries;
529 rn->header.value_size = pn->header.value_size;
530
531 memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
532 memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
533
534 size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
535 sizeof(__le64) : s->info->value_type.size;
536 memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
537 memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
538 nr_right * size);
539
540 /* new_parent should just point to l and r now */
541 pn->header.flags = cpu_to_le32(INTERNAL_NODE);
542 pn->header.nr_entries = cpu_to_le32(2);
543 pn->header.max_entries = cpu_to_le32(
544 calc_max_entries(sizeof(__le64),
545 dm_bm_block_size(
546 dm_tm_get_bm(s->info->tm))));
547 pn->header.value_size = cpu_to_le32(sizeof(__le64));
548
549 val = cpu_to_le64(dm_block_location(left));
550 __dm_bless_for_disk(&val);
551 pn->keys[0] = ln->keys[0];
552 memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64));
553
554 val = cpu_to_le64(dm_block_location(right));
555 __dm_bless_for_disk(&val);
556 pn->keys[1] = rn->keys[0];
557 memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64));
558
559 /*
560 * rejig the spine. This is ugly, since it knows too
561 * much about the spine
562 */
563 if (s->nodes[0] != new_parent) {
564 unlock_block(s->info, s->nodes[0]);
565 s->nodes[0] = new_parent;
566 }
567 if (key < le64_to_cpu(rn->keys[0])) {
568 unlock_block(s->info, right);
569 s->nodes[1] = left;
570 } else {
571 unlock_block(s->info, left);
572 s->nodes[1] = right;
573 }
574 s->count = 2;
575
576 return 0;
577}
578
579static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
580 struct dm_btree_value_type *vt,
581 uint64_t key, unsigned *index)
582{
583 int r, i = *index, top = 1;
584 struct btree_node *node;
585
586 for (;;) {
587 r = shadow_step(s, root, vt);
588 if (r < 0)
589 return r;
590
591 node = dm_block_data(shadow_current(s));
592
593 /*
594 * We have to patch up the parent node, ugly, but I don't
595 * see a way to do this automatically as part of the spine
596 * op.
597 */
598 if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */
599 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
600
601 __dm_bless_for_disk(&location);
602 memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
603 &location, sizeof(__le64));
604 }
605
606 node = dm_block_data(shadow_current(s));
607
608 if (node->header.nr_entries == node->header.max_entries) {
609 if (top)
610 r = btree_split_beneath(s, key);
611 else
612 r = btree_split_sibling(s, root, i, key);
613
614 if (r < 0)
615 return r;
616 }
617
618 node = dm_block_data(shadow_current(s));
619
620 i = lower_bound(node, key);
621
622 if (le32_to_cpu(node->header.flags) & LEAF_NODE)
623 break;
624
625 if (i < 0) {
626 /* change the bounds on the lowest key */
627 node->keys[0] = cpu_to_le64(key);
628 i = 0;
629 }
630
631 root = value64(node, i);
632 top = 0;
633 }
634
635 if (i < 0 || le64_to_cpu(node->keys[i]) != key)
636 i++;
637
638 *index = i;
639 return 0;
640}
641
642static int insert(struct dm_btree_info *info, dm_block_t root,
643 uint64_t *keys, void *value, dm_block_t *new_root,
644 int *inserted)
645 __dm_written_to_disk(value)
646{
647 int r, need_insert;
648 unsigned level, index = -1, last_level = info->levels - 1;
649 dm_block_t block = root;
650 struct shadow_spine spine;
651 struct btree_node *n;
652 struct dm_btree_value_type le64_type;
653
654 le64_type.context = NULL;
655 le64_type.size = sizeof(__le64);
656 le64_type.inc = NULL;
657 le64_type.dec = NULL;
658 le64_type.equal = NULL;
659
660 init_shadow_spine(&spine, info);
661
662 for (level = 0; level < (info->levels - 1); level++) {
663 r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index);
664 if (r < 0)
665 goto bad;
666
667 n = dm_block_data(shadow_current(&spine));
668 need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
669 (le64_to_cpu(n->keys[index]) != keys[level]));
670
671 if (need_insert) {
672 dm_block_t new_tree;
673 __le64 new_le;
674
675 r = dm_btree_empty(info, &new_tree);
676 if (r < 0)
677 goto bad;
678
679 new_le = cpu_to_le64(new_tree);
680 __dm_bless_for_disk(&new_le);
681
682 r = insert_at(sizeof(uint64_t), n, index,
683 keys[level], &new_le);
684 if (r)
685 goto bad;
686 }
687
688 if (level < last_level)
689 block = value64(n, index);
690 }
691
692 r = btree_insert_raw(&spine, block, &info->value_type,
693 keys[level], &index);
694 if (r < 0)
695 goto bad;
696
697 n = dm_block_data(shadow_current(&spine));
698 need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
699 (le64_to_cpu(n->keys[index]) != keys[level]));
700
701 if (need_insert) {
702 if (inserted)
703 *inserted = 1;
704
705 r = insert_at(info->value_type.size, n, index,
706 keys[level], value);
707 if (r)
708 goto bad_unblessed;
709 } else {
710 if (inserted)
711 *inserted = 0;
712
713 if (info->value_type.dec &&
714 (!info->value_type.equal ||
715 !info->value_type.equal(
716 info->value_type.context,
717 value_ptr(n, index),
718 value))) {
719 info->value_type.dec(info->value_type.context,
720 value_ptr(n, index));
721 }
722 memcpy_disk(value_ptr(n, index),
723 value, info->value_type.size);
724 }
725
726 *new_root = shadow_root(&spine);
727 exit_shadow_spine(&spine);
728
729 return 0;
730
731bad:
732 __dm_unbless_for_disk(value);
733bad_unblessed:
734 exit_shadow_spine(&spine);
735 return r;
736}
737
738int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
739 uint64_t *keys, void *value, dm_block_t *new_root)
740 __dm_written_to_disk(value)
741{
742 return insert(info, root, keys, value, new_root, NULL);
743}
744EXPORT_SYMBOL_GPL(dm_btree_insert);
745
746int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
747 uint64_t *keys, void *value, dm_block_t *new_root,
748 int *inserted)
749 __dm_written_to_disk(value)
750{
751 return insert(info, root, keys, value, new_root, inserted);
752}
753EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
754
755/*----------------------------------------------------------------*/
756
757static int find_highest_key(struct ro_spine *s, dm_block_t block,
758 uint64_t *result_key, dm_block_t *next_block)
759{
760 int i, r;
761 uint32_t flags;
762
763 do {
764 r = ro_step(s, block);
765 if (r < 0)
766 return r;
767
768 flags = le32_to_cpu(ro_node(s)->header.flags);
769 i = le32_to_cpu(ro_node(s)->header.nr_entries);
770 if (!i)
771 return -ENODATA;
772 else
773 i--;
774
775 *result_key = le64_to_cpu(ro_node(s)->keys[i]);
776 if (next_block || flags & INTERNAL_NODE)
777 block = value64(ro_node(s), i);
778
779 } while (flags & INTERNAL_NODE);
780
781 if (next_block)
782 *next_block = block;
783 return 0;
784}
785
786int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
787 uint64_t *result_keys)
788{
789 int r = 0, count = 0, level;
790 struct ro_spine spine;
791
792 init_ro_spine(&spine, info);
793 for (level = 0; level < info->levels; level++) {
794 r = find_highest_key(&spine, root, result_keys + level,
795 level == info->levels - 1 ? NULL : &root);
796 if (r == -ENODATA) {
797 r = 0;
798 break;
799
800 } else if (r)
801 break;
802
803 count++;
804 }
805 exit_ro_spine(&spine);
806
807 return r ? r : count;
808}
809EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h
deleted file mode 100644
index a2cd50441ca..00000000000
--- a/drivers/md/persistent-data/dm-btree.h
+++ /dev/null
@@ -1,145 +0,0 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6#ifndef _LINUX_DM_BTREE_H
7#define _LINUX_DM_BTREE_H
8
9#include "dm-block-manager.h"
10
11struct dm_transaction_manager;
12
13/*----------------------------------------------------------------*/
14
15/*
16 * Annotations used to check on-disk metadata is handled as little-endian.
17 */
18#ifdef __CHECKER__
19# define __dm_written_to_disk(x) __releases(x)
20# define __dm_reads_from_disk(x) __acquires(x)
21# define __dm_bless_for_disk(x) __acquire(x)
22# define __dm_unbless_for_disk(x) __release(x)
23#else
24# define __dm_written_to_disk(x)
25# define __dm_reads_from_disk(x)
26# define __dm_bless_for_disk(x)
27# define __dm_unbless_for_disk(x)
28#endif
29
30/*----------------------------------------------------------------*/
31
32/*
33 * Manipulates hierarchical B+ trees with 64-bit keys and arbitrary-sized
34 * values.
35 */
36
37/*
38 * Information about the values stored within the btree.
39 */
40struct dm_btree_value_type {
41 void *context;
42
43 /*
44 * The size in bytes of each value.
45 */
46 uint32_t size;
47
48 /*
49 * Any of these methods can be safely set to NULL if you do not
50 * need the corresponding feature.
51 */
52
53 /*
54 * The btree is making a duplicate of the value, for instance
55 * because previously-shared btree nodes have now diverged.
56 * @value argument is the new copy that the copy function may modify.
57 * (Probably it just wants to increment a reference count
58 * somewhere.) This method is _not_ called for insertion of a new
59 * value: It is assumed the ref count is already 1.
60 */
61 void (*inc)(void *context, void *value);
62
63 /*
64 * This value is being deleted. The btree takes care of freeing
65 * the memory pointed to by @value. Often the del function just
66 * needs to decrement a reference count somewhere.
67 */
68 void (*dec)(void *context, void *value);
69
70 /*
71 * A test for equality between two values. When a value is
72 * overwritten with a new one, the old one has the dec method
73 * called _unless_ the new and old value are deemed equal.
74 */
75 int (*equal)(void *context, void *value1, void *value2);
76};
77
78/*
79 * The shape and contents of a btree.
80 */
81struct dm_btree_info {
82 struct dm_transaction_manager *tm;
83
84 /*
85 * Number of nested btrees. (Not the depth of a single tree.)
86 */
87 unsigned levels;
88 struct dm_btree_value_type value_type;
89};
90
91/*
92 * Set up an empty tree. O(1).
93 */
94int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root);
95
96/*
97 * Delete a tree. O(n) - this is the slow one! It can also block, so
98 * please don't call it on an IO path.
99 */
100int dm_btree_del(struct dm_btree_info *info, dm_block_t root);
101
102/*
103 * All the lookup functions return -ENODATA if the key cannot be found.
104 */
105
106/*
107 * Tries to find a key that matches exactly. O(ln(n))
108 */
109int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
110 uint64_t *keys, void *value_le);
111
112/*
113 * Insertion (or overwrite an existing value). O(ln(n))
114 */
115int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
116 uint64_t *keys, void *value, dm_block_t *new_root)
117 __dm_written_to_disk(value);
118
119/*
120 * A variant of insert that indicates whether it actually inserted or just
121 * overwrote. Useful if you're keeping track of the number of entries in a
122 * tree.
123 */
124int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
125 uint64_t *keys, void *value, dm_block_t *new_root,
126 int *inserted)
127 __dm_written_to_disk(value);
128
129/*
130 * Remove a key if present. This doesn't remove empty sub trees. Normally
131 * subtrees represent a separate entity, like a snapshot map, so this is
132 * correct behaviour. O(ln(n)).
133 */
134int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
135 uint64_t *keys, dm_block_t *new_root);
136
137/*
138 * Returns < 0 on failure. Otherwise the number of key entries that have
139 * been filled out. Remember trees can have zero entries, and as such have
140 * no highest key.
141 */
142int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
143 uint64_t *result_keys);
144
145#endif /* _LINUX_DM_BTREE_H */
diff --git a/drivers/md/persistent-data/dm-persistent-data-internal.h b/drivers/md/persistent-data/dm-persistent-data-internal.h
deleted file mode 100644
index c49e26fff36..00000000000
--- a/drivers/md/persistent-data/dm-persistent-data-internal.h
+++ /dev/null
@@ -1,19 +0,0 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef _DM_PERSISTENT_DATA_INTERNAL_H
8#define _DM_PERSISTENT_DATA_INTERNAL_H
9
10#include "dm-block-manager.h"
11
12static inline unsigned dm_hash_block(dm_block_t b, unsigned hash_mask)
13{
14 const unsigned BIG_PRIME = 4294967291UL;
15
16 return (((unsigned) b) * BIG_PRIME) & hash_mask;
17}
18
19#endif /* _PERSISTENT_DATA_INTERNAL_H */
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
deleted file mode 100644
index 3e7a88d99eb..00000000000
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ /dev/null
@@ -1,712 +0,0 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-space-map-common.h"
8#include "dm-transaction-manager.h"
9
10#include <linux/bitops.h>
11#include <linux/device-mapper.h>
12
13#define DM_MSG_PREFIX "space map common"
14
15/*----------------------------------------------------------------*/
16
17/*
18 * Index validator.
19 */
20#define INDEX_CSUM_XOR 160478
21
22static void index_prepare_for_write(struct dm_block_validator *v,
23 struct dm_block *b,
24 size_t block_size)
25{
26 struct disk_metadata_index *mi_le = dm_block_data(b);
27
28 mi_le->blocknr = cpu_to_le64(dm_block_location(b));
29 mi_le->csum = cpu_to_le32(dm_bm_checksum(&mi_le->padding,
30 block_size - sizeof(__le32),
31 INDEX_CSUM_XOR));
32}
33
34static int index_check(struct dm_block_validator *v,
35 struct dm_block *b,
36 size_t block_size)
37{
38 struct disk_metadata_index *mi_le = dm_block_data(b);
39 __le32 csum_disk;
40
41 if (dm_block_location(b) != le64_to_cpu(mi_le->blocknr)) {
42 DMERR_LIMIT("index_check failed: blocknr %llu != wanted %llu",
43 le64_to_cpu(mi_le->blocknr), dm_block_location(b));
44 return -ENOTBLK;
45 }
46
47 csum_disk = cpu_to_le32(dm_bm_checksum(&mi_le->padding,
48 block_size - sizeof(__le32),
49 INDEX_CSUM_XOR));
50 if (csum_disk != mi_le->csum) {
51 DMERR_LIMIT("index_check failed: csum %u != wanted %u",
52 le32_to_cpu(csum_disk), le32_to_cpu(mi_le->csum));
53 return -EILSEQ;
54 }
55
56 return 0;
57}
58
59static struct dm_block_validator index_validator = {
60 .name = "index",
61 .prepare_for_write = index_prepare_for_write,
62 .check = index_check
63};
64
65/*----------------------------------------------------------------*/
66
67/*
68 * Bitmap validator
69 */
70#define BITMAP_CSUM_XOR 240779
71
72static void bitmap_prepare_for_write(struct dm_block_validator *v,
73 struct dm_block *b,
74 size_t block_size)
75{
76 struct disk_bitmap_header *disk_header = dm_block_data(b);
77
78 disk_header->blocknr = cpu_to_le64(dm_block_location(b));
79 disk_header->csum = cpu_to_le32(dm_bm_checksum(&disk_header->not_used,
80 block_size - sizeof(__le32),
81 BITMAP_CSUM_XOR));
82}
83
84static int bitmap_check(struct dm_block_validator *v,
85 struct dm_block *b,
86 size_t block_size)
87{
88 struct disk_bitmap_header *disk_header = dm_block_data(b);
89 __le32 csum_disk;
90
91 if (dm_block_location(b) != le64_to_cpu(disk_header->blocknr)) {
92 DMERR_LIMIT("bitmap check failed: blocknr %llu != wanted %llu",
93 le64_to_cpu(disk_header->blocknr), dm_block_location(b));
94 return -ENOTBLK;
95 }
96
97 csum_disk = cpu_to_le32(dm_bm_checksum(&disk_header->not_used,
98 block_size - sizeof(__le32),
99 BITMAP_CSUM_XOR));
100 if (csum_disk != disk_header->csum) {
101 DMERR_LIMIT("bitmap check failed: csum %u != wanted %u",
102 le32_to_cpu(csum_disk), le32_to_cpu(disk_header->csum));
103 return -EILSEQ;
104 }
105
106 return 0;
107}
108
109static struct dm_block_validator dm_sm_bitmap_validator = {
110 .name = "sm_bitmap",
111 .prepare_for_write = bitmap_prepare_for_write,
112 .check = bitmap_check
113};
114
115/*----------------------------------------------------------------*/
116
117#define ENTRIES_PER_WORD 32
118#define ENTRIES_SHIFT 5
119
120static void *dm_bitmap_data(struct dm_block *b)
121{
122 return dm_block_data(b) + sizeof(struct disk_bitmap_header);
123}
124
125#define WORD_MASK_HIGH 0xAAAAAAAAAAAAAAAAULL
126
127static unsigned bitmap_word_used(void *addr, unsigned b)
128{
129 __le64 *words_le = addr;
130 __le64 *w_le = words_le + (b >> ENTRIES_SHIFT);
131
132 uint64_t bits = le64_to_cpu(*w_le);
133 uint64_t mask = (bits + WORD_MASK_HIGH + 1) & WORD_MASK_HIGH;
134
135 return !(~bits & mask);
136}
137
138static unsigned sm_lookup_bitmap(void *addr, unsigned b)
139{
140 __le64 *words_le = addr;
141 __le64 *w_le = words_le + (b >> ENTRIES_SHIFT);
142 unsigned hi, lo;
143
144 b = (b & (ENTRIES_PER_WORD - 1)) << 1;
145 hi = !!test_bit_le(b, (void *) w_le);
146 lo = !!test_bit_le(b + 1, (void *) w_le);
147 return (hi << 1) | lo;
148}
149
150static void sm_set_bitmap(void *addr, unsigned b, unsigned val)
151{
152 __le64 *words_le = addr;
153 __le64 *w_le = words_le + (b >> ENTRIES_SHIFT);
154
155 b = (b & (ENTRIES_PER_WORD - 1)) << 1;
156
157 if (val & 2)
158 __set_bit_le(b, (void *) w_le);
159 else
160 __clear_bit_le(b, (void *) w_le);
161
162 if (val & 1)
163 __set_bit_le(b + 1, (void *) w_le);
164 else
165 __clear_bit_le(b + 1, (void *) w_le);
166}
167
168static int sm_find_free(void *addr, unsigned begin, unsigned end,
169 unsigned *result)
170{
171 while (begin < end) {
172 if (!(begin & (ENTRIES_PER_WORD - 1)) &&
173 bitmap_word_used(addr, begin)) {
174 begin += ENTRIES_PER_WORD;
175 continue;
176 }
177
178 if (!sm_lookup_bitmap(addr, begin)) {
179 *result = begin;
180 return 0;
181 }
182
183 begin++;
184 }
185
186 return -ENOSPC;
187}
188
189/*----------------------------------------------------------------*/
190
191static int sm_ll_init(struct ll_disk *ll, struct dm_transaction_manager *tm)
192{
193 ll->tm = tm;
194
195 ll->bitmap_info.tm = tm;
196 ll->bitmap_info.levels = 1;
197
198 /*
199 * Because the new bitmap blocks are created via a shadow
200 * operation, the old entry has already had its reference count
201 * decremented and we don't need the btree to do any bookkeeping.
202 */
203 ll->bitmap_info.value_type.size = sizeof(struct disk_index_entry);
204 ll->bitmap_info.value_type.inc = NULL;
205 ll->bitmap_info.value_type.dec = NULL;
206 ll->bitmap_info.value_type.equal = NULL;
207
208 ll->ref_count_info.tm = tm;
209 ll->ref_count_info.levels = 1;
210 ll->ref_count_info.value_type.size = sizeof(uint32_t);
211 ll->ref_count_info.value_type.inc = NULL;
212 ll->ref_count_info.value_type.dec = NULL;
213 ll->ref_count_info.value_type.equal = NULL;
214
215 ll->block_size = dm_bm_block_size(dm_tm_get_bm(tm));
216
217 if (ll->block_size > (1 << 30)) {
218 DMERR("block size too big to hold bitmaps");
219 return -EINVAL;
220 }
221
222 ll->entries_per_block = (ll->block_size - sizeof(struct disk_bitmap_header)) *
223 ENTRIES_PER_BYTE;
224 ll->nr_blocks = 0;
225 ll->bitmap_root = 0;
226 ll->ref_count_root = 0;
227 ll->bitmap_index_changed = false;
228
229 return 0;
230}
231
232int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks)
233{
234 int r;
235 dm_block_t i, nr_blocks, nr_indexes;
236 unsigned old_blocks, blocks;
237
238 nr_blocks = ll->nr_blocks + extra_blocks;
239 old_blocks = dm_sector_div_up(ll->nr_blocks, ll->entries_per_block);
240 blocks = dm_sector_div_up(nr_blocks, ll->entries_per_block);
241
242 nr_indexes = dm_sector_div_up(nr_blocks, ll->entries_per_block);
243 if (nr_indexes > ll->max_entries(ll)) {
244 DMERR("space map too large");
245 return -EINVAL;
246 }
247
248 for (i = old_blocks; i < blocks; i++) {
249 struct dm_block *b;
250 struct disk_index_entry idx;
251
252 r = dm_tm_new_block(ll->tm, &dm_sm_bitmap_validator, &b);
253 if (r < 0)
254 return r;
255 idx.blocknr = cpu_to_le64(dm_block_location(b));
256
257 r = dm_tm_unlock(ll->tm, b);
258 if (r < 0)
259 return r;
260
261 idx.nr_free = cpu_to_le32(ll->entries_per_block);
262 idx.none_free_before = 0;
263
264 r = ll->save_ie(ll, i, &idx);
265 if (r < 0)
266 return r;
267 }
268
269 ll->nr_blocks = nr_blocks;
270 return 0;
271}
272
273int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result)
274{
275 int r;
276 dm_block_t index = b;
277 struct disk_index_entry ie_disk;
278 struct dm_block *blk;
279
280 b = do_div(index, ll->entries_per_block);
281 r = ll->load_ie(ll, index, &ie_disk);
282 if (r < 0)
283 return r;
284
285 r = dm_tm_read_lock(ll->tm, le64_to_cpu(ie_disk.blocknr),
286 &dm_sm_bitmap_validator, &blk);
287 if (r < 0)
288 return r;
289
290 *result = sm_lookup_bitmap(dm_bitmap_data(blk), b);
291
292 return dm_tm_unlock(ll->tm, blk);
293}
294
295int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result)
296{
297 __le32 le_rc;
298 int r = sm_ll_lookup_bitmap(ll, b, result);
299
300 if (r)
301 return r;
302
303 if (*result != 3)
304 return r;
305
306 r = dm_btree_lookup(&ll->ref_count_info, ll->ref_count_root, &b, &le_rc);
307 if (r < 0)
308 return r;
309
310 *result = le32_to_cpu(le_rc);
311
312 return r;
313}
314
315int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
316 dm_block_t end, dm_block_t *result)
317{
318 int r;
319 struct disk_index_entry ie_disk;
320 dm_block_t i, index_begin = begin;
321 dm_block_t index_end = dm_sector_div_up(end, ll->entries_per_block);
322
323 /*
324 * FIXME: Use shifts
325 */
326 begin = do_div(index_begin, ll->entries_per_block);
327 end = do_div(end, ll->entries_per_block);
328
329 for (i = index_begin; i < index_end; i++, begin = 0) {
330 struct dm_block *blk;
331 unsigned position;
332 uint32_t bit_end;
333
334 r = ll->load_ie(ll, i, &ie_disk);
335 if (r < 0)
336 return r;
337
338 if (le32_to_cpu(ie_disk.nr_free) == 0)
339 continue;
340
341 r = dm_tm_read_lock(ll->tm, le64_to_cpu(ie_disk.blocknr),
342 &dm_sm_bitmap_validator, &blk);
343 if (r < 0)
344 return r;
345
346 bit_end = (i == index_end - 1) ? end : ll->entries_per_block;
347
348 r = sm_find_free(dm_bitmap_data(blk),
349 max_t(unsigned, begin, le32_to_cpu(ie_disk.none_free_before)),
350 bit_end, &position);
351 if (r == -ENOSPC) {
352 /*
353 * This might happen because we started searching
354 * part way through the bitmap.
355 */
356 dm_tm_unlock(ll->tm, blk);
357 continue;
358
359 } else if (r < 0) {
360 dm_tm_unlock(ll->tm, blk);
361 return r;
362 }
363
364 r = dm_tm_unlock(ll->tm, blk);
365 if (r < 0)
366 return r;
367
368 *result = i * ll->entries_per_block + (dm_block_t) position;
369 return 0;
370 }
371
372 return -ENOSPC;
373}
374
375int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
376 uint32_t ref_count, enum allocation_event *ev)
377{
378 int r;
379 uint32_t bit, old;
380 struct dm_block *nb;
381 dm_block_t index = b;
382 struct disk_index_entry ie_disk;
383 void *bm_le;
384 int inc;
385
386 bit = do_div(index, ll->entries_per_block);
387 r = ll->load_ie(ll, index, &ie_disk);
388 if (r < 0)
389 return r;
390
391 r = dm_tm_shadow_block(ll->tm, le64_to_cpu(ie_disk.blocknr),
392 &dm_sm_bitmap_validator, &nb, &inc);
393 if (r < 0) {
394 DMERR("dm_tm_shadow_block() failed");
395 return r;
396 }
397 ie_disk.blocknr = cpu_to_le64(dm_block_location(nb));
398
399 bm_le = dm_bitmap_data(nb);
400 old = sm_lookup_bitmap(bm_le, bit);
401
402 if (ref_count <= 2) {
403 sm_set_bitmap(bm_le, bit, ref_count);
404
405 r = dm_tm_unlock(ll->tm, nb);
406 if (r < 0)
407 return r;
408
409 if (old > 2) {
410 r = dm_btree_remove(&ll->ref_count_info,
411 ll->ref_count_root,
412 &b, &ll->ref_count_root);
413 if (r)
414 return r;
415 }
416
417 } else {
418 __le32 le_rc = cpu_to_le32(ref_count);
419
420 sm_set_bitmap(bm_le, bit, 3);
421 r = dm_tm_unlock(ll->tm, nb);
422 if (r < 0)
423 return r;
424
425 __dm_bless_for_disk(&le_rc);
426 r = dm_btree_insert(&ll->ref_count_info, ll->ref_count_root,
427 &b, &le_rc, &ll->ref_count_root);
428 if (r < 0) {
429 DMERR("ref count insert failed");
430 return r;
431 }
432 }
433
434 if (ref_count && !old) {
435 *ev = SM_ALLOC;
436 ll->nr_allocated++;
437 le32_add_cpu(&ie_disk.nr_free, -1);
438 if (le32_to_cpu(ie_disk.none_free_before) == bit)
439 ie_disk.none_free_before = cpu_to_le32(bit + 1);
440
441 } else if (old && !ref_count) {
442 *ev = SM_FREE;
443 ll->nr_allocated--;
444 le32_add_cpu(&ie_disk.nr_free, 1);
445 ie_disk.none_free_before = cpu_to_le32(min(le32_to_cpu(ie_disk.none_free_before), bit));
446 }
447
448 return ll->save_ie(ll, index, &ie_disk);
449}
450
451int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
452{
453 int r;
454 uint32_t rc;
455
456 r = sm_ll_lookup(ll, b, &rc);
457 if (r)
458 return r;
459
460 return sm_ll_insert(ll, b, rc + 1, ev);
461}
462
463int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
464{
465 int r;
466 uint32_t rc;
467
468 r = sm_ll_lookup(ll, b, &rc);
469 if (r)
470 return r;
471
472 if (!rc)
473 return -EINVAL;
474
475 return sm_ll_insert(ll, b, rc - 1, ev);
476}
477
478int sm_ll_commit(struct ll_disk *ll)
479{
480 int r = 0;
481
482 if (ll->bitmap_index_changed) {
483 r = ll->commit(ll);
484 if (!r)
485 ll->bitmap_index_changed = false;
486 }
487
488 return r;
489}
490
491/*----------------------------------------------------------------*/
492
493static int metadata_ll_load_ie(struct ll_disk *ll, dm_block_t index,
494 struct disk_index_entry *ie)
495{
496 memcpy(ie, ll->mi_le.index + index, sizeof(*ie));
497 return 0;
498}
499
500static int metadata_ll_save_ie(struct ll_disk *ll, dm_block_t index,
501 struct disk_index_entry *ie)
502{
503 ll->bitmap_index_changed = true;
504 memcpy(ll->mi_le.index + index, ie, sizeof(*ie));
505 return 0;
506}
507
508static int metadata_ll_init_index(struct ll_disk *ll)
509{
510 int r;
511 struct dm_block *b;
512
513 r = dm_tm_new_block(ll->tm, &index_validator, &b);
514 if (r < 0)
515 return r;
516
517 memcpy(dm_block_data(b), &ll->mi_le, sizeof(ll->mi_le));
518 ll->bitmap_root = dm_block_location(b);
519
520 return dm_tm_unlock(ll->tm, b);
521}
522
523static int metadata_ll_open(struct ll_disk *ll)
524{
525 int r;
526 struct dm_block *block;
527
528 r = dm_tm_read_lock(ll->tm, ll->bitmap_root,
529 &index_validator, &block);
530 if (r)
531 return r;
532
533 memcpy(&ll->mi_le, dm_block_data(block), sizeof(ll->mi_le));
534 return dm_tm_unlock(ll->tm, block);
535}
536
537static dm_block_t metadata_ll_max_entries(struct ll_disk *ll)
538{
539 return MAX_METADATA_BITMAPS;
540}
541
542static int metadata_ll_commit(struct ll_disk *ll)
543{
544 int r, inc;
545 struct dm_block *b;
546
547 r = dm_tm_shadow_block(ll->tm, ll->bitmap_root, &index_validator, &b, &inc);
548 if (r)
549 return r;
550
551 memcpy(dm_block_data(b), &ll->mi_le, sizeof(ll->mi_le));
552 ll->bitmap_root = dm_block_location(b);
553
554 return dm_tm_unlock(ll->tm, b);
555}
556
557int sm_ll_new_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm)
558{
559 int r;
560
561 r = sm_ll_init(ll, tm);
562 if (r < 0)
563 return r;
564
565 ll->load_ie = metadata_ll_load_ie;
566 ll->save_ie = metadata_ll_save_ie;
567 ll->init_index = metadata_ll_init_index;
568 ll->open_index = metadata_ll_open;
569 ll->max_entries = metadata_ll_max_entries;
570 ll->commit = metadata_ll_commit;
571
572 ll->nr_blocks = 0;
573 ll->nr_allocated = 0;
574
575 r = ll->init_index(ll);
576 if (r < 0)
577 return r;
578
579 r = dm_btree_empty(&ll->ref_count_info, &ll->ref_count_root);
580 if (r < 0)
581 return r;
582
583 return 0;
584}
585
586int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm,
587 void *root_le, size_t len)
588{
589 int r;
590 struct disk_sm_root *smr = root_le;
591
592 if (len < sizeof(struct disk_sm_root)) {
593 DMERR("sm_metadata root too small");
594 return -ENOMEM;
595 }
596
597 r = sm_ll_init(ll, tm);
598 if (r < 0)
599 return r;
600
601 ll->load_ie = metadata_ll_load_ie;
602 ll->save_ie = metadata_ll_save_ie;
603 ll->init_index = metadata_ll_init_index;
604 ll->open_index = metadata_ll_open;
605 ll->max_entries = metadata_ll_max_entries;
606 ll->commit = metadata_ll_commit;
607
608 ll->nr_blocks = le64_to_cpu(smr->nr_blocks);
609 ll->nr_allocated = le64_to_cpu(smr->nr_allocated);
610 ll->bitmap_root = le64_to_cpu(smr->bitmap_root);
611 ll->ref_count_root = le64_to_cpu(smr->ref_count_root);
612
613 return ll->open_index(ll);
614}
615
616/*----------------------------------------------------------------*/
617
618static int disk_ll_load_ie(struct ll_disk *ll, dm_block_t index,
619 struct disk_index_entry *ie)
620{
621 return dm_btree_lookup(&ll->bitmap_info, ll->bitmap_root, &index, ie);
622}
623
624static int disk_ll_save_ie(struct ll_disk *ll, dm_block_t index,
625 struct disk_index_entry *ie)
626{
627 __dm_bless_for_disk(ie);
628 return dm_btree_insert(&ll->bitmap_info, ll->bitmap_root,
629 &index, ie, &ll->bitmap_root);
630}
631
632static int disk_ll_init_index(struct ll_disk *ll)
633{
634 return dm_btree_empty(&ll->bitmap_info, &ll->bitmap_root);
635}
636
637static int disk_ll_open(struct ll_disk *ll)
638{
639 /* nothing to do */
640 return 0;
641}
642
643static dm_block_t disk_ll_max_entries(struct ll_disk *ll)
644{
645 return -1ULL;
646}
647
648static int disk_ll_commit(struct ll_disk *ll)
649{
650 return 0;
651}
652
653int sm_ll_new_disk(struct ll_disk *ll, struct dm_transaction_manager *tm)
654{
655 int r;
656
657 r = sm_ll_init(ll, tm);
658 if (r < 0)
659 return r;
660
661 ll->load_ie = disk_ll_load_ie;
662 ll->save_ie = disk_ll_save_ie;
663 ll->init_index = disk_ll_init_index;
664 ll->open_index = disk_ll_open;
665 ll->max_entries = disk_ll_max_entries;
666 ll->commit = disk_ll_commit;
667
668 ll->nr_blocks = 0;
669 ll->nr_allocated = 0;
670
671 r = ll->init_index(ll);
672 if (r < 0)
673 return r;
674
675 r = dm_btree_empty(&ll->ref_count_info, &ll->ref_count_root);
676 if (r < 0)
677 return r;
678
679 return 0;
680}
681
682int sm_ll_open_disk(struct ll_disk *ll, struct dm_transaction_manager *tm,
683 void *root_le, size_t len)
684{
685 int r;
686 struct disk_sm_root *smr = root_le;
687
688 if (len < sizeof(struct disk_sm_root)) {
689 DMERR("sm_metadata root too small");
690 return -ENOMEM;
691 }
692
693 r = sm_ll_init(ll, tm);
694 if (r < 0)
695 return r;
696
697 ll->load_ie = disk_ll_load_ie;
698 ll->save_ie = disk_ll_save_ie;
699 ll->init_index = disk_ll_init_index;
700 ll->open_index = disk_ll_open;
701 ll->max_entries = disk_ll_max_entries;
702 ll->commit = disk_ll_commit;
703
704 ll->nr_blocks = le64_to_cpu(smr->nr_blocks);
705 ll->nr_allocated = le64_to_cpu(smr->nr_allocated);
706 ll->bitmap_root = le64_to_cpu(smr->bitmap_root);
707 ll->ref_count_root = le64_to_cpu(smr->ref_count_root);
708
709 return ll->open_index(ll);
710}
711
712/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-space-map-common.h b/drivers/md/persistent-data/dm-space-map-common.h
deleted file mode 100644
index b3078d5eda0..00000000000
--- a/drivers/md/persistent-data/dm-space-map-common.h
+++ /dev/null
@@ -1,127 +0,0 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef DM_SPACE_MAP_COMMON_H
8#define DM_SPACE_MAP_COMMON_H
9
10#include "dm-btree.h"
11
12/*----------------------------------------------------------------*/
13
14/*
15 * Low level disk format
16 *
17 * Bitmap btree
18 * ------------
19 *
20 * Each value stored in the btree is an index_entry. This points to a
21 * block that is used as a bitmap. Within the bitmap hold 2 bits per
22 * entry, which represent UNUSED = 0, REF_COUNT = 1, REF_COUNT = 2 and
23 * REF_COUNT = many.
24 *
25 * Refcount btree
26 * --------------
27 *
28 * Any entry that has a ref count higher than 2 gets entered in the ref
29 * count tree. The leaf values for this tree is the 32-bit ref count.
30 */
31
32struct disk_index_entry {
33 __le64 blocknr;
34 __le32 nr_free;
35 __le32 none_free_before;
36} __packed;
37
38
39#define MAX_METADATA_BITMAPS 255
40struct disk_metadata_index {
41 __le32 csum;
42 __le32 padding;
43 __le64 blocknr;
44
45 struct disk_index_entry index[MAX_METADATA_BITMAPS];
46} __packed;
47
48struct ll_disk;
49
50typedef int (*load_ie_fn)(struct ll_disk *ll, dm_block_t index, struct disk_index_entry *result);
51typedef int (*save_ie_fn)(struct ll_disk *ll, dm_block_t index, struct disk_index_entry *ie);
52typedef int (*init_index_fn)(struct ll_disk *ll);
53typedef int (*open_index_fn)(struct ll_disk *ll);
54typedef dm_block_t (*max_index_entries_fn)(struct ll_disk *ll);
55typedef int (*commit_fn)(struct ll_disk *ll);
56
57struct ll_disk {
58 struct dm_transaction_manager *tm;
59 struct dm_btree_info bitmap_info;
60 struct dm_btree_info ref_count_info;
61
62 uint32_t block_size;
63 uint32_t entries_per_block;
64 dm_block_t nr_blocks;
65 dm_block_t nr_allocated;
66
67 /*
68 * bitmap_root may be a btree root or a simple index.
69 */
70 dm_block_t bitmap_root;
71
72 dm_block_t ref_count_root;
73
74 struct disk_metadata_index mi_le;
75 load_ie_fn load_ie;
76 save_ie_fn save_ie;
77 init_index_fn init_index;
78 open_index_fn open_index;
79 max_index_entries_fn max_entries;
80 commit_fn commit;
81 bool bitmap_index_changed:1;
82};
83
84struct disk_sm_root {
85 __le64 nr_blocks;
86 __le64 nr_allocated;
87 __le64 bitmap_root;
88 __le64 ref_count_root;
89} __packed;
90
91#define ENTRIES_PER_BYTE 4
92
93struct disk_bitmap_header {
94 __le32 csum;
95 __le32 not_used;
96 __le64 blocknr;
97} __packed;
98
99enum allocation_event {
100 SM_NONE,
101 SM_ALLOC,
102 SM_FREE,
103};
104
105/*----------------------------------------------------------------*/
106
107int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks);
108int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result);
109int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result);
110int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
111 dm_block_t end, dm_block_t *result);
112int sm_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count, enum allocation_event *ev);
113int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev);
114int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev);
115int sm_ll_commit(struct ll_disk *ll);
116
117int sm_ll_new_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm);
118int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm,
119 void *root_le, size_t len);
120
121int sm_ll_new_disk(struct ll_disk *ll, struct dm_transaction_manager *tm);
122int sm_ll_open_disk(struct ll_disk *ll, struct dm_transaction_manager *tm,
123 void *root_le, size_t len);
124
125/*----------------------------------------------------------------*/
126
127#endif /* DM_SPACE_MAP_COMMON_H */
diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c
deleted file mode 100644
index f6d29e614ab..00000000000
--- a/drivers/md/persistent-data/dm-space-map-disk.c
+++ /dev/null
@@ -1,318 +0,0 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-space-map-common.h"
8#include "dm-space-map-disk.h"
9#include "dm-space-map.h"
10#include "dm-transaction-manager.h"
11
12#include <linux/list.h>
13#include <linux/slab.h>
14#include <linux/export.h>
15#include <linux/device-mapper.h>
16
17#define DM_MSG_PREFIX "space map disk"
18
19/*----------------------------------------------------------------*/
20
21/*
22 * Space map interface.
23 */
24struct sm_disk {
25 struct dm_space_map sm;
26
27 struct ll_disk ll;
28 struct ll_disk old_ll;
29
30 dm_block_t begin;
31 dm_block_t nr_allocated_this_transaction;
32};
33
34static void sm_disk_destroy(struct dm_space_map *sm)
35{
36 struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
37
38 kfree(smd);
39}
40
41static int sm_disk_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
42{
43 struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
44
45 return sm_ll_extend(&smd->ll, extra_blocks);
46}
47
48static int sm_disk_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count)
49{
50 struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
51 *count = smd->old_ll.nr_blocks;
52
53 return 0;
54}
55
56static int sm_disk_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
57{
58 struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
59 *count = (smd->old_ll.nr_blocks - smd->old_ll.nr_allocated) - smd->nr_allocated_this_transaction;
60
61 return 0;
62}
63
64static int sm_disk_get_count(struct dm_space_map *sm, dm_block_t b,
65 uint32_t *result)
66{
67 struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
68 return sm_ll_lookup(&smd->ll, b, result);
69}
70
71static int sm_disk_count_is_more_than_one(struct dm_space_map *sm, dm_block_t b,
72 int *result)
73{
74 int r;
75 uint32_t count;
76
77 r = sm_disk_get_count(sm, b, &count);
78 if (r)
79 return r;
80
81 return count > 1;
82}
83
84static int sm_disk_set_count(struct dm_space_map *sm, dm_block_t b,
85 uint32_t count)
86{
87 int r;
88 uint32_t old_count;
89 enum allocation_event ev;
90 struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
91
92 r = sm_ll_insert(&smd->ll, b, count, &ev);
93 if (!r) {
94 switch (ev) {
95 case SM_NONE:
96 break;
97
98 case SM_ALLOC:
99 /*
100 * This _must_ be free in the prior transaction
101 * otherwise we've lost atomicity.
102 */
103 smd->nr_allocated_this_transaction++;
104 break;
105
106 case SM_FREE:
107 /*
108 * It's only free if it's also free in the last
109 * transaction.
110 */
111 r = sm_ll_lookup(&smd->old_ll, b, &old_count);
112 if (r)
113 return r;
114
115 if (!old_count)
116 smd->nr_allocated_this_transaction--;
117 break;
118 }
119 }
120
121 return r;
122}
123
124static int sm_disk_inc_block(struct dm_space_map *sm, dm_block_t b)
125{
126 int r;
127 enum allocation_event ev;
128 struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
129
130 r = sm_ll_inc(&smd->ll, b, &ev);
131 if (!r && (ev == SM_ALLOC))
132 /*
133 * This _must_ be free in the prior transaction
134 * otherwise we've lost atomicity.
135 */
136 smd->nr_allocated_this_transaction++;
137
138 return r;
139}
140
141static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b)
142{
143 int r;
144 uint32_t old_count;
145 enum allocation_event ev;
146 struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
147
148 r = sm_ll_dec(&smd->ll, b, &ev);
149 if (!r && (ev == SM_FREE)) {
150 /*
151 * It's only free if it's also free in the last
152 * transaction.
153 */
154 r = sm_ll_lookup(&smd->old_ll, b, &old_count);
155 if (r)
156 return r;
157
158 if (!old_count)
159 smd->nr_allocated_this_transaction--;
160 }
161
162 return r;
163}
164
165static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b)
166{
167 int r;
168 enum allocation_event ev;
169 struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
170
171 /* FIXME: we should loop round a couple of times */
172 r = sm_ll_find_free_block(&smd->old_ll, smd->begin, smd->old_ll.nr_blocks, b);
173 if (r)
174 return r;
175
176 smd->begin = *b + 1;
177 r = sm_ll_inc(&smd->ll, *b, &ev);
178 if (!r) {
179 BUG_ON(ev != SM_ALLOC);
180 smd->nr_allocated_this_transaction++;
181 }
182
183 return r;
184}
185
186static int sm_disk_commit(struct dm_space_map *sm)
187{
188 int r;
189 dm_block_t nr_free;
190 struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
191
192 r = sm_disk_get_nr_free(sm, &nr_free);
193 if (r)
194 return r;
195
196 r = sm_ll_commit(&smd->ll);
197 if (r)
198 return r;
199
200 memcpy(&smd->old_ll, &smd->ll, sizeof(smd->old_ll));
201 smd->begin = 0;
202 smd->nr_allocated_this_transaction = 0;
203
204 r = sm_disk_get_nr_free(sm, &nr_free);
205 if (r)
206 return r;
207
208 return 0;
209}
210
211static int sm_disk_root_size(struct dm_space_map *sm, size_t *result)
212{
213 *result = sizeof(struct disk_sm_root);
214
215 return 0;
216}
217
218static int sm_disk_copy_root(struct dm_space_map *sm, void *where_le, size_t max)
219{
220 struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
221 struct disk_sm_root root_le;
222
223 root_le.nr_blocks = cpu_to_le64(smd->ll.nr_blocks);
224 root_le.nr_allocated = cpu_to_le64(smd->ll.nr_allocated);
225 root_le.bitmap_root = cpu_to_le64(smd->ll.bitmap_root);
226 root_le.ref_count_root = cpu_to_le64(smd->ll.ref_count_root);
227
228 if (max < sizeof(root_le))
229 return -ENOSPC;
230
231 memcpy(where_le, &root_le, sizeof(root_le));
232
233 return 0;
234}
235
236/*----------------------------------------------------------------*/
237
238static struct dm_space_map ops = {
239 .destroy = sm_disk_destroy,
240 .extend = sm_disk_extend,
241 .get_nr_blocks = sm_disk_get_nr_blocks,
242 .get_nr_free = sm_disk_get_nr_free,
243 .get_count = sm_disk_get_count,
244 .count_is_more_than_one = sm_disk_count_is_more_than_one,
245 .set_count = sm_disk_set_count,
246 .inc_block = sm_disk_inc_block,
247 .dec_block = sm_disk_dec_block,
248 .new_block = sm_disk_new_block,
249 .commit = sm_disk_commit,
250 .root_size = sm_disk_root_size,
251 .copy_root = sm_disk_copy_root
252};
253
254struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm,
255 dm_block_t nr_blocks)
256{
257 int r;
258 struct sm_disk *smd;
259
260 smd = kmalloc(sizeof(*smd), GFP_KERNEL);
261 if (!smd)
262 return ERR_PTR(-ENOMEM);
263
264 smd->begin = 0;
265 smd->nr_allocated_this_transaction = 0;
266 memcpy(&smd->sm, &ops, sizeof(smd->sm));
267
268 r = sm_ll_new_disk(&smd->ll, tm);
269 if (r)
270 goto bad;
271
272 r = sm_ll_extend(&smd->ll, nr_blocks);
273 if (r)
274 goto bad;
275
276 r = sm_disk_commit(&smd->sm);
277 if (r)
278 goto bad;
279
280 return &smd->sm;
281
282bad:
283 kfree(smd);
284 return ERR_PTR(r);
285}
286EXPORT_SYMBOL_GPL(dm_sm_disk_create);
287
288struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm,
289 void *root_le, size_t len)
290{
291 int r;
292 struct sm_disk *smd;
293
294 smd = kmalloc(sizeof(*smd), GFP_KERNEL);
295 if (!smd)
296 return ERR_PTR(-ENOMEM);
297
298 smd->begin = 0;
299 smd->nr_allocated_this_transaction = 0;
300 memcpy(&smd->sm, &ops, sizeof(smd->sm));
301
302 r = sm_ll_open_disk(&smd->ll, tm, root_le, len);
303 if (r)
304 goto bad;
305
306 r = sm_disk_commit(&smd->sm);
307 if (r)
308 goto bad;
309
310 return &smd->sm;
311
312bad:
313 kfree(smd);
314 return ERR_PTR(r);
315}
316EXPORT_SYMBOL_GPL(dm_sm_disk_open);
317
318/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-space-map-disk.h b/drivers/md/persistent-data/dm-space-map-disk.h
deleted file mode 100644
index 447a0a9a2d9..00000000000
--- a/drivers/md/persistent-data/dm-space-map-disk.h
+++ /dev/null
@@ -1,25 +0,0 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef _LINUX_DM_SPACE_MAP_DISK_H
8#define _LINUX_DM_SPACE_MAP_DISK_H
9
10#include "dm-block-manager.h"
11
12struct dm_space_map;
13struct dm_transaction_manager;
14
15/*
16 * Unfortunately we have to use two-phase construction due to the cycle
17 * between the tm and sm.
18 */
19struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm,
20 dm_block_t nr_blocks);
21
22struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm,
23 void *root, size_t len);
24
25#endif /* _LINUX_DM_SPACE_MAP_DISK_H */
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
deleted file mode 100644
index 906cf3df71a..00000000000
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ /dev/null
@@ -1,596 +0,0 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-space-map.h"
8#include "dm-space-map-common.h"
9#include "dm-space-map-metadata.h"
10
11#include <linux/list.h>
12#include <linux/slab.h>
13#include <linux/device-mapper.h>
14
15#define DM_MSG_PREFIX "space map metadata"
16
17/*----------------------------------------------------------------*/
18
19/*
20 * Space map interface.
21 *
22 * The low level disk format is written using the standard btree and
23 * transaction manager. This means that performing disk operations may
24 * cause us to recurse into the space map in order to allocate new blocks.
25 * For this reason we have a pool of pre-allocated blocks large enough to
26 * service any metadata_ll_disk operation.
27 */
28
29/*
30 * FIXME: we should calculate this based on the size of the device.
31 * Only the metadata space map needs this functionality.
32 */
33#define MAX_RECURSIVE_ALLOCATIONS 1024
34
35enum block_op_type {
36 BOP_INC,
37 BOP_DEC
38};
39
40struct block_op {
41 enum block_op_type type;
42 dm_block_t block;
43};
44
45struct sm_metadata {
46 struct dm_space_map sm;
47
48 struct ll_disk ll;
49 struct ll_disk old_ll;
50
51 dm_block_t begin;
52
53 unsigned recursion_count;
54 unsigned allocated_this_transaction;
55 unsigned nr_uncommitted;
56 struct block_op uncommitted[MAX_RECURSIVE_ALLOCATIONS];
57};
58
59static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b)
60{
61 struct block_op *op;
62
63 if (smm->nr_uncommitted == MAX_RECURSIVE_ALLOCATIONS) {
64 DMERR("too many recursive allocations");
65 return -ENOMEM;
66 }
67
68 op = smm->uncommitted + smm->nr_uncommitted++;
69 op->type = type;
70 op->block = b;
71
72 return 0;
73}
74
75static int commit_bop(struct sm_metadata *smm, struct block_op *op)
76{
77 int r = 0;
78 enum allocation_event ev;
79
80 switch (op->type) {
81 case BOP_INC:
82 r = sm_ll_inc(&smm->ll, op->block, &ev);
83 break;
84
85 case BOP_DEC:
86 r = sm_ll_dec(&smm->ll, op->block, &ev);
87 break;
88 }
89
90 return r;
91}
92
93static void in(struct sm_metadata *smm)
94{
95 smm->recursion_count++;
96}
97
98static int out(struct sm_metadata *smm)
99{
100 int r = 0;
101
102 /*
103 * If we're not recursing then very bad things are happening.
104 */
105 if (!smm->recursion_count) {
106 DMERR("lost track of recursion depth");
107 return -ENOMEM;
108 }
109
110 if (smm->recursion_count == 1 && smm->nr_uncommitted) {
111 while (smm->nr_uncommitted && !r) {
112 smm->nr_uncommitted--;
113 r = commit_bop(smm, smm->uncommitted +
114 smm->nr_uncommitted);
115 if (r)
116 break;
117 }
118 }
119
120 smm->recursion_count--;
121
122 return r;
123}
124
125/*
126 * When using the out() function above, we often want to combine an error
127 * code for the operation run in the recursive context with that from
128 * out().
129 */
130static int combine_errors(int r1, int r2)
131{
132 return r1 ? r1 : r2;
133}
134
135static int recursing(struct sm_metadata *smm)
136{
137 return smm->recursion_count;
138}
139
140static void sm_metadata_destroy(struct dm_space_map *sm)
141{
142 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
143
144 kfree(smm);
145}
146
147static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
148{
149 DMERR("doesn't support extend");
150 return -EINVAL;
151}
152
153static int sm_metadata_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count)
154{
155 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
156
157 *count = smm->ll.nr_blocks;
158
159 return 0;
160}
161
162static int sm_metadata_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
163{
164 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
165
166 *count = smm->old_ll.nr_blocks - smm->old_ll.nr_allocated -
167 smm->allocated_this_transaction;
168
169 return 0;
170}
171
172static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
173 uint32_t *result)
174{
175 int r, i;
176 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
177 unsigned adjustment = 0;
178
179 /*
180 * We may have some uncommitted adjustments to add. This list
181 * should always be really short.
182 */
183 for (i = 0; i < smm->nr_uncommitted; i++) {
184 struct block_op *op = smm->uncommitted + i;
185
186 if (op->block != b)
187 continue;
188
189 switch (op->type) {
190 case BOP_INC:
191 adjustment++;
192 break;
193
194 case BOP_DEC:
195 adjustment--;
196 break;
197 }
198 }
199
200 r = sm_ll_lookup(&smm->ll, b, result);
201 if (r)
202 return r;
203
204 *result += adjustment;
205
206 return 0;
207}
208
209static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm,
210 dm_block_t b, int *result)
211{
212 int r, i, adjustment = 0;
213 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
214 uint32_t rc;
215
216 /*
217 * We may have some uncommitted adjustments to add. This list
218 * should always be really short.
219 */
220 for (i = 0; i < smm->nr_uncommitted; i++) {
221 struct block_op *op = smm->uncommitted + i;
222
223 if (op->block != b)
224 continue;
225
226 switch (op->type) {
227 case BOP_INC:
228 adjustment++;
229 break;
230
231 case BOP_DEC:
232 adjustment--;
233 break;
234 }
235 }
236
237 if (adjustment > 1) {
238 *result = 1;
239 return 0;
240 }
241
242 r = sm_ll_lookup_bitmap(&smm->ll, b, &rc);
243 if (r)
244 return r;
245
246 if (rc == 3)
247 /*
248 * We err on the side of caution, and always return true.
249 */
250 *result = 1;
251 else
252 *result = rc + adjustment > 1;
253
254 return 0;
255}
256
257static int sm_metadata_set_count(struct dm_space_map *sm, dm_block_t b,
258 uint32_t count)
259{
260 int r, r2;
261 enum allocation_event ev;
262 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
263
264 if (smm->recursion_count) {
265 DMERR("cannot recurse set_count()");
266 return -EINVAL;
267 }
268
269 in(smm);
270 r = sm_ll_insert(&smm->ll, b, count, &ev);
271 r2 = out(smm);
272
273 return combine_errors(r, r2);
274}
275
276static int sm_metadata_inc_block(struct dm_space_map *sm, dm_block_t b)
277{
278 int r, r2 = 0;
279 enum allocation_event ev;
280 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
281
282 if (recursing(smm))
283 r = add_bop(smm, BOP_INC, b);
284 else {
285 in(smm);
286 r = sm_ll_inc(&smm->ll, b, &ev);
287 r2 = out(smm);
288 }
289
290 return combine_errors(r, r2);
291}
292
293static int sm_metadata_dec_block(struct dm_space_map *sm, dm_block_t b)
294{
295 int r, r2 = 0;
296 enum allocation_event ev;
297 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
298
299 if (recursing(smm))
300 r = add_bop(smm, BOP_DEC, b);
301 else {
302 in(smm);
303 r = sm_ll_dec(&smm->ll, b, &ev);
304 r2 = out(smm);
305 }
306
307 return combine_errors(r, r2);
308}
309
310static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b)
311{
312 int r, r2 = 0;
313 enum allocation_event ev;
314 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
315
316 r = sm_ll_find_free_block(&smm->old_ll, smm->begin, smm->old_ll.nr_blocks, b);
317 if (r)
318 return r;
319
320 smm->begin = *b + 1;
321
322 if (recursing(smm))
323 r = add_bop(smm, BOP_INC, *b);
324 else {
325 in(smm);
326 r = sm_ll_inc(&smm->ll, *b, &ev);
327 r2 = out(smm);
328 }
329
330 if (!r)
331 smm->allocated_this_transaction++;
332
333 return combine_errors(r, r2);
334}
335
336static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b)
337{
338 int r = sm_metadata_new_block_(sm, b);
339 if (r)
340 DMERR("unable to allocate new metadata block");
341 return r;
342}
343
344static int sm_metadata_commit(struct dm_space_map *sm)
345{
346 int r;
347 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
348
349 r = sm_ll_commit(&smm->ll);
350 if (r)
351 return r;
352
353 memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll));
354 smm->begin = 0;
355 smm->allocated_this_transaction = 0;
356
357 return 0;
358}
359
360static int sm_metadata_root_size(struct dm_space_map *sm, size_t *result)
361{
362 *result = sizeof(struct disk_sm_root);
363
364 return 0;
365}
366
367static int sm_metadata_copy_root(struct dm_space_map *sm, void *where_le, size_t max)
368{
369 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
370 struct disk_sm_root root_le;
371
372 root_le.nr_blocks = cpu_to_le64(smm->ll.nr_blocks);
373 root_le.nr_allocated = cpu_to_le64(smm->ll.nr_allocated);
374 root_le.bitmap_root = cpu_to_le64(smm->ll.bitmap_root);
375 root_le.ref_count_root = cpu_to_le64(smm->ll.ref_count_root);
376
377 if (max < sizeof(root_le))
378 return -ENOSPC;
379
380 memcpy(where_le, &root_le, sizeof(root_le));
381
382 return 0;
383}
384
385static struct dm_space_map ops = {
386 .destroy = sm_metadata_destroy,
387 .extend = sm_metadata_extend,
388 .get_nr_blocks = sm_metadata_get_nr_blocks,
389 .get_nr_free = sm_metadata_get_nr_free,
390 .get_count = sm_metadata_get_count,
391 .count_is_more_than_one = sm_metadata_count_is_more_than_one,
392 .set_count = sm_metadata_set_count,
393 .inc_block = sm_metadata_inc_block,
394 .dec_block = sm_metadata_dec_block,
395 .new_block = sm_metadata_new_block,
396 .commit = sm_metadata_commit,
397 .root_size = sm_metadata_root_size,
398 .copy_root = sm_metadata_copy_root
399};
400
401/*----------------------------------------------------------------*/
402
403/*
404 * When a new space map is created that manages its own space. We use
405 * this tiny bootstrap allocator.
406 */
407static void sm_bootstrap_destroy(struct dm_space_map *sm)
408{
409}
410
411static int sm_bootstrap_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
412{
413 DMERR("boostrap doesn't support extend");
414
415 return -EINVAL;
416}
417
418static int sm_bootstrap_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count)
419{
420 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
421
422 return smm->ll.nr_blocks;
423}
424
425static int sm_bootstrap_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
426{
427 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
428
429 *count = smm->ll.nr_blocks - smm->begin;
430
431 return 0;
432}
433
434static int sm_bootstrap_get_count(struct dm_space_map *sm, dm_block_t b,
435 uint32_t *result)
436{
437 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
438
439 return b < smm->begin ? 1 : 0;
440}
441
442static int sm_bootstrap_count_is_more_than_one(struct dm_space_map *sm,
443 dm_block_t b, int *result)
444{
445 *result = 0;
446
447 return 0;
448}
449
450static int sm_bootstrap_set_count(struct dm_space_map *sm, dm_block_t b,
451 uint32_t count)
452{
453 DMERR("boostrap doesn't support set_count");
454
455 return -EINVAL;
456}
457
458static int sm_bootstrap_new_block(struct dm_space_map *sm, dm_block_t *b)
459{
460 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
461
462 /*
463 * We know the entire device is unused.
464 */
465 if (smm->begin == smm->ll.nr_blocks)
466 return -ENOSPC;
467
468 *b = smm->begin++;
469
470 return 0;
471}
472
473static int sm_bootstrap_inc_block(struct dm_space_map *sm, dm_block_t b)
474{
475 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
476
477 return add_bop(smm, BOP_INC, b);
478}
479
480static int sm_bootstrap_dec_block(struct dm_space_map *sm, dm_block_t b)
481{
482 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
483
484 return add_bop(smm, BOP_DEC, b);
485}
486
487static int sm_bootstrap_commit(struct dm_space_map *sm)
488{
489 return 0;
490}
491
492static int sm_bootstrap_root_size(struct dm_space_map *sm, size_t *result)
493{
494 DMERR("boostrap doesn't support root_size");
495
496 return -EINVAL;
497}
498
499static int sm_bootstrap_copy_root(struct dm_space_map *sm, void *where,
500 size_t max)
501{
502 DMERR("boostrap doesn't support copy_root");
503
504 return -EINVAL;
505}
506
507static struct dm_space_map bootstrap_ops = {
508 .destroy = sm_bootstrap_destroy,
509 .extend = sm_bootstrap_extend,
510 .get_nr_blocks = sm_bootstrap_get_nr_blocks,
511 .get_nr_free = sm_bootstrap_get_nr_free,
512 .get_count = sm_bootstrap_get_count,
513 .count_is_more_than_one = sm_bootstrap_count_is_more_than_one,
514 .set_count = sm_bootstrap_set_count,
515 .inc_block = sm_bootstrap_inc_block,
516 .dec_block = sm_bootstrap_dec_block,
517 .new_block = sm_bootstrap_new_block,
518 .commit = sm_bootstrap_commit,
519 .root_size = sm_bootstrap_root_size,
520 .copy_root = sm_bootstrap_copy_root
521};
522
523/*----------------------------------------------------------------*/
524
525struct dm_space_map *dm_sm_metadata_init(void)
526{
527 struct sm_metadata *smm;
528
529 smm = kmalloc(sizeof(*smm), GFP_KERNEL);
530 if (!smm)
531 return ERR_PTR(-ENOMEM);
532
533 memcpy(&smm->sm, &ops, sizeof(smm->sm));
534
535 return &smm->sm;
536}
537
538int dm_sm_metadata_create(struct dm_space_map *sm,
539 struct dm_transaction_manager *tm,
540 dm_block_t nr_blocks,
541 dm_block_t superblock)
542{
543 int r;
544 dm_block_t i;
545 enum allocation_event ev;
546 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
547
548 smm->begin = superblock + 1;
549 smm->recursion_count = 0;
550 smm->allocated_this_transaction = 0;
551 smm->nr_uncommitted = 0;
552
553 memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm));
554
555 r = sm_ll_new_metadata(&smm->ll, tm);
556 if (r)
557 return r;
558
559 r = sm_ll_extend(&smm->ll, nr_blocks);
560 if (r)
561 return r;
562
563 memcpy(&smm->sm, &ops, sizeof(smm->sm));
564
565 /*
566 * Now we need to update the newly created data structures with the
567 * allocated blocks that they were built from.
568 */
569 for (i = superblock; !r && i < smm->begin; i++)
570 r = sm_ll_inc(&smm->ll, i, &ev);
571
572 if (r)
573 return r;
574
575 return sm_metadata_commit(sm);
576}
577
578int dm_sm_metadata_open(struct dm_space_map *sm,
579 struct dm_transaction_manager *tm,
580 void *root_le, size_t len)
581{
582 int r;
583 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
584
585 r = sm_ll_open_metadata(&smm->ll, tm, root_le, len);
586 if (r)
587 return r;
588
589 smm->begin = 0;
590 smm->recursion_count = 0;
591 smm->allocated_this_transaction = 0;
592 smm->nr_uncommitted = 0;
593
594 memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll));
595 return 0;
596}
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.h b/drivers/md/persistent-data/dm-space-map-metadata.h
deleted file mode 100644
index 39bba0801cf..00000000000
--- a/drivers/md/persistent-data/dm-space-map-metadata.h
+++ /dev/null
@@ -1,33 +0,0 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef DM_SPACE_MAP_METADATA_H
8#define DM_SPACE_MAP_METADATA_H
9
10#include "dm-transaction-manager.h"
11
12/*
13 * Unfortunately we have to use two-phase construction due to the cycle
14 * between the tm and sm.
15 */
16struct dm_space_map *dm_sm_metadata_init(void);
17
18/*
19 * Create a fresh space map.
20 */
21int dm_sm_metadata_create(struct dm_space_map *sm,
22 struct dm_transaction_manager *tm,
23 dm_block_t nr_blocks,
24 dm_block_t superblock);
25
26/*
27 * Open from a previously-recorded root.
28 */
29int dm_sm_metadata_open(struct dm_space_map *sm,
30 struct dm_transaction_manager *tm,
31 void *root_le, size_t len);
32
33#endif /* DM_SPACE_MAP_METADATA_H */
diff --git a/drivers/md/persistent-data/dm-space-map.h b/drivers/md/persistent-data/dm-space-map.h
deleted file mode 100644
index 1cbfc6b1638..00000000000
--- a/drivers/md/persistent-data/dm-space-map.h
+++ /dev/null
@@ -1,134 +0,0 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef _LINUX_DM_SPACE_MAP_H
8#define _LINUX_DM_SPACE_MAP_H
9
10#include "dm-block-manager.h"
11
12/*
13 * struct dm_space_map keeps a record of how many times each block in a device
14 * is referenced. It needs to be fixed on disk as part of the transaction.
15 */
16struct dm_space_map {
17 void (*destroy)(struct dm_space_map *sm);
18
19 /*
20 * You must commit before allocating the newly added space.
21 */
22 int (*extend)(struct dm_space_map *sm, dm_block_t extra_blocks);
23
24 /*
25 * Extensions do not appear in this count until after commit has
26 * been called.
27 */
28 int (*get_nr_blocks)(struct dm_space_map *sm, dm_block_t *count);
29
30 /*
31 * Space maps must never allocate a block from the previous
32 * transaction, in case we need to rollback. This complicates the
33 * semantics of get_nr_free(), it should return the number of blocks
34 * that are available for allocation _now_. For instance you may
35 * have blocks with a zero reference count that will not be
36 * available for allocation until after the next commit.
37 */
38 int (*get_nr_free)(struct dm_space_map *sm, dm_block_t *count);
39
40 int (*get_count)(struct dm_space_map *sm, dm_block_t b, uint32_t *result);
41 int (*count_is_more_than_one)(struct dm_space_map *sm, dm_block_t b,
42 int *result);
43 int (*set_count)(struct dm_space_map *sm, dm_block_t b, uint32_t count);
44
45 int (*commit)(struct dm_space_map *sm);
46
47 int (*inc_block)(struct dm_space_map *sm, dm_block_t b);
48 int (*dec_block)(struct dm_space_map *sm, dm_block_t b);
49
50 /*
51 * new_block will increment the returned block.
52 */
53 int (*new_block)(struct dm_space_map *sm, dm_block_t *b);
54
55 /*
56 * The root contains all the information needed to fix the space map.
57 * Generally this info is small, so squirrel it away in a disk block
58 * along with other info.
59 */
60 int (*root_size)(struct dm_space_map *sm, size_t *result);
61 int (*copy_root)(struct dm_space_map *sm, void *copy_to_here_le, size_t len);
62};
63
64/*----------------------------------------------------------------*/
65
66static inline void dm_sm_destroy(struct dm_space_map *sm)
67{
68 sm->destroy(sm);
69}
70
71static inline int dm_sm_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
72{
73 return sm->extend(sm, extra_blocks);
74}
75
76static inline int dm_sm_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count)
77{
78 return sm->get_nr_blocks(sm, count);
79}
80
81static inline int dm_sm_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
82{
83 return sm->get_nr_free(sm, count);
84}
85
86static inline int dm_sm_get_count(struct dm_space_map *sm, dm_block_t b,
87 uint32_t *result)
88{
89 return sm->get_count(sm, b, result);
90}
91
92static inline int dm_sm_count_is_more_than_one(struct dm_space_map *sm,
93 dm_block_t b, int *result)
94{
95 return sm->count_is_more_than_one(sm, b, result);
96}
97
98static inline int dm_sm_set_count(struct dm_space_map *sm, dm_block_t b,
99 uint32_t count)
100{
101 return sm->set_count(sm, b, count);
102}
103
104static inline int dm_sm_commit(struct dm_space_map *sm)
105{
106 return sm->commit(sm);
107}
108
109static inline int dm_sm_inc_block(struct dm_space_map *sm, dm_block_t b)
110{
111 return sm->inc_block(sm, b);
112}
113
114static inline int dm_sm_dec_block(struct dm_space_map *sm, dm_block_t b)
115{
116 return sm->dec_block(sm, b);
117}
118
119static inline int dm_sm_new_block(struct dm_space_map *sm, dm_block_t *b)
120{
121 return sm->new_block(sm, b);
122}
123
124static inline int dm_sm_root_size(struct dm_space_map *sm, size_t *result)
125{
126 return sm->root_size(sm, result);
127}
128
129static inline int dm_sm_copy_root(struct dm_space_map *sm, void *copy_to_here_le, size_t len)
130{
131 return sm->copy_root(sm, copy_to_here_le, len);
132}
133
134#endif /* _LINUX_DM_SPACE_MAP_H */
diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c
deleted file mode 100644
index d247a35da3c..00000000000
--- a/drivers/md/persistent-data/dm-transaction-manager.c
+++ /dev/null
@@ -1,382 +0,0 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6#include "dm-transaction-manager.h"
7#include "dm-space-map.h"
8#include "dm-space-map-disk.h"
9#include "dm-space-map-metadata.h"
10#include "dm-persistent-data-internal.h"
11
12#include <linux/export.h>
13#include <linux/slab.h>
14#include <linux/device-mapper.h>
15
16#define DM_MSG_PREFIX "transaction manager"
17
18/*----------------------------------------------------------------*/
19
20struct shadow_info {
21 struct hlist_node hlist;
22 dm_block_t where;
23};
24
25/*
26 * It would be nice if we scaled with the size of transaction.
27 */
28#define HASH_SIZE 256
29#define HASH_MASK (HASH_SIZE - 1)
30
31struct dm_transaction_manager {
32 int is_clone;
33 struct dm_transaction_manager *real;
34
35 struct dm_block_manager *bm;
36 struct dm_space_map *sm;
37
38 spinlock_t lock;
39 struct hlist_head buckets[HASH_SIZE];
40};
41
42/*----------------------------------------------------------------*/
43
44static int is_shadow(struct dm_transaction_manager *tm, dm_block_t b)
45{
46 int r = 0;
47 unsigned bucket = dm_hash_block(b, HASH_MASK);
48 struct shadow_info *si;
49 struct hlist_node *n;
50
51 spin_lock(&tm->lock);
52 hlist_for_each_entry(si, n, tm->buckets + bucket, hlist)
53 if (si->where == b) {
54 r = 1;
55 break;
56 }
57 spin_unlock(&tm->lock);
58
59 return r;
60}
61
62/*
63 * This can silently fail if there's no memory. We're ok with this since
64 * creating redundant shadows causes no harm.
65 */
66static void insert_shadow(struct dm_transaction_manager *tm, dm_block_t b)
67{
68 unsigned bucket;
69 struct shadow_info *si;
70
71 si = kmalloc(sizeof(*si), GFP_NOIO);
72 if (si) {
73 si->where = b;
74 bucket = dm_hash_block(b, HASH_MASK);
75 spin_lock(&tm->lock);
76 hlist_add_head(&si->hlist, tm->buckets + bucket);
77 spin_unlock(&tm->lock);
78 }
79}
80
81static void wipe_shadow_table(struct dm_transaction_manager *tm)
82{
83 struct shadow_info *si;
84 struct hlist_node *n, *tmp;
85 struct hlist_head *bucket;
86 int i;
87
88 spin_lock(&tm->lock);
89 for (i = 0; i < HASH_SIZE; i++) {
90 bucket = tm->buckets + i;
91 hlist_for_each_entry_safe(si, n, tmp, bucket, hlist)
92 kfree(si);
93
94 INIT_HLIST_HEAD(bucket);
95 }
96
97 spin_unlock(&tm->lock);
98}
99
100/*----------------------------------------------------------------*/
101
102static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm,
103 struct dm_space_map *sm)
104{
105 int i;
106 struct dm_transaction_manager *tm;
107
108 tm = kmalloc(sizeof(*tm), GFP_KERNEL);
109 if (!tm)
110 return ERR_PTR(-ENOMEM);
111
112 tm->is_clone = 0;
113 tm->real = NULL;
114 tm->bm = bm;
115 tm->sm = sm;
116
117 spin_lock_init(&tm->lock);
118 for (i = 0; i < HASH_SIZE; i++)
119 INIT_HLIST_HEAD(tm->buckets + i);
120
121 return tm;
122}
123
124struct dm_transaction_manager *dm_tm_create_non_blocking_clone(struct dm_transaction_manager *real)
125{
126 struct dm_transaction_manager *tm;
127
128 tm = kmalloc(sizeof(*tm), GFP_KERNEL);
129 if (tm) {
130 tm->is_clone = 1;
131 tm->real = real;
132 }
133
134 return tm;
135}
136EXPORT_SYMBOL_GPL(dm_tm_create_non_blocking_clone);
137
138void dm_tm_destroy(struct dm_transaction_manager *tm)
139{
140 if (!tm->is_clone)
141 wipe_shadow_table(tm);
142
143 kfree(tm);
144}
145EXPORT_SYMBOL_GPL(dm_tm_destroy);
146
147int dm_tm_pre_commit(struct dm_transaction_manager *tm)
148{
149 int r;
150
151 if (tm->is_clone)
152 return -EWOULDBLOCK;
153
154 r = dm_sm_commit(tm->sm);
155 if (r < 0)
156 return r;
157
158 return 0;
159}
160EXPORT_SYMBOL_GPL(dm_tm_pre_commit);
161
162int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root)
163{
164 if (tm->is_clone)
165 return -EWOULDBLOCK;
166
167 wipe_shadow_table(tm);
168
169 return dm_bm_flush_and_unlock(tm->bm, root);
170}
171EXPORT_SYMBOL_GPL(dm_tm_commit);
172
173int dm_tm_new_block(struct dm_transaction_manager *tm,
174 struct dm_block_validator *v,
175 struct dm_block **result)
176{
177 int r;
178 dm_block_t new_block;
179
180 if (tm->is_clone)
181 return -EWOULDBLOCK;
182
183 r = dm_sm_new_block(tm->sm, &new_block);
184 if (r < 0)
185 return r;
186
187 r = dm_bm_write_lock_zero(tm->bm, new_block, v, result);
188 if (r < 0) {
189 dm_sm_dec_block(tm->sm, new_block);
190 return r;
191 }
192
193 /*
194 * New blocks count as shadows in that they don't need to be
195 * shadowed again.
196 */
197 insert_shadow(tm, new_block);
198
199 return 0;
200}
201
202static int __shadow_block(struct dm_transaction_manager *tm, dm_block_t orig,
203 struct dm_block_validator *v,
204 struct dm_block **result)
205{
206 int r;
207 dm_block_t new;
208 struct dm_block *orig_block;
209
210 r = dm_sm_new_block(tm->sm, &new);
211 if (r < 0)
212 return r;
213
214 r = dm_sm_dec_block(tm->sm, orig);
215 if (r < 0)
216 return r;
217
218 r = dm_bm_read_lock(tm->bm, orig, v, &orig_block);
219 if (r < 0)
220 return r;
221
222 /*
223 * It would be tempting to use dm_bm_unlock_move here, but some
224 * code, such as the space maps, keeps using the old data structures
225 * secure in the knowledge they won't be changed until the next
226 * transaction. Using unlock_move would force a synchronous read
227 * since the old block would no longer be in the cache.
228 */
229 r = dm_bm_write_lock_zero(tm->bm, new, v, result);
230 if (r) {
231 dm_bm_unlock(orig_block);
232 return r;
233 }
234
235 memcpy(dm_block_data(*result), dm_block_data(orig_block),
236 dm_bm_block_size(tm->bm));
237
238 dm_bm_unlock(orig_block);
239 return r;
240}
241
242int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig,
243 struct dm_block_validator *v, struct dm_block **result,
244 int *inc_children)
245{
246 int r;
247
248 if (tm->is_clone)
249 return -EWOULDBLOCK;
250
251 r = dm_sm_count_is_more_than_one(tm->sm, orig, inc_children);
252 if (r < 0)
253 return r;
254
255 if (is_shadow(tm, orig) && !*inc_children)
256 return dm_bm_write_lock(tm->bm, orig, v, result);
257
258 r = __shadow_block(tm, orig, v, result);
259 if (r < 0)
260 return r;
261 insert_shadow(tm, dm_block_location(*result));
262
263 return r;
264}
265EXPORT_SYMBOL_GPL(dm_tm_shadow_block);
266
267int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b,
268 struct dm_block_validator *v,
269 struct dm_block **blk)
270{
271 if (tm->is_clone)
272 return dm_bm_read_try_lock(tm->real->bm, b, v, blk);
273
274 return dm_bm_read_lock(tm->bm, b, v, blk);
275}
276EXPORT_SYMBOL_GPL(dm_tm_read_lock);
277
278int dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b)
279{
280 return dm_bm_unlock(b);
281}
282EXPORT_SYMBOL_GPL(dm_tm_unlock);
283
284void dm_tm_inc(struct dm_transaction_manager *tm, dm_block_t b)
285{
286 /*
287 * The non-blocking clone doesn't support this.
288 */
289 BUG_ON(tm->is_clone);
290
291 dm_sm_inc_block(tm->sm, b);
292}
293EXPORT_SYMBOL_GPL(dm_tm_inc);
294
295void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b)
296{
297 /*
298 * The non-blocking clone doesn't support this.
299 */
300 BUG_ON(tm->is_clone);
301
302 dm_sm_dec_block(tm->sm, b);
303}
304EXPORT_SYMBOL_GPL(dm_tm_dec);
305
306int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b,
307 uint32_t *result)
308{
309 if (tm->is_clone)
310 return -EWOULDBLOCK;
311
312 return dm_sm_get_count(tm->sm, b, result);
313}
314
315struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm)
316{
317 return tm->bm;
318}
319
320/*----------------------------------------------------------------*/
321
322static int dm_tm_create_internal(struct dm_block_manager *bm,
323 dm_block_t sb_location,
324 struct dm_transaction_manager **tm,
325 struct dm_space_map **sm,
326 int create,
327 void *sm_root, size_t sm_len)
328{
329 int r;
330
331 *sm = dm_sm_metadata_init();
332 if (IS_ERR(*sm))
333 return PTR_ERR(*sm);
334
335 *tm = dm_tm_create(bm, *sm);
336 if (IS_ERR(*tm)) {
337 dm_sm_destroy(*sm);
338 return PTR_ERR(*tm);
339 }
340
341 if (create) {
342 r = dm_sm_metadata_create(*sm, *tm, dm_bm_nr_blocks(bm),
343 sb_location);
344 if (r) {
345 DMERR("couldn't create metadata space map");
346 goto bad;
347 }
348
349 } else {
350 r = dm_sm_metadata_open(*sm, *tm, sm_root, sm_len);
351 if (r) {
352 DMERR("couldn't open metadata space map");
353 goto bad;
354 }
355 }
356
357 return 0;
358
359bad:
360 dm_tm_destroy(*tm);
361 dm_sm_destroy(*sm);
362 return r;
363}
364
365int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
366 struct dm_transaction_manager **tm,
367 struct dm_space_map **sm)
368{
369 return dm_tm_create_internal(bm, sb_location, tm, sm, 1, NULL, 0);
370}
371EXPORT_SYMBOL_GPL(dm_tm_create_with_sm);
372
373int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
374 void *sm_root, size_t root_len,
375 struct dm_transaction_manager **tm,
376 struct dm_space_map **sm)
377{
378 return dm_tm_create_internal(bm, sb_location, tm, sm, 0, sm_root, root_len);
379}
380EXPORT_SYMBOL_GPL(dm_tm_open_with_sm);
381
382/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h
deleted file mode 100644
index b5b139076ca..00000000000
--- a/drivers/md/persistent-data/dm-transaction-manager.h
+++ /dev/null
@@ -1,131 +0,0 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef _LINUX_DM_TRANSACTION_MANAGER_H
8#define _LINUX_DM_TRANSACTION_MANAGER_H
9
10#include "dm-block-manager.h"
11
12struct dm_transaction_manager;
13struct dm_space_map;
14
15/*----------------------------------------------------------------*/
16
17/*
18 * This manages the scope of a transaction. It also enforces immutability
19 * of the on-disk data structures by limiting access to writeable blocks.
20 *
21 * Clients should not fiddle with the block manager directly.
22 */
23
24void dm_tm_destroy(struct dm_transaction_manager *tm);
25
26/*
27 * The non-blocking version of a transaction manager is intended for use in
28 * fast path code that needs to do lookups e.g. a dm mapping function.
29 * You create the non-blocking variant from a normal tm. The interface is
30 * the same, except that most functions will just return -EWOULDBLOCK.
31 * Methods that return void yet may block should not be called on a clone
32 * viz. dm_tm_inc, dm_tm_dec. Call dm_tm_destroy() as you would with a normal
33 * tm when you've finished with it. You may not destroy the original prior
34 * to clones.
35 */
36struct dm_transaction_manager *dm_tm_create_non_blocking_clone(struct dm_transaction_manager *real);
37
38/*
39 * We use a 2-phase commit here.
40 *
41 * i) In the first phase the block manager is told to start flushing, and
42 * the changes to the space map are written to disk. You should interrogate
43 * your particular space map to get detail of its root node etc. to be
44 * included in your superblock.
45 *
46 * ii) @root will be committed last. You shouldn't use more than the
47 * first 512 bytes of @root if you wish the transaction to survive a power
48 * failure. You *must* have a write lock held on @root for both stage (i)
49 * and (ii). The commit will drop the write lock.
50 */
51int dm_tm_pre_commit(struct dm_transaction_manager *tm);
52int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root);
53
54/*
55 * These methods are the only way to get hold of a writeable block.
56 */
57
58/*
59 * dm_tm_new_block() is pretty self-explanatory. Make sure you do actually
60 * write to the whole of @data before you unlock, otherwise you could get
61 * a data leak. (The other option is for tm_new_block() to zero new blocks
62 * before handing them out, which will be redundant in most, if not all,
63 * cases).
64 * Zeroes the new block and returns with write lock held.
65 */
66int dm_tm_new_block(struct dm_transaction_manager *tm,
67 struct dm_block_validator *v,
68 struct dm_block **result);
69
70/*
71 * dm_tm_shadow_block() allocates a new block and copies the data from @orig
72 * to it. It then decrements the reference count on original block. Use
73 * this to update the contents of a block in a data structure, don't
74 * confuse this with a clone - you shouldn't access the orig block after
75 * this operation. Because the tm knows the scope of the transaction it
76 * can optimise requests for a shadow of a shadow to a no-op. Don't forget
77 * to unlock when you've finished with the shadow.
78 *
79 * The @inc_children flag is used to tell the caller whether it needs to
80 * adjust reference counts for children. (Data in the block may refer to
81 * other blocks.)
82 *
83 * Shadowing implicitly drops a reference on @orig so you must not have
84 * it locked when you call this.
85 */
86int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig,
87 struct dm_block_validator *v,
88 struct dm_block **result, int *inc_children);
89
90/*
91 * Read access. You can lock any block you want. If there's a write lock
92 * on it outstanding then it'll block.
93 */
94int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b,
95 struct dm_block_validator *v,
96 struct dm_block **result);
97
98int dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b);
99
100/*
101 * Functions for altering the reference count of a block directly.
102 */
103void dm_tm_inc(struct dm_transaction_manager *tm, dm_block_t b);
104
105void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b);
106
107int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b,
108 uint32_t *result);
109
110struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm);
111
112/*
113 * A little utility that ties the knot by producing a transaction manager
114 * that has a space map managed by the transaction manager...
115 *
116 * Returns a tm that has an open transaction to write the new disk sm.
117 * Caller should store the new sm root and commit.
118 *
119 * The superblock location is passed so the metadata space map knows it
120 * shouldn't be used.
121 */
122int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
123 struct dm_transaction_manager **tm,
124 struct dm_space_map **sm);
125
126int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
127 void *sm_root, size_t root_len,
128 struct dm_transaction_manager **tm,
129 struct dm_space_map **sm);
130
131#endif /* _LINUX_DM_TRANSACTION_MANAGER_H */
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 24b359717a7..e86bf3682e1 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -20,7 +20,6 @@
20 20
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/seq_file.h> 22#include <linux/seq_file.h>
23#include <linux/module.h>
24#include <linux/slab.h> 23#include <linux/slab.h>
25#include "md.h" 24#include "md.h"
26#include "raid0.h" 25#include "raid0.h"
@@ -28,9 +27,9 @@
28 27
29static int raid0_congested(void *data, int bits) 28static int raid0_congested(void *data, int bits)
30{ 29{
31 struct mddev *mddev = data; 30 mddev_t *mddev = data;
32 struct r0conf *conf = mddev->private; 31 raid0_conf_t *conf = mddev->private;
33 struct md_rdev **devlist = conf->devlist; 32 mdk_rdev_t **devlist = conf->devlist;
34 int raid_disks = conf->strip_zone[0].nb_dev; 33 int raid_disks = conf->strip_zone[0].nb_dev;
35 int i, ret = 0; 34 int i, ret = 0;
36 35
@@ -48,54 +47,52 @@ static int raid0_congested(void *data, int bits)
48/* 47/*
49 * inform the user of the raid configuration 48 * inform the user of the raid configuration
50*/ 49*/
51static void dump_zones(struct mddev *mddev) 50static void dump_zones(mddev_t *mddev)
52{ 51{
53 int j, k; 52 int j, k, h;
54 sector_t zone_size = 0; 53 sector_t zone_size = 0;
55 sector_t zone_start = 0; 54 sector_t zone_start = 0;
56 char b[BDEVNAME_SIZE]; 55 char b[BDEVNAME_SIZE];
57 struct r0conf *conf = mddev->private; 56 raid0_conf_t *conf = mddev->private;
58 int raid_disks = conf->strip_zone[0].nb_dev; 57 int raid_disks = conf->strip_zone[0].nb_dev;
59 printk(KERN_INFO "md: RAID0 configuration for %s - %d zone%s\n", 58 printk(KERN_INFO "******* %s configuration *********\n",
60 mdname(mddev), 59 mdname(mddev));
61 conf->nr_strip_zones, conf->nr_strip_zones==1?"":"s"); 60 h = 0;
62 for (j = 0; j < conf->nr_strip_zones; j++) { 61 for (j = 0; j < conf->nr_strip_zones; j++) {
63 printk(KERN_INFO "md: zone%d=[", j); 62 printk(KERN_INFO "zone%d=[", j);
64 for (k = 0; k < conf->strip_zone[j].nb_dev; k++) 63 for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
65 printk(KERN_CONT "%s%s", k?"/":"", 64 printk(KERN_CONT "%s/",
66 bdevname(conf->devlist[j*raid_disks 65 bdevname(conf->devlist[j*raid_disks
67 + k]->bdev, b)); 66 + k]->bdev, b));
68 printk(KERN_CONT "]\n"); 67 printk(KERN_CONT "]\n");
69 68
70 zone_size = conf->strip_zone[j].zone_end - zone_start; 69 zone_size = conf->strip_zone[j].zone_end - zone_start;
71 printk(KERN_INFO " zone-offset=%10lluKB, " 70 printk(KERN_INFO " zone offset=%llukb "
72 "device-offset=%10lluKB, size=%10lluKB\n", 71 "device offset=%llukb size=%llukb\n",
73 (unsigned long long)zone_start>>1, 72 (unsigned long long)zone_start>>1,
74 (unsigned long long)conf->strip_zone[j].dev_start>>1, 73 (unsigned long long)conf->strip_zone[j].dev_start>>1,
75 (unsigned long long)zone_size>>1); 74 (unsigned long long)zone_size>>1);
76 zone_start = conf->strip_zone[j].zone_end; 75 zone_start = conf->strip_zone[j].zone_end;
77 } 76 }
78 printk(KERN_INFO "\n"); 77 printk(KERN_INFO "**********************************\n\n");
79} 78}
80 79
81static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) 80static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf)
82{ 81{
83 int i, c, err; 82 int i, c, err;
84 sector_t curr_zone_end, sectors; 83 sector_t curr_zone_end, sectors;
85 struct md_rdev *smallest, *rdev1, *rdev2, *rdev, **dev; 84 mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev, **dev;
86 struct strip_zone *zone; 85 struct strip_zone *zone;
87 int cnt; 86 int cnt;
88 char b[BDEVNAME_SIZE]; 87 char b[BDEVNAME_SIZE];
89 char b2[BDEVNAME_SIZE]; 88 raid0_conf_t *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
90 struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
91 bool discard_supported = false;
92 89
93 if (!conf) 90 if (!conf)
94 return -ENOMEM; 91 return -ENOMEM;
95 rdev_for_each(rdev1, mddev) { 92 list_for_each_entry(rdev1, &mddev->disks, same_set) {
96 pr_debug("md/raid0:%s: looking at %s\n", 93 printk(KERN_INFO "md/raid0:%s: looking at %s\n",
97 mdname(mddev), 94 mdname(mddev),
98 bdevname(rdev1->bdev, b)); 95 bdevname(rdev1->bdev, b));
99 c = 0; 96 c = 0;
100 97
101 /* round size to chunk_size */ 98 /* round size to chunk_size */
@@ -103,17 +100,17 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
103 sector_div(sectors, mddev->chunk_sectors); 100 sector_div(sectors, mddev->chunk_sectors);
104 rdev1->sectors = sectors * mddev->chunk_sectors; 101 rdev1->sectors = sectors * mddev->chunk_sectors;
105 102
106 rdev_for_each(rdev2, mddev) { 103 list_for_each_entry(rdev2, &mddev->disks, same_set) {
107 pr_debug("md/raid0:%s: comparing %s(%llu)" 104 printk(KERN_INFO "md/raid0:%s: comparing %s(%llu)",
108 " with %s(%llu)\n", 105 mdname(mddev),
109 mdname(mddev), 106 bdevname(rdev1->bdev,b),
110 bdevname(rdev1->bdev,b), 107 (unsigned long long)rdev1->sectors);
111 (unsigned long long)rdev1->sectors, 108 printk(KERN_CONT " with %s(%llu)\n",
112 bdevname(rdev2->bdev,b2), 109 bdevname(rdev2->bdev,b),
113 (unsigned long long)rdev2->sectors); 110 (unsigned long long)rdev2->sectors);
114 if (rdev2 == rdev1) { 111 if (rdev2 == rdev1) {
115 pr_debug("md/raid0:%s: END\n", 112 printk(KERN_INFO "md/raid0:%s: END\n",
116 mdname(mddev)); 113 mdname(mddev));
117 break; 114 break;
118 } 115 }
119 if (rdev2->sectors == rdev1->sectors) { 116 if (rdev2->sectors == rdev1->sectors) {
@@ -121,30 +118,30 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
121 * Not unique, don't count it as a new 118 * Not unique, don't count it as a new
122 * group 119 * group
123 */ 120 */
124 pr_debug("md/raid0:%s: EQUAL\n", 121 printk(KERN_INFO "md/raid0:%s: EQUAL\n",
125 mdname(mddev)); 122 mdname(mddev));
126 c = 1; 123 c = 1;
127 break; 124 break;
128 } 125 }
129 pr_debug("md/raid0:%s: NOT EQUAL\n", 126 printk(KERN_INFO "md/raid0:%s: NOT EQUAL\n",
130 mdname(mddev)); 127 mdname(mddev));
131 } 128 }
132 if (!c) { 129 if (!c) {
133 pr_debug("md/raid0:%s: ==> UNIQUE\n", 130 printk(KERN_INFO "md/raid0:%s: ==> UNIQUE\n",
134 mdname(mddev)); 131 mdname(mddev));
135 conf->nr_strip_zones++; 132 conf->nr_strip_zones++;
136 pr_debug("md/raid0:%s: %d zones\n", 133 printk(KERN_INFO "md/raid0:%s: %d zones\n",
137 mdname(mddev), conf->nr_strip_zones); 134 mdname(mddev), conf->nr_strip_zones);
138 } 135 }
139 } 136 }
140 pr_debug("md/raid0:%s: FINAL %d zones\n", 137 printk(KERN_INFO "md/raid0:%s: FINAL %d zones\n",
141 mdname(mddev), conf->nr_strip_zones); 138 mdname(mddev), conf->nr_strip_zones);
142 err = -ENOMEM; 139 err = -ENOMEM;
143 conf->strip_zone = kzalloc(sizeof(struct strip_zone)* 140 conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
144 conf->nr_strip_zones, GFP_KERNEL); 141 conf->nr_strip_zones, GFP_KERNEL);
145 if (!conf->strip_zone) 142 if (!conf->strip_zone)
146 goto abort; 143 goto abort;
147 conf->devlist = kzalloc(sizeof(struct md_rdev*)* 144 conf->devlist = kzalloc(sizeof(mdk_rdev_t*)*
148 conf->nr_strip_zones*mddev->raid_disks, 145 conf->nr_strip_zones*mddev->raid_disks,
149 GFP_KERNEL); 146 GFP_KERNEL);
150 if (!conf->devlist) 147 if (!conf->devlist)
@@ -158,7 +155,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
158 smallest = NULL; 155 smallest = NULL;
159 dev = conf->devlist; 156 dev = conf->devlist;
160 err = -EINVAL; 157 err = -EINVAL;
161 rdev_for_each(rdev1, mddev) { 158 list_for_each_entry(rdev1, &mddev->disks, same_set) {
162 int j = rdev1->raid_disk; 159 int j = rdev1->raid_disk;
163 160
164 if (mddev->level == 10) { 161 if (mddev->level == 10) {
@@ -189,16 +186,19 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
189 186
190 disk_stack_limits(mddev->gendisk, rdev1->bdev, 187 disk_stack_limits(mddev->gendisk, rdev1->bdev,
191 rdev1->data_offset << 9); 188 rdev1->data_offset << 9);
189 /* as we don't honour merge_bvec_fn, we must never risk
190 * violating it, so limit ->max_segments to 1, lying within
191 * a single page.
192 */
192 193
193 if (rdev1->bdev->bd_disk->queue->merge_bvec_fn) 194 if (rdev1->bdev->bd_disk->queue->merge_bvec_fn) {
194 conf->has_merge_bvec = 1; 195 blk_queue_max_segments(mddev->queue, 1);
195 196 blk_queue_segment_boundary(mddev->queue,
197 PAGE_CACHE_SIZE - 1);
198 }
196 if (!smallest || (rdev1->sectors < smallest->sectors)) 199 if (!smallest || (rdev1->sectors < smallest->sectors))
197 smallest = rdev1; 200 smallest = rdev1;
198 cnt++; 201 cnt++;
199
200 if (blk_queue_discard(bdev_get_queue(rdev1->bdev)))
201 discard_supported = true;
202 } 202 }
203 if (cnt != mddev->raid_disks) { 203 if (cnt != mddev->raid_disks) {
204 printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - " 204 printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - "
@@ -218,45 +218,44 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
218 zone = conf->strip_zone + i; 218 zone = conf->strip_zone + i;
219 dev = conf->devlist + i * mddev->raid_disks; 219 dev = conf->devlist + i * mddev->raid_disks;
220 220
221 pr_debug("md/raid0:%s: zone %d\n", mdname(mddev), i); 221 printk(KERN_INFO "md/raid0:%s: zone %d\n",
222 mdname(mddev), i);
222 zone->dev_start = smallest->sectors; 223 zone->dev_start = smallest->sectors;
223 smallest = NULL; 224 smallest = NULL;
224 c = 0; 225 c = 0;
225 226
226 for (j=0; j<cnt; j++) { 227 for (j=0; j<cnt; j++) {
227 rdev = conf->devlist[j]; 228 rdev = conf->devlist[j];
229 printk(KERN_INFO "md/raid0:%s: checking %s ...",
230 mdname(mddev),
231 bdevname(rdev->bdev, b));
228 if (rdev->sectors <= zone->dev_start) { 232 if (rdev->sectors <= zone->dev_start) {
229 pr_debug("md/raid0:%s: checking %s ... nope\n", 233 printk(KERN_CONT " nope.\n");
230 mdname(mddev),
231 bdevname(rdev->bdev, b));
232 continue; 234 continue;
233 } 235 }
234 pr_debug("md/raid0:%s: checking %s ..." 236 printk(KERN_CONT " contained as device %d\n", c);
235 " contained as device %d\n",
236 mdname(mddev),
237 bdevname(rdev->bdev, b), c);
238 dev[c] = rdev; 237 dev[c] = rdev;
239 c++; 238 c++;
240 if (!smallest || rdev->sectors < smallest->sectors) { 239 if (!smallest || rdev->sectors < smallest->sectors) {
241 smallest = rdev; 240 smallest = rdev;
242 pr_debug("md/raid0:%s: (%llu) is smallest!.\n", 241 printk(KERN_INFO "md/raid0:%s: (%llu) is smallest!.\n",
243 mdname(mddev), 242 mdname(mddev),
244 (unsigned long long)rdev->sectors); 243 (unsigned long long)rdev->sectors);
245 } 244 }
246 } 245 }
247 246
248 zone->nb_dev = c; 247 zone->nb_dev = c;
249 sectors = (smallest->sectors - zone->dev_start) * c; 248 sectors = (smallest->sectors - zone->dev_start) * c;
250 pr_debug("md/raid0:%s: zone->nb_dev: %d, sectors: %llu\n", 249 printk(KERN_INFO "md/raid0:%s: zone->nb_dev: %d, sectors: %llu\n",
251 mdname(mddev), 250 mdname(mddev),
252 zone->nb_dev, (unsigned long long)sectors); 251 zone->nb_dev, (unsigned long long)sectors);
253 252
254 curr_zone_end += sectors; 253 curr_zone_end += sectors;
255 zone->zone_end = curr_zone_end; 254 zone->zone_end = curr_zone_end;
256 255
257 pr_debug("md/raid0:%s: current zone start: %llu\n", 256 printk(KERN_INFO "md/raid0:%s: current zone start: %llu\n",
258 mdname(mddev), 257 mdname(mddev),
259 (unsigned long long)smallest->sectors); 258 (unsigned long long)smallest->sectors);
260 } 259 }
261 mddev->queue->backing_dev_info.congested_fn = raid0_congested; 260 mddev->queue->backing_dev_info.congested_fn = raid0_congested;
262 mddev->queue->backing_dev_info.congested_data = mddev; 261 mddev->queue->backing_dev_info.congested_data = mddev;
@@ -276,12 +275,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
276 blk_queue_io_opt(mddev->queue, 275 blk_queue_io_opt(mddev->queue,
277 (mddev->chunk_sectors << 9) * mddev->raid_disks); 276 (mddev->chunk_sectors << 9) * mddev->raid_disks);
278 277
279 if (!discard_supported) 278 printk(KERN_INFO "md/raid0:%s: done.\n", mdname(mddev));
280 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
281 else
282 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
283
284 pr_debug("md/raid0:%s: done.\n", mdname(mddev));
285 *private_conf = conf; 279 *private_conf = conf;
286 280
287 return 0; 281 return 0;
@@ -293,64 +287,8 @@ abort:
293 return err; 287 return err;
294} 288}
295 289
296/* Find the zone which holds a particular offset
297 * Update *sectorp to be an offset in that zone
298 */
299static struct strip_zone *find_zone(struct r0conf *conf,
300 sector_t *sectorp)
301{
302 int i;
303 struct strip_zone *z = conf->strip_zone;
304 sector_t sector = *sectorp;
305
306 for (i = 0; i < conf->nr_strip_zones; i++)
307 if (sector < z[i].zone_end) {
308 if (i)
309 *sectorp = sector - z[i-1].zone_end;
310 return z + i;
311 }
312 BUG();
313}
314
315/*
316 * remaps the bio to the target device. we separate two flows.
317 * power 2 flow and a general flow for the sake of perfromance
318*/
319static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone,
320 sector_t sector, sector_t *sector_offset)
321{
322 unsigned int sect_in_chunk;
323 sector_t chunk;
324 struct r0conf *conf = mddev->private;
325 int raid_disks = conf->strip_zone[0].nb_dev;
326 unsigned int chunk_sects = mddev->chunk_sectors;
327
328 if (is_power_of_2(chunk_sects)) {
329 int chunksect_bits = ffz(~chunk_sects);
330 /* find the sector offset inside the chunk */
331 sect_in_chunk = sector & (chunk_sects - 1);
332 sector >>= chunksect_bits;
333 /* chunk in zone */
334 chunk = *sector_offset;
335 /* quotient is the chunk in real device*/
336 sector_div(chunk, zone->nb_dev << chunksect_bits);
337 } else{
338 sect_in_chunk = sector_div(sector, chunk_sects);
339 chunk = *sector_offset;
340 sector_div(chunk, chunk_sects * zone->nb_dev);
341 }
342 /*
343 * position the bio over the real device
344 * real sector = chunk in device + starting of zone
345 * + the position in the chunk
346 */
347 *sector_offset = (chunk * chunk_sects) + sect_in_chunk;
348 return conf->devlist[(zone - conf->strip_zone)*raid_disks
349 + sector_div(sector, zone->nb_dev)];
350}
351
352/** 290/**
353 * raid0_mergeable_bvec -- tell bio layer if two requests can be merged 291 * raid0_mergeable_bvec -- tell bio layer if a two requests can be merged
354 * @q: request queue 292 * @q: request queue
355 * @bvm: properties of new bio 293 * @bvm: properties of new bio
356 * @biovec: the request that could be merged to it. 294 * @biovec: the request that could be merged to it.
@@ -361,16 +299,11 @@ static int raid0_mergeable_bvec(struct request_queue *q,
361 struct bvec_merge_data *bvm, 299 struct bvec_merge_data *bvm,
362 struct bio_vec *biovec) 300 struct bio_vec *biovec)
363{ 301{
364 struct mddev *mddev = q->queuedata; 302 mddev_t *mddev = q->queuedata;
365 struct r0conf *conf = mddev->private;
366 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 303 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
367 sector_t sector_offset = sector;
368 int max; 304 int max;
369 unsigned int chunk_sectors = mddev->chunk_sectors; 305 unsigned int chunk_sectors = mddev->chunk_sectors;
370 unsigned int bio_sectors = bvm->bi_size >> 9; 306 unsigned int bio_sectors = bvm->bi_size >> 9;
371 struct strip_zone *zone;
372 struct md_rdev *rdev;
373 struct request_queue *subq;
374 307
375 if (is_power_of_2(chunk_sectors)) 308 if (is_power_of_2(chunk_sectors))
376 max = (chunk_sectors - ((sector & (chunk_sectors-1)) 309 max = (chunk_sectors - ((sector & (chunk_sectors-1))
@@ -378,49 +311,30 @@ static int raid0_mergeable_bvec(struct request_queue *q,
378 else 311 else
379 max = (chunk_sectors - (sector_div(sector, chunk_sectors) 312 max = (chunk_sectors - (sector_div(sector, chunk_sectors)
380 + bio_sectors)) << 9; 313 + bio_sectors)) << 9;
381 if (max < 0) 314 if (max < 0) max = 0; /* bio_add cannot handle a negative return */
382 max = 0; /* bio_add cannot handle a negative return */
383 if (max <= biovec->bv_len && bio_sectors == 0) 315 if (max <= biovec->bv_len && bio_sectors == 0)
384 return biovec->bv_len; 316 return biovec->bv_len;
385 if (max < biovec->bv_len) 317 else
386 /* too small already, no need to check further */
387 return max;
388 if (!conf->has_merge_bvec)
389 return max;
390
391 /* May need to check subordinate device */
392 sector = sector_offset;
393 zone = find_zone(mddev->private, &sector_offset);
394 rdev = map_sector(mddev, zone, sector, &sector_offset);
395 subq = bdev_get_queue(rdev->bdev);
396 if (subq->merge_bvec_fn) {
397 bvm->bi_bdev = rdev->bdev;
398 bvm->bi_sector = sector_offset + zone->dev_start +
399 rdev->data_offset;
400 return min(max, subq->merge_bvec_fn(subq, bvm, biovec));
401 } else
402 return max; 318 return max;
403} 319}
404 320
405static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks) 321static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks)
406{ 322{
407 sector_t array_sectors = 0; 323 sector_t array_sectors = 0;
408 struct md_rdev *rdev; 324 mdk_rdev_t *rdev;
409 325
410 WARN_ONCE(sectors || raid_disks, 326 WARN_ONCE(sectors || raid_disks,
411 "%s does not support generic reshape\n", __func__); 327 "%s does not support generic reshape\n", __func__);
412 328
413 rdev_for_each(rdev, mddev) 329 list_for_each_entry(rdev, &mddev->disks, same_set)
414 array_sectors += rdev->sectors; 330 array_sectors += rdev->sectors;
415 331
416 return array_sectors; 332 return array_sectors;
417} 333}
418 334
419static int raid0_stop(struct mddev *mddev); 335static int raid0_run(mddev_t *mddev)
420
421static int raid0_run(struct mddev *mddev)
422{ 336{
423 struct r0conf *conf; 337 raid0_conf_t *conf;
424 int ret; 338 int ret;
425 339
426 if (mddev->chunk_sectors == 0) { 340 if (mddev->chunk_sectors == 0) {
@@ -431,8 +345,6 @@ static int raid0_run(struct mddev *mddev)
431 if (md_check_no_bitmap(mddev)) 345 if (md_check_no_bitmap(mddev))
432 return -EINVAL; 346 return -EINVAL;
433 blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); 347 blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
434 blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
435 blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
436 348
437 /* if private is not null, we are here after takeover */ 349 /* if private is not null, we are here after takeover */
438 if (mddev->private == NULL) { 350 if (mddev->private == NULL) {
@@ -467,17 +379,12 @@ static int raid0_run(struct mddev *mddev)
467 379
468 blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec); 380 blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec);
469 dump_zones(mddev); 381 dump_zones(mddev);
470 382 return md_integrity_register(mddev);
471 ret = md_integrity_register(mddev);
472 if (ret)
473 raid0_stop(mddev);
474
475 return ret;
476} 383}
477 384
478static int raid0_stop(struct mddev *mddev) 385static int raid0_stop(mddev_t *mddev)
479{ 386{
480 struct r0conf *conf = mddev->private; 387 raid0_conf_t *conf = mddev->private;
481 388
482 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 389 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
483 kfree(conf->strip_zone); 390 kfree(conf->strip_zone);
@@ -487,10 +394,66 @@ static int raid0_stop(struct mddev *mddev)
487 return 0; 394 return 0;
488} 395}
489 396
397/* Find the zone which holds a particular offset
398 * Update *sectorp to be an offset in that zone
399 */
400static struct strip_zone *find_zone(struct raid0_private_data *conf,
401 sector_t *sectorp)
402{
403 int i;
404 struct strip_zone *z = conf->strip_zone;
405 sector_t sector = *sectorp;
406
407 for (i = 0; i < conf->nr_strip_zones; i++)
408 if (sector < z[i].zone_end) {
409 if (i)
410 *sectorp = sector - z[i-1].zone_end;
411 return z + i;
412 }
413 BUG();
414}
415
416/*
417 * remaps the bio to the target device. we separate two flows.
418 * power 2 flow and a general flow for the sake of perfromance
419*/
420static mdk_rdev_t *map_sector(mddev_t *mddev, struct strip_zone *zone,
421 sector_t sector, sector_t *sector_offset)
422{
423 unsigned int sect_in_chunk;
424 sector_t chunk;
425 raid0_conf_t *conf = mddev->private;
426 int raid_disks = conf->strip_zone[0].nb_dev;
427 unsigned int chunk_sects = mddev->chunk_sectors;
428
429 if (is_power_of_2(chunk_sects)) {
430 int chunksect_bits = ffz(~chunk_sects);
431 /* find the sector offset inside the chunk */
432 sect_in_chunk = sector & (chunk_sects - 1);
433 sector >>= chunksect_bits;
434 /* chunk in zone */
435 chunk = *sector_offset;
436 /* quotient is the chunk in real device*/
437 sector_div(chunk, zone->nb_dev << chunksect_bits);
438 } else{
439 sect_in_chunk = sector_div(sector, chunk_sects);
440 chunk = *sector_offset;
441 sector_div(chunk, chunk_sects * zone->nb_dev);
442 }
443 /*
444 * position the bio over the real device
445 * real sector = chunk in device + starting of zone
446 * + the position in the chunk
447 */
448 *sector_offset = (chunk * chunk_sects) + sect_in_chunk;
449 return conf->devlist[(zone - conf->strip_zone)*raid_disks
450 + sector_div(sector, zone->nb_dev)];
451}
452
490/* 453/*
491 * Is io distribute over 1 or more chunks ? 454 * Is io distribute over 1 or more chunks ?
492*/ 455*/
493static inline int is_io_in_chunk_boundary(struct mddev *mddev, 456static inline int is_io_in_chunk_boundary(mddev_t *mddev,
494 unsigned int chunk_sects, struct bio *bio) 457 unsigned int chunk_sects, struct bio *bio)
495{ 458{
496 if (likely(is_power_of_2(chunk_sects))) { 459 if (likely(is_power_of_2(chunk_sects))) {
@@ -503,16 +466,16 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev,
503 } 466 }
504} 467}
505 468
506static void raid0_make_request(struct mddev *mddev, struct bio *bio) 469static int raid0_make_request(mddev_t *mddev, struct bio *bio)
507{ 470{
508 unsigned int chunk_sects; 471 unsigned int chunk_sects;
509 sector_t sector_offset; 472 sector_t sector_offset;
510 struct strip_zone *zone; 473 struct strip_zone *zone;
511 struct md_rdev *tmp_dev; 474 mdk_rdev_t *tmp_dev;
512 475
513 if (unlikely(bio->bi_rw & REQ_FLUSH)) { 476 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
514 md_flush_request(mddev, bio); 477 md_flush_request(mddev, bio);
515 return; 478 return 0;
516 } 479 }
517 480
518 chunk_sects = mddev->chunk_sectors; 481 chunk_sects = mddev->chunk_sectors;
@@ -520,7 +483,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
520 sector_t sector = bio->bi_sector; 483 sector_t sector = bio->bi_sector;
521 struct bio_pair *bp; 484 struct bio_pair *bp;
522 /* Sanity check -- queue functions should prevent this happening */ 485 /* Sanity check -- queue functions should prevent this happening */
523 if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) || 486 if (bio->bi_vcnt != 1 ||
524 bio->bi_idx != 0) 487 bio->bi_idx != 0)
525 goto bad_map; 488 goto bad_map;
526 /* This is a one page bio that upper layers 489 /* This is a one page bio that upper layers
@@ -532,29 +495,26 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
532 else 495 else
533 bp = bio_split(bio, chunk_sects - 496 bp = bio_split(bio, chunk_sects -
534 sector_div(sector, chunk_sects)); 497 sector_div(sector, chunk_sects));
535 raid0_make_request(mddev, &bp->bio1); 498 if (raid0_make_request(mddev, &bp->bio1))
536 raid0_make_request(mddev, &bp->bio2); 499 generic_make_request(&bp->bio1);
500 if (raid0_make_request(mddev, &bp->bio2))
501 generic_make_request(&bp->bio2);
502
537 bio_pair_release(bp); 503 bio_pair_release(bp);
538 return; 504 return 0;
539 } 505 }
540 506
541 sector_offset = bio->bi_sector; 507 sector_offset = bio->bi_sector;
542 zone = find_zone(mddev->private, &sector_offset); 508 zone = find_zone(mddev->private, &sector_offset);
543 tmp_dev = map_sector(mddev, zone, bio->bi_sector, 509 tmp_dev = map_sector(mddev, zone, bio->bi_sector,
544 &sector_offset); 510 &sector_offset);
545 bio->bi_bdev = tmp_dev->bdev; 511 bio->bi_bdev = tmp_dev->bdev;
546 bio->bi_sector = sector_offset + zone->dev_start + 512 bio->bi_sector = sector_offset + zone->dev_start +
547 tmp_dev->data_offset; 513 tmp_dev->data_offset;
548 514 /*
549 if (unlikely((bio->bi_rw & REQ_DISCARD) && 515 * Let the main block layer submit the IO and resolve recursion:
550 !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) { 516 */
551 /* Just ignore it */ 517 return 1;
552 bio_endio(bio, 0);
553 return;
554 }
555
556 generic_make_request(bio);
557 return;
558 518
559bad_map: 519bad_map:
560 printk("md/raid0:%s: make_request bug: can't convert block across chunks" 520 printk("md/raid0:%s: make_request bug: can't convert block across chunks"
@@ -563,19 +523,46 @@ bad_map:
563 (unsigned long long)bio->bi_sector, bio->bi_size >> 10); 523 (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
564 524
565 bio_io_error(bio); 525 bio_io_error(bio);
566 return; 526 return 0;
567} 527}
568 528
569static void raid0_status(struct seq_file *seq, struct mddev *mddev) 529static void raid0_status(struct seq_file *seq, mddev_t *mddev)
570{ 530{
531#undef MD_DEBUG
532#ifdef MD_DEBUG
533 int j, k, h;
534 char b[BDEVNAME_SIZE];
535 raid0_conf_t *conf = mddev->private;
536 int raid_disks = conf->strip_zone[0].nb_dev;
537
538 sector_t zone_size;
539 sector_t zone_start = 0;
540 h = 0;
541
542 for (j = 0; j < conf->nr_strip_zones; j++) {
543 seq_printf(seq, " z%d", j);
544 seq_printf(seq, "=[");
545 for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
546 seq_printf(seq, "%s/", bdevname(
547 conf->devlist[j*raid_disks + k]
548 ->bdev, b));
549
550 zone_size = conf->strip_zone[j].zone_end - zone_start;
551 seq_printf(seq, "] ze=%lld ds=%lld s=%lld\n",
552 (unsigned long long)zone_start>>1,
553 (unsigned long long)conf->strip_zone[j].dev_start>>1,
554 (unsigned long long)zone_size>>1);
555 zone_start = conf->strip_zone[j].zone_end;
556 }
557#endif
571 seq_printf(seq, " %dk chunks", mddev->chunk_sectors / 2); 558 seq_printf(seq, " %dk chunks", mddev->chunk_sectors / 2);
572 return; 559 return;
573} 560}
574 561
575static void *raid0_takeover_raid45(struct mddev *mddev) 562static void *raid0_takeover_raid45(mddev_t *mddev)
576{ 563{
577 struct md_rdev *rdev; 564 mdk_rdev_t *rdev;
578 struct r0conf *priv_conf; 565 raid0_conf_t *priv_conf;
579 566
580 if (mddev->degraded != 1) { 567 if (mddev->degraded != 1) {
581 printk(KERN_ERR "md/raid0:%s: raid5 must be degraded! Degraded disks: %d\n", 568 printk(KERN_ERR "md/raid0:%s: raid5 must be degraded! Degraded disks: %d\n",
@@ -584,7 +571,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
584 return ERR_PTR(-EINVAL); 571 return ERR_PTR(-EINVAL);
585 } 572 }
586 573
587 rdev_for_each(rdev, mddev) { 574 list_for_each_entry(rdev, &mddev->disks, same_set) {
588 /* check slot number for a disk */ 575 /* check slot number for a disk */
589 if (rdev->raid_disk == mddev->raid_disks-1) { 576 if (rdev->raid_disk == mddev->raid_disks-1) {
590 printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n", 577 printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n",
@@ -606,9 +593,9 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
606 return priv_conf; 593 return priv_conf;
607} 594}
608 595
609static void *raid0_takeover_raid10(struct mddev *mddev) 596static void *raid0_takeover_raid10(mddev_t *mddev)
610{ 597{
611 struct r0conf *priv_conf; 598 raid0_conf_t *priv_conf;
612 599
613 /* Check layout: 600 /* Check layout:
614 * - far_copies must be 1 601 * - far_copies must be 1
@@ -647,10 +634,9 @@ static void *raid0_takeover_raid10(struct mddev *mddev)
647 return priv_conf; 634 return priv_conf;
648} 635}
649 636
650static void *raid0_takeover_raid1(struct mddev *mddev) 637static void *raid0_takeover_raid1(mddev_t *mddev)
651{ 638{
652 struct r0conf *priv_conf; 639 raid0_conf_t *priv_conf;
653 int chunksect;
654 640
655 /* Check layout: 641 /* Check layout:
656 * - (N - 1) mirror drives must be already faulty 642 * - (N - 1) mirror drives must be already faulty
@@ -661,25 +647,10 @@ static void *raid0_takeover_raid1(struct mddev *mddev)
661 return ERR_PTR(-EINVAL); 647 return ERR_PTR(-EINVAL);
662 } 648 }
663 649
664 /*
665 * a raid1 doesn't have the notion of chunk size, so
666 * figure out the largest suitable size we can use.
667 */
668 chunksect = 64 * 2; /* 64K by default */
669
670 /* The array must be an exact multiple of chunksize */
671 while (chunksect && (mddev->array_sectors & (chunksect - 1)))
672 chunksect >>= 1;
673
674 if ((chunksect << 9) < PAGE_SIZE)
675 /* array size does not allow a suitable chunk size */
676 return ERR_PTR(-EINVAL);
677
678 /* Set new parameters */ 650 /* Set new parameters */
679 mddev->new_level = 0; 651 mddev->new_level = 0;
680 mddev->new_layout = 0; 652 mddev->new_layout = 0;
681 mddev->new_chunk_sectors = chunksect; 653 mddev->new_chunk_sectors = 128; /* by default set chunk size to 64k */
682 mddev->chunk_sectors = chunksect;
683 mddev->delta_disks = 1 - mddev->raid_disks; 654 mddev->delta_disks = 1 - mddev->raid_disks;
684 mddev->raid_disks = 1; 655 mddev->raid_disks = 1;
685 /* make sure it will be not marked as dirty */ 656 /* make sure it will be not marked as dirty */
@@ -689,7 +660,7 @@ static void *raid0_takeover_raid1(struct mddev *mddev)
689 return priv_conf; 660 return priv_conf;
690} 661}
691 662
692static void *raid0_takeover(struct mddev *mddev) 663static void *raid0_takeover(mddev_t *mddev)
693{ 664{
694 /* raid0 can take over: 665 /* raid0 can take over:
695 * raid4 - if all data disks are active. 666 * raid4 - if all data disks are active.
@@ -720,11 +691,11 @@ static void *raid0_takeover(struct mddev *mddev)
720 return ERR_PTR(-EINVAL); 691 return ERR_PTR(-EINVAL);
721} 692}
722 693
723static void raid0_quiesce(struct mddev *mddev, int state) 694static void raid0_quiesce(mddev_t *mddev, int state)
724{ 695{
725} 696}
726 697
727static struct md_personality raid0_personality= 698static struct mdk_personality raid0_personality=
728{ 699{
729 .name = "raid0", 700 .name = "raid0",
730 .level = 0, 701 .level = 0,
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index 05539d9c97f..91f8e876ee6 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -1,19 +1,20 @@
1#ifndef _RAID0_H 1#ifndef _RAID0_H
2#define _RAID0_H 2#define _RAID0_H
3 3
4struct strip_zone { 4struct strip_zone
5{
5 sector_t zone_end; /* Start of the next zone (in sectors) */ 6 sector_t zone_end; /* Start of the next zone (in sectors) */
6 sector_t dev_start; /* Zone offset in real dev (in sectors) */ 7 sector_t dev_start; /* Zone offset in real dev (in sectors) */
7 int nb_dev; /* # of devices attached to the zone */ 8 int nb_dev; /* # of devices attached to the zone */
8}; 9};
9 10
10struct r0conf { 11struct raid0_private_data
11 struct strip_zone *strip_zone; 12{
12 struct md_rdev **devlist; /* lists of rdevs, pointed to 13 struct strip_zone *strip_zone;
13 * by strip_zone->dev */ 14 mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */
14 int nr_strip_zones; 15 int nr_strip_zones;
15 int has_merge_bvec; /* at least one member has
16 * a merge_bvec_fn */
17}; 16};
18 17
18typedef struct raid0_private_data raid0_conf_t;
19
19#endif 20#endif
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d5bddfc4010..606fc04fd76 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -34,45 +34,28 @@
34#include <linux/slab.h> 34#include <linux/slab.h>
35#include <linux/delay.h> 35#include <linux/delay.h>
36#include <linux/blkdev.h> 36#include <linux/blkdev.h>
37#include <linux/module.h>
38#include <linux/seq_file.h> 37#include <linux/seq_file.h>
39#include <linux/ratelimit.h> 38#include <linux/ratelimit.h>
40#include "md.h" 39#include "md.h"
41#include "raid1.h" 40#include "raid1.h"
42#include "bitmap.h" 41#include "bitmap.h"
43 42
43#define DEBUG 0
44#define PRINTK(x...) do { if (DEBUG) printk(x); } while (0)
45
44/* 46/*
45 * Number of guaranteed r1bios in case of extreme VM load: 47 * Number of guaranteed r1bios in case of extreme VM load:
46 */ 48 */
47#define NR_RAID1_BIOS 256 49#define NR_RAID1_BIOS 256
48 50
49/* when we get a read error on a read-only array, we redirect to another
50 * device without failing the first device, or trying to over-write to
51 * correct the read error. To keep track of bad blocks on a per-bio
52 * level, we store IO_BLOCKED in the appropriate 'bios' pointer
53 */
54#define IO_BLOCKED ((struct bio *)1)
55/* When we successfully write to a known bad-block, we need to remove the
56 * bad-block marking which must be done from process context. So we record
57 * the success by setting devs[n].bio to IO_MADE_GOOD
58 */
59#define IO_MADE_GOOD ((struct bio *)2)
60
61#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
62
63/* When there are this many requests queue to be written by
64 * the raid1 thread, we become 'congested' to provide back-pressure
65 * for writeback.
66 */
67static int max_queued_requests = 1024;
68 51
69static void allow_barrier(struct r1conf *conf); 52static void allow_barrier(conf_t *conf);
70static void lower_barrier(struct r1conf *conf); 53static void lower_barrier(conf_t *conf);
71 54
72static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) 55static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
73{ 56{
74 struct pool_info *pi = data; 57 struct pool_info *pi = data;
75 int size = offsetof(struct r1bio, bios[pi->raid_disks]); 58 int size = offsetof(r1bio_t, bios[pi->raid_disks]);
76 59
77 /* allocate a r1bio with room for raid_disks entries in the bios array */ 60 /* allocate a r1bio with room for raid_disks entries in the bios array */
78 return kzalloc(size, gfp_flags); 61 return kzalloc(size, gfp_flags);
@@ -93,7 +76,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
93{ 76{
94 struct pool_info *pi = data; 77 struct pool_info *pi = data;
95 struct page *page; 78 struct page *page;
96 struct r1bio *r1_bio; 79 r1bio_t *r1_bio;
97 struct bio *bio; 80 struct bio *bio;
98 int i, j; 81 int i, j;
99 82
@@ -149,7 +132,7 @@ out_free_pages:
149 put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page); 132 put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
150 j = -1; 133 j = -1;
151out_free_bio: 134out_free_bio:
152 while (++j < pi->raid_disks) 135 while ( ++j < pi->raid_disks )
153 bio_put(r1_bio->bios[j]); 136 bio_put(r1_bio->bios[j]);
154 r1bio_pool_free(r1_bio, data); 137 r1bio_pool_free(r1_bio, data);
155 return NULL; 138 return NULL;
@@ -159,7 +142,7 @@ static void r1buf_pool_free(void *__r1_bio, void *data)
159{ 142{
160 struct pool_info *pi = data; 143 struct pool_info *pi = data;
161 int i,j; 144 int i,j;
162 struct r1bio *r1bio = __r1_bio; 145 r1bio_t *r1bio = __r1_bio;
163 146
164 for (i = 0; i < RESYNC_PAGES; i++) 147 for (i = 0; i < RESYNC_PAGES; i++)
165 for (j = pi->raid_disks; j-- ;) { 148 for (j = pi->raid_disks; j-- ;) {
@@ -174,11 +157,11 @@ static void r1buf_pool_free(void *__r1_bio, void *data)
174 r1bio_pool_free(r1bio, data); 157 r1bio_pool_free(r1bio, data);
175} 158}
176 159
177static void put_all_bios(struct r1conf *conf, struct r1bio *r1_bio) 160static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
178{ 161{
179 int i; 162 int i;
180 163
181 for (i = 0; i < conf->raid_disks * 2; i++) { 164 for (i = 0; i < conf->raid_disks; i++) {
182 struct bio **bio = r1_bio->bios + i; 165 struct bio **bio = r1_bio->bios + i;
183 if (!BIO_SPECIAL(*bio)) 166 if (!BIO_SPECIAL(*bio))
184 bio_put(*bio); 167 bio_put(*bio);
@@ -186,20 +169,20 @@ static void put_all_bios(struct r1conf *conf, struct r1bio *r1_bio)
186 } 169 }
187} 170}
188 171
189static void free_r1bio(struct r1bio *r1_bio) 172static void free_r1bio(r1bio_t *r1_bio)
190{ 173{
191 struct r1conf *conf = r1_bio->mddev->private; 174 conf_t *conf = r1_bio->mddev->private;
192 175
193 put_all_bios(conf, r1_bio); 176 put_all_bios(conf, r1_bio);
194 mempool_free(r1_bio, conf->r1bio_pool); 177 mempool_free(r1_bio, conf->r1bio_pool);
195} 178}
196 179
197static void put_buf(struct r1bio *r1_bio) 180static void put_buf(r1bio_t *r1_bio)
198{ 181{
199 struct r1conf *conf = r1_bio->mddev->private; 182 conf_t *conf = r1_bio->mddev->private;
200 int i; 183 int i;
201 184
202 for (i = 0; i < conf->raid_disks * 2; i++) { 185 for (i=0; i<conf->raid_disks; i++) {
203 struct bio *bio = r1_bio->bios[i]; 186 struct bio *bio = r1_bio->bios[i];
204 if (bio->bi_end_io) 187 if (bio->bi_end_io)
205 rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev); 188 rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);
@@ -210,11 +193,11 @@ static void put_buf(struct r1bio *r1_bio)
210 lower_barrier(conf); 193 lower_barrier(conf);
211} 194}
212 195
213static void reschedule_retry(struct r1bio *r1_bio) 196static void reschedule_retry(r1bio_t *r1_bio)
214{ 197{
215 unsigned long flags; 198 unsigned long flags;
216 struct mddev *mddev = r1_bio->mddev; 199 mddev_t *mddev = r1_bio->mddev;
217 struct r1conf *conf = mddev->private; 200 conf_t *conf = mddev->private;
218 201
219 spin_lock_irqsave(&conf->device_lock, flags); 202 spin_lock_irqsave(&conf->device_lock, flags);
220 list_add(&r1_bio->retry_list, &conf->retry_list); 203 list_add(&r1_bio->retry_list, &conf->retry_list);
@@ -230,11 +213,11 @@ static void reschedule_retry(struct r1bio *r1_bio)
230 * operation and are ready to return a success/failure code to the buffer 213 * operation and are ready to return a success/failure code to the buffer
231 * cache layer. 214 * cache layer.
232 */ 215 */
233static void call_bio_endio(struct r1bio *r1_bio) 216static void call_bio_endio(r1bio_t *r1_bio)
234{ 217{
235 struct bio *bio = r1_bio->master_bio; 218 struct bio *bio = r1_bio->master_bio;
236 int done; 219 int done;
237 struct r1conf *conf = r1_bio->mddev->private; 220 conf_t *conf = r1_bio->mddev->private;
238 221
239 if (bio->bi_phys_segments) { 222 if (bio->bi_phys_segments) {
240 unsigned long flags; 223 unsigned long flags;
@@ -257,17 +240,17 @@ static void call_bio_endio(struct r1bio *r1_bio)
257 } 240 }
258} 241}
259 242
260static void raid_end_bio_io(struct r1bio *r1_bio) 243static void raid_end_bio_io(r1bio_t *r1_bio)
261{ 244{
262 struct bio *bio = r1_bio->master_bio; 245 struct bio *bio = r1_bio->master_bio;
263 246
264 /* if nobody has done the final endio yet, do it now */ 247 /* if nobody has done the final endio yet, do it now */
265 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { 248 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
266 pr_debug("raid1: sync end %s on sectors %llu-%llu\n", 249 PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n",
267 (bio_data_dir(bio) == WRITE) ? "write" : "read", 250 (bio_data_dir(bio) == WRITE) ? "write" : "read",
268 (unsigned long long) bio->bi_sector, 251 (unsigned long long) bio->bi_sector,
269 (unsigned long long) bio->bi_sector + 252 (unsigned long long) bio->bi_sector +
270 (bio->bi_size >> 9) - 1); 253 (bio->bi_size >> 9) - 1);
271 254
272 call_bio_endio(r1_bio); 255 call_bio_endio(r1_bio);
273 } 256 }
@@ -277,39 +260,20 @@ static void raid_end_bio_io(struct r1bio *r1_bio)
277/* 260/*
278 * Update disk head position estimator based on IRQ completion info. 261 * Update disk head position estimator based on IRQ completion info.
279 */ 262 */
280static inline void update_head_pos(int disk, struct r1bio *r1_bio) 263static inline void update_head_pos(int disk, r1bio_t *r1_bio)
281{ 264{
282 struct r1conf *conf = r1_bio->mddev->private; 265 conf_t *conf = r1_bio->mddev->private;
283 266
284 conf->mirrors[disk].head_position = 267 conf->mirrors[disk].head_position =
285 r1_bio->sector + (r1_bio->sectors); 268 r1_bio->sector + (r1_bio->sectors);
286} 269}
287 270
288/*
289 * Find the disk number which triggered given bio
290 */
291static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio)
292{
293 int mirror;
294 struct r1conf *conf = r1_bio->mddev->private;
295 int raid_disks = conf->raid_disks;
296
297 for (mirror = 0; mirror < raid_disks * 2; mirror++)
298 if (r1_bio->bios[mirror] == bio)
299 break;
300
301 BUG_ON(mirror == raid_disks * 2);
302 update_head_pos(mirror, r1_bio);
303
304 return mirror;
305}
306
307static void raid1_end_read_request(struct bio *bio, int error) 271static void raid1_end_read_request(struct bio *bio, int error)
308{ 272{
309 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 273 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
310 struct r1bio *r1_bio = bio->bi_private; 274 r1bio_t *r1_bio = bio->bi_private;
311 int mirror; 275 int mirror;
312 struct r1conf *conf = r1_bio->mddev->private; 276 conf_t *conf = r1_bio->mddev->private;
313 277
314 mirror = r1_bio->read_disk; 278 mirror = r1_bio->read_disk;
315 /* 279 /*
@@ -333,10 +297,9 @@ static void raid1_end_read_request(struct bio *bio, int error)
333 spin_unlock_irqrestore(&conf->device_lock, flags); 297 spin_unlock_irqrestore(&conf->device_lock, flags);
334 } 298 }
335 299
336 if (uptodate) { 300 if (uptodate)
337 raid_end_bio_io(r1_bio); 301 raid_end_bio_io(r1_bio);
338 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); 302 else {
339 } else {
340 /* 303 /*
341 * oops, read error: 304 * oops, read error:
342 */ 305 */
@@ -350,11 +313,12 @@ static void raid1_end_read_request(struct bio *bio, int error)
350 (unsigned long long)r1_bio->sector); 313 (unsigned long long)r1_bio->sector);
351 set_bit(R1BIO_ReadError, &r1_bio->state); 314 set_bit(R1BIO_ReadError, &r1_bio->state);
352 reschedule_retry(r1_bio); 315 reschedule_retry(r1_bio);
353 /* don't drop the reference on read_disk yet */
354 } 316 }
317
318 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
355} 319}
356 320
357static void close_write(struct r1bio *r1_bio) 321static void close_write(r1bio_t *r1_bio)
358{ 322{
359 /* it really is the end of this request */ 323 /* it really is the end of this request */
360 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { 324 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
@@ -373,7 +337,7 @@ static void close_write(struct r1bio *r1_bio)
373 md_write_end(r1_bio->mddev); 337 md_write_end(r1_bio->mddev);
374} 338}
375 339
376static void r1_bio_write_done(struct r1bio *r1_bio) 340static void r1_bio_write_done(r1bio_t *r1_bio)
377{ 341{
378 if (!atomic_dec_and_test(&r1_bio->remaining)) 342 if (!atomic_dec_and_test(&r1_bio->remaining))
379 return; 343 return;
@@ -392,12 +356,15 @@ static void r1_bio_write_done(struct r1bio *r1_bio)
392static void raid1_end_write_request(struct bio *bio, int error) 356static void raid1_end_write_request(struct bio *bio, int error)
393{ 357{
394 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 358 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
395 struct r1bio *r1_bio = bio->bi_private; 359 r1bio_t *r1_bio = bio->bi_private;
396 int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state); 360 int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
397 struct r1conf *conf = r1_bio->mddev->private; 361 conf_t *conf = r1_bio->mddev->private;
398 struct bio *to_put = NULL; 362 struct bio *to_put = NULL;
399 363
400 mirror = find_bio_disk(r1_bio, bio); 364
365 for (mirror = 0; mirror < conf->raid_disks; mirror++)
366 if (r1_bio->bios[mirror] == bio)
367 break;
401 368
402 /* 369 /*
403 * 'one mirror IO has finished' event handler: 370 * 'one mirror IO has finished' event handler:
@@ -405,11 +372,6 @@ static void raid1_end_write_request(struct bio *bio, int error)
405 if (!uptodate) { 372 if (!uptodate) {
406 set_bit(WriteErrorSeen, 373 set_bit(WriteErrorSeen,
407 &conf->mirrors[mirror].rdev->flags); 374 &conf->mirrors[mirror].rdev->flags);
408 if (!test_and_set_bit(WantReplacement,
409 &conf->mirrors[mirror].rdev->flags))
410 set_bit(MD_RECOVERY_NEEDED, &
411 conf->mddev->recovery);
412
413 set_bit(R1BIO_WriteError, &r1_bio->state); 375 set_bit(R1BIO_WriteError, &r1_bio->state);
414 } else { 376 } else {
415 /* 377 /*
@@ -438,6 +400,8 @@ static void raid1_end_write_request(struct bio *bio, int error)
438 } 400 }
439 } 401 }
440 402
403 update_head_pos(mirror, r1_bio);
404
441 if (behind) { 405 if (behind) {
442 if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) 406 if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
443 atomic_dec(&r1_bio->behind_remaining); 407 atomic_dec(&r1_bio->behind_remaining);
@@ -454,11 +418,10 @@ static void raid1_end_write_request(struct bio *bio, int error)
454 /* Maybe we can return now */ 418 /* Maybe we can return now */
455 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { 419 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
456 struct bio *mbio = r1_bio->master_bio; 420 struct bio *mbio = r1_bio->master_bio;
457 pr_debug("raid1: behind end write sectors" 421 PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
458 " %llu-%llu\n", 422 (unsigned long long) mbio->bi_sector,
459 (unsigned long long) mbio->bi_sector, 423 (unsigned long long) mbio->bi_sector +
460 (unsigned long long) mbio->bi_sector + 424 (mbio->bi_size >> 9) - 1);
461 (mbio->bi_size >> 9) - 1);
462 call_bio_endio(r1_bio); 425 call_bio_endio(r1_bio);
463 } 426 }
464 } 427 }
@@ -492,19 +455,17 @@ static void raid1_end_write_request(struct bio *bio, int error)
492 * 455 *
493 * The rdev for the device selected will have nr_pending incremented. 456 * The rdev for the device selected will have nr_pending incremented.
494 */ 457 */
495static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sectors) 458static int read_balance(conf_t *conf, r1bio_t *r1_bio, int *max_sectors)
496{ 459{
497 const sector_t this_sector = r1_bio->sector; 460 const sector_t this_sector = r1_bio->sector;
498 int sectors; 461 int sectors;
499 int best_good_sectors; 462 int best_good_sectors;
500 int best_disk, best_dist_disk, best_pending_disk; 463 int start_disk;
501 int has_nonrot_disk; 464 int best_disk;
502 int disk; 465 int i;
503 sector_t best_dist; 466 sector_t best_dist;
504 unsigned int min_pending; 467 mdk_rdev_t *rdev;
505 struct md_rdev *rdev;
506 int choose_first; 468 int choose_first;
507 int choose_next_idle;
508 469
509 rcu_read_lock(); 470 rcu_read_lock();
510 /* 471 /*
@@ -515,31 +476,30 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
515 retry: 476 retry:
516 sectors = r1_bio->sectors; 477 sectors = r1_bio->sectors;
517 best_disk = -1; 478 best_disk = -1;
518 best_dist_disk = -1;
519 best_dist = MaxSector; 479 best_dist = MaxSector;
520 best_pending_disk = -1;
521 min_pending = UINT_MAX;
522 best_good_sectors = 0; 480 best_good_sectors = 0;
523 has_nonrot_disk = 0;
524 choose_next_idle = 0;
525 481
526 if (conf->mddev->recovery_cp < MaxSector && 482 if (conf->mddev->recovery_cp < MaxSector &&
527 (this_sector + sectors >= conf->next_resync)) 483 (this_sector + sectors >= conf->next_resync)) {
528 choose_first = 1; 484 choose_first = 1;
529 else 485 start_disk = 0;
486 } else {
530 choose_first = 0; 487 choose_first = 0;
488 start_disk = conf->last_used;
489 }
531 490
532 for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { 491 for (i = 0 ; i < conf->raid_disks ; i++) {
533 sector_t dist; 492 sector_t dist;
534 sector_t first_bad; 493 sector_t first_bad;
535 int bad_sectors; 494 int bad_sectors;
536 unsigned int pending; 495
537 bool nonrot; 496 int disk = start_disk + i;
497 if (disk >= conf->raid_disks)
498 disk -= conf->raid_disks;
538 499
539 rdev = rcu_dereference(conf->mirrors[disk].rdev); 500 rdev = rcu_dereference(conf->mirrors[disk].rdev);
540 if (r1_bio->bios[disk] == IO_BLOCKED 501 if (r1_bio->bios[disk] == IO_BLOCKED
541 || rdev == NULL 502 || rdev == NULL
542 || test_bit(Unmerged, &rdev->flags)
543 || test_bit(Faulty, &rdev->flags)) 503 || test_bit(Faulty, &rdev->flags))
544 continue; 504 continue;
545 if (!test_bit(In_sync, &rdev->flags) && 505 if (!test_bit(In_sync, &rdev->flags) &&
@@ -593,77 +553,22 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
593 } else 553 } else
594 best_good_sectors = sectors; 554 best_good_sectors = sectors;
595 555
596 nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev));
597 has_nonrot_disk |= nonrot;
598 pending = atomic_read(&rdev->nr_pending);
599 dist = abs(this_sector - conf->mirrors[disk].head_position); 556 dist = abs(this_sector - conf->mirrors[disk].head_position);
600 if (choose_first) { 557 if (choose_first
601 best_disk = disk; 558 /* Don't change to another disk for sequential reads */
602 break; 559 || conf->next_seq_sect == this_sector
603 } 560 || dist == 0
604 /* Don't change to another disk for sequential reads */ 561 /* If device is idle, use it */
605 if (conf->mirrors[disk].next_seq_sect == this_sector 562 || atomic_read(&rdev->nr_pending) == 0) {
606 || dist == 0) {
607 int opt_iosize = bdev_io_opt(rdev->bdev) >> 9;
608 struct raid1_info *mirror = &conf->mirrors[disk];
609
610 best_disk = disk; 563 best_disk = disk;
611 /*
612 * If buffered sequential IO size exceeds optimal
613 * iosize, check if there is idle disk. If yes, choose
614 * the idle disk. read_balance could already choose an
615 * idle disk before noticing it's a sequential IO in
616 * this disk. This doesn't matter because this disk
617 * will idle, next time it will be utilized after the
618 * first disk has IO size exceeds optimal iosize. In
619 * this way, iosize of the first disk will be optimal
620 * iosize at least. iosize of the second disk might be
621 * small, but not a big deal since when the second disk
622 * starts IO, the first disk is likely still busy.
623 */
624 if (nonrot && opt_iosize > 0 &&
625 mirror->seq_start != MaxSector &&
626 mirror->next_seq_sect > opt_iosize &&
627 mirror->next_seq_sect - opt_iosize >=
628 mirror->seq_start) {
629 choose_next_idle = 1;
630 continue;
631 }
632 break; 564 break;
633 } 565 }
634 /* If device is idle, use it */
635 if (pending == 0) {
636 best_disk = disk;
637 break;
638 }
639
640 if (choose_next_idle)
641 continue;
642
643 if (min_pending > pending) {
644 min_pending = pending;
645 best_pending_disk = disk;
646 }
647
648 if (dist < best_dist) { 566 if (dist < best_dist) {
649 best_dist = dist; 567 best_dist = dist;
650 best_dist_disk = disk; 568 best_disk = disk;
651 } 569 }
652 } 570 }
653 571
654 /*
655 * If all disks are rotational, choose the closest disk. If any disk is
656 * non-rotational, choose the disk with less pending request even the
657 * disk is rotational, which might/might not be optimal for raids with
658 * mixed ratation/non-rotational disks depending on workload.
659 */
660 if (best_disk == -1) {
661 if (has_nonrot_disk)
662 best_disk = best_pending_disk;
663 else
664 best_disk = best_dist_disk;
665 }
666
667 if (best_disk >= 0) { 572 if (best_disk >= 0) {
668 rdev = rcu_dereference(conf->mirrors[best_disk].rdev); 573 rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
669 if (!rdev) 574 if (!rdev)
@@ -677,11 +582,8 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
677 goto retry; 582 goto retry;
678 } 583 }
679 sectors = best_good_sectors; 584 sectors = best_good_sectors;
680 585 conf->next_seq_sect = this_sector + sectors;
681 if (conf->mirrors[best_disk].next_seq_sect != this_sector) 586 conf->last_used = best_disk;
682 conf->mirrors[best_disk].seq_start = this_sector;
683
684 conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
685 } 587 }
686 rcu_read_unlock(); 588 rcu_read_unlock();
687 *max_sectors = sectors; 589 *max_sectors = sectors;
@@ -689,51 +591,14 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
689 return best_disk; 591 return best_disk;
690} 592}
691 593
692static int raid1_mergeable_bvec(struct request_queue *q, 594int md_raid1_congested(mddev_t *mddev, int bits)
693 struct bvec_merge_data *bvm,
694 struct bio_vec *biovec)
695{
696 struct mddev *mddev = q->queuedata;
697 struct r1conf *conf = mddev->private;
698 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
699 int max = biovec->bv_len;
700
701 if (mddev->merge_check_needed) {
702 int disk;
703 rcu_read_lock();
704 for (disk = 0; disk < conf->raid_disks * 2; disk++) {
705 struct md_rdev *rdev = rcu_dereference(
706 conf->mirrors[disk].rdev);
707 if (rdev && !test_bit(Faulty, &rdev->flags)) {
708 struct request_queue *q =
709 bdev_get_queue(rdev->bdev);
710 if (q->merge_bvec_fn) {
711 bvm->bi_sector = sector +
712 rdev->data_offset;
713 bvm->bi_bdev = rdev->bdev;
714 max = min(max, q->merge_bvec_fn(
715 q, bvm, biovec));
716 }
717 }
718 }
719 rcu_read_unlock();
720 }
721 return max;
722
723}
724
725int md_raid1_congested(struct mddev *mddev, int bits)
726{ 595{
727 struct r1conf *conf = mddev->private; 596 conf_t *conf = mddev->private;
728 int i, ret = 0; 597 int i, ret = 0;
729 598
730 if ((bits & (1 << BDI_async_congested)) &&
731 conf->pending_count >= max_queued_requests)
732 return 1;
733
734 rcu_read_lock(); 599 rcu_read_lock();
735 for (i = 0; i < conf->raid_disks * 2; i++) { 600 for (i = 0; i < mddev->raid_disks; i++) {
736 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 601 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
737 if (rdev && !test_bit(Faulty, &rdev->flags)) { 602 if (rdev && !test_bit(Faulty, &rdev->flags)) {
738 struct request_queue *q = bdev_get_queue(rdev->bdev); 603 struct request_queue *q = bdev_get_queue(rdev->bdev);
739 604
@@ -755,13 +620,13 @@ EXPORT_SYMBOL_GPL(md_raid1_congested);
755 620
756static int raid1_congested(void *data, int bits) 621static int raid1_congested(void *data, int bits)
757{ 622{
758 struct mddev *mddev = data; 623 mddev_t *mddev = data;
759 624
760 return mddev_congested(mddev, bits) || 625 return mddev_congested(mddev, bits) ||
761 md_raid1_congested(mddev, bits); 626 md_raid1_congested(mddev, bits);
762} 627}
763 628
764static void flush_pending_writes(struct r1conf *conf) 629static void flush_pending_writes(conf_t *conf)
765{ 630{
766 /* Any writes that have been queued but are awaiting 631 /* Any writes that have been queued but are awaiting
767 * bitmap updates get flushed here. 632 * bitmap updates get flushed here.
@@ -771,22 +636,15 @@ static void flush_pending_writes(struct r1conf *conf)
771 if (conf->pending_bio_list.head) { 636 if (conf->pending_bio_list.head) {
772 struct bio *bio; 637 struct bio *bio;
773 bio = bio_list_get(&conf->pending_bio_list); 638 bio = bio_list_get(&conf->pending_bio_list);
774 conf->pending_count = 0;
775 spin_unlock_irq(&conf->device_lock); 639 spin_unlock_irq(&conf->device_lock);
776 /* flush any pending bitmap writes to 640 /* flush any pending bitmap writes to
777 * disk before proceeding w/ I/O */ 641 * disk before proceeding w/ I/O */
778 bitmap_unplug(conf->mddev->bitmap); 642 bitmap_unplug(conf->mddev->bitmap);
779 wake_up(&conf->wait_barrier);
780 643
781 while (bio) { /* submit pending writes */ 644 while (bio) { /* submit pending writes */
782 struct bio *next = bio->bi_next; 645 struct bio *next = bio->bi_next;
783 bio->bi_next = NULL; 646 bio->bi_next = NULL;
784 if (unlikely((bio->bi_rw & REQ_DISCARD) && 647 generic_make_request(bio);
785 !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
786 /* Just ignore it */
787 bio_endio(bio, 0);
788 else
789 generic_make_request(bio);
790 bio = next; 648 bio = next;
791 } 649 }
792 } else 650 } else
@@ -816,13 +674,13 @@ static void flush_pending_writes(struct r1conf *conf)
816 */ 674 */
817#define RESYNC_DEPTH 32 675#define RESYNC_DEPTH 32
818 676
819static void raise_barrier(struct r1conf *conf) 677static void raise_barrier(conf_t *conf)
820{ 678{
821 spin_lock_irq(&conf->resync_lock); 679 spin_lock_irq(&conf->resync_lock);
822 680
823 /* Wait until no block IO is waiting */ 681 /* Wait until no block IO is waiting */
824 wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting, 682 wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
825 conf->resync_lock); 683 conf->resync_lock, );
826 684
827 /* block any new IO from starting */ 685 /* block any new IO from starting */
828 conf->barrier++; 686 conf->barrier++;
@@ -830,12 +688,12 @@ static void raise_barrier(struct r1conf *conf)
830 /* Now wait for all pending IO to complete */ 688 /* Now wait for all pending IO to complete */
831 wait_event_lock_irq(conf->wait_barrier, 689 wait_event_lock_irq(conf->wait_barrier,
832 !conf->nr_pending && conf->barrier < RESYNC_DEPTH, 690 !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
833 conf->resync_lock); 691 conf->resync_lock, );
834 692
835 spin_unlock_irq(&conf->resync_lock); 693 spin_unlock_irq(&conf->resync_lock);
836} 694}
837 695
838static void lower_barrier(struct r1conf *conf) 696static void lower_barrier(conf_t *conf)
839{ 697{
840 unsigned long flags; 698 unsigned long flags;
841 BUG_ON(conf->barrier <= 0); 699 BUG_ON(conf->barrier <= 0);
@@ -845,33 +703,21 @@ static void lower_barrier(struct r1conf *conf)
845 wake_up(&conf->wait_barrier); 703 wake_up(&conf->wait_barrier);
846} 704}
847 705
848static void wait_barrier(struct r1conf *conf) 706static void wait_barrier(conf_t *conf)
849{ 707{
850 spin_lock_irq(&conf->resync_lock); 708 spin_lock_irq(&conf->resync_lock);
851 if (conf->barrier) { 709 if (conf->barrier) {
852 conf->nr_waiting++; 710 conf->nr_waiting++;
853 /* Wait for the barrier to drop. 711 wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
854 * However if there are already pending 712 conf->resync_lock,
855 * requests (preventing the barrier from 713 );
856 * rising completely), and the
857 * pre-process bio queue isn't empty,
858 * then don't wait, as we need to empty
859 * that queue to get the nr_pending
860 * count down.
861 */
862 wait_event_lock_irq(conf->wait_barrier,
863 !conf->barrier ||
864 (conf->nr_pending &&
865 current->bio_list &&
866 !bio_list_empty(current->bio_list)),
867 conf->resync_lock);
868 conf->nr_waiting--; 714 conf->nr_waiting--;
869 } 715 }
870 conf->nr_pending++; 716 conf->nr_pending++;
871 spin_unlock_irq(&conf->resync_lock); 717 spin_unlock_irq(&conf->resync_lock);
872} 718}
873 719
874static void allow_barrier(struct r1conf *conf) 720static void allow_barrier(conf_t *conf)
875{ 721{
876 unsigned long flags; 722 unsigned long flags;
877 spin_lock_irqsave(&conf->resync_lock, flags); 723 spin_lock_irqsave(&conf->resync_lock, flags);
@@ -880,7 +726,7 @@ static void allow_barrier(struct r1conf *conf)
880 wake_up(&conf->wait_barrier); 726 wake_up(&conf->wait_barrier);
881} 727}
882 728
883static void freeze_array(struct r1conf *conf) 729static void freeze_array(conf_t *conf)
884{ 730{
885 /* stop syncio and normal IO and wait for everything to 731 /* stop syncio and normal IO and wait for everything to
886 * go quite. 732 * go quite.
@@ -897,13 +743,13 @@ static void freeze_array(struct r1conf *conf)
897 spin_lock_irq(&conf->resync_lock); 743 spin_lock_irq(&conf->resync_lock);
898 conf->barrier++; 744 conf->barrier++;
899 conf->nr_waiting++; 745 conf->nr_waiting++;
900 wait_event_lock_irq_cmd(conf->wait_barrier, 746 wait_event_lock_irq(conf->wait_barrier,
901 conf->nr_pending == conf->nr_queued+1, 747 conf->nr_pending == conf->nr_queued+1,
902 conf->resync_lock, 748 conf->resync_lock,
903 flush_pending_writes(conf)); 749 flush_pending_writes(conf));
904 spin_unlock_irq(&conf->resync_lock); 750 spin_unlock_irq(&conf->resync_lock);
905} 751}
906static void unfreeze_array(struct r1conf *conf) 752static void unfreeze_array(conf_t *conf)
907{ 753{
908 /* reverse the effect of the freeze */ 754 /* reverse the effect of the freeze */
909 spin_lock_irq(&conf->resync_lock); 755 spin_lock_irq(&conf->resync_lock);
@@ -916,7 +762,7 @@ static void unfreeze_array(struct r1conf *conf)
916 762
917/* duplicate the data pages for behind I/O 763/* duplicate the data pages for behind I/O
918 */ 764 */
919static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio) 765static void alloc_behind_pages(struct bio *bio, r1bio_t *r1_bio)
920{ 766{
921 int i; 767 int i;
922 struct bio_vec *bvec; 768 struct bio_vec *bvec;
@@ -945,52 +791,14 @@ do_sync_io:
945 if (bvecs[i].bv_page) 791 if (bvecs[i].bv_page)
946 put_page(bvecs[i].bv_page); 792 put_page(bvecs[i].bv_page);
947 kfree(bvecs); 793 kfree(bvecs);
948 pr_debug("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); 794 PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
949}
950
951struct raid1_plug_cb {
952 struct blk_plug_cb cb;
953 struct bio_list pending;
954 int pending_cnt;
955};
956
957static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
958{
959 struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb,
960 cb);
961 struct mddev *mddev = plug->cb.data;
962 struct r1conf *conf = mddev->private;
963 struct bio *bio;
964
965 if (from_schedule || current->bio_list) {
966 spin_lock_irq(&conf->device_lock);
967 bio_list_merge(&conf->pending_bio_list, &plug->pending);
968 conf->pending_count += plug->pending_cnt;
969 spin_unlock_irq(&conf->device_lock);
970 md_wakeup_thread(mddev->thread);
971 kfree(plug);
972 return;
973 }
974
975 /* we aren't scheduling, so we can do the write-out directly. */
976 bio = bio_list_get(&plug->pending);
977 bitmap_unplug(mddev->bitmap);
978 wake_up(&conf->wait_barrier);
979
980 while (bio) { /* submit pending writes */
981 struct bio *next = bio->bi_next;
982 bio->bi_next = NULL;
983 generic_make_request(bio);
984 bio = next;
985 }
986 kfree(plug);
987} 795}
988 796
989static void make_request(struct mddev *mddev, struct bio * bio) 797static int make_request(mddev_t *mddev, struct bio * bio)
990{ 798{
991 struct r1conf *conf = mddev->private; 799 conf_t *conf = mddev->private;
992 struct raid1_info *mirror; 800 mirror_info_t *mirror;
993 struct r1bio *r1_bio; 801 r1bio_t *r1_bio;
994 struct bio *read_bio; 802 struct bio *read_bio;
995 int i, disks; 803 int i, disks;
996 struct bitmap *bitmap; 804 struct bitmap *bitmap;
@@ -998,11 +806,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
998 const int rw = bio_data_dir(bio); 806 const int rw = bio_data_dir(bio);
999 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); 807 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
1000 const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); 808 const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
1001 const unsigned long do_discard = (bio->bi_rw 809 mdk_rdev_t *blocked_rdev;
1002 & (REQ_DISCARD | REQ_SECURE)); 810 int plugged;
1003 struct md_rdev *blocked_rdev;
1004 struct blk_plug_cb *cb;
1005 struct raid1_plug_cb *plug = NULL;
1006 int first_clone; 811 int first_clone;
1007 int sectors_handled; 812 int sectors_handled;
1008 int max_sectors; 813 int max_sectors;
@@ -1074,7 +879,7 @@ read_again:
1074 if (rdisk < 0) { 879 if (rdisk < 0) {
1075 /* couldn't find anywhere to read from */ 880 /* couldn't find anywhere to read from */
1076 raid_end_bio_io(r1_bio); 881 raid_end_bio_io(r1_bio);
1077 return; 882 return 0;
1078 } 883 }
1079 mirror = conf->mirrors + rdisk; 884 mirror = conf->mirrors + rdisk;
1080 885
@@ -1132,17 +937,12 @@ read_again:
1132 goto read_again; 937 goto read_again;
1133 } else 938 } else
1134 generic_make_request(read_bio); 939 generic_make_request(read_bio);
1135 return; 940 return 0;
1136 } 941 }
1137 942
1138 /* 943 /*
1139 * WRITE: 944 * WRITE:
1140 */ 945 */
1141 if (conf->pending_count >= max_queued_requests) {
1142 md_wakeup_thread(mddev->thread);
1143 wait_event(conf->wait_barrier,
1144 conf->pending_count < max_queued_requests);
1145 }
1146 /* first select target devices under rcu_lock and 946 /* first select target devices under rcu_lock and
1147 * inc refcount on their rdev. Record them by setting 947 * inc refcount on their rdev. Record them by setting
1148 * bios[x] to bio 948 * bios[x] to bio
@@ -1153,24 +953,23 @@ read_again:
1153 * the bad blocks. Each set of writes gets it's own r1bio 953 * the bad blocks. Each set of writes gets it's own r1bio
1154 * with a set of bios attached. 954 * with a set of bios attached.
1155 */ 955 */
956 plugged = mddev_check_plugged(mddev);
1156 957
1157 disks = conf->raid_disks * 2; 958 disks = conf->raid_disks;
1158 retry_write: 959 retry_write:
1159 blocked_rdev = NULL; 960 blocked_rdev = NULL;
1160 rcu_read_lock(); 961 rcu_read_lock();
1161 max_sectors = r1_bio->sectors; 962 max_sectors = r1_bio->sectors;
1162 for (i = 0; i < disks; i++) { 963 for (i = 0; i < disks; i++) {
1163 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 964 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
1164 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 965 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1165 atomic_inc(&rdev->nr_pending); 966 atomic_inc(&rdev->nr_pending);
1166 blocked_rdev = rdev; 967 blocked_rdev = rdev;
1167 break; 968 break;
1168 } 969 }
1169 r1_bio->bios[i] = NULL; 970 r1_bio->bios[i] = NULL;
1170 if (!rdev || test_bit(Faulty, &rdev->flags) 971 if (!rdev || test_bit(Faulty, &rdev->flags)) {
1171 || test_bit(Unmerged, &rdev->flags)) { 972 set_bit(R1BIO_Degraded, &r1_bio->state);
1172 if (i < conf->raid_disks)
1173 set_bit(R1BIO_Degraded, &r1_bio->state);
1174 continue; 973 continue;
1175 } 974 }
1176 975
@@ -1301,27 +1100,13 @@ read_again:
1301 conf->mirrors[i].rdev->data_offset); 1100 conf->mirrors[i].rdev->data_offset);
1302 mbio->bi_bdev = conf->mirrors[i].rdev->bdev; 1101 mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
1303 mbio->bi_end_io = raid1_end_write_request; 1102 mbio->bi_end_io = raid1_end_write_request;
1304 mbio->bi_rw = WRITE | do_flush_fua | do_sync | do_discard; 1103 mbio->bi_rw = WRITE | do_flush_fua | do_sync;
1305 mbio->bi_private = r1_bio; 1104 mbio->bi_private = r1_bio;
1306 1105
1307 atomic_inc(&r1_bio->remaining); 1106 atomic_inc(&r1_bio->remaining);
1308
1309 cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug));
1310 if (cb)
1311 plug = container_of(cb, struct raid1_plug_cb, cb);
1312 else
1313 plug = NULL;
1314 spin_lock_irqsave(&conf->device_lock, flags); 1107 spin_lock_irqsave(&conf->device_lock, flags);
1315 if (plug) { 1108 bio_list_add(&conf->pending_bio_list, mbio);
1316 bio_list_add(&plug->pending, mbio);
1317 plug->pending_cnt++;
1318 } else {
1319 bio_list_add(&conf->pending_bio_list, mbio);
1320 conf->pending_count++;
1321 }
1322 spin_unlock_irqrestore(&conf->device_lock, flags); 1109 spin_unlock_irqrestore(&conf->device_lock, flags);
1323 if (!plug)
1324 md_wakeup_thread(mddev->thread);
1325 } 1110 }
1326 /* Mustn't call r1_bio_write_done before this next test, 1111 /* Mustn't call r1_bio_write_done before this next test,
1327 * as it could result in the bio being freed. 1112 * as it could result in the bio being freed.
@@ -1344,18 +1129,23 @@ read_again:
1344 1129
1345 /* In case raid1d snuck in to freeze_array */ 1130 /* In case raid1d snuck in to freeze_array */
1346 wake_up(&conf->wait_barrier); 1131 wake_up(&conf->wait_barrier);
1132
1133 if (do_sync || !bitmap || !plugged)
1134 md_wakeup_thread(mddev->thread);
1135
1136 return 0;
1347} 1137}
1348 1138
1349static void status(struct seq_file *seq, struct mddev *mddev) 1139static void status(struct seq_file *seq, mddev_t *mddev)
1350{ 1140{
1351 struct r1conf *conf = mddev->private; 1141 conf_t *conf = mddev->private;
1352 int i; 1142 int i;
1353 1143
1354 seq_printf(seq, " [%d/%d] [", conf->raid_disks, 1144 seq_printf(seq, " [%d/%d] [", conf->raid_disks,
1355 conf->raid_disks - mddev->degraded); 1145 conf->raid_disks - mddev->degraded);
1356 rcu_read_lock(); 1146 rcu_read_lock();
1357 for (i = 0; i < conf->raid_disks; i++) { 1147 for (i = 0; i < conf->raid_disks; i++) {
1358 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 1148 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
1359 seq_printf(seq, "%s", 1149 seq_printf(seq, "%s",
1360 rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); 1150 rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
1361 } 1151 }
@@ -1364,10 +1154,10 @@ static void status(struct seq_file *seq, struct mddev *mddev)
1364} 1154}
1365 1155
1366 1156
1367static void error(struct mddev *mddev, struct md_rdev *rdev) 1157static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1368{ 1158{
1369 char b[BDEVNAME_SIZE]; 1159 char b[BDEVNAME_SIZE];
1370 struct r1conf *conf = mddev->private; 1160 conf_t *conf = mddev->private;
1371 1161
1372 /* 1162 /*
1373 * If it is not operational, then we have already marked it as dead 1163 * If it is not operational, then we have already marked it as dead
@@ -1407,7 +1197,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
1407 mdname(mddev), conf->raid_disks - mddev->degraded); 1197 mdname(mddev), conf->raid_disks - mddev->degraded);
1408} 1198}
1409 1199
1410static void print_conf(struct r1conf *conf) 1200static void print_conf(conf_t *conf)
1411{ 1201{
1412 int i; 1202 int i;
1413 1203
@@ -1422,7 +1212,7 @@ static void print_conf(struct r1conf *conf)
1422 rcu_read_lock(); 1212 rcu_read_lock();
1423 for (i = 0; i < conf->raid_disks; i++) { 1213 for (i = 0; i < conf->raid_disks; i++) {
1424 char b[BDEVNAME_SIZE]; 1214 char b[BDEVNAME_SIZE];
1425 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 1215 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
1426 if (rdev) 1216 if (rdev)
1427 printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n", 1217 printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
1428 i, !test_bit(In_sync, &rdev->flags), 1218 i, !test_bit(In_sync, &rdev->flags),
@@ -1432,7 +1222,7 @@ static void print_conf(struct r1conf *conf)
1432 rcu_read_unlock(); 1222 rcu_read_unlock();
1433} 1223}
1434 1224
1435static void close_sync(struct r1conf *conf) 1225static void close_sync(conf_t *conf)
1436{ 1226{
1437 wait_barrier(conf); 1227 wait_barrier(conf);
1438 allow_barrier(conf); 1228 allow_barrier(conf);
@@ -1441,10 +1231,10 @@ static void close_sync(struct r1conf *conf)
1441 conf->r1buf_pool = NULL; 1231 conf->r1buf_pool = NULL;
1442} 1232}
1443 1233
1444static int raid1_spare_active(struct mddev *mddev) 1234static int raid1_spare_active(mddev_t *mddev)
1445{ 1235{
1446 int i; 1236 int i;
1447 struct r1conf *conf = mddev->private; 1237 conf_t *conf = mddev->private;
1448 int count = 0; 1238 int count = 0;
1449 unsigned long flags; 1239 unsigned long flags;
1450 1240
@@ -1454,26 +1244,7 @@ static int raid1_spare_active(struct mddev *mddev)
1454 * Called under mddev lock, so rcu protection not needed. 1244 * Called under mddev lock, so rcu protection not needed.
1455 */ 1245 */
1456 for (i = 0; i < conf->raid_disks; i++) { 1246 for (i = 0; i < conf->raid_disks; i++) {
1457 struct md_rdev *rdev = conf->mirrors[i].rdev; 1247 mdk_rdev_t *rdev = conf->mirrors[i].rdev;
1458 struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev;
1459 if (repl
1460 && repl->recovery_offset == MaxSector
1461 && !test_bit(Faulty, &repl->flags)
1462 && !test_and_set_bit(In_sync, &repl->flags)) {
1463 /* replacement has just become active */
1464 if (!rdev ||
1465 !test_and_clear_bit(In_sync, &rdev->flags))
1466 count++;
1467 if (rdev) {
1468 /* Replaced device not technically
1469 * faulty, but we need to be sure
1470 * it gets removed and never re-added
1471 */
1472 set_bit(Faulty, &rdev->flags);
1473 sysfs_notify_dirent_safe(
1474 rdev->sysfs_state);
1475 }
1476 }
1477 if (rdev 1248 if (rdev
1478 && !test_bit(Faulty, &rdev->flags) 1249 && !test_bit(Faulty, &rdev->flags)
1479 && !test_and_set_bit(In_sync, &rdev->flags)) { 1250 && !test_and_set_bit(In_sync, &rdev->flags)) {
@@ -1490,15 +1261,14 @@ static int raid1_spare_active(struct mddev *mddev)
1490} 1261}
1491 1262
1492 1263
1493static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) 1264static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1494{ 1265{
1495 struct r1conf *conf = mddev->private; 1266 conf_t *conf = mddev->private;
1496 int err = -EEXIST; 1267 int err = -EEXIST;
1497 int mirror = 0; 1268 int mirror = 0;
1498 struct raid1_info *p; 1269 mirror_info_t *p;
1499 int first = 0; 1270 int first = 0;
1500 int last = conf->raid_disks - 1; 1271 int last = mddev->raid_disks - 1;
1501 struct request_queue *q = bdev_get_queue(rdev->bdev);
1502 1272
1503 if (mddev->recovery_disabled == conf->recovery_disabled) 1273 if (mddev->recovery_disabled == conf->recovery_disabled)
1504 return -EBUSY; 1274 return -EBUSY;
@@ -1506,17 +1276,22 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1506 if (rdev->raid_disk >= 0) 1276 if (rdev->raid_disk >= 0)
1507 first = last = rdev->raid_disk; 1277 first = last = rdev->raid_disk;
1508 1278
1509 if (q->merge_bvec_fn) { 1279 for (mirror = first; mirror <= last; mirror++)
1510 set_bit(Unmerged, &rdev->flags); 1280 if ( !(p=conf->mirrors+mirror)->rdev) {
1511 mddev->merge_check_needed = 1;
1512 }
1513
1514 for (mirror = first; mirror <= last; mirror++) {
1515 p = conf->mirrors+mirror;
1516 if (!p->rdev) {
1517 1281
1518 disk_stack_limits(mddev->gendisk, rdev->bdev, 1282 disk_stack_limits(mddev->gendisk, rdev->bdev,
1519 rdev->data_offset << 9); 1283 rdev->data_offset << 9);
1284 /* as we don't honour merge_bvec_fn, we must
1285 * never risk violating it, so limit
1286 * ->max_segments to one lying with a single
1287 * page, as a one page request is never in
1288 * violation.
1289 */
1290 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1291 blk_queue_max_segments(mddev->queue, 1);
1292 blk_queue_segment_boundary(mddev->queue,
1293 PAGE_CACHE_SIZE - 1);
1294 }
1520 1295
1521 p->head_position = 0; 1296 p->head_position = 0;
1522 rdev->raid_disk = mirror; 1297 rdev->raid_disk = mirror;
@@ -1529,50 +1304,21 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1529 rcu_assign_pointer(p->rdev, rdev); 1304 rcu_assign_pointer(p->rdev, rdev);
1530 break; 1305 break;
1531 } 1306 }
1532 if (test_bit(WantReplacement, &p->rdev->flags) &&
1533 p[conf->raid_disks].rdev == NULL) {
1534 /* Add this device as a replacement */
1535 clear_bit(In_sync, &rdev->flags);
1536 set_bit(Replacement, &rdev->flags);
1537 rdev->raid_disk = mirror;
1538 err = 0;
1539 conf->fullsync = 1;
1540 rcu_assign_pointer(p[conf->raid_disks].rdev, rdev);
1541 break;
1542 }
1543 }
1544 if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
1545 /* Some requests might not have seen this new
1546 * merge_bvec_fn. We must wait for them to complete
1547 * before merging the device fully.
1548 * First we make sure any code which has tested
1549 * our function has submitted the request, then
1550 * we wait for all outstanding requests to complete.
1551 */
1552 synchronize_sched();
1553 raise_barrier(conf);
1554 lower_barrier(conf);
1555 clear_bit(Unmerged, &rdev->flags);
1556 }
1557 md_integrity_add_rdev(rdev, mddev); 1307 md_integrity_add_rdev(rdev, mddev);
1558 if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
1559 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
1560 print_conf(conf); 1308 print_conf(conf);
1561 return err; 1309 return err;
1562} 1310}
1563 1311
1564static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) 1312static int raid1_remove_disk(mddev_t *mddev, int number)
1565{ 1313{
1566 struct r1conf *conf = mddev->private; 1314 conf_t *conf = mddev->private;
1567 int err = 0; 1315 int err = 0;
1568 int number = rdev->raid_disk; 1316 mdk_rdev_t *rdev;
1569 struct raid1_info *p = conf->mirrors + number; 1317 mirror_info_t *p = conf->mirrors+ number;
1570
1571 if (rdev != p->rdev)
1572 p = conf->mirrors + conf->raid_disks + number;
1573 1318
1574 print_conf(conf); 1319 print_conf(conf);
1575 if (rdev == p->rdev) { 1320 rdev = p->rdev;
1321 if (rdev) {
1576 if (test_bit(In_sync, &rdev->flags) || 1322 if (test_bit(In_sync, &rdev->flags) ||
1577 atomic_read(&rdev->nr_pending)) { 1323 atomic_read(&rdev->nr_pending)) {
1578 err = -EBUSY; 1324 err = -EBUSY;
@@ -1594,21 +1340,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1594 err = -EBUSY; 1340 err = -EBUSY;
1595 p->rdev = rdev; 1341 p->rdev = rdev;
1596 goto abort; 1342 goto abort;
1597 } else if (conf->mirrors[conf->raid_disks + number].rdev) { 1343 }
1598 /* We just removed a device that is being replaced.
1599 * Move down the replacement. We drain all IO before
1600 * doing this to avoid confusion.
1601 */
1602 struct md_rdev *repl =
1603 conf->mirrors[conf->raid_disks + number].rdev;
1604 raise_barrier(conf);
1605 clear_bit(Replacement, &repl->flags);
1606 p->rdev = repl;
1607 conf->mirrors[conf->raid_disks + number].rdev = NULL;
1608 lower_barrier(conf);
1609 clear_bit(WantReplacement, &rdev->flags);
1610 } else
1611 clear_bit(WantReplacement, &rdev->flags);
1612 err = md_integrity_register(mddev); 1344 err = md_integrity_register(mddev);
1613 } 1345 }
1614abort: 1346abort:
@@ -1620,10 +1352,14 @@ abort:
1620 1352
1621static void end_sync_read(struct bio *bio, int error) 1353static void end_sync_read(struct bio *bio, int error)
1622{ 1354{
1623 struct r1bio *r1_bio = bio->bi_private; 1355 r1bio_t *r1_bio = bio->bi_private;
1624 1356 int i;
1625 update_head_pos(r1_bio->read_disk, r1_bio);
1626 1357
1358 for (i=r1_bio->mddev->raid_disks; i--; )
1359 if (r1_bio->bios[i] == bio)
1360 break;
1361 BUG_ON(i < 0);
1362 update_head_pos(i, r1_bio);
1627 /* 1363 /*
1628 * we have read a block, now it needs to be re-written, 1364 * we have read a block, now it needs to be re-written,
1629 * or re-read if the read failed. 1365 * or re-read if the read failed.
@@ -1639,15 +1375,19 @@ static void end_sync_read(struct bio *bio, int error)
1639static void end_sync_write(struct bio *bio, int error) 1375static void end_sync_write(struct bio *bio, int error)
1640{ 1376{
1641 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1377 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1642 struct r1bio *r1_bio = bio->bi_private; 1378 r1bio_t *r1_bio = bio->bi_private;
1643 struct mddev *mddev = r1_bio->mddev; 1379 mddev_t *mddev = r1_bio->mddev;
1644 struct r1conf *conf = mddev->private; 1380 conf_t *conf = mddev->private;
1381 int i;
1645 int mirror=0; 1382 int mirror=0;
1646 sector_t first_bad; 1383 sector_t first_bad;
1647 int bad_sectors; 1384 int bad_sectors;
1648 1385
1649 mirror = find_bio_disk(r1_bio, bio); 1386 for (i = 0; i < conf->raid_disks; i++)
1650 1387 if (r1_bio->bios[i] == bio) {
1388 mirror = i;
1389 break;
1390 }
1651 if (!uptodate) { 1391 if (!uptodate) {
1652 sector_t sync_blocks = 0; 1392 sector_t sync_blocks = 0;
1653 sector_t s = r1_bio->sector; 1393 sector_t s = r1_bio->sector;
@@ -1661,10 +1401,6 @@ static void end_sync_write(struct bio *bio, int error)
1661 } while (sectors_to_go > 0); 1401 } while (sectors_to_go > 0);
1662 set_bit(WriteErrorSeen, 1402 set_bit(WriteErrorSeen,
1663 &conf->mirrors[mirror].rdev->flags); 1403 &conf->mirrors[mirror].rdev->flags);
1664 if (!test_and_set_bit(WantReplacement,
1665 &conf->mirrors[mirror].rdev->flags))
1666 set_bit(MD_RECOVERY_NEEDED, &
1667 mddev->recovery);
1668 set_bit(R1BIO_WriteError, &r1_bio->state); 1404 set_bit(R1BIO_WriteError, &r1_bio->state);
1669 } else if (is_badblock(conf->mirrors[mirror].rdev, 1405 } else if (is_badblock(conf->mirrors[mirror].rdev,
1670 r1_bio->sector, 1406 r1_bio->sector,
@@ -1677,6 +1413,8 @@ static void end_sync_write(struct bio *bio, int error)
1677 ) 1413 )
1678 set_bit(R1BIO_MadeGood, &r1_bio->state); 1414 set_bit(R1BIO_MadeGood, &r1_bio->state);
1679 1415
1416 update_head_pos(mirror, r1_bio);
1417
1680 if (atomic_dec_and_test(&r1_bio->remaining)) { 1418 if (atomic_dec_and_test(&r1_bio->remaining)) {
1681 int s = r1_bio->sectors; 1419 int s = r1_bio->sectors;
1682 if (test_bit(R1BIO_MadeGood, &r1_bio->state) || 1420 if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
@@ -1689,26 +1427,21 @@ static void end_sync_write(struct bio *bio, int error)
1689 } 1427 }
1690} 1428}
1691 1429
1692static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector, 1430static int r1_sync_page_io(mdk_rdev_t *rdev, sector_t sector,
1693 int sectors, struct page *page, int rw) 1431 int sectors, struct page *page, int rw)
1694{ 1432{
1695 if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) 1433 if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
1696 /* success */ 1434 /* success */
1697 return 1; 1435 return 1;
1698 if (rw == WRITE) { 1436 if (rw == WRITE)
1699 set_bit(WriteErrorSeen, &rdev->flags); 1437 set_bit(WriteErrorSeen, &rdev->flags);
1700 if (!test_and_set_bit(WantReplacement,
1701 &rdev->flags))
1702 set_bit(MD_RECOVERY_NEEDED, &
1703 rdev->mddev->recovery);
1704 }
1705 /* need to record an error - either for the block or the device */ 1438 /* need to record an error - either for the block or the device */
1706 if (!rdev_set_badblocks(rdev, sector, sectors, 0)) 1439 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
1707 md_error(rdev->mddev, rdev); 1440 md_error(rdev->mddev, rdev);
1708 return 0; 1441 return 0;
1709} 1442}
1710 1443
1711static int fix_sync_read_error(struct r1bio *r1_bio) 1444static int fix_sync_read_error(r1bio_t *r1_bio)
1712{ 1445{
1713 /* Try some synchronous reads of other devices to get 1446 /* Try some synchronous reads of other devices to get
1714 * good data, much like with normal read errors. Only 1447 * good data, much like with normal read errors. Only
@@ -1721,8 +1454,8 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
1721 * made sure that anything with a bad block in range 1454 * made sure that anything with a bad block in range
1722 * will have bi_end_io clear. 1455 * will have bi_end_io clear.
1723 */ 1456 */
1724 struct mddev *mddev = r1_bio->mddev; 1457 mddev_t *mddev = r1_bio->mddev;
1725 struct r1conf *conf = mddev->private; 1458 conf_t *conf = mddev->private;
1726 struct bio *bio = r1_bio->bios[r1_bio->read_disk]; 1459 struct bio *bio = r1_bio->bios[r1_bio->read_disk];
1727 sector_t sect = r1_bio->sector; 1460 sector_t sect = r1_bio->sector;
1728 int sectors = r1_bio->sectors; 1461 int sectors = r1_bio->sectors;
@@ -1732,7 +1465,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
1732 int s = sectors; 1465 int s = sectors;
1733 int d = r1_bio->read_disk; 1466 int d = r1_bio->read_disk;
1734 int success = 0; 1467 int success = 0;
1735 struct md_rdev *rdev; 1468 mdk_rdev_t *rdev;
1736 int start; 1469 int start;
1737 1470
1738 if (s > (PAGE_SIZE>>9)) 1471 if (s > (PAGE_SIZE>>9))
@@ -1752,7 +1485,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
1752 } 1485 }
1753 } 1486 }
1754 d++; 1487 d++;
1755 if (d == conf->raid_disks * 2) 1488 if (d == conf->raid_disks)
1756 d = 0; 1489 d = 0;
1757 } while (!success && d != r1_bio->read_disk); 1490 } while (!success && d != r1_bio->read_disk);
1758 1491
@@ -1769,7 +1502,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
1769 mdname(mddev), 1502 mdname(mddev),
1770 bdevname(bio->bi_bdev, b), 1503 bdevname(bio->bi_bdev, b),
1771 (unsigned long long)r1_bio->sector); 1504 (unsigned long long)r1_bio->sector);
1772 for (d = 0; d < conf->raid_disks * 2; d++) { 1505 for (d = 0; d < conf->raid_disks; d++) {
1773 rdev = conf->mirrors[d].rdev; 1506 rdev = conf->mirrors[d].rdev;
1774 if (!rdev || test_bit(Faulty, &rdev->flags)) 1507 if (!rdev || test_bit(Faulty, &rdev->flags))
1775 continue; 1508 continue;
@@ -1777,8 +1510,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
1777 abort = 1; 1510 abort = 1;
1778 } 1511 }
1779 if (abort) { 1512 if (abort) {
1780 conf->recovery_disabled = 1513 mddev->recovery_disabled = 1;
1781 mddev->recovery_disabled;
1782 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1514 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1783 md_done_sync(mddev, r1_bio->sectors, 0); 1515 md_done_sync(mddev, r1_bio->sectors, 0);
1784 put_buf(r1_bio); 1516 put_buf(r1_bio);
@@ -1795,7 +1527,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
1795 /* write it back and re-read */ 1527 /* write it back and re-read */
1796 while (d != r1_bio->read_disk) { 1528 while (d != r1_bio->read_disk) {
1797 if (d == 0) 1529 if (d == 0)
1798 d = conf->raid_disks * 2; 1530 d = conf->raid_disks;
1799 d--; 1531 d--;
1800 if (r1_bio->bios[d]->bi_end_io != end_sync_read) 1532 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1801 continue; 1533 continue;
@@ -1810,7 +1542,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
1810 d = start; 1542 d = start;
1811 while (d != r1_bio->read_disk) { 1543 while (d != r1_bio->read_disk) {
1812 if (d == 0) 1544 if (d == 0)
1813 d = conf->raid_disks * 2; 1545 d = conf->raid_disks;
1814 d--; 1546 d--;
1815 if (r1_bio->bios[d]->bi_end_io != end_sync_read) 1547 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1816 continue; 1548 continue;
@@ -1829,7 +1561,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
1829 return 1; 1561 return 1;
1830} 1562}
1831 1563
1832static int process_checks(struct r1bio *r1_bio) 1564static int process_checks(r1bio_t *r1_bio)
1833{ 1565{
1834 /* We have read all readable devices. If we haven't 1566 /* We have read all readable devices. If we haven't
1835 * got the block, then there is no hope left. 1567 * got the block, then there is no hope left.
@@ -1838,13 +1570,12 @@ static int process_checks(struct r1bio *r1_bio)
1838 * If any blocks failed to read, then we need to 1570 * If any blocks failed to read, then we need to
1839 * attempt an over-write 1571 * attempt an over-write
1840 */ 1572 */
1841 struct mddev *mddev = r1_bio->mddev; 1573 mddev_t *mddev = r1_bio->mddev;
1842 struct r1conf *conf = mddev->private; 1574 conf_t *conf = mddev->private;
1843 int primary; 1575 int primary;
1844 int i; 1576 int i;
1845 int vcnt;
1846 1577
1847 for (primary = 0; primary < conf->raid_disks * 2; primary++) 1578 for (primary = 0; primary < conf->raid_disks; primary++)
1848 if (r1_bio->bios[primary]->bi_end_io == end_sync_read && 1579 if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
1849 test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) { 1580 test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
1850 r1_bio->bios[primary]->bi_end_io = NULL; 1581 r1_bio->bios[primary]->bi_end_io = NULL;
@@ -1852,9 +1583,9 @@ static int process_checks(struct r1bio *r1_bio)
1852 break; 1583 break;
1853 } 1584 }
1854 r1_bio->read_disk = primary; 1585 r1_bio->read_disk = primary;
1855 vcnt = (r1_bio->sectors + PAGE_SIZE / 512 - 1) >> (PAGE_SHIFT - 9); 1586 for (i = 0; i < conf->raid_disks; i++) {
1856 for (i = 0; i < conf->raid_disks * 2; i++) {
1857 int j; 1587 int j;
1588 int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
1858 struct bio *pbio = r1_bio->bios[primary]; 1589 struct bio *pbio = r1_bio->bios[primary];
1859 struct bio *sbio = r1_bio->bios[i]; 1590 struct bio *sbio = r1_bio->bios[i];
1860 int size; 1591 int size;
@@ -1869,13 +1600,13 @@ static int process_checks(struct r1bio *r1_bio)
1869 s = sbio->bi_io_vec[j].bv_page; 1600 s = sbio->bi_io_vec[j].bv_page;
1870 if (memcmp(page_address(p), 1601 if (memcmp(page_address(p),
1871 page_address(s), 1602 page_address(s),
1872 sbio->bi_io_vec[j].bv_len)) 1603 PAGE_SIZE))
1873 break; 1604 break;
1874 } 1605 }
1875 } else 1606 } else
1876 j = 0; 1607 j = 0;
1877 if (j >= 0) 1608 if (j >= 0)
1878 atomic64_add(r1_bio->sectors, &mddev->resync_mismatches); 1609 mddev->resync_mismatches += r1_bio->sectors;
1879 if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) 1610 if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
1880 && test_bit(BIO_UPTODATE, &sbio->bi_flags))) { 1611 && test_bit(BIO_UPTODATE, &sbio->bi_flags))) {
1881 /* No need to write to this device. */ 1612 /* No need to write to this device. */
@@ -1912,11 +1643,11 @@ static int process_checks(struct r1bio *r1_bio)
1912 return 0; 1643 return 0;
1913} 1644}
1914 1645
1915static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) 1646static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
1916{ 1647{
1917 struct r1conf *conf = mddev->private; 1648 conf_t *conf = mddev->private;
1918 int i; 1649 int i;
1919 int disks = conf->raid_disks * 2; 1650 int disks = conf->raid_disks;
1920 struct bio *bio, *wbio; 1651 struct bio *bio, *wbio;
1921 1652
1922 bio = r1_bio->bios[r1_bio->read_disk]; 1653 bio = r1_bio->bios[r1_bio->read_disk];
@@ -1951,14 +1682,8 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
1951 1682
1952 if (atomic_dec_and_test(&r1_bio->remaining)) { 1683 if (atomic_dec_and_test(&r1_bio->remaining)) {
1953 /* if we're here, all write(s) have completed, so clean up */ 1684 /* if we're here, all write(s) have completed, so clean up */
1954 int s = r1_bio->sectors; 1685 md_done_sync(mddev, r1_bio->sectors, 1);
1955 if (test_bit(R1BIO_MadeGood, &r1_bio->state) || 1686 put_buf(r1_bio);
1956 test_bit(R1BIO_WriteError, &r1_bio->state))
1957 reschedule_retry(r1_bio);
1958 else {
1959 put_buf(r1_bio);
1960 md_done_sync(mddev, s, 1);
1961 }
1962 } 1687 }
1963} 1688}
1964 1689
@@ -1970,16 +1695,16 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
1970 * 3. Performs writes following reads for array synchronising. 1695 * 3. Performs writes following reads for array synchronising.
1971 */ 1696 */
1972 1697
1973static void fix_read_error(struct r1conf *conf, int read_disk, 1698static void fix_read_error(conf_t *conf, int read_disk,
1974 sector_t sect, int sectors) 1699 sector_t sect, int sectors)
1975{ 1700{
1976 struct mddev *mddev = conf->mddev; 1701 mddev_t *mddev = conf->mddev;
1977 while(sectors) { 1702 while(sectors) {
1978 int s = sectors; 1703 int s = sectors;
1979 int d = read_disk; 1704 int d = read_disk;
1980 int success = 0; 1705 int success = 0;
1981 int start; 1706 int start;
1982 struct md_rdev *rdev; 1707 mdk_rdev_t *rdev;
1983 1708
1984 if (s > (PAGE_SIZE>>9)) 1709 if (s > (PAGE_SIZE>>9))
1985 s = PAGE_SIZE >> 9; 1710 s = PAGE_SIZE >> 9;
@@ -1995,9 +1720,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
1995 1720
1996 rdev = conf->mirrors[d].rdev; 1721 rdev = conf->mirrors[d].rdev;
1997 if (rdev && 1722 if (rdev &&
1998 (test_bit(In_sync, &rdev->flags) || 1723 test_bit(In_sync, &rdev->flags) &&
1999 (!test_bit(Faulty, &rdev->flags) &&
2000 rdev->recovery_offset >= sect + s)) &&
2001 is_badblock(rdev, sect, s, 1724 is_badblock(rdev, sect, s,
2002 &first_bad, &bad_sectors) == 0 && 1725 &first_bad, &bad_sectors) == 0 &&
2003 sync_page_io(rdev, sect, s<<9, 1726 sync_page_io(rdev, sect, s<<9,
@@ -2005,14 +1728,14 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
2005 success = 1; 1728 success = 1;
2006 else { 1729 else {
2007 d++; 1730 d++;
2008 if (d == conf->raid_disks * 2) 1731 if (d == conf->raid_disks)
2009 d = 0; 1732 d = 0;
2010 } 1733 }
2011 } while (!success && d != read_disk); 1734 } while (!success && d != read_disk);
2012 1735
2013 if (!success) { 1736 if (!success) {
2014 /* Cannot read from anywhere - mark it bad */ 1737 /* Cannot read from anywhere - mark it bad */
2015 struct md_rdev *rdev = conf->mirrors[read_disk].rdev; 1738 mdk_rdev_t *rdev = conf->mirrors[read_disk].rdev;
2016 if (!rdev_set_badblocks(rdev, sect, s, 0)) 1739 if (!rdev_set_badblocks(rdev, sect, s, 0))
2017 md_error(mddev, rdev); 1740 md_error(mddev, rdev);
2018 break; 1741 break;
@@ -2021,7 +1744,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
2021 start = d; 1744 start = d;
2022 while (d != read_disk) { 1745 while (d != read_disk) {
2023 if (d==0) 1746 if (d==0)
2024 d = conf->raid_disks * 2; 1747 d = conf->raid_disks;
2025 d--; 1748 d--;
2026 rdev = conf->mirrors[d].rdev; 1749 rdev = conf->mirrors[d].rdev;
2027 if (rdev && 1750 if (rdev &&
@@ -2033,7 +1756,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
2033 while (d != read_disk) { 1756 while (d != read_disk) {
2034 char b[BDEVNAME_SIZE]; 1757 char b[BDEVNAME_SIZE];
2035 if (d==0) 1758 if (d==0)
2036 d = conf->raid_disks * 2; 1759 d = conf->raid_disks;
2037 d--; 1760 d--;
2038 rdev = conf->mirrors[d].rdev; 1761 rdev = conf->mirrors[d].rdev;
2039 if (rdev && 1762 if (rdev &&
@@ -2075,11 +1798,11 @@ static int submit_bio_wait(int rw, struct bio *bio)
2075 return test_bit(BIO_UPTODATE, &bio->bi_flags); 1798 return test_bit(BIO_UPTODATE, &bio->bi_flags);
2076} 1799}
2077 1800
2078static int narrow_write_error(struct r1bio *r1_bio, int i) 1801static int narrow_write_error(r1bio_t *r1_bio, int i)
2079{ 1802{
2080 struct mddev *mddev = r1_bio->mddev; 1803 mddev_t *mddev = r1_bio->mddev;
2081 struct r1conf *conf = mddev->private; 1804 conf_t *conf = mddev->private;
2082 struct md_rdev *rdev = conf->mirrors[i].rdev; 1805 mdk_rdev_t *rdev = conf->mirrors[i].rdev;
2083 int vcnt, idx; 1806 int vcnt, idx;
2084 struct bio_vec *vec; 1807 struct bio_vec *vec;
2085 1808
@@ -2151,18 +1874,18 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
2151 return ok; 1874 return ok;
2152} 1875}
2153 1876
2154static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio) 1877static void handle_sync_write_finished(conf_t *conf, r1bio_t *r1_bio)
2155{ 1878{
2156 int m; 1879 int m;
2157 int s = r1_bio->sectors; 1880 int s = r1_bio->sectors;
2158 for (m = 0; m < conf->raid_disks * 2 ; m++) { 1881 for (m = 0; m < conf->raid_disks ; m++) {
2159 struct md_rdev *rdev = conf->mirrors[m].rdev; 1882 mdk_rdev_t *rdev = conf->mirrors[m].rdev;
2160 struct bio *bio = r1_bio->bios[m]; 1883 struct bio *bio = r1_bio->bios[m];
2161 if (bio->bi_end_io == NULL) 1884 if (bio->bi_end_io == NULL)
2162 continue; 1885 continue;
2163 if (test_bit(BIO_UPTODATE, &bio->bi_flags) && 1886 if (test_bit(BIO_UPTODATE, &bio->bi_flags) &&
2164 test_bit(R1BIO_MadeGood, &r1_bio->state)) { 1887 test_bit(R1BIO_MadeGood, &r1_bio->state)) {
2165 rdev_clear_badblocks(rdev, r1_bio->sector, s, 0); 1888 rdev_clear_badblocks(rdev, r1_bio->sector, s);
2166 } 1889 }
2167 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && 1890 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
2168 test_bit(R1BIO_WriteError, &r1_bio->state)) { 1891 test_bit(R1BIO_WriteError, &r1_bio->state)) {
@@ -2174,15 +1897,15 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
2174 md_done_sync(conf->mddev, s, 1); 1897 md_done_sync(conf->mddev, s, 1);
2175} 1898}
2176 1899
2177static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) 1900static void handle_write_finished(conf_t *conf, r1bio_t *r1_bio)
2178{ 1901{
2179 int m; 1902 int m;
2180 for (m = 0; m < conf->raid_disks * 2 ; m++) 1903 for (m = 0; m < conf->raid_disks ; m++)
2181 if (r1_bio->bios[m] == IO_MADE_GOOD) { 1904 if (r1_bio->bios[m] == IO_MADE_GOOD) {
2182 struct md_rdev *rdev = conf->mirrors[m].rdev; 1905 mdk_rdev_t *rdev = conf->mirrors[m].rdev;
2183 rdev_clear_badblocks(rdev, 1906 rdev_clear_badblocks(rdev,
2184 r1_bio->sector, 1907 r1_bio->sector,
2185 r1_bio->sectors, 0); 1908 r1_bio->sectors);
2186 rdev_dec_pending(rdev, conf->mddev); 1909 rdev_dec_pending(rdev, conf->mddev);
2187 } else if (r1_bio->bios[m] != NULL) { 1910 } else if (r1_bio->bios[m] != NULL) {
2188 /* This drive got a write error. We need to 1911 /* This drive got a write error. We need to
@@ -2203,14 +1926,14 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
2203 raid_end_bio_io(r1_bio); 1926 raid_end_bio_io(r1_bio);
2204} 1927}
2205 1928
2206static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) 1929static void handle_read_error(conf_t *conf, r1bio_t *r1_bio)
2207{ 1930{
2208 int disk; 1931 int disk;
2209 int max_sectors; 1932 int max_sectors;
2210 struct mddev *mddev = conf->mddev; 1933 mddev_t *mddev = conf->mddev;
2211 struct bio *bio; 1934 struct bio *bio;
2212 char b[BDEVNAME_SIZE]; 1935 char b[BDEVNAME_SIZE];
2213 struct md_rdev *rdev; 1936 mdk_rdev_t *rdev;
2214 1937
2215 clear_bit(R1BIO_ReadError, &r1_bio->state); 1938 clear_bit(R1BIO_ReadError, &r1_bio->state);
2216 /* we got a read error. Maybe the drive is bad. Maybe just 1939 /* we got a read error. Maybe the drive is bad. Maybe just
@@ -2228,7 +1951,6 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
2228 unfreeze_array(conf); 1951 unfreeze_array(conf);
2229 } else 1952 } else
2230 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); 1953 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
2231 rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev);
2232 1954
2233 bio = r1_bio->bios[r1_bio->read_disk]; 1955 bio = r1_bio->bios[r1_bio->read_disk];
2234 bdevname(bio->bi_bdev, b); 1956 bdevname(bio->bi_bdev, b);
@@ -2294,12 +2016,11 @@ read_more:
2294 } 2016 }
2295} 2017}
2296 2018
2297static void raid1d(struct md_thread *thread) 2019static void raid1d(mddev_t *mddev)
2298{ 2020{
2299 struct mddev *mddev = thread->mddev; 2021 r1bio_t *r1_bio;
2300 struct r1bio *r1_bio;
2301 unsigned long flags; 2022 unsigned long flags;
2302 struct r1conf *conf = mddev->private; 2023 conf_t *conf = mddev->private;
2303 struct list_head *head = &conf->retry_list; 2024 struct list_head *head = &conf->retry_list;
2304 struct blk_plug plug; 2025 struct blk_plug plug;
2305 2026
@@ -2308,14 +2029,15 @@ static void raid1d(struct md_thread *thread)
2308 blk_start_plug(&plug); 2029 blk_start_plug(&plug);
2309 for (;;) { 2030 for (;;) {
2310 2031
2311 flush_pending_writes(conf); 2032 if (atomic_read(&mddev->plug_cnt) == 0)
2033 flush_pending_writes(conf);
2312 2034
2313 spin_lock_irqsave(&conf->device_lock, flags); 2035 spin_lock_irqsave(&conf->device_lock, flags);
2314 if (list_empty(head)) { 2036 if (list_empty(head)) {
2315 spin_unlock_irqrestore(&conf->device_lock, flags); 2037 spin_unlock_irqrestore(&conf->device_lock, flags);
2316 break; 2038 break;
2317 } 2039 }
2318 r1_bio = list_entry(head->prev, struct r1bio, retry_list); 2040 r1_bio = list_entry(head->prev, r1bio_t, retry_list);
2319 list_del(head->prev); 2041 list_del(head->prev);
2320 conf->nr_queued--; 2042 conf->nr_queued--;
2321 spin_unlock_irqrestore(&conf->device_lock, flags); 2043 spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -2347,7 +2069,7 @@ static void raid1d(struct md_thread *thread)
2347} 2069}
2348 2070
2349 2071
2350static int init_resync(struct r1conf *conf) 2072static int init_resync(conf_t *conf)
2351{ 2073{
2352 int buffs; 2074 int buffs;
2353 2075
@@ -2371,10 +2093,10 @@ static int init_resync(struct r1conf *conf)
2371 * that can be installed to exclude normal IO requests. 2093 * that can be installed to exclude normal IO requests.
2372 */ 2094 */
2373 2095
2374static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster) 2096static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
2375{ 2097{
2376 struct r1conf *conf = mddev->private; 2098 conf_t *conf = mddev->private;
2377 struct r1bio *r1_bio; 2099 r1bio_t *r1_bio;
2378 struct bio *bio; 2100 struct bio *bio;
2379 sector_t max_sector, nr_sectors; 2101 sector_t max_sector, nr_sectors;
2380 int disk = -1; 2102 int disk = -1;
@@ -2453,14 +2175,15 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
2453 r1_bio->state = 0; 2175 r1_bio->state = 0;
2454 set_bit(R1BIO_IsSync, &r1_bio->state); 2176 set_bit(R1BIO_IsSync, &r1_bio->state);
2455 2177
2456 for (i = 0; i < conf->raid_disks * 2; i++) { 2178 for (i=0; i < conf->raid_disks; i++) {
2457 struct md_rdev *rdev; 2179 mdk_rdev_t *rdev;
2458 bio = r1_bio->bios[i]; 2180 bio = r1_bio->bios[i];
2459 2181
2460 /* take from bio_init */ 2182 /* take from bio_init */
2461 bio->bi_next = NULL; 2183 bio->bi_next = NULL;
2462 bio->bi_flags &= ~(BIO_POOL_MASK-1); 2184 bio->bi_flags &= ~(BIO_POOL_MASK-1);
2463 bio->bi_flags |= 1 << BIO_UPTODATE; 2185 bio->bi_flags |= 1 << BIO_UPTODATE;
2186 bio->bi_comp_cpu = -1;
2464 bio->bi_rw = READ; 2187 bio->bi_rw = READ;
2465 bio->bi_vcnt = 0; 2188 bio->bi_vcnt = 0;
2466 bio->bi_idx = 0; 2189 bio->bi_idx = 0;
@@ -2472,8 +2195,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
2472 rdev = rcu_dereference(conf->mirrors[i].rdev); 2195 rdev = rcu_dereference(conf->mirrors[i].rdev);
2473 if (rdev == NULL || 2196 if (rdev == NULL ||
2474 test_bit(Faulty, &rdev->flags)) { 2197 test_bit(Faulty, &rdev->flags)) {
2475 if (i < conf->raid_disks) 2198 still_degraded = 1;
2476 still_degraded = 1;
2477 } else if (!test_bit(In_sync, &rdev->flags)) { 2199 } else if (!test_bit(In_sync, &rdev->flags)) {
2478 bio->bi_rw = WRITE; 2200 bio->bi_rw = WRITE;
2479 bio->bi_end_io = end_sync_write; 2201 bio->bi_end_io = end_sync_write;
@@ -2505,18 +2227,6 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
2505 bio->bi_rw = READ; 2227 bio->bi_rw = READ;
2506 bio->bi_end_io = end_sync_read; 2228 bio->bi_end_io = end_sync_read;
2507 read_targets++; 2229 read_targets++;
2508 } else if (!test_bit(WriteErrorSeen, &rdev->flags) &&
2509 test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
2510 !test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
2511 /*
2512 * The device is suitable for reading (InSync),
2513 * but has bad block(s) here. Let's try to correct them,
2514 * if we are doing resync or repair. Otherwise, leave
2515 * this device alone for this sync request.
2516 */
2517 bio->bi_rw = WRITE;
2518 bio->bi_end_io = end_sync_write;
2519 write_targets++;
2520 } 2230 }
2521 } 2231 }
2522 if (bio->bi_end_io) { 2232 if (bio->bi_end_io) {
@@ -2536,9 +2246,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
2536 * need to mark them bad on all write targets 2246 * need to mark them bad on all write targets
2537 */ 2247 */
2538 int ok = 1; 2248 int ok = 1;
2539 for (i = 0 ; i < conf->raid_disks * 2 ; i++) 2249 for (i = 0 ; i < conf->raid_disks ; i++)
2540 if (r1_bio->bios[i]->bi_end_io == end_sync_write) { 2250 if (r1_bio->bios[i]->bi_end_io == end_sync_write) {
2541 struct md_rdev *rdev = conf->mirrors[i].rdev; 2251 mdk_rdev_t *rdev =
2252 rcu_dereference(conf->mirrors[i].rdev);
2542 ok = rdev_set_badblocks(rdev, sector_nr, 2253 ok = rdev_set_badblocks(rdev, sector_nr,
2543 min_bad, 0 2254 min_bad, 0
2544 ) && ok; 2255 ) && ok;
@@ -2574,10 +2285,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
2574 /* There is nowhere to write, so all non-sync 2285 /* There is nowhere to write, so all non-sync
2575 * drives must be failed - so we are finished 2286 * drives must be failed - so we are finished
2576 */ 2287 */
2577 sector_t rv; 2288 sector_t rv = max_sector - sector_nr;
2578 if (min_bad > 0)
2579 max_sector = sector_nr + min_bad;
2580 rv = max_sector - sector_nr;
2581 *skipped = 1; 2289 *skipped = 1;
2582 put_buf(r1_bio); 2290 put_buf(r1_bio);
2583 return rv; 2291 return rv;
@@ -2607,7 +2315,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
2607 len = sync_blocks<<9; 2315 len = sync_blocks<<9;
2608 } 2316 }
2609 2317
2610 for (i = 0 ; i < conf->raid_disks * 2; i++) { 2318 for (i=0 ; i < conf->raid_disks; i++) {
2611 bio = r1_bio->bios[i]; 2319 bio = r1_bio->bios[i];
2612 if (bio->bi_end_io) { 2320 if (bio->bi_end_io) {
2613 page = bio->bi_io_vec[bio->bi_vcnt].bv_page; 2321 page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
@@ -2640,10 +2348,9 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
2640 */ 2348 */
2641 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 2349 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
2642 atomic_set(&r1_bio->remaining, read_targets); 2350 atomic_set(&r1_bio->remaining, read_targets);
2643 for (i = 0; i < conf->raid_disks * 2 && read_targets; i++) { 2351 for (i=0; i<conf->raid_disks; i++) {
2644 bio = r1_bio->bios[i]; 2352 bio = r1_bio->bios[i];
2645 if (bio->bi_end_io == end_sync_read) { 2353 if (bio->bi_end_io == end_sync_read) {
2646 read_targets--;
2647 md_sync_acct(bio->bi_bdev, nr_sectors); 2354 md_sync_acct(bio->bi_bdev, nr_sectors);
2648 generic_make_request(bio); 2355 generic_make_request(bio);
2649 } 2356 }
@@ -2658,7 +2365,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
2658 return nr_sectors; 2365 return nr_sectors;
2659} 2366}
2660 2367
2661static sector_t raid1_size(struct mddev *mddev, sector_t sectors, int raid_disks) 2368static sector_t raid1_size(mddev_t *mddev, sector_t sectors, int raid_disks)
2662{ 2369{
2663 if (sectors) 2370 if (sectors)
2664 return sectors; 2371 return sectors;
@@ -2666,20 +2373,19 @@ static sector_t raid1_size(struct mddev *mddev, sector_t sectors, int raid_disks
2666 return mddev->dev_sectors; 2373 return mddev->dev_sectors;
2667} 2374}
2668 2375
2669static struct r1conf *setup_conf(struct mddev *mddev) 2376static conf_t *setup_conf(mddev_t *mddev)
2670{ 2377{
2671 struct r1conf *conf; 2378 conf_t *conf;
2672 int i; 2379 int i;
2673 struct raid1_info *disk; 2380 mirror_info_t *disk;
2674 struct md_rdev *rdev; 2381 mdk_rdev_t *rdev;
2675 int err = -ENOMEM; 2382 int err = -ENOMEM;
2676 2383
2677 conf = kzalloc(sizeof(struct r1conf), GFP_KERNEL); 2384 conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
2678 if (!conf) 2385 if (!conf)
2679 goto abort; 2386 goto abort;
2680 2387
2681 conf->mirrors = kzalloc(sizeof(struct raid1_info) 2388 conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
2682 * mddev->raid_disks * 2,
2683 GFP_KERNEL); 2389 GFP_KERNEL);
2684 if (!conf->mirrors) 2390 if (!conf->mirrors)
2685 goto abort; 2391 goto abort;
@@ -2691,7 +2397,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2691 conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL); 2397 conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
2692 if (!conf->poolinfo) 2398 if (!conf->poolinfo)
2693 goto abort; 2399 goto abort;
2694 conf->poolinfo->raid_disks = mddev->raid_disks * 2; 2400 conf->poolinfo->raid_disks = mddev->raid_disks;
2695 conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, 2401 conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
2696 r1bio_pool_free, 2402 r1bio_pool_free,
2697 conf->poolinfo); 2403 conf->poolinfo);
@@ -2700,28 +2406,17 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2700 2406
2701 conf->poolinfo->mddev = mddev; 2407 conf->poolinfo->mddev = mddev;
2702 2408
2703 err = -EINVAL;
2704 spin_lock_init(&conf->device_lock); 2409 spin_lock_init(&conf->device_lock);
2705 rdev_for_each(rdev, mddev) { 2410 list_for_each_entry(rdev, &mddev->disks, same_set) {
2706 struct request_queue *q;
2707 int disk_idx = rdev->raid_disk; 2411 int disk_idx = rdev->raid_disk;
2708 if (disk_idx >= mddev->raid_disks 2412 if (disk_idx >= mddev->raid_disks
2709 || disk_idx < 0) 2413 || disk_idx < 0)
2710 continue; 2414 continue;
2711 if (test_bit(Replacement, &rdev->flags)) 2415 disk = conf->mirrors + disk_idx;
2712 disk = conf->mirrors + mddev->raid_disks + disk_idx;
2713 else
2714 disk = conf->mirrors + disk_idx;
2715 2416
2716 if (disk->rdev)
2717 goto abort;
2718 disk->rdev = rdev; 2417 disk->rdev = rdev;
2719 q = bdev_get_queue(rdev->bdev);
2720 if (q->merge_bvec_fn)
2721 mddev->merge_check_needed = 1;
2722 2418
2723 disk->head_position = 0; 2419 disk->head_position = 0;
2724 disk->seq_start = MaxSector;
2725 } 2420 }
2726 conf->raid_disks = mddev->raid_disks; 2421 conf->raid_disks = mddev->raid_disks;
2727 conf->mddev = mddev; 2422 conf->mddev = mddev;
@@ -2731,40 +2426,33 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2731 init_waitqueue_head(&conf->wait_barrier); 2426 init_waitqueue_head(&conf->wait_barrier);
2732 2427
2733 bio_list_init(&conf->pending_bio_list); 2428 bio_list_init(&conf->pending_bio_list);
2734 conf->pending_count = 0;
2735 conf->recovery_disabled = mddev->recovery_disabled - 1;
2736 2429
2737 err = -EIO; 2430 conf->last_used = -1;
2738 for (i = 0; i < conf->raid_disks * 2; i++) { 2431 for (i = 0; i < conf->raid_disks; i++) {
2739 2432
2740 disk = conf->mirrors + i; 2433 disk = conf->mirrors + i;
2741 2434
2742 if (i < conf->raid_disks &&
2743 disk[conf->raid_disks].rdev) {
2744 /* This slot has a replacement. */
2745 if (!disk->rdev) {
2746 /* No original, just make the replacement
2747 * a recovering spare
2748 */
2749 disk->rdev =
2750 disk[conf->raid_disks].rdev;
2751 disk[conf->raid_disks].rdev = NULL;
2752 } else if (!test_bit(In_sync, &disk->rdev->flags))
2753 /* Original is not in_sync - bad */
2754 goto abort;
2755 }
2756
2757 if (!disk->rdev || 2435 if (!disk->rdev ||
2758 !test_bit(In_sync, &disk->rdev->flags)) { 2436 !test_bit(In_sync, &disk->rdev->flags)) {
2759 disk->head_position = 0; 2437 disk->head_position = 0;
2760 if (disk->rdev && 2438 if (disk->rdev)
2761 (disk->rdev->saved_raid_disk < 0))
2762 conf->fullsync = 1; 2439 conf->fullsync = 1;
2763 } 2440 } else if (conf->last_used < 0)
2441 /*
2442 * The first working device is used as a
2443 * starting point to read balancing.
2444 */
2445 conf->last_used = i;
2764 } 2446 }
2765 2447
2448 err = -EIO;
2449 if (conf->last_used < 0) {
2450 printk(KERN_ERR "md/raid1:%s: no operational mirrors\n",
2451 mdname(mddev));
2452 goto abort;
2453 }
2766 err = -ENOMEM; 2454 err = -ENOMEM;
2767 conf->thread = md_register_thread(raid1d, mddev, "raid1"); 2455 conf->thread = md_register_thread(raid1d, mddev, NULL);
2768 if (!conf->thread) { 2456 if (!conf->thread) {
2769 printk(KERN_ERR 2457 printk(KERN_ERR
2770 "md/raid1:%s: couldn't allocate thread\n", 2458 "md/raid1:%s: couldn't allocate thread\n",
@@ -2786,14 +2474,11 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2786 return ERR_PTR(err); 2474 return ERR_PTR(err);
2787} 2475}
2788 2476
2789static int stop(struct mddev *mddev); 2477static int run(mddev_t *mddev)
2790static int run(struct mddev *mddev)
2791{ 2478{
2792 struct r1conf *conf; 2479 conf_t *conf;
2793 int i; 2480 int i;
2794 struct md_rdev *rdev; 2481 mdk_rdev_t *rdev;
2795 int ret;
2796 bool discard_supported = false;
2797 2482
2798 if (mddev->level != 1) { 2483 if (mddev->level != 1) {
2799 printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n", 2484 printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n",
@@ -2818,13 +2503,20 @@ static int run(struct mddev *mddev)
2818 if (IS_ERR(conf)) 2503 if (IS_ERR(conf))
2819 return PTR_ERR(conf); 2504 return PTR_ERR(conf);
2820 2505
2821 rdev_for_each(rdev, mddev) { 2506 list_for_each_entry(rdev, &mddev->disks, same_set) {
2822 if (!mddev->gendisk) 2507 if (!mddev->gendisk)
2823 continue; 2508 continue;
2824 disk_stack_limits(mddev->gendisk, rdev->bdev, 2509 disk_stack_limits(mddev->gendisk, rdev->bdev,
2825 rdev->data_offset << 9); 2510 rdev->data_offset << 9);
2826 if (blk_queue_discard(bdev_get_queue(rdev->bdev))) 2511 /* as we don't honour merge_bvec_fn, we must never risk
2827 discard_supported = true; 2512 * violating it, so limit ->max_segments to 1 lying within
2513 * a single page, as a one page request is never in violation.
2514 */
2515 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
2516 blk_queue_max_segments(mddev->queue, 1);
2517 blk_queue_segment_boundary(mddev->queue,
2518 PAGE_CACHE_SIZE - 1);
2519 }
2828 } 2520 }
2829 2521
2830 mddev->degraded = 0; 2522 mddev->degraded = 0;
@@ -2858,25 +2550,13 @@ static int run(struct mddev *mddev)
2858 if (mddev->queue) { 2550 if (mddev->queue) {
2859 mddev->queue->backing_dev_info.congested_fn = raid1_congested; 2551 mddev->queue->backing_dev_info.congested_fn = raid1_congested;
2860 mddev->queue->backing_dev_info.congested_data = mddev; 2552 mddev->queue->backing_dev_info.congested_data = mddev;
2861 blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec);
2862
2863 if (discard_supported)
2864 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
2865 mddev->queue);
2866 else
2867 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
2868 mddev->queue);
2869 } 2553 }
2870 2554 return md_integrity_register(mddev);
2871 ret = md_integrity_register(mddev);
2872 if (ret)
2873 stop(mddev);
2874 return ret;
2875} 2555}
2876 2556
2877static int stop(struct mddev *mddev) 2557static int stop(mddev_t *mddev)
2878{ 2558{
2879 struct r1conf *conf = mddev->private; 2559 conf_t *conf = mddev->private;
2880 struct bitmap *bitmap = mddev->bitmap; 2560 struct bitmap *bitmap = mddev->bitmap;
2881 2561
2882 /* wait for behind writes to complete */ 2562 /* wait for behind writes to complete */
@@ -2901,7 +2581,7 @@ static int stop(struct mddev *mddev)
2901 return 0; 2581 return 0;
2902} 2582}
2903 2583
2904static int raid1_resize(struct mddev *mddev, sector_t sectors) 2584static int raid1_resize(mddev_t *mddev, sector_t sectors)
2905{ 2585{
2906 /* no resync is happening, and there is enough space 2586 /* no resync is happening, and there is enough space
2907 * on all devices, so we can resize. 2587 * on all devices, so we can resize.
@@ -2910,16 +2590,9 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors)
2910 * any io in the removed space completes, but it hardly seems 2590 * any io in the removed space completes, but it hardly seems
2911 * worth it. 2591 * worth it.
2912 */ 2592 */
2913 sector_t newsize = raid1_size(mddev, sectors, 0); 2593 md_set_array_sectors(mddev, raid1_size(mddev, sectors, 0));
2914 if (mddev->external_size && 2594 if (mddev->array_sectors > raid1_size(mddev, sectors, 0))
2915 mddev->array_sectors > newsize)
2916 return -EINVAL; 2595 return -EINVAL;
2917 if (mddev->bitmap) {
2918 int ret = bitmap_resize(mddev->bitmap, newsize, 0, 0);
2919 if (ret)
2920 return ret;
2921 }
2922 md_set_array_sectors(mddev, newsize);
2923 set_capacity(mddev->gendisk, mddev->array_sectors); 2596 set_capacity(mddev->gendisk, mddev->array_sectors);
2924 revalidate_disk(mddev->gendisk); 2597 revalidate_disk(mddev->gendisk);
2925 if (sectors > mddev->dev_sectors && 2598 if (sectors > mddev->dev_sectors &&
@@ -2932,7 +2605,7 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors)
2932 return 0; 2605 return 0;
2933} 2606}
2934 2607
2935static int raid1_reshape(struct mddev *mddev) 2608static int raid1_reshape(mddev_t *mddev)
2936{ 2609{
2937 /* We need to: 2610 /* We need to:
2938 * 1/ resize the r1bio_pool 2611 * 1/ resize the r1bio_pool
@@ -2947,8 +2620,8 @@ static int raid1_reshape(struct mddev *mddev)
2947 */ 2620 */
2948 mempool_t *newpool, *oldpool; 2621 mempool_t *newpool, *oldpool;
2949 struct pool_info *newpoolinfo; 2622 struct pool_info *newpoolinfo;
2950 struct raid1_info *newmirrors; 2623 mirror_info_t *newmirrors;
2951 struct r1conf *conf = mddev->private; 2624 conf_t *conf = mddev->private;
2952 int cnt, raid_disks; 2625 int cnt, raid_disks;
2953 unsigned long flags; 2626 unsigned long flags;
2954 int d, d2, err; 2627 int d, d2, err;
@@ -2982,7 +2655,7 @@ static int raid1_reshape(struct mddev *mddev)
2982 if (!newpoolinfo) 2655 if (!newpoolinfo)
2983 return -ENOMEM; 2656 return -ENOMEM;
2984 newpoolinfo->mddev = mddev; 2657 newpoolinfo->mddev = mddev;
2985 newpoolinfo->raid_disks = raid_disks * 2; 2658 newpoolinfo->raid_disks = raid_disks;
2986 2659
2987 newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, 2660 newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
2988 r1bio_pool_free, newpoolinfo); 2661 r1bio_pool_free, newpoolinfo);
@@ -2990,8 +2663,7 @@ static int raid1_reshape(struct mddev *mddev)
2990 kfree(newpoolinfo); 2663 kfree(newpoolinfo);
2991 return -ENOMEM; 2664 return -ENOMEM;
2992 } 2665 }
2993 newmirrors = kzalloc(sizeof(struct raid1_info) * raid_disks * 2, 2666 newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL);
2994 GFP_KERNEL);
2995 if (!newmirrors) { 2667 if (!newmirrors) {
2996 kfree(newpoolinfo); 2668 kfree(newpoolinfo);
2997 mempool_destroy(newpool); 2669 mempool_destroy(newpool);
@@ -3005,7 +2677,7 @@ static int raid1_reshape(struct mddev *mddev)
3005 conf->r1bio_pool = newpool; 2677 conf->r1bio_pool = newpool;
3006 2678
3007 for (d = d2 = 0; d < conf->raid_disks; d++) { 2679 for (d = d2 = 0; d < conf->raid_disks; d++) {
3008 struct md_rdev *rdev = conf->mirrors[d].rdev; 2680 mdk_rdev_t *rdev = conf->mirrors[d].rdev;
3009 if (rdev && rdev->raid_disk != d2) { 2681 if (rdev && rdev->raid_disk != d2) {
3010 sysfs_unlink_rdev(mddev, rdev); 2682 sysfs_unlink_rdev(mddev, rdev);
3011 rdev->raid_disk = d2; 2683 rdev->raid_disk = d2;
@@ -3029,6 +2701,7 @@ static int raid1_reshape(struct mddev *mddev)
3029 conf->raid_disks = mddev->raid_disks = raid_disks; 2701 conf->raid_disks = mddev->raid_disks = raid_disks;
3030 mddev->delta_disks = 0; 2702 mddev->delta_disks = 0;
3031 2703
2704 conf->last_used = 0; /* just make sure it is in-range */
3032 lower_barrier(conf); 2705 lower_barrier(conf);
3033 2706
3034 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2707 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -3038,9 +2711,9 @@ static int raid1_reshape(struct mddev *mddev)
3038 return 0; 2711 return 0;
3039} 2712}
3040 2713
3041static void raid1_quiesce(struct mddev *mddev, int state) 2714static void raid1_quiesce(mddev_t *mddev, int state)
3042{ 2715{
3043 struct r1conf *conf = mddev->private; 2716 conf_t *conf = mddev->private;
3044 2717
3045 switch(state) { 2718 switch(state) {
3046 case 2: /* wake for suspend */ 2719 case 2: /* wake for suspend */
@@ -3055,13 +2728,13 @@ static void raid1_quiesce(struct mddev *mddev, int state)
3055 } 2728 }
3056} 2729}
3057 2730
3058static void *raid1_takeover(struct mddev *mddev) 2731static void *raid1_takeover(mddev_t *mddev)
3059{ 2732{
3060 /* raid1 can take over: 2733 /* raid1 can take over:
3061 * raid5 with 2 devices, any layout or chunk size 2734 * raid5 with 2 devices, any layout or chunk size
3062 */ 2735 */
3063 if (mddev->level == 5 && mddev->raid_disks == 2) { 2736 if (mddev->level == 5 && mddev->raid_disks == 2) {
3064 struct r1conf *conf; 2737 conf_t *conf;
3065 mddev->new_level = 1; 2738 mddev->new_level = 1;
3066 mddev->new_layout = 0; 2739 mddev->new_layout = 0;
3067 mddev->new_chunk_sectors = 0; 2740 mddev->new_chunk_sectors = 0;
@@ -3073,7 +2746,7 @@ static void *raid1_takeover(struct mddev *mddev)
3073 return ERR_PTR(-EINVAL); 2746 return ERR_PTR(-EINVAL);
3074} 2747}
3075 2748
3076static struct md_personality raid1_personality = 2749static struct mdk_personality raid1_personality =
3077{ 2750{
3078 .name = "raid1", 2751 .name = "raid1",
3079 .level = 1, 2752 .level = 1,
@@ -3111,5 +2784,3 @@ MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD");
3111MODULE_ALIAS("md-personality-3"); /* RAID1 */ 2784MODULE_ALIAS("md-personality-3"); /* RAID1 */
3112MODULE_ALIAS("md-raid1"); 2785MODULE_ALIAS("md-raid1");
3113MODULE_ALIAS("md-level-1"); 2786MODULE_ALIAS("md-level-1");
3114
3115module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 0ff3715fb7e..e0d676b4897 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -1,15 +1,11 @@
1#ifndef _RAID1_H 1#ifndef _RAID1_H
2#define _RAID1_H 2#define _RAID1_H
3 3
4struct raid1_info { 4typedef struct mirror_info mirror_info_t;
5 struct md_rdev *rdev;
6 sector_t head_position;
7 5
8 /* When choose the best device for a read (read_balance()) 6struct mirror_info {
9 * we try to keep sequential reads one the same device 7 mdk_rdev_t *rdev;
10 */ 8 sector_t head_position;
11 sector_t next_seq_sect;
12 sector_t seq_start;
13}; 9};
14 10
15/* 11/*
@@ -18,84 +14,64 @@ struct raid1_info {
18 * pool was allocated for, so they know how much to allocate and free. 14 * pool was allocated for, so they know how much to allocate and free.
19 * mddev->raid_disks cannot be used, as it can change while a pool is active 15 * mddev->raid_disks cannot be used, as it can change while a pool is active
20 * These two datums are stored in a kmalloced struct. 16 * These two datums are stored in a kmalloced struct.
21 * The 'raid_disks' here is twice the raid_disks in r1conf.
22 * This allows space for each 'real' device can have a replacement in the
23 * second half of the array.
24 */ 17 */
25 18
26struct pool_info { 19struct pool_info {
27 struct mddev *mddev; 20 mddev_t *mddev;
28 int raid_disks; 21 int raid_disks;
29}; 22};
30 23
31struct r1conf {
32 struct mddev *mddev;
33 struct raid1_info *mirrors; /* twice 'raid_disks' to
34 * allow for replacements.
35 */
36 int raid_disks;
37 24
38 /* During resync, read_balancing is only allowed on the part 25typedef struct r1bio_s r1bio_t;
39 * of the array that has been resynced. 'next_resync' tells us
40 * where that is.
41 */
42 sector_t next_resync;
43 26
27struct r1_private_data_s {
28 mddev_t *mddev;
29 mirror_info_t *mirrors;
30 int raid_disks;
31 int last_used;
32 sector_t next_seq_sect;
44 spinlock_t device_lock; 33 spinlock_t device_lock;
45 34
46 /* list of 'struct r1bio' that need to be processed by raid1d,
47 * whether to retry a read, writeout a resync or recovery
48 * block, or anything else.
49 */
50 struct list_head retry_list; 35 struct list_head retry_list;
51 36 /* queue pending writes and submit them on unplug */
52 /* queue pending writes to be submitted on unplug */
53 struct bio_list pending_bio_list; 37 struct bio_list pending_bio_list;
54 int pending_count;
55 38
56 /* for use when syncing mirrors: 39 /* for use when syncing mirrors: */
57 * We don't allow both normal IO and resync/recovery IO at 40
58 * the same time - resync/recovery can only happen when there
59 * is no other IO. So when either is active, the other has to wait.
60 * See more details description in raid1.c near raise_barrier().
61 */
62 wait_queue_head_t wait_barrier;
63 spinlock_t resync_lock; 41 spinlock_t resync_lock;
64 int nr_pending; 42 int nr_pending;
65 int nr_waiting; 43 int nr_waiting;
66 int nr_queued; 44 int nr_queued;
67 int barrier; 45 int barrier;
46 sector_t next_resync;
47 int fullsync; /* set to 1 if a full sync is needed,
48 * (fresh device added).
49 * Cleared when a sync completes.
50 */
51 int recovery_disabled; /* when the same as
52 * mddev->recovery_disabled
53 * we don't allow recovery
54 * to be attempted as we
55 * expect a read error
56 */
68 57
69 /* Set to 1 if a full sync is needed, (fresh device added). 58 wait_queue_head_t wait_barrier;
70 * Cleared when a sync completes.
71 */
72 int fullsync;
73
74 /* When the same as mddev->recovery_disabled we don't allow
75 * recovery to be attempted as we expect a read error.
76 */
77 int recovery_disabled;
78
79 59
80 /* poolinfo contains information about the content of the
81 * mempools - it changes when the array grows or shrinks
82 */
83 struct pool_info *poolinfo; 60 struct pool_info *poolinfo;
84 mempool_t *r1bio_pool;
85 mempool_t *r1buf_pool;
86 61
87 /* temporary buffer to synchronous IO when attempting to repair
88 * a read error.
89 */
90 struct page *tmppage; 62 struct page *tmppage;
91 63
64 mempool_t *r1bio_pool;
65 mempool_t *r1buf_pool;
92 66
93 /* When taking over an array from a different personality, we store 67 /* When taking over an array from a different personality, we store
94 * the new thread here until we fully activate the array. 68 * the new thread here until we fully activate the array.
95 */ 69 */
96 struct md_thread *thread; 70 struct mdk_thread_s *thread;
97}; 71};
98 72
73typedef struct r1_private_data_s conf_t;
74
99/* 75/*
100 * this is our 'private' RAID1 bio. 76 * this is our 'private' RAID1 bio.
101 * 77 *
@@ -103,7 +79,7 @@ struct r1conf {
103 * for this RAID1 operation, and about their status: 79 * for this RAID1 operation, and about their status:
104 */ 80 */
105 81
106struct r1bio { 82struct r1bio_s {
107 atomic_t remaining; /* 'have we finished' count, 83 atomic_t remaining; /* 'have we finished' count,
108 * used from IRQ handlers 84 * used from IRQ handlers
109 */ 85 */
@@ -113,7 +89,7 @@ struct r1bio {
113 sector_t sector; 89 sector_t sector;
114 int sectors; 90 int sectors;
115 unsigned long state; 91 unsigned long state;
116 struct mddev *mddev; 92 mddev_t *mddev;
117 /* 93 /*
118 * original bio going to /dev/mdx 94 * original bio going to /dev/mdx
119 */ 95 */
@@ -135,6 +111,20 @@ struct r1bio {
135 /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/ 111 /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/
136}; 112};
137 113
114/* when we get a read error on a read-only array, we redirect to another
115 * device without failing the first device, or trying to over-write to
116 * correct the read error. To keep track of bad blocks on a per-bio
117 * level, we store IO_BLOCKED in the appropriate 'bios' pointer
118 */
119#define IO_BLOCKED ((struct bio *)1)
120/* When we successfully write to a known bad-block, we need to remove the
121 * bad-block marking which must be done from process context. So we record
122 * the success by setting bios[n] to IO_MADE_GOOD
123 */
124#define IO_MADE_GOOD ((struct bio *)2)
125
126#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
127
138/* bits for r1bio.state */ 128/* bits for r1bio.state */
139#define R1BIO_Uptodate 0 129#define R1BIO_Uptodate 0
140#define R1BIO_IsSync 1 130#define R1BIO_IsSync 1
@@ -158,6 +148,6 @@ struct r1bio {
158#define R1BIO_MadeGood 7 148#define R1BIO_MadeGood 7
159#define R1BIO_WriteError 8 149#define R1BIO_WriteError 8
160 150
161extern int md_raid1_congested(struct mddev *mddev, int bits); 151extern int md_raid1_congested(mddev_t *mddev, int bits);
162 152
163#endif 153#endif
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 64d48249c03..1d44228530a 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -21,10 +21,8 @@
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/delay.h> 22#include <linux/delay.h>
23#include <linux/blkdev.h> 23#include <linux/blkdev.h>
24#include <linux/module.h>
25#include <linux/seq_file.h> 24#include <linux/seq_file.h>
26#include <linux/ratelimit.h> 25#include <linux/ratelimit.h>
27#include <linux/kthread.h>
28#include "md.h" 26#include "md.h"
29#include "raid10.h" 27#include "raid10.h"
30#include "raid0.h" 28#include "raid0.h"
@@ -60,42 +58,15 @@
60 */ 58 */
61#define NR_RAID10_BIOS 256 59#define NR_RAID10_BIOS 256
62 60
63/* when we get a read error on a read-only array, we redirect to another 61static void allow_barrier(conf_t *conf);
64 * device without failing the first device, or trying to over-write to 62static void lower_barrier(conf_t *conf);
65 * correct the read error. To keep track of bad blocks on a per-bio
66 * level, we store IO_BLOCKED in the appropriate 'bios' pointer
67 */
68#define IO_BLOCKED ((struct bio *)1)
69/* When we successfully write to a known bad-block, we need to remove the
70 * bad-block marking which must be done from process context. So we record
71 * the success by setting devs[n].bio to IO_MADE_GOOD
72 */
73#define IO_MADE_GOOD ((struct bio *)2)
74
75#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
76
77/* When there are this many requests queued to be written by
78 * the raid10 thread, we become 'congested' to provide back-pressure
79 * for writeback.
80 */
81static int max_queued_requests = 1024;
82
83static void allow_barrier(struct r10conf *conf);
84static void lower_barrier(struct r10conf *conf);
85static int enough(struct r10conf *conf, int ignore);
86static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
87 int *skipped);
88static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
89static void end_reshape_write(struct bio *bio, int error);
90static void end_reshape(struct r10conf *conf);
91 63
92static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) 64static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
93{ 65{
94 struct r10conf *conf = data; 66 conf_t *conf = data;
95 int size = offsetof(struct r10bio, devs[conf->copies]); 67 int size = offsetof(struct r10bio_s, devs[conf->copies]);
96 68
97 /* allocate a r10bio with room for raid_disks entries in the 69 /* allocate a r10bio with room for raid_disks entries in the bios array */
98 * bios array */
99 return kzalloc(size, gfp_flags); 70 return kzalloc(size, gfp_flags);
100} 71}
101 72
@@ -121,9 +92,9 @@ static void r10bio_pool_free(void *r10_bio, void *data)
121 */ 92 */
122static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) 93static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
123{ 94{
124 struct r10conf *conf = data; 95 conf_t *conf = data;
125 struct page *page; 96 struct page *page;
126 struct r10bio *r10_bio; 97 r10bio_t *r10_bio;
127 struct bio *bio; 98 struct bio *bio;
128 int i, j; 99 int i, j;
129 int nalloc; 100 int nalloc;
@@ -132,8 +103,7 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
132 if (!r10_bio) 103 if (!r10_bio)
133 return NULL; 104 return NULL;
134 105
135 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) || 106 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
136 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
137 nalloc = conf->copies; /* resync */ 107 nalloc = conf->copies; /* resync */
138 else 108 else
139 nalloc = 2; /* recovery */ 109 nalloc = 2; /* recovery */
@@ -146,25 +116,17 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
146 if (!bio) 116 if (!bio)
147 goto out_free_bio; 117 goto out_free_bio;
148 r10_bio->devs[j].bio = bio; 118 r10_bio->devs[j].bio = bio;
149 if (!conf->have_replacement)
150 continue;
151 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
152 if (!bio)
153 goto out_free_bio;
154 r10_bio->devs[j].repl_bio = bio;
155 } 119 }
156 /* 120 /*
157 * Allocate RESYNC_PAGES data pages and attach them 121 * Allocate RESYNC_PAGES data pages and attach them
158 * where needed. 122 * where needed.
159 */ 123 */
160 for (j = 0 ; j < nalloc; j++) { 124 for (j = 0 ; j < nalloc; j++) {
161 struct bio *rbio = r10_bio->devs[j].repl_bio;
162 bio = r10_bio->devs[j].bio; 125 bio = r10_bio->devs[j].bio;
163 for (i = 0; i < RESYNC_PAGES; i++) { 126 for (i = 0; i < RESYNC_PAGES; i++) {
164 if (j > 0 && !test_bit(MD_RECOVERY_SYNC, 127 if (j == 1 && !test_bit(MD_RECOVERY_SYNC,
165 &conf->mddev->recovery)) { 128 &conf->mddev->recovery)) {
166 /* we can share bv_page's during recovery 129 /* we can share bv_page's during recovery */
167 * and reshape */
168 struct bio *rbio = r10_bio->devs[0].bio; 130 struct bio *rbio = r10_bio->devs[0].bio;
169 page = rbio->bi_io_vec[i].bv_page; 131 page = rbio->bi_io_vec[i].bv_page;
170 get_page(page); 132 get_page(page);
@@ -174,8 +136,6 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
174 goto out_free_pages; 136 goto out_free_pages;
175 137
176 bio->bi_io_vec[i].bv_page = page; 138 bio->bi_io_vec[i].bv_page = page;
177 if (rbio)
178 rbio->bi_io_vec[i].bv_page = page;
179 } 139 }
180 } 140 }
181 141
@@ -187,14 +147,10 @@ out_free_pages:
187 while (j--) 147 while (j--)
188 for (i = 0; i < RESYNC_PAGES ; i++) 148 for (i = 0; i < RESYNC_PAGES ; i++)
189 safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); 149 safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
190 j = 0; 150 j = -1;
191out_free_bio: 151out_free_bio:
192 for ( ; j < nalloc; j++) { 152 while ( ++j < nalloc )
193 if (r10_bio->devs[j].bio) 153 bio_put(r10_bio->devs[j].bio);
194 bio_put(r10_bio->devs[j].bio);
195 if (r10_bio->devs[j].repl_bio)
196 bio_put(r10_bio->devs[j].repl_bio);
197 }
198 r10bio_pool_free(r10_bio, conf); 154 r10bio_pool_free(r10_bio, conf);
199 return NULL; 155 return NULL;
200} 156}
@@ -202,8 +158,8 @@ out_free_bio:
202static void r10buf_pool_free(void *__r10_bio, void *data) 158static void r10buf_pool_free(void *__r10_bio, void *data)
203{ 159{
204 int i; 160 int i;
205 struct r10conf *conf = data; 161 conf_t *conf = data;
206 struct r10bio *r10bio = __r10_bio; 162 r10bio_t *r10bio = __r10_bio;
207 int j; 163 int j;
208 164
209 for (j=0; j < conf->copies; j++) { 165 for (j=0; j < conf->copies; j++) {
@@ -215,14 +171,11 @@ static void r10buf_pool_free(void *__r10_bio, void *data)
215 } 171 }
216 bio_put(bio); 172 bio_put(bio);
217 } 173 }
218 bio = r10bio->devs[j].repl_bio;
219 if (bio)
220 bio_put(bio);
221 } 174 }
222 r10bio_pool_free(r10bio, conf); 175 r10bio_pool_free(r10bio, conf);
223} 176}
224 177
225static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio) 178static void put_all_bios(conf_t *conf, r10bio_t *r10_bio)
226{ 179{
227 int i; 180 int i;
228 181
@@ -231,35 +184,31 @@ static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
231 if (!BIO_SPECIAL(*bio)) 184 if (!BIO_SPECIAL(*bio))
232 bio_put(*bio); 185 bio_put(*bio);
233 *bio = NULL; 186 *bio = NULL;
234 bio = &r10_bio->devs[i].repl_bio;
235 if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
236 bio_put(*bio);
237 *bio = NULL;
238 } 187 }
239} 188}
240 189
241static void free_r10bio(struct r10bio *r10_bio) 190static void free_r10bio(r10bio_t *r10_bio)
242{ 191{
243 struct r10conf *conf = r10_bio->mddev->private; 192 conf_t *conf = r10_bio->mddev->private;
244 193
245 put_all_bios(conf, r10_bio); 194 put_all_bios(conf, r10_bio);
246 mempool_free(r10_bio, conf->r10bio_pool); 195 mempool_free(r10_bio, conf->r10bio_pool);
247} 196}
248 197
249static void put_buf(struct r10bio *r10_bio) 198static void put_buf(r10bio_t *r10_bio)
250{ 199{
251 struct r10conf *conf = r10_bio->mddev->private; 200 conf_t *conf = r10_bio->mddev->private;
252 201
253 mempool_free(r10_bio, conf->r10buf_pool); 202 mempool_free(r10_bio, conf->r10buf_pool);
254 203
255 lower_barrier(conf); 204 lower_barrier(conf);
256} 205}
257 206
258static void reschedule_retry(struct r10bio *r10_bio) 207static void reschedule_retry(r10bio_t *r10_bio)
259{ 208{
260 unsigned long flags; 209 unsigned long flags;
261 struct mddev *mddev = r10_bio->mddev; 210 mddev_t *mddev = r10_bio->mddev;
262 struct r10conf *conf = mddev->private; 211 conf_t *conf = mddev->private;
263 212
264 spin_lock_irqsave(&conf->device_lock, flags); 213 spin_lock_irqsave(&conf->device_lock, flags);
265 list_add(&r10_bio->retry_list, &conf->retry_list); 214 list_add(&r10_bio->retry_list, &conf->retry_list);
@@ -277,11 +226,11 @@ static void reschedule_retry(struct r10bio *r10_bio)
277 * operation and are ready to return a success/failure code to the buffer 226 * operation and are ready to return a success/failure code to the buffer
278 * cache layer. 227 * cache layer.
279 */ 228 */
280static void raid_end_bio_io(struct r10bio *r10_bio) 229static void raid_end_bio_io(r10bio_t *r10_bio)
281{ 230{
282 struct bio *bio = r10_bio->master_bio; 231 struct bio *bio = r10_bio->master_bio;
283 int done; 232 int done;
284 struct r10conf *conf = r10_bio->mddev->private; 233 conf_t *conf = r10_bio->mddev->private;
285 234
286 if (bio->bi_phys_segments) { 235 if (bio->bi_phys_segments) {
287 unsigned long flags; 236 unsigned long flags;
@@ -307,9 +256,9 @@ static void raid_end_bio_io(struct r10bio *r10_bio)
307/* 256/*
308 * Update disk head position estimator based on IRQ completion info. 257 * Update disk head position estimator based on IRQ completion info.
309 */ 258 */
310static inline void update_head_pos(int slot, struct r10bio *r10_bio) 259static inline void update_head_pos(int slot, r10bio_t *r10_bio)
311{ 260{
312 struct r10conf *conf = r10_bio->mddev->private; 261 conf_t *conf = r10_bio->mddev->private;
313 262
314 conf->mirrors[r10_bio->devs[slot].devnum].head_position = 263 conf->mirrors[r10_bio->devs[slot].devnum].head_position =
315 r10_bio->devs[slot].addr + (r10_bio->sectors); 264 r10_bio->devs[slot].addr + (r10_bio->sectors);
@@ -318,43 +267,33 @@ static inline void update_head_pos(int slot, struct r10bio *r10_bio)
318/* 267/*
319 * Find the disk number which triggered given bio 268 * Find the disk number which triggered given bio
320 */ 269 */
321static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, 270static int find_bio_disk(conf_t *conf, r10bio_t *r10_bio,
322 struct bio *bio, int *slotp, int *replp) 271 struct bio *bio, int *slotp)
323{ 272{
324 int slot; 273 int slot;
325 int repl = 0;
326 274
327 for (slot = 0; slot < conf->copies; slot++) { 275 for (slot = 0; slot < conf->copies; slot++)
328 if (r10_bio->devs[slot].bio == bio) 276 if (r10_bio->devs[slot].bio == bio)
329 break; 277 break;
330 if (r10_bio->devs[slot].repl_bio == bio) {
331 repl = 1;
332 break;
333 }
334 }
335 278
336 BUG_ON(slot == conf->copies); 279 BUG_ON(slot == conf->copies);
337 update_head_pos(slot, r10_bio); 280 update_head_pos(slot, r10_bio);
338 281
339 if (slotp) 282 if (slotp)
340 *slotp = slot; 283 *slotp = slot;
341 if (replp)
342 *replp = repl;
343 return r10_bio->devs[slot].devnum; 284 return r10_bio->devs[slot].devnum;
344} 285}
345 286
346static void raid10_end_read_request(struct bio *bio, int error) 287static void raid10_end_read_request(struct bio *bio, int error)
347{ 288{
348 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 289 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
349 struct r10bio *r10_bio = bio->bi_private; 290 r10bio_t *r10_bio = bio->bi_private;
350 int slot, dev; 291 int slot, dev;
351 struct md_rdev *rdev; 292 conf_t *conf = r10_bio->mddev->private;
352 struct r10conf *conf = r10_bio->mddev->private;
353 293
354 294
355 slot = r10_bio->read_slot; 295 slot = r10_bio->read_slot;
356 dev = r10_bio->devs[slot].devnum; 296 dev = r10_bio->devs[slot].devnum;
357 rdev = r10_bio->devs[slot].rdev;
358 /* 297 /*
359 * this branch is our 'one mirror IO has finished' event handler: 298 * this branch is our 'one mirror IO has finished' event handler:
360 */ 299 */
@@ -371,21 +310,8 @@ static void raid10_end_read_request(struct bio *bio, int error)
371 * wait for the 'master' bio. 310 * wait for the 'master' bio.
372 */ 311 */
373 set_bit(R10BIO_Uptodate, &r10_bio->state); 312 set_bit(R10BIO_Uptodate, &r10_bio->state);
374 } else {
375 /* If all other devices that store this block have
376 * failed, we want to return the error upwards rather
377 * than fail the last device. Here we redefine
378 * "uptodate" to mean "Don't want to retry"
379 */
380 unsigned long flags;
381 spin_lock_irqsave(&conf->device_lock, flags);
382 if (!enough(conf, rdev->raid_disk))
383 uptodate = 1;
384 spin_unlock_irqrestore(&conf->device_lock, flags);
385 }
386 if (uptodate) {
387 raid_end_bio_io(r10_bio); 313 raid_end_bio_io(r10_bio);
388 rdev_dec_pending(rdev, conf->mddev); 314 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
389 } else { 315 } else {
390 /* 316 /*
391 * oops, read error - keep the refcount on the rdev 317 * oops, read error - keep the refcount on the rdev
@@ -394,14 +320,14 @@ static void raid10_end_read_request(struct bio *bio, int error)
394 printk_ratelimited(KERN_ERR 320 printk_ratelimited(KERN_ERR
395 "md/raid10:%s: %s: rescheduling sector %llu\n", 321 "md/raid10:%s: %s: rescheduling sector %llu\n",
396 mdname(conf->mddev), 322 mdname(conf->mddev),
397 bdevname(rdev->bdev, b), 323 bdevname(conf->mirrors[dev].rdev->bdev, b),
398 (unsigned long long)r10_bio->sector); 324 (unsigned long long)r10_bio->sector);
399 set_bit(R10BIO_ReadError, &r10_bio->state); 325 set_bit(R10BIO_ReadError, &r10_bio->state);
400 reschedule_retry(r10_bio); 326 reschedule_retry(r10_bio);
401 } 327 }
402} 328}
403 329
404static void close_write(struct r10bio *r10_bio) 330static void close_write(r10bio_t *r10_bio)
405{ 331{
406 /* clear the bitmap if all writes complete successfully */ 332 /* clear the bitmap if all writes complete successfully */
407 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, 333 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
@@ -411,7 +337,7 @@ static void close_write(struct r10bio *r10_bio)
411 md_write_end(r10_bio->mddev); 337 md_write_end(r10_bio->mddev);
412} 338}
413 339
414static void one_write_done(struct r10bio *r10_bio) 340static void one_write_done(r10bio_t *r10_bio)
415{ 341{
416 if (atomic_dec_and_test(&r10_bio->remaining)) { 342 if (atomic_dec_and_test(&r10_bio->remaining)) {
417 if (test_bit(R10BIO_WriteError, &r10_bio->state)) 343 if (test_bit(R10BIO_WriteError, &r10_bio->state))
@@ -429,39 +355,21 @@ static void one_write_done(struct r10bio *r10_bio)
429static void raid10_end_write_request(struct bio *bio, int error) 355static void raid10_end_write_request(struct bio *bio, int error)
430{ 356{
431 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 357 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
432 struct r10bio *r10_bio = bio->bi_private; 358 r10bio_t *r10_bio = bio->bi_private;
433 int dev; 359 int dev;
434 int dec_rdev = 1; 360 int dec_rdev = 1;
435 struct r10conf *conf = r10_bio->mddev->private; 361 conf_t *conf = r10_bio->mddev->private;
436 int slot, repl; 362 int slot;
437 struct md_rdev *rdev = NULL; 363
438 364 dev = find_bio_disk(conf, r10_bio, bio, &slot);
439 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); 365
440
441 if (repl)
442 rdev = conf->mirrors[dev].replacement;
443 if (!rdev) {
444 smp_rmb();
445 repl = 0;
446 rdev = conf->mirrors[dev].rdev;
447 }
448 /* 366 /*
449 * this branch is our 'one mirror IO has finished' event handler: 367 * this branch is our 'one mirror IO has finished' event handler:
450 */ 368 */
451 if (!uptodate) { 369 if (!uptodate) {
452 if (repl) 370 set_bit(WriteErrorSeen, &conf->mirrors[dev].rdev->flags);
453 /* Never record new bad blocks to replacement, 371 set_bit(R10BIO_WriteError, &r10_bio->state);
454 * just fail it. 372 dec_rdev = 0;
455 */
456 md_error(rdev->mddev, rdev);
457 else {
458 set_bit(WriteErrorSeen, &rdev->flags);
459 if (!test_and_set_bit(WantReplacement, &rdev->flags))
460 set_bit(MD_RECOVERY_NEEDED,
461 &rdev->mddev->recovery);
462 set_bit(R10BIO_WriteError, &r10_bio->state);
463 dec_rdev = 0;
464 }
465 } else { 373 } else {
466 /* 374 /*
467 * Set R10BIO_Uptodate in our master bio, so that 375 * Set R10BIO_Uptodate in our master bio, so that
@@ -478,15 +386,12 @@ static void raid10_end_write_request(struct bio *bio, int error)
478 set_bit(R10BIO_Uptodate, &r10_bio->state); 386 set_bit(R10BIO_Uptodate, &r10_bio->state);
479 387
480 /* Maybe we can clear some bad blocks. */ 388 /* Maybe we can clear some bad blocks. */
481 if (is_badblock(rdev, 389 if (is_badblock(conf->mirrors[dev].rdev,
482 r10_bio->devs[slot].addr, 390 r10_bio->devs[slot].addr,
483 r10_bio->sectors, 391 r10_bio->sectors,
484 &first_bad, &bad_sectors)) { 392 &first_bad, &bad_sectors)) {
485 bio_put(bio); 393 bio_put(bio);
486 if (repl) 394 r10_bio->devs[slot].bio = IO_MADE_GOOD;
487 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
488 else
489 r10_bio->devs[slot].bio = IO_MADE_GOOD;
490 dec_rdev = 0; 395 dec_rdev = 0;
491 set_bit(R10BIO_MadeGood, &r10_bio->state); 396 set_bit(R10BIO_MadeGood, &r10_bio->state);
492 } 397 }
@@ -499,9 +404,10 @@ static void raid10_end_write_request(struct bio *bio, int error)
499 */ 404 */
500 one_write_done(r10_bio); 405 one_write_done(r10_bio);
501 if (dec_rdev) 406 if (dec_rdev)
502 rdev_dec_pending(rdev, conf->mddev); 407 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
503} 408}
504 409
410
505/* 411/*
506 * RAID10 layout manager 412 * RAID10 layout manager
507 * As well as the chunksize and raid_disks count, there are two 413 * As well as the chunksize and raid_disks count, there are two
@@ -527,96 +433,79 @@ static void raid10_end_write_request(struct bio *bio, int error)
527 * sector offset to a virtual address 433 * sector offset to a virtual address
528 */ 434 */
529 435
530static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio) 436static void raid10_find_phys(conf_t *conf, r10bio_t *r10bio)
531{ 437{
532 int n,f; 438 int n,f;
533 sector_t sector; 439 sector_t sector;
534 sector_t chunk; 440 sector_t chunk;
535 sector_t stripe; 441 sector_t stripe;
536 int dev; 442 int dev;
443
537 int slot = 0; 444 int slot = 0;
538 445
539 /* now calculate first sector/dev */ 446 /* now calculate first sector/dev */
540 chunk = r10bio->sector >> geo->chunk_shift; 447 chunk = r10bio->sector >> conf->chunk_shift;
541 sector = r10bio->sector & geo->chunk_mask; 448 sector = r10bio->sector & conf->chunk_mask;
542 449
543 chunk *= geo->near_copies; 450 chunk *= conf->near_copies;
544 stripe = chunk; 451 stripe = chunk;
545 dev = sector_div(stripe, geo->raid_disks); 452 dev = sector_div(stripe, conf->raid_disks);
546 if (geo->far_offset) 453 if (conf->far_offset)
547 stripe *= geo->far_copies; 454 stripe *= conf->far_copies;
548 455
549 sector += stripe << geo->chunk_shift; 456 sector += stripe << conf->chunk_shift;
550 457
551 /* and calculate all the others */ 458 /* and calculate all the others */
552 for (n = 0; n < geo->near_copies; n++) { 459 for (n=0; n < conf->near_copies; n++) {
553 int d = dev; 460 int d = dev;
554 sector_t s = sector; 461 sector_t s = sector;
555 r10bio->devs[slot].addr = sector; 462 r10bio->devs[slot].addr = sector;
556 r10bio->devs[slot].devnum = d; 463 r10bio->devs[slot].devnum = d;
557 slot++; 464 slot++;
558 465
559 for (f = 1; f < geo->far_copies; f++) { 466 for (f = 1; f < conf->far_copies; f++) {
560 d += geo->near_copies; 467 d += conf->near_copies;
561 if (d >= geo->raid_disks) 468 if (d >= conf->raid_disks)
562 d -= geo->raid_disks; 469 d -= conf->raid_disks;
563 s += geo->stride; 470 s += conf->stride;
564 r10bio->devs[slot].devnum = d; 471 r10bio->devs[slot].devnum = d;
565 r10bio->devs[slot].addr = s; 472 r10bio->devs[slot].addr = s;
566 slot++; 473 slot++;
567 } 474 }
568 dev++; 475 dev++;
569 if (dev >= geo->raid_disks) { 476 if (dev >= conf->raid_disks) {
570 dev = 0; 477 dev = 0;
571 sector += (geo->chunk_mask + 1); 478 sector += (conf->chunk_mask + 1);
572 } 479 }
573 } 480 }
481 BUG_ON(slot != conf->copies);
574} 482}
575 483
576static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio) 484static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev)
577{
578 struct geom *geo = &conf->geo;
579
580 if (conf->reshape_progress != MaxSector &&
581 ((r10bio->sector >= conf->reshape_progress) !=
582 conf->mddev->reshape_backwards)) {
583 set_bit(R10BIO_Previous, &r10bio->state);
584 geo = &conf->prev;
585 } else
586 clear_bit(R10BIO_Previous, &r10bio->state);
587
588 __raid10_find_phys(geo, r10bio);
589}
590
591static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
592{ 485{
593 sector_t offset, chunk, vchunk; 486 sector_t offset, chunk, vchunk;
594 /* Never use conf->prev as this is only called during resync
595 * or recovery, so reshape isn't happening
596 */
597 struct geom *geo = &conf->geo;
598 487
599 offset = sector & geo->chunk_mask; 488 offset = sector & conf->chunk_mask;
600 if (geo->far_offset) { 489 if (conf->far_offset) {
601 int fc; 490 int fc;
602 chunk = sector >> geo->chunk_shift; 491 chunk = sector >> conf->chunk_shift;
603 fc = sector_div(chunk, geo->far_copies); 492 fc = sector_div(chunk, conf->far_copies);
604 dev -= fc * geo->near_copies; 493 dev -= fc * conf->near_copies;
605 if (dev < 0) 494 if (dev < 0)
606 dev += geo->raid_disks; 495 dev += conf->raid_disks;
607 } else { 496 } else {
608 while (sector >= geo->stride) { 497 while (sector >= conf->stride) {
609 sector -= geo->stride; 498 sector -= conf->stride;
610 if (dev < geo->near_copies) 499 if (dev < conf->near_copies)
611 dev += geo->raid_disks - geo->near_copies; 500 dev += conf->raid_disks - conf->near_copies;
612 else 501 else
613 dev -= geo->near_copies; 502 dev -= conf->near_copies;
614 } 503 }
615 chunk = sector >> geo->chunk_shift; 504 chunk = sector >> conf->chunk_shift;
616 } 505 }
617 vchunk = chunk * geo->raid_disks + dev; 506 vchunk = chunk * conf->raid_disks + dev;
618 sector_div(vchunk, geo->near_copies); 507 sector_div(vchunk, conf->near_copies);
619 return (vchunk << geo->chunk_shift) + offset; 508 return (vchunk << conf->chunk_shift) + offset;
620} 509}
621 510
622/** 511/**
@@ -626,85 +515,25 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
626 * @biovec: the request that could be merged to it. 515 * @biovec: the request that could be merged to it.
627 * 516 *
628 * Return amount of bytes we can accept at this offset 517 * Return amount of bytes we can accept at this offset
629 * This requires checking for end-of-chunk if near_copies != raid_disks, 518 * If near_copies == raid_disk, there are no striping issues,
630 * and for subordinate merge_bvec_fns if merge_check_needed. 519 * but in that case, the function isn't called at all.
631 */ 520 */
632static int raid10_mergeable_bvec(struct request_queue *q, 521static int raid10_mergeable_bvec(struct request_queue *q,
633 struct bvec_merge_data *bvm, 522 struct bvec_merge_data *bvm,
634 struct bio_vec *biovec) 523 struct bio_vec *biovec)
635{ 524{
636 struct mddev *mddev = q->queuedata; 525 mddev_t *mddev = q->queuedata;
637 struct r10conf *conf = mddev->private;
638 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 526 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
639 int max; 527 int max;
640 unsigned int chunk_sectors; 528 unsigned int chunk_sectors = mddev->chunk_sectors;
641 unsigned int bio_sectors = bvm->bi_size >> 9; 529 unsigned int bio_sectors = bvm->bi_size >> 9;
642 struct geom *geo = &conf->geo; 530
643 531 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
644 chunk_sectors = (conf->geo.chunk_mask & conf->prev.chunk_mask) + 1; 532 if (max < 0) max = 0; /* bio_add cannot handle a negative return */
645 if (conf->reshape_progress != MaxSector && 533 if (max <= biovec->bv_len && bio_sectors == 0)
646 ((sector >= conf->reshape_progress) != 534 return biovec->bv_len;
647 conf->mddev->reshape_backwards)) 535 else
648 geo = &conf->prev; 536 return max;
649
650 if (geo->near_copies < geo->raid_disks) {
651 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
652 + bio_sectors)) << 9;
653 if (max < 0)
654 /* bio_add cannot handle a negative return */
655 max = 0;
656 if (max <= biovec->bv_len && bio_sectors == 0)
657 return biovec->bv_len;
658 } else
659 max = biovec->bv_len;
660
661 if (mddev->merge_check_needed) {
662 struct {
663 struct r10bio r10_bio;
664 struct r10dev devs[conf->copies];
665 } on_stack;
666 struct r10bio *r10_bio = &on_stack.r10_bio;
667 int s;
668 if (conf->reshape_progress != MaxSector) {
669 /* Cannot give any guidance during reshape */
670 if (max <= biovec->bv_len && bio_sectors == 0)
671 return biovec->bv_len;
672 return 0;
673 }
674 r10_bio->sector = sector;
675 raid10_find_phys(conf, r10_bio);
676 rcu_read_lock();
677 for (s = 0; s < conf->copies; s++) {
678 int disk = r10_bio->devs[s].devnum;
679 struct md_rdev *rdev = rcu_dereference(
680 conf->mirrors[disk].rdev);
681 if (rdev && !test_bit(Faulty, &rdev->flags)) {
682 struct request_queue *q =
683 bdev_get_queue(rdev->bdev);
684 if (q->merge_bvec_fn) {
685 bvm->bi_sector = r10_bio->devs[s].addr
686 + rdev->data_offset;
687 bvm->bi_bdev = rdev->bdev;
688 max = min(max, q->merge_bvec_fn(
689 q, bvm, biovec));
690 }
691 }
692 rdev = rcu_dereference(conf->mirrors[disk].replacement);
693 if (rdev && !test_bit(Faulty, &rdev->flags)) {
694 struct request_queue *q =
695 bdev_get_queue(rdev->bdev);
696 if (q->merge_bvec_fn) {
697 bvm->bi_sector = r10_bio->devs[s].addr
698 + rdev->data_offset;
699 bvm->bi_bdev = rdev->bdev;
700 max = min(max, q->merge_bvec_fn(
701 q, bvm, biovec));
702 }
703 }
704 }
705 rcu_read_unlock();
706 }
707 return max;
708} 537}
709 538
710/* 539/*
@@ -726,26 +555,22 @@ static int raid10_mergeable_bvec(struct request_queue *q,
726 * FIXME: possibly should rethink readbalancing and do it differently 555 * FIXME: possibly should rethink readbalancing and do it differently
727 * depending on near_copies / far_copies geometry. 556 * depending on near_copies / far_copies geometry.
728 */ 557 */
729static struct md_rdev *read_balance(struct r10conf *conf, 558static int read_balance(conf_t *conf, r10bio_t *r10_bio, int *max_sectors)
730 struct r10bio *r10_bio,
731 int *max_sectors)
732{ 559{
733 const sector_t this_sector = r10_bio->sector; 560 const sector_t this_sector = r10_bio->sector;
734 int disk, slot; 561 int disk, slot;
735 int sectors = r10_bio->sectors; 562 int sectors = r10_bio->sectors;
736 int best_good_sectors; 563 int best_good_sectors;
737 sector_t new_distance, best_dist; 564 sector_t new_distance, best_dist;
738 struct md_rdev *best_rdev, *rdev = NULL; 565 mdk_rdev_t *rdev;
739 int do_balance; 566 int do_balance;
740 int best_slot; 567 int best_slot;
741 struct geom *geo = &conf->geo;
742 568
743 raid10_find_phys(conf, r10_bio); 569 raid10_find_phys(conf, r10_bio);
744 rcu_read_lock(); 570 rcu_read_lock();
745retry: 571retry:
746 sectors = r10_bio->sectors; 572 sectors = r10_bio->sectors;
747 best_slot = -1; 573 best_slot = -1;
748 best_rdev = NULL;
749 best_dist = MaxSector; 574 best_dist = MaxSector;
750 best_good_sectors = 0; 575 best_good_sectors = 0;
751 do_balance = 1; 576 do_balance = 1;
@@ -767,17 +592,10 @@ retry:
767 if (r10_bio->devs[slot].bio == IO_BLOCKED) 592 if (r10_bio->devs[slot].bio == IO_BLOCKED)
768 continue; 593 continue;
769 disk = r10_bio->devs[slot].devnum; 594 disk = r10_bio->devs[slot].devnum;
770 rdev = rcu_dereference(conf->mirrors[disk].replacement); 595 rdev = rcu_dereference(conf->mirrors[disk].rdev);
771 if (rdev == NULL || test_bit(Faulty, &rdev->flags) || 596 if (rdev == NULL)
772 test_bit(Unmerged, &rdev->flags) ||
773 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
774 rdev = rcu_dereference(conf->mirrors[disk].rdev);
775 if (rdev == NULL ||
776 test_bit(Faulty, &rdev->flags) ||
777 test_bit(Unmerged, &rdev->flags))
778 continue; 597 continue;
779 if (!test_bit(In_sync, &rdev->flags) && 598 if (!test_bit(In_sync, &rdev->flags))
780 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
781 continue; 599 continue;
782 600
783 dev_sector = r10_bio->devs[slot].addr; 601 dev_sector = r10_bio->devs[slot].addr;
@@ -802,7 +620,6 @@ retry:
802 if (good_sectors > best_good_sectors) { 620 if (good_sectors > best_good_sectors) {
803 best_good_sectors = good_sectors; 621 best_good_sectors = good_sectors;
804 best_slot = slot; 622 best_slot = slot;
805 best_rdev = rdev;
806 } 623 }
807 if (!do_balance) 624 if (!do_balance)
808 /* Must read from here */ 625 /* Must read from here */
@@ -819,11 +636,11 @@ retry:
819 * sequential read speed for 'far copies' arrays. So only 636 * sequential read speed for 'far copies' arrays. So only
820 * keep it for 'near' arrays, and review those later. 637 * keep it for 'near' arrays, and review those later.
821 */ 638 */
822 if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending)) 639 if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending))
823 break; 640 break;
824 641
825 /* for far > 1 always use the lowest address */ 642 /* for far > 1 always use the lowest address */
826 if (geo->far_copies > 1) 643 if (conf->far_copies > 1)
827 new_distance = r10_bio->devs[slot].addr; 644 new_distance = r10_bio->devs[slot].addr;
828 else 645 else
829 new_distance = abs(r10_bio->devs[slot].addr - 646 new_distance = abs(r10_bio->devs[slot].addr -
@@ -831,15 +648,16 @@ retry:
831 if (new_distance < best_dist) { 648 if (new_distance < best_dist) {
832 best_dist = new_distance; 649 best_dist = new_distance;
833 best_slot = slot; 650 best_slot = slot;
834 best_rdev = rdev;
835 } 651 }
836 } 652 }
837 if (slot >= conf->copies) { 653 if (slot == conf->copies)
838 slot = best_slot; 654 slot = best_slot;
839 rdev = best_rdev;
840 }
841 655
842 if (slot >= 0) { 656 if (slot >= 0) {
657 disk = r10_bio->devs[slot].devnum;
658 rdev = rcu_dereference(conf->mirrors[disk].rdev);
659 if (!rdev)
660 goto retry;
843 atomic_inc(&rdev->nr_pending); 661 atomic_inc(&rdev->nr_pending);
844 if (test_bit(Faulty, &rdev->flags)) { 662 if (test_bit(Faulty, &rdev->flags)) {
845 /* Cannot risk returning a device that failed 663 /* Cannot risk returning a device that failed
@@ -850,28 +668,24 @@ retry:
850 } 668 }
851 r10_bio->read_slot = slot; 669 r10_bio->read_slot = slot;
852 } else 670 } else
853 rdev = NULL; 671 disk = -1;
854 rcu_read_unlock(); 672 rcu_read_unlock();
855 *max_sectors = best_good_sectors; 673 *max_sectors = best_good_sectors;
856 674
857 return rdev; 675 return disk;
858} 676}
859 677
860int md_raid10_congested(struct mddev *mddev, int bits) 678static int raid10_congested(void *data, int bits)
861{ 679{
862 struct r10conf *conf = mddev->private; 680 mddev_t *mddev = data;
681 conf_t *conf = mddev->private;
863 int i, ret = 0; 682 int i, ret = 0;
864 683
865 if ((bits & (1 << BDI_async_congested)) && 684 if (mddev_congested(mddev, bits))
866 conf->pending_count >= max_queued_requests)
867 return 1; 685 return 1;
868
869 rcu_read_lock(); 686 rcu_read_lock();
870 for (i = 0; 687 for (i = 0; i < conf->raid_disks && ret == 0; i++) {
871 (i < conf->geo.raid_disks || i < conf->prev.raid_disks) 688 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
872 && ret == 0;
873 i++) {
874 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
875 if (rdev && !test_bit(Faulty, &rdev->flags)) { 689 if (rdev && !test_bit(Faulty, &rdev->flags)) {
876 struct request_queue *q = bdev_get_queue(rdev->bdev); 690 struct request_queue *q = bdev_get_queue(rdev->bdev);
877 691
@@ -881,17 +695,8 @@ int md_raid10_congested(struct mddev *mddev, int bits)
881 rcu_read_unlock(); 695 rcu_read_unlock();
882 return ret; 696 return ret;
883} 697}
884EXPORT_SYMBOL_GPL(md_raid10_congested);
885
886static int raid10_congested(void *data, int bits)
887{
888 struct mddev *mddev = data;
889
890 return mddev_congested(mddev, bits) ||
891 md_raid10_congested(mddev, bits);
892}
893 698
894static void flush_pending_writes(struct r10conf *conf) 699static void flush_pending_writes(conf_t *conf)
895{ 700{
896 /* Any writes that have been queued but are awaiting 701 /* Any writes that have been queued but are awaiting
897 * bitmap updates get flushed here. 702 * bitmap updates get flushed here.
@@ -901,22 +706,15 @@ static void flush_pending_writes(struct r10conf *conf)
901 if (conf->pending_bio_list.head) { 706 if (conf->pending_bio_list.head) {
902 struct bio *bio; 707 struct bio *bio;
903 bio = bio_list_get(&conf->pending_bio_list); 708 bio = bio_list_get(&conf->pending_bio_list);
904 conf->pending_count = 0;
905 spin_unlock_irq(&conf->device_lock); 709 spin_unlock_irq(&conf->device_lock);
906 /* flush any pending bitmap writes to disk 710 /* flush any pending bitmap writes to disk
907 * before proceeding w/ I/O */ 711 * before proceeding w/ I/O */
908 bitmap_unplug(conf->mddev->bitmap); 712 bitmap_unplug(conf->mddev->bitmap);
909 wake_up(&conf->wait_barrier);
910 713
911 while (bio) { /* submit pending writes */ 714 while (bio) { /* submit pending writes */
912 struct bio *next = bio->bi_next; 715 struct bio *next = bio->bi_next;
913 bio->bi_next = NULL; 716 bio->bi_next = NULL;
914 if (unlikely((bio->bi_rw & REQ_DISCARD) && 717 generic_make_request(bio);
915 !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
916 /* Just ignore it */
917 bio_endio(bio, 0);
918 else
919 generic_make_request(bio);
920 bio = next; 718 bio = next;
921 } 719 }
922 } else 720 } else
@@ -945,14 +743,14 @@ static void flush_pending_writes(struct r10conf *conf)
945 * lower_barrier when the particular background IO completes. 743 * lower_barrier when the particular background IO completes.
946 */ 744 */
947 745
948static void raise_barrier(struct r10conf *conf, int force) 746static void raise_barrier(conf_t *conf, int force)
949{ 747{
950 BUG_ON(force && !conf->barrier); 748 BUG_ON(force && !conf->barrier);
951 spin_lock_irq(&conf->resync_lock); 749 spin_lock_irq(&conf->resync_lock);
952 750
953 /* Wait until no block IO is waiting (unless 'force') */ 751 /* Wait until no block IO is waiting (unless 'force') */
954 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting, 752 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
955 conf->resync_lock); 753 conf->resync_lock, );
956 754
957 /* block any new IO from starting */ 755 /* block any new IO from starting */
958 conf->barrier++; 756 conf->barrier++;
@@ -960,12 +758,12 @@ static void raise_barrier(struct r10conf *conf, int force)
960 /* Now wait for all pending IO to complete */ 758 /* Now wait for all pending IO to complete */
961 wait_event_lock_irq(conf->wait_barrier, 759 wait_event_lock_irq(conf->wait_barrier,
962 !conf->nr_pending && conf->barrier < RESYNC_DEPTH, 760 !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
963 conf->resync_lock); 761 conf->resync_lock, );
964 762
965 spin_unlock_irq(&conf->resync_lock); 763 spin_unlock_irq(&conf->resync_lock);
966} 764}
967 765
968static void lower_barrier(struct r10conf *conf) 766static void lower_barrier(conf_t *conf)
969{ 767{
970 unsigned long flags; 768 unsigned long flags;
971 spin_lock_irqsave(&conf->resync_lock, flags); 769 spin_lock_irqsave(&conf->resync_lock, flags);
@@ -974,33 +772,21 @@ static void lower_barrier(struct r10conf *conf)
974 wake_up(&conf->wait_barrier); 772 wake_up(&conf->wait_barrier);
975} 773}
976 774
977static void wait_barrier(struct r10conf *conf) 775static void wait_barrier(conf_t *conf)
978{ 776{
979 spin_lock_irq(&conf->resync_lock); 777 spin_lock_irq(&conf->resync_lock);
980 if (conf->barrier) { 778 if (conf->barrier) {
981 conf->nr_waiting++; 779 conf->nr_waiting++;
982 /* Wait for the barrier to drop. 780 wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
983 * However if there are already pending 781 conf->resync_lock,
984 * requests (preventing the barrier from 782 );
985 * rising completely), and the
986 * pre-process bio queue isn't empty,
987 * then don't wait, as we need to empty
988 * that queue to get the nr_pending
989 * count down.
990 */
991 wait_event_lock_irq(conf->wait_barrier,
992 !conf->barrier ||
993 (conf->nr_pending &&
994 current->bio_list &&
995 !bio_list_empty(current->bio_list)),
996 conf->resync_lock);
997 conf->nr_waiting--; 783 conf->nr_waiting--;
998 } 784 }
999 conf->nr_pending++; 785 conf->nr_pending++;
1000 spin_unlock_irq(&conf->resync_lock); 786 spin_unlock_irq(&conf->resync_lock);
1001} 787}
1002 788
1003static void allow_barrier(struct r10conf *conf) 789static void allow_barrier(conf_t *conf)
1004{ 790{
1005 unsigned long flags; 791 unsigned long flags;
1006 spin_lock_irqsave(&conf->resync_lock, flags); 792 spin_lock_irqsave(&conf->resync_lock, flags);
@@ -1009,7 +795,7 @@ static void allow_barrier(struct r10conf *conf)
1009 wake_up(&conf->wait_barrier); 795 wake_up(&conf->wait_barrier);
1010} 796}
1011 797
1012static void freeze_array(struct r10conf *conf) 798static void freeze_array(conf_t *conf)
1013{ 799{
1014 /* stop syncio and normal IO and wait for everything to 800 /* stop syncio and normal IO and wait for everything to
1015 * go quiet. 801 * go quiet.
@@ -1026,15 +812,15 @@ static void freeze_array(struct r10conf *conf)
1026 spin_lock_irq(&conf->resync_lock); 812 spin_lock_irq(&conf->resync_lock);
1027 conf->barrier++; 813 conf->barrier++;
1028 conf->nr_waiting++; 814 conf->nr_waiting++;
1029 wait_event_lock_irq_cmd(conf->wait_barrier, 815 wait_event_lock_irq(conf->wait_barrier,
1030 conf->nr_pending == conf->nr_queued+1, 816 conf->nr_pending == conf->nr_queued+1,
1031 conf->resync_lock, 817 conf->resync_lock,
1032 flush_pending_writes(conf)); 818 flush_pending_writes(conf));
1033 819
1034 spin_unlock_irq(&conf->resync_lock); 820 spin_unlock_irq(&conf->resync_lock);
1035} 821}
1036 822
1037static void unfreeze_array(struct r10conf *conf) 823static void unfreeze_array(conf_t *conf)
1038{ 824{
1039 /* reverse the effect of the freeze */ 825 /* reverse the effect of the freeze */
1040 spin_lock_irq(&conf->resync_lock); 826 spin_lock_irq(&conf->resync_lock);
@@ -1044,90 +830,37 @@ static void unfreeze_array(struct r10conf *conf)
1044 spin_unlock_irq(&conf->resync_lock); 830 spin_unlock_irq(&conf->resync_lock);
1045} 831}
1046 832
1047static sector_t choose_data_offset(struct r10bio *r10_bio, 833static int make_request(mddev_t *mddev, struct bio * bio)
1048 struct md_rdev *rdev)
1049{
1050 if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||
1051 test_bit(R10BIO_Previous, &r10_bio->state))
1052 return rdev->data_offset;
1053 else
1054 return rdev->new_data_offset;
1055}
1056
1057struct raid10_plug_cb {
1058 struct blk_plug_cb cb;
1059 struct bio_list pending;
1060 int pending_cnt;
1061};
1062
1063static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
1064{
1065 struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb,
1066 cb);
1067 struct mddev *mddev = plug->cb.data;
1068 struct r10conf *conf = mddev->private;
1069 struct bio *bio;
1070
1071 if (from_schedule || current->bio_list) {
1072 spin_lock_irq(&conf->device_lock);
1073 bio_list_merge(&conf->pending_bio_list, &plug->pending);
1074 conf->pending_count += plug->pending_cnt;
1075 spin_unlock_irq(&conf->device_lock);
1076 md_wakeup_thread(mddev->thread);
1077 kfree(plug);
1078 return;
1079 }
1080
1081 /* we aren't scheduling, so we can do the write-out directly. */
1082 bio = bio_list_get(&plug->pending);
1083 bitmap_unplug(mddev->bitmap);
1084 wake_up(&conf->wait_barrier);
1085
1086 while (bio) { /* submit pending writes */
1087 struct bio *next = bio->bi_next;
1088 bio->bi_next = NULL;
1089 generic_make_request(bio);
1090 bio = next;
1091 }
1092 kfree(plug);
1093}
1094
1095static void make_request(struct mddev *mddev, struct bio * bio)
1096{ 834{
1097 struct r10conf *conf = mddev->private; 835 conf_t *conf = mddev->private;
1098 struct r10bio *r10_bio; 836 mirror_info_t *mirror;
837 r10bio_t *r10_bio;
1099 struct bio *read_bio; 838 struct bio *read_bio;
1100 int i; 839 int i;
1101 sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask); 840 int chunk_sects = conf->chunk_mask + 1;
1102 int chunk_sects = chunk_mask + 1;
1103 const int rw = bio_data_dir(bio); 841 const int rw = bio_data_dir(bio);
1104 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); 842 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
1105 const unsigned long do_fua = (bio->bi_rw & REQ_FUA); 843 const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
1106 const unsigned long do_discard = (bio->bi_rw
1107 & (REQ_DISCARD | REQ_SECURE));
1108 unsigned long flags; 844 unsigned long flags;
1109 struct md_rdev *blocked_rdev; 845 mdk_rdev_t *blocked_rdev;
1110 struct blk_plug_cb *cb; 846 int plugged;
1111 struct raid10_plug_cb *plug = NULL;
1112 int sectors_handled; 847 int sectors_handled;
1113 int max_sectors; 848 int max_sectors;
1114 int sectors;
1115 849
1116 if (unlikely(bio->bi_rw & REQ_FLUSH)) { 850 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
1117 md_flush_request(mddev, bio); 851 md_flush_request(mddev, bio);
1118 return; 852 return 0;
1119 } 853 }
1120 854
1121 /* If this request crosses a chunk boundary, we need to 855 /* If this request crosses a chunk boundary, we need to
1122 * split it. This will only happen for 1 PAGE (or less) requests. 856 * split it. This will only happen for 1 PAGE (or less) requests.
1123 */ 857 */
1124 if (unlikely((bio->bi_sector & chunk_mask) + (bio->bi_size >> 9) 858 if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9)
1125 > chunk_sects 859 > chunk_sects &&
1126 && (conf->geo.near_copies < conf->geo.raid_disks 860 conf->near_copies < conf->raid_disks)) {
1127 || conf->prev.near_copies < conf->prev.raid_disks))) {
1128 struct bio_pair *bp; 861 struct bio_pair *bp;
1129 /* Sanity check -- queue functions should prevent this happening */ 862 /* Sanity check -- queue functions should prevent this happening */
1130 if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) || 863 if (bio->bi_vcnt != 1 ||
1131 bio->bi_idx != 0) 864 bio->bi_idx != 0)
1132 goto bad_map; 865 goto bad_map;
1133 /* This is a one page bio that upper layers 866 /* This is a one page bio that upper layers
@@ -1148,8 +881,10 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1148 conf->nr_waiting++; 881 conf->nr_waiting++;
1149 spin_unlock_irq(&conf->resync_lock); 882 spin_unlock_irq(&conf->resync_lock);
1150 883
1151 make_request(mddev, &bp->bio1); 884 if (make_request(mddev, &bp->bio1))
1152 make_request(mddev, &bp->bio2); 885 generic_make_request(&bp->bio1);
886 if (make_request(mddev, &bp->bio2))
887 generic_make_request(&bp->bio2);
1153 888
1154 spin_lock_irq(&conf->resync_lock); 889 spin_lock_irq(&conf->resync_lock);
1155 conf->nr_waiting--; 890 conf->nr_waiting--;
@@ -1157,14 +892,14 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1157 spin_unlock_irq(&conf->resync_lock); 892 spin_unlock_irq(&conf->resync_lock);
1158 893
1159 bio_pair_release(bp); 894 bio_pair_release(bp);
1160 return; 895 return 0;
1161 bad_map: 896 bad_map:
1162 printk("md/raid10:%s: make_request bug: can't convert block across chunks" 897 printk("md/raid10:%s: make_request bug: can't convert block across chunks"
1163 " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2, 898 " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
1164 (unsigned long long)bio->bi_sector, bio->bi_size >> 10); 899 (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
1165 900
1166 bio_io_error(bio); 901 bio_io_error(bio);
1167 return; 902 return 0;
1168 } 903 }
1169 904
1170 md_write_start(mddev, bio); 905 md_write_start(mddev, bio);
@@ -1176,41 +911,10 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1176 */ 911 */
1177 wait_barrier(conf); 912 wait_barrier(conf);
1178 913
1179 sectors = bio->bi_size >> 9;
1180 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1181 bio->bi_sector < conf->reshape_progress &&
1182 bio->bi_sector + sectors > conf->reshape_progress) {
1183 /* IO spans the reshape position. Need to wait for
1184 * reshape to pass
1185 */
1186 allow_barrier(conf);
1187 wait_event(conf->wait_barrier,
1188 conf->reshape_progress <= bio->bi_sector ||
1189 conf->reshape_progress >= bio->bi_sector + sectors);
1190 wait_barrier(conf);
1191 }
1192 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1193 bio_data_dir(bio) == WRITE &&
1194 (mddev->reshape_backwards
1195 ? (bio->bi_sector < conf->reshape_safe &&
1196 bio->bi_sector + sectors > conf->reshape_progress)
1197 : (bio->bi_sector + sectors > conf->reshape_safe &&
1198 bio->bi_sector < conf->reshape_progress))) {
1199 /* Need to update reshape_position in metadata */
1200 mddev->reshape_position = conf->reshape_progress;
1201 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1202 set_bit(MD_CHANGE_PENDING, &mddev->flags);
1203 md_wakeup_thread(mddev->thread);
1204 wait_event(mddev->sb_wait,
1205 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
1206
1207 conf->reshape_safe = mddev->reshape_position;
1208 }
1209
1210 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); 914 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1211 915
1212 r10_bio->master_bio = bio; 916 r10_bio->master_bio = bio;
1213 r10_bio->sectors = sectors; 917 r10_bio->sectors = bio->bi_size >> 9;
1214 918
1215 r10_bio->mddev = mddev; 919 r10_bio->mddev = mddev;
1216 r10_bio->sector = bio->bi_sector; 920 r10_bio->sector = bio->bi_sector;
@@ -1230,27 +934,27 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1230 /* 934 /*
1231 * read balancing logic: 935 * read balancing logic:
1232 */ 936 */
1233 struct md_rdev *rdev; 937 int disk;
1234 int slot; 938 int slot;
1235 939
1236read_again: 940read_again:
1237 rdev = read_balance(conf, r10_bio, &max_sectors); 941 disk = read_balance(conf, r10_bio, &max_sectors);
1238 if (!rdev) { 942 slot = r10_bio->read_slot;
943 if (disk < 0) {
1239 raid_end_bio_io(r10_bio); 944 raid_end_bio_io(r10_bio);
1240 return; 945 return 0;
1241 } 946 }
1242 slot = r10_bio->read_slot; 947 mirror = conf->mirrors + disk;
1243 948
1244 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); 949 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1245 md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector, 950 md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
1246 max_sectors); 951 max_sectors);
1247 952
1248 r10_bio->devs[slot].bio = read_bio; 953 r10_bio->devs[slot].bio = read_bio;
1249 r10_bio->devs[slot].rdev = rdev;
1250 954
1251 read_bio->bi_sector = r10_bio->devs[slot].addr + 955 read_bio->bi_sector = r10_bio->devs[slot].addr +
1252 choose_data_offset(r10_bio, rdev); 956 mirror->rdev->data_offset;
1253 read_bio->bi_bdev = rdev->bdev; 957 read_bio->bi_bdev = mirror->rdev->bdev;
1254 read_bio->bi_end_io = raid10_end_read_request; 958 read_bio->bi_end_io = raid10_end_read_request;
1255 read_bio->bi_rw = READ | do_sync; 959 read_bio->bi_rw = READ | do_sync;
1256 read_bio->bi_private = r10_bio; 960 read_bio->bi_private = r10_bio;
@@ -1286,17 +990,12 @@ read_again:
1286 goto read_again; 990 goto read_again;
1287 } else 991 } else
1288 generic_make_request(read_bio); 992 generic_make_request(read_bio);
1289 return; 993 return 0;
1290 } 994 }
1291 995
1292 /* 996 /*
1293 * WRITE: 997 * WRITE:
1294 */ 998 */
1295 if (conf->pending_count >= max_queued_requests) {
1296 md_wakeup_thread(mddev->thread);
1297 wait_event(conf->wait_barrier,
1298 conf->pending_count < max_queued_requests);
1299 }
1300 /* first select target devices under rcu_lock and 999 /* first select target devices under rcu_lock and
1301 * inc refcount on their rdev. Record them by setting 1000 * inc refcount on their rdev. Record them by setting
1302 * bios[x] to bio 1001 * bios[x] to bio
@@ -1308,8 +1007,8 @@ read_again:
1308 * of r10_bios is recored in bio->bi_phys_segments just as with 1007 * of r10_bios is recored in bio->bi_phys_segments just as with
1309 * the read case. 1008 * the read case.
1310 */ 1009 */
1010 plugged = mddev_check_plugged(mddev);
1311 1011
1312 r10_bio->read_slot = -1; /* make sure repl_bio gets freed */
1313 raid10_find_phys(conf, r10_bio); 1012 raid10_find_phys(conf, r10_bio);
1314retry_write: 1013retry_write:
1315 blocked_rdev = NULL; 1014 blocked_rdev = NULL;
@@ -1318,36 +1017,18 @@ retry_write:
1318 1017
1319 for (i = 0; i < conf->copies; i++) { 1018 for (i = 0; i < conf->copies; i++) {
1320 int d = r10_bio->devs[i].devnum; 1019 int d = r10_bio->devs[i].devnum;
1321 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); 1020 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev);
1322 struct md_rdev *rrdev = rcu_dereference(
1323 conf->mirrors[d].replacement);
1324 if (rdev == rrdev)
1325 rrdev = NULL;
1326 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 1021 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1327 atomic_inc(&rdev->nr_pending); 1022 atomic_inc(&rdev->nr_pending);
1328 blocked_rdev = rdev; 1023 blocked_rdev = rdev;
1329 break; 1024 break;
1330 } 1025 }
1331 if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
1332 atomic_inc(&rrdev->nr_pending);
1333 blocked_rdev = rrdev;
1334 break;
1335 }
1336 if (rdev && (test_bit(Faulty, &rdev->flags)
1337 || test_bit(Unmerged, &rdev->flags)))
1338 rdev = NULL;
1339 if (rrdev && (test_bit(Faulty, &rrdev->flags)
1340 || test_bit(Unmerged, &rrdev->flags)))
1341 rrdev = NULL;
1342
1343 r10_bio->devs[i].bio = NULL; 1026 r10_bio->devs[i].bio = NULL;
1344 r10_bio->devs[i].repl_bio = NULL; 1027 if (!rdev || test_bit(Faulty, &rdev->flags)) {
1345
1346 if (!rdev && !rrdev) {
1347 set_bit(R10BIO_Degraded, &r10_bio->state); 1028 set_bit(R10BIO_Degraded, &r10_bio->state);
1348 continue; 1029 continue;
1349 } 1030 }
1350 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { 1031 if (test_bit(WriteErrorSeen, &rdev->flags)) {
1351 sector_t first_bad; 1032 sector_t first_bad;
1352 sector_t dev_sector = r10_bio->devs[i].addr; 1033 sector_t dev_sector = r10_bio->devs[i].addr;
1353 int bad_sectors; 1034 int bad_sectors;
@@ -1389,14 +1070,8 @@ retry_write:
1389 max_sectors = good_sectors; 1070 max_sectors = good_sectors;
1390 } 1071 }
1391 } 1072 }
1392 if (rdev) { 1073 r10_bio->devs[i].bio = bio;
1393 r10_bio->devs[i].bio = bio; 1074 atomic_inc(&rdev->nr_pending);
1394 atomic_inc(&rdev->nr_pending);
1395 }
1396 if (rrdev) {
1397 r10_bio->devs[i].repl_bio = bio;
1398 atomic_inc(&rrdev->nr_pending);
1399 }
1400 } 1075 }
1401 rcu_read_unlock(); 1076 rcu_read_unlock();
1402 1077
@@ -1405,23 +1080,11 @@ retry_write:
1405 int j; 1080 int j;
1406 int d; 1081 int d;
1407 1082
1408 for (j = 0; j < i; j++) { 1083 for (j = 0; j < i; j++)
1409 if (r10_bio->devs[j].bio) { 1084 if (r10_bio->devs[j].bio) {
1410 d = r10_bio->devs[j].devnum; 1085 d = r10_bio->devs[j].devnum;
1411 rdev_dec_pending(conf->mirrors[d].rdev, mddev); 1086 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1412 } 1087 }
1413 if (r10_bio->devs[j].repl_bio) {
1414 struct md_rdev *rdev;
1415 d = r10_bio->devs[j].devnum;
1416 rdev = conf->mirrors[d].replacement;
1417 if (!rdev) {
1418 /* Race with remove_disk */
1419 smp_mb();
1420 rdev = conf->mirrors[d].rdev;
1421 }
1422 rdev_dec_pending(rdev, mddev);
1423 }
1424 }
1425 allow_barrier(conf); 1088 allow_barrier(conf);
1426 md_wait_for_blocked_rdev(blocked_rdev, mddev); 1089 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1427 wait_barrier(conf); 1090 wait_barrier(conf);
@@ -1448,71 +1111,25 @@ retry_write:
1448 for (i = 0; i < conf->copies; i++) { 1111 for (i = 0; i < conf->copies; i++) {
1449 struct bio *mbio; 1112 struct bio *mbio;
1450 int d = r10_bio->devs[i].devnum; 1113 int d = r10_bio->devs[i].devnum;
1451 if (r10_bio->devs[i].bio) { 1114 if (!r10_bio->devs[i].bio)
1452 struct md_rdev *rdev = conf->mirrors[d].rdev; 1115 continue;
1453 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1454 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1455 max_sectors);
1456 r10_bio->devs[i].bio = mbio;
1457
1458 mbio->bi_sector = (r10_bio->devs[i].addr+
1459 choose_data_offset(r10_bio,
1460 rdev));
1461 mbio->bi_bdev = rdev->bdev;
1462 mbio->bi_end_io = raid10_end_write_request;
1463 mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
1464 mbio->bi_private = r10_bio;
1465
1466 atomic_inc(&r10_bio->remaining);
1467 1116
1468 cb = blk_check_plugged(raid10_unplug, mddev, 1117 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1469 sizeof(*plug)); 1118 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1470 if (cb) 1119 max_sectors);
1471 plug = container_of(cb, struct raid10_plug_cb, 1120 r10_bio->devs[i].bio = mbio;
1472 cb);
1473 else
1474 plug = NULL;
1475 spin_lock_irqsave(&conf->device_lock, flags);
1476 if (plug) {
1477 bio_list_add(&plug->pending, mbio);
1478 plug->pending_cnt++;
1479 } else {
1480 bio_list_add(&conf->pending_bio_list, mbio);
1481 conf->pending_count++;
1482 }
1483 spin_unlock_irqrestore(&conf->device_lock, flags);
1484 if (!plug)
1485 md_wakeup_thread(mddev->thread);
1486 }
1487 1121
1488 if (r10_bio->devs[i].repl_bio) { 1122 mbio->bi_sector = (r10_bio->devs[i].addr+
1489 struct md_rdev *rdev = conf->mirrors[d].replacement; 1123 conf->mirrors[d].rdev->data_offset);
1490 if (rdev == NULL) { 1124 mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
1491 /* Replacement just got moved to main 'rdev' */ 1125 mbio->bi_end_io = raid10_end_write_request;
1492 smp_mb(); 1126 mbio->bi_rw = WRITE | do_sync | do_fua;
1493 rdev = conf->mirrors[d].rdev; 1127 mbio->bi_private = r10_bio;
1494 }
1495 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1496 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1497 max_sectors);
1498 r10_bio->devs[i].repl_bio = mbio;
1499
1500 mbio->bi_sector = (r10_bio->devs[i].addr +
1501 choose_data_offset(
1502 r10_bio, rdev));
1503 mbio->bi_bdev = rdev->bdev;
1504 mbio->bi_end_io = raid10_end_write_request;
1505 mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
1506 mbio->bi_private = r10_bio;
1507 1128
1508 atomic_inc(&r10_bio->remaining); 1129 atomic_inc(&r10_bio->remaining);
1509 spin_lock_irqsave(&conf->device_lock, flags); 1130 spin_lock_irqsave(&conf->device_lock, flags);
1510 bio_list_add(&conf->pending_bio_list, mbio); 1131 bio_list_add(&conf->pending_bio_list, mbio);
1511 conf->pending_count++; 1132 spin_unlock_irqrestore(&conf->device_lock, flags);
1512 spin_unlock_irqrestore(&conf->device_lock, flags);
1513 if (!mddev_check_plugged(mddev))
1514 md_wakeup_thread(mddev->thread);
1515 }
1516 } 1133 }
1517 1134
1518 /* Don't remove the bias on 'remaining' (one_write_done) until 1135 /* Don't remove the bias on 'remaining' (one_write_done) until
@@ -1538,26 +1155,30 @@ retry_write:
1538 1155
1539 /* In case raid10d snuck in to freeze_array */ 1156 /* In case raid10d snuck in to freeze_array */
1540 wake_up(&conf->wait_barrier); 1157 wake_up(&conf->wait_barrier);
1158
1159 if (do_sync || !mddev->bitmap || !plugged)
1160 md_wakeup_thread(mddev->thread);
1161 return 0;
1541} 1162}
1542 1163
1543static void status(struct seq_file *seq, struct mddev *mddev) 1164static void status(struct seq_file *seq, mddev_t *mddev)
1544{ 1165{
1545 struct r10conf *conf = mddev->private; 1166 conf_t *conf = mddev->private;
1546 int i; 1167 int i;
1547 1168
1548 if (conf->geo.near_copies < conf->geo.raid_disks) 1169 if (conf->near_copies < conf->raid_disks)
1549 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2); 1170 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
1550 if (conf->geo.near_copies > 1) 1171 if (conf->near_copies > 1)
1551 seq_printf(seq, " %d near-copies", conf->geo.near_copies); 1172 seq_printf(seq, " %d near-copies", conf->near_copies);
1552 if (conf->geo.far_copies > 1) { 1173 if (conf->far_copies > 1) {
1553 if (conf->geo.far_offset) 1174 if (conf->far_offset)
1554 seq_printf(seq, " %d offset-copies", conf->geo.far_copies); 1175 seq_printf(seq, " %d offset-copies", conf->far_copies);
1555 else 1176 else
1556 seq_printf(seq, " %d far-copies", conf->geo.far_copies); 1177 seq_printf(seq, " %d far-copies", conf->far_copies);
1557 } 1178 }
1558 seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks, 1179 seq_printf(seq, " [%d/%d] [", conf->raid_disks,
1559 conf->geo.raid_disks - mddev->degraded); 1180 conf->raid_disks - mddev->degraded);
1560 for (i = 0; i < conf->geo.raid_disks; i++) 1181 for (i = 0; i < conf->raid_disks; i++)
1561 seq_printf(seq, "%s", 1182 seq_printf(seq, "%s",
1562 conf->mirrors[i].rdev && 1183 conf->mirrors[i].rdev &&
1563 test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_"); 1184 test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
@@ -1569,37 +1190,29 @@ static void status(struct seq_file *seq, struct mddev *mddev)
1569 * Don't consider the device numbered 'ignore' 1190 * Don't consider the device numbered 'ignore'
1570 * as we might be about to remove it. 1191 * as we might be about to remove it.
1571 */ 1192 */
1572static int _enough(struct r10conf *conf, struct geom *geo, int ignore) 1193static int enough(conf_t *conf, int ignore)
1573{ 1194{
1574 int first = 0; 1195 int first = 0;
1575 1196
1576 do { 1197 do {
1577 int n = conf->copies; 1198 int n = conf->copies;
1578 int cnt = 0; 1199 int cnt = 0;
1579 int this = first;
1580 while (n--) { 1200 while (n--) {
1581 if (conf->mirrors[this].rdev && 1201 if (conf->mirrors[first].rdev &&
1582 this != ignore) 1202 first != ignore)
1583 cnt++; 1203 cnt++;
1584 this = (this+1) % geo->raid_disks; 1204 first = (first+1) % conf->raid_disks;
1585 } 1205 }
1586 if (cnt == 0) 1206 if (cnt == 0)
1587 return 0; 1207 return 0;
1588 first = (first + geo->near_copies) % geo->raid_disks;
1589 } while (first != 0); 1208 } while (first != 0);
1590 return 1; 1209 return 1;
1591} 1210}
1592 1211
1593static int enough(struct r10conf *conf, int ignore) 1212static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1594{
1595 return _enough(conf, &conf->geo, ignore) &&
1596 _enough(conf, &conf->prev, ignore);
1597}
1598
1599static void error(struct mddev *mddev, struct md_rdev *rdev)
1600{ 1213{
1601 char b[BDEVNAME_SIZE]; 1214 char b[BDEVNAME_SIZE];
1602 struct r10conf *conf = mddev->private; 1215 conf_t *conf = mddev->private;
1603 1216
1604 /* 1217 /*
1605 * If it is not operational, then we have already marked it as dead 1218 * If it is not operational, then we have already marked it as dead
@@ -1630,23 +1243,23 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
1630 "md/raid10:%s: Disk failure on %s, disabling device.\n" 1243 "md/raid10:%s: Disk failure on %s, disabling device.\n"
1631 "md/raid10:%s: Operation continuing on %d devices.\n", 1244 "md/raid10:%s: Operation continuing on %d devices.\n",
1632 mdname(mddev), bdevname(rdev->bdev, b), 1245 mdname(mddev), bdevname(rdev->bdev, b),
1633 mdname(mddev), conf->geo.raid_disks - mddev->degraded); 1246 mdname(mddev), conf->raid_disks - mddev->degraded);
1634} 1247}
1635 1248
1636static void print_conf(struct r10conf *conf) 1249static void print_conf(conf_t *conf)
1637{ 1250{
1638 int i; 1251 int i;
1639 struct raid10_info *tmp; 1252 mirror_info_t *tmp;
1640 1253
1641 printk(KERN_DEBUG "RAID10 conf printout:\n"); 1254 printk(KERN_DEBUG "RAID10 conf printout:\n");
1642 if (!conf) { 1255 if (!conf) {
1643 printk(KERN_DEBUG "(!conf)\n"); 1256 printk(KERN_DEBUG "(!conf)\n");
1644 return; 1257 return;
1645 } 1258 }
1646 printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded, 1259 printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
1647 conf->geo.raid_disks); 1260 conf->raid_disks);
1648 1261
1649 for (i = 0; i < conf->geo.raid_disks; i++) { 1262 for (i = 0; i < conf->raid_disks; i++) {
1650 char b[BDEVNAME_SIZE]; 1263 char b[BDEVNAME_SIZE];
1651 tmp = conf->mirrors + i; 1264 tmp = conf->mirrors + i;
1652 if (tmp->rdev) 1265 if (tmp->rdev)
@@ -1657,7 +1270,7 @@ static void print_conf(struct r10conf *conf)
1657 } 1270 }
1658} 1271}
1659 1272
1660static void close_sync(struct r10conf *conf) 1273static void close_sync(conf_t *conf)
1661{ 1274{
1662 wait_barrier(conf); 1275 wait_barrier(conf);
1663 allow_barrier(conf); 1276 allow_barrier(conf);
@@ -1666,11 +1279,11 @@ static void close_sync(struct r10conf *conf)
1666 conf->r10buf_pool = NULL; 1279 conf->r10buf_pool = NULL;
1667} 1280}
1668 1281
1669static int raid10_spare_active(struct mddev *mddev) 1282static int raid10_spare_active(mddev_t *mddev)
1670{ 1283{
1671 int i; 1284 int i;
1672 struct r10conf *conf = mddev->private; 1285 conf_t *conf = mddev->private;
1673 struct raid10_info *tmp; 1286 mirror_info_t *tmp;
1674 int count = 0; 1287 int count = 0;
1675 unsigned long flags; 1288 unsigned long flags;
1676 1289
@@ -1678,31 +1291,13 @@ static int raid10_spare_active(struct mddev *mddev)
1678 * Find all non-in_sync disks within the RAID10 configuration 1291 * Find all non-in_sync disks within the RAID10 configuration
1679 * and mark them in_sync 1292 * and mark them in_sync
1680 */ 1293 */
1681 for (i = 0; i < conf->geo.raid_disks; i++) { 1294 for (i = 0; i < conf->raid_disks; i++) {
1682 tmp = conf->mirrors + i; 1295 tmp = conf->mirrors + i;
1683 if (tmp->replacement 1296 if (tmp->rdev
1684 && tmp->replacement->recovery_offset == MaxSector 1297 && !test_bit(Faulty, &tmp->rdev->flags)
1685 && !test_bit(Faulty, &tmp->replacement->flags) 1298 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
1686 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
1687 /* Replacement has just become active */
1688 if (!tmp->rdev
1689 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
1690 count++;
1691 if (tmp->rdev) {
1692 /* Replaced device not technically faulty,
1693 * but we need to be sure it gets removed
1694 * and never re-added.
1695 */
1696 set_bit(Faulty, &tmp->rdev->flags);
1697 sysfs_notify_dirent_safe(
1698 tmp->rdev->sysfs_state);
1699 }
1700 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
1701 } else if (tmp->rdev
1702 && !test_bit(Faulty, &tmp->rdev->flags)
1703 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
1704 count++; 1299 count++;
1705 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); 1300 sysfs_notify_dirent(tmp->rdev->sysfs_state);
1706 } 1301 }
1707 } 1302 }
1708 spin_lock_irqsave(&conf->device_lock, flags); 1303 spin_lock_irqsave(&conf->device_lock, flags);
@@ -1714,60 +1309,52 @@ static int raid10_spare_active(struct mddev *mddev)
1714} 1309}
1715 1310
1716 1311
1717static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) 1312static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1718{ 1313{
1719 struct r10conf *conf = mddev->private; 1314 conf_t *conf = mddev->private;
1720 int err = -EEXIST; 1315 int err = -EEXIST;
1721 int mirror; 1316 int mirror;
1722 int first = 0; 1317 int first = 0;
1723 int last = conf->geo.raid_disks - 1; 1318 int last = conf->raid_disks - 1;
1724 struct request_queue *q = bdev_get_queue(rdev->bdev);
1725 1319
1726 if (mddev->recovery_cp < MaxSector) 1320 if (mddev->recovery_cp < MaxSector)
1727 /* only hot-add to in-sync arrays, as recovery is 1321 /* only hot-add to in-sync arrays, as recovery is
1728 * very different from resync 1322 * very different from resync
1729 */ 1323 */
1730 return -EBUSY; 1324 return -EBUSY;
1731 if (rdev->saved_raid_disk < 0 && !_enough(conf, &conf->prev, -1)) 1325 if (!enough(conf, -1))
1732 return -EINVAL; 1326 return -EINVAL;
1733 1327
1734 if (rdev->raid_disk >= 0) 1328 if (rdev->raid_disk >= 0)
1735 first = last = rdev->raid_disk; 1329 first = last = rdev->raid_disk;
1736 1330
1737 if (q->merge_bvec_fn) {
1738 set_bit(Unmerged, &rdev->flags);
1739 mddev->merge_check_needed = 1;
1740 }
1741
1742 if (rdev->saved_raid_disk >= first && 1331 if (rdev->saved_raid_disk >= first &&
1743 conf->mirrors[rdev->saved_raid_disk].rdev == NULL) 1332 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1744 mirror = rdev->saved_raid_disk; 1333 mirror = rdev->saved_raid_disk;
1745 else 1334 else
1746 mirror = first; 1335 mirror = first;
1747 for ( ; mirror <= last ; mirror++) { 1336 for ( ; mirror <= last ; mirror++) {
1748 struct raid10_info *p = &conf->mirrors[mirror]; 1337 mirror_info_t *p = &conf->mirrors[mirror];
1749 if (p->recovery_disabled == mddev->recovery_disabled) 1338 if (p->recovery_disabled == mddev->recovery_disabled)
1750 continue; 1339 continue;
1751 if (p->rdev) { 1340 if (p->rdev)
1752 if (!test_bit(WantReplacement, &p->rdev->flags) || 1341 continue;
1753 p->replacement != NULL)
1754 continue;
1755 clear_bit(In_sync, &rdev->flags);
1756 set_bit(Replacement, &rdev->flags);
1757 rdev->raid_disk = mirror;
1758 err = 0;
1759 disk_stack_limits(mddev->gendisk, rdev->bdev,
1760 rdev->data_offset << 9);
1761 conf->fullsync = 1;
1762 rcu_assign_pointer(p->replacement, rdev);
1763 break;
1764 }
1765 1342
1766 disk_stack_limits(mddev->gendisk, rdev->bdev, 1343 disk_stack_limits(mddev->gendisk, rdev->bdev,
1767 rdev->data_offset << 9); 1344 rdev->data_offset << 9);
1345 /* as we don't honour merge_bvec_fn, we must
1346 * never risk violating it, so limit
1347 * ->max_segments to one lying with a single
1348 * page, as a one page request is never in
1349 * violation.
1350 */
1351 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1352 blk_queue_max_segments(mddev->queue, 1);
1353 blk_queue_segment_boundary(mddev->queue,
1354 PAGE_CACHE_SIZE - 1);
1355 }
1768 1356
1769 p->head_position = 0; 1357 p->head_position = 0;
1770 p->recovery_disabled = mddev->recovery_disabled - 1;
1771 rdev->raid_disk = mirror; 1358 rdev->raid_disk = mirror;
1772 err = 0; 1359 err = 0;
1773 if (rdev->saved_raid_disk != mirror) 1360 if (rdev->saved_raid_disk != mirror)
@@ -1775,83 +1362,46 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1775 rcu_assign_pointer(p->rdev, rdev); 1362 rcu_assign_pointer(p->rdev, rdev);
1776 break; 1363 break;
1777 } 1364 }
1778 if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
1779 /* Some requests might not have seen this new
1780 * merge_bvec_fn. We must wait for them to complete
1781 * before merging the device fully.
1782 * First we make sure any code which has tested
1783 * our function has submitted the request, then
1784 * we wait for all outstanding requests to complete.
1785 */
1786 synchronize_sched();
1787 raise_barrier(conf, 0);
1788 lower_barrier(conf);
1789 clear_bit(Unmerged, &rdev->flags);
1790 }
1791 md_integrity_add_rdev(rdev, mddev);
1792 if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
1793 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
1794 1365
1366 md_integrity_add_rdev(rdev, mddev);
1795 print_conf(conf); 1367 print_conf(conf);
1796 return err; 1368 return err;
1797} 1369}
1798 1370
1799static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) 1371static int raid10_remove_disk(mddev_t *mddev, int number)
1800{ 1372{
1801 struct r10conf *conf = mddev->private; 1373 conf_t *conf = mddev->private;
1802 int err = 0; 1374 int err = 0;
1803 int number = rdev->raid_disk; 1375 mdk_rdev_t *rdev;
1804 struct md_rdev **rdevp; 1376 mirror_info_t *p = conf->mirrors+ number;
1805 struct raid10_info *p = conf->mirrors + number;
1806 1377
1807 print_conf(conf); 1378 print_conf(conf);
1808 if (rdev == p->rdev) 1379 rdev = p->rdev;
1809 rdevp = &p->rdev; 1380 if (rdev) {
1810 else if (rdev == p->replacement) 1381 if (test_bit(In_sync, &rdev->flags) ||
1811 rdevp = &p->replacement; 1382 atomic_read(&rdev->nr_pending)) {
1812 else 1383 err = -EBUSY;
1813 return 0; 1384 goto abort;
1814 1385 }
1815 if (test_bit(In_sync, &rdev->flags) || 1386 /* Only remove faulty devices in recovery
1816 atomic_read(&rdev->nr_pending)) { 1387 * is not possible.
1817 err = -EBUSY;
1818 goto abort;
1819 }
1820 /* Only remove faulty devices if recovery
1821 * is not possible.
1822 */
1823 if (!test_bit(Faulty, &rdev->flags) &&
1824 mddev->recovery_disabled != p->recovery_disabled &&
1825 (!p->replacement || p->replacement == rdev) &&
1826 number < conf->geo.raid_disks &&
1827 enough(conf, -1)) {
1828 err = -EBUSY;
1829 goto abort;
1830 }
1831 *rdevp = NULL;
1832 synchronize_rcu();
1833 if (atomic_read(&rdev->nr_pending)) {
1834 /* lost the race, try later */
1835 err = -EBUSY;
1836 *rdevp = rdev;
1837 goto abort;
1838 } else if (p->replacement) {
1839 /* We must have just cleared 'rdev' */
1840 p->rdev = p->replacement;
1841 clear_bit(Replacement, &p->replacement->flags);
1842 smp_mb(); /* Make sure other CPUs may see both as identical
1843 * but will never see neither -- if they are careful.
1844 */
1845 p->replacement = NULL;
1846 clear_bit(WantReplacement, &rdev->flags);
1847 } else
1848 /* We might have just remove the Replacement as faulty
1849 * Clear the flag just in case
1850 */ 1388 */
1851 clear_bit(WantReplacement, &rdev->flags); 1389 if (!test_bit(Faulty, &rdev->flags) &&
1852 1390 mddev->recovery_disabled != p->recovery_disabled &&
1853 err = md_integrity_register(mddev); 1391 enough(conf, -1)) {
1854 1392 err = -EBUSY;
1393 goto abort;
1394 }
1395 p->rdev = NULL;
1396 synchronize_rcu();
1397 if (atomic_read(&rdev->nr_pending)) {
1398 /* lost the race, try later */
1399 err = -EBUSY;
1400 p->rdev = rdev;
1401 goto abort;
1402 }
1403 err = md_integrity_register(mddev);
1404 }
1855abort: 1405abort:
1856 1406
1857 print_conf(conf); 1407 print_conf(conf);
@@ -1861,15 +1411,11 @@ abort:
1861 1411
1862static void end_sync_read(struct bio *bio, int error) 1412static void end_sync_read(struct bio *bio, int error)
1863{ 1413{
1864 struct r10bio *r10_bio = bio->bi_private; 1414 r10bio_t *r10_bio = bio->bi_private;
1865 struct r10conf *conf = r10_bio->mddev->private; 1415 conf_t *conf = r10_bio->mddev->private;
1866 int d; 1416 int d;
1867 1417
1868 if (bio == r10_bio->master_bio) { 1418 d = find_bio_disk(conf, r10_bio, bio, NULL);
1869 /* this is a reshape read */
1870 d = r10_bio->read_slot; /* really the read dev */
1871 } else
1872 d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
1873 1419
1874 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 1420 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1875 set_bit(R10BIO_Uptodate, &r10_bio->state); 1421 set_bit(R10BIO_Uptodate, &r10_bio->state);
@@ -1893,9 +1439,9 @@ static void end_sync_read(struct bio *bio, int error)
1893 } 1439 }
1894} 1440}
1895 1441
1896static void end_sync_request(struct r10bio *r10_bio) 1442static void end_sync_request(r10bio_t *r10_bio)
1897{ 1443{
1898 struct mddev *mddev = r10_bio->mddev; 1444 mddev_t *mddev = r10_bio->mddev;
1899 1445
1900 while (atomic_dec_and_test(&r10_bio->remaining)) { 1446 while (atomic_dec_and_test(&r10_bio->remaining)) {
1901 if (r10_bio->master_bio == NULL) { 1447 if (r10_bio->master_bio == NULL) {
@@ -1909,7 +1455,7 @@ static void end_sync_request(struct r10bio *r10_bio)
1909 md_done_sync(mddev, s, 1); 1455 md_done_sync(mddev, s, 1);
1910 break; 1456 break;
1911 } else { 1457 } else {
1912 struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio; 1458 r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio;
1913 if (test_bit(R10BIO_MadeGood, &r10_bio->state) || 1459 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1914 test_bit(R10BIO_WriteError, &r10_bio->state)) 1460 test_bit(R10BIO_WriteError, &r10_bio->state))
1915 reschedule_retry(r10_bio); 1461 reschedule_retry(r10_bio);
@@ -1923,39 +1469,26 @@ static void end_sync_request(struct r10bio *r10_bio)
1923static void end_sync_write(struct bio *bio, int error) 1469static void end_sync_write(struct bio *bio, int error)
1924{ 1470{
1925 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1471 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1926 struct r10bio *r10_bio = bio->bi_private; 1472 r10bio_t *r10_bio = bio->bi_private;
1927 struct mddev *mddev = r10_bio->mddev; 1473 mddev_t *mddev = r10_bio->mddev;
1928 struct r10conf *conf = mddev->private; 1474 conf_t *conf = mddev->private;
1929 int d; 1475 int d;
1930 sector_t first_bad; 1476 sector_t first_bad;
1931 int bad_sectors; 1477 int bad_sectors;
1932 int slot; 1478 int slot;
1933 int repl;
1934 struct md_rdev *rdev = NULL;
1935 1479
1936 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); 1480 d = find_bio_disk(conf, r10_bio, bio, &slot);
1937 if (repl)
1938 rdev = conf->mirrors[d].replacement;
1939 else
1940 rdev = conf->mirrors[d].rdev;
1941 1481
1942 if (!uptodate) { 1482 if (!uptodate) {
1943 if (repl) 1483 set_bit(WriteErrorSeen, &conf->mirrors[d].rdev->flags);
1944 md_error(mddev, rdev); 1484 set_bit(R10BIO_WriteError, &r10_bio->state);
1945 else { 1485 } else if (is_badblock(conf->mirrors[d].rdev,
1946 set_bit(WriteErrorSeen, &rdev->flags);
1947 if (!test_and_set_bit(WantReplacement, &rdev->flags))
1948 set_bit(MD_RECOVERY_NEEDED,
1949 &rdev->mddev->recovery);
1950 set_bit(R10BIO_WriteError, &r10_bio->state);
1951 }
1952 } else if (is_badblock(rdev,
1953 r10_bio->devs[slot].addr, 1486 r10_bio->devs[slot].addr,
1954 r10_bio->sectors, 1487 r10_bio->sectors,
1955 &first_bad, &bad_sectors)) 1488 &first_bad, &bad_sectors))
1956 set_bit(R10BIO_MadeGood, &r10_bio->state); 1489 set_bit(R10BIO_MadeGood, &r10_bio->state);
1957 1490
1958 rdev_dec_pending(rdev, mddev); 1491 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1959 1492
1960 end_sync_request(r10_bio); 1493 end_sync_request(r10_bio);
1961} 1494}
@@ -1976,12 +1509,11 @@ static void end_sync_write(struct bio *bio, int error)
1976 * We check if all blocks are in-sync and only write to blocks that 1509 * We check if all blocks are in-sync and only write to blocks that
1977 * aren't in sync 1510 * aren't in sync
1978 */ 1511 */
1979static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) 1512static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1980{ 1513{
1981 struct r10conf *conf = mddev->private; 1514 conf_t *conf = mddev->private;
1982 int i, first; 1515 int i, first;
1983 struct bio *tbio, *fbio; 1516 struct bio *tbio, *fbio;
1984 int vcnt;
1985 1517
1986 atomic_set(&r10_bio->remaining, 1); 1518 atomic_set(&r10_bio->remaining, 1);
1987 1519
@@ -1996,10 +1528,10 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
1996 first = i; 1528 first = i;
1997 fbio = r10_bio->devs[i].bio; 1529 fbio = r10_bio->devs[i].bio;
1998 1530
1999 vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
2000 /* now find blocks with errors */ 1531 /* now find blocks with errors */
2001 for (i=0 ; i < conf->copies ; i++) { 1532 for (i=0 ; i < conf->copies ; i++) {
2002 int j, d; 1533 int j, d;
1534 int vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);
2003 1535
2004 tbio = r10_bio->devs[i].bio; 1536 tbio = r10_bio->devs[i].bio;
2005 1537
@@ -2015,11 +1547,11 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2015 for (j = 0; j < vcnt; j++) 1547 for (j = 0; j < vcnt; j++)
2016 if (memcmp(page_address(fbio->bi_io_vec[j].bv_page), 1548 if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
2017 page_address(tbio->bi_io_vec[j].bv_page), 1549 page_address(tbio->bi_io_vec[j].bv_page),
2018 fbio->bi_io_vec[j].bv_len)) 1550 PAGE_SIZE))
2019 break; 1551 break;
2020 if (j == vcnt) 1552 if (j == vcnt)
2021 continue; 1553 continue;
2022 atomic64_add(r10_bio->sectors, &mddev->resync_mismatches); 1554 mddev->resync_mismatches += r10_bio->sectors;
2023 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 1555 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
2024 /* Don't fix anything. */ 1556 /* Don't fix anything. */
2025 continue; 1557 continue;
@@ -2060,28 +1592,6 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2060 generic_make_request(tbio); 1592 generic_make_request(tbio);
2061 } 1593 }
2062 1594
2063 /* Now write out to any replacement devices
2064 * that are active
2065 */
2066 for (i = 0; i < conf->copies; i++) {
2067 int j, d;
2068
2069 tbio = r10_bio->devs[i].repl_bio;
2070 if (!tbio || !tbio->bi_end_io)
2071 continue;
2072 if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
2073 && r10_bio->devs[i].bio != fbio)
2074 for (j = 0; j < vcnt; j++)
2075 memcpy(page_address(tbio->bi_io_vec[j].bv_page),
2076 page_address(fbio->bi_io_vec[j].bv_page),
2077 PAGE_SIZE);
2078 d = r10_bio->devs[i].devnum;
2079 atomic_inc(&r10_bio->remaining);
2080 md_sync_acct(conf->mirrors[d].replacement->bdev,
2081 tbio->bi_size >> 9);
2082 generic_make_request(tbio);
2083 }
2084
2085done: 1595done:
2086 if (atomic_dec_and_test(&r10_bio->remaining)) { 1596 if (atomic_dec_and_test(&r10_bio->remaining)) {
2087 md_done_sync(mddev, r10_bio->sectors, 1); 1597 md_done_sync(mddev, r10_bio->sectors, 1);
@@ -2099,7 +1609,7 @@ done:
2099 * The second for writing. 1609 * The second for writing.
2100 * 1610 *
2101 */ 1611 */
2102static void fix_recovery_read_error(struct r10bio *r10_bio) 1612static void fix_recovery_read_error(r10bio_t *r10_bio)
2103{ 1613{
2104 /* We got a read error during recovery. 1614 /* We got a read error during recovery.
2105 * We repeat the read in smaller page-sized sections. 1615 * We repeat the read in smaller page-sized sections.
@@ -2108,8 +1618,8 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
2108 * If a read fails, record a bad block on both old and 1618 * If a read fails, record a bad block on both old and
2109 * new devices. 1619 * new devices.
2110 */ 1620 */
2111 struct mddev *mddev = r10_bio->mddev; 1621 mddev_t *mddev = r10_bio->mddev;
2112 struct r10conf *conf = mddev->private; 1622 conf_t *conf = mddev->private;
2113 struct bio *bio = r10_bio->devs[0].bio; 1623 struct bio *bio = r10_bio->devs[0].bio;
2114 sector_t sect = 0; 1624 sector_t sect = 0;
2115 int sectors = r10_bio->sectors; 1625 int sectors = r10_bio->sectors;
@@ -2119,7 +1629,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
2119 1629
2120 while (sectors) { 1630 while (sectors) {
2121 int s = sectors; 1631 int s = sectors;
2122 struct md_rdev *rdev; 1632 mdk_rdev_t *rdev;
2123 sector_t addr; 1633 sector_t addr;
2124 int ok; 1634 int ok;
2125 1635
@@ -2141,13 +1651,8 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
2141 s << 9, 1651 s << 9,
2142 bio->bi_io_vec[idx].bv_page, 1652 bio->bi_io_vec[idx].bv_page,
2143 WRITE, false); 1653 WRITE, false);
2144 if (!ok) { 1654 if (!ok)
2145 set_bit(WriteErrorSeen, &rdev->flags); 1655 set_bit(WriteErrorSeen, &rdev->flags);
2146 if (!test_and_set_bit(WantReplacement,
2147 &rdev->flags))
2148 set_bit(MD_RECOVERY_NEEDED,
2149 &rdev->mddev->recovery);
2150 }
2151 } 1656 }
2152 if (!ok) { 1657 if (!ok) {
2153 /* We don't worry if we cannot set a bad block - 1658 /* We don't worry if we cannot set a bad block -
@@ -2158,7 +1663,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
2158 1663
2159 if (rdev != conf->mirrors[dw].rdev) { 1664 if (rdev != conf->mirrors[dw].rdev) {
2160 /* need bad block on destination too */ 1665 /* need bad block on destination too */
2161 struct md_rdev *rdev2 = conf->mirrors[dw].rdev; 1666 mdk_rdev_t *rdev2 = conf->mirrors[dw].rdev;
2162 addr = r10_bio->devs[1].addr + sect; 1667 addr = r10_bio->devs[1].addr + sect;
2163 ok = rdev_set_badblocks(rdev2, addr, s, 0); 1668 ok = rdev_set_badblocks(rdev2, addr, s, 0);
2164 if (!ok) { 1669 if (!ok) {
@@ -2183,11 +1688,11 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
2183 } 1688 }
2184} 1689}
2185 1690
2186static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) 1691static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
2187{ 1692{
2188 struct r10conf *conf = mddev->private; 1693 conf_t *conf = mddev->private;
2189 int d; 1694 int d;
2190 struct bio *wbio, *wbio2; 1695 struct bio *wbio;
2191 1696
2192 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) { 1697 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
2193 fix_recovery_read_error(r10_bio); 1698 fix_recovery_read_error(r10_bio);
@@ -2199,20 +1704,12 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2199 * share the pages with the first bio 1704 * share the pages with the first bio
2200 * and submit the write request 1705 * and submit the write request
2201 */ 1706 */
2202 d = r10_bio->devs[1].devnum;
2203 wbio = r10_bio->devs[1].bio; 1707 wbio = r10_bio->devs[1].bio;
2204 wbio2 = r10_bio->devs[1].repl_bio; 1708 d = r10_bio->devs[1].devnum;
2205 if (wbio->bi_end_io) { 1709
2206 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 1710 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2207 md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); 1711 md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
2208 generic_make_request(wbio); 1712 generic_make_request(wbio);
2209 }
2210 if (wbio2 && wbio2->bi_end_io) {
2211 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
2212 md_sync_acct(conf->mirrors[d].replacement->bdev,
2213 wbio2->bi_size >> 9);
2214 generic_make_request(wbio2);
2215 }
2216} 1713}
2217 1714
2218 1715
@@ -2222,7 +1719,7 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2222 * since the last recorded read error. 1719 * since the last recorded read error.
2223 * 1720 *
2224 */ 1721 */
2225static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev) 1722static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev)
2226{ 1723{
2227 struct timespec cur_time_mon; 1724 struct timespec cur_time_mon;
2228 unsigned long hours_since_last; 1725 unsigned long hours_since_last;
@@ -2253,7 +1750,7 @@ static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
2253 atomic_set(&rdev->read_errors, read_errors >> hours_since_last); 1750 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
2254} 1751}
2255 1752
2256static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector, 1753static int r10_sync_page_io(mdk_rdev_t *rdev, sector_t sector,
2257 int sectors, struct page *page, int rw) 1754 int sectors, struct page *page, int rw)
2258{ 1755{
2259 sector_t first_bad; 1756 sector_t first_bad;
@@ -2265,12 +1762,8 @@ static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
2265 if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) 1762 if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
2266 /* success */ 1763 /* success */
2267 return 1; 1764 return 1;
2268 if (rw == WRITE) { 1765 if (rw == WRITE)
2269 set_bit(WriteErrorSeen, &rdev->flags); 1766 set_bit(WriteErrorSeen, &rdev->flags);
2270 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2271 set_bit(MD_RECOVERY_NEEDED,
2272 &rdev->mddev->recovery);
2273 }
2274 /* need to record an error - either for the block or the device */ 1767 /* need to record an error - either for the block or the device */
2275 if (!rdev_set_badblocks(rdev, sector, sectors, 0)) 1768 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
2276 md_error(rdev->mddev, rdev); 1769 md_error(rdev->mddev, rdev);
@@ -2285,11 +1778,11 @@ static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
2285 * 3. Performs writes following reads for array synchronising. 1778 * 3. Performs writes following reads for array synchronising.
2286 */ 1779 */
2287 1780
2288static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio) 1781static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
2289{ 1782{
2290 int sect = 0; /* Offset from r10_bio->sector */ 1783 int sect = 0; /* Offset from r10_bio->sector */
2291 int sectors = r10_bio->sectors; 1784 int sectors = r10_bio->sectors;
2292 struct md_rdev*rdev; 1785 mdk_rdev_t*rdev;
2293 int max_read_errors = atomic_read(&mddev->max_corr_read_errors); 1786 int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
2294 int d = r10_bio->devs[r10_bio->read_slot].devnum; 1787 int d = r10_bio->devs[r10_bio->read_slot].devnum;
2295 1788
@@ -2318,7 +1811,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
2318 "md/raid10:%s: %s: Failing raid device\n", 1811 "md/raid10:%s: %s: Failing raid device\n",
2319 mdname(mddev), b); 1812 mdname(mddev), b);
2320 md_error(mddev, conf->mirrors[d].rdev); 1813 md_error(mddev, conf->mirrors[d].rdev);
2321 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
2322 return; 1814 return;
2323 } 1815 }
2324 1816
@@ -2339,7 +1831,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
2339 d = r10_bio->devs[sl].devnum; 1831 d = r10_bio->devs[sl].devnum;
2340 rdev = rcu_dereference(conf->mirrors[d].rdev); 1832 rdev = rcu_dereference(conf->mirrors[d].rdev);
2341 if (rdev && 1833 if (rdev &&
2342 !test_bit(Unmerged, &rdev->flags) &&
2343 test_bit(In_sync, &rdev->flags) && 1834 test_bit(In_sync, &rdev->flags) &&
2344 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, 1835 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
2345 &first_bad, &bad_sectors) == 0) { 1836 &first_bad, &bad_sectors) == 0) {
@@ -2373,11 +1864,8 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
2373 rdev, 1864 rdev,
2374 r10_bio->devs[r10_bio->read_slot].addr 1865 r10_bio->devs[r10_bio->read_slot].addr
2375 + sect, 1866 + sect,
2376 s, 0)) { 1867 s, 0))
2377 md_error(mddev, rdev); 1868 md_error(mddev, rdev);
2378 r10_bio->devs[r10_bio->read_slot].bio
2379 = IO_BLOCKED;
2380 }
2381 break; 1869 break;
2382 } 1870 }
2383 1871
@@ -2393,7 +1881,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
2393 d = r10_bio->devs[sl].devnum; 1881 d = r10_bio->devs[sl].devnum;
2394 rdev = rcu_dereference(conf->mirrors[d].rdev); 1882 rdev = rcu_dereference(conf->mirrors[d].rdev);
2395 if (!rdev || 1883 if (!rdev ||
2396 test_bit(Unmerged, &rdev->flags) ||
2397 !test_bit(In_sync, &rdev->flags)) 1884 !test_bit(In_sync, &rdev->flags))
2398 continue; 1885 continue;
2399 1886
@@ -2402,7 +1889,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
2402 if (r10_sync_page_io(rdev, 1889 if (r10_sync_page_io(rdev,
2403 r10_bio->devs[sl].addr + 1890 r10_bio->devs[sl].addr +
2404 sect, 1891 sect,
2405 s, conf->tmppage, WRITE) 1892 s<<9, conf->tmppage, WRITE)
2406 == 0) { 1893 == 0) {
2407 /* Well, this device is dead */ 1894 /* Well, this device is dead */
2408 printk(KERN_NOTICE 1895 printk(KERN_NOTICE
@@ -2411,9 +1898,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
2411 " (%d sectors at %llu on %s)\n", 1898 " (%d sectors at %llu on %s)\n",
2412 mdname(mddev), s, 1899 mdname(mddev), s,
2413 (unsigned long long)( 1900 (unsigned long long)(
2414 sect + 1901 sect + rdev->data_offset),
2415 choose_data_offset(r10_bio,
2416 rdev)),
2417 bdevname(rdev->bdev, b)); 1902 bdevname(rdev->bdev, b));
2418 printk(KERN_NOTICE "md/raid10:%s: %s: failing " 1903 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
2419 "drive\n", 1904 "drive\n",
@@ -2441,7 +1926,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
2441 switch (r10_sync_page_io(rdev, 1926 switch (r10_sync_page_io(rdev,
2442 r10_bio->devs[sl].addr + 1927 r10_bio->devs[sl].addr +
2443 sect, 1928 sect,
2444 s, conf->tmppage, 1929 s<<9, conf->tmppage,
2445 READ)) { 1930 READ)) {
2446 case 0: 1931 case 0:
2447 /* Well, this device is dead */ 1932 /* Well, this device is dead */
@@ -2451,8 +1936,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
2451 " (%d sectors at %llu on %s)\n", 1936 " (%d sectors at %llu on %s)\n",
2452 mdname(mddev), s, 1937 mdname(mddev), s,
2453 (unsigned long long)( 1938 (unsigned long long)(
2454 sect + 1939 sect + rdev->data_offset),
2455 choose_data_offset(r10_bio, rdev)),
2456 bdevname(rdev->bdev, b)); 1940 bdevname(rdev->bdev, b));
2457 printk(KERN_NOTICE "md/raid10:%s: %s: failing " 1941 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
2458 "drive\n", 1942 "drive\n",
@@ -2465,8 +1949,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
2465 " (%d sectors at %llu on %s)\n", 1949 " (%d sectors at %llu on %s)\n",
2466 mdname(mddev), s, 1950 mdname(mddev), s,
2467 (unsigned long long)( 1951 (unsigned long long)(
2468 sect + 1952 sect + rdev->data_offset),
2469 choose_data_offset(r10_bio, rdev)),
2470 bdevname(rdev->bdev, b)); 1953 bdevname(rdev->bdev, b));
2471 atomic_add(s, &rdev->corrected_errors); 1954 atomic_add(s, &rdev->corrected_errors);
2472 } 1955 }
@@ -2500,12 +1983,12 @@ static int submit_bio_wait(int rw, struct bio *bio)
2500 return test_bit(BIO_UPTODATE, &bio->bi_flags); 1983 return test_bit(BIO_UPTODATE, &bio->bi_flags);
2501} 1984}
2502 1985
2503static int narrow_write_error(struct r10bio *r10_bio, int i) 1986static int narrow_write_error(r10bio_t *r10_bio, int i)
2504{ 1987{
2505 struct bio *bio = r10_bio->master_bio; 1988 struct bio *bio = r10_bio->master_bio;
2506 struct mddev *mddev = r10_bio->mddev; 1989 mddev_t *mddev = r10_bio->mddev;
2507 struct r10conf *conf = mddev->private; 1990 conf_t *conf = mddev->private;
2508 struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev; 1991 mdk_rdev_t *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
2509 /* bio has the data to be written to slot 'i' where 1992 /* bio has the data to be written to slot 'i' where
2510 * we just recently had a write error. 1993 * we just recently had a write error.
2511 * We repeatedly clone the bio and trim down to one block, 1994 * We repeatedly clone the bio and trim down to one block,
@@ -2540,7 +2023,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
2540 wbio = bio_clone_mddev(bio, GFP_NOIO, mddev); 2023 wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
2541 md_trim_bio(wbio, sector - bio->bi_sector, sectors); 2024 md_trim_bio(wbio, sector - bio->bi_sector, sectors);
2542 wbio->bi_sector = (r10_bio->devs[i].addr+ 2025 wbio->bi_sector = (r10_bio->devs[i].addr+
2543 choose_data_offset(r10_bio, rdev) + 2026 rdev->data_offset+
2544 (sector - r10_bio->sector)); 2027 (sector - r10_bio->sector));
2545 wbio->bi_bdev = rdev->bdev; 2028 wbio->bi_bdev = rdev->bdev;
2546 if (submit_bio_wait(WRITE, wbio) == 0) 2029 if (submit_bio_wait(WRITE, wbio) == 0)
@@ -2557,12 +2040,13 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
2557 return ok; 2040 return ok;
2558} 2041}
2559 2042
2560static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) 2043static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio)
2561{ 2044{
2562 int slot = r10_bio->read_slot; 2045 int slot = r10_bio->read_slot;
2046 int mirror = r10_bio->devs[slot].devnum;
2563 struct bio *bio; 2047 struct bio *bio;
2564 struct r10conf *conf = mddev->private; 2048 conf_t *conf = mddev->private;
2565 struct md_rdev *rdev = r10_bio->devs[slot].rdev; 2049 mdk_rdev_t *rdev;
2566 char b[BDEVNAME_SIZE]; 2050 char b[BDEVNAME_SIZE];
2567 unsigned long do_sync; 2051 unsigned long do_sync;
2568 int max_sectors; 2052 int max_sectors;
@@ -2575,36 +2059,37 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2575 * This is all done synchronously while the array is 2059 * This is all done synchronously while the array is
2576 * frozen. 2060 * frozen.
2577 */ 2061 */
2578 bio = r10_bio->devs[slot].bio;
2579 bdevname(bio->bi_bdev, b);
2580 bio_put(bio);
2581 r10_bio->devs[slot].bio = NULL;
2582
2583 if (mddev->ro == 0) { 2062 if (mddev->ro == 0) {
2584 freeze_array(conf); 2063 freeze_array(conf);
2585 fix_read_error(conf, mddev, r10_bio); 2064 fix_read_error(conf, mddev, r10_bio);
2586 unfreeze_array(conf); 2065 unfreeze_array(conf);
2587 } else 2066 }
2588 r10_bio->devs[slot].bio = IO_BLOCKED; 2067 rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
2589
2590 rdev_dec_pending(rdev, mddev);
2591 2068
2069 bio = r10_bio->devs[slot].bio;
2070 bdevname(bio->bi_bdev, b);
2071 r10_bio->devs[slot].bio =
2072 mddev->ro ? IO_BLOCKED : NULL;
2592read_more: 2073read_more:
2593 rdev = read_balance(conf, r10_bio, &max_sectors); 2074 mirror = read_balance(conf, r10_bio, &max_sectors);
2594 if (rdev == NULL) { 2075 if (mirror == -1) {
2595 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" 2076 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
2596 " read error for block %llu\n", 2077 " read error for block %llu\n",
2597 mdname(mddev), b, 2078 mdname(mddev), b,
2598 (unsigned long long)r10_bio->sector); 2079 (unsigned long long)r10_bio->sector);
2599 raid_end_bio_io(r10_bio); 2080 raid_end_bio_io(r10_bio);
2081 bio_put(bio);
2600 return; 2082 return;
2601 } 2083 }
2602 2084
2603 do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); 2085 do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
2086 if (bio)
2087 bio_put(bio);
2604 slot = r10_bio->read_slot; 2088 slot = r10_bio->read_slot;
2089 rdev = conf->mirrors[mirror].rdev;
2605 printk_ratelimited( 2090 printk_ratelimited(
2606 KERN_ERR 2091 KERN_ERR
2607 "md/raid10:%s: %s: redirecting " 2092 "md/raid10:%s: %s: redirecting"
2608 "sector %llu to another mirror\n", 2093 "sector %llu to another mirror\n",
2609 mdname(mddev), 2094 mdname(mddev),
2610 bdevname(rdev->bdev, b), 2095 bdevname(rdev->bdev, b),
@@ -2615,9 +2100,8 @@ read_more:
2615 r10_bio->sector - bio->bi_sector, 2100 r10_bio->sector - bio->bi_sector,
2616 max_sectors); 2101 max_sectors);
2617 r10_bio->devs[slot].bio = bio; 2102 r10_bio->devs[slot].bio = bio;
2618 r10_bio->devs[slot].rdev = rdev;
2619 bio->bi_sector = r10_bio->devs[slot].addr 2103 bio->bi_sector = r10_bio->devs[slot].addr
2620 + choose_data_offset(r10_bio, rdev); 2104 + rdev->data_offset;
2621 bio->bi_bdev = rdev->bdev; 2105 bio->bi_bdev = rdev->bdev;
2622 bio->bi_rw = READ | do_sync; 2106 bio->bi_rw = READ | do_sync;
2623 bio->bi_private = r10_bio; 2107 bio->bi_private = r10_bio;
@@ -2636,6 +2120,7 @@ read_more:
2636 mbio->bi_phys_segments++; 2120 mbio->bi_phys_segments++;
2637 spin_unlock_irq(&conf->device_lock); 2121 spin_unlock_irq(&conf->device_lock);
2638 generic_make_request(bio); 2122 generic_make_request(bio);
2123 bio = NULL;
2639 2124
2640 r10_bio = mempool_alloc(conf->r10bio_pool, 2125 r10_bio = mempool_alloc(conf->r10bio_pool,
2641 GFP_NOIO); 2126 GFP_NOIO);
@@ -2654,7 +2139,7 @@ read_more:
2654 generic_make_request(bio); 2139 generic_make_request(bio);
2655} 2140}
2656 2141
2657static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) 2142static void handle_write_completed(conf_t *conf, r10bio_t *r10_bio)
2658{ 2143{
2659 /* Some sort of write request has finished and it 2144 /* Some sort of write request has finished and it
2660 * succeeded in writing where we thought there was a 2145 * succeeded in writing where we thought there was a
@@ -2663,7 +2148,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2663 * a bad block. 2148 * a bad block.
2664 */ 2149 */
2665 int m; 2150 int m;
2666 struct md_rdev *rdev; 2151 mdk_rdev_t *rdev;
2667 2152
2668 if (test_bit(R10BIO_IsSync, &r10_bio->state) || 2153 if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
2669 test_bit(R10BIO_IsRecover, &r10_bio->state)) { 2154 test_bit(R10BIO_IsRecover, &r10_bio->state)) {
@@ -2677,23 +2162,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2677 rdev_clear_badblocks( 2162 rdev_clear_badblocks(
2678 rdev, 2163 rdev,
2679 r10_bio->devs[m].addr, 2164 r10_bio->devs[m].addr,
2680 r10_bio->sectors, 0); 2165 r10_bio->sectors);
2681 } else {
2682 if (!rdev_set_badblocks(
2683 rdev,
2684 r10_bio->devs[m].addr,
2685 r10_bio->sectors, 0))
2686 md_error(conf->mddev, rdev);
2687 }
2688 rdev = conf->mirrors[dev].replacement;
2689 if (r10_bio->devs[m].repl_bio == NULL)
2690 continue;
2691 if (test_bit(BIO_UPTODATE,
2692 &r10_bio->devs[m].repl_bio->bi_flags)) {
2693 rdev_clear_badblocks(
2694 rdev,
2695 r10_bio->devs[m].addr,
2696 r10_bio->sectors, 0);
2697 } else { 2166 } else {
2698 if (!rdev_set_badblocks( 2167 if (!rdev_set_badblocks(
2699 rdev, 2168 rdev,
@@ -2712,7 +2181,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2712 rdev_clear_badblocks( 2181 rdev_clear_badblocks(
2713 rdev, 2182 rdev,
2714 r10_bio->devs[m].addr, 2183 r10_bio->devs[m].addr,
2715 r10_bio->sectors, 0); 2184 r10_bio->sectors);
2716 rdev_dec_pending(rdev, conf->mddev); 2185 rdev_dec_pending(rdev, conf->mddev);
2717 } else if (bio != NULL && 2186 } else if (bio != NULL &&
2718 !test_bit(BIO_UPTODATE, &bio->bi_flags)) { 2187 !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
@@ -2723,15 +2192,6 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2723 } 2192 }
2724 rdev_dec_pending(rdev, conf->mddev); 2193 rdev_dec_pending(rdev, conf->mddev);
2725 } 2194 }
2726 bio = r10_bio->devs[m].repl_bio;
2727 rdev = conf->mirrors[dev].replacement;
2728 if (rdev && bio == IO_MADE_GOOD) {
2729 rdev_clear_badblocks(
2730 rdev,
2731 r10_bio->devs[m].addr,
2732 r10_bio->sectors, 0);
2733 rdev_dec_pending(rdev, conf->mddev);
2734 }
2735 } 2195 }
2736 if (test_bit(R10BIO_WriteError, 2196 if (test_bit(R10BIO_WriteError,
2737 &r10_bio->state)) 2197 &r10_bio->state))
@@ -2740,12 +2200,11 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2740 } 2200 }
2741} 2201}
2742 2202
2743static void raid10d(struct md_thread *thread) 2203static void raid10d(mddev_t *mddev)
2744{ 2204{
2745 struct mddev *mddev = thread->mddev; 2205 r10bio_t *r10_bio;
2746 struct r10bio *r10_bio;
2747 unsigned long flags; 2206 unsigned long flags;
2748 struct r10conf *conf = mddev->private; 2207 conf_t *conf = mddev->private;
2749 struct list_head *head = &conf->retry_list; 2208 struct list_head *head = &conf->retry_list;
2750 struct blk_plug plug; 2209 struct blk_plug plug;
2751 2210
@@ -2761,7 +2220,7 @@ static void raid10d(struct md_thread *thread)
2761 spin_unlock_irqrestore(&conf->device_lock, flags); 2220 spin_unlock_irqrestore(&conf->device_lock, flags);
2762 break; 2221 break;
2763 } 2222 }
2764 r10_bio = list_entry(head->prev, struct r10bio, retry_list); 2223 r10_bio = list_entry(head->prev, r10bio_t, retry_list);
2765 list_del(head->prev); 2224 list_del(head->prev);
2766 conf->nr_queued--; 2225 conf->nr_queued--;
2767 spin_unlock_irqrestore(&conf->device_lock, flags); 2226 spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -2771,8 +2230,6 @@ static void raid10d(struct md_thread *thread)
2771 if (test_bit(R10BIO_MadeGood, &r10_bio->state) || 2230 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2772 test_bit(R10BIO_WriteError, &r10_bio->state)) 2231 test_bit(R10BIO_WriteError, &r10_bio->state))
2773 handle_write_completed(conf, r10_bio); 2232 handle_write_completed(conf, r10_bio);
2774 else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
2775 reshape_request_write(mddev, r10_bio);
2776 else if (test_bit(R10BIO_IsSync, &r10_bio->state)) 2233 else if (test_bit(R10BIO_IsSync, &r10_bio->state))
2777 sync_request_write(mddev, r10_bio); 2234 sync_request_write(mddev, r10_bio);
2778 else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) 2235 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
@@ -2795,17 +2252,12 @@ static void raid10d(struct md_thread *thread)
2795} 2252}
2796 2253
2797 2254
2798static int init_resync(struct r10conf *conf) 2255static int init_resync(conf_t *conf)
2799{ 2256{
2800 int buffs; 2257 int buffs;
2801 int i;
2802 2258
2803 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; 2259 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
2804 BUG_ON(conf->r10buf_pool); 2260 BUG_ON(conf->r10buf_pool);
2805 conf->have_replacement = 0;
2806 for (i = 0; i < conf->geo.raid_disks; i++)
2807 if (conf->mirrors[i].replacement)
2808 conf->have_replacement = 1;
2809 conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf); 2261 conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
2810 if (!conf->r10buf_pool) 2262 if (!conf->r10buf_pool)
2811 return -ENOMEM; 2263 return -ENOMEM;
@@ -2845,11 +2297,11 @@ static int init_resync(struct r10conf *conf)
2845 * 2297 *
2846 */ 2298 */
2847 2299
2848static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, 2300static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
2849 int *skipped, int go_faster) 2301 int *skipped, int go_faster)
2850{ 2302{
2851 struct r10conf *conf = mddev->private; 2303 conf_t *conf = mddev->private;
2852 struct r10bio *r10_bio; 2304 r10bio_t *r10_bio;
2853 struct bio *biolist = NULL, *bio; 2305 struct bio *biolist = NULL, *bio;
2854 sector_t max_sector, nr_sectors; 2306 sector_t max_sector, nr_sectors;
2855 int i; 2307 int i;
@@ -2857,7 +2309,6 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2857 sector_t sync_blocks; 2309 sector_t sync_blocks;
2858 sector_t sectors_skipped = 0; 2310 sector_t sectors_skipped = 0;
2859 int chunks_skipped = 0; 2311 int chunks_skipped = 0;
2860 sector_t chunk_mask = conf->geo.chunk_mask;
2861 2312
2862 if (!conf->r10buf_pool) 2313 if (!conf->r10buf_pool)
2863 if (init_resync(conf)) 2314 if (init_resync(conf))
@@ -2865,8 +2316,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2865 2316
2866 skipped: 2317 skipped:
2867 max_sector = mddev->dev_sectors; 2318 max_sector = mddev->dev_sectors;
2868 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 2319 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2869 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2870 max_sector = mddev->resync_max_sectors; 2320 max_sector = mddev->resync_max_sectors;
2871 if (sector_nr >= max_sector) { 2321 if (sector_nr >= max_sector) {
2872 /* If we aborted, we need to abort the 2322 /* If we aborted, we need to abort the
@@ -2878,47 +2328,25 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2878 * we need to convert that to several 2328 * we need to convert that to several
2879 * virtual addresses. 2329 * virtual addresses.
2880 */ 2330 */
2881 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
2882 end_reshape(conf);
2883 return 0;
2884 }
2885
2886 if (mddev->curr_resync < max_sector) { /* aborted */ 2331 if (mddev->curr_resync < max_sector) { /* aborted */
2887 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 2332 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2888 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 2333 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
2889 &sync_blocks, 1); 2334 &sync_blocks, 1);
2890 else for (i = 0; i < conf->geo.raid_disks; i++) { 2335 else for (i=0; i<conf->raid_disks; i++) {
2891 sector_t sect = 2336 sector_t sect =
2892 raid10_find_virt(conf, mddev->curr_resync, i); 2337 raid10_find_virt(conf, mddev->curr_resync, i);
2893 bitmap_end_sync(mddev->bitmap, sect, 2338 bitmap_end_sync(mddev->bitmap, sect,
2894 &sync_blocks, 1); 2339 &sync_blocks, 1);
2895 } 2340 }
2896 } else { 2341 } else /* completed sync */
2897 /* completed sync */
2898 if ((!mddev->bitmap || conf->fullsync)
2899 && conf->have_replacement
2900 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2901 /* Completed a full sync so the replacements
2902 * are now fully recovered.
2903 */
2904 for (i = 0; i < conf->geo.raid_disks; i++)
2905 if (conf->mirrors[i].replacement)
2906 conf->mirrors[i].replacement
2907 ->recovery_offset
2908 = MaxSector;
2909 }
2910 conf->fullsync = 0; 2342 conf->fullsync = 0;
2911 } 2343
2912 bitmap_close_sync(mddev->bitmap); 2344 bitmap_close_sync(mddev->bitmap);
2913 close_sync(conf); 2345 close_sync(conf);
2914 *skipped = 1; 2346 *skipped = 1;
2915 return sectors_skipped; 2347 return sectors_skipped;
2916 } 2348 }
2917 2349 if (chunks_skipped >= conf->raid_disks) {
2918 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2919 return reshape_request(mddev, sector_nr, skipped);
2920
2921 if (chunks_skipped >= conf->geo.raid_disks) {
2922 /* if there has been nothing to do on any drive, 2350 /* if there has been nothing to do on any drive,
2923 * then there is nothing to do at all.. 2351 * then there is nothing to do at all..
2924 */ 2352 */
@@ -2932,9 +2360,9 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2932 /* make sure whole request will fit in a chunk - if chunks 2360 /* make sure whole request will fit in a chunk - if chunks
2933 * are meaningful 2361 * are meaningful
2934 */ 2362 */
2935 if (conf->geo.near_copies < conf->geo.raid_disks && 2363 if (conf->near_copies < conf->raid_disks &&
2936 max_sector > (sector_nr | chunk_mask)) 2364 max_sector > (sector_nr | conf->chunk_mask))
2937 max_sector = (sector_nr | chunk_mask) + 1; 2365 max_sector = (sector_nr | conf->chunk_mask) + 1;
2938 /* 2366 /*
2939 * If there is non-resync activity waiting for us then 2367 * If there is non-resync activity waiting for us then
2940 * put in a delay to throttle resync. 2368 * put in a delay to throttle resync.
@@ -2963,42 +2391,29 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2963 int j; 2391 int j;
2964 r10_bio = NULL; 2392 r10_bio = NULL;
2965 2393
2966 for (i = 0 ; i < conf->geo.raid_disks; i++) { 2394 for (i=0 ; i<conf->raid_disks; i++) {
2967 int still_degraded; 2395 int still_degraded;
2968 struct r10bio *rb2; 2396 r10bio_t *rb2;
2969 sector_t sect; 2397 sector_t sect;
2970 int must_sync; 2398 int must_sync;
2971 int any_working; 2399 int any_working;
2972 struct raid10_info *mirror = &conf->mirrors[i]; 2400
2973 2401 if (conf->mirrors[i].rdev == NULL ||
2974 if ((mirror->rdev == NULL || 2402 test_bit(In_sync, &conf->mirrors[i].rdev->flags))
2975 test_bit(In_sync, &mirror->rdev->flags))
2976 &&
2977 (mirror->replacement == NULL ||
2978 test_bit(Faulty,
2979 &mirror->replacement->flags)))
2980 continue; 2403 continue;
2981 2404
2982 still_degraded = 0; 2405 still_degraded = 0;
2983 /* want to reconstruct this device */ 2406 /* want to reconstruct this device */
2984 rb2 = r10_bio; 2407 rb2 = r10_bio;
2985 sect = raid10_find_virt(conf, sector_nr, i); 2408 sect = raid10_find_virt(conf, sector_nr, i);
2986 if (sect >= mddev->resync_max_sectors) { 2409 /* Unless we are doing a full sync, we only need
2987 /* last stripe is not complete - don't 2410 * to recover the block if it is set in the bitmap
2988 * try to recover this sector.
2989 */
2990 continue;
2991 }
2992 /* Unless we are doing a full sync, or a replacement
2993 * we only need to recover the block if it is set in
2994 * the bitmap
2995 */ 2411 */
2996 must_sync = bitmap_start_sync(mddev->bitmap, sect, 2412 must_sync = bitmap_start_sync(mddev->bitmap, sect,
2997 &sync_blocks, 1); 2413 &sync_blocks, 1);
2998 if (sync_blocks < max_sync) 2414 if (sync_blocks < max_sync)
2999 max_sync = sync_blocks; 2415 max_sync = sync_blocks;
3000 if (!must_sync && 2416 if (!must_sync &&
3001 mirror->replacement == NULL &&
3002 !conf->fullsync) { 2417 !conf->fullsync) {
3003 /* yep, skip the sync_blocks here, but don't assume 2418 /* yep, skip the sync_blocks here, but don't assume
3004 * that there will never be anything to do here 2419 * that there will never be anything to do here
@@ -3023,7 +2438,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
3023 /* Need to check if the array will still be 2438 /* Need to check if the array will still be
3024 * degraded 2439 * degraded
3025 */ 2440 */
3026 for (j = 0; j < conf->geo.raid_disks; j++) 2441 for (j=0; j<conf->raid_disks; j++)
3027 if (conf->mirrors[j].rdev == NULL || 2442 if (conf->mirrors[j].rdev == NULL ||
3028 test_bit(Faulty, &conf->mirrors[j].rdev->flags)) { 2443 test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
3029 still_degraded = 1; 2444 still_degraded = 1;
@@ -3038,7 +2453,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
3038 int k; 2453 int k;
3039 int d = r10_bio->devs[j].devnum; 2454 int d = r10_bio->devs[j].devnum;
3040 sector_t from_addr, to_addr; 2455 sector_t from_addr, to_addr;
3041 struct md_rdev *rdev; 2456 mdk_rdev_t *rdev;
3042 sector_t sector, first_bad; 2457 sector_t sector, first_bad;
3043 int bad_sectors; 2458 int bad_sectors;
3044 if (!conf->mirrors[d].rdev || 2459 if (!conf->mirrors[d].rdev ||
@@ -3068,60 +2483,33 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
3068 bio->bi_end_io = end_sync_read; 2483 bio->bi_end_io = end_sync_read;
3069 bio->bi_rw = READ; 2484 bio->bi_rw = READ;
3070 from_addr = r10_bio->devs[j].addr; 2485 from_addr = r10_bio->devs[j].addr;
3071 bio->bi_sector = from_addr + rdev->data_offset; 2486 bio->bi_sector = from_addr +
3072 bio->bi_bdev = rdev->bdev; 2487 conf->mirrors[d].rdev->data_offset;
3073 atomic_inc(&rdev->nr_pending); 2488 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
3074 /* and we write to 'i' (if not in_sync) */ 2489 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2490 atomic_inc(&r10_bio->remaining);
2491 /* and we write to 'i' */
3075 2492
3076 for (k=0; k<conf->copies; k++) 2493 for (k=0; k<conf->copies; k++)
3077 if (r10_bio->devs[k].devnum == i) 2494 if (r10_bio->devs[k].devnum == i)
3078 break; 2495 break;
3079 BUG_ON(k == conf->copies); 2496 BUG_ON(k == conf->copies);
2497 bio = r10_bio->devs[1].bio;
2498 bio->bi_next = biolist;
2499 biolist = bio;
2500 bio->bi_private = r10_bio;
2501 bio->bi_end_io = end_sync_write;
2502 bio->bi_rw = WRITE;
3080 to_addr = r10_bio->devs[k].addr; 2503 to_addr = r10_bio->devs[k].addr;
2504 bio->bi_sector = to_addr +
2505 conf->mirrors[i].rdev->data_offset;
2506 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
2507
3081 r10_bio->devs[0].devnum = d; 2508 r10_bio->devs[0].devnum = d;
3082 r10_bio->devs[0].addr = from_addr; 2509 r10_bio->devs[0].addr = from_addr;
3083 r10_bio->devs[1].devnum = i; 2510 r10_bio->devs[1].devnum = i;
3084 r10_bio->devs[1].addr = to_addr; 2511 r10_bio->devs[1].addr = to_addr;
3085 2512
3086 rdev = mirror->rdev;
3087 if (!test_bit(In_sync, &rdev->flags)) {
3088 bio = r10_bio->devs[1].bio;
3089 bio->bi_next = biolist;
3090 biolist = bio;
3091 bio->bi_private = r10_bio;
3092 bio->bi_end_io = end_sync_write;
3093 bio->bi_rw = WRITE;
3094 bio->bi_sector = to_addr
3095 + rdev->data_offset;
3096 bio->bi_bdev = rdev->bdev;
3097 atomic_inc(&r10_bio->remaining);
3098 } else
3099 r10_bio->devs[1].bio->bi_end_io = NULL;
3100
3101 /* and maybe write to replacement */
3102 bio = r10_bio->devs[1].repl_bio;
3103 if (bio)
3104 bio->bi_end_io = NULL;
3105 rdev = mirror->replacement;
3106 /* Note: if rdev != NULL, then bio
3107 * cannot be NULL as r10buf_pool_alloc will
3108 * have allocated it.
3109 * So the second test here is pointless.
3110 * But it keeps semantic-checkers happy, and
3111 * this comment keeps human reviewers
3112 * happy.
3113 */
3114 if (rdev == NULL || bio == NULL ||
3115 test_bit(Faulty, &rdev->flags))
3116 break;
3117 bio->bi_next = biolist;
3118 biolist = bio;
3119 bio->bi_private = r10_bio;
3120 bio->bi_end_io = end_sync_write;
3121 bio->bi_rw = WRITE;
3122 bio->bi_sector = to_addr + rdev->data_offset;
3123 bio->bi_bdev = rdev->bdev;
3124 atomic_inc(&r10_bio->remaining);
3125 break; 2513 break;
3126 } 2514 }
3127 if (j == conf->copies) { 2515 if (j == conf->copies) {
@@ -3139,16 +2527,8 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
3139 for (k = 0; k < conf->copies; k++) 2527 for (k = 0; k < conf->copies; k++)
3140 if (r10_bio->devs[k].devnum == i) 2528 if (r10_bio->devs[k].devnum == i)
3141 break; 2529 break;
3142 if (!test_bit(In_sync, 2530 if (!rdev_set_badblocks(
3143 &mirror->rdev->flags) 2531 conf->mirrors[i].rdev,
3144 && !rdev_set_badblocks(
3145 mirror->rdev,
3146 r10_bio->devs[k].addr,
3147 max_sync, 0))
3148 any_working = 0;
3149 if (mirror->replacement &&
3150 !rdev_set_badblocks(
3151 mirror->replacement,
3152 r10_bio->devs[k].addr, 2532 r10_bio->devs[k].addr,
3153 max_sync, 0)) 2533 max_sync, 0))
3154 any_working = 0; 2534 any_working = 0;
@@ -3159,7 +2539,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
3159 printk(KERN_INFO "md/raid10:%s: insufficient " 2539 printk(KERN_INFO "md/raid10:%s: insufficient "
3160 "working devices for recovery.\n", 2540 "working devices for recovery.\n",
3161 mdname(mddev)); 2541 mdname(mddev));
3162 mirror->recovery_disabled 2542 conf->mirrors[i].recovery_disabled
3163 = mddev->recovery_disabled; 2543 = mddev->recovery_disabled;
3164 } 2544 }
3165 break; 2545 break;
@@ -3167,8 +2547,8 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
3167 } 2547 }
3168 if (biolist == NULL) { 2548 if (biolist == NULL) {
3169 while (r10_bio) { 2549 while (r10_bio) {
3170 struct r10bio *rb2 = r10_bio; 2550 r10bio_t *rb2 = r10_bio;
3171 r10_bio = (struct r10bio*) rb2->master_bio; 2551 r10_bio = (r10bio_t*) rb2->master_bio;
3172 rb2->master_bio = NULL; 2552 rb2->master_bio = NULL;
3173 put_buf(rb2); 2553 put_buf(rb2);
3174 } 2554 }
@@ -3201,16 +2581,13 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
3201 r10_bio->sector = sector_nr; 2581 r10_bio->sector = sector_nr;
3202 set_bit(R10BIO_IsSync, &r10_bio->state); 2582 set_bit(R10BIO_IsSync, &r10_bio->state);
3203 raid10_find_phys(conf, r10_bio); 2583 raid10_find_phys(conf, r10_bio);
3204 r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1; 2584 r10_bio->sectors = (sector_nr | conf->chunk_mask) - sector_nr +1;
3205 2585
3206 for (i = 0; i < conf->copies; i++) { 2586 for (i=0; i<conf->copies; i++) {
3207 int d = r10_bio->devs[i].devnum; 2587 int d = r10_bio->devs[i].devnum;
3208 sector_t first_bad, sector; 2588 sector_t first_bad, sector;
3209 int bad_sectors; 2589 int bad_sectors;
3210 2590
3211 if (r10_bio->devs[i].repl_bio)
3212 r10_bio->devs[i].repl_bio->bi_end_io = NULL;
3213
3214 bio = r10_bio->devs[i].bio; 2591 bio = r10_bio->devs[i].bio;
3215 bio->bi_end_io = NULL; 2592 bio->bi_end_io = NULL;
3216 clear_bit(BIO_UPTODATE, &bio->bi_flags); 2593 clear_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -3226,7 +2603,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
3226 else { 2603 else {
3227 bad_sectors -= (sector - first_bad); 2604 bad_sectors -= (sector - first_bad);
3228 if (max_sync > bad_sectors) 2605 if (max_sync > bad_sectors)
3229 max_sync = bad_sectors; 2606 max_sync = max_sync;
3230 continue; 2607 continue;
3231 } 2608 }
3232 } 2609 }
@@ -3241,27 +2618,6 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
3241 conf->mirrors[d].rdev->data_offset; 2618 conf->mirrors[d].rdev->data_offset;
3242 bio->bi_bdev = conf->mirrors[d].rdev->bdev; 2619 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
3243 count++; 2620 count++;
3244
3245 if (conf->mirrors[d].replacement == NULL ||
3246 test_bit(Faulty,
3247 &conf->mirrors[d].replacement->flags))
3248 continue;
3249
3250 /* Need to set up for writing to the replacement */
3251 bio = r10_bio->devs[i].repl_bio;
3252 clear_bit(BIO_UPTODATE, &bio->bi_flags);
3253
3254 sector = r10_bio->devs[i].addr;
3255 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
3256 bio->bi_next = biolist;
3257 biolist = bio;
3258 bio->bi_private = r10_bio;
3259 bio->bi_end_io = end_sync_write;
3260 bio->bi_rw = WRITE;
3261 bio->bi_sector = sector +
3262 conf->mirrors[d].replacement->data_offset;
3263 bio->bi_bdev = conf->mirrors[d].replacement->bdev;
3264 count++;
3265 } 2621 }
3266 2622
3267 if (count < 2) { 2623 if (count < 2) {
@@ -3270,11 +2626,6 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
3270 if (r10_bio->devs[i].bio->bi_end_io) 2626 if (r10_bio->devs[i].bio->bi_end_io)
3271 rdev_dec_pending(conf->mirrors[d].rdev, 2627 rdev_dec_pending(conf->mirrors[d].rdev,
3272 mddev); 2628 mddev);
3273 if (r10_bio->devs[i].repl_bio &&
3274 r10_bio->devs[i].repl_bio->bi_end_io)
3275 rdev_dec_pending(
3276 conf->mirrors[d].replacement,
3277 mddev);
3278 } 2629 }
3279 put_buf(r10_bio); 2630 put_buf(r10_bio);
3280 biolist = NULL; 2631 biolist = NULL;
@@ -3363,126 +2714,57 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
3363} 2714}
3364 2715
3365static sector_t 2716static sector_t
3366raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks) 2717raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks)
3367{ 2718{
3368 sector_t size; 2719 sector_t size;
3369 struct r10conf *conf = mddev->private; 2720 conf_t *conf = mddev->private;
3370 2721
3371 if (!raid_disks) 2722 if (!raid_disks)
3372 raid_disks = min(conf->geo.raid_disks, 2723 raid_disks = conf->raid_disks;
3373 conf->prev.raid_disks);
3374 if (!sectors) 2724 if (!sectors)
3375 sectors = conf->dev_sectors; 2725 sectors = conf->dev_sectors;
3376 2726
3377 size = sectors >> conf->geo.chunk_shift; 2727 size = sectors >> conf->chunk_shift;
3378 sector_div(size, conf->geo.far_copies); 2728 sector_div(size, conf->far_copies);
3379 size = size * raid_disks; 2729 size = size * raid_disks;
3380 sector_div(size, conf->geo.near_copies); 2730 sector_div(size, conf->near_copies);
3381 2731
3382 return size << conf->geo.chunk_shift; 2732 return size << conf->chunk_shift;
3383} 2733}
3384 2734
3385static void calc_sectors(struct r10conf *conf, sector_t size)
3386{
3387 /* Calculate the number of sectors-per-device that will
3388 * actually be used, and set conf->dev_sectors and
3389 * conf->stride
3390 */
3391
3392 size = size >> conf->geo.chunk_shift;
3393 sector_div(size, conf->geo.far_copies);
3394 size = size * conf->geo.raid_disks;
3395 sector_div(size, conf->geo.near_copies);
3396 /* 'size' is now the number of chunks in the array */
3397 /* calculate "used chunks per device" */
3398 size = size * conf->copies;
3399
3400 /* We need to round up when dividing by raid_disks to
3401 * get the stride size.
3402 */
3403 size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);
3404
3405 conf->dev_sectors = size << conf->geo.chunk_shift;
3406
3407 if (conf->geo.far_offset)
3408 conf->geo.stride = 1 << conf->geo.chunk_shift;
3409 else {
3410 sector_div(size, conf->geo.far_copies);
3411 conf->geo.stride = size << conf->geo.chunk_shift;
3412 }
3413}
3414 2735
3415enum geo_type {geo_new, geo_old, geo_start}; 2736static conf_t *setup_conf(mddev_t *mddev)
3416static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3417{ 2737{
2738 conf_t *conf = NULL;
3418 int nc, fc, fo; 2739 int nc, fc, fo;
3419 int layout, chunk, disks; 2740 sector_t stride, size;
3420 switch (new) {
3421 case geo_old:
3422 layout = mddev->layout;
3423 chunk = mddev->chunk_sectors;
3424 disks = mddev->raid_disks - mddev->delta_disks;
3425 break;
3426 case geo_new:
3427 layout = mddev->new_layout;
3428 chunk = mddev->new_chunk_sectors;
3429 disks = mddev->raid_disks;
3430 break;
3431 default: /* avoid 'may be unused' warnings */
3432 case geo_start: /* new when starting reshape - raid_disks not
3433 * updated yet. */
3434 layout = mddev->new_layout;
3435 chunk = mddev->new_chunk_sectors;
3436 disks = mddev->raid_disks + mddev->delta_disks;
3437 break;
3438 }
3439 if (layout >> 17)
3440 return -1;
3441 if (chunk < (PAGE_SIZE >> 9) ||
3442 !is_power_of_2(chunk))
3443 return -2;
3444 nc = layout & 255;
3445 fc = (layout >> 8) & 255;
3446 fo = layout & (1<<16);
3447 geo->raid_disks = disks;
3448 geo->near_copies = nc;
3449 geo->far_copies = fc;
3450 geo->far_offset = fo;
3451 geo->chunk_mask = chunk - 1;
3452 geo->chunk_shift = ffz(~chunk);
3453 return nc*fc;
3454}
3455
3456static struct r10conf *setup_conf(struct mddev *mddev)
3457{
3458 struct r10conf *conf = NULL;
3459 int err = -EINVAL; 2741 int err = -EINVAL;
3460 struct geom geo;
3461 int copies;
3462
3463 copies = setup_geo(&geo, mddev, geo_new);
3464 2742
3465 if (copies == -2) { 2743 if (mddev->new_chunk_sectors < (PAGE_SIZE >> 9) ||
2744 !is_power_of_2(mddev->new_chunk_sectors)) {
3466 printk(KERN_ERR "md/raid10:%s: chunk size must be " 2745 printk(KERN_ERR "md/raid10:%s: chunk size must be "
3467 "at least PAGE_SIZE(%ld) and be a power of 2.\n", 2746 "at least PAGE_SIZE(%ld) and be a power of 2.\n",
3468 mdname(mddev), PAGE_SIZE); 2747 mdname(mddev), PAGE_SIZE);
3469 goto out; 2748 goto out;
3470 } 2749 }
3471 2750
3472 if (copies < 2 || copies > mddev->raid_disks) { 2751 nc = mddev->new_layout & 255;
2752 fc = (mddev->new_layout >> 8) & 255;
2753 fo = mddev->new_layout & (1<<16);
2754
2755 if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
2756 (mddev->new_layout >> 17)) {
3473 printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n", 2757 printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
3474 mdname(mddev), mddev->new_layout); 2758 mdname(mddev), mddev->new_layout);
3475 goto out; 2759 goto out;
3476 } 2760 }
3477 2761
3478 err = -ENOMEM; 2762 err = -ENOMEM;
3479 conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL); 2763 conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
3480 if (!conf) 2764 if (!conf)
3481 goto out; 2765 goto out;
3482 2766
3483 /* FIXME calc properly */ 2767 conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
3484 conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks +
3485 max(0,mddev->delta_disks)),
3486 GFP_KERNEL); 2768 GFP_KERNEL);
3487 if (!conf->mirrors) 2769 if (!conf->mirrors)
3488 goto out; 2770 goto out;
@@ -3491,36 +2773,50 @@ static struct r10conf *setup_conf(struct mddev *mddev)
3491 if (!conf->tmppage) 2773 if (!conf->tmppage)
3492 goto out; 2774 goto out;
3493 2775
3494 conf->geo = geo; 2776
3495 conf->copies = copies; 2777 conf->raid_disks = mddev->raid_disks;
2778 conf->near_copies = nc;
2779 conf->far_copies = fc;
2780 conf->copies = nc*fc;
2781 conf->far_offset = fo;
2782 conf->chunk_mask = mddev->new_chunk_sectors - 1;
2783 conf->chunk_shift = ffz(~mddev->new_chunk_sectors);
2784
3496 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, 2785 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
3497 r10bio_pool_free, conf); 2786 r10bio_pool_free, conf);
3498 if (!conf->r10bio_pool) 2787 if (!conf->r10bio_pool)
3499 goto out; 2788 goto out;
3500 2789
3501 calc_sectors(conf, mddev->dev_sectors); 2790 size = mddev->dev_sectors >> conf->chunk_shift;
3502 if (mddev->reshape_position == MaxSector) { 2791 sector_div(size, fc);
3503 conf->prev = conf->geo; 2792 size = size * conf->raid_disks;
3504 conf->reshape_progress = MaxSector; 2793 sector_div(size, nc);
3505 } else { 2794 /* 'size' is now the number of chunks in the array */
3506 if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) { 2795 /* calculate "used chunks per device" in 'stride' */
3507 err = -EINVAL; 2796 stride = size * conf->copies;
3508 goto out; 2797
3509 } 2798 /* We need to round up when dividing by raid_disks to
3510 conf->reshape_progress = mddev->reshape_position; 2799 * get the stride size.
3511 if (conf->prev.far_offset) 2800 */
3512 conf->prev.stride = 1 << conf->prev.chunk_shift; 2801 stride += conf->raid_disks - 1;
3513 else 2802 sector_div(stride, conf->raid_disks);
3514 /* far_copies must be 1 */ 2803
3515 conf->prev.stride = conf->dev_sectors; 2804 conf->dev_sectors = stride << conf->chunk_shift;
3516 } 2805
2806 if (fo)
2807 stride = 1;
2808 else
2809 sector_div(stride, fc);
2810 conf->stride = stride << conf->chunk_shift;
2811
2812
3517 spin_lock_init(&conf->device_lock); 2813 spin_lock_init(&conf->device_lock);
3518 INIT_LIST_HEAD(&conf->retry_list); 2814 INIT_LIST_HEAD(&conf->retry_list);
3519 2815
3520 spin_lock_init(&conf->resync_lock); 2816 spin_lock_init(&conf->resync_lock);
3521 init_waitqueue_head(&conf->wait_barrier); 2817 init_waitqueue_head(&conf->wait_barrier);
3522 2818
3523 conf->thread = md_register_thread(raid10d, mddev, "raid10"); 2819 conf->thread = md_register_thread(raid10d, mddev, NULL);
3524 if (!conf->thread) 2820 if (!conf->thread)
3525 goto out; 2821 goto out;
3526 2822
@@ -3528,9 +2824,8 @@ static struct r10conf *setup_conf(struct mddev *mddev)
3528 return conf; 2824 return conf;
3529 2825
3530 out: 2826 out:
3531 if (err == -ENOMEM) 2827 printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
3532 printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n", 2828 mdname(mddev));
3533 mdname(mddev));
3534 if (conf) { 2829 if (conf) {
3535 if (conf->r10bio_pool) 2830 if (conf->r10bio_pool)
3536 mempool_destroy(conf->r10bio_pool); 2831 mempool_destroy(conf->r10bio_pool);
@@ -3541,16 +2836,19 @@ static struct r10conf *setup_conf(struct mddev *mddev)
3541 return ERR_PTR(err); 2836 return ERR_PTR(err);
3542} 2837}
3543 2838
3544static int run(struct mddev *mddev) 2839static int run(mddev_t *mddev)
3545{ 2840{
3546 struct r10conf *conf; 2841 conf_t *conf;
3547 int i, disk_idx, chunk_size; 2842 int i, disk_idx, chunk_size;
3548 struct raid10_info *disk; 2843 mirror_info_t *disk;
3549 struct md_rdev *rdev; 2844 mdk_rdev_t *rdev;
3550 sector_t size; 2845 sector_t size;
3551 sector_t min_offset_diff = 0; 2846
3552 int first = 1; 2847 /*
3553 bool discard_supported = false; 2848 * copy the already verified devices into our private RAID10
2849 * bookkeeping area. [whatever we allocate in run(),
2850 * should be freed in stop()]
2851 */
3554 2852
3555 if (mddev->private == NULL) { 2853 if (mddev->private == NULL) {
3556 conf = setup_conf(mddev); 2854 conf = setup_conf(mddev);
@@ -3566,66 +2864,35 @@ static int run(struct mddev *mddev)
3566 conf->thread = NULL; 2864 conf->thread = NULL;
3567 2865
3568 chunk_size = mddev->chunk_sectors << 9; 2866 chunk_size = mddev->chunk_sectors << 9;
3569 if (mddev->queue) { 2867 blk_queue_io_min(mddev->queue, chunk_size);
3570 blk_queue_max_discard_sectors(mddev->queue, 2868 if (conf->raid_disks % conf->near_copies)
3571 mddev->chunk_sectors); 2869 blk_queue_io_opt(mddev->queue, chunk_size * conf->raid_disks);
3572 blk_queue_io_min(mddev->queue, chunk_size); 2870 else
3573 if (conf->geo.raid_disks % conf->geo.near_copies) 2871 blk_queue_io_opt(mddev->queue, chunk_size *
3574 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); 2872 (conf->raid_disks / conf->near_copies));
3575 else
3576 blk_queue_io_opt(mddev->queue, chunk_size *
3577 (conf->geo.raid_disks / conf->geo.near_copies));
3578 }
3579 2873
3580 rdev_for_each(rdev, mddev) { 2874 list_for_each_entry(rdev, &mddev->disks, same_set) {
3581 long long diff;
3582 struct request_queue *q;
3583 2875
3584 disk_idx = rdev->raid_disk; 2876 disk_idx = rdev->raid_disk;
3585 if (disk_idx < 0) 2877 if (disk_idx >= conf->raid_disks
3586 continue; 2878 || disk_idx < 0)
3587 if (disk_idx >= conf->geo.raid_disks &&
3588 disk_idx >= conf->prev.raid_disks)
3589 continue; 2879 continue;
3590 disk = conf->mirrors + disk_idx; 2880 disk = conf->mirrors + disk_idx;
3591 2881
3592 if (test_bit(Replacement, &rdev->flags)) { 2882 disk->rdev = rdev;
3593 if (disk->replacement) 2883 disk_stack_limits(mddev->gendisk, rdev->bdev,
3594 goto out_free_conf; 2884 rdev->data_offset << 9);
3595 disk->replacement = rdev; 2885 /* as we don't honour merge_bvec_fn, we must never risk
3596 } else { 2886 * violating it, so limit max_segments to 1 lying
3597 if (disk->rdev) 2887 * within a single page.
3598 goto out_free_conf; 2888 */
3599 disk->rdev = rdev; 2889 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
2890 blk_queue_max_segments(mddev->queue, 1);
2891 blk_queue_segment_boundary(mddev->queue,
2892 PAGE_CACHE_SIZE - 1);
3600 } 2893 }
3601 q = bdev_get_queue(rdev->bdev);
3602 if (q->merge_bvec_fn)
3603 mddev->merge_check_needed = 1;
3604 diff = (rdev->new_data_offset - rdev->data_offset);
3605 if (!mddev->reshape_backwards)
3606 diff = -diff;
3607 if (diff < 0)
3608 diff = 0;
3609 if (first || diff < min_offset_diff)
3610 min_offset_diff = diff;
3611
3612 if (mddev->gendisk)
3613 disk_stack_limits(mddev->gendisk, rdev->bdev,
3614 rdev->data_offset << 9);
3615 2894
3616 disk->head_position = 0; 2895 disk->head_position = 0;
3617
3618 if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
3619 discard_supported = true;
3620 }
3621
3622 if (mddev->queue) {
3623 if (discard_supported)
3624 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
3625 mddev->queue);
3626 else
3627 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
3628 mddev->queue);
3629 } 2896 }
3630 /* need to check that every block has at least one working mirror */ 2897 /* need to check that every block has at least one working mirror */
3631 if (!enough(conf, -1)) { 2898 if (!enough(conf, -1)) {
@@ -3634,31 +2901,11 @@ static int run(struct mddev *mddev)
3634 goto out_free_conf; 2901 goto out_free_conf;
3635 } 2902 }
3636 2903
3637 if (conf->reshape_progress != MaxSector) {
3638 /* must ensure that shape change is supported */
3639 if (conf->geo.far_copies != 1 &&
3640 conf->geo.far_offset == 0)
3641 goto out_free_conf;
3642 if (conf->prev.far_copies != 1 &&
3643 conf->geo.far_offset == 0)
3644 goto out_free_conf;
3645 }
3646
3647 mddev->degraded = 0; 2904 mddev->degraded = 0;
3648 for (i = 0; 2905 for (i = 0; i < conf->raid_disks; i++) {
3649 i < conf->geo.raid_disks
3650 || i < conf->prev.raid_disks;
3651 i++) {
3652 2906
3653 disk = conf->mirrors + i; 2907 disk = conf->mirrors + i;
3654 2908
3655 if (!disk->rdev && disk->replacement) {
3656 /* The replacement is all we have - use it */
3657 disk->rdev = disk->replacement;
3658 disk->replacement = NULL;
3659 clear_bit(Replacement, &disk->rdev->flags);
3660 }
3661
3662 if (!disk->rdev || 2909 if (!disk->rdev ||
3663 !test_bit(In_sync, &disk->rdev->flags)) { 2910 !test_bit(In_sync, &disk->rdev->flags)) {
3664 disk->head_position = 0; 2911 disk->head_position = 0;
@@ -3666,7 +2913,6 @@ static int run(struct mddev *mddev)
3666 if (disk->rdev) 2913 if (disk->rdev)
3667 conf->fullsync = 1; 2914 conf->fullsync = 1;
3668 } 2915 }
3669 disk->recovery_disabled = mddev->recovery_disabled - 1;
3670 } 2916 }
3671 2917
3672 if (mddev->recovery_cp != MaxSector) 2918 if (mddev->recovery_cp != MaxSector)
@@ -3675,8 +2921,8 @@ static int run(struct mddev *mddev)
3675 mdname(mddev)); 2921 mdname(mddev));
3676 printk(KERN_INFO 2922 printk(KERN_INFO
3677 "md/raid10:%s: active with %d out of %d devices\n", 2923 "md/raid10:%s: active with %d out of %d devices\n",
3678 mdname(mddev), conf->geo.raid_disks - mddev->degraded, 2924 mdname(mddev), conf->raid_disks - mddev->degraded,
3679 conf->geo.raid_disks); 2925 conf->raid_disks);
3680 /* 2926 /*
3681 * Ok, everything is just fine now 2927 * Ok, everything is just fine now
3682 */ 2928 */
@@ -3685,50 +2931,27 @@ static int run(struct mddev *mddev)
3685 md_set_array_sectors(mddev, size); 2931 md_set_array_sectors(mddev, size);
3686 mddev->resync_max_sectors = size; 2932 mddev->resync_max_sectors = size;
3687 2933
3688 if (mddev->queue) { 2934 mddev->queue->backing_dev_info.congested_fn = raid10_congested;
3689 int stripe = conf->geo.raid_disks * 2935 mddev->queue->backing_dev_info.congested_data = mddev;
3690 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
3691 mddev->queue->backing_dev_info.congested_fn = raid10_congested;
3692 mddev->queue->backing_dev_info.congested_data = mddev;
3693 2936
3694 /* Calculate max read-ahead size. 2937 /* Calculate max read-ahead size.
3695 * We need to readahead at least twice a whole stripe.... 2938 * We need to readahead at least twice a whole stripe....
3696 * maybe... 2939 * maybe...
3697 */ 2940 */
3698 stripe /= conf->geo.near_copies; 2941 {
3699 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 2942 int stripe = conf->raid_disks *
3700 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 2943 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
3701 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); 2944 stripe /= conf->near_copies;
2945 if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
2946 mddev->queue->backing_dev_info.ra_pages = 2* stripe;
3702 } 2947 }
3703 2948
2949 if (conf->near_copies < conf->raid_disks)
2950 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
3704 2951
3705 if (md_integrity_register(mddev)) 2952 if (md_integrity_register(mddev))
3706 goto out_free_conf; 2953 goto out_free_conf;
3707 2954
3708 if (conf->reshape_progress != MaxSector) {
3709 unsigned long before_length, after_length;
3710
3711 before_length = ((1 << conf->prev.chunk_shift) *
3712 conf->prev.far_copies);
3713 after_length = ((1 << conf->geo.chunk_shift) *
3714 conf->geo.far_copies);
3715
3716 if (max(before_length, after_length) > min_offset_diff) {
3717 /* This cannot work */
3718 printk("md/raid10: offset difference not enough to continue reshape\n");
3719 goto out_free_conf;
3720 }
3721 conf->offset_diff = min_offset_diff;
3722
3723 conf->reshape_safe = conf->reshape_progress;
3724 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3725 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3726 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
3727 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3728 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
3729 "reshape");
3730 }
3731
3732 return 0; 2955 return 0;
3733 2956
3734out_free_conf: 2957out_free_conf:
@@ -3743,18 +2966,15 @@ out:
3743 return -EIO; 2966 return -EIO;
3744} 2967}
3745 2968
3746static int stop(struct mddev *mddev) 2969static int stop(mddev_t *mddev)
3747{ 2970{
3748 struct r10conf *conf = mddev->private; 2971 conf_t *conf = mddev->private;
3749 2972
3750 raise_barrier(conf, 0); 2973 raise_barrier(conf, 0);
3751 lower_barrier(conf); 2974 lower_barrier(conf);
3752 2975
3753 md_unregister_thread(&mddev->thread); 2976 md_unregister_thread(&mddev->thread);
3754 if (mddev->queue) 2977 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
3755 /* the unplug fn references 'conf'*/
3756 blk_sync_queue(mddev->queue);
3757
3758 if (conf->r10bio_pool) 2978 if (conf->r10bio_pool)
3759 mempool_destroy(conf->r10bio_pool); 2979 mempool_destroy(conf->r10bio_pool);
3760 kfree(conf->mirrors); 2980 kfree(conf->mirrors);
@@ -3763,9 +2983,9 @@ static int stop(struct mddev *mddev)
3763 return 0; 2983 return 0;
3764} 2984}
3765 2985
3766static void raid10_quiesce(struct mddev *mddev, int state) 2986static void raid10_quiesce(mddev_t *mddev, int state)
3767{ 2987{
3768 struct r10conf *conf = mddev->private; 2988 conf_t *conf = mddev->private;
3769 2989
3770 switch(state) { 2990 switch(state) {
3771 case 1: 2991 case 1:
@@ -3777,57 +2997,10 @@ static void raid10_quiesce(struct mddev *mddev, int state)
3777 } 2997 }
3778} 2998}
3779 2999
3780static int raid10_resize(struct mddev *mddev, sector_t sectors) 3000static void *raid10_takeover_raid0(mddev_t *mddev)
3781{ 3001{
3782 /* Resize of 'far' arrays is not supported. 3002 mdk_rdev_t *rdev;
3783 * For 'near' and 'offset' arrays we can set the 3003 conf_t *conf;
3784 * number of sectors used to be an appropriate multiple
3785 * of the chunk size.
3786 * For 'offset', this is far_copies*chunksize.
3787 * For 'near' the multiplier is the LCM of
3788 * near_copies and raid_disks.
3789 * So if far_copies > 1 && !far_offset, fail.
3790 * Else find LCM(raid_disks, near_copy)*far_copies and
3791 * multiply by chunk_size. Then round to this number.
3792 * This is mostly done by raid10_size()
3793 */
3794 struct r10conf *conf = mddev->private;
3795 sector_t oldsize, size;
3796
3797 if (mddev->reshape_position != MaxSector)
3798 return -EBUSY;
3799
3800 if (conf->geo.far_copies > 1 && !conf->geo.far_offset)
3801 return -EINVAL;
3802
3803 oldsize = raid10_size(mddev, 0, 0);
3804 size = raid10_size(mddev, sectors, 0);
3805 if (mddev->external_size &&
3806 mddev->array_sectors > size)
3807 return -EINVAL;
3808 if (mddev->bitmap) {
3809 int ret = bitmap_resize(mddev->bitmap, size, 0, 0);
3810 if (ret)
3811 return ret;
3812 }
3813 md_set_array_sectors(mddev, size);
3814 set_capacity(mddev->gendisk, mddev->array_sectors);
3815 revalidate_disk(mddev->gendisk);
3816 if (sectors > mddev->dev_sectors &&
3817 mddev->recovery_cp > oldsize) {
3818 mddev->recovery_cp = oldsize;
3819 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3820 }
3821 calc_sectors(conf, sectors);
3822 mddev->dev_sectors = conf->dev_sectors;
3823 mddev->resync_max_sectors = size;
3824 return 0;
3825}
3826
3827static void *raid10_takeover_raid0(struct mddev *mddev)
3828{
3829 struct md_rdev *rdev;
3830 struct r10conf *conf;
3831 3004
3832 if (mddev->degraded > 0) { 3005 if (mddev->degraded > 0) {
3833 printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n", 3006 printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n",
@@ -3847,7 +3020,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev)
3847 3020
3848 conf = setup_conf(mddev); 3021 conf = setup_conf(mddev);
3849 if (!IS_ERR(conf)) { 3022 if (!IS_ERR(conf)) {
3850 rdev_for_each(rdev, mddev) 3023 list_for_each_entry(rdev, &mddev->disks, same_set)
3851 if (rdev->raid_disk >= 0) 3024 if (rdev->raid_disk >= 0)
3852 rdev->new_raid_disk = rdev->raid_disk * 2; 3025 rdev->new_raid_disk = rdev->raid_disk * 2;
3853 conf->barrier = 1; 3026 conf->barrier = 1;
@@ -3856,17 +3029,17 @@ static void *raid10_takeover_raid0(struct mddev *mddev)
3856 return conf; 3029 return conf;
3857} 3030}
3858 3031
3859static void *raid10_takeover(struct mddev *mddev) 3032static void *raid10_takeover(mddev_t *mddev)
3860{ 3033{
3861 struct r0conf *raid0_conf; 3034 struct raid0_private_data *raid0_priv;
3862 3035
3863 /* raid10 can take over: 3036 /* raid10 can take over:
3864 * raid0 - providing it has only two drives 3037 * raid0 - providing it has only two drives
3865 */ 3038 */
3866 if (mddev->level == 0) { 3039 if (mddev->level == 0) {
3867 /* for raid0 takeover only one zone is supported */ 3040 /* for raid0 takeover only one zone is supported */
3868 raid0_conf = mddev->private; 3041 raid0_priv = mddev->private;
3869 if (raid0_conf->nr_strip_zones > 1) { 3042 if (raid0_priv->nr_strip_zones > 1) {
3870 printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0" 3043 printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0"
3871 " with more than one zone.\n", 3044 " with more than one zone.\n",
3872 mdname(mddev)); 3045 mdname(mddev));
@@ -3877,763 +3050,7 @@ static void *raid10_takeover(struct mddev *mddev)
3877 return ERR_PTR(-EINVAL); 3050 return ERR_PTR(-EINVAL);
3878} 3051}
3879 3052
3880static int raid10_check_reshape(struct mddev *mddev) 3053static struct mdk_personality raid10_personality =
3881{
3882 /* Called when there is a request to change
3883 * - layout (to ->new_layout)
3884 * - chunk size (to ->new_chunk_sectors)
3885 * - raid_disks (by delta_disks)
3886 * or when trying to restart a reshape that was ongoing.
3887 *
3888 * We need to validate the request and possibly allocate
3889 * space if that might be an issue later.
3890 *
3891 * Currently we reject any reshape of a 'far' mode array,
3892 * allow chunk size to change if new is generally acceptable,
3893 * allow raid_disks to increase, and allow
3894 * a switch between 'near' mode and 'offset' mode.
3895 */
3896 struct r10conf *conf = mddev->private;
3897 struct geom geo;
3898
3899 if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
3900 return -EINVAL;
3901
3902 if (setup_geo(&geo, mddev, geo_start) != conf->copies)
3903 /* mustn't change number of copies */
3904 return -EINVAL;
3905 if (geo.far_copies > 1 && !geo.far_offset)
3906 /* Cannot switch to 'far' mode */
3907 return -EINVAL;
3908
3909 if (mddev->array_sectors & geo.chunk_mask)
3910 /* not factor of array size */
3911 return -EINVAL;
3912
3913 if (!enough(conf, -1))
3914 return -EINVAL;
3915
3916 kfree(conf->mirrors_new);
3917 conf->mirrors_new = NULL;
3918 if (mddev->delta_disks > 0) {
3919 /* allocate new 'mirrors' list */
3920 conf->mirrors_new = kzalloc(
3921 sizeof(struct raid10_info)
3922 *(mddev->raid_disks +
3923 mddev->delta_disks),
3924 GFP_KERNEL);
3925 if (!conf->mirrors_new)
3926 return -ENOMEM;
3927 }
3928 return 0;
3929}
3930
3931/*
3932 * Need to check if array has failed when deciding whether to:
3933 * - start an array
3934 * - remove non-faulty devices
3935 * - add a spare
3936 * - allow a reshape
3937 * This determination is simple when no reshape is happening.
3938 * However if there is a reshape, we need to carefully check
3939 * both the before and after sections.
3940 * This is because some failed devices may only affect one
3941 * of the two sections, and some non-in_sync devices may
3942 * be insync in the section most affected by failed devices.
3943 */
3944static int calc_degraded(struct r10conf *conf)
3945{
3946 int degraded, degraded2;
3947 int i;
3948
3949 rcu_read_lock();
3950 degraded = 0;
3951 /* 'prev' section first */
3952 for (i = 0; i < conf->prev.raid_disks; i++) {
3953 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
3954 if (!rdev || test_bit(Faulty, &rdev->flags))
3955 degraded++;
3956 else if (!test_bit(In_sync, &rdev->flags))
3957 /* When we can reduce the number of devices in
3958 * an array, this might not contribute to
3959 * 'degraded'. It does now.
3960 */
3961 degraded++;
3962 }
3963 rcu_read_unlock();
3964 if (conf->geo.raid_disks == conf->prev.raid_disks)
3965 return degraded;
3966 rcu_read_lock();
3967 degraded2 = 0;
3968 for (i = 0; i < conf->geo.raid_disks; i++) {
3969 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
3970 if (!rdev || test_bit(Faulty, &rdev->flags))
3971 degraded2++;
3972 else if (!test_bit(In_sync, &rdev->flags)) {
3973 /* If reshape is increasing the number of devices,
3974 * this section has already been recovered, so
3975 * it doesn't contribute to degraded.
3976 * else it does.
3977 */
3978 if (conf->geo.raid_disks <= conf->prev.raid_disks)
3979 degraded2++;
3980 }
3981 }
3982 rcu_read_unlock();
3983 if (degraded2 > degraded)
3984 return degraded2;
3985 return degraded;
3986}
3987
3988static int raid10_start_reshape(struct mddev *mddev)
3989{
3990 /* A 'reshape' has been requested. This commits
3991 * the various 'new' fields and sets MD_RECOVER_RESHAPE
3992 * This also checks if there are enough spares and adds them
3993 * to the array.
3994 * We currently require enough spares to make the final
3995 * array non-degraded. We also require that the difference
3996 * between old and new data_offset - on each device - is
3997 * enough that we never risk over-writing.
3998 */
3999
4000 unsigned long before_length, after_length;
4001 sector_t min_offset_diff = 0;
4002 int first = 1;
4003 struct geom new;
4004 struct r10conf *conf = mddev->private;
4005 struct md_rdev *rdev;
4006 int spares = 0;
4007 int ret;
4008
4009 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4010 return -EBUSY;
4011
4012 if (setup_geo(&new, mddev, geo_start) != conf->copies)
4013 return -EINVAL;
4014
4015 before_length = ((1 << conf->prev.chunk_shift) *
4016 conf->prev.far_copies);
4017 after_length = ((1 << conf->geo.chunk_shift) *
4018 conf->geo.far_copies);
4019
4020 rdev_for_each(rdev, mddev) {
4021 if (!test_bit(In_sync, &rdev->flags)
4022 && !test_bit(Faulty, &rdev->flags))
4023 spares++;
4024 if (rdev->raid_disk >= 0) {
4025 long long diff = (rdev->new_data_offset
4026 - rdev->data_offset);
4027 if (!mddev->reshape_backwards)
4028 diff = -diff;
4029 if (diff < 0)
4030 diff = 0;
4031 if (first || diff < min_offset_diff)
4032 min_offset_diff = diff;
4033 }
4034 }
4035
4036 if (max(before_length, after_length) > min_offset_diff)
4037 return -EINVAL;
4038
4039 if (spares < mddev->delta_disks)
4040 return -EINVAL;
4041
4042 conf->offset_diff = min_offset_diff;
4043 spin_lock_irq(&conf->device_lock);
4044 if (conf->mirrors_new) {
4045 memcpy(conf->mirrors_new, conf->mirrors,
4046 sizeof(struct raid10_info)*conf->prev.raid_disks);
4047 smp_mb();
4048 kfree(conf->mirrors_old); /* FIXME and elsewhere */
4049 conf->mirrors_old = conf->mirrors;
4050 conf->mirrors = conf->mirrors_new;
4051 conf->mirrors_new = NULL;
4052 }
4053 setup_geo(&conf->geo, mddev, geo_start);
4054 smp_mb();
4055 if (mddev->reshape_backwards) {
4056 sector_t size = raid10_size(mddev, 0, 0);
4057 if (size < mddev->array_sectors) {
4058 spin_unlock_irq(&conf->device_lock);
4059 printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n",
4060 mdname(mddev));
4061 return -EINVAL;
4062 }
4063 mddev->resync_max_sectors = size;
4064 conf->reshape_progress = size;
4065 } else
4066 conf->reshape_progress = 0;
4067 spin_unlock_irq(&conf->device_lock);
4068
4069 if (mddev->delta_disks && mddev->bitmap) {
4070 ret = bitmap_resize(mddev->bitmap,
4071 raid10_size(mddev, 0,
4072 conf->geo.raid_disks),
4073 0, 0);
4074 if (ret)
4075 goto abort;
4076 }
4077 if (mddev->delta_disks > 0) {
4078 rdev_for_each(rdev, mddev)
4079 if (rdev->raid_disk < 0 &&
4080 !test_bit(Faulty, &rdev->flags)) {
4081 if (raid10_add_disk(mddev, rdev) == 0) {
4082 if (rdev->raid_disk >=
4083 conf->prev.raid_disks)
4084 set_bit(In_sync, &rdev->flags);
4085 else
4086 rdev->recovery_offset = 0;
4087
4088 if (sysfs_link_rdev(mddev, rdev))
4089 /* Failure here is OK */;
4090 }
4091 } else if (rdev->raid_disk >= conf->prev.raid_disks
4092 && !test_bit(Faulty, &rdev->flags)) {
4093 /* This is a spare that was manually added */
4094 set_bit(In_sync, &rdev->flags);
4095 }
4096 }
4097 /* When a reshape changes the number of devices,
4098 * ->degraded is measured against the larger of the
4099 * pre and post numbers.
4100 */
4101 spin_lock_irq(&conf->device_lock);
4102 mddev->degraded = calc_degraded(conf);
4103 spin_unlock_irq(&conf->device_lock);
4104 mddev->raid_disks = conf->geo.raid_disks;
4105 mddev->reshape_position = conf->reshape_progress;
4106 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4107
4108 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4109 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4110 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4111 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4112
4113 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4114 "reshape");
4115 if (!mddev->sync_thread) {
4116 ret = -EAGAIN;
4117 goto abort;
4118 }
4119 conf->reshape_checkpoint = jiffies;
4120 md_wakeup_thread(mddev->sync_thread);
4121 md_new_event(mddev);
4122 return 0;
4123
4124abort:
4125 mddev->recovery = 0;
4126 spin_lock_irq(&conf->device_lock);
4127 conf->geo = conf->prev;
4128 mddev->raid_disks = conf->geo.raid_disks;
4129 rdev_for_each(rdev, mddev)
4130 rdev->new_data_offset = rdev->data_offset;
4131 smp_wmb();
4132 conf->reshape_progress = MaxSector;
4133 mddev->reshape_position = MaxSector;
4134 spin_unlock_irq(&conf->device_lock);
4135 return ret;
4136}
4137
4138/* Calculate the last device-address that could contain
4139 * any block from the chunk that includes the array-address 's'
4140 * and report the next address.
4141 * i.e. the address returned will be chunk-aligned and after
4142 * any data that is in the chunk containing 's'.
4143 */
4144static sector_t last_dev_address(sector_t s, struct geom *geo)
4145{
4146 s = (s | geo->chunk_mask) + 1;
4147 s >>= geo->chunk_shift;
4148 s *= geo->near_copies;
4149 s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
4150 s *= geo->far_copies;
4151 s <<= geo->chunk_shift;
4152 return s;
4153}
4154
4155/* Calculate the first device-address that could contain
4156 * any block from the chunk that includes the array-address 's'.
4157 * This too will be the start of a chunk
4158 */
4159static sector_t first_dev_address(sector_t s, struct geom *geo)
4160{
4161 s >>= geo->chunk_shift;
4162 s *= geo->near_copies;
4163 sector_div(s, geo->raid_disks);
4164 s *= geo->far_copies;
4165 s <<= geo->chunk_shift;
4166 return s;
4167}
4168
4169static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
4170 int *skipped)
4171{
4172 /* We simply copy at most one chunk (smallest of old and new)
4173 * at a time, possibly less if that exceeds RESYNC_PAGES,
4174 * or we hit a bad block or something.
4175 * This might mean we pause for normal IO in the middle of
4176 * a chunk, but that is not a problem was mddev->reshape_position
4177 * can record any location.
4178 *
4179 * If we will want to write to a location that isn't
4180 * yet recorded as 'safe' (i.e. in metadata on disk) then
4181 * we need to flush all reshape requests and update the metadata.
4182 *
4183 * When reshaping forwards (e.g. to more devices), we interpret
4184 * 'safe' as the earliest block which might not have been copied
4185 * down yet. We divide this by previous stripe size and multiply
4186 * by previous stripe length to get lowest device offset that we
4187 * cannot write to yet.
4188 * We interpret 'sector_nr' as an address that we want to write to.
4189 * From this we use last_device_address() to find where we might
4190 * write to, and first_device_address on the 'safe' position.
4191 * If this 'next' write position is after the 'safe' position,
4192 * we must update the metadata to increase the 'safe' position.
4193 *
4194 * When reshaping backwards, we round in the opposite direction
4195 * and perform the reverse test: next write position must not be
4196 * less than current safe position.
4197 *
4198 * In all this the minimum difference in data offsets
4199 * (conf->offset_diff - always positive) allows a bit of slack,
4200 * so next can be after 'safe', but not by more than offset_disk
4201 *
4202 * We need to prepare all the bios here before we start any IO
4203 * to ensure the size we choose is acceptable to all devices.
4204 * The means one for each copy for write-out and an extra one for
4205 * read-in.
4206 * We store the read-in bio in ->master_bio and the others in
4207 * ->devs[x].bio and ->devs[x].repl_bio.
4208 */
4209 struct r10conf *conf = mddev->private;
4210 struct r10bio *r10_bio;
4211 sector_t next, safe, last;
4212 int max_sectors;
4213 int nr_sectors;
4214 int s;
4215 struct md_rdev *rdev;
4216 int need_flush = 0;
4217 struct bio *blist;
4218 struct bio *bio, *read_bio;
4219 int sectors_done = 0;
4220
4221 if (sector_nr == 0) {
4222 /* If restarting in the middle, skip the initial sectors */
4223 if (mddev->reshape_backwards &&
4224 conf->reshape_progress < raid10_size(mddev, 0, 0)) {
4225 sector_nr = (raid10_size(mddev, 0, 0)
4226 - conf->reshape_progress);
4227 } else if (!mddev->reshape_backwards &&
4228 conf->reshape_progress > 0)
4229 sector_nr = conf->reshape_progress;
4230 if (sector_nr) {
4231 mddev->curr_resync_completed = sector_nr;
4232 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4233 *skipped = 1;
4234 return sector_nr;
4235 }
4236 }
4237
4238 /* We don't use sector_nr to track where we are up to
4239 * as that doesn't work well for ->reshape_backwards.
4240 * So just use ->reshape_progress.
4241 */
4242 if (mddev->reshape_backwards) {
4243 /* 'next' is the earliest device address that we might
4244 * write to for this chunk in the new layout
4245 */
4246 next = first_dev_address(conf->reshape_progress - 1,
4247 &conf->geo);
4248
4249 /* 'safe' is the last device address that we might read from
4250 * in the old layout after a restart
4251 */
4252 safe = last_dev_address(conf->reshape_safe - 1,
4253 &conf->prev);
4254
4255 if (next + conf->offset_diff < safe)
4256 need_flush = 1;
4257
4258 last = conf->reshape_progress - 1;
4259 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
4260 & conf->prev.chunk_mask);
4261 if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)
4262 sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;
4263 } else {
4264 /* 'next' is after the last device address that we
4265 * might write to for this chunk in the new layout
4266 */
4267 next = last_dev_address(conf->reshape_progress, &conf->geo);
4268
4269 /* 'safe' is the earliest device address that we might
4270 * read from in the old layout after a restart
4271 */
4272 safe = first_dev_address(conf->reshape_safe, &conf->prev);
4273
4274 /* Need to update metadata if 'next' might be beyond 'safe'
4275 * as that would possibly corrupt data
4276 */
4277 if (next > safe + conf->offset_diff)
4278 need_flush = 1;
4279
4280 sector_nr = conf->reshape_progress;
4281 last = sector_nr | (conf->geo.chunk_mask
4282 & conf->prev.chunk_mask);
4283
4284 if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)
4285 last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;
4286 }
4287
4288 if (need_flush ||
4289 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4290 /* Need to update reshape_position in metadata */
4291 wait_barrier(conf);
4292 mddev->reshape_position = conf->reshape_progress;
4293 if (mddev->reshape_backwards)
4294 mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
4295 - conf->reshape_progress;
4296 else
4297 mddev->curr_resync_completed = conf->reshape_progress;
4298 conf->reshape_checkpoint = jiffies;
4299 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4300 md_wakeup_thread(mddev->thread);
4301 wait_event(mddev->sb_wait, mddev->flags == 0 ||
4302 kthread_should_stop());
4303 conf->reshape_safe = mddev->reshape_position;
4304 allow_barrier(conf);
4305 }
4306
4307read_more:
4308 /* Now schedule reads for blocks from sector_nr to last */
4309 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
4310 raise_barrier(conf, sectors_done != 0);
4311 atomic_set(&r10_bio->remaining, 0);
4312 r10_bio->mddev = mddev;
4313 r10_bio->sector = sector_nr;
4314 set_bit(R10BIO_IsReshape, &r10_bio->state);
4315 r10_bio->sectors = last - sector_nr + 1;
4316 rdev = read_balance(conf, r10_bio, &max_sectors);
4317 BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
4318
4319 if (!rdev) {
4320 /* Cannot read from here, so need to record bad blocks
4321 * on all the target devices.
4322 */
4323 // FIXME
4324 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4325 return sectors_done;
4326 }
4327
4328 read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
4329
4330 read_bio->bi_bdev = rdev->bdev;
4331 read_bio->bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
4332 + rdev->data_offset);
4333 read_bio->bi_private = r10_bio;
4334 read_bio->bi_end_io = end_sync_read;
4335 read_bio->bi_rw = READ;
4336 read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
4337 read_bio->bi_flags |= 1 << BIO_UPTODATE;
4338 read_bio->bi_vcnt = 0;
4339 read_bio->bi_idx = 0;
4340 read_bio->bi_size = 0;
4341 r10_bio->master_bio = read_bio;
4342 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
4343
4344 /* Now find the locations in the new layout */
4345 __raid10_find_phys(&conf->geo, r10_bio);
4346
4347 blist = read_bio;
4348 read_bio->bi_next = NULL;
4349
4350 for (s = 0; s < conf->copies*2; s++) {
4351 struct bio *b;
4352 int d = r10_bio->devs[s/2].devnum;
4353 struct md_rdev *rdev2;
4354 if (s&1) {
4355 rdev2 = conf->mirrors[d].replacement;
4356 b = r10_bio->devs[s/2].repl_bio;
4357 } else {
4358 rdev2 = conf->mirrors[d].rdev;
4359 b = r10_bio->devs[s/2].bio;
4360 }
4361 if (!rdev2 || test_bit(Faulty, &rdev2->flags))
4362 continue;
4363 b->bi_bdev = rdev2->bdev;
4364 b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset;
4365 b->bi_private = r10_bio;
4366 b->bi_end_io = end_reshape_write;
4367 b->bi_rw = WRITE;
4368 b->bi_flags &= ~(BIO_POOL_MASK - 1);
4369 b->bi_flags |= 1 << BIO_UPTODATE;
4370 b->bi_next = blist;
4371 b->bi_vcnt = 0;
4372 b->bi_idx = 0;
4373 b->bi_size = 0;
4374 blist = b;
4375 }
4376
4377 /* Now add as many pages as possible to all of these bios. */
4378
4379 nr_sectors = 0;
4380 for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
4381 struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;
4382 int len = (max_sectors - s) << 9;
4383 if (len > PAGE_SIZE)
4384 len = PAGE_SIZE;
4385 for (bio = blist; bio ; bio = bio->bi_next) {
4386 struct bio *bio2;
4387 if (bio_add_page(bio, page, len, 0))
4388 continue;
4389
4390 /* Didn't fit, must stop */
4391 for (bio2 = blist;
4392 bio2 && bio2 != bio;
4393 bio2 = bio2->bi_next) {
4394 /* Remove last page from this bio */
4395 bio2->bi_vcnt--;
4396 bio2->bi_size -= len;
4397 bio2->bi_flags &= ~(1<<BIO_SEG_VALID);
4398 }
4399 goto bio_full;
4400 }
4401 sector_nr += len >> 9;
4402 nr_sectors += len >> 9;
4403 }
4404bio_full:
4405 r10_bio->sectors = nr_sectors;
4406
4407 /* Now submit the read */
4408 md_sync_acct(read_bio->bi_bdev, r10_bio->sectors);
4409 atomic_inc(&r10_bio->remaining);
4410 read_bio->bi_next = NULL;
4411 generic_make_request(read_bio);
4412 sector_nr += nr_sectors;
4413 sectors_done += nr_sectors;
4414 if (sector_nr <= last)
4415 goto read_more;
4416
4417 /* Now that we have done the whole section we can
4418 * update reshape_progress
4419 */
4420 if (mddev->reshape_backwards)
4421 conf->reshape_progress -= sectors_done;
4422 else
4423 conf->reshape_progress += sectors_done;
4424
4425 return sectors_done;
4426}
4427
4428static void end_reshape_request(struct r10bio *r10_bio);
4429static int handle_reshape_read_error(struct mddev *mddev,
4430 struct r10bio *r10_bio);
4431static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
4432{
4433 /* Reshape read completed. Hopefully we have a block
4434 * to write out.
4435 * If we got a read error then we do sync 1-page reads from
4436 * elsewhere until we find the data - or give up.
4437 */
4438 struct r10conf *conf = mddev->private;
4439 int s;
4440
4441 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
4442 if (handle_reshape_read_error(mddev, r10_bio) < 0) {
4443 /* Reshape has been aborted */
4444 md_done_sync(mddev, r10_bio->sectors, 0);
4445 return;
4446 }
4447
4448 /* We definitely have the data in the pages, schedule the
4449 * writes.
4450 */
4451 atomic_set(&r10_bio->remaining, 1);
4452 for (s = 0; s < conf->copies*2; s++) {
4453 struct bio *b;
4454 int d = r10_bio->devs[s/2].devnum;
4455 struct md_rdev *rdev;
4456 if (s&1) {
4457 rdev = conf->mirrors[d].replacement;
4458 b = r10_bio->devs[s/2].repl_bio;
4459 } else {
4460 rdev = conf->mirrors[d].rdev;
4461 b = r10_bio->devs[s/2].bio;
4462 }
4463 if (!rdev || test_bit(Faulty, &rdev->flags))
4464 continue;
4465 atomic_inc(&rdev->nr_pending);
4466 md_sync_acct(b->bi_bdev, r10_bio->sectors);
4467 atomic_inc(&r10_bio->remaining);
4468 b->bi_next = NULL;
4469 generic_make_request(b);
4470 }
4471 end_reshape_request(r10_bio);
4472}
4473
4474static void end_reshape(struct r10conf *conf)
4475{
4476 if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))
4477 return;
4478
4479 spin_lock_irq(&conf->device_lock);
4480 conf->prev = conf->geo;
4481 md_finish_reshape(conf->mddev);
4482 smp_wmb();
4483 conf->reshape_progress = MaxSector;
4484 spin_unlock_irq(&conf->device_lock);
4485
4486 /* read-ahead size must cover two whole stripes, which is
4487 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
4488 */
4489 if (conf->mddev->queue) {
4490 int stripe = conf->geo.raid_disks *
4491 ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
4492 stripe /= conf->geo.near_copies;
4493 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
4494 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
4495 }
4496 conf->fullsync = 0;
4497}
4498
4499
4500static int handle_reshape_read_error(struct mddev *mddev,
4501 struct r10bio *r10_bio)
4502{
4503 /* Use sync reads to get the blocks from somewhere else */
4504 int sectors = r10_bio->sectors;
4505 struct r10conf *conf = mddev->private;
4506 struct {
4507 struct r10bio r10_bio;
4508 struct r10dev devs[conf->copies];
4509 } on_stack;
4510 struct r10bio *r10b = &on_stack.r10_bio;
4511 int slot = 0;
4512 int idx = 0;
4513 struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;
4514
4515 r10b->sector = r10_bio->sector;
4516 __raid10_find_phys(&conf->prev, r10b);
4517
4518 while (sectors) {
4519 int s = sectors;
4520 int success = 0;
4521 int first_slot = slot;
4522
4523 if (s > (PAGE_SIZE >> 9))
4524 s = PAGE_SIZE >> 9;
4525
4526 while (!success) {
4527 int d = r10b->devs[slot].devnum;
4528 struct md_rdev *rdev = conf->mirrors[d].rdev;
4529 sector_t addr;
4530 if (rdev == NULL ||
4531 test_bit(Faulty, &rdev->flags) ||
4532 !test_bit(In_sync, &rdev->flags))
4533 goto failed;
4534
4535 addr = r10b->devs[slot].addr + idx * PAGE_SIZE;
4536 success = sync_page_io(rdev,
4537 addr,
4538 s << 9,
4539 bvec[idx].bv_page,
4540 READ, false);
4541 if (success)
4542 break;
4543 failed:
4544 slot++;
4545 if (slot >= conf->copies)
4546 slot = 0;
4547 if (slot == first_slot)
4548 break;
4549 }
4550 if (!success) {
4551 /* couldn't read this block, must give up */
4552 set_bit(MD_RECOVERY_INTR,
4553 &mddev->recovery);
4554 return -EIO;
4555 }
4556 sectors -= s;
4557 idx++;
4558 }
4559 return 0;
4560}
4561
4562static void end_reshape_write(struct bio *bio, int error)
4563{
4564 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
4565 struct r10bio *r10_bio = bio->bi_private;
4566 struct mddev *mddev = r10_bio->mddev;
4567 struct r10conf *conf = mddev->private;
4568 int d;
4569 int slot;
4570 int repl;
4571 struct md_rdev *rdev = NULL;
4572
4573 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
4574 if (repl)
4575 rdev = conf->mirrors[d].replacement;
4576 if (!rdev) {
4577 smp_mb();
4578 rdev = conf->mirrors[d].rdev;
4579 }
4580
4581 if (!uptodate) {
4582 /* FIXME should record badblock */
4583 md_error(mddev, rdev);
4584 }
4585
4586 rdev_dec_pending(rdev, mddev);
4587 end_reshape_request(r10_bio);
4588}
4589
4590static void end_reshape_request(struct r10bio *r10_bio)
4591{
4592 if (!atomic_dec_and_test(&r10_bio->remaining))
4593 return;
4594 md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
4595 bio_put(r10_bio->master_bio);
4596 put_buf(r10_bio);
4597}
4598
4599static void raid10_finish_reshape(struct mddev *mddev)
4600{
4601 struct r10conf *conf = mddev->private;
4602
4603 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
4604 return;
4605
4606 if (mddev->delta_disks > 0) {
4607 sector_t size = raid10_size(mddev, 0, 0);
4608 md_set_array_sectors(mddev, size);
4609 if (mddev->recovery_cp > mddev->resync_max_sectors) {
4610 mddev->recovery_cp = mddev->resync_max_sectors;
4611 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4612 }
4613 mddev->resync_max_sectors = size;
4614 set_capacity(mddev->gendisk, mddev->array_sectors);
4615 revalidate_disk(mddev->gendisk);
4616 } else {
4617 int d;
4618 for (d = conf->geo.raid_disks ;
4619 d < conf->geo.raid_disks - mddev->delta_disks;
4620 d++) {
4621 struct md_rdev *rdev = conf->mirrors[d].rdev;
4622 if (rdev)
4623 clear_bit(In_sync, &rdev->flags);
4624 rdev = conf->mirrors[d].replacement;
4625 if (rdev)
4626 clear_bit(In_sync, &rdev->flags);
4627 }
4628 }
4629 mddev->layout = mddev->new_layout;
4630 mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
4631 mddev->reshape_position = MaxSector;
4632 mddev->delta_disks = 0;
4633 mddev->reshape_backwards = 0;
4634}
4635
4636static struct md_personality raid10_personality =
4637{ 3054{
4638 .name = "raid10", 3055 .name = "raid10",
4639 .level = 10, 3056 .level = 10,
@@ -4649,11 +3066,7 @@ static struct md_personality raid10_personality =
4649 .sync_request = sync_request, 3066 .sync_request = sync_request,
4650 .quiesce = raid10_quiesce, 3067 .quiesce = raid10_quiesce,
4651 .size = raid10_size, 3068 .size = raid10_size,
4652 .resize = raid10_resize,
4653 .takeover = raid10_takeover, 3069 .takeover = raid10_takeover,
4654 .check_reshape = raid10_check_reshape,
4655 .start_reshape = raid10_start_reshape,
4656 .finish_reshape = raid10_finish_reshape,
4657}; 3070};
4658 3071
4659static int __init raid_init(void) 3072static int __init raid_init(void)
@@ -4673,5 +3086,3 @@ MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
4673MODULE_ALIAS("md-personality-9"); /* RAID10 */ 3086MODULE_ALIAS("md-personality-9"); /* RAID10 */
4674MODULE_ALIAS("md-raid10"); 3087MODULE_ALIAS("md-raid10");
4675MODULE_ALIAS("md-level-10"); 3088MODULE_ALIAS("md-level-10");
4676
4677module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 1054cf60234..79cb52a0d4a 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -1,8 +1,10 @@
1#ifndef _RAID10_H 1#ifndef _RAID10_H
2#define _RAID10_H 2#define _RAID10_H
3 3
4struct raid10_info { 4typedef struct mirror_info mirror_info_t;
5 struct md_rdev *rdev, *replacement; 5
6struct mirror_info {
7 mdk_rdev_t *rdev;
6 sector_t head_position; 8 sector_t head_position;
7 int recovery_disabled; /* matches 9 int recovery_disabled; /* matches
8 * mddev->recovery_disabled 10 * mddev->recovery_disabled
@@ -11,72 +13,66 @@ struct raid10_info {
11 */ 13 */
12}; 14};
13 15
14struct r10conf { 16typedef struct r10bio_s r10bio_t;
15 struct mddev *mddev; 17
16 struct raid10_info *mirrors; 18struct r10_private_data_s {
17 struct raid10_info *mirrors_new, *mirrors_old; 19 mddev_t *mddev;
20 mirror_info_t *mirrors;
21 int raid_disks;
18 spinlock_t device_lock; 22 spinlock_t device_lock;
19 23
20 /* geometry */ 24 /* geometry */
21 struct geom { 25 int near_copies; /* number of copies laid out raid0 style */
22 int raid_disks; 26 int far_copies; /* number of copies laid out
23 int near_copies; /* number of copies laid out
24 * raid0 style */
25 int far_copies; /* number of copies laid out
26 * at large strides across drives 27 * at large strides across drives
27 */ 28 */
28 int far_offset; /* far_copies are offset by 1 29 int far_offset; /* far_copies are offset by 1 stripe
29 * stripe instead of many 30 * instead of many
31 */
32 int copies; /* near_copies * far_copies.
33 * must be <= raid_disks
30 */ 34 */
31 sector_t stride; /* distance between far copies. 35 sector_t stride; /* distance between far copies.
32 * This is size / far_copies unless 36 * This is size / far_copies unless
33 * far_offset, in which case it is 37 * far_offset, in which case it is
34 * 1 stripe. 38 * 1 stripe.
35 */ 39 */
36 int chunk_shift; /* shift from chunks to sectors */
37 sector_t chunk_mask;
38 } prev, geo;
39 int copies; /* near_copies * far_copies.
40 * must be <= raid_disks
41 */
42 40
43 sector_t dev_sectors; /* temp copy of 41 sector_t dev_sectors; /* temp copy of mddev->dev_sectors */
44 * mddev->dev_sectors */ 42
45 sector_t reshape_progress; 43 int chunk_shift; /* shift from chunks to sectors */
46 sector_t reshape_safe; 44 sector_t chunk_mask;
47 unsigned long reshape_checkpoint;
48 sector_t offset_diff;
49 45
50 struct list_head retry_list; 46 struct list_head retry_list;
51 /* queue pending writes and submit them on unplug */ 47 /* queue pending writes and submit them on unplug */
52 struct bio_list pending_bio_list; 48 struct bio_list pending_bio_list;
53 int pending_count; 49
54 50
55 spinlock_t resync_lock; 51 spinlock_t resync_lock;
56 int nr_pending; 52 int nr_pending;
57 int nr_waiting; 53 int nr_waiting;
58 int nr_queued; 54 int nr_queued;
59 int barrier; 55 int barrier;
60 sector_t next_resync; 56 sector_t next_resync;
61 int fullsync; /* set to 1 if a full sync is needed, 57 int fullsync; /* set to 1 if a full sync is needed,
62 * (fresh device added). 58 * (fresh device added).
63 * Cleared when a sync completes. 59 * Cleared when a sync completes.
64 */ 60 */
65 int have_replacement; /* There is at least one 61
66 * replacement device.
67 */
68 wait_queue_head_t wait_barrier; 62 wait_queue_head_t wait_barrier;
69 63
70 mempool_t *r10bio_pool; 64 mempool_t *r10bio_pool;
71 mempool_t *r10buf_pool; 65 mempool_t *r10buf_pool;
72 struct page *tmppage; 66 struct page *tmppage;
73 67
74 /* When taking over an array from a different personality, we store 68 /* When taking over an array from a different personality, we store
75 * the new thread here until we fully activate the array. 69 * the new thread here until we fully activate the array.
76 */ 70 */
77 struct md_thread *thread; 71 struct mdk_thread_s *thread;
78}; 72};
79 73
74typedef struct r10_private_data_s conf_t;
75
80/* 76/*
81 * this is our 'private' RAID10 bio. 77 * this is our 'private' RAID10 bio.
82 * 78 *
@@ -84,14 +80,14 @@ struct r10conf {
84 * for this RAID10 operation, and about their status: 80 * for this RAID10 operation, and about their status:
85 */ 81 */
86 82
87struct r10bio { 83struct r10bio_s {
88 atomic_t remaining; /* 'have we finished' count, 84 atomic_t remaining; /* 'have we finished' count,
89 * used from IRQ handlers 85 * used from IRQ handlers
90 */ 86 */
91 sector_t sector; /* virtual sector number */ 87 sector_t sector; /* virtual sector number */
92 int sectors; 88 int sectors;
93 unsigned long state; 89 unsigned long state;
94 struct mddev *mddev; 90 mddev_t *mddev;
95 /* 91 /*
96 * original bio going to /dev/mdx 92 * original bio going to /dev/mdx
97 */ 93 */
@@ -108,44 +104,40 @@ struct r10bio {
108 * When resyncing we also use one for each copy. 104 * When resyncing we also use one for each copy.
109 * When reconstructing, we use 2 bios, one for read, one for write. 105 * When reconstructing, we use 2 bios, one for read, one for write.
110 * We choose the number when they are allocated. 106 * We choose the number when they are allocated.
111 * We sometimes need an extra bio to write to the replacement.
112 */ 107 */
113 struct r10dev { 108 struct {
114 struct bio *bio; 109 struct bio *bio;
115 union { 110 sector_t addr;
116 struct bio *repl_bio; /* used for resync and 111 int devnum;
117 * writes */
118 struct md_rdev *rdev; /* used for reads
119 * (read_slot >= 0) */
120 };
121 sector_t addr;
122 int devnum;
123 } devs[0]; 112 } devs[0];
124}; 113};
125 114
115/* when we get a read error on a read-only array, we redirect to another
116 * device without failing the first device, or trying to over-write to
117 * correct the read error. To keep track of bad blocks on a per-bio
118 * level, we store IO_BLOCKED in the appropriate 'bios' pointer
119 */
120#define IO_BLOCKED ((struct bio*)1)
121/* When we successfully write to a known bad-block, we need to remove the
122 * bad-block marking which must be done from process context. So we record
123 * the success by setting devs[n].bio to IO_MADE_GOOD
124 */
125#define IO_MADE_GOOD ((struct bio *)2)
126
127#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
128
126/* bits for r10bio.state */ 129/* bits for r10bio.state */
127enum r10bio_state { 130#define R10BIO_Uptodate 0
128 R10BIO_Uptodate, 131#define R10BIO_IsSync 1
129 R10BIO_IsSync, 132#define R10BIO_IsRecover 2
130 R10BIO_IsRecover, 133#define R10BIO_Degraded 3
131 R10BIO_IsReshape,
132 R10BIO_Degraded,
133/* Set ReadError on bios that experience a read error 134/* Set ReadError on bios that experience a read error
134 * so that raid10d knows what to do with them. 135 * so that raid10d knows what to do with them.
135 */ 136 */
136 R10BIO_ReadError, 137#define R10BIO_ReadError 4
137/* If a write for this request means we can clear some 138/* If a write for this request means we can clear some
138 * known-bad-block records, we set this flag. 139 * known-bad-block records, we set this flag.
139 */ 140 */
140 R10BIO_MadeGood, 141#define R10BIO_MadeGood 5
141 R10BIO_WriteError, 142#define R10BIO_WriteError 6
142/* During a reshape we might be performing IO on the
143 * 'previous' part of the array, in which case this
144 * flag is set
145 */
146 R10BIO_Previous,
147};
148
149extern int md_raid10_congested(struct mddev *mddev, int bits);
150
151#endif 143#endif
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 19d77a02663..b6200c3935c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -47,14 +47,11 @@
47#include <linux/kthread.h> 47#include <linux/kthread.h>
48#include <linux/raid/pq.h> 48#include <linux/raid/pq.h>
49#include <linux/async_tx.h> 49#include <linux/async_tx.h>
50#include <linux/module.h>
51#include <linux/async.h> 50#include <linux/async.h>
52#include <linux/seq_file.h> 51#include <linux/seq_file.h>
53#include <linux/cpu.h> 52#include <linux/cpu.h>
54#include <linux/slab.h> 53#include <linux/slab.h>
55#include <linux/ratelimit.h> 54#include <linux/ratelimit.h>
56#include <trace/events/block.h>
57
58#include "md.h" 55#include "md.h"
59#include "raid5.h" 56#include "raid5.h"
60#include "raid0.h" 57#include "raid0.h"
@@ -73,11 +70,7 @@
73#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) 70#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))
74#define HASH_MASK (NR_HASH - 1) 71#define HASH_MASK (NR_HASH - 1)
75 72
76static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) 73#define stripe_hash(conf, sect) (&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]))
77{
78 int hash = (sect >> STRIPE_SHIFT) & HASH_MASK;
79 return &conf->stripe_hashtbl[hash];
80}
81 74
82/* bio's attached to a stripe+device for I/O are linked together in bi_sector 75/* bio's attached to a stripe+device for I/O are linked together in bi_sector
83 * order without overlap. There may be several bio's per stripe+device, and 76 * order without overlap. There may be several bio's per stripe+device, and
@@ -85,56 +78,57 @@ static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
85 * When walking this list for a particular stripe+device, we must never proceed 78 * When walking this list for a particular stripe+device, we must never proceed
86 * beyond a bio that extends past this device, as the next bio might no longer 79 * beyond a bio that extends past this device, as the next bio might no longer
87 * be valid. 80 * be valid.
88 * This function is used to determine the 'next' bio in the list, given the sector 81 * This macro is used to determine the 'next' bio in the list, given the sector
89 * of the current stripe+device 82 * of the current stripe+device
90 */ 83 */
91static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) 84#define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL)
92{ 85/*
93 int sectors = bio->bi_size >> 9; 86 * The following can be used to debug the driver
94 if (bio->bi_sector + sectors < sector + STRIPE_SECTORS) 87 */
95 return bio->bi_next; 88#define RAID5_PARANOIA 1
96 else 89#if RAID5_PARANOIA && defined(CONFIG_SMP)
97 return NULL; 90# define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock)
98} 91#else
92# define CHECK_DEVLOCK()
93#endif
94
95#ifdef DEBUG
96#define inline
97#define __inline__
98#endif
99 99
100/* 100/*
101 * We maintain a biased count of active stripes in the bottom 16 bits of 101 * We maintain a biased count of active stripes in the bottom 16 bits of
102 * bi_phys_segments, and a count of processed stripes in the upper 16 bits 102 * bi_phys_segments, and a count of processed stripes in the upper 16 bits
103 */ 103 */
104static inline int raid5_bi_processed_stripes(struct bio *bio) 104static inline int raid5_bi_phys_segments(struct bio *bio)
105{ 105{
106 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 106 return bio->bi_phys_segments & 0xffff;
107 return (atomic_read(segments) >> 16) & 0xffff;
108} 107}
109 108
110static inline int raid5_dec_bi_active_stripes(struct bio *bio) 109static inline int raid5_bi_hw_segments(struct bio *bio)
111{ 110{
112 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 111 return (bio->bi_phys_segments >> 16) & 0xffff;
113 return atomic_sub_return(1, segments) & 0xffff;
114} 112}
115 113
116static inline void raid5_inc_bi_active_stripes(struct bio *bio) 114static inline int raid5_dec_bi_phys_segments(struct bio *bio)
117{ 115{
118 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 116 --bio->bi_phys_segments;
119 atomic_inc(segments); 117 return raid5_bi_phys_segments(bio);
120} 118}
121 119
122static inline void raid5_set_bi_processed_stripes(struct bio *bio, 120static inline int raid5_dec_bi_hw_segments(struct bio *bio)
123 unsigned int cnt)
124{ 121{
125 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 122 unsigned short val = raid5_bi_hw_segments(bio);
126 int old, new;
127 123
128 do { 124 --val;
129 old = atomic_read(segments); 125 bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio);
130 new = (old & 0xffff) | (cnt << 16); 126 return val;
131 } while (atomic_cmpxchg(segments, old, new) != old);
132} 127}
133 128
134static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt) 129static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt)
135{ 130{
136 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 131 bio->bi_phys_segments = raid5_bi_phys_segments(bio) | (cnt << 16);
137 atomic_set(segments, cnt);
138} 132}
139 133
140/* Find first data disk in a raid6 stripe */ 134/* Find first data disk in a raid6 stripe */
@@ -184,14 +178,12 @@ static void return_io(struct bio *return_bi)
184 return_bi = bi->bi_next; 178 return_bi = bi->bi_next;
185 bi->bi_next = NULL; 179 bi->bi_next = NULL;
186 bi->bi_size = 0; 180 bi->bi_size = 0;
187 trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
188 bi, 0);
189 bio_endio(bi, 0); 181 bio_endio(bi, 0);
190 bi = return_bi; 182 bi = return_bi;
191 } 183 }
192} 184}
193 185
194static void print_raid5_conf (struct r5conf *conf); 186static void print_raid5_conf (raid5_conf_t *conf);
195 187
196static int stripe_operations_active(struct stripe_head *sh) 188static int stripe_operations_active(struct stripe_head *sh)
197{ 189{
@@ -200,56 +192,48 @@ static int stripe_operations_active(struct stripe_head *sh)
200 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 192 test_bit(STRIPE_COMPUTE_RUN, &sh->state);
201} 193}
202 194
203static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) 195static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
204{ 196{
205 BUG_ON(!list_empty(&sh->lru)); 197 if (atomic_dec_and_test(&sh->count)) {
206 BUG_ON(atomic_read(&conf->active_stripes)==0); 198 BUG_ON(!list_empty(&sh->lru));
207 if (test_bit(STRIPE_HANDLE, &sh->state)) { 199 BUG_ON(atomic_read(&conf->active_stripes)==0);
208 if (test_bit(STRIPE_DELAYED, &sh->state) && 200 if (test_bit(STRIPE_HANDLE, &sh->state)) {
209 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 201 if (test_bit(STRIPE_DELAYED, &sh->state))
210 list_add_tail(&sh->lru, &conf->delayed_list); 202 list_add_tail(&sh->lru, &conf->delayed_list);
211 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 203 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
212 sh->bm_seq - conf->seq_write > 0) 204 sh->bm_seq - conf->seq_write > 0)
213 list_add_tail(&sh->lru, &conf->bitmap_list); 205 list_add_tail(&sh->lru, &conf->bitmap_list);
214 else { 206 else {
215 clear_bit(STRIPE_DELAYED, &sh->state); 207 clear_bit(STRIPE_BIT_DELAY, &sh->state);
216 clear_bit(STRIPE_BIT_DELAY, &sh->state); 208 list_add_tail(&sh->lru, &conf->handle_list);
217 list_add_tail(&sh->lru, &conf->handle_list); 209 }
218 } 210 md_wakeup_thread(conf->mddev->thread);
219 md_wakeup_thread(conf->mddev->thread); 211 } else {
220 } else { 212 BUG_ON(stripe_operations_active(sh));
221 BUG_ON(stripe_operations_active(sh)); 213 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
222 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 214 atomic_dec(&conf->preread_active_stripes);
223 if (atomic_dec_return(&conf->preread_active_stripes) 215 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
224 < IO_THRESHOLD) 216 md_wakeup_thread(conf->mddev->thread);
225 md_wakeup_thread(conf->mddev->thread); 217 }
226 atomic_dec(&conf->active_stripes); 218 atomic_dec(&conf->active_stripes);
227 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 219 if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
228 list_add_tail(&sh->lru, &conf->inactive_list); 220 list_add_tail(&sh->lru, &conf->inactive_list);
229 wake_up(&conf->wait_for_stripe); 221 wake_up(&conf->wait_for_stripe);
230 if (conf->retry_read_aligned) 222 if (conf->retry_read_aligned)
231 md_wakeup_thread(conf->mddev->thread); 223 md_wakeup_thread(conf->mddev->thread);
224 }
232 } 225 }
233 } 226 }
234} 227}
235 228
236static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
237{
238 if (atomic_dec_and_test(&sh->count))
239 do_release_stripe(conf, sh);
240}
241
242static void release_stripe(struct stripe_head *sh) 229static void release_stripe(struct stripe_head *sh)
243{ 230{
244 struct r5conf *conf = sh->raid_conf; 231 raid5_conf_t *conf = sh->raid_conf;
245 unsigned long flags; 232 unsigned long flags;
246 233
247 local_irq_save(flags); 234 spin_lock_irqsave(&conf->device_lock, flags);
248 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { 235 __release_stripe(conf, sh);
249 do_release_stripe(conf, sh); 236 spin_unlock_irqrestore(&conf->device_lock, flags);
250 spin_unlock(&conf->device_lock);
251 }
252 local_irq_restore(flags);
253} 237}
254 238
255static inline void remove_hash(struct stripe_head *sh) 239static inline void remove_hash(struct stripe_head *sh)
@@ -260,23 +244,25 @@ static inline void remove_hash(struct stripe_head *sh)
260 hlist_del_init(&sh->hash); 244 hlist_del_init(&sh->hash);
261} 245}
262 246
263static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) 247static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
264{ 248{
265 struct hlist_head *hp = stripe_hash(conf, sh->sector); 249 struct hlist_head *hp = stripe_hash(conf, sh->sector);
266 250
267 pr_debug("insert_hash(), stripe %llu\n", 251 pr_debug("insert_hash(), stripe %llu\n",
268 (unsigned long long)sh->sector); 252 (unsigned long long)sh->sector);
269 253
254 CHECK_DEVLOCK();
270 hlist_add_head(&sh->hash, hp); 255 hlist_add_head(&sh->hash, hp);
271} 256}
272 257
273 258
274/* find an idle stripe, make sure it is unhashed, and return it. */ 259/* find an idle stripe, make sure it is unhashed, and return it. */
275static struct stripe_head *get_free_stripe(struct r5conf *conf) 260static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
276{ 261{
277 struct stripe_head *sh = NULL; 262 struct stripe_head *sh = NULL;
278 struct list_head *first; 263 struct list_head *first;
279 264
265 CHECK_DEVLOCK();
280 if (list_empty(&conf->inactive_list)) 266 if (list_empty(&conf->inactive_list))
281 goto out; 267 goto out;
282 first = conf->inactive_list.next; 268 first = conf->inactive_list.next;
@@ -320,18 +306,19 @@ static int grow_buffers(struct stripe_head *sh)
320} 306}
321 307
322static void raid5_build_block(struct stripe_head *sh, int i, int previous); 308static void raid5_build_block(struct stripe_head *sh, int i, int previous);
323static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 309static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
324 struct stripe_head *sh); 310 struct stripe_head *sh);
325 311
326static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 312static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
327{ 313{
328 struct r5conf *conf = sh->raid_conf; 314 raid5_conf_t *conf = sh->raid_conf;
329 int i; 315 int i;
330 316
331 BUG_ON(atomic_read(&sh->count) != 0); 317 BUG_ON(atomic_read(&sh->count) != 0);
332 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 318 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
333 BUG_ON(stripe_operations_active(sh)); 319 BUG_ON(stripe_operations_active(sh));
334 320
321 CHECK_DEVLOCK();
335 pr_debug("init_stripe called, stripe %llu\n", 322 pr_debug("init_stripe called, stripe %llu\n",
336 (unsigned long long)sh->sector); 323 (unsigned long long)sh->sector);
337 324
@@ -361,12 +348,13 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
361 insert_hash(conf, sh); 348 insert_hash(conf, sh);
362} 349}
363 350
364static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, 351static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector,
365 short generation) 352 short generation)
366{ 353{
367 struct stripe_head *sh; 354 struct stripe_head *sh;
368 struct hlist_node *hn; 355 struct hlist_node *hn;
369 356
357 CHECK_DEVLOCK();
370 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); 358 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
371 hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash) 359 hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
372 if (sh->sector == sector && sh->generation == generation) 360 if (sh->sector == sector && sh->generation == generation)
@@ -388,17 +376,17 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
388 * of the two sections, and some non-in_sync devices may 376 * of the two sections, and some non-in_sync devices may
389 * be insync in the section most affected by failed devices. 377 * be insync in the section most affected by failed devices.
390 */ 378 */
391static int calc_degraded(struct r5conf *conf) 379static int has_failed(raid5_conf_t *conf)
392{ 380{
393 int degraded, degraded2; 381 int degraded;
394 int i; 382 int i;
383 if (conf->mddev->reshape_position == MaxSector)
384 return conf->mddev->degraded > conf->max_degraded;
395 385
396 rcu_read_lock(); 386 rcu_read_lock();
397 degraded = 0; 387 degraded = 0;
398 for (i = 0; i < conf->previous_raid_disks; i++) { 388 for (i = 0; i < conf->previous_raid_disks; i++) {
399 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 389 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
400 if (rdev && test_bit(Faulty, &rdev->flags))
401 rdev = rcu_dereference(conf->disks[i].replacement);
402 if (!rdev || test_bit(Faulty, &rdev->flags)) 390 if (!rdev || test_bit(Faulty, &rdev->flags))
403 degraded++; 391 degraded++;
404 else if (test_bit(In_sync, &rdev->flags)) 392 else if (test_bit(In_sync, &rdev->flags))
@@ -417,16 +405,14 @@ static int calc_degraded(struct r5conf *conf)
417 degraded++; 405 degraded++;
418 } 406 }
419 rcu_read_unlock(); 407 rcu_read_unlock();
420 if (conf->raid_disks == conf->previous_raid_disks) 408 if (degraded > conf->max_degraded)
421 return degraded; 409 return 1;
422 rcu_read_lock(); 410 rcu_read_lock();
423 degraded2 = 0; 411 degraded = 0;
424 for (i = 0; i < conf->raid_disks; i++) { 412 for (i = 0; i < conf->raid_disks; i++) {
425 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 413 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
426 if (rdev && test_bit(Faulty, &rdev->flags))
427 rdev = rcu_dereference(conf->disks[i].replacement);
428 if (!rdev || test_bit(Faulty, &rdev->flags)) 414 if (!rdev || test_bit(Faulty, &rdev->flags))
429 degraded2++; 415 degraded++;
430 else if (test_bit(In_sync, &rdev->flags)) 416 else if (test_bit(In_sync, &rdev->flags))
431 ; 417 ;
432 else 418 else
@@ -436,29 +422,16 @@ static int calc_degraded(struct r5conf *conf)
436 * almost certainly hasn't. 422 * almost certainly hasn't.
437 */ 423 */
438 if (conf->raid_disks <= conf->previous_raid_disks) 424 if (conf->raid_disks <= conf->previous_raid_disks)
439 degraded2++; 425 degraded++;
440 } 426 }
441 rcu_read_unlock(); 427 rcu_read_unlock();
442 if (degraded2 > degraded)
443 return degraded2;
444 return degraded;
445}
446
447static int has_failed(struct r5conf *conf)
448{
449 int degraded;
450
451 if (conf->mddev->reshape_position == MaxSector)
452 return conf->mddev->degraded > conf->max_degraded;
453
454 degraded = calc_degraded(conf);
455 if (degraded > conf->max_degraded) 428 if (degraded > conf->max_degraded)
456 return 1; 429 return 1;
457 return 0; 430 return 0;
458} 431}
459 432
460static struct stripe_head * 433static struct stripe_head *
461get_active_stripe(struct r5conf *conf, sector_t sector, 434get_active_stripe(raid5_conf_t *conf, sector_t sector,
462 int previous, int noblock, int noquiesce) 435 int previous, int noblock, int noquiesce)
463{ 436{
464 struct stripe_head *sh; 437 struct stripe_head *sh;
@@ -470,7 +443,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
470 do { 443 do {
471 wait_event_lock_irq(conf->wait_for_stripe, 444 wait_event_lock_irq(conf->wait_for_stripe,
472 conf->quiesce == 0 || noquiesce, 445 conf->quiesce == 0 || noquiesce,
473 conf->device_lock); 446 conf->device_lock, /* nothing */);
474 sh = __find_stripe(conf, sector, conf->generation - previous); 447 sh = __find_stripe(conf, sector, conf->generation - previous);
475 if (!sh) { 448 if (!sh) {
476 if (!conf->inactive_blocked) 449 if (!conf->inactive_blocked)
@@ -484,15 +457,15 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
484 (atomic_read(&conf->active_stripes) 457 (atomic_read(&conf->active_stripes)
485 < (conf->max_nr_stripes *3/4) 458 < (conf->max_nr_stripes *3/4)
486 || !conf->inactive_blocked), 459 || !conf->inactive_blocked),
487 conf->device_lock); 460 conf->device_lock,
461 );
488 conf->inactive_blocked = 0; 462 conf->inactive_blocked = 0;
489 } else 463 } else
490 init_stripe(sh, sector, previous); 464 init_stripe(sh, sector, previous);
491 } else { 465 } else {
492 if (atomic_read(&sh->count)) { 466 if (atomic_read(&sh->count)) {
493 BUG_ON(!list_empty(&sh->lru) 467 BUG_ON(!list_empty(&sh->lru)
494 && !test_bit(STRIPE_EXPANDING, &sh->state) 468 && !test_bit(STRIPE_EXPANDING, &sh->state));
495 && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state));
496 } else { 469 } else {
497 if (!test_bit(STRIPE_HANDLE, &sh->state)) 470 if (!test_bit(STRIPE_HANDLE, &sh->state))
498 atomic_inc(&conf->active_stripes); 471 atomic_inc(&conf->active_stripes);
@@ -511,27 +484,6 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
511 return sh; 484 return sh;
512} 485}
513 486
514/* Determine if 'data_offset' or 'new_data_offset' should be used
515 * in this stripe_head.
516 */
517static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
518{
519 sector_t progress = conf->reshape_progress;
520 /* Need a memory barrier to make sure we see the value
521 * of conf->generation, or ->data_offset that was set before
522 * reshape_progress was updated.
523 */
524 smp_rmb();
525 if (progress == MaxSector)
526 return 0;
527 if (sh->generation == conf->generation - 1)
528 return 0;
529 /* We are in a reshape, and this is a new-generation stripe,
530 * so use new_data_offset.
531 */
532 return 1;
533}
534
535static void 487static void
536raid5_end_read_request(struct bio *bi, int error); 488raid5_end_read_request(struct bio *bi, int error);
537static void 489static void
@@ -539,78 +491,43 @@ raid5_end_write_request(struct bio *bi, int error);
539 491
540static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) 492static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
541{ 493{
542 struct r5conf *conf = sh->raid_conf; 494 raid5_conf_t *conf = sh->raid_conf;
543 int i, disks = sh->disks; 495 int i, disks = sh->disks;
544 496
545 might_sleep(); 497 might_sleep();
546 498
547 for (i = disks; i--; ) { 499 for (i = disks; i--; ) {
548 int rw; 500 int rw;
549 int replace_only = 0; 501 struct bio *bi;
550 struct bio *bi, *rbi; 502 mdk_rdev_t *rdev;
551 struct md_rdev *rdev, *rrdev = NULL;
552 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 503 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
553 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) 504 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
554 rw = WRITE_FUA; 505 rw = WRITE_FUA;
555 else 506 else
556 rw = WRITE; 507 rw = WRITE;
557 if (test_bit(R5_Discard, &sh->dev[i].flags))
558 rw |= REQ_DISCARD;
559 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 508 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
560 rw = READ; 509 rw = READ;
561 else if (test_and_clear_bit(R5_WantReplace, 510 else
562 &sh->dev[i].flags)) {
563 rw = WRITE;
564 replace_only = 1;
565 } else
566 continue; 511 continue;
567 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
568 rw |= REQ_SYNC;
569 512
570 bi = &sh->dev[i].req; 513 bi = &sh->dev[i].req;
571 rbi = &sh->dev[i].rreq; /* For writing to replacement */
572 514
573 bi->bi_rw = rw; 515 bi->bi_rw = rw;
574 rbi->bi_rw = rw; 516 if (rw & WRITE)
575 if (rw & WRITE) {
576 bi->bi_end_io = raid5_end_write_request; 517 bi->bi_end_io = raid5_end_write_request;
577 rbi->bi_end_io = raid5_end_write_request; 518 else
578 } else
579 bi->bi_end_io = raid5_end_read_request; 519 bi->bi_end_io = raid5_end_read_request;
580 520
581 rcu_read_lock(); 521 rcu_read_lock();
582 rrdev = rcu_dereference(conf->disks[i].replacement);
583 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
584 rdev = rcu_dereference(conf->disks[i].rdev); 522 rdev = rcu_dereference(conf->disks[i].rdev);
585 if (!rdev) {
586 rdev = rrdev;
587 rrdev = NULL;
588 }
589 if (rw & WRITE) {
590 if (replace_only)
591 rdev = NULL;
592 if (rdev == rrdev)
593 /* We raced and saw duplicates */
594 rrdev = NULL;
595 } else {
596 if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev)
597 rdev = rrdev;
598 rrdev = NULL;
599 }
600
601 if (rdev && test_bit(Faulty, &rdev->flags)) 523 if (rdev && test_bit(Faulty, &rdev->flags))
602 rdev = NULL; 524 rdev = NULL;
603 if (rdev) 525 if (rdev)
604 atomic_inc(&rdev->nr_pending); 526 atomic_inc(&rdev->nr_pending);
605 if (rrdev && test_bit(Faulty, &rrdev->flags))
606 rrdev = NULL;
607 if (rrdev)
608 atomic_inc(&rrdev->nr_pending);
609 rcu_read_unlock(); 527 rcu_read_unlock();
610 528
611 /* We have already checked bad blocks for reads. Now 529 /* We have already checked bad blocks for reads. Now
612 * need to check for writes. We never accept write errors 530 * need to check for writes.
613 * on the replacement, so we don't to check rrdev.
614 */ 531 */
615 while ((rw & WRITE) && rdev && 532 while ((rw & WRITE) && rdev &&
616 test_bit(WriteErrorSeen, &rdev->flags)) { 533 test_bit(WriteErrorSeen, &rdev->flags)) {
@@ -631,12 +548,6 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
631 * a chance*/ 548 * a chance*/
632 md_check_recovery(conf->mddev); 549 md_check_recovery(conf->mddev);
633 } 550 }
634 /*
635 * Because md_wait_for_blocked_rdev
636 * will dec nr_pending, we must
637 * increment it first.
638 */
639 atomic_inc(&rdev->nr_pending);
640 md_wait_for_blocked_rdev(rdev, conf->mddev); 551 md_wait_for_blocked_rdev(rdev, conf->mddev);
641 } else { 552 } else {
642 /* Acknowledged bad block - skip the write */ 553 /* Acknowledged bad block - skip the write */
@@ -646,8 +557,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
646 } 557 }
647 558
648 if (rdev) { 559 if (rdev) {
649 if (s->syncing || s->expanding || s->expanded 560 if (s->syncing || s->expanding || s->expanded)
650 || s->replacing)
651 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 561 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
652 562
653 set_bit(STRIPE_IO_STARTED, &sh->state); 563 set_bit(STRIPE_IO_STARTED, &sh->state);
@@ -657,59 +567,18 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
657 __func__, (unsigned long long)sh->sector, 567 __func__, (unsigned long long)sh->sector,
658 bi->bi_rw, i); 568 bi->bi_rw, i);
659 atomic_inc(&sh->count); 569 atomic_inc(&sh->count);
660 if (use_new_offset(conf, sh)) 570 bi->bi_sector = sh->sector + rdev->data_offset;
661 bi->bi_sector = (sh->sector
662 + rdev->new_data_offset);
663 else
664 bi->bi_sector = (sh->sector
665 + rdev->data_offset);
666 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
667 bi->bi_rw |= REQ_FLUSH;
668
669 bi->bi_flags = 1 << BIO_UPTODATE; 571 bi->bi_flags = 1 << BIO_UPTODATE;
572 bi->bi_vcnt = 1;
573 bi->bi_max_vecs = 1;
670 bi->bi_idx = 0; 574 bi->bi_idx = 0;
575 bi->bi_io_vec = &sh->dev[i].vec;
671 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 576 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
672 bi->bi_io_vec[0].bv_offset = 0; 577 bi->bi_io_vec[0].bv_offset = 0;
673 bi->bi_size = STRIPE_SIZE; 578 bi->bi_size = STRIPE_SIZE;
674 bi->bi_next = NULL; 579 bi->bi_next = NULL;
675 if (rrdev)
676 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
677 trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
678 bi, disk_devt(conf->mddev->gendisk),
679 sh->dev[i].sector);
680 generic_make_request(bi); 580 generic_make_request(bi);
681 } 581 } else {
682 if (rrdev) {
683 if (s->syncing || s->expanding || s->expanded
684 || s->replacing)
685 md_sync_acct(rrdev->bdev, STRIPE_SECTORS);
686
687 set_bit(STRIPE_IO_STARTED, &sh->state);
688
689 rbi->bi_bdev = rrdev->bdev;
690 pr_debug("%s: for %llu schedule op %ld on "
691 "replacement disc %d\n",
692 __func__, (unsigned long long)sh->sector,
693 rbi->bi_rw, i);
694 atomic_inc(&sh->count);
695 if (use_new_offset(conf, sh))
696 rbi->bi_sector = (sh->sector
697 + rrdev->new_data_offset);
698 else
699 rbi->bi_sector = (sh->sector
700 + rrdev->data_offset);
701 rbi->bi_flags = 1 << BIO_UPTODATE;
702 rbi->bi_idx = 0;
703 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
704 rbi->bi_io_vec[0].bv_offset = 0;
705 rbi->bi_size = STRIPE_SIZE;
706 rbi->bi_next = NULL;
707 trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
708 rbi, disk_devt(conf->mddev->gendisk),
709 sh->dev[i].sector);
710 generic_make_request(rbi);
711 }
712 if (!rdev && !rrdev) {
713 if (rw & WRITE) 582 if (rw & WRITE)
714 set_bit(STRIPE_DEGRADED, &sh->state); 583 set_bit(STRIPE_DEGRADED, &sh->state);
715 pr_debug("skip op %ld on disc %d for sector %llu\n", 584 pr_debug("skip op %ld on disc %d for sector %llu\n",
@@ -781,12 +650,14 @@ static void ops_complete_biofill(void *stripe_head_ref)
781{ 650{
782 struct stripe_head *sh = stripe_head_ref; 651 struct stripe_head *sh = stripe_head_ref;
783 struct bio *return_bi = NULL; 652 struct bio *return_bi = NULL;
653 raid5_conf_t *conf = sh->raid_conf;
784 int i; 654 int i;
785 655
786 pr_debug("%s: stripe %llu\n", __func__, 656 pr_debug("%s: stripe %llu\n", __func__,
787 (unsigned long long)sh->sector); 657 (unsigned long long)sh->sector);
788 658
789 /* clear completed biofills */ 659 /* clear completed biofills */
660 spin_lock_irq(&conf->device_lock);
790 for (i = sh->disks; i--; ) { 661 for (i = sh->disks; i--; ) {
791 struct r5dev *dev = &sh->dev[i]; 662 struct r5dev *dev = &sh->dev[i];
792 663
@@ -804,7 +675,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
804 while (rbi && rbi->bi_sector < 675 while (rbi && rbi->bi_sector <
805 dev->sector + STRIPE_SECTORS) { 676 dev->sector + STRIPE_SECTORS) {
806 rbi2 = r5_next_bio(rbi, dev->sector); 677 rbi2 = r5_next_bio(rbi, dev->sector);
807 if (!raid5_dec_bi_active_stripes(rbi)) { 678 if (!raid5_dec_bi_phys_segments(rbi)) {
808 rbi->bi_next = return_bi; 679 rbi->bi_next = return_bi;
809 return_bi = rbi; 680 return_bi = rbi;
810 } 681 }
@@ -812,6 +683,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
812 } 683 }
813 } 684 }
814 } 685 }
686 spin_unlock_irq(&conf->device_lock);
815 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 687 clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
816 688
817 return_io(return_bi); 689 return_io(return_bi);
@@ -823,6 +695,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
823static void ops_run_biofill(struct stripe_head *sh) 695static void ops_run_biofill(struct stripe_head *sh)
824{ 696{
825 struct dma_async_tx_descriptor *tx = NULL; 697 struct dma_async_tx_descriptor *tx = NULL;
698 raid5_conf_t *conf = sh->raid_conf;
826 struct async_submit_ctl submit; 699 struct async_submit_ctl submit;
827 int i; 700 int i;
828 701
@@ -833,10 +706,10 @@ static void ops_run_biofill(struct stripe_head *sh)
833 struct r5dev *dev = &sh->dev[i]; 706 struct r5dev *dev = &sh->dev[i];
834 if (test_bit(R5_Wantfill, &dev->flags)) { 707 if (test_bit(R5_Wantfill, &dev->flags)) {
835 struct bio *rbi; 708 struct bio *rbi;
836 spin_lock_irq(&sh->stripe_lock); 709 spin_lock_irq(&conf->device_lock);
837 dev->read = rbi = dev->toread; 710 dev->read = rbi = dev->toread;
838 dev->toread = NULL; 711 dev->toread = NULL;
839 spin_unlock_irq(&sh->stripe_lock); 712 spin_unlock_irq(&conf->device_lock);
840 while (rbi && rbi->bi_sector < 713 while (rbi && rbi->bi_sector <
841 dev->sector + STRIPE_SECTORS) { 714 dev->sector + STRIPE_SECTORS) {
842 tx = async_copy_data(0, rbi, dev->page, 715 tx = async_copy_data(0, rbi, dev->page,
@@ -1172,24 +1045,19 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1172 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { 1045 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
1173 struct bio *wbi; 1046 struct bio *wbi;
1174 1047
1175 spin_lock_irq(&sh->stripe_lock); 1048 spin_lock_irq(&sh->raid_conf->device_lock);
1176 chosen = dev->towrite; 1049 chosen = dev->towrite;
1177 dev->towrite = NULL; 1050 dev->towrite = NULL;
1178 BUG_ON(dev->written); 1051 BUG_ON(dev->written);
1179 wbi = dev->written = chosen; 1052 wbi = dev->written = chosen;
1180 spin_unlock_irq(&sh->stripe_lock); 1053 spin_unlock_irq(&sh->raid_conf->device_lock);
1181 1054
1182 while (wbi && wbi->bi_sector < 1055 while (wbi && wbi->bi_sector <
1183 dev->sector + STRIPE_SECTORS) { 1056 dev->sector + STRIPE_SECTORS) {
1184 if (wbi->bi_rw & REQ_FUA) 1057 if (wbi->bi_rw & REQ_FUA)
1185 set_bit(R5_WantFUA, &dev->flags); 1058 set_bit(R5_WantFUA, &dev->flags);
1186 if (wbi->bi_rw & REQ_SYNC) 1059 tx = async_copy_data(1, wbi, dev->page,
1187 set_bit(R5_SyncIO, &dev->flags); 1060 dev->sector, tx);
1188 if (wbi->bi_rw & REQ_DISCARD)
1189 set_bit(R5_Discard, &dev->flags);
1190 else
1191 tx = async_copy_data(1, wbi, dev->page,
1192 dev->sector, tx);
1193 wbi = r5_next_bio(wbi, dev->sector); 1061 wbi = r5_next_bio(wbi, dev->sector);
1194 } 1062 }
1195 } 1063 }
@@ -1205,27 +1073,21 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
1205 int pd_idx = sh->pd_idx; 1073 int pd_idx = sh->pd_idx;
1206 int qd_idx = sh->qd_idx; 1074 int qd_idx = sh->qd_idx;
1207 int i; 1075 int i;
1208 bool fua = false, sync = false, discard = false; 1076 bool fua = false;
1209 1077
1210 pr_debug("%s: stripe %llu\n", __func__, 1078 pr_debug("%s: stripe %llu\n", __func__,
1211 (unsigned long long)sh->sector); 1079 (unsigned long long)sh->sector);
1212 1080
1213 for (i = disks; i--; ) { 1081 for (i = disks; i--; )
1214 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); 1082 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
1215 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
1216 discard |= test_bit(R5_Discard, &sh->dev[i].flags);
1217 }
1218 1083
1219 for (i = disks; i--; ) { 1084 for (i = disks; i--; ) {
1220 struct r5dev *dev = &sh->dev[i]; 1085 struct r5dev *dev = &sh->dev[i];
1221 1086
1222 if (dev->written || i == pd_idx || i == qd_idx) { 1087 if (dev->written || i == pd_idx || i == qd_idx) {
1223 if (!discard) 1088 set_bit(R5_UPTODATE, &dev->flags);
1224 set_bit(R5_UPTODATE, &dev->flags);
1225 if (fua) 1089 if (fua)
1226 set_bit(R5_WantFUA, &dev->flags); 1090 set_bit(R5_WantFUA, &dev->flags);
1227 if (sync)
1228 set_bit(R5_SyncIO, &dev->flags);
1229 } 1091 }
1230 } 1092 }
1231 1093
@@ -1257,18 +1119,6 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
1257 pr_debug("%s: stripe %llu\n", __func__, 1119 pr_debug("%s: stripe %llu\n", __func__,
1258 (unsigned long long)sh->sector); 1120 (unsigned long long)sh->sector);
1259 1121
1260 for (i = 0; i < sh->disks; i++) {
1261 if (pd_idx == i)
1262 continue;
1263 if (!test_bit(R5_Discard, &sh->dev[i].flags))
1264 break;
1265 }
1266 if (i >= sh->disks) {
1267 atomic_inc(&sh->count);
1268 set_bit(R5_Discard, &sh->dev[pd_idx].flags);
1269 ops_complete_reconstruct(sh);
1270 return;
1271 }
1272 /* check if prexor is active which means only process blocks 1122 /* check if prexor is active which means only process blocks
1273 * that are part of a read-modify-write (written) 1123 * that are part of a read-modify-write (written)
1274 */ 1124 */
@@ -1313,24 +1163,10 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
1313{ 1163{
1314 struct async_submit_ctl submit; 1164 struct async_submit_ctl submit;
1315 struct page **blocks = percpu->scribble; 1165 struct page **blocks = percpu->scribble;
1316 int count, i; 1166 int count;
1317 1167
1318 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 1168 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
1319 1169
1320 for (i = 0; i < sh->disks; i++) {
1321 if (sh->pd_idx == i || sh->qd_idx == i)
1322 continue;
1323 if (!test_bit(R5_Discard, &sh->dev[i].flags))
1324 break;
1325 }
1326 if (i >= sh->disks) {
1327 atomic_inc(&sh->count);
1328 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
1329 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
1330 ops_complete_reconstruct(sh);
1331 return;
1332 }
1333
1334 count = set_syndrome_sources(blocks, sh); 1170 count = set_syndrome_sources(blocks, sh);
1335 1171
1336 atomic_inc(&sh->count); 1172 atomic_inc(&sh->count);
@@ -1410,7 +1246,7 @@ static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1410{ 1246{
1411 int overlap_clear = 0, i, disks = sh->disks; 1247 int overlap_clear = 0, i, disks = sh->disks;
1412 struct dma_async_tx_descriptor *tx = NULL; 1248 struct dma_async_tx_descriptor *tx = NULL;
1413 struct r5conf *conf = sh->raid_conf; 1249 raid5_conf_t *conf = sh->raid_conf;
1414 int level = conf->level; 1250 int level = conf->level;
1415 struct raid5_percpu *percpu; 1251 struct raid5_percpu *percpu;
1416 unsigned long cpu; 1252 unsigned long cpu;
@@ -1501,7 +1337,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1501#define raid_run_ops __raid_run_ops 1337#define raid_run_ops __raid_run_ops
1502#endif 1338#endif
1503 1339
1504static int grow_one_stripe(struct r5conf *conf) 1340static int grow_one_stripe(raid5_conf_t *conf)
1505{ 1341{
1506 struct stripe_head *sh; 1342 struct stripe_head *sh;
1507 sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); 1343 sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);
@@ -1513,8 +1349,6 @@ static int grow_one_stripe(struct r5conf *conf)
1513 init_waitqueue_head(&sh->ops.wait_for_ops); 1349 init_waitqueue_head(&sh->ops.wait_for_ops);
1514 #endif 1350 #endif
1515 1351
1516 spin_lock_init(&sh->stripe_lock);
1517
1518 if (grow_buffers(sh)) { 1352 if (grow_buffers(sh)) {
1519 shrink_buffers(sh); 1353 shrink_buffers(sh);
1520 kmem_cache_free(conf->slab_cache, sh); 1354 kmem_cache_free(conf->slab_cache, sh);
@@ -1528,7 +1362,7 @@ static int grow_one_stripe(struct r5conf *conf)
1528 return 1; 1362 return 1;
1529} 1363}
1530 1364
1531static int grow_stripes(struct r5conf *conf, int num) 1365static int grow_stripes(raid5_conf_t *conf, int num)
1532{ 1366{
1533 struct kmem_cache *sc; 1367 struct kmem_cache *sc;
1534 int devs = max(conf->raid_disks, conf->previous_raid_disks); 1368 int devs = max(conf->raid_disks, conf->previous_raid_disks);
@@ -1577,7 +1411,7 @@ static size_t scribble_len(int num)
1577 return len; 1411 return len;
1578} 1412}
1579 1413
1580static int resize_stripes(struct r5conf *conf, int newsize) 1414static int resize_stripes(raid5_conf_t *conf, int newsize)
1581{ 1415{
1582 /* Make all the stripes able to hold 'newsize' devices. 1416 /* Make all the stripes able to hold 'newsize' devices.
1583 * New slots in each stripe get 'page' set to a new page. 1417 * New slots in each stripe get 'page' set to a new page.
@@ -1585,7 +1419,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
1585 * This happens in stages: 1419 * This happens in stages:
1586 * 1/ create a new kmem_cache and allocate the required number of 1420 * 1/ create a new kmem_cache and allocate the required number of
1587 * stripe_heads. 1421 * stripe_heads.
1588 * 2/ gather all the old stripe_heads and transfer the pages across 1422 * 2/ gather all the old stripe_heads and tranfer the pages across
1589 * to the new stripe_heads. This will have the side effect of 1423 * to the new stripe_heads. This will have the side effect of
1590 * freezing the array as once all stripe_heads have been collected, 1424 * freezing the array as once all stripe_heads have been collected,
1591 * no IO will be possible. Old stripe heads are freed once their 1425 * no IO will be possible. Old stripe heads are freed once their
@@ -1633,7 +1467,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
1633 #ifdef CONFIG_MULTICORE_RAID456 1467 #ifdef CONFIG_MULTICORE_RAID456
1634 init_waitqueue_head(&nsh->ops.wait_for_ops); 1468 init_waitqueue_head(&nsh->ops.wait_for_ops);
1635 #endif 1469 #endif
1636 spin_lock_init(&nsh->stripe_lock);
1637 1470
1638 list_add(&nsh->lru, &newstripes); 1471 list_add(&nsh->lru, &newstripes);
1639 } 1472 }
@@ -1655,7 +1488,8 @@ static int resize_stripes(struct r5conf *conf, int newsize)
1655 spin_lock_irq(&conf->device_lock); 1488 spin_lock_irq(&conf->device_lock);
1656 wait_event_lock_irq(conf->wait_for_stripe, 1489 wait_event_lock_irq(conf->wait_for_stripe,
1657 !list_empty(&conf->inactive_list), 1490 !list_empty(&conf->inactive_list),
1658 conf->device_lock); 1491 conf->device_lock,
1492 );
1659 osh = get_free_stripe(conf); 1493 osh = get_free_stripe(conf);
1660 spin_unlock_irq(&conf->device_lock); 1494 spin_unlock_irq(&conf->device_lock);
1661 atomic_set(&nsh->count, 1); 1495 atomic_set(&nsh->count, 1);
@@ -1722,7 +1556,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
1722 return err; 1556 return err;
1723} 1557}
1724 1558
1725static int drop_one_stripe(struct r5conf *conf) 1559static int drop_one_stripe(raid5_conf_t *conf)
1726{ 1560{
1727 struct stripe_head *sh; 1561 struct stripe_head *sh;
1728 1562
@@ -1738,7 +1572,7 @@ static int drop_one_stripe(struct r5conf *conf)
1738 return 1; 1572 return 1;
1739} 1573}
1740 1574
1741static void shrink_stripes(struct r5conf *conf) 1575static void shrink_stripes(raid5_conf_t *conf)
1742{ 1576{
1743 while (drop_one_stripe(conf)) 1577 while (drop_one_stripe(conf))
1744 ; 1578 ;
@@ -1751,12 +1585,12 @@ static void shrink_stripes(struct r5conf *conf)
1751static void raid5_end_read_request(struct bio * bi, int error) 1585static void raid5_end_read_request(struct bio * bi, int error)
1752{ 1586{
1753 struct stripe_head *sh = bi->bi_private; 1587 struct stripe_head *sh = bi->bi_private;
1754 struct r5conf *conf = sh->raid_conf; 1588 raid5_conf_t *conf = sh->raid_conf;
1755 int disks = sh->disks, i; 1589 int disks = sh->disks, i;
1756 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1590 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1757 char b[BDEVNAME_SIZE]; 1591 char b[BDEVNAME_SIZE];
1758 struct md_rdev *rdev = NULL; 1592 mdk_rdev_t *rdev;
1759 sector_t s; 1593
1760 1594
1761 for (i=0 ; i<disks; i++) 1595 for (i=0 ; i<disks; i++)
1762 if (bi == &sh->dev[i].req) 1596 if (bi == &sh->dev[i].req)
@@ -1769,77 +1603,52 @@ static void raid5_end_read_request(struct bio * bi, int error)
1769 BUG(); 1603 BUG();
1770 return; 1604 return;
1771 } 1605 }
1772 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
1773 /* If replacement finished while this request was outstanding,
1774 * 'replacement' might be NULL already.
1775 * In that case it moved down to 'rdev'.
1776 * rdev is not removed until all requests are finished.
1777 */
1778 rdev = conf->disks[i].replacement;
1779 if (!rdev)
1780 rdev = conf->disks[i].rdev;
1781 1606
1782 if (use_new_offset(conf, sh))
1783 s = sh->sector + rdev->new_data_offset;
1784 else
1785 s = sh->sector + rdev->data_offset;
1786 if (uptodate) { 1607 if (uptodate) {
1787 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1608 set_bit(R5_UPTODATE, &sh->dev[i].flags);
1788 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1609 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1789 /* Note that this cannot happen on a 1610 rdev = conf->disks[i].rdev;
1790 * replacement device. We just fail those on
1791 * any error
1792 */
1793 printk_ratelimited( 1611 printk_ratelimited(
1794 KERN_INFO 1612 KERN_INFO
1795 "md/raid:%s: read error corrected" 1613 "md/raid:%s: read error corrected"
1796 " (%lu sectors at %llu on %s)\n", 1614 " (%lu sectors at %llu on %s)\n",
1797 mdname(conf->mddev), STRIPE_SECTORS, 1615 mdname(conf->mddev), STRIPE_SECTORS,
1798 (unsigned long long)s, 1616 (unsigned long long)(sh->sector
1617 + rdev->data_offset),
1799 bdevname(rdev->bdev, b)); 1618 bdevname(rdev->bdev, b));
1800 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); 1619 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
1801 clear_bit(R5_ReadError, &sh->dev[i].flags); 1620 clear_bit(R5_ReadError, &sh->dev[i].flags);
1802 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1621 clear_bit(R5_ReWrite, &sh->dev[i].flags);
1803 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 1622 }
1804 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 1623 if (atomic_read(&conf->disks[i].rdev->read_errors))
1805 1624 atomic_set(&conf->disks[i].rdev->read_errors, 0);
1806 if (atomic_read(&rdev->read_errors))
1807 atomic_set(&rdev->read_errors, 0);
1808 } else { 1625 } else {
1809 const char *bdn = bdevname(rdev->bdev, b); 1626 const char *bdn = bdevname(conf->disks[i].rdev->bdev, b);
1810 int retry = 0; 1627 int retry = 0;
1811 int set_bad = 0; 1628 rdev = conf->disks[i].rdev;
1812 1629
1813 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 1630 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
1814 atomic_inc(&rdev->read_errors); 1631 atomic_inc(&rdev->read_errors);
1815 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 1632 if (conf->mddev->degraded >= conf->max_degraded)
1816 printk_ratelimited(
1817 KERN_WARNING
1818 "md/raid:%s: read error on replacement device "
1819 "(sector %llu on %s).\n",
1820 mdname(conf->mddev),
1821 (unsigned long long)s,
1822 bdn);
1823 else if (conf->mddev->degraded >= conf->max_degraded) {
1824 set_bad = 1;
1825 printk_ratelimited( 1633 printk_ratelimited(
1826 KERN_WARNING 1634 KERN_WARNING
1827 "md/raid:%s: read error not correctable " 1635 "md/raid:%s: read error not correctable "
1828 "(sector %llu on %s).\n", 1636 "(sector %llu on %s).\n",
1829 mdname(conf->mddev), 1637 mdname(conf->mddev),
1830 (unsigned long long)s, 1638 (unsigned long long)(sh->sector
1639 + rdev->data_offset),
1831 bdn); 1640 bdn);
1832 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { 1641 else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
1833 /* Oh, no!!! */ 1642 /* Oh, no!!! */
1834 set_bad = 1;
1835 printk_ratelimited( 1643 printk_ratelimited(
1836 KERN_WARNING 1644 KERN_WARNING
1837 "md/raid:%s: read error NOT corrected!! " 1645 "md/raid:%s: read error NOT corrected!! "
1838 "(sector %llu on %s).\n", 1646 "(sector %llu on %s).\n",
1839 mdname(conf->mddev), 1647 mdname(conf->mddev),
1840 (unsigned long long)s, 1648 (unsigned long long)(sh->sector
1649 + rdev->data_offset),
1841 bdn); 1650 bdn);
1842 } else if (atomic_read(&rdev->read_errors) 1651 else if (atomic_read(&rdev->read_errors)
1843 > conf->max_nr_stripes) 1652 > conf->max_nr_stripes)
1844 printk(KERN_WARNING 1653 printk(KERN_WARNING
1845 "md/raid:%s: Too many read errors, failing device %s.\n", 1654 "md/raid:%s: Too many read errors, failing device %s.\n",
@@ -1847,22 +1656,14 @@ static void raid5_end_read_request(struct bio * bi, int error)
1847 else 1656 else
1848 retry = 1; 1657 retry = 1;
1849 if (retry) 1658 if (retry)
1850 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { 1659 set_bit(R5_ReadError, &sh->dev[i].flags);
1851 set_bit(R5_ReadError, &sh->dev[i].flags);
1852 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
1853 } else
1854 set_bit(R5_ReadNoMerge, &sh->dev[i].flags);
1855 else { 1660 else {
1856 clear_bit(R5_ReadError, &sh->dev[i].flags); 1661 clear_bit(R5_ReadError, &sh->dev[i].flags);
1857 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1662 clear_bit(R5_ReWrite, &sh->dev[i].flags);
1858 if (!(set_bad 1663 md_error(conf->mddev, rdev);
1859 && test_bit(In_sync, &rdev->flags)
1860 && rdev_set_badblocks(
1861 rdev, sh->sector, STRIPE_SECTORS, 0)))
1862 md_error(conf->mddev, rdev);
1863 } 1664 }
1864 } 1665 }
1865 rdev_dec_pending(rdev, conf->mddev); 1666 rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
1866 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1667 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1867 set_bit(STRIPE_HANDLE, &sh->state); 1668 set_bit(STRIPE_HANDLE, &sh->state);
1868 release_stripe(sh); 1669 release_stripe(sh);
@@ -1871,32 +1672,16 @@ static void raid5_end_read_request(struct bio * bi, int error)
1871static void raid5_end_write_request(struct bio *bi, int error) 1672static void raid5_end_write_request(struct bio *bi, int error)
1872{ 1673{
1873 struct stripe_head *sh = bi->bi_private; 1674 struct stripe_head *sh = bi->bi_private;
1874 struct r5conf *conf = sh->raid_conf; 1675 raid5_conf_t *conf = sh->raid_conf;
1875 int disks = sh->disks, i; 1676 int disks = sh->disks, i;
1876 struct md_rdev *uninitialized_var(rdev);
1877 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1677 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1878 sector_t first_bad; 1678 sector_t first_bad;
1879 int bad_sectors; 1679 int bad_sectors;
1880 int replacement = 0;
1881 1680
1882 for (i = 0 ; i < disks; i++) { 1681 for (i=0 ; i<disks; i++)
1883 if (bi == &sh->dev[i].req) { 1682 if (bi == &sh->dev[i].req)
1884 rdev = conf->disks[i].rdev;
1885 break;
1886 }
1887 if (bi == &sh->dev[i].rreq) {
1888 rdev = conf->disks[i].replacement;
1889 if (rdev)
1890 replacement = 1;
1891 else
1892 /* rdev was removed and 'replacement'
1893 * replaced it. rdev is not removed
1894 * until all requests are finished.
1895 */
1896 rdev = conf->disks[i].rdev;
1897 break; 1683 break;
1898 } 1684
1899 }
1900 pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", 1685 pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
1901 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1686 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
1902 uptodate); 1687 uptodate);
@@ -1905,33 +1690,21 @@ static void raid5_end_write_request(struct bio *bi, int error)
1905 return; 1690 return;
1906 } 1691 }
1907 1692
1908 if (replacement) { 1693 if (!uptodate) {
1909 if (!uptodate) 1694 set_bit(WriteErrorSeen, &conf->disks[i].rdev->flags);
1910 md_error(conf->mddev, rdev); 1695 set_bit(R5_WriteError, &sh->dev[i].flags);
1911 else if (is_badblock(rdev, sh->sector, 1696 } else if (is_badblock(conf->disks[i].rdev, sh->sector, STRIPE_SECTORS,
1912 STRIPE_SECTORS, 1697 &first_bad, &bad_sectors))
1913 &first_bad, &bad_sectors)) 1698 set_bit(R5_MadeGood, &sh->dev[i].flags);
1914 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
1915 } else {
1916 if (!uptodate) {
1917 set_bit(WriteErrorSeen, &rdev->flags);
1918 set_bit(R5_WriteError, &sh->dev[i].flags);
1919 if (!test_and_set_bit(WantReplacement, &rdev->flags))
1920 set_bit(MD_RECOVERY_NEEDED,
1921 &rdev->mddev->recovery);
1922 } else if (is_badblock(rdev, sh->sector,
1923 STRIPE_SECTORS,
1924 &first_bad, &bad_sectors))
1925 set_bit(R5_MadeGood, &sh->dev[i].flags);
1926 }
1927 rdev_dec_pending(rdev, conf->mddev);
1928 1699
1929 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) 1700 rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
1930 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1701
1702 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1931 set_bit(STRIPE_HANDLE, &sh->state); 1703 set_bit(STRIPE_HANDLE, &sh->state);
1932 release_stripe(sh); 1704 release_stripe(sh);
1933} 1705}
1934 1706
1707
1935static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); 1708static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
1936 1709
1937static void raid5_build_block(struct stripe_head *sh, int i, int previous) 1710static void raid5_build_block(struct stripe_head *sh, int i, int previous)
@@ -1942,33 +1715,33 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous)
1942 dev->req.bi_io_vec = &dev->vec; 1715 dev->req.bi_io_vec = &dev->vec;
1943 dev->req.bi_vcnt++; 1716 dev->req.bi_vcnt++;
1944 dev->req.bi_max_vecs++; 1717 dev->req.bi_max_vecs++;
1945 dev->req.bi_private = sh;
1946 dev->vec.bv_page = dev->page; 1718 dev->vec.bv_page = dev->page;
1719 dev->vec.bv_len = STRIPE_SIZE;
1720 dev->vec.bv_offset = 0;
1947 1721
1948 bio_init(&dev->rreq); 1722 dev->req.bi_sector = sh->sector;
1949 dev->rreq.bi_io_vec = &dev->rvec; 1723 dev->req.bi_private = sh;
1950 dev->rreq.bi_vcnt++;
1951 dev->rreq.bi_max_vecs++;
1952 dev->rreq.bi_private = sh;
1953 dev->rvec.bv_page = dev->page;
1954 1724
1955 dev->flags = 0; 1725 dev->flags = 0;
1956 dev->sector = compute_blocknr(sh, i, previous); 1726 dev->sector = compute_blocknr(sh, i, previous);
1957} 1727}
1958 1728
1959static void error(struct mddev *mddev, struct md_rdev *rdev) 1729static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1960{ 1730{
1961 char b[BDEVNAME_SIZE]; 1731 char b[BDEVNAME_SIZE];
1962 struct r5conf *conf = mddev->private; 1732 raid5_conf_t *conf = mddev->private;
1963 unsigned long flags;
1964 pr_debug("raid456: error called\n"); 1733 pr_debug("raid456: error called\n");
1965 1734
1966 spin_lock_irqsave(&conf->device_lock, flags); 1735 if (test_and_clear_bit(In_sync, &rdev->flags)) {
1967 clear_bit(In_sync, &rdev->flags); 1736 unsigned long flags;
1968 mddev->degraded = calc_degraded(conf); 1737 spin_lock_irqsave(&conf->device_lock, flags);
1969 spin_unlock_irqrestore(&conf->device_lock, flags); 1738 mddev->degraded++;
1970 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1739 spin_unlock_irqrestore(&conf->device_lock, flags);
1971 1740 /*
1741 * if recovery was running, make sure it aborts.
1742 */
1743 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1744 }
1972 set_bit(Blocked, &rdev->flags); 1745 set_bit(Blocked, &rdev->flags);
1973 set_bit(Faulty, &rdev->flags); 1746 set_bit(Faulty, &rdev->flags);
1974 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1747 set_bit(MD_CHANGE_DEVS, &mddev->flags);
@@ -1985,7 +1758,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
1985 * Input: a 'big' sector number, 1758 * Input: a 'big' sector number,
1986 * Output: index of the data and parity disk, and the sector # in them. 1759 * Output: index of the data and parity disk, and the sector # in them.
1987 */ 1760 */
1988static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, 1761static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
1989 int previous, int *dd_idx, 1762 int previous, int *dd_idx,
1990 struct stripe_head *sh) 1763 struct stripe_head *sh)
1991{ 1764{
@@ -2190,7 +1963,7 @@ static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
2190 1963
2191static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) 1964static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
2192{ 1965{
2193 struct r5conf *conf = sh->raid_conf; 1966 raid5_conf_t *conf = sh->raid_conf;
2194 int raid_disks = sh->disks; 1967 int raid_disks = sh->disks;
2195 int data_disks = raid_disks - conf->max_degraded; 1968 int data_disks = raid_disks - conf->max_degraded;
2196 sector_t new_sector = sh->sector, check; 1969 sector_t new_sector = sh->sector, check;
@@ -2315,7 +2088,7 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
2315 int rcw, int expand) 2088 int rcw, int expand)
2316{ 2089{
2317 int i, pd_idx = sh->pd_idx, disks = sh->disks; 2090 int i, pd_idx = sh->pd_idx, disks = sh->disks;
2318 struct r5conf *conf = sh->raid_conf; 2091 raid5_conf_t *conf = sh->raid_conf;
2319 int level = conf->level; 2092 int level = conf->level;
2320 2093
2321 if (rcw) { 2094 if (rcw) {
@@ -2400,25 +2173,18 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
2400static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) 2173static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
2401{ 2174{
2402 struct bio **bip; 2175 struct bio **bip;
2403 struct r5conf *conf = sh->raid_conf; 2176 raid5_conf_t *conf = sh->raid_conf;
2404 int firstwrite=0; 2177 int firstwrite=0;
2405 2178
2406 pr_debug("adding bi b#%llu to stripe s#%llu\n", 2179 pr_debug("adding bi b#%llu to stripe s#%llu\n",
2407 (unsigned long long)bi->bi_sector, 2180 (unsigned long long)bi->bi_sector,
2408 (unsigned long long)sh->sector); 2181 (unsigned long long)sh->sector);
2409 2182
2410 /* 2183
2411 * If several bio share a stripe. The bio bi_phys_segments acts as a 2184 spin_lock_irq(&conf->device_lock);
2412 * reference count to avoid race. The reference count should already be
2413 * increased before this function is called (for example, in
2414 * make_request()), so other bio sharing this stripe will not free the
2415 * stripe. If a stripe is owned by one stripe, the stripe lock will
2416 * protect it.
2417 */
2418 spin_lock_irq(&sh->stripe_lock);
2419 if (forwrite) { 2185 if (forwrite) {
2420 bip = &sh->dev[dd_idx].towrite; 2186 bip = &sh->dev[dd_idx].towrite;
2421 if (*bip == NULL) 2187 if (*bip == NULL && sh->dev[dd_idx].written == NULL)
2422 firstwrite = 1; 2188 firstwrite = 1;
2423 } else 2189 } else
2424 bip = &sh->dev[dd_idx].toread; 2190 bip = &sh->dev[dd_idx].toread;
@@ -2434,7 +2200,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2434 if (*bip) 2200 if (*bip)
2435 bi->bi_next = *bip; 2201 bi->bi_next = *bip;
2436 *bip = bi; 2202 *bip = bi;
2437 raid5_inc_bi_active_stripes(bi); 2203 bi->bi_phys_segments++;
2438 2204
2439 if (forwrite) { 2205 if (forwrite) {
2440 /* check if page is covered */ 2206 /* check if page is covered */
@@ -2449,11 +2215,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2449 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 2215 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
2450 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); 2216 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
2451 } 2217 }
2218 spin_unlock_irq(&conf->device_lock);
2452 2219
2453 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 2220 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
2454 (unsigned long long)(*bip)->bi_sector, 2221 (unsigned long long)(*bip)->bi_sector,
2455 (unsigned long long)sh->sector, dd_idx); 2222 (unsigned long long)sh->sector, dd_idx);
2456 spin_unlock_irq(&sh->stripe_lock);
2457 2223
2458 if (conf->mddev->bitmap && firstwrite) { 2224 if (conf->mddev->bitmap && firstwrite) {
2459 bitmap_startwrite(conf->mddev->bitmap, sh->sector, 2225 bitmap_startwrite(conf->mddev->bitmap, sh->sector,
@@ -2465,13 +2231,13 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2465 2231
2466 overlap: 2232 overlap:
2467 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 2233 set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
2468 spin_unlock_irq(&sh->stripe_lock); 2234 spin_unlock_irq(&conf->device_lock);
2469 return 0; 2235 return 0;
2470} 2236}
2471 2237
2472static void end_reshape(struct r5conf *conf); 2238static void end_reshape(raid5_conf_t *conf);
2473 2239
2474static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 2240static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
2475 struct stripe_head *sh) 2241 struct stripe_head *sh)
2476{ 2242{
2477 int sectors_per_chunk = 2243 int sectors_per_chunk =
@@ -2488,7 +2254,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
2488} 2254}
2489 2255
2490static void 2256static void
2491handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, 2257handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
2492 struct stripe_head_state *s, int disks, 2258 struct stripe_head_state *s, int disks,
2493 struct bio **return_bi) 2259 struct bio **return_bi)
2494{ 2260{
@@ -2498,7 +2264,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2498 int bitmap_end = 0; 2264 int bitmap_end = 0;
2499 2265
2500 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 2266 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
2501 struct md_rdev *rdev; 2267 mdk_rdev_t *rdev;
2502 rcu_read_lock(); 2268 rcu_read_lock();
2503 rdev = rcu_dereference(conf->disks[i].rdev); 2269 rdev = rcu_dereference(conf->disks[i].rdev);
2504 if (rdev && test_bit(In_sync, &rdev->flags)) 2270 if (rdev && test_bit(In_sync, &rdev->flags))
@@ -2515,13 +2281,14 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2515 rdev_dec_pending(rdev, conf->mddev); 2281 rdev_dec_pending(rdev, conf->mddev);
2516 } 2282 }
2517 } 2283 }
2518 spin_lock_irq(&sh->stripe_lock); 2284 spin_lock_irq(&conf->device_lock);
2519 /* fail all writes first */ 2285 /* fail all writes first */
2520 bi = sh->dev[i].towrite; 2286 bi = sh->dev[i].towrite;
2521 sh->dev[i].towrite = NULL; 2287 sh->dev[i].towrite = NULL;
2522 spin_unlock_irq(&sh->stripe_lock); 2288 if (bi) {
2523 if (bi) 2289 s->to_write--;
2524 bitmap_end = 1; 2290 bitmap_end = 1;
2291 }
2525 2292
2526 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2293 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2527 wake_up(&conf->wait_for_overlap); 2294 wake_up(&conf->wait_for_overlap);
@@ -2530,17 +2297,13 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2530 sh->dev[i].sector + STRIPE_SECTORS) { 2297 sh->dev[i].sector + STRIPE_SECTORS) {
2531 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 2298 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
2532 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2299 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2533 if (!raid5_dec_bi_active_stripes(bi)) { 2300 if (!raid5_dec_bi_phys_segments(bi)) {
2534 md_write_end(conf->mddev); 2301 md_write_end(conf->mddev);
2535 bi->bi_next = *return_bi; 2302 bi->bi_next = *return_bi;
2536 *return_bi = bi; 2303 *return_bi = bi;
2537 } 2304 }
2538 bi = nextbi; 2305 bi = nextbi;
2539 } 2306 }
2540 if (bitmap_end)
2541 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2542 STRIPE_SECTORS, 0, 0);
2543 bitmap_end = 0;
2544 /* and fail all 'written' */ 2307 /* and fail all 'written' */
2545 bi = sh->dev[i].written; 2308 bi = sh->dev[i].written;
2546 sh->dev[i].written = NULL; 2309 sh->dev[i].written = NULL;
@@ -2549,7 +2312,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2549 sh->dev[i].sector + STRIPE_SECTORS) { 2312 sh->dev[i].sector + STRIPE_SECTORS) {
2550 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 2313 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
2551 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2314 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2552 if (!raid5_dec_bi_active_stripes(bi)) { 2315 if (!raid5_dec_bi_phys_segments(bi)) {
2553 md_write_end(conf->mddev); 2316 md_write_end(conf->mddev);
2554 bi->bi_next = *return_bi; 2317 bi->bi_next = *return_bi;
2555 *return_bi = bi; 2318 *return_bi = bi;
@@ -2563,24 +2326,24 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2563 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && 2326 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
2564 (!test_bit(R5_Insync, &sh->dev[i].flags) || 2327 (!test_bit(R5_Insync, &sh->dev[i].flags) ||
2565 test_bit(R5_ReadError, &sh->dev[i].flags))) { 2328 test_bit(R5_ReadError, &sh->dev[i].flags))) {
2566 spin_lock_irq(&sh->stripe_lock);
2567 bi = sh->dev[i].toread; 2329 bi = sh->dev[i].toread;
2568 sh->dev[i].toread = NULL; 2330 sh->dev[i].toread = NULL;
2569 spin_unlock_irq(&sh->stripe_lock);
2570 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2331 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2571 wake_up(&conf->wait_for_overlap); 2332 wake_up(&conf->wait_for_overlap);
2333 if (bi) s->to_read--;
2572 while (bi && bi->bi_sector < 2334 while (bi && bi->bi_sector <
2573 sh->dev[i].sector + STRIPE_SECTORS) { 2335 sh->dev[i].sector + STRIPE_SECTORS) {
2574 struct bio *nextbi = 2336 struct bio *nextbi =
2575 r5_next_bio(bi, sh->dev[i].sector); 2337 r5_next_bio(bi, sh->dev[i].sector);
2576 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2338 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2577 if (!raid5_dec_bi_active_stripes(bi)) { 2339 if (!raid5_dec_bi_phys_segments(bi)) {
2578 bi->bi_next = *return_bi; 2340 bi->bi_next = *return_bi;
2579 *return_bi = bi; 2341 *return_bi = bi;
2580 } 2342 }
2581 bi = nextbi; 2343 bi = nextbi;
2582 } 2344 }
2583 } 2345 }
2346 spin_unlock_irq(&conf->device_lock);
2584 if (bitmap_end) 2347 if (bitmap_end)
2585 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2348 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2586 STRIPE_SECTORS, 0, 0); 2349 STRIPE_SECTORS, 0, 0);
@@ -2596,63 +2359,38 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2596} 2359}
2597 2360
2598static void 2361static void
2599handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, 2362handle_failed_sync(raid5_conf_t *conf, struct stripe_head *sh,
2600 struct stripe_head_state *s) 2363 struct stripe_head_state *s)
2601{ 2364{
2602 int abort = 0; 2365 int abort = 0;
2603 int i; 2366 int i;
2604 2367
2368 md_done_sync(conf->mddev, STRIPE_SECTORS, 0);
2605 clear_bit(STRIPE_SYNCING, &sh->state); 2369 clear_bit(STRIPE_SYNCING, &sh->state);
2606 s->syncing = 0; 2370 s->syncing = 0;
2607 s->replacing = 0;
2608 /* There is nothing more to do for sync/check/repair. 2371 /* There is nothing more to do for sync/check/repair.
2609 * Don't even need to abort as that is handled elsewhere 2372 * For recover we need to record a bad block on all
2610 * if needed, and not always wanted e.g. if there is a known
2611 * bad block here.
2612 * For recover/replace we need to record a bad block on all
2613 * non-sync devices, or abort the recovery 2373 * non-sync devices, or abort the recovery
2614 */ 2374 */
2615 if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) { 2375 if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery))
2616 /* During recovery devices cannot be removed, so 2376 return;
2617 * locking and refcounting of rdevs is not needed 2377 /* During recovery devices cannot be removed, so locking and
2618 */ 2378 * refcounting of rdevs is not needed
2619 for (i = 0; i < conf->raid_disks; i++) { 2379 */
2620 struct md_rdev *rdev = conf->disks[i].rdev; 2380 for (i = 0; i < conf->raid_disks; i++) {
2621 if (rdev 2381 mdk_rdev_t *rdev = conf->disks[i].rdev;
2622 && !test_bit(Faulty, &rdev->flags) 2382 if (!rdev
2623 && !test_bit(In_sync, &rdev->flags) 2383 || test_bit(Faulty, &rdev->flags)
2624 && !rdev_set_badblocks(rdev, sh->sector, 2384 || test_bit(In_sync, &rdev->flags))
2625 STRIPE_SECTORS, 0)) 2385 continue;
2626 abort = 1; 2386 if (!rdev_set_badblocks(rdev, sh->sector,
2627 rdev = conf->disks[i].replacement; 2387 STRIPE_SECTORS, 0))
2628 if (rdev 2388 abort = 1;
2629 && !test_bit(Faulty, &rdev->flags) 2389 }
2630 && !test_bit(In_sync, &rdev->flags) 2390 if (abort) {
2631 && !rdev_set_badblocks(rdev, sh->sector, 2391 conf->recovery_disabled = conf->mddev->recovery_disabled;
2632 STRIPE_SECTORS, 0)) 2392 set_bit(MD_RECOVERY_INTR, &conf->mddev->recovery);
2633 abort = 1;
2634 }
2635 if (abort)
2636 conf->recovery_disabled =
2637 conf->mddev->recovery_disabled;
2638 } 2393 }
2639 md_done_sync(conf->mddev, STRIPE_SECTORS, !abort);
2640}
2641
2642static int want_replace(struct stripe_head *sh, int disk_idx)
2643{
2644 struct md_rdev *rdev;
2645 int rv = 0;
2646 /* Doing recovery so rcu locking not required */
2647 rdev = sh->raid_conf->disks[disk_idx].replacement;
2648 if (rdev
2649 && !test_bit(Faulty, &rdev->flags)
2650 && !test_bit(In_sync, &rdev->flags)
2651 && (rdev->recovery_offset <= sh->sector
2652 || rdev->mddev->recovery_cp <= sh->sector))
2653 rv = 1;
2654
2655 return rv;
2656} 2394}
2657 2395
2658/* fetch_block - checks the given member device to see if its data needs 2396/* fetch_block - checks the given member device to see if its data needs
@@ -2674,7 +2412,6 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
2674 (dev->toread || 2412 (dev->toread ||
2675 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 2413 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2676 s->syncing || s->expanding || 2414 s->syncing || s->expanding ||
2677 (s->replacing && want_replace(sh, disk_idx)) ||
2678 (s->failed >= 1 && fdev[0]->toread) || 2415 (s->failed >= 1 && fdev[0]->toread) ||
2679 (s->failed >= 2 && fdev[1]->toread) || 2416 (s->failed >= 2 && fdev[1]->toread) ||
2680 (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && 2417 (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite &&
@@ -2771,7 +2508,7 @@ static void handle_stripe_fill(struct stripe_head *sh,
2771 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 2508 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
2772 * never LOCKED, so we don't need to test 'failed' directly. 2509 * never LOCKED, so we don't need to test 'failed' directly.
2773 */ 2510 */
2774static void handle_stripe_clean_event(struct r5conf *conf, 2511static void handle_stripe_clean_event(raid5_conf_t *conf,
2775 struct stripe_head *sh, int disks, struct bio **return_bi) 2512 struct stripe_head *sh, int disks, struct bio **return_bi)
2776{ 2513{
2777 int i; 2514 int i;
@@ -2781,63 +2518,53 @@ static void handle_stripe_clean_event(struct r5conf *conf,
2781 if (sh->dev[i].written) { 2518 if (sh->dev[i].written) {
2782 dev = &sh->dev[i]; 2519 dev = &sh->dev[i];
2783 if (!test_bit(R5_LOCKED, &dev->flags) && 2520 if (!test_bit(R5_LOCKED, &dev->flags) &&
2784 (test_bit(R5_UPTODATE, &dev->flags) || 2521 test_bit(R5_UPTODATE, &dev->flags)) {
2785 test_bit(R5_Discard, &dev->flags))) {
2786 /* We can return any write requests */ 2522 /* We can return any write requests */
2787 struct bio *wbi, *wbi2; 2523 struct bio *wbi, *wbi2;
2524 int bitmap_end = 0;
2788 pr_debug("Return write for disc %d\n", i); 2525 pr_debug("Return write for disc %d\n", i);
2789 if (test_and_clear_bit(R5_Discard, &dev->flags)) 2526 spin_lock_irq(&conf->device_lock);
2790 clear_bit(R5_UPTODATE, &dev->flags);
2791 wbi = dev->written; 2527 wbi = dev->written;
2792 dev->written = NULL; 2528 dev->written = NULL;
2793 while (wbi && wbi->bi_sector < 2529 while (wbi && wbi->bi_sector <
2794 dev->sector + STRIPE_SECTORS) { 2530 dev->sector + STRIPE_SECTORS) {
2795 wbi2 = r5_next_bio(wbi, dev->sector); 2531 wbi2 = r5_next_bio(wbi, dev->sector);
2796 if (!raid5_dec_bi_active_stripes(wbi)) { 2532 if (!raid5_dec_bi_phys_segments(wbi)) {
2797 md_write_end(conf->mddev); 2533 md_write_end(conf->mddev);
2798 wbi->bi_next = *return_bi; 2534 wbi->bi_next = *return_bi;
2799 *return_bi = wbi; 2535 *return_bi = wbi;
2800 } 2536 }
2801 wbi = wbi2; 2537 wbi = wbi2;
2802 } 2538 }
2803 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2539 if (dev->towrite == NULL)
2804 STRIPE_SECTORS, 2540 bitmap_end = 1;
2541 spin_unlock_irq(&conf->device_lock);
2542 if (bitmap_end)
2543 bitmap_endwrite(conf->mddev->bitmap,
2544 sh->sector,
2545 STRIPE_SECTORS,
2805 !test_bit(STRIPE_DEGRADED, &sh->state), 2546 !test_bit(STRIPE_DEGRADED, &sh->state),
2806 0); 2547 0);
2807 } 2548 }
2808 } else if (test_bit(R5_Discard, &sh->dev[i].flags)) 2549 }
2809 clear_bit(R5_Discard, &sh->dev[i].flags);
2810 2550
2811 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2551 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
2812 if (atomic_dec_and_test(&conf->pending_full_writes)) 2552 if (atomic_dec_and_test(&conf->pending_full_writes))
2813 md_wakeup_thread(conf->mddev->thread); 2553 md_wakeup_thread(conf->mddev->thread);
2814} 2554}
2815 2555
2816static void handle_stripe_dirtying(struct r5conf *conf, 2556static void handle_stripe_dirtying(raid5_conf_t *conf,
2817 struct stripe_head *sh, 2557 struct stripe_head *sh,
2818 struct stripe_head_state *s, 2558 struct stripe_head_state *s,
2819 int disks) 2559 int disks)
2820{ 2560{
2821 int rmw = 0, rcw = 0, i; 2561 int rmw = 0, rcw = 0, i;
2822 sector_t recovery_cp = conf->mddev->recovery_cp; 2562 if (conf->max_degraded == 2) {
2823 2563 /* RAID6 requires 'rcw' in current implementation
2824 /* RAID6 requires 'rcw' in current implementation. 2564 * Calculate the real rcw later - for now fake it
2825 * Otherwise, check whether resync is now happening or should start.
2826 * If yes, then the array is dirty (after unclean shutdown or
2827 * initial creation), so parity in some stripes might be inconsistent.
2828 * In this case, we need to always do reconstruct-write, to ensure
2829 * that in case of drive failure or read-error correction, we
2830 * generate correct data from the parity.
2831 */
2832 if (conf->max_degraded == 2 ||
2833 (recovery_cp < MaxSector && sh->sector >= recovery_cp)) {
2834 /* Calculate the real rcw later - for now make it
2835 * look like rcw is cheaper 2565 * look like rcw is cheaper
2836 */ 2566 */
2837 rcw = 1; rmw = 2; 2567 rcw = 1; rmw = 2;
2838 pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n",
2839 conf->max_degraded, (unsigned long long)recovery_cp,
2840 (unsigned long long)sh->sector);
2841 } else for (i = disks; i--; ) { 2568 } else for (i = disks; i--; ) {
2842 /* would I have to read this buffer for read_modify_write */ 2569 /* would I have to read this buffer for read_modify_write */
2843 struct r5dev *dev = &sh->dev[i]; 2570 struct r5dev *dev = &sh->dev[i];
@@ -2863,10 +2590,8 @@ static void handle_stripe_dirtying(struct r5conf *conf,
2863 pr_debug("for sector %llu, rmw=%d rcw=%d\n", 2590 pr_debug("for sector %llu, rmw=%d rcw=%d\n",
2864 (unsigned long long)sh->sector, rmw, rcw); 2591 (unsigned long long)sh->sector, rmw, rcw);
2865 set_bit(STRIPE_HANDLE, &sh->state); 2592 set_bit(STRIPE_HANDLE, &sh->state);
2866 if (rmw < rcw && rmw > 0) { 2593 if (rmw < rcw && rmw > 0)
2867 /* prefer read-modify-write, but need to get some data */ 2594 /* prefer read-modify-write, but need to get some data */
2868 blk_add_trace_msg(conf->mddev->queue, "raid5 rmw %llu %d",
2869 (unsigned long long)sh->sector, rmw);
2870 for (i = disks; i--; ) { 2595 for (i = disks; i--; ) {
2871 struct r5dev *dev = &sh->dev[i]; 2596 struct r5dev *dev = &sh->dev[i];
2872 if ((dev->towrite || i == sh->pd_idx) && 2597 if ((dev->towrite || i == sh->pd_idx) &&
@@ -2877,7 +2602,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
2877 if ( 2602 if (
2878 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2603 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2879 pr_debug("Read_old block " 2604 pr_debug("Read_old block "
2880 "%d for r-m-w\n", i); 2605 "%d for r-m-w\n", i);
2881 set_bit(R5_LOCKED, &dev->flags); 2606 set_bit(R5_LOCKED, &dev->flags);
2882 set_bit(R5_Wantread, &dev->flags); 2607 set_bit(R5_Wantread, &dev->flags);
2883 s->locked++; 2608 s->locked++;
@@ -2887,10 +2612,8 @@ static void handle_stripe_dirtying(struct r5conf *conf,
2887 } 2612 }
2888 } 2613 }
2889 } 2614 }
2890 }
2891 if (rcw <= rmw && rcw > 0) { 2615 if (rcw <= rmw && rcw > 0) {
2892 /* want reconstruct write, but need to get some data */ 2616 /* want reconstruct write, but need to get some data */
2893 int qread =0;
2894 rcw = 0; 2617 rcw = 0;
2895 for (i = disks; i--; ) { 2618 for (i = disks; i--; ) {
2896 struct r5dev *dev = &sh->dev[i]; 2619 struct r5dev *dev = &sh->dev[i];
@@ -2909,17 +2632,12 @@ static void handle_stripe_dirtying(struct r5conf *conf,
2909 set_bit(R5_LOCKED, &dev->flags); 2632 set_bit(R5_LOCKED, &dev->flags);
2910 set_bit(R5_Wantread, &dev->flags); 2633 set_bit(R5_Wantread, &dev->flags);
2911 s->locked++; 2634 s->locked++;
2912 qread++;
2913 } else { 2635 } else {
2914 set_bit(STRIPE_DELAYED, &sh->state); 2636 set_bit(STRIPE_DELAYED, &sh->state);
2915 set_bit(STRIPE_HANDLE, &sh->state); 2637 set_bit(STRIPE_HANDLE, &sh->state);
2916 } 2638 }
2917 } 2639 }
2918 } 2640 }
2919 if (rcw)
2920 blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d",
2921 (unsigned long long)sh->sector,
2922 rcw, qread, test_bit(STRIPE_DELAYED, &sh->state));
2923 } 2641 }
2924 /* now if nothing is locked, and if we have enough data, 2642 /* now if nothing is locked, and if we have enough data,
2925 * we can start a write request 2643 * we can start a write request
@@ -2937,7 +2655,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
2937 schedule_reconstruction(sh, s, rcw == 0, 0); 2655 schedule_reconstruction(sh, s, rcw == 0, 0);
2938} 2656}
2939 2657
2940static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, 2658static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2941 struct stripe_head_state *s, int disks) 2659 struct stripe_head_state *s, int disks)
2942{ 2660{
2943 struct r5dev *dev = NULL; 2661 struct r5dev *dev = NULL;
@@ -2998,7 +2716,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
2998 */ 2716 */
2999 set_bit(STRIPE_INSYNC, &sh->state); 2717 set_bit(STRIPE_INSYNC, &sh->state);
3000 else { 2718 else {
3001 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 2719 conf->mddev->resync_mismatches += STRIPE_SECTORS;
3002 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 2720 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
3003 /* don't try to repair!! */ 2721 /* don't try to repair!! */
3004 set_bit(STRIPE_INSYNC, &sh->state); 2722 set_bit(STRIPE_INSYNC, &sh->state);
@@ -3025,7 +2743,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
3025} 2743}
3026 2744
3027 2745
3028static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, 2746static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
3029 struct stripe_head_state *s, 2747 struct stripe_head_state *s,
3030 int disks) 2748 int disks)
3031{ 2749{
@@ -3150,7 +2868,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
3150 */ 2868 */
3151 } 2869 }
3152 } else { 2870 } else {
3153 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 2871 conf->mddev->resync_mismatches += STRIPE_SECTORS;
3154 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 2872 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
3155 /* don't try to repair!! */ 2873 /* don't try to repair!! */
3156 set_bit(STRIPE_INSYNC, &sh->state); 2874 set_bit(STRIPE_INSYNC, &sh->state);
@@ -3188,7 +2906,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
3188 } 2906 }
3189} 2907}
3190 2908
3191static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) 2909static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh)
3192{ 2910{
3193 int i; 2911 int i;
3194 2912
@@ -3241,33 +2959,40 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
3241 2959
3242 } 2960 }
3243 /* done submitting copies, wait for them to complete */ 2961 /* done submitting copies, wait for them to complete */
3244 async_tx_quiesce(&tx); 2962 if (tx) {
2963 async_tx_ack(tx);
2964 dma_wait_for_async_tx(tx);
2965 }
3245} 2966}
3246 2967
2968
3247/* 2969/*
3248 * handle_stripe - do things to a stripe. 2970 * handle_stripe - do things to a stripe.
3249 * 2971 *
3250 * We lock the stripe by setting STRIPE_ACTIVE and then examine the 2972 * We lock the stripe and then examine the state of various bits
3251 * state of various bits to see what needs to be done. 2973 * to see what needs to be done.
3252 * Possible results: 2974 * Possible results:
3253 * return some read requests which now have data 2975 * return some read request which now have data
3254 * return some write requests which are safely on storage 2976 * return some write requests which are safely on disc
3255 * schedule a read on some buffers 2977 * schedule a read on some buffers
3256 * schedule a write of some buffers 2978 * schedule a write of some buffers
3257 * return confirmation of parity correctness 2979 * return confirmation of parity correctness
3258 * 2980 *
2981 * buffers are taken off read_list or write_list, and bh_cache buffers
2982 * get BH_Lock set before the stripe lock is released.
2983 *
3259 */ 2984 */
3260 2985
3261static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) 2986static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3262{ 2987{
3263 struct r5conf *conf = sh->raid_conf; 2988 raid5_conf_t *conf = sh->raid_conf;
3264 int disks = sh->disks; 2989 int disks = sh->disks;
3265 struct r5dev *dev; 2990 struct r5dev *dev;
3266 int i; 2991 int i;
3267 int do_recovery = 0;
3268 2992
3269 memset(s, 0, sizeof(*s)); 2993 memset(s, 0, sizeof(*s));
3270 2994
2995 s->syncing = test_bit(STRIPE_SYNCING, &sh->state);
3271 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 2996 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
3272 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 2997 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
3273 s->failed_num[0] = -1; 2998 s->failed_num[0] = -1;
@@ -3275,8 +3000,9 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3275 3000
3276 /* Now to look around and see what can be done */ 3001 /* Now to look around and see what can be done */
3277 rcu_read_lock(); 3002 rcu_read_lock();
3003 spin_lock_irq(&conf->device_lock);
3278 for (i=disks; i--; ) { 3004 for (i=disks; i--; ) {
3279 struct md_rdev *rdev; 3005 mdk_rdev_t *rdev;
3280 sector_t first_bad; 3006 sector_t first_bad;
3281 int bad_sectors; 3007 int bad_sectors;
3282 int is_bad = 0; 3008 int is_bad = 0;
@@ -3284,8 +3010,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3284 dev = &sh->dev[i]; 3010 dev = &sh->dev[i];
3285 3011
3286 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 3012 pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
3287 i, dev->flags, 3013 i, dev->flags, dev->toread, dev->towrite, dev->written);
3288 dev->toread, dev->towrite, dev->written);
3289 /* maybe we can reply to a read 3014 /* maybe we can reply to a read
3290 * 3015 *
3291 * new wantfill requests are only permitted while 3016 * new wantfill requests are only permitted while
@@ -3316,23 +3041,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3316 } 3041 }
3317 if (dev->written) 3042 if (dev->written)
3318 s->written++; 3043 s->written++;
3319 /* Prefer to use the replacement for reads, but only 3044 rdev = rcu_dereference(conf->disks[i].rdev);
3320 * if it is recovered enough and has no bad blocks.
3321 */
3322 rdev = rcu_dereference(conf->disks[i].replacement);
3323 if (rdev && !test_bit(Faulty, &rdev->flags) &&
3324 rdev->recovery_offset >= sh->sector + STRIPE_SECTORS &&
3325 !is_badblock(rdev, sh->sector, STRIPE_SECTORS,
3326 &first_bad, &bad_sectors))
3327 set_bit(R5_ReadRepl, &dev->flags);
3328 else {
3329 if (rdev)
3330 set_bit(R5_NeedReplace, &dev->flags);
3331 rdev = rcu_dereference(conf->disks[i].rdev);
3332 clear_bit(R5_ReadRepl, &dev->flags);
3333 }
3334 if (rdev && test_bit(Faulty, &rdev->flags))
3335 rdev = NULL;
3336 if (rdev) { 3045 if (rdev) {
3337 is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 3046 is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
3338 &first_bad, &bad_sectors); 3047 &first_bad, &bad_sectors);
@@ -3351,8 +3060,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3351 /* Not in-sync */; 3060 /* Not in-sync */;
3352 else if (is_bad) { 3061 else if (is_bad) {
3353 /* also not in-sync */ 3062 /* also not in-sync */
3354 if (!test_bit(WriteErrorSeen, &rdev->flags) && 3063 if (!test_bit(WriteErrorSeen, &rdev->flags)) {
3355 test_bit(R5_UPTODATE, &dev->flags)) {
3356 /* treat as in-sync, but with a read error 3064 /* treat as in-sync, but with a read error
3357 * which we can now try to correct 3065 * which we can now try to correct
3358 */ 3066 */
@@ -3361,50 +3069,26 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3361 } 3069 }
3362 } else if (test_bit(In_sync, &rdev->flags)) 3070 } else if (test_bit(In_sync, &rdev->flags))
3363 set_bit(R5_Insync, &dev->flags); 3071 set_bit(R5_Insync, &dev->flags);
3364 else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) 3072 else if (!test_bit(Faulty, &rdev->flags)) {
3365 /* in sync if before recovery_offset */ 3073 /* in sync if before recovery_offset */
3366 set_bit(R5_Insync, &dev->flags); 3074 if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
3367 else if (test_bit(R5_UPTODATE, &dev->flags) && 3075 set_bit(R5_Insync, &dev->flags);
3368 test_bit(R5_Expanded, &dev->flags)) 3076 }
3369 /* If we've reshaped into here, we assume it is Insync. 3077 if (test_bit(R5_WriteError, &dev->flags)) {
3370 * We will shortly update recovery_offset to make 3078 clear_bit(R5_Insync, &dev->flags);
3371 * it official. 3079 if (!test_bit(Faulty, &rdev->flags)) {
3372 */
3373 set_bit(R5_Insync, &dev->flags);
3374
3375 if (rdev && test_bit(R5_WriteError, &dev->flags)) {
3376 /* This flag does not apply to '.replacement'
3377 * only to .rdev, so make sure to check that*/
3378 struct md_rdev *rdev2 = rcu_dereference(
3379 conf->disks[i].rdev);
3380 if (rdev2 == rdev)
3381 clear_bit(R5_Insync, &dev->flags);
3382 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
3383 s->handle_bad_blocks = 1; 3080 s->handle_bad_blocks = 1;
3384 atomic_inc(&rdev2->nr_pending); 3081 atomic_inc(&rdev->nr_pending);
3385 } else 3082 } else
3386 clear_bit(R5_WriteError, &dev->flags); 3083 clear_bit(R5_WriteError, &dev->flags);
3387 } 3084 }
3388 if (rdev && test_bit(R5_MadeGood, &dev->flags)) { 3085 if (test_bit(R5_MadeGood, &dev->flags)) {
3389 /* This flag does not apply to '.replacement' 3086 if (!test_bit(Faulty, &rdev->flags)) {
3390 * only to .rdev, so make sure to check that*/
3391 struct md_rdev *rdev2 = rcu_dereference(
3392 conf->disks[i].rdev);
3393 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
3394 s->handle_bad_blocks = 1; 3087 s->handle_bad_blocks = 1;
3395 atomic_inc(&rdev2->nr_pending); 3088 atomic_inc(&rdev->nr_pending);
3396 } else 3089 } else
3397 clear_bit(R5_MadeGood, &dev->flags); 3090 clear_bit(R5_MadeGood, &dev->flags);
3398 } 3091 }
3399 if (test_bit(R5_MadeGoodRepl, &dev->flags)) {
3400 struct md_rdev *rdev2 = rcu_dereference(
3401 conf->disks[i].replacement);
3402 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
3403 s->handle_bad_blocks = 1;
3404 atomic_inc(&rdev2->nr_pending);
3405 } else
3406 clear_bit(R5_MadeGoodRepl, &dev->flags);
3407 }
3408 if (!test_bit(R5_Insync, &dev->flags)) { 3092 if (!test_bit(R5_Insync, &dev->flags)) {
3409 /* The ReadError flag will just be confusing now */ 3093 /* The ReadError flag will just be confusing now */
3410 clear_bit(R5_ReadError, &dev->flags); 3094 clear_bit(R5_ReadError, &dev->flags);
@@ -3416,33 +3100,16 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3416 if (s->failed < 2) 3100 if (s->failed < 2)
3417 s->failed_num[s->failed] = i; 3101 s->failed_num[s->failed] = i;
3418 s->failed++; 3102 s->failed++;
3419 if (rdev && !test_bit(Faulty, &rdev->flags))
3420 do_recovery = 1;
3421 } 3103 }
3422 } 3104 }
3423 if (test_bit(STRIPE_SYNCING, &sh->state)) { 3105 spin_unlock_irq(&conf->device_lock);
3424 /* If there is a failed device being replaced,
3425 * we must be recovering.
3426 * else if we are after recovery_cp, we must be syncing
3427 * else if MD_RECOVERY_REQUESTED is set, we also are syncing.
3428 * else we can only be replacing
3429 * sync and recovery both need to read all devices, and so
3430 * use the same flag.
3431 */
3432 if (do_recovery ||
3433 sh->sector >= conf->mddev->recovery_cp ||
3434 test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery)))
3435 s->syncing = 1;
3436 else
3437 s->replacing = 1;
3438 }
3439 rcu_read_unlock(); 3106 rcu_read_unlock();
3440} 3107}
3441 3108
3442static void handle_stripe(struct stripe_head *sh) 3109static void handle_stripe(struct stripe_head *sh)
3443{ 3110{
3444 struct stripe_head_state s; 3111 struct stripe_head_state s;
3445 struct r5conf *conf = sh->raid_conf; 3112 raid5_conf_t *conf = sh->raid_conf;
3446 int i; 3113 int i;
3447 int prexor; 3114 int prexor;
3448 int disks = sh->disks; 3115 int disks = sh->disks;
@@ -3477,7 +3144,7 @@ static void handle_stripe(struct stripe_head *sh)
3477 3144
3478 if (unlikely(s.blocked_rdev)) { 3145 if (unlikely(s.blocked_rdev)) {
3479 if (s.syncing || s.expanding || s.expanded || 3146 if (s.syncing || s.expanding || s.expanded ||
3480 s.replacing || s.to_write || s.written) { 3147 s.to_write || s.written) {
3481 set_bit(STRIPE_HANDLE, &sh->state); 3148 set_bit(STRIPE_HANDLE, &sh->state);
3482 goto finish; 3149 goto finish;
3483 } 3150 }
@@ -3503,10 +3170,40 @@ static void handle_stripe(struct stripe_head *sh)
3503 sh->reconstruct_state = 0; 3170 sh->reconstruct_state = 0;
3504 if (s.to_read+s.to_write+s.written) 3171 if (s.to_read+s.to_write+s.written)
3505 handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); 3172 handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
3506 if (s.syncing + s.replacing) 3173 if (s.syncing)
3507 handle_failed_sync(conf, sh, &s); 3174 handle_failed_sync(conf, sh, &s);
3508 } 3175 }
3509 3176
3177 /*
3178 * might be able to return some write requests if the parity blocks
3179 * are safe, or on a failed drive
3180 */
3181 pdev = &sh->dev[sh->pd_idx];
3182 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
3183 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
3184 qdev = &sh->dev[sh->qd_idx];
3185 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
3186 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
3187 || conf->level < 6;
3188
3189 if (s.written &&
3190 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
3191 && !test_bit(R5_LOCKED, &pdev->flags)
3192 && test_bit(R5_UPTODATE, &pdev->flags)))) &&
3193 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
3194 && !test_bit(R5_LOCKED, &qdev->flags)
3195 && test_bit(R5_UPTODATE, &qdev->flags)))))
3196 handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
3197
3198 /* Now we might consider reading some blocks, either to check/generate
3199 * parity, or to satisfy requests
3200 * or to load a block that is being partially written.
3201 */
3202 if (s.to_read || s.non_overwrite
3203 || (conf->level == 6 && s.to_write && s.failed)
3204 || (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
3205 handle_stripe_fill(sh, &s, disks);
3206
3510 /* Now we check to see if any write operations have recently 3207 /* Now we check to see if any write operations have recently
3511 * completed 3208 * completed
3512 */ 3209 */
@@ -3520,11 +3217,9 @@ static void handle_stripe(struct stripe_head *sh)
3520 /* All the 'written' buffers and the parity block are ready to 3217 /* All the 'written' buffers and the parity block are ready to
3521 * be written back to disk 3218 * be written back to disk
3522 */ 3219 */
3523 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) && 3220 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
3524 !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags));
3525 BUG_ON(sh->qd_idx >= 0 && 3221 BUG_ON(sh->qd_idx >= 0 &&
3526 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) && 3222 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags));
3527 !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
3528 for (i = disks; i--; ) { 3223 for (i = disks; i--; ) {
3529 struct r5dev *dev = &sh->dev[i]; 3224 struct r5dev *dev = &sh->dev[i];
3530 if (test_bit(R5_LOCKED, &dev->flags) && 3225 if (test_bit(R5_LOCKED, &dev->flags) &&
@@ -3544,40 +3239,6 @@ static void handle_stripe(struct stripe_head *sh)
3544 s.dec_preread_active = 1; 3239 s.dec_preread_active = 1;
3545 } 3240 }
3546 3241
3547 /*
3548 * might be able to return some write requests if the parity blocks
3549 * are safe, or on a failed drive
3550 */
3551 pdev = &sh->dev[sh->pd_idx];
3552 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
3553 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
3554 qdev = &sh->dev[sh->qd_idx];
3555 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
3556 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
3557 || conf->level < 6;
3558
3559 if (s.written &&
3560 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
3561 && !test_bit(R5_LOCKED, &pdev->flags)
3562 && (test_bit(R5_UPTODATE, &pdev->flags) ||
3563 test_bit(R5_Discard, &pdev->flags))))) &&
3564 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
3565 && !test_bit(R5_LOCKED, &qdev->flags)
3566 && (test_bit(R5_UPTODATE, &qdev->flags) ||
3567 test_bit(R5_Discard, &qdev->flags))))))
3568 handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
3569
3570 /* Now we might consider reading some blocks, either to check/generate
3571 * parity, or to satisfy requests
3572 * or to load a block that is being partially written.
3573 */
3574 if (s.to_read || s.non_overwrite
3575 || (conf->level == 6 && s.to_write && s.failed)
3576 || (s.syncing && (s.uptodate + s.compute < disks))
3577 || s.replacing
3578 || s.expanding)
3579 handle_stripe_fill(sh, &s, disks);
3580
3581 /* Now to consider new write requests and what else, if anything 3242 /* Now to consider new write requests and what else, if anything
3582 * should be read. We do not handle new writes when: 3243 * should be read. We do not handle new writes when:
3583 * 1/ A 'write' operation (copy+xor) is already in flight. 3244 * 1/ A 'write' operation (copy+xor) is already in flight.
@@ -3602,20 +3263,7 @@ static void handle_stripe(struct stripe_head *sh)
3602 handle_parity_checks5(conf, sh, &s, disks); 3263 handle_parity_checks5(conf, sh, &s, disks);
3603 } 3264 }
3604 3265
3605 if (s.replacing && s.locked == 0 3266 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
3606 && !test_bit(STRIPE_INSYNC, &sh->state)) {
3607 /* Write out to replacement devices where possible */
3608 for (i = 0; i < conf->raid_disks; i++)
3609 if (test_bit(R5_UPTODATE, &sh->dev[i].flags) &&
3610 test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
3611 set_bit(R5_WantReplace, &sh->dev[i].flags);
3612 set_bit(R5_LOCKED, &sh->dev[i].flags);
3613 s.locked++;
3614 }
3615 set_bit(STRIPE_INSYNC, &sh->state);
3616 }
3617 if ((s.syncing || s.replacing) && s.locked == 0 &&
3618 test_bit(STRIPE_INSYNC, &sh->state)) {
3619 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3267 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
3620 clear_bit(STRIPE_SYNCING, &sh->state); 3268 clear_bit(STRIPE_SYNCING, &sh->state);
3621 } 3269 }
@@ -3692,22 +3340,12 @@ static void handle_stripe(struct stripe_head *sh)
3692 3340
3693finish: 3341finish:
3694 /* wait for this device to become unblocked */ 3342 /* wait for this device to become unblocked */
3695 if (unlikely(s.blocked_rdev)) { 3343 if (conf->mddev->external && unlikely(s.blocked_rdev))
3696 if (conf->mddev->external) 3344 md_wait_for_blocked_rdev(s.blocked_rdev, conf->mddev);
3697 md_wait_for_blocked_rdev(s.blocked_rdev,
3698 conf->mddev);
3699 else
3700 /* Internal metadata will immediately
3701 * be written by raid5d, so we don't
3702 * need to wait here.
3703 */
3704 rdev_dec_pending(s.blocked_rdev,
3705 conf->mddev);
3706 }
3707 3345
3708 if (s.handle_bad_blocks) 3346 if (s.handle_bad_blocks)
3709 for (i = disks; i--; ) { 3347 for (i = disks; i--; ) {
3710 struct md_rdev *rdev; 3348 mdk_rdev_t *rdev;
3711 struct r5dev *dev = &sh->dev[i]; 3349 struct r5dev *dev = &sh->dev[i];
3712 if (test_and_clear_bit(R5_WriteError, &dev->flags)) { 3350 if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
3713 /* We own a safe reference to the rdev */ 3351 /* We own a safe reference to the rdev */
@@ -3720,16 +3358,7 @@ finish:
3720 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { 3358 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
3721 rdev = conf->disks[i].rdev; 3359 rdev = conf->disks[i].rdev;
3722 rdev_clear_badblocks(rdev, sh->sector, 3360 rdev_clear_badblocks(rdev, sh->sector,
3723 STRIPE_SECTORS, 0); 3361 STRIPE_SECTORS);
3724 rdev_dec_pending(rdev, conf->mddev);
3725 }
3726 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
3727 rdev = conf->disks[i].replacement;
3728 if (!rdev)
3729 /* rdev have been moved down */
3730 rdev = conf->disks[i].rdev;
3731 rdev_clear_badblocks(rdev, sh->sector,
3732 STRIPE_SECTORS, 0);
3733 rdev_dec_pending(rdev, conf->mddev); 3362 rdev_dec_pending(rdev, conf->mddev);
3734 } 3363 }
3735 } 3364 }
@@ -3755,7 +3384,7 @@ finish:
3755 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 3384 clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
3756} 3385}
3757 3386
3758static void raid5_activate_delayed(struct r5conf *conf) 3387static void raid5_activate_delayed(raid5_conf_t *conf)
3759{ 3388{
3760 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 3389 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
3761 while (!list_empty(&conf->delayed_list)) { 3390 while (!list_empty(&conf->delayed_list)) {
@@ -3771,7 +3400,7 @@ static void raid5_activate_delayed(struct r5conf *conf)
3771 } 3400 }
3772} 3401}
3773 3402
3774static void activate_bit_delay(struct r5conf *conf) 3403static void activate_bit_delay(raid5_conf_t *conf)
3775{ 3404{
3776 /* device_lock is held */ 3405 /* device_lock is held */
3777 struct list_head head; 3406 struct list_head head;
@@ -3785,9 +3414,9 @@ static void activate_bit_delay(struct r5conf *conf)
3785 } 3414 }
3786} 3415}
3787 3416
3788int md_raid5_congested(struct mddev *mddev, int bits) 3417int md_raid5_congested(mddev_t *mddev, int bits)
3789{ 3418{
3790 struct r5conf *conf = mddev->private; 3419 raid5_conf_t *conf = mddev->private;
3791 3420
3792 /* No difference between reads and writes. Just check 3421 /* No difference between reads and writes. Just check
3793 * how busy the stripe_cache is 3422 * how busy the stripe_cache is
@@ -3806,7 +3435,7 @@ EXPORT_SYMBOL_GPL(md_raid5_congested);
3806 3435
3807static int raid5_congested(void *data, int bits) 3436static int raid5_congested(void *data, int bits)
3808{ 3437{
3809 struct mddev *mddev = data; 3438 mddev_t *mddev = data;
3810 3439
3811 return mddev_congested(mddev, bits) || 3440 return mddev_congested(mddev, bits) ||
3812 md_raid5_congested(mddev, bits); 3441 md_raid5_congested(mddev, bits);
@@ -3819,7 +3448,7 @@ static int raid5_mergeable_bvec(struct request_queue *q,
3819 struct bvec_merge_data *bvm, 3448 struct bvec_merge_data *bvm,
3820 struct bio_vec *biovec) 3449 struct bio_vec *biovec)
3821{ 3450{
3822 struct mddev *mddev = q->queuedata; 3451 mddev_t *mddev = q->queuedata;
3823 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 3452 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
3824 int max; 3453 int max;
3825 unsigned int chunk_sectors = mddev->chunk_sectors; 3454 unsigned int chunk_sectors = mddev->chunk_sectors;
@@ -3839,7 +3468,7 @@ static int raid5_mergeable_bvec(struct request_queue *q,
3839} 3468}
3840 3469
3841 3470
3842static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) 3471static int in_chunk_boundary(mddev_t *mddev, struct bio *bio)
3843{ 3472{
3844 sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); 3473 sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
3845 unsigned int chunk_sectors = mddev->chunk_sectors; 3474 unsigned int chunk_sectors = mddev->chunk_sectors;
@@ -3855,7 +3484,7 @@ static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
3855 * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) 3484 * add bio to the retry LIFO ( in O(1) ... we are in interrupt )
3856 * later sampled by raid5d. 3485 * later sampled by raid5d.
3857 */ 3486 */
3858static void add_bio_to_retry(struct bio *bi,struct r5conf *conf) 3487static void add_bio_to_retry(struct bio *bi,raid5_conf_t *conf)
3859{ 3488{
3860 unsigned long flags; 3489 unsigned long flags;
3861 3490
@@ -3869,7 +3498,7 @@ static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
3869} 3498}
3870 3499
3871 3500
3872static struct bio *remove_bio_from_retry(struct r5conf *conf) 3501static struct bio *remove_bio_from_retry(raid5_conf_t *conf)
3873{ 3502{
3874 struct bio *bi; 3503 struct bio *bi;
3875 3504
@@ -3886,7 +3515,7 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf)
3886 * this sets the active strip count to 1 and the processed 3515 * this sets the active strip count to 1 and the processed
3887 * strip count to zero (upper 8 bits) 3516 * strip count to zero (upper 8 bits)
3888 */ 3517 */
3889 raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */ 3518 bi->bi_phys_segments = 1; /* biased count of active stripes */
3890 } 3519 }
3891 3520
3892 return bi; 3521 return bi;
@@ -3902,10 +3531,10 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf)
3902static void raid5_align_endio(struct bio *bi, int error) 3531static void raid5_align_endio(struct bio *bi, int error)
3903{ 3532{
3904 struct bio* raid_bi = bi->bi_private; 3533 struct bio* raid_bi = bi->bi_private;
3905 struct mddev *mddev; 3534 mddev_t *mddev;
3906 struct r5conf *conf; 3535 raid5_conf_t *conf;
3907 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 3536 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
3908 struct md_rdev *rdev; 3537 mdk_rdev_t *rdev;
3909 3538
3910 bio_put(bi); 3539 bio_put(bi);
3911 3540
@@ -3917,8 +3546,6 @@ static void raid5_align_endio(struct bio *bi, int error)
3917 rdev_dec_pending(rdev, conf->mddev); 3546 rdev_dec_pending(rdev, conf->mddev);
3918 3547
3919 if (!error && uptodate) { 3548 if (!error && uptodate) {
3920 trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev),
3921 raid_bi, 0);
3922 bio_endio(raid_bi, 0); 3549 bio_endio(raid_bi, 0);
3923 if (atomic_dec_and_test(&conf->active_aligned_reads)) 3550 if (atomic_dec_and_test(&conf->active_aligned_reads))
3924 wake_up(&conf->wait_for_stripe); 3551 wake_up(&conf->wait_for_stripe);
@@ -3951,13 +3578,12 @@ static int bio_fits_rdev(struct bio *bi)
3951} 3578}
3952 3579
3953 3580
3954static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) 3581static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio)
3955{ 3582{
3956 struct r5conf *conf = mddev->private; 3583 raid5_conf_t *conf = mddev->private;
3957 int dd_idx; 3584 int dd_idx;
3958 struct bio* align_bi; 3585 struct bio* align_bi;
3959 struct md_rdev *rdev; 3586 mdk_rdev_t *rdev;
3960 sector_t end_sector;
3961 3587
3962 if (!in_chunk_boundary(mddev, raid_bio)) { 3588 if (!in_chunk_boundary(mddev, raid_bio)) {
3963 pr_debug("chunk_aligned_read : non aligned\n"); 3589 pr_debug("chunk_aligned_read : non aligned\n");
@@ -3982,19 +3608,9 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
3982 0, 3608 0,
3983 &dd_idx, NULL); 3609 &dd_idx, NULL);
3984 3610
3985 end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9);
3986 rcu_read_lock(); 3611 rcu_read_lock();
3987 rdev = rcu_dereference(conf->disks[dd_idx].replacement); 3612 rdev = rcu_dereference(conf->disks[dd_idx].rdev);
3988 if (!rdev || test_bit(Faulty, &rdev->flags) || 3613 if (rdev && test_bit(In_sync, &rdev->flags)) {
3989 rdev->recovery_offset < end_sector) {
3990 rdev = rcu_dereference(conf->disks[dd_idx].rdev);
3991 if (rdev &&
3992 (test_bit(Faulty, &rdev->flags) ||
3993 !(test_bit(In_sync, &rdev->flags) ||
3994 rdev->recovery_offset >= end_sector)))
3995 rdev = NULL;
3996 }
3997 if (rdev) {
3998 sector_t first_bad; 3614 sector_t first_bad;
3999 int bad_sectors; 3615 int bad_sectors;
4000 3616
@@ -4003,6 +3619,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
4003 raid_bio->bi_next = (void*)rdev; 3619 raid_bio->bi_next = (void*)rdev;
4004 align_bi->bi_bdev = rdev->bdev; 3620 align_bi->bi_bdev = rdev->bdev;
4005 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); 3621 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
3622 align_bi->bi_sector += rdev->data_offset;
4006 3623
4007 if (!bio_fits_rdev(align_bi) || 3624 if (!bio_fits_rdev(align_bi) ||
4008 is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9, 3625 is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9,
@@ -4013,19 +3630,13 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
4013 return 0; 3630 return 0;
4014 } 3631 }
4015 3632
4016 /* No reshape active, so we can trust rdev->data_offset */
4017 align_bi->bi_sector += rdev->data_offset;
4018
4019 spin_lock_irq(&conf->device_lock); 3633 spin_lock_irq(&conf->device_lock);
4020 wait_event_lock_irq(conf->wait_for_stripe, 3634 wait_event_lock_irq(conf->wait_for_stripe,
4021 conf->quiesce == 0, 3635 conf->quiesce == 0,
4022 conf->device_lock); 3636 conf->device_lock, /* nothing */);
4023 atomic_inc(&conf->active_aligned_reads); 3637 atomic_inc(&conf->active_aligned_reads);
4024 spin_unlock_irq(&conf->device_lock); 3638 spin_unlock_irq(&conf->device_lock);
4025 3639
4026 trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev),
4027 align_bi, disk_devt(mddev->gendisk),
4028 raid_bio->bi_sector);
4029 generic_make_request(align_bi); 3640 generic_make_request(align_bi);
4030 return 1; 3641 return 1;
4031 } else { 3642 } else {
@@ -4045,7 +3656,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
4045 * head of the hold_list has changed, i.e. the head was promoted to the 3656 * head of the hold_list has changed, i.e. the head was promoted to the
4046 * handle_list. 3657 * handle_list.
4047 */ 3658 */
4048static struct stripe_head *__get_priority_stripe(struct r5conf *conf) 3659static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf)
4049{ 3660{
4050 struct stripe_head *sh; 3661 struct stripe_head *sh;
4051 3662
@@ -4088,160 +3699,20 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf)
4088 return sh; 3699 return sh;
4089} 3700}
4090 3701
4091struct raid5_plug_cb { 3702static int make_request(mddev_t *mddev, struct bio * bi)
4092 struct blk_plug_cb cb;
4093 struct list_head list;
4094};
4095
4096static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
4097{
4098 struct raid5_plug_cb *cb = container_of(
4099 blk_cb, struct raid5_plug_cb, cb);
4100 struct stripe_head *sh;
4101 struct mddev *mddev = cb->cb.data;
4102 struct r5conf *conf = mddev->private;
4103 int cnt = 0;
4104
4105 if (cb->list.next && !list_empty(&cb->list)) {
4106 spin_lock_irq(&conf->device_lock);
4107 while (!list_empty(&cb->list)) {
4108 sh = list_first_entry(&cb->list, struct stripe_head, lru);
4109 list_del_init(&sh->lru);
4110 /*
4111 * avoid race release_stripe_plug() sees
4112 * STRIPE_ON_UNPLUG_LIST clear but the stripe
4113 * is still in our list
4114 */
4115 smp_mb__before_clear_bit();
4116 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state);
4117 __release_stripe(conf, sh);
4118 cnt++;
4119 }
4120 spin_unlock_irq(&conf->device_lock);
4121 }
4122 trace_block_unplug(mddev->queue, cnt, !from_schedule);
4123 kfree(cb);
4124}
4125
4126static void release_stripe_plug(struct mddev *mddev,
4127 struct stripe_head *sh)
4128{
4129 struct blk_plug_cb *blk_cb = blk_check_plugged(
4130 raid5_unplug, mddev,
4131 sizeof(struct raid5_plug_cb));
4132 struct raid5_plug_cb *cb;
4133
4134 if (!blk_cb) {
4135 release_stripe(sh);
4136 return;
4137 }
4138
4139 cb = container_of(blk_cb, struct raid5_plug_cb, cb);
4140
4141 if (cb->list.next == NULL)
4142 INIT_LIST_HEAD(&cb->list);
4143
4144 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
4145 list_add_tail(&sh->lru, &cb->list);
4146 else
4147 release_stripe(sh);
4148}
4149
4150static void make_discard_request(struct mddev *mddev, struct bio *bi)
4151{
4152 struct r5conf *conf = mddev->private;
4153 sector_t logical_sector, last_sector;
4154 struct stripe_head *sh;
4155 int remaining;
4156 int stripe_sectors;
4157
4158 if (mddev->reshape_position != MaxSector)
4159 /* Skip discard while reshape is happening */
4160 return;
4161
4162 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
4163 last_sector = bi->bi_sector + (bi->bi_size>>9);
4164
4165 bi->bi_next = NULL;
4166 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
4167
4168 stripe_sectors = conf->chunk_sectors *
4169 (conf->raid_disks - conf->max_degraded);
4170 logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
4171 stripe_sectors);
4172 sector_div(last_sector, stripe_sectors);
4173
4174 logical_sector *= conf->chunk_sectors;
4175 last_sector *= conf->chunk_sectors;
4176
4177 for (; logical_sector < last_sector;
4178 logical_sector += STRIPE_SECTORS) {
4179 DEFINE_WAIT(w);
4180 int d;
4181 again:
4182 sh = get_active_stripe(conf, logical_sector, 0, 0, 0);
4183 prepare_to_wait(&conf->wait_for_overlap, &w,
4184 TASK_UNINTERRUPTIBLE);
4185 spin_lock_irq(&sh->stripe_lock);
4186 for (d = 0; d < conf->raid_disks; d++) {
4187 if (d == sh->pd_idx || d == sh->qd_idx)
4188 continue;
4189 if (sh->dev[d].towrite || sh->dev[d].toread) {
4190 set_bit(R5_Overlap, &sh->dev[d].flags);
4191 spin_unlock_irq(&sh->stripe_lock);
4192 release_stripe(sh);
4193 schedule();
4194 goto again;
4195 }
4196 }
4197 finish_wait(&conf->wait_for_overlap, &w);
4198 for (d = 0; d < conf->raid_disks; d++) {
4199 if (d == sh->pd_idx || d == sh->qd_idx)
4200 continue;
4201 sh->dev[d].towrite = bi;
4202 set_bit(R5_OVERWRITE, &sh->dev[d].flags);
4203 raid5_inc_bi_active_stripes(bi);
4204 }
4205 spin_unlock_irq(&sh->stripe_lock);
4206 if (conf->mddev->bitmap) {
4207 for (d = 0;
4208 d < conf->raid_disks - conf->max_degraded;
4209 d++)
4210 bitmap_startwrite(mddev->bitmap,
4211 sh->sector,
4212 STRIPE_SECTORS,
4213 0);
4214 sh->bm_seq = conf->seq_flush + 1;
4215 set_bit(STRIPE_BIT_DELAY, &sh->state);
4216 }
4217
4218 set_bit(STRIPE_HANDLE, &sh->state);
4219 clear_bit(STRIPE_DELAYED, &sh->state);
4220 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4221 atomic_inc(&conf->preread_active_stripes);
4222 release_stripe_plug(mddev, sh);
4223 }
4224
4225 remaining = raid5_dec_bi_active_stripes(bi);
4226 if (remaining == 0) {
4227 md_write_end(mddev);
4228 bio_endio(bi, 0);
4229 }
4230}
4231
4232static void make_request(struct mddev *mddev, struct bio * bi)
4233{ 3703{
4234 struct r5conf *conf = mddev->private; 3704 raid5_conf_t *conf = mddev->private;
4235 int dd_idx; 3705 int dd_idx;
4236 sector_t new_sector; 3706 sector_t new_sector;
4237 sector_t logical_sector, last_sector; 3707 sector_t logical_sector, last_sector;
4238 struct stripe_head *sh; 3708 struct stripe_head *sh;
4239 const int rw = bio_data_dir(bi); 3709 const int rw = bio_data_dir(bi);
4240 int remaining; 3710 int remaining;
3711 int plugged;
4241 3712
4242 if (unlikely(bi->bi_rw & REQ_FLUSH)) { 3713 if (unlikely(bi->bi_rw & REQ_FLUSH)) {
4243 md_flush_request(mddev, bi); 3714 md_flush_request(mddev, bi);
4244 return; 3715 return 0;
4245 } 3716 }
4246 3717
4247 md_write_start(mddev, bi); 3718 md_write_start(mddev, bi);
@@ -4249,24 +3720,22 @@ static void make_request(struct mddev *mddev, struct bio * bi)
4249 if (rw == READ && 3720 if (rw == READ &&
4250 mddev->reshape_position == MaxSector && 3721 mddev->reshape_position == MaxSector &&
4251 chunk_aligned_read(mddev,bi)) 3722 chunk_aligned_read(mddev,bi))
4252 return; 3723 return 0;
4253
4254 if (unlikely(bi->bi_rw & REQ_DISCARD)) {
4255 make_discard_request(mddev, bi);
4256 return;
4257 }
4258 3724
4259 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 3725 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
4260 last_sector = bi->bi_sector + (bi->bi_size>>9); 3726 last_sector = bi->bi_sector + (bi->bi_size>>9);
4261 bi->bi_next = NULL; 3727 bi->bi_next = NULL;
4262 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 3728 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
4263 3729
3730 plugged = mddev_check_plugged(mddev);
4264 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 3731 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
4265 DEFINE_WAIT(w); 3732 DEFINE_WAIT(w);
3733 int disks, data_disks;
4266 int previous; 3734 int previous;
4267 3735
4268 retry: 3736 retry:
4269 previous = 0; 3737 previous = 0;
3738 disks = conf->raid_disks;
4270 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 3739 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
4271 if (unlikely(conf->reshape_progress != MaxSector)) { 3740 if (unlikely(conf->reshape_progress != MaxSector)) {
4272 /* spinlock is needed as reshape_progress may be 3741 /* spinlock is needed as reshape_progress may be
@@ -4278,12 +3747,13 @@ static void make_request(struct mddev *mddev, struct bio * bi)
4278 * to check again. 3747 * to check again.
4279 */ 3748 */
4280 spin_lock_irq(&conf->device_lock); 3749 spin_lock_irq(&conf->device_lock);
4281 if (mddev->reshape_backwards 3750 if (mddev->delta_disks < 0
4282 ? logical_sector < conf->reshape_progress 3751 ? logical_sector < conf->reshape_progress
4283 : logical_sector >= conf->reshape_progress) { 3752 : logical_sector >= conf->reshape_progress) {
3753 disks = conf->previous_raid_disks;
4284 previous = 1; 3754 previous = 1;
4285 } else { 3755 } else {
4286 if (mddev->reshape_backwards 3756 if (mddev->delta_disks < 0
4287 ? logical_sector < conf->reshape_safe 3757 ? logical_sector < conf->reshape_safe
4288 : logical_sector >= conf->reshape_safe) { 3758 : logical_sector >= conf->reshape_safe) {
4289 spin_unlock_irq(&conf->device_lock); 3759 spin_unlock_irq(&conf->device_lock);
@@ -4293,6 +3763,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
4293 } 3763 }
4294 spin_unlock_irq(&conf->device_lock); 3764 spin_unlock_irq(&conf->device_lock);
4295 } 3765 }
3766 data_disks = disks - conf->max_degraded;
4296 3767
4297 new_sector = raid5_compute_sector(conf, logical_sector, 3768 new_sector = raid5_compute_sector(conf, logical_sector,
4298 previous, 3769 previous,
@@ -4315,7 +3786,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
4315 */ 3786 */
4316 int must_retry = 0; 3787 int must_retry = 0;
4317 spin_lock_irq(&conf->device_lock); 3788 spin_lock_irq(&conf->device_lock);
4318 if (mddev->reshape_backwards 3789 if (mddev->delta_disks < 0
4319 ? logical_sector >= conf->reshape_progress 3790 ? logical_sector >= conf->reshape_progress
4320 : logical_sector < conf->reshape_progress) 3791 : logical_sector < conf->reshape_progress)
4321 /* mismatch, need to try again */ 3792 /* mismatch, need to try again */
@@ -4362,30 +3833,35 @@ static void make_request(struct mddev *mddev, struct bio * bi)
4362 if ((bi->bi_rw & REQ_SYNC) && 3833 if ((bi->bi_rw & REQ_SYNC) &&
4363 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3834 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4364 atomic_inc(&conf->preread_active_stripes); 3835 atomic_inc(&conf->preread_active_stripes);
4365 release_stripe_plug(mddev, sh); 3836 release_stripe(sh);
4366 } else { 3837 } else {
4367 /* cannot get stripe for read-ahead, just give-up */ 3838 /* cannot get stripe for read-ahead, just give-up */
4368 clear_bit(BIO_UPTODATE, &bi->bi_flags); 3839 clear_bit(BIO_UPTODATE, &bi->bi_flags);
4369 finish_wait(&conf->wait_for_overlap, &w); 3840 finish_wait(&conf->wait_for_overlap, &w);
4370 break; 3841 break;
4371 } 3842 }
3843
4372 } 3844 }
3845 if (!plugged)
3846 md_wakeup_thread(mddev->thread);
4373 3847
4374 remaining = raid5_dec_bi_active_stripes(bi); 3848 spin_lock_irq(&conf->device_lock);
3849 remaining = raid5_dec_bi_phys_segments(bi);
3850 spin_unlock_irq(&conf->device_lock);
4375 if (remaining == 0) { 3851 if (remaining == 0) {
4376 3852
4377 if ( rw == WRITE ) 3853 if ( rw == WRITE )
4378 md_write_end(mddev); 3854 md_write_end(mddev);
4379 3855
4380 trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
4381 bi, 0);
4382 bio_endio(bi, 0); 3856 bio_endio(bi, 0);
4383 } 3857 }
3858
3859 return 0;
4384} 3860}
4385 3861
4386static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks); 3862static sector_t raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks);
4387 3863
4388static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped) 3864static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped)
4389{ 3865{
4390 /* reshaping is quite different to recovery/resync so it is 3866 /* reshaping is quite different to recovery/resync so it is
4391 * handled quite separately ... here. 3867 * handled quite separately ... here.
@@ -4396,7 +3872,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
4396 * As the reads complete, handle_stripe will copy the data 3872 * As the reads complete, handle_stripe will copy the data
4397 * into the destination stripe and release that stripe. 3873 * into the destination stripe and release that stripe.
4398 */ 3874 */
4399 struct r5conf *conf = mddev->private; 3875 raid5_conf_t *conf = mddev->private;
4400 struct stripe_head *sh; 3876 struct stripe_head *sh;
4401 sector_t first_sector, last_sector; 3877 sector_t first_sector, last_sector;
4402 int raid_disks = conf->previous_raid_disks; 3878 int raid_disks = conf->previous_raid_disks;
@@ -4411,11 +3887,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
4411 3887
4412 if (sector_nr == 0) { 3888 if (sector_nr == 0) {
4413 /* If restarting in the middle, skip the initial sectors */ 3889 /* If restarting in the middle, skip the initial sectors */
4414 if (mddev->reshape_backwards && 3890 if (mddev->delta_disks < 0 &&
4415 conf->reshape_progress < raid5_size(mddev, 0, 0)) { 3891 conf->reshape_progress < raid5_size(mddev, 0, 0)) {
4416 sector_nr = raid5_size(mddev, 0, 0) 3892 sector_nr = raid5_size(mddev, 0, 0)
4417 - conf->reshape_progress; 3893 - conf->reshape_progress;
4418 } else if (!mddev->reshape_backwards && 3894 } else if (mddev->delta_disks >= 0 &&
4419 conf->reshape_progress > 0) 3895 conf->reshape_progress > 0)
4420 sector_nr = conf->reshape_progress; 3896 sector_nr = conf->reshape_progress;
4421 sector_div(sector_nr, new_data_disks); 3897 sector_div(sector_nr, new_data_disks);
@@ -4436,11 +3912,13 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
4436 else 3912 else
4437 reshape_sectors = mddev->chunk_sectors; 3913 reshape_sectors = mddev->chunk_sectors;
4438 3914
4439 /* We update the metadata at least every 10 seconds, or when 3915 /* we update the metadata when there is more than 3Meg
4440 * the data about to be copied would over-write the source of 3916 * in the block range (that is rather arbitrary, should
4441 * the data at the front of the range. i.e. one new_stripe 3917 * probably be time based) or when the data about to be
4442 * along from reshape_progress new_maps to after where 3918 * copied would over-write the source of the data at
4443 * reshape_safe old_maps to 3919 * the front of the range.
3920 * i.e. one new_stripe along from reshape_progress new_maps
3921 * to after where reshape_safe old_maps to
4444 */ 3922 */
4445 writepos = conf->reshape_progress; 3923 writepos = conf->reshape_progress;
4446 sector_div(writepos, new_data_disks); 3924 sector_div(writepos, new_data_disks);
@@ -4448,7 +3926,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
4448 sector_div(readpos, data_disks); 3926 sector_div(readpos, data_disks);
4449 safepos = conf->reshape_safe; 3927 safepos = conf->reshape_safe;
4450 sector_div(safepos, data_disks); 3928 sector_div(safepos, data_disks);
4451 if (mddev->reshape_backwards) { 3929 if (mddev->delta_disks < 0) {
4452 writepos -= min_t(sector_t, reshape_sectors, writepos); 3930 writepos -= min_t(sector_t, reshape_sectors, writepos);
4453 readpos += reshape_sectors; 3931 readpos += reshape_sectors;
4454 safepos += reshape_sectors; 3932 safepos += reshape_sectors;
@@ -4458,29 +3936,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
4458 safepos -= min_t(sector_t, reshape_sectors, safepos); 3936 safepos -= min_t(sector_t, reshape_sectors, safepos);
4459 } 3937 }
4460 3938
4461 /* Having calculated the 'writepos' possibly use it
4462 * to set 'stripe_addr' which is where we will write to.
4463 */
4464 if (mddev->reshape_backwards) {
4465 BUG_ON(conf->reshape_progress == 0);
4466 stripe_addr = writepos;
4467 BUG_ON((mddev->dev_sectors &
4468 ~((sector_t)reshape_sectors - 1))
4469 - reshape_sectors - stripe_addr
4470 != sector_nr);
4471 } else {
4472 BUG_ON(writepos != sector_nr + reshape_sectors);
4473 stripe_addr = sector_nr;
4474 }
4475
4476 /* 'writepos' is the most advanced device address we might write. 3939 /* 'writepos' is the most advanced device address we might write.
4477 * 'readpos' is the least advanced device address we might read. 3940 * 'readpos' is the least advanced device address we might read.
4478 * 'safepos' is the least address recorded in the metadata as having 3941 * 'safepos' is the least address recorded in the metadata as having
4479 * been reshaped. 3942 * been reshaped.
4480 * If there is a min_offset_diff, these are adjusted either by 3943 * If 'readpos' is behind 'writepos', then there is no way that we can
4481 * increasing the safepos/readpos if diff is negative, or
4482 * increasing writepos if diff is positive.
4483 * If 'readpos' is then behind 'writepos', there is no way that we can
4484 * ensure safety in the face of a crash - that must be done by userspace 3944 * ensure safety in the face of a crash - that must be done by userspace
4485 * making a backup of the data. So in that case there is no particular 3945 * making a backup of the data. So in that case there is no particular
4486 * rush to update metadata. 3946 * rush to update metadata.
@@ -4493,13 +3953,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
4493 * Maybe that number should be configurable, but I'm not sure it is 3953 * Maybe that number should be configurable, but I'm not sure it is
4494 * worth it.... maybe it could be a multiple of safemode_delay??? 3954 * worth it.... maybe it could be a multiple of safemode_delay???
4495 */ 3955 */
4496 if (conf->min_offset_diff < 0) { 3956 if ((mddev->delta_disks < 0
4497 safepos += -conf->min_offset_diff;
4498 readpos += -conf->min_offset_diff;
4499 } else
4500 writepos += conf->min_offset_diff;
4501
4502 if ((mddev->reshape_backwards
4503 ? (safepos > writepos && readpos < writepos) 3957 ? (safepos > writepos && readpos < writepos)
4504 : (safepos < writepos && readpos > writepos)) || 3958 : (safepos < writepos && readpos > writepos)) ||
4505 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 3959 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
@@ -4520,6 +3974,17 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
4520 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 3974 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4521 } 3975 }
4522 3976
3977 if (mddev->delta_disks < 0) {
3978 BUG_ON(conf->reshape_progress == 0);
3979 stripe_addr = writepos;
3980 BUG_ON((mddev->dev_sectors &
3981 ~((sector_t)reshape_sectors - 1))
3982 - reshape_sectors - stripe_addr
3983 != sector_nr);
3984 } else {
3985 BUG_ON(writepos != sector_nr + reshape_sectors);
3986 stripe_addr = sector_nr;
3987 }
4523 INIT_LIST_HEAD(&stripes); 3988 INIT_LIST_HEAD(&stripes);
4524 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { 3989 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
4525 int j; 3990 int j;
@@ -4553,7 +4018,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
4553 list_add(&sh->lru, &stripes); 4018 list_add(&sh->lru, &stripes);
4554 } 4019 }
4555 spin_lock_irq(&conf->device_lock); 4020 spin_lock_irq(&conf->device_lock);
4556 if (mddev->reshape_backwards) 4021 if (mddev->delta_disks < 0)
4557 conf->reshape_progress -= reshape_sectors * new_data_disks; 4022 conf->reshape_progress -= reshape_sectors * new_data_disks;
4558 else 4023 else
4559 conf->reshape_progress += reshape_sectors * new_data_disks; 4024 conf->reshape_progress += reshape_sectors * new_data_disks;
@@ -4614,9 +4079,9 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
4614} 4079}
4615 4080
4616/* FIXME go_faster isn't used */ 4081/* FIXME go_faster isn't used */
4617static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster) 4082static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
4618{ 4083{
4619 struct r5conf *conf = mddev->private; 4084 raid5_conf_t *conf = mddev->private;
4620 struct stripe_head *sh; 4085 struct stripe_head *sh;
4621 sector_t max_sector = mddev->dev_sectors; 4086 sector_t max_sector = mddev->dev_sectors;
4622 sector_t sync_blocks; 4087 sector_t sync_blocks;
@@ -4672,6 +4137,7 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
4672 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 4137 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
4673 } 4138 }
4674 4139
4140
4675 bitmap_cond_end_sync(mddev->bitmap, sector_nr); 4141 bitmap_cond_end_sync(mddev->bitmap, sector_nr);
4676 4142
4677 sh = get_active_stripe(conf, sector_nr, 0, 1, 0); 4143 sh = get_active_stripe(conf, sector_nr, 0, 1, 0);
@@ -4700,7 +4166,7 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
4700 return STRIPE_SECTORS; 4166 return STRIPE_SECTORS;
4701} 4167}
4702 4168
4703static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) 4169static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
4704{ 4170{
4705 /* We may not be able to submit a whole bio at once as there 4171 /* We may not be able to submit a whole bio at once as there
4706 * may not be enough stripe_heads available. 4172 * may not be enough stripe_heads available.
@@ -4729,7 +4195,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
4729 sector += STRIPE_SECTORS, 4195 sector += STRIPE_SECTORS,
4730 scnt++) { 4196 scnt++) {
4731 4197
4732 if (scnt < raid5_bi_processed_stripes(raid_bio)) 4198 if (scnt < raid5_bi_hw_segments(raid_bio))
4733 /* already done this stripe */ 4199 /* already done this stripe */
4734 continue; 4200 continue;
4735 4201
@@ -4737,58 +4203,33 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
4737 4203
4738 if (!sh) { 4204 if (!sh) {
4739 /* failed to get a stripe - must wait */ 4205 /* failed to get a stripe - must wait */
4740 raid5_set_bi_processed_stripes(raid_bio, scnt); 4206 raid5_set_bi_hw_segments(raid_bio, scnt);
4741 conf->retry_read_aligned = raid_bio; 4207 conf->retry_read_aligned = raid_bio;
4742 return handled; 4208 return handled;
4743 } 4209 }
4744 4210
4211 set_bit(R5_ReadError, &sh->dev[dd_idx].flags);
4745 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { 4212 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
4746 release_stripe(sh); 4213 release_stripe(sh);
4747 raid5_set_bi_processed_stripes(raid_bio, scnt); 4214 raid5_set_bi_hw_segments(raid_bio, scnt);
4748 conf->retry_read_aligned = raid_bio; 4215 conf->retry_read_aligned = raid_bio;
4749 return handled; 4216 return handled;
4750 } 4217 }
4751 4218
4752 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags);
4753 handle_stripe(sh); 4219 handle_stripe(sh);
4754 release_stripe(sh); 4220 release_stripe(sh);
4755 handled++; 4221 handled++;
4756 } 4222 }
4757 remaining = raid5_dec_bi_active_stripes(raid_bio); 4223 spin_lock_irq(&conf->device_lock);
4758 if (remaining == 0) { 4224 remaining = raid5_dec_bi_phys_segments(raid_bio);
4759 trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev), 4225 spin_unlock_irq(&conf->device_lock);
4760 raid_bio, 0); 4226 if (remaining == 0)
4761 bio_endio(raid_bio, 0); 4227 bio_endio(raid_bio, 0);
4762 }
4763 if (atomic_dec_and_test(&conf->active_aligned_reads)) 4228 if (atomic_dec_and_test(&conf->active_aligned_reads))
4764 wake_up(&conf->wait_for_stripe); 4229 wake_up(&conf->wait_for_stripe);
4765 return handled; 4230 return handled;
4766} 4231}
4767 4232
4768#define MAX_STRIPE_BATCH 8
4769static int handle_active_stripes(struct r5conf *conf)
4770{
4771 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
4772 int i, batch_size = 0;
4773
4774 while (batch_size < MAX_STRIPE_BATCH &&
4775 (sh = __get_priority_stripe(conf)) != NULL)
4776 batch[batch_size++] = sh;
4777
4778 if (batch_size == 0)
4779 return batch_size;
4780 spin_unlock_irq(&conf->device_lock);
4781
4782 for (i = 0; i < batch_size; i++)
4783 handle_stripe(batch[i]);
4784
4785 cond_resched();
4786
4787 spin_lock_irq(&conf->device_lock);
4788 for (i = 0; i < batch_size; i++)
4789 __release_stripe(conf, batch[i]);
4790 return batch_size;
4791}
4792 4233
4793/* 4234/*
4794 * This is our raid5 kernel thread. 4235 * This is our raid5 kernel thread.
@@ -4797,10 +4238,10 @@ static int handle_active_stripes(struct r5conf *conf)
4797 * During the scan, completed stripes are saved for us by the interrupt 4238 * During the scan, completed stripes are saved for us by the interrupt
4798 * handler, so that they will not have to wait for our next wakeup. 4239 * handler, so that they will not have to wait for our next wakeup.
4799 */ 4240 */
4800static void raid5d(struct md_thread *thread) 4241static void raid5d(mddev_t *mddev)
4801{ 4242{
4802 struct mddev *mddev = thread->mddev; 4243 struct stripe_head *sh;
4803 struct r5conf *conf = mddev->private; 4244 raid5_conf_t *conf = mddev->private;
4804 int handled; 4245 int handled;
4805 struct blk_plug plug; 4246 struct blk_plug plug;
4806 4247
@@ -4813,9 +4254,8 @@ static void raid5d(struct md_thread *thread)
4813 spin_lock_irq(&conf->device_lock); 4254 spin_lock_irq(&conf->device_lock);
4814 while (1) { 4255 while (1) {
4815 struct bio *bio; 4256 struct bio *bio;
4816 int batch_size;
4817 4257
4818 if ( 4258 if (atomic_read(&mddev->plug_cnt) == 0 &&
4819 !list_empty(&conf->bitmap_list)) { 4259 !list_empty(&conf->bitmap_list)) {
4820 /* Now is a good time to flush some bitmap updates */ 4260 /* Now is a good time to flush some bitmap updates */
4821 conf->seq_flush++; 4261 conf->seq_flush++;
@@ -4825,7 +4265,8 @@ static void raid5d(struct md_thread *thread)
4825 conf->seq_write = conf->seq_flush; 4265 conf->seq_write = conf->seq_flush;
4826 activate_bit_delay(conf); 4266 activate_bit_delay(conf);
4827 } 4267 }
4828 raid5_activate_delayed(conf); 4268 if (atomic_read(&mddev->plug_cnt) == 0)
4269 raid5_activate_delayed(conf);
4829 4270
4830 while ((bio = remove_bio_from_retry(conf))) { 4271 while ((bio = remove_bio_from_retry(conf))) {
4831 int ok; 4272 int ok;
@@ -4837,16 +4278,21 @@ static void raid5d(struct md_thread *thread)
4837 handled++; 4278 handled++;
4838 } 4279 }
4839 4280
4840 batch_size = handle_active_stripes(conf); 4281 sh = __get_priority_stripe(conf);
4841 if (!batch_size) 4282
4283 if (!sh)
4842 break; 4284 break;
4843 handled += batch_size; 4285 spin_unlock_irq(&conf->device_lock);
4286
4287 handled++;
4288 handle_stripe(sh);
4289 release_stripe(sh);
4290 cond_resched();
4844 4291
4845 if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) { 4292 if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
4846 spin_unlock_irq(&conf->device_lock);
4847 md_check_recovery(mddev); 4293 md_check_recovery(mddev);
4848 spin_lock_irq(&conf->device_lock); 4294
4849 } 4295 spin_lock_irq(&conf->device_lock);
4850 } 4296 }
4851 pr_debug("%d stripes handled\n", handled); 4297 pr_debug("%d stripes handled\n", handled);
4852 4298
@@ -4859,9 +4305,9 @@ static void raid5d(struct md_thread *thread)
4859} 4305}
4860 4306
4861static ssize_t 4307static ssize_t
4862raid5_show_stripe_cache_size(struct mddev *mddev, char *page) 4308raid5_show_stripe_cache_size(mddev_t *mddev, char *page)
4863{ 4309{
4864 struct r5conf *conf = mddev->private; 4310 raid5_conf_t *conf = mddev->private;
4865 if (conf) 4311 if (conf)
4866 return sprintf(page, "%d\n", conf->max_nr_stripes); 4312 return sprintf(page, "%d\n", conf->max_nr_stripes);
4867 else 4313 else
@@ -4869,9 +4315,9 @@ raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
4869} 4315}
4870 4316
4871int 4317int
4872raid5_set_cache_size(struct mddev *mddev, int size) 4318raid5_set_cache_size(mddev_t *mddev, int size)
4873{ 4319{
4874 struct r5conf *conf = mddev->private; 4320 raid5_conf_t *conf = mddev->private;
4875 int err; 4321 int err;
4876 4322
4877 if (size <= 16 || size > 32768) 4323 if (size <= 16 || size > 32768)
@@ -4895,9 +4341,9 @@ raid5_set_cache_size(struct mddev *mddev, int size)
4895EXPORT_SYMBOL(raid5_set_cache_size); 4341EXPORT_SYMBOL(raid5_set_cache_size);
4896 4342
4897static ssize_t 4343static ssize_t
4898raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) 4344raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
4899{ 4345{
4900 struct r5conf *conf = mddev->private; 4346 raid5_conf_t *conf = mddev->private;
4901 unsigned long new; 4347 unsigned long new;
4902 int err; 4348 int err;
4903 4349
@@ -4920,9 +4366,9 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
4920 raid5_store_stripe_cache_size); 4366 raid5_store_stripe_cache_size);
4921 4367
4922static ssize_t 4368static ssize_t
4923raid5_show_preread_threshold(struct mddev *mddev, char *page) 4369raid5_show_preread_threshold(mddev_t *mddev, char *page)
4924{ 4370{
4925 struct r5conf *conf = mddev->private; 4371 raid5_conf_t *conf = mddev->private;
4926 if (conf) 4372 if (conf)
4927 return sprintf(page, "%d\n", conf->bypass_threshold); 4373 return sprintf(page, "%d\n", conf->bypass_threshold);
4928 else 4374 else
@@ -4930,9 +4376,9 @@ raid5_show_preread_threshold(struct mddev *mddev, char *page)
4930} 4376}
4931 4377
4932static ssize_t 4378static ssize_t
4933raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) 4379raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len)
4934{ 4380{
4935 struct r5conf *conf = mddev->private; 4381 raid5_conf_t *conf = mddev->private;
4936 unsigned long new; 4382 unsigned long new;
4937 if (len >= PAGE_SIZE) 4383 if (len >= PAGE_SIZE)
4938 return -EINVAL; 4384 return -EINVAL;
@@ -4954,9 +4400,9 @@ raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
4954 raid5_store_preread_threshold); 4400 raid5_store_preread_threshold);
4955 4401
4956static ssize_t 4402static ssize_t
4957stripe_cache_active_show(struct mddev *mddev, char *page) 4403stripe_cache_active_show(mddev_t *mddev, char *page)
4958{ 4404{
4959 struct r5conf *conf = mddev->private; 4405 raid5_conf_t *conf = mddev->private;
4960 if (conf) 4406 if (conf)
4961 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); 4407 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
4962 else 4408 else
@@ -4978,9 +4424,9 @@ static struct attribute_group raid5_attrs_group = {
4978}; 4424};
4979 4425
4980static sector_t 4426static sector_t
4981raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) 4427raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
4982{ 4428{
4983 struct r5conf *conf = mddev->private; 4429 raid5_conf_t *conf = mddev->private;
4984 4430
4985 if (!sectors) 4431 if (!sectors)
4986 sectors = mddev->dev_sectors; 4432 sectors = mddev->dev_sectors;
@@ -4993,7 +4439,7 @@ raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
4993 return sectors * (raid_disks - conf->max_degraded); 4439 return sectors * (raid_disks - conf->max_degraded);
4994} 4440}
4995 4441
4996static void raid5_free_percpu(struct r5conf *conf) 4442static void raid5_free_percpu(raid5_conf_t *conf)
4997{ 4443{
4998 struct raid5_percpu *percpu; 4444 struct raid5_percpu *percpu;
4999 unsigned long cpu; 4445 unsigned long cpu;
@@ -5015,7 +4461,7 @@ static void raid5_free_percpu(struct r5conf *conf)
5015 free_percpu(conf->percpu); 4461 free_percpu(conf->percpu);
5016} 4462}
5017 4463
5018static void free_conf(struct r5conf *conf) 4464static void free_conf(raid5_conf_t *conf)
5019{ 4465{
5020 shrink_stripes(conf); 4466 shrink_stripes(conf);
5021 raid5_free_percpu(conf); 4467 raid5_free_percpu(conf);
@@ -5028,7 +4474,7 @@ static void free_conf(struct r5conf *conf)
5028static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, 4474static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
5029 void *hcpu) 4475 void *hcpu)
5030{ 4476{
5031 struct r5conf *conf = container_of(nfb, struct r5conf, cpu_notify); 4477 raid5_conf_t *conf = container_of(nfb, raid5_conf_t, cpu_notify);
5032 long cpu = (long)hcpu; 4478 long cpu = (long)hcpu;
5033 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); 4479 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
5034 4480
@@ -5063,7 +4509,7 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
5063} 4509}
5064#endif 4510#endif
5065 4511
5066static int raid5_alloc_percpu(struct r5conf *conf) 4512static int raid5_alloc_percpu(raid5_conf_t *conf)
5067{ 4513{
5068 unsigned long cpu; 4514 unsigned long cpu;
5069 struct page *spare_page; 4515 struct page *spare_page;
@@ -5105,13 +4551,12 @@ static int raid5_alloc_percpu(struct r5conf *conf)
5105 return err; 4551 return err;
5106} 4552}
5107 4553
5108static struct r5conf *setup_conf(struct mddev *mddev) 4554static raid5_conf_t *setup_conf(mddev_t *mddev)
5109{ 4555{
5110 struct r5conf *conf; 4556 raid5_conf_t *conf;
5111 int raid_disk, memory, max_disks; 4557 int raid_disk, memory, max_disks;
5112 struct md_rdev *rdev; 4558 mdk_rdev_t *rdev;
5113 struct disk_info *disk; 4559 struct disk_info *disk;
5114 char pers_name[6];
5115 4560
5116 if (mddev->new_level != 5 4561 if (mddev->new_level != 5
5117 && mddev->new_level != 4 4562 && mddev->new_level != 4
@@ -5142,7 +4587,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5142 return ERR_PTR(-EINVAL); 4587 return ERR_PTR(-EINVAL);
5143 } 4588 }
5144 4589
5145 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); 4590 conf = kzalloc(sizeof(raid5_conf_t), GFP_KERNEL);
5146 if (conf == NULL) 4591 if (conf == NULL)
5147 goto abort; 4592 goto abort;
5148 spin_lock_init(&conf->device_lock); 4593 spin_lock_init(&conf->device_lock);
@@ -5157,7 +4602,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5157 atomic_set(&conf->preread_active_stripes, 0); 4602 atomic_set(&conf->preread_active_stripes, 0);
5158 atomic_set(&conf->active_aligned_reads, 0); 4603 atomic_set(&conf->active_aligned_reads, 0);
5159 conf->bypass_threshold = BYPASS_THRESHOLD; 4604 conf->bypass_threshold = BYPASS_THRESHOLD;
5160 conf->recovery_disabled = mddev->recovery_disabled - 1;
5161 4605
5162 conf->raid_disks = mddev->raid_disks; 4606 conf->raid_disks = mddev->raid_disks;
5163 if (mddev->reshape_position == MaxSector) 4607 if (mddev->reshape_position == MaxSector)
@@ -5183,22 +4627,14 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5183 4627
5184 pr_debug("raid456: run(%s) called.\n", mdname(mddev)); 4628 pr_debug("raid456: run(%s) called.\n", mdname(mddev));
5185 4629
5186 rdev_for_each(rdev, mddev) { 4630 list_for_each_entry(rdev, &mddev->disks, same_set) {
5187 raid_disk = rdev->raid_disk; 4631 raid_disk = rdev->raid_disk;
5188 if (raid_disk >= max_disks 4632 if (raid_disk >= max_disks
5189 || raid_disk < 0) 4633 || raid_disk < 0)
5190 continue; 4634 continue;
5191 disk = conf->disks + raid_disk; 4635 disk = conf->disks + raid_disk;
5192 4636
5193 if (test_bit(Replacement, &rdev->flags)) { 4637 disk->rdev = rdev;
5194 if (disk->replacement)
5195 goto abort;
5196 disk->replacement = rdev;
5197 } else {
5198 if (disk->rdev)
5199 goto abort;
5200 disk->rdev = rdev;
5201 }
5202 4638
5203 if (test_bit(In_sync, &rdev->flags)) { 4639 if (test_bit(In_sync, &rdev->flags)) {
5204 char b[BDEVNAME_SIZE]; 4640 char b[BDEVNAME_SIZE];
@@ -5235,8 +4671,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5235 printk(KERN_INFO "md/raid:%s: allocated %dkB\n", 4671 printk(KERN_INFO "md/raid:%s: allocated %dkB\n",
5236 mdname(mddev), memory); 4672 mdname(mddev), memory);
5237 4673
5238 sprintf(pers_name, "raid%d", mddev->new_level); 4674 conf->thread = md_register_thread(raid5d, mddev, NULL);
5239 conf->thread = md_register_thread(raid5d, mddev, pers_name);
5240 if (!conf->thread) { 4675 if (!conf->thread) {
5241 printk(KERN_ERR 4676 printk(KERN_ERR
5242 "md/raid:%s: couldn't allocate thread.\n", 4677 "md/raid:%s: couldn't allocate thread.\n",
@@ -5281,50 +4716,23 @@ static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded
5281 return 0; 4716 return 0;
5282} 4717}
5283 4718
5284static int run(struct mddev *mddev) 4719static int run(mddev_t *mddev)
5285{ 4720{
5286 struct r5conf *conf; 4721 raid5_conf_t *conf;
5287 int working_disks = 0; 4722 int working_disks = 0;
5288 int dirty_parity_disks = 0; 4723 int dirty_parity_disks = 0;
5289 struct md_rdev *rdev; 4724 mdk_rdev_t *rdev;
5290 sector_t reshape_offset = 0; 4725 sector_t reshape_offset = 0;
5291 int i;
5292 long long min_offset_diff = 0;
5293 int first = 1;
5294 4726
5295 if (mddev->recovery_cp != MaxSector) 4727 if (mddev->recovery_cp != MaxSector)
5296 printk(KERN_NOTICE "md/raid:%s: not clean" 4728 printk(KERN_NOTICE "md/raid:%s: not clean"
5297 " -- starting background reconstruction\n", 4729 " -- starting background reconstruction\n",
5298 mdname(mddev)); 4730 mdname(mddev));
5299
5300 rdev_for_each(rdev, mddev) {
5301 long long diff;
5302 if (rdev->raid_disk < 0)
5303 continue;
5304 diff = (rdev->new_data_offset - rdev->data_offset);
5305 if (first) {
5306 min_offset_diff = diff;
5307 first = 0;
5308 } else if (mddev->reshape_backwards &&
5309 diff < min_offset_diff)
5310 min_offset_diff = diff;
5311 else if (!mddev->reshape_backwards &&
5312 diff > min_offset_diff)
5313 min_offset_diff = diff;
5314 }
5315
5316 if (mddev->reshape_position != MaxSector) { 4731 if (mddev->reshape_position != MaxSector) {
5317 /* Check that we can continue the reshape. 4732 /* Check that we can continue the reshape.
5318 * Difficulties arise if the stripe we would write to 4733 * Currently only disks can change, it must
5319 * next is at or after the stripe we would read from next. 4734 * increase, and we must be past the point where
5320 * For a reshape that changes the number of devices, this 4735 * a stripe over-writes itself
5321 * is only possible for a very short time, and mdadm makes
5322 * sure that time appears to have past before assembling
5323 * the array. So we fail if that time hasn't passed.
5324 * For a reshape that keeps the number of devices the same
5325 * mdadm must be monitoring the reshape can keeping the
5326 * critical areas read-only and backed up. It will start
5327 * the array in read-only mode, so we check for that.
5328 */ 4736 */
5329 sector_t here_new, here_old; 4737 sector_t here_new, here_old;
5330 int old_disks; 4738 int old_disks;
@@ -5356,34 +4764,26 @@ static int run(struct mddev *mddev)
5356 /* here_old is the first stripe that we might need to read 4764 /* here_old is the first stripe that we might need to read
5357 * from */ 4765 * from */
5358 if (mddev->delta_disks == 0) { 4766 if (mddev->delta_disks == 0) {
5359 if ((here_new * mddev->new_chunk_sectors !=
5360 here_old * mddev->chunk_sectors)) {
5361 printk(KERN_ERR "md/raid:%s: reshape position is"
5362 " confused - aborting\n", mdname(mddev));
5363 return -EINVAL;
5364 }
5365 /* We cannot be sure it is safe to start an in-place 4767 /* We cannot be sure it is safe to start an in-place
5366 * reshape. It is only safe if user-space is monitoring 4768 * reshape. It is only safe if user-space if monitoring
5367 * and taking constant backups. 4769 * and taking constant backups.
5368 * mdadm always starts a situation like this in 4770 * mdadm always starts a situation like this in
5369 * readonly mode so it can take control before 4771 * readonly mode so it can take control before
5370 * allowing any writes. So just check for that. 4772 * allowing any writes. So just check for that.
5371 */ 4773 */
5372 if (abs(min_offset_diff) >= mddev->chunk_sectors && 4774 if ((here_new * mddev->new_chunk_sectors !=
5373 abs(min_offset_diff) >= mddev->new_chunk_sectors) 4775 here_old * mddev->chunk_sectors) ||
5374 /* not really in-place - so OK */; 4776 mddev->ro == 0) {
5375 else if (mddev->ro == 0) { 4777 printk(KERN_ERR "md/raid:%s: in-place reshape must be started"
5376 printk(KERN_ERR "md/raid:%s: in-place reshape " 4778 " in read-only mode - aborting\n",
5377 "must be started in read-only mode "
5378 "- aborting\n",
5379 mdname(mddev)); 4779 mdname(mddev));
5380 return -EINVAL; 4780 return -EINVAL;
5381 } 4781 }
5382 } else if (mddev->reshape_backwards 4782 } else if (mddev->delta_disks < 0
5383 ? (here_new * mddev->new_chunk_sectors + min_offset_diff <= 4783 ? (here_new * mddev->new_chunk_sectors <=
5384 here_old * mddev->chunk_sectors) 4784 here_old * mddev->chunk_sectors)
5385 : (here_new * mddev->new_chunk_sectors >= 4785 : (here_new * mddev->new_chunk_sectors >=
5386 here_old * mddev->chunk_sectors + (-min_offset_diff))) { 4786 here_old * mddev->chunk_sectors)) {
5387 /* Reading from the same stripe as writing to - bad */ 4787 /* Reading from the same stripe as writing to - bad */
5388 printk(KERN_ERR "md/raid:%s: reshape_position too early for " 4788 printk(KERN_ERR "md/raid:%s: reshape_position too early for "
5389 "auto-recovery - aborting.\n", 4789 "auto-recovery - aborting.\n",
@@ -5408,30 +4808,16 @@ static int run(struct mddev *mddev)
5408 if (IS_ERR(conf)) 4808 if (IS_ERR(conf))
5409 return PTR_ERR(conf); 4809 return PTR_ERR(conf);
5410 4810
5411 conf->min_offset_diff = min_offset_diff;
5412 mddev->thread = conf->thread; 4811 mddev->thread = conf->thread;
5413 conf->thread = NULL; 4812 conf->thread = NULL;
5414 mddev->private = conf; 4813 mddev->private = conf;
5415 4814
5416 for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; 4815 /*
5417 i++) { 4816 * 0 for a fully functional array, 1 or 2 for a degraded array.
5418 rdev = conf->disks[i].rdev; 4817 */
5419 if (!rdev && conf->disks[i].replacement) { 4818 list_for_each_entry(rdev, &mddev->disks, same_set) {
5420 /* The replacement is all we have yet */ 4819 if (rdev->raid_disk < 0)
5421 rdev = conf->disks[i].replacement;
5422 conf->disks[i].replacement = NULL;
5423 clear_bit(Replacement, &rdev->flags);
5424 conf->disks[i].rdev = rdev;
5425 }
5426 if (!rdev)
5427 continue; 4820 continue;
5428 if (conf->disks[i].replacement &&
5429 conf->reshape_progress != MaxSector) {
5430 /* replacements and reshape simply do not mix. */
5431 printk(KERN_ERR "md: cannot handle concurrent "
5432 "replacement and reshape.\n");
5433 goto abort;
5434 }
5435 if (test_bit(In_sync, &rdev->flags)) { 4821 if (test_bit(In_sync, &rdev->flags)) {
5436 working_disks++; 4822 working_disks++;
5437 continue; 4823 continue;
@@ -5465,10 +4851,8 @@ static int run(struct mddev *mddev)
5465 dirty_parity_disks++; 4851 dirty_parity_disks++;
5466 } 4852 }
5467 4853
5468 /* 4854 mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks)
5469 * 0 for a fully functional array, 1 or 2 for a degraded array. 4855 - working_disks);
5470 */
5471 mddev->degraded = calc_degraded(conf);
5472 4856
5473 if (has_failed(conf)) { 4857 if (has_failed(conf)) {
5474 printk(KERN_ERR "md/raid:%s: not enough operational devices" 4858 printk(KERN_ERR "md/raid:%s: not enough operational devices"
@@ -5534,7 +4918,6 @@ static int run(struct mddev *mddev)
5534 4918
5535 if (mddev->queue) { 4919 if (mddev->queue) {
5536 int chunk_size; 4920 int chunk_size;
5537 bool discard_supported = true;
5538 /* read-ahead size must cover two whole stripes, which 4921 /* read-ahead size must cover two whole stripes, which
5539 * is 2 * (datadisks) * chunksize where 'n' is the 4922 * is 2 * (datadisks) * chunksize where 'n' is the
5540 * number of raid devices 4923 * number of raid devices
@@ -5554,67 +4937,27 @@ static int run(struct mddev *mddev)
5554 blk_queue_io_min(mddev->queue, chunk_size); 4937 blk_queue_io_min(mddev->queue, chunk_size);
5555 blk_queue_io_opt(mddev->queue, chunk_size * 4938 blk_queue_io_opt(mddev->queue, chunk_size *
5556 (conf->raid_disks - conf->max_degraded)); 4939 (conf->raid_disks - conf->max_degraded));
5557 /*
5558 * We can only discard a whole stripe. It doesn't make sense to
5559 * discard data disk but write parity disk
5560 */
5561 stripe = stripe * PAGE_SIZE;
5562 /* Round up to power of 2, as discard handling
5563 * currently assumes that */
5564 while ((stripe-1) & stripe)
5565 stripe = (stripe | (stripe-1)) + 1;
5566 mddev->queue->limits.discard_alignment = stripe;
5567 mddev->queue->limits.discard_granularity = stripe;
5568 /*
5569 * unaligned part of discard request will be ignored, so can't
5570 * guarantee discard_zerors_data
5571 */
5572 mddev->queue->limits.discard_zeroes_data = 0;
5573 4940
5574 rdev_for_each(rdev, mddev) { 4941 list_for_each_entry(rdev, &mddev->disks, same_set)
5575 disk_stack_limits(mddev->gendisk, rdev->bdev, 4942 disk_stack_limits(mddev->gendisk, rdev->bdev,
5576 rdev->data_offset << 9); 4943 rdev->data_offset << 9);
5577 disk_stack_limits(mddev->gendisk, rdev->bdev,
5578 rdev->new_data_offset << 9);
5579 /*
5580 * discard_zeroes_data is required, otherwise data
5581 * could be lost. Consider a scenario: discard a stripe
5582 * (the stripe could be inconsistent if
5583 * discard_zeroes_data is 0); write one disk of the
5584 * stripe (the stripe could be inconsistent again
5585 * depending on which disks are used to calculate
5586 * parity); the disk is broken; The stripe data of this
5587 * disk is lost.
5588 */
5589 if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) ||
5590 !bdev_get_queue(rdev->bdev)->
5591 limits.discard_zeroes_data)
5592 discard_supported = false;
5593 }
5594
5595 if (discard_supported &&
5596 mddev->queue->limits.max_discard_sectors >= stripe &&
5597 mddev->queue->limits.discard_granularity >= stripe)
5598 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
5599 mddev->queue);
5600 else
5601 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
5602 mddev->queue);
5603 } 4944 }
5604 4945
5605 return 0; 4946 return 0;
5606abort: 4947abort:
5607 md_unregister_thread(&mddev->thread); 4948 md_unregister_thread(&mddev->thread);
5608 print_raid5_conf(conf); 4949 if (conf) {
5609 free_conf(conf); 4950 print_raid5_conf(conf);
4951 free_conf(conf);
4952 }
5610 mddev->private = NULL; 4953 mddev->private = NULL;
5611 printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev)); 4954 printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev));
5612 return -EIO; 4955 return -EIO;
5613} 4956}
5614 4957
5615static int stop(struct mddev *mddev) 4958static int stop(mddev_t *mddev)
5616{ 4959{
5617 struct r5conf *conf = mddev->private; 4960 raid5_conf_t *conf = mddev->private;
5618 4961
5619 md_unregister_thread(&mddev->thread); 4962 md_unregister_thread(&mddev->thread);
5620 if (mddev->queue) 4963 if (mddev->queue)
@@ -5625,9 +4968,44 @@ static int stop(struct mddev *mddev)
5625 return 0; 4968 return 0;
5626} 4969}
5627 4970
5628static void status(struct seq_file *seq, struct mddev *mddev) 4971#ifdef DEBUG
4972static void print_sh(struct seq_file *seq, struct stripe_head *sh)
5629{ 4973{
5630 struct r5conf *conf = mddev->private; 4974 int i;
4975
4976 seq_printf(seq, "sh %llu, pd_idx %d, state %ld.\n",
4977 (unsigned long long)sh->sector, sh->pd_idx, sh->state);
4978 seq_printf(seq, "sh %llu, count %d.\n",
4979 (unsigned long long)sh->sector, atomic_read(&sh->count));
4980 seq_printf(seq, "sh %llu, ", (unsigned long long)sh->sector);
4981 for (i = 0; i < sh->disks; i++) {
4982 seq_printf(seq, "(cache%d: %p %ld) ",
4983 i, sh->dev[i].page, sh->dev[i].flags);
4984 }
4985 seq_printf(seq, "\n");
4986}
4987
4988static void printall(struct seq_file *seq, raid5_conf_t *conf)
4989{
4990 struct stripe_head *sh;
4991 struct hlist_node *hn;
4992 int i;
4993
4994 spin_lock_irq(&conf->device_lock);
4995 for (i = 0; i < NR_HASH; i++) {
4996 hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) {
4997 if (sh->raid_conf != conf)
4998 continue;
4999 print_sh(seq, sh);
5000 }
5001 }
5002 spin_unlock_irq(&conf->device_lock);
5003}
5004#endif
5005
5006static void status(struct seq_file *seq, mddev_t *mddev)
5007{
5008 raid5_conf_t *conf = mddev->private;
5631 int i; 5009 int i;
5632 5010
5633 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, 5011 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
@@ -5638,9 +5016,13 @@ static void status(struct seq_file *seq, struct mddev *mddev)
5638 conf->disks[i].rdev && 5016 conf->disks[i].rdev &&
5639 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); 5017 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_");
5640 seq_printf (seq, "]"); 5018 seq_printf (seq, "]");
5019#ifdef DEBUG
5020 seq_printf (seq, "\n");
5021 printall(seq, conf);
5022#endif
5641} 5023}
5642 5024
5643static void print_raid5_conf (struct r5conf *conf) 5025static void print_raid5_conf (raid5_conf_t *conf)
5644{ 5026{
5645 int i; 5027 int i;
5646 struct disk_info *tmp; 5028 struct disk_info *tmp;
@@ -5664,35 +5046,17 @@ static void print_raid5_conf (struct r5conf *conf)
5664 } 5046 }
5665} 5047}
5666 5048
5667static int raid5_spare_active(struct mddev *mddev) 5049static int raid5_spare_active(mddev_t *mddev)
5668{ 5050{
5669 int i; 5051 int i;
5670 struct r5conf *conf = mddev->private; 5052 raid5_conf_t *conf = mddev->private;
5671 struct disk_info *tmp; 5053 struct disk_info *tmp;
5672 int count = 0; 5054 int count = 0;
5673 unsigned long flags; 5055 unsigned long flags;
5674 5056
5675 for (i = 0; i < conf->raid_disks; i++) { 5057 for (i = 0; i < conf->raid_disks; i++) {
5676 tmp = conf->disks + i; 5058 tmp = conf->disks + i;
5677 if (tmp->replacement 5059 if (tmp->rdev
5678 && tmp->replacement->recovery_offset == MaxSector
5679 && !test_bit(Faulty, &tmp->replacement->flags)
5680 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
5681 /* Replacement has just become active. */
5682 if (!tmp->rdev
5683 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
5684 count++;
5685 if (tmp->rdev) {
5686 /* Replaced device not technically faulty,
5687 * but we need to be sure it gets removed
5688 * and never re-added.
5689 */
5690 set_bit(Faulty, &tmp->rdev->flags);
5691 sysfs_notify_dirent_safe(
5692 tmp->rdev->sysfs_state);
5693 }
5694 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
5695 } else if (tmp->rdev
5696 && tmp->rdev->recovery_offset == MaxSector 5060 && tmp->rdev->recovery_offset == MaxSector
5697 && !test_bit(Faulty, &tmp->rdev->flags) 5061 && !test_bit(Faulty, &tmp->rdev->flags)
5698 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 5062 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
@@ -5701,77 +5065,58 @@ static int raid5_spare_active(struct mddev *mddev)
5701 } 5065 }
5702 } 5066 }
5703 spin_lock_irqsave(&conf->device_lock, flags); 5067 spin_lock_irqsave(&conf->device_lock, flags);
5704 mddev->degraded = calc_degraded(conf); 5068 mddev->degraded -= count;
5705 spin_unlock_irqrestore(&conf->device_lock, flags); 5069 spin_unlock_irqrestore(&conf->device_lock, flags);
5706 print_raid5_conf(conf); 5070 print_raid5_conf(conf);
5707 return count; 5071 return count;
5708} 5072}
5709 5073
5710static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) 5074static int raid5_remove_disk(mddev_t *mddev, int number)
5711{ 5075{
5712 struct r5conf *conf = mddev->private; 5076 raid5_conf_t *conf = mddev->private;
5713 int err = 0; 5077 int err = 0;
5714 int number = rdev->raid_disk; 5078 mdk_rdev_t *rdev;
5715 struct md_rdev **rdevp;
5716 struct disk_info *p = conf->disks + number; 5079 struct disk_info *p = conf->disks + number;
5717 5080
5718 print_raid5_conf(conf); 5081 print_raid5_conf(conf);
5719 if (rdev == p->rdev) 5082 rdev = p->rdev;
5720 rdevp = &p->rdev; 5083 if (rdev) {
5721 else if (rdev == p->replacement) 5084 if (number >= conf->raid_disks &&
5722 rdevp = &p->replacement; 5085 conf->reshape_progress == MaxSector)
5723 else 5086 clear_bit(In_sync, &rdev->flags);
5724 return 0;
5725
5726 if (number >= conf->raid_disks &&
5727 conf->reshape_progress == MaxSector)
5728 clear_bit(In_sync, &rdev->flags);
5729 5087
5730 if (test_bit(In_sync, &rdev->flags) || 5088 if (test_bit(In_sync, &rdev->flags) ||
5731 atomic_read(&rdev->nr_pending)) { 5089 atomic_read(&rdev->nr_pending)) {
5732 err = -EBUSY; 5090 err = -EBUSY;
5733 goto abort; 5091 goto abort;
5734 } 5092 }
5735 /* Only remove non-faulty devices if recovery 5093 /* Only remove non-faulty devices if recovery
5736 * isn't possible. 5094 * isn't possible.
5737 */
5738 if (!test_bit(Faulty, &rdev->flags) &&
5739 mddev->recovery_disabled != conf->recovery_disabled &&
5740 !has_failed(conf) &&
5741 (!p->replacement || p->replacement == rdev) &&
5742 number < conf->raid_disks) {
5743 err = -EBUSY;
5744 goto abort;
5745 }
5746 *rdevp = NULL;
5747 synchronize_rcu();
5748 if (atomic_read(&rdev->nr_pending)) {
5749 /* lost the race, try later */
5750 err = -EBUSY;
5751 *rdevp = rdev;
5752 } else if (p->replacement) {
5753 /* We must have just cleared 'rdev' */
5754 p->rdev = p->replacement;
5755 clear_bit(Replacement, &p->replacement->flags);
5756 smp_mb(); /* Make sure other CPUs may see both as identical
5757 * but will never see neither - if they are careful
5758 */
5759 p->replacement = NULL;
5760 clear_bit(WantReplacement, &rdev->flags);
5761 } else
5762 /* We might have just removed the Replacement as faulty-
5763 * clear the bit just in case
5764 */ 5095 */
5765 clear_bit(WantReplacement, &rdev->flags); 5096 if (!test_bit(Faulty, &rdev->flags) &&
5097 mddev->recovery_disabled != conf->recovery_disabled &&
5098 !has_failed(conf) &&
5099 number < conf->raid_disks) {
5100 err = -EBUSY;
5101 goto abort;
5102 }
5103 p->rdev = NULL;
5104 synchronize_rcu();
5105 if (atomic_read(&rdev->nr_pending)) {
5106 /* lost the race, try later */
5107 err = -EBUSY;
5108 p->rdev = rdev;
5109 }
5110 }
5766abort: 5111abort:
5767 5112
5768 print_raid5_conf(conf); 5113 print_raid5_conf(conf);
5769 return err; 5114 return err;
5770} 5115}
5771 5116
5772static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) 5117static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
5773{ 5118{
5774 struct r5conf *conf = mddev->private; 5119 raid5_conf_t *conf = mddev->private;
5775 int err = -EEXIST; 5120 int err = -EEXIST;
5776 int disk; 5121 int disk;
5777 struct disk_info *p; 5122 struct disk_info *p;
@@ -5781,7 +5126,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
5781 if (mddev->recovery_disabled == conf->recovery_disabled) 5126 if (mddev->recovery_disabled == conf->recovery_disabled)
5782 return -EBUSY; 5127 return -EBUSY;
5783 5128
5784 if (rdev->saved_raid_disk < 0 && has_failed(conf)) 5129 if (has_failed(conf))
5785 /* no point adding a device */ 5130 /* no point adding a device */
5786 return -EINVAL; 5131 return -EINVAL;
5787 5132
@@ -5795,39 +5140,24 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
5795 if (rdev->saved_raid_disk >= 0 && 5140 if (rdev->saved_raid_disk >= 0 &&
5796 rdev->saved_raid_disk >= first && 5141 rdev->saved_raid_disk >= first &&
5797 conf->disks[rdev->saved_raid_disk].rdev == NULL) 5142 conf->disks[rdev->saved_raid_disk].rdev == NULL)
5798 first = rdev->saved_raid_disk; 5143 disk = rdev->saved_raid_disk;
5799 5144 else
5800 for (disk = first; disk <= last; disk++) { 5145 disk = first;
5801 p = conf->disks + disk; 5146 for ( ; disk <= last ; disk++)
5802 if (p->rdev == NULL) { 5147 if ((p=conf->disks + disk)->rdev == NULL) {
5803 clear_bit(In_sync, &rdev->flags); 5148 clear_bit(In_sync, &rdev->flags);
5804 rdev->raid_disk = disk; 5149 rdev->raid_disk = disk;
5805 err = 0; 5150 err = 0;
5806 if (rdev->saved_raid_disk != disk) 5151 if (rdev->saved_raid_disk != disk)
5807 conf->fullsync = 1; 5152 conf->fullsync = 1;
5808 rcu_assign_pointer(p->rdev, rdev); 5153 rcu_assign_pointer(p->rdev, rdev);
5809 goto out;
5810 }
5811 }
5812 for (disk = first; disk <= last; disk++) {
5813 p = conf->disks + disk;
5814 if (test_bit(WantReplacement, &p->rdev->flags) &&
5815 p->replacement == NULL) {
5816 clear_bit(In_sync, &rdev->flags);
5817 set_bit(Replacement, &rdev->flags);
5818 rdev->raid_disk = disk;
5819 err = 0;
5820 conf->fullsync = 1;
5821 rcu_assign_pointer(p->replacement, rdev);
5822 break; 5154 break;
5823 } 5155 }
5824 }
5825out:
5826 print_raid5_conf(conf); 5156 print_raid5_conf(conf);
5827 return err; 5157 return err;
5828} 5158}
5829 5159
5830static int raid5_resize(struct mddev *mddev, sector_t sectors) 5160static int raid5_resize(mddev_t *mddev, sector_t sectors)
5831{ 5161{
5832 /* no resync is happening, and there is enough space 5162 /* no resync is happening, and there is enough space
5833 * on all devices, so we can resize. 5163 * on all devices, so we can resize.
@@ -5836,18 +5166,12 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
5836 * any io in the removed space completes, but it hardly seems 5166 * any io in the removed space completes, but it hardly seems
5837 * worth it. 5167 * worth it.
5838 */ 5168 */
5839 sector_t newsize;
5840 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 5169 sectors &= ~((sector_t)mddev->chunk_sectors - 1);
5841 newsize = raid5_size(mddev, sectors, mddev->raid_disks); 5170 md_set_array_sectors(mddev, raid5_size(mddev, sectors,
5842 if (mddev->external_size && 5171 mddev->raid_disks));
5843 mddev->array_sectors > newsize) 5172 if (mddev->array_sectors >
5173 raid5_size(mddev, sectors, mddev->raid_disks))
5844 return -EINVAL; 5174 return -EINVAL;
5845 if (mddev->bitmap) {
5846 int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0);
5847 if (ret)
5848 return ret;
5849 }
5850 md_set_array_sectors(mddev, newsize);
5851 set_capacity(mddev->gendisk, mddev->array_sectors); 5175 set_capacity(mddev->gendisk, mddev->array_sectors);
5852 revalidate_disk(mddev->gendisk); 5176 revalidate_disk(mddev->gendisk);
5853 if (sectors > mddev->dev_sectors && 5177 if (sectors > mddev->dev_sectors &&
@@ -5860,7 +5184,7 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
5860 return 0; 5184 return 0;
5861} 5185}
5862 5186
5863static int check_stripe_cache(struct mddev *mddev) 5187static int check_stripe_cache(mddev_t *mddev)
5864{ 5188{
5865 /* Can only proceed if there are plenty of stripe_heads. 5189 /* Can only proceed if there are plenty of stripe_heads.
5866 * We need a minimum of one full stripe,, and for sensible progress 5190 * We need a minimum of one full stripe,, and for sensible progress
@@ -5870,7 +5194,7 @@ static int check_stripe_cache(struct mddev *mddev)
5870 * If the chunk size is greater, user-space should request more 5194 * If the chunk size is greater, user-space should request more
5871 * stripe_heads first. 5195 * stripe_heads first.
5872 */ 5196 */
5873 struct r5conf *conf = mddev->private; 5197 raid5_conf_t *conf = mddev->private;
5874 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 5198 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
5875 > conf->max_nr_stripes || 5199 > conf->max_nr_stripes ||
5876 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 5200 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
@@ -5884,14 +5208,17 @@ static int check_stripe_cache(struct mddev *mddev)
5884 return 1; 5208 return 1;
5885} 5209}
5886 5210
5887static int check_reshape(struct mddev *mddev) 5211static int check_reshape(mddev_t *mddev)
5888{ 5212{
5889 struct r5conf *conf = mddev->private; 5213 raid5_conf_t *conf = mddev->private;
5890 5214
5891 if (mddev->delta_disks == 0 && 5215 if (mddev->delta_disks == 0 &&
5892 mddev->new_layout == mddev->layout && 5216 mddev->new_layout == mddev->layout &&
5893 mddev->new_chunk_sectors == mddev->chunk_sectors) 5217 mddev->new_chunk_sectors == mddev->chunk_sectors)
5894 return 0; /* nothing to do */ 5218 return 0; /* nothing to do */
5219 if (mddev->bitmap)
5220 /* Cannot grow a bitmap yet */
5221 return -EBUSY;
5895 if (has_failed(conf)) 5222 if (has_failed(conf))
5896 return -EINVAL; 5223 return -EINVAL;
5897 if (mddev->delta_disks < 0) { 5224 if (mddev->delta_disks < 0) {
@@ -5910,14 +5237,13 @@ static int check_reshape(struct mddev *mddev)
5910 if (!check_stripe_cache(mddev)) 5237 if (!check_stripe_cache(mddev))
5911 return -ENOSPC; 5238 return -ENOSPC;
5912 5239
5913 return resize_stripes(conf, (conf->previous_raid_disks 5240 return resize_stripes(conf, conf->raid_disks + mddev->delta_disks);
5914 + mddev->delta_disks));
5915} 5241}
5916 5242
5917static int raid5_start_reshape(struct mddev *mddev) 5243static int raid5_start_reshape(mddev_t *mddev)
5918{ 5244{
5919 struct r5conf *conf = mddev->private; 5245 raid5_conf_t *conf = mddev->private;
5920 struct md_rdev *rdev; 5246 mdk_rdev_t *rdev;
5921 int spares = 0; 5247 int spares = 0;
5922 unsigned long flags; 5248 unsigned long flags;
5923 5249
@@ -5927,14 +5253,10 @@ static int raid5_start_reshape(struct mddev *mddev)
5927 if (!check_stripe_cache(mddev)) 5253 if (!check_stripe_cache(mddev))
5928 return -ENOSPC; 5254 return -ENOSPC;
5929 5255
5930 if (has_failed(conf)) 5256 list_for_each_entry(rdev, &mddev->disks, same_set)
5931 return -EINVAL;
5932
5933 rdev_for_each(rdev, mddev) {
5934 if (!test_bit(In_sync, &rdev->flags) 5257 if (!test_bit(In_sync, &rdev->flags)
5935 && !test_bit(Faulty, &rdev->flags)) 5258 && !test_bit(Faulty, &rdev->flags))
5936 spares++; 5259 spares++;
5937 }
5938 5260
5939 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 5261 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
5940 /* Not enough devices even to make a degraded array 5262 /* Not enough devices even to make a degraded array
@@ -5961,16 +5283,12 @@ static int raid5_start_reshape(struct mddev *mddev)
5961 conf->chunk_sectors = mddev->new_chunk_sectors; 5283 conf->chunk_sectors = mddev->new_chunk_sectors;
5962 conf->prev_algo = conf->algorithm; 5284 conf->prev_algo = conf->algorithm;
5963 conf->algorithm = mddev->new_layout; 5285 conf->algorithm = mddev->new_layout;
5964 conf->generation++; 5286 if (mddev->delta_disks < 0)
5965 /* Code that selects data_offset needs to see the generation update
5966 * if reshape_progress has been set - so a memory barrier needed.
5967 */
5968 smp_mb();
5969 if (mddev->reshape_backwards)
5970 conf->reshape_progress = raid5_size(mddev, 0, 0); 5287 conf->reshape_progress = raid5_size(mddev, 0, 0);
5971 else 5288 else
5972 conf->reshape_progress = 0; 5289 conf->reshape_progress = 0;
5973 conf->reshape_safe = conf->reshape_progress; 5290 conf->reshape_safe = conf->reshape_progress;
5291 conf->generation++;
5974 spin_unlock_irq(&conf->device_lock); 5292 spin_unlock_irq(&conf->device_lock);
5975 5293
5976 /* Add some new drives, as many as will fit. 5294 /* Add some new drives, as many as will fit.
@@ -5981,14 +5299,16 @@ static int raid5_start_reshape(struct mddev *mddev)
5981 * such devices during the reshape and confusion could result. 5299 * such devices during the reshape and confusion could result.
5982 */ 5300 */
5983 if (mddev->delta_disks >= 0) { 5301 if (mddev->delta_disks >= 0) {
5984 rdev_for_each(rdev, mddev) 5302 int added_devices = 0;
5303 list_for_each_entry(rdev, &mddev->disks, same_set)
5985 if (rdev->raid_disk < 0 && 5304 if (rdev->raid_disk < 0 &&
5986 !test_bit(Faulty, &rdev->flags)) { 5305 !test_bit(Faulty, &rdev->flags)) {
5987 if (raid5_add_disk(mddev, rdev) == 0) { 5306 if (raid5_add_disk(mddev, rdev) == 0) {
5988 if (rdev->raid_disk 5307 if (rdev->raid_disk
5989 >= conf->previous_raid_disks) 5308 >= conf->previous_raid_disks) {
5990 set_bit(In_sync, &rdev->flags); 5309 set_bit(In_sync, &rdev->flags);
5991 else 5310 added_devices++;
5311 } else
5992 rdev->recovery_offset = 0; 5312 rdev->recovery_offset = 0;
5993 5313
5994 if (sysfs_link_rdev(mddev, rdev)) 5314 if (sysfs_link_rdev(mddev, rdev))
@@ -5998,6 +5318,7 @@ static int raid5_start_reshape(struct mddev *mddev)
5998 && !test_bit(Faulty, &rdev->flags)) { 5318 && !test_bit(Faulty, &rdev->flags)) {
5999 /* This is a spare that was manually added */ 5319 /* This is a spare that was manually added */
6000 set_bit(In_sync, &rdev->flags); 5320 set_bit(In_sync, &rdev->flags);
5321 added_devices++;
6001 } 5322 }
6002 5323
6003 /* When a reshape changes the number of devices, 5324 /* When a reshape changes the number of devices,
@@ -6005,7 +5326,8 @@ static int raid5_start_reshape(struct mddev *mddev)
6005 * pre and post number of devices. 5326 * pre and post number of devices.
6006 */ 5327 */
6007 spin_lock_irqsave(&conf->device_lock, flags); 5328 spin_lock_irqsave(&conf->device_lock, flags);
6008 mddev->degraded = calc_degraded(conf); 5329 mddev->degraded += (conf->raid_disks - conf->previous_raid_disks)
5330 - added_devices;
6009 spin_unlock_irqrestore(&conf->device_lock, flags); 5331 spin_unlock_irqrestore(&conf->device_lock, flags);
6010 } 5332 }
6011 mddev->raid_disks = conf->raid_disks; 5333 mddev->raid_disks = conf->raid_disks;
@@ -6022,11 +5344,7 @@ static int raid5_start_reshape(struct mddev *mddev)
6022 mddev->recovery = 0; 5344 mddev->recovery = 0;
6023 spin_lock_irq(&conf->device_lock); 5345 spin_lock_irq(&conf->device_lock);
6024 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 5346 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
6025 rdev_for_each(rdev, mddev)
6026 rdev->new_data_offset = rdev->data_offset;
6027 smp_wmb();
6028 conf->reshape_progress = MaxSector; 5347 conf->reshape_progress = MaxSector;
6029 mddev->reshape_position = MaxSector;
6030 spin_unlock_irq(&conf->device_lock); 5348 spin_unlock_irq(&conf->device_lock);
6031 return -EAGAIN; 5349 return -EAGAIN;
6032 } 5350 }
@@ -6039,17 +5357,13 @@ static int raid5_start_reshape(struct mddev *mddev)
6039/* This is called from the reshape thread and should make any 5357/* This is called from the reshape thread and should make any
6040 * changes needed in 'conf' 5358 * changes needed in 'conf'
6041 */ 5359 */
6042static void end_reshape(struct r5conf *conf) 5360static void end_reshape(raid5_conf_t *conf)
6043{ 5361{
6044 5362
6045 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 5363 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
6046 struct md_rdev *rdev;
6047 5364
6048 spin_lock_irq(&conf->device_lock); 5365 spin_lock_irq(&conf->device_lock);
6049 conf->previous_raid_disks = conf->raid_disks; 5366 conf->previous_raid_disks = conf->raid_disks;
6050 rdev_for_each(rdev, conf->mddev)
6051 rdev->data_offset = rdev->new_data_offset;
6052 smp_wmb();
6053 conf->reshape_progress = MaxSector; 5367 conf->reshape_progress = MaxSector;
6054 spin_unlock_irq(&conf->device_lock); 5368 spin_unlock_irq(&conf->device_lock);
6055 wake_up(&conf->wait_for_overlap); 5369 wake_up(&conf->wait_for_overlap);
@@ -6070,9 +5384,9 @@ static void end_reshape(struct r5conf *conf)
6070/* This is called from the raid5d thread with mddev_lock held. 5384/* This is called from the raid5d thread with mddev_lock held.
6071 * It makes config changes to the device. 5385 * It makes config changes to the device.
6072 */ 5386 */
6073static void raid5_finish_reshape(struct mddev *mddev) 5387static void raid5_finish_reshape(mddev_t *mddev)
6074{ 5388{
6075 struct r5conf *conf = mddev->private; 5389 raid5_conf_t *conf = mddev->private;
6076 5390
6077 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 5391 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
6078 5392
@@ -6082,31 +5396,32 @@ static void raid5_finish_reshape(struct mddev *mddev)
6082 revalidate_disk(mddev->gendisk); 5396 revalidate_disk(mddev->gendisk);
6083 } else { 5397 } else {
6084 int d; 5398 int d;
6085 spin_lock_irq(&conf->device_lock); 5399 mddev->degraded = conf->raid_disks;
6086 mddev->degraded = calc_degraded(conf); 5400 for (d = 0; d < conf->raid_disks ; d++)
6087 spin_unlock_irq(&conf->device_lock); 5401 if (conf->disks[d].rdev &&
5402 test_bit(In_sync,
5403 &conf->disks[d].rdev->flags))
5404 mddev->degraded--;
6088 for (d = conf->raid_disks ; 5405 for (d = conf->raid_disks ;
6089 d < conf->raid_disks - mddev->delta_disks; 5406 d < conf->raid_disks - mddev->delta_disks;
6090 d++) { 5407 d++) {
6091 struct md_rdev *rdev = conf->disks[d].rdev; 5408 mdk_rdev_t *rdev = conf->disks[d].rdev;
6092 if (rdev) 5409 if (rdev && raid5_remove_disk(mddev, d) == 0) {
6093 clear_bit(In_sync, &rdev->flags); 5410 sysfs_unlink_rdev(mddev, rdev);
6094 rdev = conf->disks[d].replacement; 5411 rdev->raid_disk = -1;
6095 if (rdev) 5412 }
6096 clear_bit(In_sync, &rdev->flags);
6097 } 5413 }
6098 } 5414 }
6099 mddev->layout = conf->algorithm; 5415 mddev->layout = conf->algorithm;
6100 mddev->chunk_sectors = conf->chunk_sectors; 5416 mddev->chunk_sectors = conf->chunk_sectors;
6101 mddev->reshape_position = MaxSector; 5417 mddev->reshape_position = MaxSector;
6102 mddev->delta_disks = 0; 5418 mddev->delta_disks = 0;
6103 mddev->reshape_backwards = 0;
6104 } 5419 }
6105} 5420}
6106 5421
6107static void raid5_quiesce(struct mddev *mddev, int state) 5422static void raid5_quiesce(mddev_t *mddev, int state)
6108{ 5423{
6109 struct r5conf *conf = mddev->private; 5424 raid5_conf_t *conf = mddev->private;
6110 5425
6111 switch(state) { 5426 switch(state) {
6112 case 2: /* resume for a suspend */ 5427 case 2: /* resume for a suspend */
@@ -6122,7 +5437,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
6122 wait_event_lock_irq(conf->wait_for_stripe, 5437 wait_event_lock_irq(conf->wait_for_stripe,
6123 atomic_read(&conf->active_stripes) == 0 && 5438 atomic_read(&conf->active_stripes) == 0 &&
6124 atomic_read(&conf->active_aligned_reads) == 0, 5439 atomic_read(&conf->active_aligned_reads) == 0,
6125 conf->device_lock); 5440 conf->device_lock, /* nothing */);
6126 conf->quiesce = 1; 5441 conf->quiesce = 1;
6127 spin_unlock_irq(&conf->device_lock); 5442 spin_unlock_irq(&conf->device_lock);
6128 /* allow reshape to continue */ 5443 /* allow reshape to continue */
@@ -6140,20 +5455,20 @@ static void raid5_quiesce(struct mddev *mddev, int state)
6140} 5455}
6141 5456
6142 5457
6143static void *raid45_takeover_raid0(struct mddev *mddev, int level) 5458static void *raid45_takeover_raid0(mddev_t *mddev, int level)
6144{ 5459{
6145 struct r0conf *raid0_conf = mddev->private; 5460 struct raid0_private_data *raid0_priv = mddev->private;
6146 sector_t sectors; 5461 sector_t sectors;
6147 5462
6148 /* for raid0 takeover only one zone is supported */ 5463 /* for raid0 takeover only one zone is supported */
6149 if (raid0_conf->nr_strip_zones > 1) { 5464 if (raid0_priv->nr_strip_zones > 1) {
6150 printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n", 5465 printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n",
6151 mdname(mddev)); 5466 mdname(mddev));
6152 return ERR_PTR(-EINVAL); 5467 return ERR_PTR(-EINVAL);
6153 } 5468 }
6154 5469
6155 sectors = raid0_conf->strip_zone[0].zone_end; 5470 sectors = raid0_priv->strip_zone[0].zone_end;
6156 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev); 5471 sector_div(sectors, raid0_priv->strip_zone[0].nb_dev);
6157 mddev->dev_sectors = sectors; 5472 mddev->dev_sectors = sectors;
6158 mddev->new_level = level; 5473 mddev->new_level = level;
6159 mddev->new_layout = ALGORITHM_PARITY_N; 5474 mddev->new_layout = ALGORITHM_PARITY_N;
@@ -6167,7 +5482,7 @@ static void *raid45_takeover_raid0(struct mddev *mddev, int level)
6167} 5482}
6168 5483
6169 5484
6170static void *raid5_takeover_raid1(struct mddev *mddev) 5485static void *raid5_takeover_raid1(mddev_t *mddev)
6171{ 5486{
6172 int chunksect; 5487 int chunksect;
6173 5488
@@ -6194,7 +5509,7 @@ static void *raid5_takeover_raid1(struct mddev *mddev)
6194 return setup_conf(mddev); 5509 return setup_conf(mddev);
6195} 5510}
6196 5511
6197static void *raid5_takeover_raid6(struct mddev *mddev) 5512static void *raid5_takeover_raid6(mddev_t *mddev)
6198{ 5513{
6199 int new_layout; 5514 int new_layout;
6200 5515
@@ -6228,14 +5543,14 @@ static void *raid5_takeover_raid6(struct mddev *mddev)
6228} 5543}
6229 5544
6230 5545
6231static int raid5_check_reshape(struct mddev *mddev) 5546static int raid5_check_reshape(mddev_t *mddev)
6232{ 5547{
6233 /* For a 2-drive array, the layout and chunk size can be changed 5548 /* For a 2-drive array, the layout and chunk size can be changed
6234 * immediately as not restriping is needed. 5549 * immediately as not restriping is needed.
6235 * For larger arrays we record the new value - after validation 5550 * For larger arrays we record the new value - after validation
6236 * to be used by a reshape pass. 5551 * to be used by a reshape pass.
6237 */ 5552 */
6238 struct r5conf *conf = mddev->private; 5553 raid5_conf_t *conf = mddev->private;
6239 int new_chunk = mddev->new_chunk_sectors; 5554 int new_chunk = mddev->new_chunk_sectors;
6240 5555
6241 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) 5556 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout))
@@ -6268,7 +5583,7 @@ static int raid5_check_reshape(struct mddev *mddev)
6268 return check_reshape(mddev); 5583 return check_reshape(mddev);
6269} 5584}
6270 5585
6271static int raid6_check_reshape(struct mddev *mddev) 5586static int raid6_check_reshape(mddev_t *mddev)
6272{ 5587{
6273 int new_chunk = mddev->new_chunk_sectors; 5588 int new_chunk = mddev->new_chunk_sectors;
6274 5589
@@ -6288,7 +5603,7 @@ static int raid6_check_reshape(struct mddev *mddev)
6288 return check_reshape(mddev); 5603 return check_reshape(mddev);
6289} 5604}
6290 5605
6291static void *raid5_takeover(struct mddev *mddev) 5606static void *raid5_takeover(mddev_t *mddev)
6292{ 5607{
6293 /* raid5 can take over: 5608 /* raid5 can take over:
6294 * raid0 - if there is only one strip zone - make it a raid4 layout 5609 * raid0 - if there is only one strip zone - make it a raid4 layout
@@ -6311,7 +5626,7 @@ static void *raid5_takeover(struct mddev *mddev)
6311 return ERR_PTR(-EINVAL); 5626 return ERR_PTR(-EINVAL);
6312} 5627}
6313 5628
6314static void *raid4_takeover(struct mddev *mddev) 5629static void *raid4_takeover(mddev_t *mddev)
6315{ 5630{
6316 /* raid4 can take over: 5631 /* raid4 can take over:
6317 * raid0 - if there is only one strip zone 5632 * raid0 - if there is only one strip zone
@@ -6328,9 +5643,9 @@ static void *raid4_takeover(struct mddev *mddev)
6328 return ERR_PTR(-EINVAL); 5643 return ERR_PTR(-EINVAL);
6329} 5644}
6330 5645
6331static struct md_personality raid5_personality; 5646static struct mdk_personality raid5_personality;
6332 5647
6333static void *raid6_takeover(struct mddev *mddev) 5648static void *raid6_takeover(mddev_t *mddev)
6334{ 5649{
6335 /* Currently can only take over a raid5. We map the 5650 /* Currently can only take over a raid5. We map the
6336 * personality to an equivalent raid6 personality 5651 * personality to an equivalent raid6 personality
@@ -6377,7 +5692,7 @@ static void *raid6_takeover(struct mddev *mddev)
6377} 5692}
6378 5693
6379 5694
6380static struct md_personality raid6_personality = 5695static struct mdk_personality raid6_personality =
6381{ 5696{
6382 .name = "raid6", 5697 .name = "raid6",
6383 .level = 6, 5698 .level = 6,
@@ -6399,7 +5714,7 @@ static struct md_personality raid6_personality =
6399 .quiesce = raid5_quiesce, 5714 .quiesce = raid5_quiesce,
6400 .takeover = raid6_takeover, 5715 .takeover = raid6_takeover,
6401}; 5716};
6402static struct md_personality raid5_personality = 5717static struct mdk_personality raid5_personality =
6403{ 5718{
6404 .name = "raid5", 5719 .name = "raid5",
6405 .level = 5, 5720 .level = 5,
@@ -6422,7 +5737,7 @@ static struct md_personality raid5_personality =
6422 .takeover = raid5_takeover, 5737 .takeover = raid5_takeover,
6423}; 5738};
6424 5739
6425static struct md_personality raid4_personality = 5740static struct mdk_personality raid4_personality =
6426{ 5741{
6427 .name = "raid4", 5742 .name = "raid4",
6428 .level = 4, 5743 .level = 4,
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 18b2c4a8a1f..11b9566184b 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -27,7 +27,7 @@
27 * The possible state transitions are: 27 * The possible state transitions are:
28 * 28 *
29 * Empty -> Want - on read or write to get old data for parity calc 29 * Empty -> Want - on read or write to get old data for parity calc
30 * Empty -> Dirty - on compute_parity to satisfy write/sync request. 30 * Empty -> Dirty - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE)
31 * Empty -> Clean - on compute_block when computing a block for failed drive 31 * Empty -> Clean - on compute_block when computing a block for failed drive
32 * Want -> Empty - on failed read 32 * Want -> Empty - on failed read
33 * Want -> Clean - on successful completion of read request 33 * Want -> Clean - on successful completion of read request
@@ -197,7 +197,7 @@ enum reconstruct_states {
197struct stripe_head { 197struct stripe_head {
198 struct hlist_node hash; 198 struct hlist_node hash;
199 struct list_head lru; /* inactive_list or handle_list */ 199 struct list_head lru; /* inactive_list or handle_list */
200 struct r5conf *raid_conf; 200 struct raid5_private_data *raid_conf;
201 short generation; /* increments with every 201 short generation; /* increments with every
202 * reshape */ 202 * reshape */
203 sector_t sector; /* sector of this row */ 203 sector_t sector; /* sector of this row */
@@ -210,7 +210,6 @@ struct stripe_head {
210 int disks; /* disks in stripe */ 210 int disks; /* disks in stripe */
211 enum check_states check_state; 211 enum check_states check_state;
212 enum reconstruct_states reconstruct_state; 212 enum reconstruct_states reconstruct_state;
213 spinlock_t stripe_lock;
214 /** 213 /**
215 * struct stripe_operations 214 * struct stripe_operations
216 * @target - STRIPE_OP_COMPUTE_BLK target 215 * @target - STRIPE_OP_COMPUTE_BLK target
@@ -227,11 +226,8 @@ struct stripe_head {
227 #endif 226 #endif
228 } ops; 227 } ops;
229 struct r5dev { 228 struct r5dev {
230 /* rreq and rvec are used for the replacement device when 229 struct bio req;
231 * writing data to both devices. 230 struct bio_vec vec;
232 */
233 struct bio req, rreq;
234 struct bio_vec vec, rvec;
235 struct page *page; 231 struct page *page;
236 struct bio *toread, *read, *towrite, *written; 232 struct bio *toread, *read, *towrite, *written;
237 sector_t sector; /* sector of this page */ 233 sector_t sector; /* sector of this page */
@@ -243,13 +239,7 @@ struct stripe_head {
243 * for handle_stripe. 239 * for handle_stripe.
244 */ 240 */
245struct stripe_head_state { 241struct stripe_head_state {
246 /* 'syncing' means that we need to read all devices, either 242 int syncing, expanding, expanded;
247 * to check/correct parity, or to reconstruct a missing device.
248 * 'replacing' means we are replacing one or more drives and
249 * the source is valid at this point so we don't need to
250 * read all devices, just the replacement targets.
251 */
252 int syncing, expanding, expanded, replacing;
253 int locked, uptodate, to_read, to_write, failed, written; 243 int locked, uptodate, to_read, to_write, failed, written;
254 int to_fill, compute, req_compute, non_overwrite; 244 int to_fill, compute, req_compute, non_overwrite;
255 int failed_num[2]; 245 int failed_num[2];
@@ -258,48 +248,42 @@ struct stripe_head_state {
258 unsigned long ops_request; 248 unsigned long ops_request;
259 249
260 struct bio *return_bi; 250 struct bio *return_bi;
261 struct md_rdev *blocked_rdev; 251 mdk_rdev_t *blocked_rdev;
262 int handle_bad_blocks; 252 int handle_bad_blocks;
263}; 253};
264 254
265/* Flags for struct r5dev.flags */ 255/* Flags */
266enum r5dev_flags { 256#define R5_UPTODATE 0 /* page contains current data */
267 R5_UPTODATE, /* page contains current data */ 257#define R5_LOCKED 1 /* IO has been submitted on "req" */
268 R5_LOCKED, /* IO has been submitted on "req" */ 258#define R5_OVERWRITE 2 /* towrite covers whole page */
269 R5_DOUBLE_LOCKED,/* Cannot clear R5_LOCKED until 2 writes complete */
270 R5_OVERWRITE, /* towrite covers whole page */
271/* and some that are internal to handle_stripe */ 259/* and some that are internal to handle_stripe */
272 R5_Insync, /* rdev && rdev->in_sync at start */ 260#define R5_Insync 3 /* rdev && rdev->in_sync at start */
273 R5_Wantread, /* want to schedule a read */ 261#define R5_Wantread 4 /* want to schedule a read */
274 R5_Wantwrite, 262#define R5_Wantwrite 5
275 R5_Overlap, /* There is a pending overlapping request 263#define R5_Overlap 7 /* There is a pending overlapping request on this block */
276 * on this block */ 264#define R5_ReadError 8 /* seen a read error here recently */
277 R5_ReadNoMerge, /* prevent bio from merging in block-layer */ 265#define R5_ReWrite 9 /* have tried to over-write the readerror */
278 R5_ReadError, /* seen a read error here recently */ 266
279 R5_ReWrite, /* have tried to over-write the readerror */ 267#define R5_Expanded 10 /* This block now has post-expand data */
280 268#define R5_Wantcompute 11 /* compute_block in progress treat as
281 R5_Expanded, /* This block now has post-expand data */ 269 * uptodate
282 R5_Wantcompute, /* compute_block in progress treat as 270 */
283 * uptodate 271#define R5_Wantfill 12 /* dev->toread contains a bio that needs
284 */ 272 * filling
285 R5_Wantfill, /* dev->toread contains a bio that needs 273 */
286 * filling 274#define R5_Wantdrain 13 /* dev->towrite needs to be drained */
287 */ 275#define R5_WantFUA 14 /* Write should be FUA */
288 R5_Wantdrain, /* dev->towrite needs to be drained */ 276#define R5_WriteError 15 /* got a write error - need to record it */
289 R5_WantFUA, /* Write should be FUA */ 277#define R5_MadeGood 16 /* A bad block has been fixed by writing to it*/
290 R5_SyncIO, /* The IO is sync */ 278/*
291 R5_WriteError, /* got a write error - need to record it */ 279 * Write method
292 R5_MadeGood, /* A bad block has been fixed by writing to it */ 280 */
293 R5_ReadRepl, /* Will/did read from replacement rather than orig */ 281#define RECONSTRUCT_WRITE 1
294 R5_MadeGoodRepl,/* A bad block on the replacement device has been 282#define READ_MODIFY_WRITE 2
295 * fixed by writing to it */ 283/* not a write method, but a compute_parity mode */
296 R5_NeedReplace, /* This device has a replacement which is not 284#define CHECK_PARITY 3
297 * up-to-date at this stripe. */ 285/* Additional compute_parity mode -- updates the parity w/o LOCKING */
298 R5_WantReplace, /* We need to update the replacement, we have read 286#define UPDATE_PARITY 4
299 * data in, and now is a good time to write it out.
300 */
301 R5_Discard, /* Discard the stripe */
302};
303 287
304/* 288/*
305 * Stripe state 289 * Stripe state
@@ -322,20 +306,18 @@ enum {
322 STRIPE_BIOFILL_RUN, 306 STRIPE_BIOFILL_RUN,
323 STRIPE_COMPUTE_RUN, 307 STRIPE_COMPUTE_RUN,
324 STRIPE_OPS_REQ_PENDING, 308 STRIPE_OPS_REQ_PENDING,
325 STRIPE_ON_UNPLUG_LIST,
326}; 309};
327 310
328/* 311/*
329 * Operation request flags 312 * Operation request flags
330 */ 313 */
331enum { 314#define STRIPE_OP_BIOFILL 0
332 STRIPE_OP_BIOFILL, 315#define STRIPE_OP_COMPUTE_BLK 1
333 STRIPE_OP_COMPUTE_BLK, 316#define STRIPE_OP_PREXOR 2
334 STRIPE_OP_PREXOR, 317#define STRIPE_OP_BIODRAIN 3
335 STRIPE_OP_BIODRAIN, 318#define STRIPE_OP_RECONSTRUCT 4
336 STRIPE_OP_RECONSTRUCT, 319#define STRIPE_OP_CHECK 5
337 STRIPE_OP_CHECK, 320
338};
339/* 321/*
340 * Plugging: 322 * Plugging:
341 * 323 *
@@ -362,12 +344,13 @@ enum {
362 344
363 345
364struct disk_info { 346struct disk_info {
365 struct md_rdev *rdev, *replacement; 347 mdk_rdev_t *rdev;
366}; 348};
367 349
368struct r5conf { 350struct raid5_private_data {
369 struct hlist_head *stripe_hashtbl; 351 struct hlist_head *stripe_hashtbl;
370 struct mddev *mddev; 352 mddev_t *mddev;
353 struct disk_info *spare;
371 int chunk_sectors; 354 int chunk_sectors;
372 int level, algorithm; 355 int level, algorithm;
373 int max_degraded; 356 int max_degraded;
@@ -390,12 +373,6 @@ struct r5conf {
390 short generation; /* increments with every reshape */ 373 short generation; /* increments with every reshape */
391 unsigned long reshape_checkpoint; /* Time we last updated 374 unsigned long reshape_checkpoint; /* Time we last updated
392 * metadata */ 375 * metadata */
393 long long min_offset_diff; /* minimum difference between
394 * data_offset and
395 * new_data_offset across all
396 * devices. May be negative,
397 * but is closest to zero.
398 */
399 376
400 struct list_head handle_list; /* stripes needing handling */ 377 struct list_head handle_list; /* stripes needing handling */
401 struct list_head hold_list; /* preread ready stripes */ 378 struct list_head hold_list; /* preread ready stripes */
@@ -459,9 +436,11 @@ struct r5conf {
459 /* When taking over an array from a different personality, we store 436 /* When taking over an array from a different personality, we store
460 * the new thread here until we fully activate the array. 437 * the new thread here until we fully activate the array.
461 */ 438 */
462 struct md_thread *thread; 439 struct mdk_thread_s *thread;
463}; 440};
464 441
442typedef struct raid5_private_data raid5_conf_t;
443
465/* 444/*
466 * Our supported algorithms 445 * Our supported algorithms
467 */ 446 */
@@ -524,7 +503,7 @@ static inline int algorithm_is_DDF(int layout)
524 return layout >= 8 && layout <= 10; 503 return layout >= 8 && layout <= 10;
525} 504}
526 505
527extern int md_raid5_congested(struct mddev *mddev, int bits); 506extern int md_raid5_congested(mddev_t *mddev, int bits);
528extern void md_raid5_kick_device(struct r5conf *conf); 507extern void md_raid5_kick_device(raid5_conf_t *conf);
529extern int raid5_set_cache_size(struct mddev *mddev, int size); 508extern int raid5_set_cache_size(mddev_t *mddev, int size);
530#endif 509#endif