aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig46
-rw-r--r--drivers/md/Makefile5
-rw-r--r--drivers/md/bitmap.c483
-rw-r--r--drivers/md/dm-crypt.c56
-rw-r--r--drivers/md/dm-emc.c40
-rw-r--r--drivers/md/dm-exception-store.c67
-rw-r--r--drivers/md/dm-ioctl.c109
-rw-r--r--drivers/md/dm-linear.c8
-rw-r--r--drivers/md/dm-log.c157
-rw-r--r--drivers/md/dm-mpath.c43
-rw-r--r--drivers/md/dm-raid1.c97
-rw-r--r--drivers/md/dm-round-robin.c6
-rw-r--r--drivers/md/dm-snap.c16
-rw-r--r--drivers/md/dm-stripe.c25
-rw-r--r--drivers/md/dm-table.c57
-rw-r--r--drivers/md/dm-target.c2
-rw-r--r--drivers/md/dm-zero.c8
-rw-r--r--drivers/md/dm.c184
-rw-r--r--drivers/md/dm.h81
-rw-r--r--drivers/md/kcopyd.c4
-rw-r--r--drivers/md/linear.c74
-rw-r--r--drivers/md/md.c634
-rw-r--r--drivers/md/raid1.c43
-rw-r--r--drivers/md/raid10.c77
-rw-r--r--drivers/md/raid5.c1308
-rw-r--r--drivers/md/raid6main.c2427
26 files changed, 2528 insertions, 3529 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index ac25a48362ac..bf869ed03eed 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -90,7 +90,7 @@ config MD_RAID10
90 depends on BLK_DEV_MD && EXPERIMENTAL 90 depends on BLK_DEV_MD && EXPERIMENTAL
91 ---help--- 91 ---help---
92 RAID-10 provides a combination of striping (RAID-0) and 92 RAID-10 provides a combination of striping (RAID-0) and
93 mirroring (RAID-1) with easier configuration and more flexable 93 mirroring (RAID-1) with easier configuration and more flexible
94 layout. 94 layout.
95 Unlike RAID-0, but like RAID-1, RAID-10 requires all devices to 95 Unlike RAID-0, but like RAID-1, RAID-10 requires all devices to
96 be the same size (or at least, only as much as the smallest device 96 be the same size (or at least, only as much as the smallest device
@@ -104,8 +104,8 @@ config MD_RAID10
104 104
105 If unsure, say Y. 105 If unsure, say Y.
106 106
107config MD_RAID5 107config MD_RAID456
108 tristate "RAID-4/RAID-5 mode" 108 tristate "RAID-4/RAID-5/RAID-6 mode"
109 depends on BLK_DEV_MD 109 depends on BLK_DEV_MD
110 ---help--- 110 ---help---
111 A RAID-5 set of N drives with a capacity of C MB per drive provides 111 A RAID-5 set of N drives with a capacity of C MB per drive provides
@@ -116,20 +116,28 @@ config MD_RAID5
116 while a RAID-5 set distributes the parity across the drives in one 116 while a RAID-5 set distributes the parity across the drives in one
117 of the available parity distribution methods. 117 of the available parity distribution methods.
118 118
119 A RAID-6 set of N drives with a capacity of C MB per drive
120 provides the capacity of C * (N - 2) MB, and protects
121 against a failure of any two drives. For a given sector
122 (row) number, (N - 2) drives contain data sectors, and two
123 drives contains two independent redundancy syndromes. Like
124 RAID-5, RAID-6 distributes the syndromes across the drives
125 in one of the available parity distribution methods.
126
119 Information about Software RAID on Linux is contained in the 127 Information about Software RAID on Linux is contained in the
120 Software-RAID mini-HOWTO, available from 128 Software-RAID mini-HOWTO, available from
121 <http://www.tldp.org/docs.html#howto>. There you will also 129 <http://www.tldp.org/docs.html#howto>. There you will also
122 learn where to get the supporting user space utilities raidtools. 130 learn where to get the supporting user space utilities raidtools.
123 131
124 If you want to use such a RAID-4/RAID-5 set, say Y. To 132 If you want to use such a RAID-4/RAID-5/RAID-6 set, say Y. To
125 compile this code as a module, choose M here: the module 133 compile this code as a module, choose M here: the module
126 will be called raid5. 134 will be called raid456.
127 135
128 If unsure, say Y. 136 If unsure, say Y.
129 137
130config MD_RAID5_RESHAPE 138config MD_RAID5_RESHAPE
131 bool "Support adding drives to a raid-5 array (experimental)" 139 bool "Support adding drives to a raid-5 array (experimental)"
132 depends on MD_RAID5 && EXPERIMENTAL 140 depends on MD_RAID456 && EXPERIMENTAL
133 ---help--- 141 ---help---
134 A RAID-5 set can be expanded by adding extra drives. This 142 A RAID-5 set can be expanded by adding extra drives. This
135 requires "restriping" the array which means (almost) every 143 requires "restriping" the array which means (almost) every
@@ -139,7 +147,7 @@ config MD_RAID5_RESHAPE
139 is online. However it is still EXPERIMENTAL code. It should 147 is online. However it is still EXPERIMENTAL code. It should
140 work, but please be sure that you have backups. 148 work, but please be sure that you have backups.
141 149
142 You will need mdadm verion 2.4.1 or later to use this 150 You will need mdadm version 2.4.1 or later to use this
143 feature safely. During the early stage of reshape there is 151 feature safely. During the early stage of reshape there is
144 a critical section where live data is being over-written. A 152 a critical section where live data is being over-written. A
145 crash during this time needs extra care for recovery. The 153 crash during this time needs extra care for recovery. The
@@ -154,28 +162,6 @@ config MD_RAID5_RESHAPE
154 There should be enough spares already present to make the new 162 There should be enough spares already present to make the new
155 array workable. 163 array workable.
156 164
157config MD_RAID6
158 tristate "RAID-6 mode"
159 depends on BLK_DEV_MD
160 ---help---
161 A RAID-6 set of N drives with a capacity of C MB per drive
162 provides the capacity of C * (N - 2) MB, and protects
163 against a failure of any two drives. For a given sector
164 (row) number, (N - 2) drives contain data sectors, and two
165 drives contains two independent redundancy syndromes. Like
166 RAID-5, RAID-6 distributes the syndromes across the drives
167 in one of the available parity distribution methods.
168
169 RAID-6 requires mdadm-1.5.0 or later, available at:
170
171 ftp://ftp.kernel.org/pub/linux/utils/raid/mdadm/
172
173 If you want to use such a RAID-6 set, say Y. To compile
174 this code as a module, choose M here: the module will be
175 called raid6.
176
177 If unsure, say Y.
178
179config MD_MULTIPATH 165config MD_MULTIPATH
180 tristate "Multipath I/O support" 166 tristate "Multipath I/O support"
181 depends on BLK_DEV_MD 167 depends on BLK_DEV_MD
@@ -235,7 +221,7 @@ config DM_SNAPSHOT
235 tristate "Snapshot target (EXPERIMENTAL)" 221 tristate "Snapshot target (EXPERIMENTAL)"
236 depends on BLK_DEV_DM && EXPERIMENTAL 222 depends on BLK_DEV_DM && EXPERIMENTAL
237 ---help--- 223 ---help---
238 Allow volume managers to take writeable snapshots of a device. 224 Allow volume managers to take writable snapshots of a device.
239 225
240config DM_MIRROR 226config DM_MIRROR
241 tristate "Mirror target (EXPERIMENTAL)" 227 tristate "Mirror target (EXPERIMENTAL)"
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index d3efedf6a6ad..34957a68d921 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -8,7 +8,7 @@ dm-multipath-objs := dm-hw-handler.o dm-path-selector.o dm-mpath.o
8dm-snapshot-objs := dm-snap.o dm-exception-store.o 8dm-snapshot-objs := dm-snap.o dm-exception-store.o
9dm-mirror-objs := dm-log.o dm-raid1.o 9dm-mirror-objs := dm-log.o dm-raid1.o
10md-mod-objs := md.o bitmap.o 10md-mod-objs := md.o bitmap.o
11raid6-objs := raid6main.o raid6algos.o raid6recov.o raid6tables.o \ 11raid456-objs := raid5.o raid6algos.o raid6recov.o raid6tables.o \
12 raid6int1.o raid6int2.o raid6int4.o \ 12 raid6int1.o raid6int2.o raid6int4.o \
13 raid6int8.o raid6int16.o raid6int32.o \ 13 raid6int8.o raid6int16.o raid6int32.o \
14 raid6altivec1.o raid6altivec2.o raid6altivec4.o \ 14 raid6altivec1.o raid6altivec2.o raid6altivec4.o \
@@ -25,8 +25,7 @@ obj-$(CONFIG_MD_LINEAR) += linear.o
25obj-$(CONFIG_MD_RAID0) += raid0.o 25obj-$(CONFIG_MD_RAID0) += raid0.o
26obj-$(CONFIG_MD_RAID1) += raid1.o 26obj-$(CONFIG_MD_RAID1) += raid1.o
27obj-$(CONFIG_MD_RAID10) += raid10.o 27obj-$(CONFIG_MD_RAID10) += raid10.o
28obj-$(CONFIG_MD_RAID5) += raid5.o xor.o 28obj-$(CONFIG_MD_RAID456) += raid456.o xor.o
29obj-$(CONFIG_MD_RAID6) += raid6.o xor.o
30obj-$(CONFIG_MD_MULTIPATH) += multipath.o 29obj-$(CONFIG_MD_MULTIPATH) += multipath.o
31obj-$(CONFIG_MD_FAULTY) += faulty.o 30obj-$(CONFIG_MD_FAULTY) += faulty.o
32obj-$(CONFIG_BLK_DEV_MD) += md-mod.o 31obj-$(CONFIG_BLK_DEV_MD) += md-mod.o
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index f8ffaee20ff8..ebbd2d856256 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -7,7 +7,6 @@
7 * additions, Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.: 7 * additions, Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.:
8 * - added disk storage for bitmap 8 * - added disk storage for bitmap
9 * - changes to allow various bitmap chunk sizes 9 * - changes to allow various bitmap chunk sizes
10 * - added bitmap daemon (to asynchronously clear bitmap bits from disk)
11 */ 10 */
12 11
13/* 12/*
@@ -15,9 +14,6 @@
15 * 14 *
16 * flush after percent set rather than just time based. (maybe both). 15 * flush after percent set rather than just time based. (maybe both).
17 * wait if count gets too high, wake when it drops to half. 16 * wait if count gets too high, wake when it drops to half.
18 * allow bitmap to be mirrored with superblock (before or after...)
19 * allow hot-add to re-instate a current device.
20 * allow hot-add of bitmap after quiessing device
21 */ 17 */
22 18
23#include <linux/module.h> 19#include <linux/module.h>
@@ -73,24 +69,6 @@ static inline char * bmname(struct bitmap *bitmap)
73 69
74 70
75/* 71/*
76 * test if the bitmap is active
77 */
78int bitmap_active(struct bitmap *bitmap)
79{
80 unsigned long flags;
81 int res = 0;
82
83 if (!bitmap)
84 return res;
85 spin_lock_irqsave(&bitmap->lock, flags);
86 res = bitmap->flags & BITMAP_ACTIVE;
87 spin_unlock_irqrestore(&bitmap->lock, flags);
88 return res;
89}
90
91#define WRITE_POOL_SIZE 256
92
93/*
94 * just a placeholder - calls kmalloc for bitmap pages 72 * just a placeholder - calls kmalloc for bitmap pages
95 */ 73 */
96static unsigned char *bitmap_alloc_page(struct bitmap *bitmap) 74static unsigned char *bitmap_alloc_page(struct bitmap *bitmap)
@@ -269,6 +247,8 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde
269 247
270 if (sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ)) { 248 if (sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ)) {
271 page->index = index; 249 page->index = index;
250 attach_page_buffers(page, NULL); /* so that free_buffer will
251 * quietly no-op */
272 return page; 252 return page;
273 } 253 }
274 } 254 }
@@ -300,77 +280,132 @@ static int write_sb_page(mddev_t *mddev, long offset, struct page *page, int wai
300 */ 280 */
301static int write_page(struct bitmap *bitmap, struct page *page, int wait) 281static int write_page(struct bitmap *bitmap, struct page *page, int wait)
302{ 282{
303 int ret = -ENOMEM; 283 struct buffer_head *bh;
304 284
305 if (bitmap->file == NULL) 285 if (bitmap->file == NULL)
306 return write_sb_page(bitmap->mddev, bitmap->offset, page, wait); 286 return write_sb_page(bitmap->mddev, bitmap->offset, page, wait);
307 287
308 flush_dcache_page(page); /* make sure visible to anyone reading the file */ 288 bh = page_buffers(page);
309 289
310 if (wait) 290 while (bh && bh->b_blocknr) {
311 lock_page(page); 291 atomic_inc(&bitmap->pending_writes);
312 else { 292 set_buffer_locked(bh);
313 if (TestSetPageLocked(page)) 293 set_buffer_mapped(bh);
314 return -EAGAIN; /* already locked */ 294 submit_bh(WRITE, bh);
315 if (PageWriteback(page)) { 295 bh = bh->b_this_page;
316 unlock_page(page);
317 return -EAGAIN;
318 }
319 } 296 }
320 297
321 ret = page->mapping->a_ops->prepare_write(bitmap->file, page, 0, PAGE_SIZE); 298 if (wait) {
322 if (!ret) 299 wait_event(bitmap->write_wait,
323 ret = page->mapping->a_ops->commit_write(bitmap->file, page, 0, 300 atomic_read(&bitmap->pending_writes)==0);
324 PAGE_SIZE); 301 return (bitmap->flags & BITMAP_WRITE_ERROR) ? -EIO : 0;
325 if (ret) {
326 unlock_page(page);
327 return ret;
328 } 302 }
303 return 0;
304}
329 305
330 set_page_dirty(page); /* force it to be written out */ 306static void end_bitmap_write(struct buffer_head *bh, int uptodate)
331 307{
332 if (!wait) { 308 struct bitmap *bitmap = bh->b_private;
333 /* add to list to be waited for by daemon */ 309 unsigned long flags;
334 struct page_list *item = mempool_alloc(bitmap->write_pool, GFP_NOIO); 310
335 item->page = page; 311 if (!uptodate) {
336 get_page(page); 312 spin_lock_irqsave(&bitmap->lock, flags);
337 spin_lock(&bitmap->write_lock); 313 bitmap->flags |= BITMAP_WRITE_ERROR;
338 list_add(&item->list, &bitmap->complete_pages); 314 spin_unlock_irqrestore(&bitmap->lock, flags);
339 spin_unlock(&bitmap->write_lock); 315 }
340 md_wakeup_thread(bitmap->writeback_daemon); 316 if (atomic_dec_and_test(&bitmap->pending_writes))
317 wake_up(&bitmap->write_wait);
318}
319
320/* copied from buffer.c */
321static void
322__clear_page_buffers(struct page *page)
323{
324 ClearPagePrivate(page);
325 set_page_private(page, 0);
326 page_cache_release(page);
327}
328static void free_buffers(struct page *page)
329{
330 struct buffer_head *bh = page_buffers(page);
331
332 while (bh) {
333 struct buffer_head *next = bh->b_this_page;
334 free_buffer_head(bh);
335 bh = next;
341 } 336 }
342 return write_one_page(page, wait); 337 __clear_page_buffers(page);
338 put_page(page);
343} 339}
344 340
345/* read a page from a file, pinning it into cache, and return bytes_read */ 341/* read a page from a file.
342 * We both read the page, and attach buffers to the page to record the
343 * address of each block (using bmap). These addresses will be used
344 * to write the block later, completely bypassing the filesystem.
345 * This usage is similar to how swap files are handled, and allows us
346 * to write to a file with no concerns of memory allocation failing.
347 */
346static struct page *read_page(struct file *file, unsigned long index, 348static struct page *read_page(struct file *file, unsigned long index,
347 unsigned long *bytes_read) 349 struct bitmap *bitmap,
350 unsigned long count)
348{ 351{
349 struct inode *inode = file->f_mapping->host;
350 struct page *page = NULL; 352 struct page *page = NULL;
351 loff_t isize = i_size_read(inode); 353 struct inode *inode = file->f_dentry->d_inode;
352 unsigned long end_index = isize >> PAGE_SHIFT; 354 struct buffer_head *bh;
355 sector_t block;
353 356
354 PRINTK("read bitmap file (%dB @ %Lu)\n", (int)PAGE_SIZE, 357 PRINTK("read bitmap file (%dB @ %Lu)\n", (int)PAGE_SIZE,
355 (unsigned long long)index << PAGE_SHIFT); 358 (unsigned long long)index << PAGE_SHIFT);
356 359
357 page = read_cache_page(inode->i_mapping, index, 360 page = alloc_page(GFP_KERNEL);
358 (filler_t *)inode->i_mapping->a_ops->readpage, file); 361 if (!page)
362 page = ERR_PTR(-ENOMEM);
359 if (IS_ERR(page)) 363 if (IS_ERR(page))
360 goto out; 364 goto out;
361 wait_on_page_locked(page); 365
362 if (!PageUptodate(page) || PageError(page)) { 366 bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0);
367 if (!bh) {
363 put_page(page); 368 put_page(page);
364 page = ERR_PTR(-EIO); 369 page = ERR_PTR(-ENOMEM);
365 goto out; 370 goto out;
366 } 371 }
372 attach_page_buffers(page, bh);
373 block = index << (PAGE_SHIFT - inode->i_blkbits);
374 while (bh) {
375 if (count == 0)
376 bh->b_blocknr = 0;
377 else {
378 bh->b_blocknr = bmap(inode, block);
379 if (bh->b_blocknr == 0) {
380 /* Cannot use this file! */
381 free_buffers(page);
382 page = ERR_PTR(-EINVAL);
383 goto out;
384 }
385 bh->b_bdev = inode->i_sb->s_bdev;
386 if (count < (1<<inode->i_blkbits))
387 count = 0;
388 else
389 count -= (1<<inode->i_blkbits);
390
391 bh->b_end_io = end_bitmap_write;
392 bh->b_private = bitmap;
393 atomic_inc(&bitmap->pending_writes);
394 set_buffer_locked(bh);
395 set_buffer_mapped(bh);
396 submit_bh(READ, bh);
397 }
398 block++;
399 bh = bh->b_this_page;
400 }
401 page->index = index;
367 402
368 if (index > end_index) /* we have read beyond EOF */ 403 wait_event(bitmap->write_wait,
369 *bytes_read = 0; 404 atomic_read(&bitmap->pending_writes)==0);
370 else if (index == end_index) /* possible short read */ 405 if (bitmap->flags & BITMAP_WRITE_ERROR) {
371 *bytes_read = isize & ~PAGE_MASK; 406 free_buffers(page);
372 else 407 page = ERR_PTR(-EIO);
373 *bytes_read = PAGE_SIZE; /* got a full page */ 408 }
374out: 409out:
375 if (IS_ERR(page)) 410 if (IS_ERR(page))
376 printk(KERN_ALERT "md: bitmap read error: (%dB @ %Lu): %ld\n", 411 printk(KERN_ALERT "md: bitmap read error: (%dB @ %Lu): %ld\n",
@@ -441,16 +476,14 @@ static int bitmap_read_sb(struct bitmap *bitmap)
441 char *reason = NULL; 476 char *reason = NULL;
442 bitmap_super_t *sb; 477 bitmap_super_t *sb;
443 unsigned long chunksize, daemon_sleep, write_behind; 478 unsigned long chunksize, daemon_sleep, write_behind;
444 unsigned long bytes_read;
445 unsigned long long events; 479 unsigned long long events;
446 int err = -EINVAL; 480 int err = -EINVAL;
447 481
448 /* page 0 is the superblock, read it... */ 482 /* page 0 is the superblock, read it... */
449 if (bitmap->file) 483 if (bitmap->file)
450 bitmap->sb_page = read_page(bitmap->file, 0, &bytes_read); 484 bitmap->sb_page = read_page(bitmap->file, 0, bitmap, PAGE_SIZE);
451 else { 485 else {
452 bitmap->sb_page = read_sb_page(bitmap->mddev, bitmap->offset, 0); 486 bitmap->sb_page = read_sb_page(bitmap->mddev, bitmap->offset, 0);
453 bytes_read = PAGE_SIZE;
454 } 487 }
455 if (IS_ERR(bitmap->sb_page)) { 488 if (IS_ERR(bitmap->sb_page)) {
456 err = PTR_ERR(bitmap->sb_page); 489 err = PTR_ERR(bitmap->sb_page);
@@ -460,13 +493,6 @@ static int bitmap_read_sb(struct bitmap *bitmap)
460 493
461 sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); 494 sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
462 495
463 if (bytes_read < sizeof(*sb)) { /* short read */
464 printk(KERN_INFO "%s: bitmap file superblock truncated\n",
465 bmname(bitmap));
466 err = -ENOSPC;
467 goto out;
468 }
469
470 chunksize = le32_to_cpu(sb->chunksize); 496 chunksize = le32_to_cpu(sb->chunksize);
471 daemon_sleep = le32_to_cpu(sb->daemon_sleep); 497 daemon_sleep = le32_to_cpu(sb->daemon_sleep);
472 write_behind = le32_to_cpu(sb->write_behind); 498 write_behind = le32_to_cpu(sb->write_behind);
@@ -550,7 +576,6 @@ static void bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
550 spin_unlock_irqrestore(&bitmap->lock, flags); 576 spin_unlock_irqrestore(&bitmap->lock, flags);
551 return; 577 return;
552 } 578 }
553 get_page(bitmap->sb_page);
554 spin_unlock_irqrestore(&bitmap->lock, flags); 579 spin_unlock_irqrestore(&bitmap->lock, flags);
555 sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); 580 sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
556 switch (op) { 581 switch (op) {
@@ -561,7 +586,6 @@ static void bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
561 default: BUG(); 586 default: BUG();
562 } 587 }
563 kunmap_atomic(sb, KM_USER0); 588 kunmap_atomic(sb, KM_USER0);
564 put_page(bitmap->sb_page);
565} 589}
566 590
567/* 591/*
@@ -614,48 +638,17 @@ static void bitmap_file_unmap(struct bitmap *bitmap)
614 638
615 while (pages--) 639 while (pages--)
616 if (map[pages]->index != 0) /* 0 is sb_page, release it below */ 640 if (map[pages]->index != 0) /* 0 is sb_page, release it below */
617 put_page(map[pages]); 641 free_buffers(map[pages]);
618 kfree(map); 642 kfree(map);
619 kfree(attr); 643 kfree(attr);
620 644
621 safe_put_page(sb_page); 645 if (sb_page)
622} 646 free_buffers(sb_page);
623
624static void bitmap_stop_daemon(struct bitmap *bitmap);
625
626/* dequeue the next item in a page list -- don't call from irq context */
627static struct page_list *dequeue_page(struct bitmap *bitmap)
628{
629 struct page_list *item = NULL;
630 struct list_head *head = &bitmap->complete_pages;
631
632 spin_lock(&bitmap->write_lock);
633 if (list_empty(head))
634 goto out;
635 item = list_entry(head->prev, struct page_list, list);
636 list_del(head->prev);
637out:
638 spin_unlock(&bitmap->write_lock);
639 return item;
640}
641
642static void drain_write_queues(struct bitmap *bitmap)
643{
644 struct page_list *item;
645
646 while ((item = dequeue_page(bitmap))) {
647 /* don't bother to wait */
648 put_page(item->page);
649 mempool_free(item, bitmap->write_pool);
650 }
651
652 wake_up(&bitmap->write_wait);
653} 647}
654 648
655static void bitmap_file_put(struct bitmap *bitmap) 649static void bitmap_file_put(struct bitmap *bitmap)
656{ 650{
657 struct file *file; 651 struct file *file;
658 struct inode *inode;
659 unsigned long flags; 652 unsigned long flags;
660 653
661 spin_lock_irqsave(&bitmap->lock, flags); 654 spin_lock_irqsave(&bitmap->lock, flags);
@@ -663,17 +656,14 @@ static void bitmap_file_put(struct bitmap *bitmap)
663 bitmap->file = NULL; 656 bitmap->file = NULL;
664 spin_unlock_irqrestore(&bitmap->lock, flags); 657 spin_unlock_irqrestore(&bitmap->lock, flags);
665 658
666 bitmap_stop_daemon(bitmap); 659 if (file)
667 660 wait_event(bitmap->write_wait,
668 drain_write_queues(bitmap); 661 atomic_read(&bitmap->pending_writes)==0);
669
670 bitmap_file_unmap(bitmap); 662 bitmap_file_unmap(bitmap);
671 663
672 if (file) { 664 if (file) {
673 inode = file->f_mapping->host; 665 struct inode *inode = file->f_dentry->d_inode;
674 spin_lock(&inode->i_lock); 666 invalidate_inode_pages(inode->i_mapping);
675 atomic_set(&inode->i_writecount, 1); /* allow writes again */
676 spin_unlock(&inode->i_lock);
677 fput(file); 667 fput(file);
678 } 668 }
679} 669}
@@ -708,26 +698,27 @@ static void bitmap_file_kick(struct bitmap *bitmap)
708} 698}
709 699
710enum bitmap_page_attr { 700enum bitmap_page_attr {
711 BITMAP_PAGE_DIRTY = 1, // there are set bits that need to be synced 701 BITMAP_PAGE_DIRTY = 0, // there are set bits that need to be synced
712 BITMAP_PAGE_CLEAN = 2, // there are bits that might need to be cleared 702 BITMAP_PAGE_CLEAN = 1, // there are bits that might need to be cleared
713 BITMAP_PAGE_NEEDWRITE=4, // there are cleared bits that need to be synced 703 BITMAP_PAGE_NEEDWRITE=2, // there are cleared bits that need to be synced
714}; 704};
715 705
716static inline void set_page_attr(struct bitmap *bitmap, struct page *page, 706static inline void set_page_attr(struct bitmap *bitmap, struct page *page,
717 enum bitmap_page_attr attr) 707 enum bitmap_page_attr attr)
718{ 708{
719 bitmap->filemap_attr[page->index] |= attr; 709 __set_bit((page->index<<2) + attr, bitmap->filemap_attr);
720} 710}
721 711
722static inline void clear_page_attr(struct bitmap *bitmap, struct page *page, 712static inline void clear_page_attr(struct bitmap *bitmap, struct page *page,
723 enum bitmap_page_attr attr) 713 enum bitmap_page_attr attr)
724{ 714{
725 bitmap->filemap_attr[page->index] &= ~attr; 715 __clear_bit((page->index<<2) + attr, bitmap->filemap_attr);
726} 716}
727 717
728static inline unsigned long get_page_attr(struct bitmap *bitmap, struct page *page) 718static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page,
719 enum bitmap_page_attr attr)
729{ 720{
730 return bitmap->filemap_attr[page->index]; 721 return test_bit((page->index<<2) + attr, bitmap->filemap_attr);
731} 722}
732 723
733/* 724/*
@@ -751,11 +742,6 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
751 page = filemap_get_page(bitmap, chunk); 742 page = filemap_get_page(bitmap, chunk);
752 bit = file_page_offset(chunk); 743 bit = file_page_offset(chunk);
753 744
754
755 /* make sure the page stays cached until it gets written out */
756 if (! (get_page_attr(bitmap, page) & BITMAP_PAGE_DIRTY))
757 get_page(page);
758
759 /* set the bit */ 745 /* set the bit */
760 kaddr = kmap_atomic(page, KM_USER0); 746 kaddr = kmap_atomic(page, KM_USER0);
761 if (bitmap->flags & BITMAP_HOSTENDIAN) 747 if (bitmap->flags & BITMAP_HOSTENDIAN)
@@ -775,7 +761,8 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
775 * sync the dirty pages of the bitmap file to disk */ 761 * sync the dirty pages of the bitmap file to disk */
776int bitmap_unplug(struct bitmap *bitmap) 762int bitmap_unplug(struct bitmap *bitmap)
777{ 763{
778 unsigned long i, attr, flags; 764 unsigned long i, flags;
765 int dirty, need_write;
779 struct page *page; 766 struct page *page;
780 int wait = 0; 767 int wait = 0;
781 int err; 768 int err;
@@ -792,35 +779,26 @@ int bitmap_unplug(struct bitmap *bitmap)
792 return 0; 779 return 0;
793 } 780 }
794 page = bitmap->filemap[i]; 781 page = bitmap->filemap[i];
795 attr = get_page_attr(bitmap, page); 782 dirty = test_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
783 need_write = test_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
796 clear_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); 784 clear_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
797 clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE); 785 clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
798 if ((attr & BITMAP_PAGE_DIRTY)) 786 if (dirty)
799 wait = 1; 787 wait = 1;
800 spin_unlock_irqrestore(&bitmap->lock, flags); 788 spin_unlock_irqrestore(&bitmap->lock, flags);
801 789
802 if (attr & (BITMAP_PAGE_DIRTY | BITMAP_PAGE_NEEDWRITE)) { 790 if (dirty | need_write)
803 err = write_page(bitmap, page, 0); 791 err = write_page(bitmap, page, 0);
804 if (err == -EAGAIN) {
805 if (attr & BITMAP_PAGE_DIRTY)
806 err = write_page(bitmap, page, 1);
807 else
808 err = 0;
809 }
810 if (err)
811 return 1;
812 }
813 } 792 }
814 if (wait) { /* if any writes were performed, we need to wait on them */ 793 if (wait) { /* if any writes were performed, we need to wait on them */
815 if (bitmap->file) { 794 if (bitmap->file)
816 spin_lock_irq(&bitmap->write_lock); 795 wait_event(bitmap->write_wait,
817 wait_event_lock_irq(bitmap->write_wait, 796 atomic_read(&bitmap->pending_writes)==0);
818 list_empty(&bitmap->complete_pages), bitmap->write_lock, 797 else
819 wake_up_process(bitmap->writeback_daemon->tsk));
820 spin_unlock_irq(&bitmap->write_lock);
821 } else
822 md_super_wait(bitmap->mddev); 798 md_super_wait(bitmap->mddev);
823 } 799 }
800 if (bitmap->flags & BITMAP_WRITE_ERROR)
801 bitmap_file_kick(bitmap);
824 return 0; 802 return 0;
825} 803}
826 804
@@ -842,7 +820,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
842 struct page *page = NULL, *oldpage = NULL; 820 struct page *page = NULL, *oldpage = NULL;
843 unsigned long num_pages, bit_cnt = 0; 821 unsigned long num_pages, bit_cnt = 0;
844 struct file *file; 822 struct file *file;
845 unsigned long bytes, offset, dummy; 823 unsigned long bytes, offset;
846 int outofdate; 824 int outofdate;
847 int ret = -ENOSPC; 825 int ret = -ENOSPC;
848 void *paddr; 826 void *paddr;
@@ -879,7 +857,12 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
879 if (!bitmap->filemap) 857 if (!bitmap->filemap)
880 goto out; 858 goto out;
881 859
882 bitmap->filemap_attr = kzalloc(sizeof(long) * num_pages, GFP_KERNEL); 860 /* We need 4 bits per page, rounded up to a multiple of sizeof(unsigned long) */
861 bitmap->filemap_attr = kzalloc(
862 (((num_pages*4/8)+sizeof(unsigned long)-1)
863 /sizeof(unsigned long))
864 *sizeof(unsigned long),
865 GFP_KERNEL);
883 if (!bitmap->filemap_attr) 866 if (!bitmap->filemap_attr)
884 goto out; 867 goto out;
885 868
@@ -890,7 +873,12 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
890 index = file_page_index(i); 873 index = file_page_index(i);
891 bit = file_page_offset(i); 874 bit = file_page_offset(i);
892 if (index != oldindex) { /* this is a new page, read it in */ 875 if (index != oldindex) { /* this is a new page, read it in */
876 int count;
893 /* unmap the old page, we're done with it */ 877 /* unmap the old page, we're done with it */
878 if (index == num_pages-1)
879 count = bytes - index * PAGE_SIZE;
880 else
881 count = PAGE_SIZE;
894 if (index == 0) { 882 if (index == 0) {
895 /* 883 /*
896 * if we're here then the superblock page 884 * if we're here then the superblock page
@@ -900,7 +888,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
900 page = bitmap->sb_page; 888 page = bitmap->sb_page;
901 offset = sizeof(bitmap_super_t); 889 offset = sizeof(bitmap_super_t);
902 } else if (file) { 890 } else if (file) {
903 page = read_page(file, index, &dummy); 891 page = read_page(file, index, bitmap, count);
904 offset = 0; 892 offset = 0;
905 } else { 893 } else {
906 page = read_sb_page(bitmap->mddev, bitmap->offset, index); 894 page = read_sb_page(bitmap->mddev, bitmap->offset, index);
@@ -971,12 +959,11 @@ void bitmap_write_all(struct bitmap *bitmap)
971 /* We don't actually write all bitmap blocks here, 959 /* We don't actually write all bitmap blocks here,
972 * just flag them as needing to be written 960 * just flag them as needing to be written
973 */ 961 */
962 int i;
974 963
975 unsigned long chunks = bitmap->chunks; 964 for (i=0; i < bitmap->file_pages; i++)
976 unsigned long bytes = (chunks+7)/8 + sizeof(bitmap_super_t); 965 set_page_attr(bitmap, bitmap->filemap[i],
977 unsigned long num_pages = (bytes + PAGE_SIZE-1) / PAGE_SIZE; 966 BITMAP_PAGE_NEEDWRITE);
978 while (num_pages--)
979 bitmap->filemap_attr[num_pages] |= BITMAP_PAGE_NEEDWRITE;
980} 967}
981 968
982 969
@@ -1007,7 +994,6 @@ int bitmap_daemon_work(struct bitmap *bitmap)
1007 struct page *page = NULL, *lastpage = NULL; 994 struct page *page = NULL, *lastpage = NULL;
1008 int err = 0; 995 int err = 0;
1009 int blocks; 996 int blocks;
1010 int attr;
1011 void *paddr; 997 void *paddr;
1012 998
1013 if (bitmap == NULL) 999 if (bitmap == NULL)
@@ -1029,43 +1015,34 @@ int bitmap_daemon_work(struct bitmap *bitmap)
1029 1015
1030 if (page != lastpage) { 1016 if (page != lastpage) {
1031 /* skip this page unless it's marked as needing cleaning */ 1017 /* skip this page unless it's marked as needing cleaning */
1032 if (!((attr=get_page_attr(bitmap, page)) & BITMAP_PAGE_CLEAN)) { 1018 if (!test_page_attr(bitmap, page, BITMAP_PAGE_CLEAN)) {
1033 if (attr & BITMAP_PAGE_NEEDWRITE) { 1019 int need_write = test_page_attr(bitmap, page,
1034 get_page(page); 1020 BITMAP_PAGE_NEEDWRITE);
1021 if (need_write)
1035 clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE); 1022 clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
1036 } 1023
1037 spin_unlock_irqrestore(&bitmap->lock, flags); 1024 spin_unlock_irqrestore(&bitmap->lock, flags);
1038 if (attr & BITMAP_PAGE_NEEDWRITE) { 1025 if (need_write) {
1039 switch (write_page(bitmap, page, 0)) { 1026 switch (write_page(bitmap, page, 0)) {
1040 case -EAGAIN:
1041 set_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
1042 break;
1043 case 0: 1027 case 0:
1044 break; 1028 break;
1045 default: 1029 default:
1046 bitmap_file_kick(bitmap); 1030 bitmap_file_kick(bitmap);
1047 } 1031 }
1048 put_page(page);
1049 } 1032 }
1050 continue; 1033 continue;
1051 } 1034 }
1052 1035
1053 /* grab the new page, sync and release the old */ 1036 /* grab the new page, sync and release the old */
1054 get_page(page);
1055 if (lastpage != NULL) { 1037 if (lastpage != NULL) {
1056 if (get_page_attr(bitmap, lastpage) & BITMAP_PAGE_NEEDWRITE) { 1038 if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) {
1057 clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); 1039 clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1058 spin_unlock_irqrestore(&bitmap->lock, flags); 1040 spin_unlock_irqrestore(&bitmap->lock, flags);
1059 err = write_page(bitmap, lastpage, 0); 1041 err = write_page(bitmap, lastpage, 0);
1060 if (err == -EAGAIN) {
1061 err = 0;
1062 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1063 }
1064 } else { 1042 } else {
1065 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); 1043 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1066 spin_unlock_irqrestore(&bitmap->lock, flags); 1044 spin_unlock_irqrestore(&bitmap->lock, flags);
1067 } 1045 }
1068 put_page(lastpage);
1069 if (err) 1046 if (err)
1070 bitmap_file_kick(bitmap); 1047 bitmap_file_kick(bitmap);
1071 } else 1048 } else
@@ -1107,131 +1084,19 @@ int bitmap_daemon_work(struct bitmap *bitmap)
1107 /* now sync the final page */ 1084 /* now sync the final page */
1108 if (lastpage != NULL) { 1085 if (lastpage != NULL) {
1109 spin_lock_irqsave(&bitmap->lock, flags); 1086 spin_lock_irqsave(&bitmap->lock, flags);
1110 if (get_page_attr(bitmap, lastpage) &BITMAP_PAGE_NEEDWRITE) { 1087 if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) {
1111 clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); 1088 clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1112 spin_unlock_irqrestore(&bitmap->lock, flags); 1089 spin_unlock_irqrestore(&bitmap->lock, flags);
1113 err = write_page(bitmap, lastpage, 0); 1090 err = write_page(bitmap, lastpage, 0);
1114 if (err == -EAGAIN) {
1115 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1116 err = 0;
1117 }
1118 } else { 1091 } else {
1119 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); 1092 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1120 spin_unlock_irqrestore(&bitmap->lock, flags); 1093 spin_unlock_irqrestore(&bitmap->lock, flags);
1121 } 1094 }
1122
1123 put_page(lastpage);
1124 } 1095 }
1125 1096
1126 return err; 1097 return err;
1127} 1098}
1128 1099
1129static void daemon_exit(struct bitmap *bitmap, mdk_thread_t **daemon)
1130{
1131 mdk_thread_t *dmn;
1132 unsigned long flags;
1133
1134 /* if no one is waiting on us, we'll free the md thread struct
1135 * and exit, otherwise we let the waiter clean things up */
1136 spin_lock_irqsave(&bitmap->lock, flags);
1137 if ((dmn = *daemon)) { /* no one is waiting, cleanup and exit */
1138 *daemon = NULL;
1139 spin_unlock_irqrestore(&bitmap->lock, flags);
1140 kfree(dmn);
1141 complete_and_exit(NULL, 0); /* do_exit not exported */
1142 }
1143 spin_unlock_irqrestore(&bitmap->lock, flags);
1144}
1145
1146static void bitmap_writeback_daemon(mddev_t *mddev)
1147{
1148 struct bitmap *bitmap = mddev->bitmap;
1149 struct page *page;
1150 struct page_list *item;
1151 int err = 0;
1152
1153 if (signal_pending(current)) {
1154 printk(KERN_INFO
1155 "%s: bitmap writeback daemon got signal, exiting...\n",
1156 bmname(bitmap));
1157 err = -EINTR;
1158 goto out;
1159 }
1160 if (bitmap == NULL)
1161 /* about to be stopped. */
1162 return;
1163
1164 PRINTK("%s: bitmap writeback daemon woke up...\n", bmname(bitmap));
1165 /* wait on bitmap page writebacks */
1166 while ((item = dequeue_page(bitmap))) {
1167 page = item->page;
1168 mempool_free(item, bitmap->write_pool);
1169 PRINTK("wait on page writeback: %p\n", page);
1170 wait_on_page_writeback(page);
1171 PRINTK("finished page writeback: %p\n", page);
1172
1173 err = PageError(page);
1174 put_page(page);
1175 if (err) {
1176 printk(KERN_WARNING "%s: bitmap file writeback "
1177 "failed (page %lu): %d\n",
1178 bmname(bitmap), page->index, err);
1179 bitmap_file_kick(bitmap);
1180 goto out;
1181 }
1182 }
1183 out:
1184 wake_up(&bitmap->write_wait);
1185 if (err) {
1186 printk(KERN_INFO "%s: bitmap writeback daemon exiting (%d)\n",
1187 bmname(bitmap), err);
1188 daemon_exit(bitmap, &bitmap->writeback_daemon);
1189 }
1190}
1191
1192static mdk_thread_t *bitmap_start_daemon(struct bitmap *bitmap,
1193 void (*func)(mddev_t *), char *name)
1194{
1195 mdk_thread_t *daemon;
1196 char namebuf[32];
1197
1198#ifdef INJECT_FATAL_FAULT_2
1199 daemon = NULL;
1200#else
1201 sprintf(namebuf, "%%s_%s", name);
1202 daemon = md_register_thread(func, bitmap->mddev, namebuf);
1203#endif
1204 if (!daemon) {
1205 printk(KERN_ERR "%s: failed to start bitmap daemon\n",
1206 bmname(bitmap));
1207 return ERR_PTR(-ECHILD);
1208 }
1209
1210 md_wakeup_thread(daemon); /* start it running */
1211
1212 PRINTK("%s: %s daemon (pid %d) started...\n",
1213 bmname(bitmap), name, daemon->tsk->pid);
1214
1215 return daemon;
1216}
1217
1218static void bitmap_stop_daemon(struct bitmap *bitmap)
1219{
1220 /* the daemon can't stop itself... it'll just exit instead... */
1221 if (bitmap->writeback_daemon && ! IS_ERR(bitmap->writeback_daemon) &&
1222 current->pid != bitmap->writeback_daemon->tsk->pid) {
1223 mdk_thread_t *daemon;
1224 unsigned long flags;
1225
1226 spin_lock_irqsave(&bitmap->lock, flags);
1227 daemon = bitmap->writeback_daemon;
1228 bitmap->writeback_daemon = NULL;
1229 spin_unlock_irqrestore(&bitmap->lock, flags);
1230 if (daemon && ! IS_ERR(daemon))
1231 md_unregister_thread(daemon); /* destroy the thread */
1232 }
1233}
1234
1235static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, 1100static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
1236 sector_t offset, int *blocks, 1101 sector_t offset, int *blocks,
1237 int create) 1102 int create)
@@ -1500,8 +1365,6 @@ static void bitmap_free(struct bitmap *bitmap)
1500 1365
1501 /* free all allocated memory */ 1366 /* free all allocated memory */
1502 1367
1503 mempool_destroy(bitmap->write_pool);
1504
1505 if (bp) /* deallocate the page memory */ 1368 if (bp) /* deallocate the page memory */
1506 for (k = 0; k < pages; k++) 1369 for (k = 0; k < pages; k++)
1507 if (bp[k].map && !bp[k].hijacked) 1370 if (bp[k].map && !bp[k].hijacked)
@@ -1549,20 +1412,20 @@ int bitmap_create(mddev_t *mddev)
1549 return -ENOMEM; 1412 return -ENOMEM;
1550 1413
1551 spin_lock_init(&bitmap->lock); 1414 spin_lock_init(&bitmap->lock);
1552 bitmap->mddev = mddev; 1415 atomic_set(&bitmap->pending_writes, 0);
1553
1554 spin_lock_init(&bitmap->write_lock);
1555 INIT_LIST_HEAD(&bitmap->complete_pages);
1556 init_waitqueue_head(&bitmap->write_wait); 1416 init_waitqueue_head(&bitmap->write_wait);
1557 bitmap->write_pool = mempool_create_kmalloc_pool(WRITE_POOL_SIZE, 1417
1558 sizeof(struct page_list)); 1418 bitmap->mddev = mddev;
1559 err = -ENOMEM;
1560 if (!bitmap->write_pool)
1561 goto error;
1562 1419
1563 bitmap->file = file; 1420 bitmap->file = file;
1564 bitmap->offset = mddev->bitmap_offset; 1421 bitmap->offset = mddev->bitmap_offset;
1565 if (file) get_file(file); 1422 if (file) {
1423 get_file(file);
1424 do_sync_file_range(file, 0, LLONG_MAX,
1425 SYNC_FILE_RANGE_WAIT_BEFORE |
1426 SYNC_FILE_RANGE_WRITE |
1427 SYNC_FILE_RANGE_WAIT_AFTER);
1428 }
1566 /* read superblock from bitmap file (this sets bitmap->chunksize) */ 1429 /* read superblock from bitmap file (this sets bitmap->chunksize) */
1567 err = bitmap_read_sb(bitmap); 1430 err = bitmap_read_sb(bitmap);
1568 if (err) 1431 if (err)
@@ -1594,8 +1457,6 @@ int bitmap_create(mddev_t *mddev)
1594 if (!bitmap->bp) 1457 if (!bitmap->bp)
1595 goto error; 1458 goto error;
1596 1459
1597 bitmap->flags |= BITMAP_ACTIVE;
1598
1599 /* now that we have some pages available, initialize the in-memory 1460 /* now that we have some pages available, initialize the in-memory
1600 * bitmap from the on-disk bitmap */ 1461 * bitmap from the on-disk bitmap */
1601 start = 0; 1462 start = 0;
@@ -1613,15 +1474,6 @@ int bitmap_create(mddev_t *mddev)
1613 1474
1614 mddev->bitmap = bitmap; 1475 mddev->bitmap = bitmap;
1615 1476
1616 if (file)
1617 /* kick off the bitmap writeback daemon */
1618 bitmap->writeback_daemon =
1619 bitmap_start_daemon(bitmap,
1620 bitmap_writeback_daemon,
1621 "bitmap_wb");
1622
1623 if (IS_ERR(bitmap->writeback_daemon))
1624 return PTR_ERR(bitmap->writeback_daemon);
1625 mddev->thread->timeout = bitmap->daemon_sleep * HZ; 1477 mddev->thread->timeout = bitmap->daemon_sleep * HZ;
1626 1478
1627 return bitmap_update_sb(bitmap); 1479 return bitmap_update_sb(bitmap);
@@ -1638,4 +1490,3 @@ EXPORT_SYMBOL(bitmap_start_sync);
1638EXPORT_SYMBOL(bitmap_end_sync); 1490EXPORT_SYMBOL(bitmap_end_sync);
1639EXPORT_SYMBOL(bitmap_unplug); 1491EXPORT_SYMBOL(bitmap_unplug);
1640EXPORT_SYMBOL(bitmap_close_sync); 1492EXPORT_SYMBOL(bitmap_close_sync);
1641EXPORT_SYMBOL(bitmap_daemon_work);
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 61a590bb6241..6022ed12a795 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -20,7 +20,7 @@
20 20
21#include "dm.h" 21#include "dm.h"
22 22
23#define PFX "crypt: " 23#define DM_MSG_PREFIX "crypt"
24 24
25/* 25/*
26 * per bio private data 26 * per bio private data
@@ -125,19 +125,19 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
125 u8 *salt; 125 u8 *salt;
126 126
127 if (opts == NULL) { 127 if (opts == NULL) {
128 ti->error = PFX "Digest algorithm missing for ESSIV mode"; 128 ti->error = "Digest algorithm missing for ESSIV mode";
129 return -EINVAL; 129 return -EINVAL;
130 } 130 }
131 131
132 /* Hash the cipher key with the given hash algorithm */ 132 /* Hash the cipher key with the given hash algorithm */
133 hash_tfm = crypto_alloc_tfm(opts, CRYPTO_TFM_REQ_MAY_SLEEP); 133 hash_tfm = crypto_alloc_tfm(opts, CRYPTO_TFM_REQ_MAY_SLEEP);
134 if (hash_tfm == NULL) { 134 if (hash_tfm == NULL) {
135 ti->error = PFX "Error initializing ESSIV hash"; 135 ti->error = "Error initializing ESSIV hash";
136 return -EINVAL; 136 return -EINVAL;
137 } 137 }
138 138
139 if (crypto_tfm_alg_type(hash_tfm) != CRYPTO_ALG_TYPE_DIGEST) { 139 if (crypto_tfm_alg_type(hash_tfm) != CRYPTO_ALG_TYPE_DIGEST) {
140 ti->error = PFX "Expected digest algorithm for ESSIV hash"; 140 ti->error = "Expected digest algorithm for ESSIV hash";
141 crypto_free_tfm(hash_tfm); 141 crypto_free_tfm(hash_tfm);
142 return -EINVAL; 142 return -EINVAL;
143 } 143 }
@@ -145,7 +145,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
145 saltsize = crypto_tfm_alg_digestsize(hash_tfm); 145 saltsize = crypto_tfm_alg_digestsize(hash_tfm);
146 salt = kmalloc(saltsize, GFP_KERNEL); 146 salt = kmalloc(saltsize, GFP_KERNEL);
147 if (salt == NULL) { 147 if (salt == NULL) {
148 ti->error = PFX "Error kmallocing salt storage in ESSIV"; 148 ti->error = "Error kmallocing salt storage in ESSIV";
149 crypto_free_tfm(hash_tfm); 149 crypto_free_tfm(hash_tfm);
150 return -ENOMEM; 150 return -ENOMEM;
151 } 151 }
@@ -159,20 +159,20 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
159 CRYPTO_TFM_MODE_ECB | 159 CRYPTO_TFM_MODE_ECB |
160 CRYPTO_TFM_REQ_MAY_SLEEP); 160 CRYPTO_TFM_REQ_MAY_SLEEP);
161 if (essiv_tfm == NULL) { 161 if (essiv_tfm == NULL) {
162 ti->error = PFX "Error allocating crypto tfm for ESSIV"; 162 ti->error = "Error allocating crypto tfm for ESSIV";
163 kfree(salt); 163 kfree(salt);
164 return -EINVAL; 164 return -EINVAL;
165 } 165 }
166 if (crypto_tfm_alg_blocksize(essiv_tfm) 166 if (crypto_tfm_alg_blocksize(essiv_tfm)
167 != crypto_tfm_alg_ivsize(cc->tfm)) { 167 != crypto_tfm_alg_ivsize(cc->tfm)) {
168 ti->error = PFX "Block size of ESSIV cipher does " 168 ti->error = "Block size of ESSIV cipher does "
169 "not match IV size of block cipher"; 169 "not match IV size of block cipher";
170 crypto_free_tfm(essiv_tfm); 170 crypto_free_tfm(essiv_tfm);
171 kfree(salt); 171 kfree(salt);
172 return -EINVAL; 172 return -EINVAL;
173 } 173 }
174 if (crypto_cipher_setkey(essiv_tfm, salt, saltsize) < 0) { 174 if (crypto_cipher_setkey(essiv_tfm, salt, saltsize) < 0) {
175 ti->error = PFX "Failed to set key for ESSIV cipher"; 175 ti->error = "Failed to set key for ESSIV cipher";
176 crypto_free_tfm(essiv_tfm); 176 crypto_free_tfm(essiv_tfm);
177 kfree(salt); 177 kfree(salt);
178 return -EINVAL; 178 return -EINVAL;
@@ -521,7 +521,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
521 unsigned long long tmpll; 521 unsigned long long tmpll;
522 522
523 if (argc != 5) { 523 if (argc != 5) {
524 ti->error = PFX "Not enough arguments"; 524 ti->error = "Not enough arguments";
525 return -EINVAL; 525 return -EINVAL;
526 } 526 }
527 527
@@ -532,21 +532,21 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
532 ivmode = strsep(&ivopts, ":"); 532 ivmode = strsep(&ivopts, ":");
533 533
534 if (tmp) 534 if (tmp)
535 DMWARN(PFX "Unexpected additional cipher options"); 535 DMWARN("Unexpected additional cipher options");
536 536
537 key_size = strlen(argv[1]) >> 1; 537 key_size = strlen(argv[1]) >> 1;
538 538
539 cc = kmalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL); 539 cc = kmalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL);
540 if (cc == NULL) { 540 if (cc == NULL) {
541 ti->error = 541 ti->error =
542 PFX "Cannot allocate transparent encryption context"; 542 "Cannot allocate transparent encryption context";
543 return -ENOMEM; 543 return -ENOMEM;
544 } 544 }
545 545
546 cc->key_size = key_size; 546 cc->key_size = key_size;
547 if ((!key_size && strcmp(argv[1], "-") != 0) || 547 if ((!key_size && strcmp(argv[1], "-") != 0) ||
548 (key_size && crypt_decode_key(cc->key, argv[1], key_size) < 0)) { 548 (key_size && crypt_decode_key(cc->key, argv[1], key_size) < 0)) {
549 ti->error = PFX "Error decoding key"; 549 ti->error = "Error decoding key";
550 goto bad1; 550 goto bad1;
551 } 551 }
552 552
@@ -562,22 +562,22 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
562 else if (strcmp(chainmode, "ecb") == 0) 562 else if (strcmp(chainmode, "ecb") == 0)
563 crypto_flags = CRYPTO_TFM_MODE_ECB; 563 crypto_flags = CRYPTO_TFM_MODE_ECB;
564 else { 564 else {
565 ti->error = PFX "Unknown chaining mode"; 565 ti->error = "Unknown chaining mode";
566 goto bad1; 566 goto bad1;
567 } 567 }
568 568
569 if (crypto_flags != CRYPTO_TFM_MODE_ECB && !ivmode) { 569 if (crypto_flags != CRYPTO_TFM_MODE_ECB && !ivmode) {
570 ti->error = PFX "This chaining mode requires an IV mechanism"; 570 ti->error = "This chaining mode requires an IV mechanism";
571 goto bad1; 571 goto bad1;
572 } 572 }
573 573
574 tfm = crypto_alloc_tfm(cipher, crypto_flags | CRYPTO_TFM_REQ_MAY_SLEEP); 574 tfm = crypto_alloc_tfm(cipher, crypto_flags | CRYPTO_TFM_REQ_MAY_SLEEP);
575 if (!tfm) { 575 if (!tfm) {
576 ti->error = PFX "Error allocating crypto tfm"; 576 ti->error = "Error allocating crypto tfm";
577 goto bad1; 577 goto bad1;
578 } 578 }
579 if (crypto_tfm_alg_type(tfm) != CRYPTO_ALG_TYPE_CIPHER) { 579 if (crypto_tfm_alg_type(tfm) != CRYPTO_ALG_TYPE_CIPHER) {
580 ti->error = PFX "Expected cipher algorithm"; 580 ti->error = "Expected cipher algorithm";
581 goto bad2; 581 goto bad2;
582 } 582 }
583 583
@@ -595,7 +595,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
595 else if (strcmp(ivmode, "essiv") == 0) 595 else if (strcmp(ivmode, "essiv") == 0)
596 cc->iv_gen_ops = &crypt_iv_essiv_ops; 596 cc->iv_gen_ops = &crypt_iv_essiv_ops;
597 else { 597 else {
598 ti->error = PFX "Invalid IV mode"; 598 ti->error = "Invalid IV mode";
599 goto bad2; 599 goto bad2;
600 } 600 }
601 601
@@ -610,7 +610,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
610 else { 610 else {
611 cc->iv_size = 0; 611 cc->iv_size = 0;
612 if (cc->iv_gen_ops) { 612 if (cc->iv_gen_ops) {
613 DMWARN(PFX "Selected cipher does not support IVs"); 613 DMWARN("Selected cipher does not support IVs");
614 if (cc->iv_gen_ops->dtr) 614 if (cc->iv_gen_ops->dtr)
615 cc->iv_gen_ops->dtr(cc); 615 cc->iv_gen_ops->dtr(cc);
616 cc->iv_gen_ops = NULL; 616 cc->iv_gen_ops = NULL;
@@ -619,36 +619,36 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
619 619
620 cc->io_pool = mempool_create_slab_pool(MIN_IOS, _crypt_io_pool); 620 cc->io_pool = mempool_create_slab_pool(MIN_IOS, _crypt_io_pool);
621 if (!cc->io_pool) { 621 if (!cc->io_pool) {
622 ti->error = PFX "Cannot allocate crypt io mempool"; 622 ti->error = "Cannot allocate crypt io mempool";
623 goto bad3; 623 goto bad3;
624 } 624 }
625 625
626 cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0); 626 cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
627 if (!cc->page_pool) { 627 if (!cc->page_pool) {
628 ti->error = PFX "Cannot allocate page mempool"; 628 ti->error = "Cannot allocate page mempool";
629 goto bad4; 629 goto bad4;
630 } 630 }
631 631
632 if (tfm->crt_cipher.cit_setkey(tfm, cc->key, key_size) < 0) { 632 if (tfm->crt_cipher.cit_setkey(tfm, cc->key, key_size) < 0) {
633 ti->error = PFX "Error setting key"; 633 ti->error = "Error setting key";
634 goto bad5; 634 goto bad5;
635 } 635 }
636 636
637 if (sscanf(argv[2], "%llu", &tmpll) != 1) { 637 if (sscanf(argv[2], "%llu", &tmpll) != 1) {
638 ti->error = PFX "Invalid iv_offset sector"; 638 ti->error = "Invalid iv_offset sector";
639 goto bad5; 639 goto bad5;
640 } 640 }
641 cc->iv_offset = tmpll; 641 cc->iv_offset = tmpll;
642 642
643 if (sscanf(argv[4], "%llu", &tmpll) != 1) { 643 if (sscanf(argv[4], "%llu", &tmpll) != 1) {
644 ti->error = PFX "Invalid device sector"; 644 ti->error = "Invalid device sector";
645 goto bad5; 645 goto bad5;
646 } 646 }
647 cc->start = tmpll; 647 cc->start = tmpll;
648 648
649 if (dm_get_device(ti, argv[3], cc->start, ti->len, 649 if (dm_get_device(ti, argv[3], cc->start, ti->len,
650 dm_table_get_mode(ti->table), &cc->dev)) { 650 dm_table_get_mode(ti->table), &cc->dev)) {
651 ti->error = PFX "Device lookup failed"; 651 ti->error = "Device lookup failed";
652 goto bad5; 652 goto bad5;
653 } 653 }
654 654
@@ -657,7 +657,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
657 *(ivopts - 1) = ':'; 657 *(ivopts - 1) = ':';
658 cc->iv_mode = kmalloc(strlen(ivmode) + 1, GFP_KERNEL); 658 cc->iv_mode = kmalloc(strlen(ivmode) + 1, GFP_KERNEL);
659 if (!cc->iv_mode) { 659 if (!cc->iv_mode) {
660 ti->error = PFX "Error kmallocing iv_mode string"; 660 ti->error = "Error kmallocing iv_mode string";
661 goto bad5; 661 goto bad5;
662 } 662 }
663 strcpy(cc->iv_mode, ivmode); 663 strcpy(cc->iv_mode, ivmode);
@@ -918,13 +918,13 @@ static int __init dm_crypt_init(void)
918 _kcryptd_workqueue = create_workqueue("kcryptd"); 918 _kcryptd_workqueue = create_workqueue("kcryptd");
919 if (!_kcryptd_workqueue) { 919 if (!_kcryptd_workqueue) {
920 r = -ENOMEM; 920 r = -ENOMEM;
921 DMERR(PFX "couldn't create kcryptd"); 921 DMERR("couldn't create kcryptd");
922 goto bad1; 922 goto bad1;
923 } 923 }
924 924
925 r = dm_register_target(&crypt_target); 925 r = dm_register_target(&crypt_target);
926 if (r < 0) { 926 if (r < 0) {
927 DMERR(PFX "register failed %d", r); 927 DMERR("register failed %d", r);
928 goto bad2; 928 goto bad2;
929 } 929 }
930 930
@@ -942,7 +942,7 @@ static void __exit dm_crypt_exit(void)
942 int r = dm_unregister_target(&crypt_target); 942 int r = dm_unregister_target(&crypt_target);
943 943
944 if (r < 0) 944 if (r < 0)
945 DMERR(PFX "unregister failed %d", r); 945 DMERR("unregister failed %d", r);
946 946
947 destroy_workqueue(_kcryptd_workqueue); 947 destroy_workqueue(_kcryptd_workqueue);
948 kmem_cache_destroy(_crypt_io_pool); 948 kmem_cache_destroy(_crypt_io_pool);
diff --git a/drivers/md/dm-emc.c b/drivers/md/dm-emc.c
index c7067674dcb7..2a374ccb30dd 100644
--- a/drivers/md/dm-emc.c
+++ b/drivers/md/dm-emc.c
@@ -12,6 +12,8 @@
12#include <scsi/scsi.h> 12#include <scsi/scsi.h>
13#include <scsi/scsi_cmnd.h> 13#include <scsi/scsi_cmnd.h>
14 14
15#define DM_MSG_PREFIX "multipath emc"
16
15struct emc_handler { 17struct emc_handler {
16 spinlock_t lock; 18 spinlock_t lock;
17 19
@@ -66,7 +68,7 @@ static struct bio *get_failover_bio(struct path *path, unsigned data_size)
66 68
67 bio = bio_alloc(GFP_ATOMIC, 1); 69 bio = bio_alloc(GFP_ATOMIC, 1);
68 if (!bio) { 70 if (!bio) {
69 DMERR("dm-emc: get_failover_bio: bio_alloc() failed."); 71 DMERR("get_failover_bio: bio_alloc() failed.");
70 return NULL; 72 return NULL;
71 } 73 }
72 74
@@ -78,13 +80,13 @@ static struct bio *get_failover_bio(struct path *path, unsigned data_size)
78 80
79 page = alloc_page(GFP_ATOMIC); 81 page = alloc_page(GFP_ATOMIC);
80 if (!page) { 82 if (!page) {
81 DMERR("dm-emc: get_failover_bio: alloc_page() failed."); 83 DMERR("get_failover_bio: alloc_page() failed.");
82 bio_put(bio); 84 bio_put(bio);
83 return NULL; 85 return NULL;
84 } 86 }
85 87
86 if (bio_add_page(bio, page, data_size, 0) != data_size) { 88 if (bio_add_page(bio, page, data_size, 0) != data_size) {
87 DMERR("dm-emc: get_failover_bio: alloc_page() failed."); 89 DMERR("get_failover_bio: alloc_page() failed.");
88 __free_page(page); 90 __free_page(page);
89 bio_put(bio); 91 bio_put(bio);
90 return NULL; 92 return NULL;
@@ -103,7 +105,7 @@ static struct request *get_failover_req(struct emc_handler *h,
103 /* FIXME: Figure out why it fails with GFP_ATOMIC. */ 105 /* FIXME: Figure out why it fails with GFP_ATOMIC. */
104 rq = blk_get_request(q, WRITE, __GFP_WAIT); 106 rq = blk_get_request(q, WRITE, __GFP_WAIT);
105 if (!rq) { 107 if (!rq) {
106 DMERR("dm-emc: get_failover_req: blk_get_request failed"); 108 DMERR("get_failover_req: blk_get_request failed");
107 return NULL; 109 return NULL;
108 } 110 }
109 111
@@ -160,7 +162,7 @@ static struct request *emc_trespass_get(struct emc_handler *h,
160 162
161 bio = get_failover_bio(path, data_size); 163 bio = get_failover_bio(path, data_size);
162 if (!bio) { 164 if (!bio) {
163 DMERR("dm-emc: emc_trespass_get: no bio"); 165 DMERR("emc_trespass_get: no bio");
164 return NULL; 166 return NULL;
165 } 167 }
166 168
@@ -173,7 +175,7 @@ static struct request *emc_trespass_get(struct emc_handler *h,
173 /* get request for block layer packet command */ 175 /* get request for block layer packet command */
174 rq = get_failover_req(h, bio, path); 176 rq = get_failover_req(h, bio, path);
175 if (!rq) { 177 if (!rq) {
176 DMERR("dm-emc: emc_trespass_get: no rq"); 178 DMERR("emc_trespass_get: no rq");
177 free_bio(bio); 179 free_bio(bio);
178 return NULL; 180 return NULL;
179 } 181 }
@@ -200,18 +202,18 @@ static void emc_pg_init(struct hw_handler *hwh, unsigned bypassed,
200 * initial state passed into us and then get an update here. 202 * initial state passed into us and then get an update here.
201 */ 203 */
202 if (!q) { 204 if (!q) {
203 DMINFO("dm-emc: emc_pg_init: no queue"); 205 DMINFO("emc_pg_init: no queue");
204 goto fail_path; 206 goto fail_path;
205 } 207 }
206 208
207 /* FIXME: The request should be pre-allocated. */ 209 /* FIXME: The request should be pre-allocated. */
208 rq = emc_trespass_get(hwh->context, path); 210 rq = emc_trespass_get(hwh->context, path);
209 if (!rq) { 211 if (!rq) {
210 DMERR("dm-emc: emc_pg_init: no rq"); 212 DMERR("emc_pg_init: no rq");
211 goto fail_path; 213 goto fail_path;
212 } 214 }
213 215
214 DMINFO("dm-emc: emc_pg_init: sending switch-over command"); 216 DMINFO("emc_pg_init: sending switch-over command");
215 elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 1); 217 elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 1);
216 return; 218 return;
217 219
@@ -241,18 +243,18 @@ static int emc_create(struct hw_handler *hwh, unsigned argc, char **argv)
241 hr = 0; 243 hr = 0;
242 short_trespass = 0; 244 short_trespass = 0;
243 } else if (argc != 2) { 245 } else if (argc != 2) {
244 DMWARN("dm-emc hwhandler: incorrect number of arguments"); 246 DMWARN("incorrect number of arguments");
245 return -EINVAL; 247 return -EINVAL;
246 } else { 248 } else {
247 if ((sscanf(argv[0], "%u", &short_trespass) != 1) 249 if ((sscanf(argv[0], "%u", &short_trespass) != 1)
248 || (short_trespass > 1)) { 250 || (short_trespass > 1)) {
249 DMWARN("dm-emc: invalid trespass mode selected"); 251 DMWARN("invalid trespass mode selected");
250 return -EINVAL; 252 return -EINVAL;
251 } 253 }
252 254
253 if ((sscanf(argv[1], "%u", &hr) != 1) 255 if ((sscanf(argv[1], "%u", &hr) != 1)
254 || (hr > 1)) { 256 || (hr > 1)) {
255 DMWARN("dm-emc: invalid honor reservation flag selected"); 257 DMWARN("invalid honor reservation flag selected");
256 return -EINVAL; 258 return -EINVAL;
257 } 259 }
258 } 260 }
@@ -264,14 +266,14 @@ static int emc_create(struct hw_handler *hwh, unsigned argc, char **argv)
264 hwh->context = h; 266 hwh->context = h;
265 267
266 if ((h->short_trespass = short_trespass)) 268 if ((h->short_trespass = short_trespass))
267 DMWARN("dm-emc: short trespass command will be send"); 269 DMWARN("short trespass command will be send");
268 else 270 else
269 DMWARN("dm-emc: long trespass command will be send"); 271 DMWARN("long trespass command will be send");
270 272
271 if ((h->hr = hr)) 273 if ((h->hr = hr))
272 DMWARN("dm-emc: honor reservation bit will be set"); 274 DMWARN("honor reservation bit will be set");
273 else 275 else
274 DMWARN("dm-emc: honor reservation bit will not be set (default)"); 276 DMWARN("honor reservation bit will not be set (default)");
275 277
276 return 0; 278 return 0;
277} 279}
@@ -336,9 +338,9 @@ static int __init dm_emc_init(void)
336 int r = dm_register_hw_handler(&emc_hwh); 338 int r = dm_register_hw_handler(&emc_hwh);
337 339
338 if (r < 0) 340 if (r < 0)
339 DMERR("emc: register failed %d", r); 341 DMERR("register failed %d", r);
340 342
341 DMINFO("dm-emc version 0.0.3 loaded"); 343 DMINFO("version 0.0.3 loaded");
342 344
343 return r; 345 return r;
344} 346}
@@ -348,7 +350,7 @@ static void __exit dm_emc_exit(void)
348 int r = dm_unregister_hw_handler(&emc_hwh); 350 int r = dm_unregister_hw_handler(&emc_hwh);
349 351
350 if (r < 0) 352 if (r < 0)
351 DMERR("emc: unregister failed %d", r); 353 DMERR("unregister failed %d", r);
352} 354}
353 355
354module_init(dm_emc_init); 356module_init(dm_emc_init);
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index cc07bbebbb16..d12379b5cdb5 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -16,6 +16,8 @@
16#include <linux/vmalloc.h> 16#include <linux/vmalloc.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18 18
19#define DM_MSG_PREFIX "snapshots"
20
19/*----------------------------------------------------------------- 21/*-----------------------------------------------------------------
20 * Persistent snapshots, by persistent we mean that the snapshot 22 * Persistent snapshots, by persistent we mean that the snapshot
21 * will survive a reboot. 23 * will survive a reboot.
@@ -91,7 +93,6 @@ struct pstore {
91 struct dm_snapshot *snap; /* up pointer to my snapshot */ 93 struct dm_snapshot *snap; /* up pointer to my snapshot */
92 int version; 94 int version;
93 int valid; 95 int valid;
94 uint32_t chunk_size;
95 uint32_t exceptions_per_area; 96 uint32_t exceptions_per_area;
96 97
97 /* 98 /*
@@ -133,7 +134,7 @@ static int alloc_area(struct pstore *ps)
133 int r = -ENOMEM; 134 int r = -ENOMEM;
134 size_t len; 135 size_t len;
135 136
136 len = ps->chunk_size << SECTOR_SHIFT; 137 len = ps->snap->chunk_size << SECTOR_SHIFT;
137 138
138 /* 139 /*
139 * Allocate the chunk_size block of memory that will hold 140 * Allocate the chunk_size block of memory that will hold
@@ -160,8 +161,8 @@ static int chunk_io(struct pstore *ps, uint32_t chunk, int rw)
160 unsigned long bits; 161 unsigned long bits;
161 162
162 where.bdev = ps->snap->cow->bdev; 163 where.bdev = ps->snap->cow->bdev;
163 where.sector = ps->chunk_size * chunk; 164 where.sector = ps->snap->chunk_size * chunk;
164 where.count = ps->chunk_size; 165 where.count = ps->snap->chunk_size;
165 166
166 return dm_io_sync_vm(1, &where, rw, ps->area, &bits); 167 return dm_io_sync_vm(1, &where, rw, ps->area, &bits);
167} 168}
@@ -188,7 +189,7 @@ static int area_io(struct pstore *ps, uint32_t area, int rw)
188 189
189static int zero_area(struct pstore *ps, uint32_t area) 190static int zero_area(struct pstore *ps, uint32_t area)
190{ 191{
191 memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT); 192 memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
192 return area_io(ps, area, WRITE); 193 return area_io(ps, area, WRITE);
193} 194}
194 195
@@ -196,6 +197,7 @@ static int read_header(struct pstore *ps, int *new_snapshot)
196{ 197{
197 int r; 198 int r;
198 struct disk_header *dh; 199 struct disk_header *dh;
200 chunk_t chunk_size;
199 201
200 r = chunk_io(ps, 0, READ); 202 r = chunk_io(ps, 0, READ);
201 if (r) 203 if (r)
@@ -210,8 +212,29 @@ static int read_header(struct pstore *ps, int *new_snapshot)
210 *new_snapshot = 0; 212 *new_snapshot = 0;
211 ps->valid = le32_to_cpu(dh->valid); 213 ps->valid = le32_to_cpu(dh->valid);
212 ps->version = le32_to_cpu(dh->version); 214 ps->version = le32_to_cpu(dh->version);
213 ps->chunk_size = le32_to_cpu(dh->chunk_size); 215 chunk_size = le32_to_cpu(dh->chunk_size);
214 216 if (ps->snap->chunk_size != chunk_size) {
217 DMWARN("chunk size %llu in device metadata overrides "
218 "table chunk size of %llu.",
219 (unsigned long long)chunk_size,
220 (unsigned long long)ps->snap->chunk_size);
221
222 /* We had a bogus chunk_size. Fix stuff up. */
223 dm_io_put(sectors_to_pages(ps->snap->chunk_size));
224 free_area(ps);
225
226 ps->snap->chunk_size = chunk_size;
227 ps->snap->chunk_mask = chunk_size - 1;
228 ps->snap->chunk_shift = ffs(chunk_size) - 1;
229
230 r = alloc_area(ps);
231 if (r)
232 return r;
233
234 r = dm_io_get(sectors_to_pages(chunk_size));
235 if (r)
236 return r;
237 }
215 } else { 238 } else {
216 DMWARN("Invalid/corrupt snapshot"); 239 DMWARN("Invalid/corrupt snapshot");
217 r = -ENXIO; 240 r = -ENXIO;
@@ -224,13 +247,13 @@ static int write_header(struct pstore *ps)
224{ 247{
225 struct disk_header *dh; 248 struct disk_header *dh;
226 249
227 memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT); 250 memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
228 251
229 dh = (struct disk_header *) ps->area; 252 dh = (struct disk_header *) ps->area;
230 dh->magic = cpu_to_le32(SNAP_MAGIC); 253 dh->magic = cpu_to_le32(SNAP_MAGIC);
231 dh->valid = cpu_to_le32(ps->valid); 254 dh->valid = cpu_to_le32(ps->valid);
232 dh->version = cpu_to_le32(ps->version); 255 dh->version = cpu_to_le32(ps->version);
233 dh->chunk_size = cpu_to_le32(ps->chunk_size); 256 dh->chunk_size = cpu_to_le32(ps->snap->chunk_size);
234 257
235 return chunk_io(ps, 0, WRITE); 258 return chunk_io(ps, 0, WRITE);
236} 259}
@@ -365,7 +388,7 @@ static void persistent_destroy(struct exception_store *store)
365{ 388{
366 struct pstore *ps = get_info(store); 389 struct pstore *ps = get_info(store);
367 390
368 dm_io_put(sectors_to_pages(ps->chunk_size)); 391 dm_io_put(sectors_to_pages(ps->snap->chunk_size));
369 vfree(ps->callbacks); 392 vfree(ps->callbacks);
370 free_area(ps); 393 free_area(ps);
371 kfree(ps); 394 kfree(ps);
@@ -384,6 +407,16 @@ static int persistent_read_metadata(struct exception_store *store)
384 return r; 407 return r;
385 408
386 /* 409 /*
410 * Now we know correct chunk_size, complete the initialisation.
411 */
412 ps->exceptions_per_area = (ps->snap->chunk_size << SECTOR_SHIFT) /
413 sizeof(struct disk_exception);
414 ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
415 sizeof(*ps->callbacks));
416 if (!ps->callbacks)
417 return -ENOMEM;
418
419 /*
387 * Do we need to setup a new snapshot ? 420 * Do we need to setup a new snapshot ?
388 */ 421 */
389 if (new_snapshot) { 422 if (new_snapshot) {
@@ -533,9 +566,6 @@ int dm_create_persistent(struct exception_store *store, uint32_t chunk_size)
533 ps->snap = store->snap; 566 ps->snap = store->snap;
534 ps->valid = 1; 567 ps->valid = 1;
535 ps->version = SNAPSHOT_DISK_VERSION; 568 ps->version = SNAPSHOT_DISK_VERSION;
536 ps->chunk_size = chunk_size;
537 ps->exceptions_per_area = (chunk_size << SECTOR_SHIFT) /
538 sizeof(struct disk_exception);
539 ps->next_free = 2; /* skipping the header and first area */ 569 ps->next_free = 2; /* skipping the header and first area */
540 ps->current_committed = 0; 570 ps->current_committed = 0;
541 571
@@ -543,18 +573,9 @@ int dm_create_persistent(struct exception_store *store, uint32_t chunk_size)
543 if (r) 573 if (r)
544 goto bad; 574 goto bad;
545 575
546 /*
547 * Allocate space for all the callbacks.
548 */
549 ps->callback_count = 0; 576 ps->callback_count = 0;
550 atomic_set(&ps->pending_count, 0); 577 atomic_set(&ps->pending_count, 0);
551 ps->callbacks = dm_vcalloc(ps->exceptions_per_area, 578 ps->callbacks = NULL;
552 sizeof(*ps->callbacks));
553
554 if (!ps->callbacks) {
555 r = -ENOMEM;
556 goto bad;
557 }
558 579
559 store->destroy = persistent_destroy; 580 store->destroy = persistent_destroy;
560 store->read_metadata = persistent_read_metadata; 581 store->read_metadata = persistent_read_metadata;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 8edd6435414d..3edb3477f987 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
3 * Copyright (C) 2004 - 2005 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004 - 2006 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This file is released under the GPL. 5 * This file is released under the GPL.
6 */ 6 */
@@ -19,6 +19,7 @@
19 19
20#include <asm/uaccess.h> 20#include <asm/uaccess.h>
21 21
22#define DM_MSG_PREFIX "ioctl"
22#define DM_DRIVER_EMAIL "dm-devel@redhat.com" 23#define DM_DRIVER_EMAIL "dm-devel@redhat.com"
23 24
24/*----------------------------------------------------------------- 25/*-----------------------------------------------------------------
@@ -48,7 +49,7 @@ struct vers_iter {
48static struct list_head _name_buckets[NUM_BUCKETS]; 49static struct list_head _name_buckets[NUM_BUCKETS];
49static struct list_head _uuid_buckets[NUM_BUCKETS]; 50static struct list_head _uuid_buckets[NUM_BUCKETS];
50 51
51static void dm_hash_remove_all(void); 52static void dm_hash_remove_all(int keep_open_devices);
52 53
53/* 54/*
54 * Guards access to both hash tables. 55 * Guards access to both hash tables.
@@ -73,7 +74,7 @@ static int dm_hash_init(void)
73 74
74static void dm_hash_exit(void) 75static void dm_hash_exit(void)
75{ 76{
76 dm_hash_remove_all(); 77 dm_hash_remove_all(0);
77 devfs_remove(DM_DIR); 78 devfs_remove(DM_DIR);
78} 79}
79 80
@@ -102,8 +103,10 @@ static struct hash_cell *__get_name_cell(const char *str)
102 unsigned int h = hash_str(str); 103 unsigned int h = hash_str(str);
103 104
104 list_for_each_entry (hc, _name_buckets + h, name_list) 105 list_for_each_entry (hc, _name_buckets + h, name_list)
105 if (!strcmp(hc->name, str)) 106 if (!strcmp(hc->name, str)) {
107 dm_get(hc->md);
106 return hc; 108 return hc;
109 }
107 110
108 return NULL; 111 return NULL;
109} 112}
@@ -114,8 +117,10 @@ static struct hash_cell *__get_uuid_cell(const char *str)
114 unsigned int h = hash_str(str); 117 unsigned int h = hash_str(str);
115 118
116 list_for_each_entry (hc, _uuid_buckets + h, uuid_list) 119 list_for_each_entry (hc, _uuid_buckets + h, uuid_list)
117 if (!strcmp(hc->uuid, str)) 120 if (!strcmp(hc->uuid, str)) {
121 dm_get(hc->md);
118 return hc; 122 return hc;
123 }
119 124
120 return NULL; 125 return NULL;
121} 126}
@@ -191,7 +196,7 @@ static int unregister_with_devfs(struct hash_cell *hc)
191 */ 196 */
192static int dm_hash_insert(const char *name, const char *uuid, struct mapped_device *md) 197static int dm_hash_insert(const char *name, const char *uuid, struct mapped_device *md)
193{ 198{
194 struct hash_cell *cell; 199 struct hash_cell *cell, *hc;
195 200
196 /* 201 /*
197 * Allocate the new cells. 202 * Allocate the new cells.
@@ -204,14 +209,19 @@ static int dm_hash_insert(const char *name, const char *uuid, struct mapped_devi
204 * Insert the cell into both hash tables. 209 * Insert the cell into both hash tables.
205 */ 210 */
206 down_write(&_hash_lock); 211 down_write(&_hash_lock);
207 if (__get_name_cell(name)) 212 hc = __get_name_cell(name);
213 if (hc) {
214 dm_put(hc->md);
208 goto bad; 215 goto bad;
216 }
209 217
210 list_add(&cell->name_list, _name_buckets + hash_str(name)); 218 list_add(&cell->name_list, _name_buckets + hash_str(name));
211 219
212 if (uuid) { 220 if (uuid) {
213 if (__get_uuid_cell(uuid)) { 221 hc = __get_uuid_cell(uuid);
222 if (hc) {
214 list_del(&cell->name_list); 223 list_del(&cell->name_list);
224 dm_put(hc->md);
215 goto bad; 225 goto bad;
216 } 226 }
217 list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid)); 227 list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid));
@@ -251,19 +261,41 @@ static void __hash_remove(struct hash_cell *hc)
251 free_cell(hc); 261 free_cell(hc);
252} 262}
253 263
254static void dm_hash_remove_all(void) 264static void dm_hash_remove_all(int keep_open_devices)
255{ 265{
256 int i; 266 int i, dev_skipped, dev_removed;
257 struct hash_cell *hc; 267 struct hash_cell *hc;
258 struct list_head *tmp, *n; 268 struct list_head *tmp, *n;
259 269
260 down_write(&_hash_lock); 270 down_write(&_hash_lock);
271
272retry:
273 dev_skipped = dev_removed = 0;
261 for (i = 0; i < NUM_BUCKETS; i++) { 274 for (i = 0; i < NUM_BUCKETS; i++) {
262 list_for_each_safe (tmp, n, _name_buckets + i) { 275 list_for_each_safe (tmp, n, _name_buckets + i) {
263 hc = list_entry(tmp, struct hash_cell, name_list); 276 hc = list_entry(tmp, struct hash_cell, name_list);
277
278 if (keep_open_devices &&
279 dm_lock_for_deletion(hc->md)) {
280 dev_skipped++;
281 continue;
282 }
264 __hash_remove(hc); 283 __hash_remove(hc);
284 dev_removed = 1;
265 } 285 }
266 } 286 }
287
288 /*
289 * Some mapped devices may be using other mapped devices, so if any
290 * still exist, repeat until we make no further progress.
291 */
292 if (dev_skipped) {
293 if (dev_removed)
294 goto retry;
295
296 DMWARN("remove_all left %d open device(s)", dev_skipped);
297 }
298
267 up_write(&_hash_lock); 299 up_write(&_hash_lock);
268} 300}
269 301
@@ -289,6 +321,7 @@ static int dm_hash_rename(const char *old, const char *new)
289 if (hc) { 321 if (hc) {
290 DMWARN("asked to rename to an already existing name %s -> %s", 322 DMWARN("asked to rename to an already existing name %s -> %s",
291 old, new); 323 old, new);
324 dm_put(hc->md);
292 up_write(&_hash_lock); 325 up_write(&_hash_lock);
293 kfree(new_name); 326 kfree(new_name);
294 return -EBUSY; 327 return -EBUSY;
@@ -328,6 +361,7 @@ static int dm_hash_rename(const char *old, const char *new)
328 dm_table_put(table); 361 dm_table_put(table);
329 } 362 }
330 363
364 dm_put(hc->md);
331 up_write(&_hash_lock); 365 up_write(&_hash_lock);
332 kfree(old_name); 366 kfree(old_name);
333 return 0; 367 return 0;
@@ -344,7 +378,7 @@ typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size);
344 378
345static int remove_all(struct dm_ioctl *param, size_t param_size) 379static int remove_all(struct dm_ioctl *param, size_t param_size)
346{ 380{
347 dm_hash_remove_all(); 381 dm_hash_remove_all(1);
348 param->data_size = 0; 382 param->data_size = 0;
349 return 0; 383 return 0;
350} 384}
@@ -524,7 +558,6 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
524{ 558{
525 struct gendisk *disk = dm_disk(md); 559 struct gendisk *disk = dm_disk(md);
526 struct dm_table *table; 560 struct dm_table *table;
527 struct block_device *bdev;
528 561
529 param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG | 562 param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG |
530 DM_ACTIVE_PRESENT_FLAG); 563 DM_ACTIVE_PRESENT_FLAG);
@@ -534,20 +567,12 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
534 567
535 param->dev = huge_encode_dev(MKDEV(disk->major, disk->first_minor)); 568 param->dev = huge_encode_dev(MKDEV(disk->major, disk->first_minor));
536 569
537 if (!(param->flags & DM_SKIP_BDGET_FLAG)) { 570 /*
538 bdev = bdget_disk(disk, 0); 571 * Yes, this will be out of date by the time it gets back
539 if (!bdev) 572 * to userland, but it is still very useful for
540 return -ENXIO; 573 * debugging.
541 574 */
542 /* 575 param->open_count = dm_open_count(md);
543 * Yes, this will be out of date by the time it gets back
544 * to userland, but it is still very useful for
545 * debugging.
546 */
547 param->open_count = bdev->bd_openers;
548 bdput(bdev);
549 } else
550 param->open_count = -1;
551 576
552 if (disk->policy) 577 if (disk->policy)
553 param->flags |= DM_READONLY_FLAG; 578 param->flags |= DM_READONLY_FLAG;
@@ -567,7 +592,7 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
567 592
568static int dev_create(struct dm_ioctl *param, size_t param_size) 593static int dev_create(struct dm_ioctl *param, size_t param_size)
569{ 594{
570 int r; 595 int r, m = DM_ANY_MINOR;
571 struct mapped_device *md; 596 struct mapped_device *md;
572 597
573 r = check_name(param->name); 598 r = check_name(param->name);
@@ -575,10 +600,9 @@ static int dev_create(struct dm_ioctl *param, size_t param_size)
575 return r; 600 return r;
576 601
577 if (param->flags & DM_PERSISTENT_DEV_FLAG) 602 if (param->flags & DM_PERSISTENT_DEV_FLAG)
578 r = dm_create_with_minor(MINOR(huge_decode_dev(param->dev)), &md); 603 m = MINOR(huge_decode_dev(param->dev));
579 else
580 r = dm_create(&md);
581 604
605 r = dm_create(m, &md);
582 if (r) 606 if (r)
583 return r; 607 return r;
584 608
@@ -611,10 +635,8 @@ static struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param)
611 return __get_name_cell(param->name); 635 return __get_name_cell(param->name);
612 636
613 md = dm_get_md(huge_decode_dev(param->dev)); 637 md = dm_get_md(huge_decode_dev(param->dev));
614 if (md) { 638 if (md)
615 mdptr = dm_get_mdptr(md); 639 mdptr = dm_get_mdptr(md);
616 dm_put(md);
617 }
618 640
619 return mdptr; 641 return mdptr;
620} 642}
@@ -628,7 +650,6 @@ static struct mapped_device *find_device(struct dm_ioctl *param)
628 hc = __find_device_hash_cell(param); 650 hc = __find_device_hash_cell(param);
629 if (hc) { 651 if (hc) {
630 md = hc->md; 652 md = hc->md;
631 dm_get(md);
632 653
633 /* 654 /*
634 * Sneakily write in both the name and the uuid 655 * Sneakily write in both the name and the uuid
@@ -653,6 +674,8 @@ static struct mapped_device *find_device(struct dm_ioctl *param)
653static int dev_remove(struct dm_ioctl *param, size_t param_size) 674static int dev_remove(struct dm_ioctl *param, size_t param_size)
654{ 675{
655 struct hash_cell *hc; 676 struct hash_cell *hc;
677 struct mapped_device *md;
678 int r;
656 679
657 down_write(&_hash_lock); 680 down_write(&_hash_lock);
658 hc = __find_device_hash_cell(param); 681 hc = __find_device_hash_cell(param);
@@ -663,8 +686,22 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
663 return -ENXIO; 686 return -ENXIO;
664 } 687 }
665 688
689 md = hc->md;
690
691 /*
692 * Ensure the device is not open and nothing further can open it.
693 */
694 r = dm_lock_for_deletion(md);
695 if (r) {
696 DMWARN("unable to remove open device %s", hc->name);
697 up_write(&_hash_lock);
698 dm_put(md);
699 return r;
700 }
701
666 __hash_remove(hc); 702 __hash_remove(hc);
667 up_write(&_hash_lock); 703 up_write(&_hash_lock);
704 dm_put(md);
668 param->data_size = 0; 705 param->data_size = 0;
669 return 0; 706 return 0;
670} 707}
@@ -790,7 +827,6 @@ static int do_resume(struct dm_ioctl *param)
790 } 827 }
791 828
792 md = hc->md; 829 md = hc->md;
793 dm_get(md);
794 830
795 new_map = hc->new_map; 831 new_map = hc->new_map;
796 hc->new_map = NULL; 832 hc->new_map = NULL;
@@ -1078,6 +1114,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
1078{ 1114{
1079 int r; 1115 int r;
1080 struct hash_cell *hc; 1116 struct hash_cell *hc;
1117 struct mapped_device *md;
1081 1118
1082 down_write(&_hash_lock); 1119 down_write(&_hash_lock);
1083 1120
@@ -1096,7 +1133,9 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
1096 param->flags &= ~DM_INACTIVE_PRESENT_FLAG; 1133 param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
1097 1134
1098 r = __dev_status(hc->md, param); 1135 r = __dev_status(hc->md, param);
1136 md = hc->md;
1099 up_write(&_hash_lock); 1137 up_write(&_hash_lock);
1138 dm_put(md);
1100 return r; 1139 return r;
1101} 1140}
1102 1141
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index daf586c0898d..47b3c62bbdb8 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -12,6 +12,8 @@
12#include <linux/bio.h> 12#include <linux/bio.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14 14
15#define DM_MSG_PREFIX "linear"
16
15/* 17/*
16 * Linear: maps a linear range of a device. 18 * Linear: maps a linear range of a device.
17 */ 19 */
@@ -29,7 +31,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
29 unsigned long long tmp; 31 unsigned long long tmp;
30 32
31 if (argc != 2) { 33 if (argc != 2) {
32 ti->error = "dm-linear: Invalid argument count"; 34 ti->error = "Invalid argument count";
33 return -EINVAL; 35 return -EINVAL;
34 } 36 }
35 37
@@ -111,7 +113,7 @@ int __init dm_linear_init(void)
111 int r = dm_register_target(&linear_target); 113 int r = dm_register_target(&linear_target);
112 114
113 if (r < 0) 115 if (r < 0)
114 DMERR("linear: register failed %d", r); 116 DMERR("register failed %d", r);
115 117
116 return r; 118 return r;
117} 119}
@@ -121,5 +123,5 @@ void dm_linear_exit(void)
121 int r = dm_unregister_target(&linear_target); 123 int r = dm_unregister_target(&linear_target);
122 124
123 if (r < 0) 125 if (r < 0)
124 DMERR("linear: unregister failed %d", r); 126 DMERR("unregister failed %d", r);
125} 127}
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index d73779a42417..64b764bd02cc 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -12,6 +12,8 @@
12#include "dm-log.h" 12#include "dm-log.h"
13#include "dm-io.h" 13#include "dm-io.h"
14 14
15#define DM_MSG_PREFIX "mirror log"
16
15static LIST_HEAD(_log_types); 17static LIST_HEAD(_log_types);
16static DEFINE_SPINLOCK(_lock); 18static DEFINE_SPINLOCK(_lock);
17 19
@@ -155,8 +157,6 @@ struct log_c {
155 157
156 struct io_region header_location; 158 struct io_region header_location;
157 struct log_header *disk_header; 159 struct log_header *disk_header;
158
159 struct io_region bits_location;
160}; 160};
161 161
162/* 162/*
@@ -241,43 +241,21 @@ static inline int write_header(struct log_c *log)
241} 241}
242 242
243/*---------------------------------------------------------------- 243/*----------------------------------------------------------------
244 * Bits IO
245 *--------------------------------------------------------------*/
246static int read_bits(struct log_c *log)
247{
248 int r;
249 unsigned long ebits;
250
251 r = dm_io_sync_vm(1, &log->bits_location, READ,
252 log->clean_bits, &ebits);
253 if (r)
254 return r;
255
256 return 0;
257}
258
259static int write_bits(struct log_c *log)
260{
261 unsigned long ebits;
262 return dm_io_sync_vm(1, &log->bits_location, WRITE,
263 log->clean_bits, &ebits);
264}
265
266/*----------------------------------------------------------------
267 * core log constructor/destructor 244 * core log constructor/destructor
268 * 245 *
269 * argv contains region_size followed optionally by [no]sync 246 * argv contains region_size followed optionally by [no]sync
270 *--------------------------------------------------------------*/ 247 *--------------------------------------------------------------*/
271#define BYTE_SHIFT 3 248#define BYTE_SHIFT 3
272static int core_ctr(struct dirty_log *log, struct dm_target *ti, 249static int create_log_context(struct dirty_log *log, struct dm_target *ti,
273 unsigned int argc, char **argv) 250 unsigned int argc, char **argv,
251 struct dm_dev *dev)
274{ 252{
275 enum sync sync = DEFAULTSYNC; 253 enum sync sync = DEFAULTSYNC;
276 254
277 struct log_c *lc; 255 struct log_c *lc;
278 uint32_t region_size; 256 uint32_t region_size;
279 unsigned int region_count; 257 unsigned int region_count;
280 size_t bitset_size; 258 size_t bitset_size, buf_size;
281 259
282 if (argc < 1 || argc > 2) { 260 if (argc < 1 || argc > 2) {
283 DMWARN("wrong number of arguments to mirror log"); 261 DMWARN("wrong number of arguments to mirror log");
@@ -319,22 +297,53 @@ static int core_ctr(struct dirty_log *log, struct dm_target *ti,
319 * Work out how many "unsigned long"s we need to hold the bitset. 297 * Work out how many "unsigned long"s we need to hold the bitset.
320 */ 298 */
321 bitset_size = dm_round_up(region_count, 299 bitset_size = dm_round_up(region_count,
322 sizeof(unsigned long) << BYTE_SHIFT); 300 sizeof(*lc->clean_bits) << BYTE_SHIFT);
323 bitset_size >>= BYTE_SHIFT; 301 bitset_size >>= BYTE_SHIFT;
324 302
325 lc->bitset_uint32_count = bitset_size / 4; 303 lc->bitset_uint32_count = bitset_size / sizeof(*lc->clean_bits);
326 lc->clean_bits = vmalloc(bitset_size); 304
327 if (!lc->clean_bits) { 305 /*
328 DMWARN("couldn't allocate clean bitset"); 306 * Disk log?
329 kfree(lc); 307 */
330 return -ENOMEM; 308 if (!dev) {
309 lc->clean_bits = vmalloc(bitset_size);
310 if (!lc->clean_bits) {
311 DMWARN("couldn't allocate clean bitset");
312 kfree(lc);
313 return -ENOMEM;
314 }
315 lc->disk_header = NULL;
316 } else {
317 lc->log_dev = dev;
318 lc->header_location.bdev = lc->log_dev->bdev;
319 lc->header_location.sector = 0;
320
321 /*
322 * Buffer holds both header and bitset.
323 */
324 buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) +
325 bitset_size, ti->limits.hardsect_size);
326 lc->header_location.count = buf_size >> SECTOR_SHIFT;
327
328 lc->disk_header = vmalloc(buf_size);
329 if (!lc->disk_header) {
330 DMWARN("couldn't allocate disk log buffer");
331 kfree(lc);
332 return -ENOMEM;
333 }
334
335 lc->clean_bits = (void *)lc->disk_header +
336 (LOG_OFFSET << SECTOR_SHIFT);
331 } 337 }
338
332 memset(lc->clean_bits, -1, bitset_size); 339 memset(lc->clean_bits, -1, bitset_size);
333 340
334 lc->sync_bits = vmalloc(bitset_size); 341 lc->sync_bits = vmalloc(bitset_size);
335 if (!lc->sync_bits) { 342 if (!lc->sync_bits) {
336 DMWARN("couldn't allocate sync bitset"); 343 DMWARN("couldn't allocate sync bitset");
337 vfree(lc->clean_bits); 344 if (!dev)
345 vfree(lc->clean_bits);
346 vfree(lc->disk_header);
338 kfree(lc); 347 kfree(lc);
339 return -ENOMEM; 348 return -ENOMEM;
340 } 349 }
@@ -345,25 +354,40 @@ static int core_ctr(struct dirty_log *log, struct dm_target *ti,
345 if (!lc->recovering_bits) { 354 if (!lc->recovering_bits) {
346 DMWARN("couldn't allocate sync bitset"); 355 DMWARN("couldn't allocate sync bitset");
347 vfree(lc->sync_bits); 356 vfree(lc->sync_bits);
348 vfree(lc->clean_bits); 357 if (!dev)
358 vfree(lc->clean_bits);
359 vfree(lc->disk_header);
349 kfree(lc); 360 kfree(lc);
350 return -ENOMEM; 361 return -ENOMEM;
351 } 362 }
352 memset(lc->recovering_bits, 0, bitset_size); 363 memset(lc->recovering_bits, 0, bitset_size);
353 lc->sync_search = 0; 364 lc->sync_search = 0;
354 log->context = lc; 365 log->context = lc;
366
355 return 0; 367 return 0;
356} 368}
357 369
358static void core_dtr(struct dirty_log *log) 370static int core_ctr(struct dirty_log *log, struct dm_target *ti,
371 unsigned int argc, char **argv)
372{
373 return create_log_context(log, ti, argc, argv, NULL);
374}
375
376static void destroy_log_context(struct log_c *lc)
359{ 377{
360 struct log_c *lc = (struct log_c *) log->context;
361 vfree(lc->clean_bits);
362 vfree(lc->sync_bits); 378 vfree(lc->sync_bits);
363 vfree(lc->recovering_bits); 379 vfree(lc->recovering_bits);
364 kfree(lc); 380 kfree(lc);
365} 381}
366 382
383static void core_dtr(struct dirty_log *log)
384{
385 struct log_c *lc = (struct log_c *) log->context;
386
387 vfree(lc->clean_bits);
388 destroy_log_context(lc);
389}
390
367/*---------------------------------------------------------------- 391/*----------------------------------------------------------------
368 * disk log constructor/destructor 392 * disk log constructor/destructor
369 * 393 *
@@ -373,8 +397,6 @@ static int disk_ctr(struct dirty_log *log, struct dm_target *ti,
373 unsigned int argc, char **argv) 397 unsigned int argc, char **argv)
374{ 398{
375 int r; 399 int r;
376 size_t size;
377 struct log_c *lc;
378 struct dm_dev *dev; 400 struct dm_dev *dev;
379 401
380 if (argc < 2 || argc > 3) { 402 if (argc < 2 || argc > 3) {
@@ -387,49 +409,22 @@ static int disk_ctr(struct dirty_log *log, struct dm_target *ti,
387 if (r) 409 if (r)
388 return r; 410 return r;
389 411
390 r = core_ctr(log, ti, argc - 1, argv + 1); 412 r = create_log_context(log, ti, argc - 1, argv + 1, dev);
391 if (r) { 413 if (r) {
392 dm_put_device(ti, dev); 414 dm_put_device(ti, dev);
393 return r; 415 return r;
394 } 416 }
395 417
396 lc = (struct log_c *) log->context;
397 lc->log_dev = dev;
398
399 /* setup the disk header fields */
400 lc->header_location.bdev = lc->log_dev->bdev;
401 lc->header_location.sector = 0;
402 lc->header_location.count = 1;
403
404 /*
405 * We can't read less than this amount, even though we'll
406 * not be using most of this space.
407 */
408 lc->disk_header = vmalloc(1 << SECTOR_SHIFT);
409 if (!lc->disk_header)
410 goto bad;
411
412 /* setup the disk bitset fields */
413 lc->bits_location.bdev = lc->log_dev->bdev;
414 lc->bits_location.sector = LOG_OFFSET;
415
416 size = dm_round_up(lc->bitset_uint32_count * sizeof(uint32_t),
417 1 << SECTOR_SHIFT);
418 lc->bits_location.count = size >> SECTOR_SHIFT;
419 return 0; 418 return 0;
420
421 bad:
422 dm_put_device(ti, lc->log_dev);
423 core_dtr(log);
424 return -ENOMEM;
425} 419}
426 420
427static void disk_dtr(struct dirty_log *log) 421static void disk_dtr(struct dirty_log *log)
428{ 422{
429 struct log_c *lc = (struct log_c *) log->context; 423 struct log_c *lc = (struct log_c *) log->context;
424
430 dm_put_device(lc->ti, lc->log_dev); 425 dm_put_device(lc->ti, lc->log_dev);
431 vfree(lc->disk_header); 426 vfree(lc->disk_header);
432 core_dtr(log); 427 destroy_log_context(lc);
433} 428}
434 429
435static int count_bits32(uint32_t *addr, unsigned size) 430static int count_bits32(uint32_t *addr, unsigned size)
@@ -454,12 +449,7 @@ static int disk_resume(struct dirty_log *log)
454 if (r) 449 if (r)
455 return r; 450 return r;
456 451
457 /* read the bits */ 452 /* set or clear any new bits -- device has grown */
458 r = read_bits(lc);
459 if (r)
460 return r;
461
462 /* set or clear any new bits */
463 if (lc->sync == NOSYNC) 453 if (lc->sync == NOSYNC)
464 for (i = lc->header.nr_regions; i < lc->region_count; i++) 454 for (i = lc->header.nr_regions; i < lc->region_count; i++)
465 /* FIXME: amazingly inefficient */ 455 /* FIXME: amazingly inefficient */
@@ -469,15 +459,14 @@ static int disk_resume(struct dirty_log *log)
469 /* FIXME: amazingly inefficient */ 459 /* FIXME: amazingly inefficient */
470 log_clear_bit(lc, lc->clean_bits, i); 460 log_clear_bit(lc, lc->clean_bits, i);
471 461
462 /* clear any old bits -- device has shrunk */
463 for (i = lc->region_count; i % (sizeof(*lc->clean_bits) << BYTE_SHIFT); i++)
464 log_clear_bit(lc, lc->clean_bits, i);
465
472 /* copy clean across to sync */ 466 /* copy clean across to sync */
473 memcpy(lc->sync_bits, lc->clean_bits, size); 467 memcpy(lc->sync_bits, lc->clean_bits, size);
474 lc->sync_count = count_bits32(lc->clean_bits, lc->bitset_uint32_count); 468 lc->sync_count = count_bits32(lc->clean_bits, lc->bitset_uint32_count);
475 469
476 /* write the bits */
477 r = write_bits(lc);
478 if (r)
479 return r;
480
481 /* set the correct number of regions in the header */ 470 /* set the correct number of regions in the header */
482 lc->header.nr_regions = lc->region_count; 471 lc->header.nr_regions = lc->region_count;
483 472
@@ -518,7 +507,7 @@ static int disk_flush(struct dirty_log *log)
518 if (!lc->touched) 507 if (!lc->touched)
519 return 0; 508 return 0;
520 509
521 r = write_bits(lc); 510 r = write_header(lc);
522 if (!r) 511 if (!r)
523 lc->touched = 0; 512 lc->touched = 0;
524 513
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 1816f30678ed..217615b33223 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -21,6 +21,7 @@
21#include <linux/workqueue.h> 21#include <linux/workqueue.h>
22#include <asm/atomic.h> 22#include <asm/atomic.h>
23 23
24#define DM_MSG_PREFIX "multipath"
24#define MESG_STR(x) x, sizeof(x) 25#define MESG_STR(x) x, sizeof(x)
25 26
26/* Path properties */ 27/* Path properties */
@@ -446,8 +447,6 @@ struct param {
446 char *error; 447 char *error;
447}; 448};
448 449
449#define ESTR(s) ("dm-multipath: " s)
450
451static int read_param(struct param *param, char *str, unsigned *v, char **error) 450static int read_param(struct param *param, char *str, unsigned *v, char **error)
452{ 451{
453 if (!str || 452 if (!str ||
@@ -495,12 +494,12 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
495 unsigned ps_argc; 494 unsigned ps_argc;
496 495
497 static struct param _params[] = { 496 static struct param _params[] = {
498 {0, 1024, ESTR("invalid number of path selector args")}, 497 {0, 1024, "invalid number of path selector args"},
499 }; 498 };
500 499
501 pst = dm_get_path_selector(shift(as)); 500 pst = dm_get_path_selector(shift(as));
502 if (!pst) { 501 if (!pst) {
503 ti->error = ESTR("unknown path selector type"); 502 ti->error = "unknown path selector type";
504 return -EINVAL; 503 return -EINVAL;
505 } 504 }
506 505
@@ -511,7 +510,7 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
511 r = pst->create(&pg->ps, ps_argc, as->argv); 510 r = pst->create(&pg->ps, ps_argc, as->argv);
512 if (r) { 511 if (r) {
513 dm_put_path_selector(pst); 512 dm_put_path_selector(pst);
514 ti->error = ESTR("path selector constructor failed"); 513 ti->error = "path selector constructor failed";
515 return r; 514 return r;
516 } 515 }
517 516
@@ -529,7 +528,7 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
529 528
530 /* we need at least a path arg */ 529 /* we need at least a path arg */
531 if (as->argc < 1) { 530 if (as->argc < 1) {
532 ti->error = ESTR("no device given"); 531 ti->error = "no device given";
533 return NULL; 532 return NULL;
534 } 533 }
535 534
@@ -540,7 +539,7 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
540 r = dm_get_device(ti, shift(as), ti->begin, ti->len, 539 r = dm_get_device(ti, shift(as), ti->begin, ti->len,
541 dm_table_get_mode(ti->table), &p->path.dev); 540 dm_table_get_mode(ti->table), &p->path.dev);
542 if (r) { 541 if (r) {
543 ti->error = ESTR("error getting device"); 542 ti->error = "error getting device";
544 goto bad; 543 goto bad;
545 } 544 }
546 545
@@ -562,8 +561,8 @@ static struct priority_group *parse_priority_group(struct arg_set *as,
562 struct dm_target *ti) 561 struct dm_target *ti)
563{ 562{
564 static struct param _params[] = { 563 static struct param _params[] = {
565 {1, 1024, ESTR("invalid number of paths")}, 564 {1, 1024, "invalid number of paths"},
566 {0, 1024, ESTR("invalid number of selector args")} 565 {0, 1024, "invalid number of selector args"}
567 }; 566 };
568 567
569 int r; 568 int r;
@@ -572,13 +571,13 @@ static struct priority_group *parse_priority_group(struct arg_set *as,
572 571
573 if (as->argc < 2) { 572 if (as->argc < 2) {
574 as->argc = 0; 573 as->argc = 0;
575 ti->error = ESTR("not enough priority group aruments"); 574 ti->error = "not enough priority group aruments";
576 return NULL; 575 return NULL;
577 } 576 }
578 577
579 pg = alloc_priority_group(); 578 pg = alloc_priority_group();
580 if (!pg) { 579 if (!pg) {
581 ti->error = ESTR("couldn't allocate priority group"); 580 ti->error = "couldn't allocate priority group";
582 return NULL; 581 return NULL;
583 } 582 }
584 pg->m = m; 583 pg->m = m;
@@ -633,7 +632,7 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m,
633 unsigned hw_argc; 632 unsigned hw_argc;
634 633
635 static struct param _params[] = { 634 static struct param _params[] = {
636 {0, 1024, ESTR("invalid number of hardware handler args")}, 635 {0, 1024, "invalid number of hardware handler args"},
637 }; 636 };
638 637
639 r = read_param(_params, shift(as), &hw_argc, &ti->error); 638 r = read_param(_params, shift(as), &hw_argc, &ti->error);
@@ -645,14 +644,14 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m,
645 644
646 hwht = dm_get_hw_handler(shift(as)); 645 hwht = dm_get_hw_handler(shift(as));
647 if (!hwht) { 646 if (!hwht) {
648 ti->error = ESTR("unknown hardware handler type"); 647 ti->error = "unknown hardware handler type";
649 return -EINVAL; 648 return -EINVAL;
650 } 649 }
651 650
652 r = hwht->create(&m->hw_handler, hw_argc - 1, as->argv); 651 r = hwht->create(&m->hw_handler, hw_argc - 1, as->argv);
653 if (r) { 652 if (r) {
654 dm_put_hw_handler(hwht); 653 dm_put_hw_handler(hwht);
655 ti->error = ESTR("hardware handler constructor failed"); 654 ti->error = "hardware handler constructor failed";
656 return r; 655 return r;
657 } 656 }
658 657
@@ -669,7 +668,7 @@ static int parse_features(struct arg_set *as, struct multipath *m,
669 unsigned argc; 668 unsigned argc;
670 669
671 static struct param _params[] = { 670 static struct param _params[] = {
672 {0, 1, ESTR("invalid number of feature args")}, 671 {0, 1, "invalid number of feature args"},
673 }; 672 };
674 673
675 r = read_param(_params, shift(as), &argc, &ti->error); 674 r = read_param(_params, shift(as), &argc, &ti->error);
@@ -692,8 +691,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
692{ 691{
693 /* target parameters */ 692 /* target parameters */
694 static struct param _params[] = { 693 static struct param _params[] = {
695 {1, 1024, ESTR("invalid number of priority groups")}, 694 {1, 1024, "invalid number of priority groups"},
696 {1, 1024, ESTR("invalid initial priority group number")}, 695 {1, 1024, "invalid initial priority group number"},
697 }; 696 };
698 697
699 int r; 698 int r;
@@ -707,7 +706,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
707 706
708 m = alloc_multipath(); 707 m = alloc_multipath();
709 if (!m) { 708 if (!m) {
710 ti->error = ESTR("can't allocate multipath"); 709 ti->error = "can't allocate multipath";
711 return -EINVAL; 710 return -EINVAL;
712 } 711 }
713 712
@@ -746,7 +745,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
746 } 745 }
747 746
748 if (pg_count != m->nr_priority_groups) { 747 if (pg_count != m->nr_priority_groups) {
749 ti->error = ESTR("priority group count mismatch"); 748 ti->error = "priority group count mismatch";
750 r = -EINVAL; 749 r = -EINVAL;
751 goto bad; 750 goto bad;
752 } 751 }
@@ -807,7 +806,7 @@ static int fail_path(struct pgpath *pgpath)
807 if (!pgpath->path.is_active) 806 if (!pgpath->path.is_active)
808 goto out; 807 goto out;
809 808
810 DMWARN("dm-multipath: Failing path %s.", pgpath->path.dev->name); 809 DMWARN("Failing path %s.", pgpath->path.dev->name);
811 810
812 pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path); 811 pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
813 pgpath->path.is_active = 0; 812 pgpath->path.is_active = 0;
@@ -1250,7 +1249,7 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
1250 r = dm_get_device(ti, argv[1], ti->begin, ti->len, 1249 r = dm_get_device(ti, argv[1], ti->begin, ti->len,
1251 dm_table_get_mode(ti->table), &dev); 1250 dm_table_get_mode(ti->table), &dev);
1252 if (r) { 1251 if (r) {
1253 DMWARN("dm-multipath message: error getting device %s", 1252 DMWARN("message: error getting device %s",
1254 argv[1]); 1253 argv[1]);
1255 return -EINVAL; 1254 return -EINVAL;
1256 } 1255 }
@@ -1309,7 +1308,7 @@ static int __init dm_multipath_init(void)
1309 return -ENOMEM; 1308 return -ENOMEM;
1310 } 1309 }
1311 1310
1312 DMINFO("dm-multipath version %u.%u.%u loaded", 1311 DMINFO("version %u.%u.%u loaded",
1313 multipath_target.version[0], multipath_target.version[1], 1312 multipath_target.version[0], multipath_target.version[1],
1314 multipath_target.version[2]); 1313 multipath_target.version[2]);
1315 1314
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index d12cf3e5e076..be48cedf986b 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -20,6 +20,8 @@
20#include <linux/vmalloc.h> 20#include <linux/vmalloc.h>
21#include <linux/workqueue.h> 21#include <linux/workqueue.h>
22 22
23#define DM_MSG_PREFIX "raid1"
24
23static struct workqueue_struct *_kmirrord_wq; 25static struct workqueue_struct *_kmirrord_wq;
24static struct work_struct _kmirrord_work; 26static struct work_struct _kmirrord_work;
25 27
@@ -106,12 +108,42 @@ struct region {
106 struct bio_list delayed_bios; 108 struct bio_list delayed_bios;
107}; 109};
108 110
111
112/*-----------------------------------------------------------------
113 * Mirror set structures.
114 *---------------------------------------------------------------*/
115struct mirror {
116 atomic_t error_count;
117 struct dm_dev *dev;
118 sector_t offset;
119};
120
121struct mirror_set {
122 struct dm_target *ti;
123 struct list_head list;
124 struct region_hash rh;
125 struct kcopyd_client *kcopyd_client;
126
127 spinlock_t lock; /* protects the next two lists */
128 struct bio_list reads;
129 struct bio_list writes;
130
131 /* recovery */
132 region_t nr_regions;
133 int in_sync;
134
135 struct mirror *default_mirror; /* Default mirror */
136
137 unsigned int nr_mirrors;
138 struct mirror mirror[0];
139};
140
109/* 141/*
110 * Conversion fns 142 * Conversion fns
111 */ 143 */
112static inline region_t bio_to_region(struct region_hash *rh, struct bio *bio) 144static inline region_t bio_to_region(struct region_hash *rh, struct bio *bio)
113{ 145{
114 return bio->bi_sector >> rh->region_shift; 146 return (bio->bi_sector - rh->ms->ti->begin) >> rh->region_shift;
115} 147}
116 148
117static inline sector_t region_to_sector(struct region_hash *rh, region_t region) 149static inline sector_t region_to_sector(struct region_hash *rh, region_t region)
@@ -458,11 +490,9 @@ static int __rh_recovery_prepare(struct region_hash *rh)
458 /* Already quiesced ? */ 490 /* Already quiesced ? */
459 if (atomic_read(&reg->pending)) 491 if (atomic_read(&reg->pending))
460 list_del_init(&reg->list); 492 list_del_init(&reg->list);
493 else
494 list_move(&reg->list, &rh->quiesced_regions);
461 495
462 else {
463 list_del_init(&reg->list);
464 list_add(&reg->list, &rh->quiesced_regions);
465 }
466 spin_unlock_irq(&rh->region_lock); 496 spin_unlock_irq(&rh->region_lock);
467 497
468 return 1; 498 return 1;
@@ -541,35 +571,6 @@ static void rh_start_recovery(struct region_hash *rh)
541 wake(); 571 wake();
542} 572}
543 573
544/*-----------------------------------------------------------------
545 * Mirror set structures.
546 *---------------------------------------------------------------*/
547struct mirror {
548 atomic_t error_count;
549 struct dm_dev *dev;
550 sector_t offset;
551};
552
553struct mirror_set {
554 struct dm_target *ti;
555 struct list_head list;
556 struct region_hash rh;
557 struct kcopyd_client *kcopyd_client;
558
559 spinlock_t lock; /* protects the next two lists */
560 struct bio_list reads;
561 struct bio_list writes;
562
563 /* recovery */
564 region_t nr_regions;
565 int in_sync;
566
567 struct mirror *default_mirror; /* Default mirror */
568
569 unsigned int nr_mirrors;
570 struct mirror mirror[0];
571};
572
573/* 574/*
574 * Every mirror should look like this one. 575 * Every mirror should look like this one.
575 */ 576 */
@@ -603,7 +604,7 @@ static void recovery_complete(int read_err, unsigned int write_err,
603 struct region *reg = (struct region *) context; 604 struct region *reg = (struct region *) context;
604 605
605 /* FIXME: better error handling */ 606 /* FIXME: better error handling */
606 rh_recovery_end(reg, read_err || write_err); 607 rh_recovery_end(reg, !(read_err || write_err));
607} 608}
608 609
609static int recover(struct mirror_set *ms, struct region *reg) 610static int recover(struct mirror_set *ms, struct region *reg)
@@ -893,7 +894,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
893 894
894 ms = kmalloc(len, GFP_KERNEL); 895 ms = kmalloc(len, GFP_KERNEL);
895 if (!ms) { 896 if (!ms) {
896 ti->error = "dm-mirror: Cannot allocate mirror context"; 897 ti->error = "Cannot allocate mirror context";
897 return NULL; 898 return NULL;
898 } 899 }
899 900
@@ -907,7 +908,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
907 ms->default_mirror = &ms->mirror[DEFAULT_MIRROR]; 908 ms->default_mirror = &ms->mirror[DEFAULT_MIRROR];
908 909
909 if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { 910 if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) {
910 ti->error = "dm-mirror: Error creating dirty region hash"; 911 ti->error = "Error creating dirty region hash";
911 kfree(ms); 912 kfree(ms);
912 return NULL; 913 return NULL;
913 } 914 }
@@ -937,14 +938,14 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
937 unsigned long long offset; 938 unsigned long long offset;
938 939
939 if (sscanf(argv[1], "%llu", &offset) != 1) { 940 if (sscanf(argv[1], "%llu", &offset) != 1) {
940 ti->error = "dm-mirror: Invalid offset"; 941 ti->error = "Invalid offset";
941 return -EINVAL; 942 return -EINVAL;
942 } 943 }
943 944
944 if (dm_get_device(ti, argv[0], offset, ti->len, 945 if (dm_get_device(ti, argv[0], offset, ti->len,
945 dm_table_get_mode(ti->table), 946 dm_table_get_mode(ti->table),
946 &ms->mirror[mirror].dev)) { 947 &ms->mirror[mirror].dev)) {
947 ti->error = "dm-mirror: Device lookup failure"; 948 ti->error = "Device lookup failure";
948 return -ENXIO; 949 return -ENXIO;
949 } 950 }
950 951
@@ -981,30 +982,30 @@ static struct dirty_log *create_dirty_log(struct dm_target *ti,
981 struct dirty_log *dl; 982 struct dirty_log *dl;
982 983
983 if (argc < 2) { 984 if (argc < 2) {
984 ti->error = "dm-mirror: Insufficient mirror log arguments"; 985 ti->error = "Insufficient mirror log arguments";
985 return NULL; 986 return NULL;
986 } 987 }
987 988
988 if (sscanf(argv[1], "%u", &param_count) != 1) { 989 if (sscanf(argv[1], "%u", &param_count) != 1) {
989 ti->error = "dm-mirror: Invalid mirror log argument count"; 990 ti->error = "Invalid mirror log argument count";
990 return NULL; 991 return NULL;
991 } 992 }
992 993
993 *args_used = 2 + param_count; 994 *args_used = 2 + param_count;
994 995
995 if (argc < *args_used) { 996 if (argc < *args_used) {
996 ti->error = "dm-mirror: Insufficient mirror log arguments"; 997 ti->error = "Insufficient mirror log arguments";
997 return NULL; 998 return NULL;
998 } 999 }
999 1000
1000 dl = dm_create_dirty_log(argv[0], ti, param_count, argv + 2); 1001 dl = dm_create_dirty_log(argv[0], ti, param_count, argv + 2);
1001 if (!dl) { 1002 if (!dl) {
1002 ti->error = "dm-mirror: Error creating mirror dirty log"; 1003 ti->error = "Error creating mirror dirty log";
1003 return NULL; 1004 return NULL;
1004 } 1005 }
1005 1006
1006 if (!_check_region_size(ti, dl->type->get_region_size(dl))) { 1007 if (!_check_region_size(ti, dl->type->get_region_size(dl))) {
1007 ti->error = "dm-mirror: Invalid region size"; 1008 ti->error = "Invalid region size";
1008 dm_destroy_dirty_log(dl); 1009 dm_destroy_dirty_log(dl);
1009 return NULL; 1010 return NULL;
1010 } 1011 }
@@ -1038,7 +1039,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1038 1039
1039 if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 || 1040 if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 ||
1040 nr_mirrors < 2 || nr_mirrors > KCOPYD_MAX_REGIONS + 1) { 1041 nr_mirrors < 2 || nr_mirrors > KCOPYD_MAX_REGIONS + 1) {
1041 ti->error = "dm-mirror: Invalid number of mirrors"; 1042 ti->error = "Invalid number of mirrors";
1042 dm_destroy_dirty_log(dl); 1043 dm_destroy_dirty_log(dl);
1043 return -EINVAL; 1044 return -EINVAL;
1044 } 1045 }
@@ -1046,7 +1047,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1046 argv++, argc--; 1047 argv++, argc--;
1047 1048
1048 if (argc != nr_mirrors * 2) { 1049 if (argc != nr_mirrors * 2) {
1049 ti->error = "dm-mirror: Wrong number of mirror arguments"; 1050 ti->error = "Wrong number of mirror arguments";
1050 dm_destroy_dirty_log(dl); 1051 dm_destroy_dirty_log(dl);
1051 return -EINVAL; 1052 return -EINVAL;
1052 } 1053 }
@@ -1115,7 +1116,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio,
1115 struct mirror *m; 1116 struct mirror *m;
1116 struct mirror_set *ms = ti->private; 1117 struct mirror_set *ms = ti->private;
1117 1118
1118 map_context->ll = bio->bi_sector >> ms->rh.region_shift; 1119 map_context->ll = bio_to_region(&ms->rh, bio);
1119 1120
1120 if (rw == WRITE) { 1121 if (rw == WRITE) {
1121 queue_bio(ms, bio, rw); 1122 queue_bio(ms, bio, rw);
@@ -1221,7 +1222,7 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
1221 1222
1222static struct target_type mirror_target = { 1223static struct target_type mirror_target = {
1223 .name = "mirror", 1224 .name = "mirror",
1224 .version = {1, 0, 1}, 1225 .version = {1, 0, 2},
1225 .module = THIS_MODULE, 1226 .module = THIS_MODULE,
1226 .ctr = mirror_ctr, 1227 .ctr = mirror_ctr,
1227 .dtr = mirror_dtr, 1228 .dtr = mirror_dtr,
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c
index d0024865a789..c5a16c550122 100644
--- a/drivers/md/dm-round-robin.c
+++ b/drivers/md/dm-round-robin.c
@@ -14,6 +14,8 @@
14 14
15#include <linux/slab.h> 15#include <linux/slab.h>
16 16
17#define DM_MSG_PREFIX "multipath round-robin"
18
17/*----------------------------------------------------------------- 19/*-----------------------------------------------------------------
18 * Path-handling code, paths are held in lists 20 * Path-handling code, paths are held in lists
19 *---------------------------------------------------------------*/ 21 *---------------------------------------------------------------*/
@@ -191,9 +193,9 @@ static int __init dm_rr_init(void)
191 int r = dm_register_path_selector(&rr_ps); 193 int r = dm_register_path_selector(&rr_ps);
192 194
193 if (r < 0) 195 if (r < 0)
194 DMERR("round-robin: register failed %d", r); 196 DMERR("register failed %d", r);
195 197
196 DMINFO("dm-round-robin version 1.0.0 loaded"); 198 DMINFO("version 1.0.0 loaded");
197 199
198 return r; 200 return r;
199} 201}
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 08312b46463a..8eea0ddbf5ec 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -23,6 +23,8 @@
23#include "dm-bio-list.h" 23#include "dm-bio-list.h"
24#include "kcopyd.h" 24#include "kcopyd.h"
25 25
26#define DM_MSG_PREFIX "snapshots"
27
26/* 28/*
27 * The percentage increment we will wake up users at 29 * The percentage increment we will wake up users at
28 */ 30 */
@@ -117,7 +119,7 @@ static int init_origin_hash(void)
117 _origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head), 119 _origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head),
118 GFP_KERNEL); 120 GFP_KERNEL);
119 if (!_origins) { 121 if (!_origins) {
120 DMERR("Device mapper: Snapshot: unable to allocate memory"); 122 DMERR("unable to allocate memory");
121 return -ENOMEM; 123 return -ENOMEM;
122 } 124 }
123 125
@@ -412,7 +414,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
412 int blocksize; 414 int blocksize;
413 415
414 if (argc < 4) { 416 if (argc < 4) {
415 ti->error = "dm-snapshot: requires exactly 4 arguments"; 417 ti->error = "requires exactly 4 arguments";
416 r = -EINVAL; 418 r = -EINVAL;
417 goto bad1; 419 goto bad1;
418 } 420 }
@@ -530,7 +532,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
530 } 532 }
531 533
532 ti->private = s; 534 ti->private = s;
533 ti->split_io = chunk_size; 535 ti->split_io = s->chunk_size;
534 536
535 return 0; 537 return 0;
536 538
@@ -1127,7 +1129,7 @@ static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1127 struct dm_dev *dev; 1129 struct dm_dev *dev;
1128 1130
1129 if (argc != 1) { 1131 if (argc != 1) {
1130 ti->error = "dm-origin: incorrect number of arguments"; 1132 ti->error = "origin: incorrect number of arguments";
1131 return -EINVAL; 1133 return -EINVAL;
1132 } 1134 }
1133 1135
@@ -1204,7 +1206,7 @@ static int origin_status(struct dm_target *ti, status_type_t type, char *result,
1204 1206
1205static struct target_type origin_target = { 1207static struct target_type origin_target = {
1206 .name = "snapshot-origin", 1208 .name = "snapshot-origin",
1207 .version = {1, 1, 0}, 1209 .version = {1, 4, 0},
1208 .module = THIS_MODULE, 1210 .module = THIS_MODULE,
1209 .ctr = origin_ctr, 1211 .ctr = origin_ctr,
1210 .dtr = origin_dtr, 1212 .dtr = origin_dtr,
@@ -1215,7 +1217,7 @@ static struct target_type origin_target = {
1215 1217
1216static struct target_type snapshot_target = { 1218static struct target_type snapshot_target = {
1217 .name = "snapshot", 1219 .name = "snapshot",
1218 .version = {1, 1, 0}, 1220 .version = {1, 4, 0},
1219 .module = THIS_MODULE, 1221 .module = THIS_MODULE,
1220 .ctr = snapshot_ctr, 1222 .ctr = snapshot_ctr,
1221 .dtr = snapshot_dtr, 1223 .dtr = snapshot_dtr,
@@ -1236,7 +1238,7 @@ static int __init dm_snapshot_init(void)
1236 1238
1237 r = dm_register_target(&origin_target); 1239 r = dm_register_target(&origin_target);
1238 if (r < 0) { 1240 if (r < 0) {
1239 DMERR("Device mapper: Origin: register failed %d\n", r); 1241 DMERR("Origin target register failed %d", r);
1240 goto bad1; 1242 goto bad1;
1241 } 1243 }
1242 1244
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 08328a8f5a3c..6c29fcecd892 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -12,6 +12,8 @@
12#include <linux/bio.h> 12#include <linux/bio.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14 14
15#define DM_MSG_PREFIX "striped"
16
15struct stripe { 17struct stripe {
16 struct dm_dev *dev; 18 struct dm_dev *dev;
17 sector_t physical_start; 19 sector_t physical_start;
@@ -78,19 +80,19 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
78 unsigned int i; 80 unsigned int i;
79 81
80 if (argc < 2) { 82 if (argc < 2) {
81 ti->error = "dm-stripe: Not enough arguments"; 83 ti->error = "Not enough arguments";
82 return -EINVAL; 84 return -EINVAL;
83 } 85 }
84 86
85 stripes = simple_strtoul(argv[0], &end, 10); 87 stripes = simple_strtoul(argv[0], &end, 10);
86 if (*end) { 88 if (*end) {
87 ti->error = "dm-stripe: Invalid stripe count"; 89 ti->error = "Invalid stripe count";
88 return -EINVAL; 90 return -EINVAL;
89 } 91 }
90 92
91 chunk_size = simple_strtoul(argv[1], &end, 10); 93 chunk_size = simple_strtoul(argv[1], &end, 10);
92 if (*end) { 94 if (*end) {
93 ti->error = "dm-stripe: Invalid chunk_size"; 95 ti->error = "Invalid chunk_size";
94 return -EINVAL; 96 return -EINVAL;
95 } 97 }
96 98
@@ -99,19 +101,19 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
99 */ 101 */
100 if (!chunk_size || (chunk_size & (chunk_size - 1)) || 102 if (!chunk_size || (chunk_size & (chunk_size - 1)) ||
101 (chunk_size < (PAGE_SIZE >> SECTOR_SHIFT))) { 103 (chunk_size < (PAGE_SIZE >> SECTOR_SHIFT))) {
102 ti->error = "dm-stripe: Invalid chunk size"; 104 ti->error = "Invalid chunk size";
103 return -EINVAL; 105 return -EINVAL;
104 } 106 }
105 107
106 if (ti->len & (chunk_size - 1)) { 108 if (ti->len & (chunk_size - 1)) {
107 ti->error = "dm-stripe: Target length not divisible by " 109 ti->error = "Target length not divisible by "
108 "chunk size"; 110 "chunk size";
109 return -EINVAL; 111 return -EINVAL;
110 } 112 }
111 113
112 width = ti->len; 114 width = ti->len;
113 if (sector_div(width, stripes)) { 115 if (sector_div(width, stripes)) {
114 ti->error = "dm-stripe: Target length not divisible by " 116 ti->error = "Target length not divisible by "
115 "number of stripes"; 117 "number of stripes";
116 return -EINVAL; 118 return -EINVAL;
117 } 119 }
@@ -120,14 +122,14 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
120 * Do we have enough arguments for that many stripes ? 122 * Do we have enough arguments for that many stripes ?
121 */ 123 */
122 if (argc != (2 + 2 * stripes)) { 124 if (argc != (2 + 2 * stripes)) {
123 ti->error = "dm-stripe: Not enough destinations " 125 ti->error = "Not enough destinations "
124 "specified"; 126 "specified";
125 return -EINVAL; 127 return -EINVAL;
126 } 128 }
127 129
128 sc = alloc_context(stripes); 130 sc = alloc_context(stripes);
129 if (!sc) { 131 if (!sc) {
130 ti->error = "dm-stripe: Memory allocation for striped context " 132 ti->error = "Memory allocation for striped context "
131 "failed"; 133 "failed";
132 return -ENOMEM; 134 return -ENOMEM;
133 } 135 }
@@ -149,8 +151,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
149 151
150 r = get_stripe(ti, sc, i, argv); 152 r = get_stripe(ti, sc, i, argv);
151 if (r < 0) { 153 if (r < 0) {
152 ti->error = "dm-stripe: Couldn't parse stripe " 154 ti->error = "Couldn't parse stripe destination";
153 "destination";
154 while (i--) 155 while (i--)
155 dm_put_device(ti, sc->stripe[i].dev); 156 dm_put_device(ti, sc->stripe[i].dev);
156 kfree(sc); 157 kfree(sc);
@@ -227,7 +228,7 @@ int __init dm_stripe_init(void)
227 228
228 r = dm_register_target(&stripe_target); 229 r = dm_register_target(&stripe_target);
229 if (r < 0) 230 if (r < 0)
230 DMWARN("striped target registration failed"); 231 DMWARN("target registration failed");
231 232
232 return r; 233 return r;
233} 234}
@@ -235,7 +236,7 @@ int __init dm_stripe_init(void)
235void dm_stripe_exit(void) 236void dm_stripe_exit(void)
236{ 237{
237 if (dm_unregister_target(&stripe_target)) 238 if (dm_unregister_target(&stripe_target))
238 DMWARN("striped target unregistration failed"); 239 DMWARN("target unregistration failed");
239 240
240 return; 241 return;
241} 242}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 8f56a54cf0ce..75fe9493e6af 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -17,6 +17,8 @@
17#include <linux/mutex.h> 17#include <linux/mutex.h>
18#include <asm/atomic.h> 18#include <asm/atomic.h>
19 19
20#define DM_MSG_PREFIX "table"
21
20#define MAX_DEPTH 16 22#define MAX_DEPTH 16
21#define NODE_SIZE L1_CACHE_BYTES 23#define NODE_SIZE L1_CACHE_BYTES
22#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t)) 24#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t))
@@ -237,6 +239,44 @@ int dm_table_create(struct dm_table **result, int mode,
237 return 0; 239 return 0;
238} 240}
239 241
242int dm_create_error_table(struct dm_table **result, struct mapped_device *md)
243{
244 struct dm_table *t;
245 sector_t dev_size = 1;
246 int r;
247
248 /*
249 * Find current size of device.
250 * Default to 1 sector if inactive.
251 */
252 t = dm_get_table(md);
253 if (t) {
254 dev_size = dm_table_get_size(t);
255 dm_table_put(t);
256 }
257
258 r = dm_table_create(&t, FMODE_READ, 1, md);
259 if (r)
260 return r;
261
262 r = dm_table_add_target(t, "error", 0, dev_size, NULL);
263 if (r)
264 goto out;
265
266 r = dm_table_complete(t);
267 if (r)
268 goto out;
269
270 *result = t;
271
272out:
273 if (r)
274 dm_table_put(t);
275
276 return r;
277}
278EXPORT_SYMBOL_GPL(dm_create_error_table);
279
240static void free_devices(struct list_head *devices) 280static void free_devices(struct list_head *devices)
241{ 281{
242 struct list_head *tmp, *next; 282 struct list_head *tmp, *next;
@@ -590,6 +630,12 @@ int dm_split_args(int *argc, char ***argvp, char *input)
590 unsigned array_size = 0; 630 unsigned array_size = 0;
591 631
592 *argc = 0; 632 *argc = 0;
633
634 if (!input) {
635 *argvp = NULL;
636 return 0;
637 }
638
593 argv = realloc_argv(&array_size, argv); 639 argv = realloc_argv(&array_size, argv);
594 if (!argv) 640 if (!argv)
595 return -ENOMEM; 641 return -ENOMEM;
@@ -671,15 +717,14 @@ int dm_table_add_target(struct dm_table *t, const char *type,
671 memset(tgt, 0, sizeof(*tgt)); 717 memset(tgt, 0, sizeof(*tgt));
672 718
673 if (!len) { 719 if (!len) {
674 tgt->error = "zero-length target"; 720 DMERR("%s: zero-length target", dm_device_name(t->md));
675 DMERR("%s", tgt->error);
676 return -EINVAL; 721 return -EINVAL;
677 } 722 }
678 723
679 tgt->type = dm_get_target_type(type); 724 tgt->type = dm_get_target_type(type);
680 if (!tgt->type) { 725 if (!tgt->type) {
681 tgt->error = "unknown target type"; 726 DMERR("%s: %s: unknown target type", dm_device_name(t->md),
682 DMERR("%s", tgt->error); 727 type);
683 return -EINVAL; 728 return -EINVAL;
684 } 729 }
685 730
@@ -716,7 +761,7 @@ int dm_table_add_target(struct dm_table *t, const char *type,
716 return 0; 761 return 0;
717 762
718 bad: 763 bad:
719 DMERR("%s", tgt->error); 764 DMERR("%s: %s: %s", dm_device_name(t->md), type, tgt->error);
720 dm_put_target_type(tgt->type); 765 dm_put_target_type(tgt->type);
721 return r; 766 return r;
722} 767}
@@ -802,7 +847,7 @@ sector_t dm_table_get_size(struct dm_table *t)
802 847
803struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index) 848struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index)
804{ 849{
805 if (index > t->num_targets) 850 if (index >= t->num_targets)
806 return NULL; 851 return NULL;
807 852
808 return t->targets + index; 853 return t->targets + index;
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 64fd8e79ea4c..477a041a41cf 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -12,6 +12,8 @@
12#include <linux/bio.h> 12#include <linux/bio.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14 14
15#define DM_MSG_PREFIX "target"
16
15struct tt_internal { 17struct tt_internal {
16 struct target_type tt; 18 struct target_type tt;
17 19
diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c
index 51c0639b2487..ea569f7348d2 100644
--- a/drivers/md/dm-zero.c
+++ b/drivers/md/dm-zero.c
@@ -10,13 +10,15 @@
10#include <linux/init.h> 10#include <linux/init.h>
11#include <linux/bio.h> 11#include <linux/bio.h>
12 12
13#define DM_MSG_PREFIX "zero"
14
13/* 15/*
14 * Construct a dummy mapping that only returns zeros 16 * Construct a dummy mapping that only returns zeros
15 */ 17 */
16static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv) 18static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv)
17{ 19{
18 if (argc != 0) { 20 if (argc != 0) {
19 ti->error = "dm-zero: No arguments required"; 21 ti->error = "No arguments required";
20 return -EINVAL; 22 return -EINVAL;
21 } 23 }
22 24
@@ -60,7 +62,7 @@ static int __init dm_zero_init(void)
60 int r = dm_register_target(&zero_target); 62 int r = dm_register_target(&zero_target);
61 63
62 if (r < 0) 64 if (r < 0)
63 DMERR("zero: register failed %d", r); 65 DMERR("register failed %d", r);
64 66
65 return r; 67 return r;
66} 68}
@@ -70,7 +72,7 @@ static void __exit dm_zero_exit(void)
70 int r = dm_unregister_target(&zero_target); 72 int r = dm_unregister_target(&zero_target);
71 73
72 if (r < 0) 74 if (r < 0)
73 DMERR("zero: unregister failed %d", r); 75 DMERR("unregister failed %d", r);
74} 76}
75 77
76module_init(dm_zero_init) 78module_init(dm_zero_init)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 4d710b7a133b..3ed2e53b9eb6 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
3 * Copyright (C) 2004 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This file is released under the GPL. 5 * This file is released under the GPL.
6 */ 6 */
@@ -21,11 +21,14 @@
21#include <linux/hdreg.h> 21#include <linux/hdreg.h>
22#include <linux/blktrace_api.h> 22#include <linux/blktrace_api.h>
23 23
24#define DM_MSG_PREFIX "core"
25
24static const char *_name = DM_NAME; 26static const char *_name = DM_NAME;
25 27
26static unsigned int major = 0; 28static unsigned int major = 0;
27static unsigned int _major = 0; 29static unsigned int _major = 0;
28 30
31static DEFINE_SPINLOCK(_minor_lock);
29/* 32/*
30 * One of these is allocated per bio. 33 * One of these is allocated per bio.
31 */ 34 */
@@ -49,23 +52,28 @@ struct target_io {
49 52
50union map_info *dm_get_mapinfo(struct bio *bio) 53union map_info *dm_get_mapinfo(struct bio *bio)
51{ 54{
52 if (bio && bio->bi_private) 55 if (bio && bio->bi_private)
53 return &((struct target_io *)bio->bi_private)->info; 56 return &((struct target_io *)bio->bi_private)->info;
54 return NULL; 57 return NULL;
55} 58}
56 59
60#define MINOR_ALLOCED ((void *)-1)
61
57/* 62/*
58 * Bits for the md->flags field. 63 * Bits for the md->flags field.
59 */ 64 */
60#define DMF_BLOCK_IO 0 65#define DMF_BLOCK_IO 0
61#define DMF_SUSPENDED 1 66#define DMF_SUSPENDED 1
62#define DMF_FROZEN 2 67#define DMF_FROZEN 2
68#define DMF_FREEING 3
69#define DMF_DELETING 4
63 70
64struct mapped_device { 71struct mapped_device {
65 struct rw_semaphore io_lock; 72 struct rw_semaphore io_lock;
66 struct semaphore suspend_lock; 73 struct semaphore suspend_lock;
67 rwlock_t map_lock; 74 rwlock_t map_lock;
68 atomic_t holders; 75 atomic_t holders;
76 atomic_t open_count;
69 77
70 unsigned long flags; 78 unsigned long flags;
71 79
@@ -218,9 +226,25 @@ static int dm_blk_open(struct inode *inode, struct file *file)
218{ 226{
219 struct mapped_device *md; 227 struct mapped_device *md;
220 228
229 spin_lock(&_minor_lock);
230
221 md = inode->i_bdev->bd_disk->private_data; 231 md = inode->i_bdev->bd_disk->private_data;
232 if (!md)
233 goto out;
234
235 if (test_bit(DMF_FREEING, &md->flags) ||
236 test_bit(DMF_DELETING, &md->flags)) {
237 md = NULL;
238 goto out;
239 }
240
222 dm_get(md); 241 dm_get(md);
223 return 0; 242 atomic_inc(&md->open_count);
243
244out:
245 spin_unlock(&_minor_lock);
246
247 return md ? 0 : -ENXIO;
224} 248}
225 249
226static int dm_blk_close(struct inode *inode, struct file *file) 250static int dm_blk_close(struct inode *inode, struct file *file)
@@ -228,10 +252,35 @@ static int dm_blk_close(struct inode *inode, struct file *file)
228 struct mapped_device *md; 252 struct mapped_device *md;
229 253
230 md = inode->i_bdev->bd_disk->private_data; 254 md = inode->i_bdev->bd_disk->private_data;
255 atomic_dec(&md->open_count);
231 dm_put(md); 256 dm_put(md);
232 return 0; 257 return 0;
233} 258}
234 259
260int dm_open_count(struct mapped_device *md)
261{
262 return atomic_read(&md->open_count);
263}
264
265/*
266 * Guarantees nothing is using the device before it's deleted.
267 */
268int dm_lock_for_deletion(struct mapped_device *md)
269{
270 int r = 0;
271
272 spin_lock(&_minor_lock);
273
274 if (dm_open_count(md))
275 r = -EBUSY;
276 else
277 set_bit(DMF_DELETING, &md->flags);
278
279 spin_unlock(&_minor_lock);
280
281 return r;
282}
283
235static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 284static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
236{ 285{
237 struct mapped_device *md = bdev->bd_disk->private_data; 286 struct mapped_device *md = bdev->bd_disk->private_data;
@@ -456,8 +505,8 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
456 if (r > 0) { 505 if (r > 0) {
457 /* the bio has been remapped so dispatch it */ 506 /* the bio has been remapped so dispatch it */
458 507
459 blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone, 508 blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone,
460 tio->io->bio->bi_bdev->bd_dev, sector, 509 tio->io->bio->bi_bdev->bd_dev, sector,
461 clone->bi_sector); 510 clone->bi_sector);
462 511
463 generic_make_request(clone); 512 generic_make_request(clone);
@@ -744,43 +793,39 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
744/*----------------------------------------------------------------- 793/*-----------------------------------------------------------------
745 * An IDR is used to keep track of allocated minor numbers. 794 * An IDR is used to keep track of allocated minor numbers.
746 *---------------------------------------------------------------*/ 795 *---------------------------------------------------------------*/
747static DEFINE_MUTEX(_minor_lock);
748static DEFINE_IDR(_minor_idr); 796static DEFINE_IDR(_minor_idr);
749 797
750static void free_minor(unsigned int minor) 798static void free_minor(int minor)
751{ 799{
752 mutex_lock(&_minor_lock); 800 spin_lock(&_minor_lock);
753 idr_remove(&_minor_idr, minor); 801 idr_remove(&_minor_idr, minor);
754 mutex_unlock(&_minor_lock); 802 spin_unlock(&_minor_lock);
755} 803}
756 804
757/* 805/*
758 * See if the device with a specific minor # is free. 806 * See if the device with a specific minor # is free.
759 */ 807 */
760static int specific_minor(struct mapped_device *md, unsigned int minor) 808static int specific_minor(struct mapped_device *md, int minor)
761{ 809{
762 int r, m; 810 int r, m;
763 811
764 if (minor >= (1 << MINORBITS)) 812 if (minor >= (1 << MINORBITS))
765 return -EINVAL; 813 return -EINVAL;
766 814
767 mutex_lock(&_minor_lock); 815 r = idr_pre_get(&_minor_idr, GFP_KERNEL);
816 if (!r)
817 return -ENOMEM;
818
819 spin_lock(&_minor_lock);
768 820
769 if (idr_find(&_minor_idr, minor)) { 821 if (idr_find(&_minor_idr, minor)) {
770 r = -EBUSY; 822 r = -EBUSY;
771 goto out; 823 goto out;
772 } 824 }
773 825
774 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 826 r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m);
775 if (!r) { 827 if (r)
776 r = -ENOMEM;
777 goto out;
778 }
779
780 r = idr_get_new_above(&_minor_idr, md, minor, &m);
781 if (r) {
782 goto out; 828 goto out;
783 }
784 829
785 if (m != minor) { 830 if (m != minor) {
786 idr_remove(&_minor_idr, m); 831 idr_remove(&_minor_idr, m);
@@ -789,24 +834,21 @@ static int specific_minor(struct mapped_device *md, unsigned int minor)
789 } 834 }
790 835
791out: 836out:
792 mutex_unlock(&_minor_lock); 837 spin_unlock(&_minor_lock);
793 return r; 838 return r;
794} 839}
795 840
796static int next_free_minor(struct mapped_device *md, unsigned int *minor) 841static int next_free_minor(struct mapped_device *md, int *minor)
797{ 842{
798 int r; 843 int r, m;
799 unsigned int m;
800
801 mutex_lock(&_minor_lock);
802 844
803 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 845 r = idr_pre_get(&_minor_idr, GFP_KERNEL);
804 if (!r) { 846 if (!r)
805 r = -ENOMEM; 847 return -ENOMEM;
806 goto out; 848
807 } 849 spin_lock(&_minor_lock);
808 850
809 r = idr_get_new(&_minor_idr, md, &m); 851 r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m);
810 if (r) { 852 if (r) {
811 goto out; 853 goto out;
812 } 854 }
@@ -820,7 +862,7 @@ static int next_free_minor(struct mapped_device *md, unsigned int *minor)
820 *minor = m; 862 *minor = m;
821 863
822out: 864out:
823 mutex_unlock(&_minor_lock); 865 spin_unlock(&_minor_lock);
824 return r; 866 return r;
825} 867}
826 868
@@ -829,18 +871,25 @@ static struct block_device_operations dm_blk_dops;
829/* 871/*
830 * Allocate and initialise a blank device with a given minor. 872 * Allocate and initialise a blank device with a given minor.
831 */ 873 */
832static struct mapped_device *alloc_dev(unsigned int minor, int persistent) 874static struct mapped_device *alloc_dev(int minor)
833{ 875{
834 int r; 876 int r;
835 struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL); 877 struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL);
878 void *old_md;
836 879
837 if (!md) { 880 if (!md) {
838 DMWARN("unable to allocate device, out of memory."); 881 DMWARN("unable to allocate device, out of memory.");
839 return NULL; 882 return NULL;
840 } 883 }
841 884
885 if (!try_module_get(THIS_MODULE))
886 goto bad0;
887
842 /* get a minor number for the dev */ 888 /* get a minor number for the dev */
843 r = persistent ? specific_minor(md, minor) : next_free_minor(md, &minor); 889 if (minor == DM_ANY_MINOR)
890 r = next_free_minor(md, &minor);
891 else
892 r = specific_minor(md, minor);
844 if (r < 0) 893 if (r < 0)
845 goto bad1; 894 goto bad1;
846 895
@@ -849,6 +898,7 @@ static struct mapped_device *alloc_dev(unsigned int minor, int persistent)
849 init_MUTEX(&md->suspend_lock); 898 init_MUTEX(&md->suspend_lock);
850 rwlock_init(&md->map_lock); 899 rwlock_init(&md->map_lock);
851 atomic_set(&md->holders, 1); 900 atomic_set(&md->holders, 1);
901 atomic_set(&md->open_count, 0);
852 atomic_set(&md->event_nr, 0); 902 atomic_set(&md->event_nr, 0);
853 903
854 md->queue = blk_alloc_queue(GFP_KERNEL); 904 md->queue = blk_alloc_queue(GFP_KERNEL);
@@ -875,6 +925,10 @@ static struct mapped_device *alloc_dev(unsigned int minor, int persistent)
875 if (!md->disk) 925 if (!md->disk)
876 goto bad4; 926 goto bad4;
877 927
928 atomic_set(&md->pending, 0);
929 init_waitqueue_head(&md->wait);
930 init_waitqueue_head(&md->eventq);
931
878 md->disk->major = _major; 932 md->disk->major = _major;
879 md->disk->first_minor = minor; 933 md->disk->first_minor = minor;
880 md->disk->fops = &dm_blk_dops; 934 md->disk->fops = &dm_blk_dops;
@@ -884,9 +938,12 @@ static struct mapped_device *alloc_dev(unsigned int minor, int persistent)
884 add_disk(md->disk); 938 add_disk(md->disk);
885 format_dev_t(md->name, MKDEV(_major, minor)); 939 format_dev_t(md->name, MKDEV(_major, minor));
886 940
887 atomic_set(&md->pending, 0); 941 /* Populate the mapping, nobody knows we exist yet */
888 init_waitqueue_head(&md->wait); 942 spin_lock(&_minor_lock);
889 init_waitqueue_head(&md->eventq); 943 old_md = idr_replace(&_minor_idr, md, minor);
944 spin_unlock(&_minor_lock);
945
946 BUG_ON(old_md != MINOR_ALLOCED);
890 947
891 return md; 948 return md;
892 949
@@ -898,13 +955,15 @@ static struct mapped_device *alloc_dev(unsigned int minor, int persistent)
898 blk_cleanup_queue(md->queue); 955 blk_cleanup_queue(md->queue);
899 free_minor(minor); 956 free_minor(minor);
900 bad1: 957 bad1:
958 module_put(THIS_MODULE);
959 bad0:
901 kfree(md); 960 kfree(md);
902 return NULL; 961 return NULL;
903} 962}
904 963
905static void free_dev(struct mapped_device *md) 964static void free_dev(struct mapped_device *md)
906{ 965{
907 unsigned int minor = md->disk->first_minor; 966 int minor = md->disk->first_minor;
908 967
909 if (md->suspended_bdev) { 968 if (md->suspended_bdev) {
910 thaw_bdev(md->suspended_bdev, NULL); 969 thaw_bdev(md->suspended_bdev, NULL);
@@ -914,8 +973,14 @@ static void free_dev(struct mapped_device *md)
914 mempool_destroy(md->io_pool); 973 mempool_destroy(md->io_pool);
915 del_gendisk(md->disk); 974 del_gendisk(md->disk);
916 free_minor(minor); 975 free_minor(minor);
976
977 spin_lock(&_minor_lock);
978 md->disk->private_data = NULL;
979 spin_unlock(&_minor_lock);
980
917 put_disk(md->disk); 981 put_disk(md->disk);
918 blk_cleanup_queue(md->queue); 982 blk_cleanup_queue(md->queue);
983 module_put(THIS_MODULE);
919 kfree(md); 984 kfree(md);
920} 985}
921 986
@@ -984,12 +1049,11 @@ static void __unbind(struct mapped_device *md)
984/* 1049/*
985 * Constructor for a new device. 1050 * Constructor for a new device.
986 */ 1051 */
987static int create_aux(unsigned int minor, int persistent, 1052int dm_create(int minor, struct mapped_device **result)
988 struct mapped_device **result)
989{ 1053{
990 struct mapped_device *md; 1054 struct mapped_device *md;
991 1055
992 md = alloc_dev(minor, persistent); 1056 md = alloc_dev(minor);
993 if (!md) 1057 if (!md)
994 return -ENXIO; 1058 return -ENXIO;
995 1059
@@ -997,16 +1061,6 @@ static int create_aux(unsigned int minor, int persistent,
997 return 0; 1061 return 0;
998} 1062}
999 1063
1000int dm_create(struct mapped_device **result)
1001{
1002 return create_aux(0, 0, result);
1003}
1004
1005int dm_create_with_minor(unsigned int minor, struct mapped_device **result)
1006{
1007 return create_aux(minor, 1, result);
1008}
1009
1010static struct mapped_device *dm_find_md(dev_t dev) 1064static struct mapped_device *dm_find_md(dev_t dev)
1011{ 1065{
1012 struct mapped_device *md; 1066 struct mapped_device *md;
@@ -1015,13 +1069,18 @@ static struct mapped_device *dm_find_md(dev_t dev)
1015 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 1069 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
1016 return NULL; 1070 return NULL;
1017 1071
1018 mutex_lock(&_minor_lock); 1072 spin_lock(&_minor_lock);
1019 1073
1020 md = idr_find(&_minor_idr, minor); 1074 md = idr_find(&_minor_idr, minor);
1021 if (!md || (dm_disk(md)->first_minor != minor)) 1075 if (md && (md == MINOR_ALLOCED ||
1076 (dm_disk(md)->first_minor != minor) ||
1077 test_bit(DMF_FREEING, &md->flags))) {
1022 md = NULL; 1078 md = NULL;
1079 goto out;
1080 }
1023 1081
1024 mutex_unlock(&_minor_lock); 1082out:
1083 spin_unlock(&_minor_lock);
1025 1084
1026 return md; 1085 return md;
1027} 1086}
@@ -1051,12 +1110,23 @@ void dm_get(struct mapped_device *md)
1051 atomic_inc(&md->holders); 1110 atomic_inc(&md->holders);
1052} 1111}
1053 1112
1113const char *dm_device_name(struct mapped_device *md)
1114{
1115 return md->name;
1116}
1117EXPORT_SYMBOL_GPL(dm_device_name);
1118
1054void dm_put(struct mapped_device *md) 1119void dm_put(struct mapped_device *md)
1055{ 1120{
1056 struct dm_table *map; 1121 struct dm_table *map;
1057 1122
1058 if (atomic_dec_and_test(&md->holders)) { 1123 BUG_ON(test_bit(DMF_FREEING, &md->flags));
1124
1125 if (atomic_dec_and_lock(&md->holders, &_minor_lock)) {
1059 map = dm_get_table(md); 1126 map = dm_get_table(md);
1127 idr_replace(&_minor_idr, MINOR_ALLOCED, dm_disk(md)->first_minor);
1128 set_bit(DMF_FREEING, &md->flags);
1129 spin_unlock(&_minor_lock);
1060 if (!dm_suspended(md)) { 1130 if (!dm_suspended(md)) {
1061 dm_table_presuspend_targets(map); 1131 dm_table_presuspend_targets(map);
1062 dm_table_postsuspend_targets(map); 1132 dm_table_postsuspend_targets(map);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index fd90bc8f9e45..3c03c0ecab7e 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -2,7 +2,7 @@
2 * Internal header file for device mapper 2 * Internal header file for device mapper
3 * 3 *
4 * Copyright (C) 2001, 2002 Sistina Software 4 * Copyright (C) 2001, 2002 Sistina Software
5 * Copyright (C) 2004 Red Hat, Inc. All rights reserved. 5 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
6 * 6 *
7 * This file is released under the LGPL. 7 * This file is released under the LGPL.
8 */ 8 */
@@ -17,9 +17,10 @@
17#include <linux/hdreg.h> 17#include <linux/hdreg.h>
18 18
19#define DM_NAME "device-mapper" 19#define DM_NAME "device-mapper"
20#define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x) 20
21#define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x) 21#define DMERR(f, arg...) printk(KERN_ERR DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg)
22#define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x) 22#define DMWARN(f, arg...) printk(KERN_WARNING DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg)
23#define DMINFO(f, arg...) printk(KERN_INFO DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg)
23 24
24#define DMEMIT(x...) sz += ((sz >= maxlen) ? \ 25#define DMEMIT(x...) sz += ((sz >= maxlen) ? \
25 0 : scnprintf(result + sz, maxlen - sz, x)) 26 0 : scnprintf(result + sz, maxlen - sz, x))
@@ -39,83 +40,16 @@ struct dm_dev {
39}; 40};
40 41
41struct dm_table; 42struct dm_table;
42struct mapped_device;
43
44/*-----------------------------------------------------------------
45 * Functions for manipulating a struct mapped_device.
46 * Drop the reference with dm_put when you finish with the object.
47 *---------------------------------------------------------------*/
48int dm_create(struct mapped_device **md);
49int dm_create_with_minor(unsigned int minor, struct mapped_device **md);
50void dm_set_mdptr(struct mapped_device *md, void *ptr);
51void *dm_get_mdptr(struct mapped_device *md);
52struct mapped_device *dm_get_md(dev_t dev);
53
54/*
55 * Reference counting for md.
56 */
57void dm_get(struct mapped_device *md);
58void dm_put(struct mapped_device *md);
59
60/*
61 * A device can still be used while suspended, but I/O is deferred.
62 */
63int dm_suspend(struct mapped_device *md, int with_lockfs);
64int dm_resume(struct mapped_device *md);
65
66/*
67 * The device must be suspended before calling this method.
68 */
69int dm_swap_table(struct mapped_device *md, struct dm_table *t);
70
71/*
72 * Drop a reference on the table when you've finished with the
73 * result.
74 */
75struct dm_table *dm_get_table(struct mapped_device *md);
76
77/*
78 * Event functions.
79 */
80uint32_t dm_get_event_nr(struct mapped_device *md);
81int dm_wait_event(struct mapped_device *md, int event_nr);
82
83/*
84 * Info functions.
85 */
86struct gendisk *dm_disk(struct mapped_device *md);
87int dm_suspended(struct mapped_device *md);
88
89/*
90 * Geometry functions.
91 */
92int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo);
93int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo);
94 43
95/*----------------------------------------------------------------- 44/*-----------------------------------------------------------------
96 * Functions for manipulating a table. Tables are also reference 45 * Internal table functions.
97 * counted.
98 *---------------------------------------------------------------*/ 46 *---------------------------------------------------------------*/
99int dm_table_create(struct dm_table **result, int mode,
100 unsigned num_targets, struct mapped_device *md);
101
102void dm_table_get(struct dm_table *t);
103void dm_table_put(struct dm_table *t);
104
105int dm_table_add_target(struct dm_table *t, const char *type,
106 sector_t start, sector_t len, char *params);
107int dm_table_complete(struct dm_table *t);
108void dm_table_event_callback(struct dm_table *t, 47void dm_table_event_callback(struct dm_table *t,
109 void (*fn)(void *), void *context); 48 void (*fn)(void *), void *context);
110void dm_table_event(struct dm_table *t);
111sector_t dm_table_get_size(struct dm_table *t);
112struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index); 49struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
113struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); 50struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
114void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q); 51void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q);
115unsigned int dm_table_get_num_targets(struct dm_table *t);
116struct list_head *dm_table_get_devices(struct dm_table *t); 52struct list_head *dm_table_get_devices(struct dm_table *t);
117int dm_table_get_mode(struct dm_table *t);
118struct mapped_device *dm_table_get_md(struct dm_table *t);
119void dm_table_presuspend_targets(struct dm_table *t); 53void dm_table_presuspend_targets(struct dm_table *t);
120void dm_table_postsuspend_targets(struct dm_table *t); 54void dm_table_postsuspend_targets(struct dm_table *t);
121void dm_table_resume_targets(struct dm_table *t); 55void dm_table_resume_targets(struct dm_table *t);
@@ -133,7 +67,6 @@ void dm_put_target_type(struct target_type *t);
133int dm_target_iterate(void (*iter_func)(struct target_type *tt, 67int dm_target_iterate(void (*iter_func)(struct target_type *tt,
134 void *param), void *param); 68 void *param), void *param);
135 69
136
137/*----------------------------------------------------------------- 70/*-----------------------------------------------------------------
138 * Useful inlines. 71 * Useful inlines.
139 *---------------------------------------------------------------*/ 72 *---------------------------------------------------------------*/
@@ -191,5 +124,7 @@ void dm_stripe_exit(void);
191 124
192void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size); 125void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size);
193union map_info *dm_get_mapinfo(struct bio *bio); 126union map_info *dm_get_mapinfo(struct bio *bio);
127int dm_open_count(struct mapped_device *md);
128int dm_lock_for_deletion(struct mapped_device *md);
194 129
195#endif 130#endif
diff --git a/drivers/md/kcopyd.c b/drivers/md/kcopyd.c
index 72480a48d88b..73ab875fb158 100644
--- a/drivers/md/kcopyd.c
+++ b/drivers/md/kcopyd.c
@@ -314,7 +314,7 @@ static void complete_io(unsigned long error, void *context)
314 314
315 if (error) { 315 if (error) {
316 if (job->rw == WRITE) 316 if (job->rw == WRITE)
317 job->write_err &= error; 317 job->write_err |= error;
318 else 318 else
319 job->read_err = 1; 319 job->read_err = 1;
320 320
@@ -460,7 +460,7 @@ static void segment_complete(int read_err,
460 job->read_err = 1; 460 job->read_err = 1;
461 461
462 if (write_err) 462 if (write_err)
463 job->write_err &= write_err; 463 job->write_err |= write_err;
464 464
465 /* 465 /*
466 * Only dispatch more work if there hasn't been an error. 466 * Only dispatch more work if there hasn't been an error.
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 777585458c85..ff83c9b5979e 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -111,7 +111,7 @@ static int linear_issue_flush(request_queue_t *q, struct gendisk *disk,
111 return ret; 111 return ret;
112} 112}
113 113
114static int linear_run (mddev_t *mddev) 114static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
115{ 115{
116 linear_conf_t *conf; 116 linear_conf_t *conf;
117 dev_info_t **table; 117 dev_info_t **table;
@@ -121,20 +121,21 @@ static int linear_run (mddev_t *mddev)
121 sector_t curr_offset; 121 sector_t curr_offset;
122 struct list_head *tmp; 122 struct list_head *tmp;
123 123
124 conf = kzalloc (sizeof (*conf) + mddev->raid_disks*sizeof(dev_info_t), 124 conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(dev_info_t),
125 GFP_KERNEL); 125 GFP_KERNEL);
126 if (!conf) 126 if (!conf)
127 goto out; 127 return NULL;
128
128 mddev->private = conf; 129 mddev->private = conf;
129 130
130 cnt = 0; 131 cnt = 0;
131 mddev->array_size = 0; 132 conf->array_size = 0;
132 133
133 ITERATE_RDEV(mddev,rdev,tmp) { 134 ITERATE_RDEV(mddev,rdev,tmp) {
134 int j = rdev->raid_disk; 135 int j = rdev->raid_disk;
135 dev_info_t *disk = conf->disks + j; 136 dev_info_t *disk = conf->disks + j;
136 137
137 if (j < 0 || j > mddev->raid_disks || disk->rdev) { 138 if (j < 0 || j > raid_disks || disk->rdev) {
138 printk("linear: disk numbering problem. Aborting!\n"); 139 printk("linear: disk numbering problem. Aborting!\n");
139 goto out; 140 goto out;
140 } 141 }
@@ -152,11 +153,11 @@ static int linear_run (mddev_t *mddev)
152 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 153 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
153 154
154 disk->size = rdev->size; 155 disk->size = rdev->size;
155 mddev->array_size += rdev->size; 156 conf->array_size += rdev->size;
156 157
157 cnt++; 158 cnt++;
158 } 159 }
159 if (cnt != mddev->raid_disks) { 160 if (cnt != raid_disks) {
160 printk("linear: not enough drives present. Aborting!\n"); 161 printk("linear: not enough drives present. Aborting!\n");
161 goto out; 162 goto out;
162 } 163 }
@@ -200,7 +201,7 @@ static int linear_run (mddev_t *mddev)
200 unsigned round; 201 unsigned round;
201 unsigned long base; 202 unsigned long base;
202 203
203 sz = mddev->array_size >> conf->preshift; 204 sz = conf->array_size >> conf->preshift;
204 sz += 1; /* force round-up */ 205 sz += 1; /* force round-up */
205 base = conf->hash_spacing >> conf->preshift; 206 base = conf->hash_spacing >> conf->preshift;
206 round = sector_div(sz, base); 207 round = sector_div(sz, base);
@@ -247,14 +248,56 @@ static int linear_run (mddev_t *mddev)
247 248
248 BUG_ON(table - conf->hash_table > nb_zone); 249 BUG_ON(table - conf->hash_table > nb_zone);
249 250
251 return conf;
252
253out:
254 kfree(conf);
255 return NULL;
256}
257
258static int linear_run (mddev_t *mddev)
259{
260 linear_conf_t *conf;
261
262 conf = linear_conf(mddev, mddev->raid_disks);
263
264 if (!conf)
265 return 1;
266 mddev->private = conf;
267 mddev->array_size = conf->array_size;
268
250 blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); 269 blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
251 mddev->queue->unplug_fn = linear_unplug; 270 mddev->queue->unplug_fn = linear_unplug;
252 mddev->queue->issue_flush_fn = linear_issue_flush; 271 mddev->queue->issue_flush_fn = linear_issue_flush;
253 return 0; 272 return 0;
273}
254 274
255out: 275static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
256 kfree(conf); 276{
257 return 1; 277 /* Adding a drive to a linear array allows the array to grow.
278 * It is permitted if the new drive has a matching superblock
279 * already on it, with raid_disk equal to raid_disks.
280 * It is achieved by creating a new linear_private_data structure
281 * and swapping it in in-place of the current one.
282 * The current one is never freed until the array is stopped.
283 * This avoids races.
284 */
285 linear_conf_t *newconf;
286
287 if (rdev->raid_disk != mddev->raid_disks)
288 return -EINVAL;
289
290 newconf = linear_conf(mddev,mddev->raid_disks+1);
291
292 if (!newconf)
293 return -ENOMEM;
294
295 newconf->prev = mddev_to_conf(mddev);
296 mddev->private = newconf;
297 mddev->raid_disks++;
298 mddev->array_size = newconf->array_size;
299 set_capacity(mddev->gendisk, mddev->array_size << 1);
300 return 0;
258} 301}
259 302
260static int linear_stop (mddev_t *mddev) 303static int linear_stop (mddev_t *mddev)
@@ -262,8 +305,12 @@ static int linear_stop (mddev_t *mddev)
262 linear_conf_t *conf = mddev_to_conf(mddev); 305 linear_conf_t *conf = mddev_to_conf(mddev);
263 306
264 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 307 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
265 kfree(conf->hash_table); 308 do {
266 kfree(conf); 309 linear_conf_t *t = conf->prev;
310 kfree(conf->hash_table);
311 kfree(conf);
312 conf = t;
313 } while (conf);
267 314
268 return 0; 315 return 0;
269} 316}
@@ -360,6 +407,7 @@ static struct mdk_personality linear_personality =
360 .run = linear_run, 407 .run = linear_run,
361 .stop = linear_stop, 408 .stop = linear_stop,
362 .status = linear_status, 409 .status = linear_status,
410 .hot_add_disk = linear_add,
363}; 411};
364 412
365static int __init linear_init (void) 413static int __init linear_init (void)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index f19b874753a9..306268ec99ff 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -44,6 +44,7 @@
44#include <linux/suspend.h> 44#include <linux/suspend.h>
45#include <linux/poll.h> 45#include <linux/poll.h>
46#include <linux/mutex.h> 46#include <linux/mutex.h>
47#include <linux/ctype.h>
47 48
48#include <linux/init.h> 49#include <linux/init.h>
49 50
@@ -72,6 +73,10 @@ static void autostart_arrays (int part);
72static LIST_HEAD(pers_list); 73static LIST_HEAD(pers_list);
73static DEFINE_SPINLOCK(pers_lock); 74static DEFINE_SPINLOCK(pers_lock);
74 75
76static void md_print_devices(void);
77
78#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
79
75/* 80/*
76 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 81 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
77 * is 1000 KB/sec, so the extra system load does not show up that much. 82 * is 1000 KB/sec, so the extra system load does not show up that much.
@@ -170,7 +175,7 @@ EXPORT_SYMBOL_GPL(md_new_event);
170/* Alternate version that can be called from interrupts 175/* Alternate version that can be called from interrupts
171 * when calling sysfs_notify isn't needed. 176 * when calling sysfs_notify isn't needed.
172 */ 177 */
173void md_new_event_inintr(mddev_t *mddev) 178static void md_new_event_inintr(mddev_t *mddev)
174{ 179{
175 atomic_inc(&md_event_count); 180 atomic_inc(&md_event_count);
176 wake_up(&md_event_waiters); 181 wake_up(&md_event_waiters);
@@ -732,6 +737,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
732{ 737{
733 mdp_disk_t *desc; 738 mdp_disk_t *desc;
734 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 739 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
740 __u64 ev1 = md_event(sb);
735 741
736 rdev->raid_disk = -1; 742 rdev->raid_disk = -1;
737 rdev->flags = 0; 743 rdev->flags = 0;
@@ -748,7 +754,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
748 mddev->layout = sb->layout; 754 mddev->layout = sb->layout;
749 mddev->raid_disks = sb->raid_disks; 755 mddev->raid_disks = sb->raid_disks;
750 mddev->size = sb->size; 756 mddev->size = sb->size;
751 mddev->events = md_event(sb); 757 mddev->events = ev1;
752 mddev->bitmap_offset = 0; 758 mddev->bitmap_offset = 0;
753 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 759 mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
754 760
@@ -797,7 +803,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
797 803
798 } else if (mddev->pers == NULL) { 804 } else if (mddev->pers == NULL) {
799 /* Insist on good event counter while assembling */ 805 /* Insist on good event counter while assembling */
800 __u64 ev1 = md_event(sb);
801 ++ev1; 806 ++ev1;
802 if (ev1 < mddev->events) 807 if (ev1 < mddev->events)
803 return -EINVAL; 808 return -EINVAL;
@@ -805,19 +810,21 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
805 /* if adding to array with a bitmap, then we can accept an 810 /* if adding to array with a bitmap, then we can accept an
806 * older device ... but not too old. 811 * older device ... but not too old.
807 */ 812 */
808 __u64 ev1 = md_event(sb);
809 if (ev1 < mddev->bitmap->events_cleared) 813 if (ev1 < mddev->bitmap->events_cleared)
810 return 0; 814 return 0;
811 } else /* just a hot-add of a new device, leave raid_disk at -1 */ 815 } else {
812 return 0; 816 if (ev1 < mddev->events)
817 /* just a hot-add of a new device, leave raid_disk at -1 */
818 return 0;
819 }
813 820
814 if (mddev->level != LEVEL_MULTIPATH) { 821 if (mddev->level != LEVEL_MULTIPATH) {
815 desc = sb->disks + rdev->desc_nr; 822 desc = sb->disks + rdev->desc_nr;
816 823
817 if (desc->state & (1<<MD_DISK_FAULTY)) 824 if (desc->state & (1<<MD_DISK_FAULTY))
818 set_bit(Faulty, &rdev->flags); 825 set_bit(Faulty, &rdev->flags);
819 else if (desc->state & (1<<MD_DISK_SYNC) && 826 else if (desc->state & (1<<MD_DISK_SYNC) /* &&
820 desc->raid_disk < mddev->raid_disks) { 827 desc->raid_disk < mddev->raid_disks */) {
821 set_bit(In_sync, &rdev->flags); 828 set_bit(In_sync, &rdev->flags);
822 rdev->raid_disk = desc->raid_disk; 829 rdev->raid_disk = desc->raid_disk;
823 } 830 }
@@ -1100,6 +1107,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1100static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) 1107static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1101{ 1108{
1102 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1109 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1110 __u64 ev1 = le64_to_cpu(sb->events);
1103 1111
1104 rdev->raid_disk = -1; 1112 rdev->raid_disk = -1;
1105 rdev->flags = 0; 1113 rdev->flags = 0;
@@ -1115,7 +1123,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1115 mddev->layout = le32_to_cpu(sb->layout); 1123 mddev->layout = le32_to_cpu(sb->layout);
1116 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1124 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1117 mddev->size = le64_to_cpu(sb->size)/2; 1125 mddev->size = le64_to_cpu(sb->size)/2;
1118 mddev->events = le64_to_cpu(sb->events); 1126 mddev->events = ev1;
1119 mddev->bitmap_offset = 0; 1127 mddev->bitmap_offset = 0;
1120 mddev->default_bitmap_offset = 1024 >> 9; 1128 mddev->default_bitmap_offset = 1024 >> 9;
1121 1129
@@ -1149,7 +1157,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1149 1157
1150 } else if (mddev->pers == NULL) { 1158 } else if (mddev->pers == NULL) {
1151 /* Insist of good event counter while assembling */ 1159 /* Insist of good event counter while assembling */
1152 __u64 ev1 = le64_to_cpu(sb->events);
1153 ++ev1; 1160 ++ev1;
1154 if (ev1 < mddev->events) 1161 if (ev1 < mddev->events)
1155 return -EINVAL; 1162 return -EINVAL;
@@ -1157,12 +1164,13 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1157 /* If adding to array with a bitmap, then we can accept an 1164 /* If adding to array with a bitmap, then we can accept an
1158 * older device, but not too old. 1165 * older device, but not too old.
1159 */ 1166 */
1160 __u64 ev1 = le64_to_cpu(sb->events);
1161 if (ev1 < mddev->bitmap->events_cleared) 1167 if (ev1 < mddev->bitmap->events_cleared)
1162 return 0; 1168 return 0;
1163 } else /* just a hot-add of a new device, leave raid_disk at -1 */ 1169 } else {
1164 return 0; 1170 if (ev1 < mddev->events)
1165 1171 /* just a hot-add of a new device, leave raid_disk at -1 */
1172 return 0;
1173 }
1166 if (mddev->level != LEVEL_MULTIPATH) { 1174 if (mddev->level != LEVEL_MULTIPATH) {
1167 int role; 1175 int role;
1168 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1176 rdev->desc_nr = le32_to_cpu(sb->dev_number);
@@ -1174,7 +1182,11 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1174 set_bit(Faulty, &rdev->flags); 1182 set_bit(Faulty, &rdev->flags);
1175 break; 1183 break;
1176 default: 1184 default:
1177 set_bit(In_sync, &rdev->flags); 1185 if ((le32_to_cpu(sb->feature_map) &
1186 MD_FEATURE_RECOVERY_OFFSET))
1187 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1188 else
1189 set_bit(In_sync, &rdev->flags);
1178 rdev->raid_disk = role; 1190 rdev->raid_disk = role;
1179 break; 1191 break;
1180 } 1192 }
@@ -1198,6 +1210,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1198 1210
1199 sb->feature_map = 0; 1211 sb->feature_map = 0;
1200 sb->pad0 = 0; 1212 sb->pad0 = 0;
1213 sb->recovery_offset = cpu_to_le64(0);
1201 memset(sb->pad1, 0, sizeof(sb->pad1)); 1214 memset(sb->pad1, 0, sizeof(sb->pad1));
1202 memset(sb->pad2, 0, sizeof(sb->pad2)); 1215 memset(sb->pad2, 0, sizeof(sb->pad2));
1203 memset(sb->pad3, 0, sizeof(sb->pad3)); 1216 memset(sb->pad3, 0, sizeof(sb->pad3));
@@ -1218,6 +1231,14 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1218 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1231 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
1219 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1232 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1220 } 1233 }
1234
1235 if (rdev->raid_disk >= 0 &&
1236 !test_bit(In_sync, &rdev->flags) &&
1237 rdev->recovery_offset > 0) {
1238 sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1239 sb->recovery_offset = cpu_to_le64(rdev->recovery_offset);
1240 }
1241
1221 if (mddev->reshape_position != MaxSector) { 1242 if (mddev->reshape_position != MaxSector) {
1222 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 1243 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1223 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 1244 sb->reshape_position = cpu_to_le64(mddev->reshape_position);
@@ -1242,11 +1263,12 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1242 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1263 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1243 else if (test_bit(In_sync, &rdev2->flags)) 1264 else if (test_bit(In_sync, &rdev2->flags))
1244 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1265 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1266 else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0)
1267 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1245 else 1268 else
1246 sb->dev_roles[i] = cpu_to_le16(0xffff); 1269 sb->dev_roles[i] = cpu_to_le16(0xffff);
1247 } 1270 }
1248 1271
1249 sb->recovery_offset = cpu_to_le64(0); /* not supported yet */
1250 sb->sb_csum = calc_sb_1_csum(sb); 1272 sb->sb_csum = calc_sb_1_csum(sb);
1251} 1273}
1252 1274
@@ -1507,7 +1529,7 @@ static void print_rdev(mdk_rdev_t *rdev)
1507 printk(KERN_INFO "md: no rdev superblock!\n"); 1529 printk(KERN_INFO "md: no rdev superblock!\n");
1508} 1530}
1509 1531
1510void md_print_devices(void) 1532static void md_print_devices(void)
1511{ 1533{
1512 struct list_head *tmp, *tmp2; 1534 struct list_head *tmp, *tmp2;
1513 mdk_rdev_t *rdev; 1535 mdk_rdev_t *rdev;
@@ -1536,15 +1558,30 @@ void md_print_devices(void)
1536} 1558}
1537 1559
1538 1560
1539static void sync_sbs(mddev_t * mddev) 1561static void sync_sbs(mddev_t * mddev, int nospares)
1540{ 1562{
1563 /* Update each superblock (in-memory image), but
1564 * if we are allowed to, skip spares which already
1565 * have the right event counter, or have one earlier
1566 * (which would mean they aren't being marked as dirty
1567 * with the rest of the array)
1568 */
1541 mdk_rdev_t *rdev; 1569 mdk_rdev_t *rdev;
1542 struct list_head *tmp; 1570 struct list_head *tmp;
1543 1571
1544 ITERATE_RDEV(mddev,rdev,tmp) { 1572 ITERATE_RDEV(mddev,rdev,tmp) {
1545 super_types[mddev->major_version]. 1573 if (rdev->sb_events == mddev->events ||
1546 sync_super(mddev, rdev); 1574 (nospares &&
1547 rdev->sb_loaded = 1; 1575 rdev->raid_disk < 0 &&
1576 (rdev->sb_events&1)==0 &&
1577 rdev->sb_events+1 == mddev->events)) {
1578 /* Don't update this superblock */
1579 rdev->sb_loaded = 2;
1580 } else {
1581 super_types[mddev->major_version].
1582 sync_super(mddev, rdev);
1583 rdev->sb_loaded = 1;
1584 }
1548 } 1585 }
1549} 1586}
1550 1587
@@ -1554,12 +1591,42 @@ void md_update_sb(mddev_t * mddev)
1554 struct list_head *tmp; 1591 struct list_head *tmp;
1555 mdk_rdev_t *rdev; 1592 mdk_rdev_t *rdev;
1556 int sync_req; 1593 int sync_req;
1594 int nospares = 0;
1557 1595
1558repeat: 1596repeat:
1559 spin_lock_irq(&mddev->write_lock); 1597 spin_lock_irq(&mddev->write_lock);
1560 sync_req = mddev->in_sync; 1598 sync_req = mddev->in_sync;
1561 mddev->utime = get_seconds(); 1599 mddev->utime = get_seconds();
1562 mddev->events ++; 1600 if (mddev->sb_dirty == 3)
1601 /* just a clean<-> dirty transition, possibly leave spares alone,
1602 * though if events isn't the right even/odd, we will have to do
1603 * spares after all
1604 */
1605 nospares = 1;
1606
1607 /* If this is just a dirty<->clean transition, and the array is clean
1608 * and 'events' is odd, we can roll back to the previous clean state */
1609 if (mddev->sb_dirty == 3
1610 && (mddev->in_sync && mddev->recovery_cp == MaxSector)
1611 && (mddev->events & 1))
1612 mddev->events--;
1613 else {
1614 /* otherwise we have to go forward and ... */
1615 mddev->events ++;
1616 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */
1617 /* .. if the array isn't clean, insist on an odd 'events' */
1618 if ((mddev->events&1)==0) {
1619 mddev->events++;
1620 nospares = 0;
1621 }
1622 } else {
1623 /* otherwise insist on an even 'events' (for clean states) */
1624 if ((mddev->events&1)) {
1625 mddev->events++;
1626 nospares = 0;
1627 }
1628 }
1629 }
1563 1630
1564 if (!mddev->events) { 1631 if (!mddev->events) {
1565 /* 1632 /*
@@ -1571,7 +1638,7 @@ repeat:
1571 mddev->events --; 1638 mddev->events --;
1572 } 1639 }
1573 mddev->sb_dirty = 2; 1640 mddev->sb_dirty = 2;
1574 sync_sbs(mddev); 1641 sync_sbs(mddev, nospares);
1575 1642
1576 /* 1643 /*
1577 * do not write anything to disk if using 1644 * do not write anything to disk if using
@@ -1593,6 +1660,8 @@ repeat:
1593 ITERATE_RDEV(mddev,rdev,tmp) { 1660 ITERATE_RDEV(mddev,rdev,tmp) {
1594 char b[BDEVNAME_SIZE]; 1661 char b[BDEVNAME_SIZE];
1595 dprintk(KERN_INFO "md: "); 1662 dprintk(KERN_INFO "md: ");
1663 if (rdev->sb_loaded != 1)
1664 continue; /* no noise on spare devices */
1596 if (test_bit(Faulty, &rdev->flags)) 1665 if (test_bit(Faulty, &rdev->flags))
1597 dprintk("(skipping faulty "); 1666 dprintk("(skipping faulty ");
1598 1667
@@ -1604,6 +1673,7 @@ repeat:
1604 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", 1673 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
1605 bdevname(rdev->bdev,b), 1674 bdevname(rdev->bdev,b),
1606 (unsigned long long)rdev->sb_offset); 1675 (unsigned long long)rdev->sb_offset);
1676 rdev->sb_events = mddev->events;
1607 1677
1608 } else 1678 } else
1609 dprintk(")\n"); 1679 dprintk(")\n");
@@ -1667,6 +1737,10 @@ state_show(mdk_rdev_t *rdev, char *page)
1667 len += sprintf(page+len, "%sin_sync",sep); 1737 len += sprintf(page+len, "%sin_sync",sep);
1668 sep = ","; 1738 sep = ",";
1669 } 1739 }
1740 if (test_bit(WriteMostly, &rdev->flags)) {
1741 len += sprintf(page+len, "%swrite_mostly",sep);
1742 sep = ",";
1743 }
1670 if (!test_bit(Faulty, &rdev->flags) && 1744 if (!test_bit(Faulty, &rdev->flags) &&
1671 !test_bit(In_sync, &rdev->flags)) { 1745 !test_bit(In_sync, &rdev->flags)) {
1672 len += sprintf(page+len, "%sspare", sep); 1746 len += sprintf(page+len, "%sspare", sep);
@@ -1675,8 +1749,40 @@ state_show(mdk_rdev_t *rdev, char *page)
1675 return len+sprintf(page+len, "\n"); 1749 return len+sprintf(page+len, "\n");
1676} 1750}
1677 1751
1752static ssize_t
1753state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1754{
1755 /* can write
1756 * faulty - simulates and error
1757 * remove - disconnects the device
1758 * writemostly - sets write_mostly
1759 * -writemostly - clears write_mostly
1760 */
1761 int err = -EINVAL;
1762 if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
1763 md_error(rdev->mddev, rdev);
1764 err = 0;
1765 } else if (cmd_match(buf, "remove")) {
1766 if (rdev->raid_disk >= 0)
1767 err = -EBUSY;
1768 else {
1769 mddev_t *mddev = rdev->mddev;
1770 kick_rdev_from_array(rdev);
1771 md_update_sb(mddev);
1772 md_new_event(mddev);
1773 err = 0;
1774 }
1775 } else if (cmd_match(buf, "writemostly")) {
1776 set_bit(WriteMostly, &rdev->flags);
1777 err = 0;
1778 } else if (cmd_match(buf, "-writemostly")) {
1779 clear_bit(WriteMostly, &rdev->flags);
1780 err = 0;
1781 }
1782 return err ? err : len;
1783}
1678static struct rdev_sysfs_entry 1784static struct rdev_sysfs_entry
1679rdev_state = __ATTR_RO(state); 1785rdev_state = __ATTR(state, 0644, state_show, state_store);
1680 1786
1681static ssize_t 1787static ssize_t
1682super_show(mdk_rdev_t *rdev, char *page) 1788super_show(mdk_rdev_t *rdev, char *page)
@@ -1873,6 +1979,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
1873 rdev->desc_nr = -1; 1979 rdev->desc_nr = -1;
1874 rdev->flags = 0; 1980 rdev->flags = 0;
1875 rdev->data_offset = 0; 1981 rdev->data_offset = 0;
1982 rdev->sb_events = 0;
1876 atomic_set(&rdev->nr_pending, 0); 1983 atomic_set(&rdev->nr_pending, 0);
1877 atomic_set(&rdev->read_errors, 0); 1984 atomic_set(&rdev->read_errors, 0);
1878 atomic_set(&rdev->corrected_errors, 0); 1985 atomic_set(&rdev->corrected_errors, 0);
@@ -1978,6 +2085,54 @@ static void analyze_sbs(mddev_t * mddev)
1978} 2085}
1979 2086
1980static ssize_t 2087static ssize_t
2088safe_delay_show(mddev_t *mddev, char *page)
2089{
2090 int msec = (mddev->safemode_delay*1000)/HZ;
2091 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
2092}
2093static ssize_t
2094safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
2095{
2096 int scale=1;
2097 int dot=0;
2098 int i;
2099 unsigned long msec;
2100 char buf[30];
2101 char *e;
2102 /* remove a period, and count digits after it */
2103 if (len >= sizeof(buf))
2104 return -EINVAL;
2105 strlcpy(buf, cbuf, len);
2106 buf[len] = 0;
2107 for (i=0; i<len; i++) {
2108 if (dot) {
2109 if (isdigit(buf[i])) {
2110 buf[i-1] = buf[i];
2111 scale *= 10;
2112 }
2113 buf[i] = 0;
2114 } else if (buf[i] == '.') {
2115 dot=1;
2116 buf[i] = 0;
2117 }
2118 }
2119 msec = simple_strtoul(buf, &e, 10);
2120 if (e == buf || (*e && *e != '\n'))
2121 return -EINVAL;
2122 msec = (msec * 1000) / scale;
2123 if (msec == 0)
2124 mddev->safemode_delay = 0;
2125 else {
2126 mddev->safemode_delay = (msec*HZ)/1000;
2127 if (mddev->safemode_delay == 0)
2128 mddev->safemode_delay = 1;
2129 }
2130 return len;
2131}
2132static struct md_sysfs_entry md_safe_delay =
2133__ATTR(safe_mode_delay, 0644,safe_delay_show, safe_delay_store);
2134
2135static ssize_t
1981level_show(mddev_t *mddev, char *page) 2136level_show(mddev_t *mddev, char *page)
1982{ 2137{
1983 struct mdk_personality *p = mddev->pers; 2138 struct mdk_personality *p = mddev->pers;
@@ -2012,6 +2167,32 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
2012static struct md_sysfs_entry md_level = 2167static struct md_sysfs_entry md_level =
2013__ATTR(level, 0644, level_show, level_store); 2168__ATTR(level, 0644, level_show, level_store);
2014 2169
2170
2171static ssize_t
2172layout_show(mddev_t *mddev, char *page)
2173{
2174 /* just a number, not meaningful for all levels */
2175 return sprintf(page, "%d\n", mddev->layout);
2176}
2177
2178static ssize_t
2179layout_store(mddev_t *mddev, const char *buf, size_t len)
2180{
2181 char *e;
2182 unsigned long n = simple_strtoul(buf, &e, 10);
2183 if (mddev->pers)
2184 return -EBUSY;
2185
2186 if (!*buf || (*e && *e != '\n'))
2187 return -EINVAL;
2188
2189 mddev->layout = n;
2190 return len;
2191}
2192static struct md_sysfs_entry md_layout =
2193__ATTR(layout, 0655, layout_show, layout_store);
2194
2195
2015static ssize_t 2196static ssize_t
2016raid_disks_show(mddev_t *mddev, char *page) 2197raid_disks_show(mddev_t *mddev, char *page)
2017{ 2198{
@@ -2067,6 +2248,200 @@ static struct md_sysfs_entry md_chunk_size =
2067__ATTR(chunk_size, 0644, chunk_size_show, chunk_size_store); 2248__ATTR(chunk_size, 0644, chunk_size_show, chunk_size_store);
2068 2249
2069static ssize_t 2250static ssize_t
2251resync_start_show(mddev_t *mddev, char *page)
2252{
2253 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
2254}
2255
2256static ssize_t
2257resync_start_store(mddev_t *mddev, const char *buf, size_t len)
2258{
2259 /* can only set chunk_size if array is not yet active */
2260 char *e;
2261 unsigned long long n = simple_strtoull(buf, &e, 10);
2262
2263 if (mddev->pers)
2264 return -EBUSY;
2265 if (!*buf || (*e && *e != '\n'))
2266 return -EINVAL;
2267
2268 mddev->recovery_cp = n;
2269 return len;
2270}
2271static struct md_sysfs_entry md_resync_start =
2272__ATTR(resync_start, 0644, resync_start_show, resync_start_store);
2273
2274/*
2275 * The array state can be:
2276 *
2277 * clear
2278 * No devices, no size, no level
2279 * Equivalent to STOP_ARRAY ioctl
2280 * inactive
2281 * May have some settings, but array is not active
2282 * all IO results in error
2283 * When written, doesn't tear down array, but just stops it
2284 * suspended (not supported yet)
2285 * All IO requests will block. The array can be reconfigured.
2286 * Writing this, if accepted, will block until array is quiessent
2287 * readonly
2288 * no resync can happen. no superblocks get written.
2289 * write requests fail
2290 * read-auto
2291 * like readonly, but behaves like 'clean' on a write request.
2292 *
2293 * clean - no pending writes, but otherwise active.
2294 * When written to inactive array, starts without resync
2295 * If a write request arrives then
2296 * if metadata is known, mark 'dirty' and switch to 'active'.
2297 * if not known, block and switch to write-pending
2298 * If written to an active array that has pending writes, then fails.
2299 * active
2300 * fully active: IO and resync can be happening.
2301 * When written to inactive array, starts with resync
2302 *
2303 * write-pending
2304 * clean, but writes are blocked waiting for 'active' to be written.
2305 *
2306 * active-idle
2307 * like active, but no writes have been seen for a while (100msec).
2308 *
2309 */
2310enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
2311 write_pending, active_idle, bad_word};
2312static char *array_states[] = {
2313 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
2314 "write-pending", "active-idle", NULL };
2315
2316static int match_word(const char *word, char **list)
2317{
2318 int n;
2319 for (n=0; list[n]; n++)
2320 if (cmd_match(word, list[n]))
2321 break;
2322 return n;
2323}
2324
2325static ssize_t
2326array_state_show(mddev_t *mddev, char *page)
2327{
2328 enum array_state st = inactive;
2329
2330 if (mddev->pers)
2331 switch(mddev->ro) {
2332 case 1:
2333 st = readonly;
2334 break;
2335 case 2:
2336 st = read_auto;
2337 break;
2338 case 0:
2339 if (mddev->in_sync)
2340 st = clean;
2341 else if (mddev->safemode)
2342 st = active_idle;
2343 else
2344 st = active;
2345 }
2346 else {
2347 if (list_empty(&mddev->disks) &&
2348 mddev->raid_disks == 0 &&
2349 mddev->size == 0)
2350 st = clear;
2351 else
2352 st = inactive;
2353 }
2354 return sprintf(page, "%s\n", array_states[st]);
2355}
2356
2357static int do_md_stop(mddev_t * mddev, int ro);
2358static int do_md_run(mddev_t * mddev);
2359static int restart_array(mddev_t *mddev);
2360
2361static ssize_t
2362array_state_store(mddev_t *mddev, const char *buf, size_t len)
2363{
2364 int err = -EINVAL;
2365 enum array_state st = match_word(buf, array_states);
2366 switch(st) {
2367 case bad_word:
2368 break;
2369 case clear:
2370 /* stopping an active array */
2371 if (mddev->pers) {
2372 if (atomic_read(&mddev->active) > 1)
2373 return -EBUSY;
2374 err = do_md_stop(mddev, 0);
2375 }
2376 break;
2377 case inactive:
2378 /* stopping an active array */
2379 if (mddev->pers) {
2380 if (atomic_read(&mddev->active) > 1)
2381 return -EBUSY;
2382 err = do_md_stop(mddev, 2);
2383 }
2384 break;
2385 case suspended:
2386 break; /* not supported yet */
2387 case readonly:
2388 if (mddev->pers)
2389 err = do_md_stop(mddev, 1);
2390 else {
2391 mddev->ro = 1;
2392 err = do_md_run(mddev);
2393 }
2394 break;
2395 case read_auto:
2396 /* stopping an active array */
2397 if (mddev->pers) {
2398 err = do_md_stop(mddev, 1);
2399 if (err == 0)
2400 mddev->ro = 2; /* FIXME mark devices writable */
2401 } else {
2402 mddev->ro = 2;
2403 err = do_md_run(mddev);
2404 }
2405 break;
2406 case clean:
2407 if (mddev->pers) {
2408 restart_array(mddev);
2409 spin_lock_irq(&mddev->write_lock);
2410 if (atomic_read(&mddev->writes_pending) == 0) {
2411 mddev->in_sync = 1;
2412 mddev->sb_dirty = 1;
2413 }
2414 spin_unlock_irq(&mddev->write_lock);
2415 } else {
2416 mddev->ro = 0;
2417 mddev->recovery_cp = MaxSector;
2418 err = do_md_run(mddev);
2419 }
2420 break;
2421 case active:
2422 if (mddev->pers) {
2423 restart_array(mddev);
2424 mddev->sb_dirty = 0;
2425 wake_up(&mddev->sb_wait);
2426 err = 0;
2427 } else {
2428 mddev->ro = 0;
2429 err = do_md_run(mddev);
2430 }
2431 break;
2432 case write_pending:
2433 case active_idle:
2434 /* these cannot be set */
2435 break;
2436 }
2437 if (err)
2438 return err;
2439 else
2440 return len;
2441}
2442static struct md_sysfs_entry md_array_state = __ATTR(array_state, 0644, array_state_show, array_state_store);
2443
2444static ssize_t
2070null_show(mddev_t *mddev, char *page) 2445null_show(mddev_t *mddev, char *page)
2071{ 2446{
2072 return -EINVAL; 2447 return -EINVAL;
@@ -2428,11 +2803,15 @@ __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
2428 2803
2429static struct attribute *md_default_attrs[] = { 2804static struct attribute *md_default_attrs[] = {
2430 &md_level.attr, 2805 &md_level.attr,
2806 &md_layout.attr,
2431 &md_raid_disks.attr, 2807 &md_raid_disks.attr,
2432 &md_chunk_size.attr, 2808 &md_chunk_size.attr,
2433 &md_size.attr, 2809 &md_size.attr,
2810 &md_resync_start.attr,
2434 &md_metadata.attr, 2811 &md_metadata.attr,
2435 &md_new_device.attr, 2812 &md_new_device.attr,
2813 &md_safe_delay.attr,
2814 &md_array_state.attr,
2436 NULL, 2815 NULL,
2437}; 2816};
2438 2817
@@ -2553,8 +2932,6 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
2553 return NULL; 2932 return NULL;
2554} 2933}
2555 2934
2556void md_wakeup_thread(mdk_thread_t *thread);
2557
2558static void md_safemode_timeout(unsigned long data) 2935static void md_safemode_timeout(unsigned long data)
2559{ 2936{
2560 mddev_t *mddev = (mddev_t *) data; 2937 mddev_t *mddev = (mddev_t *) data;
@@ -2708,7 +3085,7 @@ static int do_md_run(mddev_t * mddev)
2708 mddev->safemode = 0; 3085 mddev->safemode = 0;
2709 mddev->safemode_timer.function = md_safemode_timeout; 3086 mddev->safemode_timer.function = md_safemode_timeout;
2710 mddev->safemode_timer.data = (unsigned long) mddev; 3087 mddev->safemode_timer.data = (unsigned long) mddev;
2711 mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */ 3088 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
2712 mddev->in_sync = 1; 3089 mddev->in_sync = 1;
2713 3090
2714 ITERATE_RDEV(mddev,rdev,tmp) 3091 ITERATE_RDEV(mddev,rdev,tmp)
@@ -2736,6 +3113,36 @@ static int do_md_run(mddev_t * mddev)
2736 mddev->queue->queuedata = mddev; 3113 mddev->queue->queuedata = mddev;
2737 mddev->queue->make_request_fn = mddev->pers->make_request; 3114 mddev->queue->make_request_fn = mddev->pers->make_request;
2738 3115
3116 /* If there is a partially-recovered drive we need to
3117 * start recovery here. If we leave it to md_check_recovery,
3118 * it will remove the drives and not do the right thing
3119 */
3120 if (mddev->degraded) {
3121 struct list_head *rtmp;
3122 int spares = 0;
3123 ITERATE_RDEV(mddev,rdev,rtmp)
3124 if (rdev->raid_disk >= 0 &&
3125 !test_bit(In_sync, &rdev->flags) &&
3126 !test_bit(Faulty, &rdev->flags))
3127 /* complete an interrupted recovery */
3128 spares++;
3129 if (spares && mddev->pers->sync_request) {
3130 mddev->recovery = 0;
3131 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3132 mddev->sync_thread = md_register_thread(md_do_sync,
3133 mddev,
3134 "%s_resync");
3135 if (!mddev->sync_thread) {
3136 printk(KERN_ERR "%s: could not start resync"
3137 " thread...\n",
3138 mdname(mddev));
3139 /* leave the spares where they are, it shouldn't hurt */
3140 mddev->recovery = 0;
3141 } else
3142 md_wakeup_thread(mddev->sync_thread);
3143 }
3144 }
3145
2739 mddev->changed = 1; 3146 mddev->changed = 1;
2740 md_new_event(mddev); 3147 md_new_event(mddev);
2741 return 0; 3148 return 0;
@@ -2769,18 +3176,47 @@ static int restart_array(mddev_t *mddev)
2769 */ 3176 */
2770 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3177 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2771 md_wakeup_thread(mddev->thread); 3178 md_wakeup_thread(mddev->thread);
3179 md_wakeup_thread(mddev->sync_thread);
2772 err = 0; 3180 err = 0;
2773 } else { 3181 } else
2774 printk(KERN_ERR "md: %s has no personality assigned.\n",
2775 mdname(mddev));
2776 err = -EINVAL; 3182 err = -EINVAL;
2777 }
2778 3183
2779out: 3184out:
2780 return err; 3185 return err;
2781} 3186}
2782 3187
2783static int do_md_stop(mddev_t * mddev, int ro) 3188/* similar to deny_write_access, but accounts for our holding a reference
3189 * to the file ourselves */
3190static int deny_bitmap_write_access(struct file * file)
3191{
3192 struct inode *inode = file->f_mapping->host;
3193
3194 spin_lock(&inode->i_lock);
3195 if (atomic_read(&inode->i_writecount) > 1) {
3196 spin_unlock(&inode->i_lock);
3197 return -ETXTBSY;
3198 }
3199 atomic_set(&inode->i_writecount, -1);
3200 spin_unlock(&inode->i_lock);
3201
3202 return 0;
3203}
3204
3205static void restore_bitmap_write_access(struct file *file)
3206{
3207 struct inode *inode = file->f_mapping->host;
3208
3209 spin_lock(&inode->i_lock);
3210 atomic_set(&inode->i_writecount, 1);
3211 spin_unlock(&inode->i_lock);
3212}
3213
3214/* mode:
3215 * 0 - completely stop and dis-assemble array
3216 * 1 - switch to readonly
3217 * 2 - stop but do not disassemble array
3218 */
3219static int do_md_stop(mddev_t * mddev, int mode)
2784{ 3220{
2785 int err = 0; 3221 int err = 0;
2786 struct gendisk *disk = mddev->gendisk; 3222 struct gendisk *disk = mddev->gendisk;
@@ -2792,6 +3228,7 @@ static int do_md_stop(mddev_t * mddev, int ro)
2792 } 3228 }
2793 3229
2794 if (mddev->sync_thread) { 3230 if (mddev->sync_thread) {
3231 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
2795 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3232 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2796 md_unregister_thread(mddev->sync_thread); 3233 md_unregister_thread(mddev->sync_thread);
2797 mddev->sync_thread = NULL; 3234 mddev->sync_thread = NULL;
@@ -2801,12 +3238,15 @@ static int do_md_stop(mddev_t * mddev, int ro)
2801 3238
2802 invalidate_partition(disk, 0); 3239 invalidate_partition(disk, 0);
2803 3240
2804 if (ro) { 3241 switch(mode) {
3242 case 1: /* readonly */
2805 err = -ENXIO; 3243 err = -ENXIO;
2806 if (mddev->ro==1) 3244 if (mddev->ro==1)
2807 goto out; 3245 goto out;
2808 mddev->ro = 1; 3246 mddev->ro = 1;
2809 } else { 3247 break;
3248 case 0: /* disassemble */
3249 case 2: /* stop */
2810 bitmap_flush(mddev); 3250 bitmap_flush(mddev);
2811 md_super_wait(mddev); 3251 md_super_wait(mddev);
2812 if (mddev->ro) 3252 if (mddev->ro)
@@ -2821,19 +3261,20 @@ static int do_md_stop(mddev_t * mddev, int ro)
2821 if (mddev->ro) 3261 if (mddev->ro)
2822 mddev->ro = 0; 3262 mddev->ro = 0;
2823 } 3263 }
2824 if (!mddev->in_sync) { 3264 if (!mddev->in_sync || mddev->sb_dirty) {
2825 /* mark array as shutdown cleanly */ 3265 /* mark array as shutdown cleanly */
2826 mddev->in_sync = 1; 3266 mddev->in_sync = 1;
2827 md_update_sb(mddev); 3267 md_update_sb(mddev);
2828 } 3268 }
2829 if (ro) 3269 if (mode == 1)
2830 set_disk_ro(disk, 1); 3270 set_disk_ro(disk, 1);
3271 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
2831 } 3272 }
2832 3273
2833 /* 3274 /*
2834 * Free resources if final stop 3275 * Free resources if final stop
2835 */ 3276 */
2836 if (!ro) { 3277 if (mode == 0) {
2837 mdk_rdev_t *rdev; 3278 mdk_rdev_t *rdev;
2838 struct list_head *tmp; 3279 struct list_head *tmp;
2839 struct gendisk *disk; 3280 struct gendisk *disk;
@@ -2841,7 +3282,7 @@ static int do_md_stop(mddev_t * mddev, int ro)
2841 3282
2842 bitmap_destroy(mddev); 3283 bitmap_destroy(mddev);
2843 if (mddev->bitmap_file) { 3284 if (mddev->bitmap_file) {
2844 atomic_set(&mddev->bitmap_file->f_dentry->d_inode->i_writecount, 1); 3285 restore_bitmap_write_access(mddev->bitmap_file);
2845 fput(mddev->bitmap_file); 3286 fput(mddev->bitmap_file);
2846 mddev->bitmap_file = NULL; 3287 mddev->bitmap_file = NULL;
2847 } 3288 }
@@ -2857,11 +3298,15 @@ static int do_md_stop(mddev_t * mddev, int ro)
2857 export_array(mddev); 3298 export_array(mddev);
2858 3299
2859 mddev->array_size = 0; 3300 mddev->array_size = 0;
3301 mddev->size = 0;
3302 mddev->raid_disks = 0;
3303 mddev->recovery_cp = 0;
3304
2860 disk = mddev->gendisk; 3305 disk = mddev->gendisk;
2861 if (disk) 3306 if (disk)
2862 set_capacity(disk, 0); 3307 set_capacity(disk, 0);
2863 mddev->changed = 1; 3308 mddev->changed = 1;
2864 } else 3309 } else if (mddev->pers)
2865 printk(KERN_INFO "md: %s switched to read-only mode.\n", 3310 printk(KERN_INFO "md: %s switched to read-only mode.\n",
2866 mdname(mddev)); 3311 mdname(mddev));
2867 err = 0; 3312 err = 0;
@@ -3264,6 +3709,17 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
3264 3709
3265 rdev->raid_disk = -1; 3710 rdev->raid_disk = -1;
3266 err = bind_rdev_to_array(rdev, mddev); 3711 err = bind_rdev_to_array(rdev, mddev);
3712 if (!err && !mddev->pers->hot_remove_disk) {
3713 /* If there is hot_add_disk but no hot_remove_disk
3714 * then added disks for geometry changes,
3715 * and should be added immediately.
3716 */
3717 super_types[mddev->major_version].
3718 validate_super(mddev, rdev);
3719 err = mddev->pers->hot_add_disk(mddev, rdev);
3720 if (err)
3721 unbind_rdev_from_array(rdev);
3722 }
3267 if (err) 3723 if (err)
3268 export_rdev(rdev); 3724 export_rdev(rdev);
3269 3725
@@ -3434,23 +3890,6 @@ abort_export:
3434 return err; 3890 return err;
3435} 3891}
3436 3892
3437/* similar to deny_write_access, but accounts for our holding a reference
3438 * to the file ourselves */
3439static int deny_bitmap_write_access(struct file * file)
3440{
3441 struct inode *inode = file->f_mapping->host;
3442
3443 spin_lock(&inode->i_lock);
3444 if (atomic_read(&inode->i_writecount) > 1) {
3445 spin_unlock(&inode->i_lock);
3446 return -ETXTBSY;
3447 }
3448 atomic_set(&inode->i_writecount, -1);
3449 spin_unlock(&inode->i_lock);
3450
3451 return 0;
3452}
3453
3454static int set_bitmap_file(mddev_t *mddev, int fd) 3893static int set_bitmap_file(mddev_t *mddev, int fd)
3455{ 3894{
3456 int err; 3895 int err;
@@ -3491,12 +3930,17 @@ static int set_bitmap_file(mddev_t *mddev, int fd)
3491 mddev->pers->quiesce(mddev, 1); 3930 mddev->pers->quiesce(mddev, 1);
3492 if (fd >= 0) 3931 if (fd >= 0)
3493 err = bitmap_create(mddev); 3932 err = bitmap_create(mddev);
3494 if (fd < 0 || err) 3933 if (fd < 0 || err) {
3495 bitmap_destroy(mddev); 3934 bitmap_destroy(mddev);
3935 fd = -1; /* make sure to put the file */
3936 }
3496 mddev->pers->quiesce(mddev, 0); 3937 mddev->pers->quiesce(mddev, 0);
3497 } else if (fd < 0) { 3938 }
3498 if (mddev->bitmap_file) 3939 if (fd < 0) {
3940 if (mddev->bitmap_file) {
3941 restore_bitmap_write_access(mddev->bitmap_file);
3499 fput(mddev->bitmap_file); 3942 fput(mddev->bitmap_file);
3943 }
3500 mddev->bitmap_file = NULL; 3944 mddev->bitmap_file = NULL;
3501 } 3945 }
3502 3946
@@ -3977,11 +4421,6 @@ static int md_ioctl(struct inode *inode, struct file *file,
3977 goto done_unlock; 4421 goto done_unlock;
3978 4422
3979 default: 4423 default:
3980 if (_IOC_TYPE(cmd) == MD_MAJOR)
3981 printk(KERN_WARNING "md: %s(pid %d) used"
3982 " obsolete MD ioctl, upgrade your"
3983 " software to use new ictls.\n",
3984 current->comm, current->pid);
3985 err = -EINVAL; 4424 err = -EINVAL;
3986 goto abort_unlock; 4425 goto abort_unlock;
3987 } 4426 }
@@ -4586,7 +5025,7 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
4586 spin_lock_irq(&mddev->write_lock); 5025 spin_lock_irq(&mddev->write_lock);
4587 if (mddev->in_sync) { 5026 if (mddev->in_sync) {
4588 mddev->in_sync = 0; 5027 mddev->in_sync = 0;
4589 mddev->sb_dirty = 1; 5028 mddev->sb_dirty = 3;
4590 md_wakeup_thread(mddev->thread); 5029 md_wakeup_thread(mddev->thread);
4591 } 5030 }
4592 spin_unlock_irq(&mddev->write_lock); 5031 spin_unlock_irq(&mddev->write_lock);
@@ -4599,7 +5038,7 @@ void md_write_end(mddev_t *mddev)
4599 if (atomic_dec_and_test(&mddev->writes_pending)) { 5038 if (atomic_dec_and_test(&mddev->writes_pending)) {
4600 if (mddev->safemode == 2) 5039 if (mddev->safemode == 2)
4601 md_wakeup_thread(mddev->thread); 5040 md_wakeup_thread(mddev->thread);
4602 else 5041 else if (mddev->safemode_delay)
4603 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 5042 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
4604 } 5043 }
4605} 5044}
@@ -4620,10 +5059,14 @@ void md_do_sync(mddev_t *mddev)
4620 struct list_head *tmp; 5059 struct list_head *tmp;
4621 sector_t last_check; 5060 sector_t last_check;
4622 int skipped = 0; 5061 int skipped = 0;
5062 struct list_head *rtmp;
5063 mdk_rdev_t *rdev;
4623 5064
4624 /* just incase thread restarts... */ 5065 /* just incase thread restarts... */
4625 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 5066 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
4626 return; 5067 return;
5068 if (mddev->ro) /* never try to sync a read-only array */
5069 return;
4627 5070
4628 /* we overload curr_resync somewhat here. 5071 /* we overload curr_resync somewhat here.
4629 * 0 == not engaged in resync at all 5072 * 0 == not engaged in resync at all
@@ -4682,17 +5125,30 @@ void md_do_sync(mddev_t *mddev)
4682 } 5125 }
4683 } while (mddev->curr_resync < 2); 5126 } while (mddev->curr_resync < 2);
4684 5127
5128 j = 0;
4685 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5129 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
4686 /* resync follows the size requested by the personality, 5130 /* resync follows the size requested by the personality,
4687 * which defaults to physical size, but can be virtual size 5131 * which defaults to physical size, but can be virtual size
4688 */ 5132 */
4689 max_sectors = mddev->resync_max_sectors; 5133 max_sectors = mddev->resync_max_sectors;
4690 mddev->resync_mismatches = 0; 5134 mddev->resync_mismatches = 0;
5135 /* we don't use the checkpoint if there's a bitmap */
5136 if (!mddev->bitmap &&
5137 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
5138 j = mddev->recovery_cp;
4691 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5139 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
4692 max_sectors = mddev->size << 1; 5140 max_sectors = mddev->size << 1;
4693 else 5141 else {
4694 /* recovery follows the physical size of devices */ 5142 /* recovery follows the physical size of devices */
4695 max_sectors = mddev->size << 1; 5143 max_sectors = mddev->size << 1;
5144 j = MaxSector;
5145 ITERATE_RDEV(mddev,rdev,rtmp)
5146 if (rdev->raid_disk >= 0 &&
5147 !test_bit(Faulty, &rdev->flags) &&
5148 !test_bit(In_sync, &rdev->flags) &&
5149 rdev->recovery_offset < j)
5150 j = rdev->recovery_offset;
5151 }
4696 5152
4697 printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); 5153 printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
4698 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" 5154 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
@@ -4702,12 +5158,7 @@ void md_do_sync(mddev_t *mddev)
4702 speed_max(mddev)); 5158 speed_max(mddev));
4703 5159
4704 is_mddev_idle(mddev); /* this also initializes IO event counters */ 5160 is_mddev_idle(mddev); /* this also initializes IO event counters */
4705 /* we don't use the checkpoint if there's a bitmap */ 5161
4706 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap
4707 && ! test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
4708 j = mddev->recovery_cp;
4709 else
4710 j = 0;
4711 io_sectors = 0; 5162 io_sectors = 0;
4712 for (m = 0; m < SYNC_MARKS; m++) { 5163 for (m = 0; m < SYNC_MARKS; m++) {
4713 mark[m] = jiffies; 5164 mark[m] = jiffies;
@@ -4828,15 +5279,28 @@ void md_do_sync(mddev_t *mddev)
4828 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 5279 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
4829 test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 5280 test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
4830 !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 5281 !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
4831 mddev->curr_resync > 2 && 5282 mddev->curr_resync > 2) {
4832 mddev->curr_resync >= mddev->recovery_cp) { 5283 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
4833 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 5284 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
4834 printk(KERN_INFO 5285 if (mddev->curr_resync >= mddev->recovery_cp) {
4835 "md: checkpointing recovery of %s.\n", 5286 printk(KERN_INFO
4836 mdname(mddev)); 5287 "md: checkpointing recovery of %s.\n",
4837 mddev->recovery_cp = mddev->curr_resync; 5288 mdname(mddev));
4838 } else 5289 mddev->recovery_cp = mddev->curr_resync;
4839 mddev->recovery_cp = MaxSector; 5290 }
5291 } else
5292 mddev->recovery_cp = MaxSector;
5293 } else {
5294 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
5295 mddev->curr_resync = MaxSector;
5296 ITERATE_RDEV(mddev,rdev,rtmp)
5297 if (rdev->raid_disk >= 0 &&
5298 !test_bit(Faulty, &rdev->flags) &&
5299 !test_bit(In_sync, &rdev->flags) &&
5300 rdev->recovery_offset < mddev->curr_resync)
5301 rdev->recovery_offset = mddev->curr_resync;
5302 mddev->sb_dirty = 1;
5303 }
4840 } 5304 }
4841 5305
4842 skip: 5306 skip:
@@ -4908,7 +5372,7 @@ void md_check_recovery(mddev_t *mddev)
4908 if (mddev->safemode && !atomic_read(&mddev->writes_pending) && 5372 if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
4909 !mddev->in_sync && mddev->recovery_cp == MaxSector) { 5373 !mddev->in_sync && mddev->recovery_cp == MaxSector) {
4910 mddev->in_sync = 1; 5374 mddev->in_sync = 1;
4911 mddev->sb_dirty = 1; 5375 mddev->sb_dirty = 3;
4912 } 5376 }
4913 if (mddev->safemode == 1) 5377 if (mddev->safemode == 1)
4914 mddev->safemode = 0; 5378 mddev->safemode = 0;
@@ -4957,6 +5421,8 @@ void md_check_recovery(mddev_t *mddev)
4957 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 5421 clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
4958 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 5422 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
4959 5423
5424 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
5425 goto unlock;
4960 /* no recovery is running. 5426 /* no recovery is running.
4961 * remove any failed drives, then 5427 * remove any failed drives, then
4962 * add spares if possible. 5428 * add spares if possible.
@@ -4979,6 +5445,7 @@ void md_check_recovery(mddev_t *mddev)
4979 ITERATE_RDEV(mddev,rdev,rtmp) 5445 ITERATE_RDEV(mddev,rdev,rtmp)
4980 if (rdev->raid_disk < 0 5446 if (rdev->raid_disk < 0
4981 && !test_bit(Faulty, &rdev->flags)) { 5447 && !test_bit(Faulty, &rdev->flags)) {
5448 rdev->recovery_offset = 0;
4982 if (mddev->pers->hot_add_disk(mddev,rdev)) { 5449 if (mddev->pers->hot_add_disk(mddev,rdev)) {
4983 char nm[20]; 5450 char nm[20];
4984 sprintf(nm, "rd%d", rdev->raid_disk); 5451 sprintf(nm, "rd%d", rdev->raid_disk);
@@ -5216,7 +5683,6 @@ EXPORT_SYMBOL(md_write_end);
5216EXPORT_SYMBOL(md_register_thread); 5683EXPORT_SYMBOL(md_register_thread);
5217EXPORT_SYMBOL(md_unregister_thread); 5684EXPORT_SYMBOL(md_unregister_thread);
5218EXPORT_SYMBOL(md_wakeup_thread); 5685EXPORT_SYMBOL(md_wakeup_thread);
5219EXPORT_SYMBOL(md_print_devices);
5220EXPORT_SYMBOL(md_check_recovery); 5686EXPORT_SYMBOL(md_check_recovery);
5221MODULE_LICENSE("GPL"); 5687MODULE_LICENSE("GPL");
5222MODULE_ALIAS("md"); 5688MODULE_ALIAS("md");
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 4070eff6f0f8..cead918578a7 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -374,26 +374,26 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
374 * already. 374 * already.
375 */ 375 */
376 if (atomic_dec_and_test(&r1_bio->remaining)) { 376 if (atomic_dec_and_test(&r1_bio->remaining)) {
377 if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) { 377 if (test_bit(R1BIO_BarrierRetry, &r1_bio->state))
378 reschedule_retry(r1_bio); 378 reschedule_retry(r1_bio);
379 goto out; 379 else {
380 } 380 /* it really is the end of this request */
381 /* it really is the end of this request */ 381 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
382 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { 382 /* free extra copy of the data pages */
383 /* free extra copy of the data pages */ 383 int i = bio->bi_vcnt;
384 int i = bio->bi_vcnt; 384 while (i--)
385 while (i--) 385 safe_put_page(bio->bi_io_vec[i].bv_page);
386 safe_put_page(bio->bi_io_vec[i].bv_page); 386 }
387 /* clear the bitmap if all writes complete successfully */
388 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
389 r1_bio->sectors,
390 !test_bit(R1BIO_Degraded, &r1_bio->state),
391 behind);
392 md_write_end(r1_bio->mddev);
393 raid_end_bio_io(r1_bio);
387 } 394 }
388 /* clear the bitmap if all writes complete successfully */
389 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
390 r1_bio->sectors,
391 !test_bit(R1BIO_Degraded, &r1_bio->state),
392 behind);
393 md_write_end(r1_bio->mddev);
394 raid_end_bio_io(r1_bio);
395 } 395 }
396 out: 396
397 if (to_put) 397 if (to_put)
398 bio_put(to_put); 398 bio_put(to_put);
399 399
@@ -1625,6 +1625,12 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1625 /* before building a request, check if we can skip these blocks.. 1625 /* before building a request, check if we can skip these blocks..
1626 * This call the bitmap_start_sync doesn't actually record anything 1626 * This call the bitmap_start_sync doesn't actually record anything
1627 */ 1627 */
1628 if (mddev->bitmap == NULL &&
1629 mddev->recovery_cp == MaxSector &&
1630 conf->fullsync == 0) {
1631 *skipped = 1;
1632 return max_sector - sector_nr;
1633 }
1628 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 1634 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
1629 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 1635 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
1630 /* We can skip this block, and probably several more */ 1636 /* We can skip this block, and probably several more */
@@ -1888,7 +1894,8 @@ static int run(mddev_t *mddev)
1888 1894
1889 disk = conf->mirrors + i; 1895 disk = conf->mirrors + i;
1890 1896
1891 if (!disk->rdev) { 1897 if (!disk->rdev ||
1898 !test_bit(In_sync, &disk->rdev->flags)) {
1892 disk->head_position = 0; 1899 disk->head_position = 0;
1893 mddev->degraded++; 1900 mddev->degraded++;
1894 } 1901 }
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 1440935414e6..7f636283a1ba 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -29,6 +29,7 @@
29 * raid_disks 29 * raid_disks
30 * near_copies (stored in low byte of layout) 30 * near_copies (stored in low byte of layout)
31 * far_copies (stored in second byte of layout) 31 * far_copies (stored in second byte of layout)
32 * far_offset (stored in bit 16 of layout )
32 * 33 *
33 * The data to be stored is divided into chunks using chunksize. 34 * The data to be stored is divided into chunks using chunksize.
34 * Each device is divided into far_copies sections. 35 * Each device is divided into far_copies sections.
@@ -36,10 +37,14 @@
36 * near_copies copies of each chunk is stored (each on a different drive). 37 * near_copies copies of each chunk is stored (each on a different drive).
37 * The starting device for each section is offset near_copies from the starting 38 * The starting device for each section is offset near_copies from the starting
38 * device of the previous section. 39 * device of the previous section.
39 * Thus there are (near_copies*far_copies) of each chunk, and each is on a different 40 * Thus they are (near_copies*far_copies) of each chunk, and each is on a different
40 * drive. 41 * drive.
41 * near_copies and far_copies must be at least one, and their product is at most 42 * near_copies and far_copies must be at least one, and their product is at most
42 * raid_disks. 43 * raid_disks.
44 *
45 * If far_offset is true, then the far_copies are handled a bit differently.
46 * The copies are still in different stripes, but instead of be very far apart
47 * on disk, there are adjacent stripes.
43 */ 48 */
44 49
45/* 50/*
@@ -357,8 +362,7 @@ static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, in
357 * With this layout, and block is never stored twice on the one device. 362 * With this layout, and block is never stored twice on the one device.
358 * 363 *
359 * raid10_find_phys finds the sector offset of a given virtual sector 364 * raid10_find_phys finds the sector offset of a given virtual sector
360 * on each device that it is on. If a block isn't on a device, 365 * on each device that it is on.
361 * that entry in the array is set to MaxSector.
362 * 366 *
363 * raid10_find_virt does the reverse mapping, from a device and a 367 * raid10_find_virt does the reverse mapping, from a device and a
364 * sector offset to a virtual address 368 * sector offset to a virtual address
@@ -381,6 +385,8 @@ static void raid10_find_phys(conf_t *conf, r10bio_t *r10bio)
381 chunk *= conf->near_copies; 385 chunk *= conf->near_copies;
382 stripe = chunk; 386 stripe = chunk;
383 dev = sector_div(stripe, conf->raid_disks); 387 dev = sector_div(stripe, conf->raid_disks);
388 if (conf->far_offset)
389 stripe *= conf->far_copies;
384 390
385 sector += stripe << conf->chunk_shift; 391 sector += stripe << conf->chunk_shift;
386 392
@@ -414,16 +420,24 @@ static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev)
414{ 420{
415 sector_t offset, chunk, vchunk; 421 sector_t offset, chunk, vchunk;
416 422
417 while (sector > conf->stride) {
418 sector -= conf->stride;
419 if (dev < conf->near_copies)
420 dev += conf->raid_disks - conf->near_copies;
421 else
422 dev -= conf->near_copies;
423 }
424
425 offset = sector & conf->chunk_mask; 423 offset = sector & conf->chunk_mask;
426 chunk = sector >> conf->chunk_shift; 424 if (conf->far_offset) {
425 int fc;
426 chunk = sector >> conf->chunk_shift;
427 fc = sector_div(chunk, conf->far_copies);
428 dev -= fc * conf->near_copies;
429 if (dev < 0)
430 dev += conf->raid_disks;
431 } else {
432 while (sector > conf->stride) {
433 sector -= conf->stride;
434 if (dev < conf->near_copies)
435 dev += conf->raid_disks - conf->near_copies;
436 else
437 dev -= conf->near_copies;
438 }
439 chunk = sector >> conf->chunk_shift;
440 }
427 vchunk = chunk * conf->raid_disks + dev; 441 vchunk = chunk * conf->raid_disks + dev;
428 sector_div(vchunk, conf->near_copies); 442 sector_div(vchunk, conf->near_copies);
429 return (vchunk << conf->chunk_shift) + offset; 443 return (vchunk << conf->chunk_shift) + offset;
@@ -900,9 +914,12 @@ static void status(struct seq_file *seq, mddev_t *mddev)
900 seq_printf(seq, " %dK chunks", mddev->chunk_size/1024); 914 seq_printf(seq, " %dK chunks", mddev->chunk_size/1024);
901 if (conf->near_copies > 1) 915 if (conf->near_copies > 1)
902 seq_printf(seq, " %d near-copies", conf->near_copies); 916 seq_printf(seq, " %d near-copies", conf->near_copies);
903 if (conf->far_copies > 1) 917 if (conf->far_copies > 1) {
904 seq_printf(seq, " %d far-copies", conf->far_copies); 918 if (conf->far_offset)
905 919 seq_printf(seq, " %d offset-copies", conf->far_copies);
920 else
921 seq_printf(seq, " %d far-copies", conf->far_copies);
922 }
906 seq_printf(seq, " [%d/%d] [", conf->raid_disks, 923 seq_printf(seq, " [%d/%d] [", conf->raid_disks,
907 conf->working_disks); 924 conf->working_disks);
908 for (i = 0; i < conf->raid_disks; i++) 925 for (i = 0; i < conf->raid_disks; i++)
@@ -1915,7 +1932,7 @@ static int run(mddev_t *mddev)
1915 mirror_info_t *disk; 1932 mirror_info_t *disk;
1916 mdk_rdev_t *rdev; 1933 mdk_rdev_t *rdev;
1917 struct list_head *tmp; 1934 struct list_head *tmp;
1918 int nc, fc; 1935 int nc, fc, fo;
1919 sector_t stride, size; 1936 sector_t stride, size;
1920 1937
1921 if (mddev->chunk_size == 0) { 1938 if (mddev->chunk_size == 0) {
@@ -1925,8 +1942,9 @@ static int run(mddev_t *mddev)
1925 1942
1926 nc = mddev->layout & 255; 1943 nc = mddev->layout & 255;
1927 fc = (mddev->layout >> 8) & 255; 1944 fc = (mddev->layout >> 8) & 255;
1945 fo = mddev->layout & (1<<16);
1928 if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || 1946 if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
1929 (mddev->layout >> 16)) { 1947 (mddev->layout >> 17)) {
1930 printk(KERN_ERR "raid10: %s: unsupported raid10 layout: 0x%8x\n", 1948 printk(KERN_ERR "raid10: %s: unsupported raid10 layout: 0x%8x\n",
1931 mdname(mddev), mddev->layout); 1949 mdname(mddev), mddev->layout);
1932 goto out; 1950 goto out;
@@ -1958,12 +1976,16 @@ static int run(mddev_t *mddev)
1958 conf->near_copies = nc; 1976 conf->near_copies = nc;
1959 conf->far_copies = fc; 1977 conf->far_copies = fc;
1960 conf->copies = nc*fc; 1978 conf->copies = nc*fc;
1979 conf->far_offset = fo;
1961 conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1; 1980 conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1;
1962 conf->chunk_shift = ffz(~mddev->chunk_size) - 9; 1981 conf->chunk_shift = ffz(~mddev->chunk_size) - 9;
1963 stride = mddev->size >> (conf->chunk_shift-1); 1982 if (fo)
1964 sector_div(stride, fc); 1983 conf->stride = 1 << conf->chunk_shift;
1965 conf->stride = stride << conf->chunk_shift; 1984 else {
1966 1985 stride = mddev->size >> (conf->chunk_shift-1);
1986 sector_div(stride, fc);
1987 conf->stride = stride << conf->chunk_shift;
1988 }
1967 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, 1989 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
1968 r10bio_pool_free, conf); 1990 r10bio_pool_free, conf);
1969 if (!conf->r10bio_pool) { 1991 if (!conf->r10bio_pool) {
@@ -2015,7 +2037,8 @@ static int run(mddev_t *mddev)
2015 2037
2016 disk = conf->mirrors + i; 2038 disk = conf->mirrors + i;
2017 2039
2018 if (!disk->rdev) { 2040 if (!disk->rdev ||
2041 !test_bit(In_sync, &rdev->flags)) {
2019 disk->head_position = 0; 2042 disk->head_position = 0;
2020 mddev->degraded++; 2043 mddev->degraded++;
2021 } 2044 }
@@ -2037,7 +2060,13 @@ static int run(mddev_t *mddev)
2037 /* 2060 /*
2038 * Ok, everything is just fine now 2061 * Ok, everything is just fine now
2039 */ 2062 */
2040 size = conf->stride * conf->raid_disks; 2063 if (conf->far_offset) {
2064 size = mddev->size >> (conf->chunk_shift-1);
2065 size *= conf->raid_disks;
2066 size <<= conf->chunk_shift;
2067 sector_div(size, conf->far_copies);
2068 } else
2069 size = conf->stride * conf->raid_disks;
2041 sector_div(size, conf->near_copies); 2070 sector_div(size, conf->near_copies);
2042 mddev->array_size = size/2; 2071 mddev->array_size = size/2;
2043 mddev->resync_max_sectors = size; 2072 mddev->resync_max_sectors = size;
@@ -2050,7 +2079,7 @@ static int run(mddev_t *mddev)
2050 * maybe... 2079 * maybe...
2051 */ 2080 */
2052 { 2081 {
2053 int stripe = conf->raid_disks * mddev->chunk_size / PAGE_SIZE; 2082 int stripe = conf->raid_disks * (mddev->chunk_size / PAGE_SIZE);
2054 stripe /= conf->near_copies; 2083 stripe /= conf->near_copies;
2055 if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) 2084 if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
2056 mddev->queue->backing_dev_info.ra_pages = 2* stripe; 2085 mddev->queue->backing_dev_info.ra_pages = 2* stripe;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 31843604049c..f920e50ea124 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2,8 +2,11 @@
2 * raid5.c : Multiple Devices driver for Linux 2 * raid5.c : Multiple Devices driver for Linux
3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4 * Copyright (C) 1999, 2000 Ingo Molnar 4 * Copyright (C) 1999, 2000 Ingo Molnar
5 * Copyright (C) 2002, 2003 H. Peter Anvin
5 * 6 *
6 * RAID-5 management functions. 7 * RAID-4/5/6 management functions.
8 * Thanks to Penguin Computing for making the RAID-6 development possible
9 * by donating a test server!
7 * 10 *
8 * This program is free software; you can redistribute it and/or modify 11 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by 12 * it under the terms of the GNU General Public License as published by
@@ -19,11 +22,11 @@
19#include <linux/config.h> 22#include <linux/config.h>
20#include <linux/module.h> 23#include <linux/module.h>
21#include <linux/slab.h> 24#include <linux/slab.h>
22#include <linux/raid/raid5.h>
23#include <linux/highmem.h> 25#include <linux/highmem.h>
24#include <linux/bitops.h> 26#include <linux/bitops.h>
25#include <linux/kthread.h> 27#include <linux/kthread.h>
26#include <asm/atomic.h> 28#include <asm/atomic.h>
29#include "raid6.h"
27 30
28#include <linux/raid/bitmap.h> 31#include <linux/raid/bitmap.h>
29 32
@@ -68,6 +71,16 @@
68#define __inline__ 71#define __inline__
69#endif 72#endif
70 73
74#if !RAID6_USE_EMPTY_ZERO_PAGE
75/* In .bss so it's zeroed */
76const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
77#endif
78
79static inline int raid6_next_disk(int disk, int raid_disks)
80{
81 disk++;
82 return (disk < raid_disks) ? disk : 0;
83}
71static void print_raid5_conf (raid5_conf_t *conf); 84static void print_raid5_conf (raid5_conf_t *conf);
72 85
73static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) 86static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
@@ -104,7 +117,7 @@ static void release_stripe(struct stripe_head *sh)
104{ 117{
105 raid5_conf_t *conf = sh->raid_conf; 118 raid5_conf_t *conf = sh->raid_conf;
106 unsigned long flags; 119 unsigned long flags;
107 120
108 spin_lock_irqsave(&conf->device_lock, flags); 121 spin_lock_irqsave(&conf->device_lock, flags);
109 __release_stripe(conf, sh); 122 __release_stripe(conf, sh);
110 spin_unlock_irqrestore(&conf->device_lock, flags); 123 spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -117,7 +130,7 @@ static inline void remove_hash(struct stripe_head *sh)
117 hlist_del_init(&sh->hash); 130 hlist_del_init(&sh->hash);
118} 131}
119 132
120static void insert_hash(raid5_conf_t *conf, struct stripe_head *sh) 133static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
121{ 134{
122 struct hlist_head *hp = stripe_hash(conf, sh->sector); 135 struct hlist_head *hp = stripe_hash(conf, sh->sector);
123 136
@@ -190,7 +203,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int
190 (unsigned long long)sh->sector); 203 (unsigned long long)sh->sector);
191 204
192 remove_hash(sh); 205 remove_hash(sh);
193 206
194 sh->sector = sector; 207 sh->sector = sector;
195 sh->pd_idx = pd_idx; 208 sh->pd_idx = pd_idx;
196 sh->state = 0; 209 sh->state = 0;
@@ -269,8 +282,9 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
269 } else { 282 } else {
270 if (!test_bit(STRIPE_HANDLE, &sh->state)) 283 if (!test_bit(STRIPE_HANDLE, &sh->state))
271 atomic_inc(&conf->active_stripes); 284 atomic_inc(&conf->active_stripes);
272 if (!list_empty(&sh->lru)) 285 if (list_empty(&sh->lru))
273 list_del_init(&sh->lru); 286 BUG();
287 list_del_init(&sh->lru);
274 } 288 }
275 } 289 }
276 } while (sh == NULL); 290 } while (sh == NULL);
@@ -321,10 +335,9 @@ static int grow_stripes(raid5_conf_t *conf, int num)
321 return 1; 335 return 1;
322 conf->slab_cache = sc; 336 conf->slab_cache = sc;
323 conf->pool_size = devs; 337 conf->pool_size = devs;
324 while (num--) { 338 while (num--)
325 if (!grow_one_stripe(conf)) 339 if (!grow_one_stripe(conf))
326 return 1; 340 return 1;
327 }
328 return 0; 341 return 0;
329} 342}
330 343
@@ -631,8 +644,7 @@ static void raid5_build_block (struct stripe_head *sh, int i)
631 dev->req.bi_private = sh; 644 dev->req.bi_private = sh;
632 645
633 dev->flags = 0; 646 dev->flags = 0;
634 if (i != sh->pd_idx) 647 dev->sector = compute_blocknr(sh, i);
635 dev->sector = compute_blocknr(sh, i);
636} 648}
637 649
638static void error(mddev_t *mddev, mdk_rdev_t *rdev) 650static void error(mddev_t *mddev, mdk_rdev_t *rdev)
@@ -659,7 +671,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
659 " Operation continuing on %d devices\n", 671 " Operation continuing on %d devices\n",
660 bdevname(rdev->bdev,b), conf->working_disks); 672 bdevname(rdev->bdev,b), conf->working_disks);
661 } 673 }
662} 674}
663 675
664/* 676/*
665 * Input: a 'big' sector number, 677 * Input: a 'big' sector number,
@@ -697,9 +709,12 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
697 /* 709 /*
698 * Select the parity disk based on the user selected algorithm. 710 * Select the parity disk based on the user selected algorithm.
699 */ 711 */
700 if (conf->level == 4) 712 switch(conf->level) {
713 case 4:
701 *pd_idx = data_disks; 714 *pd_idx = data_disks;
702 else switch (conf->algorithm) { 715 break;
716 case 5:
717 switch (conf->algorithm) {
703 case ALGORITHM_LEFT_ASYMMETRIC: 718 case ALGORITHM_LEFT_ASYMMETRIC:
704 *pd_idx = data_disks - stripe % raid_disks; 719 *pd_idx = data_disks - stripe % raid_disks;
705 if (*dd_idx >= *pd_idx) 720 if (*dd_idx >= *pd_idx)
@@ -721,6 +736,39 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
721 default: 736 default:
722 printk(KERN_ERR "raid5: unsupported algorithm %d\n", 737 printk(KERN_ERR "raid5: unsupported algorithm %d\n",
723 conf->algorithm); 738 conf->algorithm);
739 }
740 break;
741 case 6:
742
743 /**** FIX THIS ****/
744 switch (conf->algorithm) {
745 case ALGORITHM_LEFT_ASYMMETRIC:
746 *pd_idx = raid_disks - 1 - (stripe % raid_disks);
747 if (*pd_idx == raid_disks-1)
748 (*dd_idx)++; /* Q D D D P */
749 else if (*dd_idx >= *pd_idx)
750 (*dd_idx) += 2; /* D D P Q D */
751 break;
752 case ALGORITHM_RIGHT_ASYMMETRIC:
753 *pd_idx = stripe % raid_disks;
754 if (*pd_idx == raid_disks-1)
755 (*dd_idx)++; /* Q D D D P */
756 else if (*dd_idx >= *pd_idx)
757 (*dd_idx) += 2; /* D D P Q D */
758 break;
759 case ALGORITHM_LEFT_SYMMETRIC:
760 *pd_idx = raid_disks - 1 - (stripe % raid_disks);
761 *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
762 break;
763 case ALGORITHM_RIGHT_SYMMETRIC:
764 *pd_idx = stripe % raid_disks;
765 *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
766 break;
767 default:
768 printk (KERN_CRIT "raid6: unsupported algorithm %d\n",
769 conf->algorithm);
770 }
771 break;
724 } 772 }
725 773
726 /* 774 /*
@@ -742,12 +790,17 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
742 int chunk_number, dummy1, dummy2, dd_idx = i; 790 int chunk_number, dummy1, dummy2, dd_idx = i;
743 sector_t r_sector; 791 sector_t r_sector;
744 792
793
745 chunk_offset = sector_div(new_sector, sectors_per_chunk); 794 chunk_offset = sector_div(new_sector, sectors_per_chunk);
746 stripe = new_sector; 795 stripe = new_sector;
747 BUG_ON(new_sector != stripe); 796 BUG_ON(new_sector != stripe);
748 797
749 798 if (i == sh->pd_idx)
750 switch (conf->algorithm) { 799 return 0;
800 switch(conf->level) {
801 case 4: break;
802 case 5:
803 switch (conf->algorithm) {
751 case ALGORITHM_LEFT_ASYMMETRIC: 804 case ALGORITHM_LEFT_ASYMMETRIC:
752 case ALGORITHM_RIGHT_ASYMMETRIC: 805 case ALGORITHM_RIGHT_ASYMMETRIC:
753 if (i > sh->pd_idx) 806 if (i > sh->pd_idx)
@@ -761,7 +814,37 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
761 break; 814 break;
762 default: 815 default:
763 printk(KERN_ERR "raid5: unsupported algorithm %d\n", 816 printk(KERN_ERR "raid5: unsupported algorithm %d\n",
817 conf->algorithm);
818 }
819 break;
820 case 6:
821 data_disks = raid_disks - 2;
822 if (i == raid6_next_disk(sh->pd_idx, raid_disks))
823 return 0; /* It is the Q disk */
824 switch (conf->algorithm) {
825 case ALGORITHM_LEFT_ASYMMETRIC:
826 case ALGORITHM_RIGHT_ASYMMETRIC:
827 if (sh->pd_idx == raid_disks-1)
828 i--; /* Q D D D P */
829 else if (i > sh->pd_idx)
830 i -= 2; /* D D P Q D */
831 break;
832 case ALGORITHM_LEFT_SYMMETRIC:
833 case ALGORITHM_RIGHT_SYMMETRIC:
834 if (sh->pd_idx == raid_disks-1)
835 i--; /* Q D D D P */
836 else {
837 /* D D P Q D */
838 if (i < sh->pd_idx)
839 i += raid_disks;
840 i -= (sh->pd_idx + 2);
841 }
842 break;
843 default:
844 printk (KERN_CRIT "raid6: unsupported algorithm %d\n",
764 conf->algorithm); 845 conf->algorithm);
846 }
847 break;
765 } 848 }
766 849
767 chunk_number = stripe * data_disks + i; 850 chunk_number = stripe * data_disks + i;
@@ -778,10 +861,11 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
778 861
779 862
780/* 863/*
781 * Copy data between a page in the stripe cache, and a bio. 864 * Copy data between a page in the stripe cache, and one or more bion
782 * There are no alignment or size guarantees between the page or the 865 * The page could align with the middle of the bio, or there could be
783 * bio except that there is some overlap. 866 * several bion, each with several bio_vecs, which cover part of the page
784 * All iovecs in the bio must be considered. 867 * Multiple bion are linked together on bi_next. There may be extras
868 * at the end of this list. We ignore them.
785 */ 869 */
786static void copy_data(int frombio, struct bio *bio, 870static void copy_data(int frombio, struct bio *bio,
787 struct page *page, 871 struct page *page,
@@ -810,7 +894,7 @@ static void copy_data(int frombio, struct bio *bio,
810 if (len > 0 && page_offset + len > STRIPE_SIZE) 894 if (len > 0 && page_offset + len > STRIPE_SIZE)
811 clen = STRIPE_SIZE - page_offset; 895 clen = STRIPE_SIZE - page_offset;
812 else clen = len; 896 else clen = len;
813 897
814 if (clen > 0) { 898 if (clen > 0) {
815 char *ba = __bio_kmap_atomic(bio, i, KM_USER0); 899 char *ba = __bio_kmap_atomic(bio, i, KM_USER0);
816 if (frombio) 900 if (frombio)
@@ -862,14 +946,14 @@ static void compute_block(struct stripe_head *sh, int dd_idx)
862 set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); 946 set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
863} 947}
864 948
865static void compute_parity(struct stripe_head *sh, int method) 949static void compute_parity5(struct stripe_head *sh, int method)
866{ 950{
867 raid5_conf_t *conf = sh->raid_conf; 951 raid5_conf_t *conf = sh->raid_conf;
868 int i, pd_idx = sh->pd_idx, disks = sh->disks, count; 952 int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
869 void *ptr[MAX_XOR_BLOCKS]; 953 void *ptr[MAX_XOR_BLOCKS];
870 struct bio *chosen; 954 struct bio *chosen;
871 955
872 PRINTK("compute_parity, stripe %llu, method %d\n", 956 PRINTK("compute_parity5, stripe %llu, method %d\n",
873 (unsigned long long)sh->sector, method); 957 (unsigned long long)sh->sector, method);
874 958
875 count = 1; 959 count = 1;
@@ -956,9 +1040,195 @@ static void compute_parity(struct stripe_head *sh, int method)
956 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 1040 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
957} 1041}
958 1042
1043static void compute_parity6(struct stripe_head *sh, int method)
1044{
1045 raid6_conf_t *conf = sh->raid_conf;
1046 int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
1047 struct bio *chosen;
1048 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
1049 void *ptrs[disks];
1050
1051 qd_idx = raid6_next_disk(pd_idx, disks);
1052 d0_idx = raid6_next_disk(qd_idx, disks);
1053
1054 PRINTK("compute_parity, stripe %llu, method %d\n",
1055 (unsigned long long)sh->sector, method);
1056
1057 switch(method) {
1058 case READ_MODIFY_WRITE:
1059 BUG(); /* READ_MODIFY_WRITE N/A for RAID-6 */
1060 case RECONSTRUCT_WRITE:
1061 for (i= disks; i-- ;)
1062 if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) {
1063 chosen = sh->dev[i].towrite;
1064 sh->dev[i].towrite = NULL;
1065
1066 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1067 wake_up(&conf->wait_for_overlap);
1068
1069 if (sh->dev[i].written) BUG();
1070 sh->dev[i].written = chosen;
1071 }
1072 break;
1073 case CHECK_PARITY:
1074 BUG(); /* Not implemented yet */
1075 }
1076
1077 for (i = disks; i--;)
1078 if (sh->dev[i].written) {
1079 sector_t sector = sh->dev[i].sector;
1080 struct bio *wbi = sh->dev[i].written;
1081 while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
1082 copy_data(1, wbi, sh->dev[i].page, sector);
1083 wbi = r5_next_bio(wbi, sector);
1084 }
1085
1086 set_bit(R5_LOCKED, &sh->dev[i].flags);
1087 set_bit(R5_UPTODATE, &sh->dev[i].flags);
1088 }
1089
1090// switch(method) {
1091// case RECONSTRUCT_WRITE:
1092// case CHECK_PARITY:
1093// case UPDATE_PARITY:
1094 /* Note that unlike RAID-5, the ordering of the disks matters greatly. */
1095 /* FIX: Is this ordering of drives even remotely optimal? */
1096 count = 0;
1097 i = d0_idx;
1098 do {
1099 ptrs[count++] = page_address(sh->dev[i].page);
1100 if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags))
1101 printk("block %d/%d not uptodate on parity calc\n", i,count);
1102 i = raid6_next_disk(i, disks);
1103 } while ( i != d0_idx );
1104// break;
1105// }
1106
1107 raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs);
1108
1109 switch(method) {
1110 case RECONSTRUCT_WRITE:
1111 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1112 set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
1113 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
1114 set_bit(R5_LOCKED, &sh->dev[qd_idx].flags);
1115 break;
1116 case UPDATE_PARITY:
1117 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1118 set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
1119 break;
1120 }
1121}
1122
1123
1124/* Compute one missing block */
1125static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
1126{
1127 raid6_conf_t *conf = sh->raid_conf;
1128 int i, count, disks = conf->raid_disks;
1129 void *ptr[MAX_XOR_BLOCKS], *p;
1130 int pd_idx = sh->pd_idx;
1131 int qd_idx = raid6_next_disk(pd_idx, disks);
1132
1133 PRINTK("compute_block_1, stripe %llu, idx %d\n",
1134 (unsigned long long)sh->sector, dd_idx);
1135
1136 if ( dd_idx == qd_idx ) {
1137 /* We're actually computing the Q drive */
1138 compute_parity6(sh, UPDATE_PARITY);
1139 } else {
1140 ptr[0] = page_address(sh->dev[dd_idx].page);
1141 if (!nozero) memset(ptr[0], 0, STRIPE_SIZE);
1142 count = 1;
1143 for (i = disks ; i--; ) {
1144 if (i == dd_idx || i == qd_idx)
1145 continue;
1146 p = page_address(sh->dev[i].page);
1147 if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
1148 ptr[count++] = p;
1149 else
1150 printk("compute_block() %d, stripe %llu, %d"
1151 " not present\n", dd_idx,
1152 (unsigned long long)sh->sector, i);
1153
1154 check_xor();
1155 }
1156 if (count != 1)
1157 xor_block(count, STRIPE_SIZE, ptr);
1158 if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
1159 else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
1160 }
1161}
1162
1163/* Compute two missing blocks */
1164static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
1165{
1166 raid6_conf_t *conf = sh->raid_conf;
1167 int i, count, disks = conf->raid_disks;
1168 int pd_idx = sh->pd_idx;
1169 int qd_idx = raid6_next_disk(pd_idx, disks);
1170 int d0_idx = raid6_next_disk(qd_idx, disks);
1171 int faila, failb;
1172
1173 /* faila and failb are disk numbers relative to d0_idx */
1174 /* pd_idx become disks-2 and qd_idx become disks-1 */
1175 faila = (dd_idx1 < d0_idx) ? dd_idx1+(disks-d0_idx) : dd_idx1-d0_idx;
1176 failb = (dd_idx2 < d0_idx) ? dd_idx2+(disks-d0_idx) : dd_idx2-d0_idx;
1177
1178 BUG_ON(faila == failb);
1179 if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; }
1180
1181 PRINTK("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
1182 (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb);
1183
1184 if ( failb == disks-1 ) {
1185 /* Q disk is one of the missing disks */
1186 if ( faila == disks-2 ) {
1187 /* Missing P+Q, just recompute */
1188 compute_parity6(sh, UPDATE_PARITY);
1189 return;
1190 } else {
1191 /* We're missing D+Q; recompute D from P */
1192 compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0);
1193 compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */
1194 return;
1195 }
1196 }
1197
1198 /* We're missing D+P or D+D; build pointer table */
1199 {
1200 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
1201 void *ptrs[disks];
1202
1203 count = 0;
1204 i = d0_idx;
1205 do {
1206 ptrs[count++] = page_address(sh->dev[i].page);
1207 i = raid6_next_disk(i, disks);
1208 if (i != dd_idx1 && i != dd_idx2 &&
1209 !test_bit(R5_UPTODATE, &sh->dev[i].flags))
1210 printk("compute_2 with missing block %d/%d\n", count, i);
1211 } while ( i != d0_idx );
1212
1213 if ( failb == disks-2 ) {
1214 /* We're missing D+P. */
1215 raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs);
1216 } else {
1217 /* We're missing D+D. */
1218 raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs);
1219 }
1220
1221 /* Both the above update both missing blocks */
1222 set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
1223 set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
1224 }
1225}
1226
1227
1228
959/* 1229/*
960 * Each stripe/dev can have one or more bion attached. 1230 * Each stripe/dev can have one or more bion attached.
961 * toread/towrite point to the first in a chain. 1231 * toread/towrite point to the first in a chain.
962 * The bi_next chain must be in order. 1232 * The bi_next chain must be in order.
963 */ 1233 */
964static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) 1234static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
@@ -1031,6 +1301,13 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
1031 1301
1032static void end_reshape(raid5_conf_t *conf); 1302static void end_reshape(raid5_conf_t *conf);
1033 1303
1304static int page_is_zero(struct page *p)
1305{
1306 char *a = page_address(p);
1307 return ((*(u32*)a) == 0 &&
1308 memcmp(a, a+4, STRIPE_SIZE-4)==0);
1309}
1310
1034static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) 1311static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
1035{ 1312{
1036 int sectors_per_chunk = conf->chunk_size >> 9; 1313 int sectors_per_chunk = conf->chunk_size >> 9;
@@ -1062,7 +1339,7 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
1062 * 1339 *
1063 */ 1340 */
1064 1341
1065static void handle_stripe(struct stripe_head *sh) 1342static void handle_stripe5(struct stripe_head *sh)
1066{ 1343{
1067 raid5_conf_t *conf = sh->raid_conf; 1344 raid5_conf_t *conf = sh->raid_conf;
1068 int disks = sh->disks; 1345 int disks = sh->disks;
@@ -1394,7 +1671,7 @@ static void handle_stripe(struct stripe_head *sh)
1394 if (locked == 0 && (rcw == 0 ||rmw == 0) && 1671 if (locked == 0 && (rcw == 0 ||rmw == 0) &&
1395 !test_bit(STRIPE_BIT_DELAY, &sh->state)) { 1672 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
1396 PRINTK("Computing parity...\n"); 1673 PRINTK("Computing parity...\n");
1397 compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE); 1674 compute_parity5(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
1398 /* now every locked buffer is ready to be written */ 1675 /* now every locked buffer is ready to be written */
1399 for (i=disks; i--;) 1676 for (i=disks; i--;)
1400 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) { 1677 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
@@ -1421,13 +1698,10 @@ static void handle_stripe(struct stripe_head *sh)
1421 !test_bit(STRIPE_INSYNC, &sh->state)) { 1698 !test_bit(STRIPE_INSYNC, &sh->state)) {
1422 set_bit(STRIPE_HANDLE, &sh->state); 1699 set_bit(STRIPE_HANDLE, &sh->state);
1423 if (failed == 0) { 1700 if (failed == 0) {
1424 char *pagea;
1425 BUG_ON(uptodate != disks); 1701 BUG_ON(uptodate != disks);
1426 compute_parity(sh, CHECK_PARITY); 1702 compute_parity5(sh, CHECK_PARITY);
1427 uptodate--; 1703 uptodate--;
1428 pagea = page_address(sh->dev[sh->pd_idx].page); 1704 if (page_is_zero(sh->dev[sh->pd_idx].page)) {
1429 if ((*(u32*)pagea) == 0 &&
1430 !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) {
1431 /* parity is correct (on disc, not in buffer any more) */ 1705 /* parity is correct (on disc, not in buffer any more) */
1432 set_bit(STRIPE_INSYNC, &sh->state); 1706 set_bit(STRIPE_INSYNC, &sh->state);
1433 } else { 1707 } else {
@@ -1487,7 +1761,7 @@ static void handle_stripe(struct stripe_head *sh)
1487 /* Need to write out all blocks after computing parity */ 1761 /* Need to write out all blocks after computing parity */
1488 sh->disks = conf->raid_disks; 1762 sh->disks = conf->raid_disks;
1489 sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks); 1763 sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks);
1490 compute_parity(sh, RECONSTRUCT_WRITE); 1764 compute_parity5(sh, RECONSTRUCT_WRITE);
1491 for (i= conf->raid_disks; i--;) { 1765 for (i= conf->raid_disks; i--;) {
1492 set_bit(R5_LOCKED, &sh->dev[i].flags); 1766 set_bit(R5_LOCKED, &sh->dev[i].flags);
1493 locked++; 1767 locked++;
@@ -1615,6 +1889,569 @@ static void handle_stripe(struct stripe_head *sh)
1615 } 1889 }
1616} 1890}
1617 1891
1892static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
1893{
1894 raid6_conf_t *conf = sh->raid_conf;
1895 int disks = conf->raid_disks;
1896 struct bio *return_bi= NULL;
1897 struct bio *bi;
1898 int i;
1899 int syncing;
1900 int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
1901 int non_overwrite = 0;
1902 int failed_num[2] = {0, 0};
1903 struct r5dev *dev, *pdev, *qdev;
1904 int pd_idx = sh->pd_idx;
1905 int qd_idx = raid6_next_disk(pd_idx, disks);
1906 int p_failed, q_failed;
1907
1908 PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d, qd_idx=%d\n",
1909 (unsigned long long)sh->sector, sh->state, atomic_read(&sh->count),
1910 pd_idx, qd_idx);
1911
1912 spin_lock(&sh->lock);
1913 clear_bit(STRIPE_HANDLE, &sh->state);
1914 clear_bit(STRIPE_DELAYED, &sh->state);
1915
1916 syncing = test_bit(STRIPE_SYNCING, &sh->state);
1917 /* Now to look around and see what can be done */
1918
1919 rcu_read_lock();
1920 for (i=disks; i--; ) {
1921 mdk_rdev_t *rdev;
1922 dev = &sh->dev[i];
1923 clear_bit(R5_Insync, &dev->flags);
1924
1925 PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
1926 i, dev->flags, dev->toread, dev->towrite, dev->written);
1927 /* maybe we can reply to a read */
1928 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
1929 struct bio *rbi, *rbi2;
1930 PRINTK("Return read for disc %d\n", i);
1931 spin_lock_irq(&conf->device_lock);
1932 rbi = dev->toread;
1933 dev->toread = NULL;
1934 if (test_and_clear_bit(R5_Overlap, &dev->flags))
1935 wake_up(&conf->wait_for_overlap);
1936 spin_unlock_irq(&conf->device_lock);
1937 while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
1938 copy_data(0, rbi, dev->page, dev->sector);
1939 rbi2 = r5_next_bio(rbi, dev->sector);
1940 spin_lock_irq(&conf->device_lock);
1941 if (--rbi->bi_phys_segments == 0) {
1942 rbi->bi_next = return_bi;
1943 return_bi = rbi;
1944 }
1945 spin_unlock_irq(&conf->device_lock);
1946 rbi = rbi2;
1947 }
1948 }
1949
1950 /* now count some things */
1951 if (test_bit(R5_LOCKED, &dev->flags)) locked++;
1952 if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
1953
1954
1955 if (dev->toread) to_read++;
1956 if (dev->towrite) {
1957 to_write++;
1958 if (!test_bit(R5_OVERWRITE, &dev->flags))
1959 non_overwrite++;
1960 }
1961 if (dev->written) written++;
1962 rdev = rcu_dereference(conf->disks[i].rdev);
1963 if (!rdev || !test_bit(In_sync, &rdev->flags)) {
1964 /* The ReadError flag will just be confusing now */
1965 clear_bit(R5_ReadError, &dev->flags);
1966 clear_bit(R5_ReWrite, &dev->flags);
1967 }
1968 if (!rdev || !test_bit(In_sync, &rdev->flags)
1969 || test_bit(R5_ReadError, &dev->flags)) {
1970 if ( failed < 2 )
1971 failed_num[failed] = i;
1972 failed++;
1973 } else
1974 set_bit(R5_Insync, &dev->flags);
1975 }
1976 rcu_read_unlock();
1977 PRINTK("locked=%d uptodate=%d to_read=%d"
1978 " to_write=%d failed=%d failed_num=%d,%d\n",
1979 locked, uptodate, to_read, to_write, failed,
1980 failed_num[0], failed_num[1]);
1981 /* check if the array has lost >2 devices and, if so, some requests might
1982 * need to be failed
1983 */
1984 if (failed > 2 && to_read+to_write+written) {
1985 for (i=disks; i--; ) {
1986 int bitmap_end = 0;
1987
1988 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1989 mdk_rdev_t *rdev;
1990 rcu_read_lock();
1991 rdev = rcu_dereference(conf->disks[i].rdev);
1992 if (rdev && test_bit(In_sync, &rdev->flags))
1993 /* multiple read failures in one stripe */
1994 md_error(conf->mddev, rdev);
1995 rcu_read_unlock();
1996 }
1997
1998 spin_lock_irq(&conf->device_lock);
1999 /* fail all writes first */
2000 bi = sh->dev[i].towrite;
2001 sh->dev[i].towrite = NULL;
2002 if (bi) { to_write--; bitmap_end = 1; }
2003
2004 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2005 wake_up(&conf->wait_for_overlap);
2006
2007 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
2008 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
2009 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2010 if (--bi->bi_phys_segments == 0) {
2011 md_write_end(conf->mddev);
2012 bi->bi_next = return_bi;
2013 return_bi = bi;
2014 }
2015 bi = nextbi;
2016 }
2017 /* and fail all 'written' */
2018 bi = sh->dev[i].written;
2019 sh->dev[i].written = NULL;
2020 if (bi) bitmap_end = 1;
2021 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
2022 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
2023 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2024 if (--bi->bi_phys_segments == 0) {
2025 md_write_end(conf->mddev);
2026 bi->bi_next = return_bi;
2027 return_bi = bi;
2028 }
2029 bi = bi2;
2030 }
2031
2032 /* fail any reads if this device is non-operational */
2033 if (!test_bit(R5_Insync, &sh->dev[i].flags) ||
2034 test_bit(R5_ReadError, &sh->dev[i].flags)) {
2035 bi = sh->dev[i].toread;
2036 sh->dev[i].toread = NULL;
2037 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2038 wake_up(&conf->wait_for_overlap);
2039 if (bi) to_read--;
2040 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
2041 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
2042 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2043 if (--bi->bi_phys_segments == 0) {
2044 bi->bi_next = return_bi;
2045 return_bi = bi;
2046 }
2047 bi = nextbi;
2048 }
2049 }
2050 spin_unlock_irq(&conf->device_lock);
2051 if (bitmap_end)
2052 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2053 STRIPE_SECTORS, 0, 0);
2054 }
2055 }
2056 if (failed > 2 && syncing) {
2057 md_done_sync(conf->mddev, STRIPE_SECTORS,0);
2058 clear_bit(STRIPE_SYNCING, &sh->state);
2059 syncing = 0;
2060 }
2061
2062 /*
2063 * might be able to return some write requests if the parity blocks
2064 * are safe, or on a failed drive
2065 */
2066 pdev = &sh->dev[pd_idx];
2067 p_failed = (failed >= 1 && failed_num[0] == pd_idx)
2068 || (failed >= 2 && failed_num[1] == pd_idx);
2069 qdev = &sh->dev[qd_idx];
2070 q_failed = (failed >= 1 && failed_num[0] == qd_idx)
2071 || (failed >= 2 && failed_num[1] == qd_idx);
2072
2073 if ( written &&
2074 ( p_failed || ((test_bit(R5_Insync, &pdev->flags)
2075 && !test_bit(R5_LOCKED, &pdev->flags)
2076 && test_bit(R5_UPTODATE, &pdev->flags))) ) &&
2077 ( q_failed || ((test_bit(R5_Insync, &qdev->flags)
2078 && !test_bit(R5_LOCKED, &qdev->flags)
2079 && test_bit(R5_UPTODATE, &qdev->flags))) ) ) {
2080 /* any written block on an uptodate or failed drive can be
2081 * returned. Note that if we 'wrote' to a failed drive,
2082 * it will be UPTODATE, but never LOCKED, so we don't need
2083 * to test 'failed' directly.
2084 */
2085 for (i=disks; i--; )
2086 if (sh->dev[i].written) {
2087 dev = &sh->dev[i];
2088 if (!test_bit(R5_LOCKED, &dev->flags) &&
2089 test_bit(R5_UPTODATE, &dev->flags) ) {
2090 /* We can return any write requests */
2091 int bitmap_end = 0;
2092 struct bio *wbi, *wbi2;
2093 PRINTK("Return write for stripe %llu disc %d\n",
2094 (unsigned long long)sh->sector, i);
2095 spin_lock_irq(&conf->device_lock);
2096 wbi = dev->written;
2097 dev->written = NULL;
2098 while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
2099 wbi2 = r5_next_bio(wbi, dev->sector);
2100 if (--wbi->bi_phys_segments == 0) {
2101 md_write_end(conf->mddev);
2102 wbi->bi_next = return_bi;
2103 return_bi = wbi;
2104 }
2105 wbi = wbi2;
2106 }
2107 if (dev->towrite == NULL)
2108 bitmap_end = 1;
2109 spin_unlock_irq(&conf->device_lock);
2110 if (bitmap_end)
2111 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2112 STRIPE_SECTORS,
2113 !test_bit(STRIPE_DEGRADED, &sh->state), 0);
2114 }
2115 }
2116 }
2117
2118 /* Now we might consider reading some blocks, either to check/generate
2119 * parity, or to satisfy requests
2120 * or to load a block that is being partially written.
2121 */
2122 if (to_read || non_overwrite || (to_write && failed) || (syncing && (uptodate < disks))) {
2123 for (i=disks; i--;) {
2124 dev = &sh->dev[i];
2125 if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
2126 (dev->toread ||
2127 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2128 syncing ||
2129 (failed >= 1 && (sh->dev[failed_num[0]].toread || to_write)) ||
2130 (failed >= 2 && (sh->dev[failed_num[1]].toread || to_write))
2131 )
2132 ) {
2133 /* we would like to get this block, possibly
2134 * by computing it, but we might not be able to
2135 */
2136 if (uptodate == disks-1) {
2137 PRINTK("Computing stripe %llu block %d\n",
2138 (unsigned long long)sh->sector, i);
2139 compute_block_1(sh, i, 0);
2140 uptodate++;
2141 } else if ( uptodate == disks-2 && failed >= 2 ) {
2142 /* Computing 2-failure is *very* expensive; only do it if failed >= 2 */
2143 int other;
2144 for (other=disks; other--;) {
2145 if ( other == i )
2146 continue;
2147 if ( !test_bit(R5_UPTODATE, &sh->dev[other].flags) )
2148 break;
2149 }
2150 BUG_ON(other < 0);
2151 PRINTK("Computing stripe %llu blocks %d,%d\n",
2152 (unsigned long long)sh->sector, i, other);
2153 compute_block_2(sh, i, other);
2154 uptodate += 2;
2155 } else if (test_bit(R5_Insync, &dev->flags)) {
2156 set_bit(R5_LOCKED, &dev->flags);
2157 set_bit(R5_Wantread, &dev->flags);
2158#if 0
2159 /* if I am just reading this block and we don't have
2160 a failed drive, or any pending writes then sidestep the cache */
2161 if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
2162 ! syncing && !failed && !to_write) {
2163 sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page;
2164 sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data;
2165 }
2166#endif
2167 locked++;
2168 PRINTK("Reading block %d (sync=%d)\n",
2169 i, syncing);
2170 }
2171 }
2172 }
2173 set_bit(STRIPE_HANDLE, &sh->state);
2174 }
2175
2176 /* now to consider writing and what else, if anything should be read */
2177 if (to_write) {
2178 int rcw=0, must_compute=0;
2179 for (i=disks ; i--;) {
2180 dev = &sh->dev[i];
2181 /* Would I have to read this buffer for reconstruct_write */
2182 if (!test_bit(R5_OVERWRITE, &dev->flags)
2183 && i != pd_idx && i != qd_idx
2184 && (!test_bit(R5_LOCKED, &dev->flags)
2185#if 0
2186 || sh->bh_page[i] != bh->b_page
2187#endif
2188 ) &&
2189 !test_bit(R5_UPTODATE, &dev->flags)) {
2190 if (test_bit(R5_Insync, &dev->flags)) rcw++;
2191 else {
2192 PRINTK("raid6: must_compute: disk %d flags=%#lx\n", i, dev->flags);
2193 must_compute++;
2194 }
2195 }
2196 }
2197 PRINTK("for sector %llu, rcw=%d, must_compute=%d\n",
2198 (unsigned long long)sh->sector, rcw, must_compute);
2199 set_bit(STRIPE_HANDLE, &sh->state);
2200
2201 if (rcw > 0)
2202 /* want reconstruct write, but need to get some data */
2203 for (i=disks; i--;) {
2204 dev = &sh->dev[i];
2205 if (!test_bit(R5_OVERWRITE, &dev->flags)
2206 && !(failed == 0 && (i == pd_idx || i == qd_idx))
2207 && !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
2208 test_bit(R5_Insync, &dev->flags)) {
2209 if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
2210 {
2211 PRINTK("Read_old stripe %llu block %d for Reconstruct\n",
2212 (unsigned long long)sh->sector, i);
2213 set_bit(R5_LOCKED, &dev->flags);
2214 set_bit(R5_Wantread, &dev->flags);
2215 locked++;
2216 } else {
2217 PRINTK("Request delayed stripe %llu block %d for Reconstruct\n",
2218 (unsigned long long)sh->sector, i);
2219 set_bit(STRIPE_DELAYED, &sh->state);
2220 set_bit(STRIPE_HANDLE, &sh->state);
2221 }
2222 }
2223 }
2224 /* now if nothing is locked, and if we have enough data, we can start a write request */
2225 if (locked == 0 && rcw == 0 &&
2226 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
2227 if ( must_compute > 0 ) {
2228 /* We have failed blocks and need to compute them */
2229 switch ( failed ) {
2230 case 0: BUG();
2231 case 1: compute_block_1(sh, failed_num[0], 0); break;
2232 case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break;
2233 default: BUG(); /* This request should have been failed? */
2234 }
2235 }
2236
2237 PRINTK("Computing parity for stripe %llu\n", (unsigned long long)sh->sector);
2238 compute_parity6(sh, RECONSTRUCT_WRITE);
2239 /* now every locked buffer is ready to be written */
2240 for (i=disks; i--;)
2241 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
2242 PRINTK("Writing stripe %llu block %d\n",
2243 (unsigned long long)sh->sector, i);
2244 locked++;
2245 set_bit(R5_Wantwrite, &sh->dev[i].flags);
2246 }
2247 /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
2248 set_bit(STRIPE_INSYNC, &sh->state);
2249
2250 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2251 atomic_dec(&conf->preread_active_stripes);
2252 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
2253 md_wakeup_thread(conf->mddev->thread);
2254 }
2255 }
2256 }
2257
2258 /* maybe we need to check and possibly fix the parity for this stripe
2259 * Any reads will already have been scheduled, so we just see if enough data
2260 * is available
2261 */
2262 if (syncing && locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) {
2263 int update_p = 0, update_q = 0;
2264 struct r5dev *dev;
2265
2266 set_bit(STRIPE_HANDLE, &sh->state);
2267
2268 BUG_ON(failed>2);
2269 BUG_ON(uptodate < disks);
2270 /* Want to check and possibly repair P and Q.
2271 * However there could be one 'failed' device, in which
2272 * case we can only check one of them, possibly using the
2273 * other to generate missing data
2274 */
2275
2276 /* If !tmp_page, we cannot do the calculations,
2277 * but as we have set STRIPE_HANDLE, we will soon be called
2278 * by stripe_handle with a tmp_page - just wait until then.
2279 */
2280 if (tmp_page) {
2281 if (failed == q_failed) {
2282 /* The only possible failed device holds 'Q', so it makes
2283 * sense to check P (If anything else were failed, we would
2284 * have used P to recreate it).
2285 */
2286 compute_block_1(sh, pd_idx, 1);
2287 if (!page_is_zero(sh->dev[pd_idx].page)) {
2288 compute_block_1(sh,pd_idx,0);
2289 update_p = 1;
2290 }
2291 }
2292 if (!q_failed && failed < 2) {
2293 /* q is not failed, and we didn't use it to generate
2294 * anything, so it makes sense to check it
2295 */
2296 memcpy(page_address(tmp_page),
2297 page_address(sh->dev[qd_idx].page),
2298 STRIPE_SIZE);
2299 compute_parity6(sh, UPDATE_PARITY);
2300 if (memcmp(page_address(tmp_page),
2301 page_address(sh->dev[qd_idx].page),
2302 STRIPE_SIZE)!= 0) {
2303 clear_bit(STRIPE_INSYNC, &sh->state);
2304 update_q = 1;
2305 }
2306 }
2307 if (update_p || update_q) {
2308 conf->mddev->resync_mismatches += STRIPE_SECTORS;
2309 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2310 /* don't try to repair!! */
2311 update_p = update_q = 0;
2312 }
2313
2314 /* now write out any block on a failed drive,
2315 * or P or Q if they need it
2316 */
2317
2318 if (failed == 2) {
2319 dev = &sh->dev[failed_num[1]];
2320 locked++;
2321 set_bit(R5_LOCKED, &dev->flags);
2322 set_bit(R5_Wantwrite, &dev->flags);
2323 }
2324 if (failed >= 1) {
2325 dev = &sh->dev[failed_num[0]];
2326 locked++;
2327 set_bit(R5_LOCKED, &dev->flags);
2328 set_bit(R5_Wantwrite, &dev->flags);
2329 }
2330
2331 if (update_p) {
2332 dev = &sh->dev[pd_idx];
2333 locked ++;
2334 set_bit(R5_LOCKED, &dev->flags);
2335 set_bit(R5_Wantwrite, &dev->flags);
2336 }
2337 if (update_q) {
2338 dev = &sh->dev[qd_idx];
2339 locked++;
2340 set_bit(R5_LOCKED, &dev->flags);
2341 set_bit(R5_Wantwrite, &dev->flags);
2342 }
2343 clear_bit(STRIPE_DEGRADED, &sh->state);
2344
2345 set_bit(STRIPE_INSYNC, &sh->state);
2346 }
2347 }
2348
2349 if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
2350 md_done_sync(conf->mddev, STRIPE_SECTORS,1);
2351 clear_bit(STRIPE_SYNCING, &sh->state);
2352 }
2353
2354 /* If the failed drives are just a ReadError, then we might need
2355 * to progress the repair/check process
2356 */
2357 if (failed <= 2 && ! conf->mddev->ro)
2358 for (i=0; i<failed;i++) {
2359 dev = &sh->dev[failed_num[i]];
2360 if (test_bit(R5_ReadError, &dev->flags)
2361 && !test_bit(R5_LOCKED, &dev->flags)
2362 && test_bit(R5_UPTODATE, &dev->flags)
2363 ) {
2364 if (!test_bit(R5_ReWrite, &dev->flags)) {
2365 set_bit(R5_Wantwrite, &dev->flags);
2366 set_bit(R5_ReWrite, &dev->flags);
2367 set_bit(R5_LOCKED, &dev->flags);
2368 } else {
2369 /* let's read it back */
2370 set_bit(R5_Wantread, &dev->flags);
2371 set_bit(R5_LOCKED, &dev->flags);
2372 }
2373 }
2374 }
2375 spin_unlock(&sh->lock);
2376
2377 while ((bi=return_bi)) {
2378 int bytes = bi->bi_size;
2379
2380 return_bi = bi->bi_next;
2381 bi->bi_next = NULL;
2382 bi->bi_size = 0;
2383 bi->bi_end_io(bi, bytes, 0);
2384 }
2385 for (i=disks; i-- ;) {
2386 int rw;
2387 struct bio *bi;
2388 mdk_rdev_t *rdev;
2389 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
2390 rw = 1;
2391 else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
2392 rw = 0;
2393 else
2394 continue;
2395
2396 bi = &sh->dev[i].req;
2397
2398 bi->bi_rw = rw;
2399 if (rw)
2400 bi->bi_end_io = raid5_end_write_request;
2401 else
2402 bi->bi_end_io = raid5_end_read_request;
2403
2404 rcu_read_lock();
2405 rdev = rcu_dereference(conf->disks[i].rdev);
2406 if (rdev && test_bit(Faulty, &rdev->flags))
2407 rdev = NULL;
2408 if (rdev)
2409 atomic_inc(&rdev->nr_pending);
2410 rcu_read_unlock();
2411
2412 if (rdev) {
2413 if (syncing)
2414 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
2415
2416 bi->bi_bdev = rdev->bdev;
2417 PRINTK("for %llu schedule op %ld on disc %d\n",
2418 (unsigned long long)sh->sector, bi->bi_rw, i);
2419 atomic_inc(&sh->count);
2420 bi->bi_sector = sh->sector + rdev->data_offset;
2421 bi->bi_flags = 1 << BIO_UPTODATE;
2422 bi->bi_vcnt = 1;
2423 bi->bi_max_vecs = 1;
2424 bi->bi_idx = 0;
2425 bi->bi_io_vec = &sh->dev[i].vec;
2426 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
2427 bi->bi_io_vec[0].bv_offset = 0;
2428 bi->bi_size = STRIPE_SIZE;
2429 bi->bi_next = NULL;
2430 if (rw == WRITE &&
2431 test_bit(R5_ReWrite, &sh->dev[i].flags))
2432 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
2433 generic_make_request(bi);
2434 } else {
2435 if (rw == 1)
2436 set_bit(STRIPE_DEGRADED, &sh->state);
2437 PRINTK("skip op %ld on disc %d for sector %llu\n",
2438 bi->bi_rw, i, (unsigned long long)sh->sector);
2439 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2440 set_bit(STRIPE_HANDLE, &sh->state);
2441 }
2442 }
2443}
2444
2445static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
2446{
2447 if (sh->raid_conf->level == 6)
2448 handle_stripe6(sh, tmp_page);
2449 else
2450 handle_stripe5(sh);
2451}
2452
2453
2454
1618static void raid5_activate_delayed(raid5_conf_t *conf) 2455static void raid5_activate_delayed(raid5_conf_t *conf)
1619{ 2456{
1620 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 2457 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
@@ -1753,7 +2590,7 @@ static int make_request(request_queue_t *q, struct bio * bi)
1753 2590
1754 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 2591 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
1755 DEFINE_WAIT(w); 2592 DEFINE_WAIT(w);
1756 int disks; 2593 int disks, data_disks;
1757 2594
1758 retry: 2595 retry:
1759 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 2596 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
@@ -1781,7 +2618,9 @@ static int make_request(request_queue_t *q, struct bio * bi)
1781 } 2618 }
1782 spin_unlock_irq(&conf->device_lock); 2619 spin_unlock_irq(&conf->device_lock);
1783 } 2620 }
1784 new_sector = raid5_compute_sector(logical_sector, disks, disks - 1, 2621 data_disks = disks - conf->max_degraded;
2622
2623 new_sector = raid5_compute_sector(logical_sector, disks, data_disks,
1785 &dd_idx, &pd_idx, conf); 2624 &dd_idx, &pd_idx, conf);
1786 PRINTK("raid5: make_request, sector %llu logical %llu\n", 2625 PRINTK("raid5: make_request, sector %llu logical %llu\n",
1787 (unsigned long long)new_sector, 2626 (unsigned long long)new_sector,
@@ -1833,7 +2672,7 @@ static int make_request(request_queue_t *q, struct bio * bi)
1833 } 2672 }
1834 finish_wait(&conf->wait_for_overlap, &w); 2673 finish_wait(&conf->wait_for_overlap, &w);
1835 raid5_plug_device(conf); 2674 raid5_plug_device(conf);
1836 handle_stripe(sh); 2675 handle_stripe(sh, NULL);
1837 release_stripe(sh); 2676 release_stripe(sh);
1838 } else { 2677 } else {
1839 /* cannot get stripe for read-ahead, just give-up */ 2678 /* cannot get stripe for read-ahead, just give-up */
@@ -1849,7 +2688,7 @@ static int make_request(request_queue_t *q, struct bio * bi)
1849 if (remaining == 0) { 2688 if (remaining == 0) {
1850 int bytes = bi->bi_size; 2689 int bytes = bi->bi_size;
1851 2690
1852 if ( bio_data_dir(bi) == WRITE ) 2691 if ( rw == WRITE )
1853 md_write_end(mddev); 2692 md_write_end(mddev);
1854 bi->bi_size = 0; 2693 bi->bi_size = 0;
1855 bi->bi_end_io(bi, bytes, 0); 2694 bi->bi_end_io(bi, bytes, 0);
@@ -1857,17 +2696,142 @@ static int make_request(request_queue_t *q, struct bio * bi)
1857 return 0; 2696 return 0;
1858} 2697}
1859 2698
1860/* FIXME go_faster isn't used */ 2699static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped)
1861static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
1862{ 2700{
2701 /* reshaping is quite different to recovery/resync so it is
2702 * handled quite separately ... here.
2703 *
2704 * On each call to sync_request, we gather one chunk worth of
2705 * destination stripes and flag them as expanding.
2706 * Then we find all the source stripes and request reads.
2707 * As the reads complete, handle_stripe will copy the data
2708 * into the destination stripe and release that stripe.
2709 */
1863 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 2710 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1864 struct stripe_head *sh; 2711 struct stripe_head *sh;
1865 int pd_idx; 2712 int pd_idx;
1866 sector_t first_sector, last_sector; 2713 sector_t first_sector, last_sector;
2714 int raid_disks;
2715 int data_disks;
2716 int i;
2717 int dd_idx;
2718 sector_t writepos, safepos, gap;
2719
2720 if (sector_nr == 0 &&
2721 conf->expand_progress != 0) {
2722 /* restarting in the middle, skip the initial sectors */
2723 sector_nr = conf->expand_progress;
2724 sector_div(sector_nr, conf->raid_disks-1);
2725 *skipped = 1;
2726 return sector_nr;
2727 }
2728
2729 /* we update the metadata when there is more than 3Meg
2730 * in the block range (that is rather arbitrary, should
2731 * probably be time based) or when the data about to be
2732 * copied would over-write the source of the data at
2733 * the front of the range.
2734 * i.e. one new_stripe forward from expand_progress new_maps
2735 * to after where expand_lo old_maps to
2736 */
2737 writepos = conf->expand_progress +
2738 conf->chunk_size/512*(conf->raid_disks-1);
2739 sector_div(writepos, conf->raid_disks-1);
2740 safepos = conf->expand_lo;
2741 sector_div(safepos, conf->previous_raid_disks-1);
2742 gap = conf->expand_progress - conf->expand_lo;
2743
2744 if (writepos >= safepos ||
2745 gap > (conf->raid_disks-1)*3000*2 /*3Meg*/) {
2746 /* Cannot proceed until we've updated the superblock... */
2747 wait_event(conf->wait_for_overlap,
2748 atomic_read(&conf->reshape_stripes)==0);
2749 mddev->reshape_position = conf->expand_progress;
2750 mddev->sb_dirty = 1;
2751 md_wakeup_thread(mddev->thread);
2752 wait_event(mddev->sb_wait, mddev->sb_dirty == 0 ||
2753 kthread_should_stop());
2754 spin_lock_irq(&conf->device_lock);
2755 conf->expand_lo = mddev->reshape_position;
2756 spin_unlock_irq(&conf->device_lock);
2757 wake_up(&conf->wait_for_overlap);
2758 }
2759
2760 for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) {
2761 int j;
2762 int skipped = 0;
2763 pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks);
2764 sh = get_active_stripe(conf, sector_nr+i,
2765 conf->raid_disks, pd_idx, 0);
2766 set_bit(STRIPE_EXPANDING, &sh->state);
2767 atomic_inc(&conf->reshape_stripes);
2768 /* If any of this stripe is beyond the end of the old
2769 * array, then we need to zero those blocks
2770 */
2771 for (j=sh->disks; j--;) {
2772 sector_t s;
2773 if (j == sh->pd_idx)
2774 continue;
2775 s = compute_blocknr(sh, j);
2776 if (s < (mddev->array_size<<1)) {
2777 skipped = 1;
2778 continue;
2779 }
2780 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
2781 set_bit(R5_Expanded, &sh->dev[j].flags);
2782 set_bit(R5_UPTODATE, &sh->dev[j].flags);
2783 }
2784 if (!skipped) {
2785 set_bit(STRIPE_EXPAND_READY, &sh->state);
2786 set_bit(STRIPE_HANDLE, &sh->state);
2787 }
2788 release_stripe(sh);
2789 }
2790 spin_lock_irq(&conf->device_lock);
2791 conf->expand_progress = (sector_nr + i)*(conf->raid_disks-1);
2792 spin_unlock_irq(&conf->device_lock);
2793 /* Ok, those stripe are ready. We can start scheduling
2794 * reads on the source stripes.
2795 * The source stripes are determined by mapping the first and last
2796 * block on the destination stripes.
2797 */
2798 raid_disks = conf->previous_raid_disks;
2799 data_disks = raid_disks - 1;
2800 first_sector =
2801 raid5_compute_sector(sector_nr*(conf->raid_disks-1),
2802 raid_disks, data_disks,
2803 &dd_idx, &pd_idx, conf);
2804 last_sector =
2805 raid5_compute_sector((sector_nr+conf->chunk_size/512)
2806 *(conf->raid_disks-1) -1,
2807 raid_disks, data_disks,
2808 &dd_idx, &pd_idx, conf);
2809 if (last_sector >= (mddev->size<<1))
2810 last_sector = (mddev->size<<1)-1;
2811 while (first_sector <= last_sector) {
2812 pd_idx = stripe_to_pdidx(first_sector, conf, conf->previous_raid_disks);
2813 sh = get_active_stripe(conf, first_sector,
2814 conf->previous_raid_disks, pd_idx, 0);
2815 set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2816 set_bit(STRIPE_HANDLE, &sh->state);
2817 release_stripe(sh);
2818 first_sector += STRIPE_SECTORS;
2819 }
2820 return conf->chunk_size>>9;
2821}
2822
2823/* FIXME go_faster isn't used */
2824static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
2825{
2826 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
2827 struct stripe_head *sh;
2828 int pd_idx;
1867 int raid_disks = conf->raid_disks; 2829 int raid_disks = conf->raid_disks;
1868 int data_disks = raid_disks-1; 2830 int data_disks = raid_disks - conf->max_degraded;
1869 sector_t max_sector = mddev->size << 1; 2831 sector_t max_sector = mddev->size << 1;
1870 int sync_blocks; 2832 int sync_blocks;
2833 int still_degraded = 0;
2834 int i;
1871 2835
1872 if (sector_nr >= max_sector) { 2836 if (sector_nr >= max_sector) {
1873 /* just being told to finish up .. nothing much to do */ 2837 /* just being told to finish up .. nothing much to do */
@@ -1880,134 +2844,22 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1880 if (mddev->curr_resync < max_sector) /* aborted */ 2844 if (mddev->curr_resync < max_sector) /* aborted */
1881 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 2845 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
1882 &sync_blocks, 1); 2846 &sync_blocks, 1);
1883 else /* compelted sync */ 2847 else /* completed sync */
1884 conf->fullsync = 0; 2848 conf->fullsync = 0;
1885 bitmap_close_sync(mddev->bitmap); 2849 bitmap_close_sync(mddev->bitmap);
1886 2850
1887 return 0; 2851 return 0;
1888 } 2852 }
1889 2853
1890 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 2854 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
1891 /* reshaping is quite different to recovery/resync so it is 2855 return reshape_request(mddev, sector_nr, skipped);
1892 * handled quite separately ... here. 2856
1893 * 2857 /* if there is too many failed drives and we are trying
1894 * On each call to sync_request, we gather one chunk worth of
1895 * destination stripes and flag them as expanding.
1896 * Then we find all the source stripes and request reads.
1897 * As the reads complete, handle_stripe will copy the data
1898 * into the destination stripe and release that stripe.
1899 */
1900 int i;
1901 int dd_idx;
1902 sector_t writepos, safepos, gap;
1903
1904 if (sector_nr == 0 &&
1905 conf->expand_progress != 0) {
1906 /* restarting in the middle, skip the initial sectors */
1907 sector_nr = conf->expand_progress;
1908 sector_div(sector_nr, conf->raid_disks-1);
1909 *skipped = 1;
1910 return sector_nr;
1911 }
1912
1913 /* we update the metadata when there is more than 3Meg
1914 * in the block range (that is rather arbitrary, should
1915 * probably be time based) or when the data about to be
1916 * copied would over-write the source of the data at
1917 * the front of the range.
1918 * i.e. one new_stripe forward from expand_progress new_maps
1919 * to after where expand_lo old_maps to
1920 */
1921 writepos = conf->expand_progress +
1922 conf->chunk_size/512*(conf->raid_disks-1);
1923 sector_div(writepos, conf->raid_disks-1);
1924 safepos = conf->expand_lo;
1925 sector_div(safepos, conf->previous_raid_disks-1);
1926 gap = conf->expand_progress - conf->expand_lo;
1927
1928 if (writepos >= safepos ||
1929 gap > (conf->raid_disks-1)*3000*2 /*3Meg*/) {
1930 /* Cannot proceed until we've updated the superblock... */
1931 wait_event(conf->wait_for_overlap,
1932 atomic_read(&conf->reshape_stripes)==0);
1933 mddev->reshape_position = conf->expand_progress;
1934 mddev->sb_dirty = 1;
1935 md_wakeup_thread(mddev->thread);
1936 wait_event(mddev->sb_wait, mddev->sb_dirty == 0 ||
1937 kthread_should_stop());
1938 spin_lock_irq(&conf->device_lock);
1939 conf->expand_lo = mddev->reshape_position;
1940 spin_unlock_irq(&conf->device_lock);
1941 wake_up(&conf->wait_for_overlap);
1942 }
1943
1944 for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) {
1945 int j;
1946 int skipped = 0;
1947 pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks);
1948 sh = get_active_stripe(conf, sector_nr+i,
1949 conf->raid_disks, pd_idx, 0);
1950 set_bit(STRIPE_EXPANDING, &sh->state);
1951 atomic_inc(&conf->reshape_stripes);
1952 /* If any of this stripe is beyond the end of the old
1953 * array, then we need to zero those blocks
1954 */
1955 for (j=sh->disks; j--;) {
1956 sector_t s;
1957 if (j == sh->pd_idx)
1958 continue;
1959 s = compute_blocknr(sh, j);
1960 if (s < (mddev->array_size<<1)) {
1961 skipped = 1;
1962 continue;
1963 }
1964 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
1965 set_bit(R5_Expanded, &sh->dev[j].flags);
1966 set_bit(R5_UPTODATE, &sh->dev[j].flags);
1967 }
1968 if (!skipped) {
1969 set_bit(STRIPE_EXPAND_READY, &sh->state);
1970 set_bit(STRIPE_HANDLE, &sh->state);
1971 }
1972 release_stripe(sh);
1973 }
1974 spin_lock_irq(&conf->device_lock);
1975 conf->expand_progress = (sector_nr + i)*(conf->raid_disks-1);
1976 spin_unlock_irq(&conf->device_lock);
1977 /* Ok, those stripe are ready. We can start scheduling
1978 * reads on the source stripes.
1979 * The source stripes are determined by mapping the first and last
1980 * block on the destination stripes.
1981 */
1982 raid_disks = conf->previous_raid_disks;
1983 data_disks = raid_disks - 1;
1984 first_sector =
1985 raid5_compute_sector(sector_nr*(conf->raid_disks-1),
1986 raid_disks, data_disks,
1987 &dd_idx, &pd_idx, conf);
1988 last_sector =
1989 raid5_compute_sector((sector_nr+conf->chunk_size/512)
1990 *(conf->raid_disks-1) -1,
1991 raid_disks, data_disks,
1992 &dd_idx, &pd_idx, conf);
1993 if (last_sector >= (mddev->size<<1))
1994 last_sector = (mddev->size<<1)-1;
1995 while (first_sector <= last_sector) {
1996 pd_idx = stripe_to_pdidx(first_sector, conf, conf->previous_raid_disks);
1997 sh = get_active_stripe(conf, first_sector,
1998 conf->previous_raid_disks, pd_idx, 0);
1999 set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2000 set_bit(STRIPE_HANDLE, &sh->state);
2001 release_stripe(sh);
2002 first_sector += STRIPE_SECTORS;
2003 }
2004 return conf->chunk_size>>9;
2005 }
2006 /* if there is 1 or more failed drives and we are trying
2007 * to resync, then assert that we are finished, because there is 2858 * to resync, then assert that we are finished, because there is
2008 * nothing we can do. 2859 * nothing we can do.
2009 */ 2860 */
2010 if (mddev->degraded >= 1 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 2861 if (mddev->degraded >= conf->max_degraded &&
2862 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2011 sector_t rv = (mddev->size << 1) - sector_nr; 2863 sector_t rv = (mddev->size << 1) - sector_nr;
2012 *skipped = 1; 2864 *skipped = 1;
2013 return rv; 2865 return rv;
@@ -2026,17 +2878,26 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
2026 if (sh == NULL) { 2878 if (sh == NULL) {
2027 sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0); 2879 sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0);
2028 /* make sure we don't swamp the stripe cache if someone else 2880 /* make sure we don't swamp the stripe cache if someone else
2029 * is trying to get access 2881 * is trying to get access
2030 */ 2882 */
2031 schedule_timeout_uninterruptible(1); 2883 schedule_timeout_uninterruptible(1);
2032 } 2884 }
2033 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 0); 2885 /* Need to check if array will still be degraded after recovery/resync
2034 spin_lock(&sh->lock); 2886 * We don't need to check the 'failed' flag as when that gets set,
2887 * recovery aborts.
2888 */
2889 for (i=0; i<mddev->raid_disks; i++)
2890 if (conf->disks[i].rdev == NULL)
2891 still_degraded = 1;
2892
2893 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
2894
2895 spin_lock(&sh->lock);
2035 set_bit(STRIPE_SYNCING, &sh->state); 2896 set_bit(STRIPE_SYNCING, &sh->state);
2036 clear_bit(STRIPE_INSYNC, &sh->state); 2897 clear_bit(STRIPE_INSYNC, &sh->state);
2037 spin_unlock(&sh->lock); 2898 spin_unlock(&sh->lock);
2038 2899
2039 handle_stripe(sh); 2900 handle_stripe(sh, NULL);
2040 release_stripe(sh); 2901 release_stripe(sh);
2041 2902
2042 return STRIPE_SECTORS; 2903 return STRIPE_SECTORS;
@@ -2091,7 +2952,7 @@ static void raid5d (mddev_t *mddev)
2091 spin_unlock_irq(&conf->device_lock); 2952 spin_unlock_irq(&conf->device_lock);
2092 2953
2093 handled++; 2954 handled++;
2094 handle_stripe(sh); 2955 handle_stripe(sh, conf->spare_page);
2095 release_stripe(sh); 2956 release_stripe(sh);
2096 2957
2097 spin_lock_irq(&conf->device_lock); 2958 spin_lock_irq(&conf->device_lock);
@@ -2181,8 +3042,8 @@ static int run(mddev_t *mddev)
2181 struct disk_info *disk; 3042 struct disk_info *disk;
2182 struct list_head *tmp; 3043 struct list_head *tmp;
2183 3044
2184 if (mddev->level != 5 && mddev->level != 4) { 3045 if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) {
2185 printk(KERN_ERR "raid5: %s: raid level not set to 4/5 (%d)\n", 3046 printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n",
2186 mdname(mddev), mddev->level); 3047 mdname(mddev), mddev->level);
2187 return -EIO; 3048 return -EIO;
2188 } 3049 }
@@ -2251,6 +3112,11 @@ static int run(mddev_t *mddev)
2251 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 3112 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
2252 goto abort; 3113 goto abort;
2253 3114
3115 if (mddev->level == 6) {
3116 conf->spare_page = alloc_page(GFP_KERNEL);
3117 if (!conf->spare_page)
3118 goto abort;
3119 }
2254 spin_lock_init(&conf->device_lock); 3120 spin_lock_init(&conf->device_lock);
2255 init_waitqueue_head(&conf->wait_for_stripe); 3121 init_waitqueue_head(&conf->wait_for_stripe);
2256 init_waitqueue_head(&conf->wait_for_overlap); 3122 init_waitqueue_head(&conf->wait_for_overlap);
@@ -2282,12 +3148,16 @@ static int run(mddev_t *mddev)
2282 } 3148 }
2283 3149
2284 /* 3150 /*
2285 * 0 for a fully functional array, 1 for a degraded array. 3151 * 0 for a fully functional array, 1 or 2 for a degraded array.
2286 */ 3152 */
2287 mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks; 3153 mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks;
2288 conf->mddev = mddev; 3154 conf->mddev = mddev;
2289 conf->chunk_size = mddev->chunk_size; 3155 conf->chunk_size = mddev->chunk_size;
2290 conf->level = mddev->level; 3156 conf->level = mddev->level;
3157 if (conf->level == 6)
3158 conf->max_degraded = 2;
3159 else
3160 conf->max_degraded = 1;
2291 conf->algorithm = mddev->layout; 3161 conf->algorithm = mddev->layout;
2292 conf->max_nr_stripes = NR_STRIPES; 3162 conf->max_nr_stripes = NR_STRIPES;
2293 conf->expand_progress = mddev->reshape_position; 3163 conf->expand_progress = mddev->reshape_position;
@@ -2296,6 +3166,11 @@ static int run(mddev_t *mddev)
2296 mddev->size &= ~(mddev->chunk_size/1024 -1); 3166 mddev->size &= ~(mddev->chunk_size/1024 -1);
2297 mddev->resync_max_sectors = mddev->size << 1; 3167 mddev->resync_max_sectors = mddev->size << 1;
2298 3168
3169 if (conf->level == 6 && conf->raid_disks < 4) {
3170 printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n",
3171 mdname(mddev), conf->raid_disks);
3172 goto abort;
3173 }
2299 if (!conf->chunk_size || conf->chunk_size % 4) { 3174 if (!conf->chunk_size || conf->chunk_size % 4) {
2300 printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", 3175 printk(KERN_ERR "raid5: invalid chunk size %d for %s\n",
2301 conf->chunk_size, mdname(mddev)); 3176 conf->chunk_size, mdname(mddev));
@@ -2307,14 +3182,14 @@ static int run(mddev_t *mddev)
2307 conf->algorithm, mdname(mddev)); 3182 conf->algorithm, mdname(mddev));
2308 goto abort; 3183 goto abort;
2309 } 3184 }
2310 if (mddev->degraded > 1) { 3185 if (mddev->degraded > conf->max_degraded) {
2311 printk(KERN_ERR "raid5: not enough operational devices for %s" 3186 printk(KERN_ERR "raid5: not enough operational devices for %s"
2312 " (%d/%d failed)\n", 3187 " (%d/%d failed)\n",
2313 mdname(mddev), conf->failed_disks, conf->raid_disks); 3188 mdname(mddev), conf->failed_disks, conf->raid_disks);
2314 goto abort; 3189 goto abort;
2315 } 3190 }
2316 3191
2317 if (mddev->degraded == 1 && 3192 if (mddev->degraded > 0 &&
2318 mddev->recovery_cp != MaxSector) { 3193 mddev->recovery_cp != MaxSector) {
2319 if (mddev->ok_start_degraded) 3194 if (mddev->ok_start_degraded)
2320 printk(KERN_WARNING 3195 printk(KERN_WARNING
@@ -2379,11 +3254,12 @@ static int run(mddev_t *mddev)
2379 } 3254 }
2380 3255
2381 /* read-ahead size must cover two whole stripes, which is 3256 /* read-ahead size must cover two whole stripes, which is
2382 * 2 * (n-1) * chunksize where 'n' is the number of raid devices 3257 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
2383 */ 3258 */
2384 { 3259 {
2385 int stripe = (mddev->raid_disks-1) * mddev->chunk_size 3260 int data_disks = conf->previous_raid_disks - conf->max_degraded;
2386 / PAGE_SIZE; 3261 int stripe = data_disks *
3262 (mddev->chunk_size / PAGE_SIZE);
2387 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 3263 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
2388 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 3264 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
2389 } 3265 }
@@ -2393,12 +3269,14 @@ static int run(mddev_t *mddev)
2393 3269
2394 mddev->queue->unplug_fn = raid5_unplug_device; 3270 mddev->queue->unplug_fn = raid5_unplug_device;
2395 mddev->queue->issue_flush_fn = raid5_issue_flush; 3271 mddev->queue->issue_flush_fn = raid5_issue_flush;
2396 mddev->array_size = mddev->size * (conf->previous_raid_disks - 1); 3272 mddev->array_size = mddev->size * (conf->previous_raid_disks -
3273 conf->max_degraded);
2397 3274
2398 return 0; 3275 return 0;
2399abort: 3276abort:
2400 if (conf) { 3277 if (conf) {
2401 print_raid5_conf(conf); 3278 print_raid5_conf(conf);
3279 safe_put_page(conf->spare_page);
2402 kfree(conf->disks); 3280 kfree(conf->disks);
2403 kfree(conf->stripe_hashtbl); 3281 kfree(conf->stripe_hashtbl);
2404 kfree(conf); 3282 kfree(conf);
@@ -2427,23 +3305,23 @@ static int stop(mddev_t *mddev)
2427} 3305}
2428 3306
2429#if RAID5_DEBUG 3307#if RAID5_DEBUG
2430static void print_sh (struct stripe_head *sh) 3308static void print_sh (struct seq_file *seq, struct stripe_head *sh)
2431{ 3309{
2432 int i; 3310 int i;
2433 3311
2434 printk("sh %llu, pd_idx %d, state %ld.\n", 3312 seq_printf(seq, "sh %llu, pd_idx %d, state %ld.\n",
2435 (unsigned long long)sh->sector, sh->pd_idx, sh->state); 3313 (unsigned long long)sh->sector, sh->pd_idx, sh->state);
2436 printk("sh %llu, count %d.\n", 3314 seq_printf(seq, "sh %llu, count %d.\n",
2437 (unsigned long long)sh->sector, atomic_read(&sh->count)); 3315 (unsigned long long)sh->sector, atomic_read(&sh->count));
2438 printk("sh %llu, ", (unsigned long long)sh->sector); 3316 seq_printf(seq, "sh %llu, ", (unsigned long long)sh->sector);
2439 for (i = 0; i < sh->disks; i++) { 3317 for (i = 0; i < sh->disks; i++) {
2440 printk("(cache%d: %p %ld) ", 3318 seq_printf(seq, "(cache%d: %p %ld) ",
2441 i, sh->dev[i].page, sh->dev[i].flags); 3319 i, sh->dev[i].page, sh->dev[i].flags);
2442 } 3320 }
2443 printk("\n"); 3321 seq_printf(seq, "\n");
2444} 3322}
2445 3323
2446static void printall (raid5_conf_t *conf) 3324static void printall (struct seq_file *seq, raid5_conf_t *conf)
2447{ 3325{
2448 struct stripe_head *sh; 3326 struct stripe_head *sh;
2449 struct hlist_node *hn; 3327 struct hlist_node *hn;
@@ -2454,7 +3332,7 @@ static void printall (raid5_conf_t *conf)
2454 hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) { 3332 hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) {
2455 if (sh->raid_conf != conf) 3333 if (sh->raid_conf != conf)
2456 continue; 3334 continue;
2457 print_sh(sh); 3335 print_sh(seq, sh);
2458 } 3336 }
2459 } 3337 }
2460 spin_unlock_irq(&conf->device_lock); 3338 spin_unlock_irq(&conf->device_lock);
@@ -2474,9 +3352,8 @@ static void status (struct seq_file *seq, mddev_t *mddev)
2474 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); 3352 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_");
2475 seq_printf (seq, "]"); 3353 seq_printf (seq, "]");
2476#if RAID5_DEBUG 3354#if RAID5_DEBUG
2477#define D(x) \ 3355 seq_printf (seq, "\n");
2478 seq_printf (seq, "<"#x":%d>", atomic_read(&conf->x)) 3356 printall(seq, conf);
2479 printall(conf);
2480#endif 3357#endif
2481} 3358}
2482 3359
@@ -2560,14 +3437,20 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
2560 int disk; 3437 int disk;
2561 struct disk_info *p; 3438 struct disk_info *p;
2562 3439
2563 if (mddev->degraded > 1) 3440 if (mddev->degraded > conf->max_degraded)
2564 /* no point adding a device */ 3441 /* no point adding a device */
2565 return 0; 3442 return 0;
2566 3443
2567 /* 3444 /*
2568 * find the disk ... 3445 * find the disk ... but prefer rdev->saved_raid_disk
3446 * if possible.
2569 */ 3447 */
2570 for (disk=0; disk < conf->raid_disks; disk++) 3448 if (rdev->saved_raid_disk >= 0 &&
3449 conf->disks[rdev->saved_raid_disk].rdev == NULL)
3450 disk = rdev->saved_raid_disk;
3451 else
3452 disk = 0;
3453 for ( ; disk < conf->raid_disks; disk++)
2571 if ((p=conf->disks + disk)->rdev == NULL) { 3454 if ((p=conf->disks + disk)->rdev == NULL) {
2572 clear_bit(In_sync, &rdev->flags); 3455 clear_bit(In_sync, &rdev->flags);
2573 rdev->raid_disk = disk; 3456 rdev->raid_disk = disk;
@@ -2590,8 +3473,10 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
2590 * any io in the removed space completes, but it hardly seems 3473 * any io in the removed space completes, but it hardly seems
2591 * worth it. 3474 * worth it.
2592 */ 3475 */
3476 raid5_conf_t *conf = mddev_to_conf(mddev);
3477
2593 sectors &= ~((sector_t)mddev->chunk_size/512 - 1); 3478 sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
2594 mddev->array_size = (sectors * (mddev->raid_disks-1))>>1; 3479 mddev->array_size = (sectors * (mddev->raid_disks-conf->max_degraded))>>1;
2595 set_capacity(mddev->gendisk, mddev->array_size << 1); 3480 set_capacity(mddev->gendisk, mddev->array_size << 1);
2596 mddev->changed = 1; 3481 mddev->changed = 1;
2597 if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) { 3482 if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) {
@@ -2680,6 +3565,7 @@ static int raid5_start_reshape(mddev_t *mddev)
2680 set_bit(In_sync, &rdev->flags); 3565 set_bit(In_sync, &rdev->flags);
2681 conf->working_disks++; 3566 conf->working_disks++;
2682 added_devices++; 3567 added_devices++;
3568 rdev->recovery_offset = 0;
2683 sprintf(nm, "rd%d", rdev->raid_disk); 3569 sprintf(nm, "rd%d", rdev->raid_disk);
2684 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); 3570 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
2685 } else 3571 } else
@@ -2731,6 +3617,17 @@ static void end_reshape(raid5_conf_t *conf)
2731 conf->expand_progress = MaxSector; 3617 conf->expand_progress = MaxSector;
2732 spin_unlock_irq(&conf->device_lock); 3618 spin_unlock_irq(&conf->device_lock);
2733 conf->mddev->reshape_position = MaxSector; 3619 conf->mddev->reshape_position = MaxSector;
3620
3621 /* read-ahead size must cover two whole stripes, which is
3622 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
3623 */
3624 {
3625 int data_disks = conf->previous_raid_disks - conf->max_degraded;
3626 int stripe = data_disks *
3627 (conf->mddev->chunk_size / PAGE_SIZE);
3628 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
3629 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
3630 }
2734 } 3631 }
2735} 3632}
2736 3633
@@ -2762,6 +3659,23 @@ static void raid5_quiesce(mddev_t *mddev, int state)
2762 } 3659 }
2763} 3660}
2764 3661
3662static struct mdk_personality raid6_personality =
3663{
3664 .name = "raid6",
3665 .level = 6,
3666 .owner = THIS_MODULE,
3667 .make_request = make_request,
3668 .run = run,
3669 .stop = stop,
3670 .status = status,
3671 .error_handler = error,
3672 .hot_add_disk = raid5_add_disk,
3673 .hot_remove_disk= raid5_remove_disk,
3674 .spare_active = raid5_spare_active,
3675 .sync_request = sync_request,
3676 .resize = raid5_resize,
3677 .quiesce = raid5_quiesce,
3678};
2765static struct mdk_personality raid5_personality = 3679static struct mdk_personality raid5_personality =
2766{ 3680{
2767 .name = "raid5", 3681 .name = "raid5",
@@ -2804,6 +3718,12 @@ static struct mdk_personality raid4_personality =
2804 3718
2805static int __init raid5_init(void) 3719static int __init raid5_init(void)
2806{ 3720{
3721 int e;
3722
3723 e = raid6_select_algo();
3724 if ( e )
3725 return e;
3726 register_md_personality(&raid6_personality);
2807 register_md_personality(&raid5_personality); 3727 register_md_personality(&raid5_personality);
2808 register_md_personality(&raid4_personality); 3728 register_md_personality(&raid4_personality);
2809 return 0; 3729 return 0;
@@ -2811,6 +3731,7 @@ static int __init raid5_init(void)
2811 3731
2812static void raid5_exit(void) 3732static void raid5_exit(void)
2813{ 3733{
3734 unregister_md_personality(&raid6_personality);
2814 unregister_md_personality(&raid5_personality); 3735 unregister_md_personality(&raid5_personality);
2815 unregister_md_personality(&raid4_personality); 3736 unregister_md_personality(&raid4_personality);
2816} 3737}
@@ -2823,3 +3744,10 @@ MODULE_ALIAS("md-raid5");
2823MODULE_ALIAS("md-raid4"); 3744MODULE_ALIAS("md-raid4");
2824MODULE_ALIAS("md-level-5"); 3745MODULE_ALIAS("md-level-5");
2825MODULE_ALIAS("md-level-4"); 3746MODULE_ALIAS("md-level-4");
3747MODULE_ALIAS("md-personality-8"); /* RAID6 */
3748MODULE_ALIAS("md-raid6");
3749MODULE_ALIAS("md-level-6");
3750
3751/* This used to be two separate modules, they were: */
3752MODULE_ALIAS("raid5");
3753MODULE_ALIAS("raid6");
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
deleted file mode 100644
index bc69355e0100..000000000000
--- a/drivers/md/raid6main.c
+++ /dev/null
@@ -1,2427 +0,0 @@
1/*
2 * raid6main.c : Multiple Devices driver for Linux
3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4 * Copyright (C) 1999, 2000 Ingo Molnar
5 * Copyright (C) 2002, 2003 H. Peter Anvin
6 *
7 * RAID-6 management functions. This code is derived from raid5.c.
8 * Last merge from raid5.c bkcvs version 1.79 (kernel 2.6.1).
9 *
10 * Thanks to Penguin Computing for making the RAID-6 development possible
11 * by donating a test server!
12 *
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2, or (at your option)
16 * any later version.
17 *
18 * You should have received a copy of the GNU General Public License
19 * (for example /usr/src/linux/COPYING); if not, write to the Free
20 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 */
22
23
24#include <linux/config.h>
25#include <linux/module.h>
26#include <linux/slab.h>
27#include <linux/highmem.h>
28#include <linux/bitops.h>
29#include <asm/atomic.h>
30#include "raid6.h"
31
32#include <linux/raid/bitmap.h>
33
34/*
35 * Stripe cache
36 */
37
38#define NR_STRIPES 256
39#define STRIPE_SIZE PAGE_SIZE
40#define STRIPE_SHIFT (PAGE_SHIFT - 9)
41#define STRIPE_SECTORS (STRIPE_SIZE>>9)
42#define IO_THRESHOLD 1
43#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))
44#define HASH_MASK (NR_HASH - 1)
45
46#define stripe_hash(conf, sect) (&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]))
47
48/* bio's attached to a stripe+device for I/O are linked together in bi_sector
49 * order without overlap. There may be several bio's per stripe+device, and
50 * a bio could span several devices.
51 * When walking this list for a particular stripe+device, we must never proceed
52 * beyond a bio that extends past this device, as the next bio might no longer
53 * be valid.
54 * This macro is used to determine the 'next' bio in the list, given the sector
55 * of the current stripe+device
56 */
57#define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL)
58/*
59 * The following can be used to debug the driver
60 */
61#define RAID6_DEBUG 0 /* Extremely verbose printk */
62#define RAID6_PARANOIA 1 /* Check spinlocks */
63#define RAID6_DUMPSTATE 0 /* Include stripe cache state in /proc/mdstat */
64#if RAID6_PARANOIA && defined(CONFIG_SMP)
65# define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock)
66#else
67# define CHECK_DEVLOCK()
68#endif
69
70#define PRINTK(x...) ((void)(RAID6_DEBUG && printk(KERN_DEBUG x)))
71#if RAID6_DEBUG
72#undef inline
73#undef __inline__
74#define inline
75#define __inline__
76#endif
77
78#if !RAID6_USE_EMPTY_ZERO_PAGE
79/* In .bss so it's zeroed */
80const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
81#endif
82
83static inline int raid6_next_disk(int disk, int raid_disks)
84{
85 disk++;
86 return (disk < raid_disks) ? disk : 0;
87}
88
89static void print_raid6_conf (raid6_conf_t *conf);
90
91static void __release_stripe(raid6_conf_t *conf, struct stripe_head *sh)
92{
93 if (atomic_dec_and_test(&sh->count)) {
94 BUG_ON(!list_empty(&sh->lru));
95 BUG_ON(atomic_read(&conf->active_stripes)==0);
96 if (test_bit(STRIPE_HANDLE, &sh->state)) {
97 if (test_bit(STRIPE_DELAYED, &sh->state))
98 list_add_tail(&sh->lru, &conf->delayed_list);
99 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
100 conf->seq_write == sh->bm_seq)
101 list_add_tail(&sh->lru, &conf->bitmap_list);
102 else {
103 clear_bit(STRIPE_BIT_DELAY, &sh->state);
104 list_add_tail(&sh->lru, &conf->handle_list);
105 }
106 md_wakeup_thread(conf->mddev->thread);
107 } else {
108 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
109 atomic_dec(&conf->preread_active_stripes);
110 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
111 md_wakeup_thread(conf->mddev->thread);
112 }
113 list_add_tail(&sh->lru, &conf->inactive_list);
114 atomic_dec(&conf->active_stripes);
115 if (!conf->inactive_blocked ||
116 atomic_read(&conf->active_stripes) < (conf->max_nr_stripes*3/4))
117 wake_up(&conf->wait_for_stripe);
118 }
119 }
120}
121static void release_stripe(struct stripe_head *sh)
122{
123 raid6_conf_t *conf = sh->raid_conf;
124 unsigned long flags;
125
126 spin_lock_irqsave(&conf->device_lock, flags);
127 __release_stripe(conf, sh);
128 spin_unlock_irqrestore(&conf->device_lock, flags);
129}
130
131static inline void remove_hash(struct stripe_head *sh)
132{
133 PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector);
134
135 hlist_del_init(&sh->hash);
136}
137
138static inline void insert_hash(raid6_conf_t *conf, struct stripe_head *sh)
139{
140 struct hlist_head *hp = stripe_hash(conf, sh->sector);
141
142 PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector);
143
144 CHECK_DEVLOCK();
145 hlist_add_head(&sh->hash, hp);
146}
147
148
149/* find an idle stripe, make sure it is unhashed, and return it. */
150static struct stripe_head *get_free_stripe(raid6_conf_t *conf)
151{
152 struct stripe_head *sh = NULL;
153 struct list_head *first;
154
155 CHECK_DEVLOCK();
156 if (list_empty(&conf->inactive_list))
157 goto out;
158 first = conf->inactive_list.next;
159 sh = list_entry(first, struct stripe_head, lru);
160 list_del_init(first);
161 remove_hash(sh);
162 atomic_inc(&conf->active_stripes);
163out:
164 return sh;
165}
166
167static void shrink_buffers(struct stripe_head *sh, int num)
168{
169 struct page *p;
170 int i;
171
172 for (i=0; i<num ; i++) {
173 p = sh->dev[i].page;
174 if (!p)
175 continue;
176 sh->dev[i].page = NULL;
177 put_page(p);
178 }
179}
180
181static int grow_buffers(struct stripe_head *sh, int num)
182{
183 int i;
184
185 for (i=0; i<num; i++) {
186 struct page *page;
187
188 if (!(page = alloc_page(GFP_KERNEL))) {
189 return 1;
190 }
191 sh->dev[i].page = page;
192 }
193 return 0;
194}
195
196static void raid6_build_block (struct stripe_head *sh, int i);
197
198static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx)
199{
200 raid6_conf_t *conf = sh->raid_conf;
201 int disks = conf->raid_disks, i;
202
203 BUG_ON(atomic_read(&sh->count) != 0);
204 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
205
206 CHECK_DEVLOCK();
207 PRINTK("init_stripe called, stripe %llu\n",
208 (unsigned long long)sh->sector);
209
210 remove_hash(sh);
211
212 sh->sector = sector;
213 sh->pd_idx = pd_idx;
214 sh->state = 0;
215
216 for (i=disks; i--; ) {
217 struct r5dev *dev = &sh->dev[i];
218
219 if (dev->toread || dev->towrite || dev->written ||
220 test_bit(R5_LOCKED, &dev->flags)) {
221 PRINTK("sector=%llx i=%d %p %p %p %d\n",
222 (unsigned long long)sh->sector, i, dev->toread,
223 dev->towrite, dev->written,
224 test_bit(R5_LOCKED, &dev->flags));
225 BUG();
226 }
227 dev->flags = 0;
228 raid6_build_block(sh, i);
229 }
230 insert_hash(conf, sh);
231}
232
233static struct stripe_head *__find_stripe(raid6_conf_t *conf, sector_t sector)
234{
235 struct stripe_head *sh;
236 struct hlist_node *hn;
237
238 CHECK_DEVLOCK();
239 PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector);
240 hlist_for_each_entry (sh, hn, stripe_hash(conf, sector), hash)
241 if (sh->sector == sector)
242 return sh;
243 PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector);
244 return NULL;
245}
246
247static void unplug_slaves(mddev_t *mddev);
248
249static struct stripe_head *get_active_stripe(raid6_conf_t *conf, sector_t sector,
250 int pd_idx, int noblock)
251{
252 struct stripe_head *sh;
253
254 PRINTK("get_stripe, sector %llu\n", (unsigned long long)sector);
255
256 spin_lock_irq(&conf->device_lock);
257
258 do {
259 wait_event_lock_irq(conf->wait_for_stripe,
260 conf->quiesce == 0,
261 conf->device_lock, /* nothing */);
262 sh = __find_stripe(conf, sector);
263 if (!sh) {
264 if (!conf->inactive_blocked)
265 sh = get_free_stripe(conf);
266 if (noblock && sh == NULL)
267 break;
268 if (!sh) {
269 conf->inactive_blocked = 1;
270 wait_event_lock_irq(conf->wait_for_stripe,
271 !list_empty(&conf->inactive_list) &&
272 (atomic_read(&conf->active_stripes)
273 < (conf->max_nr_stripes *3/4)
274 || !conf->inactive_blocked),
275 conf->device_lock,
276 unplug_slaves(conf->mddev);
277 );
278 conf->inactive_blocked = 0;
279 } else
280 init_stripe(sh, sector, pd_idx);
281 } else {
282 if (atomic_read(&sh->count)) {
283 BUG_ON(!list_empty(&sh->lru));
284 } else {
285 if (!test_bit(STRIPE_HANDLE, &sh->state))
286 atomic_inc(&conf->active_stripes);
287 BUG_ON(list_empty(&sh->lru));
288 list_del_init(&sh->lru);
289 }
290 }
291 } while (sh == NULL);
292
293 if (sh)
294 atomic_inc(&sh->count);
295
296 spin_unlock_irq(&conf->device_lock);
297 return sh;
298}
299
300static int grow_one_stripe(raid6_conf_t *conf)
301{
302 struct stripe_head *sh;
303 sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL);
304 if (!sh)
305 return 0;
306 memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev));
307 sh->raid_conf = conf;
308 spin_lock_init(&sh->lock);
309
310 if (grow_buffers(sh, conf->raid_disks)) {
311 shrink_buffers(sh, conf->raid_disks);
312 kmem_cache_free(conf->slab_cache, sh);
313 return 0;
314 }
315 /* we just created an active stripe so... */
316 atomic_set(&sh->count, 1);
317 atomic_inc(&conf->active_stripes);
318 INIT_LIST_HEAD(&sh->lru);
319 release_stripe(sh);
320 return 1;
321}
322
323static int grow_stripes(raid6_conf_t *conf, int num)
324{
325 kmem_cache_t *sc;
326 int devs = conf->raid_disks;
327
328 sprintf(conf->cache_name[0], "raid6/%s", mdname(conf->mddev));
329
330 sc = kmem_cache_create(conf->cache_name[0],
331 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
332 0, 0, NULL, NULL);
333 if (!sc)
334 return 1;
335 conf->slab_cache = sc;
336 while (num--)
337 if (!grow_one_stripe(conf))
338 return 1;
339 return 0;
340}
341
342static int drop_one_stripe(raid6_conf_t *conf)
343{
344 struct stripe_head *sh;
345 spin_lock_irq(&conf->device_lock);
346 sh = get_free_stripe(conf);
347 spin_unlock_irq(&conf->device_lock);
348 if (!sh)
349 return 0;
350 BUG_ON(atomic_read(&sh->count));
351 shrink_buffers(sh, conf->raid_disks);
352 kmem_cache_free(conf->slab_cache, sh);
353 atomic_dec(&conf->active_stripes);
354 return 1;
355}
356
357static void shrink_stripes(raid6_conf_t *conf)
358{
359 while (drop_one_stripe(conf))
360 ;
361
362 if (conf->slab_cache)
363 kmem_cache_destroy(conf->slab_cache);
364 conf->slab_cache = NULL;
365}
366
367static int raid6_end_read_request(struct bio * bi, unsigned int bytes_done,
368 int error)
369{
370 struct stripe_head *sh = bi->bi_private;
371 raid6_conf_t *conf = sh->raid_conf;
372 int disks = conf->raid_disks, i;
373 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
374
375 if (bi->bi_size)
376 return 1;
377
378 for (i=0 ; i<disks; i++)
379 if (bi == &sh->dev[i].req)
380 break;
381
382 PRINTK("end_read_request %llu/%d, count: %d, uptodate %d.\n",
383 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
384 uptodate);
385 if (i == disks) {
386 BUG();
387 return 0;
388 }
389
390 if (uptodate) {
391#if 0
392 struct bio *bio;
393 unsigned long flags;
394 spin_lock_irqsave(&conf->device_lock, flags);
395 /* we can return a buffer if we bypassed the cache or
396 * if the top buffer is not in highmem. If there are
397 * multiple buffers, leave the extra work to
398 * handle_stripe
399 */
400 buffer = sh->bh_read[i];
401 if (buffer &&
402 (!PageHighMem(buffer->b_page)
403 || buffer->b_page == bh->b_page )
404 ) {
405 sh->bh_read[i] = buffer->b_reqnext;
406 buffer->b_reqnext = NULL;
407 } else
408 buffer = NULL;
409 spin_unlock_irqrestore(&conf->device_lock, flags);
410 if (sh->bh_page[i]==bh->b_page)
411 set_buffer_uptodate(bh);
412 if (buffer) {
413 if (buffer->b_page != bh->b_page)
414 memcpy(buffer->b_data, bh->b_data, bh->b_size);
415 buffer->b_end_io(buffer, 1);
416 }
417#else
418 set_bit(R5_UPTODATE, &sh->dev[i].flags);
419#endif
420 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
421 printk(KERN_INFO "raid6: read error corrected!!\n");
422 clear_bit(R5_ReadError, &sh->dev[i].flags);
423 clear_bit(R5_ReWrite, &sh->dev[i].flags);
424 }
425 if (atomic_read(&conf->disks[i].rdev->read_errors))
426 atomic_set(&conf->disks[i].rdev->read_errors, 0);
427 } else {
428 int retry = 0;
429 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
430 atomic_inc(&conf->disks[i].rdev->read_errors);
431 if (conf->mddev->degraded)
432 printk(KERN_WARNING "raid6: read error not correctable.\n");
433 else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
434 /* Oh, no!!! */
435 printk(KERN_WARNING "raid6: read error NOT corrected!!\n");
436 else if (atomic_read(&conf->disks[i].rdev->read_errors)
437 > conf->max_nr_stripes)
438 printk(KERN_WARNING
439 "raid6: Too many read errors, failing device.\n");
440 else
441 retry = 1;
442 if (retry)
443 set_bit(R5_ReadError, &sh->dev[i].flags);
444 else {
445 clear_bit(R5_ReadError, &sh->dev[i].flags);
446 clear_bit(R5_ReWrite, &sh->dev[i].flags);
447 md_error(conf->mddev, conf->disks[i].rdev);
448 }
449 }
450 rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
451#if 0
452 /* must restore b_page before unlocking buffer... */
453 if (sh->bh_page[i] != bh->b_page) {
454 bh->b_page = sh->bh_page[i];
455 bh->b_data = page_address(bh->b_page);
456 clear_buffer_uptodate(bh);
457 }
458#endif
459 clear_bit(R5_LOCKED, &sh->dev[i].flags);
460 set_bit(STRIPE_HANDLE, &sh->state);
461 release_stripe(sh);
462 return 0;
463}
464
465static int raid6_end_write_request (struct bio *bi, unsigned int bytes_done,
466 int error)
467{
468 struct stripe_head *sh = bi->bi_private;
469 raid6_conf_t *conf = sh->raid_conf;
470 int disks = conf->raid_disks, i;
471 unsigned long flags;
472 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
473
474 if (bi->bi_size)
475 return 1;
476
477 for (i=0 ; i<disks; i++)
478 if (bi == &sh->dev[i].req)
479 break;
480
481 PRINTK("end_write_request %llu/%d, count %d, uptodate: %d.\n",
482 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
483 uptodate);
484 if (i == disks) {
485 BUG();
486 return 0;
487 }
488
489 spin_lock_irqsave(&conf->device_lock, flags);
490 if (!uptodate)
491 md_error(conf->mddev, conf->disks[i].rdev);
492
493 rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
494
495 clear_bit(R5_LOCKED, &sh->dev[i].flags);
496 set_bit(STRIPE_HANDLE, &sh->state);
497 __release_stripe(conf, sh);
498 spin_unlock_irqrestore(&conf->device_lock, flags);
499 return 0;
500}
501
502
503static sector_t compute_blocknr(struct stripe_head *sh, int i);
504
505static void raid6_build_block (struct stripe_head *sh, int i)
506{
507 struct r5dev *dev = &sh->dev[i];
508 int pd_idx = sh->pd_idx;
509 int qd_idx = raid6_next_disk(pd_idx, sh->raid_conf->raid_disks);
510
511 bio_init(&dev->req);
512 dev->req.bi_io_vec = &dev->vec;
513 dev->req.bi_vcnt++;
514 dev->req.bi_max_vecs++;
515 dev->vec.bv_page = dev->page;
516 dev->vec.bv_len = STRIPE_SIZE;
517 dev->vec.bv_offset = 0;
518
519 dev->req.bi_sector = sh->sector;
520 dev->req.bi_private = sh;
521
522 dev->flags = 0;
523 if (i != pd_idx && i != qd_idx)
524 dev->sector = compute_blocknr(sh, i);
525}
526
527static void error(mddev_t *mddev, mdk_rdev_t *rdev)
528{
529 char b[BDEVNAME_SIZE];
530 raid6_conf_t *conf = (raid6_conf_t *) mddev->private;
531 PRINTK("raid6: error called\n");
532
533 if (!test_bit(Faulty, &rdev->flags)) {
534 mddev->sb_dirty = 1;
535 if (test_bit(In_sync, &rdev->flags)) {
536 conf->working_disks--;
537 mddev->degraded++;
538 conf->failed_disks++;
539 clear_bit(In_sync, &rdev->flags);
540 /*
541 * if recovery was running, make sure it aborts.
542 */
543 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
544 }
545 set_bit(Faulty, &rdev->flags);
546 printk (KERN_ALERT
547 "raid6: Disk failure on %s, disabling device."
548 " Operation continuing on %d devices\n",
549 bdevname(rdev->bdev,b), conf->working_disks);
550 }
551}
552
553/*
554 * Input: a 'big' sector number,
555 * Output: index of the data and parity disk, and the sector # in them.
556 */
557static sector_t raid6_compute_sector(sector_t r_sector, unsigned int raid_disks,
558 unsigned int data_disks, unsigned int * dd_idx,
559 unsigned int * pd_idx, raid6_conf_t *conf)
560{
561 long stripe;
562 unsigned long chunk_number;
563 unsigned int chunk_offset;
564 sector_t new_sector;
565 int sectors_per_chunk = conf->chunk_size >> 9;
566
567 /* First compute the information on this sector */
568
569 /*
570 * Compute the chunk number and the sector offset inside the chunk
571 */
572 chunk_offset = sector_div(r_sector, sectors_per_chunk);
573 chunk_number = r_sector;
574 if ( r_sector != chunk_number ) {
575 printk(KERN_CRIT "raid6: ERROR: r_sector = %llu, chunk_number = %lu\n",
576 (unsigned long long)r_sector, (unsigned long)chunk_number);
577 BUG();
578 }
579
580 /*
581 * Compute the stripe number
582 */
583 stripe = chunk_number / data_disks;
584
585 /*
586 * Compute the data disk and parity disk indexes inside the stripe
587 */
588 *dd_idx = chunk_number % data_disks;
589
590 /*
591 * Select the parity disk based on the user selected algorithm.
592 */
593
594 /**** FIX THIS ****/
595 switch (conf->algorithm) {
596 case ALGORITHM_LEFT_ASYMMETRIC:
597 *pd_idx = raid_disks - 1 - (stripe % raid_disks);
598 if (*pd_idx == raid_disks-1)
599 (*dd_idx)++; /* Q D D D P */
600 else if (*dd_idx >= *pd_idx)
601 (*dd_idx) += 2; /* D D P Q D */
602 break;
603 case ALGORITHM_RIGHT_ASYMMETRIC:
604 *pd_idx = stripe % raid_disks;
605 if (*pd_idx == raid_disks-1)
606 (*dd_idx)++; /* Q D D D P */
607 else if (*dd_idx >= *pd_idx)
608 (*dd_idx) += 2; /* D D P Q D */
609 break;
610 case ALGORITHM_LEFT_SYMMETRIC:
611 *pd_idx = raid_disks - 1 - (stripe % raid_disks);
612 *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
613 break;
614 case ALGORITHM_RIGHT_SYMMETRIC:
615 *pd_idx = stripe % raid_disks;
616 *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
617 break;
618 default:
619 printk (KERN_CRIT "raid6: unsupported algorithm %d\n",
620 conf->algorithm);
621 }
622
623 PRINTK("raid6: chunk_number = %lu, pd_idx = %u, dd_idx = %u\n",
624 chunk_number, *pd_idx, *dd_idx);
625
626 /*
627 * Finally, compute the new sector number
628 */
629 new_sector = (sector_t) stripe * sectors_per_chunk + chunk_offset;
630 return new_sector;
631}
632
633
634static sector_t compute_blocknr(struct stripe_head *sh, int i)
635{
636 raid6_conf_t *conf = sh->raid_conf;
637 int raid_disks = conf->raid_disks, data_disks = raid_disks - 2;
638 sector_t new_sector = sh->sector, check;
639 int sectors_per_chunk = conf->chunk_size >> 9;
640 sector_t stripe;
641 int chunk_offset;
642 int chunk_number, dummy1, dummy2, dd_idx = i;
643 sector_t r_sector;
644 int i0 = i;
645
646 chunk_offset = sector_div(new_sector, sectors_per_chunk);
647 stripe = new_sector;
648 if ( new_sector != stripe ) {
649 printk(KERN_CRIT "raid6: ERROR: new_sector = %llu, stripe = %lu\n",
650 (unsigned long long)new_sector, (unsigned long)stripe);
651 BUG();
652 }
653
654 switch (conf->algorithm) {
655 case ALGORITHM_LEFT_ASYMMETRIC:
656 case ALGORITHM_RIGHT_ASYMMETRIC:
657 if (sh->pd_idx == raid_disks-1)
658 i--; /* Q D D D P */
659 else if (i > sh->pd_idx)
660 i -= 2; /* D D P Q D */
661 break;
662 case ALGORITHM_LEFT_SYMMETRIC:
663 case ALGORITHM_RIGHT_SYMMETRIC:
664 if (sh->pd_idx == raid_disks-1)
665 i--; /* Q D D D P */
666 else {
667 /* D D P Q D */
668 if (i < sh->pd_idx)
669 i += raid_disks;
670 i -= (sh->pd_idx + 2);
671 }
672 break;
673 default:
674 printk (KERN_CRIT "raid6: unsupported algorithm %d\n",
675 conf->algorithm);
676 }
677
678 PRINTK("raid6: compute_blocknr: pd_idx = %u, i0 = %u, i = %u\n", sh->pd_idx, i0, i);
679
680 chunk_number = stripe * data_disks + i;
681 r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset;
682
683 check = raid6_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
684 if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
685 printk(KERN_CRIT "raid6: compute_blocknr: map not correct\n");
686 return 0;
687 }
688 return r_sector;
689}
690
691
692
693/*
694 * Copy data between a page in the stripe cache, and one or more bion
695 * The page could align with the middle of the bio, or there could be
696 * several bion, each with several bio_vecs, which cover part of the page
697 * Multiple bion are linked together on bi_next. There may be extras
698 * at the end of this list. We ignore them.
699 */
700static void copy_data(int frombio, struct bio *bio,
701 struct page *page,
702 sector_t sector)
703{
704 char *pa = page_address(page);
705 struct bio_vec *bvl;
706 int i;
707 int page_offset;
708
709 if (bio->bi_sector >= sector)
710 page_offset = (signed)(bio->bi_sector - sector) * 512;
711 else
712 page_offset = (signed)(sector - bio->bi_sector) * -512;
713 bio_for_each_segment(bvl, bio, i) {
714 int len = bio_iovec_idx(bio,i)->bv_len;
715 int clen;
716 int b_offset = 0;
717
718 if (page_offset < 0) {
719 b_offset = -page_offset;
720 page_offset += b_offset;
721 len -= b_offset;
722 }
723
724 if (len > 0 && page_offset + len > STRIPE_SIZE)
725 clen = STRIPE_SIZE - page_offset;
726 else clen = len;
727
728 if (clen > 0) {
729 char *ba = __bio_kmap_atomic(bio, i, KM_USER0);
730 if (frombio)
731 memcpy(pa+page_offset, ba+b_offset, clen);
732 else
733 memcpy(ba+b_offset, pa+page_offset, clen);
734 __bio_kunmap_atomic(ba, KM_USER0);
735 }
736 if (clen < len) /* hit end of page */
737 break;
738 page_offset += len;
739 }
740}
741
742#define check_xor() do { \
743 if (count == MAX_XOR_BLOCKS) { \
744 xor_block(count, STRIPE_SIZE, ptr); \
745 count = 1; \
746 } \
747 } while(0)
748
749/* Compute P and Q syndromes */
750static void compute_parity(struct stripe_head *sh, int method)
751{
752 raid6_conf_t *conf = sh->raid_conf;
753 int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
754 struct bio *chosen;
755 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
756 void *ptrs[disks];
757
758 qd_idx = raid6_next_disk(pd_idx, disks);
759 d0_idx = raid6_next_disk(qd_idx, disks);
760
761 PRINTK("compute_parity, stripe %llu, method %d\n",
762 (unsigned long long)sh->sector, method);
763
764 switch(method) {
765 case READ_MODIFY_WRITE:
766 BUG(); /* READ_MODIFY_WRITE N/A for RAID-6 */
767 case RECONSTRUCT_WRITE:
768 for (i= disks; i-- ;)
769 if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) {
770 chosen = sh->dev[i].towrite;
771 sh->dev[i].towrite = NULL;
772
773 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
774 wake_up(&conf->wait_for_overlap);
775
776 BUG_ON(sh->dev[i].written);
777 sh->dev[i].written = chosen;
778 }
779 break;
780 case CHECK_PARITY:
781 BUG(); /* Not implemented yet */
782 }
783
784 for (i = disks; i--;)
785 if (sh->dev[i].written) {
786 sector_t sector = sh->dev[i].sector;
787 struct bio *wbi = sh->dev[i].written;
788 while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
789 copy_data(1, wbi, sh->dev[i].page, sector);
790 wbi = r5_next_bio(wbi, sector);
791 }
792
793 set_bit(R5_LOCKED, &sh->dev[i].flags);
794 set_bit(R5_UPTODATE, &sh->dev[i].flags);
795 }
796
797// switch(method) {
798// case RECONSTRUCT_WRITE:
799// case CHECK_PARITY:
800// case UPDATE_PARITY:
801 /* Note that unlike RAID-5, the ordering of the disks matters greatly. */
802 /* FIX: Is this ordering of drives even remotely optimal? */
803 count = 0;
804 i = d0_idx;
805 do {
806 ptrs[count++] = page_address(sh->dev[i].page);
807 if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags))
808 printk("block %d/%d not uptodate on parity calc\n", i,count);
809 i = raid6_next_disk(i, disks);
810 } while ( i != d0_idx );
811// break;
812// }
813
814 raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs);
815
816 switch(method) {
817 case RECONSTRUCT_WRITE:
818 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
819 set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
820 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
821 set_bit(R5_LOCKED, &sh->dev[qd_idx].flags);
822 break;
823 case UPDATE_PARITY:
824 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
825 set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
826 break;
827 }
828}
829
830/* Compute one missing block */
831static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
832{
833 raid6_conf_t *conf = sh->raid_conf;
834 int i, count, disks = conf->raid_disks;
835 void *ptr[MAX_XOR_BLOCKS], *p;
836 int pd_idx = sh->pd_idx;
837 int qd_idx = raid6_next_disk(pd_idx, disks);
838
839 PRINTK("compute_block_1, stripe %llu, idx %d\n",
840 (unsigned long long)sh->sector, dd_idx);
841
842 if ( dd_idx == qd_idx ) {
843 /* We're actually computing the Q drive */
844 compute_parity(sh, UPDATE_PARITY);
845 } else {
846 ptr[0] = page_address(sh->dev[dd_idx].page);
847 if (!nozero) memset(ptr[0], 0, STRIPE_SIZE);
848 count = 1;
849 for (i = disks ; i--; ) {
850 if (i == dd_idx || i == qd_idx)
851 continue;
852 p = page_address(sh->dev[i].page);
853 if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
854 ptr[count++] = p;
855 else
856 printk("compute_block() %d, stripe %llu, %d"
857 " not present\n", dd_idx,
858 (unsigned long long)sh->sector, i);
859
860 check_xor();
861 }
862 if (count != 1)
863 xor_block(count, STRIPE_SIZE, ptr);
864 if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
865 else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
866 }
867}
868
869/* Compute two missing blocks */
870static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
871{
872 raid6_conf_t *conf = sh->raid_conf;
873 int i, count, disks = conf->raid_disks;
874 int pd_idx = sh->pd_idx;
875 int qd_idx = raid6_next_disk(pd_idx, disks);
876 int d0_idx = raid6_next_disk(qd_idx, disks);
877 int faila, failb;
878
879 /* faila and failb are disk numbers relative to d0_idx */
880 /* pd_idx become disks-2 and qd_idx become disks-1 */
881 faila = (dd_idx1 < d0_idx) ? dd_idx1+(disks-d0_idx) : dd_idx1-d0_idx;
882 failb = (dd_idx2 < d0_idx) ? dd_idx2+(disks-d0_idx) : dd_idx2-d0_idx;
883
884 BUG_ON(faila == failb);
885 if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; }
886
887 PRINTK("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
888 (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb);
889
890 if ( failb == disks-1 ) {
891 /* Q disk is one of the missing disks */
892 if ( faila == disks-2 ) {
893 /* Missing P+Q, just recompute */
894 compute_parity(sh, UPDATE_PARITY);
895 return;
896 } else {
897 /* We're missing D+Q; recompute D from P */
898 compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0);
899 compute_parity(sh, UPDATE_PARITY); /* Is this necessary? */
900 return;
901 }
902 }
903
904 /* We're missing D+P or D+D; build pointer table */
905 {
906 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
907 void *ptrs[disks];
908
909 count = 0;
910 i = d0_idx;
911 do {
912 ptrs[count++] = page_address(sh->dev[i].page);
913 i = raid6_next_disk(i, disks);
914 if (i != dd_idx1 && i != dd_idx2 &&
915 !test_bit(R5_UPTODATE, &sh->dev[i].flags))
916 printk("compute_2 with missing block %d/%d\n", count, i);
917 } while ( i != d0_idx );
918
919 if ( failb == disks-2 ) {
920 /* We're missing D+P. */
921 raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs);
922 } else {
923 /* We're missing D+D. */
924 raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs);
925 }
926
927 /* Both the above update both missing blocks */
928 set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
929 set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
930 }
931}
932
933
934/*
935 * Each stripe/dev can have one or more bion attached.
936 * toread/towrite point to the first in a chain.
937 * The bi_next chain must be in order.
938 */
939static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
940{
941 struct bio **bip;
942 raid6_conf_t *conf = sh->raid_conf;
943 int firstwrite=0;
944
945 PRINTK("adding bh b#%llu to stripe s#%llu\n",
946 (unsigned long long)bi->bi_sector,
947 (unsigned long long)sh->sector);
948
949
950 spin_lock(&sh->lock);
951 spin_lock_irq(&conf->device_lock);
952 if (forwrite) {
953 bip = &sh->dev[dd_idx].towrite;
954 if (*bip == NULL && sh->dev[dd_idx].written == NULL)
955 firstwrite = 1;
956 } else
957 bip = &sh->dev[dd_idx].toread;
958 while (*bip && (*bip)->bi_sector < bi->bi_sector) {
959 if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
960 goto overlap;
961 bip = &(*bip)->bi_next;
962 }
963 if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9))
964 goto overlap;
965
966 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
967 if (*bip)
968 bi->bi_next = *bip;
969 *bip = bi;
970 bi->bi_phys_segments ++;
971 spin_unlock_irq(&conf->device_lock);
972 spin_unlock(&sh->lock);
973
974 PRINTK("added bi b#%llu to stripe s#%llu, disk %d.\n",
975 (unsigned long long)bi->bi_sector,
976 (unsigned long long)sh->sector, dd_idx);
977
978 if (conf->mddev->bitmap && firstwrite) {
979 sh->bm_seq = conf->seq_write;
980 bitmap_startwrite(conf->mddev->bitmap, sh->sector,
981 STRIPE_SECTORS, 0);
982 set_bit(STRIPE_BIT_DELAY, &sh->state);
983 }
984
985 if (forwrite) {
986 /* check if page is covered */
987 sector_t sector = sh->dev[dd_idx].sector;
988 for (bi=sh->dev[dd_idx].towrite;
989 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
990 bi && bi->bi_sector <= sector;
991 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
992 if (bi->bi_sector + (bi->bi_size>>9) >= sector)
993 sector = bi->bi_sector + (bi->bi_size>>9);
994 }
995 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
996 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
997 }
998 return 1;
999
1000 overlap:
1001 set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
1002 spin_unlock_irq(&conf->device_lock);
1003 spin_unlock(&sh->lock);
1004 return 0;
1005}
1006
1007
1008static int page_is_zero(struct page *p)
1009{
1010 char *a = page_address(p);
1011 return ((*(u32*)a) == 0 &&
1012 memcmp(a, a+4, STRIPE_SIZE-4)==0);
1013}
1014/*
1015 * handle_stripe - do things to a stripe.
1016 *
1017 * We lock the stripe and then examine the state of various bits
1018 * to see what needs to be done.
1019 * Possible results:
1020 * return some read request which now have data
1021 * return some write requests which are safely on disc
1022 * schedule a read on some buffers
1023 * schedule a write of some buffers
1024 * return confirmation of parity correctness
1025 *
1026 * Parity calculations are done inside the stripe lock
1027 * buffers are taken off read_list or write_list, and bh_cache buffers
1028 * get BH_Lock set before the stripe lock is released.
1029 *
1030 */
1031
1032static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
1033{
1034 raid6_conf_t *conf = sh->raid_conf;
1035 int disks = conf->raid_disks;
1036 struct bio *return_bi= NULL;
1037 struct bio *bi;
1038 int i;
1039 int syncing;
1040 int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
1041 int non_overwrite = 0;
1042 int failed_num[2] = {0, 0};
1043 struct r5dev *dev, *pdev, *qdev;
1044 int pd_idx = sh->pd_idx;
1045 int qd_idx = raid6_next_disk(pd_idx, disks);
1046 int p_failed, q_failed;
1047
1048 PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d, qd_idx=%d\n",
1049 (unsigned long long)sh->sector, sh->state, atomic_read(&sh->count),
1050 pd_idx, qd_idx);
1051
1052 spin_lock(&sh->lock);
1053 clear_bit(STRIPE_HANDLE, &sh->state);
1054 clear_bit(STRIPE_DELAYED, &sh->state);
1055
1056 syncing = test_bit(STRIPE_SYNCING, &sh->state);
1057 /* Now to look around and see what can be done */
1058
1059 rcu_read_lock();
1060 for (i=disks; i--; ) {
1061 mdk_rdev_t *rdev;
1062 dev = &sh->dev[i];
1063 clear_bit(R5_Insync, &dev->flags);
1064
1065 PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
1066 i, dev->flags, dev->toread, dev->towrite, dev->written);
1067 /* maybe we can reply to a read */
1068 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
1069 struct bio *rbi, *rbi2;
1070 PRINTK("Return read for disc %d\n", i);
1071 spin_lock_irq(&conf->device_lock);
1072 rbi = dev->toread;
1073 dev->toread = NULL;
1074 if (test_and_clear_bit(R5_Overlap, &dev->flags))
1075 wake_up(&conf->wait_for_overlap);
1076 spin_unlock_irq(&conf->device_lock);
1077 while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
1078 copy_data(0, rbi, dev->page, dev->sector);
1079 rbi2 = r5_next_bio(rbi, dev->sector);
1080 spin_lock_irq(&conf->device_lock);
1081 if (--rbi->bi_phys_segments == 0) {
1082 rbi->bi_next = return_bi;
1083 return_bi = rbi;
1084 }
1085 spin_unlock_irq(&conf->device_lock);
1086 rbi = rbi2;
1087 }
1088 }
1089
1090 /* now count some things */
1091 if (test_bit(R5_LOCKED, &dev->flags)) locked++;
1092 if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
1093
1094
1095 if (dev->toread) to_read++;
1096 if (dev->towrite) {
1097 to_write++;
1098 if (!test_bit(R5_OVERWRITE, &dev->flags))
1099 non_overwrite++;
1100 }
1101 if (dev->written) written++;
1102 rdev = rcu_dereference(conf->disks[i].rdev);
1103 if (!rdev || !test_bit(In_sync, &rdev->flags)) {
1104 /* The ReadError flag will just be confusing now */
1105 clear_bit(R5_ReadError, &dev->flags);
1106 clear_bit(R5_ReWrite, &dev->flags);
1107 }
1108 if (!rdev || !test_bit(In_sync, &rdev->flags)
1109 || test_bit(R5_ReadError, &dev->flags)) {
1110 if ( failed < 2 )
1111 failed_num[failed] = i;
1112 failed++;
1113 } else
1114 set_bit(R5_Insync, &dev->flags);
1115 }
1116 rcu_read_unlock();
1117 PRINTK("locked=%d uptodate=%d to_read=%d"
1118 " to_write=%d failed=%d failed_num=%d,%d\n",
1119 locked, uptodate, to_read, to_write, failed,
1120 failed_num[0], failed_num[1]);
1121 /* check if the array has lost >2 devices and, if so, some requests might
1122 * need to be failed
1123 */
1124 if (failed > 2 && to_read+to_write+written) {
1125 for (i=disks; i--; ) {
1126 int bitmap_end = 0;
1127
1128 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1129 mdk_rdev_t *rdev;
1130 rcu_read_lock();
1131 rdev = rcu_dereference(conf->disks[i].rdev);
1132 if (rdev && test_bit(In_sync, &rdev->flags))
1133 /* multiple read failures in one stripe */
1134 md_error(conf->mddev, rdev);
1135 rcu_read_unlock();
1136 }
1137
1138 spin_lock_irq(&conf->device_lock);
1139 /* fail all writes first */
1140 bi = sh->dev[i].towrite;
1141 sh->dev[i].towrite = NULL;
1142 if (bi) { to_write--; bitmap_end = 1; }
1143
1144 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1145 wake_up(&conf->wait_for_overlap);
1146
1147 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
1148 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
1149 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1150 if (--bi->bi_phys_segments == 0) {
1151 md_write_end(conf->mddev);
1152 bi->bi_next = return_bi;
1153 return_bi = bi;
1154 }
1155 bi = nextbi;
1156 }
1157 /* and fail all 'written' */
1158 bi = sh->dev[i].written;
1159 sh->dev[i].written = NULL;
1160 if (bi) bitmap_end = 1;
1161 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
1162 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
1163 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1164 if (--bi->bi_phys_segments == 0) {
1165 md_write_end(conf->mddev);
1166 bi->bi_next = return_bi;
1167 return_bi = bi;
1168 }
1169 bi = bi2;
1170 }
1171
1172 /* fail any reads if this device is non-operational */
1173 if (!test_bit(R5_Insync, &sh->dev[i].flags) ||
1174 test_bit(R5_ReadError, &sh->dev[i].flags)) {
1175 bi = sh->dev[i].toread;
1176 sh->dev[i].toread = NULL;
1177 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1178 wake_up(&conf->wait_for_overlap);
1179 if (bi) to_read--;
1180 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
1181 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
1182 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1183 if (--bi->bi_phys_segments == 0) {
1184 bi->bi_next = return_bi;
1185 return_bi = bi;
1186 }
1187 bi = nextbi;
1188 }
1189 }
1190 spin_unlock_irq(&conf->device_lock);
1191 if (bitmap_end)
1192 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
1193 STRIPE_SECTORS, 0, 0);
1194 }
1195 }
1196 if (failed > 2 && syncing) {
1197 md_done_sync(conf->mddev, STRIPE_SECTORS,0);
1198 clear_bit(STRIPE_SYNCING, &sh->state);
1199 syncing = 0;
1200 }
1201
1202 /*
1203 * might be able to return some write requests if the parity blocks
1204 * are safe, or on a failed drive
1205 */
1206 pdev = &sh->dev[pd_idx];
1207 p_failed = (failed >= 1 && failed_num[0] == pd_idx)
1208 || (failed >= 2 && failed_num[1] == pd_idx);
1209 qdev = &sh->dev[qd_idx];
1210 q_failed = (failed >= 1 && failed_num[0] == qd_idx)
1211 || (failed >= 2 && failed_num[1] == qd_idx);
1212
1213 if ( written &&
1214 ( p_failed || ((test_bit(R5_Insync, &pdev->flags)
1215 && !test_bit(R5_LOCKED, &pdev->flags)
1216 && test_bit(R5_UPTODATE, &pdev->flags))) ) &&
1217 ( q_failed || ((test_bit(R5_Insync, &qdev->flags)
1218 && !test_bit(R5_LOCKED, &qdev->flags)
1219 && test_bit(R5_UPTODATE, &qdev->flags))) ) ) {
1220 /* any written block on an uptodate or failed drive can be
1221 * returned. Note that if we 'wrote' to a failed drive,
1222 * it will be UPTODATE, but never LOCKED, so we don't need
1223 * to test 'failed' directly.
1224 */
1225 for (i=disks; i--; )
1226 if (sh->dev[i].written) {
1227 dev = &sh->dev[i];
1228 if (!test_bit(R5_LOCKED, &dev->flags) &&
1229 test_bit(R5_UPTODATE, &dev->flags) ) {
1230 /* We can return any write requests */
1231 int bitmap_end = 0;
1232 struct bio *wbi, *wbi2;
1233 PRINTK("Return write for stripe %llu disc %d\n",
1234 (unsigned long long)sh->sector, i);
1235 spin_lock_irq(&conf->device_lock);
1236 wbi = dev->written;
1237 dev->written = NULL;
1238 while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
1239 wbi2 = r5_next_bio(wbi, dev->sector);
1240 if (--wbi->bi_phys_segments == 0) {
1241 md_write_end(conf->mddev);
1242 wbi->bi_next = return_bi;
1243 return_bi = wbi;
1244 }
1245 wbi = wbi2;
1246 }
1247 if (dev->towrite == NULL)
1248 bitmap_end = 1;
1249 spin_unlock_irq(&conf->device_lock);
1250 if (bitmap_end)
1251 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
1252 STRIPE_SECTORS,
1253 !test_bit(STRIPE_DEGRADED, &sh->state), 0);
1254 }
1255 }
1256 }
1257
1258 /* Now we might consider reading some blocks, either to check/generate
1259 * parity, or to satisfy requests
1260 * or to load a block that is being partially written.
1261 */
1262 if (to_read || non_overwrite || (to_write && failed) || (syncing && (uptodate < disks))) {
1263 for (i=disks; i--;) {
1264 dev = &sh->dev[i];
1265 if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
1266 (dev->toread ||
1267 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
1268 syncing ||
1269 (failed >= 1 && (sh->dev[failed_num[0]].toread || to_write)) ||
1270 (failed >= 2 && (sh->dev[failed_num[1]].toread || to_write))
1271 )
1272 ) {
1273 /* we would like to get this block, possibly
1274 * by computing it, but we might not be able to
1275 */
1276 if (uptodate == disks-1) {
1277 PRINTK("Computing stripe %llu block %d\n",
1278 (unsigned long long)sh->sector, i);
1279 compute_block_1(sh, i, 0);
1280 uptodate++;
1281 } else if ( uptodate == disks-2 && failed >= 2 ) {
1282 /* Computing 2-failure is *very* expensive; only do it if failed >= 2 */
1283 int other;
1284 for (other=disks; other--;) {
1285 if ( other == i )
1286 continue;
1287 if ( !test_bit(R5_UPTODATE, &sh->dev[other].flags) )
1288 break;
1289 }
1290 BUG_ON(other < 0);
1291 PRINTK("Computing stripe %llu blocks %d,%d\n",
1292 (unsigned long long)sh->sector, i, other);
1293 compute_block_2(sh, i, other);
1294 uptodate += 2;
1295 } else if (test_bit(R5_Insync, &dev->flags)) {
1296 set_bit(R5_LOCKED, &dev->flags);
1297 set_bit(R5_Wantread, &dev->flags);
1298#if 0
1299 /* if I am just reading this block and we don't have
1300 a failed drive, or any pending writes then sidestep the cache */
1301 if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
1302 ! syncing && !failed && !to_write) {
1303 sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page;
1304 sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data;
1305 }
1306#endif
1307 locked++;
1308 PRINTK("Reading block %d (sync=%d)\n",
1309 i, syncing);
1310 }
1311 }
1312 }
1313 set_bit(STRIPE_HANDLE, &sh->state);
1314 }
1315
1316 /* now to consider writing and what else, if anything should be read */
1317 if (to_write) {
1318 int rcw=0, must_compute=0;
1319 for (i=disks ; i--;) {
1320 dev = &sh->dev[i];
1321 /* Would I have to read this buffer for reconstruct_write */
1322 if (!test_bit(R5_OVERWRITE, &dev->flags)
1323 && i != pd_idx && i != qd_idx
1324 && (!test_bit(R5_LOCKED, &dev->flags)
1325#if 0
1326 || sh->bh_page[i] != bh->b_page
1327#endif
1328 ) &&
1329 !test_bit(R5_UPTODATE, &dev->flags)) {
1330 if (test_bit(R5_Insync, &dev->flags)) rcw++;
1331 else {
1332 PRINTK("raid6: must_compute: disk %d flags=%#lx\n", i, dev->flags);
1333 must_compute++;
1334 }
1335 }
1336 }
1337 PRINTK("for sector %llu, rcw=%d, must_compute=%d\n",
1338 (unsigned long long)sh->sector, rcw, must_compute);
1339 set_bit(STRIPE_HANDLE, &sh->state);
1340
1341 if (rcw > 0)
1342 /* want reconstruct write, but need to get some data */
1343 for (i=disks; i--;) {
1344 dev = &sh->dev[i];
1345 if (!test_bit(R5_OVERWRITE, &dev->flags)
1346 && !(failed == 0 && (i == pd_idx || i == qd_idx))
1347 && !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
1348 test_bit(R5_Insync, &dev->flags)) {
1349 if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1350 {
1351 PRINTK("Read_old stripe %llu block %d for Reconstruct\n",
1352 (unsigned long long)sh->sector, i);
1353 set_bit(R5_LOCKED, &dev->flags);
1354 set_bit(R5_Wantread, &dev->flags);
1355 locked++;
1356 } else {
1357 PRINTK("Request delayed stripe %llu block %d for Reconstruct\n",
1358 (unsigned long long)sh->sector, i);
1359 set_bit(STRIPE_DELAYED, &sh->state);
1360 set_bit(STRIPE_HANDLE, &sh->state);
1361 }
1362 }
1363 }
1364 /* now if nothing is locked, and if we have enough data, we can start a write request */
1365 if (locked == 0 && rcw == 0 &&
1366 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
1367 if ( must_compute > 0 ) {
1368 /* We have failed blocks and need to compute them */
1369 switch ( failed ) {
1370 case 0: BUG();
1371 case 1: compute_block_1(sh, failed_num[0], 0); break;
1372 case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break;
1373 default: BUG(); /* This request should have been failed? */
1374 }
1375 }
1376
1377 PRINTK("Computing parity for stripe %llu\n", (unsigned long long)sh->sector);
1378 compute_parity(sh, RECONSTRUCT_WRITE);
1379 /* now every locked buffer is ready to be written */
1380 for (i=disks; i--;)
1381 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
1382 PRINTK("Writing stripe %llu block %d\n",
1383 (unsigned long long)sh->sector, i);
1384 locked++;
1385 set_bit(R5_Wantwrite, &sh->dev[i].flags);
1386 }
1387 /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
1388 set_bit(STRIPE_INSYNC, &sh->state);
1389
1390 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
1391 atomic_dec(&conf->preread_active_stripes);
1392 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
1393 md_wakeup_thread(conf->mddev->thread);
1394 }
1395 }
1396 }
1397
1398 /* maybe we need to check and possibly fix the parity for this stripe
1399 * Any reads will already have been scheduled, so we just see if enough data
1400 * is available
1401 */
1402 if (syncing && locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) {
1403 int update_p = 0, update_q = 0;
1404 struct r5dev *dev;
1405
1406 set_bit(STRIPE_HANDLE, &sh->state);
1407
1408 BUG_ON(failed>2);
1409 BUG_ON(uptodate < disks);
1410 /* Want to check and possibly repair P and Q.
1411 * However there could be one 'failed' device, in which
1412 * case we can only check one of them, possibly using the
1413 * other to generate missing data
1414 */
1415
1416 /* If !tmp_page, we cannot do the calculations,
1417 * but as we have set STRIPE_HANDLE, we will soon be called
1418 * by stripe_handle with a tmp_page - just wait until then.
1419 */
1420 if (tmp_page) {
1421 if (failed == q_failed) {
1422 /* The only possible failed device holds 'Q', so it makes
1423 * sense to check P (If anything else were failed, we would
1424 * have used P to recreate it).
1425 */
1426 compute_block_1(sh, pd_idx, 1);
1427 if (!page_is_zero(sh->dev[pd_idx].page)) {
1428 compute_block_1(sh,pd_idx,0);
1429 update_p = 1;
1430 }
1431 }
1432 if (!q_failed && failed < 2) {
1433 /* q is not failed, and we didn't use it to generate
1434 * anything, so it makes sense to check it
1435 */
1436 memcpy(page_address(tmp_page),
1437 page_address(sh->dev[qd_idx].page),
1438 STRIPE_SIZE);
1439 compute_parity(sh, UPDATE_PARITY);
1440 if (memcmp(page_address(tmp_page),
1441 page_address(sh->dev[qd_idx].page),
1442 STRIPE_SIZE)!= 0) {
1443 clear_bit(STRIPE_INSYNC, &sh->state);
1444 update_q = 1;
1445 }
1446 }
1447 if (update_p || update_q) {
1448 conf->mddev->resync_mismatches += STRIPE_SECTORS;
1449 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
1450 /* don't try to repair!! */
1451 update_p = update_q = 0;
1452 }
1453
1454 /* now write out any block on a failed drive,
1455 * or P or Q if they need it
1456 */
1457
1458 if (failed == 2) {
1459 dev = &sh->dev[failed_num[1]];
1460 locked++;
1461 set_bit(R5_LOCKED, &dev->flags);
1462 set_bit(R5_Wantwrite, &dev->flags);
1463 }
1464 if (failed >= 1) {
1465 dev = &sh->dev[failed_num[0]];
1466 locked++;
1467 set_bit(R5_LOCKED, &dev->flags);
1468 set_bit(R5_Wantwrite, &dev->flags);
1469 }
1470
1471 if (update_p) {
1472 dev = &sh->dev[pd_idx];
1473 locked ++;
1474 set_bit(R5_LOCKED, &dev->flags);
1475 set_bit(R5_Wantwrite, &dev->flags);
1476 }
1477 if (update_q) {
1478 dev = &sh->dev[qd_idx];
1479 locked++;
1480 set_bit(R5_LOCKED, &dev->flags);
1481 set_bit(R5_Wantwrite, &dev->flags);
1482 }
1483 clear_bit(STRIPE_DEGRADED, &sh->state);
1484
1485 set_bit(STRIPE_INSYNC, &sh->state);
1486 }
1487 }
1488
1489 if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
1490 md_done_sync(conf->mddev, STRIPE_SECTORS,1);
1491 clear_bit(STRIPE_SYNCING, &sh->state);
1492 }
1493
1494 /* If the failed drives are just a ReadError, then we might need
1495 * to progress the repair/check process
1496 */
1497 if (failed <= 2 && ! conf->mddev->ro)
1498 for (i=0; i<failed;i++) {
1499 dev = &sh->dev[failed_num[i]];
1500 if (test_bit(R5_ReadError, &dev->flags)
1501 && !test_bit(R5_LOCKED, &dev->flags)
1502 && test_bit(R5_UPTODATE, &dev->flags)
1503 ) {
1504 if (!test_bit(R5_ReWrite, &dev->flags)) {
1505 set_bit(R5_Wantwrite, &dev->flags);
1506 set_bit(R5_ReWrite, &dev->flags);
1507 set_bit(R5_LOCKED, &dev->flags);
1508 } else {
1509 /* let's read it back */
1510 set_bit(R5_Wantread, &dev->flags);
1511 set_bit(R5_LOCKED, &dev->flags);
1512 }
1513 }
1514 }
1515 spin_unlock(&sh->lock);
1516
1517 while ((bi=return_bi)) {
1518 int bytes = bi->bi_size;
1519
1520 return_bi = bi->bi_next;
1521 bi->bi_next = NULL;
1522 bi->bi_size = 0;
1523 bi->bi_end_io(bi, bytes, 0);
1524 }
1525 for (i=disks; i-- ;) {
1526 int rw;
1527 struct bio *bi;
1528 mdk_rdev_t *rdev;
1529 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
1530 rw = 1;
1531 else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
1532 rw = 0;
1533 else
1534 continue;
1535
1536 bi = &sh->dev[i].req;
1537
1538 bi->bi_rw = rw;
1539 if (rw)
1540 bi->bi_end_io = raid6_end_write_request;
1541 else
1542 bi->bi_end_io = raid6_end_read_request;
1543
1544 rcu_read_lock();
1545 rdev = rcu_dereference(conf->disks[i].rdev);
1546 if (rdev && test_bit(Faulty, &rdev->flags))
1547 rdev = NULL;
1548 if (rdev)
1549 atomic_inc(&rdev->nr_pending);
1550 rcu_read_unlock();
1551
1552 if (rdev) {
1553 if (syncing)
1554 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
1555
1556 bi->bi_bdev = rdev->bdev;
1557 PRINTK("for %llu schedule op %ld on disc %d\n",
1558 (unsigned long long)sh->sector, bi->bi_rw, i);
1559 atomic_inc(&sh->count);
1560 bi->bi_sector = sh->sector + rdev->data_offset;
1561 bi->bi_flags = 1 << BIO_UPTODATE;
1562 bi->bi_vcnt = 1;
1563 bi->bi_max_vecs = 1;
1564 bi->bi_idx = 0;
1565 bi->bi_io_vec = &sh->dev[i].vec;
1566 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
1567 bi->bi_io_vec[0].bv_offset = 0;
1568 bi->bi_size = STRIPE_SIZE;
1569 bi->bi_next = NULL;
1570 if (rw == WRITE &&
1571 test_bit(R5_ReWrite, &sh->dev[i].flags))
1572 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
1573 generic_make_request(bi);
1574 } else {
1575 if (rw == 1)
1576 set_bit(STRIPE_DEGRADED, &sh->state);
1577 PRINTK("skip op %ld on disc %d for sector %llu\n",
1578 bi->bi_rw, i, (unsigned long long)sh->sector);
1579 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1580 set_bit(STRIPE_HANDLE, &sh->state);
1581 }
1582 }
1583}
1584
1585static void raid6_activate_delayed(raid6_conf_t *conf)
1586{
1587 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
1588 while (!list_empty(&conf->delayed_list)) {
1589 struct list_head *l = conf->delayed_list.next;
1590 struct stripe_head *sh;
1591 sh = list_entry(l, struct stripe_head, lru);
1592 list_del_init(l);
1593 clear_bit(STRIPE_DELAYED, &sh->state);
1594 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1595 atomic_inc(&conf->preread_active_stripes);
1596 list_add_tail(&sh->lru, &conf->handle_list);
1597 }
1598 }
1599}
1600
1601static void activate_bit_delay(raid6_conf_t *conf)
1602{
1603 /* device_lock is held */
1604 struct list_head head;
1605 list_add(&head, &conf->bitmap_list);
1606 list_del_init(&conf->bitmap_list);
1607 while (!list_empty(&head)) {
1608 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
1609 list_del_init(&sh->lru);
1610 atomic_inc(&sh->count);
1611 __release_stripe(conf, sh);
1612 }
1613}
1614
1615static void unplug_slaves(mddev_t *mddev)
1616{
1617 raid6_conf_t *conf = mddev_to_conf(mddev);
1618 int i;
1619
1620 rcu_read_lock();
1621 for (i=0; i<mddev->raid_disks; i++) {
1622 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
1623 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
1624 request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
1625
1626 atomic_inc(&rdev->nr_pending);
1627 rcu_read_unlock();
1628
1629 if (r_queue->unplug_fn)
1630 r_queue->unplug_fn(r_queue);
1631
1632 rdev_dec_pending(rdev, mddev);
1633 rcu_read_lock();
1634 }
1635 }
1636 rcu_read_unlock();
1637}
1638
1639static void raid6_unplug_device(request_queue_t *q)
1640{
1641 mddev_t *mddev = q->queuedata;
1642 raid6_conf_t *conf = mddev_to_conf(mddev);
1643 unsigned long flags;
1644
1645 spin_lock_irqsave(&conf->device_lock, flags);
1646
1647 if (blk_remove_plug(q)) {
1648 conf->seq_flush++;
1649 raid6_activate_delayed(conf);
1650 }
1651 md_wakeup_thread(mddev->thread);
1652
1653 spin_unlock_irqrestore(&conf->device_lock, flags);
1654
1655 unplug_slaves(mddev);
1656}
1657
1658static int raid6_issue_flush(request_queue_t *q, struct gendisk *disk,
1659 sector_t *error_sector)
1660{
1661 mddev_t *mddev = q->queuedata;
1662 raid6_conf_t *conf = mddev_to_conf(mddev);
1663 int i, ret = 0;
1664
1665 rcu_read_lock();
1666 for (i=0; i<mddev->raid_disks && ret == 0; i++) {
1667 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
1668 if (rdev && !test_bit(Faulty, &rdev->flags)) {
1669 struct block_device *bdev = rdev->bdev;
1670 request_queue_t *r_queue = bdev_get_queue(bdev);
1671
1672 if (!r_queue->issue_flush_fn)
1673 ret = -EOPNOTSUPP;
1674 else {
1675 atomic_inc(&rdev->nr_pending);
1676 rcu_read_unlock();
1677 ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk,
1678 error_sector);
1679 rdev_dec_pending(rdev, mddev);
1680 rcu_read_lock();
1681 }
1682 }
1683 }
1684 rcu_read_unlock();
1685 return ret;
1686}
1687
1688static inline void raid6_plug_device(raid6_conf_t *conf)
1689{
1690 spin_lock_irq(&conf->device_lock);
1691 blk_plug_device(conf->mddev->queue);
1692 spin_unlock_irq(&conf->device_lock);
1693}
1694
1695static int make_request (request_queue_t *q, struct bio * bi)
1696{
1697 mddev_t *mddev = q->queuedata;
1698 raid6_conf_t *conf = mddev_to_conf(mddev);
1699 const unsigned int raid_disks = conf->raid_disks;
1700 const unsigned int data_disks = raid_disks - 2;
1701 unsigned int dd_idx, pd_idx;
1702 sector_t new_sector;
1703 sector_t logical_sector, last_sector;
1704 struct stripe_head *sh;
1705 const int rw = bio_data_dir(bi);
1706
1707 if (unlikely(bio_barrier(bi))) {
1708 bio_endio(bi, bi->bi_size, -EOPNOTSUPP);
1709 return 0;
1710 }
1711
1712 md_write_start(mddev, bi);
1713
1714 disk_stat_inc(mddev->gendisk, ios[rw]);
1715 disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bi));
1716
1717 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
1718 last_sector = bi->bi_sector + (bi->bi_size>>9);
1719
1720 bi->bi_next = NULL;
1721 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
1722
1723 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
1724 DEFINE_WAIT(w);
1725
1726 new_sector = raid6_compute_sector(logical_sector,
1727 raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1728
1729 PRINTK("raid6: make_request, sector %llu logical %llu\n",
1730 (unsigned long long)new_sector,
1731 (unsigned long long)logical_sector);
1732
1733 retry:
1734 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
1735 sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK));
1736 if (sh) {
1737 if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
1738 /* Add failed due to overlap. Flush everything
1739 * and wait a while
1740 */
1741 raid6_unplug_device(mddev->queue);
1742 release_stripe(sh);
1743 schedule();
1744 goto retry;
1745 }
1746 finish_wait(&conf->wait_for_overlap, &w);
1747 raid6_plug_device(conf);
1748 handle_stripe(sh, NULL);
1749 release_stripe(sh);
1750 } else {
1751 /* cannot get stripe for read-ahead, just give-up */
1752 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1753 finish_wait(&conf->wait_for_overlap, &w);
1754 break;
1755 }
1756
1757 }
1758 spin_lock_irq(&conf->device_lock);
1759 if (--bi->bi_phys_segments == 0) {
1760 int bytes = bi->bi_size;
1761
1762 if (rw == WRITE )
1763 md_write_end(mddev);
1764 bi->bi_size = 0;
1765 bi->bi_end_io(bi, bytes, 0);
1766 }
1767 spin_unlock_irq(&conf->device_lock);
1768 return 0;
1769}
1770
1771/* FIXME go_faster isn't used */
1772static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
1773{
1774 raid6_conf_t *conf = (raid6_conf_t *) mddev->private;
1775 struct stripe_head *sh;
1776 int sectors_per_chunk = conf->chunk_size >> 9;
1777 sector_t x;
1778 unsigned long stripe;
1779 int chunk_offset;
1780 int dd_idx, pd_idx;
1781 sector_t first_sector;
1782 int raid_disks = conf->raid_disks;
1783 int data_disks = raid_disks - 2;
1784 sector_t max_sector = mddev->size << 1;
1785 int sync_blocks;
1786 int still_degraded = 0;
1787 int i;
1788
1789 if (sector_nr >= max_sector) {
1790 /* just being told to finish up .. nothing much to do */
1791 unplug_slaves(mddev);
1792
1793 if (mddev->curr_resync < max_sector) /* aborted */
1794 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
1795 &sync_blocks, 1);
1796 else /* completed sync */
1797 conf->fullsync = 0;
1798 bitmap_close_sync(mddev->bitmap);
1799
1800 return 0;
1801 }
1802 /* if there are 2 or more failed drives and we are trying
1803 * to resync, then assert that we are finished, because there is
1804 * nothing we can do.
1805 */
1806 if (mddev->degraded >= 2 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
1807 sector_t rv = (mddev->size << 1) - sector_nr;
1808 *skipped = 1;
1809 return rv;
1810 }
1811 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
1812 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
1813 !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
1814 /* we can skip this block, and probably more */
1815 sync_blocks /= STRIPE_SECTORS;
1816 *skipped = 1;
1817 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
1818 }
1819
1820 x = sector_nr;
1821 chunk_offset = sector_div(x, sectors_per_chunk);
1822 stripe = x;
1823 BUG_ON(x != stripe);
1824
1825 first_sector = raid6_compute_sector((sector_t)stripe*data_disks*sectors_per_chunk
1826 + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1827 sh = get_active_stripe(conf, sector_nr, pd_idx, 1);
1828 if (sh == NULL) {
1829 sh = get_active_stripe(conf, sector_nr, pd_idx, 0);
1830 /* make sure we don't swamp the stripe cache if someone else
1831 * is trying to get access
1832 */
1833 schedule_timeout_uninterruptible(1);
1834 }
1835 /* Need to check if array will still be degraded after recovery/resync
1836 * We don't need to check the 'failed' flag as when that gets set,
1837 * recovery aborts.
1838 */
1839 for (i=0; i<mddev->raid_disks; i++)
1840 if (conf->disks[i].rdev == NULL)
1841 still_degraded = 1;
1842
1843 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
1844
1845 spin_lock(&sh->lock);
1846 set_bit(STRIPE_SYNCING, &sh->state);
1847 clear_bit(STRIPE_INSYNC, &sh->state);
1848 spin_unlock(&sh->lock);
1849
1850 handle_stripe(sh, NULL);
1851 release_stripe(sh);
1852
1853 return STRIPE_SECTORS;
1854}
1855
1856/*
1857 * This is our raid6 kernel thread.
1858 *
1859 * We scan the hash table for stripes which can be handled now.
1860 * During the scan, completed stripes are saved for us by the interrupt
1861 * handler, so that they will not have to wait for our next wakeup.
1862 */
1863static void raid6d (mddev_t *mddev)
1864{
1865 struct stripe_head *sh;
1866 raid6_conf_t *conf = mddev_to_conf(mddev);
1867 int handled;
1868
1869 PRINTK("+++ raid6d active\n");
1870
1871 md_check_recovery(mddev);
1872
1873 handled = 0;
1874 spin_lock_irq(&conf->device_lock);
1875 while (1) {
1876 struct list_head *first;
1877
1878 if (conf->seq_flush - conf->seq_write > 0) {
1879 int seq = conf->seq_flush;
1880 spin_unlock_irq(&conf->device_lock);
1881 bitmap_unplug(mddev->bitmap);
1882 spin_lock_irq(&conf->device_lock);
1883 conf->seq_write = seq;
1884 activate_bit_delay(conf);
1885 }
1886
1887 if (list_empty(&conf->handle_list) &&
1888 atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
1889 !blk_queue_plugged(mddev->queue) &&
1890 !list_empty(&conf->delayed_list))
1891 raid6_activate_delayed(conf);
1892
1893 if (list_empty(&conf->handle_list))
1894 break;
1895
1896 first = conf->handle_list.next;
1897 sh = list_entry(first, struct stripe_head, lru);
1898
1899 list_del_init(first);
1900 atomic_inc(&sh->count);
1901 BUG_ON(atomic_read(&sh->count)!= 1);
1902 spin_unlock_irq(&conf->device_lock);
1903
1904 handled++;
1905 handle_stripe(sh, conf->spare_page);
1906 release_stripe(sh);
1907
1908 spin_lock_irq(&conf->device_lock);
1909 }
1910 PRINTK("%d stripes handled\n", handled);
1911
1912 spin_unlock_irq(&conf->device_lock);
1913
1914 unplug_slaves(mddev);
1915
1916 PRINTK("--- raid6d inactive\n");
1917}
1918
1919static ssize_t
1920raid6_show_stripe_cache_size(mddev_t *mddev, char *page)
1921{
1922 raid6_conf_t *conf = mddev_to_conf(mddev);
1923 if (conf)
1924 return sprintf(page, "%d\n", conf->max_nr_stripes);
1925 else
1926 return 0;
1927}
1928
1929static ssize_t
1930raid6_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
1931{
1932 raid6_conf_t *conf = mddev_to_conf(mddev);
1933 char *end;
1934 int new;
1935 if (len >= PAGE_SIZE)
1936 return -EINVAL;
1937 if (!conf)
1938 return -ENODEV;
1939
1940 new = simple_strtoul(page, &end, 10);
1941 if (!*page || (*end && *end != '\n') )
1942 return -EINVAL;
1943 if (new <= 16 || new > 32768)
1944 return -EINVAL;
1945 while (new < conf->max_nr_stripes) {
1946 if (drop_one_stripe(conf))
1947 conf->max_nr_stripes--;
1948 else
1949 break;
1950 }
1951 while (new > conf->max_nr_stripes) {
1952 if (grow_one_stripe(conf))
1953 conf->max_nr_stripes++;
1954 else break;
1955 }
1956 return len;
1957}
1958
1959static struct md_sysfs_entry
1960raid6_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
1961 raid6_show_stripe_cache_size,
1962 raid6_store_stripe_cache_size);
1963
1964static ssize_t
1965stripe_cache_active_show(mddev_t *mddev, char *page)
1966{
1967 raid6_conf_t *conf = mddev_to_conf(mddev);
1968 if (conf)
1969 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
1970 else
1971 return 0;
1972}
1973
1974static struct md_sysfs_entry
1975raid6_stripecache_active = __ATTR_RO(stripe_cache_active);
1976
1977static struct attribute *raid6_attrs[] = {
1978 &raid6_stripecache_size.attr,
1979 &raid6_stripecache_active.attr,
1980 NULL,
1981};
1982static struct attribute_group raid6_attrs_group = {
1983 .name = NULL,
1984 .attrs = raid6_attrs,
1985};
1986
1987static int run(mddev_t *mddev)
1988{
1989 raid6_conf_t *conf;
1990 int raid_disk, memory;
1991 mdk_rdev_t *rdev;
1992 struct disk_info *disk;
1993 struct list_head *tmp;
1994
1995 if (mddev->level != 6) {
1996 PRINTK("raid6: %s: raid level not set to 6 (%d)\n", mdname(mddev), mddev->level);
1997 return -EIO;
1998 }
1999
2000 mddev->private = kzalloc(sizeof (raid6_conf_t), GFP_KERNEL);
2001 if ((conf = mddev->private) == NULL)
2002 goto abort;
2003 conf->disks = kzalloc(mddev->raid_disks * sizeof(struct disk_info),
2004 GFP_KERNEL);
2005 if (!conf->disks)
2006 goto abort;
2007
2008 conf->mddev = mddev;
2009
2010 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
2011 goto abort;
2012
2013 conf->spare_page = alloc_page(GFP_KERNEL);
2014 if (!conf->spare_page)
2015 goto abort;
2016
2017 spin_lock_init(&conf->device_lock);
2018 init_waitqueue_head(&conf->wait_for_stripe);
2019 init_waitqueue_head(&conf->wait_for_overlap);
2020 INIT_LIST_HEAD(&conf->handle_list);
2021 INIT_LIST_HEAD(&conf->delayed_list);
2022 INIT_LIST_HEAD(&conf->bitmap_list);
2023 INIT_LIST_HEAD(&conf->inactive_list);
2024 atomic_set(&conf->active_stripes, 0);
2025 atomic_set(&conf->preread_active_stripes, 0);
2026
2027 PRINTK("raid6: run(%s) called.\n", mdname(mddev));
2028
2029 ITERATE_RDEV(mddev,rdev,tmp) {
2030 raid_disk = rdev->raid_disk;
2031 if (raid_disk >= mddev->raid_disks
2032 || raid_disk < 0)
2033 continue;
2034 disk = conf->disks + raid_disk;
2035
2036 disk->rdev = rdev;
2037
2038 if (test_bit(In_sync, &rdev->flags)) {
2039 char b[BDEVNAME_SIZE];
2040 printk(KERN_INFO "raid6: device %s operational as raid"
2041 " disk %d\n", bdevname(rdev->bdev,b),
2042 raid_disk);
2043 conf->working_disks++;
2044 }
2045 }
2046
2047 conf->raid_disks = mddev->raid_disks;
2048
2049 /*
2050 * 0 for a fully functional array, 1 or 2 for a degraded array.
2051 */
2052 mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks;
2053 conf->mddev = mddev;
2054 conf->chunk_size = mddev->chunk_size;
2055 conf->level = mddev->level;
2056 conf->algorithm = mddev->layout;
2057 conf->max_nr_stripes = NR_STRIPES;
2058
2059 /* device size must be a multiple of chunk size */
2060 mddev->size &= ~(mddev->chunk_size/1024 -1);
2061 mddev->resync_max_sectors = mddev->size << 1;
2062
2063 if (conf->raid_disks < 4) {
2064 printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n",
2065 mdname(mddev), conf->raid_disks);
2066 goto abort;
2067 }
2068 if (!conf->chunk_size || conf->chunk_size % 4) {
2069 printk(KERN_ERR "raid6: invalid chunk size %d for %s\n",
2070 conf->chunk_size, mdname(mddev));
2071 goto abort;
2072 }
2073 if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
2074 printk(KERN_ERR
2075 "raid6: unsupported parity algorithm %d for %s\n",
2076 conf->algorithm, mdname(mddev));
2077 goto abort;
2078 }
2079 if (mddev->degraded > 2) {
2080 printk(KERN_ERR "raid6: not enough operational devices for %s"
2081 " (%d/%d failed)\n",
2082 mdname(mddev), conf->failed_disks, conf->raid_disks);
2083 goto abort;
2084 }
2085
2086 if (mddev->degraded > 0 &&
2087 mddev->recovery_cp != MaxSector) {
2088 if (mddev->ok_start_degraded)
2089 printk(KERN_WARNING "raid6: starting dirty degraded array:%s"
2090 "- data corruption possible.\n",
2091 mdname(mddev));
2092 else {
2093 printk(KERN_ERR "raid6: cannot start dirty degraded array"
2094 " for %s\n", mdname(mddev));
2095 goto abort;
2096 }
2097 }
2098
2099 {
2100 mddev->thread = md_register_thread(raid6d, mddev, "%s_raid6");
2101 if (!mddev->thread) {
2102 printk(KERN_ERR
2103 "raid6: couldn't allocate thread for %s\n",
2104 mdname(mddev));
2105 goto abort;
2106 }
2107 }
2108
2109 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
2110 conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
2111 if (grow_stripes(conf, conf->max_nr_stripes)) {
2112 printk(KERN_ERR
2113 "raid6: couldn't allocate %dkB for buffers\n", memory);
2114 shrink_stripes(conf);
2115 md_unregister_thread(mddev->thread);
2116 goto abort;
2117 } else
2118 printk(KERN_INFO "raid6: allocated %dkB for %s\n",
2119 memory, mdname(mddev));
2120
2121 if (mddev->degraded == 0)
2122 printk(KERN_INFO "raid6: raid level %d set %s active with %d out of %d"
2123 " devices, algorithm %d\n", conf->level, mdname(mddev),
2124 mddev->raid_disks-mddev->degraded, mddev->raid_disks,
2125 conf->algorithm);
2126 else
2127 printk(KERN_ALERT "raid6: raid level %d set %s active with %d"
2128 " out of %d devices, algorithm %d\n", conf->level,
2129 mdname(mddev), mddev->raid_disks - mddev->degraded,
2130 mddev->raid_disks, conf->algorithm);
2131
2132 print_raid6_conf(conf);
2133
2134 /* read-ahead size must cover two whole stripes, which is
2135 * 2 * (n-2) * chunksize where 'n' is the number of raid devices
2136 */
2137 {
2138 int stripe = (mddev->raid_disks-2) * mddev->chunk_size
2139 / PAGE_SIZE;
2140 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
2141 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
2142 }
2143
2144 /* Ok, everything is just fine now */
2145 sysfs_create_group(&mddev->kobj, &raid6_attrs_group);
2146
2147 mddev->array_size = mddev->size * (mddev->raid_disks - 2);
2148
2149 mddev->queue->unplug_fn = raid6_unplug_device;
2150 mddev->queue->issue_flush_fn = raid6_issue_flush;
2151 return 0;
2152abort:
2153 if (conf) {
2154 print_raid6_conf(conf);
2155 safe_put_page(conf->spare_page);
2156 kfree(conf->stripe_hashtbl);
2157 kfree(conf->disks);
2158 kfree(conf);
2159 }
2160 mddev->private = NULL;
2161 printk(KERN_ALERT "raid6: failed to run raid set %s\n", mdname(mddev));
2162 return -EIO;
2163}
2164
2165
2166
2167static int stop (mddev_t *mddev)
2168{
2169 raid6_conf_t *conf = (raid6_conf_t *) mddev->private;
2170
2171 md_unregister_thread(mddev->thread);
2172 mddev->thread = NULL;
2173 shrink_stripes(conf);
2174 kfree(conf->stripe_hashtbl);
2175 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
2176 sysfs_remove_group(&mddev->kobj, &raid6_attrs_group);
2177 kfree(conf);
2178 mddev->private = NULL;
2179 return 0;
2180}
2181
2182#if RAID6_DUMPSTATE
2183static void print_sh (struct seq_file *seq, struct stripe_head *sh)
2184{
2185 int i;
2186
2187 seq_printf(seq, "sh %llu, pd_idx %d, state %ld.\n",
2188 (unsigned long long)sh->sector, sh->pd_idx, sh->state);
2189 seq_printf(seq, "sh %llu, count %d.\n",
2190 (unsigned long long)sh->sector, atomic_read(&sh->count));
2191 seq_printf(seq, "sh %llu, ", (unsigned long long)sh->sector);
2192 for (i = 0; i < sh->raid_conf->raid_disks; i++) {
2193 seq_printf(seq, "(cache%d: %p %ld) ",
2194 i, sh->dev[i].page, sh->dev[i].flags);
2195 }
2196 seq_printf(seq, "\n");
2197}
2198
2199static void printall (struct seq_file *seq, raid6_conf_t *conf)
2200{
2201 struct stripe_head *sh;
2202 struct hlist_node *hn;
2203 int i;
2204
2205 spin_lock_irq(&conf->device_lock);
2206 for (i = 0; i < NR_HASH; i++) {
2207 sh = conf->stripe_hashtbl[i];
2208 hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) {
2209 if (sh->raid_conf != conf)
2210 continue;
2211 print_sh(seq, sh);
2212 }
2213 }
2214 spin_unlock_irq(&conf->device_lock);
2215}
2216#endif
2217
2218static void status (struct seq_file *seq, mddev_t *mddev)
2219{
2220 raid6_conf_t *conf = (raid6_conf_t *) mddev->private;
2221 int i;
2222
2223 seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout);
2224 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->working_disks);
2225 for (i = 0; i < conf->raid_disks; i++)
2226 seq_printf (seq, "%s",
2227 conf->disks[i].rdev &&
2228 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_");
2229 seq_printf (seq, "]");
2230#if RAID6_DUMPSTATE
2231 seq_printf (seq, "\n");
2232 printall(seq, conf);
2233#endif
2234}
2235
2236static void print_raid6_conf (raid6_conf_t *conf)
2237{
2238 int i;
2239 struct disk_info *tmp;
2240
2241 printk("RAID6 conf printout:\n");
2242 if (!conf) {
2243 printk("(conf==NULL)\n");
2244 return;
2245 }
2246 printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
2247 conf->working_disks, conf->failed_disks);
2248
2249 for (i = 0; i < conf->raid_disks; i++) {
2250 char b[BDEVNAME_SIZE];
2251 tmp = conf->disks + i;
2252 if (tmp->rdev)
2253 printk(" disk %d, o:%d, dev:%s\n",
2254 i, !test_bit(Faulty, &tmp->rdev->flags),
2255 bdevname(tmp->rdev->bdev,b));
2256 }
2257}
2258
2259static int raid6_spare_active(mddev_t *mddev)
2260{
2261 int i;
2262 raid6_conf_t *conf = mddev->private;
2263 struct disk_info *tmp;
2264
2265 for (i = 0; i < conf->raid_disks; i++) {
2266 tmp = conf->disks + i;
2267 if (tmp->rdev
2268 && !test_bit(Faulty, &tmp->rdev->flags)
2269 && !test_bit(In_sync, &tmp->rdev->flags)) {
2270 mddev->degraded--;
2271 conf->failed_disks--;
2272 conf->working_disks++;
2273 set_bit(In_sync, &tmp->rdev->flags);
2274 }
2275 }
2276 print_raid6_conf(conf);
2277 return 0;
2278}
2279
2280static int raid6_remove_disk(mddev_t *mddev, int number)
2281{
2282 raid6_conf_t *conf = mddev->private;
2283 int err = 0;
2284 mdk_rdev_t *rdev;
2285 struct disk_info *p = conf->disks + number;
2286
2287 print_raid6_conf(conf);
2288 rdev = p->rdev;
2289 if (rdev) {
2290 if (test_bit(In_sync, &rdev->flags) ||
2291 atomic_read(&rdev->nr_pending)) {
2292 err = -EBUSY;
2293 goto abort;
2294 }
2295 p->rdev = NULL;
2296 synchronize_rcu();
2297 if (atomic_read(&rdev->nr_pending)) {
2298 /* lost the race, try later */
2299 err = -EBUSY;
2300 p->rdev = rdev;
2301 }
2302 }
2303
2304abort:
2305
2306 print_raid6_conf(conf);
2307 return err;
2308}
2309
2310static int raid6_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
2311{
2312 raid6_conf_t *conf = mddev->private;
2313 int found = 0;
2314 int disk;
2315 struct disk_info *p;
2316
2317 if (mddev->degraded > 2)
2318 /* no point adding a device */
2319 return 0;
2320 /*
2321 * find the disk ... but prefer rdev->saved_raid_disk
2322 * if possible.
2323 */
2324 if (rdev->saved_raid_disk >= 0 &&
2325 conf->disks[rdev->saved_raid_disk].rdev == NULL)
2326 disk = rdev->saved_raid_disk;
2327 else
2328 disk = 0;
2329 for ( ; disk < mddev->raid_disks; disk++)
2330 if ((p=conf->disks + disk)->rdev == NULL) {
2331 clear_bit(In_sync, &rdev->flags);
2332 rdev->raid_disk = disk;
2333 found = 1;
2334 if (rdev->saved_raid_disk != disk)
2335 conf->fullsync = 1;
2336 rcu_assign_pointer(p->rdev, rdev);
2337 break;
2338 }
2339 print_raid6_conf(conf);
2340 return found;
2341}
2342
2343static int raid6_resize(mddev_t *mddev, sector_t sectors)
2344{
2345 /* no resync is happening, and there is enough space
2346 * on all devices, so we can resize.
2347 * We need to make sure resync covers any new space.
2348 * If the array is shrinking we should possibly wait until
2349 * any io in the removed space completes, but it hardly seems
2350 * worth it.
2351 */
2352 sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
2353 mddev->array_size = (sectors * (mddev->raid_disks-2))>>1;
2354 set_capacity(mddev->gendisk, mddev->array_size << 1);
2355 mddev->changed = 1;
2356 if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) {
2357 mddev->recovery_cp = mddev->size << 1;
2358 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2359 }
2360 mddev->size = sectors /2;
2361 mddev->resync_max_sectors = sectors;
2362 return 0;
2363}
2364
2365static void raid6_quiesce(mddev_t *mddev, int state)
2366{
2367 raid6_conf_t *conf = mddev_to_conf(mddev);
2368
2369 switch(state) {
2370 case 1: /* stop all writes */
2371 spin_lock_irq(&conf->device_lock);
2372 conf->quiesce = 1;
2373 wait_event_lock_irq(conf->wait_for_stripe,
2374 atomic_read(&conf->active_stripes) == 0,
2375 conf->device_lock, /* nothing */);
2376 spin_unlock_irq(&conf->device_lock);
2377 break;
2378
2379 case 0: /* re-enable writes */
2380 spin_lock_irq(&conf->device_lock);
2381 conf->quiesce = 0;
2382 wake_up(&conf->wait_for_stripe);
2383 spin_unlock_irq(&conf->device_lock);
2384 break;
2385 }
2386}
2387
2388static struct mdk_personality raid6_personality =
2389{
2390 .name = "raid6",
2391 .level = 6,
2392 .owner = THIS_MODULE,
2393 .make_request = make_request,
2394 .run = run,
2395 .stop = stop,
2396 .status = status,
2397 .error_handler = error,
2398 .hot_add_disk = raid6_add_disk,
2399 .hot_remove_disk= raid6_remove_disk,
2400 .spare_active = raid6_spare_active,
2401 .sync_request = sync_request,
2402 .resize = raid6_resize,
2403 .quiesce = raid6_quiesce,
2404};
2405
2406static int __init raid6_init(void)
2407{
2408 int e;
2409
2410 e = raid6_select_algo();
2411 if ( e )
2412 return e;
2413
2414 return register_md_personality(&raid6_personality);
2415}
2416
2417static void raid6_exit (void)
2418{
2419 unregister_md_personality(&raid6_personality);
2420}
2421
2422module_init(raid6_init);
2423module_exit(raid6_exit);
2424MODULE_LICENSE("GPL");
2425MODULE_ALIAS("md-personality-8"); /* RAID6 */
2426MODULE_ALIAS("md-raid6");
2427MODULE_ALIAS("md-level-6");