aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig46
-rw-r--r--drivers/md/Makefile5
-rw-r--r--drivers/md/bitmap.c484
-rw-r--r--drivers/md/dm-crypt.c194
-rw-r--r--drivers/md/dm-emc.c40
-rw-r--r--drivers/md/dm-exception-store.c67
-rw-r--r--drivers/md/dm-ioctl.c139
-rw-r--r--drivers/md/dm-linear.c8
-rw-r--r--drivers/md/dm-log.c157
-rw-r--r--drivers/md/dm-mpath.c46
-rw-r--r--drivers/md/dm-raid1.c101
-rw-r--r--drivers/md/dm-round-robin.c6
-rw-r--r--drivers/md/dm-snap.c17
-rw-r--r--drivers/md/dm-stripe.c25
-rw-r--r--drivers/md/dm-table.c57
-rw-r--r--drivers/md/dm-target.c2
-rw-r--r--drivers/md/dm-zero.c8
-rw-r--r--drivers/md/dm.c186
-rw-r--r--drivers/md/dm.h81
-rw-r--r--drivers/md/kcopyd.c5
-rw-r--r--drivers/md/linear.c80
-rw-r--r--drivers/md/md.c754
-rw-r--r--drivers/md/raid1.c104
-rw-r--r--drivers/md/raid10.c81
-rw-r--r--drivers/md/raid5.c1390
-rw-r--r--drivers/md/raid6main.c2427
26 files changed, 2758 insertions, 3752 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index ac25a48362ac..bf869ed03eed 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -90,7 +90,7 @@ config MD_RAID10
90 depends on BLK_DEV_MD && EXPERIMENTAL 90 depends on BLK_DEV_MD && EXPERIMENTAL
91 ---help--- 91 ---help---
92 RAID-10 provides a combination of striping (RAID-0) and 92 RAID-10 provides a combination of striping (RAID-0) and
93 mirroring (RAID-1) with easier configuration and more flexable 93 mirroring (RAID-1) with easier configuration and more flexible
94 layout. 94 layout.
95 Unlike RAID-0, but like RAID-1, RAID-10 requires all devices to 95 Unlike RAID-0, but like RAID-1, RAID-10 requires all devices to
96 be the same size (or at least, only as much as the smallest device 96 be the same size (or at least, only as much as the smallest device
@@ -104,8 +104,8 @@ config MD_RAID10
104 104
105 If unsure, say Y. 105 If unsure, say Y.
106 106
107config MD_RAID5 107config MD_RAID456
108 tristate "RAID-4/RAID-5 mode" 108 tristate "RAID-4/RAID-5/RAID-6 mode"
109 depends on BLK_DEV_MD 109 depends on BLK_DEV_MD
110 ---help--- 110 ---help---
111 A RAID-5 set of N drives with a capacity of C MB per drive provides 111 A RAID-5 set of N drives with a capacity of C MB per drive provides
@@ -116,20 +116,28 @@ config MD_RAID5
116 while a RAID-5 set distributes the parity across the drives in one 116 while a RAID-5 set distributes the parity across the drives in one
117 of the available parity distribution methods. 117 of the available parity distribution methods.
118 118
119 A RAID-6 set of N drives with a capacity of C MB per drive
120 provides the capacity of C * (N - 2) MB, and protects
121 against a failure of any two drives. For a given sector
122 (row) number, (N - 2) drives contain data sectors, and two
123 drives contains two independent redundancy syndromes. Like
124 RAID-5, RAID-6 distributes the syndromes across the drives
125 in one of the available parity distribution methods.
126
119 Information about Software RAID on Linux is contained in the 127 Information about Software RAID on Linux is contained in the
120 Software-RAID mini-HOWTO, available from 128 Software-RAID mini-HOWTO, available from
121 <http://www.tldp.org/docs.html#howto>. There you will also 129 <http://www.tldp.org/docs.html#howto>. There you will also
122 learn where to get the supporting user space utilities raidtools. 130 learn where to get the supporting user space utilities raidtools.
123 131
124 If you want to use such a RAID-4/RAID-5 set, say Y. To 132 If you want to use such a RAID-4/RAID-5/RAID-6 set, say Y. To
125 compile this code as a module, choose M here: the module 133 compile this code as a module, choose M here: the module
126 will be called raid5. 134 will be called raid456.
127 135
128 If unsure, say Y. 136 If unsure, say Y.
129 137
130config MD_RAID5_RESHAPE 138config MD_RAID5_RESHAPE
131 bool "Support adding drives to a raid-5 array (experimental)" 139 bool "Support adding drives to a raid-5 array (experimental)"
132 depends on MD_RAID5 && EXPERIMENTAL 140 depends on MD_RAID456 && EXPERIMENTAL
133 ---help--- 141 ---help---
134 A RAID-5 set can be expanded by adding extra drives. This 142 A RAID-5 set can be expanded by adding extra drives. This
135 requires "restriping" the array which means (almost) every 143 requires "restriping" the array which means (almost) every
@@ -139,7 +147,7 @@ config MD_RAID5_RESHAPE
139 is online. However it is still EXPERIMENTAL code. It should 147 is online. However it is still EXPERIMENTAL code. It should
140 work, but please be sure that you have backups. 148 work, but please be sure that you have backups.
141 149
142 You will need mdadm verion 2.4.1 or later to use this 150 You will need mdadm version 2.4.1 or later to use this
143 feature safely. During the early stage of reshape there is 151 feature safely. During the early stage of reshape there is
144 a critical section where live data is being over-written. A 152 a critical section where live data is being over-written. A
145 crash during this time needs extra care for recovery. The 153 crash during this time needs extra care for recovery. The
@@ -154,28 +162,6 @@ config MD_RAID5_RESHAPE
154 There should be enough spares already present to make the new 162 There should be enough spares already present to make the new
155 array workable. 163 array workable.
156 164
157config MD_RAID6
158 tristate "RAID-6 mode"
159 depends on BLK_DEV_MD
160 ---help---
161 A RAID-6 set of N drives with a capacity of C MB per drive
162 provides the capacity of C * (N - 2) MB, and protects
163 against a failure of any two drives. For a given sector
164 (row) number, (N - 2) drives contain data sectors, and two
165 drives contains two independent redundancy syndromes. Like
166 RAID-5, RAID-6 distributes the syndromes across the drives
167 in one of the available parity distribution methods.
168
169 RAID-6 requires mdadm-1.5.0 or later, available at:
170
171 ftp://ftp.kernel.org/pub/linux/utils/raid/mdadm/
172
173 If you want to use such a RAID-6 set, say Y. To compile
174 this code as a module, choose M here: the module will be
175 called raid6.
176
177 If unsure, say Y.
178
179config MD_MULTIPATH 165config MD_MULTIPATH
180 tristate "Multipath I/O support" 166 tristate "Multipath I/O support"
181 depends on BLK_DEV_MD 167 depends on BLK_DEV_MD
@@ -235,7 +221,7 @@ config DM_SNAPSHOT
235 tristate "Snapshot target (EXPERIMENTAL)" 221 tristate "Snapshot target (EXPERIMENTAL)"
236 depends on BLK_DEV_DM && EXPERIMENTAL 222 depends on BLK_DEV_DM && EXPERIMENTAL
237 ---help--- 223 ---help---
238 Allow volume managers to take writeable snapshots of a device. 224 Allow volume managers to take writable snapshots of a device.
239 225
240config DM_MIRROR 226config DM_MIRROR
241 tristate "Mirror target (EXPERIMENTAL)" 227 tristate "Mirror target (EXPERIMENTAL)"
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index d3efedf6a6ad..34957a68d921 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -8,7 +8,7 @@ dm-multipath-objs := dm-hw-handler.o dm-path-selector.o dm-mpath.o
8dm-snapshot-objs := dm-snap.o dm-exception-store.o 8dm-snapshot-objs := dm-snap.o dm-exception-store.o
9dm-mirror-objs := dm-log.o dm-raid1.o 9dm-mirror-objs := dm-log.o dm-raid1.o
10md-mod-objs := md.o bitmap.o 10md-mod-objs := md.o bitmap.o
11raid6-objs := raid6main.o raid6algos.o raid6recov.o raid6tables.o \ 11raid456-objs := raid5.o raid6algos.o raid6recov.o raid6tables.o \
12 raid6int1.o raid6int2.o raid6int4.o \ 12 raid6int1.o raid6int2.o raid6int4.o \
13 raid6int8.o raid6int16.o raid6int32.o \ 13 raid6int8.o raid6int16.o raid6int32.o \
14 raid6altivec1.o raid6altivec2.o raid6altivec4.o \ 14 raid6altivec1.o raid6altivec2.o raid6altivec4.o \
@@ -25,8 +25,7 @@ obj-$(CONFIG_MD_LINEAR) += linear.o
25obj-$(CONFIG_MD_RAID0) += raid0.o 25obj-$(CONFIG_MD_RAID0) += raid0.o
26obj-$(CONFIG_MD_RAID1) += raid1.o 26obj-$(CONFIG_MD_RAID1) += raid1.o
27obj-$(CONFIG_MD_RAID10) += raid10.o 27obj-$(CONFIG_MD_RAID10) += raid10.o
28obj-$(CONFIG_MD_RAID5) += raid5.o xor.o 28obj-$(CONFIG_MD_RAID456) += raid456.o xor.o
29obj-$(CONFIG_MD_RAID6) += raid6.o xor.o
30obj-$(CONFIG_MD_MULTIPATH) += multipath.o 29obj-$(CONFIG_MD_MULTIPATH) += multipath.o
31obj-$(CONFIG_MD_FAULTY) += faulty.o 30obj-$(CONFIG_MD_FAULTY) += faulty.o
32obj-$(CONFIG_BLK_DEV_MD) += md-mod.o 31obj-$(CONFIG_BLK_DEV_MD) += md-mod.o
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index f8ffaee20ff8..ecc56765d949 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -7,7 +7,6 @@
7 * additions, Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.: 7 * additions, Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.:
8 * - added disk storage for bitmap 8 * - added disk storage for bitmap
9 * - changes to allow various bitmap chunk sizes 9 * - changes to allow various bitmap chunk sizes
10 * - added bitmap daemon (to asynchronously clear bitmap bits from disk)
11 */ 10 */
12 11
13/* 12/*
@@ -15,16 +14,12 @@
15 * 14 *
16 * flush after percent set rather than just time based. (maybe both). 15 * flush after percent set rather than just time based. (maybe both).
17 * wait if count gets too high, wake when it drops to half. 16 * wait if count gets too high, wake when it drops to half.
18 * allow bitmap to be mirrored with superblock (before or after...)
19 * allow hot-add to re-instate a current device.
20 * allow hot-add of bitmap after quiessing device
21 */ 17 */
22 18
23#include <linux/module.h> 19#include <linux/module.h>
24#include <linux/errno.h> 20#include <linux/errno.h>
25#include <linux/slab.h> 21#include <linux/slab.h>
26#include <linux/init.h> 22#include <linux/init.h>
27#include <linux/config.h>
28#include <linux/timer.h> 23#include <linux/timer.h>
29#include <linux/sched.h> 24#include <linux/sched.h>
30#include <linux/list.h> 25#include <linux/list.h>
@@ -73,24 +68,6 @@ static inline char * bmname(struct bitmap *bitmap)
73 68
74 69
75/* 70/*
76 * test if the bitmap is active
77 */
78int bitmap_active(struct bitmap *bitmap)
79{
80 unsigned long flags;
81 int res = 0;
82
83 if (!bitmap)
84 return res;
85 spin_lock_irqsave(&bitmap->lock, flags);
86 res = bitmap->flags & BITMAP_ACTIVE;
87 spin_unlock_irqrestore(&bitmap->lock, flags);
88 return res;
89}
90
91#define WRITE_POOL_SIZE 256
92
93/*
94 * just a placeholder - calls kmalloc for bitmap pages 71 * just a placeholder - calls kmalloc for bitmap pages
95 */ 72 */
96static unsigned char *bitmap_alloc_page(struct bitmap *bitmap) 73static unsigned char *bitmap_alloc_page(struct bitmap *bitmap)
@@ -269,6 +246,8 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde
269 246
270 if (sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ)) { 247 if (sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ)) {
271 page->index = index; 248 page->index = index;
249 attach_page_buffers(page, NULL); /* so that free_buffer will
250 * quietly no-op */
272 return page; 251 return page;
273 } 252 }
274 } 253 }
@@ -300,77 +279,132 @@ static int write_sb_page(mddev_t *mddev, long offset, struct page *page, int wai
300 */ 279 */
301static int write_page(struct bitmap *bitmap, struct page *page, int wait) 280static int write_page(struct bitmap *bitmap, struct page *page, int wait)
302{ 281{
303 int ret = -ENOMEM; 282 struct buffer_head *bh;
304 283
305 if (bitmap->file == NULL) 284 if (bitmap->file == NULL)
306 return write_sb_page(bitmap->mddev, bitmap->offset, page, wait); 285 return write_sb_page(bitmap->mddev, bitmap->offset, page, wait);
307 286
308 flush_dcache_page(page); /* make sure visible to anyone reading the file */ 287 bh = page_buffers(page);
309 288
310 if (wait) 289 while (bh && bh->b_blocknr) {
311 lock_page(page); 290 atomic_inc(&bitmap->pending_writes);
312 else { 291 set_buffer_locked(bh);
313 if (TestSetPageLocked(page)) 292 set_buffer_mapped(bh);
314 return -EAGAIN; /* already locked */ 293 submit_bh(WRITE, bh);
315 if (PageWriteback(page)) { 294 bh = bh->b_this_page;
316 unlock_page(page);
317 return -EAGAIN;
318 }
319 } 295 }
320 296
321 ret = page->mapping->a_ops->prepare_write(bitmap->file, page, 0, PAGE_SIZE); 297 if (wait) {
322 if (!ret) 298 wait_event(bitmap->write_wait,
323 ret = page->mapping->a_ops->commit_write(bitmap->file, page, 0, 299 atomic_read(&bitmap->pending_writes)==0);
324 PAGE_SIZE); 300 return (bitmap->flags & BITMAP_WRITE_ERROR) ? -EIO : 0;
325 if (ret) {
326 unlock_page(page);
327 return ret;
328 } 301 }
302 return 0;
303}
329 304
330 set_page_dirty(page); /* force it to be written out */ 305static void end_bitmap_write(struct buffer_head *bh, int uptodate)
331 306{
332 if (!wait) { 307 struct bitmap *bitmap = bh->b_private;
333 /* add to list to be waited for by daemon */ 308 unsigned long flags;
334 struct page_list *item = mempool_alloc(bitmap->write_pool, GFP_NOIO); 309
335 item->page = page; 310 if (!uptodate) {
336 get_page(page); 311 spin_lock_irqsave(&bitmap->lock, flags);
337 spin_lock(&bitmap->write_lock); 312 bitmap->flags |= BITMAP_WRITE_ERROR;
338 list_add(&item->list, &bitmap->complete_pages); 313 spin_unlock_irqrestore(&bitmap->lock, flags);
339 spin_unlock(&bitmap->write_lock); 314 }
340 md_wakeup_thread(bitmap->writeback_daemon); 315 if (atomic_dec_and_test(&bitmap->pending_writes))
316 wake_up(&bitmap->write_wait);
317}
318
319/* copied from buffer.c */
320static void
321__clear_page_buffers(struct page *page)
322{
323 ClearPagePrivate(page);
324 set_page_private(page, 0);
325 page_cache_release(page);
326}
327static void free_buffers(struct page *page)
328{
329 struct buffer_head *bh = page_buffers(page);
330
331 while (bh) {
332 struct buffer_head *next = bh->b_this_page;
333 free_buffer_head(bh);
334 bh = next;
341 } 335 }
342 return write_one_page(page, wait); 336 __clear_page_buffers(page);
337 put_page(page);
343} 338}
344 339
345/* read a page from a file, pinning it into cache, and return bytes_read */ 340/* read a page from a file.
341 * We both read the page, and attach buffers to the page to record the
342 * address of each block (using bmap). These addresses will be used
343 * to write the block later, completely bypassing the filesystem.
344 * This usage is similar to how swap files are handled, and allows us
345 * to write to a file with no concerns of memory allocation failing.
346 */
346static struct page *read_page(struct file *file, unsigned long index, 347static struct page *read_page(struct file *file, unsigned long index,
347 unsigned long *bytes_read) 348 struct bitmap *bitmap,
349 unsigned long count)
348{ 350{
349 struct inode *inode = file->f_mapping->host;
350 struct page *page = NULL; 351 struct page *page = NULL;
351 loff_t isize = i_size_read(inode); 352 struct inode *inode = file->f_dentry->d_inode;
352 unsigned long end_index = isize >> PAGE_SHIFT; 353 struct buffer_head *bh;
354 sector_t block;
353 355
354 PRINTK("read bitmap file (%dB @ %Lu)\n", (int)PAGE_SIZE, 356 PRINTK("read bitmap file (%dB @ %Lu)\n", (int)PAGE_SIZE,
355 (unsigned long long)index << PAGE_SHIFT); 357 (unsigned long long)index << PAGE_SHIFT);
356 358
357 page = read_cache_page(inode->i_mapping, index, 359 page = alloc_page(GFP_KERNEL);
358 (filler_t *)inode->i_mapping->a_ops->readpage, file); 360 if (!page)
361 page = ERR_PTR(-ENOMEM);
359 if (IS_ERR(page)) 362 if (IS_ERR(page))
360 goto out; 363 goto out;
361 wait_on_page_locked(page); 364
362 if (!PageUptodate(page) || PageError(page)) { 365 bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0);
366 if (!bh) {
363 put_page(page); 367 put_page(page);
364 page = ERR_PTR(-EIO); 368 page = ERR_PTR(-ENOMEM);
365 goto out; 369 goto out;
366 } 370 }
371 attach_page_buffers(page, bh);
372 block = index << (PAGE_SHIFT - inode->i_blkbits);
373 while (bh) {
374 if (count == 0)
375 bh->b_blocknr = 0;
376 else {
377 bh->b_blocknr = bmap(inode, block);
378 if (bh->b_blocknr == 0) {
379 /* Cannot use this file! */
380 free_buffers(page);
381 page = ERR_PTR(-EINVAL);
382 goto out;
383 }
384 bh->b_bdev = inode->i_sb->s_bdev;
385 if (count < (1<<inode->i_blkbits))
386 count = 0;
387 else
388 count -= (1<<inode->i_blkbits);
389
390 bh->b_end_io = end_bitmap_write;
391 bh->b_private = bitmap;
392 atomic_inc(&bitmap->pending_writes);
393 set_buffer_locked(bh);
394 set_buffer_mapped(bh);
395 submit_bh(READ, bh);
396 }
397 block++;
398 bh = bh->b_this_page;
399 }
400 page->index = index;
367 401
368 if (index > end_index) /* we have read beyond EOF */ 402 wait_event(bitmap->write_wait,
369 *bytes_read = 0; 403 atomic_read(&bitmap->pending_writes)==0);
370 else if (index == end_index) /* possible short read */ 404 if (bitmap->flags & BITMAP_WRITE_ERROR) {
371 *bytes_read = isize & ~PAGE_MASK; 405 free_buffers(page);
372 else 406 page = ERR_PTR(-EIO);
373 *bytes_read = PAGE_SIZE; /* got a full page */ 407 }
374out: 408out:
375 if (IS_ERR(page)) 409 if (IS_ERR(page))
376 printk(KERN_ALERT "md: bitmap read error: (%dB @ %Lu): %ld\n", 410 printk(KERN_ALERT "md: bitmap read error: (%dB @ %Lu): %ld\n",
@@ -441,16 +475,14 @@ static int bitmap_read_sb(struct bitmap *bitmap)
441 char *reason = NULL; 475 char *reason = NULL;
442 bitmap_super_t *sb; 476 bitmap_super_t *sb;
443 unsigned long chunksize, daemon_sleep, write_behind; 477 unsigned long chunksize, daemon_sleep, write_behind;
444 unsigned long bytes_read;
445 unsigned long long events; 478 unsigned long long events;
446 int err = -EINVAL; 479 int err = -EINVAL;
447 480
448 /* page 0 is the superblock, read it... */ 481 /* page 0 is the superblock, read it... */
449 if (bitmap->file) 482 if (bitmap->file)
450 bitmap->sb_page = read_page(bitmap->file, 0, &bytes_read); 483 bitmap->sb_page = read_page(bitmap->file, 0, bitmap, PAGE_SIZE);
451 else { 484 else {
452 bitmap->sb_page = read_sb_page(bitmap->mddev, bitmap->offset, 0); 485 bitmap->sb_page = read_sb_page(bitmap->mddev, bitmap->offset, 0);
453 bytes_read = PAGE_SIZE;
454 } 486 }
455 if (IS_ERR(bitmap->sb_page)) { 487 if (IS_ERR(bitmap->sb_page)) {
456 err = PTR_ERR(bitmap->sb_page); 488 err = PTR_ERR(bitmap->sb_page);
@@ -460,13 +492,6 @@ static int bitmap_read_sb(struct bitmap *bitmap)
460 492
461 sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); 493 sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
462 494
463 if (bytes_read < sizeof(*sb)) { /* short read */
464 printk(KERN_INFO "%s: bitmap file superblock truncated\n",
465 bmname(bitmap));
466 err = -ENOSPC;
467 goto out;
468 }
469
470 chunksize = le32_to_cpu(sb->chunksize); 495 chunksize = le32_to_cpu(sb->chunksize);
471 daemon_sleep = le32_to_cpu(sb->daemon_sleep); 496 daemon_sleep = le32_to_cpu(sb->daemon_sleep);
472 write_behind = le32_to_cpu(sb->write_behind); 497 write_behind = le32_to_cpu(sb->write_behind);
@@ -550,7 +575,6 @@ static void bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
550 spin_unlock_irqrestore(&bitmap->lock, flags); 575 spin_unlock_irqrestore(&bitmap->lock, flags);
551 return; 576 return;
552 } 577 }
553 get_page(bitmap->sb_page);
554 spin_unlock_irqrestore(&bitmap->lock, flags); 578 spin_unlock_irqrestore(&bitmap->lock, flags);
555 sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); 579 sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
556 switch (op) { 580 switch (op) {
@@ -561,7 +585,6 @@ static void bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
561 default: BUG(); 585 default: BUG();
562 } 586 }
563 kunmap_atomic(sb, KM_USER0); 587 kunmap_atomic(sb, KM_USER0);
564 put_page(bitmap->sb_page);
565} 588}
566 589
567/* 590/*
@@ -614,48 +637,17 @@ static void bitmap_file_unmap(struct bitmap *bitmap)
614 637
615 while (pages--) 638 while (pages--)
616 if (map[pages]->index != 0) /* 0 is sb_page, release it below */ 639 if (map[pages]->index != 0) /* 0 is sb_page, release it below */
617 put_page(map[pages]); 640 free_buffers(map[pages]);
618 kfree(map); 641 kfree(map);
619 kfree(attr); 642 kfree(attr);
620 643
621 safe_put_page(sb_page); 644 if (sb_page)
622} 645 free_buffers(sb_page);
623
624static void bitmap_stop_daemon(struct bitmap *bitmap);
625
626/* dequeue the next item in a page list -- don't call from irq context */
627static struct page_list *dequeue_page(struct bitmap *bitmap)
628{
629 struct page_list *item = NULL;
630 struct list_head *head = &bitmap->complete_pages;
631
632 spin_lock(&bitmap->write_lock);
633 if (list_empty(head))
634 goto out;
635 item = list_entry(head->prev, struct page_list, list);
636 list_del(head->prev);
637out:
638 spin_unlock(&bitmap->write_lock);
639 return item;
640}
641
642static void drain_write_queues(struct bitmap *bitmap)
643{
644 struct page_list *item;
645
646 while ((item = dequeue_page(bitmap))) {
647 /* don't bother to wait */
648 put_page(item->page);
649 mempool_free(item, bitmap->write_pool);
650 }
651
652 wake_up(&bitmap->write_wait);
653} 646}
654 647
655static void bitmap_file_put(struct bitmap *bitmap) 648static void bitmap_file_put(struct bitmap *bitmap)
656{ 649{
657 struct file *file; 650 struct file *file;
658 struct inode *inode;
659 unsigned long flags; 651 unsigned long flags;
660 652
661 spin_lock_irqsave(&bitmap->lock, flags); 653 spin_lock_irqsave(&bitmap->lock, flags);
@@ -663,17 +655,14 @@ static void bitmap_file_put(struct bitmap *bitmap)
663 bitmap->file = NULL; 655 bitmap->file = NULL;
664 spin_unlock_irqrestore(&bitmap->lock, flags); 656 spin_unlock_irqrestore(&bitmap->lock, flags);
665 657
666 bitmap_stop_daemon(bitmap); 658 if (file)
667 659 wait_event(bitmap->write_wait,
668 drain_write_queues(bitmap); 660 atomic_read(&bitmap->pending_writes)==0);
669
670 bitmap_file_unmap(bitmap); 661 bitmap_file_unmap(bitmap);
671 662
672 if (file) { 663 if (file) {
673 inode = file->f_mapping->host; 664 struct inode *inode = file->f_dentry->d_inode;
674 spin_lock(&inode->i_lock); 665 invalidate_inode_pages(inode->i_mapping);
675 atomic_set(&inode->i_writecount, 1); /* allow writes again */
676 spin_unlock(&inode->i_lock);
677 fput(file); 666 fput(file);
678 } 667 }
679} 668}
@@ -708,26 +697,27 @@ static void bitmap_file_kick(struct bitmap *bitmap)
708} 697}
709 698
710enum bitmap_page_attr { 699enum bitmap_page_attr {
711 BITMAP_PAGE_DIRTY = 1, // there are set bits that need to be synced 700 BITMAP_PAGE_DIRTY = 0, // there are set bits that need to be synced
712 BITMAP_PAGE_CLEAN = 2, // there are bits that might need to be cleared 701 BITMAP_PAGE_CLEAN = 1, // there are bits that might need to be cleared
713 BITMAP_PAGE_NEEDWRITE=4, // there are cleared bits that need to be synced 702 BITMAP_PAGE_NEEDWRITE=2, // there are cleared bits that need to be synced
714}; 703};
715 704
716static inline void set_page_attr(struct bitmap *bitmap, struct page *page, 705static inline void set_page_attr(struct bitmap *bitmap, struct page *page,
717 enum bitmap_page_attr attr) 706 enum bitmap_page_attr attr)
718{ 707{
719 bitmap->filemap_attr[page->index] |= attr; 708 __set_bit((page->index<<2) + attr, bitmap->filemap_attr);
720} 709}
721 710
722static inline void clear_page_attr(struct bitmap *bitmap, struct page *page, 711static inline void clear_page_attr(struct bitmap *bitmap, struct page *page,
723 enum bitmap_page_attr attr) 712 enum bitmap_page_attr attr)
724{ 713{
725 bitmap->filemap_attr[page->index] &= ~attr; 714 __clear_bit((page->index<<2) + attr, bitmap->filemap_attr);
726} 715}
727 716
728static inline unsigned long get_page_attr(struct bitmap *bitmap, struct page *page) 717static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page,
718 enum bitmap_page_attr attr)
729{ 719{
730 return bitmap->filemap_attr[page->index]; 720 return test_bit((page->index<<2) + attr, bitmap->filemap_attr);
731} 721}
732 722
733/* 723/*
@@ -751,11 +741,6 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
751 page = filemap_get_page(bitmap, chunk); 741 page = filemap_get_page(bitmap, chunk);
752 bit = file_page_offset(chunk); 742 bit = file_page_offset(chunk);
753 743
754
755 /* make sure the page stays cached until it gets written out */
756 if (! (get_page_attr(bitmap, page) & BITMAP_PAGE_DIRTY))
757 get_page(page);
758
759 /* set the bit */ 744 /* set the bit */
760 kaddr = kmap_atomic(page, KM_USER0); 745 kaddr = kmap_atomic(page, KM_USER0);
761 if (bitmap->flags & BITMAP_HOSTENDIAN) 746 if (bitmap->flags & BITMAP_HOSTENDIAN)
@@ -775,7 +760,8 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
775 * sync the dirty pages of the bitmap file to disk */ 760 * sync the dirty pages of the bitmap file to disk */
776int bitmap_unplug(struct bitmap *bitmap) 761int bitmap_unplug(struct bitmap *bitmap)
777{ 762{
778 unsigned long i, attr, flags; 763 unsigned long i, flags;
764 int dirty, need_write;
779 struct page *page; 765 struct page *page;
780 int wait = 0; 766 int wait = 0;
781 int err; 767 int err;
@@ -792,35 +778,26 @@ int bitmap_unplug(struct bitmap *bitmap)
792 return 0; 778 return 0;
793 } 779 }
794 page = bitmap->filemap[i]; 780 page = bitmap->filemap[i];
795 attr = get_page_attr(bitmap, page); 781 dirty = test_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
782 need_write = test_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
796 clear_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); 783 clear_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
797 clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE); 784 clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
798 if ((attr & BITMAP_PAGE_DIRTY)) 785 if (dirty)
799 wait = 1; 786 wait = 1;
800 spin_unlock_irqrestore(&bitmap->lock, flags); 787 spin_unlock_irqrestore(&bitmap->lock, flags);
801 788
802 if (attr & (BITMAP_PAGE_DIRTY | BITMAP_PAGE_NEEDWRITE)) { 789 if (dirty | need_write)
803 err = write_page(bitmap, page, 0); 790 err = write_page(bitmap, page, 0);
804 if (err == -EAGAIN) {
805 if (attr & BITMAP_PAGE_DIRTY)
806 err = write_page(bitmap, page, 1);
807 else
808 err = 0;
809 }
810 if (err)
811 return 1;
812 }
813 } 791 }
814 if (wait) { /* if any writes were performed, we need to wait on them */ 792 if (wait) { /* if any writes were performed, we need to wait on them */
815 if (bitmap->file) { 793 if (bitmap->file)
816 spin_lock_irq(&bitmap->write_lock); 794 wait_event(bitmap->write_wait,
817 wait_event_lock_irq(bitmap->write_wait, 795 atomic_read(&bitmap->pending_writes)==0);
818 list_empty(&bitmap->complete_pages), bitmap->write_lock, 796 else
819 wake_up_process(bitmap->writeback_daemon->tsk));
820 spin_unlock_irq(&bitmap->write_lock);
821 } else
822 md_super_wait(bitmap->mddev); 797 md_super_wait(bitmap->mddev);
823 } 798 }
799 if (bitmap->flags & BITMAP_WRITE_ERROR)
800 bitmap_file_kick(bitmap);
824 return 0; 801 return 0;
825} 802}
826 803
@@ -842,7 +819,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
842 struct page *page = NULL, *oldpage = NULL; 819 struct page *page = NULL, *oldpage = NULL;
843 unsigned long num_pages, bit_cnt = 0; 820 unsigned long num_pages, bit_cnt = 0;
844 struct file *file; 821 struct file *file;
845 unsigned long bytes, offset, dummy; 822 unsigned long bytes, offset;
846 int outofdate; 823 int outofdate;
847 int ret = -ENOSPC; 824 int ret = -ENOSPC;
848 void *paddr; 825 void *paddr;
@@ -879,7 +856,12 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
879 if (!bitmap->filemap) 856 if (!bitmap->filemap)
880 goto out; 857 goto out;
881 858
882 bitmap->filemap_attr = kzalloc(sizeof(long) * num_pages, GFP_KERNEL); 859 /* We need 4 bits per page, rounded up to a multiple of sizeof(unsigned long) */
860 bitmap->filemap_attr = kzalloc(
861 (((num_pages*4/8)+sizeof(unsigned long)-1)
862 /sizeof(unsigned long))
863 *sizeof(unsigned long),
864 GFP_KERNEL);
883 if (!bitmap->filemap_attr) 865 if (!bitmap->filemap_attr)
884 goto out; 866 goto out;
885 867
@@ -890,7 +872,12 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
890 index = file_page_index(i); 872 index = file_page_index(i);
891 bit = file_page_offset(i); 873 bit = file_page_offset(i);
892 if (index != oldindex) { /* this is a new page, read it in */ 874 if (index != oldindex) { /* this is a new page, read it in */
875 int count;
893 /* unmap the old page, we're done with it */ 876 /* unmap the old page, we're done with it */
877 if (index == num_pages-1)
878 count = bytes - index * PAGE_SIZE;
879 else
880 count = PAGE_SIZE;
894 if (index == 0) { 881 if (index == 0) {
895 /* 882 /*
896 * if we're here then the superblock page 883 * if we're here then the superblock page
@@ -900,7 +887,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
900 page = bitmap->sb_page; 887 page = bitmap->sb_page;
901 offset = sizeof(bitmap_super_t); 888 offset = sizeof(bitmap_super_t);
902 } else if (file) { 889 } else if (file) {
903 page = read_page(file, index, &dummy); 890 page = read_page(file, index, bitmap, count);
904 offset = 0; 891 offset = 0;
905 } else { 892 } else {
906 page = read_sb_page(bitmap->mddev, bitmap->offset, index); 893 page = read_sb_page(bitmap->mddev, bitmap->offset, index);
@@ -971,12 +958,11 @@ void bitmap_write_all(struct bitmap *bitmap)
971 /* We don't actually write all bitmap blocks here, 958 /* We don't actually write all bitmap blocks here,
972 * just flag them as needing to be written 959 * just flag them as needing to be written
973 */ 960 */
961 int i;
974 962
975 unsigned long chunks = bitmap->chunks; 963 for (i=0; i < bitmap->file_pages; i++)
976 unsigned long bytes = (chunks+7)/8 + sizeof(bitmap_super_t); 964 set_page_attr(bitmap, bitmap->filemap[i],
977 unsigned long num_pages = (bytes + PAGE_SIZE-1) / PAGE_SIZE; 965 BITMAP_PAGE_NEEDWRITE);
978 while (num_pages--)
979 bitmap->filemap_attr[num_pages] |= BITMAP_PAGE_NEEDWRITE;
980} 966}
981 967
982 968
@@ -1007,7 +993,6 @@ int bitmap_daemon_work(struct bitmap *bitmap)
1007 struct page *page = NULL, *lastpage = NULL; 993 struct page *page = NULL, *lastpage = NULL;
1008 int err = 0; 994 int err = 0;
1009 int blocks; 995 int blocks;
1010 int attr;
1011 void *paddr; 996 void *paddr;
1012 997
1013 if (bitmap == NULL) 998 if (bitmap == NULL)
@@ -1029,43 +1014,34 @@ int bitmap_daemon_work(struct bitmap *bitmap)
1029 1014
1030 if (page != lastpage) { 1015 if (page != lastpage) {
1031 /* skip this page unless it's marked as needing cleaning */ 1016 /* skip this page unless it's marked as needing cleaning */
1032 if (!((attr=get_page_attr(bitmap, page)) & BITMAP_PAGE_CLEAN)) { 1017 if (!test_page_attr(bitmap, page, BITMAP_PAGE_CLEAN)) {
1033 if (attr & BITMAP_PAGE_NEEDWRITE) { 1018 int need_write = test_page_attr(bitmap, page,
1034 get_page(page); 1019 BITMAP_PAGE_NEEDWRITE);
1020 if (need_write)
1035 clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE); 1021 clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
1036 } 1022
1037 spin_unlock_irqrestore(&bitmap->lock, flags); 1023 spin_unlock_irqrestore(&bitmap->lock, flags);
1038 if (attr & BITMAP_PAGE_NEEDWRITE) { 1024 if (need_write) {
1039 switch (write_page(bitmap, page, 0)) { 1025 switch (write_page(bitmap, page, 0)) {
1040 case -EAGAIN:
1041 set_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
1042 break;
1043 case 0: 1026 case 0:
1044 break; 1027 break;
1045 default: 1028 default:
1046 bitmap_file_kick(bitmap); 1029 bitmap_file_kick(bitmap);
1047 } 1030 }
1048 put_page(page);
1049 } 1031 }
1050 continue; 1032 continue;
1051 } 1033 }
1052 1034
1053 /* grab the new page, sync and release the old */ 1035 /* grab the new page, sync and release the old */
1054 get_page(page);
1055 if (lastpage != NULL) { 1036 if (lastpage != NULL) {
1056 if (get_page_attr(bitmap, lastpage) & BITMAP_PAGE_NEEDWRITE) { 1037 if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) {
1057 clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); 1038 clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1058 spin_unlock_irqrestore(&bitmap->lock, flags); 1039 spin_unlock_irqrestore(&bitmap->lock, flags);
1059 err = write_page(bitmap, lastpage, 0); 1040 err = write_page(bitmap, lastpage, 0);
1060 if (err == -EAGAIN) {
1061 err = 0;
1062 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1063 }
1064 } else { 1041 } else {
1065 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); 1042 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1066 spin_unlock_irqrestore(&bitmap->lock, flags); 1043 spin_unlock_irqrestore(&bitmap->lock, flags);
1067 } 1044 }
1068 put_page(lastpage);
1069 if (err) 1045 if (err)
1070 bitmap_file_kick(bitmap); 1046 bitmap_file_kick(bitmap);
1071 } else 1047 } else
@@ -1107,131 +1083,19 @@ int bitmap_daemon_work(struct bitmap *bitmap)
1107 /* now sync the final page */ 1083 /* now sync the final page */
1108 if (lastpage != NULL) { 1084 if (lastpage != NULL) {
1109 spin_lock_irqsave(&bitmap->lock, flags); 1085 spin_lock_irqsave(&bitmap->lock, flags);
1110 if (get_page_attr(bitmap, lastpage) &BITMAP_PAGE_NEEDWRITE) { 1086 if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) {
1111 clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); 1087 clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1112 spin_unlock_irqrestore(&bitmap->lock, flags); 1088 spin_unlock_irqrestore(&bitmap->lock, flags);
1113 err = write_page(bitmap, lastpage, 0); 1089 err = write_page(bitmap, lastpage, 0);
1114 if (err == -EAGAIN) {
1115 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1116 err = 0;
1117 }
1118 } else { 1090 } else {
1119 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); 1091 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1120 spin_unlock_irqrestore(&bitmap->lock, flags); 1092 spin_unlock_irqrestore(&bitmap->lock, flags);
1121 } 1093 }
1122
1123 put_page(lastpage);
1124 } 1094 }
1125 1095
1126 return err; 1096 return err;
1127} 1097}
1128 1098
1129static void daemon_exit(struct bitmap *bitmap, mdk_thread_t **daemon)
1130{
1131 mdk_thread_t *dmn;
1132 unsigned long flags;
1133
1134 /* if no one is waiting on us, we'll free the md thread struct
1135 * and exit, otherwise we let the waiter clean things up */
1136 spin_lock_irqsave(&bitmap->lock, flags);
1137 if ((dmn = *daemon)) { /* no one is waiting, cleanup and exit */
1138 *daemon = NULL;
1139 spin_unlock_irqrestore(&bitmap->lock, flags);
1140 kfree(dmn);
1141 complete_and_exit(NULL, 0); /* do_exit not exported */
1142 }
1143 spin_unlock_irqrestore(&bitmap->lock, flags);
1144}
1145
1146static void bitmap_writeback_daemon(mddev_t *mddev)
1147{
1148 struct bitmap *bitmap = mddev->bitmap;
1149 struct page *page;
1150 struct page_list *item;
1151 int err = 0;
1152
1153 if (signal_pending(current)) {
1154 printk(KERN_INFO
1155 "%s: bitmap writeback daemon got signal, exiting...\n",
1156 bmname(bitmap));
1157 err = -EINTR;
1158 goto out;
1159 }
1160 if (bitmap == NULL)
1161 /* about to be stopped. */
1162 return;
1163
1164 PRINTK("%s: bitmap writeback daemon woke up...\n", bmname(bitmap));
1165 /* wait on bitmap page writebacks */
1166 while ((item = dequeue_page(bitmap))) {
1167 page = item->page;
1168 mempool_free(item, bitmap->write_pool);
1169 PRINTK("wait on page writeback: %p\n", page);
1170 wait_on_page_writeback(page);
1171 PRINTK("finished page writeback: %p\n", page);
1172
1173 err = PageError(page);
1174 put_page(page);
1175 if (err) {
1176 printk(KERN_WARNING "%s: bitmap file writeback "
1177 "failed (page %lu): %d\n",
1178 bmname(bitmap), page->index, err);
1179 bitmap_file_kick(bitmap);
1180 goto out;
1181 }
1182 }
1183 out:
1184 wake_up(&bitmap->write_wait);
1185 if (err) {
1186 printk(KERN_INFO "%s: bitmap writeback daemon exiting (%d)\n",
1187 bmname(bitmap), err);
1188 daemon_exit(bitmap, &bitmap->writeback_daemon);
1189 }
1190}
1191
1192static mdk_thread_t *bitmap_start_daemon(struct bitmap *bitmap,
1193 void (*func)(mddev_t *), char *name)
1194{
1195 mdk_thread_t *daemon;
1196 char namebuf[32];
1197
1198#ifdef INJECT_FATAL_FAULT_2
1199 daemon = NULL;
1200#else
1201 sprintf(namebuf, "%%s_%s", name);
1202 daemon = md_register_thread(func, bitmap->mddev, namebuf);
1203#endif
1204 if (!daemon) {
1205 printk(KERN_ERR "%s: failed to start bitmap daemon\n",
1206 bmname(bitmap));
1207 return ERR_PTR(-ECHILD);
1208 }
1209
1210 md_wakeup_thread(daemon); /* start it running */
1211
1212 PRINTK("%s: %s daemon (pid %d) started...\n",
1213 bmname(bitmap), name, daemon->tsk->pid);
1214
1215 return daemon;
1216}
1217
1218static void bitmap_stop_daemon(struct bitmap *bitmap)
1219{
1220 /* the daemon can't stop itself... it'll just exit instead... */
1221 if (bitmap->writeback_daemon && ! IS_ERR(bitmap->writeback_daemon) &&
1222 current->pid != bitmap->writeback_daemon->tsk->pid) {
1223 mdk_thread_t *daemon;
1224 unsigned long flags;
1225
1226 spin_lock_irqsave(&bitmap->lock, flags);
1227 daemon = bitmap->writeback_daemon;
1228 bitmap->writeback_daemon = NULL;
1229 spin_unlock_irqrestore(&bitmap->lock, flags);
1230 if (daemon && ! IS_ERR(daemon))
1231 md_unregister_thread(daemon); /* destroy the thread */
1232 }
1233}
1234
1235static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, 1099static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
1236 sector_t offset, int *blocks, 1100 sector_t offset, int *blocks,
1237 int create) 1101 int create)
@@ -1500,8 +1364,6 @@ static void bitmap_free(struct bitmap *bitmap)
1500 1364
1501 /* free all allocated memory */ 1365 /* free all allocated memory */
1502 1366
1503 mempool_destroy(bitmap->write_pool);
1504
1505 if (bp) /* deallocate the page memory */ 1367 if (bp) /* deallocate the page memory */
1506 for (k = 0; k < pages; k++) 1368 for (k = 0; k < pages; k++)
1507 if (bp[k].map && !bp[k].hijacked) 1369 if (bp[k].map && !bp[k].hijacked)
@@ -1549,20 +1411,20 @@ int bitmap_create(mddev_t *mddev)
1549 return -ENOMEM; 1411 return -ENOMEM;
1550 1412
1551 spin_lock_init(&bitmap->lock); 1413 spin_lock_init(&bitmap->lock);
1552 bitmap->mddev = mddev; 1414 atomic_set(&bitmap->pending_writes, 0);
1553
1554 spin_lock_init(&bitmap->write_lock);
1555 INIT_LIST_HEAD(&bitmap->complete_pages);
1556 init_waitqueue_head(&bitmap->write_wait); 1415 init_waitqueue_head(&bitmap->write_wait);
1557 bitmap->write_pool = mempool_create_kmalloc_pool(WRITE_POOL_SIZE, 1416
1558 sizeof(struct page_list)); 1417 bitmap->mddev = mddev;
1559 err = -ENOMEM;
1560 if (!bitmap->write_pool)
1561 goto error;
1562 1418
1563 bitmap->file = file; 1419 bitmap->file = file;
1564 bitmap->offset = mddev->bitmap_offset; 1420 bitmap->offset = mddev->bitmap_offset;
1565 if (file) get_file(file); 1421 if (file) {
1422 get_file(file);
1423 do_sync_file_range(file, 0, LLONG_MAX,
1424 SYNC_FILE_RANGE_WAIT_BEFORE |
1425 SYNC_FILE_RANGE_WRITE |
1426 SYNC_FILE_RANGE_WAIT_AFTER);
1427 }
1566 /* read superblock from bitmap file (this sets bitmap->chunksize) */ 1428 /* read superblock from bitmap file (this sets bitmap->chunksize) */
1567 err = bitmap_read_sb(bitmap); 1429 err = bitmap_read_sb(bitmap);
1568 if (err) 1430 if (err)
@@ -1594,8 +1456,6 @@ int bitmap_create(mddev_t *mddev)
1594 if (!bitmap->bp) 1456 if (!bitmap->bp)
1595 goto error; 1457 goto error;
1596 1458
1597 bitmap->flags |= BITMAP_ACTIVE;
1598
1599 /* now that we have some pages available, initialize the in-memory 1459 /* now that we have some pages available, initialize the in-memory
1600 * bitmap from the on-disk bitmap */ 1460 * bitmap from the on-disk bitmap */
1601 start = 0; 1461 start = 0;
@@ -1613,15 +1473,6 @@ int bitmap_create(mddev_t *mddev)
1613 1473
1614 mddev->bitmap = bitmap; 1474 mddev->bitmap = bitmap;
1615 1475
1616 if (file)
1617 /* kick off the bitmap writeback daemon */
1618 bitmap->writeback_daemon =
1619 bitmap_start_daemon(bitmap,
1620 bitmap_writeback_daemon,
1621 "bitmap_wb");
1622
1623 if (IS_ERR(bitmap->writeback_daemon))
1624 return PTR_ERR(bitmap->writeback_daemon);
1625 mddev->thread->timeout = bitmap->daemon_sleep * HZ; 1476 mddev->thread->timeout = bitmap->daemon_sleep * HZ;
1626 1477
1627 return bitmap_update_sb(bitmap); 1478 return bitmap_update_sb(bitmap);
@@ -1638,4 +1489,3 @@ EXPORT_SYMBOL(bitmap_start_sync);
1638EXPORT_SYMBOL(bitmap_end_sync); 1489EXPORT_SYMBOL(bitmap_end_sync);
1639EXPORT_SYMBOL(bitmap_unplug); 1490EXPORT_SYMBOL(bitmap_unplug);
1640EXPORT_SYMBOL(bitmap_close_sync); 1491EXPORT_SYMBOL(bitmap_close_sync);
1641EXPORT_SYMBOL(bitmap_daemon_work);
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 61a590bb6241..bdbd34993a80 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -5,6 +5,7 @@
5 * This file is released under the GPL. 5 * This file is released under the GPL.
6 */ 6 */
7 7
8#include <linux/err.h>
8#include <linux/module.h> 9#include <linux/module.h>
9#include <linux/init.h> 10#include <linux/init.h>
10#include <linux/kernel.h> 11#include <linux/kernel.h>
@@ -20,7 +21,7 @@
20 21
21#include "dm.h" 22#include "dm.h"
22 23
23#define PFX "crypt: " 24#define DM_MSG_PREFIX "crypt"
24 25
25/* 26/*
26 * per bio private data 27 * per bio private data
@@ -78,11 +79,13 @@ struct crypt_config {
78 */ 79 */
79 struct crypt_iv_operations *iv_gen_ops; 80 struct crypt_iv_operations *iv_gen_ops;
80 char *iv_mode; 81 char *iv_mode;
81 void *iv_gen_private; 82 struct crypto_cipher *iv_gen_private;
82 sector_t iv_offset; 83 sector_t iv_offset;
83 unsigned int iv_size; 84 unsigned int iv_size;
84 85
85 struct crypto_tfm *tfm; 86 char cipher[CRYPTO_MAX_ALG_NAME];
87 char chainmode[CRYPTO_MAX_ALG_NAME];
88 struct crypto_blkcipher *tfm;
86 unsigned int key_size; 89 unsigned int key_size;
87 u8 key[0]; 90 u8 key[0];
88}; 91};
@@ -96,12 +99,12 @@ static kmem_cache_t *_crypt_io_pool;
96/* 99/*
97 * Different IV generation algorithms: 100 * Different IV generation algorithms:
98 * 101 *
99 * plain: the initial vector is the 32-bit low-endian version of the sector 102 * plain: the initial vector is the 32-bit little-endian version of the sector
100 * number, padded with zeros if neccessary. 103 * number, padded with zeros if neccessary.
101 * 104 *
102 * ess_iv: "encrypted sector|salt initial vector", the sector number is 105 * essiv: "encrypted sector|salt initial vector", the sector number is
103 * encrypted with the bulk cipher using a salt as key. The salt 106 * encrypted with the bulk cipher using a salt as key. The salt
104 * should be derived from the bulk cipher's key via hashing. 107 * should be derived from the bulk cipher's key via hashing.
105 * 108 *
106 * plumb: unimplemented, see: 109 * plumb: unimplemented, see:
107 * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454 110 * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454
@@ -118,88 +121,84 @@ static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
118static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, 121static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
119 const char *opts) 122 const char *opts)
120{ 123{
121 struct crypto_tfm *essiv_tfm; 124 struct crypto_cipher *essiv_tfm;
122 struct crypto_tfm *hash_tfm; 125 struct crypto_hash *hash_tfm;
126 struct hash_desc desc;
123 struct scatterlist sg; 127 struct scatterlist sg;
124 unsigned int saltsize; 128 unsigned int saltsize;
125 u8 *salt; 129 u8 *salt;
130 int err;
126 131
127 if (opts == NULL) { 132 if (opts == NULL) {
128 ti->error = PFX "Digest algorithm missing for ESSIV mode"; 133 ti->error = "Digest algorithm missing for ESSIV mode";
129 return -EINVAL; 134 return -EINVAL;
130 } 135 }
131 136
132 /* Hash the cipher key with the given hash algorithm */ 137 /* Hash the cipher key with the given hash algorithm */
133 hash_tfm = crypto_alloc_tfm(opts, CRYPTO_TFM_REQ_MAY_SLEEP); 138 hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC);
134 if (hash_tfm == NULL) { 139 if (IS_ERR(hash_tfm)) {
135 ti->error = PFX "Error initializing ESSIV hash"; 140 ti->error = "Error initializing ESSIV hash";
136 return -EINVAL; 141 return PTR_ERR(hash_tfm);
137 } 142 }
138 143
139 if (crypto_tfm_alg_type(hash_tfm) != CRYPTO_ALG_TYPE_DIGEST) { 144 saltsize = crypto_hash_digestsize(hash_tfm);
140 ti->error = PFX "Expected digest algorithm for ESSIV hash";
141 crypto_free_tfm(hash_tfm);
142 return -EINVAL;
143 }
144
145 saltsize = crypto_tfm_alg_digestsize(hash_tfm);
146 salt = kmalloc(saltsize, GFP_KERNEL); 145 salt = kmalloc(saltsize, GFP_KERNEL);
147 if (salt == NULL) { 146 if (salt == NULL) {
148 ti->error = PFX "Error kmallocing salt storage in ESSIV"; 147 ti->error = "Error kmallocing salt storage in ESSIV";
149 crypto_free_tfm(hash_tfm); 148 crypto_free_hash(hash_tfm);
150 return -ENOMEM; 149 return -ENOMEM;
151 } 150 }
152 151
153 sg_set_buf(&sg, cc->key, cc->key_size); 152 sg_set_buf(&sg, cc->key, cc->key_size);
154 crypto_digest_digest(hash_tfm, &sg, 1, salt); 153 desc.tfm = hash_tfm;
155 crypto_free_tfm(hash_tfm); 154 desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
155 err = crypto_hash_digest(&desc, &sg, cc->key_size, salt);
156 crypto_free_hash(hash_tfm);
157
158 if (err) {
159 ti->error = "Error calculating hash in ESSIV";
160 return err;
161 }
156 162
157 /* Setup the essiv_tfm with the given salt */ 163 /* Setup the essiv_tfm with the given salt */
158 essiv_tfm = crypto_alloc_tfm(crypto_tfm_alg_name(cc->tfm), 164 essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
159 CRYPTO_TFM_MODE_ECB | 165 if (IS_ERR(essiv_tfm)) {
160 CRYPTO_TFM_REQ_MAY_SLEEP); 166 ti->error = "Error allocating crypto tfm for ESSIV";
161 if (essiv_tfm == NULL) {
162 ti->error = PFX "Error allocating crypto tfm for ESSIV";
163 kfree(salt); 167 kfree(salt);
164 return -EINVAL; 168 return PTR_ERR(essiv_tfm);
165 } 169 }
166 if (crypto_tfm_alg_blocksize(essiv_tfm) 170 if (crypto_cipher_blocksize(essiv_tfm) !=
167 != crypto_tfm_alg_ivsize(cc->tfm)) { 171 crypto_blkcipher_ivsize(cc->tfm)) {
168 ti->error = PFX "Block size of ESSIV cipher does " 172 ti->error = "Block size of ESSIV cipher does "
169 "not match IV size of block cipher"; 173 "not match IV size of block cipher";
170 crypto_free_tfm(essiv_tfm); 174 crypto_free_cipher(essiv_tfm);
171 kfree(salt); 175 kfree(salt);
172 return -EINVAL; 176 return -EINVAL;
173 } 177 }
174 if (crypto_cipher_setkey(essiv_tfm, salt, saltsize) < 0) { 178 err = crypto_cipher_setkey(essiv_tfm, salt, saltsize);
175 ti->error = PFX "Failed to set key for ESSIV cipher"; 179 if (err) {
176 crypto_free_tfm(essiv_tfm); 180 ti->error = "Failed to set key for ESSIV cipher";
181 crypto_free_cipher(essiv_tfm);
177 kfree(salt); 182 kfree(salt);
178 return -EINVAL; 183 return err;
179 } 184 }
180 kfree(salt); 185 kfree(salt);
181 186
182 cc->iv_gen_private = (void *)essiv_tfm; 187 cc->iv_gen_private = essiv_tfm;
183 return 0; 188 return 0;
184} 189}
185 190
186static void crypt_iv_essiv_dtr(struct crypt_config *cc) 191static void crypt_iv_essiv_dtr(struct crypt_config *cc)
187{ 192{
188 crypto_free_tfm((struct crypto_tfm *)cc->iv_gen_private); 193 crypto_free_cipher(cc->iv_gen_private);
189 cc->iv_gen_private = NULL; 194 cc->iv_gen_private = NULL;
190} 195}
191 196
192static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector) 197static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
193{ 198{
194 struct scatterlist sg;
195
196 memset(iv, 0, cc->iv_size); 199 memset(iv, 0, cc->iv_size);
197 *(u64 *)iv = cpu_to_le64(sector); 200 *(u64 *)iv = cpu_to_le64(sector);
198 201 crypto_cipher_encrypt_one(cc->iv_gen_private, iv, iv);
199 sg_set_buf(&sg, iv, cc->iv_size);
200 crypto_cipher_encrypt((struct crypto_tfm *)cc->iv_gen_private,
201 &sg, &sg, cc->iv_size);
202
203 return 0; 202 return 0;
204} 203}
205 204
@@ -220,6 +219,11 @@ crypt_convert_scatterlist(struct crypt_config *cc, struct scatterlist *out,
220 int write, sector_t sector) 219 int write, sector_t sector)
221{ 220{
222 u8 iv[cc->iv_size]; 221 u8 iv[cc->iv_size];
222 struct blkcipher_desc desc = {
223 .tfm = cc->tfm,
224 .info = iv,
225 .flags = CRYPTO_TFM_REQ_MAY_SLEEP,
226 };
223 int r; 227 int r;
224 228
225 if (cc->iv_gen_ops) { 229 if (cc->iv_gen_ops) {
@@ -228,14 +232,14 @@ crypt_convert_scatterlist(struct crypt_config *cc, struct scatterlist *out,
228 return r; 232 return r;
229 233
230 if (write) 234 if (write)
231 r = crypto_cipher_encrypt_iv(cc->tfm, out, in, length, iv); 235 r = crypto_blkcipher_encrypt_iv(&desc, out, in, length);
232 else 236 else
233 r = crypto_cipher_decrypt_iv(cc->tfm, out, in, length, iv); 237 r = crypto_blkcipher_decrypt_iv(&desc, out, in, length);
234 } else { 238 } else {
235 if (write) 239 if (write)
236 r = crypto_cipher_encrypt(cc->tfm, out, in, length); 240 r = crypto_blkcipher_encrypt(&desc, out, in, length);
237 else 241 else
238 r = crypto_cipher_decrypt(cc->tfm, out, in, length); 242 r = crypto_blkcipher_decrypt(&desc, out, in, length);
239 } 243 }
240 244
241 return r; 245 return r;
@@ -510,18 +514,17 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
510static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) 514static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
511{ 515{
512 struct crypt_config *cc; 516 struct crypt_config *cc;
513 struct crypto_tfm *tfm; 517 struct crypto_blkcipher *tfm;
514 char *tmp; 518 char *tmp;
515 char *cipher; 519 char *cipher;
516 char *chainmode; 520 char *chainmode;
517 char *ivmode; 521 char *ivmode;
518 char *ivopts; 522 char *ivopts;
519 unsigned int crypto_flags;
520 unsigned int key_size; 523 unsigned int key_size;
521 unsigned long long tmpll; 524 unsigned long long tmpll;
522 525
523 if (argc != 5) { 526 if (argc != 5) {
524 ti->error = PFX "Not enough arguments"; 527 ti->error = "Not enough arguments";
525 return -EINVAL; 528 return -EINVAL;
526 } 529 }
527 530
@@ -532,21 +535,21 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
532 ivmode = strsep(&ivopts, ":"); 535 ivmode = strsep(&ivopts, ":");
533 536
534 if (tmp) 537 if (tmp)
535 DMWARN(PFX "Unexpected additional cipher options"); 538 DMWARN("Unexpected additional cipher options");
536 539
537 key_size = strlen(argv[1]) >> 1; 540 key_size = strlen(argv[1]) >> 1;
538 541
539 cc = kmalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL); 542 cc = kmalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL);
540 if (cc == NULL) { 543 if (cc == NULL) {
541 ti->error = 544 ti->error =
542 PFX "Cannot allocate transparent encryption context"; 545 "Cannot allocate transparent encryption context";
543 return -ENOMEM; 546 return -ENOMEM;
544 } 547 }
545 548
546 cc->key_size = key_size; 549 cc->key_size = key_size;
547 if ((!key_size && strcmp(argv[1], "-") != 0) || 550 if ((!key_size && strcmp(argv[1], "-") != 0) ||
548 (key_size && crypt_decode_key(cc->key, argv[1], key_size) < 0)) { 551 (key_size && crypt_decode_key(cc->key, argv[1], key_size) < 0)) {
549 ti->error = PFX "Error decoding key"; 552 ti->error = "Error decoding key";
550 goto bad1; 553 goto bad1;
551 } 554 }
552 555
@@ -556,31 +559,25 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
556 ivmode = "plain"; 559 ivmode = "plain";
557 } 560 }
558 561
559 /* Choose crypto_flags according to chainmode */ 562 if (strcmp(chainmode, "ecb") && !ivmode) {
560 if (strcmp(chainmode, "cbc") == 0) 563 ti->error = "This chaining mode requires an IV mechanism";
561 crypto_flags = CRYPTO_TFM_MODE_CBC;
562 else if (strcmp(chainmode, "ecb") == 0)
563 crypto_flags = CRYPTO_TFM_MODE_ECB;
564 else {
565 ti->error = PFX "Unknown chaining mode";
566 goto bad1; 564 goto bad1;
567 } 565 }
568 566
569 if (crypto_flags != CRYPTO_TFM_MODE_ECB && !ivmode) { 567 if (snprintf(cc->cipher, CRYPTO_MAX_ALG_NAME, "%s(%s)", chainmode,
570 ti->error = PFX "This chaining mode requires an IV mechanism"; 568 cipher) >= CRYPTO_MAX_ALG_NAME) {
569 ti->error = "Chain mode + cipher name is too long";
571 goto bad1; 570 goto bad1;
572 } 571 }
573 572
574 tfm = crypto_alloc_tfm(cipher, crypto_flags | CRYPTO_TFM_REQ_MAY_SLEEP); 573 tfm = crypto_alloc_blkcipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
575 if (!tfm) { 574 if (IS_ERR(tfm)) {
576 ti->error = PFX "Error allocating crypto tfm"; 575 ti->error = "Error allocating crypto tfm";
577 goto bad1; 576 goto bad1;
578 } 577 }
579 if (crypto_tfm_alg_type(tfm) != CRYPTO_ALG_TYPE_CIPHER) {
580 ti->error = PFX "Expected cipher algorithm";
581 goto bad2;
582 }
583 578
579 strcpy(cc->cipher, cipher);
580 strcpy(cc->chainmode, chainmode);
584 cc->tfm = tfm; 581 cc->tfm = tfm;
585 582
586 /* 583 /*
@@ -595,7 +592,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
595 else if (strcmp(ivmode, "essiv") == 0) 592 else if (strcmp(ivmode, "essiv") == 0)
596 cc->iv_gen_ops = &crypt_iv_essiv_ops; 593 cc->iv_gen_ops = &crypt_iv_essiv_ops;
597 else { 594 else {
598 ti->error = PFX "Invalid IV mode"; 595 ti->error = "Invalid IV mode";
599 goto bad2; 596 goto bad2;
600 } 597 }
601 598
@@ -603,14 +600,14 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
603 cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0) 600 cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0)
604 goto bad2; 601 goto bad2;
605 602
606 if (tfm->crt_cipher.cit_decrypt_iv && tfm->crt_cipher.cit_encrypt_iv) 603 cc->iv_size = crypto_blkcipher_ivsize(tfm);
604 if (cc->iv_size)
607 /* at least a 64 bit sector number should fit in our buffer */ 605 /* at least a 64 bit sector number should fit in our buffer */
608 cc->iv_size = max(crypto_tfm_alg_ivsize(tfm), 606 cc->iv_size = max(cc->iv_size,
609 (unsigned int)(sizeof(u64) / sizeof(u8))); 607 (unsigned int)(sizeof(u64) / sizeof(u8)));
610 else { 608 else {
611 cc->iv_size = 0;
612 if (cc->iv_gen_ops) { 609 if (cc->iv_gen_ops) {
613 DMWARN(PFX "Selected cipher does not support IVs"); 610 DMWARN("Selected cipher does not support IVs");
614 if (cc->iv_gen_ops->dtr) 611 if (cc->iv_gen_ops->dtr)
615 cc->iv_gen_ops->dtr(cc); 612 cc->iv_gen_ops->dtr(cc);
616 cc->iv_gen_ops = NULL; 613 cc->iv_gen_ops = NULL;
@@ -619,36 +616,36 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
619 616
620 cc->io_pool = mempool_create_slab_pool(MIN_IOS, _crypt_io_pool); 617 cc->io_pool = mempool_create_slab_pool(MIN_IOS, _crypt_io_pool);
621 if (!cc->io_pool) { 618 if (!cc->io_pool) {
622 ti->error = PFX "Cannot allocate crypt io mempool"; 619 ti->error = "Cannot allocate crypt io mempool";
623 goto bad3; 620 goto bad3;
624 } 621 }
625 622
626 cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0); 623 cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
627 if (!cc->page_pool) { 624 if (!cc->page_pool) {
628 ti->error = PFX "Cannot allocate page mempool"; 625 ti->error = "Cannot allocate page mempool";
629 goto bad4; 626 goto bad4;
630 } 627 }
631 628
632 if (tfm->crt_cipher.cit_setkey(tfm, cc->key, key_size) < 0) { 629 if (crypto_blkcipher_setkey(tfm, cc->key, key_size) < 0) {
633 ti->error = PFX "Error setting key"; 630 ti->error = "Error setting key";
634 goto bad5; 631 goto bad5;
635 } 632 }
636 633
637 if (sscanf(argv[2], "%llu", &tmpll) != 1) { 634 if (sscanf(argv[2], "%llu", &tmpll) != 1) {
638 ti->error = PFX "Invalid iv_offset sector"; 635 ti->error = "Invalid iv_offset sector";
639 goto bad5; 636 goto bad5;
640 } 637 }
641 cc->iv_offset = tmpll; 638 cc->iv_offset = tmpll;
642 639
643 if (sscanf(argv[4], "%llu", &tmpll) != 1) { 640 if (sscanf(argv[4], "%llu", &tmpll) != 1) {
644 ti->error = PFX "Invalid device sector"; 641 ti->error = "Invalid device sector";
645 goto bad5; 642 goto bad5;
646 } 643 }
647 cc->start = tmpll; 644 cc->start = tmpll;
648 645
649 if (dm_get_device(ti, argv[3], cc->start, ti->len, 646 if (dm_get_device(ti, argv[3], cc->start, ti->len,
650 dm_table_get_mode(ti->table), &cc->dev)) { 647 dm_table_get_mode(ti->table), &cc->dev)) {
651 ti->error = PFX "Device lookup failed"; 648 ti->error = "Device lookup failed";
652 goto bad5; 649 goto bad5;
653 } 650 }
654 651
@@ -657,7 +654,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
657 *(ivopts - 1) = ':'; 654 *(ivopts - 1) = ':';
658 cc->iv_mode = kmalloc(strlen(ivmode) + 1, GFP_KERNEL); 655 cc->iv_mode = kmalloc(strlen(ivmode) + 1, GFP_KERNEL);
659 if (!cc->iv_mode) { 656 if (!cc->iv_mode) {
660 ti->error = PFX "Error kmallocing iv_mode string"; 657 ti->error = "Error kmallocing iv_mode string";
661 goto bad5; 658 goto bad5;
662 } 659 }
663 strcpy(cc->iv_mode, ivmode); 660 strcpy(cc->iv_mode, ivmode);
@@ -675,7 +672,7 @@ bad3:
675 if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) 672 if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
676 cc->iv_gen_ops->dtr(cc); 673 cc->iv_gen_ops->dtr(cc);
677bad2: 674bad2:
678 crypto_free_tfm(tfm); 675 crypto_free_blkcipher(tfm);
679bad1: 676bad1:
680 /* Must zero key material before freeing */ 677 /* Must zero key material before freeing */
681 memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8)); 678 memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8));
@@ -693,7 +690,7 @@ static void crypt_dtr(struct dm_target *ti)
693 kfree(cc->iv_mode); 690 kfree(cc->iv_mode);
694 if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) 691 if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
695 cc->iv_gen_ops->dtr(cc); 692 cc->iv_gen_ops->dtr(cc);
696 crypto_free_tfm(cc->tfm); 693 crypto_free_blkcipher(cc->tfm);
697 dm_put_device(ti, cc->dev); 694 dm_put_device(ti, cc->dev);
698 695
699 /* Must zero key material before freeing */ 696 /* Must zero key material before freeing */
@@ -858,18 +855,9 @@ static int crypt_status(struct dm_target *ti, status_type_t type,
858 break; 855 break;
859 856
860 case STATUSTYPE_TABLE: 857 case STATUSTYPE_TABLE:
861 cipher = crypto_tfm_alg_name(cc->tfm); 858 cipher = crypto_blkcipher_name(cc->tfm);
862 859
863 switch(cc->tfm->crt_cipher.cit_mode) { 860 chainmode = cc->chainmode;
864 case CRYPTO_TFM_MODE_CBC:
865 chainmode = "cbc";
866 break;
867 case CRYPTO_TFM_MODE_ECB:
868 chainmode = "ecb";
869 break;
870 default:
871 BUG();
872 }
873 861
874 if (cc->iv_mode) 862 if (cc->iv_mode)
875 DMEMIT("%s-%s-%s ", cipher, chainmode, cc->iv_mode); 863 DMEMIT("%s-%s-%s ", cipher, chainmode, cc->iv_mode);
@@ -918,13 +906,13 @@ static int __init dm_crypt_init(void)
918 _kcryptd_workqueue = create_workqueue("kcryptd"); 906 _kcryptd_workqueue = create_workqueue("kcryptd");
919 if (!_kcryptd_workqueue) { 907 if (!_kcryptd_workqueue) {
920 r = -ENOMEM; 908 r = -ENOMEM;
921 DMERR(PFX "couldn't create kcryptd"); 909 DMERR("couldn't create kcryptd");
922 goto bad1; 910 goto bad1;
923 } 911 }
924 912
925 r = dm_register_target(&crypt_target); 913 r = dm_register_target(&crypt_target);
926 if (r < 0) { 914 if (r < 0) {
927 DMERR(PFX "register failed %d", r); 915 DMERR("register failed %d", r);
928 goto bad2; 916 goto bad2;
929 } 917 }
930 918
@@ -942,7 +930,7 @@ static void __exit dm_crypt_exit(void)
942 int r = dm_unregister_target(&crypt_target); 930 int r = dm_unregister_target(&crypt_target);
943 931
944 if (r < 0) 932 if (r < 0)
945 DMERR(PFX "unregister failed %d", r); 933 DMERR("unregister failed %d", r);
946 934
947 destroy_workqueue(_kcryptd_workqueue); 935 destroy_workqueue(_kcryptd_workqueue);
948 kmem_cache_destroy(_crypt_io_pool); 936 kmem_cache_destroy(_crypt_io_pool);
diff --git a/drivers/md/dm-emc.c b/drivers/md/dm-emc.c
index c7067674dcb7..2a374ccb30dd 100644
--- a/drivers/md/dm-emc.c
+++ b/drivers/md/dm-emc.c
@@ -12,6 +12,8 @@
12#include <scsi/scsi.h> 12#include <scsi/scsi.h>
13#include <scsi/scsi_cmnd.h> 13#include <scsi/scsi_cmnd.h>
14 14
15#define DM_MSG_PREFIX "multipath emc"
16
15struct emc_handler { 17struct emc_handler {
16 spinlock_t lock; 18 spinlock_t lock;
17 19
@@ -66,7 +68,7 @@ static struct bio *get_failover_bio(struct path *path, unsigned data_size)
66 68
67 bio = bio_alloc(GFP_ATOMIC, 1); 69 bio = bio_alloc(GFP_ATOMIC, 1);
68 if (!bio) { 70 if (!bio) {
69 DMERR("dm-emc: get_failover_bio: bio_alloc() failed."); 71 DMERR("get_failover_bio: bio_alloc() failed.");
70 return NULL; 72 return NULL;
71 } 73 }
72 74
@@ -78,13 +80,13 @@ static struct bio *get_failover_bio(struct path *path, unsigned data_size)
78 80
79 page = alloc_page(GFP_ATOMIC); 81 page = alloc_page(GFP_ATOMIC);
80 if (!page) { 82 if (!page) {
81 DMERR("dm-emc: get_failover_bio: alloc_page() failed."); 83 DMERR("get_failover_bio: alloc_page() failed.");
82 bio_put(bio); 84 bio_put(bio);
83 return NULL; 85 return NULL;
84 } 86 }
85 87
86 if (bio_add_page(bio, page, data_size, 0) != data_size) { 88 if (bio_add_page(bio, page, data_size, 0) != data_size) {
87 DMERR("dm-emc: get_failover_bio: alloc_page() failed."); 89 DMERR("get_failover_bio: alloc_page() failed.");
88 __free_page(page); 90 __free_page(page);
89 bio_put(bio); 91 bio_put(bio);
90 return NULL; 92 return NULL;
@@ -103,7 +105,7 @@ static struct request *get_failover_req(struct emc_handler *h,
103 /* FIXME: Figure out why it fails with GFP_ATOMIC. */ 105 /* FIXME: Figure out why it fails with GFP_ATOMIC. */
104 rq = blk_get_request(q, WRITE, __GFP_WAIT); 106 rq = blk_get_request(q, WRITE, __GFP_WAIT);
105 if (!rq) { 107 if (!rq) {
106 DMERR("dm-emc: get_failover_req: blk_get_request failed"); 108 DMERR("get_failover_req: blk_get_request failed");
107 return NULL; 109 return NULL;
108 } 110 }
109 111
@@ -160,7 +162,7 @@ static struct request *emc_trespass_get(struct emc_handler *h,
160 162
161 bio = get_failover_bio(path, data_size); 163 bio = get_failover_bio(path, data_size);
162 if (!bio) { 164 if (!bio) {
163 DMERR("dm-emc: emc_trespass_get: no bio"); 165 DMERR("emc_trespass_get: no bio");
164 return NULL; 166 return NULL;
165 } 167 }
166 168
@@ -173,7 +175,7 @@ static struct request *emc_trespass_get(struct emc_handler *h,
173 /* get request for block layer packet command */ 175 /* get request for block layer packet command */
174 rq = get_failover_req(h, bio, path); 176 rq = get_failover_req(h, bio, path);
175 if (!rq) { 177 if (!rq) {
176 DMERR("dm-emc: emc_trespass_get: no rq"); 178 DMERR("emc_trespass_get: no rq");
177 free_bio(bio); 179 free_bio(bio);
178 return NULL; 180 return NULL;
179 } 181 }
@@ -200,18 +202,18 @@ static void emc_pg_init(struct hw_handler *hwh, unsigned bypassed,
200 * initial state passed into us and then get an update here. 202 * initial state passed into us and then get an update here.
201 */ 203 */
202 if (!q) { 204 if (!q) {
203 DMINFO("dm-emc: emc_pg_init: no queue"); 205 DMINFO("emc_pg_init: no queue");
204 goto fail_path; 206 goto fail_path;
205 } 207 }
206 208
207 /* FIXME: The request should be pre-allocated. */ 209 /* FIXME: The request should be pre-allocated. */
208 rq = emc_trespass_get(hwh->context, path); 210 rq = emc_trespass_get(hwh->context, path);
209 if (!rq) { 211 if (!rq) {
210 DMERR("dm-emc: emc_pg_init: no rq"); 212 DMERR("emc_pg_init: no rq");
211 goto fail_path; 213 goto fail_path;
212 } 214 }
213 215
214 DMINFO("dm-emc: emc_pg_init: sending switch-over command"); 216 DMINFO("emc_pg_init: sending switch-over command");
215 elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 1); 217 elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 1);
216 return; 218 return;
217 219
@@ -241,18 +243,18 @@ static int emc_create(struct hw_handler *hwh, unsigned argc, char **argv)
241 hr = 0; 243 hr = 0;
242 short_trespass = 0; 244 short_trespass = 0;
243 } else if (argc != 2) { 245 } else if (argc != 2) {
244 DMWARN("dm-emc hwhandler: incorrect number of arguments"); 246 DMWARN("incorrect number of arguments");
245 return -EINVAL; 247 return -EINVAL;
246 } else { 248 } else {
247 if ((sscanf(argv[0], "%u", &short_trespass) != 1) 249 if ((sscanf(argv[0], "%u", &short_trespass) != 1)
248 || (short_trespass > 1)) { 250 || (short_trespass > 1)) {
249 DMWARN("dm-emc: invalid trespass mode selected"); 251 DMWARN("invalid trespass mode selected");
250 return -EINVAL; 252 return -EINVAL;
251 } 253 }
252 254
253 if ((sscanf(argv[1], "%u", &hr) != 1) 255 if ((sscanf(argv[1], "%u", &hr) != 1)
254 || (hr > 1)) { 256 || (hr > 1)) {
255 DMWARN("dm-emc: invalid honor reservation flag selected"); 257 DMWARN("invalid honor reservation flag selected");
256 return -EINVAL; 258 return -EINVAL;
257 } 259 }
258 } 260 }
@@ -264,14 +266,14 @@ static int emc_create(struct hw_handler *hwh, unsigned argc, char **argv)
264 hwh->context = h; 266 hwh->context = h;
265 267
266 if ((h->short_trespass = short_trespass)) 268 if ((h->short_trespass = short_trespass))
267 DMWARN("dm-emc: short trespass command will be send"); 269 DMWARN("short trespass command will be send");
268 else 270 else
269 DMWARN("dm-emc: long trespass command will be send"); 271 DMWARN("long trespass command will be send");
270 272
271 if ((h->hr = hr)) 273 if ((h->hr = hr))
272 DMWARN("dm-emc: honor reservation bit will be set"); 274 DMWARN("honor reservation bit will be set");
273 else 275 else
274 DMWARN("dm-emc: honor reservation bit will not be set (default)"); 276 DMWARN("honor reservation bit will not be set (default)");
275 277
276 return 0; 278 return 0;
277} 279}
@@ -336,9 +338,9 @@ static int __init dm_emc_init(void)
336 int r = dm_register_hw_handler(&emc_hwh); 338 int r = dm_register_hw_handler(&emc_hwh);
337 339
338 if (r < 0) 340 if (r < 0)
339 DMERR("emc: register failed %d", r); 341 DMERR("register failed %d", r);
340 342
341 DMINFO("dm-emc version 0.0.3 loaded"); 343 DMINFO("version 0.0.3 loaded");
342 344
343 return r; 345 return r;
344} 346}
@@ -348,7 +350,7 @@ static void __exit dm_emc_exit(void)
348 int r = dm_unregister_hw_handler(&emc_hwh); 350 int r = dm_unregister_hw_handler(&emc_hwh);
349 351
350 if (r < 0) 352 if (r < 0)
351 DMERR("emc: unregister failed %d", r); 353 DMERR("unregister failed %d", r);
352} 354}
353 355
354module_init(dm_emc_init); 356module_init(dm_emc_init);
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index cc07bbebbb16..d12379b5cdb5 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -16,6 +16,8 @@
16#include <linux/vmalloc.h> 16#include <linux/vmalloc.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18 18
19#define DM_MSG_PREFIX "snapshots"
20
19/*----------------------------------------------------------------- 21/*-----------------------------------------------------------------
20 * Persistent snapshots, by persistent we mean that the snapshot 22 * Persistent snapshots, by persistent we mean that the snapshot
21 * will survive a reboot. 23 * will survive a reboot.
@@ -91,7 +93,6 @@ struct pstore {
91 struct dm_snapshot *snap; /* up pointer to my snapshot */ 93 struct dm_snapshot *snap; /* up pointer to my snapshot */
92 int version; 94 int version;
93 int valid; 95 int valid;
94 uint32_t chunk_size;
95 uint32_t exceptions_per_area; 96 uint32_t exceptions_per_area;
96 97
97 /* 98 /*
@@ -133,7 +134,7 @@ static int alloc_area(struct pstore *ps)
133 int r = -ENOMEM; 134 int r = -ENOMEM;
134 size_t len; 135 size_t len;
135 136
136 len = ps->chunk_size << SECTOR_SHIFT; 137 len = ps->snap->chunk_size << SECTOR_SHIFT;
137 138
138 /* 139 /*
139 * Allocate the chunk_size block of memory that will hold 140 * Allocate the chunk_size block of memory that will hold
@@ -160,8 +161,8 @@ static int chunk_io(struct pstore *ps, uint32_t chunk, int rw)
160 unsigned long bits; 161 unsigned long bits;
161 162
162 where.bdev = ps->snap->cow->bdev; 163 where.bdev = ps->snap->cow->bdev;
163 where.sector = ps->chunk_size * chunk; 164 where.sector = ps->snap->chunk_size * chunk;
164 where.count = ps->chunk_size; 165 where.count = ps->snap->chunk_size;
165 166
166 return dm_io_sync_vm(1, &where, rw, ps->area, &bits); 167 return dm_io_sync_vm(1, &where, rw, ps->area, &bits);
167} 168}
@@ -188,7 +189,7 @@ static int area_io(struct pstore *ps, uint32_t area, int rw)
188 189
189static int zero_area(struct pstore *ps, uint32_t area) 190static int zero_area(struct pstore *ps, uint32_t area)
190{ 191{
191 memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT); 192 memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
192 return area_io(ps, area, WRITE); 193 return area_io(ps, area, WRITE);
193} 194}
194 195
@@ -196,6 +197,7 @@ static int read_header(struct pstore *ps, int *new_snapshot)
196{ 197{
197 int r; 198 int r;
198 struct disk_header *dh; 199 struct disk_header *dh;
200 chunk_t chunk_size;
199 201
200 r = chunk_io(ps, 0, READ); 202 r = chunk_io(ps, 0, READ);
201 if (r) 203 if (r)
@@ -210,8 +212,29 @@ static int read_header(struct pstore *ps, int *new_snapshot)
210 *new_snapshot = 0; 212 *new_snapshot = 0;
211 ps->valid = le32_to_cpu(dh->valid); 213 ps->valid = le32_to_cpu(dh->valid);
212 ps->version = le32_to_cpu(dh->version); 214 ps->version = le32_to_cpu(dh->version);
213 ps->chunk_size = le32_to_cpu(dh->chunk_size); 215 chunk_size = le32_to_cpu(dh->chunk_size);
214 216 if (ps->snap->chunk_size != chunk_size) {
217 DMWARN("chunk size %llu in device metadata overrides "
218 "table chunk size of %llu.",
219 (unsigned long long)chunk_size,
220 (unsigned long long)ps->snap->chunk_size);
221
222 /* We had a bogus chunk_size. Fix stuff up. */
223 dm_io_put(sectors_to_pages(ps->snap->chunk_size));
224 free_area(ps);
225
226 ps->snap->chunk_size = chunk_size;
227 ps->snap->chunk_mask = chunk_size - 1;
228 ps->snap->chunk_shift = ffs(chunk_size) - 1;
229
230 r = alloc_area(ps);
231 if (r)
232 return r;
233
234 r = dm_io_get(sectors_to_pages(chunk_size));
235 if (r)
236 return r;
237 }
215 } else { 238 } else {
216 DMWARN("Invalid/corrupt snapshot"); 239 DMWARN("Invalid/corrupt snapshot");
217 r = -ENXIO; 240 r = -ENXIO;
@@ -224,13 +247,13 @@ static int write_header(struct pstore *ps)
224{ 247{
225 struct disk_header *dh; 248 struct disk_header *dh;
226 249
227 memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT); 250 memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
228 251
229 dh = (struct disk_header *) ps->area; 252 dh = (struct disk_header *) ps->area;
230 dh->magic = cpu_to_le32(SNAP_MAGIC); 253 dh->magic = cpu_to_le32(SNAP_MAGIC);
231 dh->valid = cpu_to_le32(ps->valid); 254 dh->valid = cpu_to_le32(ps->valid);
232 dh->version = cpu_to_le32(ps->version); 255 dh->version = cpu_to_le32(ps->version);
233 dh->chunk_size = cpu_to_le32(ps->chunk_size); 256 dh->chunk_size = cpu_to_le32(ps->snap->chunk_size);
234 257
235 return chunk_io(ps, 0, WRITE); 258 return chunk_io(ps, 0, WRITE);
236} 259}
@@ -365,7 +388,7 @@ static void persistent_destroy(struct exception_store *store)
365{ 388{
366 struct pstore *ps = get_info(store); 389 struct pstore *ps = get_info(store);
367 390
368 dm_io_put(sectors_to_pages(ps->chunk_size)); 391 dm_io_put(sectors_to_pages(ps->snap->chunk_size));
369 vfree(ps->callbacks); 392 vfree(ps->callbacks);
370 free_area(ps); 393 free_area(ps);
371 kfree(ps); 394 kfree(ps);
@@ -384,6 +407,16 @@ static int persistent_read_metadata(struct exception_store *store)
384 return r; 407 return r;
385 408
386 /* 409 /*
410 * Now we know correct chunk_size, complete the initialisation.
411 */
412 ps->exceptions_per_area = (ps->snap->chunk_size << SECTOR_SHIFT) /
413 sizeof(struct disk_exception);
414 ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
415 sizeof(*ps->callbacks));
416 if (!ps->callbacks)
417 return -ENOMEM;
418
419 /*
387 * Do we need to setup a new snapshot ? 420 * Do we need to setup a new snapshot ?
388 */ 421 */
389 if (new_snapshot) { 422 if (new_snapshot) {
@@ -533,9 +566,6 @@ int dm_create_persistent(struct exception_store *store, uint32_t chunk_size)
533 ps->snap = store->snap; 566 ps->snap = store->snap;
534 ps->valid = 1; 567 ps->valid = 1;
535 ps->version = SNAPSHOT_DISK_VERSION; 568 ps->version = SNAPSHOT_DISK_VERSION;
536 ps->chunk_size = chunk_size;
537 ps->exceptions_per_area = (chunk_size << SECTOR_SHIFT) /
538 sizeof(struct disk_exception);
539 ps->next_free = 2; /* skipping the header and first area */ 569 ps->next_free = 2; /* skipping the header and first area */
540 ps->current_committed = 0; 570 ps->current_committed = 0;
541 571
@@ -543,18 +573,9 @@ int dm_create_persistent(struct exception_store *store, uint32_t chunk_size)
543 if (r) 573 if (r)
544 goto bad; 574 goto bad;
545 575
546 /*
547 * Allocate space for all the callbacks.
548 */
549 ps->callback_count = 0; 576 ps->callback_count = 0;
550 atomic_set(&ps->pending_count, 0); 577 atomic_set(&ps->pending_count, 0);
551 ps->callbacks = dm_vcalloc(ps->exceptions_per_area, 578 ps->callbacks = NULL;
552 sizeof(*ps->callbacks));
553
554 if (!ps->callbacks) {
555 r = -ENOMEM;
556 goto bad;
557 }
558 579
559 store->destroy = persistent_destroy; 580 store->destroy = persistent_destroy;
560 store->read_metadata = persistent_read_metadata; 581 store->read_metadata = persistent_read_metadata;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 8edd6435414d..d13bb15a8a02 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
3 * Copyright (C) 2004 - 2005 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004 - 2006 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This file is released under the GPL. 5 * This file is released under the GPL.
6 */ 6 */
@@ -13,12 +13,12 @@
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/wait.h> 14#include <linux/wait.h>
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/devfs_fs_kernel.h>
17#include <linux/dm-ioctl.h> 16#include <linux/dm-ioctl.h>
18#include <linux/hdreg.h> 17#include <linux/hdreg.h>
19 18
20#include <asm/uaccess.h> 19#include <asm/uaccess.h>
21 20
21#define DM_MSG_PREFIX "ioctl"
22#define DM_DRIVER_EMAIL "dm-devel@redhat.com" 22#define DM_DRIVER_EMAIL "dm-devel@redhat.com"
23 23
24/*----------------------------------------------------------------- 24/*-----------------------------------------------------------------
@@ -48,7 +48,7 @@ struct vers_iter {
48static struct list_head _name_buckets[NUM_BUCKETS]; 48static struct list_head _name_buckets[NUM_BUCKETS];
49static struct list_head _uuid_buckets[NUM_BUCKETS]; 49static struct list_head _uuid_buckets[NUM_BUCKETS];
50 50
51static void dm_hash_remove_all(void); 51static void dm_hash_remove_all(int keep_open_devices);
52 52
53/* 53/*
54 * Guards access to both hash tables. 54 * Guards access to both hash tables.
@@ -67,14 +67,12 @@ static int dm_hash_init(void)
67{ 67{
68 init_buckets(_name_buckets); 68 init_buckets(_name_buckets);
69 init_buckets(_uuid_buckets); 69 init_buckets(_uuid_buckets);
70 devfs_mk_dir(DM_DIR);
71 return 0; 70 return 0;
72} 71}
73 72
74static void dm_hash_exit(void) 73static void dm_hash_exit(void)
75{ 74{
76 dm_hash_remove_all(); 75 dm_hash_remove_all(0);
77 devfs_remove(DM_DIR);
78} 76}
79 77
80/*----------------------------------------------------------------- 78/*-----------------------------------------------------------------
@@ -102,8 +100,10 @@ static struct hash_cell *__get_name_cell(const char *str)
102 unsigned int h = hash_str(str); 100 unsigned int h = hash_str(str);
103 101
104 list_for_each_entry (hc, _name_buckets + h, name_list) 102 list_for_each_entry (hc, _name_buckets + h, name_list)
105 if (!strcmp(hc->name, str)) 103 if (!strcmp(hc->name, str)) {
104 dm_get(hc->md);
106 return hc; 105 return hc;
106 }
107 107
108 return NULL; 108 return NULL;
109} 109}
@@ -114,8 +114,10 @@ static struct hash_cell *__get_uuid_cell(const char *str)
114 unsigned int h = hash_str(str); 114 unsigned int h = hash_str(str);
115 115
116 list_for_each_entry (hc, _uuid_buckets + h, uuid_list) 116 list_for_each_entry (hc, _uuid_buckets + h, uuid_list)
117 if (!strcmp(hc->uuid, str)) 117 if (!strcmp(hc->uuid, str)) {
118 dm_get(hc->md);
118 return hc; 119 return hc;
120 }
119 121
120 return NULL; 122 return NULL;
121} 123}
@@ -167,31 +169,12 @@ static void free_cell(struct hash_cell *hc)
167} 169}
168 170
169/* 171/*
170 * devfs stuff.
171 */
172static int register_with_devfs(struct hash_cell *hc)
173{
174 struct gendisk *disk = dm_disk(hc->md);
175
176 devfs_mk_bdev(MKDEV(disk->major, disk->first_minor),
177 S_IFBLK | S_IRUSR | S_IWUSR | S_IRGRP,
178 DM_DIR "/%s", hc->name);
179 return 0;
180}
181
182static int unregister_with_devfs(struct hash_cell *hc)
183{
184 devfs_remove(DM_DIR"/%s", hc->name);
185 return 0;
186}
187
188/*
189 * The kdev_t and uuid of a device can never change once it is 172 * The kdev_t and uuid of a device can never change once it is
190 * initially inserted. 173 * initially inserted.
191 */ 174 */
192static int dm_hash_insert(const char *name, const char *uuid, struct mapped_device *md) 175static int dm_hash_insert(const char *name, const char *uuid, struct mapped_device *md)
193{ 176{
194 struct hash_cell *cell; 177 struct hash_cell *cell, *hc;
195 178
196 /* 179 /*
197 * Allocate the new cells. 180 * Allocate the new cells.
@@ -204,19 +187,23 @@ static int dm_hash_insert(const char *name, const char *uuid, struct mapped_devi
204 * Insert the cell into both hash tables. 187 * Insert the cell into both hash tables.
205 */ 188 */
206 down_write(&_hash_lock); 189 down_write(&_hash_lock);
207 if (__get_name_cell(name)) 190 hc = __get_name_cell(name);
191 if (hc) {
192 dm_put(hc->md);
208 goto bad; 193 goto bad;
194 }
209 195
210 list_add(&cell->name_list, _name_buckets + hash_str(name)); 196 list_add(&cell->name_list, _name_buckets + hash_str(name));
211 197
212 if (uuid) { 198 if (uuid) {
213 if (__get_uuid_cell(uuid)) { 199 hc = __get_uuid_cell(uuid);
200 if (hc) {
214 list_del(&cell->name_list); 201 list_del(&cell->name_list);
202 dm_put(hc->md);
215 goto bad; 203 goto bad;
216 } 204 }
217 list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid)); 205 list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid));
218 } 206 }
219 register_with_devfs(cell);
220 dm_get(md); 207 dm_get(md);
221 dm_set_mdptr(md, cell); 208 dm_set_mdptr(md, cell);
222 up_write(&_hash_lock); 209 up_write(&_hash_lock);
@@ -236,7 +223,6 @@ static void __hash_remove(struct hash_cell *hc)
236 /* remove from the dev hash */ 223 /* remove from the dev hash */
237 list_del(&hc->uuid_list); 224 list_del(&hc->uuid_list);
238 list_del(&hc->name_list); 225 list_del(&hc->name_list);
239 unregister_with_devfs(hc);
240 dm_set_mdptr(hc->md, NULL); 226 dm_set_mdptr(hc->md, NULL);
241 227
242 table = dm_get_table(hc->md); 228 table = dm_get_table(hc->md);
@@ -251,19 +237,41 @@ static void __hash_remove(struct hash_cell *hc)
251 free_cell(hc); 237 free_cell(hc);
252} 238}
253 239
254static void dm_hash_remove_all(void) 240static void dm_hash_remove_all(int keep_open_devices)
255{ 241{
256 int i; 242 int i, dev_skipped, dev_removed;
257 struct hash_cell *hc; 243 struct hash_cell *hc;
258 struct list_head *tmp, *n; 244 struct list_head *tmp, *n;
259 245
260 down_write(&_hash_lock); 246 down_write(&_hash_lock);
247
248retry:
249 dev_skipped = dev_removed = 0;
261 for (i = 0; i < NUM_BUCKETS; i++) { 250 for (i = 0; i < NUM_BUCKETS; i++) {
262 list_for_each_safe (tmp, n, _name_buckets + i) { 251 list_for_each_safe (tmp, n, _name_buckets + i) {
263 hc = list_entry(tmp, struct hash_cell, name_list); 252 hc = list_entry(tmp, struct hash_cell, name_list);
253
254 if (keep_open_devices &&
255 dm_lock_for_deletion(hc->md)) {
256 dev_skipped++;
257 continue;
258 }
264 __hash_remove(hc); 259 __hash_remove(hc);
260 dev_removed = 1;
265 } 261 }
266 } 262 }
263
264 /*
265 * Some mapped devices may be using other mapped devices, so if any
266 * still exist, repeat until we make no further progress.
267 */
268 if (dev_skipped) {
269 if (dev_removed)
270 goto retry;
271
272 DMWARN("remove_all left %d open device(s)", dev_skipped);
273 }
274
267 up_write(&_hash_lock); 275 up_write(&_hash_lock);
268} 276}
269 277
@@ -289,6 +297,7 @@ static int dm_hash_rename(const char *old, const char *new)
289 if (hc) { 297 if (hc) {
290 DMWARN("asked to rename to an already existing name %s -> %s", 298 DMWARN("asked to rename to an already existing name %s -> %s",
291 old, new); 299 old, new);
300 dm_put(hc->md);
292 up_write(&_hash_lock); 301 up_write(&_hash_lock);
293 kfree(new_name); 302 kfree(new_name);
294 return -EBUSY; 303 return -EBUSY;
@@ -309,16 +318,11 @@ static int dm_hash_rename(const char *old, const char *new)
309 /* 318 /*
310 * rename and move the name cell. 319 * rename and move the name cell.
311 */ 320 */
312 unregister_with_devfs(hc);
313
314 list_del(&hc->name_list); 321 list_del(&hc->name_list);
315 old_name = hc->name; 322 old_name = hc->name;
316 hc->name = new_name; 323 hc->name = new_name;
317 list_add(&hc->name_list, _name_buckets + hash_str(new_name)); 324 list_add(&hc->name_list, _name_buckets + hash_str(new_name));
318 325
319 /* rename the device node in devfs */
320 register_with_devfs(hc);
321
322 /* 326 /*
323 * Wake up any dm event waiters. 327 * Wake up any dm event waiters.
324 */ 328 */
@@ -328,6 +332,7 @@ static int dm_hash_rename(const char *old, const char *new)
328 dm_table_put(table); 332 dm_table_put(table);
329 } 333 }
330 334
335 dm_put(hc->md);
331 up_write(&_hash_lock); 336 up_write(&_hash_lock);
332 kfree(old_name); 337 kfree(old_name);
333 return 0; 338 return 0;
@@ -344,7 +349,7 @@ typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size);
344 349
345static int remove_all(struct dm_ioctl *param, size_t param_size) 350static int remove_all(struct dm_ioctl *param, size_t param_size)
346{ 351{
347 dm_hash_remove_all(); 352 dm_hash_remove_all(1);
348 param->data_size = 0; 353 param->data_size = 0;
349 return 0; 354 return 0;
350} 355}
@@ -524,7 +529,6 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
524{ 529{
525 struct gendisk *disk = dm_disk(md); 530 struct gendisk *disk = dm_disk(md);
526 struct dm_table *table; 531 struct dm_table *table;
527 struct block_device *bdev;
528 532
529 param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG | 533 param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG |
530 DM_ACTIVE_PRESENT_FLAG); 534 DM_ACTIVE_PRESENT_FLAG);
@@ -534,20 +538,12 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
534 538
535 param->dev = huge_encode_dev(MKDEV(disk->major, disk->first_minor)); 539 param->dev = huge_encode_dev(MKDEV(disk->major, disk->first_minor));
536 540
537 if (!(param->flags & DM_SKIP_BDGET_FLAG)) { 541 /*
538 bdev = bdget_disk(disk, 0); 542 * Yes, this will be out of date by the time it gets back
539 if (!bdev) 543 * to userland, but it is still very useful for
540 return -ENXIO; 544 * debugging.
541 545 */
542 /* 546 param->open_count = dm_open_count(md);
543 * Yes, this will be out of date by the time it gets back
544 * to userland, but it is still very useful for
545 * debugging.
546 */
547 param->open_count = bdev->bd_openers;
548 bdput(bdev);
549 } else
550 param->open_count = -1;
551 547
552 if (disk->policy) 548 if (disk->policy)
553 param->flags |= DM_READONLY_FLAG; 549 param->flags |= DM_READONLY_FLAG;
@@ -567,7 +563,7 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
567 563
568static int dev_create(struct dm_ioctl *param, size_t param_size) 564static int dev_create(struct dm_ioctl *param, size_t param_size)
569{ 565{
570 int r; 566 int r, m = DM_ANY_MINOR;
571 struct mapped_device *md; 567 struct mapped_device *md;
572 568
573 r = check_name(param->name); 569 r = check_name(param->name);
@@ -575,10 +571,9 @@ static int dev_create(struct dm_ioctl *param, size_t param_size)
575 return r; 571 return r;
576 572
577 if (param->flags & DM_PERSISTENT_DEV_FLAG) 573 if (param->flags & DM_PERSISTENT_DEV_FLAG)
578 r = dm_create_with_minor(MINOR(huge_decode_dev(param->dev)), &md); 574 m = MINOR(huge_decode_dev(param->dev));
579 else
580 r = dm_create(&md);
581 575
576 r = dm_create(m, &md);
582 if (r) 577 if (r)
583 return r; 578 return r;
584 579
@@ -611,10 +606,8 @@ static struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param)
611 return __get_name_cell(param->name); 606 return __get_name_cell(param->name);
612 607
613 md = dm_get_md(huge_decode_dev(param->dev)); 608 md = dm_get_md(huge_decode_dev(param->dev));
614 if (md) { 609 if (md)
615 mdptr = dm_get_mdptr(md); 610 mdptr = dm_get_mdptr(md);
616 dm_put(md);
617 }
618 611
619 return mdptr; 612 return mdptr;
620} 613}
@@ -628,7 +621,6 @@ static struct mapped_device *find_device(struct dm_ioctl *param)
628 hc = __find_device_hash_cell(param); 621 hc = __find_device_hash_cell(param);
629 if (hc) { 622 if (hc) {
630 md = hc->md; 623 md = hc->md;
631 dm_get(md);
632 624
633 /* 625 /*
634 * Sneakily write in both the name and the uuid 626 * Sneakily write in both the name and the uuid
@@ -653,6 +645,8 @@ static struct mapped_device *find_device(struct dm_ioctl *param)
653static int dev_remove(struct dm_ioctl *param, size_t param_size) 645static int dev_remove(struct dm_ioctl *param, size_t param_size)
654{ 646{
655 struct hash_cell *hc; 647 struct hash_cell *hc;
648 struct mapped_device *md;
649 int r;
656 650
657 down_write(&_hash_lock); 651 down_write(&_hash_lock);
658 hc = __find_device_hash_cell(param); 652 hc = __find_device_hash_cell(param);
@@ -663,8 +657,22 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
663 return -ENXIO; 657 return -ENXIO;
664 } 658 }
665 659
660 md = hc->md;
661
662 /*
663 * Ensure the device is not open and nothing further can open it.
664 */
665 r = dm_lock_for_deletion(md);
666 if (r) {
667 DMWARN("unable to remove open device %s", hc->name);
668 up_write(&_hash_lock);
669 dm_put(md);
670 return r;
671 }
672
666 __hash_remove(hc); 673 __hash_remove(hc);
667 up_write(&_hash_lock); 674 up_write(&_hash_lock);
675 dm_put(md);
668 param->data_size = 0; 676 param->data_size = 0;
669 return 0; 677 return 0;
670} 678}
@@ -790,7 +798,6 @@ static int do_resume(struct dm_ioctl *param)
790 } 798 }
791 799
792 md = hc->md; 800 md = hc->md;
793 dm_get(md);
794 801
795 new_map = hc->new_map; 802 new_map = hc->new_map;
796 hc->new_map = NULL; 803 hc->new_map = NULL;
@@ -1078,6 +1085,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
1078{ 1085{
1079 int r; 1086 int r;
1080 struct hash_cell *hc; 1087 struct hash_cell *hc;
1088 struct mapped_device *md;
1081 1089
1082 down_write(&_hash_lock); 1090 down_write(&_hash_lock);
1083 1091
@@ -1096,7 +1104,9 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
1096 param->flags &= ~DM_INACTIVE_PRESENT_FLAG; 1104 param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
1097 1105
1098 r = __dev_status(hc->md, param); 1106 r = __dev_status(hc->md, param);
1107 md = hc->md;
1099 up_write(&_hash_lock); 1108 up_write(&_hash_lock);
1109 dm_put(md);
1100 return r; 1110 return r;
1101} 1111}
1102 1112
@@ -1462,7 +1472,6 @@ static struct file_operations _ctl_fops = {
1462static struct miscdevice _dm_misc = { 1472static struct miscdevice _dm_misc = {
1463 .minor = MISC_DYNAMIC_MINOR, 1473 .minor = MISC_DYNAMIC_MINOR,
1464 .name = DM_NAME, 1474 .name = DM_NAME,
1465 .devfs_name = "mapper/control",
1466 .fops = &_ctl_fops 1475 .fops = &_ctl_fops
1467}; 1476};
1468 1477
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index daf586c0898d..47b3c62bbdb8 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -12,6 +12,8 @@
12#include <linux/bio.h> 12#include <linux/bio.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14 14
15#define DM_MSG_PREFIX "linear"
16
15/* 17/*
16 * Linear: maps a linear range of a device. 18 * Linear: maps a linear range of a device.
17 */ 19 */
@@ -29,7 +31,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
29 unsigned long long tmp; 31 unsigned long long tmp;
30 32
31 if (argc != 2) { 33 if (argc != 2) {
32 ti->error = "dm-linear: Invalid argument count"; 34 ti->error = "Invalid argument count";
33 return -EINVAL; 35 return -EINVAL;
34 } 36 }
35 37
@@ -111,7 +113,7 @@ int __init dm_linear_init(void)
111 int r = dm_register_target(&linear_target); 113 int r = dm_register_target(&linear_target);
112 114
113 if (r < 0) 115 if (r < 0)
114 DMERR("linear: register failed %d", r); 116 DMERR("register failed %d", r);
115 117
116 return r; 118 return r;
117} 119}
@@ -121,5 +123,5 @@ void dm_linear_exit(void)
121 int r = dm_unregister_target(&linear_target); 123 int r = dm_unregister_target(&linear_target);
122 124
123 if (r < 0) 125 if (r < 0)
124 DMERR("linear: unregister failed %d", r); 126 DMERR("unregister failed %d", r);
125} 127}
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index d73779a42417..64b764bd02cc 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -12,6 +12,8 @@
12#include "dm-log.h" 12#include "dm-log.h"
13#include "dm-io.h" 13#include "dm-io.h"
14 14
15#define DM_MSG_PREFIX "mirror log"
16
15static LIST_HEAD(_log_types); 17static LIST_HEAD(_log_types);
16static DEFINE_SPINLOCK(_lock); 18static DEFINE_SPINLOCK(_lock);
17 19
@@ -155,8 +157,6 @@ struct log_c {
155 157
156 struct io_region header_location; 158 struct io_region header_location;
157 struct log_header *disk_header; 159 struct log_header *disk_header;
158
159 struct io_region bits_location;
160}; 160};
161 161
162/* 162/*
@@ -241,43 +241,21 @@ static inline int write_header(struct log_c *log)
241} 241}
242 242
243/*---------------------------------------------------------------- 243/*----------------------------------------------------------------
244 * Bits IO
245 *--------------------------------------------------------------*/
246static int read_bits(struct log_c *log)
247{
248 int r;
249 unsigned long ebits;
250
251 r = dm_io_sync_vm(1, &log->bits_location, READ,
252 log->clean_bits, &ebits);
253 if (r)
254 return r;
255
256 return 0;
257}
258
259static int write_bits(struct log_c *log)
260{
261 unsigned long ebits;
262 return dm_io_sync_vm(1, &log->bits_location, WRITE,
263 log->clean_bits, &ebits);
264}
265
266/*----------------------------------------------------------------
267 * core log constructor/destructor 244 * core log constructor/destructor
268 * 245 *
269 * argv contains region_size followed optionally by [no]sync 246 * argv contains region_size followed optionally by [no]sync
270 *--------------------------------------------------------------*/ 247 *--------------------------------------------------------------*/
271#define BYTE_SHIFT 3 248#define BYTE_SHIFT 3
272static int core_ctr(struct dirty_log *log, struct dm_target *ti, 249static int create_log_context(struct dirty_log *log, struct dm_target *ti,
273 unsigned int argc, char **argv) 250 unsigned int argc, char **argv,
251 struct dm_dev *dev)
274{ 252{
275 enum sync sync = DEFAULTSYNC; 253 enum sync sync = DEFAULTSYNC;
276 254
277 struct log_c *lc; 255 struct log_c *lc;
278 uint32_t region_size; 256 uint32_t region_size;
279 unsigned int region_count; 257 unsigned int region_count;
280 size_t bitset_size; 258 size_t bitset_size, buf_size;
281 259
282 if (argc < 1 || argc > 2) { 260 if (argc < 1 || argc > 2) {
283 DMWARN("wrong number of arguments to mirror log"); 261 DMWARN("wrong number of arguments to mirror log");
@@ -319,22 +297,53 @@ static int core_ctr(struct dirty_log *log, struct dm_target *ti,
319 * Work out how many "unsigned long"s we need to hold the bitset. 297 * Work out how many "unsigned long"s we need to hold the bitset.
320 */ 298 */
321 bitset_size = dm_round_up(region_count, 299 bitset_size = dm_round_up(region_count,
322 sizeof(unsigned long) << BYTE_SHIFT); 300 sizeof(*lc->clean_bits) << BYTE_SHIFT);
323 bitset_size >>= BYTE_SHIFT; 301 bitset_size >>= BYTE_SHIFT;
324 302
325 lc->bitset_uint32_count = bitset_size / 4; 303 lc->bitset_uint32_count = bitset_size / sizeof(*lc->clean_bits);
326 lc->clean_bits = vmalloc(bitset_size); 304
327 if (!lc->clean_bits) { 305 /*
328 DMWARN("couldn't allocate clean bitset"); 306 * Disk log?
329 kfree(lc); 307 */
330 return -ENOMEM; 308 if (!dev) {
309 lc->clean_bits = vmalloc(bitset_size);
310 if (!lc->clean_bits) {
311 DMWARN("couldn't allocate clean bitset");
312 kfree(lc);
313 return -ENOMEM;
314 }
315 lc->disk_header = NULL;
316 } else {
317 lc->log_dev = dev;
318 lc->header_location.bdev = lc->log_dev->bdev;
319 lc->header_location.sector = 0;
320
321 /*
322 * Buffer holds both header and bitset.
323 */
324 buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) +
325 bitset_size, ti->limits.hardsect_size);
326 lc->header_location.count = buf_size >> SECTOR_SHIFT;
327
328 lc->disk_header = vmalloc(buf_size);
329 if (!lc->disk_header) {
330 DMWARN("couldn't allocate disk log buffer");
331 kfree(lc);
332 return -ENOMEM;
333 }
334
335 lc->clean_bits = (void *)lc->disk_header +
336 (LOG_OFFSET << SECTOR_SHIFT);
331 } 337 }
338
332 memset(lc->clean_bits, -1, bitset_size); 339 memset(lc->clean_bits, -1, bitset_size);
333 340
334 lc->sync_bits = vmalloc(bitset_size); 341 lc->sync_bits = vmalloc(bitset_size);
335 if (!lc->sync_bits) { 342 if (!lc->sync_bits) {
336 DMWARN("couldn't allocate sync bitset"); 343 DMWARN("couldn't allocate sync bitset");
337 vfree(lc->clean_bits); 344 if (!dev)
345 vfree(lc->clean_bits);
346 vfree(lc->disk_header);
338 kfree(lc); 347 kfree(lc);
339 return -ENOMEM; 348 return -ENOMEM;
340 } 349 }
@@ -345,25 +354,40 @@ static int core_ctr(struct dirty_log *log, struct dm_target *ti,
345 if (!lc->recovering_bits) { 354 if (!lc->recovering_bits) {
346 DMWARN("couldn't allocate sync bitset"); 355 DMWARN("couldn't allocate sync bitset");
347 vfree(lc->sync_bits); 356 vfree(lc->sync_bits);
348 vfree(lc->clean_bits); 357 if (!dev)
358 vfree(lc->clean_bits);
359 vfree(lc->disk_header);
349 kfree(lc); 360 kfree(lc);
350 return -ENOMEM; 361 return -ENOMEM;
351 } 362 }
352 memset(lc->recovering_bits, 0, bitset_size); 363 memset(lc->recovering_bits, 0, bitset_size);
353 lc->sync_search = 0; 364 lc->sync_search = 0;
354 log->context = lc; 365 log->context = lc;
366
355 return 0; 367 return 0;
356} 368}
357 369
358static void core_dtr(struct dirty_log *log) 370static int core_ctr(struct dirty_log *log, struct dm_target *ti,
371 unsigned int argc, char **argv)
372{
373 return create_log_context(log, ti, argc, argv, NULL);
374}
375
376static void destroy_log_context(struct log_c *lc)
359{ 377{
360 struct log_c *lc = (struct log_c *) log->context;
361 vfree(lc->clean_bits);
362 vfree(lc->sync_bits); 378 vfree(lc->sync_bits);
363 vfree(lc->recovering_bits); 379 vfree(lc->recovering_bits);
364 kfree(lc); 380 kfree(lc);
365} 381}
366 382
383static void core_dtr(struct dirty_log *log)
384{
385 struct log_c *lc = (struct log_c *) log->context;
386
387 vfree(lc->clean_bits);
388 destroy_log_context(lc);
389}
390
367/*---------------------------------------------------------------- 391/*----------------------------------------------------------------
368 * disk log constructor/destructor 392 * disk log constructor/destructor
369 * 393 *
@@ -373,8 +397,6 @@ static int disk_ctr(struct dirty_log *log, struct dm_target *ti,
373 unsigned int argc, char **argv) 397 unsigned int argc, char **argv)
374{ 398{
375 int r; 399 int r;
376 size_t size;
377 struct log_c *lc;
378 struct dm_dev *dev; 400 struct dm_dev *dev;
379 401
380 if (argc < 2 || argc > 3) { 402 if (argc < 2 || argc > 3) {
@@ -387,49 +409,22 @@ static int disk_ctr(struct dirty_log *log, struct dm_target *ti,
387 if (r) 409 if (r)
388 return r; 410 return r;
389 411
390 r = core_ctr(log, ti, argc - 1, argv + 1); 412 r = create_log_context(log, ti, argc - 1, argv + 1, dev);
391 if (r) { 413 if (r) {
392 dm_put_device(ti, dev); 414 dm_put_device(ti, dev);
393 return r; 415 return r;
394 } 416 }
395 417
396 lc = (struct log_c *) log->context;
397 lc->log_dev = dev;
398
399 /* setup the disk header fields */
400 lc->header_location.bdev = lc->log_dev->bdev;
401 lc->header_location.sector = 0;
402 lc->header_location.count = 1;
403
404 /*
405 * We can't read less than this amount, even though we'll
406 * not be using most of this space.
407 */
408 lc->disk_header = vmalloc(1 << SECTOR_SHIFT);
409 if (!lc->disk_header)
410 goto bad;
411
412 /* setup the disk bitset fields */
413 lc->bits_location.bdev = lc->log_dev->bdev;
414 lc->bits_location.sector = LOG_OFFSET;
415
416 size = dm_round_up(lc->bitset_uint32_count * sizeof(uint32_t),
417 1 << SECTOR_SHIFT);
418 lc->bits_location.count = size >> SECTOR_SHIFT;
419 return 0; 418 return 0;
420
421 bad:
422 dm_put_device(ti, lc->log_dev);
423 core_dtr(log);
424 return -ENOMEM;
425} 419}
426 420
427static void disk_dtr(struct dirty_log *log) 421static void disk_dtr(struct dirty_log *log)
428{ 422{
429 struct log_c *lc = (struct log_c *) log->context; 423 struct log_c *lc = (struct log_c *) log->context;
424
430 dm_put_device(lc->ti, lc->log_dev); 425 dm_put_device(lc->ti, lc->log_dev);
431 vfree(lc->disk_header); 426 vfree(lc->disk_header);
432 core_dtr(log); 427 destroy_log_context(lc);
433} 428}
434 429
435static int count_bits32(uint32_t *addr, unsigned size) 430static int count_bits32(uint32_t *addr, unsigned size)
@@ -454,12 +449,7 @@ static int disk_resume(struct dirty_log *log)
454 if (r) 449 if (r)
455 return r; 450 return r;
456 451
457 /* read the bits */ 452 /* set or clear any new bits -- device has grown */
458 r = read_bits(lc);
459 if (r)
460 return r;
461
462 /* set or clear any new bits */
463 if (lc->sync == NOSYNC) 453 if (lc->sync == NOSYNC)
464 for (i = lc->header.nr_regions; i < lc->region_count; i++) 454 for (i = lc->header.nr_regions; i < lc->region_count; i++)
465 /* FIXME: amazingly inefficient */ 455 /* FIXME: amazingly inefficient */
@@ -469,15 +459,14 @@ static int disk_resume(struct dirty_log *log)
469 /* FIXME: amazingly inefficient */ 459 /* FIXME: amazingly inefficient */
470 log_clear_bit(lc, lc->clean_bits, i); 460 log_clear_bit(lc, lc->clean_bits, i);
471 461
462 /* clear any old bits -- device has shrunk */
463 for (i = lc->region_count; i % (sizeof(*lc->clean_bits) << BYTE_SHIFT); i++)
464 log_clear_bit(lc, lc->clean_bits, i);
465
472 /* copy clean across to sync */ 466 /* copy clean across to sync */
473 memcpy(lc->sync_bits, lc->clean_bits, size); 467 memcpy(lc->sync_bits, lc->clean_bits, size);
474 lc->sync_count = count_bits32(lc->clean_bits, lc->bitset_uint32_count); 468 lc->sync_count = count_bits32(lc->clean_bits, lc->bitset_uint32_count);
475 469
476 /* write the bits */
477 r = write_bits(lc);
478 if (r)
479 return r;
480
481 /* set the correct number of regions in the header */ 470 /* set the correct number of regions in the header */
482 lc->header.nr_regions = lc->region_count; 471 lc->header.nr_regions = lc->region_count;
483 472
@@ -518,7 +507,7 @@ static int disk_flush(struct dirty_log *log)
518 if (!lc->touched) 507 if (!lc->touched)
519 return 0; 508 return 0;
520 509
521 r = write_bits(lc); 510 r = write_header(lc);
522 if (!r) 511 if (!r)
523 lc->touched = 0; 512 lc->touched = 0;
524 513
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 1816f30678ed..93f701ea87bc 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -21,6 +21,7 @@
21#include <linux/workqueue.h> 21#include <linux/workqueue.h>
22#include <asm/atomic.h> 22#include <asm/atomic.h>
23 23
24#define DM_MSG_PREFIX "multipath"
24#define MESG_STR(x) x, sizeof(x) 25#define MESG_STR(x) x, sizeof(x)
25 26
26/* Path properties */ 27/* Path properties */
@@ -446,8 +447,6 @@ struct param {
446 char *error; 447 char *error;
447}; 448};
448 449
449#define ESTR(s) ("dm-multipath: " s)
450
451static int read_param(struct param *param, char *str, unsigned *v, char **error) 450static int read_param(struct param *param, char *str, unsigned *v, char **error)
452{ 451{
453 if (!str || 452 if (!str ||
@@ -495,12 +494,12 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
495 unsigned ps_argc; 494 unsigned ps_argc;
496 495
497 static struct param _params[] = { 496 static struct param _params[] = {
498 {0, 1024, ESTR("invalid number of path selector args")}, 497 {0, 1024, "invalid number of path selector args"},
499 }; 498 };
500 499
501 pst = dm_get_path_selector(shift(as)); 500 pst = dm_get_path_selector(shift(as));
502 if (!pst) { 501 if (!pst) {
503 ti->error = ESTR("unknown path selector type"); 502 ti->error = "unknown path selector type";
504 return -EINVAL; 503 return -EINVAL;
505 } 504 }
506 505
@@ -511,7 +510,7 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
511 r = pst->create(&pg->ps, ps_argc, as->argv); 510 r = pst->create(&pg->ps, ps_argc, as->argv);
512 if (r) { 511 if (r) {
513 dm_put_path_selector(pst); 512 dm_put_path_selector(pst);
514 ti->error = ESTR("path selector constructor failed"); 513 ti->error = "path selector constructor failed";
515 return r; 514 return r;
516 } 515 }
517 516
@@ -529,7 +528,7 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
529 528
530 /* we need at least a path arg */ 529 /* we need at least a path arg */
531 if (as->argc < 1) { 530 if (as->argc < 1) {
532 ti->error = ESTR("no device given"); 531 ti->error = "no device given";
533 return NULL; 532 return NULL;
534 } 533 }
535 534
@@ -540,7 +539,7 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
540 r = dm_get_device(ti, shift(as), ti->begin, ti->len, 539 r = dm_get_device(ti, shift(as), ti->begin, ti->len,
541 dm_table_get_mode(ti->table), &p->path.dev); 540 dm_table_get_mode(ti->table), &p->path.dev);
542 if (r) { 541 if (r) {
543 ti->error = ESTR("error getting device"); 542 ti->error = "error getting device";
544 goto bad; 543 goto bad;
545 } 544 }
546 545
@@ -562,8 +561,8 @@ static struct priority_group *parse_priority_group(struct arg_set *as,
562 struct dm_target *ti) 561 struct dm_target *ti)
563{ 562{
564 static struct param _params[] = { 563 static struct param _params[] = {
565 {1, 1024, ESTR("invalid number of paths")}, 564 {1, 1024, "invalid number of paths"},
566 {0, 1024, ESTR("invalid number of selector args")} 565 {0, 1024, "invalid number of selector args"}
567 }; 566 };
568 567
569 int r; 568 int r;
@@ -572,13 +571,13 @@ static struct priority_group *parse_priority_group(struct arg_set *as,
572 571
573 if (as->argc < 2) { 572 if (as->argc < 2) {
574 as->argc = 0; 573 as->argc = 0;
575 ti->error = ESTR("not enough priority group aruments"); 574 ti->error = "not enough priority group aruments";
576 return NULL; 575 return NULL;
577 } 576 }
578 577
579 pg = alloc_priority_group(); 578 pg = alloc_priority_group();
580 if (!pg) { 579 if (!pg) {
581 ti->error = ESTR("couldn't allocate priority group"); 580 ti->error = "couldn't allocate priority group";
582 return NULL; 581 return NULL;
583 } 582 }
584 pg->m = m; 583 pg->m = m;
@@ -633,7 +632,7 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m,
633 unsigned hw_argc; 632 unsigned hw_argc;
634 633
635 static struct param _params[] = { 634 static struct param _params[] = {
636 {0, 1024, ESTR("invalid number of hardware handler args")}, 635 {0, 1024, "invalid number of hardware handler args"},
637 }; 636 };
638 637
639 r = read_param(_params, shift(as), &hw_argc, &ti->error); 638 r = read_param(_params, shift(as), &hw_argc, &ti->error);
@@ -645,14 +644,14 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m,
645 644
646 hwht = dm_get_hw_handler(shift(as)); 645 hwht = dm_get_hw_handler(shift(as));
647 if (!hwht) { 646 if (!hwht) {
648 ti->error = ESTR("unknown hardware handler type"); 647 ti->error = "unknown hardware handler type";
649 return -EINVAL; 648 return -EINVAL;
650 } 649 }
651 650
652 r = hwht->create(&m->hw_handler, hw_argc - 1, as->argv); 651 r = hwht->create(&m->hw_handler, hw_argc - 1, as->argv);
653 if (r) { 652 if (r) {
654 dm_put_hw_handler(hwht); 653 dm_put_hw_handler(hwht);
655 ti->error = ESTR("hardware handler constructor failed"); 654 ti->error = "hardware handler constructor failed";
656 return r; 655 return r;
657 } 656 }
658 657
@@ -669,7 +668,7 @@ static int parse_features(struct arg_set *as, struct multipath *m,
669 unsigned argc; 668 unsigned argc;
670 669
671 static struct param _params[] = { 670 static struct param _params[] = {
672 {0, 1, ESTR("invalid number of feature args")}, 671 {0, 1, "invalid number of feature args"},
673 }; 672 };
674 673
675 r = read_param(_params, shift(as), &argc, &ti->error); 674 r = read_param(_params, shift(as), &argc, &ti->error);
@@ -692,8 +691,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
692{ 691{
693 /* target parameters */ 692 /* target parameters */
694 static struct param _params[] = { 693 static struct param _params[] = {
695 {1, 1024, ESTR("invalid number of priority groups")}, 694 {1, 1024, "invalid number of priority groups"},
696 {1, 1024, ESTR("invalid initial priority group number")}, 695 {1, 1024, "invalid initial priority group number"},
697 }; 696 };
698 697
699 int r; 698 int r;
@@ -707,10 +706,12 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
707 706
708 m = alloc_multipath(); 707 m = alloc_multipath();
709 if (!m) { 708 if (!m) {
710 ti->error = ESTR("can't allocate multipath"); 709 ti->error = "can't allocate multipath";
711 return -EINVAL; 710 return -EINVAL;
712 } 711 }
713 712
713 m->ti = ti;
714
714 r = parse_features(&as, m, ti); 715 r = parse_features(&as, m, ti);
715 if (r) 716 if (r)
716 goto bad; 717 goto bad;
@@ -746,13 +747,12 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
746 } 747 }
747 748
748 if (pg_count != m->nr_priority_groups) { 749 if (pg_count != m->nr_priority_groups) {
749 ti->error = ESTR("priority group count mismatch"); 750 ti->error = "priority group count mismatch";
750 r = -EINVAL; 751 r = -EINVAL;
751 goto bad; 752 goto bad;
752 } 753 }
753 754
754 ti->private = m; 755 ti->private = m;
755 m->ti = ti;
756 756
757 return 0; 757 return 0;
758 758
@@ -807,7 +807,7 @@ static int fail_path(struct pgpath *pgpath)
807 if (!pgpath->path.is_active) 807 if (!pgpath->path.is_active)
808 goto out; 808 goto out;
809 809
810 DMWARN("dm-multipath: Failing path %s.", pgpath->path.dev->name); 810 DMWARN("Failing path %s.", pgpath->path.dev->name);
811 811
812 pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path); 812 pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
813 pgpath->path.is_active = 0; 813 pgpath->path.is_active = 0;
@@ -1250,7 +1250,7 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
1250 r = dm_get_device(ti, argv[1], ti->begin, ti->len, 1250 r = dm_get_device(ti, argv[1], ti->begin, ti->len,
1251 dm_table_get_mode(ti->table), &dev); 1251 dm_table_get_mode(ti->table), &dev);
1252 if (r) { 1252 if (r) {
1253 DMWARN("dm-multipath message: error getting device %s", 1253 DMWARN("message: error getting device %s",
1254 argv[1]); 1254 argv[1]);
1255 return -EINVAL; 1255 return -EINVAL;
1256 } 1256 }
@@ -1309,7 +1309,7 @@ static int __init dm_multipath_init(void)
1309 return -ENOMEM; 1309 return -ENOMEM;
1310 } 1310 }
1311 1311
1312 DMINFO("dm-multipath version %u.%u.%u loaded", 1312 DMINFO("version %u.%u.%u loaded",
1313 multipath_target.version[0], multipath_target.version[1], 1313 multipath_target.version[0], multipath_target.version[1],
1314 multipath_target.version[2]); 1314 multipath_target.version[2]);
1315 1315
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index d12cf3e5e076..c54de989eb00 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -20,6 +20,8 @@
20#include <linux/vmalloc.h> 20#include <linux/vmalloc.h>
21#include <linux/workqueue.h> 21#include <linux/workqueue.h>
22 22
23#define DM_MSG_PREFIX "raid1"
24
23static struct workqueue_struct *_kmirrord_wq; 25static struct workqueue_struct *_kmirrord_wq;
24static struct work_struct _kmirrord_work; 26static struct work_struct _kmirrord_work;
25 27
@@ -106,12 +108,42 @@ struct region {
106 struct bio_list delayed_bios; 108 struct bio_list delayed_bios;
107}; 109};
108 110
111
112/*-----------------------------------------------------------------
113 * Mirror set structures.
114 *---------------------------------------------------------------*/
115struct mirror {
116 atomic_t error_count;
117 struct dm_dev *dev;
118 sector_t offset;
119};
120
121struct mirror_set {
122 struct dm_target *ti;
123 struct list_head list;
124 struct region_hash rh;
125 struct kcopyd_client *kcopyd_client;
126
127 spinlock_t lock; /* protects the next two lists */
128 struct bio_list reads;
129 struct bio_list writes;
130
131 /* recovery */
132 region_t nr_regions;
133 int in_sync;
134
135 struct mirror *default_mirror; /* Default mirror */
136
137 unsigned int nr_mirrors;
138 struct mirror mirror[0];
139};
140
109/* 141/*
110 * Conversion fns 142 * Conversion fns
111 */ 143 */
112static inline region_t bio_to_region(struct region_hash *rh, struct bio *bio) 144static inline region_t bio_to_region(struct region_hash *rh, struct bio *bio)
113{ 145{
114 return bio->bi_sector >> rh->region_shift; 146 return (bio->bi_sector - rh->ms->ti->begin) >> rh->region_shift;
115} 147}
116 148
117static inline sector_t region_to_sector(struct region_hash *rh, region_t region) 149static inline sector_t region_to_sector(struct region_hash *rh, region_t region)
@@ -223,7 +255,9 @@ static struct region *__rh_alloc(struct region_hash *rh, region_t region)
223 struct region *reg, *nreg; 255 struct region *reg, *nreg;
224 256
225 read_unlock(&rh->hash_lock); 257 read_unlock(&rh->hash_lock);
226 nreg = mempool_alloc(rh->region_pool, GFP_NOIO); 258 nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC);
259 if (unlikely(!nreg))
260 nreg = kmalloc(sizeof(struct region), GFP_NOIO);
227 nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? 261 nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
228 RH_CLEAN : RH_NOSYNC; 262 RH_CLEAN : RH_NOSYNC;
229 nreg->rh = rh; 263 nreg->rh = rh;
@@ -458,11 +492,9 @@ static int __rh_recovery_prepare(struct region_hash *rh)
458 /* Already quiesced ? */ 492 /* Already quiesced ? */
459 if (atomic_read(&reg->pending)) 493 if (atomic_read(&reg->pending))
460 list_del_init(&reg->list); 494 list_del_init(&reg->list);
495 else
496 list_move(&reg->list, &rh->quiesced_regions);
461 497
462 else {
463 list_del_init(&reg->list);
464 list_add(&reg->list, &rh->quiesced_regions);
465 }
466 spin_unlock_irq(&rh->region_lock); 498 spin_unlock_irq(&rh->region_lock);
467 499
468 return 1; 500 return 1;
@@ -541,35 +573,6 @@ static void rh_start_recovery(struct region_hash *rh)
541 wake(); 573 wake();
542} 574}
543 575
544/*-----------------------------------------------------------------
545 * Mirror set structures.
546 *---------------------------------------------------------------*/
547struct mirror {
548 atomic_t error_count;
549 struct dm_dev *dev;
550 sector_t offset;
551};
552
553struct mirror_set {
554 struct dm_target *ti;
555 struct list_head list;
556 struct region_hash rh;
557 struct kcopyd_client *kcopyd_client;
558
559 spinlock_t lock; /* protects the next two lists */
560 struct bio_list reads;
561 struct bio_list writes;
562
563 /* recovery */
564 region_t nr_regions;
565 int in_sync;
566
567 struct mirror *default_mirror; /* Default mirror */
568
569 unsigned int nr_mirrors;
570 struct mirror mirror[0];
571};
572
573/* 576/*
574 * Every mirror should look like this one. 577 * Every mirror should look like this one.
575 */ 578 */
@@ -603,7 +606,7 @@ static void recovery_complete(int read_err, unsigned int write_err,
603 struct region *reg = (struct region *) context; 606 struct region *reg = (struct region *) context;
604 607
605 /* FIXME: better error handling */ 608 /* FIXME: better error handling */
606 rh_recovery_end(reg, read_err || write_err); 609 rh_recovery_end(reg, !(read_err || write_err));
607} 610}
608 611
609static int recover(struct mirror_set *ms, struct region *reg) 612static int recover(struct mirror_set *ms, struct region *reg)
@@ -893,7 +896,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
893 896
894 ms = kmalloc(len, GFP_KERNEL); 897 ms = kmalloc(len, GFP_KERNEL);
895 if (!ms) { 898 if (!ms) {
896 ti->error = "dm-mirror: Cannot allocate mirror context"; 899 ti->error = "Cannot allocate mirror context";
897 return NULL; 900 return NULL;
898 } 901 }
899 902
@@ -907,7 +910,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
907 ms->default_mirror = &ms->mirror[DEFAULT_MIRROR]; 910 ms->default_mirror = &ms->mirror[DEFAULT_MIRROR];
908 911
909 if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { 912 if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) {
910 ti->error = "dm-mirror: Error creating dirty region hash"; 913 ti->error = "Error creating dirty region hash";
911 kfree(ms); 914 kfree(ms);
912 return NULL; 915 return NULL;
913 } 916 }
@@ -937,14 +940,14 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
937 unsigned long long offset; 940 unsigned long long offset;
938 941
939 if (sscanf(argv[1], "%llu", &offset) != 1) { 942 if (sscanf(argv[1], "%llu", &offset) != 1) {
940 ti->error = "dm-mirror: Invalid offset"; 943 ti->error = "Invalid offset";
941 return -EINVAL; 944 return -EINVAL;
942 } 945 }
943 946
944 if (dm_get_device(ti, argv[0], offset, ti->len, 947 if (dm_get_device(ti, argv[0], offset, ti->len,
945 dm_table_get_mode(ti->table), 948 dm_table_get_mode(ti->table),
946 &ms->mirror[mirror].dev)) { 949 &ms->mirror[mirror].dev)) {
947 ti->error = "dm-mirror: Device lookup failure"; 950 ti->error = "Device lookup failure";
948 return -ENXIO; 951 return -ENXIO;
949 } 952 }
950 953
@@ -981,30 +984,30 @@ static struct dirty_log *create_dirty_log(struct dm_target *ti,
981 struct dirty_log *dl; 984 struct dirty_log *dl;
982 985
983 if (argc < 2) { 986 if (argc < 2) {
984 ti->error = "dm-mirror: Insufficient mirror log arguments"; 987 ti->error = "Insufficient mirror log arguments";
985 return NULL; 988 return NULL;
986 } 989 }
987 990
988 if (sscanf(argv[1], "%u", &param_count) != 1) { 991 if (sscanf(argv[1], "%u", &param_count) != 1) {
989 ti->error = "dm-mirror: Invalid mirror log argument count"; 992 ti->error = "Invalid mirror log argument count";
990 return NULL; 993 return NULL;
991 } 994 }
992 995
993 *args_used = 2 + param_count; 996 *args_used = 2 + param_count;
994 997
995 if (argc < *args_used) { 998 if (argc < *args_used) {
996 ti->error = "dm-mirror: Insufficient mirror log arguments"; 999 ti->error = "Insufficient mirror log arguments";
997 return NULL; 1000 return NULL;
998 } 1001 }
999 1002
1000 dl = dm_create_dirty_log(argv[0], ti, param_count, argv + 2); 1003 dl = dm_create_dirty_log(argv[0], ti, param_count, argv + 2);
1001 if (!dl) { 1004 if (!dl) {
1002 ti->error = "dm-mirror: Error creating mirror dirty log"; 1005 ti->error = "Error creating mirror dirty log";
1003 return NULL; 1006 return NULL;
1004 } 1007 }
1005 1008
1006 if (!_check_region_size(ti, dl->type->get_region_size(dl))) { 1009 if (!_check_region_size(ti, dl->type->get_region_size(dl))) {
1007 ti->error = "dm-mirror: Invalid region size"; 1010 ti->error = "Invalid region size";
1008 dm_destroy_dirty_log(dl); 1011 dm_destroy_dirty_log(dl);
1009 return NULL; 1012 return NULL;
1010 } 1013 }
@@ -1038,7 +1041,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1038 1041
1039 if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 || 1042 if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 ||
1040 nr_mirrors < 2 || nr_mirrors > KCOPYD_MAX_REGIONS + 1) { 1043 nr_mirrors < 2 || nr_mirrors > KCOPYD_MAX_REGIONS + 1) {
1041 ti->error = "dm-mirror: Invalid number of mirrors"; 1044 ti->error = "Invalid number of mirrors";
1042 dm_destroy_dirty_log(dl); 1045 dm_destroy_dirty_log(dl);
1043 return -EINVAL; 1046 return -EINVAL;
1044 } 1047 }
@@ -1046,7 +1049,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1046 argv++, argc--; 1049 argv++, argc--;
1047 1050
1048 if (argc != nr_mirrors * 2) { 1051 if (argc != nr_mirrors * 2) {
1049 ti->error = "dm-mirror: Wrong number of mirror arguments"; 1052 ti->error = "Wrong number of mirror arguments";
1050 dm_destroy_dirty_log(dl); 1053 dm_destroy_dirty_log(dl);
1051 return -EINVAL; 1054 return -EINVAL;
1052 } 1055 }
@@ -1115,7 +1118,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio,
1115 struct mirror *m; 1118 struct mirror *m;
1116 struct mirror_set *ms = ti->private; 1119 struct mirror_set *ms = ti->private;
1117 1120
1118 map_context->ll = bio->bi_sector >> ms->rh.region_shift; 1121 map_context->ll = bio_to_region(&ms->rh, bio);
1119 1122
1120 if (rw == WRITE) { 1123 if (rw == WRITE) {
1121 queue_bio(ms, bio, rw); 1124 queue_bio(ms, bio, rw);
@@ -1221,7 +1224,7 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
1221 1224
1222static struct target_type mirror_target = { 1225static struct target_type mirror_target = {
1223 .name = "mirror", 1226 .name = "mirror",
1224 .version = {1, 0, 1}, 1227 .version = {1, 0, 2},
1225 .module = THIS_MODULE, 1228 .module = THIS_MODULE,
1226 .ctr = mirror_ctr, 1229 .ctr = mirror_ctr,
1227 .dtr = mirror_dtr, 1230 .dtr = mirror_dtr,
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c
index d0024865a789..c5a16c550122 100644
--- a/drivers/md/dm-round-robin.c
+++ b/drivers/md/dm-round-robin.c
@@ -14,6 +14,8 @@
14 14
15#include <linux/slab.h> 15#include <linux/slab.h>
16 16
17#define DM_MSG_PREFIX "multipath round-robin"
18
17/*----------------------------------------------------------------- 19/*-----------------------------------------------------------------
18 * Path-handling code, paths are held in lists 20 * Path-handling code, paths are held in lists
19 *---------------------------------------------------------------*/ 21 *---------------------------------------------------------------*/
@@ -191,9 +193,9 @@ static int __init dm_rr_init(void)
191 int r = dm_register_path_selector(&rr_ps); 193 int r = dm_register_path_selector(&rr_ps);
192 194
193 if (r < 0) 195 if (r < 0)
194 DMERR("round-robin: register failed %d", r); 196 DMERR("register failed %d", r);
195 197
196 DMINFO("dm-round-robin version 1.0.0 loaded"); 198 DMINFO("version 1.0.0 loaded");
197 199
198 return r; 200 return r;
199} 201}
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 08312b46463a..1d0fafda0f76 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -7,7 +7,6 @@
7 */ 7 */
8 8
9#include <linux/blkdev.h> 9#include <linux/blkdev.h>
10#include <linux/config.h>
11#include <linux/ctype.h> 10#include <linux/ctype.h>
12#include <linux/device-mapper.h> 11#include <linux/device-mapper.h>
13#include <linux/fs.h> 12#include <linux/fs.h>
@@ -23,6 +22,8 @@
23#include "dm-bio-list.h" 22#include "dm-bio-list.h"
24#include "kcopyd.h" 23#include "kcopyd.h"
25 24
25#define DM_MSG_PREFIX "snapshots"
26
26/* 27/*
27 * The percentage increment we will wake up users at 28 * The percentage increment we will wake up users at
28 */ 29 */
@@ -117,7 +118,7 @@ static int init_origin_hash(void)
117 _origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head), 118 _origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head),
118 GFP_KERNEL); 119 GFP_KERNEL);
119 if (!_origins) { 120 if (!_origins) {
120 DMERR("Device mapper: Snapshot: unable to allocate memory"); 121 DMERR("unable to allocate memory");
121 return -ENOMEM; 122 return -ENOMEM;
122 } 123 }
123 124
@@ -412,7 +413,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
412 int blocksize; 413 int blocksize;
413 414
414 if (argc < 4) { 415 if (argc < 4) {
415 ti->error = "dm-snapshot: requires exactly 4 arguments"; 416 ti->error = "requires exactly 4 arguments";
416 r = -EINVAL; 417 r = -EINVAL;
417 goto bad1; 418 goto bad1;
418 } 419 }
@@ -530,7 +531,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
530 } 531 }
531 532
532 ti->private = s; 533 ti->private = s;
533 ti->split_io = chunk_size; 534 ti->split_io = s->chunk_size;
534 535
535 return 0; 536 return 0;
536 537
@@ -1127,7 +1128,7 @@ static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1127 struct dm_dev *dev; 1128 struct dm_dev *dev;
1128 1129
1129 if (argc != 1) { 1130 if (argc != 1) {
1130 ti->error = "dm-origin: incorrect number of arguments"; 1131 ti->error = "origin: incorrect number of arguments";
1131 return -EINVAL; 1132 return -EINVAL;
1132 } 1133 }
1133 1134
@@ -1204,7 +1205,7 @@ static int origin_status(struct dm_target *ti, status_type_t type, char *result,
1204 1205
1205static struct target_type origin_target = { 1206static struct target_type origin_target = {
1206 .name = "snapshot-origin", 1207 .name = "snapshot-origin",
1207 .version = {1, 1, 0}, 1208 .version = {1, 4, 0},
1208 .module = THIS_MODULE, 1209 .module = THIS_MODULE,
1209 .ctr = origin_ctr, 1210 .ctr = origin_ctr,
1210 .dtr = origin_dtr, 1211 .dtr = origin_dtr,
@@ -1215,7 +1216,7 @@ static struct target_type origin_target = {
1215 1216
1216static struct target_type snapshot_target = { 1217static struct target_type snapshot_target = {
1217 .name = "snapshot", 1218 .name = "snapshot",
1218 .version = {1, 1, 0}, 1219 .version = {1, 4, 0},
1219 .module = THIS_MODULE, 1220 .module = THIS_MODULE,
1220 .ctr = snapshot_ctr, 1221 .ctr = snapshot_ctr,
1221 .dtr = snapshot_dtr, 1222 .dtr = snapshot_dtr,
@@ -1236,7 +1237,7 @@ static int __init dm_snapshot_init(void)
1236 1237
1237 r = dm_register_target(&origin_target); 1238 r = dm_register_target(&origin_target);
1238 if (r < 0) { 1239 if (r < 0) {
1239 DMERR("Device mapper: Origin: register failed %d\n", r); 1240 DMERR("Origin target register failed %d", r);
1240 goto bad1; 1241 goto bad1;
1241 } 1242 }
1242 1243
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 08328a8f5a3c..6c29fcecd892 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -12,6 +12,8 @@
12#include <linux/bio.h> 12#include <linux/bio.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14 14
15#define DM_MSG_PREFIX "striped"
16
15struct stripe { 17struct stripe {
16 struct dm_dev *dev; 18 struct dm_dev *dev;
17 sector_t physical_start; 19 sector_t physical_start;
@@ -78,19 +80,19 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
78 unsigned int i; 80 unsigned int i;
79 81
80 if (argc < 2) { 82 if (argc < 2) {
81 ti->error = "dm-stripe: Not enough arguments"; 83 ti->error = "Not enough arguments";
82 return -EINVAL; 84 return -EINVAL;
83 } 85 }
84 86
85 stripes = simple_strtoul(argv[0], &end, 10); 87 stripes = simple_strtoul(argv[0], &end, 10);
86 if (*end) { 88 if (*end) {
87 ti->error = "dm-stripe: Invalid stripe count"; 89 ti->error = "Invalid stripe count";
88 return -EINVAL; 90 return -EINVAL;
89 } 91 }
90 92
91 chunk_size = simple_strtoul(argv[1], &end, 10); 93 chunk_size = simple_strtoul(argv[1], &end, 10);
92 if (*end) { 94 if (*end) {
93 ti->error = "dm-stripe: Invalid chunk_size"; 95 ti->error = "Invalid chunk_size";
94 return -EINVAL; 96 return -EINVAL;
95 } 97 }
96 98
@@ -99,19 +101,19 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
99 */ 101 */
100 if (!chunk_size || (chunk_size & (chunk_size - 1)) || 102 if (!chunk_size || (chunk_size & (chunk_size - 1)) ||
101 (chunk_size < (PAGE_SIZE >> SECTOR_SHIFT))) { 103 (chunk_size < (PAGE_SIZE >> SECTOR_SHIFT))) {
102 ti->error = "dm-stripe: Invalid chunk size"; 104 ti->error = "Invalid chunk size";
103 return -EINVAL; 105 return -EINVAL;
104 } 106 }
105 107
106 if (ti->len & (chunk_size - 1)) { 108 if (ti->len & (chunk_size - 1)) {
107 ti->error = "dm-stripe: Target length not divisible by " 109 ti->error = "Target length not divisible by "
108 "chunk size"; 110 "chunk size";
109 return -EINVAL; 111 return -EINVAL;
110 } 112 }
111 113
112 width = ti->len; 114 width = ti->len;
113 if (sector_div(width, stripes)) { 115 if (sector_div(width, stripes)) {
114 ti->error = "dm-stripe: Target length not divisible by " 116 ti->error = "Target length not divisible by "
115 "number of stripes"; 117 "number of stripes";
116 return -EINVAL; 118 return -EINVAL;
117 } 119 }
@@ -120,14 +122,14 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
120 * Do we have enough arguments for that many stripes ? 122 * Do we have enough arguments for that many stripes ?
121 */ 123 */
122 if (argc != (2 + 2 * stripes)) { 124 if (argc != (2 + 2 * stripes)) {
123 ti->error = "dm-stripe: Not enough destinations " 125 ti->error = "Not enough destinations "
124 "specified"; 126 "specified";
125 return -EINVAL; 127 return -EINVAL;
126 } 128 }
127 129
128 sc = alloc_context(stripes); 130 sc = alloc_context(stripes);
129 if (!sc) { 131 if (!sc) {
130 ti->error = "dm-stripe: Memory allocation for striped context " 132 ti->error = "Memory allocation for striped context "
131 "failed"; 133 "failed";
132 return -ENOMEM; 134 return -ENOMEM;
133 } 135 }
@@ -149,8 +151,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
149 151
150 r = get_stripe(ti, sc, i, argv); 152 r = get_stripe(ti, sc, i, argv);
151 if (r < 0) { 153 if (r < 0) {
152 ti->error = "dm-stripe: Couldn't parse stripe " 154 ti->error = "Couldn't parse stripe destination";
153 "destination";
154 while (i--) 155 while (i--)
155 dm_put_device(ti, sc->stripe[i].dev); 156 dm_put_device(ti, sc->stripe[i].dev);
156 kfree(sc); 157 kfree(sc);
@@ -227,7 +228,7 @@ int __init dm_stripe_init(void)
227 228
228 r = dm_register_target(&stripe_target); 229 r = dm_register_target(&stripe_target);
229 if (r < 0) 230 if (r < 0)
230 DMWARN("striped target registration failed"); 231 DMWARN("target registration failed");
231 232
232 return r; 233 return r;
233} 234}
@@ -235,7 +236,7 @@ int __init dm_stripe_init(void)
235void dm_stripe_exit(void) 236void dm_stripe_exit(void)
236{ 237{
237 if (dm_unregister_target(&stripe_target)) 238 if (dm_unregister_target(&stripe_target))
238 DMWARN("striped target unregistration failed"); 239 DMWARN("target unregistration failed");
239 240
240 return; 241 return;
241} 242}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 8f56a54cf0ce..75fe9493e6af 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -17,6 +17,8 @@
17#include <linux/mutex.h> 17#include <linux/mutex.h>
18#include <asm/atomic.h> 18#include <asm/atomic.h>
19 19
20#define DM_MSG_PREFIX "table"
21
20#define MAX_DEPTH 16 22#define MAX_DEPTH 16
21#define NODE_SIZE L1_CACHE_BYTES 23#define NODE_SIZE L1_CACHE_BYTES
22#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t)) 24#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t))
@@ -237,6 +239,44 @@ int dm_table_create(struct dm_table **result, int mode,
237 return 0; 239 return 0;
238} 240}
239 241
242int dm_create_error_table(struct dm_table **result, struct mapped_device *md)
243{
244 struct dm_table *t;
245 sector_t dev_size = 1;
246 int r;
247
248 /*
249 * Find current size of device.
250 * Default to 1 sector if inactive.
251 */
252 t = dm_get_table(md);
253 if (t) {
254 dev_size = dm_table_get_size(t);
255 dm_table_put(t);
256 }
257
258 r = dm_table_create(&t, FMODE_READ, 1, md);
259 if (r)
260 return r;
261
262 r = dm_table_add_target(t, "error", 0, dev_size, NULL);
263 if (r)
264 goto out;
265
266 r = dm_table_complete(t);
267 if (r)
268 goto out;
269
270 *result = t;
271
272out:
273 if (r)
274 dm_table_put(t);
275
276 return r;
277}
278EXPORT_SYMBOL_GPL(dm_create_error_table);
279
240static void free_devices(struct list_head *devices) 280static void free_devices(struct list_head *devices)
241{ 281{
242 struct list_head *tmp, *next; 282 struct list_head *tmp, *next;
@@ -590,6 +630,12 @@ int dm_split_args(int *argc, char ***argvp, char *input)
590 unsigned array_size = 0; 630 unsigned array_size = 0;
591 631
592 *argc = 0; 632 *argc = 0;
633
634 if (!input) {
635 *argvp = NULL;
636 return 0;
637 }
638
593 argv = realloc_argv(&array_size, argv); 639 argv = realloc_argv(&array_size, argv);
594 if (!argv) 640 if (!argv)
595 return -ENOMEM; 641 return -ENOMEM;
@@ -671,15 +717,14 @@ int dm_table_add_target(struct dm_table *t, const char *type,
671 memset(tgt, 0, sizeof(*tgt)); 717 memset(tgt, 0, sizeof(*tgt));
672 718
673 if (!len) { 719 if (!len) {
674 tgt->error = "zero-length target"; 720 DMERR("%s: zero-length target", dm_device_name(t->md));
675 DMERR("%s", tgt->error);
676 return -EINVAL; 721 return -EINVAL;
677 } 722 }
678 723
679 tgt->type = dm_get_target_type(type); 724 tgt->type = dm_get_target_type(type);
680 if (!tgt->type) { 725 if (!tgt->type) {
681 tgt->error = "unknown target type"; 726 DMERR("%s: %s: unknown target type", dm_device_name(t->md),
682 DMERR("%s", tgt->error); 727 type);
683 return -EINVAL; 728 return -EINVAL;
684 } 729 }
685 730
@@ -716,7 +761,7 @@ int dm_table_add_target(struct dm_table *t, const char *type,
716 return 0; 761 return 0;
717 762
718 bad: 763 bad:
719 DMERR("%s", tgt->error); 764 DMERR("%s: %s: %s", dm_device_name(t->md), type, tgt->error);
720 dm_put_target_type(tgt->type); 765 dm_put_target_type(tgt->type);
721 return r; 766 return r;
722} 767}
@@ -802,7 +847,7 @@ sector_t dm_table_get_size(struct dm_table *t)
802 847
803struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index) 848struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index)
804{ 849{
805 if (index > t->num_targets) 850 if (index >= t->num_targets)
806 return NULL; 851 return NULL;
807 852
808 return t->targets + index; 853 return t->targets + index;
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 64fd8e79ea4c..477a041a41cf 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -12,6 +12,8 @@
12#include <linux/bio.h> 12#include <linux/bio.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14 14
15#define DM_MSG_PREFIX "target"
16
15struct tt_internal { 17struct tt_internal {
16 struct target_type tt; 18 struct target_type tt;
17 19
diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c
index 51c0639b2487..ea569f7348d2 100644
--- a/drivers/md/dm-zero.c
+++ b/drivers/md/dm-zero.c
@@ -10,13 +10,15 @@
10#include <linux/init.h> 10#include <linux/init.h>
11#include <linux/bio.h> 11#include <linux/bio.h>
12 12
13#define DM_MSG_PREFIX "zero"
14
13/* 15/*
14 * Construct a dummy mapping that only returns zeros 16 * Construct a dummy mapping that only returns zeros
15 */ 17 */
16static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv) 18static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv)
17{ 19{
18 if (argc != 0) { 20 if (argc != 0) {
19 ti->error = "dm-zero: No arguments required"; 21 ti->error = "No arguments required";
20 return -EINVAL; 22 return -EINVAL;
21 } 23 }
22 24
@@ -60,7 +62,7 @@ static int __init dm_zero_init(void)
60 int r = dm_register_target(&zero_target); 62 int r = dm_register_target(&zero_target);
61 63
62 if (r < 0) 64 if (r < 0)
63 DMERR("zero: register failed %d", r); 65 DMERR("register failed %d", r);
64 66
65 return r; 67 return r;
66} 68}
@@ -70,7 +72,7 @@ static void __exit dm_zero_exit(void)
70 int r = dm_unregister_target(&zero_target); 72 int r = dm_unregister_target(&zero_target);
71 73
72 if (r < 0) 74 if (r < 0)
73 DMERR("zero: unregister failed %d", r); 75 DMERR("unregister failed %d", r);
74} 76}
75 77
76module_init(dm_zero_init) 78module_init(dm_zero_init)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 4d710b7a133b..c99bf9f01759 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
3 * Copyright (C) 2004 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This file is released under the GPL. 5 * This file is released under the GPL.
6 */ 6 */
@@ -21,11 +21,14 @@
21#include <linux/hdreg.h> 21#include <linux/hdreg.h>
22#include <linux/blktrace_api.h> 22#include <linux/blktrace_api.h>
23 23
24#define DM_MSG_PREFIX "core"
25
24static const char *_name = DM_NAME; 26static const char *_name = DM_NAME;
25 27
26static unsigned int major = 0; 28static unsigned int major = 0;
27static unsigned int _major = 0; 29static unsigned int _major = 0;
28 30
31static DEFINE_SPINLOCK(_minor_lock);
29/* 32/*
30 * One of these is allocated per bio. 33 * One of these is allocated per bio.
31 */ 34 */
@@ -49,23 +52,28 @@ struct target_io {
49 52
50union map_info *dm_get_mapinfo(struct bio *bio) 53union map_info *dm_get_mapinfo(struct bio *bio)
51{ 54{
52 if (bio && bio->bi_private) 55 if (bio && bio->bi_private)
53 return &((struct target_io *)bio->bi_private)->info; 56 return &((struct target_io *)bio->bi_private)->info;
54 return NULL; 57 return NULL;
55} 58}
56 59
60#define MINOR_ALLOCED ((void *)-1)
61
57/* 62/*
58 * Bits for the md->flags field. 63 * Bits for the md->flags field.
59 */ 64 */
60#define DMF_BLOCK_IO 0 65#define DMF_BLOCK_IO 0
61#define DMF_SUSPENDED 1 66#define DMF_SUSPENDED 1
62#define DMF_FROZEN 2 67#define DMF_FROZEN 2
68#define DMF_FREEING 3
69#define DMF_DELETING 4
63 70
64struct mapped_device { 71struct mapped_device {
65 struct rw_semaphore io_lock; 72 struct rw_semaphore io_lock;
66 struct semaphore suspend_lock; 73 struct semaphore suspend_lock;
67 rwlock_t map_lock; 74 rwlock_t map_lock;
68 atomic_t holders; 75 atomic_t holders;
76 atomic_t open_count;
69 77
70 unsigned long flags; 78 unsigned long flags;
71 79
@@ -159,7 +167,7 @@ static void local_exit(void)
159 bioset_free(dm_set); 167 bioset_free(dm_set);
160 168
161 if (unregister_blkdev(_major, _name) < 0) 169 if (unregister_blkdev(_major, _name) < 0)
162 DMERR("devfs_unregister_blkdev failed"); 170 DMERR("unregister_blkdev failed");
163 171
164 _major = 0; 172 _major = 0;
165 173
@@ -218,9 +226,25 @@ static int dm_blk_open(struct inode *inode, struct file *file)
218{ 226{
219 struct mapped_device *md; 227 struct mapped_device *md;
220 228
229 spin_lock(&_minor_lock);
230
221 md = inode->i_bdev->bd_disk->private_data; 231 md = inode->i_bdev->bd_disk->private_data;
232 if (!md)
233 goto out;
234
235 if (test_bit(DMF_FREEING, &md->flags) ||
236 test_bit(DMF_DELETING, &md->flags)) {
237 md = NULL;
238 goto out;
239 }
240
222 dm_get(md); 241 dm_get(md);
223 return 0; 242 atomic_inc(&md->open_count);
243
244out:
245 spin_unlock(&_minor_lock);
246
247 return md ? 0 : -ENXIO;
224} 248}
225 249
226static int dm_blk_close(struct inode *inode, struct file *file) 250static int dm_blk_close(struct inode *inode, struct file *file)
@@ -228,10 +252,35 @@ static int dm_blk_close(struct inode *inode, struct file *file)
228 struct mapped_device *md; 252 struct mapped_device *md;
229 253
230 md = inode->i_bdev->bd_disk->private_data; 254 md = inode->i_bdev->bd_disk->private_data;
255 atomic_dec(&md->open_count);
231 dm_put(md); 256 dm_put(md);
232 return 0; 257 return 0;
233} 258}
234 259
260int dm_open_count(struct mapped_device *md)
261{
262 return atomic_read(&md->open_count);
263}
264
265/*
266 * Guarantees nothing is using the device before it's deleted.
267 */
268int dm_lock_for_deletion(struct mapped_device *md)
269{
270 int r = 0;
271
272 spin_lock(&_minor_lock);
273
274 if (dm_open_count(md))
275 r = -EBUSY;
276 else
277 set_bit(DMF_DELETING, &md->flags);
278
279 spin_unlock(&_minor_lock);
280
281 return r;
282}
283
235static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 284static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
236{ 285{
237 struct mapped_device *md = bdev->bd_disk->private_data; 286 struct mapped_device *md = bdev->bd_disk->private_data;
@@ -456,8 +505,8 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
456 if (r > 0) { 505 if (r > 0) {
457 /* the bio has been remapped so dispatch it */ 506 /* the bio has been remapped so dispatch it */
458 507
459 blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone, 508 blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone,
460 tio->io->bio->bi_bdev->bd_dev, sector, 509 tio->io->bio->bi_bdev->bd_dev, sector,
461 clone->bi_sector); 510 clone->bi_sector);
462 511
463 generic_make_request(clone); 512 generic_make_request(clone);
@@ -744,43 +793,39 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
744/*----------------------------------------------------------------- 793/*-----------------------------------------------------------------
745 * An IDR is used to keep track of allocated minor numbers. 794 * An IDR is used to keep track of allocated minor numbers.
746 *---------------------------------------------------------------*/ 795 *---------------------------------------------------------------*/
747static DEFINE_MUTEX(_minor_lock);
748static DEFINE_IDR(_minor_idr); 796static DEFINE_IDR(_minor_idr);
749 797
750static void free_minor(unsigned int minor) 798static void free_minor(int minor)
751{ 799{
752 mutex_lock(&_minor_lock); 800 spin_lock(&_minor_lock);
753 idr_remove(&_minor_idr, minor); 801 idr_remove(&_minor_idr, minor);
754 mutex_unlock(&_minor_lock); 802 spin_unlock(&_minor_lock);
755} 803}
756 804
757/* 805/*
758 * See if the device with a specific minor # is free. 806 * See if the device with a specific minor # is free.
759 */ 807 */
760static int specific_minor(struct mapped_device *md, unsigned int minor) 808static int specific_minor(struct mapped_device *md, int minor)
761{ 809{
762 int r, m; 810 int r, m;
763 811
764 if (minor >= (1 << MINORBITS)) 812 if (minor >= (1 << MINORBITS))
765 return -EINVAL; 813 return -EINVAL;
766 814
767 mutex_lock(&_minor_lock); 815 r = idr_pre_get(&_minor_idr, GFP_KERNEL);
816 if (!r)
817 return -ENOMEM;
818
819 spin_lock(&_minor_lock);
768 820
769 if (idr_find(&_minor_idr, minor)) { 821 if (idr_find(&_minor_idr, minor)) {
770 r = -EBUSY; 822 r = -EBUSY;
771 goto out; 823 goto out;
772 } 824 }
773 825
774 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 826 r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m);
775 if (!r) { 827 if (r)
776 r = -ENOMEM;
777 goto out;
778 }
779
780 r = idr_get_new_above(&_minor_idr, md, minor, &m);
781 if (r) {
782 goto out; 828 goto out;
783 }
784 829
785 if (m != minor) { 830 if (m != minor) {
786 idr_remove(&_minor_idr, m); 831 idr_remove(&_minor_idr, m);
@@ -789,24 +834,21 @@ static int specific_minor(struct mapped_device *md, unsigned int minor)
789 } 834 }
790 835
791out: 836out:
792 mutex_unlock(&_minor_lock); 837 spin_unlock(&_minor_lock);
793 return r; 838 return r;
794} 839}
795 840
796static int next_free_minor(struct mapped_device *md, unsigned int *minor) 841static int next_free_minor(struct mapped_device *md, int *minor)
797{ 842{
798 int r; 843 int r, m;
799 unsigned int m;
800
801 mutex_lock(&_minor_lock);
802 844
803 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 845 r = idr_pre_get(&_minor_idr, GFP_KERNEL);
804 if (!r) { 846 if (!r)
805 r = -ENOMEM; 847 return -ENOMEM;
806 goto out; 848
807 } 849 spin_lock(&_minor_lock);
808 850
809 r = idr_get_new(&_minor_idr, md, &m); 851 r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m);
810 if (r) { 852 if (r) {
811 goto out; 853 goto out;
812 } 854 }
@@ -820,7 +862,7 @@ static int next_free_minor(struct mapped_device *md, unsigned int *minor)
820 *minor = m; 862 *minor = m;
821 863
822out: 864out:
823 mutex_unlock(&_minor_lock); 865 spin_unlock(&_minor_lock);
824 return r; 866 return r;
825} 867}
826 868
@@ -829,18 +871,25 @@ static struct block_device_operations dm_blk_dops;
829/* 871/*
830 * Allocate and initialise a blank device with a given minor. 872 * Allocate and initialise a blank device with a given minor.
831 */ 873 */
832static struct mapped_device *alloc_dev(unsigned int minor, int persistent) 874static struct mapped_device *alloc_dev(int minor)
833{ 875{
834 int r; 876 int r;
835 struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL); 877 struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL);
878 void *old_md;
836 879
837 if (!md) { 880 if (!md) {
838 DMWARN("unable to allocate device, out of memory."); 881 DMWARN("unable to allocate device, out of memory.");
839 return NULL; 882 return NULL;
840 } 883 }
841 884
885 if (!try_module_get(THIS_MODULE))
886 goto bad0;
887
842 /* get a minor number for the dev */ 888 /* get a minor number for the dev */
843 r = persistent ? specific_minor(md, minor) : next_free_minor(md, &minor); 889 if (minor == DM_ANY_MINOR)
890 r = next_free_minor(md, &minor);
891 else
892 r = specific_minor(md, minor);
844 if (r < 0) 893 if (r < 0)
845 goto bad1; 894 goto bad1;
846 895
@@ -849,6 +898,7 @@ static struct mapped_device *alloc_dev(unsigned int minor, int persistent)
849 init_MUTEX(&md->suspend_lock); 898 init_MUTEX(&md->suspend_lock);
850 rwlock_init(&md->map_lock); 899 rwlock_init(&md->map_lock);
851 atomic_set(&md->holders, 1); 900 atomic_set(&md->holders, 1);
901 atomic_set(&md->open_count, 0);
852 atomic_set(&md->event_nr, 0); 902 atomic_set(&md->event_nr, 0);
853 903
854 md->queue = blk_alloc_queue(GFP_KERNEL); 904 md->queue = blk_alloc_queue(GFP_KERNEL);
@@ -875,6 +925,10 @@ static struct mapped_device *alloc_dev(unsigned int minor, int persistent)
875 if (!md->disk) 925 if (!md->disk)
876 goto bad4; 926 goto bad4;
877 927
928 atomic_set(&md->pending, 0);
929 init_waitqueue_head(&md->wait);
930 init_waitqueue_head(&md->eventq);
931
878 md->disk->major = _major; 932 md->disk->major = _major;
879 md->disk->first_minor = minor; 933 md->disk->first_minor = minor;
880 md->disk->fops = &dm_blk_dops; 934 md->disk->fops = &dm_blk_dops;
@@ -884,9 +938,12 @@ static struct mapped_device *alloc_dev(unsigned int minor, int persistent)
884 add_disk(md->disk); 938 add_disk(md->disk);
885 format_dev_t(md->name, MKDEV(_major, minor)); 939 format_dev_t(md->name, MKDEV(_major, minor));
886 940
887 atomic_set(&md->pending, 0); 941 /* Populate the mapping, nobody knows we exist yet */
888 init_waitqueue_head(&md->wait); 942 spin_lock(&_minor_lock);
889 init_waitqueue_head(&md->eventq); 943 old_md = idr_replace(&_minor_idr, md, minor);
944 spin_unlock(&_minor_lock);
945
946 BUG_ON(old_md != MINOR_ALLOCED);
890 947
891 return md; 948 return md;
892 949
@@ -898,13 +955,15 @@ static struct mapped_device *alloc_dev(unsigned int minor, int persistent)
898 blk_cleanup_queue(md->queue); 955 blk_cleanup_queue(md->queue);
899 free_minor(minor); 956 free_minor(minor);
900 bad1: 957 bad1:
958 module_put(THIS_MODULE);
959 bad0:
901 kfree(md); 960 kfree(md);
902 return NULL; 961 return NULL;
903} 962}
904 963
905static void free_dev(struct mapped_device *md) 964static void free_dev(struct mapped_device *md)
906{ 965{
907 unsigned int minor = md->disk->first_minor; 966 int minor = md->disk->first_minor;
908 967
909 if (md->suspended_bdev) { 968 if (md->suspended_bdev) {
910 thaw_bdev(md->suspended_bdev, NULL); 969 thaw_bdev(md->suspended_bdev, NULL);
@@ -914,8 +973,14 @@ static void free_dev(struct mapped_device *md)
914 mempool_destroy(md->io_pool); 973 mempool_destroy(md->io_pool);
915 del_gendisk(md->disk); 974 del_gendisk(md->disk);
916 free_minor(minor); 975 free_minor(minor);
976
977 spin_lock(&_minor_lock);
978 md->disk->private_data = NULL;
979 spin_unlock(&_minor_lock);
980
917 put_disk(md->disk); 981 put_disk(md->disk);
918 blk_cleanup_queue(md->queue); 982 blk_cleanup_queue(md->queue);
983 module_put(THIS_MODULE);
919 kfree(md); 984 kfree(md);
920} 985}
921 986
@@ -984,12 +1049,11 @@ static void __unbind(struct mapped_device *md)
984/* 1049/*
985 * Constructor for a new device. 1050 * Constructor for a new device.
986 */ 1051 */
987static int create_aux(unsigned int minor, int persistent, 1052int dm_create(int minor, struct mapped_device **result)
988 struct mapped_device **result)
989{ 1053{
990 struct mapped_device *md; 1054 struct mapped_device *md;
991 1055
992 md = alloc_dev(minor, persistent); 1056 md = alloc_dev(minor);
993 if (!md) 1057 if (!md)
994 return -ENXIO; 1058 return -ENXIO;
995 1059
@@ -997,16 +1061,6 @@ static int create_aux(unsigned int minor, int persistent,
997 return 0; 1061 return 0;
998} 1062}
999 1063
1000int dm_create(struct mapped_device **result)
1001{
1002 return create_aux(0, 0, result);
1003}
1004
1005int dm_create_with_minor(unsigned int minor, struct mapped_device **result)
1006{
1007 return create_aux(minor, 1, result);
1008}
1009
1010static struct mapped_device *dm_find_md(dev_t dev) 1064static struct mapped_device *dm_find_md(dev_t dev)
1011{ 1065{
1012 struct mapped_device *md; 1066 struct mapped_device *md;
@@ -1015,13 +1069,18 @@ static struct mapped_device *dm_find_md(dev_t dev)
1015 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 1069 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
1016 return NULL; 1070 return NULL;
1017 1071
1018 mutex_lock(&_minor_lock); 1072 spin_lock(&_minor_lock);
1019 1073
1020 md = idr_find(&_minor_idr, minor); 1074 md = idr_find(&_minor_idr, minor);
1021 if (!md || (dm_disk(md)->first_minor != minor)) 1075 if (md && (md == MINOR_ALLOCED ||
1076 (dm_disk(md)->first_minor != minor) ||
1077 test_bit(DMF_FREEING, &md->flags))) {
1022 md = NULL; 1078 md = NULL;
1079 goto out;
1080 }
1023 1081
1024 mutex_unlock(&_minor_lock); 1082out:
1083 spin_unlock(&_minor_lock);
1025 1084
1026 return md; 1085 return md;
1027} 1086}
@@ -1051,12 +1110,23 @@ void dm_get(struct mapped_device *md)
1051 atomic_inc(&md->holders); 1110 atomic_inc(&md->holders);
1052} 1111}
1053 1112
1113const char *dm_device_name(struct mapped_device *md)
1114{
1115 return md->name;
1116}
1117EXPORT_SYMBOL_GPL(dm_device_name);
1118
1054void dm_put(struct mapped_device *md) 1119void dm_put(struct mapped_device *md)
1055{ 1120{
1056 struct dm_table *map; 1121 struct dm_table *map;
1057 1122
1058 if (atomic_dec_and_test(&md->holders)) { 1123 BUG_ON(test_bit(DMF_FREEING, &md->flags));
1124
1125 if (atomic_dec_and_lock(&md->holders, &_minor_lock)) {
1059 map = dm_get_table(md); 1126 map = dm_get_table(md);
1127 idr_replace(&_minor_idr, MINOR_ALLOCED, dm_disk(md)->first_minor);
1128 set_bit(DMF_FREEING, &md->flags);
1129 spin_unlock(&_minor_lock);
1060 if (!dm_suspended(md)) { 1130 if (!dm_suspended(md)) {
1061 dm_table_presuspend_targets(map); 1131 dm_table_presuspend_targets(map);
1062 dm_table_postsuspend_targets(map); 1132 dm_table_postsuspend_targets(map);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index fd90bc8f9e45..3c03c0ecab7e 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -2,7 +2,7 @@
2 * Internal header file for device mapper 2 * Internal header file for device mapper
3 * 3 *
4 * Copyright (C) 2001, 2002 Sistina Software 4 * Copyright (C) 2001, 2002 Sistina Software
5 * Copyright (C) 2004 Red Hat, Inc. All rights reserved. 5 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
6 * 6 *
7 * This file is released under the LGPL. 7 * This file is released under the LGPL.
8 */ 8 */
@@ -17,9 +17,10 @@
17#include <linux/hdreg.h> 17#include <linux/hdreg.h>
18 18
19#define DM_NAME "device-mapper" 19#define DM_NAME "device-mapper"
20#define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x) 20
21#define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x) 21#define DMERR(f, arg...) printk(KERN_ERR DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg)
22#define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x) 22#define DMWARN(f, arg...) printk(KERN_WARNING DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg)
23#define DMINFO(f, arg...) printk(KERN_INFO DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg)
23 24
24#define DMEMIT(x...) sz += ((sz >= maxlen) ? \ 25#define DMEMIT(x...) sz += ((sz >= maxlen) ? \
25 0 : scnprintf(result + sz, maxlen - sz, x)) 26 0 : scnprintf(result + sz, maxlen - sz, x))
@@ -39,83 +40,16 @@ struct dm_dev {
39}; 40};
40 41
41struct dm_table; 42struct dm_table;
42struct mapped_device;
43
44/*-----------------------------------------------------------------
45 * Functions for manipulating a struct mapped_device.
46 * Drop the reference with dm_put when you finish with the object.
47 *---------------------------------------------------------------*/
48int dm_create(struct mapped_device **md);
49int dm_create_with_minor(unsigned int minor, struct mapped_device **md);
50void dm_set_mdptr(struct mapped_device *md, void *ptr);
51void *dm_get_mdptr(struct mapped_device *md);
52struct mapped_device *dm_get_md(dev_t dev);
53
54/*
55 * Reference counting for md.
56 */
57void dm_get(struct mapped_device *md);
58void dm_put(struct mapped_device *md);
59
60/*
61 * A device can still be used while suspended, but I/O is deferred.
62 */
63int dm_suspend(struct mapped_device *md, int with_lockfs);
64int dm_resume(struct mapped_device *md);
65
66/*
67 * The device must be suspended before calling this method.
68 */
69int dm_swap_table(struct mapped_device *md, struct dm_table *t);
70
71/*
72 * Drop a reference on the table when you've finished with the
73 * result.
74 */
75struct dm_table *dm_get_table(struct mapped_device *md);
76
77/*
78 * Event functions.
79 */
80uint32_t dm_get_event_nr(struct mapped_device *md);
81int dm_wait_event(struct mapped_device *md, int event_nr);
82
83/*
84 * Info functions.
85 */
86struct gendisk *dm_disk(struct mapped_device *md);
87int dm_suspended(struct mapped_device *md);
88
89/*
90 * Geometry functions.
91 */
92int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo);
93int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo);
94 43
95/*----------------------------------------------------------------- 44/*-----------------------------------------------------------------
96 * Functions for manipulating a table. Tables are also reference 45 * Internal table functions.
97 * counted.
98 *---------------------------------------------------------------*/ 46 *---------------------------------------------------------------*/
99int dm_table_create(struct dm_table **result, int mode,
100 unsigned num_targets, struct mapped_device *md);
101
102void dm_table_get(struct dm_table *t);
103void dm_table_put(struct dm_table *t);
104
105int dm_table_add_target(struct dm_table *t, const char *type,
106 sector_t start, sector_t len, char *params);
107int dm_table_complete(struct dm_table *t);
108void dm_table_event_callback(struct dm_table *t, 47void dm_table_event_callback(struct dm_table *t,
109 void (*fn)(void *), void *context); 48 void (*fn)(void *), void *context);
110void dm_table_event(struct dm_table *t);
111sector_t dm_table_get_size(struct dm_table *t);
112struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index); 49struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
113struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); 50struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
114void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q); 51void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q);
115unsigned int dm_table_get_num_targets(struct dm_table *t);
116struct list_head *dm_table_get_devices(struct dm_table *t); 52struct list_head *dm_table_get_devices(struct dm_table *t);
117int dm_table_get_mode(struct dm_table *t);
118struct mapped_device *dm_table_get_md(struct dm_table *t);
119void dm_table_presuspend_targets(struct dm_table *t); 53void dm_table_presuspend_targets(struct dm_table *t);
120void dm_table_postsuspend_targets(struct dm_table *t); 54void dm_table_postsuspend_targets(struct dm_table *t);
121void dm_table_resume_targets(struct dm_table *t); 55void dm_table_resume_targets(struct dm_table *t);
@@ -133,7 +67,6 @@ void dm_put_target_type(struct target_type *t);
133int dm_target_iterate(void (*iter_func)(struct target_type *tt, 67int dm_target_iterate(void (*iter_func)(struct target_type *tt,
134 void *param), void *param); 68 void *param), void *param);
135 69
136
137/*----------------------------------------------------------------- 70/*-----------------------------------------------------------------
138 * Useful inlines. 71 * Useful inlines.
139 *---------------------------------------------------------------*/ 72 *---------------------------------------------------------------*/
@@ -191,5 +124,7 @@ void dm_stripe_exit(void);
191 124
192void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size); 125void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size);
193union map_info *dm_get_mapinfo(struct bio *bio); 126union map_info *dm_get_mapinfo(struct bio *bio);
127int dm_open_count(struct mapped_device *md);
128int dm_lock_for_deletion(struct mapped_device *md);
194 129
195#endif 130#endif
diff --git a/drivers/md/kcopyd.c b/drivers/md/kcopyd.c
index 72480a48d88b..f1db6eff4857 100644
--- a/drivers/md/kcopyd.c
+++ b/drivers/md/kcopyd.c
@@ -12,7 +12,6 @@
12#include <asm/atomic.h> 12#include <asm/atomic.h>
13 13
14#include <linux/blkdev.h> 14#include <linux/blkdev.h>
15#include <linux/config.h>
16#include <linux/fs.h> 15#include <linux/fs.h>
17#include <linux/init.h> 16#include <linux/init.h>
18#include <linux/list.h> 17#include <linux/list.h>
@@ -314,7 +313,7 @@ static void complete_io(unsigned long error, void *context)
314 313
315 if (error) { 314 if (error) {
316 if (job->rw == WRITE) 315 if (job->rw == WRITE)
317 job->write_err &= error; 316 job->write_err |= error;
318 else 317 else
319 job->read_err = 1; 318 job->read_err = 1;
320 319
@@ -460,7 +459,7 @@ static void segment_complete(int read_err,
460 job->read_err = 1; 459 job->read_err = 1;
461 460
462 if (write_err) 461 if (write_err)
463 job->write_err &= write_err; 462 job->write_err |= write_err;
464 463
465 /* 464 /*
466 * Only dispatch more work if there hasn't been an error. 465 * Only dispatch more work if there hasn't been an error.
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 777585458c85..b99c19c7eb22 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -111,7 +111,7 @@ static int linear_issue_flush(request_queue_t *q, struct gendisk *disk,
111 return ret; 111 return ret;
112} 112}
113 113
114static int linear_run (mddev_t *mddev) 114static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
115{ 115{
116 linear_conf_t *conf; 116 linear_conf_t *conf;
117 dev_info_t **table; 117 dev_info_t **table;
@@ -121,20 +121,21 @@ static int linear_run (mddev_t *mddev)
121 sector_t curr_offset; 121 sector_t curr_offset;
122 struct list_head *tmp; 122 struct list_head *tmp;
123 123
124 conf = kzalloc (sizeof (*conf) + mddev->raid_disks*sizeof(dev_info_t), 124 conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(dev_info_t),
125 GFP_KERNEL); 125 GFP_KERNEL);
126 if (!conf) 126 if (!conf)
127 goto out; 127 return NULL;
128
128 mddev->private = conf; 129 mddev->private = conf;
129 130
130 cnt = 0; 131 cnt = 0;
131 mddev->array_size = 0; 132 conf->array_size = 0;
132 133
133 ITERATE_RDEV(mddev,rdev,tmp) { 134 ITERATE_RDEV(mddev,rdev,tmp) {
134 int j = rdev->raid_disk; 135 int j = rdev->raid_disk;
135 dev_info_t *disk = conf->disks + j; 136 dev_info_t *disk = conf->disks + j;
136 137
137 if (j < 0 || j > mddev->raid_disks || disk->rdev) { 138 if (j < 0 || j > raid_disks || disk->rdev) {
138 printk("linear: disk numbering problem. Aborting!\n"); 139 printk("linear: disk numbering problem. Aborting!\n");
139 goto out; 140 goto out;
140 } 141 }
@@ -152,16 +153,16 @@ static int linear_run (mddev_t *mddev)
152 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 153 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
153 154
154 disk->size = rdev->size; 155 disk->size = rdev->size;
155 mddev->array_size += rdev->size; 156 conf->array_size += rdev->size;
156 157
157 cnt++; 158 cnt++;
158 } 159 }
159 if (cnt != mddev->raid_disks) { 160 if (cnt != raid_disks) {
160 printk("linear: not enough drives present. Aborting!\n"); 161 printk("linear: not enough drives present. Aborting!\n");
161 goto out; 162 goto out;
162 } 163 }
163 164
164 min_spacing = mddev->array_size; 165 min_spacing = conf->array_size;
165 sector_div(min_spacing, PAGE_SIZE/sizeof(struct dev_info *)); 166 sector_div(min_spacing, PAGE_SIZE/sizeof(struct dev_info *));
166 167
167 /* min_spacing is the minimum spacing that will fit the hash 168 /* min_spacing is the minimum spacing that will fit the hash
@@ -170,7 +171,7 @@ static int linear_run (mddev_t *mddev)
170 * that is larger than min_spacing as use the size of that as 171 * that is larger than min_spacing as use the size of that as
171 * the actual spacing 172 * the actual spacing
172 */ 173 */
173 conf->hash_spacing = mddev->array_size; 174 conf->hash_spacing = conf->array_size;
174 for (i=0; i < cnt-1 ; i++) { 175 for (i=0; i < cnt-1 ; i++) {
175 sector_t sz = 0; 176 sector_t sz = 0;
176 int j; 177 int j;
@@ -200,7 +201,7 @@ static int linear_run (mddev_t *mddev)
200 unsigned round; 201 unsigned round;
201 unsigned long base; 202 unsigned long base;
202 203
203 sz = mddev->array_size >> conf->preshift; 204 sz = conf->array_size >> conf->preshift;
204 sz += 1; /* force round-up */ 205 sz += 1; /* force round-up */
205 base = conf->hash_spacing >> conf->preshift; 206 base = conf->hash_spacing >> conf->preshift;
206 round = sector_div(sz, base); 207 round = sector_div(sz, base);
@@ -227,7 +228,7 @@ static int linear_run (mddev_t *mddev)
227 curr_offset = 0; 228 curr_offset = 0;
228 i = 0; 229 i = 0;
229 for (curr_offset = 0; 230 for (curr_offset = 0;
230 curr_offset < mddev->array_size; 231 curr_offset < conf->array_size;
231 curr_offset += conf->hash_spacing) { 232 curr_offset += conf->hash_spacing) {
232 233
233 while (i < mddev->raid_disks-1 && 234 while (i < mddev->raid_disks-1 &&
@@ -247,14 +248,56 @@ static int linear_run (mddev_t *mddev)
247 248
248 BUG_ON(table - conf->hash_table > nb_zone); 249 BUG_ON(table - conf->hash_table > nb_zone);
249 250
251 return conf;
252
253out:
254 kfree(conf);
255 return NULL;
256}
257
258static int linear_run (mddev_t *mddev)
259{
260 linear_conf_t *conf;
261
262 conf = linear_conf(mddev, mddev->raid_disks);
263
264 if (!conf)
265 return 1;
266 mddev->private = conf;
267 mddev->array_size = conf->array_size;
268
250 blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); 269 blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
251 mddev->queue->unplug_fn = linear_unplug; 270 mddev->queue->unplug_fn = linear_unplug;
252 mddev->queue->issue_flush_fn = linear_issue_flush; 271 mddev->queue->issue_flush_fn = linear_issue_flush;
253 return 0; 272 return 0;
273}
254 274
255out: 275static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
256 kfree(conf); 276{
257 return 1; 277 /* Adding a drive to a linear array allows the array to grow.
278 * It is permitted if the new drive has a matching superblock
279 * already on it, with raid_disk equal to raid_disks.
280 * It is achieved by creating a new linear_private_data structure
281 * and swapping it in in-place of the current one.
282 * The current one is never freed until the array is stopped.
283 * This avoids races.
284 */
285 linear_conf_t *newconf;
286
287 if (rdev->raid_disk != mddev->raid_disks)
288 return -EINVAL;
289
290 newconf = linear_conf(mddev,mddev->raid_disks+1);
291
292 if (!newconf)
293 return -ENOMEM;
294
295 newconf->prev = mddev_to_conf(mddev);
296 mddev->private = newconf;
297 mddev->raid_disks++;
298 mddev->array_size = newconf->array_size;
299 set_capacity(mddev->gendisk, mddev->array_size << 1);
300 return 0;
258} 301}
259 302
260static int linear_stop (mddev_t *mddev) 303static int linear_stop (mddev_t *mddev)
@@ -262,8 +305,12 @@ static int linear_stop (mddev_t *mddev)
262 linear_conf_t *conf = mddev_to_conf(mddev); 305 linear_conf_t *conf = mddev_to_conf(mddev);
263 306
264 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 307 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
265 kfree(conf->hash_table); 308 do {
266 kfree(conf); 309 linear_conf_t *t = conf->prev;
310 kfree(conf->hash_table);
311 kfree(conf);
312 conf = t;
313 } while (conf);
267 314
268 return 0; 315 return 0;
269} 316}
@@ -360,6 +407,7 @@ static struct mdk_personality linear_personality =
360 .run = linear_run, 407 .run = linear_run,
361 .stop = linear_stop, 408 .stop = linear_stop,
362 .status = linear_status, 409 .status = linear_status,
410 .hot_add_disk = linear_add,
363}; 411};
364 412
365static int __init linear_init (void) 413static int __init linear_init (void)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index f19b874753a9..8dbab2ef3885 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -33,17 +33,16 @@
33*/ 33*/
34 34
35#include <linux/module.h> 35#include <linux/module.h>
36#include <linux/config.h>
37#include <linux/kthread.h> 36#include <linux/kthread.h>
38#include <linux/linkage.h> 37#include <linux/linkage.h>
39#include <linux/raid/md.h> 38#include <linux/raid/md.h>
40#include <linux/raid/bitmap.h> 39#include <linux/raid/bitmap.h>
41#include <linux/sysctl.h> 40#include <linux/sysctl.h>
42#include <linux/devfs_fs_kernel.h>
43#include <linux/buffer_head.h> /* for invalidate_bdev */ 41#include <linux/buffer_head.h> /* for invalidate_bdev */
44#include <linux/suspend.h> 42#include <linux/suspend.h>
45#include <linux/poll.h> 43#include <linux/poll.h>
46#include <linux/mutex.h> 44#include <linux/mutex.h>
45#include <linux/ctype.h>
47 46
48#include <linux/init.h> 47#include <linux/init.h>
49 48
@@ -72,6 +71,10 @@ static void autostart_arrays (int part);
72static LIST_HEAD(pers_list); 71static LIST_HEAD(pers_list);
73static DEFINE_SPINLOCK(pers_lock); 72static DEFINE_SPINLOCK(pers_lock);
74 73
74static void md_print_devices(void);
75
76#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
77
75/* 78/*
76 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 79 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
77 * is 1000 KB/sec, so the extra system load does not show up that much. 80 * is 1000 KB/sec, so the extra system load does not show up that much.
@@ -107,7 +110,7 @@ static ctl_table raid_table[] = {
107 .procname = "speed_limit_min", 110 .procname = "speed_limit_min",
108 .data = &sysctl_speed_limit_min, 111 .data = &sysctl_speed_limit_min,
109 .maxlen = sizeof(int), 112 .maxlen = sizeof(int),
110 .mode = 0644, 113 .mode = S_IRUGO|S_IWUSR,
111 .proc_handler = &proc_dointvec, 114 .proc_handler = &proc_dointvec,
112 }, 115 },
113 { 116 {
@@ -115,7 +118,7 @@ static ctl_table raid_table[] = {
115 .procname = "speed_limit_max", 118 .procname = "speed_limit_max",
116 .data = &sysctl_speed_limit_max, 119 .data = &sysctl_speed_limit_max,
117 .maxlen = sizeof(int), 120 .maxlen = sizeof(int),
118 .mode = 0644, 121 .mode = S_IRUGO|S_IWUSR,
119 .proc_handler = &proc_dointvec, 122 .proc_handler = &proc_dointvec,
120 }, 123 },
121 { .ctl_name = 0 } 124 { .ctl_name = 0 }
@@ -126,7 +129,7 @@ static ctl_table raid_dir_table[] = {
126 .ctl_name = DEV_RAID, 129 .ctl_name = DEV_RAID,
127 .procname = "raid", 130 .procname = "raid",
128 .maxlen = 0, 131 .maxlen = 0,
129 .mode = 0555, 132 .mode = S_IRUGO|S_IXUGO,
130 .child = raid_table, 133 .child = raid_table,
131 }, 134 },
132 { .ctl_name = 0 } 135 { .ctl_name = 0 }
@@ -170,7 +173,7 @@ EXPORT_SYMBOL_GPL(md_new_event);
170/* Alternate version that can be called from interrupts 173/* Alternate version that can be called from interrupts
171 * when calling sysfs_notify isn't needed. 174 * when calling sysfs_notify isn't needed.
172 */ 175 */
173void md_new_event_inintr(mddev_t *mddev) 176static void md_new_event_inintr(mddev_t *mddev)
174{ 177{
175 atomic_inc(&md_event_count); 178 atomic_inc(&md_event_count);
176 wake_up(&md_event_waiters); 179 wake_up(&md_event_waiters);
@@ -732,6 +735,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
732{ 735{
733 mdp_disk_t *desc; 736 mdp_disk_t *desc;
734 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 737 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
738 __u64 ev1 = md_event(sb);
735 739
736 rdev->raid_disk = -1; 740 rdev->raid_disk = -1;
737 rdev->flags = 0; 741 rdev->flags = 0;
@@ -748,7 +752,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
748 mddev->layout = sb->layout; 752 mddev->layout = sb->layout;
749 mddev->raid_disks = sb->raid_disks; 753 mddev->raid_disks = sb->raid_disks;
750 mddev->size = sb->size; 754 mddev->size = sb->size;
751 mddev->events = md_event(sb); 755 mddev->events = ev1;
752 mddev->bitmap_offset = 0; 756 mddev->bitmap_offset = 0;
753 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 757 mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
754 758
@@ -797,7 +801,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
797 801
798 } else if (mddev->pers == NULL) { 802 } else if (mddev->pers == NULL) {
799 /* Insist on good event counter while assembling */ 803 /* Insist on good event counter while assembling */
800 __u64 ev1 = md_event(sb);
801 ++ev1; 804 ++ev1;
802 if (ev1 < mddev->events) 805 if (ev1 < mddev->events)
803 return -EINVAL; 806 return -EINVAL;
@@ -805,19 +808,21 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
805 /* if adding to array with a bitmap, then we can accept an 808 /* if adding to array with a bitmap, then we can accept an
806 * older device ... but not too old. 809 * older device ... but not too old.
807 */ 810 */
808 __u64 ev1 = md_event(sb);
809 if (ev1 < mddev->bitmap->events_cleared) 811 if (ev1 < mddev->bitmap->events_cleared)
810 return 0; 812 return 0;
811 } else /* just a hot-add of a new device, leave raid_disk at -1 */ 813 } else {
812 return 0; 814 if (ev1 < mddev->events)
815 /* just a hot-add of a new device, leave raid_disk at -1 */
816 return 0;
817 }
813 818
814 if (mddev->level != LEVEL_MULTIPATH) { 819 if (mddev->level != LEVEL_MULTIPATH) {
815 desc = sb->disks + rdev->desc_nr; 820 desc = sb->disks + rdev->desc_nr;
816 821
817 if (desc->state & (1<<MD_DISK_FAULTY)) 822 if (desc->state & (1<<MD_DISK_FAULTY))
818 set_bit(Faulty, &rdev->flags); 823 set_bit(Faulty, &rdev->flags);
819 else if (desc->state & (1<<MD_DISK_SYNC) && 824 else if (desc->state & (1<<MD_DISK_SYNC) /* &&
820 desc->raid_disk < mddev->raid_disks) { 825 desc->raid_disk < mddev->raid_disks */) {
821 set_bit(In_sync, &rdev->flags); 826 set_bit(In_sync, &rdev->flags);
822 rdev->raid_disk = desc->raid_disk; 827 rdev->raid_disk = desc->raid_disk;
823 } 828 }
@@ -1057,6 +1062,11 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1057 if (rdev->sb_size & bmask) 1062 if (rdev->sb_size & bmask)
1058 rdev-> sb_size = (rdev->sb_size | bmask)+1; 1063 rdev-> sb_size = (rdev->sb_size | bmask)+1;
1059 1064
1065 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1066 rdev->desc_nr = -1;
1067 else
1068 rdev->desc_nr = le32_to_cpu(sb->dev_number);
1069
1060 if (refdev == 0) 1070 if (refdev == 0)
1061 ret = 1; 1071 ret = 1;
1062 else { 1072 else {
@@ -1100,6 +1110,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1100static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) 1110static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1101{ 1111{
1102 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1112 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1113 __u64 ev1 = le64_to_cpu(sb->events);
1103 1114
1104 rdev->raid_disk = -1; 1115 rdev->raid_disk = -1;
1105 rdev->flags = 0; 1116 rdev->flags = 0;
@@ -1115,7 +1126,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1115 mddev->layout = le32_to_cpu(sb->layout); 1126 mddev->layout = le32_to_cpu(sb->layout);
1116 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1127 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1117 mddev->size = le64_to_cpu(sb->size)/2; 1128 mddev->size = le64_to_cpu(sb->size)/2;
1118 mddev->events = le64_to_cpu(sb->events); 1129 mddev->events = ev1;
1119 mddev->bitmap_offset = 0; 1130 mddev->bitmap_offset = 0;
1120 mddev->default_bitmap_offset = 1024 >> 9; 1131 mddev->default_bitmap_offset = 1024 >> 9;
1121 1132
@@ -1149,7 +1160,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1149 1160
1150 } else if (mddev->pers == NULL) { 1161 } else if (mddev->pers == NULL) {
1151 /* Insist of good event counter while assembling */ 1162 /* Insist of good event counter while assembling */
1152 __u64 ev1 = le64_to_cpu(sb->events);
1153 ++ev1; 1163 ++ev1;
1154 if (ev1 < mddev->events) 1164 if (ev1 < mddev->events)
1155 return -EINVAL; 1165 return -EINVAL;
@@ -1157,15 +1167,15 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1157 /* If adding to array with a bitmap, then we can accept an 1167 /* If adding to array with a bitmap, then we can accept an
1158 * older device, but not too old. 1168 * older device, but not too old.
1159 */ 1169 */
1160 __u64 ev1 = le64_to_cpu(sb->events);
1161 if (ev1 < mddev->bitmap->events_cleared) 1170 if (ev1 < mddev->bitmap->events_cleared)
1162 return 0; 1171 return 0;
1163 } else /* just a hot-add of a new device, leave raid_disk at -1 */ 1172 } else {
1164 return 0; 1173 if (ev1 < mddev->events)
1165 1174 /* just a hot-add of a new device, leave raid_disk at -1 */
1175 return 0;
1176 }
1166 if (mddev->level != LEVEL_MULTIPATH) { 1177 if (mddev->level != LEVEL_MULTIPATH) {
1167 int role; 1178 int role;
1168 rdev->desc_nr = le32_to_cpu(sb->dev_number);
1169 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1179 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1170 switch(role) { 1180 switch(role) {
1171 case 0xffff: /* spare */ 1181 case 0xffff: /* spare */
@@ -1174,7 +1184,11 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1174 set_bit(Faulty, &rdev->flags); 1184 set_bit(Faulty, &rdev->flags);
1175 break; 1185 break;
1176 default: 1186 default:
1177 set_bit(In_sync, &rdev->flags); 1187 if ((le32_to_cpu(sb->feature_map) &
1188 MD_FEATURE_RECOVERY_OFFSET))
1189 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1190 else
1191 set_bit(In_sync, &rdev->flags);
1178 rdev->raid_disk = role; 1192 rdev->raid_disk = role;
1179 break; 1193 break;
1180 } 1194 }
@@ -1198,6 +1212,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1198 1212
1199 sb->feature_map = 0; 1213 sb->feature_map = 0;
1200 sb->pad0 = 0; 1214 sb->pad0 = 0;
1215 sb->recovery_offset = cpu_to_le64(0);
1201 memset(sb->pad1, 0, sizeof(sb->pad1)); 1216 memset(sb->pad1, 0, sizeof(sb->pad1));
1202 memset(sb->pad2, 0, sizeof(sb->pad2)); 1217 memset(sb->pad2, 0, sizeof(sb->pad2));
1203 memset(sb->pad3, 0, sizeof(sb->pad3)); 1218 memset(sb->pad3, 0, sizeof(sb->pad3));
@@ -1218,6 +1233,14 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1218 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1233 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
1219 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1234 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1220 } 1235 }
1236
1237 if (rdev->raid_disk >= 0 &&
1238 !test_bit(In_sync, &rdev->flags) &&
1239 rdev->recovery_offset > 0) {
1240 sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1241 sb->recovery_offset = cpu_to_le64(rdev->recovery_offset);
1242 }
1243
1221 if (mddev->reshape_position != MaxSector) { 1244 if (mddev->reshape_position != MaxSector) {
1222 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 1245 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1223 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 1246 sb->reshape_position = cpu_to_le64(mddev->reshape_position);
@@ -1242,11 +1265,12 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1242 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1265 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1243 else if (test_bit(In_sync, &rdev2->flags)) 1266 else if (test_bit(In_sync, &rdev2->flags))
1244 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1267 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1268 else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0)
1269 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1245 else 1270 else
1246 sb->dev_roles[i] = cpu_to_le16(0xffff); 1271 sb->dev_roles[i] = cpu_to_le16(0xffff);
1247 } 1272 }
1248 1273
1249 sb->recovery_offset = cpu_to_le64(0); /* not supported yet */
1250 sb->sb_csum = calc_sb_1_csum(sb); 1274 sb->sb_csum = calc_sb_1_csum(sb);
1251} 1275}
1252 1276
@@ -1384,7 +1408,7 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
1384 struct block_device *bdev; 1408 struct block_device *bdev;
1385 char b[BDEVNAME_SIZE]; 1409 char b[BDEVNAME_SIZE];
1386 1410
1387 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); 1411 bdev = open_partition_by_devnum(dev, FMODE_READ|FMODE_WRITE);
1388 if (IS_ERR(bdev)) { 1412 if (IS_ERR(bdev)) {
1389 printk(KERN_ERR "md: could not open %s.\n", 1413 printk(KERN_ERR "md: could not open %s.\n",
1390 __bdevname(dev, b)); 1414 __bdevname(dev, b));
@@ -1394,7 +1418,7 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
1394 if (err) { 1418 if (err) {
1395 printk(KERN_ERR "md: could not bd_claim %s.\n", 1419 printk(KERN_ERR "md: could not bd_claim %s.\n",
1396 bdevname(bdev, b)); 1420 bdevname(bdev, b));
1397 blkdev_put(bdev); 1421 blkdev_put_partition(bdev);
1398 return err; 1422 return err;
1399 } 1423 }
1400 rdev->bdev = bdev; 1424 rdev->bdev = bdev;
@@ -1408,7 +1432,7 @@ static void unlock_rdev(mdk_rdev_t *rdev)
1408 if (!bdev) 1432 if (!bdev)
1409 MD_BUG(); 1433 MD_BUG();
1410 bd_release(bdev); 1434 bd_release(bdev);
1411 blkdev_put(bdev); 1435 blkdev_put_partition(bdev);
1412} 1436}
1413 1437
1414void md_autodetect_dev(dev_t dev); 1438void md_autodetect_dev(dev_t dev);
@@ -1507,7 +1531,7 @@ static void print_rdev(mdk_rdev_t *rdev)
1507 printk(KERN_INFO "md: no rdev superblock!\n"); 1531 printk(KERN_INFO "md: no rdev superblock!\n");
1508} 1532}
1509 1533
1510void md_print_devices(void) 1534static void md_print_devices(void)
1511{ 1535{
1512 struct list_head *tmp, *tmp2; 1536 struct list_head *tmp, *tmp2;
1513 mdk_rdev_t *rdev; 1537 mdk_rdev_t *rdev;
@@ -1536,15 +1560,30 @@ void md_print_devices(void)
1536} 1560}
1537 1561
1538 1562
1539static void sync_sbs(mddev_t * mddev) 1563static void sync_sbs(mddev_t * mddev, int nospares)
1540{ 1564{
1565 /* Update each superblock (in-memory image), but
1566 * if we are allowed to, skip spares which already
1567 * have the right event counter, or have one earlier
1568 * (which would mean they aren't being marked as dirty
1569 * with the rest of the array)
1570 */
1541 mdk_rdev_t *rdev; 1571 mdk_rdev_t *rdev;
1542 struct list_head *tmp; 1572 struct list_head *tmp;
1543 1573
1544 ITERATE_RDEV(mddev,rdev,tmp) { 1574 ITERATE_RDEV(mddev,rdev,tmp) {
1545 super_types[mddev->major_version]. 1575 if (rdev->sb_events == mddev->events ||
1546 sync_super(mddev, rdev); 1576 (nospares &&
1547 rdev->sb_loaded = 1; 1577 rdev->raid_disk < 0 &&
1578 (rdev->sb_events&1)==0 &&
1579 rdev->sb_events+1 == mddev->events)) {
1580 /* Don't update this superblock */
1581 rdev->sb_loaded = 2;
1582 } else {
1583 super_types[mddev->major_version].
1584 sync_super(mddev, rdev);
1585 rdev->sb_loaded = 1;
1586 }
1548 } 1587 }
1549} 1588}
1550 1589
@@ -1554,12 +1593,55 @@ void md_update_sb(mddev_t * mddev)
1554 struct list_head *tmp; 1593 struct list_head *tmp;
1555 mdk_rdev_t *rdev; 1594 mdk_rdev_t *rdev;
1556 int sync_req; 1595 int sync_req;
1596 int nospares = 0;
1557 1597
1558repeat: 1598repeat:
1559 spin_lock_irq(&mddev->write_lock); 1599 spin_lock_irq(&mddev->write_lock);
1600
1601 if (mddev->degraded && mddev->sb_dirty == 3)
1602 /* If the array is degraded, then skipping spares is both
1603 * dangerous and fairly pointless.
1604 * Dangerous because a device that was removed from the array
1605 * might have a event_count that still looks up-to-date,
1606 * so it can be re-added without a resync.
1607 * Pointless because if there are any spares to skip,
1608 * then a recovery will happen and soon that array won't
1609 * be degraded any more and the spare can go back to sleep then.
1610 */
1611 mddev->sb_dirty = 1;
1612
1560 sync_req = mddev->in_sync; 1613 sync_req = mddev->in_sync;
1561 mddev->utime = get_seconds(); 1614 mddev->utime = get_seconds();
1562 mddev->events ++; 1615 if (mddev->sb_dirty == 3)
1616 /* just a clean<-> dirty transition, possibly leave spares alone,
1617 * though if events isn't the right even/odd, we will have to do
1618 * spares after all
1619 */
1620 nospares = 1;
1621
1622 /* If this is just a dirty<->clean transition, and the array is clean
1623 * and 'events' is odd, we can roll back to the previous clean state */
1624 if (mddev->sb_dirty == 3
1625 && (mddev->in_sync && mddev->recovery_cp == MaxSector)
1626 && (mddev->events & 1))
1627 mddev->events--;
1628 else {
1629 /* otherwise we have to go forward and ... */
1630 mddev->events ++;
1631 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */
1632 /* .. if the array isn't clean, insist on an odd 'events' */
1633 if ((mddev->events&1)==0) {
1634 mddev->events++;
1635 nospares = 0;
1636 }
1637 } else {
1638 /* otherwise insist on an even 'events' (for clean states) */
1639 if ((mddev->events&1)) {
1640 mddev->events++;
1641 nospares = 0;
1642 }
1643 }
1644 }
1563 1645
1564 if (!mddev->events) { 1646 if (!mddev->events) {
1565 /* 1647 /*
@@ -1571,7 +1653,7 @@ repeat:
1571 mddev->events --; 1653 mddev->events --;
1572 } 1654 }
1573 mddev->sb_dirty = 2; 1655 mddev->sb_dirty = 2;
1574 sync_sbs(mddev); 1656 sync_sbs(mddev, nospares);
1575 1657
1576 /* 1658 /*
1577 * do not write anything to disk if using 1659 * do not write anything to disk if using
@@ -1593,6 +1675,8 @@ repeat:
1593 ITERATE_RDEV(mddev,rdev,tmp) { 1675 ITERATE_RDEV(mddev,rdev,tmp) {
1594 char b[BDEVNAME_SIZE]; 1676 char b[BDEVNAME_SIZE];
1595 dprintk(KERN_INFO "md: "); 1677 dprintk(KERN_INFO "md: ");
1678 if (rdev->sb_loaded != 1)
1679 continue; /* no noise on spare devices */
1596 if (test_bit(Faulty, &rdev->flags)) 1680 if (test_bit(Faulty, &rdev->flags))
1597 dprintk("(skipping faulty "); 1681 dprintk("(skipping faulty ");
1598 1682
@@ -1604,6 +1688,7 @@ repeat:
1604 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", 1688 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
1605 bdevname(rdev->bdev,b), 1689 bdevname(rdev->bdev,b),
1606 (unsigned long long)rdev->sb_offset); 1690 (unsigned long long)rdev->sb_offset);
1691 rdev->sb_events = mddev->events;
1607 1692
1608 } else 1693 } else
1609 dprintk(")\n"); 1694 dprintk(")\n");
@@ -1667,6 +1752,10 @@ state_show(mdk_rdev_t *rdev, char *page)
1667 len += sprintf(page+len, "%sin_sync",sep); 1752 len += sprintf(page+len, "%sin_sync",sep);
1668 sep = ","; 1753 sep = ",";
1669 } 1754 }
1755 if (test_bit(WriteMostly, &rdev->flags)) {
1756 len += sprintf(page+len, "%swrite_mostly",sep);
1757 sep = ",";
1758 }
1670 if (!test_bit(Faulty, &rdev->flags) && 1759 if (!test_bit(Faulty, &rdev->flags) &&
1671 !test_bit(In_sync, &rdev->flags)) { 1760 !test_bit(In_sync, &rdev->flags)) {
1672 len += sprintf(page+len, "%sspare", sep); 1761 len += sprintf(page+len, "%sspare", sep);
@@ -1675,8 +1764,40 @@ state_show(mdk_rdev_t *rdev, char *page)
1675 return len+sprintf(page+len, "\n"); 1764 return len+sprintf(page+len, "\n");
1676} 1765}
1677 1766
1678static struct rdev_sysfs_entry 1767static ssize_t
1679rdev_state = __ATTR_RO(state); 1768state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1769{
1770 /* can write
1771 * faulty - simulates and error
1772 * remove - disconnects the device
1773 * writemostly - sets write_mostly
1774 * -writemostly - clears write_mostly
1775 */
1776 int err = -EINVAL;
1777 if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
1778 md_error(rdev->mddev, rdev);
1779 err = 0;
1780 } else if (cmd_match(buf, "remove")) {
1781 if (rdev->raid_disk >= 0)
1782 err = -EBUSY;
1783 else {
1784 mddev_t *mddev = rdev->mddev;
1785 kick_rdev_from_array(rdev);
1786 md_update_sb(mddev);
1787 md_new_event(mddev);
1788 err = 0;
1789 }
1790 } else if (cmd_match(buf, "writemostly")) {
1791 set_bit(WriteMostly, &rdev->flags);
1792 err = 0;
1793 } else if (cmd_match(buf, "-writemostly")) {
1794 clear_bit(WriteMostly, &rdev->flags);
1795 err = 0;
1796 }
1797 return err ? err : len;
1798}
1799static struct rdev_sysfs_entry rdev_state =
1800__ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
1680 1801
1681static ssize_t 1802static ssize_t
1682super_show(mdk_rdev_t *rdev, char *page) 1803super_show(mdk_rdev_t *rdev, char *page)
@@ -1707,7 +1828,7 @@ errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1707 return -EINVAL; 1828 return -EINVAL;
1708} 1829}
1709static struct rdev_sysfs_entry rdev_errors = 1830static struct rdev_sysfs_entry rdev_errors =
1710__ATTR(errors, 0644, errors_show, errors_store); 1831__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
1711 1832
1712static ssize_t 1833static ssize_t
1713slot_show(mdk_rdev_t *rdev, char *page) 1834slot_show(mdk_rdev_t *rdev, char *page)
@@ -1741,7 +1862,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1741 1862
1742 1863
1743static struct rdev_sysfs_entry rdev_slot = 1864static struct rdev_sysfs_entry rdev_slot =
1744__ATTR(slot, 0644, slot_show, slot_store); 1865__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
1745 1866
1746static ssize_t 1867static ssize_t
1747offset_show(mdk_rdev_t *rdev, char *page) 1868offset_show(mdk_rdev_t *rdev, char *page)
@@ -1763,7 +1884,7 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1763} 1884}
1764 1885
1765static struct rdev_sysfs_entry rdev_offset = 1886static struct rdev_sysfs_entry rdev_offset =
1766__ATTR(offset, 0644, offset_show, offset_store); 1887__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
1767 1888
1768static ssize_t 1889static ssize_t
1769rdev_size_show(mdk_rdev_t *rdev, char *page) 1890rdev_size_show(mdk_rdev_t *rdev, char *page)
@@ -1787,7 +1908,7 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1787} 1908}
1788 1909
1789static struct rdev_sysfs_entry rdev_size = 1910static struct rdev_sysfs_entry rdev_size =
1790__ATTR(size, 0644, rdev_size_show, rdev_size_store); 1911__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
1791 1912
1792static struct attribute *rdev_default_attrs[] = { 1913static struct attribute *rdev_default_attrs[] = {
1793 &rdev_state.attr, 1914 &rdev_state.attr,
@@ -1818,6 +1939,8 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
1818 1939
1819 if (!entry->store) 1940 if (!entry->store)
1820 return -EIO; 1941 return -EIO;
1942 if (!capable(CAP_SYS_ADMIN))
1943 return -EACCES;
1821 return entry->store(rdev, page, length); 1944 return entry->store(rdev, page, length);
1822} 1945}
1823 1946
@@ -1873,6 +1996,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
1873 rdev->desc_nr = -1; 1996 rdev->desc_nr = -1;
1874 rdev->flags = 0; 1997 rdev->flags = 0;
1875 rdev->data_offset = 0; 1998 rdev->data_offset = 0;
1999 rdev->sb_events = 0;
1876 atomic_set(&rdev->nr_pending, 0); 2000 atomic_set(&rdev->nr_pending, 0);
1877 atomic_set(&rdev->read_errors, 0); 2001 atomic_set(&rdev->read_errors, 0);
1878 atomic_set(&rdev->corrected_errors, 0); 2002 atomic_set(&rdev->corrected_errors, 0);
@@ -1978,6 +2102,54 @@ static void analyze_sbs(mddev_t * mddev)
1978} 2102}
1979 2103
1980static ssize_t 2104static ssize_t
2105safe_delay_show(mddev_t *mddev, char *page)
2106{
2107 int msec = (mddev->safemode_delay*1000)/HZ;
2108 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
2109}
2110static ssize_t
2111safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
2112{
2113 int scale=1;
2114 int dot=0;
2115 int i;
2116 unsigned long msec;
2117 char buf[30];
2118 char *e;
2119 /* remove a period, and count digits after it */
2120 if (len >= sizeof(buf))
2121 return -EINVAL;
2122 strlcpy(buf, cbuf, len);
2123 buf[len] = 0;
2124 for (i=0; i<len; i++) {
2125 if (dot) {
2126 if (isdigit(buf[i])) {
2127 buf[i-1] = buf[i];
2128 scale *= 10;
2129 }
2130 buf[i] = 0;
2131 } else if (buf[i] == '.') {
2132 dot=1;
2133 buf[i] = 0;
2134 }
2135 }
2136 msec = simple_strtoul(buf, &e, 10);
2137 if (e == buf || (*e && *e != '\n'))
2138 return -EINVAL;
2139 msec = (msec * 1000) / scale;
2140 if (msec == 0)
2141 mddev->safemode_delay = 0;
2142 else {
2143 mddev->safemode_delay = (msec*HZ)/1000;
2144 if (mddev->safemode_delay == 0)
2145 mddev->safemode_delay = 1;
2146 }
2147 return len;
2148}
2149static struct md_sysfs_entry md_safe_delay =
2150__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
2151
2152static ssize_t
1981level_show(mddev_t *mddev, char *page) 2153level_show(mddev_t *mddev, char *page)
1982{ 2154{
1983 struct mdk_personality *p = mddev->pers; 2155 struct mdk_personality *p = mddev->pers;
@@ -2010,7 +2182,33 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
2010} 2182}
2011 2183
2012static struct md_sysfs_entry md_level = 2184static struct md_sysfs_entry md_level =
2013__ATTR(level, 0644, level_show, level_store); 2185__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
2186
2187
2188static ssize_t
2189layout_show(mddev_t *mddev, char *page)
2190{
2191 /* just a number, not meaningful for all levels */
2192 return sprintf(page, "%d\n", mddev->layout);
2193}
2194
2195static ssize_t
2196layout_store(mddev_t *mddev, const char *buf, size_t len)
2197{
2198 char *e;
2199 unsigned long n = simple_strtoul(buf, &e, 10);
2200 if (mddev->pers)
2201 return -EBUSY;
2202
2203 if (!*buf || (*e && *e != '\n'))
2204 return -EINVAL;
2205
2206 mddev->layout = n;
2207 return len;
2208}
2209static struct md_sysfs_entry md_layout =
2210__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
2211
2014 2212
2015static ssize_t 2213static ssize_t
2016raid_disks_show(mddev_t *mddev, char *page) 2214raid_disks_show(mddev_t *mddev, char *page)
@@ -2040,7 +2238,7 @@ raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
2040 return rv ? rv : len; 2238 return rv ? rv : len;
2041} 2239}
2042static struct md_sysfs_entry md_raid_disks = 2240static struct md_sysfs_entry md_raid_disks =
2043__ATTR(raid_disks, 0644, raid_disks_show, raid_disks_store); 2241__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
2044 2242
2045static ssize_t 2243static ssize_t
2046chunk_size_show(mddev_t *mddev, char *page) 2244chunk_size_show(mddev_t *mddev, char *page)
@@ -2064,7 +2262,202 @@ chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
2064 return len; 2262 return len;
2065} 2263}
2066static struct md_sysfs_entry md_chunk_size = 2264static struct md_sysfs_entry md_chunk_size =
2067__ATTR(chunk_size, 0644, chunk_size_show, chunk_size_store); 2265__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
2266
2267static ssize_t
2268resync_start_show(mddev_t *mddev, char *page)
2269{
2270 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
2271}
2272
2273static ssize_t
2274resync_start_store(mddev_t *mddev, const char *buf, size_t len)
2275{
2276 /* can only set chunk_size if array is not yet active */
2277 char *e;
2278 unsigned long long n = simple_strtoull(buf, &e, 10);
2279
2280 if (mddev->pers)
2281 return -EBUSY;
2282 if (!*buf || (*e && *e != '\n'))
2283 return -EINVAL;
2284
2285 mddev->recovery_cp = n;
2286 return len;
2287}
2288static struct md_sysfs_entry md_resync_start =
2289__ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
2290
2291/*
2292 * The array state can be:
2293 *
2294 * clear
2295 * No devices, no size, no level
2296 * Equivalent to STOP_ARRAY ioctl
2297 * inactive
2298 * May have some settings, but array is not active
2299 * all IO results in error
2300 * When written, doesn't tear down array, but just stops it
2301 * suspended (not supported yet)
2302 * All IO requests will block. The array can be reconfigured.
2303 * Writing this, if accepted, will block until array is quiessent
2304 * readonly
2305 * no resync can happen. no superblocks get written.
2306 * write requests fail
2307 * read-auto
2308 * like readonly, but behaves like 'clean' on a write request.
2309 *
2310 * clean - no pending writes, but otherwise active.
2311 * When written to inactive array, starts without resync
2312 * If a write request arrives then
2313 * if metadata is known, mark 'dirty' and switch to 'active'.
2314 * if not known, block and switch to write-pending
2315 * If written to an active array that has pending writes, then fails.
2316 * active
2317 * fully active: IO and resync can be happening.
2318 * When written to inactive array, starts with resync
2319 *
2320 * write-pending
2321 * clean, but writes are blocked waiting for 'active' to be written.
2322 *
2323 * active-idle
2324 * like active, but no writes have been seen for a while (100msec).
2325 *
2326 */
2327enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
2328 write_pending, active_idle, bad_word};
2329static char *array_states[] = {
2330 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
2331 "write-pending", "active-idle", NULL };
2332
2333static int match_word(const char *word, char **list)
2334{
2335 int n;
2336 for (n=0; list[n]; n++)
2337 if (cmd_match(word, list[n]))
2338 break;
2339 return n;
2340}
2341
2342static ssize_t
2343array_state_show(mddev_t *mddev, char *page)
2344{
2345 enum array_state st = inactive;
2346
2347 if (mddev->pers)
2348 switch(mddev->ro) {
2349 case 1:
2350 st = readonly;
2351 break;
2352 case 2:
2353 st = read_auto;
2354 break;
2355 case 0:
2356 if (mddev->in_sync)
2357 st = clean;
2358 else if (mddev->safemode)
2359 st = active_idle;
2360 else
2361 st = active;
2362 }
2363 else {
2364 if (list_empty(&mddev->disks) &&
2365 mddev->raid_disks == 0 &&
2366 mddev->size == 0)
2367 st = clear;
2368 else
2369 st = inactive;
2370 }
2371 return sprintf(page, "%s\n", array_states[st]);
2372}
2373
2374static int do_md_stop(mddev_t * mddev, int ro);
2375static int do_md_run(mddev_t * mddev);
2376static int restart_array(mddev_t *mddev);
2377
2378static ssize_t
2379array_state_store(mddev_t *mddev, const char *buf, size_t len)
2380{
2381 int err = -EINVAL;
2382 enum array_state st = match_word(buf, array_states);
2383 switch(st) {
2384 case bad_word:
2385 break;
2386 case clear:
2387 /* stopping an active array */
2388 if (mddev->pers) {
2389 if (atomic_read(&mddev->active) > 1)
2390 return -EBUSY;
2391 err = do_md_stop(mddev, 0);
2392 }
2393 break;
2394 case inactive:
2395 /* stopping an active array */
2396 if (mddev->pers) {
2397 if (atomic_read(&mddev->active) > 1)
2398 return -EBUSY;
2399 err = do_md_stop(mddev, 2);
2400 }
2401 break;
2402 case suspended:
2403 break; /* not supported yet */
2404 case readonly:
2405 if (mddev->pers)
2406 err = do_md_stop(mddev, 1);
2407 else {
2408 mddev->ro = 1;
2409 err = do_md_run(mddev);
2410 }
2411 break;
2412 case read_auto:
2413 /* stopping an active array */
2414 if (mddev->pers) {
2415 err = do_md_stop(mddev, 1);
2416 if (err == 0)
2417 mddev->ro = 2; /* FIXME mark devices writable */
2418 } else {
2419 mddev->ro = 2;
2420 err = do_md_run(mddev);
2421 }
2422 break;
2423 case clean:
2424 if (mddev->pers) {
2425 restart_array(mddev);
2426 spin_lock_irq(&mddev->write_lock);
2427 if (atomic_read(&mddev->writes_pending) == 0) {
2428 mddev->in_sync = 1;
2429 mddev->sb_dirty = 1;
2430 }
2431 spin_unlock_irq(&mddev->write_lock);
2432 } else {
2433 mddev->ro = 0;
2434 mddev->recovery_cp = MaxSector;
2435 err = do_md_run(mddev);
2436 }
2437 break;
2438 case active:
2439 if (mddev->pers) {
2440 restart_array(mddev);
2441 mddev->sb_dirty = 0;
2442 wake_up(&mddev->sb_wait);
2443 err = 0;
2444 } else {
2445 mddev->ro = 0;
2446 err = do_md_run(mddev);
2447 }
2448 break;
2449 case write_pending:
2450 case active_idle:
2451 /* these cannot be set */
2452 break;
2453 }
2454 if (err)
2455 return err;
2456 else
2457 return len;
2458}
2459static struct md_sysfs_entry md_array_state =
2460__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
2068 2461
2069static ssize_t 2462static ssize_t
2070null_show(mddev_t *mddev, char *page) 2463null_show(mddev_t *mddev, char *page)
@@ -2124,7 +2517,7 @@ new_dev_store(mddev_t *mddev, const char *buf, size_t len)
2124} 2517}
2125 2518
2126static struct md_sysfs_entry md_new_device = 2519static struct md_sysfs_entry md_new_device =
2127__ATTR(new_dev, 0200, null_show, new_dev_store); 2520__ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
2128 2521
2129static ssize_t 2522static ssize_t
2130size_show(mddev_t *mddev, char *page) 2523size_show(mddev_t *mddev, char *page)
@@ -2162,7 +2555,7 @@ size_store(mddev_t *mddev, const char *buf, size_t len)
2162} 2555}
2163 2556
2164static struct md_sysfs_entry md_size = 2557static struct md_sysfs_entry md_size =
2165__ATTR(component_size, 0644, size_show, size_store); 2558__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
2166 2559
2167 2560
2168/* Metdata version. 2561/* Metdata version.
@@ -2210,7 +2603,7 @@ metadata_store(mddev_t *mddev, const char *buf, size_t len)
2210} 2603}
2211 2604
2212static struct md_sysfs_entry md_metadata = 2605static struct md_sysfs_entry md_metadata =
2213__ATTR(metadata_version, 0644, metadata_show, metadata_store); 2606__ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
2214 2607
2215static ssize_t 2608static ssize_t
2216action_show(mddev_t *mddev, char *page) 2609action_show(mddev_t *mddev, char *page)
@@ -2278,12 +2671,11 @@ mismatch_cnt_show(mddev_t *mddev, char *page)
2278 (unsigned long long) mddev->resync_mismatches); 2671 (unsigned long long) mddev->resync_mismatches);
2279} 2672}
2280 2673
2281static struct md_sysfs_entry 2674static struct md_sysfs_entry md_scan_mode =
2282md_scan_mode = __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 2675__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
2283 2676
2284 2677
2285static struct md_sysfs_entry 2678static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
2286md_mismatches = __ATTR_RO(mismatch_cnt);
2287 2679
2288static ssize_t 2680static ssize_t
2289sync_min_show(mddev_t *mddev, char *page) 2681sync_min_show(mddev_t *mddev, char *page)
@@ -2342,15 +2734,14 @@ static ssize_t
2342sync_speed_show(mddev_t *mddev, char *page) 2734sync_speed_show(mddev_t *mddev, char *page)
2343{ 2735{
2344 unsigned long resync, dt, db; 2736 unsigned long resync, dt, db;
2345 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active)); 2737 resync = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active));
2346 dt = ((jiffies - mddev->resync_mark) / HZ); 2738 dt = ((jiffies - mddev->resync_mark) / HZ);
2347 if (!dt) dt++; 2739 if (!dt) dt++;
2348 db = resync - (mddev->resync_mark_cnt); 2740 db = resync - (mddev->resync_mark_cnt);
2349 return sprintf(page, "%ld\n", db/dt/2); /* K/sec */ 2741 return sprintf(page, "%ld\n", db/dt/2); /* K/sec */
2350} 2742}
2351 2743
2352static struct md_sysfs_entry 2744static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
2353md_sync_speed = __ATTR_RO(sync_speed);
2354 2745
2355static ssize_t 2746static ssize_t
2356sync_completed_show(mddev_t *mddev, char *page) 2747sync_completed_show(mddev_t *mddev, char *page)
@@ -2366,8 +2757,7 @@ sync_completed_show(mddev_t *mddev, char *page)
2366 return sprintf(page, "%lu / %lu\n", resync, max_blocks); 2757 return sprintf(page, "%lu / %lu\n", resync, max_blocks);
2367} 2758}
2368 2759
2369static struct md_sysfs_entry 2760static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
2370md_sync_completed = __ATTR_RO(sync_completed);
2371 2761
2372static ssize_t 2762static ssize_t
2373suspend_lo_show(mddev_t *mddev, char *page) 2763suspend_lo_show(mddev_t *mddev, char *page)
@@ -2428,11 +2818,15 @@ __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
2428 2818
2429static struct attribute *md_default_attrs[] = { 2819static struct attribute *md_default_attrs[] = {
2430 &md_level.attr, 2820 &md_level.attr,
2821 &md_layout.attr,
2431 &md_raid_disks.attr, 2822 &md_raid_disks.attr,
2432 &md_chunk_size.attr, 2823 &md_chunk_size.attr,
2433 &md_size.attr, 2824 &md_size.attr,
2825 &md_resync_start.attr,
2434 &md_metadata.attr, 2826 &md_metadata.attr,
2435 &md_new_device.attr, 2827 &md_new_device.attr,
2828 &md_safe_delay.attr,
2829 &md_array_state.attr,
2436 NULL, 2830 NULL,
2437}; 2831};
2438 2832
@@ -2480,6 +2874,8 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
2480 2874
2481 if (!entry->store) 2875 if (!entry->store)
2482 return -EIO; 2876 return -EIO;
2877 if (!capable(CAP_SYS_ADMIN))
2878 return -EACCES;
2483 rv = mddev_lock(mddev); 2879 rv = mddev_lock(mddev);
2484 if (!rv) { 2880 if (!rv) {
2485 rv = entry->store(mddev, page, length); 2881 rv = entry->store(mddev, page, length);
@@ -2532,13 +2928,10 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
2532 } 2928 }
2533 disk->major = MAJOR(dev); 2929 disk->major = MAJOR(dev);
2534 disk->first_minor = unit << shift; 2930 disk->first_minor = unit << shift;
2535 if (partitioned) { 2931 if (partitioned)
2536 sprintf(disk->disk_name, "md_d%d", unit); 2932 sprintf(disk->disk_name, "md_d%d", unit);
2537 sprintf(disk->devfs_name, "md/d%d", unit); 2933 else
2538 } else {
2539 sprintf(disk->disk_name, "md%d", unit); 2934 sprintf(disk->disk_name, "md%d", unit);
2540 sprintf(disk->devfs_name, "md/%d", unit);
2541 }
2542 disk->fops = &md_fops; 2935 disk->fops = &md_fops;
2543 disk->private_data = mddev; 2936 disk->private_data = mddev;
2544 disk->queue = mddev->queue; 2937 disk->queue = mddev->queue;
@@ -2553,8 +2946,6 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
2553 return NULL; 2946 return NULL;
2554} 2947}
2555 2948
2556void md_wakeup_thread(mdk_thread_t *thread);
2557
2558static void md_safemode_timeout(unsigned long data) 2949static void md_safemode_timeout(unsigned long data)
2559{ 2950{
2560 mddev_t *mddev = (mddev_t *) data; 2951 mddev_t *mddev = (mddev_t *) data;
@@ -2708,7 +3099,7 @@ static int do_md_run(mddev_t * mddev)
2708 mddev->safemode = 0; 3099 mddev->safemode = 0;
2709 mddev->safemode_timer.function = md_safemode_timeout; 3100 mddev->safemode_timer.function = md_safemode_timeout;
2710 mddev->safemode_timer.data = (unsigned long) mddev; 3101 mddev->safemode_timer.data = (unsigned long) mddev;
2711 mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */ 3102 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
2712 mddev->in_sync = 1; 3103 mddev->in_sync = 1;
2713 3104
2714 ITERATE_RDEV(mddev,rdev,tmp) 3105 ITERATE_RDEV(mddev,rdev,tmp)
@@ -2719,7 +3110,6 @@ static int do_md_run(mddev_t * mddev)
2719 } 3110 }
2720 3111
2721 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3112 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2722 md_wakeup_thread(mddev->thread);
2723 3113
2724 if (mddev->sb_dirty) 3114 if (mddev->sb_dirty)
2725 md_update_sb(mddev); 3115 md_update_sb(mddev);
@@ -2736,6 +3126,37 @@ static int do_md_run(mddev_t * mddev)
2736 mddev->queue->queuedata = mddev; 3126 mddev->queue->queuedata = mddev;
2737 mddev->queue->make_request_fn = mddev->pers->make_request; 3127 mddev->queue->make_request_fn = mddev->pers->make_request;
2738 3128
3129 /* If there is a partially-recovered drive we need to
3130 * start recovery here. If we leave it to md_check_recovery,
3131 * it will remove the drives and not do the right thing
3132 */
3133 if (mddev->degraded && !mddev->sync_thread) {
3134 struct list_head *rtmp;
3135 int spares = 0;
3136 ITERATE_RDEV(mddev,rdev,rtmp)
3137 if (rdev->raid_disk >= 0 &&
3138 !test_bit(In_sync, &rdev->flags) &&
3139 !test_bit(Faulty, &rdev->flags))
3140 /* complete an interrupted recovery */
3141 spares++;
3142 if (spares && mddev->pers->sync_request) {
3143 mddev->recovery = 0;
3144 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3145 mddev->sync_thread = md_register_thread(md_do_sync,
3146 mddev,
3147 "%s_resync");
3148 if (!mddev->sync_thread) {
3149 printk(KERN_ERR "%s: could not start resync"
3150 " thread...\n",
3151 mdname(mddev));
3152 /* leave the spares where they are, it shouldn't hurt */
3153 mddev->recovery = 0;
3154 }
3155 }
3156 }
3157 md_wakeup_thread(mddev->thread);
3158 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
3159
2739 mddev->changed = 1; 3160 mddev->changed = 1;
2740 md_new_event(mddev); 3161 md_new_event(mddev);
2741 return 0; 3162 return 0;
@@ -2769,18 +3190,47 @@ static int restart_array(mddev_t *mddev)
2769 */ 3190 */
2770 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3191 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2771 md_wakeup_thread(mddev->thread); 3192 md_wakeup_thread(mddev->thread);
3193 md_wakeup_thread(mddev->sync_thread);
2772 err = 0; 3194 err = 0;
2773 } else { 3195 } else
2774 printk(KERN_ERR "md: %s has no personality assigned.\n",
2775 mdname(mddev));
2776 err = -EINVAL; 3196 err = -EINVAL;
2777 }
2778 3197
2779out: 3198out:
2780 return err; 3199 return err;
2781} 3200}
2782 3201
2783static int do_md_stop(mddev_t * mddev, int ro) 3202/* similar to deny_write_access, but accounts for our holding a reference
3203 * to the file ourselves */
3204static int deny_bitmap_write_access(struct file * file)
3205{
3206 struct inode *inode = file->f_mapping->host;
3207
3208 spin_lock(&inode->i_lock);
3209 if (atomic_read(&inode->i_writecount) > 1) {
3210 spin_unlock(&inode->i_lock);
3211 return -ETXTBSY;
3212 }
3213 atomic_set(&inode->i_writecount, -1);
3214 spin_unlock(&inode->i_lock);
3215
3216 return 0;
3217}
3218
3219static void restore_bitmap_write_access(struct file *file)
3220{
3221 struct inode *inode = file->f_mapping->host;
3222
3223 spin_lock(&inode->i_lock);
3224 atomic_set(&inode->i_writecount, 1);
3225 spin_unlock(&inode->i_lock);
3226}
3227
3228/* mode:
3229 * 0 - completely stop and dis-assemble array
3230 * 1 - switch to readonly
3231 * 2 - stop but do not disassemble array
3232 */
3233static int do_md_stop(mddev_t * mddev, int mode)
2784{ 3234{
2785 int err = 0; 3235 int err = 0;
2786 struct gendisk *disk = mddev->gendisk; 3236 struct gendisk *disk = mddev->gendisk;
@@ -2792,6 +3242,7 @@ static int do_md_stop(mddev_t * mddev, int ro)
2792 } 3242 }
2793 3243
2794 if (mddev->sync_thread) { 3244 if (mddev->sync_thread) {
3245 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
2795 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3246 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2796 md_unregister_thread(mddev->sync_thread); 3247 md_unregister_thread(mddev->sync_thread);
2797 mddev->sync_thread = NULL; 3248 mddev->sync_thread = NULL;
@@ -2801,12 +3252,15 @@ static int do_md_stop(mddev_t * mddev, int ro)
2801 3252
2802 invalidate_partition(disk, 0); 3253 invalidate_partition(disk, 0);
2803 3254
2804 if (ro) { 3255 switch(mode) {
3256 case 1: /* readonly */
2805 err = -ENXIO; 3257 err = -ENXIO;
2806 if (mddev->ro==1) 3258 if (mddev->ro==1)
2807 goto out; 3259 goto out;
2808 mddev->ro = 1; 3260 mddev->ro = 1;
2809 } else { 3261 break;
3262 case 0: /* disassemble */
3263 case 2: /* stop */
2810 bitmap_flush(mddev); 3264 bitmap_flush(mddev);
2811 md_super_wait(mddev); 3265 md_super_wait(mddev);
2812 if (mddev->ro) 3266 if (mddev->ro)
@@ -2821,19 +3275,20 @@ static int do_md_stop(mddev_t * mddev, int ro)
2821 if (mddev->ro) 3275 if (mddev->ro)
2822 mddev->ro = 0; 3276 mddev->ro = 0;
2823 } 3277 }
2824 if (!mddev->in_sync) { 3278 if (!mddev->in_sync || mddev->sb_dirty) {
2825 /* mark array as shutdown cleanly */ 3279 /* mark array as shutdown cleanly */
2826 mddev->in_sync = 1; 3280 mddev->in_sync = 1;
2827 md_update_sb(mddev); 3281 md_update_sb(mddev);
2828 } 3282 }
2829 if (ro) 3283 if (mode == 1)
2830 set_disk_ro(disk, 1); 3284 set_disk_ro(disk, 1);
3285 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
2831 } 3286 }
2832 3287
2833 /* 3288 /*
2834 * Free resources if final stop 3289 * Free resources if final stop
2835 */ 3290 */
2836 if (!ro) { 3291 if (mode == 0) {
2837 mdk_rdev_t *rdev; 3292 mdk_rdev_t *rdev;
2838 struct list_head *tmp; 3293 struct list_head *tmp;
2839 struct gendisk *disk; 3294 struct gendisk *disk;
@@ -2841,7 +3296,7 @@ static int do_md_stop(mddev_t * mddev, int ro)
2841 3296
2842 bitmap_destroy(mddev); 3297 bitmap_destroy(mddev);
2843 if (mddev->bitmap_file) { 3298 if (mddev->bitmap_file) {
2844 atomic_set(&mddev->bitmap_file->f_dentry->d_inode->i_writecount, 1); 3299 restore_bitmap_write_access(mddev->bitmap_file);
2845 fput(mddev->bitmap_file); 3300 fput(mddev->bitmap_file);
2846 mddev->bitmap_file = NULL; 3301 mddev->bitmap_file = NULL;
2847 } 3302 }
@@ -2857,11 +3312,15 @@ static int do_md_stop(mddev_t * mddev, int ro)
2857 export_array(mddev); 3312 export_array(mddev);
2858 3313
2859 mddev->array_size = 0; 3314 mddev->array_size = 0;
3315 mddev->size = 0;
3316 mddev->raid_disks = 0;
3317 mddev->recovery_cp = 0;
3318
2860 disk = mddev->gendisk; 3319 disk = mddev->gendisk;
2861 if (disk) 3320 if (disk)
2862 set_capacity(disk, 0); 3321 set_capacity(disk, 0);
2863 mddev->changed = 1; 3322 mddev->changed = 1;
2864 } else 3323 } else if (mddev->pers)
2865 printk(KERN_INFO "md: %s switched to read-only mode.\n", 3324 printk(KERN_INFO "md: %s switched to read-only mode.\n",
2866 mdname(mddev)); 3325 mdname(mddev));
2867 err = 0; 3326 err = 0;
@@ -3264,6 +3723,17 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
3264 3723
3265 rdev->raid_disk = -1; 3724 rdev->raid_disk = -1;
3266 err = bind_rdev_to_array(rdev, mddev); 3725 err = bind_rdev_to_array(rdev, mddev);
3726 if (!err && !mddev->pers->hot_remove_disk) {
3727 /* If there is hot_add_disk but no hot_remove_disk
3728 * then added disks for geometry changes,
3729 * and should be added immediately.
3730 */
3731 super_types[mddev->major_version].
3732 validate_super(mddev, rdev);
3733 err = mddev->pers->hot_add_disk(mddev, rdev);
3734 if (err)
3735 unbind_rdev_from_array(rdev);
3736 }
3267 if (err) 3737 if (err)
3268 export_rdev(rdev); 3738 export_rdev(rdev);
3269 3739
@@ -3434,23 +3904,6 @@ abort_export:
3434 return err; 3904 return err;
3435} 3905}
3436 3906
3437/* similar to deny_write_access, but accounts for our holding a reference
3438 * to the file ourselves */
3439static int deny_bitmap_write_access(struct file * file)
3440{
3441 struct inode *inode = file->f_mapping->host;
3442
3443 spin_lock(&inode->i_lock);
3444 if (atomic_read(&inode->i_writecount) > 1) {
3445 spin_unlock(&inode->i_lock);
3446 return -ETXTBSY;
3447 }
3448 atomic_set(&inode->i_writecount, -1);
3449 spin_unlock(&inode->i_lock);
3450
3451 return 0;
3452}
3453
3454static int set_bitmap_file(mddev_t *mddev, int fd) 3907static int set_bitmap_file(mddev_t *mddev, int fd)
3455{ 3908{
3456 int err; 3909 int err;
@@ -3491,12 +3944,17 @@ static int set_bitmap_file(mddev_t *mddev, int fd)
3491 mddev->pers->quiesce(mddev, 1); 3944 mddev->pers->quiesce(mddev, 1);
3492 if (fd >= 0) 3945 if (fd >= 0)
3493 err = bitmap_create(mddev); 3946 err = bitmap_create(mddev);
3494 if (fd < 0 || err) 3947 if (fd < 0 || err) {
3495 bitmap_destroy(mddev); 3948 bitmap_destroy(mddev);
3949 fd = -1; /* make sure to put the file */
3950 }
3496 mddev->pers->quiesce(mddev, 0); 3951 mddev->pers->quiesce(mddev, 0);
3497 } else if (fd < 0) { 3952 }
3498 if (mddev->bitmap_file) 3953 if (fd < 0) {
3954 if (mddev->bitmap_file) {
3955 restore_bitmap_write_access(mddev->bitmap_file);
3499 fput(mddev->bitmap_file); 3956 fput(mddev->bitmap_file);
3957 }
3500 mddev->bitmap_file = NULL; 3958 mddev->bitmap_file = NULL;
3501 } 3959 }
3502 3960
@@ -3977,11 +4435,6 @@ static int md_ioctl(struct inode *inode, struct file *file,
3977 goto done_unlock; 4435 goto done_unlock;
3978 4436
3979 default: 4437 default:
3980 if (_IOC_TYPE(cmd) == MD_MAJOR)
3981 printk(KERN_WARNING "md: %s(pid %d) used"
3982 " obsolete MD ioctl, upgrade your"
3983 " software to use new ictls.\n",
3984 current->comm, current->pid);
3985 err = -EINVAL; 4438 err = -EINVAL;
3986 goto abort_unlock; 4439 goto abort_unlock;
3987 } 4440 }
@@ -4152,6 +4605,8 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
4152 __builtin_return_address(0),__builtin_return_address(1), 4605 __builtin_return_address(0),__builtin_return_address(1),
4153 __builtin_return_address(2),__builtin_return_address(3)); 4606 __builtin_return_address(2),__builtin_return_address(3));
4154*/ 4607*/
4608 if (!mddev->pers)
4609 return;
4155 if (!mddev->pers->error_handler) 4610 if (!mddev->pers->error_handler)
4156 return; 4611 return;
4157 mddev->pers->error_handler(mddev,rdev); 4612 mddev->pers->error_handler(mddev,rdev);
@@ -4249,12 +4704,13 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
4249 */ 4704 */
4250 dt = ((jiffies - mddev->resync_mark) / HZ); 4705 dt = ((jiffies - mddev->resync_mark) / HZ);
4251 if (!dt) dt++; 4706 if (!dt) dt++;
4252 db = resync - (mddev->resync_mark_cnt/2); 4707 db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
4253 rt = (dt * ((unsigned long)(max_blocks-resync) / (db/100+1)))/100; 4708 - mddev->resync_mark_cnt;
4709 rt = (dt * ((unsigned long)(max_blocks-resync) / (db/2/100+1)))/100;
4254 4710
4255 seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); 4711 seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
4256 4712
4257 seq_printf(seq, " speed=%ldK/sec", db/dt); 4713 seq_printf(seq, " speed=%ldK/sec", db/2/dt);
4258} 4714}
4259 4715
4260static void *md_seq_start(struct seq_file *seq, loff_t *pos) 4716static void *md_seq_start(struct seq_file *seq, loff_t *pos)
@@ -4586,7 +5042,7 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
4586 spin_lock_irq(&mddev->write_lock); 5042 spin_lock_irq(&mddev->write_lock);
4587 if (mddev->in_sync) { 5043 if (mddev->in_sync) {
4588 mddev->in_sync = 0; 5044 mddev->in_sync = 0;
4589 mddev->sb_dirty = 1; 5045 mddev->sb_dirty = 3;
4590 md_wakeup_thread(mddev->thread); 5046 md_wakeup_thread(mddev->thread);
4591 } 5047 }
4592 spin_unlock_irq(&mddev->write_lock); 5048 spin_unlock_irq(&mddev->write_lock);
@@ -4599,7 +5055,7 @@ void md_write_end(mddev_t *mddev)
4599 if (atomic_dec_and_test(&mddev->writes_pending)) { 5055 if (atomic_dec_and_test(&mddev->writes_pending)) {
4600 if (mddev->safemode == 2) 5056 if (mddev->safemode == 2)
4601 md_wakeup_thread(mddev->thread); 5057 md_wakeup_thread(mddev->thread);
4602 else 5058 else if (mddev->safemode_delay)
4603 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 5059 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
4604 } 5060 }
4605} 5061}
@@ -4620,10 +5076,14 @@ void md_do_sync(mddev_t *mddev)
4620 struct list_head *tmp; 5076 struct list_head *tmp;
4621 sector_t last_check; 5077 sector_t last_check;
4622 int skipped = 0; 5078 int skipped = 0;
5079 struct list_head *rtmp;
5080 mdk_rdev_t *rdev;
4623 5081
4624 /* just incase thread restarts... */ 5082 /* just incase thread restarts... */
4625 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 5083 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
4626 return; 5084 return;
5085 if (mddev->ro) /* never try to sync a read-only array */
5086 return;
4627 5087
4628 /* we overload curr_resync somewhat here. 5088 /* we overload curr_resync somewhat here.
4629 * 0 == not engaged in resync at all 5089 * 0 == not engaged in resync at all
@@ -4682,17 +5142,30 @@ void md_do_sync(mddev_t *mddev)
4682 } 5142 }
4683 } while (mddev->curr_resync < 2); 5143 } while (mddev->curr_resync < 2);
4684 5144
5145 j = 0;
4685 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5146 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
4686 /* resync follows the size requested by the personality, 5147 /* resync follows the size requested by the personality,
4687 * which defaults to physical size, but can be virtual size 5148 * which defaults to physical size, but can be virtual size
4688 */ 5149 */
4689 max_sectors = mddev->resync_max_sectors; 5150 max_sectors = mddev->resync_max_sectors;
4690 mddev->resync_mismatches = 0; 5151 mddev->resync_mismatches = 0;
5152 /* we don't use the checkpoint if there's a bitmap */
5153 if (!mddev->bitmap &&
5154 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
5155 j = mddev->recovery_cp;
4691 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5156 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
4692 max_sectors = mddev->size << 1; 5157 max_sectors = mddev->size << 1;
4693 else 5158 else {
4694 /* recovery follows the physical size of devices */ 5159 /* recovery follows the physical size of devices */
4695 max_sectors = mddev->size << 1; 5160 max_sectors = mddev->size << 1;
5161 j = MaxSector;
5162 ITERATE_RDEV(mddev,rdev,rtmp)
5163 if (rdev->raid_disk >= 0 &&
5164 !test_bit(Faulty, &rdev->flags) &&
5165 !test_bit(In_sync, &rdev->flags) &&
5166 rdev->recovery_offset < j)
5167 j = rdev->recovery_offset;
5168 }
4696 5169
4697 printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); 5170 printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
4698 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" 5171 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
@@ -4702,12 +5175,7 @@ void md_do_sync(mddev_t *mddev)
4702 speed_max(mddev)); 5175 speed_max(mddev));
4703 5176
4704 is_mddev_idle(mddev); /* this also initializes IO event counters */ 5177 is_mddev_idle(mddev); /* this also initializes IO event counters */
4705 /* we don't use the checkpoint if there's a bitmap */ 5178
4706 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap
4707 && ! test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
4708 j = mddev->recovery_cp;
4709 else
4710 j = 0;
4711 io_sectors = 0; 5179 io_sectors = 0;
4712 for (m = 0; m < SYNC_MARKS; m++) { 5180 for (m = 0; m < SYNC_MARKS; m++) {
4713 mark[m] = jiffies; 5181 mark[m] = jiffies;
@@ -4753,6 +5221,7 @@ void md_do_sync(mddev_t *mddev)
4753 5221
4754 j += sectors; 5222 j += sectors;
4755 if (j>1) mddev->curr_resync = j; 5223 if (j>1) mddev->curr_resync = j;
5224 mddev->curr_mark_cnt = io_sectors;
4756 if (last_check == 0) 5225 if (last_check == 0)
4757 /* this is the earliers that rebuilt will be 5226 /* this is the earliers that rebuilt will be
4758 * visible in /proc/mdstat 5227 * visible in /proc/mdstat
@@ -4828,15 +5297,28 @@ void md_do_sync(mddev_t *mddev)
4828 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 5297 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
4829 test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 5298 test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
4830 !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 5299 !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
4831 mddev->curr_resync > 2 && 5300 mddev->curr_resync > 2) {
4832 mddev->curr_resync >= mddev->recovery_cp) { 5301 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
4833 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 5302 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
4834 printk(KERN_INFO 5303 if (mddev->curr_resync >= mddev->recovery_cp) {
4835 "md: checkpointing recovery of %s.\n", 5304 printk(KERN_INFO
4836 mdname(mddev)); 5305 "md: checkpointing recovery of %s.\n",
4837 mddev->recovery_cp = mddev->curr_resync; 5306 mdname(mddev));
4838 } else 5307 mddev->recovery_cp = mddev->curr_resync;
4839 mddev->recovery_cp = MaxSector; 5308 }
5309 } else
5310 mddev->recovery_cp = MaxSector;
5311 } else {
5312 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
5313 mddev->curr_resync = MaxSector;
5314 ITERATE_RDEV(mddev,rdev,rtmp)
5315 if (rdev->raid_disk >= 0 &&
5316 !test_bit(Faulty, &rdev->flags) &&
5317 !test_bit(In_sync, &rdev->flags) &&
5318 rdev->recovery_offset < mddev->curr_resync)
5319 rdev->recovery_offset = mddev->curr_resync;
5320 mddev->sb_dirty = 1;
5321 }
4840 } 5322 }
4841 5323
4842 skip: 5324 skip:
@@ -4908,7 +5390,7 @@ void md_check_recovery(mddev_t *mddev)
4908 if (mddev->safemode && !atomic_read(&mddev->writes_pending) && 5390 if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
4909 !mddev->in_sync && mddev->recovery_cp == MaxSector) { 5391 !mddev->in_sync && mddev->recovery_cp == MaxSector) {
4910 mddev->in_sync = 1; 5392 mddev->in_sync = 1;
4911 mddev->sb_dirty = 1; 5393 mddev->sb_dirty = 3;
4912 } 5394 }
4913 if (mddev->safemode == 1) 5395 if (mddev->safemode == 1)
4914 mddev->safemode = 0; 5396 mddev->safemode = 0;
@@ -4957,6 +5439,8 @@ void md_check_recovery(mddev_t *mddev)
4957 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 5439 clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
4958 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 5440 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
4959 5441
5442 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
5443 goto unlock;
4960 /* no recovery is running. 5444 /* no recovery is running.
4961 * remove any failed drives, then 5445 * remove any failed drives, then
4962 * add spares if possible. 5446 * add spares if possible.
@@ -4979,6 +5463,7 @@ void md_check_recovery(mddev_t *mddev)
4979 ITERATE_RDEV(mddev,rdev,rtmp) 5463 ITERATE_RDEV(mddev,rdev,rtmp)
4980 if (rdev->raid_disk < 0 5464 if (rdev->raid_disk < 0
4981 && !test_bit(Faulty, &rdev->flags)) { 5465 && !test_bit(Faulty, &rdev->flags)) {
5466 rdev->recovery_offset = 0;
4982 if (mddev->pers->hot_add_disk(mddev,rdev)) { 5467 if (mddev->pers->hot_add_disk(mddev,rdev)) {
4983 char nm[20]; 5468 char nm[20];
4984 sprintf(nm, "rd%d", rdev->raid_disk); 5469 sprintf(nm, "rd%d", rdev->raid_disk);
@@ -5071,8 +5556,6 @@ static void md_geninit(void)
5071 5556
5072static int __init md_init(void) 5557static int __init md_init(void)
5073{ 5558{
5074 int minor;
5075
5076 printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d," 5559 printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d,"
5077 " MD_SB_DISKS=%d\n", 5560 " MD_SB_DISKS=%d\n",
5078 MD_MAJOR_VERSION, MD_MINOR_VERSION, 5561 MD_MAJOR_VERSION, MD_MINOR_VERSION,
@@ -5086,23 +5569,11 @@ static int __init md_init(void)
5086 unregister_blkdev(MAJOR_NR, "md"); 5569 unregister_blkdev(MAJOR_NR, "md");
5087 return -1; 5570 return -1;
5088 } 5571 }
5089 devfs_mk_dir("md");
5090 blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE, 5572 blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE,
5091 md_probe, NULL, NULL); 5573 md_probe, NULL, NULL);
5092 blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE, 5574 blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE,
5093 md_probe, NULL, NULL); 5575 md_probe, NULL, NULL);
5094 5576
5095 for (minor=0; minor < MAX_MD_DEVS; ++minor)
5096 devfs_mk_bdev(MKDEV(MAJOR_NR, minor),
5097 S_IFBLK|S_IRUSR|S_IWUSR,
5098 "md/%d", minor);
5099
5100 for (minor=0; minor < MAX_MD_DEVS; ++minor)
5101 devfs_mk_bdev(MKDEV(mdp_major, minor<<MdpMinorShift),
5102 S_IFBLK|S_IRUSR|S_IWUSR,
5103 "md/mdp%d", minor);
5104
5105
5106 register_reboot_notifier(&md_notifier); 5577 register_reboot_notifier(&md_notifier);
5107 raid_table_header = register_sysctl_table(raid_root_table, 1); 5578 raid_table_header = register_sysctl_table(raid_root_table, 1);
5108 5579
@@ -5158,15 +5629,9 @@ static __exit void md_exit(void)
5158{ 5629{
5159 mddev_t *mddev; 5630 mddev_t *mddev;
5160 struct list_head *tmp; 5631 struct list_head *tmp;
5161 int i; 5632
5162 blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS); 5633 blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS);
5163 blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift); 5634 blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift);
5164 for (i=0; i < MAX_MD_DEVS; i++)
5165 devfs_remove("md/%d", i);
5166 for (i=0; i < MAX_MD_DEVS; i++)
5167 devfs_remove("md/d%d", i);
5168
5169 devfs_remove("md");
5170 5635
5171 unregister_blkdev(MAJOR_NR,"md"); 5636 unregister_blkdev(MAJOR_NR,"md");
5172 unregister_blkdev(mdp_major, "mdp"); 5637 unregister_blkdev(mdp_major, "mdp");
@@ -5203,8 +5668,8 @@ static int set_ro(const char *val, struct kernel_param *kp)
5203 return -EINVAL; 5668 return -EINVAL;
5204} 5669}
5205 5670
5206module_param_call(start_ro, set_ro, get_ro, NULL, 0600); 5671module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
5207module_param(start_dirty_degraded, int, 0644); 5672module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
5208 5673
5209 5674
5210EXPORT_SYMBOL(register_md_personality); 5675EXPORT_SYMBOL(register_md_personality);
@@ -5216,7 +5681,6 @@ EXPORT_SYMBOL(md_write_end);
5216EXPORT_SYMBOL(md_register_thread); 5681EXPORT_SYMBOL(md_register_thread);
5217EXPORT_SYMBOL(md_unregister_thread); 5682EXPORT_SYMBOL(md_unregister_thread);
5218EXPORT_SYMBOL(md_wakeup_thread); 5683EXPORT_SYMBOL(md_wakeup_thread);
5219EXPORT_SYMBOL(md_print_devices);
5220EXPORT_SYMBOL(md_check_recovery); 5684EXPORT_SYMBOL(md_check_recovery);
5221MODULE_LICENSE("GPL"); 5685MODULE_LICENSE("GPL");
5222MODULE_ALIAS("md"); 5686MODULE_ALIAS("md");
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 4070eff6f0f8..3b4d69c05623 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -374,26 +374,26 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
374 * already. 374 * already.
375 */ 375 */
376 if (atomic_dec_and_test(&r1_bio->remaining)) { 376 if (atomic_dec_and_test(&r1_bio->remaining)) {
377 if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) { 377 if (test_bit(R1BIO_BarrierRetry, &r1_bio->state))
378 reschedule_retry(r1_bio); 378 reschedule_retry(r1_bio);
379 goto out; 379 else {
380 } 380 /* it really is the end of this request */
381 /* it really is the end of this request */ 381 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
382 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { 382 /* free extra copy of the data pages */
383 /* free extra copy of the data pages */ 383 int i = bio->bi_vcnt;
384 int i = bio->bi_vcnt; 384 while (i--)
385 while (i--) 385 safe_put_page(bio->bi_io_vec[i].bv_page);
386 safe_put_page(bio->bi_io_vec[i].bv_page); 386 }
387 /* clear the bitmap if all writes complete successfully */
388 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
389 r1_bio->sectors,
390 !test_bit(R1BIO_Degraded, &r1_bio->state),
391 behind);
392 md_write_end(r1_bio->mddev);
393 raid_end_bio_io(r1_bio);
387 } 394 }
388 /* clear the bitmap if all writes complete successfully */
389 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
390 r1_bio->sectors,
391 !test_bit(R1BIO_Degraded, &r1_bio->state),
392 behind);
393 md_write_end(r1_bio->mddev);
394 raid_end_bio_io(r1_bio);
395 } 395 }
396 out: 396
397 if (to_put) 397 if (to_put)
398 bio_put(to_put); 398 bio_put(to_put);
399 399
@@ -930,10 +930,13 @@ static void status(struct seq_file *seq, mddev_t *mddev)
930 930
931 seq_printf(seq, " [%d/%d] [", conf->raid_disks, 931 seq_printf(seq, " [%d/%d] [", conf->raid_disks,
932 conf->working_disks); 932 conf->working_disks);
933 for (i = 0; i < conf->raid_disks; i++) 933 rcu_read_lock();
934 for (i = 0; i < conf->raid_disks; i++) {
935 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
934 seq_printf(seq, "%s", 936 seq_printf(seq, "%s",
935 conf->mirrors[i].rdev && 937 rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
936 test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_"); 938 }
939 rcu_read_unlock();
937 seq_printf(seq, "]"); 940 seq_printf(seq, "]");
938} 941}
939 942
@@ -975,7 +978,6 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
975static void print_conf(conf_t *conf) 978static void print_conf(conf_t *conf)
976{ 979{
977 int i; 980 int i;
978 mirror_info_t *tmp;
979 981
980 printk("RAID1 conf printout:\n"); 982 printk("RAID1 conf printout:\n");
981 if (!conf) { 983 if (!conf) {
@@ -985,14 +987,17 @@ static void print_conf(conf_t *conf)
985 printk(" --- wd:%d rd:%d\n", conf->working_disks, 987 printk(" --- wd:%d rd:%d\n", conf->working_disks,
986 conf->raid_disks); 988 conf->raid_disks);
987 989
990 rcu_read_lock();
988 for (i = 0; i < conf->raid_disks; i++) { 991 for (i = 0; i < conf->raid_disks; i++) {
989 char b[BDEVNAME_SIZE]; 992 char b[BDEVNAME_SIZE];
990 tmp = conf->mirrors + i; 993 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
991 if (tmp->rdev) 994 if (rdev)
992 printk(" disk %d, wo:%d, o:%d, dev:%s\n", 995 printk(" disk %d, wo:%d, o:%d, dev:%s\n",
993 i, !test_bit(In_sync, &tmp->rdev->flags), !test_bit(Faulty, &tmp->rdev->flags), 996 i, !test_bit(In_sync, &rdev->flags),
994 bdevname(tmp->rdev->bdev,b)); 997 !test_bit(Faulty, &rdev->flags),
998 bdevname(rdev->bdev,b));
995 } 999 }
1000 rcu_read_unlock();
996} 1001}
997 1002
998static void close_sync(conf_t *conf) 1003static void close_sync(conf_t *conf)
@@ -1008,20 +1013,20 @@ static int raid1_spare_active(mddev_t *mddev)
1008{ 1013{
1009 int i; 1014 int i;
1010 conf_t *conf = mddev->private; 1015 conf_t *conf = mddev->private;
1011 mirror_info_t *tmp;
1012 1016
1013 /* 1017 /*
1014 * Find all failed disks within the RAID1 configuration 1018 * Find all failed disks within the RAID1 configuration
1015 * and mark them readable 1019 * and mark them readable.
1020 * Called under mddev lock, so rcu protection not needed.
1016 */ 1021 */
1017 for (i = 0; i < conf->raid_disks; i++) { 1022 for (i = 0; i < conf->raid_disks; i++) {
1018 tmp = conf->mirrors + i; 1023 mdk_rdev_t *rdev = conf->mirrors[i].rdev;
1019 if (tmp->rdev 1024 if (rdev
1020 && !test_bit(Faulty, &tmp->rdev->flags) 1025 && !test_bit(Faulty, &rdev->flags)
1021 && !test_bit(In_sync, &tmp->rdev->flags)) { 1026 && !test_bit(In_sync, &rdev->flags)) {
1022 conf->working_disks++; 1027 conf->working_disks++;
1023 mddev->degraded--; 1028 mddev->degraded--;
1024 set_bit(In_sync, &tmp->rdev->flags); 1029 set_bit(In_sync, &rdev->flags);
1025 } 1030 }
1026 } 1031 }
1027 1032
@@ -1145,7 +1150,7 @@ static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error)
1145 long sectors_to_go = r1_bio->sectors; 1150 long sectors_to_go = r1_bio->sectors;
1146 /* make sure these bits doesn't get cleared. */ 1151 /* make sure these bits doesn't get cleared. */
1147 do { 1152 do {
1148 bitmap_end_sync(mddev->bitmap, r1_bio->sector, 1153 bitmap_end_sync(mddev->bitmap, s,
1149 &sync_blocks, 1); 1154 &sync_blocks, 1);
1150 s += sync_blocks; 1155 s += sync_blocks;
1151 sectors_to_go -= sync_blocks; 1156 sectors_to_go -= sync_blocks;
@@ -1237,7 +1242,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
1237 /* ouch - failed to read all of that. 1242 /* ouch - failed to read all of that.
1238 * Try some synchronous reads of other devices to get 1243 * Try some synchronous reads of other devices to get
1239 * good data, much like with normal read errors. Only 1244 * good data, much like with normal read errors. Only
1240 * read into the pages we already have so they we don't 1245 * read into the pages we already have so we don't
1241 * need to re-issue the read request. 1246 * need to re-issue the read request.
1242 * We don't need to freeze the array, because being in an 1247 * We don't need to freeze the array, because being in an
1243 * active sync request, there is no normal IO, and 1248 * active sync request, there is no normal IO, and
@@ -1257,6 +1262,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
1257 s = PAGE_SIZE >> 9; 1262 s = PAGE_SIZE >> 9;
1258 do { 1263 do {
1259 if (r1_bio->bios[d]->bi_end_io == end_sync_read) { 1264 if (r1_bio->bios[d]->bi_end_io == end_sync_read) {
1265 /* No rcu protection needed here devices
1266 * can only be removed when no resync is
1267 * active, and resync is currently active
1268 */
1260 rdev = conf->mirrors[d].rdev; 1269 rdev = conf->mirrors[d].rdev;
1261 if (sync_page_io(rdev->bdev, 1270 if (sync_page_io(rdev->bdev,
1262 sect + rdev->data_offset, 1271 sect + rdev->data_offset,
@@ -1463,6 +1472,11 @@ static void raid1d(mddev_t *mddev)
1463 s = PAGE_SIZE >> 9; 1472 s = PAGE_SIZE >> 9;
1464 1473
1465 do { 1474 do {
1475 /* Note: no rcu protection needed here
1476 * as this is synchronous in the raid1d thread
1477 * which is the thread that might remove
1478 * a device. If raid1d ever becomes multi-threaded....
1479 */
1466 rdev = conf->mirrors[d].rdev; 1480 rdev = conf->mirrors[d].rdev;
1467 if (rdev && 1481 if (rdev &&
1468 test_bit(In_sync, &rdev->flags) && 1482 test_bit(In_sync, &rdev->flags) &&
@@ -1486,7 +1500,6 @@ static void raid1d(mddev_t *mddev)
1486 d = conf->raid_disks; 1500 d = conf->raid_disks;
1487 d--; 1501 d--;
1488 rdev = conf->mirrors[d].rdev; 1502 rdev = conf->mirrors[d].rdev;
1489 atomic_add(s, &rdev->corrected_errors);
1490 if (rdev && 1503 if (rdev &&
1491 test_bit(In_sync, &rdev->flags)) { 1504 test_bit(In_sync, &rdev->flags)) {
1492 if (sync_page_io(rdev->bdev, 1505 if (sync_page_io(rdev->bdev,
@@ -1509,6 +1522,11 @@ static void raid1d(mddev_t *mddev)
1509 s<<9, conf->tmppage, READ) == 0) 1522 s<<9, conf->tmppage, READ) == 0)
1510 /* Well, this device is dead */ 1523 /* Well, this device is dead */
1511 md_error(mddev, rdev); 1524 md_error(mddev, rdev);
1525 else {
1526 atomic_add(s, &rdev->corrected_errors);
1527 printk(KERN_INFO "raid1:%s: read error corrected (%d sectors at %llu on %s)\n",
1528 mdname(mddev), s, (unsigned long long)(sect + rdev->data_offset), bdevname(rdev->bdev, b));
1529 }
1512 } 1530 }
1513 } 1531 }
1514 } else { 1532 } else {
@@ -1622,6 +1640,13 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1622 return 0; 1640 return 0;
1623 } 1641 }
1624 1642
1643 if (mddev->bitmap == NULL &&
1644 mddev->recovery_cp == MaxSector &&
1645 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
1646 conf->fullsync == 0) {
1647 *skipped = 1;
1648 return max_sector - sector_nr;
1649 }
1625 /* before building a request, check if we can skip these blocks.. 1650 /* before building a request, check if we can skip these blocks..
1626 * This call the bitmap_start_sync doesn't actually record anything 1651 * This call the bitmap_start_sync doesn't actually record anything
1627 */ 1652 */
@@ -1777,19 +1802,17 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1777 for (i=0; i<conf->raid_disks; i++) { 1802 for (i=0; i<conf->raid_disks; i++) {
1778 bio = r1_bio->bios[i]; 1803 bio = r1_bio->bios[i];
1779 if (bio->bi_end_io == end_sync_read) { 1804 if (bio->bi_end_io == end_sync_read) {
1780 md_sync_acct(conf->mirrors[i].rdev->bdev, nr_sectors); 1805 md_sync_acct(bio->bi_bdev, nr_sectors);
1781 generic_make_request(bio); 1806 generic_make_request(bio);
1782 } 1807 }
1783 } 1808 }
1784 } else { 1809 } else {
1785 atomic_set(&r1_bio->remaining, 1); 1810 atomic_set(&r1_bio->remaining, 1);
1786 bio = r1_bio->bios[r1_bio->read_disk]; 1811 bio = r1_bio->bios[r1_bio->read_disk];
1787 md_sync_acct(conf->mirrors[r1_bio->read_disk].rdev->bdev, 1812 md_sync_acct(bio->bi_bdev, nr_sectors);
1788 nr_sectors);
1789 generic_make_request(bio); 1813 generic_make_request(bio);
1790 1814
1791 } 1815 }
1792
1793 return nr_sectors; 1816 return nr_sectors;
1794} 1817}
1795 1818
@@ -1888,7 +1911,8 @@ static int run(mddev_t *mddev)
1888 1911
1889 disk = conf->mirrors + i; 1912 disk = conf->mirrors + i;
1890 1913
1891 if (!disk->rdev) { 1914 if (!disk->rdev ||
1915 !test_bit(In_sync, &disk->rdev->flags)) {
1892 disk->head_position = 0; 1916 disk->head_position = 0;
1893 mddev->degraded++; 1917 mddev->degraded++;
1894 } 1918 }
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 1440935414e6..016ddb831c9b 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -29,6 +29,7 @@
29 * raid_disks 29 * raid_disks
30 * near_copies (stored in low byte of layout) 30 * near_copies (stored in low byte of layout)
31 * far_copies (stored in second byte of layout) 31 * far_copies (stored in second byte of layout)
32 * far_offset (stored in bit 16 of layout )
32 * 33 *
33 * The data to be stored is divided into chunks using chunksize. 34 * The data to be stored is divided into chunks using chunksize.
34 * Each device is divided into far_copies sections. 35 * Each device is divided into far_copies sections.
@@ -36,10 +37,14 @@
36 * near_copies copies of each chunk is stored (each on a different drive). 37 * near_copies copies of each chunk is stored (each on a different drive).
37 * The starting device for each section is offset near_copies from the starting 38 * The starting device for each section is offset near_copies from the starting
38 * device of the previous section. 39 * device of the previous section.
39 * Thus there are (near_copies*far_copies) of each chunk, and each is on a different 40 * Thus they are (near_copies*far_copies) of each chunk, and each is on a different
40 * drive. 41 * drive.
41 * near_copies and far_copies must be at least one, and their product is at most 42 * near_copies and far_copies must be at least one, and their product is at most
42 * raid_disks. 43 * raid_disks.
44 *
45 * If far_offset is true, then the far_copies are handled a bit differently.
46 * The copies are still in different stripes, but instead of be very far apart
47 * on disk, there are adjacent stripes.
43 */ 48 */
44 49
45/* 50/*
@@ -357,8 +362,7 @@ static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, in
357 * With this layout, and block is never stored twice on the one device. 362 * With this layout, and block is never stored twice on the one device.
358 * 363 *
359 * raid10_find_phys finds the sector offset of a given virtual sector 364 * raid10_find_phys finds the sector offset of a given virtual sector
360 * on each device that it is on. If a block isn't on a device, 365 * on each device that it is on.
361 * that entry in the array is set to MaxSector.
362 * 366 *
363 * raid10_find_virt does the reverse mapping, from a device and a 367 * raid10_find_virt does the reverse mapping, from a device and a
364 * sector offset to a virtual address 368 * sector offset to a virtual address
@@ -381,6 +385,8 @@ static void raid10_find_phys(conf_t *conf, r10bio_t *r10bio)
381 chunk *= conf->near_copies; 385 chunk *= conf->near_copies;
382 stripe = chunk; 386 stripe = chunk;
383 dev = sector_div(stripe, conf->raid_disks); 387 dev = sector_div(stripe, conf->raid_disks);
388 if (conf->far_offset)
389 stripe *= conf->far_copies;
384 390
385 sector += stripe << conf->chunk_shift; 391 sector += stripe << conf->chunk_shift;
386 392
@@ -414,16 +420,24 @@ static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev)
414{ 420{
415 sector_t offset, chunk, vchunk; 421 sector_t offset, chunk, vchunk;
416 422
417 while (sector > conf->stride) {
418 sector -= conf->stride;
419 if (dev < conf->near_copies)
420 dev += conf->raid_disks - conf->near_copies;
421 else
422 dev -= conf->near_copies;
423 }
424
425 offset = sector & conf->chunk_mask; 423 offset = sector & conf->chunk_mask;
426 chunk = sector >> conf->chunk_shift; 424 if (conf->far_offset) {
425 int fc;
426 chunk = sector >> conf->chunk_shift;
427 fc = sector_div(chunk, conf->far_copies);
428 dev -= fc * conf->near_copies;
429 if (dev < 0)
430 dev += conf->raid_disks;
431 } else {
432 while (sector > conf->stride) {
433 sector -= conf->stride;
434 if (dev < conf->near_copies)
435 dev += conf->raid_disks - conf->near_copies;
436 else
437 dev -= conf->near_copies;
438 }
439 chunk = sector >> conf->chunk_shift;
440 }
427 vchunk = chunk * conf->raid_disks + dev; 441 vchunk = chunk * conf->raid_disks + dev;
428 sector_div(vchunk, conf->near_copies); 442 sector_div(vchunk, conf->near_copies);
429 return (vchunk << conf->chunk_shift) + offset; 443 return (vchunk << conf->chunk_shift) + offset;
@@ -900,9 +914,12 @@ static void status(struct seq_file *seq, mddev_t *mddev)
900 seq_printf(seq, " %dK chunks", mddev->chunk_size/1024); 914 seq_printf(seq, " %dK chunks", mddev->chunk_size/1024);
901 if (conf->near_copies > 1) 915 if (conf->near_copies > 1)
902 seq_printf(seq, " %d near-copies", conf->near_copies); 916 seq_printf(seq, " %d near-copies", conf->near_copies);
903 if (conf->far_copies > 1) 917 if (conf->far_copies > 1) {
904 seq_printf(seq, " %d far-copies", conf->far_copies); 918 if (conf->far_offset)
905 919 seq_printf(seq, " %d offset-copies", conf->far_copies);
920 else
921 seq_printf(seq, " %d far-copies", conf->far_copies);
922 }
906 seq_printf(seq, " [%d/%d] [", conf->raid_disks, 923 seq_printf(seq, " [%d/%d] [", conf->raid_disks,
907 conf->working_disks); 924 conf->working_disks);
908 for (i = 0; i < conf->raid_disks; i++) 925 for (i = 0; i < conf->raid_disks; i++)
@@ -1475,6 +1492,10 @@ static void raid10d(mddev_t *mddev)
1475 s<<9, conf->tmppage, READ) == 0) 1492 s<<9, conf->tmppage, READ) == 0)
1476 /* Well, this device is dead */ 1493 /* Well, this device is dead */
1477 md_error(mddev, rdev); 1494 md_error(mddev, rdev);
1495 else
1496 printk(KERN_INFO "raid10:%s: read error corrected (%d sectors at %llu on %s)\n",
1497 mdname(mddev), s, (unsigned long long)(sect+rdev->data_offset), bdevname(rdev->bdev, b));
1498
1478 rdev_dec_pending(rdev, mddev); 1499 rdev_dec_pending(rdev, mddev);
1479 rcu_read_lock(); 1500 rcu_read_lock();
1480 } 1501 }
@@ -1915,7 +1936,7 @@ static int run(mddev_t *mddev)
1915 mirror_info_t *disk; 1936 mirror_info_t *disk;
1916 mdk_rdev_t *rdev; 1937 mdk_rdev_t *rdev;
1917 struct list_head *tmp; 1938 struct list_head *tmp;
1918 int nc, fc; 1939 int nc, fc, fo;
1919 sector_t stride, size; 1940 sector_t stride, size;
1920 1941
1921 if (mddev->chunk_size == 0) { 1942 if (mddev->chunk_size == 0) {
@@ -1925,8 +1946,9 @@ static int run(mddev_t *mddev)
1925 1946
1926 nc = mddev->layout & 255; 1947 nc = mddev->layout & 255;
1927 fc = (mddev->layout >> 8) & 255; 1948 fc = (mddev->layout >> 8) & 255;
1949 fo = mddev->layout & (1<<16);
1928 if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || 1950 if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
1929 (mddev->layout >> 16)) { 1951 (mddev->layout >> 17)) {
1930 printk(KERN_ERR "raid10: %s: unsupported raid10 layout: 0x%8x\n", 1952 printk(KERN_ERR "raid10: %s: unsupported raid10 layout: 0x%8x\n",
1931 mdname(mddev), mddev->layout); 1953 mdname(mddev), mddev->layout);
1932 goto out; 1954 goto out;
@@ -1958,12 +1980,16 @@ static int run(mddev_t *mddev)
1958 conf->near_copies = nc; 1980 conf->near_copies = nc;
1959 conf->far_copies = fc; 1981 conf->far_copies = fc;
1960 conf->copies = nc*fc; 1982 conf->copies = nc*fc;
1983 conf->far_offset = fo;
1961 conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1; 1984 conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1;
1962 conf->chunk_shift = ffz(~mddev->chunk_size) - 9; 1985 conf->chunk_shift = ffz(~mddev->chunk_size) - 9;
1963 stride = mddev->size >> (conf->chunk_shift-1); 1986 if (fo)
1964 sector_div(stride, fc); 1987 conf->stride = 1 << conf->chunk_shift;
1965 conf->stride = stride << conf->chunk_shift; 1988 else {
1966 1989 stride = mddev->size >> (conf->chunk_shift-1);
1990 sector_div(stride, fc);
1991 conf->stride = stride << conf->chunk_shift;
1992 }
1967 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, 1993 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
1968 r10bio_pool_free, conf); 1994 r10bio_pool_free, conf);
1969 if (!conf->r10bio_pool) { 1995 if (!conf->r10bio_pool) {
@@ -2015,7 +2041,8 @@ static int run(mddev_t *mddev)
2015 2041
2016 disk = conf->mirrors + i; 2042 disk = conf->mirrors + i;
2017 2043
2018 if (!disk->rdev) { 2044 if (!disk->rdev ||
2045 !test_bit(In_sync, &rdev->flags)) {
2019 disk->head_position = 0; 2046 disk->head_position = 0;
2020 mddev->degraded++; 2047 mddev->degraded++;
2021 } 2048 }
@@ -2037,7 +2064,13 @@ static int run(mddev_t *mddev)
2037 /* 2064 /*
2038 * Ok, everything is just fine now 2065 * Ok, everything is just fine now
2039 */ 2066 */
2040 size = conf->stride * conf->raid_disks; 2067 if (conf->far_offset) {
2068 size = mddev->size >> (conf->chunk_shift-1);
2069 size *= conf->raid_disks;
2070 size <<= conf->chunk_shift;
2071 sector_div(size, conf->far_copies);
2072 } else
2073 size = conf->stride * conf->raid_disks;
2041 sector_div(size, conf->near_copies); 2074 sector_div(size, conf->near_copies);
2042 mddev->array_size = size/2; 2075 mddev->array_size = size/2;
2043 mddev->resync_max_sectors = size; 2076 mddev->resync_max_sectors = size;
@@ -2050,7 +2083,7 @@ static int run(mddev_t *mddev)
2050 * maybe... 2083 * maybe...
2051 */ 2084 */
2052 { 2085 {
2053 int stripe = conf->raid_disks * mddev->chunk_size / PAGE_SIZE; 2086 int stripe = conf->raid_disks * (mddev->chunk_size / PAGE_SIZE);
2054 stripe /= conf->near_copies; 2087 stripe /= conf->near_copies;
2055 if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) 2088 if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
2056 mddev->queue->backing_dev_info.ra_pages = 2* stripe; 2089 mddev->queue->backing_dev_info.ra_pages = 2* stripe;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 31843604049c..450066007160 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2,8 +2,11 @@
2 * raid5.c : Multiple Devices driver for Linux 2 * raid5.c : Multiple Devices driver for Linux
3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4 * Copyright (C) 1999, 2000 Ingo Molnar 4 * Copyright (C) 1999, 2000 Ingo Molnar
5 * Copyright (C) 2002, 2003 H. Peter Anvin
5 * 6 *
6 * RAID-5 management functions. 7 * RAID-4/5/6 management functions.
8 * Thanks to Penguin Computing for making the RAID-6 development possible
9 * by donating a test server!
7 * 10 *
8 * This program is free software; you can redistribute it and/or modify 11 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by 12 * it under the terms of the GNU General Public License as published by
@@ -15,15 +18,38 @@
15 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
16 */ 19 */
17 20
21/*
22 * BITMAP UNPLUGGING:
23 *
24 * The sequencing for updating the bitmap reliably is a little
25 * subtle (and I got it wrong the first time) so it deserves some
26 * explanation.
27 *
28 * We group bitmap updates into batches. Each batch has a number.
29 * We may write out several batches at once, but that isn't very important.
30 * conf->bm_write is the number of the last batch successfully written.
31 * conf->bm_flush is the number of the last batch that was closed to
32 * new additions.
33 * When we discover that we will need to write to any block in a stripe
34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
35 * the number of the batch it will be in. This is bm_flush+1.
36 * When we are ready to do a write, if that batch hasn't been written yet,
37 * we plug the array and queue the stripe for later.
38 * When an unplug happens, we increment bm_flush, thus closing the current
39 * batch.
40 * When we notice that bm_flush > bm_write, we write out all pending updates
41 * to the bitmap, and advance bm_write to where bm_flush was.
42 * This may occasionally write a bit out twice, but is sure never to
43 * miss any bits.
44 */
18 45
19#include <linux/config.h>
20#include <linux/module.h> 46#include <linux/module.h>
21#include <linux/slab.h> 47#include <linux/slab.h>
22#include <linux/raid/raid5.h>
23#include <linux/highmem.h> 48#include <linux/highmem.h>
24#include <linux/bitops.h> 49#include <linux/bitops.h>
25#include <linux/kthread.h> 50#include <linux/kthread.h>
26#include <asm/atomic.h> 51#include <asm/atomic.h>
52#include "raid6.h"
27 53
28#include <linux/raid/bitmap.h> 54#include <linux/raid/bitmap.h>
29 55
@@ -68,6 +94,16 @@
68#define __inline__ 94#define __inline__
69#endif 95#endif
70 96
97#if !RAID6_USE_EMPTY_ZERO_PAGE
98/* In .bss so it's zeroed */
99const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
100#endif
101
102static inline int raid6_next_disk(int disk, int raid_disks)
103{
104 disk++;
105 return (disk < raid_disks) ? disk : 0;
106}
71static void print_raid5_conf (raid5_conf_t *conf); 107static void print_raid5_conf (raid5_conf_t *conf);
72 108
73static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) 109static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
@@ -76,12 +112,14 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
76 BUG_ON(!list_empty(&sh->lru)); 112 BUG_ON(!list_empty(&sh->lru));
77 BUG_ON(atomic_read(&conf->active_stripes)==0); 113 BUG_ON(atomic_read(&conf->active_stripes)==0);
78 if (test_bit(STRIPE_HANDLE, &sh->state)) { 114 if (test_bit(STRIPE_HANDLE, &sh->state)) {
79 if (test_bit(STRIPE_DELAYED, &sh->state)) 115 if (test_bit(STRIPE_DELAYED, &sh->state)) {
80 list_add_tail(&sh->lru, &conf->delayed_list); 116 list_add_tail(&sh->lru, &conf->delayed_list);
81 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 117 blk_plug_device(conf->mddev->queue);
82 conf->seq_write == sh->bm_seq) 118 } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
119 sh->bm_seq - conf->seq_write > 0) {
83 list_add_tail(&sh->lru, &conf->bitmap_list); 120 list_add_tail(&sh->lru, &conf->bitmap_list);
84 else { 121 blk_plug_device(conf->mddev->queue);
122 } else {
85 clear_bit(STRIPE_BIT_DELAY, &sh->state); 123 clear_bit(STRIPE_BIT_DELAY, &sh->state);
86 list_add_tail(&sh->lru, &conf->handle_list); 124 list_add_tail(&sh->lru, &conf->handle_list);
87 } 125 }
@@ -104,7 +142,7 @@ static void release_stripe(struct stripe_head *sh)
104{ 142{
105 raid5_conf_t *conf = sh->raid_conf; 143 raid5_conf_t *conf = sh->raid_conf;
106 unsigned long flags; 144 unsigned long flags;
107 145
108 spin_lock_irqsave(&conf->device_lock, flags); 146 spin_lock_irqsave(&conf->device_lock, flags);
109 __release_stripe(conf, sh); 147 __release_stripe(conf, sh);
110 spin_unlock_irqrestore(&conf->device_lock, flags); 148 spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -117,7 +155,7 @@ static inline void remove_hash(struct stripe_head *sh)
117 hlist_del_init(&sh->hash); 155 hlist_del_init(&sh->hash);
118} 156}
119 157
120static void insert_hash(raid5_conf_t *conf, struct stripe_head *sh) 158static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
121{ 159{
122 struct hlist_head *hp = stripe_hash(conf, sh->sector); 160 struct hlist_head *hp = stripe_hash(conf, sh->sector);
123 161
@@ -190,7 +228,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int
190 (unsigned long long)sh->sector); 228 (unsigned long long)sh->sector);
191 229
192 remove_hash(sh); 230 remove_hash(sh);
193 231
194 sh->sector = sector; 232 sh->sector = sector;
195 sh->pd_idx = pd_idx; 233 sh->pd_idx = pd_idx;
196 sh->state = 0; 234 sh->state = 0;
@@ -258,7 +296,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
258 < (conf->max_nr_stripes *3/4) 296 < (conf->max_nr_stripes *3/4)
259 || !conf->inactive_blocked), 297 || !conf->inactive_blocked),
260 conf->device_lock, 298 conf->device_lock,
261 unplug_slaves(conf->mddev) 299 raid5_unplug_device(conf->mddev->queue)
262 ); 300 );
263 conf->inactive_blocked = 0; 301 conf->inactive_blocked = 0;
264 } else 302 } else
@@ -269,8 +307,10 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
269 } else { 307 } else {
270 if (!test_bit(STRIPE_HANDLE, &sh->state)) 308 if (!test_bit(STRIPE_HANDLE, &sh->state))
271 atomic_inc(&conf->active_stripes); 309 atomic_inc(&conf->active_stripes);
272 if (!list_empty(&sh->lru)) 310 if (list_empty(&sh->lru) &&
273 list_del_init(&sh->lru); 311 !test_bit(STRIPE_EXPANDING, &sh->state))
312 BUG();
313 list_del_init(&sh->lru);
274 } 314 }
275 } 315 }
276 } while (sh == NULL); 316 } while (sh == NULL);
@@ -321,10 +361,9 @@ static int grow_stripes(raid5_conf_t *conf, int num)
321 return 1; 361 return 1;
322 conf->slab_cache = sc; 362 conf->slab_cache = sc;
323 conf->pool_size = devs; 363 conf->pool_size = devs;
324 while (num--) { 364 while (num--)
325 if (!grow_one_stripe(conf)) 365 if (!grow_one_stripe(conf))
326 return 1; 366 return 1;
327 }
328 return 0; 367 return 0;
329} 368}
330 369
@@ -484,6 +523,8 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
484 raid5_conf_t *conf = sh->raid_conf; 523 raid5_conf_t *conf = sh->raid_conf;
485 int disks = sh->disks, i; 524 int disks = sh->disks, i;
486 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 525 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
526 char b[BDEVNAME_SIZE];
527 mdk_rdev_t *rdev;
487 528
488 if (bi->bi_size) 529 if (bi->bi_size)
489 return 1; 530 return 1;
@@ -531,25 +572,39 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
531 set_bit(R5_UPTODATE, &sh->dev[i].flags); 572 set_bit(R5_UPTODATE, &sh->dev[i].flags);
532#endif 573#endif
533 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 574 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
534 printk(KERN_INFO "raid5: read error corrected!!\n"); 575 rdev = conf->disks[i].rdev;
576 printk(KERN_INFO "raid5:%s: read error corrected (%lu sectors at %llu on %s)\n",
577 mdname(conf->mddev), STRIPE_SECTORS,
578 (unsigned long long)sh->sector + rdev->data_offset,
579 bdevname(rdev->bdev, b));
535 clear_bit(R5_ReadError, &sh->dev[i].flags); 580 clear_bit(R5_ReadError, &sh->dev[i].flags);
536 clear_bit(R5_ReWrite, &sh->dev[i].flags); 581 clear_bit(R5_ReWrite, &sh->dev[i].flags);
537 } 582 }
538 if (atomic_read(&conf->disks[i].rdev->read_errors)) 583 if (atomic_read(&conf->disks[i].rdev->read_errors))
539 atomic_set(&conf->disks[i].rdev->read_errors, 0); 584 atomic_set(&conf->disks[i].rdev->read_errors, 0);
540 } else { 585 } else {
586 const char *bdn = bdevname(conf->disks[i].rdev->bdev, b);
541 int retry = 0; 587 int retry = 0;
588 rdev = conf->disks[i].rdev;
589
542 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 590 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
543 atomic_inc(&conf->disks[i].rdev->read_errors); 591 atomic_inc(&rdev->read_errors);
544 if (conf->mddev->degraded) 592 if (conf->mddev->degraded)
545 printk(KERN_WARNING "raid5: read error not correctable.\n"); 593 printk(KERN_WARNING "raid5:%s: read error not correctable (sector %llu on %s).\n",
594 mdname(conf->mddev),
595 (unsigned long long)sh->sector + rdev->data_offset,
596 bdn);
546 else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) 597 else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
547 /* Oh, no!!! */ 598 /* Oh, no!!! */
548 printk(KERN_WARNING "raid5: read error NOT corrected!!\n"); 599 printk(KERN_WARNING "raid5:%s: read error NOT corrected!! (sector %llu on %s).\n",
549 else if (atomic_read(&conf->disks[i].rdev->read_errors) 600 mdname(conf->mddev),
601 (unsigned long long)sh->sector + rdev->data_offset,
602 bdn);
603 else if (atomic_read(&rdev->read_errors)
550 > conf->max_nr_stripes) 604 > conf->max_nr_stripes)
551 printk(KERN_WARNING 605 printk(KERN_WARNING
552 "raid5: Too many read errors, failing device.\n"); 606 "raid5:%s: Too many read errors, failing device %s.\n",
607 mdname(conf->mddev), bdn);
553 else 608 else
554 retry = 1; 609 retry = 1;
555 if (retry) 610 if (retry)
@@ -557,7 +612,7 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
557 else { 612 else {
558 clear_bit(R5_ReadError, &sh->dev[i].flags); 613 clear_bit(R5_ReadError, &sh->dev[i].flags);
559 clear_bit(R5_ReWrite, &sh->dev[i].flags); 614 clear_bit(R5_ReWrite, &sh->dev[i].flags);
560 md_error(conf->mddev, conf->disks[i].rdev); 615 md_error(conf->mddev, rdev);
561 } 616 }
562 } 617 }
563 rdev_dec_pending(conf->disks[i].rdev, conf->mddev); 618 rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
@@ -631,8 +686,7 @@ static void raid5_build_block (struct stripe_head *sh, int i)
631 dev->req.bi_private = sh; 686 dev->req.bi_private = sh;
632 687
633 dev->flags = 0; 688 dev->flags = 0;
634 if (i != sh->pd_idx) 689 dev->sector = compute_blocknr(sh, i);
635 dev->sector = compute_blocknr(sh, i);
636} 690}
637 691
638static void error(mddev_t *mddev, mdk_rdev_t *rdev) 692static void error(mddev_t *mddev, mdk_rdev_t *rdev)
@@ -659,7 +713,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
659 " Operation continuing on %d devices\n", 713 " Operation continuing on %d devices\n",
660 bdevname(rdev->bdev,b), conf->working_disks); 714 bdevname(rdev->bdev,b), conf->working_disks);
661 } 715 }
662} 716}
663 717
664/* 718/*
665 * Input: a 'big' sector number, 719 * Input: a 'big' sector number,
@@ -697,9 +751,12 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
697 /* 751 /*
698 * Select the parity disk based on the user selected algorithm. 752 * Select the parity disk based on the user selected algorithm.
699 */ 753 */
700 if (conf->level == 4) 754 switch(conf->level) {
755 case 4:
701 *pd_idx = data_disks; 756 *pd_idx = data_disks;
702 else switch (conf->algorithm) { 757 break;
758 case 5:
759 switch (conf->algorithm) {
703 case ALGORITHM_LEFT_ASYMMETRIC: 760 case ALGORITHM_LEFT_ASYMMETRIC:
704 *pd_idx = data_disks - stripe % raid_disks; 761 *pd_idx = data_disks - stripe % raid_disks;
705 if (*dd_idx >= *pd_idx) 762 if (*dd_idx >= *pd_idx)
@@ -721,6 +778,39 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
721 default: 778 default:
722 printk(KERN_ERR "raid5: unsupported algorithm %d\n", 779 printk(KERN_ERR "raid5: unsupported algorithm %d\n",
723 conf->algorithm); 780 conf->algorithm);
781 }
782 break;
783 case 6:
784
785 /**** FIX THIS ****/
786 switch (conf->algorithm) {
787 case ALGORITHM_LEFT_ASYMMETRIC:
788 *pd_idx = raid_disks - 1 - (stripe % raid_disks);
789 if (*pd_idx == raid_disks-1)
790 (*dd_idx)++; /* Q D D D P */
791 else if (*dd_idx >= *pd_idx)
792 (*dd_idx) += 2; /* D D P Q D */
793 break;
794 case ALGORITHM_RIGHT_ASYMMETRIC:
795 *pd_idx = stripe % raid_disks;
796 if (*pd_idx == raid_disks-1)
797 (*dd_idx)++; /* Q D D D P */
798 else if (*dd_idx >= *pd_idx)
799 (*dd_idx) += 2; /* D D P Q D */
800 break;
801 case ALGORITHM_LEFT_SYMMETRIC:
802 *pd_idx = raid_disks - 1 - (stripe % raid_disks);
803 *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
804 break;
805 case ALGORITHM_RIGHT_SYMMETRIC:
806 *pd_idx = stripe % raid_disks;
807 *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
808 break;
809 default:
810 printk (KERN_CRIT "raid6: unsupported algorithm %d\n",
811 conf->algorithm);
812 }
813 break;
724 } 814 }
725 815
726 /* 816 /*
@@ -742,12 +832,17 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
742 int chunk_number, dummy1, dummy2, dd_idx = i; 832 int chunk_number, dummy1, dummy2, dd_idx = i;
743 sector_t r_sector; 833 sector_t r_sector;
744 834
835
745 chunk_offset = sector_div(new_sector, sectors_per_chunk); 836 chunk_offset = sector_div(new_sector, sectors_per_chunk);
746 stripe = new_sector; 837 stripe = new_sector;
747 BUG_ON(new_sector != stripe); 838 BUG_ON(new_sector != stripe);
748 839
749 840 if (i == sh->pd_idx)
750 switch (conf->algorithm) { 841 return 0;
842 switch(conf->level) {
843 case 4: break;
844 case 5:
845 switch (conf->algorithm) {
751 case ALGORITHM_LEFT_ASYMMETRIC: 846 case ALGORITHM_LEFT_ASYMMETRIC:
752 case ALGORITHM_RIGHT_ASYMMETRIC: 847 case ALGORITHM_RIGHT_ASYMMETRIC:
753 if (i > sh->pd_idx) 848 if (i > sh->pd_idx)
@@ -761,7 +856,37 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
761 break; 856 break;
762 default: 857 default:
763 printk(KERN_ERR "raid5: unsupported algorithm %d\n", 858 printk(KERN_ERR "raid5: unsupported algorithm %d\n",
859 conf->algorithm);
860 }
861 break;
862 case 6:
863 data_disks = raid_disks - 2;
864 if (i == raid6_next_disk(sh->pd_idx, raid_disks))
865 return 0; /* It is the Q disk */
866 switch (conf->algorithm) {
867 case ALGORITHM_LEFT_ASYMMETRIC:
868 case ALGORITHM_RIGHT_ASYMMETRIC:
869 if (sh->pd_idx == raid_disks-1)
870 i--; /* Q D D D P */
871 else if (i > sh->pd_idx)
872 i -= 2; /* D D P Q D */
873 break;
874 case ALGORITHM_LEFT_SYMMETRIC:
875 case ALGORITHM_RIGHT_SYMMETRIC:
876 if (sh->pd_idx == raid_disks-1)
877 i--; /* Q D D D P */
878 else {
879 /* D D P Q D */
880 if (i < sh->pd_idx)
881 i += raid_disks;
882 i -= (sh->pd_idx + 2);
883 }
884 break;
885 default:
886 printk (KERN_CRIT "raid6: unsupported algorithm %d\n",
764 conf->algorithm); 887 conf->algorithm);
888 }
889 break;
765 } 890 }
766 891
767 chunk_number = stripe * data_disks + i; 892 chunk_number = stripe * data_disks + i;
@@ -778,10 +903,11 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
778 903
779 904
780/* 905/*
781 * Copy data between a page in the stripe cache, and a bio. 906 * Copy data between a page in the stripe cache, and one or more bion
782 * There are no alignment or size guarantees between the page or the 907 * The page could align with the middle of the bio, or there could be
783 * bio except that there is some overlap. 908 * several bion, each with several bio_vecs, which cover part of the page
784 * All iovecs in the bio must be considered. 909 * Multiple bion are linked together on bi_next. There may be extras
910 * at the end of this list. We ignore them.
785 */ 911 */
786static void copy_data(int frombio, struct bio *bio, 912static void copy_data(int frombio, struct bio *bio,
787 struct page *page, 913 struct page *page,
@@ -810,7 +936,7 @@ static void copy_data(int frombio, struct bio *bio,
810 if (len > 0 && page_offset + len > STRIPE_SIZE) 936 if (len > 0 && page_offset + len > STRIPE_SIZE)
811 clen = STRIPE_SIZE - page_offset; 937 clen = STRIPE_SIZE - page_offset;
812 else clen = len; 938 else clen = len;
813 939
814 if (clen > 0) { 940 if (clen > 0) {
815 char *ba = __bio_kmap_atomic(bio, i, KM_USER0); 941 char *ba = __bio_kmap_atomic(bio, i, KM_USER0);
816 if (frombio) 942 if (frombio)
@@ -862,14 +988,14 @@ static void compute_block(struct stripe_head *sh, int dd_idx)
862 set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); 988 set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
863} 989}
864 990
865static void compute_parity(struct stripe_head *sh, int method) 991static void compute_parity5(struct stripe_head *sh, int method)
866{ 992{
867 raid5_conf_t *conf = sh->raid_conf; 993 raid5_conf_t *conf = sh->raid_conf;
868 int i, pd_idx = sh->pd_idx, disks = sh->disks, count; 994 int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
869 void *ptr[MAX_XOR_BLOCKS]; 995 void *ptr[MAX_XOR_BLOCKS];
870 struct bio *chosen; 996 struct bio *chosen;
871 997
872 PRINTK("compute_parity, stripe %llu, method %d\n", 998 PRINTK("compute_parity5, stripe %llu, method %d\n",
873 (unsigned long long)sh->sector, method); 999 (unsigned long long)sh->sector, method);
874 1000
875 count = 1; 1001 count = 1;
@@ -956,9 +1082,195 @@ static void compute_parity(struct stripe_head *sh, int method)
956 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 1082 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
957} 1083}
958 1084
1085static void compute_parity6(struct stripe_head *sh, int method)
1086{
1087 raid6_conf_t *conf = sh->raid_conf;
1088 int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
1089 struct bio *chosen;
1090 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
1091 void *ptrs[disks];
1092
1093 qd_idx = raid6_next_disk(pd_idx, disks);
1094 d0_idx = raid6_next_disk(qd_idx, disks);
1095
1096 PRINTK("compute_parity, stripe %llu, method %d\n",
1097 (unsigned long long)sh->sector, method);
1098
1099 switch(method) {
1100 case READ_MODIFY_WRITE:
1101 BUG(); /* READ_MODIFY_WRITE N/A for RAID-6 */
1102 case RECONSTRUCT_WRITE:
1103 for (i= disks; i-- ;)
1104 if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) {
1105 chosen = sh->dev[i].towrite;
1106 sh->dev[i].towrite = NULL;
1107
1108 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1109 wake_up(&conf->wait_for_overlap);
1110
1111 if (sh->dev[i].written) BUG();
1112 sh->dev[i].written = chosen;
1113 }
1114 break;
1115 case CHECK_PARITY:
1116 BUG(); /* Not implemented yet */
1117 }
1118
1119 for (i = disks; i--;)
1120 if (sh->dev[i].written) {
1121 sector_t sector = sh->dev[i].sector;
1122 struct bio *wbi = sh->dev[i].written;
1123 while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
1124 copy_data(1, wbi, sh->dev[i].page, sector);
1125 wbi = r5_next_bio(wbi, sector);
1126 }
1127
1128 set_bit(R5_LOCKED, &sh->dev[i].flags);
1129 set_bit(R5_UPTODATE, &sh->dev[i].flags);
1130 }
1131
1132// switch(method) {
1133// case RECONSTRUCT_WRITE:
1134// case CHECK_PARITY:
1135// case UPDATE_PARITY:
1136 /* Note that unlike RAID-5, the ordering of the disks matters greatly. */
1137 /* FIX: Is this ordering of drives even remotely optimal? */
1138 count = 0;
1139 i = d0_idx;
1140 do {
1141 ptrs[count++] = page_address(sh->dev[i].page);
1142 if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags))
1143 printk("block %d/%d not uptodate on parity calc\n", i,count);
1144 i = raid6_next_disk(i, disks);
1145 } while ( i != d0_idx );
1146// break;
1147// }
1148
1149 raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs);
1150
1151 switch(method) {
1152 case RECONSTRUCT_WRITE:
1153 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1154 set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
1155 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
1156 set_bit(R5_LOCKED, &sh->dev[qd_idx].flags);
1157 break;
1158 case UPDATE_PARITY:
1159 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1160 set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
1161 break;
1162 }
1163}
1164
1165
1166/* Compute one missing block */
1167static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
1168{
1169 raid6_conf_t *conf = sh->raid_conf;
1170 int i, count, disks = conf->raid_disks;
1171 void *ptr[MAX_XOR_BLOCKS], *p;
1172 int pd_idx = sh->pd_idx;
1173 int qd_idx = raid6_next_disk(pd_idx, disks);
1174
1175 PRINTK("compute_block_1, stripe %llu, idx %d\n",
1176 (unsigned long long)sh->sector, dd_idx);
1177
1178 if ( dd_idx == qd_idx ) {
1179 /* We're actually computing the Q drive */
1180 compute_parity6(sh, UPDATE_PARITY);
1181 } else {
1182 ptr[0] = page_address(sh->dev[dd_idx].page);
1183 if (!nozero) memset(ptr[0], 0, STRIPE_SIZE);
1184 count = 1;
1185 for (i = disks ; i--; ) {
1186 if (i == dd_idx || i == qd_idx)
1187 continue;
1188 p = page_address(sh->dev[i].page);
1189 if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
1190 ptr[count++] = p;
1191 else
1192 printk("compute_block() %d, stripe %llu, %d"
1193 " not present\n", dd_idx,
1194 (unsigned long long)sh->sector, i);
1195
1196 check_xor();
1197 }
1198 if (count != 1)
1199 xor_block(count, STRIPE_SIZE, ptr);
1200 if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
1201 else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
1202 }
1203}
1204
1205/* Compute two missing blocks */
1206static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
1207{
1208 raid6_conf_t *conf = sh->raid_conf;
1209 int i, count, disks = conf->raid_disks;
1210 int pd_idx = sh->pd_idx;
1211 int qd_idx = raid6_next_disk(pd_idx, disks);
1212 int d0_idx = raid6_next_disk(qd_idx, disks);
1213 int faila, failb;
1214
1215 /* faila and failb are disk numbers relative to d0_idx */
1216 /* pd_idx become disks-2 and qd_idx become disks-1 */
1217 faila = (dd_idx1 < d0_idx) ? dd_idx1+(disks-d0_idx) : dd_idx1-d0_idx;
1218 failb = (dd_idx2 < d0_idx) ? dd_idx2+(disks-d0_idx) : dd_idx2-d0_idx;
1219
1220 BUG_ON(faila == failb);
1221 if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; }
1222
1223 PRINTK("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
1224 (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb);
1225
1226 if ( failb == disks-1 ) {
1227 /* Q disk is one of the missing disks */
1228 if ( faila == disks-2 ) {
1229 /* Missing P+Q, just recompute */
1230 compute_parity6(sh, UPDATE_PARITY);
1231 return;
1232 } else {
1233 /* We're missing D+Q; recompute D from P */
1234 compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0);
1235 compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */
1236 return;
1237 }
1238 }
1239
1240 /* We're missing D+P or D+D; build pointer table */
1241 {
1242 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
1243 void *ptrs[disks];
1244
1245 count = 0;
1246 i = d0_idx;
1247 do {
1248 ptrs[count++] = page_address(sh->dev[i].page);
1249 i = raid6_next_disk(i, disks);
1250 if (i != dd_idx1 && i != dd_idx2 &&
1251 !test_bit(R5_UPTODATE, &sh->dev[i].flags))
1252 printk("compute_2 with missing block %d/%d\n", count, i);
1253 } while ( i != d0_idx );
1254
1255 if ( failb == disks-2 ) {
1256 /* We're missing D+P. */
1257 raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs);
1258 } else {
1259 /* We're missing D+D. */
1260 raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs);
1261 }
1262
1263 /* Both the above update both missing blocks */
1264 set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
1265 set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
1266 }
1267}
1268
1269
1270
959/* 1271/*
960 * Each stripe/dev can have one or more bion attached. 1272 * Each stripe/dev can have one or more bion attached.
961 * toread/towrite point to the first in a chain. 1273 * toread/towrite point to the first in a chain.
962 * The bi_next chain must be in order. 1274 * The bi_next chain must be in order.
963 */ 1275 */
964static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) 1276static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
@@ -1001,9 +1313,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
1001 (unsigned long long)sh->sector, dd_idx); 1313 (unsigned long long)sh->sector, dd_idx);
1002 1314
1003 if (conf->mddev->bitmap && firstwrite) { 1315 if (conf->mddev->bitmap && firstwrite) {
1004 sh->bm_seq = conf->seq_write;
1005 bitmap_startwrite(conf->mddev->bitmap, sh->sector, 1316 bitmap_startwrite(conf->mddev->bitmap, sh->sector,
1006 STRIPE_SECTORS, 0); 1317 STRIPE_SECTORS, 0);
1318 sh->bm_seq = conf->seq_flush+1;
1007 set_bit(STRIPE_BIT_DELAY, &sh->state); 1319 set_bit(STRIPE_BIT_DELAY, &sh->state);
1008 } 1320 }
1009 1321
@@ -1031,6 +1343,13 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
1031 1343
1032static void end_reshape(raid5_conf_t *conf); 1344static void end_reshape(raid5_conf_t *conf);
1033 1345
1346static int page_is_zero(struct page *p)
1347{
1348 char *a = page_address(p);
1349 return ((*(u32*)a) == 0 &&
1350 memcmp(a, a+4, STRIPE_SIZE-4)==0);
1351}
1352
1034static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) 1353static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
1035{ 1354{
1036 int sectors_per_chunk = conf->chunk_size >> 9; 1355 int sectors_per_chunk = conf->chunk_size >> 9;
@@ -1062,7 +1381,7 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
1062 * 1381 *
1063 */ 1382 */
1064 1383
1065static void handle_stripe(struct stripe_head *sh) 1384static void handle_stripe5(struct stripe_head *sh)
1066{ 1385{
1067 raid5_conf_t *conf = sh->raid_conf; 1386 raid5_conf_t *conf = sh->raid_conf;
1068 int disks = sh->disks; 1387 int disks = sh->disks;
@@ -1394,7 +1713,7 @@ static void handle_stripe(struct stripe_head *sh)
1394 if (locked == 0 && (rcw == 0 ||rmw == 0) && 1713 if (locked == 0 && (rcw == 0 ||rmw == 0) &&
1395 !test_bit(STRIPE_BIT_DELAY, &sh->state)) { 1714 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
1396 PRINTK("Computing parity...\n"); 1715 PRINTK("Computing parity...\n");
1397 compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE); 1716 compute_parity5(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
1398 /* now every locked buffer is ready to be written */ 1717 /* now every locked buffer is ready to be written */
1399 for (i=disks; i--;) 1718 for (i=disks; i--;)
1400 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) { 1719 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
@@ -1421,13 +1740,10 @@ static void handle_stripe(struct stripe_head *sh)
1421 !test_bit(STRIPE_INSYNC, &sh->state)) { 1740 !test_bit(STRIPE_INSYNC, &sh->state)) {
1422 set_bit(STRIPE_HANDLE, &sh->state); 1741 set_bit(STRIPE_HANDLE, &sh->state);
1423 if (failed == 0) { 1742 if (failed == 0) {
1424 char *pagea;
1425 BUG_ON(uptodate != disks); 1743 BUG_ON(uptodate != disks);
1426 compute_parity(sh, CHECK_PARITY); 1744 compute_parity5(sh, CHECK_PARITY);
1427 uptodate--; 1745 uptodate--;
1428 pagea = page_address(sh->dev[sh->pd_idx].page); 1746 if (page_is_zero(sh->dev[sh->pd_idx].page)) {
1429 if ((*(u32*)pagea) == 0 &&
1430 !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) {
1431 /* parity is correct (on disc, not in buffer any more) */ 1747 /* parity is correct (on disc, not in buffer any more) */
1432 set_bit(STRIPE_INSYNC, &sh->state); 1748 set_bit(STRIPE_INSYNC, &sh->state);
1433 } else { 1749 } else {
@@ -1487,7 +1803,7 @@ static void handle_stripe(struct stripe_head *sh)
1487 /* Need to write out all blocks after computing parity */ 1803 /* Need to write out all blocks after computing parity */
1488 sh->disks = conf->raid_disks; 1804 sh->disks = conf->raid_disks;
1489 sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks); 1805 sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks);
1490 compute_parity(sh, RECONSTRUCT_WRITE); 1806 compute_parity5(sh, RECONSTRUCT_WRITE);
1491 for (i= conf->raid_disks; i--;) { 1807 for (i= conf->raid_disks; i--;) {
1492 set_bit(R5_LOCKED, &sh->dev[i].flags); 1808 set_bit(R5_LOCKED, &sh->dev[i].flags);
1493 locked++; 1809 locked++;
@@ -1615,6 +1931,569 @@ static void handle_stripe(struct stripe_head *sh)
1615 } 1931 }
1616} 1932}
1617 1933
1934static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
1935{
1936 raid6_conf_t *conf = sh->raid_conf;
1937 int disks = conf->raid_disks;
1938 struct bio *return_bi= NULL;
1939 struct bio *bi;
1940 int i;
1941 int syncing;
1942 int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
1943 int non_overwrite = 0;
1944 int failed_num[2] = {0, 0};
1945 struct r5dev *dev, *pdev, *qdev;
1946 int pd_idx = sh->pd_idx;
1947 int qd_idx = raid6_next_disk(pd_idx, disks);
1948 int p_failed, q_failed;
1949
1950 PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d, qd_idx=%d\n",
1951 (unsigned long long)sh->sector, sh->state, atomic_read(&sh->count),
1952 pd_idx, qd_idx);
1953
1954 spin_lock(&sh->lock);
1955 clear_bit(STRIPE_HANDLE, &sh->state);
1956 clear_bit(STRIPE_DELAYED, &sh->state);
1957
1958 syncing = test_bit(STRIPE_SYNCING, &sh->state);
1959 /* Now to look around and see what can be done */
1960
1961 rcu_read_lock();
1962 for (i=disks; i--; ) {
1963 mdk_rdev_t *rdev;
1964 dev = &sh->dev[i];
1965 clear_bit(R5_Insync, &dev->flags);
1966
1967 PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
1968 i, dev->flags, dev->toread, dev->towrite, dev->written);
1969 /* maybe we can reply to a read */
1970 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
1971 struct bio *rbi, *rbi2;
1972 PRINTK("Return read for disc %d\n", i);
1973 spin_lock_irq(&conf->device_lock);
1974 rbi = dev->toread;
1975 dev->toread = NULL;
1976 if (test_and_clear_bit(R5_Overlap, &dev->flags))
1977 wake_up(&conf->wait_for_overlap);
1978 spin_unlock_irq(&conf->device_lock);
1979 while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
1980 copy_data(0, rbi, dev->page, dev->sector);
1981 rbi2 = r5_next_bio(rbi, dev->sector);
1982 spin_lock_irq(&conf->device_lock);
1983 if (--rbi->bi_phys_segments == 0) {
1984 rbi->bi_next = return_bi;
1985 return_bi = rbi;
1986 }
1987 spin_unlock_irq(&conf->device_lock);
1988 rbi = rbi2;
1989 }
1990 }
1991
1992 /* now count some things */
1993 if (test_bit(R5_LOCKED, &dev->flags)) locked++;
1994 if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
1995
1996
1997 if (dev->toread) to_read++;
1998 if (dev->towrite) {
1999 to_write++;
2000 if (!test_bit(R5_OVERWRITE, &dev->flags))
2001 non_overwrite++;
2002 }
2003 if (dev->written) written++;
2004 rdev = rcu_dereference(conf->disks[i].rdev);
2005 if (!rdev || !test_bit(In_sync, &rdev->flags)) {
2006 /* The ReadError flag will just be confusing now */
2007 clear_bit(R5_ReadError, &dev->flags);
2008 clear_bit(R5_ReWrite, &dev->flags);
2009 }
2010 if (!rdev || !test_bit(In_sync, &rdev->flags)
2011 || test_bit(R5_ReadError, &dev->flags)) {
2012 if ( failed < 2 )
2013 failed_num[failed] = i;
2014 failed++;
2015 } else
2016 set_bit(R5_Insync, &dev->flags);
2017 }
2018 rcu_read_unlock();
2019 PRINTK("locked=%d uptodate=%d to_read=%d"
2020 " to_write=%d failed=%d failed_num=%d,%d\n",
2021 locked, uptodate, to_read, to_write, failed,
2022 failed_num[0], failed_num[1]);
2023 /* check if the array has lost >2 devices and, if so, some requests might
2024 * need to be failed
2025 */
2026 if (failed > 2 && to_read+to_write+written) {
2027 for (i=disks; i--; ) {
2028 int bitmap_end = 0;
2029
2030 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
2031 mdk_rdev_t *rdev;
2032 rcu_read_lock();
2033 rdev = rcu_dereference(conf->disks[i].rdev);
2034 if (rdev && test_bit(In_sync, &rdev->flags))
2035 /* multiple read failures in one stripe */
2036 md_error(conf->mddev, rdev);
2037 rcu_read_unlock();
2038 }
2039
2040 spin_lock_irq(&conf->device_lock);
2041 /* fail all writes first */
2042 bi = sh->dev[i].towrite;
2043 sh->dev[i].towrite = NULL;
2044 if (bi) { to_write--; bitmap_end = 1; }
2045
2046 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2047 wake_up(&conf->wait_for_overlap);
2048
2049 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
2050 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
2051 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2052 if (--bi->bi_phys_segments == 0) {
2053 md_write_end(conf->mddev);
2054 bi->bi_next = return_bi;
2055 return_bi = bi;
2056 }
2057 bi = nextbi;
2058 }
2059 /* and fail all 'written' */
2060 bi = sh->dev[i].written;
2061 sh->dev[i].written = NULL;
2062 if (bi) bitmap_end = 1;
2063 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
2064 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
2065 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2066 if (--bi->bi_phys_segments == 0) {
2067 md_write_end(conf->mddev);
2068 bi->bi_next = return_bi;
2069 return_bi = bi;
2070 }
2071 bi = bi2;
2072 }
2073
2074 /* fail any reads if this device is non-operational */
2075 if (!test_bit(R5_Insync, &sh->dev[i].flags) ||
2076 test_bit(R5_ReadError, &sh->dev[i].flags)) {
2077 bi = sh->dev[i].toread;
2078 sh->dev[i].toread = NULL;
2079 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2080 wake_up(&conf->wait_for_overlap);
2081 if (bi) to_read--;
2082 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
2083 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
2084 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2085 if (--bi->bi_phys_segments == 0) {
2086 bi->bi_next = return_bi;
2087 return_bi = bi;
2088 }
2089 bi = nextbi;
2090 }
2091 }
2092 spin_unlock_irq(&conf->device_lock);
2093 if (bitmap_end)
2094 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2095 STRIPE_SECTORS, 0, 0);
2096 }
2097 }
2098 if (failed > 2 && syncing) {
2099 md_done_sync(conf->mddev, STRIPE_SECTORS,0);
2100 clear_bit(STRIPE_SYNCING, &sh->state);
2101 syncing = 0;
2102 }
2103
2104 /*
2105 * might be able to return some write requests if the parity blocks
2106 * are safe, or on a failed drive
2107 */
2108 pdev = &sh->dev[pd_idx];
2109 p_failed = (failed >= 1 && failed_num[0] == pd_idx)
2110 || (failed >= 2 && failed_num[1] == pd_idx);
2111 qdev = &sh->dev[qd_idx];
2112 q_failed = (failed >= 1 && failed_num[0] == qd_idx)
2113 || (failed >= 2 && failed_num[1] == qd_idx);
2114
2115 if ( written &&
2116 ( p_failed || ((test_bit(R5_Insync, &pdev->flags)
2117 && !test_bit(R5_LOCKED, &pdev->flags)
2118 && test_bit(R5_UPTODATE, &pdev->flags))) ) &&
2119 ( q_failed || ((test_bit(R5_Insync, &qdev->flags)
2120 && !test_bit(R5_LOCKED, &qdev->flags)
2121 && test_bit(R5_UPTODATE, &qdev->flags))) ) ) {
2122 /* any written block on an uptodate or failed drive can be
2123 * returned. Note that if we 'wrote' to a failed drive,
2124 * it will be UPTODATE, but never LOCKED, so we don't need
2125 * to test 'failed' directly.
2126 */
2127 for (i=disks; i--; )
2128 if (sh->dev[i].written) {
2129 dev = &sh->dev[i];
2130 if (!test_bit(R5_LOCKED, &dev->flags) &&
2131 test_bit(R5_UPTODATE, &dev->flags) ) {
2132 /* We can return any write requests */
2133 int bitmap_end = 0;
2134 struct bio *wbi, *wbi2;
2135 PRINTK("Return write for stripe %llu disc %d\n",
2136 (unsigned long long)sh->sector, i);
2137 spin_lock_irq(&conf->device_lock);
2138 wbi = dev->written;
2139 dev->written = NULL;
2140 while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
2141 wbi2 = r5_next_bio(wbi, dev->sector);
2142 if (--wbi->bi_phys_segments == 0) {
2143 md_write_end(conf->mddev);
2144 wbi->bi_next = return_bi;
2145 return_bi = wbi;
2146 }
2147 wbi = wbi2;
2148 }
2149 if (dev->towrite == NULL)
2150 bitmap_end = 1;
2151 spin_unlock_irq(&conf->device_lock);
2152 if (bitmap_end)
2153 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2154 STRIPE_SECTORS,
2155 !test_bit(STRIPE_DEGRADED, &sh->state), 0);
2156 }
2157 }
2158 }
2159
2160 /* Now we might consider reading some blocks, either to check/generate
2161 * parity, or to satisfy requests
2162 * or to load a block that is being partially written.
2163 */
2164 if (to_read || non_overwrite || (to_write && failed) || (syncing && (uptodate < disks))) {
2165 for (i=disks; i--;) {
2166 dev = &sh->dev[i];
2167 if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
2168 (dev->toread ||
2169 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2170 syncing ||
2171 (failed >= 1 && (sh->dev[failed_num[0]].toread || to_write)) ||
2172 (failed >= 2 && (sh->dev[failed_num[1]].toread || to_write))
2173 )
2174 ) {
2175 /* we would like to get this block, possibly
2176 * by computing it, but we might not be able to
2177 */
2178 if (uptodate == disks-1) {
2179 PRINTK("Computing stripe %llu block %d\n",
2180 (unsigned long long)sh->sector, i);
2181 compute_block_1(sh, i, 0);
2182 uptodate++;
2183 } else if ( uptodate == disks-2 && failed >= 2 ) {
2184 /* Computing 2-failure is *very* expensive; only do it if failed >= 2 */
2185 int other;
2186 for (other=disks; other--;) {
2187 if ( other == i )
2188 continue;
2189 if ( !test_bit(R5_UPTODATE, &sh->dev[other].flags) )
2190 break;
2191 }
2192 BUG_ON(other < 0);
2193 PRINTK("Computing stripe %llu blocks %d,%d\n",
2194 (unsigned long long)sh->sector, i, other);
2195 compute_block_2(sh, i, other);
2196 uptodate += 2;
2197 } else if (test_bit(R5_Insync, &dev->flags)) {
2198 set_bit(R5_LOCKED, &dev->flags);
2199 set_bit(R5_Wantread, &dev->flags);
2200#if 0
2201 /* if I am just reading this block and we don't have
2202 a failed drive, or any pending writes then sidestep the cache */
2203 if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
2204 ! syncing && !failed && !to_write) {
2205 sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page;
2206 sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data;
2207 }
2208#endif
2209 locked++;
2210 PRINTK("Reading block %d (sync=%d)\n",
2211 i, syncing);
2212 }
2213 }
2214 }
2215 set_bit(STRIPE_HANDLE, &sh->state);
2216 }
2217
2218 /* now to consider writing and what else, if anything should be read */
2219 if (to_write) {
2220 int rcw=0, must_compute=0;
2221 for (i=disks ; i--;) {
2222 dev = &sh->dev[i];
2223 /* Would I have to read this buffer for reconstruct_write */
2224 if (!test_bit(R5_OVERWRITE, &dev->flags)
2225 && i != pd_idx && i != qd_idx
2226 && (!test_bit(R5_LOCKED, &dev->flags)
2227#if 0
2228 || sh->bh_page[i] != bh->b_page
2229#endif
2230 ) &&
2231 !test_bit(R5_UPTODATE, &dev->flags)) {
2232 if (test_bit(R5_Insync, &dev->flags)) rcw++;
2233 else {
2234 PRINTK("raid6: must_compute: disk %d flags=%#lx\n", i, dev->flags);
2235 must_compute++;
2236 }
2237 }
2238 }
2239 PRINTK("for sector %llu, rcw=%d, must_compute=%d\n",
2240 (unsigned long long)sh->sector, rcw, must_compute);
2241 set_bit(STRIPE_HANDLE, &sh->state);
2242
2243 if (rcw > 0)
2244 /* want reconstruct write, but need to get some data */
2245 for (i=disks; i--;) {
2246 dev = &sh->dev[i];
2247 if (!test_bit(R5_OVERWRITE, &dev->flags)
2248 && !(failed == 0 && (i == pd_idx || i == qd_idx))
2249 && !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
2250 test_bit(R5_Insync, &dev->flags)) {
2251 if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
2252 {
2253 PRINTK("Read_old stripe %llu block %d for Reconstruct\n",
2254 (unsigned long long)sh->sector, i);
2255 set_bit(R5_LOCKED, &dev->flags);
2256 set_bit(R5_Wantread, &dev->flags);
2257 locked++;
2258 } else {
2259 PRINTK("Request delayed stripe %llu block %d for Reconstruct\n",
2260 (unsigned long long)sh->sector, i);
2261 set_bit(STRIPE_DELAYED, &sh->state);
2262 set_bit(STRIPE_HANDLE, &sh->state);
2263 }
2264 }
2265 }
2266 /* now if nothing is locked, and if we have enough data, we can start a write request */
2267 if (locked == 0 && rcw == 0 &&
2268 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
2269 if ( must_compute > 0 ) {
2270 /* We have failed blocks and need to compute them */
2271 switch ( failed ) {
2272 case 0: BUG();
2273 case 1: compute_block_1(sh, failed_num[0], 0); break;
2274 case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break;
2275 default: BUG(); /* This request should have been failed? */
2276 }
2277 }
2278
2279 PRINTK("Computing parity for stripe %llu\n", (unsigned long long)sh->sector);
2280 compute_parity6(sh, RECONSTRUCT_WRITE);
2281 /* now every locked buffer is ready to be written */
2282 for (i=disks; i--;)
2283 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
2284 PRINTK("Writing stripe %llu block %d\n",
2285 (unsigned long long)sh->sector, i);
2286 locked++;
2287 set_bit(R5_Wantwrite, &sh->dev[i].flags);
2288 }
2289 /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
2290 set_bit(STRIPE_INSYNC, &sh->state);
2291
2292 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2293 atomic_dec(&conf->preread_active_stripes);
2294 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
2295 md_wakeup_thread(conf->mddev->thread);
2296 }
2297 }
2298 }
2299
2300 /* maybe we need to check and possibly fix the parity for this stripe
2301 * Any reads will already have been scheduled, so we just see if enough data
2302 * is available
2303 */
2304 if (syncing && locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) {
2305 int update_p = 0, update_q = 0;
2306 struct r5dev *dev;
2307
2308 set_bit(STRIPE_HANDLE, &sh->state);
2309
2310 BUG_ON(failed>2);
2311 BUG_ON(uptodate < disks);
2312 /* Want to check and possibly repair P and Q.
2313 * However there could be one 'failed' device, in which
2314 * case we can only check one of them, possibly using the
2315 * other to generate missing data
2316 */
2317
2318 /* If !tmp_page, we cannot do the calculations,
2319 * but as we have set STRIPE_HANDLE, we will soon be called
2320 * by stripe_handle with a tmp_page - just wait until then.
2321 */
2322 if (tmp_page) {
2323 if (failed == q_failed) {
2324 /* The only possible failed device holds 'Q', so it makes
2325 * sense to check P (If anything else were failed, we would
2326 * have used P to recreate it).
2327 */
2328 compute_block_1(sh, pd_idx, 1);
2329 if (!page_is_zero(sh->dev[pd_idx].page)) {
2330 compute_block_1(sh,pd_idx,0);
2331 update_p = 1;
2332 }
2333 }
2334 if (!q_failed && failed < 2) {
2335 /* q is not failed, and we didn't use it to generate
2336 * anything, so it makes sense to check it
2337 */
2338 memcpy(page_address(tmp_page),
2339 page_address(sh->dev[qd_idx].page),
2340 STRIPE_SIZE);
2341 compute_parity6(sh, UPDATE_PARITY);
2342 if (memcmp(page_address(tmp_page),
2343 page_address(sh->dev[qd_idx].page),
2344 STRIPE_SIZE)!= 0) {
2345 clear_bit(STRIPE_INSYNC, &sh->state);
2346 update_q = 1;
2347 }
2348 }
2349 if (update_p || update_q) {
2350 conf->mddev->resync_mismatches += STRIPE_SECTORS;
2351 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2352 /* don't try to repair!! */
2353 update_p = update_q = 0;
2354 }
2355
2356 /* now write out any block on a failed drive,
2357 * or P or Q if they need it
2358 */
2359
2360 if (failed == 2) {
2361 dev = &sh->dev[failed_num[1]];
2362 locked++;
2363 set_bit(R5_LOCKED, &dev->flags);
2364 set_bit(R5_Wantwrite, &dev->flags);
2365 }
2366 if (failed >= 1) {
2367 dev = &sh->dev[failed_num[0]];
2368 locked++;
2369 set_bit(R5_LOCKED, &dev->flags);
2370 set_bit(R5_Wantwrite, &dev->flags);
2371 }
2372
2373 if (update_p) {
2374 dev = &sh->dev[pd_idx];
2375 locked ++;
2376 set_bit(R5_LOCKED, &dev->flags);
2377 set_bit(R5_Wantwrite, &dev->flags);
2378 }
2379 if (update_q) {
2380 dev = &sh->dev[qd_idx];
2381 locked++;
2382 set_bit(R5_LOCKED, &dev->flags);
2383 set_bit(R5_Wantwrite, &dev->flags);
2384 }
2385 clear_bit(STRIPE_DEGRADED, &sh->state);
2386
2387 set_bit(STRIPE_INSYNC, &sh->state);
2388 }
2389 }
2390
2391 if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
2392 md_done_sync(conf->mddev, STRIPE_SECTORS,1);
2393 clear_bit(STRIPE_SYNCING, &sh->state);
2394 }
2395
2396 /* If the failed drives are just a ReadError, then we might need
2397 * to progress the repair/check process
2398 */
2399 if (failed <= 2 && ! conf->mddev->ro)
2400 for (i=0; i<failed;i++) {
2401 dev = &sh->dev[failed_num[i]];
2402 if (test_bit(R5_ReadError, &dev->flags)
2403 && !test_bit(R5_LOCKED, &dev->flags)
2404 && test_bit(R5_UPTODATE, &dev->flags)
2405 ) {
2406 if (!test_bit(R5_ReWrite, &dev->flags)) {
2407 set_bit(R5_Wantwrite, &dev->flags);
2408 set_bit(R5_ReWrite, &dev->flags);
2409 set_bit(R5_LOCKED, &dev->flags);
2410 } else {
2411 /* let's read it back */
2412 set_bit(R5_Wantread, &dev->flags);
2413 set_bit(R5_LOCKED, &dev->flags);
2414 }
2415 }
2416 }
2417 spin_unlock(&sh->lock);
2418
2419 while ((bi=return_bi)) {
2420 int bytes = bi->bi_size;
2421
2422 return_bi = bi->bi_next;
2423 bi->bi_next = NULL;
2424 bi->bi_size = 0;
2425 bi->bi_end_io(bi, bytes, 0);
2426 }
2427 for (i=disks; i-- ;) {
2428 int rw;
2429 struct bio *bi;
2430 mdk_rdev_t *rdev;
2431 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
2432 rw = 1;
2433 else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
2434 rw = 0;
2435 else
2436 continue;
2437
2438 bi = &sh->dev[i].req;
2439
2440 bi->bi_rw = rw;
2441 if (rw)
2442 bi->bi_end_io = raid5_end_write_request;
2443 else
2444 bi->bi_end_io = raid5_end_read_request;
2445
2446 rcu_read_lock();
2447 rdev = rcu_dereference(conf->disks[i].rdev);
2448 if (rdev && test_bit(Faulty, &rdev->flags))
2449 rdev = NULL;
2450 if (rdev)
2451 atomic_inc(&rdev->nr_pending);
2452 rcu_read_unlock();
2453
2454 if (rdev) {
2455 if (syncing)
2456 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
2457
2458 bi->bi_bdev = rdev->bdev;
2459 PRINTK("for %llu schedule op %ld on disc %d\n",
2460 (unsigned long long)sh->sector, bi->bi_rw, i);
2461 atomic_inc(&sh->count);
2462 bi->bi_sector = sh->sector + rdev->data_offset;
2463 bi->bi_flags = 1 << BIO_UPTODATE;
2464 bi->bi_vcnt = 1;
2465 bi->bi_max_vecs = 1;
2466 bi->bi_idx = 0;
2467 bi->bi_io_vec = &sh->dev[i].vec;
2468 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
2469 bi->bi_io_vec[0].bv_offset = 0;
2470 bi->bi_size = STRIPE_SIZE;
2471 bi->bi_next = NULL;
2472 if (rw == WRITE &&
2473 test_bit(R5_ReWrite, &sh->dev[i].flags))
2474 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
2475 generic_make_request(bi);
2476 } else {
2477 if (rw == 1)
2478 set_bit(STRIPE_DEGRADED, &sh->state);
2479 PRINTK("skip op %ld on disc %d for sector %llu\n",
2480 bi->bi_rw, i, (unsigned long long)sh->sector);
2481 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2482 set_bit(STRIPE_HANDLE, &sh->state);
2483 }
2484 }
2485}
2486
2487static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
2488{
2489 if (sh->raid_conf->level == 6)
2490 handle_stripe6(sh, tmp_page);
2491 else
2492 handle_stripe5(sh);
2493}
2494
2495
2496
1618static void raid5_activate_delayed(raid5_conf_t *conf) 2497static void raid5_activate_delayed(raid5_conf_t *conf)
1619{ 2498{
1620 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 2499 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
@@ -1718,13 +2597,6 @@ static int raid5_issue_flush(request_queue_t *q, struct gendisk *disk,
1718 return ret; 2597 return ret;
1719} 2598}
1720 2599
1721static inline void raid5_plug_device(raid5_conf_t *conf)
1722{
1723 spin_lock_irq(&conf->device_lock);
1724 blk_plug_device(conf->mddev->queue);
1725 spin_unlock_irq(&conf->device_lock);
1726}
1727
1728static int make_request(request_queue_t *q, struct bio * bi) 2600static int make_request(request_queue_t *q, struct bio * bi)
1729{ 2601{
1730 mddev_t *mddev = q->queuedata; 2602 mddev_t *mddev = q->queuedata;
@@ -1753,7 +2625,7 @@ static int make_request(request_queue_t *q, struct bio * bi)
1753 2625
1754 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 2626 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
1755 DEFINE_WAIT(w); 2627 DEFINE_WAIT(w);
1756 int disks; 2628 int disks, data_disks;
1757 2629
1758 retry: 2630 retry:
1759 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 2631 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
@@ -1781,7 +2653,9 @@ static int make_request(request_queue_t *q, struct bio * bi)
1781 } 2653 }
1782 spin_unlock_irq(&conf->device_lock); 2654 spin_unlock_irq(&conf->device_lock);
1783 } 2655 }
1784 new_sector = raid5_compute_sector(logical_sector, disks, disks - 1, 2656 data_disks = disks - conf->max_degraded;
2657
2658 new_sector = raid5_compute_sector(logical_sector, disks, data_disks,
1785 &dd_idx, &pd_idx, conf); 2659 &dd_idx, &pd_idx, conf);
1786 PRINTK("raid5: make_request, sector %llu logical %llu\n", 2660 PRINTK("raid5: make_request, sector %llu logical %llu\n",
1787 (unsigned long long)new_sector, 2661 (unsigned long long)new_sector,
@@ -1832,8 +2706,7 @@ static int make_request(request_queue_t *q, struct bio * bi)
1832 goto retry; 2706 goto retry;
1833 } 2707 }
1834 finish_wait(&conf->wait_for_overlap, &w); 2708 finish_wait(&conf->wait_for_overlap, &w);
1835 raid5_plug_device(conf); 2709 handle_stripe(sh, NULL);
1836 handle_stripe(sh);
1837 release_stripe(sh); 2710 release_stripe(sh);
1838 } else { 2711 } else {
1839 /* cannot get stripe for read-ahead, just give-up */ 2712 /* cannot get stripe for read-ahead, just give-up */
@@ -1849,7 +2722,7 @@ static int make_request(request_queue_t *q, struct bio * bi)
1849 if (remaining == 0) { 2722 if (remaining == 0) {
1850 int bytes = bi->bi_size; 2723 int bytes = bi->bi_size;
1851 2724
1852 if ( bio_data_dir(bi) == WRITE ) 2725 if ( rw == WRITE )
1853 md_write_end(mddev); 2726 md_write_end(mddev);
1854 bi->bi_size = 0; 2727 bi->bi_size = 0;
1855 bi->bi_end_io(bi, bytes, 0); 2728 bi->bi_end_io(bi, bytes, 0);
@@ -1857,17 +2730,141 @@ static int make_request(request_queue_t *q, struct bio * bi)
1857 return 0; 2730 return 0;
1858} 2731}
1859 2732
1860/* FIXME go_faster isn't used */ 2733static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped)
1861static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
1862{ 2734{
2735 /* reshaping is quite different to recovery/resync so it is
2736 * handled quite separately ... here.
2737 *
2738 * On each call to sync_request, we gather one chunk worth of
2739 * destination stripes and flag them as expanding.
2740 * Then we find all the source stripes and request reads.
2741 * As the reads complete, handle_stripe will copy the data
2742 * into the destination stripe and release that stripe.
2743 */
1863 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 2744 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1864 struct stripe_head *sh; 2745 struct stripe_head *sh;
1865 int pd_idx; 2746 int pd_idx;
1866 sector_t first_sector, last_sector; 2747 sector_t first_sector, last_sector;
2748 int raid_disks;
2749 int data_disks;
2750 int i;
2751 int dd_idx;
2752 sector_t writepos, safepos, gap;
2753
2754 if (sector_nr == 0 &&
2755 conf->expand_progress != 0) {
2756 /* restarting in the middle, skip the initial sectors */
2757 sector_nr = conf->expand_progress;
2758 sector_div(sector_nr, conf->raid_disks-1);
2759 *skipped = 1;
2760 return sector_nr;
2761 }
2762
2763 /* we update the metadata when there is more than 3Meg
2764 * in the block range (that is rather arbitrary, should
2765 * probably be time based) or when the data about to be
2766 * copied would over-write the source of the data at
2767 * the front of the range.
2768 * i.e. one new_stripe forward from expand_progress new_maps
2769 * to after where expand_lo old_maps to
2770 */
2771 writepos = conf->expand_progress +
2772 conf->chunk_size/512*(conf->raid_disks-1);
2773 sector_div(writepos, conf->raid_disks-1);
2774 safepos = conf->expand_lo;
2775 sector_div(safepos, conf->previous_raid_disks-1);
2776 gap = conf->expand_progress - conf->expand_lo;
2777
2778 if (writepos >= safepos ||
2779 gap > (conf->raid_disks-1)*3000*2 /*3Meg*/) {
2780 /* Cannot proceed until we've updated the superblock... */
2781 wait_event(conf->wait_for_overlap,
2782 atomic_read(&conf->reshape_stripes)==0);
2783 mddev->reshape_position = conf->expand_progress;
2784 mddev->sb_dirty = 1;
2785 md_wakeup_thread(mddev->thread);
2786 wait_event(mddev->sb_wait, mddev->sb_dirty == 0 ||
2787 kthread_should_stop());
2788 spin_lock_irq(&conf->device_lock);
2789 conf->expand_lo = mddev->reshape_position;
2790 spin_unlock_irq(&conf->device_lock);
2791 wake_up(&conf->wait_for_overlap);
2792 }
2793
2794 for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) {
2795 int j;
2796 int skipped = 0;
2797 pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks);
2798 sh = get_active_stripe(conf, sector_nr+i,
2799 conf->raid_disks, pd_idx, 0);
2800 set_bit(STRIPE_EXPANDING, &sh->state);
2801 atomic_inc(&conf->reshape_stripes);
2802 /* If any of this stripe is beyond the end of the old
2803 * array, then we need to zero those blocks
2804 */
2805 for (j=sh->disks; j--;) {
2806 sector_t s;
2807 if (j == sh->pd_idx)
2808 continue;
2809 s = compute_blocknr(sh, j);
2810 if (s < (mddev->array_size<<1)) {
2811 skipped = 1;
2812 continue;
2813 }
2814 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
2815 set_bit(R5_Expanded, &sh->dev[j].flags);
2816 set_bit(R5_UPTODATE, &sh->dev[j].flags);
2817 }
2818 if (!skipped) {
2819 set_bit(STRIPE_EXPAND_READY, &sh->state);
2820 set_bit(STRIPE_HANDLE, &sh->state);
2821 }
2822 release_stripe(sh);
2823 }
2824 spin_lock_irq(&conf->device_lock);
2825 conf->expand_progress = (sector_nr + i)*(conf->raid_disks-1);
2826 spin_unlock_irq(&conf->device_lock);
2827 /* Ok, those stripe are ready. We can start scheduling
2828 * reads on the source stripes.
2829 * The source stripes are determined by mapping the first and last
2830 * block on the destination stripes.
2831 */
2832 raid_disks = conf->previous_raid_disks;
2833 data_disks = raid_disks - 1;
2834 first_sector =
2835 raid5_compute_sector(sector_nr*(conf->raid_disks-1),
2836 raid_disks, data_disks,
2837 &dd_idx, &pd_idx, conf);
2838 last_sector =
2839 raid5_compute_sector((sector_nr+conf->chunk_size/512)
2840 *(conf->raid_disks-1) -1,
2841 raid_disks, data_disks,
2842 &dd_idx, &pd_idx, conf);
2843 if (last_sector >= (mddev->size<<1))
2844 last_sector = (mddev->size<<1)-1;
2845 while (first_sector <= last_sector) {
2846 pd_idx = stripe_to_pdidx(first_sector, conf, conf->previous_raid_disks);
2847 sh = get_active_stripe(conf, first_sector,
2848 conf->previous_raid_disks, pd_idx, 0);
2849 set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2850 set_bit(STRIPE_HANDLE, &sh->state);
2851 release_stripe(sh);
2852 first_sector += STRIPE_SECTORS;
2853 }
2854 return conf->chunk_size>>9;
2855}
2856
2857/* FIXME go_faster isn't used */
2858static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
2859{
2860 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
2861 struct stripe_head *sh;
2862 int pd_idx;
1867 int raid_disks = conf->raid_disks; 2863 int raid_disks = conf->raid_disks;
1868 int data_disks = raid_disks-1;
1869 sector_t max_sector = mddev->size << 1; 2864 sector_t max_sector = mddev->size << 1;
1870 int sync_blocks; 2865 int sync_blocks;
2866 int still_degraded = 0;
2867 int i;
1871 2868
1872 if (sector_nr >= max_sector) { 2869 if (sector_nr >= max_sector) {
1873 /* just being told to finish up .. nothing much to do */ 2870 /* just being told to finish up .. nothing much to do */
@@ -1880,134 +2877,22 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1880 if (mddev->curr_resync < max_sector) /* aborted */ 2877 if (mddev->curr_resync < max_sector) /* aborted */
1881 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 2878 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
1882 &sync_blocks, 1); 2879 &sync_blocks, 1);
1883 else /* compelted sync */ 2880 else /* completed sync */
1884 conf->fullsync = 0; 2881 conf->fullsync = 0;
1885 bitmap_close_sync(mddev->bitmap); 2882 bitmap_close_sync(mddev->bitmap);
1886 2883
1887 return 0; 2884 return 0;
1888 } 2885 }
1889 2886
1890 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 2887 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
1891 /* reshaping is quite different to recovery/resync so it is 2888 return reshape_request(mddev, sector_nr, skipped);
1892 * handled quite separately ... here. 2889
1893 * 2890 /* if there is too many failed drives and we are trying
1894 * On each call to sync_request, we gather one chunk worth of
1895 * destination stripes and flag them as expanding.
1896 * Then we find all the source stripes and request reads.
1897 * As the reads complete, handle_stripe will copy the data
1898 * into the destination stripe and release that stripe.
1899 */
1900 int i;
1901 int dd_idx;
1902 sector_t writepos, safepos, gap;
1903
1904 if (sector_nr == 0 &&
1905 conf->expand_progress != 0) {
1906 /* restarting in the middle, skip the initial sectors */
1907 sector_nr = conf->expand_progress;
1908 sector_div(sector_nr, conf->raid_disks-1);
1909 *skipped = 1;
1910 return sector_nr;
1911 }
1912
1913 /* we update the metadata when there is more than 3Meg
1914 * in the block range (that is rather arbitrary, should
1915 * probably be time based) or when the data about to be
1916 * copied would over-write the source of the data at
1917 * the front of the range.
1918 * i.e. one new_stripe forward from expand_progress new_maps
1919 * to after where expand_lo old_maps to
1920 */
1921 writepos = conf->expand_progress +
1922 conf->chunk_size/512*(conf->raid_disks-1);
1923 sector_div(writepos, conf->raid_disks-1);
1924 safepos = conf->expand_lo;
1925 sector_div(safepos, conf->previous_raid_disks-1);
1926 gap = conf->expand_progress - conf->expand_lo;
1927
1928 if (writepos >= safepos ||
1929 gap > (conf->raid_disks-1)*3000*2 /*3Meg*/) {
1930 /* Cannot proceed until we've updated the superblock... */
1931 wait_event(conf->wait_for_overlap,
1932 atomic_read(&conf->reshape_stripes)==0);
1933 mddev->reshape_position = conf->expand_progress;
1934 mddev->sb_dirty = 1;
1935 md_wakeup_thread(mddev->thread);
1936 wait_event(mddev->sb_wait, mddev->sb_dirty == 0 ||
1937 kthread_should_stop());
1938 spin_lock_irq(&conf->device_lock);
1939 conf->expand_lo = mddev->reshape_position;
1940 spin_unlock_irq(&conf->device_lock);
1941 wake_up(&conf->wait_for_overlap);
1942 }
1943
1944 for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) {
1945 int j;
1946 int skipped = 0;
1947 pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks);
1948 sh = get_active_stripe(conf, sector_nr+i,
1949 conf->raid_disks, pd_idx, 0);
1950 set_bit(STRIPE_EXPANDING, &sh->state);
1951 atomic_inc(&conf->reshape_stripes);
1952 /* If any of this stripe is beyond the end of the old
1953 * array, then we need to zero those blocks
1954 */
1955 for (j=sh->disks; j--;) {
1956 sector_t s;
1957 if (j == sh->pd_idx)
1958 continue;
1959 s = compute_blocknr(sh, j);
1960 if (s < (mddev->array_size<<1)) {
1961 skipped = 1;
1962 continue;
1963 }
1964 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
1965 set_bit(R5_Expanded, &sh->dev[j].flags);
1966 set_bit(R5_UPTODATE, &sh->dev[j].flags);
1967 }
1968 if (!skipped) {
1969 set_bit(STRIPE_EXPAND_READY, &sh->state);
1970 set_bit(STRIPE_HANDLE, &sh->state);
1971 }
1972 release_stripe(sh);
1973 }
1974 spin_lock_irq(&conf->device_lock);
1975 conf->expand_progress = (sector_nr + i)*(conf->raid_disks-1);
1976 spin_unlock_irq(&conf->device_lock);
1977 /* Ok, those stripe are ready. We can start scheduling
1978 * reads on the source stripes.
1979 * The source stripes are determined by mapping the first and last
1980 * block on the destination stripes.
1981 */
1982 raid_disks = conf->previous_raid_disks;
1983 data_disks = raid_disks - 1;
1984 first_sector =
1985 raid5_compute_sector(sector_nr*(conf->raid_disks-1),
1986 raid_disks, data_disks,
1987 &dd_idx, &pd_idx, conf);
1988 last_sector =
1989 raid5_compute_sector((sector_nr+conf->chunk_size/512)
1990 *(conf->raid_disks-1) -1,
1991 raid_disks, data_disks,
1992 &dd_idx, &pd_idx, conf);
1993 if (last_sector >= (mddev->size<<1))
1994 last_sector = (mddev->size<<1)-1;
1995 while (first_sector <= last_sector) {
1996 pd_idx = stripe_to_pdidx(first_sector, conf, conf->previous_raid_disks);
1997 sh = get_active_stripe(conf, first_sector,
1998 conf->previous_raid_disks, pd_idx, 0);
1999 set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2000 set_bit(STRIPE_HANDLE, &sh->state);
2001 release_stripe(sh);
2002 first_sector += STRIPE_SECTORS;
2003 }
2004 return conf->chunk_size>>9;
2005 }
2006 /* if there is 1 or more failed drives and we are trying
2007 * to resync, then assert that we are finished, because there is 2891 * to resync, then assert that we are finished, because there is
2008 * nothing we can do. 2892 * nothing we can do.
2009 */ 2893 */
2010 if (mddev->degraded >= 1 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 2894 if (mddev->degraded >= conf->max_degraded &&
2895 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2011 sector_t rv = (mddev->size << 1) - sector_nr; 2896 sector_t rv = (mddev->size << 1) - sector_nr;
2012 *skipped = 1; 2897 *skipped = 1;
2013 return rv; 2898 return rv;
@@ -2026,17 +2911,26 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
2026 if (sh == NULL) { 2911 if (sh == NULL) {
2027 sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0); 2912 sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0);
2028 /* make sure we don't swamp the stripe cache if someone else 2913 /* make sure we don't swamp the stripe cache if someone else
2029 * is trying to get access 2914 * is trying to get access
2030 */ 2915 */
2031 schedule_timeout_uninterruptible(1); 2916 schedule_timeout_uninterruptible(1);
2032 } 2917 }
2033 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 0); 2918 /* Need to check if array will still be degraded after recovery/resync
2034 spin_lock(&sh->lock); 2919 * We don't need to check the 'failed' flag as when that gets set,
2920 * recovery aborts.
2921 */
2922 for (i=0; i<mddev->raid_disks; i++)
2923 if (conf->disks[i].rdev == NULL)
2924 still_degraded = 1;
2925
2926 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
2927
2928 spin_lock(&sh->lock);
2035 set_bit(STRIPE_SYNCING, &sh->state); 2929 set_bit(STRIPE_SYNCING, &sh->state);
2036 clear_bit(STRIPE_INSYNC, &sh->state); 2930 clear_bit(STRIPE_INSYNC, &sh->state);
2037 spin_unlock(&sh->lock); 2931 spin_unlock(&sh->lock);
2038 2932
2039 handle_stripe(sh); 2933 handle_stripe(sh, NULL);
2040 release_stripe(sh); 2934 release_stripe(sh);
2041 2935
2042 return STRIPE_SECTORS; 2936 return STRIPE_SECTORS;
@@ -2064,7 +2958,7 @@ static void raid5d (mddev_t *mddev)
2064 while (1) { 2958 while (1) {
2065 struct list_head *first; 2959 struct list_head *first;
2066 2960
2067 if (conf->seq_flush - conf->seq_write > 0) { 2961 if (conf->seq_flush != conf->seq_write) {
2068 int seq = conf->seq_flush; 2962 int seq = conf->seq_flush;
2069 spin_unlock_irq(&conf->device_lock); 2963 spin_unlock_irq(&conf->device_lock);
2070 bitmap_unplug(mddev->bitmap); 2964 bitmap_unplug(mddev->bitmap);
@@ -2091,7 +2985,7 @@ static void raid5d (mddev_t *mddev)
2091 spin_unlock_irq(&conf->device_lock); 2985 spin_unlock_irq(&conf->device_lock);
2092 2986
2093 handled++; 2987 handled++;
2094 handle_stripe(sh); 2988 handle_stripe(sh, conf->spare_page);
2095 release_stripe(sh); 2989 release_stripe(sh);
2096 2990
2097 spin_lock_irq(&conf->device_lock); 2991 spin_lock_irq(&conf->device_lock);
@@ -2181,8 +3075,8 @@ static int run(mddev_t *mddev)
2181 struct disk_info *disk; 3075 struct disk_info *disk;
2182 struct list_head *tmp; 3076 struct list_head *tmp;
2183 3077
2184 if (mddev->level != 5 && mddev->level != 4) { 3078 if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) {
2185 printk(KERN_ERR "raid5: %s: raid level not set to 4/5 (%d)\n", 3079 printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n",
2186 mdname(mddev), mddev->level); 3080 mdname(mddev), mddev->level);
2187 return -EIO; 3081 return -EIO;
2188 } 3082 }
@@ -2251,6 +3145,11 @@ static int run(mddev_t *mddev)
2251 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 3145 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
2252 goto abort; 3146 goto abort;
2253 3147
3148 if (mddev->level == 6) {
3149 conf->spare_page = alloc_page(GFP_KERNEL);
3150 if (!conf->spare_page)
3151 goto abort;
3152 }
2254 spin_lock_init(&conf->device_lock); 3153 spin_lock_init(&conf->device_lock);
2255 init_waitqueue_head(&conf->wait_for_stripe); 3154 init_waitqueue_head(&conf->wait_for_stripe);
2256 init_waitqueue_head(&conf->wait_for_overlap); 3155 init_waitqueue_head(&conf->wait_for_overlap);
@@ -2282,12 +3181,16 @@ static int run(mddev_t *mddev)
2282 } 3181 }
2283 3182
2284 /* 3183 /*
2285 * 0 for a fully functional array, 1 for a degraded array. 3184 * 0 for a fully functional array, 1 or 2 for a degraded array.
2286 */ 3185 */
2287 mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks; 3186 mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks;
2288 conf->mddev = mddev; 3187 conf->mddev = mddev;
2289 conf->chunk_size = mddev->chunk_size; 3188 conf->chunk_size = mddev->chunk_size;
2290 conf->level = mddev->level; 3189 conf->level = mddev->level;
3190 if (conf->level == 6)
3191 conf->max_degraded = 2;
3192 else
3193 conf->max_degraded = 1;
2291 conf->algorithm = mddev->layout; 3194 conf->algorithm = mddev->layout;
2292 conf->max_nr_stripes = NR_STRIPES; 3195 conf->max_nr_stripes = NR_STRIPES;
2293 conf->expand_progress = mddev->reshape_position; 3196 conf->expand_progress = mddev->reshape_position;
@@ -2296,6 +3199,11 @@ static int run(mddev_t *mddev)
2296 mddev->size &= ~(mddev->chunk_size/1024 -1); 3199 mddev->size &= ~(mddev->chunk_size/1024 -1);
2297 mddev->resync_max_sectors = mddev->size << 1; 3200 mddev->resync_max_sectors = mddev->size << 1;
2298 3201
3202 if (conf->level == 6 && conf->raid_disks < 4) {
3203 printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n",
3204 mdname(mddev), conf->raid_disks);
3205 goto abort;
3206 }
2299 if (!conf->chunk_size || conf->chunk_size % 4) { 3207 if (!conf->chunk_size || conf->chunk_size % 4) {
2300 printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", 3208 printk(KERN_ERR "raid5: invalid chunk size %d for %s\n",
2301 conf->chunk_size, mdname(mddev)); 3209 conf->chunk_size, mdname(mddev));
@@ -2307,14 +3215,14 @@ static int run(mddev_t *mddev)
2307 conf->algorithm, mdname(mddev)); 3215 conf->algorithm, mdname(mddev));
2308 goto abort; 3216 goto abort;
2309 } 3217 }
2310 if (mddev->degraded > 1) { 3218 if (mddev->degraded > conf->max_degraded) {
2311 printk(KERN_ERR "raid5: not enough operational devices for %s" 3219 printk(KERN_ERR "raid5: not enough operational devices for %s"
2312 " (%d/%d failed)\n", 3220 " (%d/%d failed)\n",
2313 mdname(mddev), conf->failed_disks, conf->raid_disks); 3221 mdname(mddev), conf->failed_disks, conf->raid_disks);
2314 goto abort; 3222 goto abort;
2315 } 3223 }
2316 3224
2317 if (mddev->degraded == 1 && 3225 if (mddev->degraded > 0 &&
2318 mddev->recovery_cp != MaxSector) { 3226 mddev->recovery_cp != MaxSector) {
2319 if (mddev->ok_start_degraded) 3227 if (mddev->ok_start_degraded)
2320 printk(KERN_WARNING 3228 printk(KERN_WARNING
@@ -2373,17 +3281,15 @@ static int run(mddev_t *mddev)
2373 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 3281 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
2374 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 3282 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
2375 "%s_reshape"); 3283 "%s_reshape");
2376 /* FIXME if md_register_thread fails?? */
2377 md_wakeup_thread(mddev->sync_thread);
2378
2379 } 3284 }
2380 3285
2381 /* read-ahead size must cover two whole stripes, which is 3286 /* read-ahead size must cover two whole stripes, which is
2382 * 2 * (n-1) * chunksize where 'n' is the number of raid devices 3287 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
2383 */ 3288 */
2384 { 3289 {
2385 int stripe = (mddev->raid_disks-1) * mddev->chunk_size 3290 int data_disks = conf->previous_raid_disks - conf->max_degraded;
2386 / PAGE_SIZE; 3291 int stripe = data_disks *
3292 (mddev->chunk_size / PAGE_SIZE);
2387 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 3293 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
2388 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 3294 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
2389 } 3295 }
@@ -2393,12 +3299,14 @@ static int run(mddev_t *mddev)
2393 3299
2394 mddev->queue->unplug_fn = raid5_unplug_device; 3300 mddev->queue->unplug_fn = raid5_unplug_device;
2395 mddev->queue->issue_flush_fn = raid5_issue_flush; 3301 mddev->queue->issue_flush_fn = raid5_issue_flush;
2396 mddev->array_size = mddev->size * (conf->previous_raid_disks - 1); 3302 mddev->array_size = mddev->size * (conf->previous_raid_disks -
3303 conf->max_degraded);
2397 3304
2398 return 0; 3305 return 0;
2399abort: 3306abort:
2400 if (conf) { 3307 if (conf) {
2401 print_raid5_conf(conf); 3308 print_raid5_conf(conf);
3309 safe_put_page(conf->spare_page);
2402 kfree(conf->disks); 3310 kfree(conf->disks);
2403 kfree(conf->stripe_hashtbl); 3311 kfree(conf->stripe_hashtbl);
2404 kfree(conf); 3312 kfree(conf);
@@ -2427,23 +3335,23 @@ static int stop(mddev_t *mddev)
2427} 3335}
2428 3336
2429#if RAID5_DEBUG 3337#if RAID5_DEBUG
2430static void print_sh (struct stripe_head *sh) 3338static void print_sh (struct seq_file *seq, struct stripe_head *sh)
2431{ 3339{
2432 int i; 3340 int i;
2433 3341
2434 printk("sh %llu, pd_idx %d, state %ld.\n", 3342 seq_printf(seq, "sh %llu, pd_idx %d, state %ld.\n",
2435 (unsigned long long)sh->sector, sh->pd_idx, sh->state); 3343 (unsigned long long)sh->sector, sh->pd_idx, sh->state);
2436 printk("sh %llu, count %d.\n", 3344 seq_printf(seq, "sh %llu, count %d.\n",
2437 (unsigned long long)sh->sector, atomic_read(&sh->count)); 3345 (unsigned long long)sh->sector, atomic_read(&sh->count));
2438 printk("sh %llu, ", (unsigned long long)sh->sector); 3346 seq_printf(seq, "sh %llu, ", (unsigned long long)sh->sector);
2439 for (i = 0; i < sh->disks; i++) { 3347 for (i = 0; i < sh->disks; i++) {
2440 printk("(cache%d: %p %ld) ", 3348 seq_printf(seq, "(cache%d: %p %ld) ",
2441 i, sh->dev[i].page, sh->dev[i].flags); 3349 i, sh->dev[i].page, sh->dev[i].flags);
2442 } 3350 }
2443 printk("\n"); 3351 seq_printf(seq, "\n");
2444} 3352}
2445 3353
2446static void printall (raid5_conf_t *conf) 3354static void printall (struct seq_file *seq, raid5_conf_t *conf)
2447{ 3355{
2448 struct stripe_head *sh; 3356 struct stripe_head *sh;
2449 struct hlist_node *hn; 3357 struct hlist_node *hn;
@@ -2454,7 +3362,7 @@ static void printall (raid5_conf_t *conf)
2454 hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) { 3362 hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) {
2455 if (sh->raid_conf != conf) 3363 if (sh->raid_conf != conf)
2456 continue; 3364 continue;
2457 print_sh(sh); 3365 print_sh(seq, sh);
2458 } 3366 }
2459 } 3367 }
2460 spin_unlock_irq(&conf->device_lock); 3368 spin_unlock_irq(&conf->device_lock);
@@ -2474,9 +3382,8 @@ static void status (struct seq_file *seq, mddev_t *mddev)
2474 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); 3382 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_");
2475 seq_printf (seq, "]"); 3383 seq_printf (seq, "]");
2476#if RAID5_DEBUG 3384#if RAID5_DEBUG
2477#define D(x) \ 3385 seq_printf (seq, "\n");
2478 seq_printf (seq, "<"#x":%d>", atomic_read(&conf->x)) 3386 printall(seq, conf);
2479 printall(conf);
2480#endif 3387#endif
2481} 3388}
2482 3389
@@ -2560,14 +3467,20 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
2560 int disk; 3467 int disk;
2561 struct disk_info *p; 3468 struct disk_info *p;
2562 3469
2563 if (mddev->degraded > 1) 3470 if (mddev->degraded > conf->max_degraded)
2564 /* no point adding a device */ 3471 /* no point adding a device */
2565 return 0; 3472 return 0;
2566 3473
2567 /* 3474 /*
2568 * find the disk ... 3475 * find the disk ... but prefer rdev->saved_raid_disk
3476 * if possible.
2569 */ 3477 */
2570 for (disk=0; disk < conf->raid_disks; disk++) 3478 if (rdev->saved_raid_disk >= 0 &&
3479 conf->disks[rdev->saved_raid_disk].rdev == NULL)
3480 disk = rdev->saved_raid_disk;
3481 else
3482 disk = 0;
3483 for ( ; disk < conf->raid_disks; disk++)
2571 if ((p=conf->disks + disk)->rdev == NULL) { 3484 if ((p=conf->disks + disk)->rdev == NULL) {
2572 clear_bit(In_sync, &rdev->flags); 3485 clear_bit(In_sync, &rdev->flags);
2573 rdev->raid_disk = disk; 3486 rdev->raid_disk = disk;
@@ -2590,8 +3503,10 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
2590 * any io in the removed space completes, but it hardly seems 3503 * any io in the removed space completes, but it hardly seems
2591 * worth it. 3504 * worth it.
2592 */ 3505 */
3506 raid5_conf_t *conf = mddev_to_conf(mddev);
3507
2593 sectors &= ~((sector_t)mddev->chunk_size/512 - 1); 3508 sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
2594 mddev->array_size = (sectors * (mddev->raid_disks-1))>>1; 3509 mddev->array_size = (sectors * (mddev->raid_disks-conf->max_degraded))>>1;
2595 set_capacity(mddev->gendisk, mddev->array_size << 1); 3510 set_capacity(mddev->gendisk, mddev->array_size << 1);
2596 mddev->changed = 1; 3511 mddev->changed = 1;
2597 if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) { 3512 if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) {
@@ -2680,6 +3595,7 @@ static int raid5_start_reshape(mddev_t *mddev)
2680 set_bit(In_sync, &rdev->flags); 3595 set_bit(In_sync, &rdev->flags);
2681 conf->working_disks++; 3596 conf->working_disks++;
2682 added_devices++; 3597 added_devices++;
3598 rdev->recovery_offset = 0;
2683 sprintf(nm, "rd%d", rdev->raid_disk); 3599 sprintf(nm, "rd%d", rdev->raid_disk);
2684 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); 3600 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
2685 } else 3601 } else
@@ -2731,6 +3647,17 @@ static void end_reshape(raid5_conf_t *conf)
2731 conf->expand_progress = MaxSector; 3647 conf->expand_progress = MaxSector;
2732 spin_unlock_irq(&conf->device_lock); 3648 spin_unlock_irq(&conf->device_lock);
2733 conf->mddev->reshape_position = MaxSector; 3649 conf->mddev->reshape_position = MaxSector;
3650
3651 /* read-ahead size must cover two whole stripes, which is
3652 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
3653 */
3654 {
3655 int data_disks = conf->previous_raid_disks - conf->max_degraded;
3656 int stripe = data_disks *
3657 (conf->mddev->chunk_size / PAGE_SIZE);
3658 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
3659 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
3660 }
2734 } 3661 }
2735} 3662}
2736 3663
@@ -2762,6 +3689,23 @@ static void raid5_quiesce(mddev_t *mddev, int state)
2762 } 3689 }
2763} 3690}
2764 3691
3692static struct mdk_personality raid6_personality =
3693{
3694 .name = "raid6",
3695 .level = 6,
3696 .owner = THIS_MODULE,
3697 .make_request = make_request,
3698 .run = run,
3699 .stop = stop,
3700 .status = status,
3701 .error_handler = error,
3702 .hot_add_disk = raid5_add_disk,
3703 .hot_remove_disk= raid5_remove_disk,
3704 .spare_active = raid5_spare_active,
3705 .sync_request = sync_request,
3706 .resize = raid5_resize,
3707 .quiesce = raid5_quiesce,
3708};
2765static struct mdk_personality raid5_personality = 3709static struct mdk_personality raid5_personality =
2766{ 3710{
2767 .name = "raid5", 3711 .name = "raid5",
@@ -2804,6 +3748,12 @@ static struct mdk_personality raid4_personality =
2804 3748
2805static int __init raid5_init(void) 3749static int __init raid5_init(void)
2806{ 3750{
3751 int e;
3752
3753 e = raid6_select_algo();
3754 if ( e )
3755 return e;
3756 register_md_personality(&raid6_personality);
2807 register_md_personality(&raid5_personality); 3757 register_md_personality(&raid5_personality);
2808 register_md_personality(&raid4_personality); 3758 register_md_personality(&raid4_personality);
2809 return 0; 3759 return 0;
@@ -2811,6 +3761,7 @@ static int __init raid5_init(void)
2811 3761
2812static void raid5_exit(void) 3762static void raid5_exit(void)
2813{ 3763{
3764 unregister_md_personality(&raid6_personality);
2814 unregister_md_personality(&raid5_personality); 3765 unregister_md_personality(&raid5_personality);
2815 unregister_md_personality(&raid4_personality); 3766 unregister_md_personality(&raid4_personality);
2816} 3767}
@@ -2823,3 +3774,10 @@ MODULE_ALIAS("md-raid5");
2823MODULE_ALIAS("md-raid4"); 3774MODULE_ALIAS("md-raid4");
2824MODULE_ALIAS("md-level-5"); 3775MODULE_ALIAS("md-level-5");
2825MODULE_ALIAS("md-level-4"); 3776MODULE_ALIAS("md-level-4");
3777MODULE_ALIAS("md-personality-8"); /* RAID6 */
3778MODULE_ALIAS("md-raid6");
3779MODULE_ALIAS("md-level-6");
3780
3781/* This used to be two separate modules, they were: */
3782MODULE_ALIAS("raid5");
3783MODULE_ALIAS("raid6");
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
deleted file mode 100644
index bc69355e0100..000000000000
--- a/drivers/md/raid6main.c
+++ /dev/null
@@ -1,2427 +0,0 @@
1/*
2 * raid6main.c : Multiple Devices driver for Linux
3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4 * Copyright (C) 1999, 2000 Ingo Molnar
5 * Copyright (C) 2002, 2003 H. Peter Anvin
6 *
7 * RAID-6 management functions. This code is derived from raid5.c.
8 * Last merge from raid5.c bkcvs version 1.79 (kernel 2.6.1).
9 *
10 * Thanks to Penguin Computing for making the RAID-6 development possible
11 * by donating a test server!
12 *
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2, or (at your option)
16 * any later version.
17 *
18 * You should have received a copy of the GNU General Public License
19 * (for example /usr/src/linux/COPYING); if not, write to the Free
20 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 */
22
23
24#include <linux/config.h>
25#include <linux/module.h>
26#include <linux/slab.h>
27#include <linux/highmem.h>
28#include <linux/bitops.h>
29#include <asm/atomic.h>
30#include "raid6.h"
31
32#include <linux/raid/bitmap.h>
33
34/*
35 * Stripe cache
36 */
37
38#define NR_STRIPES 256
39#define STRIPE_SIZE PAGE_SIZE
40#define STRIPE_SHIFT (PAGE_SHIFT - 9)
41#define STRIPE_SECTORS (STRIPE_SIZE>>9)
42#define IO_THRESHOLD 1
43#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))
44#define HASH_MASK (NR_HASH - 1)
45
46#define stripe_hash(conf, sect) (&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]))
47
48/* bio's attached to a stripe+device for I/O are linked together in bi_sector
49 * order without overlap. There may be several bio's per stripe+device, and
50 * a bio could span several devices.
51 * When walking this list for a particular stripe+device, we must never proceed
52 * beyond a bio that extends past this device, as the next bio might no longer
53 * be valid.
54 * This macro is used to determine the 'next' bio in the list, given the sector
55 * of the current stripe+device
56 */
57#define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL)
58/*
59 * The following can be used to debug the driver
60 */
61#define RAID6_DEBUG 0 /* Extremely verbose printk */
62#define RAID6_PARANOIA 1 /* Check spinlocks */
63#define RAID6_DUMPSTATE 0 /* Include stripe cache state in /proc/mdstat */
64#if RAID6_PARANOIA && defined(CONFIG_SMP)
65# define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock)
66#else
67# define CHECK_DEVLOCK()
68#endif
69
70#define PRINTK(x...) ((void)(RAID6_DEBUG && printk(KERN_DEBUG x)))
71#if RAID6_DEBUG
72#undef inline
73#undef __inline__
74#define inline
75#define __inline__
76#endif
77
78#if !RAID6_USE_EMPTY_ZERO_PAGE
79/* In .bss so it's zeroed */
80const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
81#endif
82
83static inline int raid6_next_disk(int disk, int raid_disks)
84{
85 disk++;
86 return (disk < raid_disks) ? disk : 0;
87}
88
89static void print_raid6_conf (raid6_conf_t *conf);
90
91static void __release_stripe(raid6_conf_t *conf, struct stripe_head *sh)
92{
93 if (atomic_dec_and_test(&sh->count)) {
94 BUG_ON(!list_empty(&sh->lru));
95 BUG_ON(atomic_read(&conf->active_stripes)==0);
96 if (test_bit(STRIPE_HANDLE, &sh->state)) {
97 if (test_bit(STRIPE_DELAYED, &sh->state))
98 list_add_tail(&sh->lru, &conf->delayed_list);
99 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
100 conf->seq_write == sh->bm_seq)
101 list_add_tail(&sh->lru, &conf->bitmap_list);
102 else {
103 clear_bit(STRIPE_BIT_DELAY, &sh->state);
104 list_add_tail(&sh->lru, &conf->handle_list);
105 }
106 md_wakeup_thread(conf->mddev->thread);
107 } else {
108 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
109 atomic_dec(&conf->preread_active_stripes);
110 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
111 md_wakeup_thread(conf->mddev->thread);
112 }
113 list_add_tail(&sh->lru, &conf->inactive_list);
114 atomic_dec(&conf->active_stripes);
115 if (!conf->inactive_blocked ||
116 atomic_read(&conf->active_stripes) < (conf->max_nr_stripes*3/4))
117 wake_up(&conf->wait_for_stripe);
118 }
119 }
120}
121static void release_stripe(struct stripe_head *sh)
122{
123 raid6_conf_t *conf = sh->raid_conf;
124 unsigned long flags;
125
126 spin_lock_irqsave(&conf->device_lock, flags);
127 __release_stripe(conf, sh);
128 spin_unlock_irqrestore(&conf->device_lock, flags);
129}
130
131static inline void remove_hash(struct stripe_head *sh)
132{
133 PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector);
134
135 hlist_del_init(&sh->hash);
136}
137
138static inline void insert_hash(raid6_conf_t *conf, struct stripe_head *sh)
139{
140 struct hlist_head *hp = stripe_hash(conf, sh->sector);
141
142 PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector);
143
144 CHECK_DEVLOCK();
145 hlist_add_head(&sh->hash, hp);
146}
147
148
149/* find an idle stripe, make sure it is unhashed, and return it. */
150static struct stripe_head *get_free_stripe(raid6_conf_t *conf)
151{
152 struct stripe_head *sh = NULL;
153 struct list_head *first;
154
155 CHECK_DEVLOCK();
156 if (list_empty(&conf->inactive_list))
157 goto out;
158 first = conf->inactive_list.next;
159 sh = list_entry(first, struct stripe_head, lru);
160 list_del_init(first);
161 remove_hash(sh);
162 atomic_inc(&conf->active_stripes);
163out:
164 return sh;
165}
166
167static void shrink_buffers(struct stripe_head *sh, int num)
168{
169 struct page *p;
170 int i;
171
172 for (i=0; i<num ; i++) {
173 p = sh->dev[i].page;
174 if (!p)
175 continue;
176 sh->dev[i].page = NULL;
177 put_page(p);
178 }
179}
180
181static int grow_buffers(struct stripe_head *sh, int num)
182{
183 int i;
184
185 for (i=0; i<num; i++) {
186 struct page *page;
187
188 if (!(page = alloc_page(GFP_KERNEL))) {
189 return 1;
190 }
191 sh->dev[i].page = page;
192 }
193 return 0;
194}
195
196static void raid6_build_block (struct stripe_head *sh, int i);
197
198static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx)
199{
200 raid6_conf_t *conf = sh->raid_conf;
201 int disks = conf->raid_disks, i;
202
203 BUG_ON(atomic_read(&sh->count) != 0);
204 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
205
206 CHECK_DEVLOCK();
207 PRINTK("init_stripe called, stripe %llu\n",
208 (unsigned long long)sh->sector);
209
210 remove_hash(sh);
211
212 sh->sector = sector;
213 sh->pd_idx = pd_idx;
214 sh->state = 0;
215
216 for (i=disks; i--; ) {
217 struct r5dev *dev = &sh->dev[i];
218
219 if (dev->toread || dev->towrite || dev->written ||
220 test_bit(R5_LOCKED, &dev->flags)) {
221 PRINTK("sector=%llx i=%d %p %p %p %d\n",
222 (unsigned long long)sh->sector, i, dev->toread,
223 dev->towrite, dev->written,
224 test_bit(R5_LOCKED, &dev->flags));
225 BUG();
226 }
227 dev->flags = 0;
228 raid6_build_block(sh, i);
229 }
230 insert_hash(conf, sh);
231}
232
233static struct stripe_head *__find_stripe(raid6_conf_t *conf, sector_t sector)
234{
235 struct stripe_head *sh;
236 struct hlist_node *hn;
237
238 CHECK_DEVLOCK();
239 PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector);
240 hlist_for_each_entry (sh, hn, stripe_hash(conf, sector), hash)
241 if (sh->sector == sector)
242 return sh;
243 PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector);
244 return NULL;
245}
246
247static void unplug_slaves(mddev_t *mddev);
248
249static struct stripe_head *get_active_stripe(raid6_conf_t *conf, sector_t sector,
250 int pd_idx, int noblock)
251{
252 struct stripe_head *sh;
253
254 PRINTK("get_stripe, sector %llu\n", (unsigned long long)sector);
255
256 spin_lock_irq(&conf->device_lock);
257
258 do {
259 wait_event_lock_irq(conf->wait_for_stripe,
260 conf->quiesce == 0,
261 conf->device_lock, /* nothing */);
262 sh = __find_stripe(conf, sector);
263 if (!sh) {
264 if (!conf->inactive_blocked)
265 sh = get_free_stripe(conf);
266 if (noblock && sh == NULL)
267 break;
268 if (!sh) {
269 conf->inactive_blocked = 1;
270 wait_event_lock_irq(conf->wait_for_stripe,
271 !list_empty(&conf->inactive_list) &&
272 (atomic_read(&conf->active_stripes)
273 < (conf->max_nr_stripes *3/4)
274 || !conf->inactive_blocked),
275 conf->device_lock,
276 unplug_slaves(conf->mddev);
277 );
278 conf->inactive_blocked = 0;
279 } else
280 init_stripe(sh, sector, pd_idx);
281 } else {
282 if (atomic_read(&sh->count)) {
283 BUG_ON(!list_empty(&sh->lru));
284 } else {
285 if (!test_bit(STRIPE_HANDLE, &sh->state))
286 atomic_inc(&conf->active_stripes);
287 BUG_ON(list_empty(&sh->lru));
288 list_del_init(&sh->lru);
289 }
290 }
291 } while (sh == NULL);
292
293 if (sh)
294 atomic_inc(&sh->count);
295
296 spin_unlock_irq(&conf->device_lock);
297 return sh;
298}
299
300static int grow_one_stripe(raid6_conf_t *conf)
301{
302 struct stripe_head *sh;
303 sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL);
304 if (!sh)
305 return 0;
306 memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev));
307 sh->raid_conf = conf;
308 spin_lock_init(&sh->lock);
309
310 if (grow_buffers(sh, conf->raid_disks)) {
311 shrink_buffers(sh, conf->raid_disks);
312 kmem_cache_free(conf->slab_cache, sh);
313 return 0;
314 }
315 /* we just created an active stripe so... */
316 atomic_set(&sh->count, 1);
317 atomic_inc(&conf->active_stripes);
318 INIT_LIST_HEAD(&sh->lru);
319 release_stripe(sh);
320 return 1;
321}
322
323static int grow_stripes(raid6_conf_t *conf, int num)
324{
325 kmem_cache_t *sc;
326 int devs = conf->raid_disks;
327
328 sprintf(conf->cache_name[0], "raid6/%s", mdname(conf->mddev));
329
330 sc = kmem_cache_create(conf->cache_name[0],
331 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
332 0, 0, NULL, NULL);
333 if (!sc)
334 return 1;
335 conf->slab_cache = sc;
336 while (num--)
337 if (!grow_one_stripe(conf))
338 return 1;
339 return 0;
340}
341
342static int drop_one_stripe(raid6_conf_t *conf)
343{
344 struct stripe_head *sh;
345 spin_lock_irq(&conf->device_lock);
346 sh = get_free_stripe(conf);
347 spin_unlock_irq(&conf->device_lock);
348 if (!sh)
349 return 0;
350 BUG_ON(atomic_read(&sh->count));
351 shrink_buffers(sh, conf->raid_disks);
352 kmem_cache_free(conf->slab_cache, sh);
353 atomic_dec(&conf->active_stripes);
354 return 1;
355}
356
357static void shrink_stripes(raid6_conf_t *conf)
358{
359 while (drop_one_stripe(conf))
360 ;
361
362 if (conf->slab_cache)
363 kmem_cache_destroy(conf->slab_cache);
364 conf->slab_cache = NULL;
365}
366
367static int raid6_end_read_request(struct bio * bi, unsigned int bytes_done,
368 int error)
369{
370 struct stripe_head *sh = bi->bi_private;
371 raid6_conf_t *conf = sh->raid_conf;
372 int disks = conf->raid_disks, i;
373 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
374
375 if (bi->bi_size)
376 return 1;
377
378 for (i=0 ; i<disks; i++)
379 if (bi == &sh->dev[i].req)
380 break;
381
382 PRINTK("end_read_request %llu/%d, count: %d, uptodate %d.\n",
383 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
384 uptodate);
385 if (i == disks) {
386 BUG();
387 return 0;
388 }
389
390 if (uptodate) {
391#if 0
392 struct bio *bio;
393 unsigned long flags;
394 spin_lock_irqsave(&conf->device_lock, flags);
395 /* we can return a buffer if we bypassed the cache or
396 * if the top buffer is not in highmem. If there are
397 * multiple buffers, leave the extra work to
398 * handle_stripe
399 */
400 buffer = sh->bh_read[i];
401 if (buffer &&
402 (!PageHighMem(buffer->b_page)
403 || buffer->b_page == bh->b_page )
404 ) {
405 sh->bh_read[i] = buffer->b_reqnext;
406 buffer->b_reqnext = NULL;
407 } else
408 buffer = NULL;
409 spin_unlock_irqrestore(&conf->device_lock, flags);
410 if (sh->bh_page[i]==bh->b_page)
411 set_buffer_uptodate(bh);
412 if (buffer) {
413 if (buffer->b_page != bh->b_page)
414 memcpy(buffer->b_data, bh->b_data, bh->b_size);
415 buffer->b_end_io(buffer, 1);
416 }
417#else
418 set_bit(R5_UPTODATE, &sh->dev[i].flags);
419#endif
420 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
421 printk(KERN_INFO "raid6: read error corrected!!\n");
422 clear_bit(R5_ReadError, &sh->dev[i].flags);
423 clear_bit(R5_ReWrite, &sh->dev[i].flags);
424 }
425 if (atomic_read(&conf->disks[i].rdev->read_errors))
426 atomic_set(&conf->disks[i].rdev->read_errors, 0);
427 } else {
428 int retry = 0;
429 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
430 atomic_inc(&conf->disks[i].rdev->read_errors);
431 if (conf->mddev->degraded)
432 printk(KERN_WARNING "raid6: read error not correctable.\n");
433 else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
434 /* Oh, no!!! */
435 printk(KERN_WARNING "raid6: read error NOT corrected!!\n");
436 else if (atomic_read(&conf->disks[i].rdev->read_errors)
437 > conf->max_nr_stripes)
438 printk(KERN_WARNING
439 "raid6: Too many read errors, failing device.\n");
440 else
441 retry = 1;
442 if (retry)
443 set_bit(R5_ReadError, &sh->dev[i].flags);
444 else {
445 clear_bit(R5_ReadError, &sh->dev[i].flags);
446 clear_bit(R5_ReWrite, &sh->dev[i].flags);
447 md_error(conf->mddev, conf->disks[i].rdev);
448 }
449 }
450 rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
451#if 0
452 /* must restore b_page before unlocking buffer... */
453 if (sh->bh_page[i] != bh->b_page) {
454 bh->b_page = sh->bh_page[i];
455 bh->b_data = page_address(bh->b_page);
456 clear_buffer_uptodate(bh);
457 }
458#endif
459 clear_bit(R5_LOCKED, &sh->dev[i].flags);
460 set_bit(STRIPE_HANDLE, &sh->state);
461 release_stripe(sh);
462 return 0;
463}
464
465static int raid6_end_write_request (struct bio *bi, unsigned int bytes_done,
466 int error)
467{
468 struct stripe_head *sh = bi->bi_private;
469 raid6_conf_t *conf = sh->raid_conf;
470 int disks = conf->raid_disks, i;
471 unsigned long flags;
472 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
473
474 if (bi->bi_size)
475 return 1;
476
477 for (i=0 ; i<disks; i++)
478 if (bi == &sh->dev[i].req)
479 break;
480
481 PRINTK("end_write_request %llu/%d, count %d, uptodate: %d.\n",
482 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
483 uptodate);
484 if (i == disks) {
485 BUG();
486 return 0;
487 }
488
489 spin_lock_irqsave(&conf->device_lock, flags);
490 if (!uptodate)
491 md_error(conf->mddev, conf->disks[i].rdev);
492
493 rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
494
495 clear_bit(R5_LOCKED, &sh->dev[i].flags);
496 set_bit(STRIPE_HANDLE, &sh->state);
497 __release_stripe(conf, sh);
498 spin_unlock_irqrestore(&conf->device_lock, flags);
499 return 0;
500}
501
502
503static sector_t compute_blocknr(struct stripe_head *sh, int i);
504
505static void raid6_build_block (struct stripe_head *sh, int i)
506{
507 struct r5dev *dev = &sh->dev[i];
508 int pd_idx = sh->pd_idx;
509 int qd_idx = raid6_next_disk(pd_idx, sh->raid_conf->raid_disks);
510
511 bio_init(&dev->req);
512 dev->req.bi_io_vec = &dev->vec;
513 dev->req.bi_vcnt++;
514 dev->req.bi_max_vecs++;
515 dev->vec.bv_page = dev->page;
516 dev->vec.bv_len = STRIPE_SIZE;
517 dev->vec.bv_offset = 0;
518
519 dev->req.bi_sector = sh->sector;
520 dev->req.bi_private = sh;
521
522 dev->flags = 0;
523 if (i != pd_idx && i != qd_idx)
524 dev->sector = compute_blocknr(sh, i);
525}
526
527static void error(mddev_t *mddev, mdk_rdev_t *rdev)
528{
529 char b[BDEVNAME_SIZE];
530 raid6_conf_t *conf = (raid6_conf_t *) mddev->private;
531 PRINTK("raid6: error called\n");
532
533 if (!test_bit(Faulty, &rdev->flags)) {
534 mddev->sb_dirty = 1;
535 if (test_bit(In_sync, &rdev->flags)) {
536 conf->working_disks--;
537 mddev->degraded++;
538 conf->failed_disks++;
539 clear_bit(In_sync, &rdev->flags);
540 /*
541 * if recovery was running, make sure it aborts.
542 */
543 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
544 }
545 set_bit(Faulty, &rdev->flags);
546 printk (KERN_ALERT
547 "raid6: Disk failure on %s, disabling device."
548 " Operation continuing on %d devices\n",
549 bdevname(rdev->bdev,b), conf->working_disks);
550 }
551}
552
553/*
554 * Input: a 'big' sector number,
555 * Output: index of the data and parity disk, and the sector # in them.
556 */
557static sector_t raid6_compute_sector(sector_t r_sector, unsigned int raid_disks,
558 unsigned int data_disks, unsigned int * dd_idx,
559 unsigned int * pd_idx, raid6_conf_t *conf)
560{
561 long stripe;
562 unsigned long chunk_number;
563 unsigned int chunk_offset;
564 sector_t new_sector;
565 int sectors_per_chunk = conf->chunk_size >> 9;
566
567 /* First compute the information on this sector */
568
569 /*
570 * Compute the chunk number and the sector offset inside the chunk
571 */
572 chunk_offset = sector_div(r_sector, sectors_per_chunk);
573 chunk_number = r_sector;
574 if ( r_sector != chunk_number ) {
575 printk(KERN_CRIT "raid6: ERROR: r_sector = %llu, chunk_number = %lu\n",
576 (unsigned long long)r_sector, (unsigned long)chunk_number);
577 BUG();
578 }
579
580 /*
581 * Compute the stripe number
582 */
583 stripe = chunk_number / data_disks;
584
585 /*
586 * Compute the data disk and parity disk indexes inside the stripe
587 */
588 *dd_idx = chunk_number % data_disks;
589
590 /*
591 * Select the parity disk based on the user selected algorithm.
592 */
593
594 /**** FIX THIS ****/
595 switch (conf->algorithm) {
596 case ALGORITHM_LEFT_ASYMMETRIC:
597 *pd_idx = raid_disks - 1 - (stripe % raid_disks);
598 if (*pd_idx == raid_disks-1)
599 (*dd_idx)++; /* Q D D D P */
600 else if (*dd_idx >= *pd_idx)
601 (*dd_idx) += 2; /* D D P Q D */
602 break;
603 case ALGORITHM_RIGHT_ASYMMETRIC:
604 *pd_idx = stripe % raid_disks;
605 if (*pd_idx == raid_disks-1)
606 (*dd_idx)++; /* Q D D D P */
607 else if (*dd_idx >= *pd_idx)
608 (*dd_idx) += 2; /* D D P Q D */
609 break;
610 case ALGORITHM_LEFT_SYMMETRIC:
611 *pd_idx = raid_disks - 1 - (stripe % raid_disks);
612 *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
613 break;
614 case ALGORITHM_RIGHT_SYMMETRIC:
615 *pd_idx = stripe % raid_disks;
616 *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
617 break;
618 default:
619 printk (KERN_CRIT "raid6: unsupported algorithm %d\n",
620 conf->algorithm);
621 }
622
623 PRINTK("raid6: chunk_number = %lu, pd_idx = %u, dd_idx = %u\n",
624 chunk_number, *pd_idx, *dd_idx);
625
626 /*
627 * Finally, compute the new sector number
628 */
629 new_sector = (sector_t) stripe * sectors_per_chunk + chunk_offset;
630 return new_sector;
631}
632
633
634static sector_t compute_blocknr(struct stripe_head *sh, int i)
635{
636 raid6_conf_t *conf = sh->raid_conf;
637 int raid_disks = conf->raid_disks, data_disks = raid_disks - 2;
638 sector_t new_sector = sh->sector, check;
639 int sectors_per_chunk = conf->chunk_size >> 9;
640 sector_t stripe;
641 int chunk_offset;
642 int chunk_number, dummy1, dummy2, dd_idx = i;
643 sector_t r_sector;
644 int i0 = i;
645
646 chunk_offset = sector_div(new_sector, sectors_per_chunk);
647 stripe = new_sector;
648 if ( new_sector != stripe ) {
649 printk(KERN_CRIT "raid6: ERROR: new_sector = %llu, stripe = %lu\n",
650 (unsigned long long)new_sector, (unsigned long)stripe);
651 BUG();
652 }
653
654 switch (conf->algorithm) {
655 case ALGORITHM_LEFT_ASYMMETRIC:
656 case ALGORITHM_RIGHT_ASYMMETRIC:
657 if (sh->pd_idx == raid_disks-1)
658 i--; /* Q D D D P */
659 else if (i > sh->pd_idx)
660 i -= 2; /* D D P Q D */
661 break;
662 case ALGORITHM_LEFT_SYMMETRIC:
663 case ALGORITHM_RIGHT_SYMMETRIC:
664 if (sh->pd_idx == raid_disks-1)
665 i--; /* Q D D D P */
666 else {
667 /* D D P Q D */
668 if (i < sh->pd_idx)
669 i += raid_disks;
670 i -= (sh->pd_idx + 2);
671 }
672 break;
673 default:
674 printk (KERN_CRIT "raid6: unsupported algorithm %d\n",
675 conf->algorithm);
676 }
677
678 PRINTK("raid6: compute_blocknr: pd_idx = %u, i0 = %u, i = %u\n", sh->pd_idx, i0, i);
679
680 chunk_number = stripe * data_disks + i;
681 r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset;
682
683 check = raid6_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
684 if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
685 printk(KERN_CRIT "raid6: compute_blocknr: map not correct\n");
686 return 0;
687 }
688 return r_sector;
689}
690
691
692
693/*
694 * Copy data between a page in the stripe cache, and one or more bion
695 * The page could align with the middle of the bio, or there could be
696 * several bion, each with several bio_vecs, which cover part of the page
697 * Multiple bion are linked together on bi_next. There may be extras
698 * at the end of this list. We ignore them.
699 */
700static void copy_data(int frombio, struct bio *bio,
701 struct page *page,
702 sector_t sector)
703{
704 char *pa = page_address(page);
705 struct bio_vec *bvl;
706 int i;
707 int page_offset;
708
709 if (bio->bi_sector >= sector)
710 page_offset = (signed)(bio->bi_sector - sector) * 512;
711 else
712 page_offset = (signed)(sector - bio->bi_sector) * -512;
713 bio_for_each_segment(bvl, bio, i) {
714 int len = bio_iovec_idx(bio,i)->bv_len;
715 int clen;
716 int b_offset = 0;
717
718 if (page_offset < 0) {
719 b_offset = -page_offset;
720 page_offset += b_offset;
721 len -= b_offset;
722 }
723
724 if (len > 0 && page_offset + len > STRIPE_SIZE)
725 clen = STRIPE_SIZE - page_offset;
726 else clen = len;
727
728 if (clen > 0) {
729 char *ba = __bio_kmap_atomic(bio, i, KM_USER0);
730 if (frombio)
731 memcpy(pa+page_offset, ba+b_offset, clen);
732 else
733 memcpy(ba+b_offset, pa+page_offset, clen);
734 __bio_kunmap_atomic(ba, KM_USER0);
735 }
736 if (clen < len) /* hit end of page */
737 break;
738 page_offset += len;
739 }
740}
741
742#define check_xor() do { \
743 if (count == MAX_XOR_BLOCKS) { \
744 xor_block(count, STRIPE_SIZE, ptr); \
745 count = 1; \
746 } \
747 } while(0)
748
749/* Compute P and Q syndromes */
750static void compute_parity(struct stripe_head *sh, int method)
751{
752 raid6_conf_t *conf = sh->raid_conf;
753 int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
754 struct bio *chosen;
755 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
756 void *ptrs[disks];
757
758 qd_idx = raid6_next_disk(pd_idx, disks);
759 d0_idx = raid6_next_disk(qd_idx, disks);
760
761 PRINTK("compute_parity, stripe %llu, method %d\n",
762 (unsigned long long)sh->sector, method);
763
764 switch(method) {
765 case READ_MODIFY_WRITE:
766 BUG(); /* READ_MODIFY_WRITE N/A for RAID-6 */
767 case RECONSTRUCT_WRITE:
768 for (i= disks; i-- ;)
769 if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) {
770 chosen = sh->dev[i].towrite;
771 sh->dev[i].towrite = NULL;
772
773 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
774 wake_up(&conf->wait_for_overlap);
775
776 BUG_ON(sh->dev[i].written);
777 sh->dev[i].written = chosen;
778 }
779 break;
780 case CHECK_PARITY:
781 BUG(); /* Not implemented yet */
782 }
783
784 for (i = disks; i--;)
785 if (sh->dev[i].written) {
786 sector_t sector = sh->dev[i].sector;
787 struct bio *wbi = sh->dev[i].written;
788 while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
789 copy_data(1, wbi, sh->dev[i].page, sector);
790 wbi = r5_next_bio(wbi, sector);
791 }
792
793 set_bit(R5_LOCKED, &sh->dev[i].flags);
794 set_bit(R5_UPTODATE, &sh->dev[i].flags);
795 }
796
797// switch(method) {
798// case RECONSTRUCT_WRITE:
799// case CHECK_PARITY:
800// case UPDATE_PARITY:
801 /* Note that unlike RAID-5, the ordering of the disks matters greatly. */
802 /* FIX: Is this ordering of drives even remotely optimal? */
803 count = 0;
804 i = d0_idx;
805 do {
806 ptrs[count++] = page_address(sh->dev[i].page);
807 if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags))
808 printk("block %d/%d not uptodate on parity calc\n", i,count);
809 i = raid6_next_disk(i, disks);
810 } while ( i != d0_idx );
811// break;
812// }
813
814 raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs);
815
816 switch(method) {
817 case RECONSTRUCT_WRITE:
818 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
819 set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
820 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
821 set_bit(R5_LOCKED, &sh->dev[qd_idx].flags);
822 break;
823 case UPDATE_PARITY:
824 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
825 set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
826 break;
827 }
828}
829
830/* Compute one missing block */
831static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
832{
833 raid6_conf_t *conf = sh->raid_conf;
834 int i, count, disks = conf->raid_disks;
835 void *ptr[MAX_XOR_BLOCKS], *p;
836 int pd_idx = sh->pd_idx;
837 int qd_idx = raid6_next_disk(pd_idx, disks);
838
839 PRINTK("compute_block_1, stripe %llu, idx %d\n",
840 (unsigned long long)sh->sector, dd_idx);
841
842 if ( dd_idx == qd_idx ) {
843 /* We're actually computing the Q drive */
844 compute_parity(sh, UPDATE_PARITY);
845 } else {
846 ptr[0] = page_address(sh->dev[dd_idx].page);
847 if (!nozero) memset(ptr[0], 0, STRIPE_SIZE);
848 count = 1;
849 for (i = disks ; i--; ) {
850 if (i == dd_idx || i == qd_idx)
851 continue;
852 p = page_address(sh->dev[i].page);
853 if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
854 ptr[count++] = p;
855 else
856 printk("compute_block() %d, stripe %llu, %d"
857 " not present\n", dd_idx,
858 (unsigned long long)sh->sector, i);
859
860 check_xor();
861 }
862 if (count != 1)
863 xor_block(count, STRIPE_SIZE, ptr);
864 if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
865 else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
866 }
867}
868
869/* Compute two missing blocks */
870static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
871{
872 raid6_conf_t *conf = sh->raid_conf;
873 int i, count, disks = conf->raid_disks;
874 int pd_idx = sh->pd_idx;
875 int qd_idx = raid6_next_disk(pd_idx, disks);
876 int d0_idx = raid6_next_disk(qd_idx, disks);
877 int faila, failb;
878
879 /* faila and failb are disk numbers relative to d0_idx */
880 /* pd_idx become disks-2 and qd_idx become disks-1 */
881 faila = (dd_idx1 < d0_idx) ? dd_idx1+(disks-d0_idx) : dd_idx1-d0_idx;
882 failb = (dd_idx2 < d0_idx) ? dd_idx2+(disks-d0_idx) : dd_idx2-d0_idx;
883
884 BUG_ON(faila == failb);
885 if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; }
886
887 PRINTK("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
888 (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb);
889
890 if ( failb == disks-1 ) {
891 /* Q disk is one of the missing disks */
892 if ( faila == disks-2 ) {
893 /* Missing P+Q, just recompute */
894 compute_parity(sh, UPDATE_PARITY);
895 return;
896 } else {
897 /* We're missing D+Q; recompute D from P */
898 compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0);
899 compute_parity(sh, UPDATE_PARITY); /* Is this necessary? */
900 return;
901 }
902 }
903
904 /* We're missing D+P or D+D; build pointer table */
905 {
906 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
907 void *ptrs[disks];
908
909 count = 0;
910 i = d0_idx;
911 do {
912 ptrs[count++] = page_address(sh->dev[i].page);
913 i = raid6_next_disk(i, disks);
914 if (i != dd_idx1 && i != dd_idx2 &&
915 !test_bit(R5_UPTODATE, &sh->dev[i].flags))
916 printk("compute_2 with missing block %d/%d\n", count, i);
917 } while ( i != d0_idx );
918
919 if ( failb == disks-2 ) {
920 /* We're missing D+P. */
921 raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs);
922 } else {
923 /* We're missing D+D. */
924 raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs);
925 }
926
927 /* Both the above update both missing blocks */
928 set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
929 set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
930 }
931}
932
933
934/*
935 * Each stripe/dev can have one or more bion attached.
936 * toread/towrite point to the first in a chain.
937 * The bi_next chain must be in order.
938 */
939static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
940{
941 struct bio **bip;
942 raid6_conf_t *conf = sh->raid_conf;
943 int firstwrite=0;
944
945 PRINTK("adding bh b#%llu to stripe s#%llu\n",
946 (unsigned long long)bi->bi_sector,
947 (unsigned long long)sh->sector);
948
949
950 spin_lock(&sh->lock);
951 spin_lock_irq(&conf->device_lock);
952 if (forwrite) {
953 bip = &sh->dev[dd_idx].towrite;
954 if (*bip == NULL && sh->dev[dd_idx].written == NULL)
955 firstwrite = 1;
956 } else
957 bip = &sh->dev[dd_idx].toread;
958 while (*bip && (*bip)->bi_sector < bi->bi_sector) {
959 if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
960 goto overlap;
961 bip = &(*bip)->bi_next;
962 }
963 if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9))
964 goto overlap;
965
966 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
967 if (*bip)
968 bi->bi_next = *bip;
969 *bip = bi;
970 bi->bi_phys_segments ++;
971 spin_unlock_irq(&conf->device_lock);
972 spin_unlock(&sh->lock);
973
974 PRINTK("added bi b#%llu to stripe s#%llu, disk %d.\n",
975 (unsigned long long)bi->bi_sector,
976 (unsigned long long)sh->sector, dd_idx);
977
978 if (conf->mddev->bitmap && firstwrite) {
979 sh->bm_seq = conf->seq_write;
980 bitmap_startwrite(conf->mddev->bitmap, sh->sector,
981 STRIPE_SECTORS, 0);
982 set_bit(STRIPE_BIT_DELAY, &sh->state);
983 }
984
985 if (forwrite) {
986 /* check if page is covered */
987 sector_t sector = sh->dev[dd_idx].sector;
988 for (bi=sh->dev[dd_idx].towrite;
989 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
990 bi && bi->bi_sector <= sector;
991 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
992 if (bi->bi_sector + (bi->bi_size>>9) >= sector)
993 sector = bi->bi_sector + (bi->bi_size>>9);
994 }
995 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
996 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
997 }
998 return 1;
999
1000 overlap:
1001 set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
1002 spin_unlock_irq(&conf->device_lock);
1003 spin_unlock(&sh->lock);
1004 return 0;
1005}
1006
1007
1008static int page_is_zero(struct page *p)
1009{
1010 char *a = page_address(p);
1011 return ((*(u32*)a) == 0 &&
1012 memcmp(a, a+4, STRIPE_SIZE-4)==0);
1013}
1014/*
1015 * handle_stripe - do things to a stripe.
1016 *
1017 * We lock the stripe and then examine the state of various bits
1018 * to see what needs to be done.
1019 * Possible results:
1020 * return some read request which now have data
1021 * return some write requests which are safely on disc
1022 * schedule a read on some buffers
1023 * schedule a write of some buffers
1024 * return confirmation of parity correctness
1025 *
1026 * Parity calculations are done inside the stripe lock
1027 * buffers are taken off read_list or write_list, and bh_cache buffers
1028 * get BH_Lock set before the stripe lock is released.
1029 *
1030 */
1031
1032static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
1033{
1034 raid6_conf_t *conf = sh->raid_conf;
1035 int disks = conf->raid_disks;
1036 struct bio *return_bi= NULL;
1037 struct bio *bi;
1038 int i;
1039 int syncing;
1040 int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
1041 int non_overwrite = 0;
1042 int failed_num[2] = {0, 0};
1043 struct r5dev *dev, *pdev, *qdev;
1044 int pd_idx = sh->pd_idx;
1045 int qd_idx = raid6_next_disk(pd_idx, disks);
1046 int p_failed, q_failed;
1047
1048 PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d, qd_idx=%d\n",
1049 (unsigned long long)sh->sector, sh->state, atomic_read(&sh->count),
1050 pd_idx, qd_idx);
1051
1052 spin_lock(&sh->lock);
1053 clear_bit(STRIPE_HANDLE, &sh->state);
1054 clear_bit(STRIPE_DELAYED, &sh->state);
1055
1056 syncing = test_bit(STRIPE_SYNCING, &sh->state);
1057 /* Now to look around and see what can be done */
1058
1059 rcu_read_lock();
1060 for (i=disks; i--; ) {
1061 mdk_rdev_t *rdev;
1062 dev = &sh->dev[i];
1063 clear_bit(R5_Insync, &dev->flags);
1064
1065 PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
1066 i, dev->flags, dev->toread, dev->towrite, dev->written);
1067 /* maybe we can reply to a read */
1068 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
1069 struct bio *rbi, *rbi2;
1070 PRINTK("Return read for disc %d\n", i);
1071 spin_lock_irq(&conf->device_lock);
1072 rbi = dev->toread;
1073 dev->toread = NULL;
1074 if (test_and_clear_bit(R5_Overlap, &dev->flags))
1075 wake_up(&conf->wait_for_overlap);
1076 spin_unlock_irq(&conf->device_lock);
1077 while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
1078 copy_data(0, rbi, dev->page, dev->sector);
1079 rbi2 = r5_next_bio(rbi, dev->sector);
1080 spin_lock_irq(&conf->device_lock);
1081 if (--rbi->bi_phys_segments == 0) {
1082 rbi->bi_next = return_bi;
1083 return_bi = rbi;
1084 }
1085 spin_unlock_irq(&conf->device_lock);
1086 rbi = rbi2;
1087 }
1088 }
1089
1090 /* now count some things */
1091 if (test_bit(R5_LOCKED, &dev->flags)) locked++;
1092 if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
1093
1094
1095 if (dev->toread) to_read++;
1096 if (dev->towrite) {
1097 to_write++;
1098 if (!test_bit(R5_OVERWRITE, &dev->flags))
1099 non_overwrite++;
1100 }
1101 if (dev->written) written++;
1102 rdev = rcu_dereference(conf->disks[i].rdev);
1103 if (!rdev || !test_bit(In_sync, &rdev->flags)) {
1104 /* The ReadError flag will just be confusing now */
1105 clear_bit(R5_ReadError, &dev->flags);
1106 clear_bit(R5_ReWrite, &dev->flags);
1107 }
1108 if (!rdev || !test_bit(In_sync, &rdev->flags)
1109 || test_bit(R5_ReadError, &dev->flags)) {
1110 if ( failed < 2 )
1111 failed_num[failed] = i;
1112 failed++;
1113 } else
1114 set_bit(R5_Insync, &dev->flags);
1115 }
1116 rcu_read_unlock();
1117 PRINTK("locked=%d uptodate=%d to_read=%d"
1118 " to_write=%d failed=%d failed_num=%d,%d\n",
1119 locked, uptodate, to_read, to_write, failed,
1120 failed_num[0], failed_num[1]);
1121 /* check if the array has lost >2 devices and, if so, some requests might
1122 * need to be failed
1123 */
1124 if (failed > 2 && to_read+to_write+written) {
1125 for (i=disks; i--; ) {
1126 int bitmap_end = 0;
1127
1128 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1129 mdk_rdev_t *rdev;
1130 rcu_read_lock();
1131 rdev = rcu_dereference(conf->disks[i].rdev);
1132 if (rdev && test_bit(In_sync, &rdev->flags))
1133 /* multiple read failures in one stripe */
1134 md_error(conf->mddev, rdev);
1135 rcu_read_unlock();
1136 }
1137
1138 spin_lock_irq(&conf->device_lock);
1139 /* fail all writes first */
1140 bi = sh->dev[i].towrite;
1141 sh->dev[i].towrite = NULL;
1142 if (bi) { to_write--; bitmap_end = 1; }
1143
1144 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1145 wake_up(&conf->wait_for_overlap);
1146
1147 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
1148 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
1149 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1150 if (--bi->bi_phys_segments == 0) {
1151 md_write_end(conf->mddev);
1152 bi->bi_next = return_bi;
1153 return_bi = bi;
1154 }
1155 bi = nextbi;
1156 }
1157 /* and fail all 'written' */
1158 bi = sh->dev[i].written;
1159 sh->dev[i].written = NULL;
1160 if (bi) bitmap_end = 1;
1161 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
1162 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
1163 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1164 if (--bi->bi_phys_segments == 0) {
1165 md_write_end(conf->mddev);
1166 bi->bi_next = return_bi;
1167 return_bi = bi;
1168 }
1169 bi = bi2;
1170 }
1171
1172 /* fail any reads if this device is non-operational */
1173 if (!test_bit(R5_Insync, &sh->dev[i].flags) ||
1174 test_bit(R5_ReadError, &sh->dev[i].flags)) {
1175 bi = sh->dev[i].toread;
1176 sh->dev[i].toread = NULL;
1177 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1178 wake_up(&conf->wait_for_overlap);
1179 if (bi) to_read--;
1180 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
1181 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
1182 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1183 if (--bi->bi_phys_segments == 0) {
1184 bi->bi_next = return_bi;
1185 return_bi = bi;
1186 }
1187 bi = nextbi;
1188 }
1189 }
1190 spin_unlock_irq(&conf->device_lock);
1191 if (bitmap_end)
1192 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
1193 STRIPE_SECTORS, 0, 0);
1194 }
1195 }
1196 if (failed > 2 && syncing) {
1197 md_done_sync(conf->mddev, STRIPE_SECTORS,0);
1198 clear_bit(STRIPE_SYNCING, &sh->state);
1199 syncing = 0;
1200 }
1201
1202 /*
1203 * might be able to return some write requests if the parity blocks
1204 * are safe, or on a failed drive
1205 */
1206 pdev = &sh->dev[pd_idx];
1207 p_failed = (failed >= 1 && failed_num[0] == pd_idx)
1208 || (failed >= 2 && failed_num[1] == pd_idx);
1209 qdev = &sh->dev[qd_idx];
1210 q_failed = (failed >= 1 && failed_num[0] == qd_idx)
1211 || (failed >= 2 && failed_num[1] == qd_idx);
1212
1213 if ( written &&
1214 ( p_failed || ((test_bit(R5_Insync, &pdev->flags)
1215 && !test_bit(R5_LOCKED, &pdev->flags)
1216 && test_bit(R5_UPTODATE, &pdev->flags))) ) &&
1217 ( q_failed || ((test_bit(R5_Insync, &qdev->flags)
1218 && !test_bit(R5_LOCKED, &qdev->flags)
1219 && test_bit(R5_UPTODATE, &qdev->flags))) ) ) {
1220 /* any written block on an uptodate or failed drive can be
1221 * returned. Note that if we 'wrote' to a failed drive,
1222 * it will be UPTODATE, but never LOCKED, so we don't need
1223 * to test 'failed' directly.
1224 */
1225 for (i=disks; i--; )
1226 if (sh->dev[i].written) {
1227 dev = &sh->dev[i];
1228 if (!test_bit(R5_LOCKED, &dev->flags) &&
1229 test_bit(R5_UPTODATE, &dev->flags) ) {
1230 /* We can return any write requests */
1231 int bitmap_end = 0;
1232 struct bio *wbi, *wbi2;
1233 PRINTK("Return write for stripe %llu disc %d\n",
1234 (unsigned long long)sh->sector, i);
1235 spin_lock_irq(&conf->device_lock);
1236 wbi = dev->written;
1237 dev->written = NULL;
1238 while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
1239 wbi2 = r5_next_bio(wbi, dev->sector);
1240 if (--wbi->bi_phys_segments == 0) {
1241 md_write_end(conf->mddev);
1242 wbi->bi_next = return_bi;
1243 return_bi = wbi;
1244 }
1245 wbi = wbi2;
1246 }
1247 if (dev->towrite == NULL)
1248 bitmap_end = 1;
1249 spin_unlock_irq(&conf->device_lock);
1250 if (bitmap_end)
1251 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
1252 STRIPE_SECTORS,
1253 !test_bit(STRIPE_DEGRADED, &sh->state), 0);
1254 }
1255 }
1256 }
1257
1258 /* Now we might consider reading some blocks, either to check/generate
1259 * parity, or to satisfy requests
1260 * or to load a block that is being partially written.
1261 */
1262 if (to_read || non_overwrite || (to_write && failed) || (syncing && (uptodate < disks))) {
1263 for (i=disks; i--;) {
1264 dev = &sh->dev[i];
1265 if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
1266 (dev->toread ||
1267 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
1268 syncing ||
1269 (failed >= 1 && (sh->dev[failed_num[0]].toread || to_write)) ||
1270 (failed >= 2 && (sh->dev[failed_num[1]].toread || to_write))
1271 )
1272 ) {
1273 /* we would like to get this block, possibly
1274 * by computing it, but we might not be able to
1275 */
1276 if (uptodate == disks-1) {
1277 PRINTK("Computing stripe %llu block %d\n",
1278 (unsigned long long)sh->sector, i);
1279 compute_block_1(sh, i, 0);
1280 uptodate++;
1281 } else if ( uptodate == disks-2 && failed >= 2 ) {
1282 /* Computing 2-failure is *very* expensive; only do it if failed >= 2 */
1283 int other;
1284 for (other=disks; other--;) {
1285 if ( other == i )
1286 continue;
1287 if ( !test_bit(R5_UPTODATE, &sh->dev[other].flags) )
1288 break;
1289 }
1290 BUG_ON(other < 0);
1291 PRINTK("Computing stripe %llu blocks %d,%d\n",
1292 (unsigned long long)sh->sector, i, other);
1293 compute_block_2(sh, i, other);
1294 uptodate += 2;
1295 } else if (test_bit(R5_Insync, &dev->flags)) {
1296 set_bit(R5_LOCKED, &dev->flags);
1297 set_bit(R5_Wantread, &dev->flags);
1298#if 0
1299 /* if I am just reading this block and we don't have
1300 a failed drive, or any pending writes then sidestep the cache */
1301 if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
1302 ! syncing && !failed && !to_write) {
1303 sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page;
1304 sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data;
1305 }
1306#endif
1307 locked++;
1308 PRINTK("Reading block %d (sync=%d)\n",
1309 i, syncing);
1310 }
1311 }
1312 }
1313 set_bit(STRIPE_HANDLE, &sh->state);
1314 }
1315
1316 /* now to consider writing and what else, if anything should be read */
1317 if (to_write) {
1318 int rcw=0, must_compute=0;
1319 for (i=disks ; i--;) {
1320 dev = &sh->dev[i];
1321 /* Would I have to read this buffer for reconstruct_write */
1322 if (!test_bit(R5_OVERWRITE, &dev->flags)
1323 && i != pd_idx && i != qd_idx
1324 && (!test_bit(R5_LOCKED, &dev->flags)
1325#if 0
1326 || sh->bh_page[i] != bh->b_page
1327#endif
1328 ) &&
1329 !test_bit(R5_UPTODATE, &dev->flags)) {
1330 if (test_bit(R5_Insync, &dev->flags)) rcw++;
1331 else {
1332 PRINTK("raid6: must_compute: disk %d flags=%#lx\n", i, dev->flags);
1333 must_compute++;
1334 }
1335 }
1336 }
1337 PRINTK("for sector %llu, rcw=%d, must_compute=%d\n",
1338 (unsigned long long)sh->sector, rcw, must_compute);
1339 set_bit(STRIPE_HANDLE, &sh->state);
1340
1341 if (rcw > 0)
1342 /* want reconstruct write, but need to get some data */
1343 for (i=disks; i--;) {
1344 dev = &sh->dev[i];
1345 if (!test_bit(R5_OVERWRITE, &dev->flags)
1346 && !(failed == 0 && (i == pd_idx || i == qd_idx))
1347 && !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
1348 test_bit(R5_Insync, &dev->flags)) {
1349 if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1350 {
1351 PRINTK("Read_old stripe %llu block %d for Reconstruct\n",
1352 (unsigned long long)sh->sector, i);
1353 set_bit(R5_LOCKED, &dev->flags);
1354 set_bit(R5_Wantread, &dev->flags);
1355 locked++;
1356 } else {
1357 PRINTK("Request delayed stripe %llu block %d for Reconstruct\n",
1358 (unsigned long long)sh->sector, i);
1359 set_bit(STRIPE_DELAYED, &sh->state);
1360 set_bit(STRIPE_HANDLE, &sh->state);
1361 }
1362 }
1363 }
1364 /* now if nothing is locked, and if we have enough data, we can start a write request */
1365 if (locked == 0 && rcw == 0 &&
1366 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
1367 if ( must_compute > 0 ) {
1368 /* We have failed blocks and need to compute them */
1369 switch ( failed ) {
1370 case 0: BUG();
1371 case 1: compute_block_1(sh, failed_num[0], 0); break;
1372 case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break;
1373 default: BUG(); /* This request should have been failed? */
1374 }
1375 }
1376
1377 PRINTK("Computing parity for stripe %llu\n", (unsigned long long)sh->sector);
1378 compute_parity(sh, RECONSTRUCT_WRITE);
1379 /* now every locked buffer is ready to be written */
1380 for (i=disks; i--;)
1381 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
1382 PRINTK("Writing stripe %llu block %d\n",
1383 (unsigned long long)sh->sector, i);
1384 locked++;
1385 set_bit(R5_Wantwrite, &sh->dev[i].flags);
1386 }
1387 /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
1388 set_bit(STRIPE_INSYNC, &sh->state);
1389
1390 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
1391 atomic_dec(&conf->preread_active_stripes);
1392 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
1393 md_wakeup_thread(conf->mddev->thread);
1394 }
1395 }
1396 }
1397
1398 /* maybe we need to check and possibly fix the parity for this stripe
1399 * Any reads will already have been scheduled, so we just see if enough data
1400 * is available
1401 */
1402 if (syncing && locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) {
1403 int update_p = 0, update_q = 0;
1404 struct r5dev *dev;
1405
1406 set_bit(STRIPE_HANDLE, &sh->state);
1407
1408 BUG_ON(failed>2);
1409 BUG_ON(uptodate < disks);
1410 /* Want to check and possibly repair P and Q.
1411 * However there could be one 'failed' device, in which
1412 * case we can only check one of them, possibly using the
1413 * other to generate missing data
1414 */
1415
1416 /* If !tmp_page, we cannot do the calculations,
1417 * but as we have set STRIPE_HANDLE, we will soon be called
1418 * by stripe_handle with a tmp_page - just wait until then.
1419 */
1420 if (tmp_page) {
1421 if (failed == q_failed) {
1422 /* The only possible failed device holds 'Q', so it makes
1423 * sense to check P (If anything else were failed, we would
1424 * have used P to recreate it).
1425 */
1426 compute_block_1(sh, pd_idx, 1);
1427 if (!page_is_zero(sh->dev[pd_idx].page)) {
1428 compute_block_1(sh,pd_idx,0);
1429 update_p = 1;
1430 }
1431 }
1432 if (!q_failed && failed < 2) {
1433 /* q is not failed, and we didn't use it to generate
1434 * anything, so it makes sense to check it
1435 */
1436 memcpy(page_address(tmp_page),
1437 page_address(sh->dev[qd_idx].page),
1438 STRIPE_SIZE);
1439 compute_parity(sh, UPDATE_PARITY);
1440 if (memcmp(page_address(tmp_page),
1441 page_address(sh->dev[qd_idx].page),
1442 STRIPE_SIZE)!= 0) {
1443 clear_bit(STRIPE_INSYNC, &sh->state);
1444 update_q = 1;
1445 }
1446 }
1447 if (update_p || update_q) {
1448 conf->mddev->resync_mismatches += STRIPE_SECTORS;
1449 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
1450 /* don't try to repair!! */
1451 update_p = update_q = 0;
1452 }
1453
1454 /* now write out any block on a failed drive,
1455 * or P or Q if they need it
1456 */
1457
1458 if (failed == 2) {
1459 dev = &sh->dev[failed_num[1]];
1460 locked++;
1461 set_bit(R5_LOCKED, &dev->flags);
1462 set_bit(R5_Wantwrite, &dev->flags);
1463 }
1464 if (failed >= 1) {
1465 dev = &sh->dev[failed_num[0]];
1466 locked++;
1467 set_bit(R5_LOCKED, &dev->flags);
1468 set_bit(R5_Wantwrite, &dev->flags);
1469 }
1470
1471 if (update_p) {
1472 dev = &sh->dev[pd_idx];
1473 locked ++;
1474 set_bit(R5_LOCKED, &dev->flags);
1475 set_bit(R5_Wantwrite, &dev->flags);
1476 }
1477 if (update_q) {
1478 dev = &sh->dev[qd_idx];
1479 locked++;
1480 set_bit(R5_LOCKED, &dev->flags);
1481 set_bit(R5_Wantwrite, &dev->flags);
1482 }
1483 clear_bit(STRIPE_DEGRADED, &sh->state);
1484
1485 set_bit(STRIPE_INSYNC, &sh->state);
1486 }
1487 }
1488
1489 if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
1490 md_done_sync(conf->mddev, STRIPE_SECTORS,1);
1491 clear_bit(STRIPE_SYNCING, &sh->state);
1492 }
1493
1494 /* If the failed drives are just a ReadError, then we might need
1495 * to progress the repair/check process
1496 */
1497 if (failed <= 2 && ! conf->mddev->ro)
1498 for (i=0; i<failed;i++) {
1499 dev = &sh->dev[failed_num[i]];
1500 if (test_bit(R5_ReadError, &dev->flags)
1501 && !test_bit(R5_LOCKED, &dev->flags)
1502 && test_bit(R5_UPTODATE, &dev->flags)
1503 ) {
1504 if (!test_bit(R5_ReWrite, &dev->flags)) {
1505 set_bit(R5_Wantwrite, &dev->flags);
1506 set_bit(R5_ReWrite, &dev->flags);
1507 set_bit(R5_LOCKED, &dev->flags);
1508 } else {
1509 /* let's read it back */
1510 set_bit(R5_Wantread, &dev->flags);
1511 set_bit(R5_LOCKED, &dev->flags);
1512 }
1513 }
1514 }
1515 spin_unlock(&sh->lock);
1516
1517 while ((bi=return_bi)) {
1518 int bytes = bi->bi_size;
1519
1520 return_bi = bi->bi_next;
1521 bi->bi_next = NULL;
1522 bi->bi_size = 0;
1523 bi->bi_end_io(bi, bytes, 0);
1524 }
1525 for (i=disks; i-- ;) {
1526 int rw;
1527 struct bio *bi;
1528 mdk_rdev_t *rdev;
1529 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
1530 rw = 1;
1531 else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
1532 rw = 0;
1533 else
1534 continue;
1535
1536 bi = &sh->dev[i].req;
1537
1538 bi->bi_rw = rw;
1539 if (rw)
1540 bi->bi_end_io = raid6_end_write_request;
1541 else
1542 bi->bi_end_io = raid6_end_read_request;
1543
1544 rcu_read_lock();
1545 rdev = rcu_dereference(conf->disks[i].rdev);
1546 if (rdev && test_bit(Faulty, &rdev->flags))
1547 rdev = NULL;
1548 if (rdev)
1549 atomic_inc(&rdev->nr_pending);
1550 rcu_read_unlock();
1551
1552 if (rdev) {
1553 if (syncing)
1554 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
1555
1556 bi->bi_bdev = rdev->bdev;
1557 PRINTK("for %llu schedule op %ld on disc %d\n",
1558 (unsigned long long)sh->sector, bi->bi_rw, i);
1559 atomic_inc(&sh->count);
1560 bi->bi_sector = sh->sector + rdev->data_offset;
1561 bi->bi_flags = 1 << BIO_UPTODATE;
1562 bi->bi_vcnt = 1;
1563 bi->bi_max_vecs = 1;
1564 bi->bi_idx = 0;
1565 bi->bi_io_vec = &sh->dev[i].vec;
1566 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
1567 bi->bi_io_vec[0].bv_offset = 0;
1568 bi->bi_size = STRIPE_SIZE;
1569 bi->bi_next = NULL;
1570 if (rw == WRITE &&
1571 test_bit(R5_ReWrite, &sh->dev[i].flags))
1572 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
1573 generic_make_request(bi);
1574 } else {
1575 if (rw == 1)
1576 set_bit(STRIPE_DEGRADED, &sh->state);
1577 PRINTK("skip op %ld on disc %d for sector %llu\n",
1578 bi->bi_rw, i, (unsigned long long)sh->sector);
1579 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1580 set_bit(STRIPE_HANDLE, &sh->state);
1581 }
1582 }
1583}
1584
1585static void raid6_activate_delayed(raid6_conf_t *conf)
1586{
1587 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
1588 while (!list_empty(&conf->delayed_list)) {
1589 struct list_head *l = conf->delayed_list.next;
1590 struct stripe_head *sh;
1591 sh = list_entry(l, struct stripe_head, lru);
1592 list_del_init(l);
1593 clear_bit(STRIPE_DELAYED, &sh->state);
1594 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1595 atomic_inc(&conf->preread_active_stripes);
1596 list_add_tail(&sh->lru, &conf->handle_list);
1597 }
1598 }
1599}
1600
1601static void activate_bit_delay(raid6_conf_t *conf)
1602{
1603 /* device_lock is held */
1604 struct list_head head;
1605 list_add(&head, &conf->bitmap_list);
1606 list_del_init(&conf->bitmap_list);
1607 while (!list_empty(&head)) {
1608 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
1609 list_del_init(&sh->lru);
1610 atomic_inc(&sh->count);
1611 __release_stripe(conf, sh);
1612 }
1613}
1614
1615static void unplug_slaves(mddev_t *mddev)
1616{
1617 raid6_conf_t *conf = mddev_to_conf(mddev);
1618 int i;
1619
1620 rcu_read_lock();
1621 for (i=0; i<mddev->raid_disks; i++) {
1622 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
1623 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
1624 request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
1625
1626 atomic_inc(&rdev->nr_pending);
1627 rcu_read_unlock();
1628
1629 if (r_queue->unplug_fn)
1630 r_queue->unplug_fn(r_queue);
1631
1632 rdev_dec_pending(rdev, mddev);
1633 rcu_read_lock();
1634 }
1635 }
1636 rcu_read_unlock();
1637}
1638
1639static void raid6_unplug_device(request_queue_t *q)
1640{
1641 mddev_t *mddev = q->queuedata;
1642 raid6_conf_t *conf = mddev_to_conf(mddev);
1643 unsigned long flags;
1644
1645 spin_lock_irqsave(&conf->device_lock, flags);
1646
1647 if (blk_remove_plug(q)) {
1648 conf->seq_flush++;
1649 raid6_activate_delayed(conf);
1650 }
1651 md_wakeup_thread(mddev->thread);
1652
1653 spin_unlock_irqrestore(&conf->device_lock, flags);
1654
1655 unplug_slaves(mddev);
1656}
1657
1658static int raid6_issue_flush(request_queue_t *q, struct gendisk *disk,
1659 sector_t *error_sector)
1660{
1661 mddev_t *mddev = q->queuedata;
1662 raid6_conf_t *conf = mddev_to_conf(mddev);
1663 int i, ret = 0;
1664
1665 rcu_read_lock();
1666 for (i=0; i<mddev->raid_disks && ret == 0; i++) {
1667 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
1668 if (rdev && !test_bit(Faulty, &rdev->flags)) {
1669 struct block_device *bdev = rdev->bdev;
1670 request_queue_t *r_queue = bdev_get_queue(bdev);
1671
1672 if (!r_queue->issue_flush_fn)
1673 ret = -EOPNOTSUPP;
1674 else {
1675 atomic_inc(&rdev->nr_pending);
1676 rcu_read_unlock();
1677 ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk,
1678 error_sector);
1679 rdev_dec_pending(rdev, mddev);
1680 rcu_read_lock();
1681 }
1682 }
1683 }
1684 rcu_read_unlock();
1685 return ret;
1686}
1687
1688static inline void raid6_plug_device(raid6_conf_t *conf)
1689{
1690 spin_lock_irq(&conf->device_lock);
1691 blk_plug_device(conf->mddev->queue);
1692 spin_unlock_irq(&conf->device_lock);
1693}
1694
1695static int make_request (request_queue_t *q, struct bio * bi)
1696{
1697 mddev_t *mddev = q->queuedata;
1698 raid6_conf_t *conf = mddev_to_conf(mddev);
1699 const unsigned int raid_disks = conf->raid_disks;
1700 const unsigned int data_disks = raid_disks - 2;
1701 unsigned int dd_idx, pd_idx;
1702 sector_t new_sector;
1703 sector_t logical_sector, last_sector;
1704 struct stripe_head *sh;
1705 const int rw = bio_data_dir(bi);
1706
1707 if (unlikely(bio_barrier(bi))) {
1708 bio_endio(bi, bi->bi_size, -EOPNOTSUPP);
1709 return 0;
1710 }
1711
1712 md_write_start(mddev, bi);
1713
1714 disk_stat_inc(mddev->gendisk, ios[rw]);
1715 disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bi));
1716
1717 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
1718 last_sector = bi->bi_sector + (bi->bi_size>>9);
1719
1720 bi->bi_next = NULL;
1721 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
1722
1723 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
1724 DEFINE_WAIT(w);
1725
1726 new_sector = raid6_compute_sector(logical_sector,
1727 raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1728
1729 PRINTK("raid6: make_request, sector %llu logical %llu\n",
1730 (unsigned long long)new_sector,
1731 (unsigned long long)logical_sector);
1732
1733 retry:
1734 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
1735 sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK));
1736 if (sh) {
1737 if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
1738 /* Add failed due to overlap. Flush everything
1739 * and wait a while
1740 */
1741 raid6_unplug_device(mddev->queue);
1742 release_stripe(sh);
1743 schedule();
1744 goto retry;
1745 }
1746 finish_wait(&conf->wait_for_overlap, &w);
1747 raid6_plug_device(conf);
1748 handle_stripe(sh, NULL);
1749 release_stripe(sh);
1750 } else {
1751 /* cannot get stripe for read-ahead, just give-up */
1752 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1753 finish_wait(&conf->wait_for_overlap, &w);
1754 break;
1755 }
1756
1757 }
1758 spin_lock_irq(&conf->device_lock);
1759 if (--bi->bi_phys_segments == 0) {
1760 int bytes = bi->bi_size;
1761
1762 if (rw == WRITE )
1763 md_write_end(mddev);
1764 bi->bi_size = 0;
1765 bi->bi_end_io(bi, bytes, 0);
1766 }
1767 spin_unlock_irq(&conf->device_lock);
1768 return 0;
1769}
1770
1771/* FIXME go_faster isn't used */
1772static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
1773{
1774 raid6_conf_t *conf = (raid6_conf_t *) mddev->private;
1775 struct stripe_head *sh;
1776 int sectors_per_chunk = conf->chunk_size >> 9;
1777 sector_t x;
1778 unsigned long stripe;
1779 int chunk_offset;
1780 int dd_idx, pd_idx;
1781 sector_t first_sector;
1782 int raid_disks = conf->raid_disks;
1783 int data_disks = raid_disks - 2;
1784 sector_t max_sector = mddev->size << 1;
1785 int sync_blocks;
1786 int still_degraded = 0;
1787 int i;
1788
1789 if (sector_nr >= max_sector) {
1790 /* just being told to finish up .. nothing much to do */
1791 unplug_slaves(mddev);
1792
1793 if (mddev->curr_resync < max_sector) /* aborted */
1794 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
1795 &sync_blocks, 1);
1796 else /* completed sync */
1797 conf->fullsync = 0;
1798 bitmap_close_sync(mddev->bitmap);
1799
1800 return 0;
1801 }
1802 /* if there are 2 or more failed drives and we are trying
1803 * to resync, then assert that we are finished, because there is
1804 * nothing we can do.
1805 */
1806 if (mddev->degraded >= 2 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
1807 sector_t rv = (mddev->size << 1) - sector_nr;
1808 *skipped = 1;
1809 return rv;
1810 }
1811 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
1812 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
1813 !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
1814 /* we can skip this block, and probably more */
1815 sync_blocks /= STRIPE_SECTORS;
1816 *skipped = 1;
1817 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
1818 }
1819
1820 x = sector_nr;
1821 chunk_offset = sector_div(x, sectors_per_chunk);
1822 stripe = x;
1823 BUG_ON(x != stripe);
1824
1825 first_sector = raid6_compute_sector((sector_t)stripe*data_disks*sectors_per_chunk
1826 + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1827 sh = get_active_stripe(conf, sector_nr, pd_idx, 1);
1828 if (sh == NULL) {
1829 sh = get_active_stripe(conf, sector_nr, pd_idx, 0);
1830 /* make sure we don't swamp the stripe cache if someone else
1831 * is trying to get access
1832 */
1833 schedule_timeout_uninterruptible(1);
1834 }
1835 /* Need to check if array will still be degraded after recovery/resync
1836 * We don't need to check the 'failed' flag as when that gets set,
1837 * recovery aborts.
1838 */
1839 for (i=0; i<mddev->raid_disks; i++)
1840 if (conf->disks[i].rdev == NULL)
1841 still_degraded = 1;
1842
1843 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
1844
1845 spin_lock(&sh->lock);
1846 set_bit(STRIPE_SYNCING, &sh->state);
1847 clear_bit(STRIPE_INSYNC, &sh->state);
1848 spin_unlock(&sh->lock);
1849
1850 handle_stripe(sh, NULL);
1851 release_stripe(sh);
1852
1853 return STRIPE_SECTORS;
1854}
1855
1856/*
1857 * This is our raid6 kernel thread.
1858 *
1859 * We scan the hash table for stripes which can be handled now.
1860 * During the scan, completed stripes are saved for us by the interrupt
1861 * handler, so that they will not have to wait for our next wakeup.
1862 */
1863static void raid6d (mddev_t *mddev)
1864{
1865 struct stripe_head *sh;
1866 raid6_conf_t *conf = mddev_to_conf(mddev);
1867 int handled;
1868
1869 PRINTK("+++ raid6d active\n");
1870
1871 md_check_recovery(mddev);
1872
1873 handled = 0;
1874 spin_lock_irq(&conf->device_lock);
1875 while (1) {
1876 struct list_head *first;
1877
1878 if (conf->seq_flush - conf->seq_write > 0) {
1879 int seq = conf->seq_flush;
1880 spin_unlock_irq(&conf->device_lock);
1881 bitmap_unplug(mddev->bitmap);
1882 spin_lock_irq(&conf->device_lock);
1883 conf->seq_write = seq;
1884 activate_bit_delay(conf);
1885 }
1886
1887 if (list_empty(&conf->handle_list) &&
1888 atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
1889 !blk_queue_plugged(mddev->queue) &&
1890 !list_empty(&conf->delayed_list))
1891 raid6_activate_delayed(conf);
1892
1893 if (list_empty(&conf->handle_list))
1894 break;
1895
1896 first = conf->handle_list.next;
1897 sh = list_entry(first, struct stripe_head, lru);
1898
1899 list_del_init(first);
1900 atomic_inc(&sh->count);
1901 BUG_ON(atomic_read(&sh->count)!= 1);
1902 spin_unlock_irq(&conf->device_lock);
1903
1904 handled++;
1905 handle_stripe(sh, conf->spare_page);
1906 release_stripe(sh);
1907
1908 spin_lock_irq(&conf->device_lock);
1909 }
1910 PRINTK("%d stripes handled\n", handled);
1911
1912 spin_unlock_irq(&conf->device_lock);
1913
1914 unplug_slaves(mddev);
1915
1916 PRINTK("--- raid6d inactive\n");
1917}
1918
1919static ssize_t
1920raid6_show_stripe_cache_size(mddev_t *mddev, char *page)
1921{
1922 raid6_conf_t *conf = mddev_to_conf(mddev);
1923 if (conf)
1924 return sprintf(page, "%d\n", conf->max_nr_stripes);
1925 else
1926 return 0;
1927}
1928
1929static ssize_t
1930raid6_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
1931{
1932 raid6_conf_t *conf = mddev_to_conf(mddev);
1933 char *end;
1934 int new;
1935 if (len >= PAGE_SIZE)
1936 return -EINVAL;
1937 if (!conf)
1938 return -ENODEV;
1939
1940 new = simple_strtoul(page, &end, 10);
1941 if (!*page || (*end && *end != '\n') )
1942 return -EINVAL;
1943 if (new <= 16 || new > 32768)
1944 return -EINVAL;
1945 while (new < conf->max_nr_stripes) {
1946 if (drop_one_stripe(conf))
1947 conf->max_nr_stripes--;
1948 else
1949 break;
1950 }
1951 while (new > conf->max_nr_stripes) {
1952 if (grow_one_stripe(conf))
1953 conf->max_nr_stripes++;
1954 else break;
1955 }
1956 return len;
1957}
1958
1959static struct md_sysfs_entry
1960raid6_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
1961 raid6_show_stripe_cache_size,
1962 raid6_store_stripe_cache_size);
1963
1964static ssize_t
1965stripe_cache_active_show(mddev_t *mddev, char *page)
1966{
1967 raid6_conf_t *conf = mddev_to_conf(mddev);
1968 if (conf)
1969 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
1970 else
1971 return 0;
1972}
1973
1974static struct md_sysfs_entry
1975raid6_stripecache_active = __ATTR_RO(stripe_cache_active);
1976
1977static struct attribute *raid6_attrs[] = {
1978 &raid6_stripecache_size.attr,
1979 &raid6_stripecache_active.attr,
1980 NULL,
1981};
1982static struct attribute_group raid6_attrs_group = {
1983 .name = NULL,
1984 .attrs = raid6_attrs,
1985};
1986
1987static int run(mddev_t *mddev)
1988{
1989 raid6_conf_t *conf;
1990 int raid_disk, memory;
1991 mdk_rdev_t *rdev;
1992 struct disk_info *disk;
1993 struct list_head *tmp;
1994
1995 if (mddev->level != 6) {
1996 PRINTK("raid6: %s: raid level not set to 6 (%d)\n", mdname(mddev), mddev->level);
1997 return -EIO;
1998 }
1999
2000 mddev->private = kzalloc(sizeof (raid6_conf_t), GFP_KERNEL);
2001 if ((conf = mddev->private) == NULL)
2002 goto abort;
2003 conf->disks = kzalloc(mddev->raid_disks * sizeof(struct disk_info),
2004 GFP_KERNEL);
2005 if (!conf->disks)
2006 goto abort;
2007
2008 conf->mddev = mddev;
2009
2010 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
2011 goto abort;
2012
2013 conf->spare_page = alloc_page(GFP_KERNEL);
2014 if (!conf->spare_page)
2015 goto abort;
2016
2017 spin_lock_init(&conf->device_lock);
2018 init_waitqueue_head(&conf->wait_for_stripe);
2019 init_waitqueue_head(&conf->wait_for_overlap);
2020 INIT_LIST_HEAD(&conf->handle_list);
2021 INIT_LIST_HEAD(&conf->delayed_list);
2022 INIT_LIST_HEAD(&conf->bitmap_list);
2023 INIT_LIST_HEAD(&conf->inactive_list);
2024 atomic_set(&conf->active_stripes, 0);
2025 atomic_set(&conf->preread_active_stripes, 0);
2026
2027 PRINTK("raid6: run(%s) called.\n", mdname(mddev));
2028
2029 ITERATE_RDEV(mddev,rdev,tmp) {
2030 raid_disk = rdev->raid_disk;
2031 if (raid_disk >= mddev->raid_disks
2032 || raid_disk < 0)
2033 continue;
2034 disk = conf->disks + raid_disk;
2035
2036 disk->rdev = rdev;
2037
2038 if (test_bit(In_sync, &rdev->flags)) {
2039 char b[BDEVNAME_SIZE];
2040 printk(KERN_INFO "raid6: device %s operational as raid"
2041 " disk %d\n", bdevname(rdev->bdev,b),
2042 raid_disk);
2043 conf->working_disks++;
2044 }
2045 }
2046
2047 conf->raid_disks = mddev->raid_disks;
2048
2049 /*
2050 * 0 for a fully functional array, 1 or 2 for a degraded array.
2051 */
2052 mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks;
2053 conf->mddev = mddev;
2054 conf->chunk_size = mddev->chunk_size;
2055 conf->level = mddev->level;
2056 conf->algorithm = mddev->layout;
2057 conf->max_nr_stripes = NR_STRIPES;
2058
2059 /* device size must be a multiple of chunk size */
2060 mddev->size &= ~(mddev->chunk_size/1024 -1);
2061 mddev->resync_max_sectors = mddev->size << 1;
2062
2063 if (conf->raid_disks < 4) {
2064 printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n",
2065 mdname(mddev), conf->raid_disks);
2066 goto abort;
2067 }
2068 if (!conf->chunk_size || conf->chunk_size % 4) {
2069 printk(KERN_ERR "raid6: invalid chunk size %d for %s\n",
2070 conf->chunk_size, mdname(mddev));
2071 goto abort;
2072 }
2073 if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
2074 printk(KERN_ERR
2075 "raid6: unsupported parity algorithm %d for %s\n",
2076 conf->algorithm, mdname(mddev));
2077 goto abort;
2078 }
2079 if (mddev->degraded > 2) {
2080 printk(KERN_ERR "raid6: not enough operational devices for %s"
2081 " (%d/%d failed)\n",
2082 mdname(mddev), conf->failed_disks, conf->raid_disks);
2083 goto abort;
2084 }
2085
2086 if (mddev->degraded > 0 &&
2087 mddev->recovery_cp != MaxSector) {
2088 if (mddev->ok_start_degraded)
2089 printk(KERN_WARNING "raid6: starting dirty degraded array:%s"
2090 "- data corruption possible.\n",
2091 mdname(mddev));
2092 else {
2093 printk(KERN_ERR "raid6: cannot start dirty degraded array"
2094 " for %s\n", mdname(mddev));
2095 goto abort;
2096 }
2097 }
2098
2099 {
2100 mddev->thread = md_register_thread(raid6d, mddev, "%s_raid6");
2101 if (!mddev->thread) {
2102 printk(KERN_ERR
2103 "raid6: couldn't allocate thread for %s\n",
2104 mdname(mddev));
2105 goto abort;
2106 }
2107 }
2108
2109 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
2110 conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
2111 if (grow_stripes(conf, conf->max_nr_stripes)) {
2112 printk(KERN_ERR
2113 "raid6: couldn't allocate %dkB for buffers\n", memory);
2114 shrink_stripes(conf);
2115 md_unregister_thread(mddev->thread);
2116 goto abort;
2117 } else
2118 printk(KERN_INFO "raid6: allocated %dkB for %s\n",
2119 memory, mdname(mddev));
2120
2121 if (mddev->degraded == 0)
2122 printk(KERN_INFO "raid6: raid level %d set %s active with %d out of %d"
2123 " devices, algorithm %d\n", conf->level, mdname(mddev),
2124 mddev->raid_disks-mddev->degraded, mddev->raid_disks,
2125 conf->algorithm);
2126 else
2127 printk(KERN_ALERT "raid6: raid level %d set %s active with %d"
2128 " out of %d devices, algorithm %d\n", conf->level,
2129 mdname(mddev), mddev->raid_disks - mddev->degraded,
2130 mddev->raid_disks, conf->algorithm);
2131
2132 print_raid6_conf(conf);
2133
2134 /* read-ahead size must cover two whole stripes, which is
2135 * 2 * (n-2) * chunksize where 'n' is the number of raid devices
2136 */
2137 {
2138 int stripe = (mddev->raid_disks-2) * mddev->chunk_size
2139 / PAGE_SIZE;
2140 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
2141 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
2142 }
2143
2144 /* Ok, everything is just fine now */
2145 sysfs_create_group(&mddev->kobj, &raid6_attrs_group);
2146
2147 mddev->array_size = mddev->size * (mddev->raid_disks - 2);
2148
2149 mddev->queue->unplug_fn = raid6_unplug_device;
2150 mddev->queue->issue_flush_fn = raid6_issue_flush;
2151 return 0;
2152abort:
2153 if (conf) {
2154 print_raid6_conf(conf);
2155 safe_put_page(conf->spare_page);
2156 kfree(conf->stripe_hashtbl);
2157 kfree(conf->disks);
2158 kfree(conf);
2159 }
2160 mddev->private = NULL;
2161 printk(KERN_ALERT "raid6: failed to run raid set %s\n", mdname(mddev));
2162 return -EIO;
2163}
2164
2165
2166
2167static int stop (mddev_t *mddev)
2168{
2169 raid6_conf_t *conf = (raid6_conf_t *) mddev->private;
2170
2171 md_unregister_thread(mddev->thread);
2172 mddev->thread = NULL;
2173 shrink_stripes(conf);
2174 kfree(conf->stripe_hashtbl);
2175 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
2176 sysfs_remove_group(&mddev->kobj, &raid6_attrs_group);
2177 kfree(conf);
2178 mddev->private = NULL;
2179 return 0;
2180}
2181
2182#if RAID6_DUMPSTATE
2183static void print_sh (struct seq_file *seq, struct stripe_head *sh)
2184{
2185 int i;
2186
2187 seq_printf(seq, "sh %llu, pd_idx %d, state %ld.\n",
2188 (unsigned long long)sh->sector, sh->pd_idx, sh->state);
2189 seq_printf(seq, "sh %llu, count %d.\n",
2190 (unsigned long long)sh->sector, atomic_read(&sh->count));
2191 seq_printf(seq, "sh %llu, ", (unsigned long long)sh->sector);
2192 for (i = 0; i < sh->raid_conf->raid_disks; i++) {
2193 seq_printf(seq, "(cache%d: %p %ld) ",
2194 i, sh->dev[i].page, sh->dev[i].flags);
2195 }
2196 seq_printf(seq, "\n");
2197}
2198
2199static void printall (struct seq_file *seq, raid6_conf_t *conf)
2200{
2201 struct stripe_head *sh;
2202 struct hlist_node *hn;
2203 int i;
2204
2205 spin_lock_irq(&conf->device_lock);
2206 for (i = 0; i < NR_HASH; i++) {
2207 sh = conf->stripe_hashtbl[i];
2208 hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) {
2209 if (sh->raid_conf != conf)
2210 continue;
2211 print_sh(seq, sh);
2212 }
2213 }
2214 spin_unlock_irq(&conf->device_lock);
2215}
2216#endif
2217
2218static void status (struct seq_file *seq, mddev_t *mddev)
2219{
2220 raid6_conf_t *conf = (raid6_conf_t *) mddev->private;
2221 int i;
2222
2223 seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout);
2224 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->working_disks);
2225 for (i = 0; i < conf->raid_disks; i++)
2226 seq_printf (seq, "%s",
2227 conf->disks[i].rdev &&
2228 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_");
2229 seq_printf (seq, "]");
2230#if RAID6_DUMPSTATE
2231 seq_printf (seq, "\n");
2232 printall(seq, conf);
2233#endif
2234}
2235
2236static void print_raid6_conf (raid6_conf_t *conf)
2237{
2238 int i;
2239 struct disk_info *tmp;
2240
2241 printk("RAID6 conf printout:\n");
2242 if (!conf) {
2243 printk("(conf==NULL)\n");
2244 return;
2245 }
2246 printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
2247 conf->working_disks, conf->failed_disks);
2248
2249 for (i = 0; i < conf->raid_disks; i++) {
2250 char b[BDEVNAME_SIZE];
2251 tmp = conf->disks + i;
2252 if (tmp->rdev)
2253 printk(" disk %d, o:%d, dev:%s\n",
2254 i, !test_bit(Faulty, &tmp->rdev->flags),
2255 bdevname(tmp->rdev->bdev,b));
2256 }
2257}
2258
2259static int raid6_spare_active(mddev_t *mddev)
2260{
2261 int i;
2262 raid6_conf_t *conf = mddev->private;
2263 struct disk_info *tmp;
2264
2265 for (i = 0; i < conf->raid_disks; i++) {
2266 tmp = conf->disks + i;
2267 if (tmp->rdev
2268 && !test_bit(Faulty, &tmp->rdev->flags)
2269 && !test_bit(In_sync, &tmp->rdev->flags)) {
2270 mddev->degraded--;
2271 conf->failed_disks--;
2272 conf->working_disks++;
2273 set_bit(In_sync, &tmp->rdev->flags);
2274 }
2275 }
2276 print_raid6_conf(conf);
2277 return 0;
2278}
2279
2280static int raid6_remove_disk(mddev_t *mddev, int number)
2281{
2282 raid6_conf_t *conf = mddev->private;
2283 int err = 0;
2284 mdk_rdev_t *rdev;
2285 struct disk_info *p = conf->disks + number;
2286
2287 print_raid6_conf(conf);
2288 rdev = p->rdev;
2289 if (rdev) {
2290 if (test_bit(In_sync, &rdev->flags) ||
2291 atomic_read(&rdev->nr_pending)) {
2292 err = -EBUSY;
2293 goto abort;
2294 }
2295 p->rdev = NULL;
2296 synchronize_rcu();
2297 if (atomic_read(&rdev->nr_pending)) {
2298 /* lost the race, try later */
2299 err = -EBUSY;
2300 p->rdev = rdev;
2301 }
2302 }
2303
2304abort:
2305
2306 print_raid6_conf(conf);
2307 return err;
2308}
2309
2310static int raid6_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
2311{
2312 raid6_conf_t *conf = mddev->private;
2313 int found = 0;
2314 int disk;
2315 struct disk_info *p;
2316
2317 if (mddev->degraded > 2)
2318 /* no point adding a device */
2319 return 0;
2320 /*
2321 * find the disk ... but prefer rdev->saved_raid_disk
2322 * if possible.
2323 */
2324 if (rdev->saved_raid_disk >= 0 &&
2325 conf->disks[rdev->saved_raid_disk].rdev == NULL)
2326 disk = rdev->saved_raid_disk;
2327 else
2328 disk = 0;
2329 for ( ; disk < mddev->raid_disks; disk++)
2330 if ((p=conf->disks + disk)->rdev == NULL) {
2331 clear_bit(In_sync, &rdev->flags);
2332 rdev->raid_disk = disk;
2333 found = 1;
2334 if (rdev->saved_raid_disk != disk)
2335 conf->fullsync = 1;
2336 rcu_assign_pointer(p->rdev, rdev);
2337 break;
2338 }
2339 print_raid6_conf(conf);
2340 return found;
2341}
2342
2343static int raid6_resize(mddev_t *mddev, sector_t sectors)
2344{
2345 /* no resync is happening, and there is enough space
2346 * on all devices, so we can resize.
2347 * We need to make sure resync covers any new space.
2348 * If the array is shrinking we should possibly wait until
2349 * any io in the removed space completes, but it hardly seems
2350 * worth it.
2351 */
2352 sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
2353 mddev->array_size = (sectors * (mddev->raid_disks-2))>>1;
2354 set_capacity(mddev->gendisk, mddev->array_size << 1);
2355 mddev->changed = 1;
2356 if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) {
2357 mddev->recovery_cp = mddev->size << 1;
2358 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2359 }
2360 mddev->size = sectors /2;
2361 mddev->resync_max_sectors = sectors;
2362 return 0;
2363}
2364
2365static void raid6_quiesce(mddev_t *mddev, int state)
2366{
2367 raid6_conf_t *conf = mddev_to_conf(mddev);
2368
2369 switch(state) {
2370 case 1: /* stop all writes */
2371 spin_lock_irq(&conf->device_lock);
2372 conf->quiesce = 1;
2373 wait_event_lock_irq(conf->wait_for_stripe,
2374 atomic_read(&conf->active_stripes) == 0,
2375 conf->device_lock, /* nothing */);
2376 spin_unlock_irq(&conf->device_lock);
2377 break;
2378
2379 case 0: /* re-enable writes */
2380 spin_lock_irq(&conf->device_lock);
2381 conf->quiesce = 0;
2382 wake_up(&conf->wait_for_stripe);
2383 spin_unlock_irq(&conf->device_lock);
2384 break;
2385 }
2386}
2387
2388static struct mdk_personality raid6_personality =
2389{
2390 .name = "raid6",
2391 .level = 6,
2392 .owner = THIS_MODULE,
2393 .make_request = make_request,
2394 .run = run,
2395 .stop = stop,
2396 .status = status,
2397 .error_handler = error,
2398 .hot_add_disk = raid6_add_disk,
2399 .hot_remove_disk= raid6_remove_disk,
2400 .spare_active = raid6_spare_active,
2401 .sync_request = sync_request,
2402 .resize = raid6_resize,
2403 .quiesce = raid6_quiesce,
2404};
2405
2406static int __init raid6_init(void)
2407{
2408 int e;
2409
2410 e = raid6_select_algo();
2411 if ( e )
2412 return e;
2413
2414 return register_md_personality(&raid6_personality);
2415}
2416
2417static void raid6_exit (void)
2418{
2419 unregister_md_personality(&raid6_personality);
2420}
2421
2422module_init(raid6_init);
2423module_exit(raid6_exit);
2424MODULE_LICENSE("GPL");
2425MODULE_ALIAS("md-personality-8"); /* RAID6 */
2426MODULE_ALIAS("md-raid6");
2427MODULE_ALIAS("md-level-6");