aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig13
-rw-r--r--drivers/md/Makefile1
-rw-r--r--drivers/md/bitmap.c499
-rw-r--r--drivers/md/bitmap.h21
-rw-r--r--drivers/md/dm-crypt.c212
-rw-r--r--drivers/md/dm-delay.c8
-rw-r--r--drivers/md/dm-exception-store.c67
-rw-r--r--drivers/md/dm-exception-store.h70
-rw-r--r--drivers/md/dm-io.c120
-rw-r--r--drivers/md/dm-ioctl.c145
-rw-r--r--drivers/md/dm-kcopyd.c5
-rw-r--r--drivers/md/dm-linear.c3
-rw-r--r--drivers/md/dm-log-userspace-base.c3
-rw-r--r--drivers/md/dm-log-userspace-transfer.c17
-rw-r--r--drivers/md/dm-log.c80
-rw-r--r--drivers/md/dm-mpath.c178
-rw-r--r--drivers/md/dm-raid1.c232
-rw-r--r--drivers/md/dm-region-hash.c37
-rw-r--r--drivers/md/dm-service-time.c2
-rw-r--r--drivers/md/dm-snap-persistent.c213
-rw-r--r--drivers/md/dm-snap-transient.c24
-rw-r--r--drivers/md/dm-snap.c1286
-rw-r--r--drivers/md/dm-stripe.c5
-rw-r--r--drivers/md/dm-sysfs.c4
-rw-r--r--drivers/md/dm-table.c41
-rw-r--r--drivers/md/dm-target.c1
-rw-r--r--drivers/md/dm-uevent.c16
-rw-r--r--drivers/md/dm.c684
-rw-r--r--drivers/md/dm.h17
-rw-r--r--drivers/md/faulty.c11
-rw-r--r--drivers/md/linear.c52
-rw-r--r--drivers/md/md.c1081
-rw-r--r--drivers/md/md.h70
-rw-r--r--drivers/md/multipath.c37
-rw-r--r--drivers/md/raid0.c271
-rw-r--r--drivers/md/raid1.c355
-rw-r--r--drivers/md/raid1.h5
-rw-r--r--drivers/md/raid10.c442
-rw-r--r--drivers/md/raid10.h7
-rw-r--r--drivers/md/raid5.c744
-rw-r--r--drivers/md/raid5.h16
41 files changed, 4970 insertions, 2125 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index e27ae4604cef..bf1a95e31559 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -100,8 +100,8 @@ config MD_RAID1
100 If unsure, say Y. 100 If unsure, say Y.
101 101
102config MD_RAID10 102config MD_RAID10
103 tristate "RAID-10 (mirrored striping) mode (EXPERIMENTAL)" 103 tristate "RAID-10 (mirrored striping) mode"
104 depends on BLK_DEV_MD && EXPERIMENTAL 104 depends on BLK_DEV_MD
105 ---help--- 105 ---help---
106 RAID-10 provides a combination of striping (RAID-0) and 106 RAID-10 provides a combination of striping (RAID-0) and
107 mirroring (RAID-1) with easier configuration and more flexible 107 mirroring (RAID-1) with easier configuration and more flexible
@@ -169,11 +169,10 @@ config MD_MULTIPATH
169 tristate "Multipath I/O support" 169 tristate "Multipath I/O support"
170 depends on BLK_DEV_MD 170 depends on BLK_DEV_MD
171 help 171 help
172 Multipath-IO is the ability of certain devices to address the same 172 MD_MULTIPATH provides a simple multi-path personality for use
173 physical disk over multiple 'IO paths'. The code ensures that such 173 the MD framework. It is not under active development. New
174 paths can be defined and handled at runtime, and ensures that a 174 projects should consider using DM_MULTIPATH which has more
175 transparent failover to the backup path(s) happens if a IO errors 175 features and more testing.
176 arrives on the primary path.
177 176
178 If unsure, say N. 177 If unsure, say N.
179 178
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index c9b3a7843d83..5e3aac41919d 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -40,4 +40,3 @@ obj-$(CONFIG_DM_ZERO) += dm-zero.o
40ifeq ($(CONFIG_DM_UEVENT),y) 40ifeq ($(CONFIG_DM_UEVENT),y)
41dm-mod-objs += dm-uevent.o 41dm-mod-objs += dm-uevent.o
42endif 42endif
43
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 6986b0059d23..1742435ce3ae 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -212,7 +212,7 @@ static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page)
212 */ 212 */
213 213
214/* IO operations when bitmap is stored near all superblocks */ 214/* IO operations when bitmap is stored near all superblocks */
215static struct page *read_sb_page(mddev_t *mddev, long offset, 215static struct page *read_sb_page(mddev_t *mddev, loff_t offset,
216 struct page *page, 216 struct page *page,
217 unsigned long index, int size) 217 unsigned long index, int size)
218{ 218{
@@ -287,27 +287,36 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
287 287
288 while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { 288 while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
289 int size = PAGE_SIZE; 289 int size = PAGE_SIZE;
290 loff_t offset = mddev->bitmap_info.offset;
290 if (page->index == bitmap->file_pages-1) 291 if (page->index == bitmap->file_pages-1)
291 size = roundup(bitmap->last_page_size, 292 size = roundup(bitmap->last_page_size,
292 bdev_logical_block_size(rdev->bdev)); 293 bdev_logical_block_size(rdev->bdev));
293 /* Just make sure we aren't corrupting data or 294 /* Just make sure we aren't corrupting data or
294 * metadata 295 * metadata
295 */ 296 */
296 if (bitmap->offset < 0) { 297 if (mddev->external) {
298 /* Bitmap could be anywhere. */
299 if (rdev->sb_start + offset + (page->index *(PAGE_SIZE/512)) >
300 rdev->data_offset &&
301 rdev->sb_start + offset <
302 rdev->data_offset + mddev->dev_sectors +
303 (PAGE_SIZE/512))
304 goto bad_alignment;
305 } else if (offset < 0) {
297 /* DATA BITMAP METADATA */ 306 /* DATA BITMAP METADATA */
298 if (bitmap->offset 307 if (offset
299 + (long)(page->index * (PAGE_SIZE/512)) 308 + (long)(page->index * (PAGE_SIZE/512))
300 + size/512 > 0) 309 + size/512 > 0)
301 /* bitmap runs in to metadata */ 310 /* bitmap runs in to metadata */
302 goto bad_alignment; 311 goto bad_alignment;
303 if (rdev->data_offset + mddev->dev_sectors 312 if (rdev->data_offset + mddev->dev_sectors
304 > rdev->sb_start + bitmap->offset) 313 > rdev->sb_start + offset)
305 /* data runs in to bitmap */ 314 /* data runs in to bitmap */
306 goto bad_alignment; 315 goto bad_alignment;
307 } else if (rdev->sb_start < rdev->data_offset) { 316 } else if (rdev->sb_start < rdev->data_offset) {
308 /* METADATA BITMAP DATA */ 317 /* METADATA BITMAP DATA */
309 if (rdev->sb_start 318 if (rdev->sb_start
310 + bitmap->offset 319 + offset
311 + page->index*(PAGE_SIZE/512) + size/512 320 + page->index*(PAGE_SIZE/512) + size/512
312 > rdev->data_offset) 321 > rdev->data_offset)
313 /* bitmap runs in to data */ 322 /* bitmap runs in to data */
@@ -316,7 +325,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
316 /* DATA METADATA BITMAP - no problems */ 325 /* DATA METADATA BITMAP - no problems */
317 } 326 }
318 md_super_write(mddev, rdev, 327 md_super_write(mddev, rdev,
319 rdev->sb_start + bitmap->offset 328 rdev->sb_start + offset
320 + page->index * (PAGE_SIZE/512), 329 + page->index * (PAGE_SIZE/512),
321 size, 330 size,
322 page); 331 page);
@@ -488,19 +497,24 @@ void bitmap_update_sb(struct bitmap *bitmap)
488 497
489 if (!bitmap || !bitmap->mddev) /* no bitmap for this array */ 498 if (!bitmap || !bitmap->mddev) /* no bitmap for this array */
490 return; 499 return;
500 if (bitmap->mddev->bitmap_info.external)
501 return;
491 spin_lock_irqsave(&bitmap->lock, flags); 502 spin_lock_irqsave(&bitmap->lock, flags);
492 if (!bitmap->sb_page) { /* no superblock */ 503 if (!bitmap->sb_page) { /* no superblock */
493 spin_unlock_irqrestore(&bitmap->lock, flags); 504 spin_unlock_irqrestore(&bitmap->lock, flags);
494 return; 505 return;
495 } 506 }
496 spin_unlock_irqrestore(&bitmap->lock, flags); 507 spin_unlock_irqrestore(&bitmap->lock, flags);
497 sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); 508 sb = kmap_atomic(bitmap->sb_page, KM_USER0);
498 sb->events = cpu_to_le64(bitmap->mddev->events); 509 sb->events = cpu_to_le64(bitmap->mddev->events);
499 if (bitmap->mddev->events < bitmap->events_cleared) { 510 if (bitmap->mddev->events < bitmap->events_cleared) {
500 /* rocking back to read-only */ 511 /* rocking back to read-only */
501 bitmap->events_cleared = bitmap->mddev->events; 512 bitmap->events_cleared = bitmap->mddev->events;
502 sb->events_cleared = cpu_to_le64(bitmap->events_cleared); 513 sb->events_cleared = cpu_to_le64(bitmap->events_cleared);
503 } 514 }
515 /* Just in case these have been changed via sysfs: */
516 sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ);
517 sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind);
504 kunmap_atomic(sb, KM_USER0); 518 kunmap_atomic(sb, KM_USER0);
505 write_page(bitmap, bitmap->sb_page, 1); 519 write_page(bitmap, bitmap->sb_page, 1);
506} 520}
@@ -512,7 +526,7 @@ void bitmap_print_sb(struct bitmap *bitmap)
512 526
513 if (!bitmap || !bitmap->sb_page) 527 if (!bitmap || !bitmap->sb_page)
514 return; 528 return;
515 sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); 529 sb = kmap_atomic(bitmap->sb_page, KM_USER0);
516 printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap)); 530 printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap));
517 printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic)); 531 printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic));
518 printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version)); 532 printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version));
@@ -550,7 +564,8 @@ static int bitmap_read_sb(struct bitmap *bitmap)
550 564
551 bitmap->sb_page = read_page(bitmap->file, 0, bitmap, bytes); 565 bitmap->sb_page = read_page(bitmap->file, 0, bitmap, bytes);
552 } else { 566 } else {
553 bitmap->sb_page = read_sb_page(bitmap->mddev, bitmap->offset, 567 bitmap->sb_page = read_sb_page(bitmap->mddev,
568 bitmap->mddev->bitmap_info.offset,
554 NULL, 569 NULL,
555 0, sizeof(bitmap_super_t)); 570 0, sizeof(bitmap_super_t));
556 } 571 }
@@ -560,10 +575,10 @@ static int bitmap_read_sb(struct bitmap *bitmap)
560 return err; 575 return err;
561 } 576 }
562 577
563 sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); 578 sb = kmap_atomic(bitmap->sb_page, KM_USER0);
564 579
565 chunksize = le32_to_cpu(sb->chunksize); 580 chunksize = le32_to_cpu(sb->chunksize);
566 daemon_sleep = le32_to_cpu(sb->daemon_sleep); 581 daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
567 write_behind = le32_to_cpu(sb->write_behind); 582 write_behind = le32_to_cpu(sb->write_behind);
568 583
569 /* verify that the bitmap-specific fields are valid */ 584 /* verify that the bitmap-specific fields are valid */
@@ -576,7 +591,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
576 reason = "bitmap chunksize too small"; 591 reason = "bitmap chunksize too small";
577 else if ((1 << ffz(~chunksize)) != chunksize) 592 else if ((1 << ffz(~chunksize)) != chunksize)
578 reason = "bitmap chunksize not a power of 2"; 593 reason = "bitmap chunksize not a power of 2";
579 else if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT / HZ) 594 else if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT)
580 reason = "daemon sleep period out of range"; 595 reason = "daemon sleep period out of range";
581 else if (write_behind > COUNTER_MAX) 596 else if (write_behind > COUNTER_MAX)
582 reason = "write-behind limit out of range (0 - 16383)"; 597 reason = "write-behind limit out of range (0 - 16383)";
@@ -610,10 +625,9 @@ static int bitmap_read_sb(struct bitmap *bitmap)
610 } 625 }
611success: 626success:
612 /* assign fields using values from superblock */ 627 /* assign fields using values from superblock */
613 bitmap->chunksize = chunksize; 628 bitmap->mddev->bitmap_info.chunksize = chunksize;
614 bitmap->daemon_sleep = daemon_sleep; 629 bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
615 bitmap->daemon_lastrun = jiffies; 630 bitmap->mddev->bitmap_info.max_write_behind = write_behind;
616 bitmap->max_write_behind = write_behind;
617 bitmap->flags |= le32_to_cpu(sb->state); 631 bitmap->flags |= le32_to_cpu(sb->state);
618 if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) 632 if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)
619 bitmap->flags |= BITMAP_HOSTENDIAN; 633 bitmap->flags |= BITMAP_HOSTENDIAN;
@@ -647,7 +661,7 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
647 return 0; 661 return 0;
648 } 662 }
649 spin_unlock_irqrestore(&bitmap->lock, flags); 663 spin_unlock_irqrestore(&bitmap->lock, flags);
650 sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); 664 sb = kmap_atomic(bitmap->sb_page, KM_USER0);
651 old = le32_to_cpu(sb->state) & bits; 665 old = le32_to_cpu(sb->state) & bits;
652 switch (op) { 666 switch (op) {
653 case MASK_SET: sb->state |= cpu_to_le32(bits); 667 case MASK_SET: sb->state |= cpu_to_le32(bits);
@@ -664,16 +678,26 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
664 * general bitmap file operations 678 * general bitmap file operations
665 */ 679 */
666 680
681/*
682 * on-disk bitmap:
683 *
684 * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap
685 * file a page at a time. There's a superblock at the start of the file.
686 */
667/* calculate the index of the page that contains this bit */ 687/* calculate the index of the page that contains this bit */
668static inline unsigned long file_page_index(unsigned long chunk) 688static inline unsigned long file_page_index(struct bitmap *bitmap, unsigned long chunk)
669{ 689{
670 return CHUNK_BIT_OFFSET(chunk) >> PAGE_BIT_SHIFT; 690 if (!bitmap->mddev->bitmap_info.external)
691 chunk += sizeof(bitmap_super_t) << 3;
692 return chunk >> PAGE_BIT_SHIFT;
671} 693}
672 694
673/* calculate the (bit) offset of this bit within a page */ 695/* calculate the (bit) offset of this bit within a page */
674static inline unsigned long file_page_offset(unsigned long chunk) 696static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned long chunk)
675{ 697{
676 return CHUNK_BIT_OFFSET(chunk) & (PAGE_BITS - 1); 698 if (!bitmap->mddev->bitmap_info.external)
699 chunk += sizeof(bitmap_super_t) << 3;
700 return chunk & (PAGE_BITS - 1);
677} 701}
678 702
679/* 703/*
@@ -686,8 +710,9 @@ static inline unsigned long file_page_offset(unsigned long chunk)
686static inline struct page *filemap_get_page(struct bitmap *bitmap, 710static inline struct page *filemap_get_page(struct bitmap *bitmap,
687 unsigned long chunk) 711 unsigned long chunk)
688{ 712{
689 if (file_page_index(chunk) >= bitmap->file_pages) return NULL; 713 if (file_page_index(bitmap, chunk) >= bitmap->file_pages) return NULL;
690 return bitmap->filemap[file_page_index(chunk) - file_page_index(0)]; 714 return bitmap->filemap[file_page_index(bitmap, chunk)
715 - file_page_index(bitmap, 0)];
691} 716}
692 717
693 718
@@ -710,7 +735,7 @@ static void bitmap_file_unmap(struct bitmap *bitmap)
710 spin_unlock_irqrestore(&bitmap->lock, flags); 735 spin_unlock_irqrestore(&bitmap->lock, flags);
711 736
712 while (pages--) 737 while (pages--)
713 if (map[pages]->index != 0) /* 0 is sb_page, release it below */ 738 if (map[pages] != sb_page) /* 0 is sb_page, release it below */
714 free_buffers(map[pages]); 739 free_buffers(map[pages]);
715 kfree(map); 740 kfree(map);
716 kfree(attr); 741 kfree(attr);
@@ -821,7 +846,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
821 846
822 page = filemap_get_page(bitmap, chunk); 847 page = filemap_get_page(bitmap, chunk);
823 if (!page) return; 848 if (!page) return;
824 bit = file_page_offset(chunk); 849 bit = file_page_offset(bitmap, chunk);
825 850
826 /* set the bit */ 851 /* set the bit */
827 kaddr = kmap_atomic(page, KM_USER0); 852 kaddr = kmap_atomic(page, KM_USER0);
@@ -907,7 +932,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
907 chunks = bitmap->chunks; 932 chunks = bitmap->chunks;
908 file = bitmap->file; 933 file = bitmap->file;
909 934
910 BUG_ON(!file && !bitmap->offset); 935 BUG_ON(!file && !bitmap->mddev->bitmap_info.offset);
911 936
912#ifdef INJECT_FAULTS_3 937#ifdef INJECT_FAULTS_3
913 outofdate = 1; 938 outofdate = 1;
@@ -919,14 +944,17 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
919 "recovery\n", bmname(bitmap)); 944 "recovery\n", bmname(bitmap));
920 945
921 bytes = (chunks + 7) / 8; 946 bytes = (chunks + 7) / 8;
947 if (!bitmap->mddev->bitmap_info.external)
948 bytes += sizeof(bitmap_super_t);
922 949
923 num_pages = (bytes + sizeof(bitmap_super_t) + PAGE_SIZE - 1) / PAGE_SIZE; 950
951 num_pages = (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
924 952
925 if (file && i_size_read(file->f_mapping->host) < bytes + sizeof(bitmap_super_t)) { 953 if (file && i_size_read(file->f_mapping->host) < bytes) {
926 printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n", 954 printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n",
927 bmname(bitmap), 955 bmname(bitmap),
928 (unsigned long) i_size_read(file->f_mapping->host), 956 (unsigned long) i_size_read(file->f_mapping->host),
929 bytes + sizeof(bitmap_super_t)); 957 bytes);
930 goto err; 958 goto err;
931 } 959 }
932 960
@@ -947,17 +975,16 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
947 975
948 for (i = 0; i < chunks; i++) { 976 for (i = 0; i < chunks; i++) {
949 int b; 977 int b;
950 index = file_page_index(i); 978 index = file_page_index(bitmap, i);
951 bit = file_page_offset(i); 979 bit = file_page_offset(bitmap, i);
952 if (index != oldindex) { /* this is a new page, read it in */ 980 if (index != oldindex) { /* this is a new page, read it in */
953 int count; 981 int count;
954 /* unmap the old page, we're done with it */ 982 /* unmap the old page, we're done with it */
955 if (index == num_pages-1) 983 if (index == num_pages-1)
956 count = bytes + sizeof(bitmap_super_t) 984 count = bytes - index * PAGE_SIZE;
957 - index * PAGE_SIZE;
958 else 985 else
959 count = PAGE_SIZE; 986 count = PAGE_SIZE;
960 if (index == 0) { 987 if (index == 0 && bitmap->sb_page) {
961 /* 988 /*
962 * if we're here then the superblock page 989 * if we're here then the superblock page
963 * contains some bits (PAGE_SIZE != sizeof sb) 990 * contains some bits (PAGE_SIZE != sizeof sb)
@@ -967,14 +994,15 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
967 offset = sizeof(bitmap_super_t); 994 offset = sizeof(bitmap_super_t);
968 if (!file) 995 if (!file)
969 read_sb_page(bitmap->mddev, 996 read_sb_page(bitmap->mddev,
970 bitmap->offset, 997 bitmap->mddev->bitmap_info.offset,
971 page, 998 page,
972 index, count); 999 index, count);
973 } else if (file) { 1000 } else if (file) {
974 page = read_page(file, index, bitmap, count); 1001 page = read_page(file, index, bitmap, count);
975 offset = 0; 1002 offset = 0;
976 } else { 1003 } else {
977 page = read_sb_page(bitmap->mddev, bitmap->offset, 1004 page = read_sb_page(bitmap->mddev,
1005 bitmap->mddev->bitmap_info.offset,
978 NULL, 1006 NULL,
979 index, count); 1007 index, count);
980 offset = 0; 1008 offset = 0;
@@ -1078,23 +1106,32 @@ static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
1078 * out to disk 1106 * out to disk
1079 */ 1107 */
1080 1108
1081void bitmap_daemon_work(struct bitmap *bitmap) 1109void bitmap_daemon_work(mddev_t *mddev)
1082{ 1110{
1111 struct bitmap *bitmap;
1083 unsigned long j; 1112 unsigned long j;
1084 unsigned long flags; 1113 unsigned long flags;
1085 struct page *page = NULL, *lastpage = NULL; 1114 struct page *page = NULL, *lastpage = NULL;
1086 int blocks; 1115 int blocks;
1087 void *paddr; 1116 void *paddr;
1088 1117
1089 if (bitmap == NULL) 1118 /* Use a mutex to guard daemon_work against
1119 * bitmap_destroy.
1120 */
1121 mutex_lock(&mddev->bitmap_info.mutex);
1122 bitmap = mddev->bitmap;
1123 if (bitmap == NULL) {
1124 mutex_unlock(&mddev->bitmap_info.mutex);
1090 return; 1125 return;
1091 if (time_before(jiffies, bitmap->daemon_lastrun + bitmap->daemon_sleep*HZ)) 1126 }
1127 if (time_before(jiffies, bitmap->daemon_lastrun
1128 + bitmap->mddev->bitmap_info.daemon_sleep))
1092 goto done; 1129 goto done;
1093 1130
1094 bitmap->daemon_lastrun = jiffies; 1131 bitmap->daemon_lastrun = jiffies;
1095 if (bitmap->allclean) { 1132 if (bitmap->allclean) {
1096 bitmap->mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; 1133 bitmap->mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
1097 return; 1134 goto done;
1098 } 1135 }
1099 bitmap->allclean = 1; 1136 bitmap->allclean = 1;
1100 1137
@@ -1142,7 +1179,8 @@ void bitmap_daemon_work(struct bitmap *bitmap)
1142 /* We are possibly going to clear some bits, so make 1179 /* We are possibly going to clear some bits, so make
1143 * sure that events_cleared is up-to-date. 1180 * sure that events_cleared is up-to-date.
1144 */ 1181 */
1145 if (bitmap->need_sync) { 1182 if (bitmap->need_sync &&
1183 bitmap->mddev->bitmap_info.external == 0) {
1146 bitmap_super_t *sb; 1184 bitmap_super_t *sb;
1147 bitmap->need_sync = 0; 1185 bitmap->need_sync = 0;
1148 sb = kmap_atomic(bitmap->sb_page, KM_USER0); 1186 sb = kmap_atomic(bitmap->sb_page, KM_USER0);
@@ -1152,7 +1190,8 @@ void bitmap_daemon_work(struct bitmap *bitmap)
1152 write_page(bitmap, bitmap->sb_page, 1); 1190 write_page(bitmap, bitmap->sb_page, 1);
1153 } 1191 }
1154 spin_lock_irqsave(&bitmap->lock, flags); 1192 spin_lock_irqsave(&bitmap->lock, flags);
1155 clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); 1193 if (!bitmap->need_sync)
1194 clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
1156 } 1195 }
1157 bmc = bitmap_get_counter(bitmap, 1196 bmc = bitmap_get_counter(bitmap,
1158 (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap), 1197 (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap),
@@ -1167,7 +1206,7 @@ void bitmap_daemon_work(struct bitmap *bitmap)
1167 if (*bmc == 2) { 1206 if (*bmc == 2) {
1168 *bmc=1; /* maybe clear the bit next time */ 1207 *bmc=1; /* maybe clear the bit next time */
1169 set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); 1208 set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
1170 } else if (*bmc == 1) { 1209 } else if (*bmc == 1 && !bitmap->need_sync) {
1171 /* we can clear the bit */ 1210 /* we can clear the bit */
1172 *bmc = 0; 1211 *bmc = 0;
1173 bitmap_count_page(bitmap, 1212 bitmap_count_page(bitmap,
@@ -1177,9 +1216,11 @@ void bitmap_daemon_work(struct bitmap *bitmap)
1177 /* clear the bit */ 1216 /* clear the bit */
1178 paddr = kmap_atomic(page, KM_USER0); 1217 paddr = kmap_atomic(page, KM_USER0);
1179 if (bitmap->flags & BITMAP_HOSTENDIAN) 1218 if (bitmap->flags & BITMAP_HOSTENDIAN)
1180 clear_bit(file_page_offset(j), paddr); 1219 clear_bit(file_page_offset(bitmap, j),
1220 paddr);
1181 else 1221 else
1182 ext2_clear_bit(file_page_offset(j), paddr); 1222 ext2_clear_bit(file_page_offset(bitmap, j),
1223 paddr);
1183 kunmap_atomic(paddr, KM_USER0); 1224 kunmap_atomic(paddr, KM_USER0);
1184 } 1225 }
1185 } else 1226 } else
@@ -1202,7 +1243,9 @@ void bitmap_daemon_work(struct bitmap *bitmap)
1202 1243
1203 done: 1244 done:
1204 if (bitmap->allclean == 0) 1245 if (bitmap->allclean == 0)
1205 bitmap->mddev->thread->timeout = bitmap->daemon_sleep * HZ; 1246 bitmap->mddev->thread->timeout =
1247 bitmap->mddev->bitmap_info.daemon_sleep;
1248 mutex_unlock(&mddev->bitmap_info.mutex);
1206} 1249}
1207 1250
1208static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, 1251static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
@@ -1249,9 +1292,14 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
1249 if (!bitmap) return 0; 1292 if (!bitmap) return 0;
1250 1293
1251 if (behind) { 1294 if (behind) {
1295 int bw;
1252 atomic_inc(&bitmap->behind_writes); 1296 atomic_inc(&bitmap->behind_writes);
1297 bw = atomic_read(&bitmap->behind_writes);
1298 if (bw > bitmap->behind_writes_used)
1299 bitmap->behind_writes_used = bw;
1300
1253 PRINTK(KERN_DEBUG "inc write-behind count %d/%d\n", 1301 PRINTK(KERN_DEBUG "inc write-behind count %d/%d\n",
1254 atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); 1302 bw, bitmap->max_write_behind);
1255 } 1303 }
1256 1304
1257 while (sectors) { 1305 while (sectors) {
@@ -1308,7 +1356,8 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
1308{ 1356{
1309 if (!bitmap) return; 1357 if (!bitmap) return;
1310 if (behind) { 1358 if (behind) {
1311 atomic_dec(&bitmap->behind_writes); 1359 if (atomic_dec_and_test(&bitmap->behind_writes))
1360 wake_up(&bitmap->behind_wait);
1312 PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n", 1361 PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n",
1313 atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); 1362 atomic_read(&bitmap->behind_writes), bitmap->max_write_behind);
1314 } 1363 }
@@ -1332,6 +1381,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
1332 bitmap->events_cleared < bitmap->mddev->events) { 1381 bitmap->events_cleared < bitmap->mddev->events) {
1333 bitmap->events_cleared = bitmap->mddev->events; 1382 bitmap->events_cleared = bitmap->mddev->events;
1334 bitmap->need_sync = 1; 1383 bitmap->need_sync = 1;
1384 sysfs_notify_dirent(bitmap->sysfs_can_clear);
1335 } 1385 }
1336 1386
1337 if (!success && ! (*bmc & NEEDED_MASK)) 1387 if (!success && ! (*bmc & NEEDED_MASK))
@@ -1470,7 +1520,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
1470 return; 1520 return;
1471 } 1521 }
1472 if (time_before(jiffies, (bitmap->last_end_sync 1522 if (time_before(jiffies, (bitmap->last_end_sync
1473 + bitmap->daemon_sleep * HZ))) 1523 + bitmap->mddev->bitmap_info.daemon_sleep)))
1474 return; 1524 return;
1475 wait_event(bitmap->mddev->recovery_wait, 1525 wait_event(bitmap->mddev->recovery_wait,
1476 atomic_read(&bitmap->mddev->recovery_active) == 0); 1526 atomic_read(&bitmap->mddev->recovery_active) == 0);
@@ -1522,6 +1572,12 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e)
1522 sector_t sec = (sector_t)chunk << CHUNK_BLOCK_SHIFT(bitmap); 1572 sector_t sec = (sector_t)chunk << CHUNK_BLOCK_SHIFT(bitmap);
1523 bitmap_set_memory_bits(bitmap, sec, 1); 1573 bitmap_set_memory_bits(bitmap, sec, 1);
1524 bitmap_file_set_bit(bitmap, sec); 1574 bitmap_file_set_bit(bitmap, sec);
1575 if (sec < bitmap->mddev->recovery_cp)
1576 /* We are asserting that the array is dirty,
1577 * so move the recovery_cp address back so
1578 * that it is obvious that it is dirty
1579 */
1580 bitmap->mddev->recovery_cp = sec;
1525 } 1581 }
1526} 1582}
1527 1583
@@ -1531,7 +1587,7 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e)
1531void bitmap_flush(mddev_t *mddev) 1587void bitmap_flush(mddev_t *mddev)
1532{ 1588{
1533 struct bitmap *bitmap = mddev->bitmap; 1589 struct bitmap *bitmap = mddev->bitmap;
1534 int sleep; 1590 long sleep;
1535 1591
1536 if (!bitmap) /* there was no bitmap */ 1592 if (!bitmap) /* there was no bitmap */
1537 return; 1593 return;
@@ -1539,12 +1595,13 @@ void bitmap_flush(mddev_t *mddev)
1539 /* run the daemon_work three time to ensure everything is flushed 1595 /* run the daemon_work three time to ensure everything is flushed
1540 * that can be 1596 * that can be
1541 */ 1597 */
1542 sleep = bitmap->daemon_sleep; 1598 sleep = mddev->bitmap_info.daemon_sleep * 2;
1543 bitmap->daemon_sleep = 0; 1599 bitmap->daemon_lastrun -= sleep;
1544 bitmap_daemon_work(bitmap); 1600 bitmap_daemon_work(mddev);
1545 bitmap_daemon_work(bitmap); 1601 bitmap->daemon_lastrun -= sleep;
1546 bitmap_daemon_work(bitmap); 1602 bitmap_daemon_work(mddev);
1547 bitmap->daemon_sleep = sleep; 1603 bitmap->daemon_lastrun -= sleep;
1604 bitmap_daemon_work(mddev);
1548 bitmap_update_sb(bitmap); 1605 bitmap_update_sb(bitmap);
1549} 1606}
1550 1607
@@ -1574,6 +1631,7 @@ static void bitmap_free(struct bitmap *bitmap)
1574 kfree(bp); 1631 kfree(bp);
1575 kfree(bitmap); 1632 kfree(bitmap);
1576} 1633}
1634
1577void bitmap_destroy(mddev_t *mddev) 1635void bitmap_destroy(mddev_t *mddev)
1578{ 1636{
1579 struct bitmap *bitmap = mddev->bitmap; 1637 struct bitmap *bitmap = mddev->bitmap;
@@ -1581,10 +1639,15 @@ void bitmap_destroy(mddev_t *mddev)
1581 if (!bitmap) /* there was no bitmap */ 1639 if (!bitmap) /* there was no bitmap */
1582 return; 1640 return;
1583 1641
1642 mutex_lock(&mddev->bitmap_info.mutex);
1584 mddev->bitmap = NULL; /* disconnect from the md device */ 1643 mddev->bitmap = NULL; /* disconnect from the md device */
1644 mutex_unlock(&mddev->bitmap_info.mutex);
1585 if (mddev->thread) 1645 if (mddev->thread)
1586 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; 1646 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
1587 1647
1648 if (bitmap->sysfs_can_clear)
1649 sysfs_put(bitmap->sysfs_can_clear);
1650
1588 bitmap_free(bitmap); 1651 bitmap_free(bitmap);
1589} 1652}
1590 1653
@@ -1598,16 +1661,17 @@ int bitmap_create(mddev_t *mddev)
1598 sector_t blocks = mddev->resync_max_sectors; 1661 sector_t blocks = mddev->resync_max_sectors;
1599 unsigned long chunks; 1662 unsigned long chunks;
1600 unsigned long pages; 1663 unsigned long pages;
1601 struct file *file = mddev->bitmap_file; 1664 struct file *file = mddev->bitmap_info.file;
1602 int err; 1665 int err;
1603 sector_t start; 1666 sector_t start;
1667 struct sysfs_dirent *bm;
1604 1668
1605 BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); 1669 BUILD_BUG_ON(sizeof(bitmap_super_t) != 256);
1606 1670
1607 if (!file && !mddev->bitmap_offset) /* bitmap disabled, nothing to do */ 1671 if (!file && !mddev->bitmap_info.offset) /* bitmap disabled, nothing to do */
1608 return 0; 1672 return 0;
1609 1673
1610 BUG_ON(file && mddev->bitmap_offset); 1674 BUG_ON(file && mddev->bitmap_info.offset);
1611 1675
1612 bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); 1676 bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
1613 if (!bitmap) 1677 if (!bitmap)
@@ -1617,24 +1681,42 @@ int bitmap_create(mddev_t *mddev)
1617 atomic_set(&bitmap->pending_writes, 0); 1681 atomic_set(&bitmap->pending_writes, 0);
1618 init_waitqueue_head(&bitmap->write_wait); 1682 init_waitqueue_head(&bitmap->write_wait);
1619 init_waitqueue_head(&bitmap->overflow_wait); 1683 init_waitqueue_head(&bitmap->overflow_wait);
1684 init_waitqueue_head(&bitmap->behind_wait);
1620 1685
1621 bitmap->mddev = mddev; 1686 bitmap->mddev = mddev;
1622 1687
1688 bm = sysfs_get_dirent(mddev->kobj.sd, NULL, "bitmap");
1689 if (bm) {
1690 bitmap->sysfs_can_clear = sysfs_get_dirent(bm, NULL, "can_clear");
1691 sysfs_put(bm);
1692 } else
1693 bitmap->sysfs_can_clear = NULL;
1694
1623 bitmap->file = file; 1695 bitmap->file = file;
1624 bitmap->offset = mddev->bitmap_offset;
1625 if (file) { 1696 if (file) {
1626 get_file(file); 1697 get_file(file);
1627 do_sync_mapping_range(file->f_mapping, 0, LLONG_MAX, 1698 /* As future accesses to this file will use bmap,
1628 SYNC_FILE_RANGE_WAIT_BEFORE | 1699 * and bypass the page cache, we must sync the file
1629 SYNC_FILE_RANGE_WRITE | 1700 * first.
1630 SYNC_FILE_RANGE_WAIT_AFTER); 1701 */
1702 vfs_fsync(file, 1);
1703 }
1704 /* read superblock from bitmap file (this sets mddev->bitmap_info.chunksize) */
1705 if (!mddev->bitmap_info.external)
1706 err = bitmap_read_sb(bitmap);
1707 else {
1708 err = 0;
1709 if (mddev->bitmap_info.chunksize == 0 ||
1710 mddev->bitmap_info.daemon_sleep == 0)
1711 /* chunksize and time_base need to be
1712 * set first. */
1713 err = -EINVAL;
1631 } 1714 }
1632 /* read superblock from bitmap file (this sets bitmap->chunksize) */
1633 err = bitmap_read_sb(bitmap);
1634 if (err) 1715 if (err)
1635 goto error; 1716 goto error;
1636 1717
1637 bitmap->chunkshift = ffz(~bitmap->chunksize); 1718 bitmap->daemon_lastrun = jiffies;
1719 bitmap->chunkshift = ffz(~mddev->bitmap_info.chunksize);
1638 1720
1639 /* now that chunksize and chunkshift are set, we can use these macros */ 1721 /* now that chunksize and chunkshift are set, we can use these macros */
1640 chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) >> 1722 chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) >>
@@ -1676,7 +1758,8 @@ int bitmap_create(mddev_t *mddev)
1676 1758
1677 mddev->bitmap = bitmap; 1759 mddev->bitmap = bitmap;
1678 1760
1679 mddev->thread->timeout = bitmap->daemon_sleep * HZ; 1761 mddev->thread->timeout = mddev->bitmap_info.daemon_sleep;
1762 md_wakeup_thread(mddev->thread);
1680 1763
1681 bitmap_update_sb(bitmap); 1764 bitmap_update_sb(bitmap);
1682 1765
@@ -1687,6 +1770,286 @@ int bitmap_create(mddev_t *mddev)
1687 return err; 1770 return err;
1688} 1771}
1689 1772
1773static ssize_t
1774location_show(mddev_t *mddev, char *page)
1775{
1776 ssize_t len;
1777 if (mddev->bitmap_info.file) {
1778 len = sprintf(page, "file");
1779 } else if (mddev->bitmap_info.offset) {
1780 len = sprintf(page, "%+lld", (long long)mddev->bitmap_info.offset);
1781 } else
1782 len = sprintf(page, "none");
1783 len += sprintf(page+len, "\n");
1784 return len;
1785}
1786
1787static ssize_t
1788location_store(mddev_t *mddev, const char *buf, size_t len)
1789{
1790
1791 if (mddev->pers) {
1792 if (!mddev->pers->quiesce)
1793 return -EBUSY;
1794 if (mddev->recovery || mddev->sync_thread)
1795 return -EBUSY;
1796 }
1797
1798 if (mddev->bitmap || mddev->bitmap_info.file ||
1799 mddev->bitmap_info.offset) {
1800 /* bitmap already configured. Only option is to clear it */
1801 if (strncmp(buf, "none", 4) != 0)
1802 return -EBUSY;
1803 if (mddev->pers) {
1804 mddev->pers->quiesce(mddev, 1);
1805 bitmap_destroy(mddev);
1806 mddev->pers->quiesce(mddev, 0);
1807 }
1808 mddev->bitmap_info.offset = 0;
1809 if (mddev->bitmap_info.file) {
1810 struct file *f = mddev->bitmap_info.file;
1811 mddev->bitmap_info.file = NULL;
1812 restore_bitmap_write_access(f);
1813 fput(f);
1814 }
1815 } else {
1816 /* No bitmap, OK to set a location */
1817 long long offset;
1818 if (strncmp(buf, "none", 4) == 0)
1819 /* nothing to be done */;
1820 else if (strncmp(buf, "file:", 5) == 0) {
1821 /* Not supported yet */
1822 return -EINVAL;
1823 } else {
1824 int rv;
1825 if (buf[0] == '+')
1826 rv = strict_strtoll(buf+1, 10, &offset);
1827 else
1828 rv = strict_strtoll(buf, 10, &offset);
1829 if (rv)
1830 return rv;
1831 if (offset == 0)
1832 return -EINVAL;
1833 if (mddev->bitmap_info.external == 0 &&
1834 mddev->major_version == 0 &&
1835 offset != mddev->bitmap_info.default_offset)
1836 return -EINVAL;
1837 mddev->bitmap_info.offset = offset;
1838 if (mddev->pers) {
1839 mddev->pers->quiesce(mddev, 1);
1840 rv = bitmap_create(mddev);
1841 if (rv) {
1842 bitmap_destroy(mddev);
1843 mddev->bitmap_info.offset = 0;
1844 }
1845 mddev->pers->quiesce(mddev, 0);
1846 if (rv)
1847 return rv;
1848 }
1849 }
1850 }
1851 if (!mddev->external) {
1852 /* Ensure new bitmap info is stored in
1853 * metadata promptly.
1854 */
1855 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1856 md_wakeup_thread(mddev->thread);
1857 }
1858 return len;
1859}
1860
1861static struct md_sysfs_entry bitmap_location =
1862__ATTR(location, S_IRUGO|S_IWUSR, location_show, location_store);
1863
1864static ssize_t
1865timeout_show(mddev_t *mddev, char *page)
1866{
1867 ssize_t len;
1868 unsigned long secs = mddev->bitmap_info.daemon_sleep / HZ;
1869 unsigned long jifs = mddev->bitmap_info.daemon_sleep % HZ;
1870
1871 len = sprintf(page, "%lu", secs);
1872 if (jifs)
1873 len += sprintf(page+len, ".%03u", jiffies_to_msecs(jifs));
1874 len += sprintf(page+len, "\n");
1875 return len;
1876}
1877
1878static ssize_t
1879timeout_store(mddev_t *mddev, const char *buf, size_t len)
1880{
1881 /* timeout can be set at any time */
1882 unsigned long timeout;
1883 int rv = strict_strtoul_scaled(buf, &timeout, 4);
1884 if (rv)
1885 return rv;
1886
1887 /* just to make sure we don't overflow... */
1888 if (timeout >= LONG_MAX / HZ)
1889 return -EINVAL;
1890
1891 timeout = timeout * HZ / 10000;
1892
1893 if (timeout >= MAX_SCHEDULE_TIMEOUT)
1894 timeout = MAX_SCHEDULE_TIMEOUT-1;
1895 if (timeout < 1)
1896 timeout = 1;
1897 mddev->bitmap_info.daemon_sleep = timeout;
1898 if (mddev->thread) {
1899 /* if thread->timeout is MAX_SCHEDULE_TIMEOUT, then
1900 * the bitmap is all clean and we don't need to
1901 * adjust the timeout right now
1902 */
1903 if (mddev->thread->timeout < MAX_SCHEDULE_TIMEOUT) {
1904 mddev->thread->timeout = timeout;
1905 md_wakeup_thread(mddev->thread);
1906 }
1907 }
1908 return len;
1909}
1910
1911static struct md_sysfs_entry bitmap_timeout =
1912__ATTR(time_base, S_IRUGO|S_IWUSR, timeout_show, timeout_store);
1913
1914static ssize_t
1915backlog_show(mddev_t *mddev, char *page)
1916{
1917 return sprintf(page, "%lu\n", mddev->bitmap_info.max_write_behind);
1918}
1919
1920static ssize_t
1921backlog_store(mddev_t *mddev, const char *buf, size_t len)
1922{
1923 unsigned long backlog;
1924 int rv = strict_strtoul(buf, 10, &backlog);
1925 if (rv)
1926 return rv;
1927 if (backlog > COUNTER_MAX)
1928 return -EINVAL;
1929 mddev->bitmap_info.max_write_behind = backlog;
1930 return len;
1931}
1932
1933static struct md_sysfs_entry bitmap_backlog =
1934__ATTR(backlog, S_IRUGO|S_IWUSR, backlog_show, backlog_store);
1935
1936static ssize_t
1937chunksize_show(mddev_t *mddev, char *page)
1938{
1939 return sprintf(page, "%lu\n", mddev->bitmap_info.chunksize);
1940}
1941
1942static ssize_t
1943chunksize_store(mddev_t *mddev, const char *buf, size_t len)
1944{
1945 /* Can only be changed when no bitmap is active */
1946 int rv;
1947 unsigned long csize;
1948 if (mddev->bitmap)
1949 return -EBUSY;
1950 rv = strict_strtoul(buf, 10, &csize);
1951 if (rv)
1952 return rv;
1953 if (csize < 512 ||
1954 !is_power_of_2(csize))
1955 return -EINVAL;
1956 mddev->bitmap_info.chunksize = csize;
1957 return len;
1958}
1959
1960static struct md_sysfs_entry bitmap_chunksize =
1961__ATTR(chunksize, S_IRUGO|S_IWUSR, chunksize_show, chunksize_store);
1962
1963static ssize_t metadata_show(mddev_t *mddev, char *page)
1964{
1965 return sprintf(page, "%s\n", (mddev->bitmap_info.external
1966 ? "external" : "internal"));
1967}
1968
1969static ssize_t metadata_store(mddev_t *mddev, const char *buf, size_t len)
1970{
1971 if (mddev->bitmap ||
1972 mddev->bitmap_info.file ||
1973 mddev->bitmap_info.offset)
1974 return -EBUSY;
1975 if (strncmp(buf, "external", 8) == 0)
1976 mddev->bitmap_info.external = 1;
1977 else if (strncmp(buf, "internal", 8) == 0)
1978 mddev->bitmap_info.external = 0;
1979 else
1980 return -EINVAL;
1981 return len;
1982}
1983
1984static struct md_sysfs_entry bitmap_metadata =
1985__ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
1986
1987static ssize_t can_clear_show(mddev_t *mddev, char *page)
1988{
1989 int len;
1990 if (mddev->bitmap)
1991 len = sprintf(page, "%s\n", (mddev->bitmap->need_sync ?
1992 "false" : "true"));
1993 else
1994 len = sprintf(page, "\n");
1995 return len;
1996}
1997
1998static ssize_t can_clear_store(mddev_t *mddev, const char *buf, size_t len)
1999{
2000 if (mddev->bitmap == NULL)
2001 return -ENOENT;
2002 if (strncmp(buf, "false", 5) == 0)
2003 mddev->bitmap->need_sync = 1;
2004 else if (strncmp(buf, "true", 4) == 0) {
2005 if (mddev->degraded)
2006 return -EBUSY;
2007 mddev->bitmap->need_sync = 0;
2008 } else
2009 return -EINVAL;
2010 return len;
2011}
2012
2013static struct md_sysfs_entry bitmap_can_clear =
2014__ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store);
2015
2016static ssize_t
2017behind_writes_used_show(mddev_t *mddev, char *page)
2018{
2019 if (mddev->bitmap == NULL)
2020 return sprintf(page, "0\n");
2021 return sprintf(page, "%lu\n",
2022 mddev->bitmap->behind_writes_used);
2023}
2024
2025static ssize_t
2026behind_writes_used_reset(mddev_t *mddev, const char *buf, size_t len)
2027{
2028 if (mddev->bitmap)
2029 mddev->bitmap->behind_writes_used = 0;
2030 return len;
2031}
2032
2033static struct md_sysfs_entry max_backlog_used =
2034__ATTR(max_backlog_used, S_IRUGO | S_IWUSR,
2035 behind_writes_used_show, behind_writes_used_reset);
2036
2037static struct attribute *md_bitmap_attrs[] = {
2038 &bitmap_location.attr,
2039 &bitmap_timeout.attr,
2040 &bitmap_backlog.attr,
2041 &bitmap_chunksize.attr,
2042 &bitmap_metadata.attr,
2043 &bitmap_can_clear.attr,
2044 &max_backlog_used.attr,
2045 NULL
2046};
2047struct attribute_group md_bitmap_group = {
2048 .name = "bitmap",
2049 .attrs = md_bitmap_attrs,
2050};
2051
2052
1690/* the bitmap API -- for raid personalities */ 2053/* the bitmap API -- for raid personalities */
1691EXPORT_SYMBOL(bitmap_startwrite); 2054EXPORT_SYMBOL(bitmap_startwrite);
1692EXPORT_SYMBOL(bitmap_endwrite); 2055EXPORT_SYMBOL(bitmap_endwrite);
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index e98900671ca9..3797dea4723a 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -106,7 +106,7 @@ typedef __u16 bitmap_counter_t;
106#define BITMAP_BLOCK_SHIFT 9 106#define BITMAP_BLOCK_SHIFT 9
107 107
108/* how many blocks per chunk? (this is variable) */ 108/* how many blocks per chunk? (this is variable) */
109#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->chunksize >> BITMAP_BLOCK_SHIFT) 109#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->mddev->bitmap_info.chunksize >> BITMAP_BLOCK_SHIFT)
110#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT) 110#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT)
111#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1) 111#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1)
112 112
@@ -118,16 +118,6 @@ typedef __u16 bitmap_counter_t;
118 (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1) 118 (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1)
119#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1) 119#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1)
120 120
121/*
122 * on-disk bitmap:
123 *
124 * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap
125 * file a page at a time. There's a superblock at the start of the file.
126 */
127
128/* map chunks (bits) to file pages - offset by the size of the superblock */
129#define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3))
130
131#endif 121#endif
132 122
133/* 123/*
@@ -209,7 +199,6 @@ struct bitmap {
209 int counter_bits; /* how many bits per block counter */ 199 int counter_bits; /* how many bits per block counter */
210 200
211 /* bitmap chunksize -- how much data does each bit represent? */ 201 /* bitmap chunksize -- how much data does each bit represent? */
212 unsigned long chunksize;
213 unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */ 202 unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */
214 unsigned long chunks; /* total number of data chunks for the array */ 203 unsigned long chunks; /* total number of data chunks for the array */
215 204
@@ -226,7 +215,6 @@ struct bitmap {
226 /* bitmap spinlock */ 215 /* bitmap spinlock */
227 spinlock_t lock; 216 spinlock_t lock;
228 217
229 long offset; /* offset from superblock if file is NULL */
230 struct file *file; /* backing disk file */ 218 struct file *file; /* backing disk file */
231 struct page *sb_page; /* cached copy of the bitmap file superblock */ 219 struct page *sb_page; /* cached copy of the bitmap file superblock */
232 struct page **filemap; /* list of cache pages for the file */ 220 struct page **filemap; /* list of cache pages for the file */
@@ -238,22 +226,23 @@ struct bitmap {
238 226
239 int allclean; 227 int allclean;
240 228
241 unsigned long max_write_behind; /* write-behind mode */
242 atomic_t behind_writes; 229 atomic_t behind_writes;
230 unsigned long behind_writes_used; /* highest actual value at runtime */
243 231
244 /* 232 /*
245 * the bitmap daemon - periodically wakes up and sweeps the bitmap 233 * the bitmap daemon - periodically wakes up and sweeps the bitmap
246 * file, cleaning up bits and flushing out pages to disk as necessary 234 * file, cleaning up bits and flushing out pages to disk as necessary
247 */ 235 */
248 unsigned long daemon_lastrun; /* jiffies of last run */ 236 unsigned long daemon_lastrun; /* jiffies of last run */
249 unsigned long daemon_sleep; /* how many seconds between updates? */
250 unsigned long last_end_sync; /* when we lasted called end_sync to 237 unsigned long last_end_sync; /* when we lasted called end_sync to
251 * update bitmap with resync progress */ 238 * update bitmap with resync progress */
252 239
253 atomic_t pending_writes; /* pending writes to the bitmap file */ 240 atomic_t pending_writes; /* pending writes to the bitmap file */
254 wait_queue_head_t write_wait; 241 wait_queue_head_t write_wait;
255 wait_queue_head_t overflow_wait; 242 wait_queue_head_t overflow_wait;
243 wait_queue_head_t behind_wait;
256 244
245 struct sysfs_dirent *sysfs_can_clear;
257}; 246};
258 247
259/* the bitmap API */ 248/* the bitmap API */
@@ -282,7 +271,7 @@ void bitmap_close_sync(struct bitmap *bitmap);
282void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector); 271void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector);
283 272
284void bitmap_unplug(struct bitmap *bitmap); 273void bitmap_unplug(struct bitmap *bitmap);
285void bitmap_daemon_work(struct bitmap *bitmap); 274void bitmap_daemon_work(mddev_t *mddev);
286#endif 275#endif
287 276
288#endif 277#endif
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index ed1038164019..3bdbb6115702 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Copyright (C) 2003 Christophe Saout <christophe@saout.de> 2 * Copyright (C) 2003 Christophe Saout <christophe@saout.de>
3 * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> 3 * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
4 * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved. 4 * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved.
5 * 5 *
6 * This file is released under the GPL. 6 * This file is released under the GPL.
7 */ 7 */
@@ -71,10 +71,21 @@ struct crypt_iv_operations {
71 int (*ctr)(struct crypt_config *cc, struct dm_target *ti, 71 int (*ctr)(struct crypt_config *cc, struct dm_target *ti,
72 const char *opts); 72 const char *opts);
73 void (*dtr)(struct crypt_config *cc); 73 void (*dtr)(struct crypt_config *cc);
74 const char *(*status)(struct crypt_config *cc); 74 int (*init)(struct crypt_config *cc);
75 int (*wipe)(struct crypt_config *cc);
75 int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector); 76 int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector);
76}; 77};
77 78
79struct iv_essiv_private {
80 struct crypto_cipher *tfm;
81 struct crypto_hash *hash_tfm;
82 u8 *salt;
83};
84
85struct iv_benbi_private {
86 int shift;
87};
88
78/* 89/*
79 * Crypt: maps a linear range of a block device 90 * Crypt: maps a linear range of a block device
80 * and encrypts / decrypts at the same time. 91 * and encrypts / decrypts at the same time.
@@ -102,8 +113,8 @@ struct crypt_config {
102 struct crypt_iv_operations *iv_gen_ops; 113 struct crypt_iv_operations *iv_gen_ops;
103 char *iv_mode; 114 char *iv_mode;
104 union { 115 union {
105 struct crypto_cipher *essiv_tfm; 116 struct iv_essiv_private essiv;
106 int benbi_shift; 117 struct iv_benbi_private benbi;
107 } iv_gen_private; 118 } iv_gen_private;
108 sector_t iv_offset; 119 sector_t iv_offset;
109 unsigned int iv_size; 120 unsigned int iv_size;
@@ -147,6 +158,9 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io);
147 * plain: the initial vector is the 32-bit little-endian version of the sector 158 * plain: the initial vector is the 32-bit little-endian version of the sector
148 * number, padded with zeros if necessary. 159 * number, padded with zeros if necessary.
149 * 160 *
161 * plain64: the initial vector is the 64-bit little-endian version of the sector
162 * number, padded with zeros if necessary.
163 *
150 * essiv: "encrypted sector|salt initial vector", the sector number is 164 * essiv: "encrypted sector|salt initial vector", the sector number is
151 * encrypted with the bulk cipher using a salt as key. The salt 165 * encrypted with the bulk cipher using a salt as key. The salt
152 * should be derived from the bulk cipher's key via hashing. 166 * should be derived from the bulk cipher's key via hashing.
@@ -169,88 +183,123 @@ static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
169 return 0; 183 return 0;
170} 184}
171 185
172static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, 186static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv,
173 const char *opts) 187 sector_t sector)
174{ 188{
175 struct crypto_cipher *essiv_tfm; 189 memset(iv, 0, cc->iv_size);
176 struct crypto_hash *hash_tfm; 190 *(u64 *)iv = cpu_to_le64(sector);
191
192 return 0;
193}
194
195/* Initialise ESSIV - compute salt but no local memory allocations */
196static int crypt_iv_essiv_init(struct crypt_config *cc)
197{
198 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
177 struct hash_desc desc; 199 struct hash_desc desc;
178 struct scatterlist sg; 200 struct scatterlist sg;
179 unsigned int saltsize;
180 u8 *salt;
181 int err; 201 int err;
182 202
183 if (opts == NULL) { 203 sg_init_one(&sg, cc->key, cc->key_size);
204 desc.tfm = essiv->hash_tfm;
205 desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
206
207 err = crypto_hash_digest(&desc, &sg, cc->key_size, essiv->salt);
208 if (err)
209 return err;
210
211 return crypto_cipher_setkey(essiv->tfm, essiv->salt,
212 crypto_hash_digestsize(essiv->hash_tfm));
213}
214
215/* Wipe salt and reset key derived from volume key */
216static int crypt_iv_essiv_wipe(struct crypt_config *cc)
217{
218 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
219 unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm);
220
221 memset(essiv->salt, 0, salt_size);
222
223 return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size);
224}
225
226static void crypt_iv_essiv_dtr(struct crypt_config *cc)
227{
228 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
229
230 crypto_free_cipher(essiv->tfm);
231 essiv->tfm = NULL;
232
233 crypto_free_hash(essiv->hash_tfm);
234 essiv->hash_tfm = NULL;
235
236 kzfree(essiv->salt);
237 essiv->salt = NULL;
238}
239
240static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
241 const char *opts)
242{
243 struct crypto_cipher *essiv_tfm = NULL;
244 struct crypto_hash *hash_tfm = NULL;
245 u8 *salt = NULL;
246 int err;
247
248 if (!opts) {
184 ti->error = "Digest algorithm missing for ESSIV mode"; 249 ti->error = "Digest algorithm missing for ESSIV mode";
185 return -EINVAL; 250 return -EINVAL;
186 } 251 }
187 252
188 /* Hash the cipher key with the given hash algorithm */ 253 /* Allocate hash algorithm */
189 hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC); 254 hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC);
190 if (IS_ERR(hash_tfm)) { 255 if (IS_ERR(hash_tfm)) {
191 ti->error = "Error initializing ESSIV hash"; 256 ti->error = "Error initializing ESSIV hash";
192 return PTR_ERR(hash_tfm); 257 err = PTR_ERR(hash_tfm);
258 goto bad;
193 } 259 }
194 260
195 saltsize = crypto_hash_digestsize(hash_tfm); 261 salt = kzalloc(crypto_hash_digestsize(hash_tfm), GFP_KERNEL);
196 salt = kmalloc(saltsize, GFP_KERNEL); 262 if (!salt) {
197 if (salt == NULL) {
198 ti->error = "Error kmallocing salt storage in ESSIV"; 263 ti->error = "Error kmallocing salt storage in ESSIV";
199 crypto_free_hash(hash_tfm); 264 err = -ENOMEM;
200 return -ENOMEM; 265 goto bad;
201 } 266 }
202 267
203 sg_init_one(&sg, cc->key, cc->key_size); 268 /* Allocate essiv_tfm */
204 desc.tfm = hash_tfm;
205 desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
206 err = crypto_hash_digest(&desc, &sg, cc->key_size, salt);
207 crypto_free_hash(hash_tfm);
208
209 if (err) {
210 ti->error = "Error calculating hash in ESSIV";
211 kfree(salt);
212 return err;
213 }
214
215 /* Setup the essiv_tfm with the given salt */
216 essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC); 269 essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
217 if (IS_ERR(essiv_tfm)) { 270 if (IS_ERR(essiv_tfm)) {
218 ti->error = "Error allocating crypto tfm for ESSIV"; 271 ti->error = "Error allocating crypto tfm for ESSIV";
219 kfree(salt); 272 err = PTR_ERR(essiv_tfm);
220 return PTR_ERR(essiv_tfm); 273 goto bad;
221 } 274 }
222 if (crypto_cipher_blocksize(essiv_tfm) != 275 if (crypto_cipher_blocksize(essiv_tfm) !=
223 crypto_ablkcipher_ivsize(cc->tfm)) { 276 crypto_ablkcipher_ivsize(cc->tfm)) {
224 ti->error = "Block size of ESSIV cipher does " 277 ti->error = "Block size of ESSIV cipher does "
225 "not match IV size of block cipher"; 278 "not match IV size of block cipher";
226 crypto_free_cipher(essiv_tfm); 279 err = -EINVAL;
227 kfree(salt); 280 goto bad;
228 return -EINVAL;
229 } 281 }
230 err = crypto_cipher_setkey(essiv_tfm, salt, saltsize);
231 if (err) {
232 ti->error = "Failed to set key for ESSIV cipher";
233 crypto_free_cipher(essiv_tfm);
234 kfree(salt);
235 return err;
236 }
237 kfree(salt);
238 282
239 cc->iv_gen_private.essiv_tfm = essiv_tfm; 283 cc->iv_gen_private.essiv.salt = salt;
284 cc->iv_gen_private.essiv.tfm = essiv_tfm;
285 cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
286
240 return 0; 287 return 0;
241}
242 288
243static void crypt_iv_essiv_dtr(struct crypt_config *cc) 289bad:
244{ 290 if (essiv_tfm && !IS_ERR(essiv_tfm))
245 crypto_free_cipher(cc->iv_gen_private.essiv_tfm); 291 crypto_free_cipher(essiv_tfm);
246 cc->iv_gen_private.essiv_tfm = NULL; 292 if (hash_tfm && !IS_ERR(hash_tfm))
293 crypto_free_hash(hash_tfm);
294 kfree(salt);
295 return err;
247} 296}
248 297
249static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector) 298static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
250{ 299{
251 memset(iv, 0, cc->iv_size); 300 memset(iv, 0, cc->iv_size);
252 *(u64 *)iv = cpu_to_le64(sector); 301 *(u64 *)iv = cpu_to_le64(sector);
253 crypto_cipher_encrypt_one(cc->iv_gen_private.essiv_tfm, iv, iv); 302 crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv);
254 return 0; 303 return 0;
255} 304}
256 305
@@ -273,7 +322,7 @@ static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
273 return -EINVAL; 322 return -EINVAL;
274 } 323 }
275 324
276 cc->iv_gen_private.benbi_shift = 9 - log; 325 cc->iv_gen_private.benbi.shift = 9 - log;
277 326
278 return 0; 327 return 0;
279} 328}
@@ -288,7 +337,7 @@ static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
288 337
289 memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */ 338 memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */
290 339
291 val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi_shift) + 1); 340 val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi.shift) + 1);
292 put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64))); 341 put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64)));
293 342
294 return 0; 343 return 0;
@@ -305,9 +354,15 @@ static struct crypt_iv_operations crypt_iv_plain_ops = {
305 .generator = crypt_iv_plain_gen 354 .generator = crypt_iv_plain_gen
306}; 355};
307 356
357static struct crypt_iv_operations crypt_iv_plain64_ops = {
358 .generator = crypt_iv_plain64_gen
359};
360
308static struct crypt_iv_operations crypt_iv_essiv_ops = { 361static struct crypt_iv_operations crypt_iv_essiv_ops = {
309 .ctr = crypt_iv_essiv_ctr, 362 .ctr = crypt_iv_essiv_ctr,
310 .dtr = crypt_iv_essiv_dtr, 363 .dtr = crypt_iv_essiv_dtr,
364 .init = crypt_iv_essiv_init,
365 .wipe = crypt_iv_essiv_wipe,
311 .generator = crypt_iv_essiv_gen 366 .generator = crypt_iv_essiv_gen
312}; 367};
313 368
@@ -934,14 +989,14 @@ static int crypt_set_key(struct crypt_config *cc, char *key)
934 989
935 set_bit(DM_CRYPT_KEY_VALID, &cc->flags); 990 set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
936 991
937 return 0; 992 return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
938} 993}
939 994
940static int crypt_wipe_key(struct crypt_config *cc) 995static int crypt_wipe_key(struct crypt_config *cc)
941{ 996{
942 clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); 997 clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
943 memset(&cc->key, 0, cc->key_size * sizeof(u8)); 998 memset(&cc->key, 0, cc->key_size * sizeof(u8));
944 return 0; 999 return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
945} 1000}
946 1001
947/* 1002/*
@@ -983,12 +1038,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
983 return -ENOMEM; 1038 return -ENOMEM;
984 } 1039 }
985 1040
986 if (crypt_set_key(cc, argv[1])) { 1041 /* Compatibility mode for old dm-crypt cipher strings */
987 ti->error = "Error decoding key";
988 goto bad_cipher;
989 }
990
991 /* Compatiblity mode for old dm-crypt cipher strings */
992 if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) { 1042 if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) {
993 chainmode = "cbc"; 1043 chainmode = "cbc";
994 ivmode = "plain"; 1044 ivmode = "plain";
@@ -1015,6 +1065,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1015 strcpy(cc->chainmode, chainmode); 1065 strcpy(cc->chainmode, chainmode);
1016 cc->tfm = tfm; 1066 cc->tfm = tfm;
1017 1067
1068 if (crypt_set_key(cc, argv[1]) < 0) {
1069 ti->error = "Error decoding and setting key";
1070 goto bad_ivmode;
1071 }
1072
1018 /* 1073 /*
1019 * Choose ivmode. Valid modes: "plain", "essiv:<esshash>", "benbi". 1074 * Choose ivmode. Valid modes: "plain", "essiv:<esshash>", "benbi".
1020 * See comments at iv code 1075 * See comments at iv code
@@ -1024,6 +1079,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1024 cc->iv_gen_ops = NULL; 1079 cc->iv_gen_ops = NULL;
1025 else if (strcmp(ivmode, "plain") == 0) 1080 else if (strcmp(ivmode, "plain") == 0)
1026 cc->iv_gen_ops = &crypt_iv_plain_ops; 1081 cc->iv_gen_ops = &crypt_iv_plain_ops;
1082 else if (strcmp(ivmode, "plain64") == 0)
1083 cc->iv_gen_ops = &crypt_iv_plain64_ops;
1027 else if (strcmp(ivmode, "essiv") == 0) 1084 else if (strcmp(ivmode, "essiv") == 0)
1028 cc->iv_gen_ops = &crypt_iv_essiv_ops; 1085 cc->iv_gen_ops = &crypt_iv_essiv_ops;
1029 else if (strcmp(ivmode, "benbi") == 0) 1086 else if (strcmp(ivmode, "benbi") == 0)
@@ -1039,6 +1096,12 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1039 cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0) 1096 cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0)
1040 goto bad_ivmode; 1097 goto bad_ivmode;
1041 1098
1099 if (cc->iv_gen_ops && cc->iv_gen_ops->init &&
1100 cc->iv_gen_ops->init(cc) < 0) {
1101 ti->error = "Error initialising IV";
1102 goto bad_slab_pool;
1103 }
1104
1042 cc->iv_size = crypto_ablkcipher_ivsize(tfm); 1105 cc->iv_size = crypto_ablkcipher_ivsize(tfm);
1043 if (cc->iv_size) 1106 if (cc->iv_size)
1044 /* at least a 64 bit sector number should fit in our buffer */ 1107 /* at least a 64 bit sector number should fit in our buffer */
@@ -1085,11 +1148,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1085 goto bad_bs; 1148 goto bad_bs;
1086 } 1149 }
1087 1150
1088 if (crypto_ablkcipher_setkey(tfm, cc->key, key_size) < 0) {
1089 ti->error = "Error setting key";
1090 goto bad_device;
1091 }
1092
1093 if (sscanf(argv[2], "%llu", &tmpll) != 1) { 1151 if (sscanf(argv[2], "%llu", &tmpll) != 1) {
1094 ti->error = "Invalid iv_offset sector"; 1152 ti->error = "Invalid iv_offset sector";
1095 goto bad_device; 1153 goto bad_device;
@@ -1102,8 +1160,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1102 } 1160 }
1103 cc->start = tmpll; 1161 cc->start = tmpll;
1104 1162
1105 if (dm_get_device(ti, argv[3], cc->start, ti->len, 1163 if (dm_get_device(ti, argv[3], dm_table_get_mode(ti->table), &cc->dev)) {
1106 dm_table_get_mode(ti->table), &cc->dev)) {
1107 ti->error = "Device lookup failed"; 1164 ti->error = "Device lookup failed";
1108 goto bad_device; 1165 goto bad_device;
1109 } 1166 }
@@ -1278,6 +1335,7 @@ static void crypt_resume(struct dm_target *ti)
1278static int crypt_message(struct dm_target *ti, unsigned argc, char **argv) 1335static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
1279{ 1336{
1280 struct crypt_config *cc = ti->private; 1337 struct crypt_config *cc = ti->private;
1338 int ret = -EINVAL;
1281 1339
1282 if (argc < 2) 1340 if (argc < 2)
1283 goto error; 1341 goto error;
@@ -1287,10 +1345,22 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
1287 DMWARN("not suspended during key manipulation."); 1345 DMWARN("not suspended during key manipulation.");
1288 return -EINVAL; 1346 return -EINVAL;
1289 } 1347 }
1290 if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) 1348 if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) {
1291 return crypt_set_key(cc, argv[2]); 1349 ret = crypt_set_key(cc, argv[2]);
1292 if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) 1350 if (ret)
1351 return ret;
1352 if (cc->iv_gen_ops && cc->iv_gen_ops->init)
1353 ret = cc->iv_gen_ops->init(cc);
1354 return ret;
1355 }
1356 if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) {
1357 if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) {
1358 ret = cc->iv_gen_ops->wipe(cc);
1359 if (ret)
1360 return ret;
1361 }
1293 return crypt_wipe_key(cc); 1362 return crypt_wipe_key(cc);
1363 }
1294 } 1364 }
1295 1365
1296error: 1366error:
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index ebe7381f47c8..852052880d7a 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -156,8 +156,8 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
156 goto bad; 156 goto bad;
157 } 157 }
158 158
159 if (dm_get_device(ti, argv[0], dc->start_read, ti->len, 159 if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
160 dm_table_get_mode(ti->table), &dc->dev_read)) { 160 &dc->dev_read)) {
161 ti->error = "Device lookup failed"; 161 ti->error = "Device lookup failed";
162 goto bad; 162 goto bad;
163 } 163 }
@@ -177,8 +177,8 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
177 goto bad_dev_read; 177 goto bad_dev_read;
178 } 178 }
179 179
180 if (dm_get_device(ti, argv[3], dc->start_write, ti->len, 180 if (dm_get_device(ti, argv[3], dm_table_get_mode(ti->table),
181 dm_table_get_mode(ti->table), &dc->dev_write)) { 181 &dc->dev_write)) {
182 ti->error = "Write device lookup failed"; 182 ti->error = "Write device lookup failed";
183 goto bad_dev_read; 183 goto bad_dev_read;
184 } 184 }
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 556acff3952f..2b7907b6dd09 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -138,16 +138,6 @@ int dm_exception_store_type_unregister(struct dm_exception_store_type *type)
138} 138}
139EXPORT_SYMBOL(dm_exception_store_type_unregister); 139EXPORT_SYMBOL(dm_exception_store_type_unregister);
140 140
141/*
142 * Round a number up to the nearest 'size' boundary. size must
143 * be a power of 2.
144 */
145static ulong round_up(ulong n, ulong size)
146{
147 size--;
148 return (n + size) & ~size;
149}
150
151static int set_chunk_size(struct dm_exception_store *store, 141static int set_chunk_size(struct dm_exception_store *store,
152 const char *chunk_size_arg, char **error) 142 const char *chunk_size_arg, char **error)
153{ 143{
@@ -155,7 +145,8 @@ static int set_chunk_size(struct dm_exception_store *store,
155 char *value; 145 char *value;
156 146
157 chunk_size_ulong = simple_strtoul(chunk_size_arg, &value, 10); 147 chunk_size_ulong = simple_strtoul(chunk_size_arg, &value, 10);
158 if (*chunk_size_arg == '\0' || *value != '\0') { 148 if (*chunk_size_arg == '\0' || *value != '\0' ||
149 chunk_size_ulong > UINT_MAX) {
159 *error = "Invalid chunk size"; 150 *error = "Invalid chunk size";
160 return -EINVAL; 151 return -EINVAL;
161 } 152 }
@@ -165,45 +156,42 @@ static int set_chunk_size(struct dm_exception_store *store,
165 return 0; 156 return 0;
166 } 157 }
167 158
168 /* 159 return dm_exception_store_set_chunk_size(store,
169 * Chunk size must be multiple of page size. Silently 160 (unsigned) chunk_size_ulong,
170 * round up if it's not.
171 */
172 chunk_size_ulong = round_up(chunk_size_ulong, PAGE_SIZE >> 9);
173
174 return dm_exception_store_set_chunk_size(store, chunk_size_ulong,
175 error); 161 error);
176} 162}
177 163
178int dm_exception_store_set_chunk_size(struct dm_exception_store *store, 164int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
179 unsigned long chunk_size_ulong, 165 unsigned chunk_size,
180 char **error) 166 char **error)
181{ 167{
182 /* Check chunk_size is a power of 2 */ 168 /* Check chunk_size is a power of 2 */
183 if (!is_power_of_2(chunk_size_ulong)) { 169 if (!is_power_of_2(chunk_size)) {
184 *error = "Chunk size is not a power of 2"; 170 *error = "Chunk size is not a power of 2";
185 return -EINVAL; 171 return -EINVAL;
186 } 172 }
187 173
188 /* Validate the chunk size against the device block size */ 174 /* Validate the chunk size against the device block size */
189 if (chunk_size_ulong % (bdev_logical_block_size(store->cow->bdev) >> 9)) { 175 if (chunk_size %
176 (bdev_logical_block_size(dm_snap_cow(store->snap)->bdev) >> 9)) {
190 *error = "Chunk size is not a multiple of device blocksize"; 177 *error = "Chunk size is not a multiple of device blocksize";
191 return -EINVAL; 178 return -EINVAL;
192 } 179 }
193 180
194 if (chunk_size_ulong > INT_MAX >> SECTOR_SHIFT) { 181 if (chunk_size > INT_MAX >> SECTOR_SHIFT) {
195 *error = "Chunk size is too high"; 182 *error = "Chunk size is too high";
196 return -EINVAL; 183 return -EINVAL;
197 } 184 }
198 185
199 store->chunk_size = chunk_size_ulong; 186 store->chunk_size = chunk_size;
200 store->chunk_mask = chunk_size_ulong - 1; 187 store->chunk_mask = chunk_size - 1;
201 store->chunk_shift = ffs(chunk_size_ulong) - 1; 188 store->chunk_shift = ffs(chunk_size) - 1;
202 189
203 return 0; 190 return 0;
204} 191}
205 192
206int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, 193int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
194 struct dm_snapshot *snap,
207 unsigned *args_used, 195 unsigned *args_used,
208 struct dm_exception_store **store) 196 struct dm_exception_store **store)
209{ 197{
@@ -212,7 +200,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
212 struct dm_exception_store *tmp_store; 200 struct dm_exception_store *tmp_store;
213 char persistent; 201 char persistent;
214 202
215 if (argc < 3) { 203 if (argc < 2) {
216 ti->error = "Insufficient exception store arguments"; 204 ti->error = "Insufficient exception store arguments";
217 return -EINVAL; 205 return -EINVAL;
218 } 206 }
@@ -223,14 +211,15 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
223 return -ENOMEM; 211 return -ENOMEM;
224 } 212 }
225 213
226 persistent = toupper(*argv[1]); 214 persistent = toupper(*argv[0]);
227 if (persistent == 'P') 215 if (persistent == 'P')
228 type = get_type("P"); 216 type = get_type("P");
229 else if (persistent == 'N') 217 else if (persistent == 'N')
230 type = get_type("N"); 218 type = get_type("N");
231 else { 219 else {
232 ti->error = "Persistent flag is not P or N"; 220 ti->error = "Persistent flag is not P or N";
233 return -EINVAL; 221 r = -EINVAL;
222 goto bad_type;
234 } 223 }
235 224
236 if (!type) { 225 if (!type) {
@@ -240,32 +229,23 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
240 } 229 }
241 230
242 tmp_store->type = type; 231 tmp_store->type = type;
243 tmp_store->ti = ti; 232 tmp_store->snap = snap;
244
245 r = dm_get_device(ti, argv[0], 0, 0,
246 FMODE_READ | FMODE_WRITE, &tmp_store->cow);
247 if (r) {
248 ti->error = "Cannot get COW device";
249 goto bad_cow;
250 }
251 233
252 r = set_chunk_size(tmp_store, argv[2], &ti->error); 234 r = set_chunk_size(tmp_store, argv[1], &ti->error);
253 if (r) 235 if (r)
254 goto bad_cow; 236 goto bad;
255 237
256 r = type->ctr(tmp_store, 0, NULL); 238 r = type->ctr(tmp_store, 0, NULL);
257 if (r) { 239 if (r) {
258 ti->error = "Exception store type constructor failed"; 240 ti->error = "Exception store type constructor failed";
259 goto bad_ctr; 241 goto bad;
260 } 242 }
261 243
262 *args_used = 3; 244 *args_used = 2;
263 *store = tmp_store; 245 *store = tmp_store;
264 return 0; 246 return 0;
265 247
266bad_ctr: 248bad:
267 dm_put_device(ti, tmp_store->cow);
268bad_cow:
269 put_type(type); 249 put_type(type);
270bad_type: 250bad_type:
271 kfree(tmp_store); 251 kfree(tmp_store);
@@ -276,7 +256,6 @@ EXPORT_SYMBOL(dm_exception_store_create);
276void dm_exception_store_destroy(struct dm_exception_store *store) 256void dm_exception_store_destroy(struct dm_exception_store *store)
277{ 257{
278 store->type->dtr(store); 258 store->type->dtr(store);
279 dm_put_device(store->ti, store->cow);
280 put_type(store->type); 259 put_type(store->type);
281 kfree(store); 260 kfree(store);
282} 261}
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index 812c71872ba0..e8dfa06af3ba 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -26,7 +26,7 @@ typedef sector_t chunk_t;
26 * of chunks that follow contiguously. Remaining bits hold the number of the 26 * of chunks that follow contiguously. Remaining bits hold the number of the
27 * chunk within the device. 27 * chunk within the device.
28 */ 28 */
29struct dm_snap_exception { 29struct dm_exception {
30 struct list_head hash_list; 30 struct list_head hash_list;
31 31
32 chunk_t old_chunk; 32 chunk_t old_chunk;
@@ -64,17 +64,34 @@ struct dm_exception_store_type {
64 * Find somewhere to store the next exception. 64 * Find somewhere to store the next exception.
65 */ 65 */
66 int (*prepare_exception) (struct dm_exception_store *store, 66 int (*prepare_exception) (struct dm_exception_store *store,
67 struct dm_snap_exception *e); 67 struct dm_exception *e);
68 68
69 /* 69 /*
70 * Update the metadata with this exception. 70 * Update the metadata with this exception.
71 */ 71 */
72 void (*commit_exception) (struct dm_exception_store *store, 72 void (*commit_exception) (struct dm_exception_store *store,
73 struct dm_snap_exception *e, 73 struct dm_exception *e,
74 void (*callback) (void *, int success), 74 void (*callback) (void *, int success),
75 void *callback_context); 75 void *callback_context);
76 76
77 /* 77 /*
78 * Returns 0 if the exception store is empty.
79 *
80 * If there are exceptions still to be merged, sets
81 * *last_old_chunk and *last_new_chunk to the most recent
82 * still-to-be-merged chunk and returns the number of
83 * consecutive previous ones.
84 */
85 int (*prepare_merge) (struct dm_exception_store *store,
86 chunk_t *last_old_chunk, chunk_t *last_new_chunk);
87
88 /*
89 * Clear the last n exceptions.
90 * nr_merged must be <= the value returned by prepare_merge.
91 */
92 int (*commit_merge) (struct dm_exception_store *store, int nr_merged);
93
94 /*
78 * The snapshot is invalid, note this in the metadata. 95 * The snapshot is invalid, note this in the metadata.
79 */ 96 */
80 void (*drop_snapshot) (struct dm_exception_store *store); 97 void (*drop_snapshot) (struct dm_exception_store *store);
@@ -86,29 +103,34 @@ struct dm_exception_store_type {
86 /* 103 /*
87 * Return how full the snapshot is. 104 * Return how full the snapshot is.
88 */ 105 */
89 void (*fraction_full) (struct dm_exception_store *store, 106 void (*usage) (struct dm_exception_store *store,
90 sector_t *numerator, 107 sector_t *total_sectors, sector_t *sectors_allocated,
91 sector_t *denominator); 108 sector_t *metadata_sectors);
92 109
93 /* For internal device-mapper use only. */ 110 /* For internal device-mapper use only. */
94 struct list_head list; 111 struct list_head list;
95}; 112};
96 113
114struct dm_snapshot;
115
97struct dm_exception_store { 116struct dm_exception_store {
98 struct dm_exception_store_type *type; 117 struct dm_exception_store_type *type;
99 struct dm_target *ti; 118 struct dm_snapshot *snap;
100
101 struct dm_dev *cow;
102 119
103 /* Size of data blocks saved - must be a power of 2 */ 120 /* Size of data blocks saved - must be a power of 2 */
104 chunk_t chunk_size; 121 unsigned chunk_size;
105 chunk_t chunk_mask; 122 unsigned chunk_mask;
106 chunk_t chunk_shift; 123 unsigned chunk_shift;
107 124
108 void *context; 125 void *context;
109}; 126};
110 127
111/* 128/*
129 * Obtain the cow device used by a given snapshot.
130 */
131struct dm_dev *dm_snap_cow(struct dm_snapshot *snap);
132
133/*
112 * Funtions to manipulate consecutive chunks 134 * Funtions to manipulate consecutive chunks
113 */ 135 */
114# if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64) 136# if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64)
@@ -120,18 +142,25 @@ static inline chunk_t dm_chunk_number(chunk_t chunk)
120 return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL); 142 return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL);
121} 143}
122 144
123static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e) 145static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e)
124{ 146{
125 return e->new_chunk >> DM_CHUNK_NUMBER_BITS; 147 return e->new_chunk >> DM_CHUNK_NUMBER_BITS;
126} 148}
127 149
128static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e) 150static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e)
129{ 151{
130 e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS); 152 e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS);
131 153
132 BUG_ON(!dm_consecutive_chunk_count(e)); 154 BUG_ON(!dm_consecutive_chunk_count(e));
133} 155}
134 156
157static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e)
158{
159 BUG_ON(!dm_consecutive_chunk_count(e));
160
161 e->new_chunk -= (1ULL << DM_CHUNK_NUMBER_BITS);
162}
163
135# else 164# else
136# define DM_CHUNK_CONSECUTIVE_BITS 0 165# define DM_CHUNK_CONSECUTIVE_BITS 0
137 166
@@ -140,12 +169,16 @@ static inline chunk_t dm_chunk_number(chunk_t chunk)
140 return chunk; 169 return chunk;
141} 170}
142 171
143static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e) 172static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e)
144{ 173{
145 return 0; 174 return 0;
146} 175}
147 176
148static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e) 177static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e)
178{
179}
180
181static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e)
149{ 182{
150} 183}
151 184
@@ -162,17 +195,18 @@ static inline sector_t get_dev_size(struct block_device *bdev)
162static inline chunk_t sector_to_chunk(struct dm_exception_store *store, 195static inline chunk_t sector_to_chunk(struct dm_exception_store *store,
163 sector_t sector) 196 sector_t sector)
164{ 197{
165 return (sector & ~store->chunk_mask) >> store->chunk_shift; 198 return sector >> store->chunk_shift;
166} 199}
167 200
168int dm_exception_store_type_register(struct dm_exception_store_type *type); 201int dm_exception_store_type_register(struct dm_exception_store_type *type);
169int dm_exception_store_type_unregister(struct dm_exception_store_type *type); 202int dm_exception_store_type_unregister(struct dm_exception_store_type *type);
170 203
171int dm_exception_store_set_chunk_size(struct dm_exception_store *store, 204int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
172 unsigned long chunk_size_ulong, 205 unsigned chunk_size,
173 char **error); 206 char **error);
174 207
175int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, 208int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
209 struct dm_snapshot *snap,
176 unsigned *args_used, 210 unsigned *args_used,
177 struct dm_exception_store **store); 211 struct dm_exception_store **store);
178void dm_exception_store_destroy(struct dm_exception_store *store); 212void dm_exception_store_destroy(struct dm_exception_store *store);
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 3a2e6a2f8bdd..10f457ca6af2 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -5,6 +5,8 @@
5 * This file is released under the GPL. 5 * This file is released under the GPL.
6 */ 6 */
7 7
8#include "dm.h"
9
8#include <linux/device-mapper.h> 10#include <linux/device-mapper.h>
9 11
10#include <linux/bio.h> 12#include <linux/bio.h>
@@ -14,12 +16,19 @@
14#include <linux/slab.h> 16#include <linux/slab.h>
15#include <linux/dm-io.h> 17#include <linux/dm-io.h>
16 18
19#define DM_MSG_PREFIX "io"
20
21#define DM_IO_MAX_REGIONS BITS_PER_LONG
22
17struct dm_io_client { 23struct dm_io_client {
18 mempool_t *pool; 24 mempool_t *pool;
19 struct bio_set *bios; 25 struct bio_set *bios;
20}; 26};
21 27
22/* FIXME: can we shrink this ? */ 28/*
29 * Aligning 'struct io' reduces the number of bits required to store
30 * its address. Refer to store_io_and_region_in_bio() below.
31 */
23struct io { 32struct io {
24 unsigned long error_bits; 33 unsigned long error_bits;
25 unsigned long eopnotsupp_bits; 34 unsigned long eopnotsupp_bits;
@@ -28,7 +37,9 @@ struct io {
28 struct dm_io_client *client; 37 struct dm_io_client *client;
29 io_notify_fn callback; 38 io_notify_fn callback;
30 void *context; 39 void *context;
31}; 40} __attribute__((aligned(DM_IO_MAX_REGIONS)));
41
42static struct kmem_cache *_dm_io_cache;
32 43
33/* 44/*
34 * io contexts are only dynamically allocated for asynchronous 45 * io contexts are only dynamically allocated for asynchronous
@@ -53,7 +64,7 @@ struct dm_io_client *dm_io_client_create(unsigned num_pages)
53 if (!client) 64 if (!client)
54 return ERR_PTR(-ENOMEM); 65 return ERR_PTR(-ENOMEM);
55 66
56 client->pool = mempool_create_kmalloc_pool(ios, sizeof(struct io)); 67 client->pool = mempool_create_slab_pool(ios, _dm_io_cache);
57 if (!client->pool) 68 if (!client->pool)
58 goto bad; 69 goto bad;
59 70
@@ -88,18 +99,29 @@ EXPORT_SYMBOL(dm_io_client_destroy);
88 99
89/*----------------------------------------------------------------- 100/*-----------------------------------------------------------------
90 * We need to keep track of which region a bio is doing io for. 101 * We need to keep track of which region a bio is doing io for.
91 * In order to save a memory allocation we store this the last 102 * To avoid a memory allocation to store just 5 or 6 bits, we
92 * bvec which we know is unused (blech). 103 * ensure the 'struct io' pointer is aligned so enough low bits are
93 * XXX This is ugly and can OOPS with some configs... find another way. 104 * always zero and then combine it with the region number directly in
105 * bi_private.
94 *---------------------------------------------------------------*/ 106 *---------------------------------------------------------------*/
95static inline void bio_set_region(struct bio *bio, unsigned region) 107static void store_io_and_region_in_bio(struct bio *bio, struct io *io,
108 unsigned region)
96{ 109{
97 bio->bi_io_vec[bio->bi_max_vecs].bv_len = region; 110 if (unlikely(!IS_ALIGNED((unsigned long)io, DM_IO_MAX_REGIONS))) {
111 DMCRIT("Unaligned struct io pointer %p", io);
112 BUG();
113 }
114
115 bio->bi_private = (void *)((unsigned long)io | region);
98} 116}
99 117
100static inline unsigned bio_get_region(struct bio *bio) 118static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io,
119 unsigned *region)
101{ 120{
102 return bio->bi_io_vec[bio->bi_max_vecs].bv_len; 121 unsigned long val = (unsigned long)bio->bi_private;
122
123 *io = (void *)(val & -(unsigned long)DM_IO_MAX_REGIONS);
124 *region = val & (DM_IO_MAX_REGIONS - 1);
103} 125}
104 126
105/*----------------------------------------------------------------- 127/*-----------------------------------------------------------------
@@ -140,10 +162,8 @@ static void endio(struct bio *bio, int error)
140 /* 162 /*
141 * The bio destructor in bio_put() may use the io object. 163 * The bio destructor in bio_put() may use the io object.
142 */ 164 */
143 io = bio->bi_private; 165 retrieve_io_and_region_from_bio(bio, &io, &region);
144 region = bio_get_region(bio);
145 166
146 bio->bi_max_vecs++;
147 bio_put(bio); 167 bio_put(bio);
148 168
149 dec_count(io, region, error); 169 dec_count(io, region, error);
@@ -243,7 +263,10 @@ static void vm_dp_init(struct dpages *dp, void *data)
243 263
244static void dm_bio_destructor(struct bio *bio) 264static void dm_bio_destructor(struct bio *bio)
245{ 265{
246 struct io *io = bio->bi_private; 266 unsigned region;
267 struct io *io;
268
269 retrieve_io_and_region_from_bio(bio, &io, &region);
247 270
248 bio_free(bio, io->client->bios); 271 bio_free(bio, io->client->bios);
249} 272}
@@ -286,26 +309,23 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
286 unsigned num_bvecs; 309 unsigned num_bvecs;
287 sector_t remaining = where->count; 310 sector_t remaining = where->count;
288 311
289 while (remaining) { 312 /*
313 * where->count may be zero if rw holds a write barrier and we
314 * need to send a zero-sized barrier.
315 */
316 do {
290 /* 317 /*
291 * Allocate a suitably sized-bio: we add an extra 318 * Allocate a suitably sized-bio.
292 * bvec for bio_get/set_region() and decrement bi_max_vecs
293 * to hide it from bio_add_page().
294 */ 319 */
295 num_bvecs = dm_sector_div_up(remaining, 320 num_bvecs = dm_sector_div_up(remaining,
296 (PAGE_SIZE >> SECTOR_SHIFT)); 321 (PAGE_SIZE >> SECTOR_SHIFT));
297 num_bvecs = 1 + min_t(int, bio_get_nr_vecs(where->bdev), 322 num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), num_bvecs);
298 num_bvecs);
299 if (unlikely(num_bvecs > BIO_MAX_PAGES))
300 num_bvecs = BIO_MAX_PAGES;
301 bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios); 323 bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
302 bio->bi_sector = where->sector + (where->count - remaining); 324 bio->bi_sector = where->sector + (where->count - remaining);
303 bio->bi_bdev = where->bdev; 325 bio->bi_bdev = where->bdev;
304 bio->bi_end_io = endio; 326 bio->bi_end_io = endio;
305 bio->bi_private = io;
306 bio->bi_destructor = dm_bio_destructor; 327 bio->bi_destructor = dm_bio_destructor;
307 bio->bi_max_vecs--; 328 store_io_and_region_in_bio(bio, io, region);
308 bio_set_region(bio, region);
309 329
310 /* 330 /*
311 * Try and add as many pages as possible. 331 * Try and add as many pages as possible.
@@ -323,7 +343,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
323 343
324 atomic_inc(&io->count); 344 atomic_inc(&io->count);
325 submit_bio(rw, bio); 345 submit_bio(rw, bio);
326 } 346 } while (remaining);
327} 347}
328 348
329static void dispatch_io(int rw, unsigned int num_regions, 349static void dispatch_io(int rw, unsigned int num_regions,
@@ -333,6 +353,8 @@ static void dispatch_io(int rw, unsigned int num_regions,
333 int i; 353 int i;
334 struct dpages old_pages = *dp; 354 struct dpages old_pages = *dp;
335 355
356 BUG_ON(num_regions > DM_IO_MAX_REGIONS);
357
336 if (sync) 358 if (sync)
337 rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); 359 rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
338 360
@@ -342,7 +364,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
342 */ 364 */
343 for (i = 0; i < num_regions; i++) { 365 for (i = 0; i < num_regions; i++) {
344 *dp = old_pages; 366 *dp = old_pages;
345 if (where[i].count) 367 if (where[i].count || (rw & (1 << BIO_RW_BARRIER)))
346 do_region(rw, i, where + i, dp, io); 368 do_region(rw, i, where + i, dp, io);
347 } 369 }
348 370
@@ -357,7 +379,14 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
357 struct dm_io_region *where, int rw, struct dpages *dp, 379 struct dm_io_region *where, int rw, struct dpages *dp,
358 unsigned long *error_bits) 380 unsigned long *error_bits)
359{ 381{
360 struct io io; 382 /*
383 * gcc <= 4.3 can't do the alignment for stack variables, so we must
384 * align it on our own.
385 * volatile prevents the optimizer from removing or reusing
386 * "io_" field from the stack frame (allowed in ANSI C).
387 */
388 volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1];
389 struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io));
361 390
362 if (num_regions > 1 && (rw & RW_MASK) != WRITE) { 391 if (num_regions > 1 && (rw & RW_MASK) != WRITE) {
363 WARN_ON(1); 392 WARN_ON(1);
@@ -365,33 +394,33 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
365 } 394 }
366 395
367retry: 396retry:
368 io.error_bits = 0; 397 io->error_bits = 0;
369 io.eopnotsupp_bits = 0; 398 io->eopnotsupp_bits = 0;
370 atomic_set(&io.count, 1); /* see dispatch_io() */ 399 atomic_set(&io->count, 1); /* see dispatch_io() */
371 io.sleeper = current; 400 io->sleeper = current;
372 io.client = client; 401 io->client = client;
373 402
374 dispatch_io(rw, num_regions, where, dp, &io, 1); 403 dispatch_io(rw, num_regions, where, dp, io, 1);
375 404
376 while (1) { 405 while (1) {
377 set_current_state(TASK_UNINTERRUPTIBLE); 406 set_current_state(TASK_UNINTERRUPTIBLE);
378 407
379 if (!atomic_read(&io.count)) 408 if (!atomic_read(&io->count))
380 break; 409 break;
381 410
382 io_schedule(); 411 io_schedule();
383 } 412 }
384 set_current_state(TASK_RUNNING); 413 set_current_state(TASK_RUNNING);
385 414
386 if (io.eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) { 415 if (io->eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) {
387 rw &= ~(1 << BIO_RW_BARRIER); 416 rw &= ~(1 << BIO_RW_BARRIER);
388 goto retry; 417 goto retry;
389 } 418 }
390 419
391 if (error_bits) 420 if (error_bits)
392 *error_bits = io.error_bits; 421 *error_bits = io->error_bits;
393 422
394 return io.error_bits ? -EIO : 0; 423 return io->error_bits ? -EIO : 0;
395} 424}
396 425
397static int async_io(struct dm_io_client *client, unsigned int num_regions, 426static int async_io(struct dm_io_client *client, unsigned int num_regions,
@@ -472,3 +501,18 @@ int dm_io(struct dm_io_request *io_req, unsigned num_regions,
472 &dp, io_req->notify.fn, io_req->notify.context); 501 &dp, io_req->notify.fn, io_req->notify.context);
473} 502}
474EXPORT_SYMBOL(dm_io); 503EXPORT_SYMBOL(dm_io);
504
505int __init dm_io_init(void)
506{
507 _dm_io_cache = KMEM_CACHE(io, 0);
508 if (!_dm_io_cache)
509 return -ENOMEM;
510
511 return 0;
512}
513
514void dm_io_exit(void)
515{
516 kmem_cache_destroy(_dm_io_cache);
517 _dm_io_cache = NULL;
518}
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index a67942931582..d7500e1c26f2 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -56,6 +56,11 @@ static void dm_hash_remove_all(int keep_open_devices);
56 */ 56 */
57static DECLARE_RWSEM(_hash_lock); 57static DECLARE_RWSEM(_hash_lock);
58 58
59/*
60 * Protects use of mdptr to obtain hash cell name and uuid from mapped device.
61 */
62static DEFINE_MUTEX(dm_hash_cells_mutex);
63
59static void init_buckets(struct list_head *buckets) 64static void init_buckets(struct list_head *buckets)
60{ 65{
61 unsigned int i; 66 unsigned int i;
@@ -206,7 +211,9 @@ static int dm_hash_insert(const char *name, const char *uuid, struct mapped_devi
206 list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid)); 211 list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid));
207 } 212 }
208 dm_get(md); 213 dm_get(md);
214 mutex_lock(&dm_hash_cells_mutex);
209 dm_set_mdptr(md, cell); 215 dm_set_mdptr(md, cell);
216 mutex_unlock(&dm_hash_cells_mutex);
210 up_write(&_hash_lock); 217 up_write(&_hash_lock);
211 218
212 return 0; 219 return 0;
@@ -224,9 +231,11 @@ static void __hash_remove(struct hash_cell *hc)
224 /* remove from the dev hash */ 231 /* remove from the dev hash */
225 list_del(&hc->uuid_list); 232 list_del(&hc->uuid_list);
226 list_del(&hc->name_list); 233 list_del(&hc->name_list);
234 mutex_lock(&dm_hash_cells_mutex);
227 dm_set_mdptr(hc->md, NULL); 235 dm_set_mdptr(hc->md, NULL);
236 mutex_unlock(&dm_hash_cells_mutex);
228 237
229 table = dm_get_table(hc->md); 238 table = dm_get_live_table(hc->md);
230 if (table) { 239 if (table) {
231 dm_table_event(table); 240 dm_table_event(table);
232 dm_table_put(table); 241 dm_table_put(table);
@@ -276,7 +285,8 @@ retry:
276 up_write(&_hash_lock); 285 up_write(&_hash_lock);
277} 286}
278 287
279static int dm_hash_rename(uint32_t cookie, const char *old, const char *new) 288static int dm_hash_rename(uint32_t cookie, uint32_t *flags, const char *old,
289 const char *new)
280{ 290{
281 char *new_name, *old_name; 291 char *new_name, *old_name;
282 struct hash_cell *hc; 292 struct hash_cell *hc;
@@ -321,19 +331,22 @@ static int dm_hash_rename(uint32_t cookie, const char *old, const char *new)
321 */ 331 */
322 list_del(&hc->name_list); 332 list_del(&hc->name_list);
323 old_name = hc->name; 333 old_name = hc->name;
334 mutex_lock(&dm_hash_cells_mutex);
324 hc->name = new_name; 335 hc->name = new_name;
336 mutex_unlock(&dm_hash_cells_mutex);
325 list_add(&hc->name_list, _name_buckets + hash_str(new_name)); 337 list_add(&hc->name_list, _name_buckets + hash_str(new_name));
326 338
327 /* 339 /*
328 * Wake up any dm event waiters. 340 * Wake up any dm event waiters.
329 */ 341 */
330 table = dm_get_table(hc->md); 342 table = dm_get_live_table(hc->md);
331 if (table) { 343 if (table) {
332 dm_table_event(table); 344 dm_table_event(table);
333 dm_table_put(table); 345 dm_table_put(table);
334 } 346 }
335 347
336 dm_kobject_uevent(hc->md, KOBJ_CHANGE, cookie); 348 if (!dm_kobject_uevent(hc->md, KOBJ_CHANGE, cookie))
349 *flags |= DM_UEVENT_GENERATED_FLAG;
337 350
338 dm_put(hc->md); 351 dm_put(hc->md);
339 up_write(&_hash_lock); 352 up_write(&_hash_lock);
@@ -512,8 +525,6 @@ static int list_versions(struct dm_ioctl *param, size_t param_size)
512 return 0; 525 return 0;
513} 526}
514 527
515
516
517static int check_name(const char *name) 528static int check_name(const char *name)
518{ 529{
519 if (strchr(name, '/')) { 530 if (strchr(name, '/')) {
@@ -525,6 +536,40 @@ static int check_name(const char *name)
525} 536}
526 537
527/* 538/*
539 * On successful return, the caller must not attempt to acquire
540 * _hash_lock without first calling dm_table_put, because dm_table_destroy
541 * waits for this dm_table_put and could be called under this lock.
542 */
543static struct dm_table *dm_get_inactive_table(struct mapped_device *md)
544{
545 struct hash_cell *hc;
546 struct dm_table *table = NULL;
547
548 down_read(&_hash_lock);
549 hc = dm_get_mdptr(md);
550 if (!hc || hc->md != md) {
551 DMWARN("device has been removed from the dev hash table.");
552 goto out;
553 }
554
555 table = hc->new_map;
556 if (table)
557 dm_table_get(table);
558
559out:
560 up_read(&_hash_lock);
561
562 return table;
563}
564
565static struct dm_table *dm_get_live_or_inactive_table(struct mapped_device *md,
566 struct dm_ioctl *param)
567{
568 return (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) ?
569 dm_get_inactive_table(md) : dm_get_live_table(md);
570}
571
572/*
528 * Fills in a dm_ioctl structure, ready for sending back to 573 * Fills in a dm_ioctl structure, ready for sending back to
529 * userland. 574 * userland.
530 */ 575 */
@@ -536,7 +581,7 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
536 param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG | 581 param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG |
537 DM_ACTIVE_PRESENT_FLAG); 582 DM_ACTIVE_PRESENT_FLAG);
538 583
539 if (dm_suspended(md)) 584 if (dm_suspended_md(md))
540 param->flags |= DM_SUSPEND_FLAG; 585 param->flags |= DM_SUSPEND_FLAG;
541 586
542 param->dev = huge_encode_dev(disk_devt(disk)); 587 param->dev = huge_encode_dev(disk_devt(disk));
@@ -548,18 +593,30 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
548 */ 593 */
549 param->open_count = dm_open_count(md); 594 param->open_count = dm_open_count(md);
550 595
551 if (get_disk_ro(disk))
552 param->flags |= DM_READONLY_FLAG;
553
554 param->event_nr = dm_get_event_nr(md); 596 param->event_nr = dm_get_event_nr(md);
597 param->target_count = 0;
555 598
556 table = dm_get_table(md); 599 table = dm_get_live_table(md);
557 if (table) { 600 if (table) {
558 param->flags |= DM_ACTIVE_PRESENT_FLAG; 601 if (!(param->flags & DM_QUERY_INACTIVE_TABLE_FLAG)) {
559 param->target_count = dm_table_get_num_targets(table); 602 if (get_disk_ro(disk))
603 param->flags |= DM_READONLY_FLAG;
604 param->target_count = dm_table_get_num_targets(table);
605 }
560 dm_table_put(table); 606 dm_table_put(table);
561 } else 607
562 param->target_count = 0; 608 param->flags |= DM_ACTIVE_PRESENT_FLAG;
609 }
610
611 if (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) {
612 table = dm_get_inactive_table(md);
613 if (table) {
614 if (!(dm_table_get_mode(table) & FMODE_WRITE))
615 param->flags |= DM_READONLY_FLAG;
616 param->target_count = dm_table_get_num_targets(table);
617 dm_table_put(table);
618 }
619 }
563 620
564 return 0; 621 return 0;
565} 622}
@@ -634,9 +691,9 @@ static struct mapped_device *find_device(struct dm_ioctl *param)
634 * Sneakily write in both the name and the uuid 691 * Sneakily write in both the name and the uuid
635 * while we have the cell. 692 * while we have the cell.
636 */ 693 */
637 strncpy(param->name, hc->name, sizeof(param->name)); 694 strlcpy(param->name, hc->name, sizeof(param->name));
638 if (hc->uuid) 695 if (hc->uuid)
639 strncpy(param->uuid, hc->uuid, sizeof(param->uuid)-1); 696 strlcpy(param->uuid, hc->uuid, sizeof(param->uuid));
640 else 697 else
641 param->uuid[0] = '\0'; 698 param->uuid[0] = '\0';
642 699
@@ -681,10 +738,10 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
681 __hash_remove(hc); 738 __hash_remove(hc);
682 up_write(&_hash_lock); 739 up_write(&_hash_lock);
683 740
684 dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr); 741 if (!dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr))
742 param->flags |= DM_UEVENT_GENERATED_FLAG;
685 743
686 dm_put(md); 744 dm_put(md);
687 param->data_size = 0;
688 return 0; 745 return 0;
689} 746}
690 747
@@ -718,7 +775,9 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size)
718 return r; 775 return r;
719 776
720 param->data_size = 0; 777 param->data_size = 0;
721 return dm_hash_rename(param->event_nr, param->name, new_name); 778
779 return dm_hash_rename(param->event_nr, &param->flags, param->name,
780 new_name);
722} 781}
723 782
724static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) 783static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
@@ -784,7 +843,7 @@ static int do_suspend(struct dm_ioctl *param)
784 if (param->flags & DM_NOFLUSH_FLAG) 843 if (param->flags & DM_NOFLUSH_FLAG)
785 suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; 844 suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG;
786 845
787 if (!dm_suspended(md)) 846 if (!dm_suspended_md(md))
788 r = dm_suspend(md, suspend_flags); 847 r = dm_suspend(md, suspend_flags);
789 848
790 if (!r) 849 if (!r)
@@ -800,7 +859,7 @@ static int do_resume(struct dm_ioctl *param)
800 unsigned suspend_flags = DM_SUSPEND_LOCKFS_FLAG; 859 unsigned suspend_flags = DM_SUSPEND_LOCKFS_FLAG;
801 struct hash_cell *hc; 860 struct hash_cell *hc;
802 struct mapped_device *md; 861 struct mapped_device *md;
803 struct dm_table *new_map; 862 struct dm_table *new_map, *old_map = NULL;
804 863
805 down_write(&_hash_lock); 864 down_write(&_hash_lock);
806 865
@@ -826,14 +885,14 @@ static int do_resume(struct dm_ioctl *param)
826 suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG; 885 suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG;
827 if (param->flags & DM_NOFLUSH_FLAG) 886 if (param->flags & DM_NOFLUSH_FLAG)
828 suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; 887 suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG;
829 if (!dm_suspended(md)) 888 if (!dm_suspended_md(md))
830 dm_suspend(md, suspend_flags); 889 dm_suspend(md, suspend_flags);
831 890
832 r = dm_swap_table(md, new_map); 891 old_map = dm_swap_table(md, new_map);
833 if (r) { 892 if (IS_ERR(old_map)) {
834 dm_table_destroy(new_map); 893 dm_table_destroy(new_map);
835 dm_put(md); 894 dm_put(md);
836 return r; 895 return PTR_ERR(old_map);
837 } 896 }
838 897
839 if (dm_table_get_mode(new_map) & FMODE_WRITE) 898 if (dm_table_get_mode(new_map) & FMODE_WRITE)
@@ -842,14 +901,17 @@ static int do_resume(struct dm_ioctl *param)
842 set_disk_ro(dm_disk(md), 1); 901 set_disk_ro(dm_disk(md), 1);
843 } 902 }
844 903
845 if (dm_suspended(md)) 904 if (dm_suspended_md(md)) {
846 r = dm_resume(md); 905 r = dm_resume(md);
906 if (!r && !dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr))
907 param->flags |= DM_UEVENT_GENERATED_FLAG;
908 }
847 909
910 if (old_map)
911 dm_table_destroy(old_map);
848 912
849 if (!r) { 913 if (!r)
850 dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr);
851 r = __dev_status(md, param); 914 r = __dev_status(md, param);
852 }
853 915
854 dm_put(md); 916 dm_put(md);
855 return r; 917 return r;
@@ -982,7 +1044,7 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size)
982 if (r) 1044 if (r)
983 goto out; 1045 goto out;
984 1046
985 table = dm_get_table(md); 1047 table = dm_get_live_or_inactive_table(md, param);
986 if (table) { 1048 if (table) {
987 retrieve_status(table, param, param_size); 1049 retrieve_status(table, param, param_size);
988 dm_table_put(table); 1050 dm_table_put(table);
@@ -1215,7 +1277,7 @@ static int table_deps(struct dm_ioctl *param, size_t param_size)
1215 if (r) 1277 if (r)
1216 goto out; 1278 goto out;
1217 1279
1218 table = dm_get_table(md); 1280 table = dm_get_live_or_inactive_table(md, param);
1219 if (table) { 1281 if (table) {
1220 retrieve_deps(table, param, param_size); 1282 retrieve_deps(table, param, param_size);
1221 dm_table_put(table); 1283 dm_table_put(table);
@@ -1244,13 +1306,13 @@ static int table_status(struct dm_ioctl *param, size_t param_size)
1244 if (r) 1306 if (r)
1245 goto out; 1307 goto out;
1246 1308
1247 table = dm_get_table(md); 1309 table = dm_get_live_or_inactive_table(md, param);
1248 if (table) { 1310 if (table) {
1249 retrieve_status(table, param, param_size); 1311 retrieve_status(table, param, param_size);
1250 dm_table_put(table); 1312 dm_table_put(table);
1251 } 1313 }
1252 1314
1253 out: 1315out:
1254 dm_put(md); 1316 dm_put(md);
1255 return r; 1317 return r;
1256} 1318}
@@ -1288,10 +1350,15 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
1288 goto out; 1350 goto out;
1289 } 1351 }
1290 1352
1291 table = dm_get_table(md); 1353 table = dm_get_live_table(md);
1292 if (!table) 1354 if (!table)
1293 goto out_argv; 1355 goto out_argv;
1294 1356
1357 if (dm_deleting_md(md)) {
1358 r = -ENXIO;
1359 goto out_table;
1360 }
1361
1295 ti = dm_table_find_target(table, tmsg->sector); 1362 ti = dm_table_find_target(table, tmsg->sector);
1296 if (!dm_target_is_valid(ti)) { 1363 if (!dm_target_is_valid(ti)) {
1297 DMWARN("Target message sector outside device."); 1364 DMWARN("Target message sector outside device.");
@@ -1303,6 +1370,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
1303 r = -EINVAL; 1370 r = -EINVAL;
1304 } 1371 }
1305 1372
1373 out_table:
1306 dm_table_put(table); 1374 dm_table_put(table);
1307 out_argv: 1375 out_argv:
1308 kfree(argv); 1376 kfree(argv);
@@ -1413,6 +1481,7 @@ static int validate_params(uint cmd, struct dm_ioctl *param)
1413{ 1481{
1414 /* Always clear this flag */ 1482 /* Always clear this flag */
1415 param->flags &= ~DM_BUFFER_FULL_FLAG; 1483 param->flags &= ~DM_BUFFER_FULL_FLAG;
1484 param->flags &= ~DM_UEVENT_GENERATED_FLAG;
1416 1485
1417 /* Ignores parameters */ 1486 /* Ignores parameters */
1418 if (cmd == DM_REMOVE_ALL_CMD || 1487 if (cmd == DM_REMOVE_ALL_CMD ||
@@ -1582,8 +1651,7 @@ int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid)
1582 if (!md) 1651 if (!md)
1583 return -ENXIO; 1652 return -ENXIO;
1584 1653
1585 dm_get(md); 1654 mutex_lock(&dm_hash_cells_mutex);
1586 down_read(&_hash_lock);
1587 hc = dm_get_mdptr(md); 1655 hc = dm_get_mdptr(md);
1588 if (!hc || hc->md != md) { 1656 if (!hc || hc->md != md) {
1589 r = -ENXIO; 1657 r = -ENXIO;
@@ -1596,8 +1664,7 @@ int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid)
1596 strcpy(uuid, hc->uuid ? : ""); 1664 strcpy(uuid, hc->uuid ? : "");
1597 1665
1598out: 1666out:
1599 up_read(&_hash_lock); 1667 mutex_unlock(&dm_hash_cells_mutex);
1600 dm_put(md);
1601 1668
1602 return r; 1669 return r;
1603} 1670}
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 3e3fc06cb861..addf83475040 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -450,7 +450,10 @@ static void dispatch_job(struct kcopyd_job *job)
450{ 450{
451 struct dm_kcopyd_client *kc = job->kc; 451 struct dm_kcopyd_client *kc = job->kc;
452 atomic_inc(&kc->nr_jobs); 452 atomic_inc(&kc->nr_jobs);
453 push(&kc->pages_jobs, job); 453 if (unlikely(!job->source.count))
454 push(&kc->complete_jobs, job);
455 else
456 push(&kc->pages_jobs, job);
454 wake(kc); 457 wake(kc);
455} 458}
456 459
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 82f7d6e6b1ea..9200dbf2391a 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -47,8 +47,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
47 } 47 }
48 lc->start = tmp; 48 lc->start = tmp;
49 49
50 if (dm_get_device(ti, argv[0], lc->start, ti->len, 50 if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &lc->dev)) {
51 dm_table_get_mode(ti->table), &lc->dev)) {
52 ti->error = "dm-linear: Device lookup failed"; 51 ti->error = "dm-linear: Device lookup failed";
53 goto bad; 52 goto bad;
54 } 53 }
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 652bd33109e3..1ed0094f064b 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -5,6 +5,7 @@
5 */ 5 */
6 6
7#include <linux/bio.h> 7#include <linux/bio.h>
8#include <linux/slab.h>
8#include <linux/dm-dirty-log.h> 9#include <linux/dm-dirty-log.h>
9#include <linux/device-mapper.h> 10#include <linux/device-mapper.h>
10#include <linux/dm-log-userspace.h> 11#include <linux/dm-log-userspace.h>
@@ -156,7 +157,7 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
156 } 157 }
157 158
158 /* The ptr value is sufficient for local unique id */ 159 /* The ptr value is sufficient for local unique id */
159 lc->luid = (uint64_t)lc; 160 lc->luid = (unsigned long)lc;
160 161
161 lc->ti = ti; 162 lc->ti = ti;
162 163
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
index ba0edad2d048..075cbcf8a9f5 100644
--- a/drivers/md/dm-log-userspace-transfer.c
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -6,6 +6,7 @@
6 6
7#include <linux/kernel.h> 7#include <linux/kernel.h>
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/slab.h>
9#include <net/sock.h> 10#include <net/sock.h>
10#include <linux/workqueue.h> 11#include <linux/workqueue.h>
11#include <linux/connector.h> 12#include <linux/connector.h>
@@ -129,11 +130,13 @@ static int fill_pkg(struct cn_msg *msg, struct dm_ulog_request *tfr)
129 * This is the connector callback that delivers data 130 * This is the connector callback that delivers data
130 * that was sent from userspace. 131 * that was sent from userspace.
131 */ 132 */
132static void cn_ulog_callback(void *data) 133static void cn_ulog_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp)
133{ 134{
134 struct cn_msg *msg = (struct cn_msg *)data;
135 struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1); 135 struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1);
136 136
137 if (!cap_raised(nsp->eff_cap, CAP_SYS_ADMIN))
138 return;
139
137 spin_lock(&receiving_list_lock); 140 spin_lock(&receiving_list_lock);
138 if (msg->len == 0) 141 if (msg->len == 0)
139 fill_pkg(msg, NULL); 142 fill_pkg(msg, NULL);
@@ -170,11 +173,15 @@ int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type,
170{ 173{
171 int r = 0; 174 int r = 0;
172 size_t dummy = 0; 175 size_t dummy = 0;
173 int overhead_size = 176 int overhead_size = sizeof(struct dm_ulog_request) + sizeof(struct cn_msg);
174 sizeof(struct dm_ulog_request *) + sizeof(struct cn_msg);
175 struct dm_ulog_request *tfr = prealloced_ulog_tfr; 177 struct dm_ulog_request *tfr = prealloced_ulog_tfr;
176 struct receiving_pkg pkg; 178 struct receiving_pkg pkg;
177 179
180 /*
181 * Given the space needed to hold the 'struct cn_msg' and
182 * 'struct dm_ulog_request' - do we have enough payload
183 * space remaining?
184 */
178 if (data_size > (DM_ULOG_PREALLOCED_SIZE - overhead_size)) { 185 if (data_size > (DM_ULOG_PREALLOCED_SIZE - overhead_size)) {
179 DMINFO("Size of tfr exceeds preallocated size"); 186 DMINFO("Size of tfr exceeds preallocated size");
180 return -EINVAL; 187 return -EINVAL;
@@ -189,7 +196,7 @@ resend:
189 */ 196 */
190 mutex_lock(&dm_ulog_lock); 197 mutex_lock(&dm_ulog_lock);
191 198
192 memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - overhead_size); 199 memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - sizeof(struct cn_msg));
193 memcpy(tfr->uuid, uuid, DM_UUID_LEN); 200 memcpy(tfr->uuid, uuid, DM_UUID_LEN);
194 tfr->luid = luid; 201 tfr->luid = luid;
195 tfr->seq = dm_ulog_seq++; 202 tfr->seq = dm_ulog_seq++;
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 9443896ede07..5a08be0222db 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -145,8 +145,9 @@ int dm_dirty_log_type_unregister(struct dm_dirty_log_type *type)
145EXPORT_SYMBOL(dm_dirty_log_type_unregister); 145EXPORT_SYMBOL(dm_dirty_log_type_unregister);
146 146
147struct dm_dirty_log *dm_dirty_log_create(const char *type_name, 147struct dm_dirty_log *dm_dirty_log_create(const char *type_name,
148 struct dm_target *ti, 148 struct dm_target *ti,
149 unsigned int argc, char **argv) 149 int (*flush_callback_fn)(struct dm_target *ti),
150 unsigned int argc, char **argv)
150{ 151{
151 struct dm_dirty_log_type *type; 152 struct dm_dirty_log_type *type;
152 struct dm_dirty_log *log; 153 struct dm_dirty_log *log;
@@ -161,6 +162,7 @@ struct dm_dirty_log *dm_dirty_log_create(const char *type_name,
161 return NULL; 162 return NULL;
162 } 163 }
163 164
165 log->flush_callback_fn = flush_callback_fn;
164 log->type = type; 166 log->type = type;
165 if (type->ctr(log, ti, argc, argv)) { 167 if (type->ctr(log, ti, argc, argv)) {
166 kfree(log); 168 kfree(log);
@@ -208,7 +210,9 @@ struct log_header {
208 210
209struct log_c { 211struct log_c {
210 struct dm_target *ti; 212 struct dm_target *ti;
211 int touched; 213 int touched_dirtied;
214 int touched_cleaned;
215 int flush_failed;
212 uint32_t region_size; 216 uint32_t region_size;
213 unsigned int region_count; 217 unsigned int region_count;
214 region_t sync_count; 218 region_t sync_count;
@@ -233,6 +237,7 @@ struct log_c {
233 * Disk log fields 237 * Disk log fields
234 */ 238 */
235 int log_dev_failed; 239 int log_dev_failed;
240 int log_dev_flush_failed;
236 struct dm_dev *log_dev; 241 struct dm_dev *log_dev;
237 struct log_header header; 242 struct log_header header;
238 243
@@ -253,14 +258,14 @@ static inline void log_set_bit(struct log_c *l,
253 uint32_t *bs, unsigned bit) 258 uint32_t *bs, unsigned bit)
254{ 259{
255 ext2_set_bit(bit, (unsigned long *) bs); 260 ext2_set_bit(bit, (unsigned long *) bs);
256 l->touched = 1; 261 l->touched_cleaned = 1;
257} 262}
258 263
259static inline void log_clear_bit(struct log_c *l, 264static inline void log_clear_bit(struct log_c *l,
260 uint32_t *bs, unsigned bit) 265 uint32_t *bs, unsigned bit)
261{ 266{
262 ext2_clear_bit(bit, (unsigned long *) bs); 267 ext2_clear_bit(bit, (unsigned long *) bs);
263 l->touched = 1; 268 l->touched_dirtied = 1;
264} 269}
265 270
266/*---------------------------------------------------------------- 271/*----------------------------------------------------------------
@@ -287,6 +292,19 @@ static int rw_header(struct log_c *lc, int rw)
287 return dm_io(&lc->io_req, 1, &lc->header_location, NULL); 292 return dm_io(&lc->io_req, 1, &lc->header_location, NULL);
288} 293}
289 294
295static int flush_header(struct log_c *lc)
296{
297 struct dm_io_region null_location = {
298 .bdev = lc->header_location.bdev,
299 .sector = 0,
300 .count = 0,
301 };
302
303 lc->io_req.bi_rw = WRITE_BARRIER;
304
305 return dm_io(&lc->io_req, 1, &null_location, NULL);
306}
307
290static int read_header(struct log_c *log) 308static int read_header(struct log_c *log)
291{ 309{
292 int r; 310 int r;
@@ -378,7 +396,9 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
378 } 396 }
379 397
380 lc->ti = ti; 398 lc->ti = ti;
381 lc->touched = 0; 399 lc->touched_dirtied = 0;
400 lc->touched_cleaned = 0;
401 lc->flush_failed = 0;
382 lc->region_size = region_size; 402 lc->region_size = region_size;
383 lc->region_count = region_count; 403 lc->region_count = region_count;
384 lc->sync = sync; 404 lc->sync = sync;
@@ -406,6 +426,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
406 } else { 426 } else {
407 lc->log_dev = dev; 427 lc->log_dev = dev;
408 lc->log_dev_failed = 0; 428 lc->log_dev_failed = 0;
429 lc->log_dev_flush_failed = 0;
409 lc->header_location.bdev = lc->log_dev->bdev; 430 lc->header_location.bdev = lc->log_dev->bdev;
410 lc->header_location.sector = 0; 431 lc->header_location.sector = 0;
411 432
@@ -522,8 +543,7 @@ static int disk_ctr(struct dm_dirty_log *log, struct dm_target *ti,
522 return -EINVAL; 543 return -EINVAL;
523 } 544 }
524 545
525 r = dm_get_device(ti, argv[0], 0, 0 /* FIXME */, 546 r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &dev);
526 FMODE_READ | FMODE_WRITE, &dev);
527 if (r) 547 if (r)
528 return r; 548 return r;
529 549
@@ -614,6 +634,11 @@ static int disk_resume(struct dm_dirty_log *log)
614 634
615 /* write the new header */ 635 /* write the new header */
616 r = rw_header(lc, WRITE); 636 r = rw_header(lc, WRITE);
637 if (!r) {
638 r = flush_header(lc);
639 if (r)
640 lc->log_dev_flush_failed = 1;
641 }
617 if (r) { 642 if (r) {
618 DMWARN("%s: Failed to write header on dirty region log device", 643 DMWARN("%s: Failed to write header on dirty region log device",
619 lc->log_dev->name); 644 lc->log_dev->name);
@@ -656,18 +681,40 @@ static int core_flush(struct dm_dirty_log *log)
656 681
657static int disk_flush(struct dm_dirty_log *log) 682static int disk_flush(struct dm_dirty_log *log)
658{ 683{
659 int r; 684 int r, i;
660 struct log_c *lc = (struct log_c *) log->context; 685 struct log_c *lc = log->context;
661 686
662 /* only write if the log has changed */ 687 /* only write if the log has changed */
663 if (!lc->touched) 688 if (!lc->touched_cleaned && !lc->touched_dirtied)
664 return 0; 689 return 0;
665 690
691 if (lc->touched_cleaned && log->flush_callback_fn &&
692 log->flush_callback_fn(lc->ti)) {
693 /*
694 * At this point it is impossible to determine which
695 * regions are clean and which are dirty (without
696 * re-reading the log off disk). So mark all of them
697 * dirty.
698 */
699 lc->flush_failed = 1;
700 for (i = 0; i < lc->region_count; i++)
701 log_clear_bit(lc, lc->clean_bits, i);
702 }
703
666 r = rw_header(lc, WRITE); 704 r = rw_header(lc, WRITE);
667 if (r) 705 if (r)
668 fail_log_device(lc); 706 fail_log_device(lc);
669 else 707 else {
670 lc->touched = 0; 708 if (lc->touched_dirtied) {
709 r = flush_header(lc);
710 if (r) {
711 lc->log_dev_flush_failed = 1;
712 fail_log_device(lc);
713 } else
714 lc->touched_dirtied = 0;
715 }
716 lc->touched_cleaned = 0;
717 }
671 718
672 return r; 719 return r;
673} 720}
@@ -681,7 +728,8 @@ static void core_mark_region(struct dm_dirty_log *log, region_t region)
681static void core_clear_region(struct dm_dirty_log *log, region_t region) 728static void core_clear_region(struct dm_dirty_log *log, region_t region)
682{ 729{
683 struct log_c *lc = (struct log_c *) log->context; 730 struct log_c *lc = (struct log_c *) log->context;
684 log_set_bit(lc, lc->clean_bits, region); 731 if (likely(!lc->flush_failed))
732 log_set_bit(lc, lc->clean_bits, region);
685} 733}
686 734
687static int core_get_resync_work(struct dm_dirty_log *log, region_t *region) 735static int core_get_resync_work(struct dm_dirty_log *log, region_t *region)
@@ -762,7 +810,9 @@ static int disk_status(struct dm_dirty_log *log, status_type_t status,
762 switch(status) { 810 switch(status) {
763 case STATUSTYPE_INFO: 811 case STATUSTYPE_INFO:
764 DMEMIT("3 %s %s %c", log->type->name, lc->log_dev->name, 812 DMEMIT("3 %s %s %c", log->type->name, lc->log_dev->name,
765 lc->log_dev_failed ? 'D' : 'A'); 813 lc->log_dev_flush_failed ? 'F' :
814 lc->log_dev_failed ? 'D' :
815 'A');
766 break; 816 break;
767 817
768 case STATUSTYPE_TABLE: 818 case STATUSTYPE_TABLE:
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 32d0b878eccc..826bce7343b3 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -69,6 +69,7 @@ struct multipath {
69 struct list_head priority_groups; 69 struct list_head priority_groups;
70 unsigned pg_init_required; /* pg_init needs calling? */ 70 unsigned pg_init_required; /* pg_init needs calling? */
71 unsigned pg_init_in_progress; /* Only one pg_init allowed at once */ 71 unsigned pg_init_in_progress; /* Only one pg_init allowed at once */
72 wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */
72 73
73 unsigned nr_valid_paths; /* Total number of usable paths */ 74 unsigned nr_valid_paths; /* Total number of usable paths */
74 struct pgpath *current_pgpath; 75 struct pgpath *current_pgpath;
@@ -93,6 +94,8 @@ struct multipath {
93 * can resubmit bios on error. 94 * can resubmit bios on error.
94 */ 95 */
95 mempool_t *mpio_pool; 96 mempool_t *mpio_pool;
97
98 struct mutex work_mutex;
96}; 99};
97 100
98/* 101/*
@@ -198,6 +201,8 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
198 m->queue_io = 1; 201 m->queue_io = 1;
199 INIT_WORK(&m->process_queued_ios, process_queued_ios); 202 INIT_WORK(&m->process_queued_ios, process_queued_ios);
200 INIT_WORK(&m->trigger_event, trigger_event); 203 INIT_WORK(&m->trigger_event, trigger_event);
204 init_waitqueue_head(&m->pg_init_wait);
205 mutex_init(&m->work_mutex);
201 m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache); 206 m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
202 if (!m->mpio_pool) { 207 if (!m->mpio_pool) {
203 kfree(m); 208 kfree(m);
@@ -230,6 +235,21 @@ static void free_multipath(struct multipath *m)
230 * Path selection 235 * Path selection
231 *-----------------------------------------------*/ 236 *-----------------------------------------------*/
232 237
238static void __pg_init_all_paths(struct multipath *m)
239{
240 struct pgpath *pgpath;
241
242 m->pg_init_count++;
243 m->pg_init_required = 0;
244 list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) {
245 /* Skip failed paths */
246 if (!pgpath->is_active)
247 continue;
248 if (queue_work(kmpath_handlerd, &pgpath->activate_path))
249 m->pg_init_in_progress++;
250 }
251}
252
233static void __switch_pg(struct multipath *m, struct pgpath *pgpath) 253static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
234{ 254{
235 m->current_pg = pgpath->pg; 255 m->current_pg = pgpath->pg;
@@ -434,7 +454,7 @@ static void process_queued_ios(struct work_struct *work)
434{ 454{
435 struct multipath *m = 455 struct multipath *m =
436 container_of(work, struct multipath, process_queued_ios); 456 container_of(work, struct multipath, process_queued_ios);
437 struct pgpath *pgpath = NULL, *tmp; 457 struct pgpath *pgpath = NULL;
438 unsigned must_queue = 1; 458 unsigned must_queue = 1;
439 unsigned long flags; 459 unsigned long flags;
440 460
@@ -452,14 +472,9 @@ static void process_queued_ios(struct work_struct *work)
452 (!pgpath && !m->queue_if_no_path)) 472 (!pgpath && !m->queue_if_no_path))
453 must_queue = 0; 473 must_queue = 0;
454 474
455 if (m->pg_init_required && !m->pg_init_in_progress && pgpath) { 475 if (m->pg_init_required && !m->pg_init_in_progress && pgpath)
456 m->pg_init_count++; 476 __pg_init_all_paths(m);
457 m->pg_init_required = 0; 477
458 list_for_each_entry(tmp, &pgpath->pg->pgpaths, list) {
459 if (queue_work(kmpath_handlerd, &tmp->activate_path))
460 m->pg_init_in_progress++;
461 }
462 }
463out: 478out:
464 spin_unlock_irqrestore(&m->lock, flags); 479 spin_unlock_irqrestore(&m->lock, flags);
465 if (!must_queue) 480 if (!must_queue)
@@ -592,8 +607,8 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
592 if (!p) 607 if (!p)
593 return ERR_PTR(-ENOMEM); 608 return ERR_PTR(-ENOMEM);
594 609
595 r = dm_get_device(ti, shift(as), ti->begin, ti->len, 610 r = dm_get_device(ti, shift(as), dm_table_get_mode(ti->table),
596 dm_table_get_mode(ti->table), &p->path.dev); 611 &p->path.dev);
597 if (r) { 612 if (r) {
598 ti->error = "error getting device"; 613 ti->error = "error getting device";
599 goto bad; 614 goto bad;
@@ -885,13 +900,43 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
885 return r; 900 return r;
886} 901}
887 902
888static void multipath_dtr(struct dm_target *ti) 903static void multipath_wait_for_pg_init_completion(struct multipath *m)
889{ 904{
890 struct multipath *m = (struct multipath *) ti->private; 905 DECLARE_WAITQUEUE(wait, current);
906 unsigned long flags;
907
908 add_wait_queue(&m->pg_init_wait, &wait);
891 909
910 while (1) {
911 set_current_state(TASK_UNINTERRUPTIBLE);
912
913 spin_lock_irqsave(&m->lock, flags);
914 if (!m->pg_init_in_progress) {
915 spin_unlock_irqrestore(&m->lock, flags);
916 break;
917 }
918 spin_unlock_irqrestore(&m->lock, flags);
919
920 io_schedule();
921 }
922 set_current_state(TASK_RUNNING);
923
924 remove_wait_queue(&m->pg_init_wait, &wait);
925}
926
927static void flush_multipath_work(struct multipath *m)
928{
892 flush_workqueue(kmpath_handlerd); 929 flush_workqueue(kmpath_handlerd);
930 multipath_wait_for_pg_init_completion(m);
893 flush_workqueue(kmultipathd); 931 flush_workqueue(kmultipathd);
894 flush_scheduled_work(); 932 flush_scheduled_work();
933}
934
935static void multipath_dtr(struct dm_target *ti)
936{
937 struct multipath *m = ti->private;
938
939 flush_multipath_work(m);
895 free_multipath(m); 940 free_multipath(m);
896} 941}
897 942
@@ -1116,9 +1161,9 @@ static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath)
1116 return limit_reached; 1161 return limit_reached;
1117} 1162}
1118 1163
1119static void pg_init_done(struct dm_path *path, int errors) 1164static void pg_init_done(void *data, int errors)
1120{ 1165{
1121 struct pgpath *pgpath = path_to_pgpath(path); 1166 struct pgpath *pgpath = data;
1122 struct priority_group *pg = pgpath->pg; 1167 struct priority_group *pg = pgpath->pg;
1123 struct multipath *m = pg->m; 1168 struct multipath *m = pg->m;
1124 unsigned long flags; 1169 unsigned long flags;
@@ -1132,8 +1177,8 @@ static void pg_init_done(struct dm_path *path, int errors)
1132 errors = 0; 1177 errors = 0;
1133 break; 1178 break;
1134 } 1179 }
1135 DMERR("Cannot failover device because scsi_dh_%s was not " 1180 DMERR("Could not failover the device: Handler scsi_dh_%s "
1136 "loaded.", m->hw_handler_name); 1181 "Error %d.", m->hw_handler_name, errors);
1137 /* 1182 /*
1138 * Fail path for now, so we do not ping pong 1183 * Fail path for now, so we do not ping pong
1139 */ 1184 */
@@ -1170,25 +1215,34 @@ static void pg_init_done(struct dm_path *path, int errors)
1170 m->current_pgpath = NULL; 1215 m->current_pgpath = NULL;
1171 m->current_pg = NULL; 1216 m->current_pg = NULL;
1172 } 1217 }
1173 } else if (!m->pg_init_required) { 1218 } else if (!m->pg_init_required)
1174 m->queue_io = 0;
1175 pg->bypassed = 0; 1219 pg->bypassed = 0;
1176 }
1177 1220
1178 m->pg_init_in_progress--; 1221 if (--m->pg_init_in_progress)
1179 if (!m->pg_init_in_progress) 1222 /* Activations of other paths are still on going */
1180 queue_work(kmultipathd, &m->process_queued_ios); 1223 goto out;
1224
1225 if (!m->pg_init_required)
1226 m->queue_io = 0;
1227
1228 queue_work(kmultipathd, &m->process_queued_ios);
1229
1230 /*
1231 * Wake up any thread waiting to suspend.
1232 */
1233 wake_up(&m->pg_init_wait);
1234
1235out:
1181 spin_unlock_irqrestore(&m->lock, flags); 1236 spin_unlock_irqrestore(&m->lock, flags);
1182} 1237}
1183 1238
1184static void activate_path(struct work_struct *work) 1239static void activate_path(struct work_struct *work)
1185{ 1240{
1186 int ret;
1187 struct pgpath *pgpath = 1241 struct pgpath *pgpath =
1188 container_of(work, struct pgpath, activate_path); 1242 container_of(work, struct pgpath, activate_path);
1189 1243
1190 ret = scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev)); 1244 scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev),
1191 pg_init_done(&pgpath->path, ret); 1245 pg_init_done, pgpath);
1192} 1246}
1193 1247
1194/* 1248/*
@@ -1261,6 +1315,15 @@ static void multipath_presuspend(struct dm_target *ti)
1261 queue_if_no_path(m, 0, 1); 1315 queue_if_no_path(m, 0, 1);
1262} 1316}
1263 1317
1318static void multipath_postsuspend(struct dm_target *ti)
1319{
1320 struct multipath *m = ti->private;
1321
1322 mutex_lock(&m->work_mutex);
1323 flush_multipath_work(m);
1324 mutex_unlock(&m->work_mutex);
1325}
1326
1264/* 1327/*
1265 * Restore the queue_if_no_path setting. 1328 * Restore the queue_if_no_path setting.
1266 */ 1329 */
@@ -1397,51 +1460,65 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
1397 1460
1398static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) 1461static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
1399{ 1462{
1400 int r; 1463 int r = -EINVAL;
1401 struct dm_dev *dev; 1464 struct dm_dev *dev;
1402 struct multipath *m = (struct multipath *) ti->private; 1465 struct multipath *m = (struct multipath *) ti->private;
1403 action_fn action; 1466 action_fn action;
1404 1467
1468 mutex_lock(&m->work_mutex);
1469
1470 if (dm_suspended(ti)) {
1471 r = -EBUSY;
1472 goto out;
1473 }
1474
1405 if (argc == 1) { 1475 if (argc == 1) {
1406 if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) 1476 if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) {
1407 return queue_if_no_path(m, 1, 0); 1477 r = queue_if_no_path(m, 1, 0);
1408 else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path"))) 1478 goto out;
1409 return queue_if_no_path(m, 0, 0); 1479 } else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path"))) {
1480 r = queue_if_no_path(m, 0, 0);
1481 goto out;
1482 }
1410 } 1483 }
1411 1484
1412 if (argc != 2) 1485 if (argc != 2) {
1413 goto error; 1486 DMWARN("Unrecognised multipath message received.");
1487 goto out;
1488 }
1414 1489
1415 if (!strnicmp(argv[0], MESG_STR("disable_group"))) 1490 if (!strnicmp(argv[0], MESG_STR("disable_group"))) {
1416 return bypass_pg_num(m, argv[1], 1); 1491 r = bypass_pg_num(m, argv[1], 1);
1417 else if (!strnicmp(argv[0], MESG_STR("enable_group"))) 1492 goto out;
1418 return bypass_pg_num(m, argv[1], 0); 1493 } else if (!strnicmp(argv[0], MESG_STR("enable_group"))) {
1419 else if (!strnicmp(argv[0], MESG_STR("switch_group"))) 1494 r = bypass_pg_num(m, argv[1], 0);
1420 return switch_pg_num(m, argv[1]); 1495 goto out;
1421 else if (!strnicmp(argv[0], MESG_STR("reinstate_path"))) 1496 } else if (!strnicmp(argv[0], MESG_STR("switch_group"))) {
1497 r = switch_pg_num(m, argv[1]);
1498 goto out;
1499 } else if (!strnicmp(argv[0], MESG_STR("reinstate_path")))
1422 action = reinstate_path; 1500 action = reinstate_path;
1423 else if (!strnicmp(argv[0], MESG_STR("fail_path"))) 1501 else if (!strnicmp(argv[0], MESG_STR("fail_path")))
1424 action = fail_path; 1502 action = fail_path;
1425 else 1503 else {
1426 goto error; 1504 DMWARN("Unrecognised multipath message received.");
1505 goto out;
1506 }
1427 1507
1428 r = dm_get_device(ti, argv[1], ti->begin, ti->len, 1508 r = dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dev);
1429 dm_table_get_mode(ti->table), &dev);
1430 if (r) { 1509 if (r) {
1431 DMWARN("message: error getting device %s", 1510 DMWARN("message: error getting device %s",
1432 argv[1]); 1511 argv[1]);
1433 return -EINVAL; 1512 goto out;
1434 } 1513 }
1435 1514
1436 r = action_dev(m, dev, action); 1515 r = action_dev(m, dev, action);
1437 1516
1438 dm_put_device(ti, dev); 1517 dm_put_device(ti, dev);
1439 1518
1519out:
1520 mutex_unlock(&m->work_mutex);
1440 return r; 1521 return r;
1441
1442error:
1443 DMWARN("Unrecognised multipath message received.");
1444 return -EINVAL;
1445} 1522}
1446 1523
1447static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, 1524static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
@@ -1567,13 +1644,14 @@ out:
1567 *---------------------------------------------------------------*/ 1644 *---------------------------------------------------------------*/
1568static struct target_type multipath_target = { 1645static struct target_type multipath_target = {
1569 .name = "multipath", 1646 .name = "multipath",
1570 .version = {1, 1, 0}, 1647 .version = {1, 1, 1},
1571 .module = THIS_MODULE, 1648 .module = THIS_MODULE,
1572 .ctr = multipath_ctr, 1649 .ctr = multipath_ctr,
1573 .dtr = multipath_dtr, 1650 .dtr = multipath_dtr,
1574 .map_rq = multipath_map, 1651 .map_rq = multipath_map,
1575 .rq_end_io = multipath_end_io, 1652 .rq_end_io = multipath_end_io,
1576 .presuspend = multipath_presuspend, 1653 .presuspend = multipath_presuspend,
1654 .postsuspend = multipath_postsuspend,
1577 .resume = multipath_resume, 1655 .resume = multipath_resume,
1578 .status = multipath_status, 1656 .status = multipath_status,
1579 .message = multipath_message, 1657 .message = multipath_message,
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index cc9dc79b0784..ddda531723dc 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -35,6 +35,7 @@ static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped);
35 *---------------------------------------------------------------*/ 35 *---------------------------------------------------------------*/
36enum dm_raid1_error { 36enum dm_raid1_error {
37 DM_RAID1_WRITE_ERROR, 37 DM_RAID1_WRITE_ERROR,
38 DM_RAID1_FLUSH_ERROR,
38 DM_RAID1_SYNC_ERROR, 39 DM_RAID1_SYNC_ERROR,
39 DM_RAID1_READ_ERROR 40 DM_RAID1_READ_ERROR
40}; 41};
@@ -57,6 +58,7 @@ struct mirror_set {
57 struct bio_list reads; 58 struct bio_list reads;
58 struct bio_list writes; 59 struct bio_list writes;
59 struct bio_list failures; 60 struct bio_list failures;
61 struct bio_list holds; /* bios are waiting until suspend */
60 62
61 struct dm_region_hash *rh; 63 struct dm_region_hash *rh;
62 struct dm_kcopyd_client *kcopyd_client; 64 struct dm_kcopyd_client *kcopyd_client;
@@ -67,6 +69,7 @@ struct mirror_set {
67 region_t nr_regions; 69 region_t nr_regions;
68 int in_sync; 70 int in_sync;
69 int log_failure; 71 int log_failure;
72 int leg_failure;
70 atomic_t suspend; 73 atomic_t suspend;
71 74
72 atomic_t default_mirror; /* Default mirror */ 75 atomic_t default_mirror; /* Default mirror */
@@ -179,6 +182,17 @@ static void set_default_mirror(struct mirror *m)
179 atomic_set(&ms->default_mirror, m - m0); 182 atomic_set(&ms->default_mirror, m - m0);
180} 183}
181 184
185static struct mirror *get_valid_mirror(struct mirror_set *ms)
186{
187 struct mirror *m;
188
189 for (m = ms->mirror; m < ms->mirror + ms->nr_mirrors; m++)
190 if (!atomic_read(&m->error_count))
191 return m;
192
193 return NULL;
194}
195
182/* fail_mirror 196/* fail_mirror
183 * @m: mirror device to fail 197 * @m: mirror device to fail
184 * @error_type: one of the enum's, DM_RAID1_*_ERROR 198 * @error_type: one of the enum's, DM_RAID1_*_ERROR
@@ -198,6 +212,8 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
198 struct mirror_set *ms = m->ms; 212 struct mirror_set *ms = m->ms;
199 struct mirror *new; 213 struct mirror *new;
200 214
215 ms->leg_failure = 1;
216
201 /* 217 /*
202 * error_count is used for nothing more than a 218 * error_count is used for nothing more than a
203 * simple way to tell if a device has encountered 219 * simple way to tell if a device has encountered
@@ -224,19 +240,50 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
224 goto out; 240 goto out;
225 } 241 }
226 242
227 for (new = ms->mirror; new < ms->mirror + ms->nr_mirrors; new++) 243 new = get_valid_mirror(ms);
228 if (!atomic_read(&new->error_count)) { 244 if (new)
229 set_default_mirror(new); 245 set_default_mirror(new);
230 break; 246 else
231 }
232
233 if (unlikely(new == ms->mirror + ms->nr_mirrors))
234 DMWARN("All sides of mirror have failed."); 247 DMWARN("All sides of mirror have failed.");
235 248
236out: 249out:
237 schedule_work(&ms->trigger_event); 250 schedule_work(&ms->trigger_event);
238} 251}
239 252
253static int mirror_flush(struct dm_target *ti)
254{
255 struct mirror_set *ms = ti->private;
256 unsigned long error_bits;
257
258 unsigned int i;
259 struct dm_io_region io[ms->nr_mirrors];
260 struct mirror *m;
261 struct dm_io_request io_req = {
262 .bi_rw = WRITE_BARRIER,
263 .mem.type = DM_IO_KMEM,
264 .mem.ptr.bvec = NULL,
265 .client = ms->io_client,
266 };
267
268 for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) {
269 io[i].bdev = m->dev->bdev;
270 io[i].sector = 0;
271 io[i].count = 0;
272 }
273
274 error_bits = -1;
275 dm_io(&io_req, ms->nr_mirrors, io, &error_bits);
276 if (unlikely(error_bits != 0)) {
277 for (i = 0; i < ms->nr_mirrors; i++)
278 if (test_bit(i, &error_bits))
279 fail_mirror(ms->mirror + i,
280 DM_RAID1_FLUSH_ERROR);
281 return -EIO;
282 }
283
284 return 0;
285}
286
240/*----------------------------------------------------------------- 287/*-----------------------------------------------------------------
241 * Recovery. 288 * Recovery.
242 * 289 *
@@ -396,6 +443,8 @@ static int mirror_available(struct mirror_set *ms, struct bio *bio)
396 */ 443 */
397static sector_t map_sector(struct mirror *m, struct bio *bio) 444static sector_t map_sector(struct mirror *m, struct bio *bio)
398{ 445{
446 if (unlikely(!bio->bi_size))
447 return 0;
399 return m->offset + (bio->bi_sector - m->ms->ti->begin); 448 return m->offset + (bio->bi_sector - m->ms->ti->begin);
400} 449}
401 450
@@ -413,6 +462,34 @@ static void map_region(struct dm_io_region *io, struct mirror *m,
413 io->count = bio->bi_size >> 9; 462 io->count = bio->bi_size >> 9;
414} 463}
415 464
465static void hold_bio(struct mirror_set *ms, struct bio *bio)
466{
467 /*
468 * Lock is required to avoid race condition during suspend
469 * process.
470 */
471 spin_lock_irq(&ms->lock);
472
473 if (atomic_read(&ms->suspend)) {
474 spin_unlock_irq(&ms->lock);
475
476 /*
477 * If device is suspended, complete the bio.
478 */
479 if (dm_noflush_suspending(ms->ti))
480 bio_endio(bio, DM_ENDIO_REQUEUE);
481 else
482 bio_endio(bio, -EIO);
483 return;
484 }
485
486 /*
487 * Hold bio until the suspend is complete.
488 */
489 bio_list_add(&ms->holds, bio);
490 spin_unlock_irq(&ms->lock);
491}
492
416/*----------------------------------------------------------------- 493/*-----------------------------------------------------------------
417 * Reads 494 * Reads
418 *---------------------------------------------------------------*/ 495 *---------------------------------------------------------------*/
@@ -511,7 +588,6 @@ static void write_callback(unsigned long error, void *context)
511 unsigned i, ret = 0; 588 unsigned i, ret = 0;
512 struct bio *bio = (struct bio *) context; 589 struct bio *bio = (struct bio *) context;
513 struct mirror_set *ms; 590 struct mirror_set *ms;
514 int uptodate = 0;
515 int should_wake = 0; 591 int should_wake = 0;
516 unsigned long flags; 592 unsigned long flags;
517 593
@@ -524,36 +600,27 @@ static void write_callback(unsigned long error, void *context)
524 * This way we handle both writes to SYNC and NOSYNC 600 * This way we handle both writes to SYNC and NOSYNC
525 * regions with the same code. 601 * regions with the same code.
526 */ 602 */
527 if (likely(!error)) 603 if (likely(!error)) {
528 goto out; 604 bio_endio(bio, ret);
605 return;
606 }
529 607
530 for (i = 0; i < ms->nr_mirrors; i++) 608 for (i = 0; i < ms->nr_mirrors; i++)
531 if (test_bit(i, &error)) 609 if (test_bit(i, &error))
532 fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR); 610 fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR);
533 else
534 uptodate = 1;
535 611
536 if (unlikely(!uptodate)) { 612 /*
537 DMERR("All replicated volumes dead, failing I/O"); 613 * Need to raise event. Since raising
538 /* None of the writes succeeded, fail the I/O. */ 614 * events can block, we need to do it in
539 ret = -EIO; 615 * the main thread.
540 } else if (errors_handled(ms)) { 616 */
541 /* 617 spin_lock_irqsave(&ms->lock, flags);
542 * Need to raise event. Since raising 618 if (!ms->failures.head)
543 * events can block, we need to do it in 619 should_wake = 1;
544 * the main thread. 620 bio_list_add(&ms->failures, bio);
545 */ 621 spin_unlock_irqrestore(&ms->lock, flags);
546 spin_lock_irqsave(&ms->lock, flags); 622 if (should_wake)
547 if (!ms->failures.head) 623 wakeup_mirrord(ms);
548 should_wake = 1;
549 bio_list_add(&ms->failures, bio);
550 spin_unlock_irqrestore(&ms->lock, flags);
551 if (should_wake)
552 wakeup_mirrord(ms);
553 return;
554 }
555out:
556 bio_endio(bio, ret);
557} 624}
558 625
559static void do_write(struct mirror_set *ms, struct bio *bio) 626static void do_write(struct mirror_set *ms, struct bio *bio)
@@ -562,7 +629,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
562 struct dm_io_region io[ms->nr_mirrors], *dest = io; 629 struct dm_io_region io[ms->nr_mirrors], *dest = io;
563 struct mirror *m; 630 struct mirror *m;
564 struct dm_io_request io_req = { 631 struct dm_io_request io_req = {
565 .bi_rw = WRITE, 632 .bi_rw = WRITE | (bio->bi_rw & WRITE_BARRIER),
566 .mem.type = DM_IO_BVEC, 633 .mem.type = DM_IO_BVEC,
567 .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, 634 .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
568 .notify.fn = write_callback, 635 .notify.fn = write_callback,
@@ -603,6 +670,11 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
603 bio_list_init(&requeue); 670 bio_list_init(&requeue);
604 671
605 while ((bio = bio_list_pop(writes))) { 672 while ((bio = bio_list_pop(writes))) {
673 if (unlikely(bio_empty_barrier(bio))) {
674 bio_list_add(&sync, bio);
675 continue;
676 }
677
606 region = dm_rh_bio_to_region(ms->rh, bio); 678 region = dm_rh_bio_to_region(ms->rh, bio);
607 679
608 if (log->type->is_remote_recovering && 680 if (log->type->is_remote_recovering &&
@@ -659,7 +731,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
659 /* 731 /*
660 * Dispatch io. 732 * Dispatch io.
661 */ 733 */
662 if (unlikely(ms->log_failure)) { 734 if (unlikely(ms->log_failure) && errors_handled(ms)) {
663 spin_lock_irq(&ms->lock); 735 spin_lock_irq(&ms->lock);
664 bio_list_merge(&ms->failures, &sync); 736 bio_list_merge(&ms->failures, &sync);
665 spin_unlock_irq(&ms->lock); 737 spin_unlock_irq(&ms->lock);
@@ -672,8 +744,15 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
672 dm_rh_delay(ms->rh, bio); 744 dm_rh_delay(ms->rh, bio);
673 745
674 while ((bio = bio_list_pop(&nosync))) { 746 while ((bio = bio_list_pop(&nosync))) {
675 map_bio(get_default_mirror(ms), bio); 747 if (unlikely(ms->leg_failure) && errors_handled(ms)) {
676 generic_make_request(bio); 748 spin_lock_irq(&ms->lock);
749 bio_list_add(&ms->failures, bio);
750 spin_unlock_irq(&ms->lock);
751 wakeup_mirrord(ms);
752 } else {
753 map_bio(get_default_mirror(ms), bio);
754 generic_make_request(bio);
755 }
677 } 756 }
678} 757}
679 758
@@ -681,20 +760,12 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
681{ 760{
682 struct bio *bio; 761 struct bio *bio;
683 762
684 if (!failures->head) 763 if (likely(!failures->head))
685 return;
686
687 if (!ms->log_failure) {
688 while ((bio = bio_list_pop(failures))) {
689 ms->in_sync = 0;
690 dm_rh_mark_nosync(ms->rh, bio, bio->bi_size, 0);
691 }
692 return; 764 return;
693 }
694 765
695 /* 766 /*
696 * If the log has failed, unattempted writes are being 767 * If the log has failed, unattempted writes are being
697 * put on the failures list. We can't issue those writes 768 * put on the holds list. We can't issue those writes
698 * until a log has been marked, so we must store them. 769 * until a log has been marked, so we must store them.
699 * 770 *
700 * If a 'noflush' suspend is in progress, we can requeue 771 * If a 'noflush' suspend is in progress, we can requeue
@@ -709,23 +780,27 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
709 * for us to treat them the same and requeue them 780 * for us to treat them the same and requeue them
710 * as well. 781 * as well.
711 */ 782 */
712 if (dm_noflush_suspending(ms->ti)) { 783 while ((bio = bio_list_pop(failures))) {
713 while ((bio = bio_list_pop(failures))) 784 if (!ms->log_failure) {
714 bio_endio(bio, DM_ENDIO_REQUEUE); 785 ms->in_sync = 0;
715 return; 786 dm_rh_mark_nosync(ms->rh, bio);
716 } 787 }
717 788
718 if (atomic_read(&ms->suspend)) { 789 /*
719 while ((bio = bio_list_pop(failures))) 790 * If all the legs are dead, fail the I/O.
791 * If we have been told to handle errors, hold the bio
792 * and wait for userspace to deal with the problem.
793 * Otherwise pretend that the I/O succeeded. (This would
794 * be wrong if the failed leg returned after reboot and
795 * got replicated back to the good legs.)
796 */
797 if (!get_valid_mirror(ms))
720 bio_endio(bio, -EIO); 798 bio_endio(bio, -EIO);
721 return; 799 else if (errors_handled(ms))
800 hold_bio(ms, bio);
801 else
802 bio_endio(bio, 0);
722 } 803 }
723
724 spin_lock_irq(&ms->lock);
725 bio_list_merge(&ms->failures, failures);
726 spin_unlock_irq(&ms->lock);
727
728 delayed_wake(ms);
729} 804}
730 805
731static void trigger_event(struct work_struct *work) 806static void trigger_event(struct work_struct *work)
@@ -784,12 +859,17 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
784 } 859 }
785 860
786 spin_lock_init(&ms->lock); 861 spin_lock_init(&ms->lock);
862 bio_list_init(&ms->reads);
863 bio_list_init(&ms->writes);
864 bio_list_init(&ms->failures);
865 bio_list_init(&ms->holds);
787 866
788 ms->ti = ti; 867 ms->ti = ti;
789 ms->nr_mirrors = nr_mirrors; 868 ms->nr_mirrors = nr_mirrors;
790 ms->nr_regions = dm_sector_div_up(ti->len, region_size); 869 ms->nr_regions = dm_sector_div_up(ti->len, region_size);
791 ms->in_sync = 0; 870 ms->in_sync = 0;
792 ms->log_failure = 0; 871 ms->log_failure = 0;
872 ms->leg_failure = 0;
793 atomic_set(&ms->suspend, 0); 873 atomic_set(&ms->suspend, 0);
794 atomic_set(&ms->default_mirror, DEFAULT_MIRROR); 874 atomic_set(&ms->default_mirror, DEFAULT_MIRROR);
795 875
@@ -847,8 +927,7 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
847 return -EINVAL; 927 return -EINVAL;
848 } 928 }
849 929
850 if (dm_get_device(ti, argv[0], offset, ti->len, 930 if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
851 dm_table_get_mode(ti->table),
852 &ms->mirror[mirror].dev)) { 931 &ms->mirror[mirror].dev)) {
853 ti->error = "Device lookup failure"; 932 ti->error = "Device lookup failure";
854 return -ENXIO; 933 return -ENXIO;
@@ -889,7 +968,8 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti,
889 return NULL; 968 return NULL;
890 } 969 }
891 970
892 dl = dm_dirty_log_create(argv[0], ti, param_count, argv + 2); 971 dl = dm_dirty_log_create(argv[0], ti, mirror_flush, param_count,
972 argv + 2);
893 if (!dl) { 973 if (!dl) {
894 ti->error = "Error creating mirror dirty log"; 974 ti->error = "Error creating mirror dirty log";
895 return NULL; 975 return NULL;
@@ -995,6 +1075,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
995 1075
996 ti->private = ms; 1076 ti->private = ms;
997 ti->split_io = dm_rh_get_region_size(ms->rh); 1077 ti->split_io = dm_rh_get_region_size(ms->rh);
1078 ti->num_flush_requests = 1;
998 1079
999 ms->kmirrord_wq = create_singlethread_workqueue("kmirrord"); 1080 ms->kmirrord_wq = create_singlethread_workqueue("kmirrord");
1000 if (!ms->kmirrord_wq) { 1081 if (!ms->kmirrord_wq) {
@@ -1122,7 +1203,8 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
1122 * We need to dec pending if this was a write. 1203 * We need to dec pending if this was a write.
1123 */ 1204 */
1124 if (rw == WRITE) { 1205 if (rw == WRITE) {
1125 dm_rh_dec(ms->rh, map_context->ll); 1206 if (likely(!bio_empty_barrier(bio)))
1207 dm_rh_dec(ms->rh, map_context->ll);
1126 return error; 1208 return error;
1127 } 1209 }
1128 1210
@@ -1180,9 +1262,26 @@ static void mirror_presuspend(struct dm_target *ti)
1180 struct mirror_set *ms = (struct mirror_set *) ti->private; 1262 struct mirror_set *ms = (struct mirror_set *) ti->private;
1181 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); 1263 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
1182 1264
1265 struct bio_list holds;
1266 struct bio *bio;
1267
1183 atomic_set(&ms->suspend, 1); 1268 atomic_set(&ms->suspend, 1);
1184 1269
1185 /* 1270 /*
1271 * Process bios in the hold list to start recovery waiting
1272 * for bios in the hold list. After the process, no bio has
1273 * a chance to be added in the hold list because ms->suspend
1274 * is set.
1275 */
1276 spin_lock_irq(&ms->lock);
1277 holds = ms->holds;
1278 bio_list_init(&ms->holds);
1279 spin_unlock_irq(&ms->lock);
1280
1281 while ((bio = bio_list_pop(&holds)))
1282 hold_bio(ms, bio);
1283
1284 /*
1186 * We must finish up all the work that we've 1285 * We must finish up all the work that we've
1187 * generated (i.e. recovery work). 1286 * generated (i.e. recovery work).
1188 */ 1287 */
@@ -1244,7 +1343,8 @@ static char device_status_char(struct mirror *m)
1244 if (!atomic_read(&(m->error_count))) 1343 if (!atomic_read(&(m->error_count)))
1245 return 'A'; 1344 return 'A';
1246 1345
1247 return (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' : 1346 return (test_bit(DM_RAID1_FLUSH_ERROR, &(m->error_type))) ? 'F' :
1347 (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' :
1248 (test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' : 1348 (test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' :
1249 (test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U'; 1349 (test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U';
1250} 1350}
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index 36dbe29f2fd6..bd5c58b28868 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -11,6 +11,7 @@
11#include <linux/ctype.h> 11#include <linux/ctype.h>
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/slab.h>
14#include <linux/vmalloc.h> 15#include <linux/vmalloc.h>
15 16
16#include "dm.h" 17#include "dm.h"
@@ -79,6 +80,11 @@ struct dm_region_hash {
79 struct list_head recovered_regions; 80 struct list_head recovered_regions;
80 struct list_head failed_recovered_regions; 81 struct list_head failed_recovered_regions;
81 82
83 /*
84 * If there was a barrier failure no regions can be marked clean.
85 */
86 int barrier_failure;
87
82 void *context; 88 void *context;
83 sector_t target_begin; 89 sector_t target_begin;
84 90
@@ -211,6 +217,7 @@ struct dm_region_hash *dm_region_hash_create(
211 INIT_LIST_HEAD(&rh->quiesced_regions); 217 INIT_LIST_HEAD(&rh->quiesced_regions);
212 INIT_LIST_HEAD(&rh->recovered_regions); 218 INIT_LIST_HEAD(&rh->recovered_regions);
213 INIT_LIST_HEAD(&rh->failed_recovered_regions); 219 INIT_LIST_HEAD(&rh->failed_recovered_regions);
220 rh->barrier_failure = 0;
214 221
215 rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS, 222 rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
216 sizeof(struct dm_region)); 223 sizeof(struct dm_region));
@@ -377,8 +384,6 @@ static void complete_resync_work(struct dm_region *reg, int success)
377/* dm_rh_mark_nosync 384/* dm_rh_mark_nosync
378 * @ms 385 * @ms
379 * @bio 386 * @bio
380 * @done
381 * @error
382 * 387 *
383 * The bio was written on some mirror(s) but failed on other mirror(s). 388 * The bio was written on some mirror(s) but failed on other mirror(s).
384 * We can successfully endio the bio but should avoid the region being 389 * We can successfully endio the bio but should avoid the region being
@@ -386,8 +391,7 @@ static void complete_resync_work(struct dm_region *reg, int success)
386 * 391 *
387 * This function is _not_ safe in interrupt context! 392 * This function is _not_ safe in interrupt context!
388 */ 393 */
389void dm_rh_mark_nosync(struct dm_region_hash *rh, 394void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio)
390 struct bio *bio, unsigned done, int error)
391{ 395{
392 unsigned long flags; 396 unsigned long flags;
393 struct dm_dirty_log *log = rh->log; 397 struct dm_dirty_log *log = rh->log;
@@ -395,6 +399,11 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh,
395 region_t region = dm_rh_bio_to_region(rh, bio); 399 region_t region = dm_rh_bio_to_region(rh, bio);
396 int recovering = 0; 400 int recovering = 0;
397 401
402 if (bio_empty_barrier(bio)) {
403 rh->barrier_failure = 1;
404 return;
405 }
406
398 /* We must inform the log that the sync count has changed. */ 407 /* We must inform the log that the sync count has changed. */
399 log->type->set_region_sync(log, region, 0); 408 log->type->set_region_sync(log, region, 0);
400 409
@@ -419,7 +428,6 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh,
419 BUG_ON(!list_empty(&reg->list)); 428 BUG_ON(!list_empty(&reg->list));
420 spin_unlock_irqrestore(&rh->region_lock, flags); 429 spin_unlock_irqrestore(&rh->region_lock, flags);
421 430
422 bio_endio(bio, error);
423 if (recovering) 431 if (recovering)
424 complete_resync_work(reg, 0); 432 complete_resync_work(reg, 0);
425} 433}
@@ -515,8 +523,11 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
515{ 523{
516 struct bio *bio; 524 struct bio *bio;
517 525
518 for (bio = bios->head; bio; bio = bio->bi_next) 526 for (bio = bios->head; bio; bio = bio->bi_next) {
527 if (bio_empty_barrier(bio))
528 continue;
519 rh_inc(rh, dm_rh_bio_to_region(rh, bio)); 529 rh_inc(rh, dm_rh_bio_to_region(rh, bio));
530 }
520} 531}
521EXPORT_SYMBOL_GPL(dm_rh_inc_pending); 532EXPORT_SYMBOL_GPL(dm_rh_inc_pending);
522 533
@@ -544,7 +555,14 @@ void dm_rh_dec(struct dm_region_hash *rh, region_t region)
544 */ 555 */
545 556
546 /* do nothing for DM_RH_NOSYNC */ 557 /* do nothing for DM_RH_NOSYNC */
547 if (reg->state == DM_RH_RECOVERING) { 558 if (unlikely(rh->barrier_failure)) {
559 /*
560 * If a write barrier failed some time ago, we
561 * don't know whether or not this write made it
562 * to the disk, so we must resync the device.
563 */
564 reg->state = DM_RH_NOSYNC;
565 } else if (reg->state == DM_RH_RECOVERING) {
548 list_add_tail(&reg->list, &rh->quiesced_regions); 566 list_add_tail(&reg->list, &rh->quiesced_regions);
549 } else if (reg->state == DM_RH_DIRTY) { 567 } else if (reg->state == DM_RH_DIRTY) {
550 reg->state = DM_RH_CLEAN; 568 reg->state = DM_RH_CLEAN;
@@ -643,10 +661,9 @@ void dm_rh_recovery_end(struct dm_region *reg, int success)
643 spin_lock_irq(&rh->region_lock); 661 spin_lock_irq(&rh->region_lock);
644 if (success) 662 if (success)
645 list_add(&reg->list, &reg->rh->recovered_regions); 663 list_add(&reg->list, &reg->rh->recovered_regions);
646 else { 664 else
647 reg->state = DM_RH_NOSYNC;
648 list_add(&reg->list, &reg->rh->failed_recovered_regions); 665 list_add(&reg->list, &reg->rh->failed_recovered_regions);
649 } 666
650 spin_unlock_irq(&rh->region_lock); 667 spin_unlock_irq(&rh->region_lock);
651 668
652 rh->wakeup_workers(rh->context); 669 rh->wakeup_workers(rh->context);
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c
index cfa668f46c40..9c6c2e47ad62 100644
--- a/drivers/md/dm-service-time.c
+++ b/drivers/md/dm-service-time.c
@@ -11,6 +11,8 @@
11#include "dm.h" 11#include "dm.h"
12#include "dm-path-selector.h" 12#include "dm-path-selector.h"
13 13
14#include <linux/slab.h>
15
14#define DM_MSG_PREFIX "multipath service-time" 16#define DM_MSG_PREFIX "multipath service-time"
15#define ST_MIN_IO 1 17#define ST_MIN_IO 1
16#define ST_MAX_RELATIVE_THROUGHPUT 100 18#define ST_MAX_RELATIVE_THROUGHPUT 100
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index d5b2e08750d5..c097d8a4823d 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -55,6 +55,8 @@
55 */ 55 */
56#define SNAPSHOT_DISK_VERSION 1 56#define SNAPSHOT_DISK_VERSION 1
57 57
58#define NUM_SNAPSHOT_HDR_CHUNKS 1
59
58struct disk_header { 60struct disk_header {
59 uint32_t magic; 61 uint32_t magic;
60 62
@@ -120,7 +122,22 @@ struct pstore {
120 122
121 /* 123 /*
122 * The next free chunk for an exception. 124 * The next free chunk for an exception.
125 *
126 * When creating exceptions, all the chunks here and above are
127 * free. It holds the next chunk to be allocated. On rare
128 * occasions (e.g. after a system crash) holes can be left in
129 * the exception store because chunks can be committed out of
130 * order.
131 *
132 * When merging exceptions, it does not necessarily mean all the
133 * chunks here and above are free. It holds the value it would
134 * have held if all chunks had been committed in order of
135 * allocation. Consequently the value may occasionally be
136 * slightly too low, but since it's only used for 'status' and
137 * it can never reach its minimum value too early this doesn't
138 * matter.
123 */ 139 */
140
124 chunk_t next_free; 141 chunk_t next_free;
125 142
126 /* 143 /*
@@ -214,7 +231,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw,
214 int metadata) 231 int metadata)
215{ 232{
216 struct dm_io_region where = { 233 struct dm_io_region where = {
217 .bdev = ps->store->cow->bdev, 234 .bdev = dm_snap_cow(ps->store->snap)->bdev,
218 .sector = ps->store->chunk_size * chunk, 235 .sector = ps->store->chunk_size * chunk,
219 .count = ps->store->chunk_size, 236 .count = ps->store->chunk_size,
220 }; 237 };
@@ -237,7 +254,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw,
237 * Issue the synchronous I/O from a different thread 254 * Issue the synchronous I/O from a different thread
238 * to avoid generic_make_request recursion. 255 * to avoid generic_make_request recursion.
239 */ 256 */
240 INIT_WORK(&req.work, do_metadata); 257 INIT_WORK_ON_STACK(&req.work, do_metadata);
241 queue_work(ps->metadata_wq, &req.work); 258 queue_work(ps->metadata_wq, &req.work);
242 flush_workqueue(ps->metadata_wq); 259 flush_workqueue(ps->metadata_wq);
243 260
@@ -284,16 +301,18 @@ static int read_header(struct pstore *ps, int *new_snapshot)
284{ 301{
285 int r; 302 int r;
286 struct disk_header *dh; 303 struct disk_header *dh;
287 chunk_t chunk_size; 304 unsigned chunk_size;
288 int chunk_size_supplied = 1; 305 int chunk_size_supplied = 1;
289 char *chunk_err; 306 char *chunk_err;
290 307
291 /* 308 /*
292 * Use default chunk size (or hardsect_size, if larger) if none supplied 309 * Use default chunk size (or logical_block_size, if larger)
310 * if none supplied
293 */ 311 */
294 if (!ps->store->chunk_size) { 312 if (!ps->store->chunk_size) {
295 ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS, 313 ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
296 bdev_logical_block_size(ps->store->cow->bdev) >> 9); 314 bdev_logical_block_size(dm_snap_cow(ps->store->snap)->
315 bdev) >> 9);
297 ps->store->chunk_mask = ps->store->chunk_size - 1; 316 ps->store->chunk_mask = ps->store->chunk_size - 1;
298 ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1; 317 ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1;
299 chunk_size_supplied = 0; 318 chunk_size_supplied = 0;
@@ -334,10 +353,9 @@ static int read_header(struct pstore *ps, int *new_snapshot)
334 return 0; 353 return 0;
335 354
336 if (chunk_size_supplied) 355 if (chunk_size_supplied)
337 DMWARN("chunk size %llu in device metadata overrides " 356 DMWARN("chunk size %u in device metadata overrides "
338 "table chunk size of %llu.", 357 "table chunk size of %u.",
339 (unsigned long long)chunk_size, 358 chunk_size, ps->store->chunk_size);
340 (unsigned long long)ps->store->chunk_size);
341 359
342 /* We had a bogus chunk_size. Fix stuff up. */ 360 /* We had a bogus chunk_size. Fix stuff up. */
343 free_area(ps); 361 free_area(ps);
@@ -345,8 +363,8 @@ static int read_header(struct pstore *ps, int *new_snapshot)
345 r = dm_exception_store_set_chunk_size(ps->store, chunk_size, 363 r = dm_exception_store_set_chunk_size(ps->store, chunk_size,
346 &chunk_err); 364 &chunk_err);
347 if (r) { 365 if (r) {
348 DMERR("invalid on-disk chunk size %llu: %s.", 366 DMERR("invalid on-disk chunk size %u: %s.",
349 (unsigned long long)chunk_size, chunk_err); 367 chunk_size, chunk_err);
350 return r; 368 return r;
351 } 369 }
352 370
@@ -408,6 +426,15 @@ static void write_exception(struct pstore *ps,
408 e->new_chunk = cpu_to_le64(de->new_chunk); 426 e->new_chunk = cpu_to_le64(de->new_chunk);
409} 427}
410 428
429static void clear_exception(struct pstore *ps, uint32_t index)
430{
431 struct disk_exception *e = get_exception(ps, index);
432
433 /* clear it */
434 e->old_chunk = 0;
435 e->new_chunk = 0;
436}
437
411/* 438/*
412 * Registers the exceptions that are present in the current area. 439 * Registers the exceptions that are present in the current area.
413 * 'full' is filled in to indicate if the area has been 440 * 'full' is filled in to indicate if the area has been
@@ -489,11 +516,23 @@ static struct pstore *get_info(struct dm_exception_store *store)
489 return (struct pstore *) store->context; 516 return (struct pstore *) store->context;
490} 517}
491 518
492static void persistent_fraction_full(struct dm_exception_store *store, 519static void persistent_usage(struct dm_exception_store *store,
493 sector_t *numerator, sector_t *denominator) 520 sector_t *total_sectors,
521 sector_t *sectors_allocated,
522 sector_t *metadata_sectors)
494{ 523{
495 *numerator = get_info(store)->next_free * store->chunk_size; 524 struct pstore *ps = get_info(store);
496 *denominator = get_dev_size(store->cow->bdev); 525
526 *sectors_allocated = ps->next_free * store->chunk_size;
527 *total_sectors = get_dev_size(dm_snap_cow(store->snap)->bdev);
528
529 /*
530 * First chunk is the fixed header.
531 * Then there are (ps->current_area + 1) metadata chunks, each one
532 * separated from the next by ps->exceptions_per_area data chunks.
533 */
534 *metadata_sectors = (ps->current_area + 1 + NUM_SNAPSHOT_HDR_CHUNKS) *
535 store->chunk_size;
497} 536}
498 537
499static void persistent_dtr(struct dm_exception_store *store) 538static void persistent_dtr(struct dm_exception_store *store)
@@ -552,44 +591,40 @@ static int persistent_read_metadata(struct dm_exception_store *store,
552 ps->current_area = 0; 591 ps->current_area = 0;
553 zero_memory_area(ps); 592 zero_memory_area(ps);
554 r = zero_disk_area(ps, 0); 593 r = zero_disk_area(ps, 0);
555 if (r) { 594 if (r)
556 DMWARN("zero_disk_area(0) failed"); 595 DMWARN("zero_disk_area(0) failed");
557 return r; 596 return r;
558 } 597 }
559 } else { 598 /*
560 /* 599 * Sanity checks.
561 * Sanity checks. 600 */
562 */ 601 if (ps->version != SNAPSHOT_DISK_VERSION) {
563 if (ps->version != SNAPSHOT_DISK_VERSION) { 602 DMWARN("unable to handle snapshot disk version %d",
564 DMWARN("unable to handle snapshot disk version %d", 603 ps->version);
565 ps->version); 604 return -EINVAL;
566 return -EINVAL; 605 }
567 }
568 606
569 /* 607 /*
570 * Metadata are valid, but snapshot is invalidated 608 * Metadata are valid, but snapshot is invalidated
571 */ 609 */
572 if (!ps->valid) 610 if (!ps->valid)
573 return 1; 611 return 1;
574 612
575 /* 613 /*
576 * Read the metadata. 614 * Read the metadata.
577 */ 615 */
578 r = read_exceptions(ps, callback, callback_context); 616 r = read_exceptions(ps, callback, callback_context);
579 if (r)
580 return r;
581 }
582 617
583 return 0; 618 return r;
584} 619}
585 620
586static int persistent_prepare_exception(struct dm_exception_store *store, 621static int persistent_prepare_exception(struct dm_exception_store *store,
587 struct dm_snap_exception *e) 622 struct dm_exception *e)
588{ 623{
589 struct pstore *ps = get_info(store); 624 struct pstore *ps = get_info(store);
590 uint32_t stride; 625 uint32_t stride;
591 chunk_t next_free; 626 chunk_t next_free;
592 sector_t size = get_dev_size(store->cow->bdev); 627 sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev);
593 628
594 /* Is there enough room ? */ 629 /* Is there enough room ? */
595 if (size < ((ps->next_free + 1) * store->chunk_size)) 630 if (size < ((ps->next_free + 1) * store->chunk_size))
@@ -611,7 +646,7 @@ static int persistent_prepare_exception(struct dm_exception_store *store,
611} 646}
612 647
613static void persistent_commit_exception(struct dm_exception_store *store, 648static void persistent_commit_exception(struct dm_exception_store *store,
614 struct dm_snap_exception *e, 649 struct dm_exception *e,
615 void (*callback) (void *, int success), 650 void (*callback) (void *, int success),
616 void *callback_context) 651 void *callback_context)
617{ 652{
@@ -672,6 +707,85 @@ static void persistent_commit_exception(struct dm_exception_store *store,
672 ps->callback_count = 0; 707 ps->callback_count = 0;
673} 708}
674 709
710static int persistent_prepare_merge(struct dm_exception_store *store,
711 chunk_t *last_old_chunk,
712 chunk_t *last_new_chunk)
713{
714 struct pstore *ps = get_info(store);
715 struct disk_exception de;
716 int nr_consecutive;
717 int r;
718
719 /*
720 * When current area is empty, move back to preceding area.
721 */
722 if (!ps->current_committed) {
723 /*
724 * Have we finished?
725 */
726 if (!ps->current_area)
727 return 0;
728
729 ps->current_area--;
730 r = area_io(ps, READ);
731 if (r < 0)
732 return r;
733 ps->current_committed = ps->exceptions_per_area;
734 }
735
736 read_exception(ps, ps->current_committed - 1, &de);
737 *last_old_chunk = de.old_chunk;
738 *last_new_chunk = de.new_chunk;
739
740 /*
741 * Find number of consecutive chunks within the current area,
742 * working backwards.
743 */
744 for (nr_consecutive = 1; nr_consecutive < ps->current_committed;
745 nr_consecutive++) {
746 read_exception(ps, ps->current_committed - 1 - nr_consecutive,
747 &de);
748 if (de.old_chunk != *last_old_chunk - nr_consecutive ||
749 de.new_chunk != *last_new_chunk - nr_consecutive)
750 break;
751 }
752
753 return nr_consecutive;
754}
755
756static int persistent_commit_merge(struct dm_exception_store *store,
757 int nr_merged)
758{
759 int r, i;
760 struct pstore *ps = get_info(store);
761
762 BUG_ON(nr_merged > ps->current_committed);
763
764 for (i = 0; i < nr_merged; i++)
765 clear_exception(ps, ps->current_committed - 1 - i);
766
767 r = area_io(ps, WRITE);
768 if (r < 0)
769 return r;
770
771 ps->current_committed -= nr_merged;
772
773 /*
774 * At this stage, only persistent_usage() uses ps->next_free, so
775 * we make no attempt to keep ps->next_free strictly accurate
776 * as exceptions may have been committed out-of-order originally.
777 * Once a snapshot has become merging, we set it to the value it
778 * would have held had all the exceptions been committed in order.
779 *
780 * ps->current_area does not get reduced by prepare_merge() until
781 * after commit_merge() has removed the nr_merged previous exceptions.
782 */
783 ps->next_free = (area_location(ps, ps->current_area) - 1) +
784 (ps->current_committed + 1) + NUM_SNAPSHOT_HDR_CHUNKS;
785
786 return 0;
787}
788
675static void persistent_drop_snapshot(struct dm_exception_store *store) 789static void persistent_drop_snapshot(struct dm_exception_store *store)
676{ 790{
677 struct pstore *ps = get_info(store); 791 struct pstore *ps = get_info(store);
@@ -697,7 +811,7 @@ static int persistent_ctr(struct dm_exception_store *store,
697 ps->area = NULL; 811 ps->area = NULL;
698 ps->zero_area = NULL; 812 ps->zero_area = NULL;
699 ps->header_area = NULL; 813 ps->header_area = NULL;
700 ps->next_free = 2; /* skipping the header and first area */ 814 ps->next_free = NUM_SNAPSHOT_HDR_CHUNKS + 1; /* header and 1st area */
701 ps->current_committed = 0; 815 ps->current_committed = 0;
702 816
703 ps->callback_count = 0; 817 ps->callback_count = 0;
@@ -726,8 +840,7 @@ static unsigned persistent_status(struct dm_exception_store *store,
726 case STATUSTYPE_INFO: 840 case STATUSTYPE_INFO:
727 break; 841 break;
728 case STATUSTYPE_TABLE: 842 case STATUSTYPE_TABLE:
729 DMEMIT(" %s P %llu", store->cow->name, 843 DMEMIT(" P %llu", (unsigned long long)store->chunk_size);
730 (unsigned long long)store->chunk_size);
731 } 844 }
732 845
733 return sz; 846 return sz;
@@ -741,8 +854,10 @@ static struct dm_exception_store_type _persistent_type = {
741 .read_metadata = persistent_read_metadata, 854 .read_metadata = persistent_read_metadata,
742 .prepare_exception = persistent_prepare_exception, 855 .prepare_exception = persistent_prepare_exception,
743 .commit_exception = persistent_commit_exception, 856 .commit_exception = persistent_commit_exception,
857 .prepare_merge = persistent_prepare_merge,
858 .commit_merge = persistent_commit_merge,
744 .drop_snapshot = persistent_drop_snapshot, 859 .drop_snapshot = persistent_drop_snapshot,
745 .fraction_full = persistent_fraction_full, 860 .usage = persistent_usage,
746 .status = persistent_status, 861 .status = persistent_status,
747}; 862};
748 863
@@ -754,8 +869,10 @@ static struct dm_exception_store_type _persistent_compat_type = {
754 .read_metadata = persistent_read_metadata, 869 .read_metadata = persistent_read_metadata,
755 .prepare_exception = persistent_prepare_exception, 870 .prepare_exception = persistent_prepare_exception,
756 .commit_exception = persistent_commit_exception, 871 .commit_exception = persistent_commit_exception,
872 .prepare_merge = persistent_prepare_merge,
873 .commit_merge = persistent_commit_merge,
757 .drop_snapshot = persistent_drop_snapshot, 874 .drop_snapshot = persistent_drop_snapshot,
758 .fraction_full = persistent_fraction_full, 875 .usage = persistent_usage,
759 .status = persistent_status, 876 .status = persistent_status,
760}; 877};
761 878
diff --git a/drivers/md/dm-snap-transient.c b/drivers/md/dm-snap-transient.c
index cde5aa558e6d..a0898a66a2f8 100644
--- a/drivers/md/dm-snap-transient.c
+++ b/drivers/md/dm-snap-transient.c
@@ -36,10 +36,10 @@ static int transient_read_metadata(struct dm_exception_store *store,
36} 36}
37 37
38static int transient_prepare_exception(struct dm_exception_store *store, 38static int transient_prepare_exception(struct dm_exception_store *store,
39 struct dm_snap_exception *e) 39 struct dm_exception *e)
40{ 40{
41 struct transient_c *tc = store->context; 41 struct transient_c *tc = store->context;
42 sector_t size = get_dev_size(store->cow->bdev); 42 sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev);
43 43
44 if (size < (tc->next_free + store->chunk_size)) 44 if (size < (tc->next_free + store->chunk_size))
45 return -1; 45 return -1;
@@ -51,7 +51,7 @@ static int transient_prepare_exception(struct dm_exception_store *store,
51} 51}
52 52
53static void transient_commit_exception(struct dm_exception_store *store, 53static void transient_commit_exception(struct dm_exception_store *store,
54 struct dm_snap_exception *e, 54 struct dm_exception *e,
55 void (*callback) (void *, int success), 55 void (*callback) (void *, int success),
56 void *callback_context) 56 void *callback_context)
57{ 57{
@@ -59,11 +59,14 @@ static void transient_commit_exception(struct dm_exception_store *store,
59 callback(callback_context, 1); 59 callback(callback_context, 1);
60} 60}
61 61
62static void transient_fraction_full(struct dm_exception_store *store, 62static void transient_usage(struct dm_exception_store *store,
63 sector_t *numerator, sector_t *denominator) 63 sector_t *total_sectors,
64 sector_t *sectors_allocated,
65 sector_t *metadata_sectors)
64{ 66{
65 *numerator = ((struct transient_c *) store->context)->next_free; 67 *sectors_allocated = ((struct transient_c *) store->context)->next_free;
66 *denominator = get_dev_size(store->cow->bdev); 68 *total_sectors = get_dev_size(dm_snap_cow(store->snap)->bdev);
69 *metadata_sectors = 0;
67} 70}
68 71
69static int transient_ctr(struct dm_exception_store *store, 72static int transient_ctr(struct dm_exception_store *store,
@@ -91,8 +94,7 @@ static unsigned transient_status(struct dm_exception_store *store,
91 case STATUSTYPE_INFO: 94 case STATUSTYPE_INFO:
92 break; 95 break;
93 case STATUSTYPE_TABLE: 96 case STATUSTYPE_TABLE:
94 DMEMIT(" %s N %llu", store->cow->name, 97 DMEMIT(" N %llu", (unsigned long long)store->chunk_size);
95 (unsigned long long)store->chunk_size);
96 } 98 }
97 99
98 return sz; 100 return sz;
@@ -106,7 +108,7 @@ static struct dm_exception_store_type _transient_type = {
106 .read_metadata = transient_read_metadata, 108 .read_metadata = transient_read_metadata,
107 .prepare_exception = transient_prepare_exception, 109 .prepare_exception = transient_prepare_exception,
108 .commit_exception = transient_commit_exception, 110 .commit_exception = transient_commit_exception,
109 .fraction_full = transient_fraction_full, 111 .usage = transient_usage,
110 .status = transient_status, 112 .status = transient_status,
111}; 113};
112 114
@@ -118,7 +120,7 @@ static struct dm_exception_store_type _transient_compat_type = {
118 .read_metadata = transient_read_metadata, 120 .read_metadata = transient_read_metadata,
119 .prepare_exception = transient_prepare_exception, 121 .prepare_exception = transient_prepare_exception,
120 .commit_exception = transient_commit_exception, 122 .commit_exception = transient_commit_exception,
121 .fraction_full = transient_fraction_full, 123 .usage = transient_usage,
122 .status = transient_status, 124 .status = transient_status,
123}; 125};
124 126
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 57f1bf7f3b7a..54853773510c 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -25,6 +25,11 @@
25 25
26#define DM_MSG_PREFIX "snapshots" 26#define DM_MSG_PREFIX "snapshots"
27 27
28static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
29
30#define dm_target_is_snapshot_merge(ti) \
31 ((ti)->type->name == dm_snapshot_merge_target_name)
32
28/* 33/*
29 * The percentage increment we will wake up users at 34 * The percentage increment we will wake up users at
30 */ 35 */
@@ -49,7 +54,7 @@
49#define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \ 54#define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \
50 (DM_TRACKED_CHUNK_HASH_SIZE - 1)) 55 (DM_TRACKED_CHUNK_HASH_SIZE - 1))
51 56
52struct exception_table { 57struct dm_exception_table {
53 uint32_t hash_mask; 58 uint32_t hash_mask;
54 unsigned hash_shift; 59 unsigned hash_shift;
55 struct list_head *table; 60 struct list_head *table;
@@ -59,22 +64,31 @@ struct dm_snapshot {
59 struct rw_semaphore lock; 64 struct rw_semaphore lock;
60 65
61 struct dm_dev *origin; 66 struct dm_dev *origin;
67 struct dm_dev *cow;
68
69 struct dm_target *ti;
62 70
63 /* List of snapshots per Origin */ 71 /* List of snapshots per Origin */
64 struct list_head list; 72 struct list_head list;
65 73
66 /* You can't use a snapshot if this is 0 (e.g. if full) */ 74 /*
75 * You can't use a snapshot if this is 0 (e.g. if full).
76 * A snapshot-merge target never clears this.
77 */
67 int valid; 78 int valid;
68 79
69 /* Origin writes don't trigger exceptions until this is set */ 80 /* Origin writes don't trigger exceptions until this is set */
70 int active; 81 int active;
71 82
72 mempool_t *pending_pool; 83 /* Whether or not owning mapped_device is suspended */
84 int suspended;
73 85
74 atomic_t pending_exceptions_count; 86 atomic_t pending_exceptions_count;
75 87
76 struct exception_table pending; 88 mempool_t *pending_pool;
77 struct exception_table complete; 89
90 struct dm_exception_table pending;
91 struct dm_exception_table complete;
78 92
79 /* 93 /*
80 * pe_lock protects all pending_exception operations and access 94 * pe_lock protects all pending_exception operations and access
@@ -82,6 +96,11 @@ struct dm_snapshot {
82 */ 96 */
83 spinlock_t pe_lock; 97 spinlock_t pe_lock;
84 98
99 /* Chunks with outstanding reads */
100 spinlock_t tracked_chunk_lock;
101 mempool_t *tracked_chunk_pool;
102 struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE];
103
85 /* The on disk metadata handler */ 104 /* The on disk metadata handler */
86 struct dm_exception_store *store; 105 struct dm_exception_store *store;
87 106
@@ -91,12 +110,50 @@ struct dm_snapshot {
91 struct bio_list queued_bios; 110 struct bio_list queued_bios;
92 struct work_struct queued_bios_work; 111 struct work_struct queued_bios_work;
93 112
94 /* Chunks with outstanding reads */ 113 /* Wait for events based on state_bits */
95 mempool_t *tracked_chunk_pool; 114 unsigned long state_bits;
96 spinlock_t tracked_chunk_lock; 115
97 struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE]; 116 /* Range of chunks currently being merged. */
117 chunk_t first_merging_chunk;
118 int num_merging_chunks;
119
120 /*
121 * The merge operation failed if this flag is set.
122 * Failure modes are handled as follows:
123 * - I/O error reading the header
124 * => don't load the target; abort.
125 * - Header does not have "valid" flag set
126 * => use the origin; forget about the snapshot.
127 * - I/O error when reading exceptions
128 * => don't load the target; abort.
129 * (We can't use the intermediate origin state.)
130 * - I/O error while merging
131 * => stop merging; set merge_failed; process I/O normally.
132 */
133 int merge_failed;
134
135 /*
136 * Incoming bios that overlap with chunks being merged must wait
137 * for them to be committed.
138 */
139 struct bio_list bios_queued_during_merge;
98}; 140};
99 141
142/*
143 * state_bits:
144 * RUNNING_MERGE - Merge operation is in progress.
145 * SHUTDOWN_MERGE - Set to signal that merge needs to be stopped;
146 * cleared afterwards.
147 */
148#define RUNNING_MERGE 0
149#define SHUTDOWN_MERGE 1
150
151struct dm_dev *dm_snap_cow(struct dm_snapshot *s)
152{
153 return s->cow;
154}
155EXPORT_SYMBOL(dm_snap_cow);
156
100static struct workqueue_struct *ksnapd; 157static struct workqueue_struct *ksnapd;
101static void flush_queued_bios(struct work_struct *work); 158static void flush_queued_bios(struct work_struct *work);
102 159
@@ -116,7 +173,7 @@ static int bdev_equal(struct block_device *lhs, struct block_device *rhs)
116} 173}
117 174
118struct dm_snap_pending_exception { 175struct dm_snap_pending_exception {
119 struct dm_snap_exception e; 176 struct dm_exception e;
120 177
121 /* 178 /*
122 * Origin buffers waiting for this to complete are held 179 * Origin buffers waiting for this to complete are held
@@ -125,28 +182,6 @@ struct dm_snap_pending_exception {
125 struct bio_list origin_bios; 182 struct bio_list origin_bios;
126 struct bio_list snapshot_bios; 183 struct bio_list snapshot_bios;
127 184
128 /*
129 * Short-term queue of pending exceptions prior to submission.
130 */
131 struct list_head list;
132
133 /*
134 * The primary pending_exception is the one that holds
135 * the ref_count and the list of origin_bios for a
136 * group of pending_exceptions. It is always last to get freed.
137 * These fields get set up when writing to the origin.
138 */
139 struct dm_snap_pending_exception *primary_pe;
140
141 /*
142 * Number of pending_exceptions processing this chunk.
143 * When this drops to zero we must complete the origin bios.
144 * If incrementing or decrementing this, hold pe->snap->lock for
145 * the sibling concerned and not pe->primary_pe->snap->lock unless
146 * they are the same.
147 */
148 atomic_t ref_count;
149
150 /* Pointer back to snapshot context */ 185 /* Pointer back to snapshot context */
151 struct dm_snapshot *snap; 186 struct dm_snapshot *snap;
152 187
@@ -222,6 +257,16 @@ static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk)
222} 257}
223 258
224/* 259/*
260 * This conflicting I/O is extremely improbable in the caller,
261 * so msleep(1) is sufficient and there is no need for a wait queue.
262 */
263static void __check_for_conflicting_io(struct dm_snapshot *s, chunk_t chunk)
264{
265 while (__chunk_is_tracked(s, chunk))
266 msleep(1);
267}
268
269/*
225 * One of these per registered origin, held in the snapshot_origins hash 270 * One of these per registered origin, held in the snapshot_origins hash
226 */ 271 */
227struct origin { 272struct origin {
@@ -243,6 +288,10 @@ struct origin {
243static struct list_head *_origins; 288static struct list_head *_origins;
244static struct rw_semaphore _origins_lock; 289static struct rw_semaphore _origins_lock;
245 290
291static DECLARE_WAIT_QUEUE_HEAD(_pending_exceptions_done);
292static DEFINE_SPINLOCK(_pending_exceptions_done_spinlock);
293static uint64_t _pending_exceptions_done_count;
294
246static int init_origin_hash(void) 295static int init_origin_hash(void)
247{ 296{
248 int i; 297 int i;
@@ -291,21 +340,144 @@ static void __insert_origin(struct origin *o)
291} 340}
292 341
293/* 342/*
343 * _origins_lock must be held when calling this function.
344 * Returns number of snapshots registered using the supplied cow device, plus:
345 * snap_src - a snapshot suitable for use as a source of exception handover
346 * snap_dest - a snapshot capable of receiving exception handover.
347 * snap_merge - an existing snapshot-merge target linked to the same origin.
348 * There can be at most one snapshot-merge target. The parameter is optional.
349 *
350 * Possible return values and states of snap_src and snap_dest.
351 * 0: NULL, NULL - first new snapshot
352 * 1: snap_src, NULL - normal snapshot
353 * 2: snap_src, snap_dest - waiting for handover
354 * 2: snap_src, NULL - handed over, waiting for old to be deleted
355 * 1: NULL, snap_dest - source got destroyed without handover
356 */
357static int __find_snapshots_sharing_cow(struct dm_snapshot *snap,
358 struct dm_snapshot **snap_src,
359 struct dm_snapshot **snap_dest,
360 struct dm_snapshot **snap_merge)
361{
362 struct dm_snapshot *s;
363 struct origin *o;
364 int count = 0;
365 int active;
366
367 o = __lookup_origin(snap->origin->bdev);
368 if (!o)
369 goto out;
370
371 list_for_each_entry(s, &o->snapshots, list) {
372 if (dm_target_is_snapshot_merge(s->ti) && snap_merge)
373 *snap_merge = s;
374 if (!bdev_equal(s->cow->bdev, snap->cow->bdev))
375 continue;
376
377 down_read(&s->lock);
378 active = s->active;
379 up_read(&s->lock);
380
381 if (active) {
382 if (snap_src)
383 *snap_src = s;
384 } else if (snap_dest)
385 *snap_dest = s;
386
387 count++;
388 }
389
390out:
391 return count;
392}
393
394/*
395 * On success, returns 1 if this snapshot is a handover destination,
396 * otherwise returns 0.
397 */
398static int __validate_exception_handover(struct dm_snapshot *snap)
399{
400 struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
401 struct dm_snapshot *snap_merge = NULL;
402
403 /* Does snapshot need exceptions handed over to it? */
404 if ((__find_snapshots_sharing_cow(snap, &snap_src, &snap_dest,
405 &snap_merge) == 2) ||
406 snap_dest) {
407 snap->ti->error = "Snapshot cow pairing for exception "
408 "table handover failed";
409 return -EINVAL;
410 }
411
412 /*
413 * If no snap_src was found, snap cannot become a handover
414 * destination.
415 */
416 if (!snap_src)
417 return 0;
418
419 /*
420 * Non-snapshot-merge handover?
421 */
422 if (!dm_target_is_snapshot_merge(snap->ti))
423 return 1;
424
425 /*
426 * Do not allow more than one merging snapshot.
427 */
428 if (snap_merge) {
429 snap->ti->error = "A snapshot is already merging.";
430 return -EINVAL;
431 }
432
433 if (!snap_src->store->type->prepare_merge ||
434 !snap_src->store->type->commit_merge) {
435 snap->ti->error = "Snapshot exception store does not "
436 "support snapshot-merge.";
437 return -EINVAL;
438 }
439
440 return 1;
441}
442
443static void __insert_snapshot(struct origin *o, struct dm_snapshot *s)
444{
445 struct dm_snapshot *l;
446
447 /* Sort the list according to chunk size, largest-first smallest-last */
448 list_for_each_entry(l, &o->snapshots, list)
449 if (l->store->chunk_size < s->store->chunk_size)
450 break;
451 list_add_tail(&s->list, &l->list);
452}
453
454/*
294 * Make a note of the snapshot and its origin so we can look it 455 * Make a note of the snapshot and its origin so we can look it
295 * up when the origin has a write on it. 456 * up when the origin has a write on it.
457 *
458 * Also validate snapshot exception store handovers.
459 * On success, returns 1 if this registration is a handover destination,
460 * otherwise returns 0.
296 */ 461 */
297static int register_snapshot(struct dm_snapshot *snap) 462static int register_snapshot(struct dm_snapshot *snap)
298{ 463{
299 struct origin *o, *new_o; 464 struct origin *o, *new_o = NULL;
300 struct block_device *bdev = snap->origin->bdev; 465 struct block_device *bdev = snap->origin->bdev;
466 int r = 0;
301 467
302 new_o = kmalloc(sizeof(*new_o), GFP_KERNEL); 468 new_o = kmalloc(sizeof(*new_o), GFP_KERNEL);
303 if (!new_o) 469 if (!new_o)
304 return -ENOMEM; 470 return -ENOMEM;
305 471
306 down_write(&_origins_lock); 472 down_write(&_origins_lock);
307 o = __lookup_origin(bdev);
308 473
474 r = __validate_exception_handover(snap);
475 if (r < 0) {
476 kfree(new_o);
477 goto out;
478 }
479
480 o = __lookup_origin(bdev);
309 if (o) 481 if (o)
310 kfree(new_o); 482 kfree(new_o);
311 else { 483 else {
@@ -319,10 +491,27 @@ static int register_snapshot(struct dm_snapshot *snap)
319 __insert_origin(o); 491 __insert_origin(o);
320 } 492 }
321 493
322 list_add_tail(&snap->list, &o->snapshots); 494 __insert_snapshot(o, snap);
495
496out:
497 up_write(&_origins_lock);
498
499 return r;
500}
501
502/*
503 * Move snapshot to correct place in list according to chunk size.
504 */
505static void reregister_snapshot(struct dm_snapshot *s)
506{
507 struct block_device *bdev = s->origin->bdev;
508
509 down_write(&_origins_lock);
510
511 list_del(&s->list);
512 __insert_snapshot(__lookup_origin(bdev), s);
323 513
324 up_write(&_origins_lock); 514 up_write(&_origins_lock);
325 return 0;
326} 515}
327 516
328static void unregister_snapshot(struct dm_snapshot *s) 517static void unregister_snapshot(struct dm_snapshot *s)
@@ -333,7 +522,7 @@ static void unregister_snapshot(struct dm_snapshot *s)
333 o = __lookup_origin(s->origin->bdev); 522 o = __lookup_origin(s->origin->bdev);
334 523
335 list_del(&s->list); 524 list_del(&s->list);
336 if (list_empty(&o->snapshots)) { 525 if (o && list_empty(&o->snapshots)) {
337 list_del(&o->hash_list); 526 list_del(&o->hash_list);
338 kfree(o); 527 kfree(o);
339 } 528 }
@@ -346,8 +535,8 @@ static void unregister_snapshot(struct dm_snapshot *s)
346 * The lowest hash_shift bits of the chunk number are ignored, allowing 535 * The lowest hash_shift bits of the chunk number are ignored, allowing
347 * some consecutive chunks to be grouped together. 536 * some consecutive chunks to be grouped together.
348 */ 537 */
349static int init_exception_table(struct exception_table *et, uint32_t size, 538static int dm_exception_table_init(struct dm_exception_table *et,
350 unsigned hash_shift) 539 uint32_t size, unsigned hash_shift)
351{ 540{
352 unsigned int i; 541 unsigned int i;
353 542
@@ -363,10 +552,11 @@ static int init_exception_table(struct exception_table *et, uint32_t size,
363 return 0; 552 return 0;
364} 553}
365 554
366static void exit_exception_table(struct exception_table *et, struct kmem_cache *mem) 555static void dm_exception_table_exit(struct dm_exception_table *et,
556 struct kmem_cache *mem)
367{ 557{
368 struct list_head *slot; 558 struct list_head *slot;
369 struct dm_snap_exception *ex, *next; 559 struct dm_exception *ex, *next;
370 int i, size; 560 int i, size;
371 561
372 size = et->hash_mask + 1; 562 size = et->hash_mask + 1;
@@ -380,19 +570,12 @@ static void exit_exception_table(struct exception_table *et, struct kmem_cache *
380 vfree(et->table); 570 vfree(et->table);
381} 571}
382 572
383static uint32_t exception_hash(struct exception_table *et, chunk_t chunk) 573static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk)
384{ 574{
385 return (chunk >> et->hash_shift) & et->hash_mask; 575 return (chunk >> et->hash_shift) & et->hash_mask;
386} 576}
387 577
388static void insert_exception(struct exception_table *eh, 578static void dm_remove_exception(struct dm_exception *e)
389 struct dm_snap_exception *e)
390{
391 struct list_head *l = &eh->table[exception_hash(eh, e->old_chunk)];
392 list_add(&e->hash_list, l);
393}
394
395static void remove_exception(struct dm_snap_exception *e)
396{ 579{
397 list_del(&e->hash_list); 580 list_del(&e->hash_list);
398} 581}
@@ -401,11 +584,11 @@ static void remove_exception(struct dm_snap_exception *e)
401 * Return the exception data for a sector, or NULL if not 584 * Return the exception data for a sector, or NULL if not
402 * remapped. 585 * remapped.
403 */ 586 */
404static struct dm_snap_exception *lookup_exception(struct exception_table *et, 587static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et,
405 chunk_t chunk) 588 chunk_t chunk)
406{ 589{
407 struct list_head *slot; 590 struct list_head *slot;
408 struct dm_snap_exception *e; 591 struct dm_exception *e;
409 592
410 slot = &et->table[exception_hash(et, chunk)]; 593 slot = &et->table[exception_hash(et, chunk)];
411 list_for_each_entry (e, slot, hash_list) 594 list_for_each_entry (e, slot, hash_list)
@@ -416,9 +599,9 @@ static struct dm_snap_exception *lookup_exception(struct exception_table *et,
416 return NULL; 599 return NULL;
417} 600}
418 601
419static struct dm_snap_exception *alloc_exception(void) 602static struct dm_exception *alloc_completed_exception(void)
420{ 603{
421 struct dm_snap_exception *e; 604 struct dm_exception *e;
422 605
423 e = kmem_cache_alloc(exception_cache, GFP_NOIO); 606 e = kmem_cache_alloc(exception_cache, GFP_NOIO);
424 if (!e) 607 if (!e)
@@ -427,7 +610,7 @@ static struct dm_snap_exception *alloc_exception(void)
427 return e; 610 return e;
428} 611}
429 612
430static void free_exception(struct dm_snap_exception *e) 613static void free_completed_exception(struct dm_exception *e)
431{ 614{
432 kmem_cache_free(exception_cache, e); 615 kmem_cache_free(exception_cache, e);
433} 616}
@@ -452,12 +635,11 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe)
452 atomic_dec(&s->pending_exceptions_count); 635 atomic_dec(&s->pending_exceptions_count);
453} 636}
454 637
455static void insert_completed_exception(struct dm_snapshot *s, 638static void dm_insert_exception(struct dm_exception_table *eh,
456 struct dm_snap_exception *new_e) 639 struct dm_exception *new_e)
457{ 640{
458 struct exception_table *eh = &s->complete;
459 struct list_head *l; 641 struct list_head *l;
460 struct dm_snap_exception *e = NULL; 642 struct dm_exception *e = NULL;
461 643
462 l = &eh->table[exception_hash(eh, new_e->old_chunk)]; 644 l = &eh->table[exception_hash(eh, new_e->old_chunk)];
463 645
@@ -473,7 +655,7 @@ static void insert_completed_exception(struct dm_snapshot *s,
473 new_e->new_chunk == (dm_chunk_number(e->new_chunk) + 655 new_e->new_chunk == (dm_chunk_number(e->new_chunk) +
474 dm_consecutive_chunk_count(e) + 1)) { 656 dm_consecutive_chunk_count(e) + 1)) {
475 dm_consecutive_chunk_count_inc(e); 657 dm_consecutive_chunk_count_inc(e);
476 free_exception(new_e); 658 free_completed_exception(new_e);
477 return; 659 return;
478 } 660 }
479 661
@@ -483,7 +665,7 @@ static void insert_completed_exception(struct dm_snapshot *s,
483 dm_consecutive_chunk_count_inc(e); 665 dm_consecutive_chunk_count_inc(e);
484 e->old_chunk--; 666 e->old_chunk--;
485 e->new_chunk--; 667 e->new_chunk--;
486 free_exception(new_e); 668 free_completed_exception(new_e);
487 return; 669 return;
488 } 670 }
489 671
@@ -502,9 +684,9 @@ out:
502static int dm_add_exception(void *context, chunk_t old, chunk_t new) 684static int dm_add_exception(void *context, chunk_t old, chunk_t new)
503{ 685{
504 struct dm_snapshot *s = context; 686 struct dm_snapshot *s = context;
505 struct dm_snap_exception *e; 687 struct dm_exception *e;
506 688
507 e = alloc_exception(); 689 e = alloc_completed_exception();
508 if (!e) 690 if (!e)
509 return -ENOMEM; 691 return -ENOMEM;
510 692
@@ -513,11 +695,30 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new)
513 /* Consecutive_count is implicitly initialised to zero */ 695 /* Consecutive_count is implicitly initialised to zero */
514 e->new_chunk = new; 696 e->new_chunk = new;
515 697
516 insert_completed_exception(s, e); 698 dm_insert_exception(&s->complete, e);
517 699
518 return 0; 700 return 0;
519} 701}
520 702
703#define min_not_zero(l, r) (((l) == 0) ? (r) : (((r) == 0) ? (l) : min(l, r)))
704
705/*
706 * Return a minimum chunk size of all snapshots that have the specified origin.
707 * Return zero if the origin has no snapshots.
708 */
709static sector_t __minimum_chunk_size(struct origin *o)
710{
711 struct dm_snapshot *snap;
712 unsigned chunk_size = 0;
713
714 if (o)
715 list_for_each_entry(snap, &o->snapshots, list)
716 chunk_size = min_not_zero(chunk_size,
717 snap->store->chunk_size);
718
719 return chunk_size;
720}
721
521/* 722/*
522 * Hard coded magic. 723 * Hard coded magic.
523 */ 724 */
@@ -541,16 +742,18 @@ static int init_hash_tables(struct dm_snapshot *s)
541 * Calculate based on the size of the original volume or 742 * Calculate based on the size of the original volume or
542 * the COW volume... 743 * the COW volume...
543 */ 744 */
544 cow_dev_size = get_dev_size(s->store->cow->bdev); 745 cow_dev_size = get_dev_size(s->cow->bdev);
545 origin_dev_size = get_dev_size(s->origin->bdev); 746 origin_dev_size = get_dev_size(s->origin->bdev);
546 max_buckets = calc_max_buckets(); 747 max_buckets = calc_max_buckets();
547 748
548 hash_size = min(origin_dev_size, cow_dev_size) >> s->store->chunk_shift; 749 hash_size = min(origin_dev_size, cow_dev_size) >> s->store->chunk_shift;
549 hash_size = min(hash_size, max_buckets); 750 hash_size = min(hash_size, max_buckets);
550 751
752 if (hash_size < 64)
753 hash_size = 64;
551 hash_size = rounddown_pow_of_two(hash_size); 754 hash_size = rounddown_pow_of_two(hash_size);
552 if (init_exception_table(&s->complete, hash_size, 755 if (dm_exception_table_init(&s->complete, hash_size,
553 DM_CHUNK_CONSECUTIVE_BITS)) 756 DM_CHUNK_CONSECUTIVE_BITS))
554 return -ENOMEM; 757 return -ENOMEM;
555 758
556 /* 759 /*
@@ -561,14 +764,284 @@ static int init_hash_tables(struct dm_snapshot *s)
561 if (hash_size < 64) 764 if (hash_size < 64)
562 hash_size = 64; 765 hash_size = 64;
563 766
564 if (init_exception_table(&s->pending, hash_size, 0)) { 767 if (dm_exception_table_init(&s->pending, hash_size, 0)) {
565 exit_exception_table(&s->complete, exception_cache); 768 dm_exception_table_exit(&s->complete, exception_cache);
566 return -ENOMEM; 769 return -ENOMEM;
567 } 770 }
568 771
569 return 0; 772 return 0;
570} 773}
571 774
775static void merge_shutdown(struct dm_snapshot *s)
776{
777 clear_bit_unlock(RUNNING_MERGE, &s->state_bits);
778 smp_mb__after_clear_bit();
779 wake_up_bit(&s->state_bits, RUNNING_MERGE);
780}
781
782static struct bio *__release_queued_bios_after_merge(struct dm_snapshot *s)
783{
784 s->first_merging_chunk = 0;
785 s->num_merging_chunks = 0;
786
787 return bio_list_get(&s->bios_queued_during_merge);
788}
789
790/*
791 * Remove one chunk from the index of completed exceptions.
792 */
793static int __remove_single_exception_chunk(struct dm_snapshot *s,
794 chunk_t old_chunk)
795{
796 struct dm_exception *e;
797
798 e = dm_lookup_exception(&s->complete, old_chunk);
799 if (!e) {
800 DMERR("Corruption detected: exception for block %llu is "
801 "on disk but not in memory",
802 (unsigned long long)old_chunk);
803 return -EINVAL;
804 }
805
806 /*
807 * If this is the only chunk using this exception, remove exception.
808 */
809 if (!dm_consecutive_chunk_count(e)) {
810 dm_remove_exception(e);
811 free_completed_exception(e);
812 return 0;
813 }
814
815 /*
816 * The chunk may be either at the beginning or the end of a
817 * group of consecutive chunks - never in the middle. We are
818 * removing chunks in the opposite order to that in which they
819 * were added, so this should always be true.
820 * Decrement the consecutive chunk counter and adjust the
821 * starting point if necessary.
822 */
823 if (old_chunk == e->old_chunk) {
824 e->old_chunk++;
825 e->new_chunk++;
826 } else if (old_chunk != e->old_chunk +
827 dm_consecutive_chunk_count(e)) {
828 DMERR("Attempt to merge block %llu from the "
829 "middle of a chunk range [%llu - %llu]",
830 (unsigned long long)old_chunk,
831 (unsigned long long)e->old_chunk,
832 (unsigned long long)
833 e->old_chunk + dm_consecutive_chunk_count(e));
834 return -EINVAL;
835 }
836
837 dm_consecutive_chunk_count_dec(e);
838
839 return 0;
840}
841
842static void flush_bios(struct bio *bio);
843
844static int remove_single_exception_chunk(struct dm_snapshot *s)
845{
846 struct bio *b = NULL;
847 int r;
848 chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1;
849
850 down_write(&s->lock);
851
852 /*
853 * Process chunks (and associated exceptions) in reverse order
854 * so that dm_consecutive_chunk_count_dec() accounting works.
855 */
856 do {
857 r = __remove_single_exception_chunk(s, old_chunk);
858 if (r)
859 goto out;
860 } while (old_chunk-- > s->first_merging_chunk);
861
862 b = __release_queued_bios_after_merge(s);
863
864out:
865 up_write(&s->lock);
866 if (b)
867 flush_bios(b);
868
869 return r;
870}
871
872static int origin_write_extent(struct dm_snapshot *merging_snap,
873 sector_t sector, unsigned chunk_size);
874
875static void merge_callback(int read_err, unsigned long write_err,
876 void *context);
877
878static uint64_t read_pending_exceptions_done_count(void)
879{
880 uint64_t pending_exceptions_done;
881
882 spin_lock(&_pending_exceptions_done_spinlock);
883 pending_exceptions_done = _pending_exceptions_done_count;
884 spin_unlock(&_pending_exceptions_done_spinlock);
885
886 return pending_exceptions_done;
887}
888
889static void increment_pending_exceptions_done_count(void)
890{
891 spin_lock(&_pending_exceptions_done_spinlock);
892 _pending_exceptions_done_count++;
893 spin_unlock(&_pending_exceptions_done_spinlock);
894
895 wake_up_all(&_pending_exceptions_done);
896}
897
898static void snapshot_merge_next_chunks(struct dm_snapshot *s)
899{
900 int i, linear_chunks;
901 chunk_t old_chunk, new_chunk;
902 struct dm_io_region src, dest;
903 sector_t io_size;
904 uint64_t previous_count;
905
906 BUG_ON(!test_bit(RUNNING_MERGE, &s->state_bits));
907 if (unlikely(test_bit(SHUTDOWN_MERGE, &s->state_bits)))
908 goto shut;
909
910 /*
911 * valid flag never changes during merge, so no lock required.
912 */
913 if (!s->valid) {
914 DMERR("Snapshot is invalid: can't merge");
915 goto shut;
916 }
917
918 linear_chunks = s->store->type->prepare_merge(s->store, &old_chunk,
919 &new_chunk);
920 if (linear_chunks <= 0) {
921 if (linear_chunks < 0) {
922 DMERR("Read error in exception store: "
923 "shutting down merge");
924 down_write(&s->lock);
925 s->merge_failed = 1;
926 up_write(&s->lock);
927 }
928 goto shut;
929 }
930
931 /* Adjust old_chunk and new_chunk to reflect start of linear region */
932 old_chunk = old_chunk + 1 - linear_chunks;
933 new_chunk = new_chunk + 1 - linear_chunks;
934
935 /*
936 * Use one (potentially large) I/O to copy all 'linear_chunks'
937 * from the exception store to the origin
938 */
939 io_size = linear_chunks * s->store->chunk_size;
940
941 dest.bdev = s->origin->bdev;
942 dest.sector = chunk_to_sector(s->store, old_chunk);
943 dest.count = min(io_size, get_dev_size(dest.bdev) - dest.sector);
944
945 src.bdev = s->cow->bdev;
946 src.sector = chunk_to_sector(s->store, new_chunk);
947 src.count = dest.count;
948
949 /*
950 * Reallocate any exceptions needed in other snapshots then
951 * wait for the pending exceptions to complete.
952 * Each time any pending exception (globally on the system)
953 * completes we are woken and repeat the process to find out
954 * if we can proceed. While this may not seem a particularly
955 * efficient algorithm, it is not expected to have any
956 * significant impact on performance.
957 */
958 previous_count = read_pending_exceptions_done_count();
959 while (origin_write_extent(s, dest.sector, io_size)) {
960 wait_event(_pending_exceptions_done,
961 (read_pending_exceptions_done_count() !=
962 previous_count));
963 /* Retry after the wait, until all exceptions are done. */
964 previous_count = read_pending_exceptions_done_count();
965 }
966
967 down_write(&s->lock);
968 s->first_merging_chunk = old_chunk;
969 s->num_merging_chunks = linear_chunks;
970 up_write(&s->lock);
971
972 /* Wait until writes to all 'linear_chunks' drain */
973 for (i = 0; i < linear_chunks; i++)
974 __check_for_conflicting_io(s, old_chunk + i);
975
976 dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, merge_callback, s);
977 return;
978
979shut:
980 merge_shutdown(s);
981}
982
983static void error_bios(struct bio *bio);
984
985static void merge_callback(int read_err, unsigned long write_err, void *context)
986{
987 struct dm_snapshot *s = context;
988 struct bio *b = NULL;
989
990 if (read_err || write_err) {
991 if (read_err)
992 DMERR("Read error: shutting down merge.");
993 else
994 DMERR("Write error: shutting down merge.");
995 goto shut;
996 }
997
998 if (s->store->type->commit_merge(s->store,
999 s->num_merging_chunks) < 0) {
1000 DMERR("Write error in exception store: shutting down merge");
1001 goto shut;
1002 }
1003
1004 if (remove_single_exception_chunk(s) < 0)
1005 goto shut;
1006
1007 snapshot_merge_next_chunks(s);
1008
1009 return;
1010
1011shut:
1012 down_write(&s->lock);
1013 s->merge_failed = 1;
1014 b = __release_queued_bios_after_merge(s);
1015 up_write(&s->lock);
1016 error_bios(b);
1017
1018 merge_shutdown(s);
1019}
1020
1021static void start_merge(struct dm_snapshot *s)
1022{
1023 if (!test_and_set_bit(RUNNING_MERGE, &s->state_bits))
1024 snapshot_merge_next_chunks(s);
1025}
1026
1027static int wait_schedule(void *ptr)
1028{
1029 schedule();
1030
1031 return 0;
1032}
1033
1034/*
1035 * Stop the merging process and wait until it finishes.
1036 */
1037static void stop_merge(struct dm_snapshot *s)
1038{
1039 set_bit(SHUTDOWN_MERGE, &s->state_bits);
1040 wait_on_bit(&s->state_bits, RUNNING_MERGE, wait_schedule,
1041 TASK_UNINTERRUPTIBLE);
1042 clear_bit(SHUTDOWN_MERGE, &s->state_bits);
1043}
1044
572/* 1045/*
573 * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size> 1046 * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
574 */ 1047 */
@@ -577,50 +1050,72 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
577 struct dm_snapshot *s; 1050 struct dm_snapshot *s;
578 int i; 1051 int i;
579 int r = -EINVAL; 1052 int r = -EINVAL;
580 char *origin_path; 1053 char *origin_path, *cow_path;
581 struct dm_exception_store *store; 1054 unsigned args_used, num_flush_requests = 1;
582 unsigned args_used; 1055 fmode_t origin_mode = FMODE_READ;
583 1056
584 if (argc != 4) { 1057 if (argc != 4) {
585 ti->error = "requires exactly 4 arguments"; 1058 ti->error = "requires exactly 4 arguments";
586 r = -EINVAL; 1059 r = -EINVAL;
587 goto bad_args; 1060 goto bad;
1061 }
1062
1063 if (dm_target_is_snapshot_merge(ti)) {
1064 num_flush_requests = 2;
1065 origin_mode = FMODE_WRITE;
588 } 1066 }
589 1067
590 origin_path = argv[0]; 1068 origin_path = argv[0];
591 argv++; 1069 argv++;
592 argc--; 1070 argc--;
593 1071
594 r = dm_exception_store_create(ti, argc, argv, &args_used, &store); 1072 s = kmalloc(sizeof(*s), GFP_KERNEL);
1073 if (!s) {
1074 ti->error = "Cannot allocate snapshot context private "
1075 "structure";
1076 r = -ENOMEM;
1077 goto bad;
1078 }
1079
1080 cow_path = argv[0];
1081 argv++;
1082 argc--;
1083
1084 r = dm_get_device(ti, cow_path, FMODE_READ | FMODE_WRITE, &s->cow);
1085 if (r) {
1086 ti->error = "Cannot get COW device";
1087 goto bad_cow;
1088 }
1089
1090 r = dm_exception_store_create(ti, argc, argv, s, &args_used, &s->store);
595 if (r) { 1091 if (r) {
596 ti->error = "Couldn't create exception store"; 1092 ti->error = "Couldn't create exception store";
597 r = -EINVAL; 1093 r = -EINVAL;
598 goto bad_args; 1094 goto bad_store;
599 } 1095 }
600 1096
601 argv += args_used; 1097 argv += args_used;
602 argc -= args_used; 1098 argc -= args_used;
603 1099
604 s = kmalloc(sizeof(*s), GFP_KERNEL); 1100 r = dm_get_device(ti, origin_path, origin_mode, &s->origin);
605 if (!s) {
606 ti->error = "Cannot allocate snapshot context private "
607 "structure";
608 r = -ENOMEM;
609 goto bad_snap;
610 }
611
612 r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin);
613 if (r) { 1101 if (r) {
614 ti->error = "Cannot get origin device"; 1102 ti->error = "Cannot get origin device";
615 goto bad_origin; 1103 goto bad_origin;
616 } 1104 }
617 1105
618 s->store = store; 1106 s->ti = ti;
619 s->valid = 1; 1107 s->valid = 1;
620 s->active = 0; 1108 s->active = 0;
1109 s->suspended = 0;
621 atomic_set(&s->pending_exceptions_count, 0); 1110 atomic_set(&s->pending_exceptions_count, 0);
622 init_rwsem(&s->lock); 1111 init_rwsem(&s->lock);
1112 INIT_LIST_HEAD(&s->list);
623 spin_lock_init(&s->pe_lock); 1113 spin_lock_init(&s->pe_lock);
1114 s->state_bits = 0;
1115 s->merge_failed = 0;
1116 s->first_merging_chunk = 0;
1117 s->num_merging_chunks = 0;
1118 bio_list_init(&s->bios_queued_during_merge);
624 1119
625 /* Allocate hash table for COW data */ 1120 /* Allocate hash table for COW data */
626 if (init_hash_tables(s)) { 1121 if (init_hash_tables(s)) {
@@ -654,34 +1149,55 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
654 1149
655 spin_lock_init(&s->tracked_chunk_lock); 1150 spin_lock_init(&s->tracked_chunk_lock);
656 1151
657 /* Metadata must only be loaded into one table at once */ 1152 bio_list_init(&s->queued_bios);
1153 INIT_WORK(&s->queued_bios_work, flush_queued_bios);
1154
1155 ti->private = s;
1156 ti->num_flush_requests = num_flush_requests;
1157
1158 /* Add snapshot to the list of snapshots for this origin */
1159 /* Exceptions aren't triggered till snapshot_resume() is called */
1160 r = register_snapshot(s);
1161 if (r == -ENOMEM) {
1162 ti->error = "Snapshot origin struct allocation failed";
1163 goto bad_load_and_register;
1164 } else if (r < 0) {
1165 /* invalid handover, register_snapshot has set ti->error */
1166 goto bad_load_and_register;
1167 }
1168
1169 /*
1170 * Metadata must only be loaded into one table at once, so skip this
1171 * if metadata will be handed over during resume.
1172 * Chunk size will be set during the handover - set it to zero to
1173 * ensure it's ignored.
1174 */
1175 if (r > 0) {
1176 s->store->chunk_size = 0;
1177 return 0;
1178 }
1179
658 r = s->store->type->read_metadata(s->store, dm_add_exception, 1180 r = s->store->type->read_metadata(s->store, dm_add_exception,
659 (void *)s); 1181 (void *)s);
660 if (r < 0) { 1182 if (r < 0) {
661 ti->error = "Failed to read snapshot metadata"; 1183 ti->error = "Failed to read snapshot metadata";
662 goto bad_load_and_register; 1184 goto bad_read_metadata;
663 } else if (r > 0) { 1185 } else if (r > 0) {
664 s->valid = 0; 1186 s->valid = 0;
665 DMWARN("Snapshot is marked invalid."); 1187 DMWARN("Snapshot is marked invalid.");
666 } 1188 }
667 1189
668 bio_list_init(&s->queued_bios); 1190 if (!s->store->chunk_size) {
669 INIT_WORK(&s->queued_bios_work, flush_queued_bios); 1191 ti->error = "Chunk size not set";
670 1192 goto bad_read_metadata;
671 /* Add snapshot to the list of snapshots for this origin */
672 /* Exceptions aren't triggered till snapshot_resume() is called */
673 if (register_snapshot(s)) {
674 r = -EINVAL;
675 ti->error = "Cannot register snapshot origin";
676 goto bad_load_and_register;
677 } 1193 }
678
679 ti->private = s;
680 ti->split_io = s->store->chunk_size; 1194 ti->split_io = s->store->chunk_size;
681 ti->num_flush_requests = 1;
682 1195
683 return 0; 1196 return 0;
684 1197
1198bad_read_metadata:
1199 unregister_snapshot(s);
1200
685bad_load_and_register: 1201bad_load_and_register:
686 mempool_destroy(s->tracked_chunk_pool); 1202 mempool_destroy(s->tracked_chunk_pool);
687 1203
@@ -692,19 +1208,22 @@ bad_pending_pool:
692 dm_kcopyd_client_destroy(s->kcopyd_client); 1208 dm_kcopyd_client_destroy(s->kcopyd_client);
693 1209
694bad_kcopyd: 1210bad_kcopyd:
695 exit_exception_table(&s->pending, pending_cache); 1211 dm_exception_table_exit(&s->pending, pending_cache);
696 exit_exception_table(&s->complete, exception_cache); 1212 dm_exception_table_exit(&s->complete, exception_cache);
697 1213
698bad_hash_tables: 1214bad_hash_tables:
699 dm_put_device(ti, s->origin); 1215 dm_put_device(ti, s->origin);
700 1216
701bad_origin: 1217bad_origin:
702 kfree(s); 1218 dm_exception_store_destroy(s->store);
703 1219
704bad_snap: 1220bad_store:
705 dm_exception_store_destroy(store); 1221 dm_put_device(ti, s->cow);
1222
1223bad_cow:
1224 kfree(s);
706 1225
707bad_args: 1226bad:
708 return r; 1227 return r;
709} 1228}
710 1229
@@ -713,8 +1232,39 @@ static void __free_exceptions(struct dm_snapshot *s)
713 dm_kcopyd_client_destroy(s->kcopyd_client); 1232 dm_kcopyd_client_destroy(s->kcopyd_client);
714 s->kcopyd_client = NULL; 1233 s->kcopyd_client = NULL;
715 1234
716 exit_exception_table(&s->pending, pending_cache); 1235 dm_exception_table_exit(&s->pending, pending_cache);
717 exit_exception_table(&s->complete, exception_cache); 1236 dm_exception_table_exit(&s->complete, exception_cache);
1237}
1238
1239static void __handover_exceptions(struct dm_snapshot *snap_src,
1240 struct dm_snapshot *snap_dest)
1241{
1242 union {
1243 struct dm_exception_table table_swap;
1244 struct dm_exception_store *store_swap;
1245 } u;
1246
1247 /*
1248 * Swap all snapshot context information between the two instances.
1249 */
1250 u.table_swap = snap_dest->complete;
1251 snap_dest->complete = snap_src->complete;
1252 snap_src->complete = u.table_swap;
1253
1254 u.store_swap = snap_dest->store;
1255 snap_dest->store = snap_src->store;
1256 snap_src->store = u.store_swap;
1257
1258 snap_dest->store->snap = snap_dest;
1259 snap_src->store->snap = snap_src;
1260
1261 snap_dest->ti->split_io = snap_dest->store->chunk_size;
1262 snap_dest->valid = snap_src->valid;
1263
1264 /*
1265 * Set source invalid to ensure it receives no further I/O.
1266 */
1267 snap_src->valid = 0;
718} 1268}
719 1269
720static void snapshot_dtr(struct dm_target *ti) 1270static void snapshot_dtr(struct dm_target *ti)
@@ -723,9 +1273,24 @@ static void snapshot_dtr(struct dm_target *ti)
723 int i; 1273 int i;
724#endif 1274#endif
725 struct dm_snapshot *s = ti->private; 1275 struct dm_snapshot *s = ti->private;
1276 struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
726 1277
727 flush_workqueue(ksnapd); 1278 flush_workqueue(ksnapd);
728 1279
1280 down_read(&_origins_lock);
1281 /* Check whether exception handover must be cancelled */
1282 (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
1283 if (snap_src && snap_dest && (s == snap_src)) {
1284 down_write(&snap_dest->lock);
1285 snap_dest->valid = 0;
1286 up_write(&snap_dest->lock);
1287 DMERR("Cancelling snapshot handover.");
1288 }
1289 up_read(&_origins_lock);
1290
1291 if (dm_target_is_snapshot_merge(ti))
1292 stop_merge(s);
1293
729 /* Prevent further origin writes from using this snapshot. */ 1294 /* Prevent further origin writes from using this snapshot. */
730 /* After this returns there can be no new kcopyd jobs. */ 1295 /* After this returns there can be no new kcopyd jobs. */
731 unregister_snapshot(s); 1296 unregister_snapshot(s);
@@ -753,6 +1318,8 @@ static void snapshot_dtr(struct dm_target *ti)
753 1318
754 dm_exception_store_destroy(s->store); 1319 dm_exception_store_destroy(s->store);
755 1320
1321 dm_put_device(ti, s->cow);
1322
756 kfree(s); 1323 kfree(s);
757} 1324}
758 1325
@@ -785,6 +1352,26 @@ static void flush_queued_bios(struct work_struct *work)
785 flush_bios(queued_bios); 1352 flush_bios(queued_bios);
786} 1353}
787 1354
1355static int do_origin(struct dm_dev *origin, struct bio *bio);
1356
1357/*
1358 * Flush a list of buffers.
1359 */
1360static void retry_origin_bios(struct dm_snapshot *s, struct bio *bio)
1361{
1362 struct bio *n;
1363 int r;
1364
1365 while (bio) {
1366 n = bio->bi_next;
1367 bio->bi_next = NULL;
1368 r = do_origin(s->origin, bio);
1369 if (r == DM_MAPIO_REMAPPED)
1370 generic_make_request(bio);
1371 bio = n;
1372 }
1373}
1374
788/* 1375/*
789 * Error a list of buffers. 1376 * Error a list of buffers.
790 */ 1377 */
@@ -815,45 +1402,12 @@ static void __invalidate_snapshot(struct dm_snapshot *s, int err)
815 1402
816 s->valid = 0; 1403 s->valid = 0;
817 1404
818 dm_table_event(s->store->ti->table); 1405 dm_table_event(s->ti->table);
819}
820
821static void get_pending_exception(struct dm_snap_pending_exception *pe)
822{
823 atomic_inc(&pe->ref_count);
824}
825
826static struct bio *put_pending_exception(struct dm_snap_pending_exception *pe)
827{
828 struct dm_snap_pending_exception *primary_pe;
829 struct bio *origin_bios = NULL;
830
831 primary_pe = pe->primary_pe;
832
833 /*
834 * If this pe is involved in a write to the origin and
835 * it is the last sibling to complete then release
836 * the bios for the original write to the origin.
837 */
838 if (primary_pe &&
839 atomic_dec_and_test(&primary_pe->ref_count)) {
840 origin_bios = bio_list_get(&primary_pe->origin_bios);
841 free_pending_exception(primary_pe);
842 }
843
844 /*
845 * Free the pe if it's not linked to an origin write or if
846 * it's not itself a primary pe.
847 */
848 if (!primary_pe || primary_pe != pe)
849 free_pending_exception(pe);
850
851 return origin_bios;
852} 1406}
853 1407
854static void pending_complete(struct dm_snap_pending_exception *pe, int success) 1408static void pending_complete(struct dm_snap_pending_exception *pe, int success)
855{ 1409{
856 struct dm_snap_exception *e; 1410 struct dm_exception *e;
857 struct dm_snapshot *s = pe->snap; 1411 struct dm_snapshot *s = pe->snap;
858 struct bio *origin_bios = NULL; 1412 struct bio *origin_bios = NULL;
859 struct bio *snapshot_bios = NULL; 1413 struct bio *snapshot_bios = NULL;
@@ -867,7 +1421,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
867 goto out; 1421 goto out;
868 } 1422 }
869 1423
870 e = alloc_exception(); 1424 e = alloc_completed_exception();
871 if (!e) { 1425 if (!e) {
872 down_write(&s->lock); 1426 down_write(&s->lock);
873 __invalidate_snapshot(s, -ENOMEM); 1427 __invalidate_snapshot(s, -ENOMEM);
@@ -878,28 +1432,27 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
878 1432
879 down_write(&s->lock); 1433 down_write(&s->lock);
880 if (!s->valid) { 1434 if (!s->valid) {
881 free_exception(e); 1435 free_completed_exception(e);
882 error = 1; 1436 error = 1;
883 goto out; 1437 goto out;
884 } 1438 }
885 1439
886 /* 1440 /* Check for conflicting reads */
887 * Check for conflicting reads. This is extremely improbable, 1441 __check_for_conflicting_io(s, pe->e.old_chunk);
888 * so msleep(1) is sufficient and there is no need for a wait queue.
889 */
890 while (__chunk_is_tracked(s, pe->e.old_chunk))
891 msleep(1);
892 1442
893 /* 1443 /*
894 * Add a proper exception, and remove the 1444 * Add a proper exception, and remove the
895 * in-flight exception from the list. 1445 * in-flight exception from the list.
896 */ 1446 */
897 insert_completed_exception(s, e); 1447 dm_insert_exception(&s->complete, e);
898 1448
899 out: 1449 out:
900 remove_exception(&pe->e); 1450 dm_remove_exception(&pe->e);
901 snapshot_bios = bio_list_get(&pe->snapshot_bios); 1451 snapshot_bios = bio_list_get(&pe->snapshot_bios);
902 origin_bios = put_pending_exception(pe); 1452 origin_bios = bio_list_get(&pe->origin_bios);
1453 free_pending_exception(pe);
1454
1455 increment_pending_exceptions_done_count();
903 1456
904 up_write(&s->lock); 1457 up_write(&s->lock);
905 1458
@@ -909,7 +1462,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
909 else 1462 else
910 flush_bios(snapshot_bios); 1463 flush_bios(snapshot_bios);
911 1464
912 flush_bios(origin_bios); 1465 retry_origin_bios(s, origin_bios);
913} 1466}
914 1467
915static void commit_callback(void *context, int success) 1468static void commit_callback(void *context, int success)
@@ -951,9 +1504,9 @@ static void start_copy(struct dm_snap_pending_exception *pe)
951 1504
952 src.bdev = bdev; 1505 src.bdev = bdev;
953 src.sector = chunk_to_sector(s->store, pe->e.old_chunk); 1506 src.sector = chunk_to_sector(s->store, pe->e.old_chunk);
954 src.count = min(s->store->chunk_size, dev_size - src.sector); 1507 src.count = min((sector_t)s->store->chunk_size, dev_size - src.sector);
955 1508
956 dest.bdev = s->store->cow->bdev; 1509 dest.bdev = s->cow->bdev;
957 dest.sector = chunk_to_sector(s->store, pe->e.new_chunk); 1510 dest.sector = chunk_to_sector(s->store, pe->e.new_chunk);
958 dest.count = src.count; 1511 dest.count = src.count;
959 1512
@@ -965,7 +1518,7 @@ static void start_copy(struct dm_snap_pending_exception *pe)
965static struct dm_snap_pending_exception * 1518static struct dm_snap_pending_exception *
966__lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk) 1519__lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk)
967{ 1520{
968 struct dm_snap_exception *e = lookup_exception(&s->pending, chunk); 1521 struct dm_exception *e = dm_lookup_exception(&s->pending, chunk);
969 1522
970 if (!e) 1523 if (!e)
971 return NULL; 1524 return NULL;
@@ -996,8 +1549,6 @@ __find_pending_exception(struct dm_snapshot *s,
996 pe->e.old_chunk = chunk; 1549 pe->e.old_chunk = chunk;
997 bio_list_init(&pe->origin_bios); 1550 bio_list_init(&pe->origin_bios);
998 bio_list_init(&pe->snapshot_bios); 1551 bio_list_init(&pe->snapshot_bios);
999 pe->primary_pe = NULL;
1000 atomic_set(&pe->ref_count, 0);
1001 pe->started = 0; 1552 pe->started = 0;
1002 1553
1003 if (s->store->type->prepare_exception(s->store, &pe->e)) { 1554 if (s->store->type->prepare_exception(s->store, &pe->e)) {
@@ -1005,16 +1556,15 @@ __find_pending_exception(struct dm_snapshot *s,
1005 return NULL; 1556 return NULL;
1006 } 1557 }
1007 1558
1008 get_pending_exception(pe); 1559 dm_insert_exception(&s->pending, &pe->e);
1009 insert_exception(&s->pending, &pe->e);
1010 1560
1011 return pe; 1561 return pe;
1012} 1562}
1013 1563
1014static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e, 1564static void remap_exception(struct dm_snapshot *s, struct dm_exception *e,
1015 struct bio *bio, chunk_t chunk) 1565 struct bio *bio, chunk_t chunk)
1016{ 1566{
1017 bio->bi_bdev = s->store->cow->bdev; 1567 bio->bi_bdev = s->cow->bdev;
1018 bio->bi_sector = chunk_to_sector(s->store, 1568 bio->bi_sector = chunk_to_sector(s->store,
1019 dm_chunk_number(e->new_chunk) + 1569 dm_chunk_number(e->new_chunk) +
1020 (chunk - e->old_chunk)) + 1570 (chunk - e->old_chunk)) +
@@ -1025,14 +1575,14 @@ static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e,
1025static int snapshot_map(struct dm_target *ti, struct bio *bio, 1575static int snapshot_map(struct dm_target *ti, struct bio *bio,
1026 union map_info *map_context) 1576 union map_info *map_context)
1027{ 1577{
1028 struct dm_snap_exception *e; 1578 struct dm_exception *e;
1029 struct dm_snapshot *s = ti->private; 1579 struct dm_snapshot *s = ti->private;
1030 int r = DM_MAPIO_REMAPPED; 1580 int r = DM_MAPIO_REMAPPED;
1031 chunk_t chunk; 1581 chunk_t chunk;
1032 struct dm_snap_pending_exception *pe = NULL; 1582 struct dm_snap_pending_exception *pe = NULL;
1033 1583
1034 if (unlikely(bio_empty_barrier(bio))) { 1584 if (unlikely(bio_empty_barrier(bio))) {
1035 bio->bi_bdev = s->store->cow->bdev; 1585 bio->bi_bdev = s->cow->bdev;
1036 return DM_MAPIO_REMAPPED; 1586 return DM_MAPIO_REMAPPED;
1037 } 1587 }
1038 1588
@@ -1053,7 +1603,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
1053 } 1603 }
1054 1604
1055 /* If the block is already remapped - use that, else remap it */ 1605 /* If the block is already remapped - use that, else remap it */
1056 e = lookup_exception(&s->complete, chunk); 1606 e = dm_lookup_exception(&s->complete, chunk);
1057 if (e) { 1607 if (e) {
1058 remap_exception(s, e, bio, chunk); 1608 remap_exception(s, e, bio, chunk);
1059 goto out_unlock; 1609 goto out_unlock;
@@ -1077,7 +1627,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
1077 goto out_unlock; 1627 goto out_unlock;
1078 } 1628 }
1079 1629
1080 e = lookup_exception(&s->complete, chunk); 1630 e = dm_lookup_exception(&s->complete, chunk);
1081 if (e) { 1631 if (e) {
1082 free_pending_exception(pe); 1632 free_pending_exception(pe);
1083 remap_exception(s, e, bio, chunk); 1633 remap_exception(s, e, bio, chunk);
@@ -1115,6 +1665,78 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
1115 return r; 1665 return r;
1116} 1666}
1117 1667
1668/*
1669 * A snapshot-merge target behaves like a combination of a snapshot
1670 * target and a snapshot-origin target. It only generates new
1671 * exceptions in other snapshots and not in the one that is being
1672 * merged.
1673 *
1674 * For each chunk, if there is an existing exception, it is used to
1675 * redirect I/O to the cow device. Otherwise I/O is sent to the origin,
1676 * which in turn might generate exceptions in other snapshots.
1677 * If merging is currently taking place on the chunk in question, the
1678 * I/O is deferred by adding it to s->bios_queued_during_merge.
1679 */
1680static int snapshot_merge_map(struct dm_target *ti, struct bio *bio,
1681 union map_info *map_context)
1682{
1683 struct dm_exception *e;
1684 struct dm_snapshot *s = ti->private;
1685 int r = DM_MAPIO_REMAPPED;
1686 chunk_t chunk;
1687
1688 if (unlikely(bio_empty_barrier(bio))) {
1689 if (!map_context->flush_request)
1690 bio->bi_bdev = s->origin->bdev;
1691 else
1692 bio->bi_bdev = s->cow->bdev;
1693 map_context->ptr = NULL;
1694 return DM_MAPIO_REMAPPED;
1695 }
1696
1697 chunk = sector_to_chunk(s->store, bio->bi_sector);
1698
1699 down_write(&s->lock);
1700
1701 /* Full merging snapshots are redirected to the origin */
1702 if (!s->valid)
1703 goto redirect_to_origin;
1704
1705 /* If the block is already remapped - use that */
1706 e = dm_lookup_exception(&s->complete, chunk);
1707 if (e) {
1708 /* Queue writes overlapping with chunks being merged */
1709 if (bio_rw(bio) == WRITE &&
1710 chunk >= s->first_merging_chunk &&
1711 chunk < (s->first_merging_chunk +
1712 s->num_merging_chunks)) {
1713 bio->bi_bdev = s->origin->bdev;
1714 bio_list_add(&s->bios_queued_during_merge, bio);
1715 r = DM_MAPIO_SUBMITTED;
1716 goto out_unlock;
1717 }
1718
1719 remap_exception(s, e, bio, chunk);
1720
1721 if (bio_rw(bio) == WRITE)
1722 map_context->ptr = track_chunk(s, chunk);
1723 goto out_unlock;
1724 }
1725
1726redirect_to_origin:
1727 bio->bi_bdev = s->origin->bdev;
1728
1729 if (bio_rw(bio) == WRITE) {
1730 up_write(&s->lock);
1731 return do_origin(s->origin, bio);
1732 }
1733
1734out_unlock:
1735 up_write(&s->lock);
1736
1737 return r;
1738}
1739
1118static int snapshot_end_io(struct dm_target *ti, struct bio *bio, 1740static int snapshot_end_io(struct dm_target *ti, struct bio *bio,
1119 int error, union map_info *map_context) 1741 int error, union map_info *map_context)
1120{ 1742{
@@ -1127,15 +1749,101 @@ static int snapshot_end_io(struct dm_target *ti, struct bio *bio,
1127 return 0; 1749 return 0;
1128} 1750}
1129 1751
1752static void snapshot_merge_presuspend(struct dm_target *ti)
1753{
1754 struct dm_snapshot *s = ti->private;
1755
1756 stop_merge(s);
1757}
1758
1759static void snapshot_postsuspend(struct dm_target *ti)
1760{
1761 struct dm_snapshot *s = ti->private;
1762
1763 down_write(&s->lock);
1764 s->suspended = 1;
1765 up_write(&s->lock);
1766}
1767
1768static int snapshot_preresume(struct dm_target *ti)
1769{
1770 int r = 0;
1771 struct dm_snapshot *s = ti->private;
1772 struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
1773
1774 down_read(&_origins_lock);
1775 (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
1776 if (snap_src && snap_dest) {
1777 down_read(&snap_src->lock);
1778 if (s == snap_src) {
1779 DMERR("Unable to resume snapshot source until "
1780 "handover completes.");
1781 r = -EINVAL;
1782 } else if (!snap_src->suspended) {
1783 DMERR("Unable to perform snapshot handover until "
1784 "source is suspended.");
1785 r = -EINVAL;
1786 }
1787 up_read(&snap_src->lock);
1788 }
1789 up_read(&_origins_lock);
1790
1791 return r;
1792}
1793
1130static void snapshot_resume(struct dm_target *ti) 1794static void snapshot_resume(struct dm_target *ti)
1131{ 1795{
1132 struct dm_snapshot *s = ti->private; 1796 struct dm_snapshot *s = ti->private;
1797 struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
1798
1799 down_read(&_origins_lock);
1800 (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
1801 if (snap_src && snap_dest) {
1802 down_write(&snap_src->lock);
1803 down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING);
1804 __handover_exceptions(snap_src, snap_dest);
1805 up_write(&snap_dest->lock);
1806 up_write(&snap_src->lock);
1807 }
1808 up_read(&_origins_lock);
1809
1810 /* Now we have correct chunk size, reregister */
1811 reregister_snapshot(s);
1133 1812
1134 down_write(&s->lock); 1813 down_write(&s->lock);
1135 s->active = 1; 1814 s->active = 1;
1815 s->suspended = 0;
1136 up_write(&s->lock); 1816 up_write(&s->lock);
1137} 1817}
1138 1818
1819static sector_t get_origin_minimum_chunksize(struct block_device *bdev)
1820{
1821 sector_t min_chunksize;
1822
1823 down_read(&_origins_lock);
1824 min_chunksize = __minimum_chunk_size(__lookup_origin(bdev));
1825 up_read(&_origins_lock);
1826
1827 return min_chunksize;
1828}
1829
1830static void snapshot_merge_resume(struct dm_target *ti)
1831{
1832 struct dm_snapshot *s = ti->private;
1833
1834 /*
1835 * Handover exceptions from existing snapshot.
1836 */
1837 snapshot_resume(ti);
1838
1839 /*
1840 * snapshot-merge acts as an origin, so set ti->split_io
1841 */
1842 ti->split_io = get_origin_minimum_chunksize(s->origin->bdev);
1843
1844 start_merge(s);
1845}
1846
1139static int snapshot_status(struct dm_target *ti, status_type_t type, 1847static int snapshot_status(struct dm_target *ti, status_type_t type,
1140 char *result, unsigned int maxlen) 1848 char *result, unsigned int maxlen)
1141{ 1849{
@@ -1144,21 +1852,32 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
1144 1852
1145 switch (type) { 1853 switch (type) {
1146 case STATUSTYPE_INFO: 1854 case STATUSTYPE_INFO:
1855
1856 down_write(&snap->lock);
1857
1147 if (!snap->valid) 1858 if (!snap->valid)
1148 DMEMIT("Invalid"); 1859 DMEMIT("Invalid");
1860 else if (snap->merge_failed)
1861 DMEMIT("Merge failed");
1149 else { 1862 else {
1150 if (snap->store->type->fraction_full) { 1863 if (snap->store->type->usage) {
1151 sector_t numerator, denominator; 1864 sector_t total_sectors, sectors_allocated,
1152 snap->store->type->fraction_full(snap->store, 1865 metadata_sectors;
1153 &numerator, 1866 snap->store->type->usage(snap->store,
1154 &denominator); 1867 &total_sectors,
1155 DMEMIT("%llu/%llu", 1868 &sectors_allocated,
1156 (unsigned long long)numerator, 1869 &metadata_sectors);
1157 (unsigned long long)denominator); 1870 DMEMIT("%llu/%llu %llu",
1871 (unsigned long long)sectors_allocated,
1872 (unsigned long long)total_sectors,
1873 (unsigned long long)metadata_sectors);
1158 } 1874 }
1159 else 1875 else
1160 DMEMIT("Unknown"); 1876 DMEMIT("Unknown");
1161 } 1877 }
1878
1879 up_write(&snap->lock);
1880
1162 break; 1881 break;
1163 1882
1164 case STATUSTYPE_TABLE: 1883 case STATUSTYPE_TABLE:
@@ -1167,7 +1886,7 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
1167 * to make private copies if the output is to 1886 * to make private copies if the output is to
1168 * make sense. 1887 * make sense.
1169 */ 1888 */
1170 DMEMIT("%s", snap->origin->name); 1889 DMEMIT("%s %s", snap->origin->name, snap->cow->name);
1171 snap->store->type->status(snap->store, type, result + sz, 1890 snap->store->type->status(snap->store, type, result + sz,
1172 maxlen - sz); 1891 maxlen - sz);
1173 break; 1892 break;
@@ -1188,17 +1907,36 @@ static int snapshot_iterate_devices(struct dm_target *ti,
1188/*----------------------------------------------------------------- 1907/*-----------------------------------------------------------------
1189 * Origin methods 1908 * Origin methods
1190 *---------------------------------------------------------------*/ 1909 *---------------------------------------------------------------*/
1191static int __origin_write(struct list_head *snapshots, struct bio *bio) 1910
1911/*
1912 * If no exceptions need creating, DM_MAPIO_REMAPPED is returned and any
1913 * supplied bio was ignored. The caller may submit it immediately.
1914 * (No remapping actually occurs as the origin is always a direct linear
1915 * map.)
1916 *
1917 * If further exceptions are required, DM_MAPIO_SUBMITTED is returned
1918 * and any supplied bio is added to a list to be submitted once all
1919 * the necessary exceptions exist.
1920 */
1921static int __origin_write(struct list_head *snapshots, sector_t sector,
1922 struct bio *bio)
1192{ 1923{
1193 int r = DM_MAPIO_REMAPPED, first = 0; 1924 int r = DM_MAPIO_REMAPPED;
1194 struct dm_snapshot *snap; 1925 struct dm_snapshot *snap;
1195 struct dm_snap_exception *e; 1926 struct dm_exception *e;
1196 struct dm_snap_pending_exception *pe, *next_pe, *primary_pe = NULL; 1927 struct dm_snap_pending_exception *pe;
1928 struct dm_snap_pending_exception *pe_to_start_now = NULL;
1929 struct dm_snap_pending_exception *pe_to_start_last = NULL;
1197 chunk_t chunk; 1930 chunk_t chunk;
1198 LIST_HEAD(pe_queue);
1199 1931
1200 /* Do all the snapshots on this origin */ 1932 /* Do all the snapshots on this origin */
1201 list_for_each_entry (snap, snapshots, list) { 1933 list_for_each_entry (snap, snapshots, list) {
1934 /*
1935 * Don't make new exceptions in a merging snapshot
1936 * because it has effectively been deleted
1937 */
1938 if (dm_target_is_snapshot_merge(snap->ti))
1939 continue;
1202 1940
1203 down_write(&snap->lock); 1941 down_write(&snap->lock);
1204 1942
@@ -1207,24 +1945,21 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
1207 goto next_snapshot; 1945 goto next_snapshot;
1208 1946
1209 /* Nothing to do if writing beyond end of snapshot */ 1947 /* Nothing to do if writing beyond end of snapshot */
1210 if (bio->bi_sector >= dm_table_get_size(snap->store->ti->table)) 1948 if (sector >= dm_table_get_size(snap->ti->table))
1211 goto next_snapshot; 1949 goto next_snapshot;
1212 1950
1213 /* 1951 /*
1214 * Remember, different snapshots can have 1952 * Remember, different snapshots can have
1215 * different chunk sizes. 1953 * different chunk sizes.
1216 */ 1954 */
1217 chunk = sector_to_chunk(snap->store, bio->bi_sector); 1955 chunk = sector_to_chunk(snap->store, sector);
1218 1956
1219 /* 1957 /*
1220 * Check exception table to see if block 1958 * Check exception table to see if block
1221 * is already remapped in this snapshot 1959 * is already remapped in this snapshot
1222 * and trigger an exception if not. 1960 * and trigger an exception if not.
1223 *
1224 * ref_count is initialised to 1 so pending_complete()
1225 * won't destroy the primary_pe while we're inside this loop.
1226 */ 1961 */
1227 e = lookup_exception(&snap->complete, chunk); 1962 e = dm_lookup_exception(&snap->complete, chunk);
1228 if (e) 1963 if (e)
1229 goto next_snapshot; 1964 goto next_snapshot;
1230 1965
@@ -1239,7 +1974,7 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
1239 goto next_snapshot; 1974 goto next_snapshot;
1240 } 1975 }
1241 1976
1242 e = lookup_exception(&snap->complete, chunk); 1977 e = dm_lookup_exception(&snap->complete, chunk);
1243 if (e) { 1978 if (e) {
1244 free_pending_exception(pe); 1979 free_pending_exception(pe);
1245 goto next_snapshot; 1980 goto next_snapshot;
@@ -1252,59 +1987,43 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
1252 } 1987 }
1253 } 1988 }
1254 1989
1255 if (!primary_pe) { 1990 r = DM_MAPIO_SUBMITTED;
1256 /*
1257 * Either every pe here has same
1258 * primary_pe or none has one yet.
1259 */
1260 if (pe->primary_pe)
1261 primary_pe = pe->primary_pe;
1262 else {
1263 primary_pe = pe;
1264 first = 1;
1265 }
1266
1267 bio_list_add(&primary_pe->origin_bios, bio);
1268 1991
1269 r = DM_MAPIO_SUBMITTED; 1992 /*
1270 } 1993 * If an origin bio was supplied, queue it to wait for the
1994 * completion of this exception, and start this one last,
1995 * at the end of the function.
1996 */
1997 if (bio) {
1998 bio_list_add(&pe->origin_bios, bio);
1999 bio = NULL;
1271 2000
1272 if (!pe->primary_pe) { 2001 if (!pe->started) {
1273 pe->primary_pe = primary_pe; 2002 pe->started = 1;
1274 get_pending_exception(primary_pe); 2003 pe_to_start_last = pe;
2004 }
1275 } 2005 }
1276 2006
1277 if (!pe->started) { 2007 if (!pe->started) {
1278 pe->started = 1; 2008 pe->started = 1;
1279 list_add_tail(&pe->list, &pe_queue); 2009 pe_to_start_now = pe;
1280 } 2010 }
1281 2011
1282 next_snapshot: 2012 next_snapshot:
1283 up_write(&snap->lock); 2013 up_write(&snap->lock);
1284 }
1285
1286 if (!primary_pe)
1287 return r;
1288
1289 /*
1290 * If this is the first time we're processing this chunk and
1291 * ref_count is now 1 it means all the pending exceptions
1292 * got completed while we were in the loop above, so it falls to
1293 * us here to remove the primary_pe and submit any origin_bios.
1294 */
1295 2014
1296 if (first && atomic_dec_and_test(&primary_pe->ref_count)) { 2015 if (pe_to_start_now) {
1297 flush_bios(bio_list_get(&primary_pe->origin_bios)); 2016 start_copy(pe_to_start_now);
1298 free_pending_exception(primary_pe); 2017 pe_to_start_now = NULL;
1299 /* If we got here, pe_queue is necessarily empty. */ 2018 }
1300 return r;
1301 } 2019 }
1302 2020
1303 /* 2021 /*
1304 * Now that we have a complete pe list we can start the copying. 2022 * Submit the exception against which the bio is queued last,
2023 * to give the other exceptions a head start.
1305 */ 2024 */
1306 list_for_each_entry_safe(pe, next_pe, &pe_queue, list) 2025 if (pe_to_start_last)
1307 start_copy(pe); 2026 start_copy(pe_to_start_last);
1308 2027
1309 return r; 2028 return r;
1310} 2029}
@@ -1320,13 +2039,48 @@ static int do_origin(struct dm_dev *origin, struct bio *bio)
1320 down_read(&_origins_lock); 2039 down_read(&_origins_lock);
1321 o = __lookup_origin(origin->bdev); 2040 o = __lookup_origin(origin->bdev);
1322 if (o) 2041 if (o)
1323 r = __origin_write(&o->snapshots, bio); 2042 r = __origin_write(&o->snapshots, bio->bi_sector, bio);
1324 up_read(&_origins_lock); 2043 up_read(&_origins_lock);
1325 2044
1326 return r; 2045 return r;
1327} 2046}
1328 2047
1329/* 2048/*
2049 * Trigger exceptions in all non-merging snapshots.
2050 *
2051 * The chunk size of the merging snapshot may be larger than the chunk
2052 * size of some other snapshot so we may need to reallocate multiple
2053 * chunks in other snapshots.
2054 *
2055 * We scan all the overlapping exceptions in the other snapshots.
2056 * Returns 1 if anything was reallocated and must be waited for,
2057 * otherwise returns 0.
2058 *
2059 * size must be a multiple of merging_snap's chunk_size.
2060 */
2061static int origin_write_extent(struct dm_snapshot *merging_snap,
2062 sector_t sector, unsigned size)
2063{
2064 int must_wait = 0;
2065 sector_t n;
2066 struct origin *o;
2067
2068 /*
2069 * The origin's __minimum_chunk_size() got stored in split_io
2070 * by snapshot_merge_resume().
2071 */
2072 down_read(&_origins_lock);
2073 o = __lookup_origin(merging_snap->origin->bdev);
2074 for (n = 0; n < size; n += merging_snap->ti->split_io)
2075 if (__origin_write(&o->snapshots, sector + n, NULL) ==
2076 DM_MAPIO_SUBMITTED)
2077 must_wait = 1;
2078 up_read(&_origins_lock);
2079
2080 return must_wait;
2081}
2082
2083/*
1330 * Origin: maps a linear range of a device, with hooks for snapshotting. 2084 * Origin: maps a linear range of a device, with hooks for snapshotting.
1331 */ 2085 */
1332 2086
@@ -1345,8 +2099,7 @@ static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1345 return -EINVAL; 2099 return -EINVAL;
1346 } 2100 }
1347 2101
1348 r = dm_get_device(ti, argv[0], 0, ti->len, 2102 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &dev);
1349 dm_table_get_mode(ti->table), &dev);
1350 if (r) { 2103 if (r) {
1351 ti->error = "Cannot get target device"; 2104 ti->error = "Cannot get target device";
1352 return r; 2105 return r;
@@ -1377,8 +2130,6 @@ static int origin_map(struct dm_target *ti, struct bio *bio,
1377 return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED; 2130 return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED;
1378} 2131}
1379 2132
1380#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
1381
1382/* 2133/*
1383 * Set the target "split_io" field to the minimum of all the snapshots' 2134 * Set the target "split_io" field to the minimum of all the snapshots'
1384 * chunk sizes. 2135 * chunk sizes.
@@ -1386,19 +2137,8 @@ static int origin_map(struct dm_target *ti, struct bio *bio,
1386static void origin_resume(struct dm_target *ti) 2137static void origin_resume(struct dm_target *ti)
1387{ 2138{
1388 struct dm_dev *dev = ti->private; 2139 struct dm_dev *dev = ti->private;
1389 struct dm_snapshot *snap;
1390 struct origin *o;
1391 chunk_t chunk_size = 0;
1392 2140
1393 down_read(&_origins_lock); 2141 ti->split_io = get_origin_minimum_chunksize(dev->bdev);
1394 o = __lookup_origin(dev->bdev);
1395 if (o)
1396 list_for_each_entry (snap, &o->snapshots, list)
1397 chunk_size = min_not_zero(chunk_size,
1398 snap->store->chunk_size);
1399 up_read(&_origins_lock);
1400
1401 ti->split_io = chunk_size;
1402} 2142}
1403 2143
1404static int origin_status(struct dm_target *ti, status_type_t type, char *result, 2144static int origin_status(struct dm_target *ti, status_type_t type, char *result,
@@ -1441,17 +2181,35 @@ static struct target_type origin_target = {
1441 2181
1442static struct target_type snapshot_target = { 2182static struct target_type snapshot_target = {
1443 .name = "snapshot", 2183 .name = "snapshot",
1444 .version = {1, 7, 0}, 2184 .version = {1, 9, 0},
1445 .module = THIS_MODULE, 2185 .module = THIS_MODULE,
1446 .ctr = snapshot_ctr, 2186 .ctr = snapshot_ctr,
1447 .dtr = snapshot_dtr, 2187 .dtr = snapshot_dtr,
1448 .map = snapshot_map, 2188 .map = snapshot_map,
1449 .end_io = snapshot_end_io, 2189 .end_io = snapshot_end_io,
2190 .postsuspend = snapshot_postsuspend,
2191 .preresume = snapshot_preresume,
1450 .resume = snapshot_resume, 2192 .resume = snapshot_resume,
1451 .status = snapshot_status, 2193 .status = snapshot_status,
1452 .iterate_devices = snapshot_iterate_devices, 2194 .iterate_devices = snapshot_iterate_devices,
1453}; 2195};
1454 2196
2197static struct target_type merge_target = {
2198 .name = dm_snapshot_merge_target_name,
2199 .version = {1, 0, 0},
2200 .module = THIS_MODULE,
2201 .ctr = snapshot_ctr,
2202 .dtr = snapshot_dtr,
2203 .map = snapshot_merge_map,
2204 .end_io = snapshot_end_io,
2205 .presuspend = snapshot_merge_presuspend,
2206 .postsuspend = snapshot_postsuspend,
2207 .preresume = snapshot_preresume,
2208 .resume = snapshot_merge_resume,
2209 .status = snapshot_status,
2210 .iterate_devices = snapshot_iterate_devices,
2211};
2212
1455static int __init dm_snapshot_init(void) 2213static int __init dm_snapshot_init(void)
1456{ 2214{
1457 int r; 2215 int r;
@@ -1463,42 +2221,48 @@ static int __init dm_snapshot_init(void)
1463 } 2221 }
1464 2222
1465 r = dm_register_target(&snapshot_target); 2223 r = dm_register_target(&snapshot_target);
1466 if (r) { 2224 if (r < 0) {
1467 DMERR("snapshot target register failed %d", r); 2225 DMERR("snapshot target register failed %d", r);
1468 return r; 2226 goto bad_register_snapshot_target;
1469 } 2227 }
1470 2228
1471 r = dm_register_target(&origin_target); 2229 r = dm_register_target(&origin_target);
1472 if (r < 0) { 2230 if (r < 0) {
1473 DMERR("Origin target register failed %d", r); 2231 DMERR("Origin target register failed %d", r);
1474 goto bad1; 2232 goto bad_register_origin_target;
2233 }
2234
2235 r = dm_register_target(&merge_target);
2236 if (r < 0) {
2237 DMERR("Merge target register failed %d", r);
2238 goto bad_register_merge_target;
1475 } 2239 }
1476 2240
1477 r = init_origin_hash(); 2241 r = init_origin_hash();
1478 if (r) { 2242 if (r) {
1479 DMERR("init_origin_hash failed."); 2243 DMERR("init_origin_hash failed.");
1480 goto bad2; 2244 goto bad_origin_hash;
1481 } 2245 }
1482 2246
1483 exception_cache = KMEM_CACHE(dm_snap_exception, 0); 2247 exception_cache = KMEM_CACHE(dm_exception, 0);
1484 if (!exception_cache) { 2248 if (!exception_cache) {
1485 DMERR("Couldn't create exception cache."); 2249 DMERR("Couldn't create exception cache.");
1486 r = -ENOMEM; 2250 r = -ENOMEM;
1487 goto bad3; 2251 goto bad_exception_cache;
1488 } 2252 }
1489 2253
1490 pending_cache = KMEM_CACHE(dm_snap_pending_exception, 0); 2254 pending_cache = KMEM_CACHE(dm_snap_pending_exception, 0);
1491 if (!pending_cache) { 2255 if (!pending_cache) {
1492 DMERR("Couldn't create pending cache."); 2256 DMERR("Couldn't create pending cache.");
1493 r = -ENOMEM; 2257 r = -ENOMEM;
1494 goto bad4; 2258 goto bad_pending_cache;
1495 } 2259 }
1496 2260
1497 tracked_chunk_cache = KMEM_CACHE(dm_snap_tracked_chunk, 0); 2261 tracked_chunk_cache = KMEM_CACHE(dm_snap_tracked_chunk, 0);
1498 if (!tracked_chunk_cache) { 2262 if (!tracked_chunk_cache) {
1499 DMERR("Couldn't create cache to track chunks in use."); 2263 DMERR("Couldn't create cache to track chunks in use.");
1500 r = -ENOMEM; 2264 r = -ENOMEM;
1501 goto bad5; 2265 goto bad_tracked_chunk_cache;
1502 } 2266 }
1503 2267
1504 ksnapd = create_singlethread_workqueue("ksnapd"); 2268 ksnapd = create_singlethread_workqueue("ksnapd");
@@ -1512,16 +2276,21 @@ static int __init dm_snapshot_init(void)
1512 2276
1513bad_pending_pool: 2277bad_pending_pool:
1514 kmem_cache_destroy(tracked_chunk_cache); 2278 kmem_cache_destroy(tracked_chunk_cache);
1515bad5: 2279bad_tracked_chunk_cache:
1516 kmem_cache_destroy(pending_cache); 2280 kmem_cache_destroy(pending_cache);
1517bad4: 2281bad_pending_cache:
1518 kmem_cache_destroy(exception_cache); 2282 kmem_cache_destroy(exception_cache);
1519bad3: 2283bad_exception_cache:
1520 exit_origin_hash(); 2284 exit_origin_hash();
1521bad2: 2285bad_origin_hash:
2286 dm_unregister_target(&merge_target);
2287bad_register_merge_target:
1522 dm_unregister_target(&origin_target); 2288 dm_unregister_target(&origin_target);
1523bad1: 2289bad_register_origin_target:
1524 dm_unregister_target(&snapshot_target); 2290 dm_unregister_target(&snapshot_target);
2291bad_register_snapshot_target:
2292 dm_exception_store_exit();
2293
1525 return r; 2294 return r;
1526} 2295}
1527 2296
@@ -1531,6 +2300,7 @@ static void __exit dm_snapshot_exit(void)
1531 2300
1532 dm_unregister_target(&snapshot_target); 2301 dm_unregister_target(&snapshot_target);
1533 dm_unregister_target(&origin_target); 2302 dm_unregister_target(&origin_target);
2303 dm_unregister_target(&merge_target);
1534 2304
1535 exit_origin_hash(); 2305 exit_origin_hash();
1536 kmem_cache_destroy(pending_cache); 2306 kmem_cache_destroy(pending_cache);
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index e0efc1adcaff..e610725db766 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -80,8 +80,7 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
80 if (sscanf(argv[1], "%llu", &start) != 1) 80 if (sscanf(argv[1], "%llu", &start) != 1)
81 return -EINVAL; 81 return -EINVAL;
82 82
83 if (dm_get_device(ti, argv[0], start, sc->stripe_width, 83 if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
84 dm_table_get_mode(ti->table),
85 &sc->stripe[stripe].dev)) 84 &sc->stripe[stripe].dev))
86 return -ENXIO; 85 return -ENXIO;
87 86
@@ -110,7 +109,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
110 } 109 }
111 110
112 stripes = simple_strtoul(argv[0], &end, 10); 111 stripes = simple_strtoul(argv[0], &end, 10);
113 if (*end) { 112 if (!stripes || *end) {
114 ti->error = "Invalid stripe count"; 113 ti->error = "Invalid stripe count";
115 return -EINVAL; 114 return -EINVAL;
116 } 115 }
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
index 4b045903a4e2..84d2b91e4efb 100644
--- a/drivers/md/dm-sysfs.c
+++ b/drivers/md/dm-sysfs.c
@@ -59,7 +59,7 @@ static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf)
59 59
60static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf) 60static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf)
61{ 61{
62 sprintf(buf, "%d\n", dm_suspended(md)); 62 sprintf(buf, "%d\n", dm_suspended_md(md));
63 63
64 return strlen(buf); 64 return strlen(buf);
65} 65}
@@ -75,7 +75,7 @@ static struct attribute *dm_attrs[] = {
75 NULL, 75 NULL,
76}; 76};
77 77
78static struct sysfs_ops dm_sysfs_ops = { 78static const struct sysfs_ops dm_sysfs_ops = {
79 .show = dm_attr_show, 79 .show = dm_attr_show,
80}; 80};
81 81
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 1a6cb3c7822e..9924ea23032d 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -12,6 +12,7 @@
12#include <linux/blkdev.h> 12#include <linux/blkdev.h>
13#include <linux/namei.h> 13#include <linux/namei.h>
14#include <linux/ctype.h> 14#include <linux/ctype.h>
15#include <linux/string.h>
15#include <linux/slab.h> 16#include <linux/slab.h>
16#include <linux/interrupt.h> 17#include <linux/interrupt.h>
17#include <linux/mutex.h> 18#include <linux/mutex.h>
@@ -237,6 +238,9 @@ void dm_table_destroy(struct dm_table *t)
237{ 238{
238 unsigned int i; 239 unsigned int i;
239 240
241 if (!t)
242 return;
243
240 while (atomic_read(&t->holders)) 244 while (atomic_read(&t->holders))
241 msleep(1); 245 msleep(1);
242 smp_mb(); 246 smp_mb();
@@ -425,8 +429,7 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode,
425 * it's already present. 429 * it's already present.
426 */ 430 */
427static int __table_get_device(struct dm_table *t, struct dm_target *ti, 431static int __table_get_device(struct dm_table *t, struct dm_target *ti,
428 const char *path, sector_t start, sector_t len, 432 const char *path, fmode_t mode, struct dm_dev **result)
429 fmode_t mode, struct dm_dev **result)
430{ 433{
431 int r; 434 int r;
432 dev_t uninitialized_var(dev); 435 dev_t uninitialized_var(dev);
@@ -499,16 +502,15 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
499 return 0; 502 return 0;
500 } 503 }
501 504
502 if (blk_stack_limits(limits, &q->limits, start << 9) < 0) 505 if (bdev_stack_limits(limits, bdev, start) < 0)
503 DMWARN("%s: target device %s is misaligned: " 506 DMWARN("%s: adding target device %s caused an alignment inconsistency: "
504 "physical_block_size=%u, logical_block_size=%u, " 507 "physical_block_size=%u, logical_block_size=%u, "
505 "alignment_offset=%u, start=%llu", 508 "alignment_offset=%u, start=%llu",
506 dm_device_name(ti->table->md), bdevname(bdev, b), 509 dm_device_name(ti->table->md), bdevname(bdev, b),
507 q->limits.physical_block_size, 510 q->limits.physical_block_size,
508 q->limits.logical_block_size, 511 q->limits.logical_block_size,
509 q->limits.alignment_offset, 512 q->limits.alignment_offset,
510 (unsigned long long) start << 9); 513 (unsigned long long) start << SECTOR_SHIFT);
511
512 514
513 /* 515 /*
514 * Check if merge fn is supported. 516 * Check if merge fn is supported.
@@ -524,11 +526,10 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
524} 526}
525EXPORT_SYMBOL_GPL(dm_set_device_limits); 527EXPORT_SYMBOL_GPL(dm_set_device_limits);
526 528
527int dm_get_device(struct dm_target *ti, const char *path, sector_t start, 529int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
528 sector_t len, fmode_t mode, struct dm_dev **result) 530 struct dm_dev **result)
529{ 531{
530 return __table_get_device(ti->table, ti, path, 532 return __table_get_device(ti->table, ti, path, mode, result);
531 start, len, mode, result);
532} 533}
533 534
534 535
@@ -600,11 +601,8 @@ int dm_split_args(int *argc, char ***argvp, char *input)
600 return -ENOMEM; 601 return -ENOMEM;
601 602
602 while (1) { 603 while (1) {
603 start = end;
604
605 /* Skip whitespace */ 604 /* Skip whitespace */
606 while (*start && isspace(*start)) 605 start = skip_spaces(end);
607 start++;
608 606
609 if (!*start) 607 if (!*start)
610 break; /* success, we hit the end */ 608 break; /* success, we hit the end */
@@ -1025,9 +1023,9 @@ combine_limits:
1025 * for the table. 1023 * for the table.
1026 */ 1024 */
1027 if (blk_stack_limits(limits, &ti_limits, 0) < 0) 1025 if (blk_stack_limits(limits, &ti_limits, 0) < 0)
1028 DMWARN("%s: target device " 1026 DMWARN("%s: adding target device "
1029 "(start sect %llu len %llu) " 1027 "(start sect %llu len %llu) "
1030 "is misaligned", 1028 "caused an alignment inconsistency",
1031 dm_device_name(table->md), 1029 dm_device_name(table->md),
1032 (unsigned long long) ti->begin, 1030 (unsigned long long) ti->begin,
1033 (unsigned long long) ti->len); 1031 (unsigned long long) ti->len);
@@ -1079,15 +1077,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
1079 struct queue_limits *limits) 1077 struct queue_limits *limits)
1080{ 1078{
1081 /* 1079 /*
1082 * Each target device in the table has a data area that should normally
1083 * be aligned such that the DM device's alignment_offset is 0.
1084 * FIXME: Propagate alignment_offsets up the stack and warn of
1085 * sub-optimal or inconsistent settings.
1086 */
1087 limits->alignment_offset = 0;
1088 limits->misaligned = 0;
1089
1090 /*
1091 * Copy table's limits to the DM device's request_queue 1080 * Copy table's limits to the DM device's request_queue
1092 */ 1081 */
1093 q->limits = *limits; 1082 q->limits = *limits;
@@ -1240,8 +1229,6 @@ void dm_table_unplug_all(struct dm_table *t)
1240 1229
1241struct mapped_device *dm_table_get_md(struct dm_table *t) 1230struct mapped_device *dm_table_get_md(struct dm_table *t)
1242{ 1231{
1243 dm_get(t->md);
1244
1245 return t->md; 1232 return t->md;
1246} 1233}
1247 1234
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 04feccf2a997..11dea11dc0b6 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -10,7 +10,6 @@
10#include <linux/init.h> 10#include <linux/init.h>
11#include <linux/kmod.h> 11#include <linux/kmod.h>
12#include <linux/bio.h> 12#include <linux/bio.h>
13#include <linux/slab.h>
14 13
15#define DM_MSG_PREFIX "target" 14#define DM_MSG_PREFIX "target"
16 15
diff --git a/drivers/md/dm-uevent.c b/drivers/md/dm-uevent.c
index 6f65883aef12..6b1e3b61b25e 100644
--- a/drivers/md/dm-uevent.c
+++ b/drivers/md/dm-uevent.c
@@ -139,14 +139,13 @@ void dm_send_uevents(struct list_head *events, struct kobject *kobj)
139 list_del_init(&event->elist); 139 list_del_init(&event->elist);
140 140
141 /* 141 /*
142 * Need to call dm_copy_name_and_uuid from here for now. 142 * When a device is being removed this copy fails and we
143 * Context of previous var adds and locking used for 143 * discard these unsent events.
144 * hash_cell not compatable.
145 */ 144 */
146 if (dm_copy_name_and_uuid(event->md, event->name, 145 if (dm_copy_name_and_uuid(event->md, event->name,
147 event->uuid)) { 146 event->uuid)) {
148 DMERR("%s: dm_copy_name_and_uuid() failed", 147 DMINFO("%s: skipping sending uevent for lost device",
149 __func__); 148 __func__);
150 goto uevent_free; 149 goto uevent_free;
151 } 150 }
152 151
@@ -188,7 +187,7 @@ void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti,
188 187
189 if (event_type >= ARRAY_SIZE(_dm_uevent_type_names)) { 188 if (event_type >= ARRAY_SIZE(_dm_uevent_type_names)) {
190 DMERR("%s: Invalid event_type %d", __func__, event_type); 189 DMERR("%s: Invalid event_type %d", __func__, event_type);
191 goto out; 190 return;
192 } 191 }
193 192
194 event = dm_build_path_uevent(md, ti, 193 event = dm_build_path_uevent(md, ti,
@@ -196,12 +195,9 @@ void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti,
196 _dm_uevent_type_names[event_type].name, 195 _dm_uevent_type_names[event_type].name,
197 path, nr_valid_paths); 196 path, nr_valid_paths);
198 if (IS_ERR(event)) 197 if (IS_ERR(event))
199 goto out; 198 return;
200 199
201 dm_uevent_add(md, &event->elist); 200 dm_uevent_add(md, &event->elist);
202
203out:
204 dm_put(md);
205} 201}
206EXPORT_SYMBOL_GPL(dm_path_uevent); 202EXPORT_SYMBOL_GPL(dm_path_uevent);
207 203
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 376f1ab48a24..d21e1284604f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -47,6 +47,7 @@ struct dm_io {
47 atomic_t io_count; 47 atomic_t io_count;
48 struct bio *bio; 48 struct bio *bio;
49 unsigned long start_time; 49 unsigned long start_time;
50 spinlock_t endio_lock;
50}; 51};
51 52
52/* 53/*
@@ -142,9 +143,19 @@ struct mapped_device {
142 int barrier_error; 143 int barrier_error;
143 144
144 /* 145 /*
146 * Protect barrier_error from concurrent endio processing
147 * in request-based dm.
148 */
149 spinlock_t barrier_error_lock;
150
151 /*
145 * Processing queue (flush/barriers) 152 * Processing queue (flush/barriers)
146 */ 153 */
147 struct workqueue_struct *wq; 154 struct workqueue_struct *wq;
155 struct work_struct barrier_work;
156
157 /* A pointer to the currently processing pre/post flush request */
158 struct request *flush_request;
148 159
149 /* 160 /*
150 * The current mapping. 161 * The current mapping.
@@ -177,9 +188,6 @@ struct mapped_device {
177 /* forced geometry settings */ 188 /* forced geometry settings */
178 struct hd_geometry geometry; 189 struct hd_geometry geometry;
179 190
180 /* marker of flush suspend for request-based dm */
181 struct request suspend_rq;
182
183 /* For saving the address of __make_request for request based dm */ 191 /* For saving the address of __make_request for request based dm */
184 make_request_fn *saved_make_request_fn; 192 make_request_fn *saved_make_request_fn;
185 193
@@ -274,6 +282,7 @@ static int (*_inits[])(void) __initdata = {
274 dm_target_init, 282 dm_target_init,
275 dm_linear_init, 283 dm_linear_init,
276 dm_stripe_init, 284 dm_stripe_init,
285 dm_io_init,
277 dm_kcopyd_init, 286 dm_kcopyd_init,
278 dm_interface_init, 287 dm_interface_init,
279}; 288};
@@ -283,6 +292,7 @@ static void (*_exits[])(void) = {
283 dm_target_exit, 292 dm_target_exit,
284 dm_linear_exit, 293 dm_linear_exit,
285 dm_stripe_exit, 294 dm_stripe_exit,
295 dm_io_exit,
286 dm_kcopyd_exit, 296 dm_kcopyd_exit,
287 dm_interface_exit, 297 dm_interface_exit,
288}; 298};
@@ -319,6 +329,11 @@ static void __exit dm_exit(void)
319/* 329/*
320 * Block device functions 330 * Block device functions
321 */ 331 */
332int dm_deleting_md(struct mapped_device *md)
333{
334 return test_bit(DMF_DELETING, &md->flags);
335}
336
322static int dm_blk_open(struct block_device *bdev, fmode_t mode) 337static int dm_blk_open(struct block_device *bdev, fmode_t mode)
323{ 338{
324 struct mapped_device *md; 339 struct mapped_device *md;
@@ -330,7 +345,7 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)
330 goto out; 345 goto out;
331 346
332 if (test_bit(DMF_FREEING, &md->flags) || 347 if (test_bit(DMF_FREEING, &md->flags) ||
333 test_bit(DMF_DELETING, &md->flags)) { 348 dm_deleting_md(md)) {
334 md = NULL; 349 md = NULL;
335 goto out; 350 goto out;
336 } 351 }
@@ -387,7 +402,7 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
387 unsigned int cmd, unsigned long arg) 402 unsigned int cmd, unsigned long arg)
388{ 403{
389 struct mapped_device *md = bdev->bd_disk->private_data; 404 struct mapped_device *md = bdev->bd_disk->private_data;
390 struct dm_table *map = dm_get_table(md); 405 struct dm_table *map = dm_get_live_table(md);
391 struct dm_target *tgt; 406 struct dm_target *tgt;
392 int r = -ENOTTY; 407 int r = -ENOTTY;
393 408
@@ -400,7 +415,7 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
400 415
401 tgt = dm_table_get_target(map, 0); 416 tgt = dm_table_get_target(map, 0);
402 417
403 if (dm_suspended(md)) { 418 if (dm_suspended_md(md)) {
404 r = -EAGAIN; 419 r = -EAGAIN;
405 goto out; 420 goto out;
406 } 421 }
@@ -429,9 +444,10 @@ static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
429 mempool_free(tio, md->tio_pool); 444 mempool_free(tio, md->tio_pool);
430} 445}
431 446
432static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md) 447static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,
448 gfp_t gfp_mask)
433{ 449{
434 return mempool_alloc(md->tio_pool, GFP_ATOMIC); 450 return mempool_alloc(md->tio_pool, gfp_mask);
435} 451}
436 452
437static void free_rq_tio(struct dm_rq_target_io *tio) 453static void free_rq_tio(struct dm_rq_target_io *tio)
@@ -449,6 +465,12 @@ static void free_bio_info(struct dm_rq_clone_bio_info *info)
449 mempool_free(info, info->tio->md->io_pool); 465 mempool_free(info, info->tio->md->io_pool);
450} 466}
451 467
468static int md_in_flight(struct mapped_device *md)
469{
470 return atomic_read(&md->pending[READ]) +
471 atomic_read(&md->pending[WRITE]);
472}
473
452static void start_io_acct(struct dm_io *io) 474static void start_io_acct(struct dm_io *io)
453{ 475{
454 struct mapped_device *md = io->md; 476 struct mapped_device *md = io->md;
@@ -511,7 +533,7 @@ static void queue_io(struct mapped_device *md, struct bio *bio)
511 * function to access the md->map field, and make sure they call 533 * function to access the md->map field, and make sure they call
512 * dm_table_put() when finished. 534 * dm_table_put() when finished.
513 */ 535 */
514struct dm_table *dm_get_table(struct mapped_device *md) 536struct dm_table *dm_get_live_table(struct mapped_device *md)
515{ 537{
516 struct dm_table *t; 538 struct dm_table *t;
517 unsigned long flags; 539 unsigned long flags;
@@ -578,8 +600,12 @@ static void dec_pending(struct dm_io *io, int error)
578 struct mapped_device *md = io->md; 600 struct mapped_device *md = io->md;
579 601
580 /* Push-back supersedes any I/O errors */ 602 /* Push-back supersedes any I/O errors */
581 if (error && !(io->error > 0 && __noflush_suspending(md))) 603 if (unlikely(error)) {
582 io->error = error; 604 spin_lock_irqsave(&io->endio_lock, flags);
605 if (!(io->error > 0 && __noflush_suspending(md)))
606 io->error = error;
607 spin_unlock_irqrestore(&io->endio_lock, flags);
608 }
583 609
584 if (atomic_dec_and_test(&io->io_count)) { 610 if (atomic_dec_and_test(&io->io_count)) {
585 if (io->error == DM_ENDIO_REQUEUE) { 611 if (io->error == DM_ENDIO_REQUEUE) {
@@ -609,8 +635,10 @@ static void dec_pending(struct dm_io *io, int error)
609 if (!md->barrier_error && io_error != -EOPNOTSUPP) 635 if (!md->barrier_error && io_error != -EOPNOTSUPP)
610 md->barrier_error = io_error; 636 md->barrier_error = io_error;
611 end_io_acct(io); 637 end_io_acct(io);
638 free_io(md, io);
612 } else { 639 } else {
613 end_io_acct(io); 640 end_io_acct(io);
641 free_io(md, io);
614 642
615 if (io_error != DM_ENDIO_REQUEUE) { 643 if (io_error != DM_ENDIO_REQUEUE) {
616 trace_block_bio_complete(md->queue, bio); 644 trace_block_bio_complete(md->queue, bio);
@@ -618,8 +646,6 @@ static void dec_pending(struct dm_io *io, int error)
618 bio_endio(bio, io_error); 646 bio_endio(bio, io_error);
619 } 647 }
620 } 648 }
621
622 free_io(md, io);
623 } 649 }
624} 650}
625 651
@@ -711,28 +737,38 @@ static void end_clone_bio(struct bio *clone, int error)
711 blk_update_request(tio->orig, 0, nr_bytes); 737 blk_update_request(tio->orig, 0, nr_bytes);
712} 738}
713 739
740static void store_barrier_error(struct mapped_device *md, int error)
741{
742 unsigned long flags;
743
744 spin_lock_irqsave(&md->barrier_error_lock, flags);
745 /*
746 * Basically, the first error is taken, but:
747 * -EOPNOTSUPP supersedes any I/O error.
748 * Requeue request supersedes any I/O error but -EOPNOTSUPP.
749 */
750 if (!md->barrier_error || error == -EOPNOTSUPP ||
751 (md->barrier_error != -EOPNOTSUPP &&
752 error == DM_ENDIO_REQUEUE))
753 md->barrier_error = error;
754 spin_unlock_irqrestore(&md->barrier_error_lock, flags);
755}
756
714/* 757/*
715 * Don't touch any member of the md after calling this function because 758 * Don't touch any member of the md after calling this function because
716 * the md may be freed in dm_put() at the end of this function. 759 * the md may be freed in dm_put() at the end of this function.
717 * Or do dm_get() before calling this function and dm_put() later. 760 * Or do dm_get() before calling this function and dm_put() later.
718 */ 761 */
719static void rq_completed(struct mapped_device *md, int run_queue) 762static void rq_completed(struct mapped_device *md, int rw, int run_queue)
720{ 763{
721 int wakeup_waiters = 0; 764 atomic_dec(&md->pending[rw]);
722 struct request_queue *q = md->queue;
723 unsigned long flags;
724
725 spin_lock_irqsave(q->queue_lock, flags);
726 if (!queue_in_flight(q))
727 wakeup_waiters = 1;
728 spin_unlock_irqrestore(q->queue_lock, flags);
729 765
730 /* nudge anyone waiting on suspend queue */ 766 /* nudge anyone waiting on suspend queue */
731 if (wakeup_waiters) 767 if (!md_in_flight(md))
732 wake_up(&md->wait); 768 wake_up(&md->wait);
733 769
734 if (run_queue) 770 if (run_queue)
735 blk_run_queue(q); 771 blk_run_queue(md->queue);
736 772
737 /* 773 /*
738 * dm_put() must be at the end of this function. See the comment above 774 * dm_put() must be at the end of this function. See the comment above
@@ -748,6 +784,44 @@ static void free_rq_clone(struct request *clone)
748 free_rq_tio(tio); 784 free_rq_tio(tio);
749} 785}
750 786
787/*
788 * Complete the clone and the original request.
789 * Must be called without queue lock.
790 */
791static void dm_end_request(struct request *clone, int error)
792{
793 int rw = rq_data_dir(clone);
794 int run_queue = 1;
795 bool is_barrier = blk_barrier_rq(clone);
796 struct dm_rq_target_io *tio = clone->end_io_data;
797 struct mapped_device *md = tio->md;
798 struct request *rq = tio->orig;
799
800 if (blk_pc_request(rq) && !is_barrier) {
801 rq->errors = clone->errors;
802 rq->resid_len = clone->resid_len;
803
804 if (rq->sense)
805 /*
806 * We are using the sense buffer of the original
807 * request.
808 * So setting the length of the sense data is enough.
809 */
810 rq->sense_len = clone->sense_len;
811 }
812
813 free_rq_clone(clone);
814
815 if (unlikely(is_barrier)) {
816 if (unlikely(error))
817 store_barrier_error(md, error);
818 run_queue = 0;
819 } else
820 blk_end_request_all(rq, error);
821
822 rq_completed(md, rw, run_queue);
823}
824
751static void dm_unprep_request(struct request *rq) 825static void dm_unprep_request(struct request *rq)
752{ 826{
753 struct request *clone = rq->special; 827 struct request *clone = rq->special;
@@ -763,12 +837,23 @@ static void dm_unprep_request(struct request *rq)
763 */ 837 */
764void dm_requeue_unmapped_request(struct request *clone) 838void dm_requeue_unmapped_request(struct request *clone)
765{ 839{
840 int rw = rq_data_dir(clone);
766 struct dm_rq_target_io *tio = clone->end_io_data; 841 struct dm_rq_target_io *tio = clone->end_io_data;
767 struct mapped_device *md = tio->md; 842 struct mapped_device *md = tio->md;
768 struct request *rq = tio->orig; 843 struct request *rq = tio->orig;
769 struct request_queue *q = rq->q; 844 struct request_queue *q = rq->q;
770 unsigned long flags; 845 unsigned long flags;
771 846
847 if (unlikely(blk_barrier_rq(clone))) {
848 /*
849 * Barrier clones share an original request.
850 * Leave it to dm_end_request(), which handles this special
851 * case.
852 */
853 dm_end_request(clone, DM_ENDIO_REQUEUE);
854 return;
855 }
856
772 dm_unprep_request(rq); 857 dm_unprep_request(rq);
773 858
774 spin_lock_irqsave(q->queue_lock, flags); 859 spin_lock_irqsave(q->queue_lock, flags);
@@ -777,7 +862,7 @@ void dm_requeue_unmapped_request(struct request *clone)
777 blk_requeue_request(q, rq); 862 blk_requeue_request(q, rq);
778 spin_unlock_irqrestore(q->queue_lock, flags); 863 spin_unlock_irqrestore(q->queue_lock, flags);
779 864
780 rq_completed(md, 0); 865 rq_completed(md, rw, 0);
781} 866}
782EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); 867EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request);
783 868
@@ -810,34 +895,28 @@ static void start_queue(struct request_queue *q)
810 spin_unlock_irqrestore(q->queue_lock, flags); 895 spin_unlock_irqrestore(q->queue_lock, flags);
811} 896}
812 897
813/* 898static void dm_done(struct request *clone, int error, bool mapped)
814 * Complete the clone and the original request.
815 * Must be called without queue lock.
816 */
817static void dm_end_request(struct request *clone, int error)
818{ 899{
900 int r = error;
819 struct dm_rq_target_io *tio = clone->end_io_data; 901 struct dm_rq_target_io *tio = clone->end_io_data;
820 struct mapped_device *md = tio->md; 902 dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;
821 struct request *rq = tio->orig;
822 903
823 if (blk_pc_request(rq)) { 904 if (mapped && rq_end_io)
824 rq->errors = clone->errors; 905 r = rq_end_io(tio->ti, clone, error, &tio->info);
825 rq->resid_len = clone->resid_len;
826 906
827 if (rq->sense) 907 if (r <= 0)
828 /* 908 /* The target wants to complete the I/O */
829 * We are using the sense buffer of the original 909 dm_end_request(clone, r);
830 * request. 910 else if (r == DM_ENDIO_INCOMPLETE)
831 * So setting the length of the sense data is enough. 911 /* The target will handle the I/O */
832 */ 912 return;
833 rq->sense_len = clone->sense_len; 913 else if (r == DM_ENDIO_REQUEUE)
914 /* The target wants to requeue the I/O */
915 dm_requeue_unmapped_request(clone);
916 else {
917 DMWARN("unimplemented target endio return value: %d", r);
918 BUG();
834 } 919 }
835
836 free_rq_clone(clone);
837
838 blk_end_request_all(rq, error);
839
840 rq_completed(md, 1);
841} 920}
842 921
843/* 922/*
@@ -845,27 +924,14 @@ static void dm_end_request(struct request *clone, int error)
845 */ 924 */
846static void dm_softirq_done(struct request *rq) 925static void dm_softirq_done(struct request *rq)
847{ 926{
927 bool mapped = true;
848 struct request *clone = rq->completion_data; 928 struct request *clone = rq->completion_data;
849 struct dm_rq_target_io *tio = clone->end_io_data; 929 struct dm_rq_target_io *tio = clone->end_io_data;
850 dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;
851 int error = tio->error;
852 930
853 if (!(rq->cmd_flags & REQ_FAILED) && rq_end_io) 931 if (rq->cmd_flags & REQ_FAILED)
854 error = rq_end_io(tio->ti, clone, error, &tio->info); 932 mapped = false;
855 933
856 if (error <= 0) 934 dm_done(clone, tio->error, mapped);
857 /* The target wants to complete the I/O */
858 dm_end_request(clone, error);
859 else if (error == DM_ENDIO_INCOMPLETE)
860 /* The target will handle the I/O */
861 return;
862 else if (error == DM_ENDIO_REQUEUE)
863 /* The target wants to requeue the I/O */
864 dm_requeue_unmapped_request(clone);
865 else {
866 DMWARN("unimplemented target endio return value: %d", error);
867 BUG();
868 }
869} 935}
870 936
871/* 937/*
@@ -877,6 +943,19 @@ static void dm_complete_request(struct request *clone, int error)
877 struct dm_rq_target_io *tio = clone->end_io_data; 943 struct dm_rq_target_io *tio = clone->end_io_data;
878 struct request *rq = tio->orig; 944 struct request *rq = tio->orig;
879 945
946 if (unlikely(blk_barrier_rq(clone))) {
947 /*
948 * Barrier clones share an original request. So can't use
949 * softirq_done with the original.
950 * Pass the clone to dm_done() directly in this special case.
951 * It is safe (even if clone->q->queue_lock is held here)
952 * because there is no I/O dispatching during the completion
953 * of barrier clone.
954 */
955 dm_done(clone, error, true);
956 return;
957 }
958
880 tio->error = error; 959 tio->error = error;
881 rq->completion_data = clone; 960 rq->completion_data = clone;
882 blk_complete_request(rq); 961 blk_complete_request(rq);
@@ -893,6 +972,17 @@ void dm_kill_unmapped_request(struct request *clone, int error)
893 struct dm_rq_target_io *tio = clone->end_io_data; 972 struct dm_rq_target_io *tio = clone->end_io_data;
894 struct request *rq = tio->orig; 973 struct request *rq = tio->orig;
895 974
975 if (unlikely(blk_barrier_rq(clone))) {
976 /*
977 * Barrier clones share an original request.
978 * Leave it to dm_end_request(), which handles this special
979 * case.
980 */
981 BUG_ON(error > 0);
982 dm_end_request(clone, error);
983 return;
984 }
985
896 rq->cmd_flags |= REQ_FAILED; 986 rq->cmd_flags |= REQ_FAILED;
897 dm_complete_request(clone, error); 987 dm_complete_request(clone, error);
898} 988}
@@ -1209,7 +1299,7 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
1209 struct clone_info ci; 1299 struct clone_info ci;
1210 int error = 0; 1300 int error = 0;
1211 1301
1212 ci.map = dm_get_table(md); 1302 ci.map = dm_get_live_table(md);
1213 if (unlikely(!ci.map)) { 1303 if (unlikely(!ci.map)) {
1214 if (!bio_rw_flagged(bio, BIO_RW_BARRIER)) 1304 if (!bio_rw_flagged(bio, BIO_RW_BARRIER))
1215 bio_io_error(bio); 1305 bio_io_error(bio);
@@ -1226,6 +1316,7 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
1226 atomic_set(&ci.io->io_count, 1); 1316 atomic_set(&ci.io->io_count, 1);
1227 ci.io->bio = bio; 1317 ci.io->bio = bio;
1228 ci.io->md = md; 1318 ci.io->md = md;
1319 spin_lock_init(&ci.io->endio_lock);
1229 ci.sector = bio->bi_sector; 1320 ci.sector = bio->bi_sector;
1230 ci.sector_count = bio_sectors(bio); 1321 ci.sector_count = bio_sectors(bio);
1231 if (unlikely(bio_empty_barrier(bio))) 1322 if (unlikely(bio_empty_barrier(bio)))
@@ -1249,7 +1340,7 @@ static int dm_merge_bvec(struct request_queue *q,
1249 struct bio_vec *biovec) 1340 struct bio_vec *biovec)
1250{ 1341{
1251 struct mapped_device *md = q->queuedata; 1342 struct mapped_device *md = q->queuedata;
1252 struct dm_table *map = dm_get_table(md); 1343 struct dm_table *map = dm_get_live_table(md);
1253 struct dm_target *ti; 1344 struct dm_target *ti;
1254 sector_t max_sectors; 1345 sector_t max_sectors;
1255 int max_size = 0; 1346 int max_size = 0;
@@ -1346,11 +1437,6 @@ static int dm_make_request(struct request_queue *q, struct bio *bio)
1346{ 1437{
1347 struct mapped_device *md = q->queuedata; 1438 struct mapped_device *md = q->queuedata;
1348 1439
1349 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
1350 bio_endio(bio, -EOPNOTSUPP);
1351 return 0;
1352 }
1353
1354 return md->saved_make_request_fn(q, bio); /* call __make_request() */ 1440 return md->saved_make_request_fn(q, bio); /* call __make_request() */
1355} 1441}
1356 1442
@@ -1369,6 +1455,25 @@ static int dm_request(struct request_queue *q, struct bio *bio)
1369 return _dm_request(q, bio); 1455 return _dm_request(q, bio);
1370} 1456}
1371 1457
1458/*
1459 * Mark this request as flush request, so that dm_request_fn() can
1460 * recognize.
1461 */
1462static void dm_rq_prepare_flush(struct request_queue *q, struct request *rq)
1463{
1464 rq->cmd_type = REQ_TYPE_LINUX_BLOCK;
1465 rq->cmd[0] = REQ_LB_OP_FLUSH;
1466}
1467
1468static bool dm_rq_is_flush_request(struct request *rq)
1469{
1470 if (rq->cmd_type == REQ_TYPE_LINUX_BLOCK &&
1471 rq->cmd[0] == REQ_LB_OP_FLUSH)
1472 return true;
1473 else
1474 return false;
1475}
1476
1372void dm_dispatch_request(struct request *rq) 1477void dm_dispatch_request(struct request *rq)
1373{ 1478{
1374 int r; 1479 int r;
@@ -1414,25 +1519,54 @@ static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
1414static int setup_clone(struct request *clone, struct request *rq, 1519static int setup_clone(struct request *clone, struct request *rq,
1415 struct dm_rq_target_io *tio) 1520 struct dm_rq_target_io *tio)
1416{ 1521{
1417 int r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, 1522 int r;
1418 dm_rq_bio_constructor, tio);
1419 1523
1420 if (r) 1524 if (dm_rq_is_flush_request(rq)) {
1421 return r; 1525 blk_rq_init(NULL, clone);
1526 clone->cmd_type = REQ_TYPE_FS;
1527 clone->cmd_flags |= (REQ_HARDBARRIER | WRITE);
1528 } else {
1529 r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
1530 dm_rq_bio_constructor, tio);
1531 if (r)
1532 return r;
1533
1534 clone->cmd = rq->cmd;
1535 clone->cmd_len = rq->cmd_len;
1536 clone->sense = rq->sense;
1537 clone->buffer = rq->buffer;
1538 }
1422 1539
1423 clone->cmd = rq->cmd;
1424 clone->cmd_len = rq->cmd_len;
1425 clone->sense = rq->sense;
1426 clone->buffer = rq->buffer;
1427 clone->end_io = end_clone_request; 1540 clone->end_io = end_clone_request;
1428 clone->end_io_data = tio; 1541 clone->end_io_data = tio;
1429 1542
1430 return 0; 1543 return 0;
1431} 1544}
1432 1545
1433static int dm_rq_flush_suspending(struct mapped_device *md) 1546static struct request *clone_rq(struct request *rq, struct mapped_device *md,
1547 gfp_t gfp_mask)
1434{ 1548{
1435 return !md->suspend_rq.special; 1549 struct request *clone;
1550 struct dm_rq_target_io *tio;
1551
1552 tio = alloc_rq_tio(md, gfp_mask);
1553 if (!tio)
1554 return NULL;
1555
1556 tio->md = md;
1557 tio->ti = NULL;
1558 tio->orig = rq;
1559 tio->error = 0;
1560 memset(&tio->info, 0, sizeof(tio->info));
1561
1562 clone = &tio->clone;
1563 if (setup_clone(clone, rq, tio)) {
1564 /* -ENOMEM */
1565 free_rq_tio(tio);
1566 return NULL;
1567 }
1568
1569 return clone;
1436} 1570}
1437 1571
1438/* 1572/*
@@ -1441,51 +1575,35 @@ static int dm_rq_flush_suspending(struct mapped_device *md)
1441static int dm_prep_fn(struct request_queue *q, struct request *rq) 1575static int dm_prep_fn(struct request_queue *q, struct request *rq)
1442{ 1576{
1443 struct mapped_device *md = q->queuedata; 1577 struct mapped_device *md = q->queuedata;
1444 struct dm_rq_target_io *tio;
1445 struct request *clone; 1578 struct request *clone;
1446 1579
1447 if (unlikely(rq == &md->suspend_rq)) { 1580 if (unlikely(dm_rq_is_flush_request(rq)))
1448 if (dm_rq_flush_suspending(md)) 1581 return BLKPREP_OK;
1449 return BLKPREP_OK;
1450 else
1451 /* The flush suspend was interrupted */
1452 return BLKPREP_KILL;
1453 }
1454 1582
1455 if (unlikely(rq->special)) { 1583 if (unlikely(rq->special)) {
1456 DMWARN("Already has something in rq->special."); 1584 DMWARN("Already has something in rq->special.");
1457 return BLKPREP_KILL; 1585 return BLKPREP_KILL;
1458 } 1586 }
1459 1587
1460 tio = alloc_rq_tio(md); /* Only one for each original request */ 1588 clone = clone_rq(rq, md, GFP_ATOMIC);
1461 if (!tio) 1589 if (!clone)
1462 /* -ENOMEM */
1463 return BLKPREP_DEFER; 1590 return BLKPREP_DEFER;
1464 1591
1465 tio->md = md;
1466 tio->ti = NULL;
1467 tio->orig = rq;
1468 tio->error = 0;
1469 memset(&tio->info, 0, sizeof(tio->info));
1470
1471 clone = &tio->clone;
1472 if (setup_clone(clone, rq, tio)) {
1473 /* -ENOMEM */
1474 free_rq_tio(tio);
1475 return BLKPREP_DEFER;
1476 }
1477
1478 rq->special = clone; 1592 rq->special = clone;
1479 rq->cmd_flags |= REQ_DONTPREP; 1593 rq->cmd_flags |= REQ_DONTPREP;
1480 1594
1481 return BLKPREP_OK; 1595 return BLKPREP_OK;
1482} 1596}
1483 1597
1484static void map_request(struct dm_target *ti, struct request *rq, 1598/*
1485 struct mapped_device *md) 1599 * Returns:
1600 * 0 : the request has been processed (not requeued)
1601 * !0 : the request has been requeued
1602 */
1603static int map_request(struct dm_target *ti, struct request *clone,
1604 struct mapped_device *md)
1486{ 1605{
1487 int r; 1606 int r, requeued = 0;
1488 struct request *clone = rq->special;
1489 struct dm_rq_target_io *tio = clone->end_io_data; 1607 struct dm_rq_target_io *tio = clone->end_io_data;
1490 1608
1491 /* 1609 /*
@@ -1505,11 +1623,14 @@ static void map_request(struct dm_target *ti, struct request *rq,
1505 break; 1623 break;
1506 case DM_MAPIO_REMAPPED: 1624 case DM_MAPIO_REMAPPED:
1507 /* The target has remapped the I/O so dispatch it */ 1625 /* The target has remapped the I/O so dispatch it */
1626 trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
1627 blk_rq_pos(tio->orig));
1508 dm_dispatch_request(clone); 1628 dm_dispatch_request(clone);
1509 break; 1629 break;
1510 case DM_MAPIO_REQUEUE: 1630 case DM_MAPIO_REQUEUE:
1511 /* The target wants to requeue the I/O */ 1631 /* The target wants to requeue the I/O */
1512 dm_requeue_unmapped_request(clone); 1632 dm_requeue_unmapped_request(clone);
1633 requeued = 1;
1513 break; 1634 break;
1514 default: 1635 default:
1515 if (r > 0) { 1636 if (r > 0) {
@@ -1521,6 +1642,8 @@ static void map_request(struct dm_target *ti, struct request *rq,
1521 dm_kill_unmapped_request(clone, r); 1642 dm_kill_unmapped_request(clone, r);
1522 break; 1643 break;
1523 } 1644 }
1645
1646 return requeued;
1524} 1647}
1525 1648
1526/* 1649/*
@@ -1530,29 +1653,26 @@ static void map_request(struct dm_target *ti, struct request *rq,
1530static void dm_request_fn(struct request_queue *q) 1653static void dm_request_fn(struct request_queue *q)
1531{ 1654{
1532 struct mapped_device *md = q->queuedata; 1655 struct mapped_device *md = q->queuedata;
1533 struct dm_table *map = dm_get_table(md); 1656 struct dm_table *map = dm_get_live_table(md);
1534 struct dm_target *ti; 1657 struct dm_target *ti;
1535 struct request *rq; 1658 struct request *rq, *clone;
1536 1659
1537 /* 1660 /*
1538 * For noflush suspend, check blk_queue_stopped() to immediately 1661 * For suspend, check blk_queue_stopped() and increment
1539 * quit I/O dispatching. 1662 * ->pending within a single queue_lock not to increment the
1663 * number of in-flight I/Os after the queue is stopped in
1664 * dm_suspend().
1540 */ 1665 */
1541 while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) { 1666 while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) {
1542 rq = blk_peek_request(q); 1667 rq = blk_peek_request(q);
1543 if (!rq) 1668 if (!rq)
1544 goto plug_and_out; 1669 goto plug_and_out;
1545 1670
1546 if (unlikely(rq == &md->suspend_rq)) { /* Flush suspend maker */ 1671 if (unlikely(dm_rq_is_flush_request(rq))) {
1547 if (queue_in_flight(q)) 1672 BUG_ON(md->flush_request);
1548 /* Not quiet yet. Wait more */ 1673 md->flush_request = rq;
1549 goto plug_and_out;
1550
1551 /* This device should be quiet now */
1552 __stop_queue(q);
1553 blk_start_request(rq); 1674 blk_start_request(rq);
1554 __blk_end_request_all(rq, 0); 1675 queue_work(md->wq, &md->barrier_work);
1555 wake_up(&md->wait);
1556 goto out; 1676 goto out;
1557 } 1677 }
1558 1678
@@ -1561,13 +1681,21 @@ static void dm_request_fn(struct request_queue *q)
1561 goto plug_and_out; 1681 goto plug_and_out;
1562 1682
1563 blk_start_request(rq); 1683 blk_start_request(rq);
1684 clone = rq->special;
1685 atomic_inc(&md->pending[rq_data_dir(clone)]);
1686
1564 spin_unlock(q->queue_lock); 1687 spin_unlock(q->queue_lock);
1565 map_request(ti, rq, md); 1688 if (map_request(ti, clone, md))
1689 goto requeued;
1690
1566 spin_lock_irq(q->queue_lock); 1691 spin_lock_irq(q->queue_lock);
1567 } 1692 }
1568 1693
1569 goto out; 1694 goto out;
1570 1695
1696requeued:
1697 spin_lock_irq(q->queue_lock);
1698
1571plug_and_out: 1699plug_and_out:
1572 if (!elv_queue_empty(q)) 1700 if (!elv_queue_empty(q))
1573 /* Some requests still remain, retry later */ 1701 /* Some requests still remain, retry later */
@@ -1589,7 +1717,7 @@ static int dm_lld_busy(struct request_queue *q)
1589{ 1717{
1590 int r; 1718 int r;
1591 struct mapped_device *md = q->queuedata; 1719 struct mapped_device *md = q->queuedata;
1592 struct dm_table *map = dm_get_table(md); 1720 struct dm_table *map = dm_get_live_table(md);
1593 1721
1594 if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) 1722 if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
1595 r = 1; 1723 r = 1;
@@ -1604,7 +1732,7 @@ static int dm_lld_busy(struct request_queue *q)
1604static void dm_unplug_all(struct request_queue *q) 1732static void dm_unplug_all(struct request_queue *q)
1605{ 1733{
1606 struct mapped_device *md = q->queuedata; 1734 struct mapped_device *md = q->queuedata;
1607 struct dm_table *map = dm_get_table(md); 1735 struct dm_table *map = dm_get_live_table(md);
1608 1736
1609 if (map) { 1737 if (map) {
1610 if (dm_request_based(md)) 1738 if (dm_request_based(md))
@@ -1622,7 +1750,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
1622 struct dm_table *map; 1750 struct dm_table *map;
1623 1751
1624 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 1752 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1625 map = dm_get_table(md); 1753 map = dm_get_live_table(md);
1626 if (map) { 1754 if (map) {
1627 /* 1755 /*
1628 * Request-based dm cares about only own queue for 1756 * Request-based dm cares about only own queue for
@@ -1719,6 +1847,7 @@ out:
1719static const struct block_device_operations dm_blk_dops; 1847static const struct block_device_operations dm_blk_dops;
1720 1848
1721static void dm_wq_work(struct work_struct *work); 1849static void dm_wq_work(struct work_struct *work);
1850static void dm_rq_barrier_work(struct work_struct *work);
1722 1851
1723/* 1852/*
1724 * Allocate and initialise a blank device with a given minor. 1853 * Allocate and initialise a blank device with a given minor.
@@ -1748,6 +1877,7 @@ static struct mapped_device *alloc_dev(int minor)
1748 init_rwsem(&md->io_lock); 1877 init_rwsem(&md->io_lock);
1749 mutex_init(&md->suspend_lock); 1878 mutex_init(&md->suspend_lock);
1750 spin_lock_init(&md->deferred_lock); 1879 spin_lock_init(&md->deferred_lock);
1880 spin_lock_init(&md->barrier_error_lock);
1751 rwlock_init(&md->map_lock); 1881 rwlock_init(&md->map_lock);
1752 atomic_set(&md->holders, 1); 1882 atomic_set(&md->holders, 1);
1753 atomic_set(&md->open_count, 0); 1883 atomic_set(&md->open_count, 0);
@@ -1782,6 +1912,8 @@ static struct mapped_device *alloc_dev(int minor)
1782 blk_queue_softirq_done(md->queue, dm_softirq_done); 1912 blk_queue_softirq_done(md->queue, dm_softirq_done);
1783 blk_queue_prep_rq(md->queue, dm_prep_fn); 1913 blk_queue_prep_rq(md->queue, dm_prep_fn);
1784 blk_queue_lld_busy(md->queue, dm_lld_busy); 1914 blk_queue_lld_busy(md->queue, dm_lld_busy);
1915 blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH,
1916 dm_rq_prepare_flush);
1785 1917
1786 md->disk = alloc_disk(1); 1918 md->disk = alloc_disk(1);
1787 if (!md->disk) 1919 if (!md->disk)
@@ -1791,6 +1923,7 @@ static struct mapped_device *alloc_dev(int minor)
1791 atomic_set(&md->pending[1], 0); 1923 atomic_set(&md->pending[1], 0);
1792 init_waitqueue_head(&md->wait); 1924 init_waitqueue_head(&md->wait);
1793 INIT_WORK(&md->work, dm_wq_work); 1925 INIT_WORK(&md->work, dm_wq_work);
1926 INIT_WORK(&md->barrier_work, dm_rq_barrier_work);
1794 init_waitqueue_head(&md->eventq); 1927 init_waitqueue_head(&md->eventq);
1795 1928
1796 md->disk->major = _major; 1929 md->disk->major = _major;
@@ -1822,6 +1955,7 @@ static struct mapped_device *alloc_dev(int minor)
1822bad_bdev: 1955bad_bdev:
1823 destroy_workqueue(md->wq); 1956 destroy_workqueue(md->wq);
1824bad_thread: 1957bad_thread:
1958 del_gendisk(md->disk);
1825 put_disk(md->disk); 1959 put_disk(md->disk);
1826bad_disk: 1960bad_disk:
1827 blk_cleanup_queue(md->queue); 1961 blk_cleanup_queue(md->queue);
@@ -1914,9 +2048,13 @@ static void __set_size(struct mapped_device *md, sector_t size)
1914 mutex_unlock(&md->bdev->bd_inode->i_mutex); 2048 mutex_unlock(&md->bdev->bd_inode->i_mutex);
1915} 2049}
1916 2050
1917static int __bind(struct mapped_device *md, struct dm_table *t, 2051/*
1918 struct queue_limits *limits) 2052 * Returns old map, which caller must destroy.
2053 */
2054static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2055 struct queue_limits *limits)
1919{ 2056{
2057 struct dm_table *old_map;
1920 struct request_queue *q = md->queue; 2058 struct request_queue *q = md->queue;
1921 sector_t size; 2059 sector_t size;
1922 unsigned long flags; 2060 unsigned long flags;
@@ -1931,11 +2069,6 @@ static int __bind(struct mapped_device *md, struct dm_table *t,
1931 2069
1932 __set_size(md, size); 2070 __set_size(md, size);
1933 2071
1934 if (!size) {
1935 dm_table_destroy(t);
1936 return 0;
1937 }
1938
1939 dm_table_event_callback(t, event_callback, md); 2072 dm_table_event_callback(t, event_callback, md);
1940 2073
1941 /* 2074 /*
@@ -1951,26 +2084,31 @@ static int __bind(struct mapped_device *md, struct dm_table *t,
1951 __bind_mempools(md, t); 2084 __bind_mempools(md, t);
1952 2085
1953 write_lock_irqsave(&md->map_lock, flags); 2086 write_lock_irqsave(&md->map_lock, flags);
2087 old_map = md->map;
1954 md->map = t; 2088 md->map = t;
1955 dm_table_set_restrictions(t, q, limits); 2089 dm_table_set_restrictions(t, q, limits);
1956 write_unlock_irqrestore(&md->map_lock, flags); 2090 write_unlock_irqrestore(&md->map_lock, flags);
1957 2091
1958 return 0; 2092 return old_map;
1959} 2093}
1960 2094
1961static void __unbind(struct mapped_device *md) 2095/*
2096 * Returns unbound table for the caller to free.
2097 */
2098static struct dm_table *__unbind(struct mapped_device *md)
1962{ 2099{
1963 struct dm_table *map = md->map; 2100 struct dm_table *map = md->map;
1964 unsigned long flags; 2101 unsigned long flags;
1965 2102
1966 if (!map) 2103 if (!map)
1967 return; 2104 return NULL;
1968 2105
1969 dm_table_event_callback(map, NULL, NULL); 2106 dm_table_event_callback(map, NULL, NULL);
1970 write_lock_irqsave(&md->map_lock, flags); 2107 write_lock_irqsave(&md->map_lock, flags);
1971 md->map = NULL; 2108 md->map = NULL;
1972 write_unlock_irqrestore(&md->map_lock, flags); 2109 write_unlock_irqrestore(&md->map_lock, flags);
1973 dm_table_destroy(map); 2110
2111 return map;
1974} 2112}
1975 2113
1976/* 2114/*
@@ -2052,18 +2190,18 @@ void dm_put(struct mapped_device *md)
2052 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 2190 BUG_ON(test_bit(DMF_FREEING, &md->flags));
2053 2191
2054 if (atomic_dec_and_lock(&md->holders, &_minor_lock)) { 2192 if (atomic_dec_and_lock(&md->holders, &_minor_lock)) {
2055 map = dm_get_table(md); 2193 map = dm_get_live_table(md);
2056 idr_replace(&_minor_idr, MINOR_ALLOCED, 2194 idr_replace(&_minor_idr, MINOR_ALLOCED,
2057 MINOR(disk_devt(dm_disk(md)))); 2195 MINOR(disk_devt(dm_disk(md))));
2058 set_bit(DMF_FREEING, &md->flags); 2196 set_bit(DMF_FREEING, &md->flags);
2059 spin_unlock(&_minor_lock); 2197 spin_unlock(&_minor_lock);
2060 if (!dm_suspended(md)) { 2198 if (!dm_suspended_md(md)) {
2061 dm_table_presuspend_targets(map); 2199 dm_table_presuspend_targets(map);
2062 dm_table_postsuspend_targets(map); 2200 dm_table_postsuspend_targets(map);
2063 } 2201 }
2064 dm_sysfs_exit(md); 2202 dm_sysfs_exit(md);
2065 dm_table_put(map); 2203 dm_table_put(map);
2066 __unbind(md); 2204 dm_table_destroy(__unbind(md));
2067 free_dev(md); 2205 free_dev(md);
2068 } 2206 }
2069} 2207}
@@ -2073,8 +2211,6 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
2073{ 2211{
2074 int r = 0; 2212 int r = 0;
2075 DECLARE_WAITQUEUE(wait, current); 2213 DECLARE_WAITQUEUE(wait, current);
2076 struct request_queue *q = md->queue;
2077 unsigned long flags;
2078 2214
2079 dm_unplug_all(md->queue); 2215 dm_unplug_all(md->queue);
2080 2216
@@ -2084,15 +2220,7 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
2084 set_current_state(interruptible); 2220 set_current_state(interruptible);
2085 2221
2086 smp_mb(); 2222 smp_mb();
2087 if (dm_request_based(md)) { 2223 if (!md_in_flight(md))
2088 spin_lock_irqsave(q->queue_lock, flags);
2089 if (!queue_in_flight(q) && blk_queue_stopped(q)) {
2090 spin_unlock_irqrestore(q->queue_lock, flags);
2091 break;
2092 }
2093 spin_unlock_irqrestore(q->queue_lock, flags);
2094 } else if (!atomic_read(&md->pending[0]) &&
2095 !atomic_read(&md->pending[1]))
2096 break; 2224 break;
2097 2225
2098 if (interruptible == TASK_INTERRUPTIBLE && 2226 if (interruptible == TASK_INTERRUPTIBLE &&
@@ -2187,98 +2315,106 @@ static void dm_queue_flush(struct mapped_device *md)
2187 queue_work(md->wq, &md->work); 2315 queue_work(md->wq, &md->work);
2188} 2316}
2189 2317
2190/* 2318static void dm_rq_set_flush_nr(struct request *clone, unsigned flush_nr)
2191 * Swap in a new table (destroying old one).
2192 */
2193int dm_swap_table(struct mapped_device *md, struct dm_table *table)
2194{ 2319{
2195 struct queue_limits limits; 2320 struct dm_rq_target_io *tio = clone->end_io_data;
2196 int r = -EINVAL;
2197 2321
2198 mutex_lock(&md->suspend_lock); 2322 tio->info.flush_request = flush_nr;
2323}
2199 2324
2200 /* device must be suspended */ 2325/* Issue barrier requests to targets and wait for their completion. */
2201 if (!dm_suspended(md)) 2326static int dm_rq_barrier(struct mapped_device *md)
2202 goto out; 2327{
2328 int i, j;
2329 struct dm_table *map = dm_get_live_table(md);
2330 unsigned num_targets = dm_table_get_num_targets(map);
2331 struct dm_target *ti;
2332 struct request *clone;
2203 2333
2204 r = dm_calculate_queue_limits(table, &limits); 2334 md->barrier_error = 0;
2205 if (r)
2206 goto out;
2207 2335
2208 /* cannot change the device type, once a table is bound */ 2336 for (i = 0; i < num_targets; i++) {
2209 if (md->map && 2337 ti = dm_table_get_target(map, i);
2210 (dm_table_get_type(md->map) != dm_table_get_type(table))) { 2338 for (j = 0; j < ti->num_flush_requests; j++) {
2211 DMWARN("can't change the device type after a table is bound"); 2339 clone = clone_rq(md->flush_request, md, GFP_NOIO);
2212 goto out; 2340 dm_rq_set_flush_nr(clone, j);
2341 atomic_inc(&md->pending[rq_data_dir(clone)]);
2342 map_request(ti, clone, md);
2343 }
2213 } 2344 }
2214 2345
2215 __unbind(md); 2346 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2216 r = __bind(md, table, &limits); 2347 dm_table_put(map);
2217
2218out:
2219 mutex_unlock(&md->suspend_lock);
2220 return r;
2221}
2222 2348
2223static void dm_rq_invalidate_suspend_marker(struct mapped_device *md) 2349 return md->barrier_error;
2224{
2225 md->suspend_rq.special = (void *)0x1;
2226} 2350}
2227 2351
2228static void dm_rq_abort_suspend(struct mapped_device *md, int noflush) 2352static void dm_rq_barrier_work(struct work_struct *work)
2229{ 2353{
2354 int error;
2355 struct mapped_device *md = container_of(work, struct mapped_device,
2356 barrier_work);
2230 struct request_queue *q = md->queue; 2357 struct request_queue *q = md->queue;
2358 struct request *rq;
2231 unsigned long flags; 2359 unsigned long flags;
2232 2360
2233 spin_lock_irqsave(q->queue_lock, flags); 2361 /*
2234 if (!noflush) 2362 * Hold the md reference here and leave it at the last part so that
2235 dm_rq_invalidate_suspend_marker(md); 2363 * the md can't be deleted by device opener when the barrier request
2236 __start_queue(q); 2364 * completes.
2237 spin_unlock_irqrestore(q->queue_lock, flags); 2365 */
2238} 2366 dm_get(md);
2239 2367
2240static void dm_rq_start_suspend(struct mapped_device *md, int noflush) 2368 error = dm_rq_barrier(md);
2241{
2242 struct request *rq = &md->suspend_rq;
2243 struct request_queue *q = md->queue;
2244 2369
2245 if (noflush) 2370 rq = md->flush_request;
2246 stop_queue(q); 2371 md->flush_request = NULL;
2247 else { 2372
2248 blk_rq_init(q, rq); 2373 if (error == DM_ENDIO_REQUEUE) {
2249 blk_insert_request(q, rq, 0, NULL); 2374 spin_lock_irqsave(q->queue_lock, flags);
2250 } 2375 blk_requeue_request(q, rq);
2376 spin_unlock_irqrestore(q->queue_lock, flags);
2377 } else
2378 blk_end_request_all(rq, error);
2379
2380 blk_run_queue(q);
2381
2382 dm_put(md);
2251} 2383}
2252 2384
2253static int dm_rq_suspend_available(struct mapped_device *md, int noflush) 2385/*
2386 * Swap in a new table, returning the old one for the caller to destroy.
2387 */
2388struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2254{ 2389{
2255 int r = 1; 2390 struct dm_table *map = ERR_PTR(-EINVAL);
2256 struct request *rq = &md->suspend_rq; 2391 struct queue_limits limits;
2257 struct request_queue *q = md->queue; 2392 int r;
2258 unsigned long flags;
2259 2393
2260 if (noflush) 2394 mutex_lock(&md->suspend_lock);
2261 return r;
2262 2395
2263 /* The marker must be protected by queue lock if it is in use */ 2396 /* device must be suspended */
2264 spin_lock_irqsave(q->queue_lock, flags); 2397 if (!dm_suspended_md(md))
2265 if (unlikely(rq->ref_count)) { 2398 goto out;
2266 /* 2399
2267 * This can happen, when the previous flush suspend was 2400 r = dm_calculate_queue_limits(table, &limits);
2268 * interrupted, the marker is still in the queue and 2401 if (r) {
2269 * this flush suspend has been invoked, because we don't 2402 map = ERR_PTR(r);
2270 * remove the marker at the time of suspend interruption. 2403 goto out;
2271 * We have only one marker per mapped_device, so we can't
2272 * start another flush suspend while it is in use.
2273 */
2274 BUG_ON(!rq->special); /* The marker should be invalidated */
2275 DMWARN("Invalidating the previous flush suspend is still in"
2276 " progress. Please retry later.");
2277 r = 0;
2278 } 2404 }
2279 spin_unlock_irqrestore(q->queue_lock, flags);
2280 2405
2281 return r; 2406 /* cannot change the device type, once a table is bound */
2407 if (md->map &&
2408 (dm_table_get_type(md->map) != dm_table_get_type(table))) {
2409 DMWARN("can't change the device type after a table is bound");
2410 goto out;
2411 }
2412
2413 map = __bind(md, table, &limits);
2414
2415out:
2416 mutex_unlock(&md->suspend_lock);
2417 return map;
2282} 2418}
2283 2419
2284/* 2420/*
@@ -2323,49 +2459,11 @@ static void unlock_fs(struct mapped_device *md)
2323/* 2459/*
2324 * Suspend mechanism in request-based dm. 2460 * Suspend mechanism in request-based dm.
2325 * 2461 *
2326 * After the suspend starts, further incoming requests are kept in 2462 * 1. Flush all I/Os by lock_fs() if needed.
2327 * the request_queue and deferred. 2463 * 2. Stop dispatching any I/O by stopping the request_queue.
2328 * Remaining requests in the request_queue at the start of suspend are flushed 2464 * 3. Wait for all in-flight I/Os to be completed or requeued.
2329 * if it is flush suspend.
2330 * The suspend completes when the following conditions have been satisfied,
2331 * so wait for it:
2332 * 1. q->in_flight is 0 (which means no in_flight request)
2333 * 2. queue has been stopped (which means no request dispatching)
2334 * 2465 *
2335 * 2466 * To abort suspend, start the request_queue.
2336 * Noflush suspend
2337 * ---------------
2338 * Noflush suspend doesn't need to dispatch remaining requests.
2339 * So stop the queue immediately. Then, wait for all in_flight requests
2340 * to be completed or requeued.
2341 *
2342 * To abort noflush suspend, start the queue.
2343 *
2344 *
2345 * Flush suspend
2346 * -------------
2347 * Flush suspend needs to dispatch remaining requests. So stop the queue
2348 * after the remaining requests are completed. (Requeued request must be also
2349 * re-dispatched and completed. Until then, we can't stop the queue.)
2350 *
2351 * During flushing the remaining requests, further incoming requests are also
2352 * inserted to the same queue. To distinguish which requests are to be
2353 * flushed, we insert a marker request to the queue at the time of starting
2354 * flush suspend, like a barrier.
2355 * The dispatching is blocked when the marker is found on the top of the queue.
2356 * And the queue is stopped when all in_flight requests are completed, since
2357 * that means the remaining requests are completely flushed.
2358 * Then, the marker is removed from the queue.
2359 *
2360 * To abort flush suspend, we also need to take care of the marker, not only
2361 * starting the queue.
2362 * We don't remove the marker forcibly from the queue since it's against
2363 * the block-layer manner. Instead, we put a invalidated mark on the marker.
2364 * When the invalidated marker is found on the top of the queue, it is
2365 * immediately removed from the queue, so it doesn't block dispatching.
2366 * Because we have only one marker per mapped_device, we can't start another
2367 * flush suspend until the invalidated marker is removed from the queue.
2368 * So fail and return with -EBUSY in such a case.
2369 */ 2467 */
2370int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 2468int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2371{ 2469{
@@ -2376,17 +2474,12 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2376 2474
2377 mutex_lock(&md->suspend_lock); 2475 mutex_lock(&md->suspend_lock);
2378 2476
2379 if (dm_suspended(md)) { 2477 if (dm_suspended_md(md)) {
2380 r = -EINVAL; 2478 r = -EINVAL;
2381 goto out_unlock; 2479 goto out_unlock;
2382 } 2480 }
2383 2481
2384 if (dm_request_based(md) && !dm_rq_suspend_available(md, noflush)) { 2482 map = dm_get_live_table(md);
2385 r = -EBUSY;
2386 goto out_unlock;
2387 }
2388
2389 map = dm_get_table(md);
2390 2483
2391 /* 2484 /*
2392 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 2485 * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
@@ -2399,8 +2492,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2399 dm_table_presuspend_targets(map); 2492 dm_table_presuspend_targets(map);
2400 2493
2401 /* 2494 /*
2402 * Flush I/O to the device. noflush supersedes do_lockfs, 2495 * Flush I/O to the device.
2403 * because lock_fs() needs to flush I/Os. 2496 * Any I/O submitted after lock_fs() may not be flushed.
2497 * noflush takes precedence over do_lockfs.
2498 * (lock_fs() flushes I/Os and waits for them to complete.)
2404 */ 2499 */
2405 if (!noflush && do_lockfs) { 2500 if (!noflush && do_lockfs) {
2406 r = lock_fs(md); 2501 r = lock_fs(md);
@@ -2429,10 +2524,15 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2429 set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); 2524 set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
2430 up_write(&md->io_lock); 2525 up_write(&md->io_lock);
2431 2526
2432 flush_workqueue(md->wq); 2527 /*
2433 2528 * Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which
2529 * can be kicked until md->queue is stopped. So stop md->queue before
2530 * flushing md->wq.
2531 */
2434 if (dm_request_based(md)) 2532 if (dm_request_based(md))
2435 dm_rq_start_suspend(md, noflush); 2533 stop_queue(md->queue);
2534
2535 flush_workqueue(md->wq);
2436 2536
2437 /* 2537 /*
2438 * At this point no more requests are entering target request routines. 2538 * At this point no more requests are entering target request routines.
@@ -2451,7 +2551,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2451 dm_queue_flush(md); 2551 dm_queue_flush(md);
2452 2552
2453 if (dm_request_based(md)) 2553 if (dm_request_based(md))
2454 dm_rq_abort_suspend(md, noflush); 2554 start_queue(md->queue);
2455 2555
2456 unlock_fs(md); 2556 unlock_fs(md);
2457 goto out; /* pushback list is already flushed, so skip flush */ 2557 goto out; /* pushback list is already flushed, so skip flush */
@@ -2463,10 +2563,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2463 * requests are being added to md->deferred list. 2563 * requests are being added to md->deferred list.
2464 */ 2564 */
2465 2565
2466 dm_table_postsuspend_targets(map);
2467
2468 set_bit(DMF_SUSPENDED, &md->flags); 2566 set_bit(DMF_SUSPENDED, &md->flags);
2469 2567
2568 dm_table_postsuspend_targets(map);
2569
2470out: 2570out:
2471 dm_table_put(map); 2571 dm_table_put(map);
2472 2572
@@ -2481,10 +2581,10 @@ int dm_resume(struct mapped_device *md)
2481 struct dm_table *map = NULL; 2581 struct dm_table *map = NULL;
2482 2582
2483 mutex_lock(&md->suspend_lock); 2583 mutex_lock(&md->suspend_lock);
2484 if (!dm_suspended(md)) 2584 if (!dm_suspended_md(md))
2485 goto out; 2585 goto out;
2486 2586
2487 map = dm_get_table(md); 2587 map = dm_get_live_table(md);
2488 if (!map || !dm_table_get_size(map)) 2588 if (!map || !dm_table_get_size(map))
2489 goto out; 2589 goto out;
2490 2590
@@ -2518,18 +2618,19 @@ out:
2518/*----------------------------------------------------------------- 2618/*-----------------------------------------------------------------
2519 * Event notification. 2619 * Event notification.
2520 *---------------------------------------------------------------*/ 2620 *---------------------------------------------------------------*/
2521void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 2621int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
2522 unsigned cookie) 2622 unsigned cookie)
2523{ 2623{
2524 char udev_cookie[DM_COOKIE_LENGTH]; 2624 char udev_cookie[DM_COOKIE_LENGTH];
2525 char *envp[] = { udev_cookie, NULL }; 2625 char *envp[] = { udev_cookie, NULL };
2526 2626
2527 if (!cookie) 2627 if (!cookie)
2528 kobject_uevent(&disk_to_dev(md->disk)->kobj, action); 2628 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2529 else { 2629 else {
2530 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", 2630 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2531 DM_COOKIE_ENV_VAR_NAME, cookie); 2631 DM_COOKIE_ENV_VAR_NAME, cookie);
2532 kobject_uevent_env(&disk_to_dev(md->disk)->kobj, action, envp); 2632 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2633 action, envp);
2533 } 2634 }
2534} 2635}
2535 2636
@@ -2585,26 +2686,27 @@ struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2585 return NULL; 2686 return NULL;
2586 2687
2587 if (test_bit(DMF_FREEING, &md->flags) || 2688 if (test_bit(DMF_FREEING, &md->flags) ||
2588 test_bit(DMF_DELETING, &md->flags)) 2689 dm_deleting_md(md))
2589 return NULL; 2690 return NULL;
2590 2691
2591 dm_get(md); 2692 dm_get(md);
2592 return md; 2693 return md;
2593} 2694}
2594 2695
2595int dm_suspended(struct mapped_device *md) 2696int dm_suspended_md(struct mapped_device *md)
2596{ 2697{
2597 return test_bit(DMF_SUSPENDED, &md->flags); 2698 return test_bit(DMF_SUSPENDED, &md->flags);
2598} 2699}
2599 2700
2600int dm_noflush_suspending(struct dm_target *ti) 2701int dm_suspended(struct dm_target *ti)
2601{ 2702{
2602 struct mapped_device *md = dm_table_get_md(ti->table); 2703 return dm_suspended_md(dm_table_get_md(ti->table));
2603 int r = __noflush_suspending(md); 2704}
2604 2705EXPORT_SYMBOL_GPL(dm_suspended);
2605 dm_put(md);
2606 2706
2607 return r; 2707int dm_noflush_suspending(struct dm_target *ti)
2708{
2709 return __noflush_suspending(dm_table_get_md(ti->table));
2608} 2710}
2609EXPORT_SYMBOL_GPL(dm_noflush_suspending); 2711EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2610 2712
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index a7663eba17e2..bad1724d4869 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -89,6 +89,16 @@ int dm_target_iterate(void (*iter_func)(struct target_type *tt,
89int dm_split_args(int *argc, char ***argvp, char *input); 89int dm_split_args(int *argc, char ***argvp, char *input);
90 90
91/* 91/*
92 * Is this mapped_device being deleted?
93 */
94int dm_deleting_md(struct mapped_device *md);
95
96/*
97 * Is this mapped_device suspended?
98 */
99int dm_suspended_md(struct mapped_device *md);
100
101/*
92 * The device-mapper can be driven through one of two interfaces; 102 * The device-mapper can be driven through one of two interfaces;
93 * ioctl or filesystem, depending which patch you have applied. 103 * ioctl or filesystem, depending which patch you have applied.
94 */ 104 */
@@ -115,8 +125,11 @@ void dm_stripe_exit(void);
115int dm_open_count(struct mapped_device *md); 125int dm_open_count(struct mapped_device *md);
116int dm_lock_for_deletion(struct mapped_device *md); 126int dm_lock_for_deletion(struct mapped_device *md);
117 127
118void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 128int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
119 unsigned cookie); 129 unsigned cookie);
130
131int dm_io_init(void);
132void dm_io_exit(void);
120 133
121int dm_kcopyd_init(void); 134int dm_kcopyd_init(void);
122void dm_kcopyd_exit(void); 135void dm_kcopyd_exit(void);
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 87d88dbb667f..1a8987884614 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -64,6 +64,7 @@
64#define MaxFault 50 64#define MaxFault 50
65#include <linux/blkdev.h> 65#include <linux/blkdev.h>
66#include <linux/raid/md_u.h> 66#include <linux/raid/md_u.h>
67#include <linux/slab.h>
67#include "md.h" 68#include "md.h"
68#include <linux/seq_file.h> 69#include <linux/seq_file.h>
69 70
@@ -168,10 +169,9 @@ static void add_sector(conf_t *conf, sector_t start, int mode)
168 conf->nfaults = n+1; 169 conf->nfaults = n+1;
169} 170}
170 171
171static int make_request(struct request_queue *q, struct bio *bio) 172static int make_request(mddev_t *mddev, struct bio *bio)
172{ 173{
173 mddev_t *mddev = q->queuedata; 174 conf_t *conf = mddev->private;
174 conf_t *conf = (conf_t*)mddev->private;
175 int failit = 0; 175 int failit = 0;
176 176
177 if (bio_data_dir(bio) == WRITE) { 177 if (bio_data_dir(bio) == WRITE) {
@@ -224,7 +224,7 @@ static int make_request(struct request_queue *q, struct bio *bio)
224 224
225static void status(struct seq_file *seq, mddev_t *mddev) 225static void status(struct seq_file *seq, mddev_t *mddev)
226{ 226{
227 conf_t *conf = (conf_t*)mddev->private; 227 conf_t *conf = mddev->private;
228 int n; 228 int n;
229 229
230 if ((n=atomic_read(&conf->counters[WriteTransient])) != 0) 230 if ((n=atomic_read(&conf->counters[WriteTransient])) != 0)
@@ -327,7 +327,7 @@ static int run(mddev_t *mddev)
327 327
328static int stop(mddev_t *mddev) 328static int stop(mddev_t *mddev)
329{ 329{
330 conf_t *conf = (conf_t *)mddev->private; 330 conf_t *conf = mddev->private;
331 331
332 kfree(conf); 332 kfree(conf);
333 mddev->private = NULL; 333 mddev->private = NULL;
@@ -360,6 +360,7 @@ static void raid_exit(void)
360module_init(raid_init); 360module_init(raid_init);
361module_exit(raid_exit); 361module_exit(raid_exit);
362MODULE_LICENSE("GPL"); 362MODULE_LICENSE("GPL");
363MODULE_DESCRIPTION("Fault injection personality for MD");
363MODULE_ALIAS("md-personality-10"); /* faulty */ 364MODULE_ALIAS("md-personality-10"); /* faulty */
364MODULE_ALIAS("md-faulty"); 365MODULE_ALIAS("md-faulty");
365MODULE_ALIAS("md-level--5"); 366MODULE_ALIAS("md-level--5");
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 1ceceb334d5e..7e0e057db9a7 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -19,6 +19,7 @@
19#include <linux/blkdev.h> 19#include <linux/blkdev.h>
20#include <linux/raid/md_u.h> 20#include <linux/raid/md_u.h>
21#include <linux/seq_file.h> 21#include <linux/seq_file.h>
22#include <linux/slab.h>
22#include "md.h" 23#include "md.h"
23#include "linear.h" 24#include "linear.h"
24 25
@@ -158,7 +159,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
158 sector_t sectors; 159 sector_t sectors;
159 160
160 if (j < 0 || j >= raid_disks || disk->rdev) { 161 if (j < 0 || j >= raid_disks || disk->rdev) {
161 printk("linear: disk numbering problem. Aborting!\n"); 162 printk(KERN_ERR "md/linear:%s: disk numbering problem. Aborting!\n",
163 mdname(mddev));
162 goto out; 164 goto out;
163 } 165 }
164 166
@@ -172,19 +174,22 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
172 disk_stack_limits(mddev->gendisk, rdev->bdev, 174 disk_stack_limits(mddev->gendisk, rdev->bdev,
173 rdev->data_offset << 9); 175 rdev->data_offset << 9);
174 /* as we don't honour merge_bvec_fn, we must never risk 176 /* as we don't honour merge_bvec_fn, we must never risk
175 * violating it, so limit ->max_sector to one PAGE, as 177 * violating it, so limit max_segments to 1 lying within
176 * a one page request is never in violation. 178 * a single page.
177 */ 179 */
178 if (rdev->bdev->bd_disk->queue->merge_bvec_fn && 180 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
179 queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) 181 blk_queue_max_segments(mddev->queue, 1);
180 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 182 blk_queue_segment_boundary(mddev->queue,
183 PAGE_CACHE_SIZE - 1);
184 }
181 185
182 conf->array_sectors += rdev->sectors; 186 conf->array_sectors += rdev->sectors;
183 cnt++; 187 cnt++;
184 188
185 } 189 }
186 if (cnt != raid_disks) { 190 if (cnt != raid_disks) {
187 printk("linear: not enough drives present. Aborting!\n"); 191 printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n",
192 mdname(mddev));
188 goto out; 193 goto out;
189 } 194 }
190 195
@@ -279,29 +284,21 @@ static int linear_stop (mddev_t *mddev)
279 rcu_barrier(); 284 rcu_barrier();
280 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 285 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
281 kfree(conf); 286 kfree(conf);
287 mddev->private = NULL;
282 288
283 return 0; 289 return 0;
284} 290}
285 291
286static int linear_make_request (struct request_queue *q, struct bio *bio) 292static int linear_make_request (mddev_t *mddev, struct bio *bio)
287{ 293{
288 const int rw = bio_data_dir(bio);
289 mddev_t *mddev = q->queuedata;
290 dev_info_t *tmp_dev; 294 dev_info_t *tmp_dev;
291 sector_t start_sector; 295 sector_t start_sector;
292 int cpu;
293 296
294 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { 297 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
295 bio_endio(bio, -EOPNOTSUPP); 298 md_barrier_request(mddev, bio);
296 return 0; 299 return 0;
297 } 300 }
298 301
299 cpu = part_stat_lock();
300 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
301 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
302 bio_sectors(bio));
303 part_stat_unlock();
304
305 rcu_read_lock(); 302 rcu_read_lock();
306 tmp_dev = which_dev(mddev, bio->bi_sector); 303 tmp_dev = which_dev(mddev, bio->bi_sector);
307 start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; 304 start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
@@ -311,12 +308,14 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
311 || (bio->bi_sector < start_sector))) { 308 || (bio->bi_sector < start_sector))) {
312 char b[BDEVNAME_SIZE]; 309 char b[BDEVNAME_SIZE];
313 310
314 printk("linear_make_request: Sector %llu out of bounds on " 311 printk(KERN_ERR
315 "dev %s: %llu sectors, offset %llu\n", 312 "md/linear:%s: make_request: Sector %llu out of bounds on "
316 (unsigned long long)bio->bi_sector, 313 "dev %s: %llu sectors, offset %llu\n",
317 bdevname(tmp_dev->rdev->bdev, b), 314 mdname(mddev),
318 (unsigned long long)tmp_dev->rdev->sectors, 315 (unsigned long long)bio->bi_sector,
319 (unsigned long long)start_sector); 316 bdevname(tmp_dev->rdev->bdev, b),
317 (unsigned long long)tmp_dev->rdev->sectors,
318 (unsigned long long)start_sector);
320 rcu_read_unlock(); 319 rcu_read_unlock();
321 bio_io_error(bio); 320 bio_io_error(bio);
322 return 0; 321 return 0;
@@ -333,9 +332,9 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
333 332
334 bp = bio_split(bio, end_sector - bio->bi_sector); 333 bp = bio_split(bio, end_sector - bio->bi_sector);
335 334
336 if (linear_make_request(q, &bp->bio1)) 335 if (linear_make_request(mddev, &bp->bio1))
337 generic_make_request(&bp->bio1); 336 generic_make_request(&bp->bio1);
338 if (linear_make_request(q, &bp->bio2)) 337 if (linear_make_request(mddev, &bp->bio2))
339 generic_make_request(&bp->bio2); 338 generic_make_request(&bp->bio2);
340 bio_pair_release(bp); 339 bio_pair_release(bp);
341 return 0; 340 return 0;
@@ -383,6 +382,7 @@ static void linear_exit (void)
383module_init(linear_init); 382module_init(linear_init);
384module_exit(linear_exit); 383module_exit(linear_exit);
385MODULE_LICENSE("GPL"); 384MODULE_LICENSE("GPL");
385MODULE_DESCRIPTION("Linear device concatenation personality for MD");
386MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/ 386MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/
387MODULE_ALIAS("md-linear"); 387MODULE_ALIAS("md-linear");
388MODULE_ALIAS("md-level--1"); 388MODULE_ALIAS("md-level--1");
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 26ba42a79129..cb20d0b0555a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -39,14 +39,17 @@
39#include <linux/buffer_head.h> /* for invalidate_bdev */ 39#include <linux/buffer_head.h> /* for invalidate_bdev */
40#include <linux/poll.h> 40#include <linux/poll.h>
41#include <linux/ctype.h> 41#include <linux/ctype.h>
42#include <linux/string.h>
42#include <linux/hdreg.h> 43#include <linux/hdreg.h>
43#include <linux/proc_fs.h> 44#include <linux/proc_fs.h>
44#include <linux/random.h> 45#include <linux/random.h>
45#include <linux/reboot.h> 46#include <linux/reboot.h>
46#include <linux/file.h> 47#include <linux/file.h>
48#include <linux/compat.h>
47#include <linux/delay.h> 49#include <linux/delay.h>
48#include <linux/raid/md_p.h> 50#include <linux/raid/md_p.h>
49#include <linux/raid/md_u.h> 51#include <linux/raid/md_u.h>
52#include <linux/slab.h>
50#include "md.h" 53#include "md.h"
51#include "bitmap.h" 54#include "bitmap.h"
52 55
@@ -68,6 +71,12 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
68#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } 71#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
69 72
70/* 73/*
74 * Default number of read corrections we'll attempt on an rdev
75 * before ejecting it from the array. We divide the read error
76 * count by 2 for every hour elapsed between read errors.
77 */
78#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
79/*
71 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 80 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
72 * is 1000 KB/sec, so the extra system load does not show up that much. 81 * is 1000 KB/sec, so the extra system load does not show up that much.
73 * Increase it if you want to have more _guaranteed_ speed. Note that 82 * Increase it if you want to have more _guaranteed_ speed. Note that
@@ -98,44 +107,40 @@ static struct ctl_table_header *raid_table_header;
98 107
99static ctl_table raid_table[] = { 108static ctl_table raid_table[] = {
100 { 109 {
101 .ctl_name = DEV_RAID_SPEED_LIMIT_MIN,
102 .procname = "speed_limit_min", 110 .procname = "speed_limit_min",
103 .data = &sysctl_speed_limit_min, 111 .data = &sysctl_speed_limit_min,
104 .maxlen = sizeof(int), 112 .maxlen = sizeof(int),
105 .mode = S_IRUGO|S_IWUSR, 113 .mode = S_IRUGO|S_IWUSR,
106 .proc_handler = &proc_dointvec, 114 .proc_handler = proc_dointvec,
107 }, 115 },
108 { 116 {
109 .ctl_name = DEV_RAID_SPEED_LIMIT_MAX,
110 .procname = "speed_limit_max", 117 .procname = "speed_limit_max",
111 .data = &sysctl_speed_limit_max, 118 .data = &sysctl_speed_limit_max,
112 .maxlen = sizeof(int), 119 .maxlen = sizeof(int),
113 .mode = S_IRUGO|S_IWUSR, 120 .mode = S_IRUGO|S_IWUSR,
114 .proc_handler = &proc_dointvec, 121 .proc_handler = proc_dointvec,
115 }, 122 },
116 { .ctl_name = 0 } 123 { }
117}; 124};
118 125
119static ctl_table raid_dir_table[] = { 126static ctl_table raid_dir_table[] = {
120 { 127 {
121 .ctl_name = DEV_RAID,
122 .procname = "raid", 128 .procname = "raid",
123 .maxlen = 0, 129 .maxlen = 0,
124 .mode = S_IRUGO|S_IXUGO, 130 .mode = S_IRUGO|S_IXUGO,
125 .child = raid_table, 131 .child = raid_table,
126 }, 132 },
127 { .ctl_name = 0 } 133 { }
128}; 134};
129 135
130static ctl_table raid_root_table[] = { 136static ctl_table raid_root_table[] = {
131 { 137 {
132 .ctl_name = CTL_DEV,
133 .procname = "dev", 138 .procname = "dev",
134 .maxlen = 0, 139 .maxlen = 0,
135 .mode = 0555, 140 .mode = 0555,
136 .child = raid_dir_table, 141 .child = raid_dir_table,
137 }, 142 },
138 { .ctl_name = 0 } 143 { }
139}; 144};
140 145
141static const struct block_device_operations md_fops; 146static const struct block_device_operations md_fops;
@@ -210,19 +215,22 @@ static DEFINE_SPINLOCK(all_mddevs_lock);
210 */ 215 */
211static int md_make_request(struct request_queue *q, struct bio *bio) 216static int md_make_request(struct request_queue *q, struct bio *bio)
212{ 217{
218 const int rw = bio_data_dir(bio);
213 mddev_t *mddev = q->queuedata; 219 mddev_t *mddev = q->queuedata;
214 int rv; 220 int rv;
221 int cpu;
222
215 if (mddev == NULL || mddev->pers == NULL) { 223 if (mddev == NULL || mddev->pers == NULL) {
216 bio_io_error(bio); 224 bio_io_error(bio);
217 return 0; 225 return 0;
218 } 226 }
219 rcu_read_lock(); 227 rcu_read_lock();
220 if (mddev->suspended) { 228 if (mddev->suspended || mddev->barrier) {
221 DEFINE_WAIT(__wait); 229 DEFINE_WAIT(__wait);
222 for (;;) { 230 for (;;) {
223 prepare_to_wait(&mddev->sb_wait, &__wait, 231 prepare_to_wait(&mddev->sb_wait, &__wait,
224 TASK_UNINTERRUPTIBLE); 232 TASK_UNINTERRUPTIBLE);
225 if (!mddev->suspended) 233 if (!mddev->suspended && !mddev->barrier)
226 break; 234 break;
227 rcu_read_unlock(); 235 rcu_read_unlock();
228 schedule(); 236 schedule();
@@ -232,13 +240,27 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
232 } 240 }
233 atomic_inc(&mddev->active_io); 241 atomic_inc(&mddev->active_io);
234 rcu_read_unlock(); 242 rcu_read_unlock();
235 rv = mddev->pers->make_request(q, bio); 243
244 rv = mddev->pers->make_request(mddev, bio);
245
246 cpu = part_stat_lock();
247 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
248 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
249 bio_sectors(bio));
250 part_stat_unlock();
251
236 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) 252 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
237 wake_up(&mddev->sb_wait); 253 wake_up(&mddev->sb_wait);
238 254
239 return rv; 255 return rv;
240} 256}
241 257
258/* mddev_suspend makes sure no new requests are submitted
259 * to the device, and that any requests that have been submitted
260 * are completely handled.
261 * Once ->stop is called and completes, the module will be completely
262 * unused.
263 */
242static void mddev_suspend(mddev_t *mddev) 264static void mddev_suspend(mddev_t *mddev)
243{ 265{
244 BUG_ON(mddev->suspended); 266 BUG_ON(mddev->suspended);
@@ -246,13 +268,6 @@ static void mddev_suspend(mddev_t *mddev)
246 synchronize_rcu(); 268 synchronize_rcu();
247 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); 269 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
248 mddev->pers->quiesce(mddev, 1); 270 mddev->pers->quiesce(mddev, 1);
249 md_unregister_thread(mddev->thread);
250 mddev->thread = NULL;
251 /* we now know that no code is executing in the personality module,
252 * except possibly the tail end of a ->bi_end_io function, but that
253 * is certain to complete before the module has a chance to get
254 * unloaded
255 */
256} 271}
257 272
258static void mddev_resume(mddev_t *mddev) 273static void mddev_resume(mddev_t *mddev)
@@ -264,10 +279,110 @@ static void mddev_resume(mddev_t *mddev)
264 279
265int mddev_congested(mddev_t *mddev, int bits) 280int mddev_congested(mddev_t *mddev, int bits)
266{ 281{
282 if (mddev->barrier)
283 return 1;
267 return mddev->suspended; 284 return mddev->suspended;
268} 285}
269EXPORT_SYMBOL(mddev_congested); 286EXPORT_SYMBOL(mddev_congested);
270 287
288/*
289 * Generic barrier handling for md
290 */
291
292#define POST_REQUEST_BARRIER ((void*)1)
293
294static void md_end_barrier(struct bio *bio, int err)
295{
296 mdk_rdev_t *rdev = bio->bi_private;
297 mddev_t *mddev = rdev->mddev;
298 if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER)
299 set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags);
300
301 rdev_dec_pending(rdev, mddev);
302
303 if (atomic_dec_and_test(&mddev->flush_pending)) {
304 if (mddev->barrier == POST_REQUEST_BARRIER) {
305 /* This was a post-request barrier */
306 mddev->barrier = NULL;
307 wake_up(&mddev->sb_wait);
308 } else
309 /* The pre-request barrier has finished */
310 schedule_work(&mddev->barrier_work);
311 }
312 bio_put(bio);
313}
314
315static void submit_barriers(mddev_t *mddev)
316{
317 mdk_rdev_t *rdev;
318
319 rcu_read_lock();
320 list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
321 if (rdev->raid_disk >= 0 &&
322 !test_bit(Faulty, &rdev->flags)) {
323 /* Take two references, one is dropped
324 * when request finishes, one after
325 * we reclaim rcu_read_lock
326 */
327 struct bio *bi;
328 atomic_inc(&rdev->nr_pending);
329 atomic_inc(&rdev->nr_pending);
330 rcu_read_unlock();
331 bi = bio_alloc(GFP_KERNEL, 0);
332 bi->bi_end_io = md_end_barrier;
333 bi->bi_private = rdev;
334 bi->bi_bdev = rdev->bdev;
335 atomic_inc(&mddev->flush_pending);
336 submit_bio(WRITE_BARRIER, bi);
337 rcu_read_lock();
338 rdev_dec_pending(rdev, mddev);
339 }
340 rcu_read_unlock();
341}
342
343static void md_submit_barrier(struct work_struct *ws)
344{
345 mddev_t *mddev = container_of(ws, mddev_t, barrier_work);
346 struct bio *bio = mddev->barrier;
347
348 atomic_set(&mddev->flush_pending, 1);
349
350 if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
351 bio_endio(bio, -EOPNOTSUPP);
352 else if (bio->bi_size == 0)
353 /* an empty barrier - all done */
354 bio_endio(bio, 0);
355 else {
356 bio->bi_rw &= ~(1<<BIO_RW_BARRIER);
357 if (mddev->pers->make_request(mddev, bio))
358 generic_make_request(bio);
359 mddev->barrier = POST_REQUEST_BARRIER;
360 submit_barriers(mddev);
361 }
362 if (atomic_dec_and_test(&mddev->flush_pending)) {
363 mddev->barrier = NULL;
364 wake_up(&mddev->sb_wait);
365 }
366}
367
368void md_barrier_request(mddev_t *mddev, struct bio *bio)
369{
370 spin_lock_irq(&mddev->write_lock);
371 wait_event_lock_irq(mddev->sb_wait,
372 !mddev->barrier,
373 mddev->write_lock, /*nothing*/);
374 mddev->barrier = bio;
375 spin_unlock_irq(&mddev->write_lock);
376
377 atomic_set(&mddev->flush_pending, 1);
378 INIT_WORK(&mddev->barrier_work, md_submit_barrier);
379
380 submit_barriers(mddev);
381
382 if (atomic_dec_and_test(&mddev->flush_pending))
383 schedule_work(&mddev->barrier_work);
384}
385EXPORT_SYMBOL(md_barrier_request);
271 386
272static inline mddev_t *mddev_get(mddev_t *mddev) 387static inline mddev_t *mddev_get(mddev_t *mddev)
273{ 388{
@@ -282,7 +397,9 @@ static void mddev_put(mddev_t *mddev)
282 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 397 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
283 return; 398 return;
284 if (!mddev->raid_disks && list_empty(&mddev->disks) && 399 if (!mddev->raid_disks && list_empty(&mddev->disks) &&
285 !mddev->hold_active) { 400 mddev->ctime == 0 && !mddev->hold_active) {
401 /* Array is not configured at all, and not held active,
402 * so destroy it */
286 list_del(&mddev->all_mddevs); 403 list_del(&mddev->all_mddevs);
287 if (mddev->gendisk) { 404 if (mddev->gendisk) {
288 /* we did a probe so need to clean up. 405 /* we did a probe so need to clean up.
@@ -299,6 +416,27 @@ static void mddev_put(mddev_t *mddev)
299 spin_unlock(&all_mddevs_lock); 416 spin_unlock(&all_mddevs_lock);
300} 417}
301 418
419static void mddev_init(mddev_t *mddev)
420{
421 mutex_init(&mddev->open_mutex);
422 mutex_init(&mddev->reconfig_mutex);
423 mutex_init(&mddev->bitmap_info.mutex);
424 INIT_LIST_HEAD(&mddev->disks);
425 INIT_LIST_HEAD(&mddev->all_mddevs);
426 init_timer(&mddev->safemode_timer);
427 atomic_set(&mddev->active, 1);
428 atomic_set(&mddev->openers, 0);
429 atomic_set(&mddev->active_io, 0);
430 spin_lock_init(&mddev->write_lock);
431 atomic_set(&mddev->flush_pending, 0);
432 init_waitqueue_head(&mddev->sb_wait);
433 init_waitqueue_head(&mddev->recovery_wait);
434 mddev->reshape_position = MaxSector;
435 mddev->resync_min = 0;
436 mddev->resync_max = MaxSector;
437 mddev->level = LEVEL_NONE;
438}
439
302static mddev_t * mddev_find(dev_t unit) 440static mddev_t * mddev_find(dev_t unit)
303{ 441{
304 mddev_t *mddev, *new = NULL; 442 mddev_t *mddev, *new = NULL;
@@ -365,21 +503,7 @@ static mddev_t * mddev_find(dev_t unit)
365 else 503 else
366 new->md_minor = MINOR(unit) >> MdpMinorShift; 504 new->md_minor = MINOR(unit) >> MdpMinorShift;
367 505
368 mutex_init(&new->open_mutex); 506 mddev_init(new);
369 mutex_init(&new->reconfig_mutex);
370 INIT_LIST_HEAD(&new->disks);
371 INIT_LIST_HEAD(&new->all_mddevs);
372 init_timer(&new->safemode_timer);
373 atomic_set(&new->active, 1);
374 atomic_set(&new->openers, 0);
375 atomic_set(&new->active_io, 0);
376 spin_lock_init(&new->write_lock);
377 init_waitqueue_head(&new->sb_wait);
378 init_waitqueue_head(&new->recovery_wait);
379 new->reshape_position = MaxSector;
380 new->resync_min = 0;
381 new->resync_max = MaxSector;
382 new->level = LEVEL_NONE;
383 507
384 goto retry; 508 goto retry;
385} 509}
@@ -399,9 +523,36 @@ static inline int mddev_trylock(mddev_t * mddev)
399 return mutex_trylock(&mddev->reconfig_mutex); 523 return mutex_trylock(&mddev->reconfig_mutex);
400} 524}
401 525
402static inline void mddev_unlock(mddev_t * mddev) 526static struct attribute_group md_redundancy_group;
527
528static void mddev_unlock(mddev_t * mddev)
403{ 529{
404 mutex_unlock(&mddev->reconfig_mutex); 530 if (mddev->to_remove) {
531 /* These cannot be removed under reconfig_mutex as
532 * an access to the files will try to take reconfig_mutex
533 * while holding the file unremovable, which leads to
534 * a deadlock.
535 * So hold open_mutex instead - we are allowed to take
536 * it while holding reconfig_mutex, and md_run can
537 * use it to wait for the remove to complete.
538 */
539 struct attribute_group *to_remove = mddev->to_remove;
540 mddev->to_remove = NULL;
541 mutex_lock(&mddev->open_mutex);
542 mutex_unlock(&mddev->reconfig_mutex);
543
544 if (to_remove != &md_redundancy_group)
545 sysfs_remove_group(&mddev->kobj, to_remove);
546 if (mddev->pers == NULL ||
547 mddev->pers->sync_request == NULL) {
548 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
549 if (mddev->sysfs_action)
550 sysfs_put(mddev->sysfs_action);
551 mddev->sysfs_action = NULL;
552 }
553 mutex_unlock(&mddev->open_mutex);
554 } else
555 mutex_unlock(&mddev->reconfig_mutex);
405 556
406 md_wakeup_thread(mddev->thread); 557 md_wakeup_thread(mddev->thread);
407} 558}
@@ -752,7 +903,7 @@ struct super_type {
752 */ 903 */
753int md_check_no_bitmap(mddev_t *mddev) 904int md_check_no_bitmap(mddev_t *mddev)
754{ 905{
755 if (!mddev->bitmap_file && !mddev->bitmap_offset) 906 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
756 return 0; 907 return 0;
757 printk(KERN_ERR "%s: bitmaps are not supported for %s\n", 908 printk(KERN_ERR "%s: bitmaps are not supported for %s\n",
758 mdname(mddev), mddev->pers->name); 909 mdname(mddev), mddev->pers->name);
@@ -880,8 +1031,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
880 mddev->raid_disks = sb->raid_disks; 1031 mddev->raid_disks = sb->raid_disks;
881 mddev->dev_sectors = sb->size * 2; 1032 mddev->dev_sectors = sb->size * 2;
882 mddev->events = ev1; 1033 mddev->events = ev1;
883 mddev->bitmap_offset = 0; 1034 mddev->bitmap_info.offset = 0;
884 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 1035 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
885 1036
886 if (mddev->minor_version >= 91) { 1037 if (mddev->minor_version >= 91) {
887 mddev->reshape_position = sb->reshape_position; 1038 mddev->reshape_position = sb->reshape_position;
@@ -915,14 +1066,18 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
915 mddev->max_disks = MD_SB_DISKS; 1066 mddev->max_disks = MD_SB_DISKS;
916 1067
917 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 1068 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
918 mddev->bitmap_file == NULL) 1069 mddev->bitmap_info.file == NULL)
919 mddev->bitmap_offset = mddev->default_bitmap_offset; 1070 mddev->bitmap_info.offset =
1071 mddev->bitmap_info.default_offset;
920 1072
921 } else if (mddev->pers == NULL) { 1073 } else if (mddev->pers == NULL) {
922 /* Insist on good event counter while assembling */ 1074 /* Insist on good event counter while assembling, except
1075 * for spares (which don't need an event count) */
923 ++ev1; 1076 ++ev1;
924 if (ev1 < mddev->events) 1077 if (sb->disks[rdev->desc_nr].state & (
925 return -EINVAL; 1078 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1079 if (ev1 < mddev->events)
1080 return -EINVAL;
926 } else if (mddev->bitmap) { 1081 } else if (mddev->bitmap) {
927 /* if adding to array with a bitmap, then we can accept an 1082 /* if adding to array with a bitmap, then we can accept an
928 * older device ... but not too old. 1083 * older device ... but not too old.
@@ -944,6 +1099,14 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
944 desc->raid_disk < mddev->raid_disks */) { 1099 desc->raid_disk < mddev->raid_disks */) {
945 set_bit(In_sync, &rdev->flags); 1100 set_bit(In_sync, &rdev->flags);
946 rdev->raid_disk = desc->raid_disk; 1101 rdev->raid_disk = desc->raid_disk;
1102 } else if (desc->state & (1<<MD_DISK_ACTIVE)) {
1103 /* active but not in sync implies recovery up to
1104 * reshape position. We don't know exactly where
1105 * that is, so set to zero for now */
1106 if (mddev->minor_version >= 91) {
1107 rdev->recovery_offset = 0;
1108 rdev->raid_disk = desc->raid_disk;
1109 }
947 } 1110 }
948 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 1111 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
949 set_bit(WriteMostly, &rdev->flags); 1112 set_bit(WriteMostly, &rdev->flags);
@@ -1025,15 +1188,26 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1025 sb->layout = mddev->layout; 1188 sb->layout = mddev->layout;
1026 sb->chunk_size = mddev->chunk_sectors << 9; 1189 sb->chunk_size = mddev->chunk_sectors << 9;
1027 1190
1028 if (mddev->bitmap && mddev->bitmap_file == NULL) 1191 if (mddev->bitmap && mddev->bitmap_info.file == NULL)
1029 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1192 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1030 1193
1031 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1194 sb->disks[0].state = (1<<MD_DISK_REMOVED);
1032 list_for_each_entry(rdev2, &mddev->disks, same_set) { 1195 list_for_each_entry(rdev2, &mddev->disks, same_set) {
1033 mdp_disk_t *d; 1196 mdp_disk_t *d;
1034 int desc_nr; 1197 int desc_nr;
1035 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 1198 int is_active = test_bit(In_sync, &rdev2->flags);
1036 && !test_bit(Faulty, &rdev2->flags)) 1199
1200 if (rdev2->raid_disk >= 0 &&
1201 sb->minor_version >= 91)
1202 /* we have nowhere to store the recovery_offset,
1203 * but if it is not below the reshape_position,
1204 * we can piggy-back on that.
1205 */
1206 is_active = 1;
1207 if (rdev2->raid_disk < 0 ||
1208 test_bit(Faulty, &rdev2->flags))
1209 is_active = 0;
1210 if (is_active)
1037 desc_nr = rdev2->raid_disk; 1211 desc_nr = rdev2->raid_disk;
1038 else 1212 else
1039 desc_nr = next_spare++; 1213 desc_nr = next_spare++;
@@ -1043,16 +1217,16 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1043 d->number = rdev2->desc_nr; 1217 d->number = rdev2->desc_nr;
1044 d->major = MAJOR(rdev2->bdev->bd_dev); 1218 d->major = MAJOR(rdev2->bdev->bd_dev);
1045 d->minor = MINOR(rdev2->bdev->bd_dev); 1219 d->minor = MINOR(rdev2->bdev->bd_dev);
1046 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 1220 if (is_active)
1047 && !test_bit(Faulty, &rdev2->flags))
1048 d->raid_disk = rdev2->raid_disk; 1221 d->raid_disk = rdev2->raid_disk;
1049 else 1222 else
1050 d->raid_disk = rdev2->desc_nr; /* compatibility */ 1223 d->raid_disk = rdev2->desc_nr; /* compatibility */
1051 if (test_bit(Faulty, &rdev2->flags)) 1224 if (test_bit(Faulty, &rdev2->flags))
1052 d->state = (1<<MD_DISK_FAULTY); 1225 d->state = (1<<MD_DISK_FAULTY);
1053 else if (test_bit(In_sync, &rdev2->flags)) { 1226 else if (is_active) {
1054 d->state = (1<<MD_DISK_ACTIVE); 1227 d->state = (1<<MD_DISK_ACTIVE);
1055 d->state |= (1<<MD_DISK_SYNC); 1228 if (test_bit(In_sync, &rdev2->flags))
1229 d->state |= (1<<MD_DISK_SYNC);
1056 active++; 1230 active++;
1057 working++; 1231 working++;
1058 } else { 1232 } else {
@@ -1092,7 +1266,7 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1092{ 1266{
1093 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1267 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1094 return 0; /* component must fit device */ 1268 return 0; /* component must fit device */
1095 if (rdev->mddev->bitmap_offset) 1269 if (rdev->mddev->bitmap_info.offset)
1096 return 0; /* can't move bitmap */ 1270 return 0; /* can't move bitmap */
1097 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 1271 rdev->sb_start = calc_dev_sboffset(rdev->bdev);
1098 if (!num_sectors || num_sectors > rdev->sb_start) 1272 if (!num_sectors || num_sectors > rdev->sb_start)
@@ -1271,8 +1445,8 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1271 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1445 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1272 mddev->dev_sectors = le64_to_cpu(sb->size); 1446 mddev->dev_sectors = le64_to_cpu(sb->size);
1273 mddev->events = ev1; 1447 mddev->events = ev1;
1274 mddev->bitmap_offset = 0; 1448 mddev->bitmap_info.offset = 0;
1275 mddev->default_bitmap_offset = 1024 >> 9; 1449 mddev->bitmap_info.default_offset = 1024 >> 9;
1276 1450
1277 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1451 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1278 memcpy(mddev->uuid, sb->set_uuid, 16); 1452 memcpy(mddev->uuid, sb->set_uuid, 16);
@@ -1280,8 +1454,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1280 mddev->max_disks = (4096-256)/2; 1454 mddev->max_disks = (4096-256)/2;
1281 1455
1282 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1456 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1283 mddev->bitmap_file == NULL ) 1457 mddev->bitmap_info.file == NULL )
1284 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); 1458 mddev->bitmap_info.offset =
1459 (__s32)le32_to_cpu(sb->bitmap_offset);
1285 1460
1286 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1461 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1287 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1462 mddev->reshape_position = le64_to_cpu(sb->reshape_position);
@@ -1298,10 +1473,14 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1298 } 1473 }
1299 1474
1300 } else if (mddev->pers == NULL) { 1475 } else if (mddev->pers == NULL) {
1301 /* Insist of good event counter while assembling */ 1476 /* Insist of good event counter while assembling, except for
1477 * spares (which don't need an event count) */
1302 ++ev1; 1478 ++ev1;
1303 if (ev1 < mddev->events) 1479 if (rdev->desc_nr >= 0 &&
1304 return -EINVAL; 1480 rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1481 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe)
1482 if (ev1 < mddev->events)
1483 return -EINVAL;
1305 } else if (mddev->bitmap) { 1484 } else if (mddev->bitmap) {
1306 /* If adding to array with a bitmap, then we can accept an 1485 /* If adding to array with a bitmap, then we can accept an
1307 * older device, but not too old. 1486 * older device, but not too old.
@@ -1375,21 +1554,17 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1375 sb->level = cpu_to_le32(mddev->level); 1554 sb->level = cpu_to_le32(mddev->level);
1376 sb->layout = cpu_to_le32(mddev->layout); 1555 sb->layout = cpu_to_le32(mddev->layout);
1377 1556
1378 if (mddev->bitmap && mddev->bitmap_file == NULL) { 1557 if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
1379 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1558 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
1380 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1559 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1381 } 1560 }
1382 1561
1383 if (rdev->raid_disk >= 0 && 1562 if (rdev->raid_disk >= 0 &&
1384 !test_bit(In_sync, &rdev->flags)) { 1563 !test_bit(In_sync, &rdev->flags)) {
1385 if (mddev->curr_resync_completed > rdev->recovery_offset) 1564 sb->feature_map |=
1386 rdev->recovery_offset = mddev->curr_resync_completed; 1565 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1387 if (rdev->recovery_offset > 0) { 1566 sb->recovery_offset =
1388 sb->feature_map |= 1567 cpu_to_le64(rdev->recovery_offset);
1389 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1390 sb->recovery_offset =
1391 cpu_to_le64(rdev->recovery_offset);
1392 }
1393 } 1568 }
1394 1569
1395 if (mddev->reshape_position != MaxSector) { 1570 if (mddev->reshape_position != MaxSector) {
@@ -1423,7 +1598,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1423 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1598 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1424 else if (test_bit(In_sync, &rdev2->flags)) 1599 else if (test_bit(In_sync, &rdev2->flags))
1425 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1600 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1426 else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0) 1601 else if (rdev2->raid_disk >= 0)
1427 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1602 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1428 else 1603 else
1429 sb->dev_roles[i] = cpu_to_le16(0xffff); 1604 sb->dev_roles[i] = cpu_to_le16(0xffff);
@@ -1445,7 +1620,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1445 max_sectors -= rdev->data_offset; 1620 max_sectors -= rdev->data_offset;
1446 if (!num_sectors || num_sectors > max_sectors) 1621 if (!num_sectors || num_sectors > max_sectors)
1447 num_sectors = max_sectors; 1622 num_sectors = max_sectors;
1448 } else if (rdev->mddev->bitmap_offset) { 1623 } else if (rdev->mddev->bitmap_info.offset) {
1449 /* minor version 0 with bitmap we can't move */ 1624 /* minor version 0 with bitmap we can't move */
1450 return 0; 1625 return 0;
1451 } else { 1626 } else {
@@ -1640,7 +1815,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1640 kobject_del(&rdev->kobj); 1815 kobject_del(&rdev->kobj);
1641 goto fail; 1816 goto fail;
1642 } 1817 }
1643 rdev->sysfs_state = sysfs_get_dirent(rdev->kobj.sd, "state"); 1818 rdev->sysfs_state = sysfs_get_dirent(rdev->kobj.sd, NULL, "state");
1644 1819
1645 list_add_rcu(&rdev->same_set, &mddev->disks); 1820 list_add_rcu(&rdev->same_set, &mddev->disks);
1646 bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); 1821 bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
@@ -1813,15 +1988,11 @@ static void print_sb_1(struct mdp_superblock_1 *sb)
1813 1988
1814 uuid = sb->set_uuid; 1989 uuid = sb->set_uuid;
1815 printk(KERN_INFO 1990 printk(KERN_INFO
1816 "md: SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x" 1991 "md: SB: (V:%u) (F:0x%08x) Array-ID:<%pU>\n"
1817 ":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n"
1818 "md: Name: \"%s\" CT:%llu\n", 1992 "md: Name: \"%s\" CT:%llu\n",
1819 le32_to_cpu(sb->major_version), 1993 le32_to_cpu(sb->major_version),
1820 le32_to_cpu(sb->feature_map), 1994 le32_to_cpu(sb->feature_map),
1821 uuid[0], uuid[1], uuid[2], uuid[3], 1995 uuid,
1822 uuid[4], uuid[5], uuid[6], uuid[7],
1823 uuid[8], uuid[9], uuid[10], uuid[11],
1824 uuid[12], uuid[13], uuid[14], uuid[15],
1825 sb->set_name, 1996 sb->set_name,
1826 (unsigned long long)le64_to_cpu(sb->ctime) 1997 (unsigned long long)le64_to_cpu(sb->ctime)
1827 & MD_SUPERBLOCK_1_TIME_SEC_MASK); 1998 & MD_SUPERBLOCK_1_TIME_SEC_MASK);
@@ -1830,8 +2001,7 @@ static void print_sb_1(struct mdp_superblock_1 *sb)
1830 printk(KERN_INFO 2001 printk(KERN_INFO
1831 "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu" 2002 "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu"
1832 " RO:%llu\n" 2003 " RO:%llu\n"
1833 "md: Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x" 2004 "md: Dev:%08x UUID: %pU\n"
1834 ":%02x%02x%02x%02x%02x%02x\n"
1835 "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n" 2005 "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n"
1836 "md: (MaxDev:%u) \n", 2006 "md: (MaxDev:%u) \n",
1837 le32_to_cpu(sb->level), 2007 le32_to_cpu(sb->level),
@@ -1844,10 +2014,7 @@ static void print_sb_1(struct mdp_superblock_1 *sb)
1844 (unsigned long long)le64_to_cpu(sb->super_offset), 2014 (unsigned long long)le64_to_cpu(sb->super_offset),
1845 (unsigned long long)le64_to_cpu(sb->recovery_offset), 2015 (unsigned long long)le64_to_cpu(sb->recovery_offset),
1846 le32_to_cpu(sb->dev_number), 2016 le32_to_cpu(sb->dev_number),
1847 uuid[0], uuid[1], uuid[2], uuid[3], 2017 uuid,
1848 uuid[4], uuid[5], uuid[6], uuid[7],
1849 uuid[8], uuid[9], uuid[10], uuid[11],
1850 uuid[12], uuid[13], uuid[14], uuid[15],
1851 sb->devflags, 2018 sb->devflags,
1852 (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK, 2019 (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK,
1853 (unsigned long long)le64_to_cpu(sb->events), 2020 (unsigned long long)le64_to_cpu(sb->events),
@@ -1917,11 +2084,19 @@ static void sync_sbs(mddev_t * mddev, int nospares)
1917 */ 2084 */
1918 mdk_rdev_t *rdev; 2085 mdk_rdev_t *rdev;
1919 2086
2087 /* First make sure individual recovery_offsets are correct */
2088 list_for_each_entry(rdev, &mddev->disks, same_set) {
2089 if (rdev->raid_disk >= 0 &&
2090 mddev->delta_disks >= 0 &&
2091 !test_bit(In_sync, &rdev->flags) &&
2092 mddev->curr_resync_completed > rdev->recovery_offset)
2093 rdev->recovery_offset = mddev->curr_resync_completed;
2094
2095 }
1920 list_for_each_entry(rdev, &mddev->disks, same_set) { 2096 list_for_each_entry(rdev, &mddev->disks, same_set) {
1921 if (rdev->sb_events == mddev->events || 2097 if (rdev->sb_events == mddev->events ||
1922 (nospares && 2098 (nospares &&
1923 rdev->raid_disk < 0 && 2099 rdev->raid_disk < 0 &&
1924 (rdev->sb_events&1)==0 &&
1925 rdev->sb_events+1 == mddev->events)) { 2100 rdev->sb_events+1 == mddev->events)) {
1926 /* Don't update this superblock */ 2101 /* Don't update this superblock */
1927 rdev->sb_loaded = 2; 2102 rdev->sb_loaded = 2;
@@ -1974,22 +2149,14 @@ repeat:
1974 * and 'events' is odd, we can roll back to the previous clean state */ 2149 * and 'events' is odd, we can roll back to the previous clean state */
1975 if (nospares 2150 if (nospares
1976 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 2151 && (mddev->in_sync && mddev->recovery_cp == MaxSector)
1977 && (mddev->events & 1) 2152 && mddev->can_decrease_events
1978 && mddev->events != 1) 2153 && mddev->events != 1) {
1979 mddev->events--; 2154 mddev->events--;
1980 else { 2155 mddev->can_decrease_events = 0;
2156 } else {
1981 /* otherwise we have to go forward and ... */ 2157 /* otherwise we have to go forward and ... */
1982 mddev->events ++; 2158 mddev->events ++;
1983 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ 2159 mddev->can_decrease_events = nospares;
1984 /* .. if the array isn't clean, an 'even' event must also go
1985 * to spares. */
1986 if ((mddev->events&1)==0)
1987 nospares = 0;
1988 } else {
1989 /* otherwise an 'odd' event must go to spares */
1990 if ((mddev->events&1))
1991 nospares = 0;
1992 }
1993 } 2160 }
1994 2161
1995 if (!mddev->events) { 2162 if (!mddev->events) {
@@ -2233,6 +2400,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2233 return err; 2400 return err;
2234 sprintf(nm, "rd%d", rdev->raid_disk); 2401 sprintf(nm, "rd%d", rdev->raid_disk);
2235 sysfs_remove_link(&rdev->mddev->kobj, nm); 2402 sysfs_remove_link(&rdev->mddev->kobj, nm);
2403 rdev->raid_disk = -1;
2236 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2404 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2237 md_wakeup_thread(rdev->mddev->thread); 2405 md_wakeup_thread(rdev->mddev->thread);
2238 } else if (rdev->mddev->pers) { 2406 } else if (rdev->mddev->pers) {
@@ -2421,12 +2589,49 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2421static struct rdev_sysfs_entry rdev_size = 2589static struct rdev_sysfs_entry rdev_size =
2422__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 2590__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
2423 2591
2592
2593static ssize_t recovery_start_show(mdk_rdev_t *rdev, char *page)
2594{
2595 unsigned long long recovery_start = rdev->recovery_offset;
2596
2597 if (test_bit(In_sync, &rdev->flags) ||
2598 recovery_start == MaxSector)
2599 return sprintf(page, "none\n");
2600
2601 return sprintf(page, "%llu\n", recovery_start);
2602}
2603
2604static ssize_t recovery_start_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2605{
2606 unsigned long long recovery_start;
2607
2608 if (cmd_match(buf, "none"))
2609 recovery_start = MaxSector;
2610 else if (strict_strtoull(buf, 10, &recovery_start))
2611 return -EINVAL;
2612
2613 if (rdev->mddev->pers &&
2614 rdev->raid_disk >= 0)
2615 return -EBUSY;
2616
2617 rdev->recovery_offset = recovery_start;
2618 if (recovery_start == MaxSector)
2619 set_bit(In_sync, &rdev->flags);
2620 else
2621 clear_bit(In_sync, &rdev->flags);
2622 return len;
2623}
2624
2625static struct rdev_sysfs_entry rdev_recovery_start =
2626__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
2627
2424static struct attribute *rdev_default_attrs[] = { 2628static struct attribute *rdev_default_attrs[] = {
2425 &rdev_state.attr, 2629 &rdev_state.attr,
2426 &rdev_errors.attr, 2630 &rdev_errors.attr,
2427 &rdev_slot.attr, 2631 &rdev_slot.attr,
2428 &rdev_offset.attr, 2632 &rdev_offset.attr,
2429 &rdev_size.attr, 2633 &rdev_size.attr,
2634 &rdev_recovery_start.attr,
2430 NULL, 2635 NULL,
2431}; 2636};
2432static ssize_t 2637static ssize_t
@@ -2480,7 +2685,7 @@ static void rdev_free(struct kobject *ko)
2480 mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj); 2685 mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj);
2481 kfree(rdev); 2686 kfree(rdev);
2482} 2687}
2483static struct sysfs_ops rdev_sysfs_ops = { 2688static const struct sysfs_ops rdev_sysfs_ops = {
2484 .show = rdev_attr_show, 2689 .show = rdev_attr_show,
2485 .store = rdev_attr_store, 2690 .store = rdev_attr_store,
2486}; 2691};
@@ -2528,6 +2733,8 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
2528 rdev->flags = 0; 2733 rdev->flags = 0;
2529 rdev->data_offset = 0; 2734 rdev->data_offset = 0;
2530 rdev->sb_events = 0; 2735 rdev->sb_events = 0;
2736 rdev->last_read_error.tv_sec = 0;
2737 rdev->last_read_error.tv_nsec = 0;
2531 atomic_set(&rdev->nr_pending, 0); 2738 atomic_set(&rdev->nr_pending, 0);
2532 atomic_set(&rdev->read_errors, 0); 2739 atomic_set(&rdev->read_errors, 0);
2533 atomic_set(&rdev->corrected_errors, 0); 2740 atomic_set(&rdev->corrected_errors, 0);
@@ -2609,8 +2816,9 @@ static void analyze_sbs(mddev_t * mddev)
2609 2816
2610 i = 0; 2817 i = 0;
2611 rdev_for_each(rdev, tmp, mddev) { 2818 rdev_for_each(rdev, tmp, mddev) {
2612 if (rdev->desc_nr >= mddev->max_disks || 2819 if (mddev->max_disks &&
2613 i > mddev->max_disks) { 2820 (rdev->desc_nr >= mddev->max_disks ||
2821 i > mddev->max_disks)) {
2614 printk(KERN_WARNING 2822 printk(KERN_WARNING
2615 "md: %s: %s: only %d devices permitted\n", 2823 "md: %s: %s: only %d devices permitted\n",
2616 mdname(mddev), bdevname(rdev->bdev, b), 2824 mdname(mddev), bdevname(rdev->bdev, b),
@@ -2631,13 +2839,54 @@ static void analyze_sbs(mddev_t * mddev)
2631 rdev->desc_nr = i++; 2839 rdev->desc_nr = i++;
2632 rdev->raid_disk = rdev->desc_nr; 2840 rdev->raid_disk = rdev->desc_nr;
2633 set_bit(In_sync, &rdev->flags); 2841 set_bit(In_sync, &rdev->flags);
2634 } else if (rdev->raid_disk >= mddev->raid_disks) { 2842 } else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) {
2635 rdev->raid_disk = -1; 2843 rdev->raid_disk = -1;
2636 clear_bit(In_sync, &rdev->flags); 2844 clear_bit(In_sync, &rdev->flags);
2637 } 2845 }
2638 } 2846 }
2639} 2847}
2640 2848
2849/* Read a fixed-point number.
2850 * Numbers in sysfs attributes should be in "standard" units where
2851 * possible, so time should be in seconds.
2852 * However we internally use a a much smaller unit such as
2853 * milliseconds or jiffies.
2854 * This function takes a decimal number with a possible fractional
2855 * component, and produces an integer which is the result of
2856 * multiplying that number by 10^'scale'.
2857 * all without any floating-point arithmetic.
2858 */
2859int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
2860{
2861 unsigned long result = 0;
2862 long decimals = -1;
2863 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
2864 if (*cp == '.')
2865 decimals = 0;
2866 else if (decimals < scale) {
2867 unsigned int value;
2868 value = *cp - '0';
2869 result = result * 10 + value;
2870 if (decimals >= 0)
2871 decimals++;
2872 }
2873 cp++;
2874 }
2875 if (*cp == '\n')
2876 cp++;
2877 if (*cp)
2878 return -EINVAL;
2879 if (decimals < 0)
2880 decimals = 0;
2881 while (decimals < scale) {
2882 result *= 10;
2883 decimals ++;
2884 }
2885 *res = result;
2886 return 0;
2887}
2888
2889
2641static void md_safemode_timeout(unsigned long data); 2890static void md_safemode_timeout(unsigned long data);
2642 2891
2643static ssize_t 2892static ssize_t
@@ -2649,31 +2898,10 @@ safe_delay_show(mddev_t *mddev, char *page)
2649static ssize_t 2898static ssize_t
2650safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) 2899safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
2651{ 2900{
2652 int scale=1;
2653 int dot=0;
2654 int i;
2655 unsigned long msec; 2901 unsigned long msec;
2656 char buf[30];
2657 2902
2658 /* remove a period, and count digits after it */ 2903 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
2659 if (len >= sizeof(buf))
2660 return -EINVAL;
2661 strlcpy(buf, cbuf, sizeof(buf));
2662 for (i=0; i<len; i++) {
2663 if (dot) {
2664 if (isdigit(buf[i])) {
2665 buf[i-1] = buf[i];
2666 scale *= 10;
2667 }
2668 buf[i] = 0;
2669 } else if (buf[i] == '.') {
2670 dot=1;
2671 buf[i] = 0;
2672 }
2673 }
2674 if (strict_strtoul(buf, 10, &msec) < 0)
2675 return -EINVAL; 2904 return -EINVAL;
2676 msec = (msec * 1000) / scale;
2677 if (msec == 0) 2905 if (msec == 0)
2678 mddev->safemode_delay = 0; 2906 mddev->safemode_delay = 0;
2679 else { 2907 else {
@@ -2706,9 +2934,10 @@ level_show(mddev_t *mddev, char *page)
2706static ssize_t 2934static ssize_t
2707level_store(mddev_t *mddev, const char *buf, size_t len) 2935level_store(mddev_t *mddev, const char *buf, size_t len)
2708{ 2936{
2709 char level[16]; 2937 char clevel[16];
2710 ssize_t rv = len; 2938 ssize_t rv = len;
2711 struct mdk_personality *pers; 2939 struct mdk_personality *pers;
2940 long level;
2712 void *priv; 2941 void *priv;
2713 mdk_rdev_t *rdev; 2942 mdk_rdev_t *rdev;
2714 2943
@@ -2741,19 +2970,22 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
2741 } 2970 }
2742 2971
2743 /* Now find the new personality */ 2972 /* Now find the new personality */
2744 if (len == 0 || len >= sizeof(level)) 2973 if (len == 0 || len >= sizeof(clevel))
2745 return -EINVAL; 2974 return -EINVAL;
2746 strncpy(level, buf, len); 2975 strncpy(clevel, buf, len);
2747 if (level[len-1] == '\n') 2976 if (clevel[len-1] == '\n')
2748 len--; 2977 len--;
2749 level[len] = 0; 2978 clevel[len] = 0;
2979 if (strict_strtol(clevel, 10, &level))
2980 level = LEVEL_NONE;
2750 2981
2751 request_module("md-%s", level); 2982 if (request_module("md-%s", clevel) != 0)
2983 request_module("md-level-%s", clevel);
2752 spin_lock(&pers_lock); 2984 spin_lock(&pers_lock);
2753 pers = find_pers(LEVEL_NONE, level); 2985 pers = find_pers(level, clevel);
2754 if (!pers || !try_module_get(pers->owner)) { 2986 if (!pers || !try_module_get(pers->owner)) {
2755 spin_unlock(&pers_lock); 2987 spin_unlock(&pers_lock);
2756 printk(KERN_WARNING "md: personality %s not loaded\n", level); 2988 printk(KERN_WARNING "md: personality %s not loaded\n", clevel);
2757 return -EINVAL; 2989 return -EINVAL;
2758 } 2990 }
2759 spin_unlock(&pers_lock); 2991 spin_unlock(&pers_lock);
@@ -2766,10 +2998,13 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
2766 if (!pers->takeover) { 2998 if (!pers->takeover) {
2767 module_put(pers->owner); 2999 module_put(pers->owner);
2768 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", 3000 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n",
2769 mdname(mddev), level); 3001 mdname(mddev), clevel);
2770 return -EINVAL; 3002 return -EINVAL;
2771 } 3003 }
2772 3004
3005 list_for_each_entry(rdev, &mddev->disks, same_set)
3006 rdev->new_raid_disk = rdev->raid_disk;
3007
2773 /* ->takeover must set new_* and/or delta_disks 3008 /* ->takeover must set new_* and/or delta_disks
2774 * if it succeeds, and may set them when it fails. 3009 * if it succeeds, and may set them when it fails.
2775 */ 3010 */
@@ -2782,20 +3017,73 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
2782 mddev->delta_disks = 0; 3017 mddev->delta_disks = 0;
2783 module_put(pers->owner); 3018 module_put(pers->owner);
2784 printk(KERN_WARNING "md: %s: %s would not accept array\n", 3019 printk(KERN_WARNING "md: %s: %s would not accept array\n",
2785 mdname(mddev), level); 3020 mdname(mddev), clevel);
2786 return PTR_ERR(priv); 3021 return PTR_ERR(priv);
2787 } 3022 }
2788 3023
2789 /* Looks like we have a winner */ 3024 /* Looks like we have a winner */
2790 mddev_suspend(mddev); 3025 mddev_suspend(mddev);
2791 mddev->pers->stop(mddev); 3026 mddev->pers->stop(mddev);
2792 module_put(mddev->pers->owner); 3027
2793 /* Invalidate devices that are now superfluous */ 3028 if (mddev->pers->sync_request == NULL &&
2794 list_for_each_entry(rdev, &mddev->disks, same_set) 3029 pers->sync_request != NULL) {
2795 if (rdev->raid_disk >= mddev->raid_disks) { 3030 /* need to add the md_redundancy_group */
2796 rdev->raid_disk = -1; 3031 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
3032 printk(KERN_WARNING
3033 "md: cannot register extra attributes for %s\n",
3034 mdname(mddev));
3035 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, NULL, "sync_action");
3036 }
3037 if (mddev->pers->sync_request != NULL &&
3038 pers->sync_request == NULL) {
3039 /* need to remove the md_redundancy_group */
3040 if (mddev->to_remove == NULL)
3041 mddev->to_remove = &md_redundancy_group;
3042 }
3043
3044 if (mddev->pers->sync_request == NULL &&
3045 mddev->external) {
3046 /* We are converting from a no-redundancy array
3047 * to a redundancy array and metadata is managed
3048 * externally so we need to be sure that writes
3049 * won't block due to a need to transition
3050 * clean->dirty
3051 * until external management is started.
3052 */
3053 mddev->in_sync = 0;
3054 mddev->safemode_delay = 0;
3055 mddev->safemode = 0;
3056 }
3057
3058 list_for_each_entry(rdev, &mddev->disks, same_set) {
3059 char nm[20];
3060 if (rdev->raid_disk < 0)
3061 continue;
3062 if (rdev->new_raid_disk > mddev->raid_disks)
3063 rdev->new_raid_disk = -1;
3064 if (rdev->new_raid_disk == rdev->raid_disk)
3065 continue;
3066 sprintf(nm, "rd%d", rdev->raid_disk);
3067 sysfs_remove_link(&mddev->kobj, nm);
3068 }
3069 list_for_each_entry(rdev, &mddev->disks, same_set) {
3070 if (rdev->raid_disk < 0)
3071 continue;
3072 if (rdev->new_raid_disk == rdev->raid_disk)
3073 continue;
3074 rdev->raid_disk = rdev->new_raid_disk;
3075 if (rdev->raid_disk < 0)
2797 clear_bit(In_sync, &rdev->flags); 3076 clear_bit(In_sync, &rdev->flags);
3077 else {
3078 char nm[20];
3079 sprintf(nm, "rd%d", rdev->raid_disk);
3080 if(sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
3081 printk("md: cannot register %s for %s after level change\n",
3082 nm, mdname(mddev));
2798 } 3083 }
3084 }
3085
3086 module_put(mddev->pers->owner);
2799 mddev->pers = pers; 3087 mddev->pers = pers;
2800 mddev->private = priv; 3088 mddev->private = priv;
2801 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 3089 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
@@ -2803,11 +3091,20 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
2803 mddev->layout = mddev->new_layout; 3091 mddev->layout = mddev->new_layout;
2804 mddev->chunk_sectors = mddev->new_chunk_sectors; 3092 mddev->chunk_sectors = mddev->new_chunk_sectors;
2805 mddev->delta_disks = 0; 3093 mddev->delta_disks = 0;
3094 if (mddev->pers->sync_request == NULL) {
3095 /* this is now an array without redundancy, so
3096 * it must always be in_sync
3097 */
3098 mddev->in_sync = 1;
3099 del_timer_sync(&mddev->safemode_timer);
3100 }
2806 pers->run(mddev); 3101 pers->run(mddev);
2807 mddev_resume(mddev); 3102 mddev_resume(mddev);
2808 set_bit(MD_CHANGE_DEVS, &mddev->flags); 3103 set_bit(MD_CHANGE_DEVS, &mddev->flags);
2809 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3104 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2810 md_wakeup_thread(mddev->thread); 3105 md_wakeup_thread(mddev->thread);
3106 sysfs_notify(&mddev->kobj, NULL, "level");
3107 md_new_event(mddev);
2811 return rv; 3108 return rv;
2812} 3109}
2813 3110
@@ -2949,7 +3246,9 @@ resync_start_store(mddev_t *mddev, const char *buf, size_t len)
2949 3246
2950 if (mddev->pers) 3247 if (mddev->pers)
2951 return -EBUSY; 3248 return -EBUSY;
2952 if (!*buf || (*e && *e != '\n')) 3249 if (cmd_match(buf, "none"))
3250 n = MaxSector;
3251 else if (!*buf || (*e && *e != '\n'))
2953 return -EINVAL; 3252 return -EINVAL;
2954 3253
2955 mddev->recovery_cp = n; 3254 mddev->recovery_cp = n;
@@ -3044,6 +3343,7 @@ array_state_show(mddev_t *mddev, char *page)
3044} 3343}
3045 3344
3046static int do_md_stop(mddev_t * mddev, int ro, int is_open); 3345static int do_md_stop(mddev_t * mddev, int ro, int is_open);
3346static int md_set_readonly(mddev_t * mddev, int is_open);
3047static int do_md_run(mddev_t * mddev); 3347static int do_md_run(mddev_t * mddev);
3048static int restart_array(mddev_t *mddev); 3348static int restart_array(mddev_t *mddev);
3049 3349
@@ -3074,7 +3374,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
3074 break; /* not supported yet */ 3374 break; /* not supported yet */
3075 case readonly: 3375 case readonly:
3076 if (mddev->pers) 3376 if (mddev->pers)
3077 err = do_md_stop(mddev, 1, 0); 3377 err = md_set_readonly(mddev, 0);
3078 else { 3378 else {
3079 mddev->ro = 1; 3379 mddev->ro = 1;
3080 set_disk_ro(mddev->gendisk, 1); 3380 set_disk_ro(mddev->gendisk, 1);
@@ -3084,7 +3384,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
3084 case read_auto: 3384 case read_auto:
3085 if (mddev->pers) { 3385 if (mddev->pers) {
3086 if (mddev->ro == 0) 3386 if (mddev->ro == 0)
3087 err = do_md_stop(mddev, 1, 0); 3387 err = md_set_readonly(mddev, 0);
3088 else if (mddev->ro == 1) 3388 else if (mddev->ro == 1)
3089 err = restart_array(mddev); 3389 err = restart_array(mddev);
3090 if (err == 0) { 3390 if (err == 0) {
@@ -3145,6 +3445,29 @@ static struct md_sysfs_entry md_array_state =
3145__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 3445__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
3146 3446
3147static ssize_t 3447static ssize_t
3448max_corrected_read_errors_show(mddev_t *mddev, char *page) {
3449 return sprintf(page, "%d\n",
3450 atomic_read(&mddev->max_corr_read_errors));
3451}
3452
3453static ssize_t
3454max_corrected_read_errors_store(mddev_t *mddev, const char *buf, size_t len)
3455{
3456 char *e;
3457 unsigned long n = simple_strtoul(buf, &e, 10);
3458
3459 if (*buf && (*e == 0 || *e == '\n')) {
3460 atomic_set(&mddev->max_corr_read_errors, n);
3461 return len;
3462 }
3463 return -EINVAL;
3464}
3465
3466static struct md_sysfs_entry max_corr_read_errors =
3467__ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
3468 max_corrected_read_errors_store);
3469
3470static ssize_t
3148null_show(mddev_t *mddev, char *page) 3471null_show(mddev_t *mddev, char *page)
3149{ 3472{
3150 return -EINVAL; 3473 return -EINVAL;
@@ -3225,8 +3548,7 @@ bitmap_store(mddev_t *mddev, const char *buf, size_t len)
3225 } 3548 }
3226 if (*end && !isspace(*end)) break; 3549 if (*end && !isspace(*end)) break;
3227 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); 3550 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
3228 buf = end; 3551 buf = skip_spaces(end);
3229 while (isspace(*buf)) buf++;
3230 } 3552 }
3231 bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ 3553 bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
3232out: 3554out:
@@ -3769,6 +4091,7 @@ static struct attribute *md_default_attrs[] = {
3769 &md_array_state.attr, 4091 &md_array_state.attr,
3770 &md_reshape_position.attr, 4092 &md_reshape_position.attr,
3771 &md_array_size.attr, 4093 &md_array_size.attr,
4094 &max_corr_read_errors.attr,
3772 NULL, 4095 NULL,
3773}; 4096};
3774 4097
@@ -3850,7 +4173,7 @@ static void md_free(struct kobject *ko)
3850 kfree(mddev); 4173 kfree(mddev);
3851} 4174}
3852 4175
3853static struct sysfs_ops md_sysfs_ops = { 4176static const struct sysfs_ops md_sysfs_ops = {
3854 .show = md_attr_show, 4177 .show = md_attr_show,
3855 .store = md_attr_store, 4178 .store = md_attr_store,
3856}; 4179};
@@ -3866,13 +4189,7 @@ static void mddev_delayed_delete(struct work_struct *ws)
3866{ 4189{
3867 mddev_t *mddev = container_of(ws, mddev_t, del_work); 4190 mddev_t *mddev = container_of(ws, mddev_t, del_work);
3868 4191
3869 if (mddev->private == &md_redundancy_group) { 4192 sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
3870 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
3871 if (mddev->sysfs_action)
3872 sysfs_put(mddev->sysfs_action);
3873 mddev->sysfs_action = NULL;
3874 mddev->private = NULL;
3875 }
3876 kobject_del(&mddev->kobj); 4193 kobject_del(&mddev->kobj);
3877 kobject_put(&mddev->kobj); 4194 kobject_put(&mddev->kobj);
3878} 4195}
@@ -3964,11 +4281,13 @@ static int md_alloc(dev_t dev, char *name)
3964 disk->disk_name); 4281 disk->disk_name);
3965 error = 0; 4282 error = 0;
3966 } 4283 }
4284 if (sysfs_create_group(&mddev->kobj, &md_bitmap_group))
4285 printk(KERN_DEBUG "pointless warning\n");
3967 abort: 4286 abort:
3968 mutex_unlock(&disks_mutex); 4287 mutex_unlock(&disks_mutex);
3969 if (!error) { 4288 if (!error) {
3970 kobject_uevent(&mddev->kobj, KOBJ_ADD); 4289 kobject_uevent(&mddev->kobj, KOBJ_ADD);
3971 mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state"); 4290 mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, NULL, "array_state");
3972 } 4291 }
3973 mddev_put(mddev); 4292 mddev_put(mddev);
3974 return error; 4293 return error;
@@ -4013,11 +4332,10 @@ static void md_safemode_timeout(unsigned long data)
4013 4332
4014static int start_dirty_degraded; 4333static int start_dirty_degraded;
4015 4334
4016static int do_md_run(mddev_t * mddev) 4335static int md_run(mddev_t *mddev)
4017{ 4336{
4018 int err; 4337 int err;
4019 mdk_rdev_t *rdev; 4338 mdk_rdev_t *rdev;
4020 struct gendisk *disk;
4021 struct mdk_personality *pers; 4339 struct mdk_personality *pers;
4022 4340
4023 if (list_empty(&mddev->disks)) 4341 if (list_empty(&mddev->disks))
@@ -4027,6 +4345,13 @@ static int do_md_run(mddev_t * mddev)
4027 if (mddev->pers) 4345 if (mddev->pers)
4028 return -EBUSY; 4346 return -EBUSY;
4029 4347
4348 /* These two calls synchronise us with the
4349 * sysfs_remove_group calls in mddev_unlock,
4350 * so they must have completed.
4351 */
4352 mutex_lock(&mddev->open_mutex);
4353 mutex_unlock(&mddev->open_mutex);
4354
4030 /* 4355 /*
4031 * Analyze all RAID superblock(s) 4356 * Analyze all RAID superblock(s)
4032 */ 4357 */
@@ -4075,11 +4400,6 @@ static int do_md_run(mddev_t * mddev)
4075 sysfs_notify_dirent(rdev->sysfs_state); 4400 sysfs_notify_dirent(rdev->sysfs_state);
4076 } 4401 }
4077 4402
4078 md_probe(mddev->unit, NULL, NULL);
4079 disk = mddev->gendisk;
4080 if (!disk)
4081 return -ENOMEM;
4082
4083 spin_lock(&pers_lock); 4403 spin_lock(&pers_lock);
4084 pers = find_pers(mddev->level, mddev->clevel); 4404 pers = find_pers(mddev->level, mddev->clevel);
4085 if (!pers || !try_module_get(pers->owner)) { 4405 if (!pers || !try_module_get(pers->owner)) {
@@ -4145,7 +4465,7 @@ static int do_md_run(mddev_t * mddev)
4145 mddev->barriers_work = 1; 4465 mddev->barriers_work = 1;
4146 mddev->ok_start_degraded = start_dirty_degraded; 4466 mddev->ok_start_degraded = start_dirty_degraded;
4147 4467
4148 if (start_readonly) 4468 if (start_readonly && mddev->ro == 0)
4149 mddev->ro = 2; /* read-only, but switch on first write */ 4469 mddev->ro = 2; /* read-only, but switch on first write */
4150 4470
4151 err = mddev->pers->run(mddev); 4471 err = mddev->pers->run(mddev);
@@ -4180,11 +4500,13 @@ static int do_md_run(mddev_t * mddev)
4180 printk(KERN_WARNING 4500 printk(KERN_WARNING
4181 "md: cannot register extra attributes for %s\n", 4501 "md: cannot register extra attributes for %s\n",
4182 mdname(mddev)); 4502 mdname(mddev));
4183 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); 4503 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, NULL, "sync_action");
4184 } else if (mddev->ro == 2) /* auto-readonly not meaningful */ 4504 } else if (mddev->ro == 2) /* auto-readonly not meaningful */
4185 mddev->ro = 0; 4505 mddev->ro = 0;
4186 4506
4187 atomic_set(&mddev->writes_pending,0); 4507 atomic_set(&mddev->writes_pending,0);
4508 atomic_set(&mddev->max_corr_read_errors,
4509 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
4188 mddev->safemode = 0; 4510 mddev->safemode = 0;
4189 mddev->safemode_timer.function = md_safemode_timeout; 4511 mddev->safemode_timer.function = md_safemode_timeout;
4190 mddev->safemode_timer.data = (unsigned long) mddev; 4512 mddev->safemode_timer.data = (unsigned long) mddev;
@@ -4205,49 +4527,32 @@ static int do_md_run(mddev_t * mddev)
4205 if (mddev->flags) 4527 if (mddev->flags)
4206 md_update_sb(mddev, 0); 4528 md_update_sb(mddev, 0);
4207 4529
4208 set_capacity(disk, mddev->array_sectors);
4209
4210 /* If there is a partially-recovered drive we need to
4211 * start recovery here. If we leave it to md_check_recovery,
4212 * it will remove the drives and not do the right thing
4213 */
4214 if (mddev->degraded && !mddev->sync_thread) {
4215 int spares = 0;
4216 list_for_each_entry(rdev, &mddev->disks, same_set)
4217 if (rdev->raid_disk >= 0 &&
4218 !test_bit(In_sync, &rdev->flags) &&
4219 !test_bit(Faulty, &rdev->flags))
4220 /* complete an interrupted recovery */
4221 spares++;
4222 if (spares && mddev->pers->sync_request) {
4223 mddev->recovery = 0;
4224 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4225 mddev->sync_thread = md_register_thread(md_do_sync,
4226 mddev,
4227 "resync");
4228 if (!mddev->sync_thread) {
4229 printk(KERN_ERR "%s: could not start resync"
4230 " thread...\n",
4231 mdname(mddev));
4232 /* leave the spares where they are, it shouldn't hurt */
4233 mddev->recovery = 0;
4234 }
4235 }
4236 }
4237 md_wakeup_thread(mddev->thread); 4530 md_wakeup_thread(mddev->thread);
4238 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 4531 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
4239 4532
4240 revalidate_disk(mddev->gendisk);
4241 mddev->changed = 1;
4242 md_new_event(mddev); 4533 md_new_event(mddev);
4243 sysfs_notify_dirent(mddev->sysfs_state); 4534 sysfs_notify_dirent(mddev->sysfs_state);
4244 if (mddev->sysfs_action) 4535 if (mddev->sysfs_action)
4245 sysfs_notify_dirent(mddev->sysfs_action); 4536 sysfs_notify_dirent(mddev->sysfs_action);
4246 sysfs_notify(&mddev->kobj, NULL, "degraded"); 4537 sysfs_notify(&mddev->kobj, NULL, "degraded");
4247 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
4248 return 0; 4538 return 0;
4249} 4539}
4250 4540
4541static int do_md_run(mddev_t *mddev)
4542{
4543 int err;
4544
4545 err = md_run(mddev);
4546 if (err)
4547 goto out;
4548
4549 set_capacity(mddev->gendisk, mddev->array_sectors);
4550 revalidate_disk(mddev->gendisk);
4551 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
4552out:
4553 return err;
4554}
4555
4251static int restart_array(mddev_t *mddev) 4556static int restart_array(mddev_t *mddev)
4252{ 4557{
4253 struct gendisk *disk = mddev->gendisk; 4558 struct gendisk *disk = mddev->gendisk;
@@ -4289,7 +4594,7 @@ static int deny_bitmap_write_access(struct file * file)
4289 return 0; 4594 return 0;
4290} 4595}
4291 4596
4292static void restore_bitmap_write_access(struct file *file) 4597void restore_bitmap_write_access(struct file *file)
4293{ 4598{
4294 struct inode *inode = file->f_mapping->host; 4599 struct inode *inode = file->f_mapping->host;
4295 4600
@@ -4298,9 +4603,110 @@ static void restore_bitmap_write_access(struct file *file)
4298 spin_unlock(&inode->i_lock); 4603 spin_unlock(&inode->i_lock);
4299} 4604}
4300 4605
4606static void md_clean(mddev_t *mddev)
4607{
4608 mddev->array_sectors = 0;
4609 mddev->external_size = 0;
4610 mddev->dev_sectors = 0;
4611 mddev->raid_disks = 0;
4612 mddev->recovery_cp = 0;
4613 mddev->resync_min = 0;
4614 mddev->resync_max = MaxSector;
4615 mddev->reshape_position = MaxSector;
4616 mddev->external = 0;
4617 mddev->persistent = 0;
4618 mddev->level = LEVEL_NONE;
4619 mddev->clevel[0] = 0;
4620 mddev->flags = 0;
4621 mddev->ro = 0;
4622 mddev->metadata_type[0] = 0;
4623 mddev->chunk_sectors = 0;
4624 mddev->ctime = mddev->utime = 0;
4625 mddev->layout = 0;
4626 mddev->max_disks = 0;
4627 mddev->events = 0;
4628 mddev->can_decrease_events = 0;
4629 mddev->delta_disks = 0;
4630 mddev->new_level = LEVEL_NONE;
4631 mddev->new_layout = 0;
4632 mddev->new_chunk_sectors = 0;
4633 mddev->curr_resync = 0;
4634 mddev->resync_mismatches = 0;
4635 mddev->suspend_lo = mddev->suspend_hi = 0;
4636 mddev->sync_speed_min = mddev->sync_speed_max = 0;
4637 mddev->recovery = 0;
4638 mddev->in_sync = 0;
4639 mddev->degraded = 0;
4640 mddev->barriers_work = 0;
4641 mddev->safemode = 0;
4642 mddev->bitmap_info.offset = 0;
4643 mddev->bitmap_info.default_offset = 0;
4644 mddev->bitmap_info.chunksize = 0;
4645 mddev->bitmap_info.daemon_sleep = 0;
4646 mddev->bitmap_info.max_write_behind = 0;
4647}
4648
4649static void md_stop_writes(mddev_t *mddev)
4650{
4651 if (mddev->sync_thread) {
4652 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4653 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4654 md_unregister_thread(mddev->sync_thread);
4655 mddev->sync_thread = NULL;
4656 }
4657
4658 del_timer_sync(&mddev->safemode_timer);
4659
4660 bitmap_flush(mddev);
4661 md_super_wait(mddev);
4662
4663 if (!mddev->in_sync || mddev->flags) {
4664 /* mark array as shutdown cleanly */
4665 mddev->in_sync = 1;
4666 md_update_sb(mddev, 1);
4667 }
4668}
4669
4670static void md_stop(mddev_t *mddev)
4671{
4672 md_stop_writes(mddev);
4673
4674 mddev->pers->stop(mddev);
4675 if (mddev->pers->sync_request && mddev->to_remove == NULL)
4676 mddev->to_remove = &md_redundancy_group;
4677 module_put(mddev->pers->owner);
4678 mddev->pers = NULL;
4679 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4680}
4681
4682static int md_set_readonly(mddev_t *mddev, int is_open)
4683{
4684 int err = 0;
4685 mutex_lock(&mddev->open_mutex);
4686 if (atomic_read(&mddev->openers) > is_open) {
4687 printk("md: %s still in use.\n",mdname(mddev));
4688 err = -EBUSY;
4689 goto out;
4690 }
4691 if (mddev->pers) {
4692 md_stop_writes(mddev);
4693
4694 err = -ENXIO;
4695 if (mddev->ro==1)
4696 goto out;
4697 mddev->ro = 1;
4698 set_disk_ro(mddev->gendisk, 1);
4699 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4700 sysfs_notify_dirent(mddev->sysfs_state);
4701 err = 0;
4702 }
4703out:
4704 mutex_unlock(&mddev->open_mutex);
4705 return err;
4706}
4707
4301/* mode: 4708/* mode:
4302 * 0 - completely stop and dis-assemble array 4709 * 0 - completely stop and dis-assemble array
4303 * 1 - switch to readonly
4304 * 2 - stop but do not disassemble array 4710 * 2 - stop but do not disassemble array
4305 */ 4711 */
4306static int do_md_stop(mddev_t * mddev, int mode, int is_open) 4712static int do_md_stop(mddev_t * mddev, int mode, int is_open)
@@ -4315,64 +4721,32 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
4315 err = -EBUSY; 4721 err = -EBUSY;
4316 } else if (mddev->pers) { 4722 } else if (mddev->pers) {
4317 4723
4318 if (mddev->sync_thread) { 4724 if (mddev->ro)
4319 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4725 set_disk_ro(disk, 0);
4320 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4321 md_unregister_thread(mddev->sync_thread);
4322 mddev->sync_thread = NULL;
4323 }
4324
4325 del_timer_sync(&mddev->safemode_timer);
4326 4726
4327 switch(mode) { 4727 md_stop(mddev);
4328 case 1: /* readonly */ 4728 mddev->queue->merge_bvec_fn = NULL;
4329 err = -ENXIO; 4729 mddev->queue->unplug_fn = NULL;
4330 if (mddev->ro==1) 4730 mddev->queue->backing_dev_info.congested_fn = NULL;
4331 goto out;
4332 mddev->ro = 1;
4333 break;
4334 case 0: /* disassemble */
4335 case 2: /* stop */
4336 bitmap_flush(mddev);
4337 md_super_wait(mddev);
4338 if (mddev->ro)
4339 set_disk_ro(disk, 0);
4340 4731
4341 mddev->pers->stop(mddev); 4732 /* tell userspace to handle 'inactive' */
4342 mddev->queue->merge_bvec_fn = NULL; 4733 sysfs_notify_dirent(mddev->sysfs_state);
4343 mddev->queue->unplug_fn = NULL;
4344 mddev->queue->backing_dev_info.congested_fn = NULL;
4345 module_put(mddev->pers->owner);
4346 if (mddev->pers->sync_request)
4347 mddev->private = &md_redundancy_group;
4348 mddev->pers = NULL;
4349 /* tell userspace to handle 'inactive' */
4350 sysfs_notify_dirent(mddev->sysfs_state);
4351 4734
4352 list_for_each_entry(rdev, &mddev->disks, same_set) 4735 list_for_each_entry(rdev, &mddev->disks, same_set)
4353 if (rdev->raid_disk >= 0) { 4736 if (rdev->raid_disk >= 0) {
4354 char nm[20]; 4737 char nm[20];
4355 sprintf(nm, "rd%d", rdev->raid_disk); 4738 sprintf(nm, "rd%d", rdev->raid_disk);
4356 sysfs_remove_link(&mddev->kobj, nm); 4739 sysfs_remove_link(&mddev->kobj, nm);
4357 } 4740 }
4358 4741
4359 set_capacity(disk, 0); 4742 set_capacity(disk, 0);
4360 mddev->changed = 1; 4743 revalidate_disk(disk);
4361 4744
4362 if (mddev->ro) 4745 if (mddev->ro)
4363 mddev->ro = 0; 4746 mddev->ro = 0;
4364 } 4747
4365 if (!mddev->in_sync || mddev->flags) {
4366 /* mark array as shutdown cleanly */
4367 mddev->in_sync = 1;
4368 md_update_sb(mddev, 1);
4369 }
4370 if (mode == 1)
4371 set_disk_ro(disk, 1);
4372 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4373 err = 0; 4748 err = 0;
4374 } 4749 }
4375out:
4376 mutex_unlock(&mddev->open_mutex); 4750 mutex_unlock(&mddev->open_mutex);
4377 if (err) 4751 if (err)
4378 return err; 4752 return err;
@@ -4384,59 +4758,21 @@ out:
4384 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 4758 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
4385 4759
4386 bitmap_destroy(mddev); 4760 bitmap_destroy(mddev);
4387 if (mddev->bitmap_file) { 4761 if (mddev->bitmap_info.file) {
4388 restore_bitmap_write_access(mddev->bitmap_file); 4762 restore_bitmap_write_access(mddev->bitmap_info.file);
4389 fput(mddev->bitmap_file); 4763 fput(mddev->bitmap_info.file);
4390 mddev->bitmap_file = NULL; 4764 mddev->bitmap_info.file = NULL;
4391 } 4765 }
4392 mddev->bitmap_offset = 0; 4766 mddev->bitmap_info.offset = 0;
4393
4394 /* make sure all md_delayed_delete calls have finished */
4395 flush_scheduled_work();
4396 4767
4397 export_array(mddev); 4768 export_array(mddev);
4398 4769
4399 mddev->array_sectors = 0; 4770 md_clean(mddev);
4400 mddev->external_size = 0;
4401 mddev->dev_sectors = 0;
4402 mddev->raid_disks = 0;
4403 mddev->recovery_cp = 0;
4404 mddev->resync_min = 0;
4405 mddev->resync_max = MaxSector;
4406 mddev->reshape_position = MaxSector;
4407 mddev->external = 0;
4408 mddev->persistent = 0;
4409 mddev->level = LEVEL_NONE;
4410 mddev->clevel[0] = 0;
4411 mddev->flags = 0;
4412 mddev->ro = 0;
4413 mddev->metadata_type[0] = 0;
4414 mddev->chunk_sectors = 0;
4415 mddev->ctime = mddev->utime = 0;
4416 mddev->layout = 0;
4417 mddev->max_disks = 0;
4418 mddev->events = 0;
4419 mddev->delta_disks = 0;
4420 mddev->new_level = LEVEL_NONE;
4421 mddev->new_layout = 0;
4422 mddev->new_chunk_sectors = 0;
4423 mddev->curr_resync = 0;
4424 mddev->resync_mismatches = 0;
4425 mddev->suspend_lo = mddev->suspend_hi = 0;
4426 mddev->sync_speed_min = mddev->sync_speed_max = 0;
4427 mddev->recovery = 0;
4428 mddev->in_sync = 0;
4429 mddev->changed = 0;
4430 mddev->degraded = 0;
4431 mddev->barriers_work = 0;
4432 mddev->safemode = 0;
4433 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 4771 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
4434 if (mddev->hold_active == UNTIL_STOP) 4772 if (mddev->hold_active == UNTIL_STOP)
4435 mddev->hold_active = 0; 4773 mddev->hold_active = 0;
4436 4774
4437 } else if (mddev->pers) 4775 }
4438 printk(KERN_INFO "md: %s switched to read-only mode.\n",
4439 mdname(mddev));
4440 err = 0; 4776 err = 0;
4441 blk_integrity_unregister(disk); 4777 blk_integrity_unregister(disk);
4442 md_new_event(mddev); 4778 md_new_event(mddev);
@@ -4615,7 +4951,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
4615 info.state = 0; 4951 info.state = 0;
4616 if (mddev->in_sync) 4952 if (mddev->in_sync)
4617 info.state = (1<<MD_SB_CLEAN); 4953 info.state = (1<<MD_SB_CLEAN);
4618 if (mddev->bitmap && mddev->bitmap_offset) 4954 if (mddev->bitmap && mddev->bitmap_info.offset)
4619 info.state = (1<<MD_SB_BITMAP_PRESENT); 4955 info.state = (1<<MD_SB_BITMAP_PRESENT);
4620 info.active_disks = insync; 4956 info.active_disks = insync;
4621 info.working_disks = working; 4957 info.working_disks = working;
@@ -4973,23 +5309,23 @@ static int set_bitmap_file(mddev_t *mddev, int fd)
4973 if (fd >= 0) { 5309 if (fd >= 0) {
4974 if (mddev->bitmap) 5310 if (mddev->bitmap)
4975 return -EEXIST; /* cannot add when bitmap is present */ 5311 return -EEXIST; /* cannot add when bitmap is present */
4976 mddev->bitmap_file = fget(fd); 5312 mddev->bitmap_info.file = fget(fd);
4977 5313
4978 if (mddev->bitmap_file == NULL) { 5314 if (mddev->bitmap_info.file == NULL) {
4979 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 5315 printk(KERN_ERR "%s: error: failed to get bitmap file\n",
4980 mdname(mddev)); 5316 mdname(mddev));
4981 return -EBADF; 5317 return -EBADF;
4982 } 5318 }
4983 5319
4984 err = deny_bitmap_write_access(mddev->bitmap_file); 5320 err = deny_bitmap_write_access(mddev->bitmap_info.file);
4985 if (err) { 5321 if (err) {
4986 printk(KERN_ERR "%s: error: bitmap file is already in use\n", 5322 printk(KERN_ERR "%s: error: bitmap file is already in use\n",
4987 mdname(mddev)); 5323 mdname(mddev));
4988 fput(mddev->bitmap_file); 5324 fput(mddev->bitmap_info.file);
4989 mddev->bitmap_file = NULL; 5325 mddev->bitmap_info.file = NULL;
4990 return err; 5326 return err;
4991 } 5327 }
4992 mddev->bitmap_offset = 0; /* file overrides offset */ 5328 mddev->bitmap_info.offset = 0; /* file overrides offset */
4993 } else if (mddev->bitmap == NULL) 5329 } else if (mddev->bitmap == NULL)
4994 return -ENOENT; /* cannot remove what isn't there */ 5330 return -ENOENT; /* cannot remove what isn't there */
4995 err = 0; 5331 err = 0;
@@ -5004,11 +5340,11 @@ static int set_bitmap_file(mddev_t *mddev, int fd)
5004 mddev->pers->quiesce(mddev, 0); 5340 mddev->pers->quiesce(mddev, 0);
5005 } 5341 }
5006 if (fd < 0) { 5342 if (fd < 0) {
5007 if (mddev->bitmap_file) { 5343 if (mddev->bitmap_info.file) {
5008 restore_bitmap_write_access(mddev->bitmap_file); 5344 restore_bitmap_write_access(mddev->bitmap_info.file);
5009 fput(mddev->bitmap_file); 5345 fput(mddev->bitmap_info.file);
5010 } 5346 }
5011 mddev->bitmap_file = NULL; 5347 mddev->bitmap_info.file = NULL;
5012 } 5348 }
5013 5349
5014 return err; 5350 return err;
@@ -5045,6 +5381,10 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
5045 mddev->minor_version = info->minor_version; 5381 mddev->minor_version = info->minor_version;
5046 mddev->patch_version = info->patch_version; 5382 mddev->patch_version = info->patch_version;
5047 mddev->persistent = !info->not_persistent; 5383 mddev->persistent = !info->not_persistent;
5384 /* ensure mddev_put doesn't delete this now that there
5385 * is some minimal configuration.
5386 */
5387 mddev->ctime = get_seconds();
5048 return 0; 5388 return 0;
5049 } 5389 }
5050 mddev->major_version = MD_MAJOR_VERSION; 5390 mddev->major_version = MD_MAJOR_VERSION;
@@ -5075,8 +5415,8 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
5075 mddev->flags = 0; 5415 mddev->flags = 0;
5076 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5416 set_bit(MD_CHANGE_DEVS, &mddev->flags);
5077 5417
5078 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 5418 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
5079 mddev->bitmap_offset = 0; 5419 mddev->bitmap_info.offset = 0;
5080 5420
5081 mddev->reshape_position = MaxSector; 5421 mddev->reshape_position = MaxSector;
5082 5422
@@ -5150,7 +5490,7 @@ static int update_raid_disks(mddev_t *mddev, int raid_disks)
5150 if (mddev->pers->check_reshape == NULL) 5490 if (mddev->pers->check_reshape == NULL)
5151 return -EINVAL; 5491 return -EINVAL;
5152 if (raid_disks <= 0 || 5492 if (raid_disks <= 0 ||
5153 raid_disks >= mddev->max_disks) 5493 (mddev->max_disks && raid_disks >= mddev->max_disks))
5154 return -EINVAL; 5494 return -EINVAL;
5155 if (mddev->sync_thread || mddev->reshape_position != MaxSector) 5495 if (mddev->sync_thread || mddev->reshape_position != MaxSector)
5156 return -EBUSY; 5496 return -EBUSY;
@@ -5176,7 +5516,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
5176 int state = 0; 5516 int state = 0;
5177 5517
5178 /* calculate expected state,ignoring low bits */ 5518 /* calculate expected state,ignoring low bits */
5179 if (mddev->bitmap && mddev->bitmap_offset) 5519 if (mddev->bitmap && mddev->bitmap_info.offset)
5180 state |= (1 << MD_SB_BITMAP_PRESENT); 5520 state |= (1 << MD_SB_BITMAP_PRESENT);
5181 5521
5182 if (mddev->major_version != info->major_version || 5522 if (mddev->major_version != info->major_version ||
@@ -5235,9 +5575,10 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
5235 /* add the bitmap */ 5575 /* add the bitmap */
5236 if (mddev->bitmap) 5576 if (mddev->bitmap)
5237 return -EEXIST; 5577 return -EEXIST;
5238 if (mddev->default_bitmap_offset == 0) 5578 if (mddev->bitmap_info.default_offset == 0)
5239 return -EINVAL; 5579 return -EINVAL;
5240 mddev->bitmap_offset = mddev->default_bitmap_offset; 5580 mddev->bitmap_info.offset =
5581 mddev->bitmap_info.default_offset;
5241 mddev->pers->quiesce(mddev, 1); 5582 mddev->pers->quiesce(mddev, 1);
5242 rv = bitmap_create(mddev); 5583 rv = bitmap_create(mddev);
5243 if (rv) 5584 if (rv)
@@ -5252,7 +5593,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
5252 mddev->pers->quiesce(mddev, 1); 5593 mddev->pers->quiesce(mddev, 1);
5253 bitmap_destroy(mddev); 5594 bitmap_destroy(mddev);
5254 mddev->pers->quiesce(mddev, 0); 5595 mddev->pers->quiesce(mddev, 0);
5255 mddev->bitmap_offset = 0; 5596 mddev->bitmap_info.offset = 0;
5256 } 5597 }
5257 } 5598 }
5258 md_update_sb(mddev, 1); 5599 md_update_sb(mddev, 1);
@@ -5286,7 +5627,7 @@ static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
5286 5627
5287 geo->heads = 2; 5628 geo->heads = 2;
5288 geo->sectors = 4; 5629 geo->sectors = 4;
5289 geo->cylinders = get_capacity(mddev->gendisk) / 8; 5630 geo->cylinders = mddev->array_sectors / 8;
5290 return 0; 5631 return 0;
5291} 5632}
5292 5633
@@ -5296,6 +5637,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
5296 int err = 0; 5637 int err = 0;
5297 void __user *argp = (void __user *)arg; 5638 void __user *argp = (void __user *)arg;
5298 mddev_t *mddev = NULL; 5639 mddev_t *mddev = NULL;
5640 int ro;
5299 5641
5300 if (!capable(CAP_SYS_ADMIN)) 5642 if (!capable(CAP_SYS_ADMIN))
5301 return -EACCES; 5643 return -EACCES;
@@ -5428,9 +5770,37 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
5428 goto done_unlock; 5770 goto done_unlock;
5429 5771
5430 case STOP_ARRAY_RO: 5772 case STOP_ARRAY_RO:
5431 err = do_md_stop(mddev, 1, 1); 5773 err = md_set_readonly(mddev, 1);
5432 goto done_unlock; 5774 goto done_unlock;
5433 5775
5776 case BLKROSET:
5777 if (get_user(ro, (int __user *)(arg))) {
5778 err = -EFAULT;
5779 goto done_unlock;
5780 }
5781 err = -EINVAL;
5782
5783 /* if the bdev is going readonly the value of mddev->ro
5784 * does not matter, no writes are coming
5785 */
5786 if (ro)
5787 goto done_unlock;
5788
5789 /* are we are already prepared for writes? */
5790 if (mddev->ro != 1)
5791 goto done_unlock;
5792
5793 /* transitioning to readauto need only happen for
5794 * arrays that call md_write_start
5795 */
5796 if (mddev->pers) {
5797 err = restart_array(mddev);
5798 if (err == 0) {
5799 mddev->ro = 2;
5800 set_disk_ro(mddev->gendisk, 0);
5801 }
5802 }
5803 goto done_unlock;
5434 } 5804 }
5435 5805
5436 /* 5806 /*
@@ -5503,6 +5873,25 @@ done:
5503abort: 5873abort:
5504 return err; 5874 return err;
5505} 5875}
5876#ifdef CONFIG_COMPAT
5877static int md_compat_ioctl(struct block_device *bdev, fmode_t mode,
5878 unsigned int cmd, unsigned long arg)
5879{
5880 switch (cmd) {
5881 case HOT_REMOVE_DISK:
5882 case HOT_ADD_DISK:
5883 case SET_DISK_FAULTY:
5884 case SET_BITMAP_FILE:
5885 /* These take in integer arg, do not convert */
5886 break;
5887 default:
5888 arg = (unsigned long)compat_ptr(arg);
5889 break;
5890 }
5891
5892 return md_ioctl(bdev, mode, cmd, arg);
5893}
5894#endif /* CONFIG_COMPAT */
5506 5895
5507static int md_open(struct block_device *bdev, fmode_t mode) 5896static int md_open(struct block_device *bdev, fmode_t mode)
5508{ 5897{
@@ -5532,7 +5921,7 @@ static int md_open(struct block_device *bdev, fmode_t mode)
5532 atomic_inc(&mddev->openers); 5921 atomic_inc(&mddev->openers);
5533 mutex_unlock(&mddev->open_mutex); 5922 mutex_unlock(&mddev->open_mutex);
5534 5923
5535 check_disk_change(bdev); 5924 check_disk_size_change(mddev->gendisk, bdev);
5536 out: 5925 out:
5537 return err; 5926 return err;
5538} 5927}
@@ -5547,30 +5936,16 @@ static int md_release(struct gendisk *disk, fmode_t mode)
5547 5936
5548 return 0; 5937 return 0;
5549} 5938}
5550
5551static int md_media_changed(struct gendisk *disk)
5552{
5553 mddev_t *mddev = disk->private_data;
5554
5555 return mddev->changed;
5556}
5557
5558static int md_revalidate(struct gendisk *disk)
5559{
5560 mddev_t *mddev = disk->private_data;
5561
5562 mddev->changed = 0;
5563 return 0;
5564}
5565static const struct block_device_operations md_fops = 5939static const struct block_device_operations md_fops =
5566{ 5940{
5567 .owner = THIS_MODULE, 5941 .owner = THIS_MODULE,
5568 .open = md_open, 5942 .open = md_open,
5569 .release = md_release, 5943 .release = md_release,
5570 .ioctl = md_ioctl, 5944 .ioctl = md_ioctl,
5945#ifdef CONFIG_COMPAT
5946 .compat_ioctl = md_compat_ioctl,
5947#endif
5571 .getgeo = md_getgeo, 5948 .getgeo = md_getgeo,
5572 .media_changed = md_media_changed,
5573 .revalidate_disk= md_revalidate,
5574}; 5949};
5575 5950
5576static int md_thread(void * arg) 5951static int md_thread(void * arg)
@@ -5684,7 +6059,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
5684 mddev->pers->error_handler(mddev,rdev); 6059 mddev->pers->error_handler(mddev,rdev);
5685 if (mddev->degraded) 6060 if (mddev->degraded)
5686 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6061 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5687 set_bit(StateChanged, &rdev->flags); 6062 sysfs_notify_dirent(rdev->sysfs_state);
5688 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6063 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5689 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6064 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5690 md_wakeup_thread(mddev->thread); 6065 md_wakeup_thread(mddev->thread);
@@ -5961,14 +6336,14 @@ static int md_seq_show(struct seq_file *seq, void *v)
5961 unsigned long chunk_kb; 6336 unsigned long chunk_kb;
5962 unsigned long flags; 6337 unsigned long flags;
5963 spin_lock_irqsave(&bitmap->lock, flags); 6338 spin_lock_irqsave(&bitmap->lock, flags);
5964 chunk_kb = bitmap->chunksize >> 10; 6339 chunk_kb = mddev->bitmap_info.chunksize >> 10;
5965 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " 6340 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
5966 "%lu%s chunk", 6341 "%lu%s chunk",
5967 bitmap->pages - bitmap->missing_pages, 6342 bitmap->pages - bitmap->missing_pages,
5968 bitmap->pages, 6343 bitmap->pages,
5969 (bitmap->pages - bitmap->missing_pages) 6344 (bitmap->pages - bitmap->missing_pages)
5970 << (PAGE_SHIFT - 10), 6345 << (PAGE_SHIFT - 10),
5971 chunk_kb ? chunk_kb : bitmap->chunksize, 6346 chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize,
5972 chunk_kb ? "KB" : "B"); 6347 chunk_kb ? "KB" : "B");
5973 if (bitmap->file) { 6348 if (bitmap->file) {
5974 seq_printf(seq, ", file: "); 6349 seq_printf(seq, ", file: ");
@@ -6254,10 +6629,11 @@ void md_do_sync(mddev_t *mddev)
6254 mddev->curr_resync = 2; 6629 mddev->curr_resync = 2;
6255 6630
6256 try_again: 6631 try_again:
6257 if (kthread_should_stop()) { 6632 if (kthread_should_stop())
6258 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6633 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6634
6635 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6259 goto skip; 6636 goto skip;
6260 }
6261 for_each_mddev(mddev2, tmp) { 6637 for_each_mddev(mddev2, tmp) {
6262 if (mddev2 == mddev) 6638 if (mddev2 == mddev)
6263 continue; 6639 continue;
@@ -6317,12 +6693,14 @@ void md_do_sync(mddev_t *mddev)
6317 /* recovery follows the physical size of devices */ 6693 /* recovery follows the physical size of devices */
6318 max_sectors = mddev->dev_sectors; 6694 max_sectors = mddev->dev_sectors;
6319 j = MaxSector; 6695 j = MaxSector;
6320 list_for_each_entry(rdev, &mddev->disks, same_set) 6696 rcu_read_lock();
6697 list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
6321 if (rdev->raid_disk >= 0 && 6698 if (rdev->raid_disk >= 0 &&
6322 !test_bit(Faulty, &rdev->flags) && 6699 !test_bit(Faulty, &rdev->flags) &&
6323 !test_bit(In_sync, &rdev->flags) && 6700 !test_bit(In_sync, &rdev->flags) &&
6324 rdev->recovery_offset < j) 6701 rdev->recovery_offset < j)
6325 j = rdev->recovery_offset; 6702 j = rdev->recovery_offset;
6703 rcu_read_unlock();
6326 } 6704 }
6327 6705
6328 printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); 6706 printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
@@ -6359,6 +6737,7 @@ void md_do_sync(mddev_t *mddev)
6359 desc, mdname(mddev)); 6737 desc, mdname(mddev));
6360 mddev->curr_resync = j; 6738 mddev->curr_resync = j;
6361 } 6739 }
6740 mddev->curr_resync_completed = mddev->curr_resync;
6362 6741
6363 while (j < max_sectors) { 6742 while (j < max_sectors) {
6364 sector_t sectors; 6743 sector_t sectors;
@@ -6491,21 +6870,30 @@ void md_do_sync(mddev_t *mddev)
6491 } else { 6870 } else {
6492 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 6871 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6493 mddev->curr_resync = MaxSector; 6872 mddev->curr_resync = MaxSector;
6494 list_for_each_entry(rdev, &mddev->disks, same_set) 6873 rcu_read_lock();
6874 list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
6495 if (rdev->raid_disk >= 0 && 6875 if (rdev->raid_disk >= 0 &&
6876 mddev->delta_disks >= 0 &&
6496 !test_bit(Faulty, &rdev->flags) && 6877 !test_bit(Faulty, &rdev->flags) &&
6497 !test_bit(In_sync, &rdev->flags) && 6878 !test_bit(In_sync, &rdev->flags) &&
6498 rdev->recovery_offset < mddev->curr_resync) 6879 rdev->recovery_offset < mddev->curr_resync)
6499 rdev->recovery_offset = mddev->curr_resync; 6880 rdev->recovery_offset = mddev->curr_resync;
6881 rcu_read_unlock();
6500 } 6882 }
6501 } 6883 }
6502 set_bit(MD_CHANGE_DEVS, &mddev->flags); 6884 set_bit(MD_CHANGE_DEVS, &mddev->flags);
6503 6885
6504 skip: 6886 skip:
6887 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
6888 /* We completed so min/max setting can be forgotten if used. */
6889 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
6890 mddev->resync_min = 0;
6891 mddev->resync_max = MaxSector;
6892 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
6893 mddev->resync_min = mddev->curr_resync_completed;
6505 mddev->curr_resync = 0; 6894 mddev->curr_resync = 0;
6506 mddev->curr_resync_completed = 0; 6895 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6507 mddev->resync_min = 0; 6896 mddev->curr_resync_completed = 0;
6508 mddev->resync_max = MaxSector;
6509 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 6897 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6510 wake_up(&resync_wait); 6898 wake_up(&resync_wait);
6511 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 6899 set_bit(MD_RECOVERY_DONE, &mddev->recovery);
@@ -6568,6 +6956,7 @@ static int remove_and_add_spares(mddev_t *mddev)
6568 nm, mdname(mddev)); 6956 nm, mdname(mddev));
6569 spares++; 6957 spares++;
6570 md_new_event(mddev); 6958 md_new_event(mddev);
6959 set_bit(MD_CHANGE_DEVS, &mddev->flags);
6571 } else 6960 } else
6572 break; 6961 break;
6573 } 6962 }
@@ -6603,7 +6992,7 @@ void md_check_recovery(mddev_t *mddev)
6603 6992
6604 6993
6605 if (mddev->bitmap) 6994 if (mddev->bitmap)
6606 bitmap_daemon_work(mddev->bitmap); 6995 bitmap_daemon_work(mddev);
6607 6996
6608 if (mddev->ro) 6997 if (mddev->ro)
6609 return; 6998 return;
@@ -6663,11 +7052,6 @@ void md_check_recovery(mddev_t *mddev)
6663 if (mddev->flags) 7052 if (mddev->flags)
6664 md_update_sb(mddev, 0); 7053 md_update_sb(mddev, 0);
6665 7054
6666 list_for_each_entry(rdev, &mddev->disks, same_set)
6667 if (test_and_clear_bit(StateChanged, &rdev->flags))
6668 sysfs_notify_dirent(rdev->sysfs_state);
6669
6670
6671 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 7055 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
6672 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 7056 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
6673 /* resync/recovery still happening */ 7057 /* resync/recovery still happening */
@@ -6804,7 +7188,7 @@ static int md_notify_reboot(struct notifier_block *this,
6804 * appears to still be in use. Hence 7188 * appears to still be in use. Hence
6805 * the '100'. 7189 * the '100'.
6806 */ 7190 */
6807 do_md_stop(mddev, 1, 100); 7191 md_set_readonly(mddev, 100);
6808 mddev_unlock(mddev); 7192 mddev_unlock(mddev);
6809 } 7193 }
6810 /* 7194 /*
@@ -6973,5 +7357,6 @@ EXPORT_SYMBOL(md_unregister_thread);
6973EXPORT_SYMBOL(md_wakeup_thread); 7357EXPORT_SYMBOL(md_wakeup_thread);
6974EXPORT_SYMBOL(md_check_recovery); 7358EXPORT_SYMBOL(md_check_recovery);
6975MODULE_LICENSE("GPL"); 7359MODULE_LICENSE("GPL");
7360MODULE_DESCRIPTION("MD RAID framework");
6976MODULE_ALIAS("md"); 7361MODULE_ALIAS("md");
6977MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 7362MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index f184b69ef337..10597bfec000 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -74,13 +74,13 @@ struct mdk_rdev_s
74#define Blocked 8 /* An error occured on an externally 74#define Blocked 8 /* An error occured on an externally
75 * managed array, don't allow writes 75 * managed array, don't allow writes
76 * until it is cleared */ 76 * until it is cleared */
77#define StateChanged 9 /* Faulty or Blocked has changed during
78 * interrupt, so it needs to be
79 * notified by the thread */
80 wait_queue_head_t blocked_wait; 77 wait_queue_head_t blocked_wait;
81 78
82 int desc_nr; /* descriptor index in the superblock */ 79 int desc_nr; /* descriptor index in the superblock */
83 int raid_disk; /* role of device in array */ 80 int raid_disk; /* role of device in array */
81 int new_raid_disk; /* role that the device will have in
82 * the array after a level-change completes.
83 */
84 int saved_raid_disk; /* role that device used to have in the 84 int saved_raid_disk; /* role that device used to have in the
85 * array and could again if we did a partial 85 * array and could again if we did a partial
86 * resync from the bitmap 86 * resync from the bitmap
@@ -97,6 +97,9 @@ struct mdk_rdev_s
97 atomic_t read_errors; /* number of consecutive read errors that 97 atomic_t read_errors; /* number of consecutive read errors that
98 * we have tried to ignore. 98 * we have tried to ignore.
99 */ 99 */
100 struct timespec last_read_error; /* monotonic time since our
101 * last read error
102 */
100 atomic_t corrected_errors; /* number of corrected read errors, 103 atomic_t corrected_errors; /* number of corrected read errors,
101 * for reporting to userspace and storing 104 * for reporting to userspace and storing
102 * in superblock. 105 * in superblock.
@@ -150,6 +153,12 @@ struct mddev_s
150 int external_size; /* size managed 153 int external_size; /* size managed
151 * externally */ 154 * externally */
152 __u64 events; 155 __u64 events;
156 /* If the last 'event' was simply a clean->dirty transition, and
157 * we didn't write it to the spares, then it is safe and simple
158 * to just decrement the event count on a dirty->clean transition.
159 * So we record that possibility here.
160 */
161 int can_decrease_events;
153 162
154 char uuid[16]; 163 char uuid[16];
155 164
@@ -237,7 +246,6 @@ struct mddev_s
237 atomic_t active; /* general refcount */ 246 atomic_t active; /* general refcount */
238 atomic_t openers; /* number of active opens */ 247 atomic_t openers; /* number of active opens */
239 248
240 int changed; /* true if we might need to reread partition info */
241 int degraded; /* whether md should consider 249 int degraded; /* whether md should consider
242 * adding a spare 250 * adding a spare
243 */ 251 */
@@ -276,21 +284,40 @@ struct mddev_s
276 atomic_t writes_pending; 284 atomic_t writes_pending;
277 struct request_queue *queue; /* for plugging ... */ 285 struct request_queue *queue; /* for plugging ... */
278 286
279 atomic_t write_behind; /* outstanding async IO */
280 unsigned int max_write_behind; /* 0 = sync */
281
282 struct bitmap *bitmap; /* the bitmap for the device */ 287 struct bitmap *bitmap; /* the bitmap for the device */
283 struct file *bitmap_file; /* the bitmap file */ 288 struct {
284 long bitmap_offset; /* offset from superblock of 289 struct file *file; /* the bitmap file */
285 * start of bitmap. May be 290 loff_t offset; /* offset from superblock of
286 * negative, but not '0' 291 * start of bitmap. May be
287 */ 292 * negative, but not '0'
288 long default_bitmap_offset; /* this is the offset to use when 293 * For external metadata, offset
289 * hot-adding a bitmap. It should 294 * from start of device.
290 * eventually be settable by sysfs. 295 */
291 */ 296 loff_t default_offset; /* this is the offset to use when
292 297 * hot-adding a bitmap. It should
298 * eventually be settable by sysfs.
299 */
300 struct mutex mutex;
301 unsigned long chunksize;
302 unsigned long daemon_sleep; /* how many seconds between updates? */
303 unsigned long max_write_behind; /* write-behind mode */
304 int external;
305 } bitmap_info;
306
307 atomic_t max_corr_read_errors; /* max read retries */
293 struct list_head all_mddevs; 308 struct list_head all_mddevs;
309
310 struct attribute_group *to_remove;
311 /* Generic barrier handling.
312 * If there is a pending barrier request, all other
313 * writes are blocked while the devices are flushed.
314 * The last to finish a flush schedules a worker to
315 * submit the barrier request (without the barrier flag),
316 * then submit more flush requests.
317 */
318 struct bio *barrier;
319 atomic_t flush_pending;
320 struct work_struct barrier_work;
294}; 321};
295 322
296 323
@@ -312,7 +339,7 @@ struct mdk_personality
312 int level; 339 int level;
313 struct list_head list; 340 struct list_head list;
314 struct module *owner; 341 struct module *owner;
315 int (*make_request)(struct request_queue *q, struct bio *bio); 342 int (*make_request)(mddev_t *mddev, struct bio *bio);
316 int (*run)(mddev_t *mddev); 343 int (*run)(mddev_t *mddev);
317 int (*stop)(mddev_t *mddev); 344 int (*stop)(mddev_t *mddev);
318 void (*status)(struct seq_file *seq, mddev_t *mddev); 345 void (*status)(struct seq_file *seq, mddev_t *mddev);
@@ -353,7 +380,7 @@ struct md_sysfs_entry {
353 ssize_t (*show)(mddev_t *, char *); 380 ssize_t (*show)(mddev_t *, char *);
354 ssize_t (*store)(mddev_t *, const char *, size_t); 381 ssize_t (*store)(mddev_t *, const char *, size_t);
355}; 382};
356 383extern struct attribute_group md_bitmap_group;
357 384
358static inline char * mdname (mddev_t * mddev) 385static inline char * mdname (mddev_t * mddev)
359{ 386{
@@ -431,6 +458,7 @@ extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
431extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); 458extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
432 459
433extern int mddev_congested(mddev_t *mddev, int bits); 460extern int mddev_congested(mddev_t *mddev, int bits);
461extern void md_barrier_request(mddev_t *mddev, struct bio *bio);
434extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 462extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
435 sector_t sector, int size, struct page *page); 463 sector_t sector, int size, struct page *page);
436extern void md_super_wait(mddev_t *mddev); 464extern void md_super_wait(mddev_t *mddev);
@@ -443,6 +471,8 @@ extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
443extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors); 471extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors);
444extern int md_check_no_bitmap(mddev_t *mddev); 472extern int md_check_no_bitmap(mddev_t *mddev);
445extern int md_integrity_register(mddev_t *mddev); 473extern int md_integrity_register(mddev_t *mddev);
446void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev); 474extern void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
475extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
476extern void restore_bitmap_write_access(struct file *file);
447 477
448#endif /* _MD_MD_H */ 478#endif /* _MD_MD_H */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index ee7646f974a0..410fb60699ac 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -22,6 +22,7 @@
22#include <linux/blkdev.h> 22#include <linux/blkdev.h>
23#include <linux/raid/md_u.h> 23#include <linux/raid/md_u.h>
24#include <linux/seq_file.h> 24#include <linux/seq_file.h>
25#include <linux/slab.h>
25#include "md.h" 26#include "md.h"
26#include "multipath.h" 27#include "multipath.h"
27 28
@@ -84,7 +85,7 @@ static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err)
84static void multipath_end_request(struct bio *bio, int error) 85static void multipath_end_request(struct bio *bio, int error)
85{ 86{
86 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 87 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
87 struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private); 88 struct multipath_bh *mp_bh = bio->bi_private;
88 multipath_conf_t *conf = mp_bh->mddev->private; 89 multipath_conf_t *conf = mp_bh->mddev->private;
89 mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev; 90 mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev;
90 91
@@ -135,17 +136,14 @@ static void multipath_unplug(struct request_queue *q)
135} 136}
136 137
137 138
138static int multipath_make_request (struct request_queue *q, struct bio * bio) 139static int multipath_make_request(mddev_t *mddev, struct bio * bio)
139{ 140{
140 mddev_t *mddev = q->queuedata;
141 multipath_conf_t *conf = mddev->private; 141 multipath_conf_t *conf = mddev->private;
142 struct multipath_bh * mp_bh; 142 struct multipath_bh * mp_bh;
143 struct multipath_info *multipath; 143 struct multipath_info *multipath;
144 const int rw = bio_data_dir(bio);
145 int cpu;
146 144
147 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { 145 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
148 bio_endio(bio, -EOPNOTSUPP); 146 md_barrier_request(mddev, bio);
149 return 0; 147 return 0;
150 } 148 }
151 149
@@ -154,12 +152,6 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio)
154 mp_bh->master_bio = bio; 152 mp_bh->master_bio = bio;
155 mp_bh->mddev = mddev; 153 mp_bh->mddev = mddev;
156 154
157 cpu = part_stat_lock();
158 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
159 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
160 bio_sectors(bio));
161 part_stat_unlock();
162
163 mp_bh->path = multipath_map(conf); 155 mp_bh->path = multipath_map(conf);
164 if (mp_bh->path < 0) { 156 if (mp_bh->path < 0) {
165 bio_endio(bio, -EIO); 157 bio_endio(bio, -EIO);
@@ -301,14 +293,16 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
301 rdev->data_offset << 9); 293 rdev->data_offset << 9);
302 294
303 /* as we don't honour merge_bvec_fn, we must never risk 295 /* as we don't honour merge_bvec_fn, we must never risk
304 * violating it, so limit ->max_sector to one PAGE, as 296 * violating it, so limit ->max_segments to one, lying
305 * a one page request is never in violation. 297 * within a single page.
306 * (Note: it is very unlikely that a device with 298 * (Note: it is very unlikely that a device with
307 * merge_bvec_fn will be involved in multipath.) 299 * merge_bvec_fn will be involved in multipath.)
308 */ 300 */
309 if (q->merge_bvec_fn && 301 if (q->merge_bvec_fn) {
310 queue_max_sectors(q) > (PAGE_SIZE>>9)) 302 blk_queue_max_segments(mddev->queue, 1);
311 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 303 blk_queue_segment_boundary(mddev->queue,
304 PAGE_CACHE_SIZE - 1);
305 }
312 306
313 conf->working_disks++; 307 conf->working_disks++;
314 mddev->degraded--; 308 mddev->degraded--;
@@ -476,9 +470,11 @@ static int multipath_run (mddev_t *mddev)
476 /* as we don't honour merge_bvec_fn, we must never risk 470 /* as we don't honour merge_bvec_fn, we must never risk
477 * violating it, not that we ever expect a device with 471 * violating it, not that we ever expect a device with
478 * a merge_bvec_fn to be involved in multipath */ 472 * a merge_bvec_fn to be involved in multipath */
479 if (rdev->bdev->bd_disk->queue->merge_bvec_fn && 473 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
480 queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) 474 blk_queue_max_segments(mddev->queue, 1);
481 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 475 blk_queue_segment_boundary(mddev->queue,
476 PAGE_CACHE_SIZE - 1);
477 }
482 478
483 if (!test_bit(Faulty, &rdev->flags)) 479 if (!test_bit(Faulty, &rdev->flags))
484 conf->working_disks++; 480 conf->working_disks++;
@@ -581,6 +577,7 @@ static void __exit multipath_exit (void)
581module_init(multipath_init); 577module_init(multipath_init);
582module_exit(multipath_exit); 578module_exit(multipath_exit);
583MODULE_LICENSE("GPL"); 579MODULE_LICENSE("GPL");
580MODULE_DESCRIPTION("simple multi-path personality for MD");
584MODULE_ALIAS("md-personality-7"); /* MULTIPATH */ 581MODULE_ALIAS("md-personality-7"); /* MULTIPATH */
585MODULE_ALIAS("md-multipath"); 582MODULE_ALIAS("md-multipath");
586MODULE_ALIAS("md-level--4"); 583MODULE_ALIAS("md-level--4");
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index d3a4ce06015a..563abed5a2cb 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -20,17 +20,20 @@
20 20
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/seq_file.h> 22#include <linux/seq_file.h>
23#include <linux/slab.h>
23#include "md.h" 24#include "md.h"
24#include "raid0.h" 25#include "raid0.h"
26#include "raid5.h"
25 27
26static void raid0_unplug(struct request_queue *q) 28static void raid0_unplug(struct request_queue *q)
27{ 29{
28 mddev_t *mddev = q->queuedata; 30 mddev_t *mddev = q->queuedata;
29 raid0_conf_t *conf = mddev->private; 31 raid0_conf_t *conf = mddev->private;
30 mdk_rdev_t **devlist = conf->devlist; 32 mdk_rdev_t **devlist = conf->devlist;
33 int raid_disks = conf->strip_zone[0].nb_dev;
31 int i; 34 int i;
32 35
33 for (i=0; i<mddev->raid_disks; i++) { 36 for (i=0; i < raid_disks; i++) {
34 struct request_queue *r_queue = bdev_get_queue(devlist[i]->bdev); 37 struct request_queue *r_queue = bdev_get_queue(devlist[i]->bdev);
35 38
36 blk_unplug(r_queue); 39 blk_unplug(r_queue);
@@ -42,12 +45,13 @@ static int raid0_congested(void *data, int bits)
42 mddev_t *mddev = data; 45 mddev_t *mddev = data;
43 raid0_conf_t *conf = mddev->private; 46 raid0_conf_t *conf = mddev->private;
44 mdk_rdev_t **devlist = conf->devlist; 47 mdk_rdev_t **devlist = conf->devlist;
48 int raid_disks = conf->strip_zone[0].nb_dev;
45 int i, ret = 0; 49 int i, ret = 0;
46 50
47 if (mddev_congested(mddev, bits)) 51 if (mddev_congested(mddev, bits))
48 return 1; 52 return 1;
49 53
50 for (i = 0; i < mddev->raid_disks && !ret ; i++) { 54 for (i = 0; i < raid_disks && !ret ; i++) {
51 struct request_queue *q = bdev_get_queue(devlist[i]->bdev); 55 struct request_queue *q = bdev_get_queue(devlist[i]->bdev);
52 56
53 ret |= bdi_congested(&q->backing_dev_info, bits); 57 ret |= bdi_congested(&q->backing_dev_info, bits);
@@ -65,16 +69,17 @@ static void dump_zones(mddev_t *mddev)
65 sector_t zone_start = 0; 69 sector_t zone_start = 0;
66 char b[BDEVNAME_SIZE]; 70 char b[BDEVNAME_SIZE];
67 raid0_conf_t *conf = mddev->private; 71 raid0_conf_t *conf = mddev->private;
72 int raid_disks = conf->strip_zone[0].nb_dev;
68 printk(KERN_INFO "******* %s configuration *********\n", 73 printk(KERN_INFO "******* %s configuration *********\n",
69 mdname(mddev)); 74 mdname(mddev));
70 h = 0; 75 h = 0;
71 for (j = 0; j < conf->nr_strip_zones; j++) { 76 for (j = 0; j < conf->nr_strip_zones; j++) {
72 printk(KERN_INFO "zone%d=[", j); 77 printk(KERN_INFO "zone%d=[", j);
73 for (k = 0; k < conf->strip_zone[j].nb_dev; k++) 78 for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
74 printk("%s/", 79 printk(KERN_CONT "%s/",
75 bdevname(conf->devlist[j*mddev->raid_disks 80 bdevname(conf->devlist[j*raid_disks
76 + k]->bdev, b)); 81 + k]->bdev, b));
77 printk("]\n"); 82 printk(KERN_CONT "]\n");
78 83
79 zone_size = conf->strip_zone[j].zone_end - zone_start; 84 zone_size = conf->strip_zone[j].zone_end - zone_start;
80 printk(KERN_INFO " zone offset=%llukb " 85 printk(KERN_INFO " zone offset=%llukb "
@@ -87,7 +92,7 @@ static void dump_zones(mddev_t *mddev)
87 printk(KERN_INFO "**********************************\n\n"); 92 printk(KERN_INFO "**********************************\n\n");
88} 93}
89 94
90static int create_strip_zones(mddev_t *mddev) 95static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf)
91{ 96{
92 int i, c, err; 97 int i, c, err;
93 sector_t curr_zone_end, sectors; 98 sector_t curr_zone_end, sectors;
@@ -100,8 +105,9 @@ static int create_strip_zones(mddev_t *mddev)
100 if (!conf) 105 if (!conf)
101 return -ENOMEM; 106 return -ENOMEM;
102 list_for_each_entry(rdev1, &mddev->disks, same_set) { 107 list_for_each_entry(rdev1, &mddev->disks, same_set) {
103 printk(KERN_INFO "raid0: looking at %s\n", 108 printk(KERN_INFO "md/raid0:%s: looking at %s\n",
104 bdevname(rdev1->bdev,b)); 109 mdname(mddev),
110 bdevname(rdev1->bdev, b));
105 c = 0; 111 c = 0;
106 112
107 /* round size to chunk_size */ 113 /* round size to chunk_size */
@@ -110,14 +116,16 @@ static int create_strip_zones(mddev_t *mddev)
110 rdev1->sectors = sectors * mddev->chunk_sectors; 116 rdev1->sectors = sectors * mddev->chunk_sectors;
111 117
112 list_for_each_entry(rdev2, &mddev->disks, same_set) { 118 list_for_each_entry(rdev2, &mddev->disks, same_set) {
113 printk(KERN_INFO "raid0: comparing %s(%llu)", 119 printk(KERN_INFO "md/raid0:%s: comparing %s(%llu)",
120 mdname(mddev),
114 bdevname(rdev1->bdev,b), 121 bdevname(rdev1->bdev,b),
115 (unsigned long long)rdev1->sectors); 122 (unsigned long long)rdev1->sectors);
116 printk(KERN_INFO " with %s(%llu)\n", 123 printk(KERN_CONT " with %s(%llu)\n",
117 bdevname(rdev2->bdev,b), 124 bdevname(rdev2->bdev,b),
118 (unsigned long long)rdev2->sectors); 125 (unsigned long long)rdev2->sectors);
119 if (rdev2 == rdev1) { 126 if (rdev2 == rdev1) {
120 printk(KERN_INFO "raid0: END\n"); 127 printk(KERN_INFO "md/raid0:%s: END\n",
128 mdname(mddev));
121 break; 129 break;
122 } 130 }
123 if (rdev2->sectors == rdev1->sectors) { 131 if (rdev2->sectors == rdev1->sectors) {
@@ -125,20 +133,24 @@ static int create_strip_zones(mddev_t *mddev)
125 * Not unique, don't count it as a new 133 * Not unique, don't count it as a new
126 * group 134 * group
127 */ 135 */
128 printk(KERN_INFO "raid0: EQUAL\n"); 136 printk(KERN_INFO "md/raid0:%s: EQUAL\n",
137 mdname(mddev));
129 c = 1; 138 c = 1;
130 break; 139 break;
131 } 140 }
132 printk(KERN_INFO "raid0: NOT EQUAL\n"); 141 printk(KERN_INFO "md/raid0:%s: NOT EQUAL\n",
142 mdname(mddev));
133 } 143 }
134 if (!c) { 144 if (!c) {
135 printk(KERN_INFO "raid0: ==> UNIQUE\n"); 145 printk(KERN_INFO "md/raid0:%s: ==> UNIQUE\n",
146 mdname(mddev));
136 conf->nr_strip_zones++; 147 conf->nr_strip_zones++;
137 printk(KERN_INFO "raid0: %d zones\n", 148 printk(KERN_INFO "md/raid0:%s: %d zones\n",
138 conf->nr_strip_zones); 149 mdname(mddev), conf->nr_strip_zones);
139 } 150 }
140 } 151 }
141 printk(KERN_INFO "raid0: FINAL %d zones\n", conf->nr_strip_zones); 152 printk(KERN_INFO "md/raid0:%s: FINAL %d zones\n",
153 mdname(mddev), conf->nr_strip_zones);
142 err = -ENOMEM; 154 err = -ENOMEM;
143 conf->strip_zone = kzalloc(sizeof(struct strip_zone)* 155 conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
144 conf->nr_strip_zones, GFP_KERNEL); 156 conf->nr_strip_zones, GFP_KERNEL);
@@ -161,14 +173,20 @@ static int create_strip_zones(mddev_t *mddev)
161 list_for_each_entry(rdev1, &mddev->disks, same_set) { 173 list_for_each_entry(rdev1, &mddev->disks, same_set) {
162 int j = rdev1->raid_disk; 174 int j = rdev1->raid_disk;
163 175
176 if (mddev->level == 10) {
177 /* taking over a raid10-n2 array */
178 j /= 2;
179 rdev1->new_raid_disk = j;
180 }
181
164 if (j < 0 || j >= mddev->raid_disks) { 182 if (j < 0 || j >= mddev->raid_disks) {
165 printk(KERN_ERR "raid0: bad disk number %d - " 183 printk(KERN_ERR "md/raid0:%s: bad disk number %d - "
166 "aborting!\n", j); 184 "aborting!\n", mdname(mddev), j);
167 goto abort; 185 goto abort;
168 } 186 }
169 if (dev[j]) { 187 if (dev[j]) {
170 printk(KERN_ERR "raid0: multiple devices for %d - " 188 printk(KERN_ERR "md/raid0:%s: multiple devices for %d - "
171 "aborting!\n", j); 189 "aborting!\n", mdname(mddev), j);
172 goto abort; 190 goto abort;
173 } 191 }
174 dev[j] = rdev1; 192 dev[j] = rdev1;
@@ -176,21 +194,22 @@ static int create_strip_zones(mddev_t *mddev)
176 disk_stack_limits(mddev->gendisk, rdev1->bdev, 194 disk_stack_limits(mddev->gendisk, rdev1->bdev,
177 rdev1->data_offset << 9); 195 rdev1->data_offset << 9);
178 /* as we don't honour merge_bvec_fn, we must never risk 196 /* as we don't honour merge_bvec_fn, we must never risk
179 * violating it, so limit ->max_sector to one PAGE, as 197 * violating it, so limit ->max_segments to 1, lying within
180 * a one page request is never in violation. 198 * a single page.
181 */ 199 */
182 200
183 if (rdev1->bdev->bd_disk->queue->merge_bvec_fn && 201 if (rdev1->bdev->bd_disk->queue->merge_bvec_fn) {
184 queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) 202 blk_queue_max_segments(mddev->queue, 1);
185 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 203 blk_queue_segment_boundary(mddev->queue,
186 204 PAGE_CACHE_SIZE - 1);
205 }
187 if (!smallest || (rdev1->sectors < smallest->sectors)) 206 if (!smallest || (rdev1->sectors < smallest->sectors))
188 smallest = rdev1; 207 smallest = rdev1;
189 cnt++; 208 cnt++;
190 } 209 }
191 if (cnt != mddev->raid_disks) { 210 if (cnt != mddev->raid_disks) {
192 printk(KERN_ERR "raid0: too few disks (%d of %d) - " 211 printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - "
193 "aborting!\n", cnt, mddev->raid_disks); 212 "aborting!\n", mdname(mddev), cnt, mddev->raid_disks);
194 goto abort; 213 goto abort;
195 } 214 }
196 zone->nb_dev = cnt; 215 zone->nb_dev = cnt;
@@ -206,39 +225,44 @@ static int create_strip_zones(mddev_t *mddev)
206 zone = conf->strip_zone + i; 225 zone = conf->strip_zone + i;
207 dev = conf->devlist + i * mddev->raid_disks; 226 dev = conf->devlist + i * mddev->raid_disks;
208 227
209 printk(KERN_INFO "raid0: zone %d\n", i); 228 printk(KERN_INFO "md/raid0:%s: zone %d\n",
229 mdname(mddev), i);
210 zone->dev_start = smallest->sectors; 230 zone->dev_start = smallest->sectors;
211 smallest = NULL; 231 smallest = NULL;
212 c = 0; 232 c = 0;
213 233
214 for (j=0; j<cnt; j++) { 234 for (j=0; j<cnt; j++) {
215 rdev = conf->devlist[j]; 235 rdev = conf->devlist[j];
216 printk(KERN_INFO "raid0: checking %s ...", 236 printk(KERN_INFO "md/raid0:%s: checking %s ...",
217 bdevname(rdev->bdev, b)); 237 mdname(mddev),
238 bdevname(rdev->bdev, b));
218 if (rdev->sectors <= zone->dev_start) { 239 if (rdev->sectors <= zone->dev_start) {
219 printk(KERN_INFO " nope.\n"); 240 printk(KERN_CONT " nope.\n");
220 continue; 241 continue;
221 } 242 }
222 printk(KERN_INFO " contained as device %d\n", c); 243 printk(KERN_CONT " contained as device %d\n", c);
223 dev[c] = rdev; 244 dev[c] = rdev;
224 c++; 245 c++;
225 if (!smallest || rdev->sectors < smallest->sectors) { 246 if (!smallest || rdev->sectors < smallest->sectors) {
226 smallest = rdev; 247 smallest = rdev;
227 printk(KERN_INFO " (%llu) is smallest!.\n", 248 printk(KERN_INFO "md/raid0:%s: (%llu) is smallest!.\n",
228 (unsigned long long)rdev->sectors); 249 mdname(mddev),
250 (unsigned long long)rdev->sectors);
229 } 251 }
230 } 252 }
231 253
232 zone->nb_dev = c; 254 zone->nb_dev = c;
233 sectors = (smallest->sectors - zone->dev_start) * c; 255 sectors = (smallest->sectors - zone->dev_start) * c;
234 printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n", 256 printk(KERN_INFO "md/raid0:%s: zone->nb_dev: %d, sectors: %llu\n",
235 zone->nb_dev, (unsigned long long)sectors); 257 mdname(mddev),
258 zone->nb_dev, (unsigned long long)sectors);
236 259
237 curr_zone_end += sectors; 260 curr_zone_end += sectors;
238 zone->zone_end = curr_zone_end; 261 zone->zone_end = curr_zone_end;
239 262
240 printk(KERN_INFO "raid0: current zone start: %llu\n", 263 printk(KERN_INFO "md/raid0:%s: current zone start: %llu\n",
241 (unsigned long long)smallest->sectors); 264 mdname(mddev),
265 (unsigned long long)smallest->sectors);
242 } 266 }
243 mddev->queue->unplug_fn = raid0_unplug; 267 mddev->queue->unplug_fn = raid0_unplug;
244 mddev->queue->backing_dev_info.congested_fn = raid0_congested; 268 mddev->queue->backing_dev_info.congested_fn = raid0_congested;
@@ -249,7 +273,7 @@ static int create_strip_zones(mddev_t *mddev)
249 * chunk size is a multiple of that sector size 273 * chunk size is a multiple of that sector size
250 */ 274 */
251 if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) { 275 if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) {
252 printk(KERN_ERR "%s chunk_size of %d not valid\n", 276 printk(KERN_ERR "md/raid0:%s: chunk_size of %d not valid\n",
253 mdname(mddev), 277 mdname(mddev),
254 mddev->chunk_sectors << 9); 278 mddev->chunk_sectors << 9);
255 goto abort; 279 goto abort;
@@ -259,14 +283,15 @@ static int create_strip_zones(mddev_t *mddev)
259 blk_queue_io_opt(mddev->queue, 283 blk_queue_io_opt(mddev->queue,
260 (mddev->chunk_sectors << 9) * mddev->raid_disks); 284 (mddev->chunk_sectors << 9) * mddev->raid_disks);
261 285
262 printk(KERN_INFO "raid0: done.\n"); 286 printk(KERN_INFO "md/raid0:%s: done.\n", mdname(mddev));
263 mddev->private = conf; 287 *private_conf = conf;
288
264 return 0; 289 return 0;
265abort: 290abort:
266 kfree(conf->strip_zone); 291 kfree(conf->strip_zone);
267 kfree(conf->devlist); 292 kfree(conf->devlist);
268 kfree(conf); 293 kfree(conf);
269 mddev->private = NULL; 294 *private_conf = NULL;
270 return err; 295 return err;
271} 296}
272 297
@@ -317,26 +342,34 @@ static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks)
317 342
318static int raid0_run(mddev_t *mddev) 343static int raid0_run(mddev_t *mddev)
319{ 344{
345 raid0_conf_t *conf;
320 int ret; 346 int ret;
321 347
322 if (mddev->chunk_sectors == 0) { 348 if (mddev->chunk_sectors == 0) {
323 printk(KERN_ERR "md/raid0: chunk size must be set.\n"); 349 printk(KERN_ERR "md/raid0:%s: chunk size must be set.\n",
350 mdname(mddev));
324 return -EINVAL; 351 return -EINVAL;
325 } 352 }
326 if (md_check_no_bitmap(mddev)) 353 if (md_check_no_bitmap(mddev))
327 return -EINVAL; 354 return -EINVAL;
328 blk_queue_max_sectors(mddev->queue, mddev->chunk_sectors); 355 blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
329 mddev->queue->queue_lock = &mddev->queue->__queue_lock; 356 mddev->queue->queue_lock = &mddev->queue->__queue_lock;
330 357
331 ret = create_strip_zones(mddev); 358 /* if private is not null, we are here after takeover */
332 if (ret < 0) 359 if (mddev->private == NULL) {
333 return ret; 360 ret = create_strip_zones(mddev, &conf);
361 if (ret < 0)
362 return ret;
363 mddev->private = conf;
364 }
365 conf = mddev->private;
334 366
335 /* calculate array device size */ 367 /* calculate array device size */
336 md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); 368 md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
337 369
338 printk(KERN_INFO "raid0 : md_size is %llu sectors.\n", 370 printk(KERN_INFO "md/raid0:%s: md_size is %llu sectors.\n",
339 (unsigned long long)mddev->array_sectors); 371 mdname(mddev),
372 (unsigned long long)mddev->array_sectors);
340 /* calculate the max read-ahead size. 373 /* calculate the max read-ahead size.
341 * For read-ahead of large files to be effective, we need to 374 * For read-ahead of large files to be effective, we need to
342 * readahead at least twice a whole stripe. i.e. number of devices 375 * readahead at least twice a whole stripe. i.e. number of devices
@@ -400,6 +433,7 @@ static mdk_rdev_t *map_sector(mddev_t *mddev, struct strip_zone *zone,
400 unsigned int sect_in_chunk; 433 unsigned int sect_in_chunk;
401 sector_t chunk; 434 sector_t chunk;
402 raid0_conf_t *conf = mddev->private; 435 raid0_conf_t *conf = mddev->private;
436 int raid_disks = conf->strip_zone[0].nb_dev;
403 unsigned int chunk_sects = mddev->chunk_sectors; 437 unsigned int chunk_sects = mddev->chunk_sectors;
404 438
405 if (is_power_of_2(chunk_sects)) { 439 if (is_power_of_2(chunk_sects)) {
@@ -422,7 +456,7 @@ static mdk_rdev_t *map_sector(mddev_t *mddev, struct strip_zone *zone,
422 * + the position in the chunk 456 * + the position in the chunk
423 */ 457 */
424 *sector_offset = (chunk * chunk_sects) + sect_in_chunk; 458 *sector_offset = (chunk * chunk_sects) + sect_in_chunk;
425 return conf->devlist[(zone - conf->strip_zone)*mddev->raid_disks 459 return conf->devlist[(zone - conf->strip_zone)*raid_disks
426 + sector_div(sector, zone->nb_dev)]; 460 + sector_div(sector, zone->nb_dev)];
427} 461}
428 462
@@ -442,27 +476,18 @@ static inline int is_io_in_chunk_boundary(mddev_t *mddev,
442 } 476 }
443} 477}
444 478
445static int raid0_make_request(struct request_queue *q, struct bio *bio) 479static int raid0_make_request(mddev_t *mddev, struct bio *bio)
446{ 480{
447 mddev_t *mddev = q->queuedata;
448 unsigned int chunk_sects; 481 unsigned int chunk_sects;
449 sector_t sector_offset; 482 sector_t sector_offset;
450 struct strip_zone *zone; 483 struct strip_zone *zone;
451 mdk_rdev_t *tmp_dev; 484 mdk_rdev_t *tmp_dev;
452 const int rw = bio_data_dir(bio);
453 int cpu;
454 485
455 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { 486 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
456 bio_endio(bio, -EOPNOTSUPP); 487 md_barrier_request(mddev, bio);
457 return 0; 488 return 0;
458 } 489 }
459 490
460 cpu = part_stat_lock();
461 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
462 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
463 bio_sectors(bio));
464 part_stat_unlock();
465
466 chunk_sects = mddev->chunk_sectors; 491 chunk_sects = mddev->chunk_sectors;
467 if (unlikely(!is_io_in_chunk_boundary(mddev, chunk_sects, bio))) { 492 if (unlikely(!is_io_in_chunk_boundary(mddev, chunk_sects, bio))) {
468 sector_t sector = bio->bi_sector; 493 sector_t sector = bio->bi_sector;
@@ -480,9 +505,9 @@ static int raid0_make_request(struct request_queue *q, struct bio *bio)
480 else 505 else
481 bp = bio_split(bio, chunk_sects - 506 bp = bio_split(bio, chunk_sects -
482 sector_div(sector, chunk_sects)); 507 sector_div(sector, chunk_sects));
483 if (raid0_make_request(q, &bp->bio1)) 508 if (raid0_make_request(mddev, &bp->bio1))
484 generic_make_request(&bp->bio1); 509 generic_make_request(&bp->bio1);
485 if (raid0_make_request(q, &bp->bio2)) 510 if (raid0_make_request(mddev, &bp->bio2))
486 generic_make_request(&bp->bio2); 511 generic_make_request(&bp->bio2);
487 512
488 bio_pair_release(bp); 513 bio_pair_release(bp);
@@ -502,9 +527,10 @@ static int raid0_make_request(struct request_queue *q, struct bio *bio)
502 return 1; 527 return 1;
503 528
504bad_map: 529bad_map:
505 printk("raid0_make_request bug: can't convert block across chunks" 530 printk("md/raid0:%s: make_request bug: can't convert block across chunks"
506 " or bigger than %dk %llu %d\n", chunk_sects / 2, 531 " or bigger than %dk %llu %d\n",
507 (unsigned long long)bio->bi_sector, bio->bi_size >> 10); 532 mdname(mddev), chunk_sects / 2,
533 (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
508 534
509 bio_io_error(bio); 535 bio_io_error(bio);
510 return 0; 536 return 0;
@@ -517,6 +543,7 @@ static void raid0_status(struct seq_file *seq, mddev_t *mddev)
517 int j, k, h; 543 int j, k, h;
518 char b[BDEVNAME_SIZE]; 544 char b[BDEVNAME_SIZE];
519 raid0_conf_t *conf = mddev->private; 545 raid0_conf_t *conf = mddev->private;
546 int raid_disks = conf->strip_zone[0].nb_dev;
520 547
521 sector_t zone_size; 548 sector_t zone_size;
522 sector_t zone_start = 0; 549 sector_t zone_start = 0;
@@ -527,7 +554,7 @@ static void raid0_status(struct seq_file *seq, mddev_t *mddev)
527 seq_printf(seq, "=["); 554 seq_printf(seq, "=[");
528 for (k = 0; k < conf->strip_zone[j].nb_dev; k++) 555 for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
529 seq_printf(seq, "%s/", bdevname( 556 seq_printf(seq, "%s/", bdevname(
530 conf->devlist[j*mddev->raid_disks + k] 557 conf->devlist[j*raid_disks + k]
531 ->bdev, b)); 558 ->bdev, b));
532 559
533 zone_size = conf->strip_zone[j].zone_end - zone_start; 560 zone_size = conf->strip_zone[j].zone_end - zone_start;
@@ -542,6 +569,109 @@ static void raid0_status(struct seq_file *seq, mddev_t *mddev)
542 return; 569 return;
543} 570}
544 571
572static void *raid0_takeover_raid45(mddev_t *mddev)
573{
574 mdk_rdev_t *rdev;
575 raid0_conf_t *priv_conf;
576
577 if (mddev->degraded != 1) {
578 printk(KERN_ERR "md/raid0:%s: raid5 must be degraded! Degraded disks: %d\n",
579 mdname(mddev),
580 mddev->degraded);
581 return ERR_PTR(-EINVAL);
582 }
583
584 list_for_each_entry(rdev, &mddev->disks, same_set) {
585 /* check slot number for a disk */
586 if (rdev->raid_disk == mddev->raid_disks-1) {
587 printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n",
588 mdname(mddev));
589 return ERR_PTR(-EINVAL);
590 }
591 }
592
593 /* Set new parameters */
594 mddev->new_level = 0;
595 mddev->new_layout = 0;
596 mddev->new_chunk_sectors = mddev->chunk_sectors;
597 mddev->raid_disks--;
598 mddev->delta_disks = -1;
599 /* make sure it will be not marked as dirty */
600 mddev->recovery_cp = MaxSector;
601
602 create_strip_zones(mddev, &priv_conf);
603 return priv_conf;
604}
605
606static void *raid0_takeover_raid10(mddev_t *mddev)
607{
608 raid0_conf_t *priv_conf;
609
610 /* Check layout:
611 * - far_copies must be 1
612 * - near_copies must be 2
613 * - disks number must be even
614 * - all mirrors must be already degraded
615 */
616 if (mddev->layout != ((1 << 8) + 2)) {
617 printk(KERN_ERR "md/raid0:%s:: Raid0 cannot takover layout: 0x%x\n",
618 mdname(mddev),
619 mddev->layout);
620 return ERR_PTR(-EINVAL);
621 }
622 if (mddev->raid_disks & 1) {
623 printk(KERN_ERR "md/raid0:%s: Raid0 cannot takover Raid10 with odd disk number.\n",
624 mdname(mddev));
625 return ERR_PTR(-EINVAL);
626 }
627 if (mddev->degraded != (mddev->raid_disks>>1)) {
628 printk(KERN_ERR "md/raid0:%s: All mirrors must be already degraded!\n",
629 mdname(mddev));
630 return ERR_PTR(-EINVAL);
631 }
632
633 /* Set new parameters */
634 mddev->new_level = 0;
635 mddev->new_layout = 0;
636 mddev->new_chunk_sectors = mddev->chunk_sectors;
637 mddev->delta_disks = - mddev->raid_disks / 2;
638 mddev->raid_disks += mddev->delta_disks;
639 mddev->degraded = 0;
640 /* make sure it will be not marked as dirty */
641 mddev->recovery_cp = MaxSector;
642
643 create_strip_zones(mddev, &priv_conf);
644 return priv_conf;
645}
646
647static void *raid0_takeover(mddev_t *mddev)
648{
649 /* raid0 can take over:
650 * raid4 - if all data disks are active.
651 * raid5 - providing it is Raid4 layout and one disk is faulty
652 * raid10 - assuming we have all necessary active disks
653 */
654 if (mddev->level == 4)
655 return raid0_takeover_raid45(mddev);
656
657 if (mddev->level == 5) {
658 if (mddev->layout == ALGORITHM_PARITY_N)
659 return raid0_takeover_raid45(mddev);
660
661 printk(KERN_ERR "md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n",
662 mdname(mddev), ALGORITHM_PARITY_N);
663 }
664
665 if (mddev->level == 10)
666 return raid0_takeover_raid10(mddev);
667
668 return ERR_PTR(-EINVAL);
669}
670
671static void raid0_quiesce(mddev_t *mddev, int state)
672{
673}
674
545static struct mdk_personality raid0_personality= 675static struct mdk_personality raid0_personality=
546{ 676{
547 .name = "raid0", 677 .name = "raid0",
@@ -552,6 +682,8 @@ static struct mdk_personality raid0_personality=
552 .stop = raid0_stop, 682 .stop = raid0_stop,
553 .status = raid0_status, 683 .status = raid0_status,
554 .size = raid0_size, 684 .size = raid0_size,
685 .takeover = raid0_takeover,
686 .quiesce = raid0_quiesce,
555}; 687};
556 688
557static int __init raid0_init (void) 689static int __init raid0_init (void)
@@ -567,6 +699,7 @@ static void raid0_exit (void)
567module_init(raid0_init); 699module_init(raid0_init);
568module_exit(raid0_exit); 700module_exit(raid0_exit);
569MODULE_LICENSE("GPL"); 701MODULE_LICENSE("GPL");
702MODULE_DESCRIPTION("RAID0 (striping) personality for MD");
570MODULE_ALIAS("md-personality-2"); /* RAID0 */ 703MODULE_ALIAS("md-personality-2"); /* RAID0 */
571MODULE_ALIAS("md-raid0"); 704MODULE_ALIAS("md-raid0");
572MODULE_ALIAS("md-level-0"); 705MODULE_ALIAS("md-level-0");
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d1b9bd5fd4f6..a948da8012de 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -31,6 +31,7 @@
31 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 31 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
32 */ 32 */
33 33
34#include <linux/slab.h>
34#include <linux/delay.h> 35#include <linux/delay.h>
35#include <linux/blkdev.h> 36#include <linux/blkdev.h>
36#include <linux/seq_file.h> 37#include <linux/seq_file.h>
@@ -64,7 +65,7 @@ static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
64 65
65 /* allocate a r1bio with room for raid_disks entries in the bios array */ 66 /* allocate a r1bio with room for raid_disks entries in the bios array */
66 r1_bio = kzalloc(size, gfp_flags); 67 r1_bio = kzalloc(size, gfp_flags);
67 if (!r1_bio) 68 if (!r1_bio && pi->mddev)
68 unplug_slaves(pi->mddev); 69 unplug_slaves(pi->mddev);
69 70
70 return r1_bio; 71 return r1_bio;
@@ -262,7 +263,7 @@ static inline void update_head_pos(int disk, r1bio_t *r1_bio)
262static void raid1_end_read_request(struct bio *bio, int error) 263static void raid1_end_read_request(struct bio *bio, int error)
263{ 264{
264 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 265 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
265 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 266 r1bio_t *r1_bio = bio->bi_private;
266 int mirror; 267 int mirror;
267 conf_t *conf = r1_bio->mddev->private; 268 conf_t *conf = r1_bio->mddev->private;
268 269
@@ -296,7 +297,8 @@ static void raid1_end_read_request(struct bio *bio, int error)
296 */ 297 */
297 char b[BDEVNAME_SIZE]; 298 char b[BDEVNAME_SIZE];
298 if (printk_ratelimit()) 299 if (printk_ratelimit())
299 printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n", 300 printk(KERN_ERR "md/raid1:%s: %s: rescheduling sector %llu\n",
301 mdname(conf->mddev),
300 bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector); 302 bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector);
301 reschedule_retry(r1_bio); 303 reschedule_retry(r1_bio);
302 } 304 }
@@ -307,7 +309,7 @@ static void raid1_end_read_request(struct bio *bio, int error)
307static void raid1_end_write_request(struct bio *bio, int error) 309static void raid1_end_write_request(struct bio *bio, int error)
308{ 310{
309 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 311 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
310 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 312 r1bio_t *r1_bio = bio->bi_private;
311 int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state); 313 int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
312 conf_t *conf = r1_bio->mddev->private; 314 conf_t *conf = r1_bio->mddev->private;
313 struct bio *to_put = NULL; 315 struct bio *to_put = NULL;
@@ -417,7 +419,7 @@ static void raid1_end_write_request(struct bio *bio, int error)
417 */ 419 */
418static int read_balance(conf_t *conf, r1bio_t *r1_bio) 420static int read_balance(conf_t *conf, r1bio_t *r1_bio)
419{ 421{
420 const unsigned long this_sector = r1_bio->sector; 422 const sector_t this_sector = r1_bio->sector;
421 int new_disk = conf->last_used, disk = new_disk; 423 int new_disk = conf->last_used, disk = new_disk;
422 int wonly_disk = -1; 424 int wonly_disk = -1;
423 const int sectors = r1_bio->sectors; 425 const int sectors = r1_bio->sectors;
@@ -433,7 +435,7 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
433 retry: 435 retry:
434 if (conf->mddev->recovery_cp < MaxSector && 436 if (conf->mddev->recovery_cp < MaxSector &&
435 (this_sector + sectors >= conf->next_resync)) { 437 (this_sector + sectors >= conf->next_resync)) {
436 /* Choose the first operation device, for consistancy */ 438 /* Choose the first operational device, for consistancy */
437 new_disk = 0; 439 new_disk = 0;
438 440
439 for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); 441 for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
@@ -677,6 +679,7 @@ static void raise_barrier(conf_t *conf)
677static void lower_barrier(conf_t *conf) 679static void lower_barrier(conf_t *conf)
678{ 680{
679 unsigned long flags; 681 unsigned long flags;
682 BUG_ON(conf->barrier <= 0);
680 spin_lock_irqsave(&conf->resync_lock, flags); 683 spin_lock_irqsave(&conf->resync_lock, flags);
681 conf->barrier--; 684 conf->barrier--;
682 spin_unlock_irqrestore(&conf->resync_lock, flags); 685 spin_unlock_irqrestore(&conf->resync_lock, flags);
@@ -772,9 +775,8 @@ do_sync_io:
772 return NULL; 775 return NULL;
773} 776}
774 777
775static int make_request(struct request_queue *q, struct bio * bio) 778static int make_request(mddev_t *mddev, struct bio * bio)
776{ 779{
777 mddev_t *mddev = q->queuedata;
778 conf_t *conf = mddev->private; 780 conf_t *conf = mddev->private;
779 mirror_info_t *mirror; 781 mirror_info_t *mirror;
780 r1bio_t *r1_bio; 782 r1bio_t *r1_bio;
@@ -786,7 +788,6 @@ static int make_request(struct request_queue *q, struct bio * bio)
786 struct page **behind_pages = NULL; 788 struct page **behind_pages = NULL;
787 const int rw = bio_data_dir(bio); 789 const int rw = bio_data_dir(bio);
788 const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO); 790 const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);
789 int cpu;
790 bool do_barriers; 791 bool do_barriers;
791 mdk_rdev_t *blocked_rdev; 792 mdk_rdev_t *blocked_rdev;
792 793
@@ -801,6 +802,25 @@ static int make_request(struct request_queue *q, struct bio * bio)
801 802
802 md_write_start(mddev, bio); /* wait on superblock update early */ 803 md_write_start(mddev, bio); /* wait on superblock update early */
803 804
805 if (bio_data_dir(bio) == WRITE &&
806 bio->bi_sector + bio->bi_size/512 > mddev->suspend_lo &&
807 bio->bi_sector < mddev->suspend_hi) {
808 /* As the suspend_* range is controlled by
809 * userspace, we want an interruptible
810 * wait.
811 */
812 DEFINE_WAIT(w);
813 for (;;) {
814 flush_signals(current);
815 prepare_to_wait(&conf->wait_barrier,
816 &w, TASK_INTERRUPTIBLE);
817 if (bio->bi_sector + bio->bi_size/512 <= mddev->suspend_lo ||
818 bio->bi_sector >= mddev->suspend_hi)
819 break;
820 schedule();
821 }
822 finish_wait(&conf->wait_barrier, &w);
823 }
804 if (unlikely(!mddev->barriers_work && 824 if (unlikely(!mddev->barriers_work &&
805 bio_rw_flagged(bio, BIO_RW_BARRIER))) { 825 bio_rw_flagged(bio, BIO_RW_BARRIER))) {
806 if (rw == WRITE) 826 if (rw == WRITE)
@@ -813,12 +833,6 @@ static int make_request(struct request_queue *q, struct bio * bio)
813 833
814 bitmap = mddev->bitmap; 834 bitmap = mddev->bitmap;
815 835
816 cpu = part_stat_lock();
817 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
818 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
819 bio_sectors(bio));
820 part_stat_unlock();
821
822 /* 836 /*
823 * make_request() can abort the operation when READA is being 837 * make_request() can abort the operation when READA is being
824 * used and no empty request is available. 838 * used and no empty request is available.
@@ -845,6 +859,15 @@ static int make_request(struct request_queue *q, struct bio * bio)
845 } 859 }
846 mirror = conf->mirrors + rdisk; 860 mirror = conf->mirrors + rdisk;
847 861
862 if (test_bit(WriteMostly, &mirror->rdev->flags) &&
863 bitmap) {
864 /* Reading from a write-mostly device must
865 * take care not to over-take any writes
866 * that are 'behind'
867 */
868 wait_event(bitmap->behind_wait,
869 atomic_read(&bitmap->behind_writes) == 0);
870 }
848 r1_bio->read_disk = rdisk; 871 r1_bio->read_disk = rdisk;
849 872
850 read_bio = bio_clone(bio, GFP_NOIO); 873 read_bio = bio_clone(bio, GFP_NOIO);
@@ -891,9 +914,10 @@ static int make_request(struct request_queue *q, struct bio * bio)
891 if (test_bit(Faulty, &rdev->flags)) { 914 if (test_bit(Faulty, &rdev->flags)) {
892 rdev_dec_pending(rdev, mddev); 915 rdev_dec_pending(rdev, mddev);
893 r1_bio->bios[i] = NULL; 916 r1_bio->bios[i] = NULL;
894 } else 917 } else {
895 r1_bio->bios[i] = bio; 918 r1_bio->bios[i] = bio;
896 targets++; 919 targets++;
920 }
897 } else 921 } else
898 r1_bio->bios[i] = NULL; 922 r1_bio->bios[i] = NULL;
899 } 923 }
@@ -921,9 +945,14 @@ static int make_request(struct request_queue *q, struct bio * bio)
921 set_bit(R1BIO_Degraded, &r1_bio->state); 945 set_bit(R1BIO_Degraded, &r1_bio->state);
922 } 946 }
923 947
924 /* do behind I/O ? */ 948 /* do behind I/O ?
949 * Not if there are too many, or cannot allocate memory,
950 * or a reader on WriteMostly is waiting for behind writes
951 * to flush */
925 if (bitmap && 952 if (bitmap &&
926 atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind && 953 (atomic_read(&bitmap->behind_writes)
954 < mddev->bitmap_info.max_write_behind) &&
955 !waitqueue_active(&bitmap->behind_wait) &&
927 (behind_pages = alloc_behind_pages(bio)) != NULL) 956 (behind_pages = alloc_behind_pages(bio)) != NULL)
928 set_bit(R1BIO_BehindIO, &r1_bio->state); 957 set_bit(R1BIO_BehindIO, &r1_bio->state);
929 958
@@ -1048,21 +1077,22 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1048 } else 1077 } else
1049 set_bit(Faulty, &rdev->flags); 1078 set_bit(Faulty, &rdev->flags);
1050 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1079 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1051 printk(KERN_ALERT "raid1: Disk failure on %s, disabling device.\n" 1080 printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n"
1052 "raid1: Operation continuing on %d devices.\n", 1081 KERN_ALERT "md/raid1:%s: Operation continuing on %d devices.\n",
1053 bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); 1082 mdname(mddev), bdevname(rdev->bdev, b),
1083 mdname(mddev), conf->raid_disks - mddev->degraded);
1054} 1084}
1055 1085
1056static void print_conf(conf_t *conf) 1086static void print_conf(conf_t *conf)
1057{ 1087{
1058 int i; 1088 int i;
1059 1089
1060 printk("RAID1 conf printout:\n"); 1090 printk(KERN_DEBUG "RAID1 conf printout:\n");
1061 if (!conf) { 1091 if (!conf) {
1062 printk("(!conf)\n"); 1092 printk(KERN_DEBUG "(!conf)\n");
1063 return; 1093 return;
1064 } 1094 }
1065 printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, 1095 printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
1066 conf->raid_disks); 1096 conf->raid_disks);
1067 1097
1068 rcu_read_lock(); 1098 rcu_read_lock();
@@ -1070,7 +1100,7 @@ static void print_conf(conf_t *conf)
1070 char b[BDEVNAME_SIZE]; 1100 char b[BDEVNAME_SIZE];
1071 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); 1101 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
1072 if (rdev) 1102 if (rdev)
1073 printk(" disk %d, wo:%d, o:%d, dev:%s\n", 1103 printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
1074 i, !test_bit(In_sync, &rdev->flags), 1104 i, !test_bit(In_sync, &rdev->flags),
1075 !test_bit(Faulty, &rdev->flags), 1105 !test_bit(Faulty, &rdev->flags),
1076 bdevname(rdev->bdev,b)); 1106 bdevname(rdev->bdev,b));
@@ -1131,13 +1161,17 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1131 1161
1132 disk_stack_limits(mddev->gendisk, rdev->bdev, 1162 disk_stack_limits(mddev->gendisk, rdev->bdev,
1133 rdev->data_offset << 9); 1163 rdev->data_offset << 9);
1134 /* as we don't honour merge_bvec_fn, we must never risk 1164 /* as we don't honour merge_bvec_fn, we must
1135 * violating it, so limit ->max_sector to one PAGE, as 1165 * never risk violating it, so limit
1136 * a one page request is never in violation. 1166 * ->max_segments to one lying with a single
1167 * page, as a one page request is never in
1168 * violation.
1137 */ 1169 */
1138 if (rdev->bdev->bd_disk->queue->merge_bvec_fn && 1170 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1139 queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) 1171 blk_queue_max_segments(mddev->queue, 1);
1140 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 1172 blk_queue_segment_boundary(mddev->queue,
1173 PAGE_CACHE_SIZE - 1);
1174 }
1141 1175
1142 p->head_position = 0; 1176 p->head_position = 0;
1143 rdev->raid_disk = mirror; 1177 rdev->raid_disk = mirror;
@@ -1197,7 +1231,7 @@ abort:
1197 1231
1198static void end_sync_read(struct bio *bio, int error) 1232static void end_sync_read(struct bio *bio, int error)
1199{ 1233{
1200 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 1234 r1bio_t *r1_bio = bio->bi_private;
1201 int i; 1235 int i;
1202 1236
1203 for (i=r1_bio->mddev->raid_disks; i--; ) 1237 for (i=r1_bio->mddev->raid_disks; i--; )
@@ -1220,7 +1254,7 @@ static void end_sync_read(struct bio *bio, int error)
1220static void end_sync_write(struct bio *bio, int error) 1254static void end_sync_write(struct bio *bio, int error)
1221{ 1255{
1222 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1256 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1223 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 1257 r1bio_t *r1_bio = bio->bi_private;
1224 mddev_t *mddev = r1_bio->mddev; 1258 mddev_t *mddev = r1_bio->mddev;
1225 conf_t *conf = mddev->private; 1259 conf_t *conf = mddev->private;
1226 int i; 1260 int i;
@@ -1427,9 +1461,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
1427 char b[BDEVNAME_SIZE]; 1461 char b[BDEVNAME_SIZE];
1428 /* Cannot read from anywhere, array is toast */ 1462 /* Cannot read from anywhere, array is toast */
1429 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); 1463 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
1430 printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error" 1464 printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error"
1431 " for block %llu\n", 1465 " for block %llu\n",
1432 bdevname(bio->bi_bdev,b), 1466 mdname(mddev),
1467 bdevname(bio->bi_bdev, b),
1433 (unsigned long long)r1_bio->sector); 1468 (unsigned long long)r1_bio->sector);
1434 md_done_sync(mddev, r1_bio->sectors, 0); 1469 md_done_sync(mddev, r1_bio->sectors, 0);
1435 put_buf(r1_bio); 1470 put_buf(r1_bio);
@@ -1551,7 +1586,7 @@ static void fix_read_error(conf_t *conf, int read_disk,
1551 else { 1586 else {
1552 atomic_add(s, &rdev->corrected_errors); 1587 atomic_add(s, &rdev->corrected_errors);
1553 printk(KERN_INFO 1588 printk(KERN_INFO
1554 "raid1:%s: read error corrected " 1589 "md/raid1:%s: read error corrected "
1555 "(%d sectors at %llu on %s)\n", 1590 "(%d sectors at %llu on %s)\n",
1556 mdname(mddev), s, 1591 mdname(mddev), s,
1557 (unsigned long long)(sect + 1592 (unsigned long long)(sect +
@@ -1650,13 +1685,15 @@ static void raid1d(mddev_t *mddev)
1650 r1_bio->sector, 1685 r1_bio->sector,
1651 r1_bio->sectors); 1686 r1_bio->sectors);
1652 unfreeze_array(conf); 1687 unfreeze_array(conf);
1653 } 1688 } else
1689 md_error(mddev,
1690 conf->mirrors[r1_bio->read_disk].rdev);
1654 1691
1655 bio = r1_bio->bios[r1_bio->read_disk]; 1692 bio = r1_bio->bios[r1_bio->read_disk];
1656 if ((disk=read_balance(conf, r1_bio)) == -1 || 1693 if ((disk=read_balance(conf, r1_bio)) == -1) {
1657 disk == r1_bio->read_disk) { 1694 printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O"
1658 printk(KERN_ALERT "raid1: %s: unrecoverable I/O"
1659 " read error for block %llu\n", 1695 " read error for block %llu\n",
1696 mdname(mddev),
1660 bdevname(bio->bi_bdev,b), 1697 bdevname(bio->bi_bdev,b),
1661 (unsigned long long)r1_bio->sector); 1698 (unsigned long long)r1_bio->sector);
1662 raid_end_bio_io(r1_bio); 1699 raid_end_bio_io(r1_bio);
@@ -1670,10 +1707,11 @@ static void raid1d(mddev_t *mddev)
1670 r1_bio->bios[r1_bio->read_disk] = bio; 1707 r1_bio->bios[r1_bio->read_disk] = bio;
1671 rdev = conf->mirrors[disk].rdev; 1708 rdev = conf->mirrors[disk].rdev;
1672 if (printk_ratelimit()) 1709 if (printk_ratelimit())
1673 printk(KERN_ERR "raid1: %s: redirecting sector %llu to" 1710 printk(KERN_ERR "md/raid1:%s: redirecting sector %llu to"
1674 " another mirror\n", 1711 " other mirror: %s\n",
1675 bdevname(rdev->bdev,b), 1712 mdname(mddev),
1676 (unsigned long long)r1_bio->sector); 1713 (unsigned long long)r1_bio->sector,
1714 bdevname(rdev->bdev,b));
1677 bio->bi_sector = r1_bio->sector + rdev->data_offset; 1715 bio->bi_sector = r1_bio->sector + rdev->data_offset;
1678 bio->bi_bdev = rdev->bdev; 1716 bio->bi_bdev = rdev->bdev;
1679 bio->bi_end_io = raid1_end_read_request; 1717 bio->bi_end_io = raid1_end_read_request;
@@ -1683,6 +1721,7 @@ static void raid1d(mddev_t *mddev)
1683 generic_make_request(bio); 1721 generic_make_request(bio);
1684 } 1722 }
1685 } 1723 }
1724 cond_resched();
1686 } 1725 }
1687 if (unplug) 1726 if (unplug)
1688 unplug_slaves(mddev); 1727 unplug_slaves(mddev);
@@ -1727,13 +1766,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1727 int still_degraded = 0; 1766 int still_degraded = 0;
1728 1767
1729 if (!conf->r1buf_pool) 1768 if (!conf->r1buf_pool)
1730 {
1731/*
1732 printk("sync start - bitmap %p\n", mddev->bitmap);
1733*/
1734 if (init_resync(conf)) 1769 if (init_resync(conf))
1735 return 0; 1770 return 0;
1736 }
1737 1771
1738 max_sector = mddev->dev_sectors; 1772 max_sector = mddev->dev_sectors;
1739 if (sector_nr >= max_sector) { 1773 if (sector_nr >= max_sector) {
@@ -1939,73 +1973,48 @@ static sector_t raid1_size(mddev_t *mddev, sector_t sectors, int raid_disks)
1939 return mddev->dev_sectors; 1973 return mddev->dev_sectors;
1940} 1974}
1941 1975
1942static int run(mddev_t *mddev) 1976static conf_t *setup_conf(mddev_t *mddev)
1943{ 1977{
1944 conf_t *conf; 1978 conf_t *conf;
1945 int i, j, disk_idx; 1979 int i;
1946 mirror_info_t *disk; 1980 mirror_info_t *disk;
1947 mdk_rdev_t *rdev; 1981 mdk_rdev_t *rdev;
1982 int err = -ENOMEM;
1948 1983
1949 if (mddev->level != 1) {
1950 printk("raid1: %s: raid level not set to mirroring (%d)\n",
1951 mdname(mddev), mddev->level);
1952 goto out;
1953 }
1954 if (mddev->reshape_position != MaxSector) {
1955 printk("raid1: %s: reshape_position set but not supported\n",
1956 mdname(mddev));
1957 goto out;
1958 }
1959 /*
1960 * copy the already verified devices into our private RAID1
1961 * bookkeeping area. [whatever we allocate in run(),
1962 * should be freed in stop()]
1963 */
1964 conf = kzalloc(sizeof(conf_t), GFP_KERNEL); 1984 conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
1965 mddev->private = conf;
1966 if (!conf) 1985 if (!conf)
1967 goto out_no_mem; 1986 goto abort;
1968 1987
1969 conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, 1988 conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
1970 GFP_KERNEL); 1989 GFP_KERNEL);
1971 if (!conf->mirrors) 1990 if (!conf->mirrors)
1972 goto out_no_mem; 1991 goto abort;
1973 1992
1974 conf->tmppage = alloc_page(GFP_KERNEL); 1993 conf->tmppage = alloc_page(GFP_KERNEL);
1975 if (!conf->tmppage) 1994 if (!conf->tmppage)
1976 goto out_no_mem; 1995 goto abort;
1977 1996
1978 conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL); 1997 conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
1979 if (!conf->poolinfo) 1998 if (!conf->poolinfo)
1980 goto out_no_mem; 1999 goto abort;
1981 conf->poolinfo->mddev = mddev;
1982 conf->poolinfo->raid_disks = mddev->raid_disks; 2000 conf->poolinfo->raid_disks = mddev->raid_disks;
1983 conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, 2001 conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
1984 r1bio_pool_free, 2002 r1bio_pool_free,
1985 conf->poolinfo); 2003 conf->poolinfo);
1986 if (!conf->r1bio_pool) 2004 if (!conf->r1bio_pool)
1987 goto out_no_mem; 2005 goto abort;
1988 2006
1989 spin_lock_init(&conf->device_lock); 2007 conf->poolinfo->mddev = mddev;
1990 mddev->queue->queue_lock = &conf->device_lock;
1991 2008
2009 spin_lock_init(&conf->device_lock);
1992 list_for_each_entry(rdev, &mddev->disks, same_set) { 2010 list_for_each_entry(rdev, &mddev->disks, same_set) {
1993 disk_idx = rdev->raid_disk; 2011 int disk_idx = rdev->raid_disk;
1994 if (disk_idx >= mddev->raid_disks 2012 if (disk_idx >= mddev->raid_disks
1995 || disk_idx < 0) 2013 || disk_idx < 0)
1996 continue; 2014 continue;
1997 disk = conf->mirrors + disk_idx; 2015 disk = conf->mirrors + disk_idx;
1998 2016
1999 disk->rdev = rdev; 2017 disk->rdev = rdev;
2000 disk_stack_limits(mddev->gendisk, rdev->bdev,
2001 rdev->data_offset << 9);
2002 /* as we don't honour merge_bvec_fn, we must never risk
2003 * violating it, so limit ->max_sector to one PAGE, as
2004 * a one page request is never in violation.
2005 */
2006 if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
2007 queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
2008 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
2009 2018
2010 disk->head_position = 0; 2019 disk->head_position = 0;
2011 } 2020 }
@@ -2019,8 +2028,7 @@ static int run(mddev_t *mddev)
2019 bio_list_init(&conf->pending_bio_list); 2028 bio_list_init(&conf->pending_bio_list);
2020 bio_list_init(&conf->flushing_bio_list); 2029 bio_list_init(&conf->flushing_bio_list);
2021 2030
2022 2031 conf->last_used = -1;
2023 mddev->degraded = 0;
2024 for (i = 0; i < conf->raid_disks; i++) { 2032 for (i = 0; i < conf->raid_disks; i++) {
2025 2033
2026 disk = conf->mirrors + i; 2034 disk = conf->mirrors + i;
@@ -2028,49 +2036,115 @@ static int run(mddev_t *mddev)
2028 if (!disk->rdev || 2036 if (!disk->rdev ||
2029 !test_bit(In_sync, &disk->rdev->flags)) { 2037 !test_bit(In_sync, &disk->rdev->flags)) {
2030 disk->head_position = 0; 2038 disk->head_position = 0;
2031 mddev->degraded++;
2032 if (disk->rdev) 2039 if (disk->rdev)
2033 conf->fullsync = 1; 2040 conf->fullsync = 1;
2034 } 2041 } else if (conf->last_used < 0)
2042 /*
2043 * The first working device is used as a
2044 * starting point to read balancing.
2045 */
2046 conf->last_used = i;
2035 } 2047 }
2036 if (mddev->degraded == conf->raid_disks) { 2048
2037 printk(KERN_ERR "raid1: no operational mirrors for %s\n", 2049 err = -EIO;
2038 mdname(mddev)); 2050 if (conf->last_used < 0) {
2039 goto out_free_conf; 2051 printk(KERN_ERR "md/raid1:%s: no operational mirrors\n",
2052 mdname(mddev));
2053 goto abort;
2054 }
2055 err = -ENOMEM;
2056 conf->thread = md_register_thread(raid1d, mddev, NULL);
2057 if (!conf->thread) {
2058 printk(KERN_ERR
2059 "md/raid1:%s: couldn't allocate thread\n",
2060 mdname(mddev));
2061 goto abort;
2040 } 2062 }
2041 if (conf->raid_disks - mddev->degraded == 1)
2042 mddev->recovery_cp = MaxSector;
2043 2063
2064 return conf;
2065
2066 abort:
2067 if (conf) {
2068 if (conf->r1bio_pool)
2069 mempool_destroy(conf->r1bio_pool);
2070 kfree(conf->mirrors);
2071 safe_put_page(conf->tmppage);
2072 kfree(conf->poolinfo);
2073 kfree(conf);
2074 }
2075 return ERR_PTR(err);
2076}
2077
2078static int run(mddev_t *mddev)
2079{
2080 conf_t *conf;
2081 int i;
2082 mdk_rdev_t *rdev;
2083
2084 if (mddev->level != 1) {
2085 printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n",
2086 mdname(mddev), mddev->level);
2087 return -EIO;
2088 }
2089 if (mddev->reshape_position != MaxSector) {
2090 printk(KERN_ERR "md/raid1:%s: reshape_position set but not supported\n",
2091 mdname(mddev));
2092 return -EIO;
2093 }
2044 /* 2094 /*
2045 * find the first working one and use it as a starting point 2095 * copy the already verified devices into our private RAID1
2046 * to read balancing. 2096 * bookkeeping area. [whatever we allocate in run(),
2097 * should be freed in stop()]
2047 */ 2098 */
2048 for (j = 0; j < conf->raid_disks && 2099 if (mddev->private == NULL)
2049 (!conf->mirrors[j].rdev || 2100 conf = setup_conf(mddev);
2050 !test_bit(In_sync, &conf->mirrors[j].rdev->flags)) ; j++) 2101 else
2051 /* nothing */; 2102 conf = mddev->private;
2052 conf->last_used = j;
2053 2103
2104 if (IS_ERR(conf))
2105 return PTR_ERR(conf);
2054 2106
2055 mddev->thread = md_register_thread(raid1d, mddev, NULL); 2107 mddev->queue->queue_lock = &conf->device_lock;
2056 if (!mddev->thread) { 2108 list_for_each_entry(rdev, &mddev->disks, same_set) {
2057 printk(KERN_ERR 2109 disk_stack_limits(mddev->gendisk, rdev->bdev,
2058 "raid1: couldn't allocate thread for %s\n", 2110 rdev->data_offset << 9);
2059 mdname(mddev)); 2111 /* as we don't honour merge_bvec_fn, we must never risk
2060 goto out_free_conf; 2112 * violating it, so limit ->max_segments to 1 lying within
2113 * a single page, as a one page request is never in violation.
2114 */
2115 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
2116 blk_queue_max_segments(mddev->queue, 1);
2117 blk_queue_segment_boundary(mddev->queue,
2118 PAGE_CACHE_SIZE - 1);
2119 }
2061 } 2120 }
2062 2121
2122 mddev->degraded = 0;
2123 for (i=0; i < conf->raid_disks; i++)
2124 if (conf->mirrors[i].rdev == NULL ||
2125 !test_bit(In_sync, &conf->mirrors[i].rdev->flags) ||
2126 test_bit(Faulty, &conf->mirrors[i].rdev->flags))
2127 mddev->degraded++;
2128
2129 if (conf->raid_disks - mddev->degraded == 1)
2130 mddev->recovery_cp = MaxSector;
2131
2063 if (mddev->recovery_cp != MaxSector) 2132 if (mddev->recovery_cp != MaxSector)
2064 printk(KERN_NOTICE "raid1: %s is not clean" 2133 printk(KERN_NOTICE "md/raid1:%s: not clean"
2065 " -- starting background reconstruction\n", 2134 " -- starting background reconstruction\n",
2066 mdname(mddev)); 2135 mdname(mddev));
2067 printk(KERN_INFO 2136 printk(KERN_INFO
2068 "raid1: raid set %s active with %d out of %d mirrors\n", 2137 "md/raid1:%s: active with %d out of %d mirrors\n",
2069 mdname(mddev), mddev->raid_disks - mddev->degraded, 2138 mdname(mddev), mddev->raid_disks - mddev->degraded,
2070 mddev->raid_disks); 2139 mddev->raid_disks);
2140
2071 /* 2141 /*
2072 * Ok, everything is just fine now 2142 * Ok, everything is just fine now
2073 */ 2143 */
2144 mddev->thread = conf->thread;
2145 conf->thread = NULL;
2146 mddev->private = conf;
2147
2074 md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); 2148 md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
2075 2149
2076 mddev->queue->unplug_fn = raid1_unplug; 2150 mddev->queue->unplug_fn = raid1_unplug;
@@ -2078,38 +2152,20 @@ static int run(mddev_t *mddev)
2078 mddev->queue->backing_dev_info.congested_data = mddev; 2152 mddev->queue->backing_dev_info.congested_data = mddev;
2079 md_integrity_register(mddev); 2153 md_integrity_register(mddev);
2080 return 0; 2154 return 0;
2081
2082out_no_mem:
2083 printk(KERN_ERR "raid1: couldn't allocate memory for %s\n",
2084 mdname(mddev));
2085
2086out_free_conf:
2087 if (conf) {
2088 if (conf->r1bio_pool)
2089 mempool_destroy(conf->r1bio_pool);
2090 kfree(conf->mirrors);
2091 safe_put_page(conf->tmppage);
2092 kfree(conf->poolinfo);
2093 kfree(conf);
2094 mddev->private = NULL;
2095 }
2096out:
2097 return -EIO;
2098} 2155}
2099 2156
2100static int stop(mddev_t *mddev) 2157static int stop(mddev_t *mddev)
2101{ 2158{
2102 conf_t *conf = mddev->private; 2159 conf_t *conf = mddev->private;
2103 struct bitmap *bitmap = mddev->bitmap; 2160 struct bitmap *bitmap = mddev->bitmap;
2104 int behind_wait = 0;
2105 2161
2106 /* wait for behind writes to complete */ 2162 /* wait for behind writes to complete */
2107 while (bitmap && atomic_read(&bitmap->behind_writes) > 0) { 2163 if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
2108 behind_wait++; 2164 printk(KERN_INFO "md/raid1:%s: behind writes in progress - waiting to stop.\n",
2109 printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait); 2165 mdname(mddev));
2110 set_current_state(TASK_UNINTERRUPTIBLE);
2111 schedule_timeout(HZ); /* wait a second */
2112 /* need to kick something here to make sure I/O goes? */ 2166 /* need to kick something here to make sure I/O goes? */
2167 wait_event(bitmap->behind_wait,
2168 atomic_read(&bitmap->behind_writes) == 0);
2113 } 2169 }
2114 2170
2115 raise_barrier(conf); 2171 raise_barrier(conf);
@@ -2140,7 +2196,6 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
2140 if (mddev->array_sectors > raid1_size(mddev, sectors, 0)) 2196 if (mddev->array_sectors > raid1_size(mddev, sectors, 0))
2141 return -EINVAL; 2197 return -EINVAL;
2142 set_capacity(mddev->gendisk, mddev->array_sectors); 2198 set_capacity(mddev->gendisk, mddev->array_sectors);
2143 mddev->changed = 1;
2144 revalidate_disk(mddev->gendisk); 2199 revalidate_disk(mddev->gendisk);
2145 if (sectors > mddev->dev_sectors && 2200 if (sectors > mddev->dev_sectors &&
2146 mddev->recovery_cp == MaxSector) { 2201 mddev->recovery_cp == MaxSector) {
@@ -2235,9 +2290,9 @@ static int raid1_reshape(mddev_t *mddev)
2235 if (sysfs_create_link(&mddev->kobj, 2290 if (sysfs_create_link(&mddev->kobj,
2236 &rdev->kobj, nm)) 2291 &rdev->kobj, nm))
2237 printk(KERN_WARNING 2292 printk(KERN_WARNING
2238 "md/raid1: cannot register " 2293 "md/raid1:%s: cannot register "
2239 "%s for %s\n", 2294 "%s\n",
2240 nm, mdname(mddev)); 2295 mdname(mddev), nm);
2241 } 2296 }
2242 if (rdev) 2297 if (rdev)
2243 newmirrors[d2++].rdev = rdev; 2298 newmirrors[d2++].rdev = rdev;
@@ -2268,6 +2323,9 @@ static void raid1_quiesce(mddev_t *mddev, int state)
2268 conf_t *conf = mddev->private; 2323 conf_t *conf = mddev->private;
2269 2324
2270 switch(state) { 2325 switch(state) {
2326 case 2: /* wake for suspend */
2327 wake_up(&conf->wait_barrier);
2328 break;
2271 case 1: 2329 case 1:
2272 raise_barrier(conf); 2330 raise_barrier(conf);
2273 break; 2331 break;
@@ -2277,6 +2335,23 @@ static void raid1_quiesce(mddev_t *mddev, int state)
2277 } 2335 }
2278} 2336}
2279 2337
2338static void *raid1_takeover(mddev_t *mddev)
2339{
2340 /* raid1 can take over:
2341 * raid5 with 2 devices, any layout or chunk size
2342 */
2343 if (mddev->level == 5 && mddev->raid_disks == 2) {
2344 conf_t *conf;
2345 mddev->new_level = 1;
2346 mddev->new_layout = 0;
2347 mddev->new_chunk_sectors = 0;
2348 conf = setup_conf(mddev);
2349 if (!IS_ERR(conf))
2350 conf->barrier = 1;
2351 return conf;
2352 }
2353 return ERR_PTR(-EINVAL);
2354}
2280 2355
2281static struct mdk_personality raid1_personality = 2356static struct mdk_personality raid1_personality =
2282{ 2357{
@@ -2296,6 +2371,7 @@ static struct mdk_personality raid1_personality =
2296 .size = raid1_size, 2371 .size = raid1_size,
2297 .check_reshape = raid1_reshape, 2372 .check_reshape = raid1_reshape,
2298 .quiesce = raid1_quiesce, 2373 .quiesce = raid1_quiesce,
2374 .takeover = raid1_takeover,
2299}; 2375};
2300 2376
2301static int __init raid_init(void) 2377static int __init raid_init(void)
@@ -2311,6 +2387,7 @@ static void raid_exit(void)
2311module_init(raid_init); 2387module_init(raid_init);
2312module_exit(raid_exit); 2388module_exit(raid_exit);
2313MODULE_LICENSE("GPL"); 2389MODULE_LICENSE("GPL");
2390MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD");
2314MODULE_ALIAS("md-personality-3"); /* RAID1 */ 2391MODULE_ALIAS("md-personality-3"); /* RAID1 */
2315MODULE_ALIAS("md-raid1"); 2392MODULE_ALIAS("md-raid1");
2316MODULE_ALIAS("md-level-1"); 2393MODULE_ALIAS("md-level-1");
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index e87b84deff68..5f2d443ae28a 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -59,6 +59,11 @@ struct r1_private_data_s {
59 59
60 mempool_t *r1bio_pool; 60 mempool_t *r1bio_pool;
61 mempool_t *r1buf_pool; 61 mempool_t *r1buf_pool;
62
63 /* When taking over an array from a different personality, we store
64 * the new thread here until we fully activate the array.
65 */
66 struct mdk_thread_s *thread;
62}; 67};
63 68
64typedef struct r1_private_data_s conf_t; 69typedef struct r1_private_data_s conf_t;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 51c4c5c4d87a..42e64e4e5e25 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -18,11 +18,13 @@
18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */ 19 */
20 20
21#include <linux/slab.h>
21#include <linux/delay.h> 22#include <linux/delay.h>
22#include <linux/blkdev.h> 23#include <linux/blkdev.h>
23#include <linux/seq_file.h> 24#include <linux/seq_file.h>
24#include "md.h" 25#include "md.h"
25#include "raid10.h" 26#include "raid10.h"
27#include "raid0.h"
26#include "bitmap.h" 28#include "bitmap.h"
27 29
28/* 30/*
@@ -68,7 +70,7 @@ static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
68 70
69 /* allocate a r10bio with room for raid_disks entries in the bios array */ 71 /* allocate a r10bio with room for raid_disks entries in the bios array */
70 r10_bio = kzalloc(size, gfp_flags); 72 r10_bio = kzalloc(size, gfp_flags);
71 if (!r10_bio) 73 if (!r10_bio && conf->mddev)
72 unplug_slaves(conf->mddev); 74 unplug_slaves(conf->mddev);
73 75
74 return r10_bio; 76 return r10_bio;
@@ -254,7 +256,7 @@ static inline void update_head_pos(int slot, r10bio_t *r10_bio)
254static void raid10_end_read_request(struct bio *bio, int error) 256static void raid10_end_read_request(struct bio *bio, int error)
255{ 257{
256 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 258 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
257 r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); 259 r10bio_t *r10_bio = bio->bi_private;
258 int slot, dev; 260 int slot, dev;
259 conf_t *conf = r10_bio->mddev->private; 261 conf_t *conf = r10_bio->mddev->private;
260 262
@@ -284,7 +286,8 @@ static void raid10_end_read_request(struct bio *bio, int error)
284 */ 286 */
285 char b[BDEVNAME_SIZE]; 287 char b[BDEVNAME_SIZE];
286 if (printk_ratelimit()) 288 if (printk_ratelimit())
287 printk(KERN_ERR "raid10: %s: rescheduling sector %llu\n", 289 printk(KERN_ERR "md/raid10:%s: %s: rescheduling sector %llu\n",
290 mdname(conf->mddev),
288 bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); 291 bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);
289 reschedule_retry(r10_bio); 292 reschedule_retry(r10_bio);
290 } 293 }
@@ -295,7 +298,7 @@ static void raid10_end_read_request(struct bio *bio, int error)
295static void raid10_end_write_request(struct bio *bio, int error) 298static void raid10_end_write_request(struct bio *bio, int error)
296{ 299{
297 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 300 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
298 r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); 301 r10bio_t *r10_bio = bio->bi_private;
299 int slot, dev; 302 int slot, dev;
300 conf_t *conf = r10_bio->mddev->private; 303 conf_t *conf = r10_bio->mddev->private;
301 304
@@ -493,7 +496,7 @@ static int raid10_mergeable_bvec(struct request_queue *q,
493 */ 496 */
494static int read_balance(conf_t *conf, r10bio_t *r10_bio) 497static int read_balance(conf_t *conf, r10bio_t *r10_bio)
495{ 498{
496 const unsigned long this_sector = r10_bio->sector; 499 const sector_t this_sector = r10_bio->sector;
497 int disk, slot, nslot; 500 int disk, slot, nslot;
498 const int sectors = r10_bio->sectors; 501 const int sectors = r10_bio->sectors;
499 sector_t new_distance, current_distance; 502 sector_t new_distance, current_distance;
@@ -600,7 +603,7 @@ static void unplug_slaves(mddev_t *mddev)
600 int i; 603 int i;
601 604
602 rcu_read_lock(); 605 rcu_read_lock();
603 for (i=0; i<mddev->raid_disks; i++) { 606 for (i=0; i < conf->raid_disks; i++) {
604 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); 607 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
605 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { 608 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
606 struct request_queue *r_queue = bdev_get_queue(rdev->bdev); 609 struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
@@ -634,7 +637,7 @@ static int raid10_congested(void *data, int bits)
634 if (mddev_congested(mddev, bits)) 637 if (mddev_congested(mddev, bits))
635 return 1; 638 return 1;
636 rcu_read_lock(); 639 rcu_read_lock();
637 for (i = 0; i < mddev->raid_disks && ret == 0; i++) { 640 for (i = 0; i < conf->raid_disks && ret == 0; i++) {
638 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); 641 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
639 if (rdev && !test_bit(Faulty, &rdev->flags)) { 642 if (rdev && !test_bit(Faulty, &rdev->flags)) {
640 struct request_queue *q = bdev_get_queue(rdev->bdev); 643 struct request_queue *q = bdev_get_queue(rdev->bdev);
@@ -787,14 +790,12 @@ static void unfreeze_array(conf_t *conf)
787 spin_unlock_irq(&conf->resync_lock); 790 spin_unlock_irq(&conf->resync_lock);
788} 791}
789 792
790static int make_request(struct request_queue *q, struct bio * bio) 793static int make_request(mddev_t *mddev, struct bio * bio)
791{ 794{
792 mddev_t *mddev = q->queuedata;
793 conf_t *conf = mddev->private; 795 conf_t *conf = mddev->private;
794 mirror_info_t *mirror; 796 mirror_info_t *mirror;
795 r10bio_t *r10_bio; 797 r10bio_t *r10_bio;
796 struct bio *read_bio; 798 struct bio *read_bio;
797 int cpu;
798 int i; 799 int i;
799 int chunk_sects = conf->chunk_mask + 1; 800 int chunk_sects = conf->chunk_mask + 1;
800 const int rw = bio_data_dir(bio); 801 const int rw = bio_data_dir(bio);
@@ -804,7 +805,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
804 mdk_rdev_t *blocked_rdev; 805 mdk_rdev_t *blocked_rdev;
805 806
806 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { 807 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
807 bio_endio(bio, -EOPNOTSUPP); 808 md_barrier_request(mddev, bio);
808 return 0; 809 return 0;
809 } 810 }
810 811
@@ -824,16 +825,16 @@ static int make_request(struct request_queue *q, struct bio * bio)
824 */ 825 */
825 bp = bio_split(bio, 826 bp = bio_split(bio,
826 chunk_sects - (bio->bi_sector & (chunk_sects - 1)) ); 827 chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
827 if (make_request(q, &bp->bio1)) 828 if (make_request(mddev, &bp->bio1))
828 generic_make_request(&bp->bio1); 829 generic_make_request(&bp->bio1);
829 if (make_request(q, &bp->bio2)) 830 if (make_request(mddev, &bp->bio2))
830 generic_make_request(&bp->bio2); 831 generic_make_request(&bp->bio2);
831 832
832 bio_pair_release(bp); 833 bio_pair_release(bp);
833 return 0; 834 return 0;
834 bad_map: 835 bad_map:
835 printk("raid10_make_request bug: can't convert block across chunks" 836 printk("md/raid10:%s: make_request bug: can't convert block across chunks"
836 " or bigger than %dk %llu %d\n", chunk_sects/2, 837 " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
837 (unsigned long long)bio->bi_sector, bio->bi_size >> 10); 838 (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
838 839
839 bio_io_error(bio); 840 bio_io_error(bio);
@@ -849,12 +850,6 @@ static int make_request(struct request_queue *q, struct bio * bio)
849 */ 850 */
850 wait_barrier(conf); 851 wait_barrier(conf);
851 852
852 cpu = part_stat_lock();
853 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
854 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
855 bio_sectors(bio));
856 part_stat_unlock();
857
858 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); 853 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
859 854
860 r10_bio->master_bio = bio; 855 r10_bio->master_bio = bio;
@@ -1038,9 +1033,10 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1038 } 1033 }
1039 set_bit(Faulty, &rdev->flags); 1034 set_bit(Faulty, &rdev->flags);
1040 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1035 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1041 printk(KERN_ALERT "raid10: Disk failure on %s, disabling device.\n" 1036 printk(KERN_ALERT "md/raid10:%s: Disk failure on %s, disabling device.\n"
1042 "raid10: Operation continuing on %d devices.\n", 1037 KERN_ALERT "md/raid10:%s: Operation continuing on %d devices.\n",
1043 bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); 1038 mdname(mddev), bdevname(rdev->bdev, b),
1039 mdname(mddev), conf->raid_disks - mddev->degraded);
1044} 1040}
1045 1041
1046static void print_conf(conf_t *conf) 1042static void print_conf(conf_t *conf)
@@ -1048,19 +1044,19 @@ static void print_conf(conf_t *conf)
1048 int i; 1044 int i;
1049 mirror_info_t *tmp; 1045 mirror_info_t *tmp;
1050 1046
1051 printk("RAID10 conf printout:\n"); 1047 printk(KERN_DEBUG "RAID10 conf printout:\n");
1052 if (!conf) { 1048 if (!conf) {
1053 printk("(!conf)\n"); 1049 printk(KERN_DEBUG "(!conf)\n");
1054 return; 1050 return;
1055 } 1051 }
1056 printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, 1052 printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
1057 conf->raid_disks); 1053 conf->raid_disks);
1058 1054
1059 for (i = 0; i < conf->raid_disks; i++) { 1055 for (i = 0; i < conf->raid_disks; i++) {
1060 char b[BDEVNAME_SIZE]; 1056 char b[BDEVNAME_SIZE];
1061 tmp = conf->mirrors + i; 1057 tmp = conf->mirrors + i;
1062 if (tmp->rdev) 1058 if (tmp->rdev)
1063 printk(" disk %d, wo:%d, o:%d, dev:%s\n", 1059 printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
1064 i, !test_bit(In_sync, &tmp->rdev->flags), 1060 i, !test_bit(In_sync, &tmp->rdev->flags),
1065 !test_bit(Faulty, &tmp->rdev->flags), 1061 !test_bit(Faulty, &tmp->rdev->flags),
1066 bdevname(tmp->rdev->bdev,b)); 1062 bdevname(tmp->rdev->bdev,b));
@@ -1131,7 +1127,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1131 int mirror; 1127 int mirror;
1132 mirror_info_t *p; 1128 mirror_info_t *p;
1133 int first = 0; 1129 int first = 0;
1134 int last = mddev->raid_disks - 1; 1130 int last = conf->raid_disks - 1;
1135 1131
1136 if (mddev->recovery_cp < MaxSector) 1132 if (mddev->recovery_cp < MaxSector)
1137 /* only hot-add to in-sync arrays, as recovery is 1133 /* only hot-add to in-sync arrays, as recovery is
@@ -1155,13 +1151,17 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1155 1151
1156 disk_stack_limits(mddev->gendisk, rdev->bdev, 1152 disk_stack_limits(mddev->gendisk, rdev->bdev,
1157 rdev->data_offset << 9); 1153 rdev->data_offset << 9);
1158 /* as we don't honour merge_bvec_fn, we must never risk 1154 /* as we don't honour merge_bvec_fn, we must
1159 * violating it, so limit ->max_sector to one PAGE, as 1155 * never risk violating it, so limit
1160 * a one page request is never in violation. 1156 * ->max_segments to one lying with a single
1157 * page, as a one page request is never in
1158 * violation.
1161 */ 1159 */
1162 if (rdev->bdev->bd_disk->queue->merge_bvec_fn && 1160 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1163 queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) 1161 blk_queue_max_segments(mddev->queue, 1);
1164 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 1162 blk_queue_segment_boundary(mddev->queue,
1163 PAGE_CACHE_SIZE - 1);
1164 }
1165 1165
1166 p->head_position = 0; 1166 p->head_position = 0;
1167 rdev->raid_disk = mirror; 1167 rdev->raid_disk = mirror;
@@ -1219,7 +1219,7 @@ abort:
1219 1219
1220static void end_sync_read(struct bio *bio, int error) 1220static void end_sync_read(struct bio *bio, int error)
1221{ 1221{
1222 r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); 1222 r10bio_t *r10_bio = bio->bi_private;
1223 conf_t *conf = r10_bio->mddev->private; 1223 conf_t *conf = r10_bio->mddev->private;
1224 int i,d; 1224 int i,d;
1225 1225
@@ -1256,7 +1256,7 @@ static void end_sync_read(struct bio *bio, int error)
1256static void end_sync_write(struct bio *bio, int error) 1256static void end_sync_write(struct bio *bio, int error)
1257{ 1257{
1258 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1258 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1259 r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); 1259 r10bio_t *r10_bio = bio->bi_private;
1260 mddev_t *mddev = r10_bio->mddev; 1260 mddev_t *mddev = r10_bio->mddev;
1261 conf_t *conf = mddev->private; 1261 conf_t *conf = mddev->private;
1262 int i,d; 1262 int i,d;
@@ -1432,6 +1432,43 @@ static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1432 1432
1433 1433
1434/* 1434/*
1435 * Used by fix_read_error() to decay the per rdev read_errors.
1436 * We halve the read error count for every hour that has elapsed
1437 * since the last recorded read error.
1438 *
1439 */
1440static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev)
1441{
1442 struct timespec cur_time_mon;
1443 unsigned long hours_since_last;
1444 unsigned int read_errors = atomic_read(&rdev->read_errors);
1445
1446 ktime_get_ts(&cur_time_mon);
1447
1448 if (rdev->last_read_error.tv_sec == 0 &&
1449 rdev->last_read_error.tv_nsec == 0) {
1450 /* first time we've seen a read error */
1451 rdev->last_read_error = cur_time_mon;
1452 return;
1453 }
1454
1455 hours_since_last = (cur_time_mon.tv_sec -
1456 rdev->last_read_error.tv_sec) / 3600;
1457
1458 rdev->last_read_error = cur_time_mon;
1459
1460 /*
1461 * if hours_since_last is > the number of bits in read_errors
1462 * just set read errors to 0. We do this to avoid
1463 * overflowing the shift of read_errors by hours_since_last.
1464 */
1465 if (hours_since_last >= 8 * sizeof(read_errors))
1466 atomic_set(&rdev->read_errors, 0);
1467 else
1468 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
1469}
1470
1471/*
1435 * This is a kernel thread which: 1472 * This is a kernel thread which:
1436 * 1473 *
1437 * 1. Retries failed read operations on working mirrors. 1474 * 1. Retries failed read operations on working mirrors.
@@ -1444,6 +1481,44 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1444 int sect = 0; /* Offset from r10_bio->sector */ 1481 int sect = 0; /* Offset from r10_bio->sector */
1445 int sectors = r10_bio->sectors; 1482 int sectors = r10_bio->sectors;
1446 mdk_rdev_t*rdev; 1483 mdk_rdev_t*rdev;
1484 int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
1485 int d = r10_bio->devs[r10_bio->read_slot].devnum;
1486
1487 rcu_read_lock();
1488 rdev = rcu_dereference(conf->mirrors[d].rdev);
1489 if (rdev) { /* If rdev is not NULL */
1490 char b[BDEVNAME_SIZE];
1491 int cur_read_error_count = 0;
1492
1493 bdevname(rdev->bdev, b);
1494
1495 if (test_bit(Faulty, &rdev->flags)) {
1496 rcu_read_unlock();
1497 /* drive has already been failed, just ignore any
1498 more fix_read_error() attempts */
1499 return;
1500 }
1501
1502 check_decay_read_errors(mddev, rdev);
1503 atomic_inc(&rdev->read_errors);
1504 cur_read_error_count = atomic_read(&rdev->read_errors);
1505 if (cur_read_error_count > max_read_errors) {
1506 rcu_read_unlock();
1507 printk(KERN_NOTICE
1508 "md/raid10:%s: %s: Raid device exceeded "
1509 "read_error threshold "
1510 "[cur %d:max %d]\n",
1511 mdname(mddev),
1512 b, cur_read_error_count, max_read_errors);
1513 printk(KERN_NOTICE
1514 "md/raid10:%s: %s: Failing raid "
1515 "device\n", mdname(mddev), b);
1516 md_error(mddev, conf->mirrors[d].rdev);
1517 return;
1518 }
1519 }
1520 rcu_read_unlock();
1521
1447 while(sectors) { 1522 while(sectors) {
1448 int s = sectors; 1523 int s = sectors;
1449 int sl = r10_bio->read_slot; 1524 int sl = r10_bio->read_slot;
@@ -1455,7 +1530,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1455 1530
1456 rcu_read_lock(); 1531 rcu_read_lock();
1457 do { 1532 do {
1458 int d = r10_bio->devs[sl].devnum; 1533 d = r10_bio->devs[sl].devnum;
1459 rdev = rcu_dereference(conf->mirrors[d].rdev); 1534 rdev = rcu_dereference(conf->mirrors[d].rdev);
1460 if (rdev && 1535 if (rdev &&
1461 test_bit(In_sync, &rdev->flags)) { 1536 test_bit(In_sync, &rdev->flags)) {
@@ -1488,7 +1563,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1488 /* write it back and re-read */ 1563 /* write it back and re-read */
1489 rcu_read_lock(); 1564 rcu_read_lock();
1490 while (sl != r10_bio->read_slot) { 1565 while (sl != r10_bio->read_slot) {
1491 int d; 1566 char b[BDEVNAME_SIZE];
1567
1492 if (sl==0) 1568 if (sl==0)
1493 sl = conf->copies; 1569 sl = conf->copies;
1494 sl--; 1570 sl--;
@@ -1503,16 +1579,29 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1503 r10_bio->devs[sl].addr + 1579 r10_bio->devs[sl].addr +
1504 sect + rdev->data_offset, 1580 sect + rdev->data_offset,
1505 s<<9, conf->tmppage, WRITE) 1581 s<<9, conf->tmppage, WRITE)
1506 == 0) 1582 == 0) {
1507 /* Well, this device is dead */ 1583 /* Well, this device is dead */
1584 printk(KERN_NOTICE
1585 "md/raid10:%s: read correction "
1586 "write failed"
1587 " (%d sectors at %llu on %s)\n",
1588 mdname(mddev), s,
1589 (unsigned long long)(sect+
1590 rdev->data_offset),
1591 bdevname(rdev->bdev, b));
1592 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
1593 "drive\n",
1594 mdname(mddev),
1595 bdevname(rdev->bdev, b));
1508 md_error(mddev, rdev); 1596 md_error(mddev, rdev);
1597 }
1509 rdev_dec_pending(rdev, mddev); 1598 rdev_dec_pending(rdev, mddev);
1510 rcu_read_lock(); 1599 rcu_read_lock();
1511 } 1600 }
1512 } 1601 }
1513 sl = start; 1602 sl = start;
1514 while (sl != r10_bio->read_slot) { 1603 while (sl != r10_bio->read_slot) {
1515 int d; 1604
1516 if (sl==0) 1605 if (sl==0)
1517 sl = conf->copies; 1606 sl = conf->copies;
1518 sl--; 1607 sl--;
@@ -1526,17 +1615,31 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1526 if (sync_page_io(rdev->bdev, 1615 if (sync_page_io(rdev->bdev,
1527 r10_bio->devs[sl].addr + 1616 r10_bio->devs[sl].addr +
1528 sect + rdev->data_offset, 1617 sect + rdev->data_offset,
1529 s<<9, conf->tmppage, READ) == 0) 1618 s<<9, conf->tmppage,
1619 READ) == 0) {
1530 /* Well, this device is dead */ 1620 /* Well, this device is dead */
1621 printk(KERN_NOTICE
1622 "md/raid10:%s: unable to read back "
1623 "corrected sectors"
1624 " (%d sectors at %llu on %s)\n",
1625 mdname(mddev), s,
1626 (unsigned long long)(sect+
1627 rdev->data_offset),
1628 bdevname(rdev->bdev, b));
1629 printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n",
1630 mdname(mddev),
1631 bdevname(rdev->bdev, b));
1632
1531 md_error(mddev, rdev); 1633 md_error(mddev, rdev);
1532 else 1634 } else {
1533 printk(KERN_INFO 1635 printk(KERN_INFO
1534 "raid10:%s: read error corrected" 1636 "md/raid10:%s: read error corrected"
1535 " (%d sectors at %llu on %s)\n", 1637 " (%d sectors at %llu on %s)\n",
1536 mdname(mddev), s, 1638 mdname(mddev), s,
1537 (unsigned long long)(sect+ 1639 (unsigned long long)(sect+
1538 rdev->data_offset), 1640 rdev->data_offset),
1539 bdevname(rdev->bdev, b)); 1641 bdevname(rdev->bdev, b));
1642 }
1540 1643
1541 rdev_dec_pending(rdev, mddev); 1644 rdev_dec_pending(rdev, mddev);
1542 rcu_read_lock(); 1645 rcu_read_lock();
@@ -1605,8 +1708,9 @@ static void raid10d(mddev_t *mddev)
1605 mddev->ro ? IO_BLOCKED : NULL; 1708 mddev->ro ? IO_BLOCKED : NULL;
1606 mirror = read_balance(conf, r10_bio); 1709 mirror = read_balance(conf, r10_bio);
1607 if (mirror == -1) { 1710 if (mirror == -1) {
1608 printk(KERN_ALERT "raid10: %s: unrecoverable I/O" 1711 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
1609 " read error for block %llu\n", 1712 " read error for block %llu\n",
1713 mdname(mddev),
1610 bdevname(bio->bi_bdev,b), 1714 bdevname(bio->bi_bdev,b),
1611 (unsigned long long)r10_bio->sector); 1715 (unsigned long long)r10_bio->sector);
1612 raid_end_bio_io(r10_bio); 1716 raid_end_bio_io(r10_bio);
@@ -1616,8 +1720,9 @@ static void raid10d(mddev_t *mddev)
1616 bio_put(bio); 1720 bio_put(bio);
1617 rdev = conf->mirrors[mirror].rdev; 1721 rdev = conf->mirrors[mirror].rdev;
1618 if (printk_ratelimit()) 1722 if (printk_ratelimit())
1619 printk(KERN_ERR "raid10: %s: redirecting sector %llu to" 1723 printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to"
1620 " another mirror\n", 1724 " another mirror\n",
1725 mdname(mddev),
1621 bdevname(rdev->bdev,b), 1726 bdevname(rdev->bdev,b),
1622 (unsigned long long)r10_bio->sector); 1727 (unsigned long long)r10_bio->sector);
1623 bio = bio_clone(r10_bio->master_bio, GFP_NOIO); 1728 bio = bio_clone(r10_bio->master_bio, GFP_NOIO);
@@ -1632,6 +1737,7 @@ static void raid10d(mddev_t *mddev)
1632 generic_make_request(bio); 1737 generic_make_request(bio);
1633 } 1738 }
1634 } 1739 }
1740 cond_resched();
1635 } 1741 }
1636 if (unplug) 1742 if (unplug)
1637 unplug_slaves(mddev); 1743 unplug_slaves(mddev);
@@ -1874,7 +1980,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1874 r10_bio = rb2; 1980 r10_bio = rb2;
1875 if (!test_and_set_bit(MD_RECOVERY_INTR, 1981 if (!test_and_set_bit(MD_RECOVERY_INTR,
1876 &mddev->recovery)) 1982 &mddev->recovery))
1877 printk(KERN_INFO "raid10: %s: insufficient working devices for recovery.\n", 1983 printk(KERN_INFO "md/raid10:%s: insufficient "
1984 "working devices for recovery.\n",
1878 mdname(mddev)); 1985 mdname(mddev));
1879 break; 1986 break;
1880 } 1987 }
@@ -2034,9 +2141,9 @@ raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks)
2034 conf_t *conf = mddev->private; 2141 conf_t *conf = mddev->private;
2035 2142
2036 if (!raid_disks) 2143 if (!raid_disks)
2037 raid_disks = mddev->raid_disks; 2144 raid_disks = conf->raid_disks;
2038 if (!sectors) 2145 if (!sectors)
2039 sectors = mddev->dev_sectors; 2146 sectors = conf->dev_sectors;
2040 2147
2041 size = sectors >> conf->chunk_shift; 2148 size = sectors >> conf->chunk_shift;
2042 sector_div(size, conf->far_copies); 2149 sector_div(size, conf->far_copies);
@@ -2046,63 +2153,61 @@ raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks)
2046 return size << conf->chunk_shift; 2153 return size << conf->chunk_shift;
2047} 2154}
2048 2155
2049static int run(mddev_t *mddev) 2156
2157static conf_t *setup_conf(mddev_t *mddev)
2050{ 2158{
2051 conf_t *conf; 2159 conf_t *conf = NULL;
2052 int i, disk_idx, chunk_size;
2053 mirror_info_t *disk;
2054 mdk_rdev_t *rdev;
2055 int nc, fc, fo; 2160 int nc, fc, fo;
2056 sector_t stride, size; 2161 sector_t stride, size;
2162 int err = -EINVAL;
2057 2163
2058 if (mddev->chunk_sectors < (PAGE_SIZE >> 9) || 2164 if (mddev->new_chunk_sectors < (PAGE_SIZE >> 9) ||
2059 !is_power_of_2(mddev->chunk_sectors)) { 2165 !is_power_of_2(mddev->new_chunk_sectors)) {
2060 printk(KERN_ERR "md/raid10: chunk size must be " 2166 printk(KERN_ERR "md/raid10:%s: chunk size must be "
2061 "at least PAGE_SIZE(%ld) and be a power of 2.\n", PAGE_SIZE); 2167 "at least PAGE_SIZE(%ld) and be a power of 2.\n",
2062 return -EINVAL; 2168 mdname(mddev), PAGE_SIZE);
2169 goto out;
2063 } 2170 }
2064 2171
2065 nc = mddev->layout & 255; 2172 nc = mddev->new_layout & 255;
2066 fc = (mddev->layout >> 8) & 255; 2173 fc = (mddev->new_layout >> 8) & 255;
2067 fo = mddev->layout & (1<<16); 2174 fo = mddev->new_layout & (1<<16);
2175
2068 if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || 2176 if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
2069 (mddev->layout >> 17)) { 2177 (mddev->new_layout >> 17)) {
2070 printk(KERN_ERR "raid10: %s: unsupported raid10 layout: 0x%8x\n", 2178 printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
2071 mdname(mddev), mddev->layout); 2179 mdname(mddev), mddev->new_layout);
2072 goto out; 2180 goto out;
2073 } 2181 }
2074 /* 2182
2075 * copy the already verified devices into our private RAID10 2183 err = -ENOMEM;
2076 * bookkeeping area. [whatever we allocate in run(),
2077 * should be freed in stop()]
2078 */
2079 conf = kzalloc(sizeof(conf_t), GFP_KERNEL); 2184 conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
2080 mddev->private = conf; 2185 if (!conf)
2081 if (!conf) {
2082 printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
2083 mdname(mddev));
2084 goto out; 2186 goto out;
2085 } 2187
2086 conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, 2188 conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
2087 GFP_KERNEL); 2189 GFP_KERNEL);
2088 if (!conf->mirrors) { 2190 if (!conf->mirrors)
2089 printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", 2191 goto out;
2090 mdname(mddev));
2091 goto out_free_conf;
2092 }
2093 2192
2094 conf->tmppage = alloc_page(GFP_KERNEL); 2193 conf->tmppage = alloc_page(GFP_KERNEL);
2095 if (!conf->tmppage) 2194 if (!conf->tmppage)
2096 goto out_free_conf; 2195 goto out;
2196
2097 2197
2098 conf->mddev = mddev;
2099 conf->raid_disks = mddev->raid_disks; 2198 conf->raid_disks = mddev->raid_disks;
2100 conf->near_copies = nc; 2199 conf->near_copies = nc;
2101 conf->far_copies = fc; 2200 conf->far_copies = fc;
2102 conf->copies = nc*fc; 2201 conf->copies = nc*fc;
2103 conf->far_offset = fo; 2202 conf->far_offset = fo;
2104 conf->chunk_mask = mddev->chunk_sectors - 1; 2203 conf->chunk_mask = mddev->new_chunk_sectors - 1;
2105 conf->chunk_shift = ffz(~mddev->chunk_sectors); 2204 conf->chunk_shift = ffz(~mddev->new_chunk_sectors);
2205
2206 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
2207 r10bio_pool_free, conf);
2208 if (!conf->r10bio_pool)
2209 goto out;
2210
2106 size = mddev->dev_sectors >> conf->chunk_shift; 2211 size = mddev->dev_sectors >> conf->chunk_shift;
2107 sector_div(size, fc); 2212 sector_div(size, fc);
2108 size = size * conf->raid_disks; 2213 size = size * conf->raid_disks;
@@ -2116,7 +2221,8 @@ static int run(mddev_t *mddev)
2116 */ 2221 */
2117 stride += conf->raid_disks - 1; 2222 stride += conf->raid_disks - 1;
2118 sector_div(stride, conf->raid_disks); 2223 sector_div(stride, conf->raid_disks);
2119 mddev->dev_sectors = stride << conf->chunk_shift; 2224
2225 conf->dev_sectors = stride << conf->chunk_shift;
2120 2226
2121 if (fo) 2227 if (fo)
2122 stride = 1; 2228 stride = 1;
@@ -2124,17 +2230,62 @@ static int run(mddev_t *mddev)
2124 sector_div(stride, fc); 2230 sector_div(stride, fc);
2125 conf->stride = stride << conf->chunk_shift; 2231 conf->stride = stride << conf->chunk_shift;
2126 2232
2127 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
2128 r10bio_pool_free, conf);
2129 if (!conf->r10bio_pool) {
2130 printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
2131 mdname(mddev));
2132 goto out_free_conf;
2133 }
2134 2233
2135 spin_lock_init(&conf->device_lock); 2234 spin_lock_init(&conf->device_lock);
2235 INIT_LIST_HEAD(&conf->retry_list);
2236
2237 spin_lock_init(&conf->resync_lock);
2238 init_waitqueue_head(&conf->wait_barrier);
2239
2240 conf->thread = md_register_thread(raid10d, mddev, NULL);
2241 if (!conf->thread)
2242 goto out;
2243
2244 conf->mddev = mddev;
2245 return conf;
2246
2247 out:
2248 printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
2249 mdname(mddev));
2250 if (conf) {
2251 if (conf->r10bio_pool)
2252 mempool_destroy(conf->r10bio_pool);
2253 kfree(conf->mirrors);
2254 safe_put_page(conf->tmppage);
2255 kfree(conf);
2256 }
2257 return ERR_PTR(err);
2258}
2259
2260static int run(mddev_t *mddev)
2261{
2262 conf_t *conf;
2263 int i, disk_idx, chunk_size;
2264 mirror_info_t *disk;
2265 mdk_rdev_t *rdev;
2266 sector_t size;
2267
2268 /*
2269 * copy the already verified devices into our private RAID10
2270 * bookkeeping area. [whatever we allocate in run(),
2271 * should be freed in stop()]
2272 */
2273
2274 if (mddev->private == NULL) {
2275 conf = setup_conf(mddev);
2276 if (IS_ERR(conf))
2277 return PTR_ERR(conf);
2278 mddev->private = conf;
2279 }
2280 conf = mddev->private;
2281 if (!conf)
2282 goto out;
2283
2136 mddev->queue->queue_lock = &conf->device_lock; 2284 mddev->queue->queue_lock = &conf->device_lock;
2137 2285
2286 mddev->thread = conf->thread;
2287 conf->thread = NULL;
2288
2138 chunk_size = mddev->chunk_sectors << 9; 2289 chunk_size = mddev->chunk_sectors << 9;
2139 blk_queue_io_min(mddev->queue, chunk_size); 2290 blk_queue_io_min(mddev->queue, chunk_size);
2140 if (conf->raid_disks % conf->near_copies) 2291 if (conf->raid_disks % conf->near_copies)
@@ -2145,7 +2296,7 @@ static int run(mddev_t *mddev)
2145 2296
2146 list_for_each_entry(rdev, &mddev->disks, same_set) { 2297 list_for_each_entry(rdev, &mddev->disks, same_set) {
2147 disk_idx = rdev->raid_disk; 2298 disk_idx = rdev->raid_disk;
2148 if (disk_idx >= mddev->raid_disks 2299 if (disk_idx >= conf->raid_disks
2149 || disk_idx < 0) 2300 || disk_idx < 0)
2150 continue; 2301 continue;
2151 disk = conf->mirrors + disk_idx; 2302 disk = conf->mirrors + disk_idx;
@@ -2154,23 +2305,20 @@ static int run(mddev_t *mddev)
2154 disk_stack_limits(mddev->gendisk, rdev->bdev, 2305 disk_stack_limits(mddev->gendisk, rdev->bdev,
2155 rdev->data_offset << 9); 2306 rdev->data_offset << 9);
2156 /* as we don't honour merge_bvec_fn, we must never risk 2307 /* as we don't honour merge_bvec_fn, we must never risk
2157 * violating it, so limit ->max_sector to one PAGE, as 2308 * violating it, so limit max_segments to 1 lying
2158 * a one page request is never in violation. 2309 * within a single page.
2159 */ 2310 */
2160 if (rdev->bdev->bd_disk->queue->merge_bvec_fn && 2311 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
2161 queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) 2312 blk_queue_max_segments(mddev->queue, 1);
2162 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 2313 blk_queue_segment_boundary(mddev->queue,
2314 PAGE_CACHE_SIZE - 1);
2315 }
2163 2316
2164 disk->head_position = 0; 2317 disk->head_position = 0;
2165 } 2318 }
2166 INIT_LIST_HEAD(&conf->retry_list);
2167
2168 spin_lock_init(&conf->resync_lock);
2169 init_waitqueue_head(&conf->wait_barrier);
2170
2171 /* need to check that every block has at least one working mirror */ 2319 /* need to check that every block has at least one working mirror */
2172 if (!enough(conf)) { 2320 if (!enough(conf)) {
2173 printk(KERN_ERR "raid10: not enough operational mirrors for %s\n", 2321 printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
2174 mdname(mddev)); 2322 mdname(mddev));
2175 goto out_free_conf; 2323 goto out_free_conf;
2176 } 2324 }
@@ -2189,28 +2337,21 @@ static int run(mddev_t *mddev)
2189 } 2337 }
2190 } 2338 }
2191 2339
2192
2193 mddev->thread = md_register_thread(raid10d, mddev, NULL);
2194 if (!mddev->thread) {
2195 printk(KERN_ERR
2196 "raid10: couldn't allocate thread for %s\n",
2197 mdname(mddev));
2198 goto out_free_conf;
2199 }
2200
2201 if (mddev->recovery_cp != MaxSector) 2340 if (mddev->recovery_cp != MaxSector)
2202 printk(KERN_NOTICE "raid10: %s is not clean" 2341 printk(KERN_NOTICE "md/raid10:%s: not clean"
2203 " -- starting background reconstruction\n", 2342 " -- starting background reconstruction\n",
2204 mdname(mddev)); 2343 mdname(mddev));
2205 printk(KERN_INFO 2344 printk(KERN_INFO
2206 "raid10: raid set %s active with %d out of %d devices\n", 2345 "md/raid10:%s: active with %d out of %d devices\n",
2207 mdname(mddev), mddev->raid_disks - mddev->degraded, 2346 mdname(mddev), conf->raid_disks - mddev->degraded,
2208 mddev->raid_disks); 2347 conf->raid_disks);
2209 /* 2348 /*
2210 * Ok, everything is just fine now 2349 * Ok, everything is just fine now
2211 */ 2350 */
2212 md_set_array_sectors(mddev, raid10_size(mddev, 0, 0)); 2351 mddev->dev_sectors = conf->dev_sectors;
2213 mddev->resync_max_sectors = raid10_size(mddev, 0, 0); 2352 size = raid10_size(mddev, 0, 0);
2353 md_set_array_sectors(mddev, size);
2354 mddev->resync_max_sectors = size;
2214 2355
2215 mddev->queue->unplug_fn = raid10_unplug; 2356 mddev->queue->unplug_fn = raid10_unplug;
2216 mddev->queue->backing_dev_info.congested_fn = raid10_congested; 2357 mddev->queue->backing_dev_info.congested_fn = raid10_congested;
@@ -2228,7 +2369,7 @@ static int run(mddev_t *mddev)
2228 mddev->queue->backing_dev_info.ra_pages = 2* stripe; 2369 mddev->queue->backing_dev_info.ra_pages = 2* stripe;
2229 } 2370 }
2230 2371
2231 if (conf->near_copies < mddev->raid_disks) 2372 if (conf->near_copies < conf->raid_disks)
2232 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); 2373 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
2233 md_integrity_register(mddev); 2374 md_integrity_register(mddev);
2234 return 0; 2375 return 0;
@@ -2240,6 +2381,7 @@ out_free_conf:
2240 kfree(conf->mirrors); 2381 kfree(conf->mirrors);
2241 kfree(conf); 2382 kfree(conf);
2242 mddev->private = NULL; 2383 mddev->private = NULL;
2384 md_unregister_thread(mddev->thread);
2243out: 2385out:
2244 return -EIO; 2386 return -EIO;
2245} 2387}
@@ -2274,13 +2416,57 @@ static void raid10_quiesce(mddev_t *mddev, int state)
2274 lower_barrier(conf); 2416 lower_barrier(conf);
2275 break; 2417 break;
2276 } 2418 }
2277 if (mddev->thread) { 2419}
2278 if (mddev->bitmap) 2420
2279 mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; 2421static void *raid10_takeover_raid0(mddev_t *mddev)
2280 else 2422{
2281 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; 2423 mdk_rdev_t *rdev;
2282 md_wakeup_thread(mddev->thread); 2424 conf_t *conf;
2425
2426 if (mddev->degraded > 0) {
2427 printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n",
2428 mdname(mddev));
2429 return ERR_PTR(-EINVAL);
2430 }
2431
2432 /* Set new parameters */
2433 mddev->new_level = 10;
2434 /* new layout: far_copies = 1, near_copies = 2 */
2435 mddev->new_layout = (1<<8) + 2;
2436 mddev->new_chunk_sectors = mddev->chunk_sectors;
2437 mddev->delta_disks = mddev->raid_disks;
2438 mddev->raid_disks *= 2;
2439 /* make sure it will be not marked as dirty */
2440 mddev->recovery_cp = MaxSector;
2441
2442 conf = setup_conf(mddev);
2443 if (!IS_ERR(conf))
2444 list_for_each_entry(rdev, &mddev->disks, same_set)
2445 if (rdev->raid_disk >= 0)
2446 rdev->new_raid_disk = rdev->raid_disk * 2;
2447
2448 return conf;
2449}
2450
2451static void *raid10_takeover(mddev_t *mddev)
2452{
2453 struct raid0_private_data *raid0_priv;
2454
2455 /* raid10 can take over:
2456 * raid0 - providing it has only two drives
2457 */
2458 if (mddev->level == 0) {
2459 /* for raid0 takeover only one zone is supported */
2460 raid0_priv = mddev->private;
2461 if (raid0_priv->nr_strip_zones > 1) {
2462 printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0"
2463 " with more than one zone.\n",
2464 mdname(mddev));
2465 return ERR_PTR(-EINVAL);
2466 }
2467 return raid10_takeover_raid0(mddev);
2283 } 2468 }
2469 return ERR_PTR(-EINVAL);
2284} 2470}
2285 2471
2286static struct mdk_personality raid10_personality = 2472static struct mdk_personality raid10_personality =
@@ -2299,6 +2485,7 @@ static struct mdk_personality raid10_personality =
2299 .sync_request = sync_request, 2485 .sync_request = sync_request,
2300 .quiesce = raid10_quiesce, 2486 .quiesce = raid10_quiesce,
2301 .size = raid10_size, 2487 .size = raid10_size,
2488 .takeover = raid10_takeover,
2302}; 2489};
2303 2490
2304static int __init raid_init(void) 2491static int __init raid_init(void)
@@ -2314,6 +2501,7 @@ static void raid_exit(void)
2314module_init(raid_init); 2501module_init(raid_init);
2315module_exit(raid_exit); 2502module_exit(raid_exit);
2316MODULE_LICENSE("GPL"); 2503MODULE_LICENSE("GPL");
2504MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
2317MODULE_ALIAS("md-personality-9"); /* RAID10 */ 2505MODULE_ALIAS("md-personality-9"); /* RAID10 */
2318MODULE_ALIAS("md-raid10"); 2506MODULE_ALIAS("md-raid10");
2319MODULE_ALIAS("md-level-10"); 2507MODULE_ALIAS("md-level-10");
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 59cd1efb8d30..2316ac2e8e21 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -33,6 +33,8 @@ struct r10_private_data_s {
33 * 1 stripe. 33 * 1 stripe.
34 */ 34 */
35 35
36 sector_t dev_sectors; /* temp copy of mddev->dev_sectors */
37
36 int chunk_shift; /* shift from chunks to sectors */ 38 int chunk_shift; /* shift from chunks to sectors */
37 sector_t chunk_mask; 39 sector_t chunk_mask;
38 40
@@ -57,6 +59,11 @@ struct r10_private_data_s {
57 mempool_t *r10bio_pool; 59 mempool_t *r10bio_pool;
58 mempool_t *r10buf_pool; 60 mempool_t *r10buf_pool;
59 struct page *tmppage; 61 struct page *tmppage;
62
63 /* When taking over an array from a different personality, we store
64 * the new thread here until we fully activate the array.
65 */
66 struct mdk_thread_s *thread;
60}; 67};
61 68
62typedef struct r10_private_data_s conf_t; 69typedef struct r10_private_data_s conf_t;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 94829804ab7f..96c690279fc6 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -50,8 +50,10 @@
50#include <linux/async.h> 50#include <linux/async.h>
51#include <linux/seq_file.h> 51#include <linux/seq_file.h>
52#include <linux/cpu.h> 52#include <linux/cpu.h>
53#include <linux/slab.h>
53#include "md.h" 54#include "md.h"
54#include "raid5.h" 55#include "raid5.h"
56#include "raid0.h"
55#include "bitmap.h" 57#include "bitmap.h"
56 58
57/* 59/*
@@ -156,13 +158,16 @@ static inline int raid6_next_disk(int disk, int raid_disks)
156static int raid6_idx_to_slot(int idx, struct stripe_head *sh, 158static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
157 int *count, int syndrome_disks) 159 int *count, int syndrome_disks)
158{ 160{
159 int slot; 161 int slot = *count;
160 162
163 if (sh->ddf_layout)
164 (*count)++;
161 if (idx == sh->pd_idx) 165 if (idx == sh->pd_idx)
162 return syndrome_disks; 166 return syndrome_disks;
163 if (idx == sh->qd_idx) 167 if (idx == sh->qd_idx)
164 return syndrome_disks + 1; 168 return syndrome_disks + 1;
165 slot = (*count)++; 169 if (!sh->ddf_layout)
170 (*count)++;
166 return slot; 171 return slot;
167} 172}
168 173
@@ -272,12 +277,13 @@ out:
272 return sh; 277 return sh;
273} 278}
274 279
275static void shrink_buffers(struct stripe_head *sh, int num) 280static void shrink_buffers(struct stripe_head *sh)
276{ 281{
277 struct page *p; 282 struct page *p;
278 int i; 283 int i;
284 int num = sh->raid_conf->pool_size;
279 285
280 for (i=0; i<num ; i++) { 286 for (i = 0; i < num ; i++) {
281 p = sh->dev[i].page; 287 p = sh->dev[i].page;
282 if (!p) 288 if (!p)
283 continue; 289 continue;
@@ -286,11 +292,12 @@ static void shrink_buffers(struct stripe_head *sh, int num)
286 } 292 }
287} 293}
288 294
289static int grow_buffers(struct stripe_head *sh, int num) 295static int grow_buffers(struct stripe_head *sh)
290{ 296{
291 int i; 297 int i;
298 int num = sh->raid_conf->pool_size;
292 299
293 for (i=0; i<num; i++) { 300 for (i = 0; i < num; i++) {
294 struct page *page; 301 struct page *page;
295 302
296 if (!(page = alloc_page(GFP_KERNEL))) { 303 if (!(page = alloc_page(GFP_KERNEL))) {
@@ -359,6 +366,73 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector,
359 return NULL; 366 return NULL;
360} 367}
361 368
369/*
370 * Need to check if array has failed when deciding whether to:
371 * - start an array
372 * - remove non-faulty devices
373 * - add a spare
374 * - allow a reshape
375 * This determination is simple when no reshape is happening.
376 * However if there is a reshape, we need to carefully check
377 * both the before and after sections.
378 * This is because some failed devices may only affect one
379 * of the two sections, and some non-in_sync devices may
380 * be insync in the section most affected by failed devices.
381 */
382static int has_failed(raid5_conf_t *conf)
383{
384 int degraded;
385 int i;
386 if (conf->mddev->reshape_position == MaxSector)
387 return conf->mddev->degraded > conf->max_degraded;
388
389 rcu_read_lock();
390 degraded = 0;
391 for (i = 0; i < conf->previous_raid_disks; i++) {
392 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
393 if (!rdev || test_bit(Faulty, &rdev->flags))
394 degraded++;
395 else if (test_bit(In_sync, &rdev->flags))
396 ;
397 else
398 /* not in-sync or faulty.
399 * If the reshape increases the number of devices,
400 * this is being recovered by the reshape, so
401 * this 'previous' section is not in_sync.
402 * If the number of devices is being reduced however,
403 * the device can only be part of the array if
404 * we are reverting a reshape, so this section will
405 * be in-sync.
406 */
407 if (conf->raid_disks >= conf->previous_raid_disks)
408 degraded++;
409 }
410 rcu_read_unlock();
411 if (degraded > conf->max_degraded)
412 return 1;
413 rcu_read_lock();
414 degraded = 0;
415 for (i = 0; i < conf->raid_disks; i++) {
416 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
417 if (!rdev || test_bit(Faulty, &rdev->flags))
418 degraded++;
419 else if (test_bit(In_sync, &rdev->flags))
420 ;
421 else
422 /* not in-sync or faulty.
423 * If reshape increases the number of devices, this
424 * section has already been recovered, else it
425 * almost certainly hasn't.
426 */
427 if (conf->raid_disks <= conf->previous_raid_disks)
428 degraded++;
429 }
430 rcu_read_unlock();
431 if (degraded > conf->max_degraded)
432 return 1;
433 return 0;
434}
435
362static void unplug_slaves(mddev_t *mddev); 436static void unplug_slaves(mddev_t *mddev);
363static void raid5_unplug_device(struct request_queue *q); 437static void raid5_unplug_device(struct request_queue *q);
364 438
@@ -717,7 +791,7 @@ static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
717 int i; 791 int i;
718 792
719 for (i = 0; i < disks; i++) 793 for (i = 0; i < disks; i++)
720 srcs[i] = (void *)raid6_empty_zero_page; 794 srcs[i] = NULL;
721 795
722 count = 0; 796 count = 0;
723 i = d0_idx; 797 i = d0_idx;
@@ -727,9 +801,8 @@ static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
727 srcs[slot] = sh->dev[i].page; 801 srcs[slot] = sh->dev[i].page;
728 i = raid6_next_disk(i, disks); 802 i = raid6_next_disk(i, disks);
729 } while (i != d0_idx); 803 } while (i != d0_idx);
730 BUG_ON(count != syndrome_disks);
731 804
732 return count; 805 return syndrome_disks;
733} 806}
734 807
735static struct dma_async_tx_descriptor * 808static struct dma_async_tx_descriptor *
@@ -814,7 +887,7 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
814 * slot number conversion for 'faila' and 'failb' 887 * slot number conversion for 'faila' and 'failb'
815 */ 888 */
816 for (i = 0; i < disks ; i++) 889 for (i = 0; i < disks ; i++)
817 blocks[i] = (void *)raid6_empty_zero_page; 890 blocks[i] = NULL;
818 count = 0; 891 count = 0;
819 i = d0_idx; 892 i = d0_idx;
820 do { 893 do {
@@ -828,7 +901,6 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
828 failb = slot; 901 failb = slot;
829 i = raid6_next_disk(i, disks); 902 i = raid6_next_disk(i, disks);
830 } while (i != d0_idx); 903 } while (i != d0_idx);
831 BUG_ON(count != syndrome_disks);
832 904
833 BUG_ON(faila == failb); 905 BUG_ON(faila == failb);
834 if (failb < faila) 906 if (failb < faila)
@@ -845,7 +917,7 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
845 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 917 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
846 ops_complete_compute, sh, 918 ops_complete_compute, sh,
847 to_addr_conv(sh, percpu)); 919 to_addr_conv(sh, percpu));
848 return async_gen_syndrome(blocks, 0, count+2, 920 return async_gen_syndrome(blocks, 0, syndrome_disks+2,
849 STRIPE_SIZE, &submit); 921 STRIPE_SIZE, &submit);
850 } else { 922 } else {
851 struct page *dest; 923 struct page *dest;
@@ -1139,7 +1211,7 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu
1139 &sh->ops.zero_sum_result, percpu->spare_page, &submit); 1211 &sh->ops.zero_sum_result, percpu->spare_page, &submit);
1140} 1212}
1141 1213
1142static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 1214static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1143{ 1215{
1144 int overlap_clear = 0, i, disks = sh->disks; 1216 int overlap_clear = 0, i, disks = sh->disks;
1145 struct dma_async_tx_descriptor *tx = NULL; 1217 struct dma_async_tx_descriptor *tx = NULL;
@@ -1204,22 +1276,54 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1204 put_cpu(); 1276 put_cpu();
1205} 1277}
1206 1278
1279#ifdef CONFIG_MULTICORE_RAID456
1280static void async_run_ops(void *param, async_cookie_t cookie)
1281{
1282 struct stripe_head *sh = param;
1283 unsigned long ops_request = sh->ops.request;
1284
1285 clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state);
1286 wake_up(&sh->ops.wait_for_ops);
1287
1288 __raid_run_ops(sh, ops_request);
1289 release_stripe(sh);
1290}
1291
1292static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1293{
1294 /* since handle_stripe can be called outside of raid5d context
1295 * we need to ensure sh->ops.request is de-staged before another
1296 * request arrives
1297 */
1298 wait_event(sh->ops.wait_for_ops,
1299 !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state));
1300 sh->ops.request = ops_request;
1301
1302 atomic_inc(&sh->count);
1303 async_schedule(async_run_ops, sh);
1304}
1305#else
1306#define raid_run_ops __raid_run_ops
1307#endif
1308
1207static int grow_one_stripe(raid5_conf_t *conf) 1309static int grow_one_stripe(raid5_conf_t *conf)
1208{ 1310{
1209 struct stripe_head *sh; 1311 struct stripe_head *sh;
1210 sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); 1312 sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL);
1211 if (!sh) 1313 if (!sh)
1212 return 0; 1314 return 0;
1213 memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev)); 1315 memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev));
1214 sh->raid_conf = conf; 1316 sh->raid_conf = conf;
1215 spin_lock_init(&sh->lock); 1317 spin_lock_init(&sh->lock);
1318 #ifdef CONFIG_MULTICORE_RAID456
1319 init_waitqueue_head(&sh->ops.wait_for_ops);
1320 #endif
1216 1321
1217 if (grow_buffers(sh, conf->raid_disks)) { 1322 if (grow_buffers(sh)) {
1218 shrink_buffers(sh, conf->raid_disks); 1323 shrink_buffers(sh);
1219 kmem_cache_free(conf->slab_cache, sh); 1324 kmem_cache_free(conf->slab_cache, sh);
1220 return 0; 1325 return 0;
1221 } 1326 }
1222 sh->disks = conf->raid_disks;
1223 /* we just created an active stripe so... */ 1327 /* we just created an active stripe so... */
1224 atomic_set(&sh->count, 1); 1328 atomic_set(&sh->count, 1);
1225 atomic_inc(&conf->active_stripes); 1329 atomic_inc(&conf->active_stripes);
@@ -1231,7 +1335,7 @@ static int grow_one_stripe(raid5_conf_t *conf)
1231static int grow_stripes(raid5_conf_t *conf, int num) 1335static int grow_stripes(raid5_conf_t *conf, int num)
1232{ 1336{
1233 struct kmem_cache *sc; 1337 struct kmem_cache *sc;
1234 int devs = conf->raid_disks; 1338 int devs = max(conf->raid_disks, conf->previous_raid_disks);
1235 1339
1236 sprintf(conf->cache_name[0], 1340 sprintf(conf->cache_name[0],
1237 "raid%d-%s", conf->level, mdname(conf->mddev)); 1341 "raid%d-%s", conf->level, mdname(conf->mddev));
@@ -1329,6 +1433,9 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
1329 1433
1330 nsh->raid_conf = conf; 1434 nsh->raid_conf = conf;
1331 spin_lock_init(&nsh->lock); 1435 spin_lock_init(&nsh->lock);
1436 #ifdef CONFIG_MULTICORE_RAID456
1437 init_waitqueue_head(&nsh->ops.wait_for_ops);
1438 #endif
1332 1439
1333 list_add(&nsh->lru, &newstripes); 1440 list_add(&nsh->lru, &newstripes);
1334 } 1441 }
@@ -1429,7 +1536,7 @@ static int drop_one_stripe(raid5_conf_t *conf)
1429 if (!sh) 1536 if (!sh)
1430 return 0; 1537 return 0;
1431 BUG_ON(atomic_read(&sh->count)); 1538 BUG_ON(atomic_read(&sh->count));
1432 shrink_buffers(sh, conf->pool_size); 1539 shrink_buffers(sh);
1433 kmem_cache_free(conf->slab_cache, sh); 1540 kmem_cache_free(conf->slab_cache, sh);
1434 atomic_dec(&conf->active_stripes); 1541 atomic_dec(&conf->active_stripes);
1435 return 1; 1542 return 1;
@@ -1471,7 +1578,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
1471 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1578 set_bit(R5_UPTODATE, &sh->dev[i].flags);
1472 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1579 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1473 rdev = conf->disks[i].rdev; 1580 rdev = conf->disks[i].rdev;
1474 printk_rl(KERN_INFO "raid5:%s: read error corrected" 1581 printk_rl(KERN_INFO "md/raid:%s: read error corrected"
1475 " (%lu sectors at %llu on %s)\n", 1582 " (%lu sectors at %llu on %s)\n",
1476 mdname(conf->mddev), STRIPE_SECTORS, 1583 mdname(conf->mddev), STRIPE_SECTORS,
1477 (unsigned long long)(sh->sector 1584 (unsigned long long)(sh->sector
@@ -1489,9 +1596,9 @@ static void raid5_end_read_request(struct bio * bi, int error)
1489 1596
1490 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 1597 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
1491 atomic_inc(&rdev->read_errors); 1598 atomic_inc(&rdev->read_errors);
1492 if (conf->mddev->degraded) 1599 if (conf->mddev->degraded >= conf->max_degraded)
1493 printk_rl(KERN_WARNING 1600 printk_rl(KERN_WARNING
1494 "raid5:%s: read error not correctable " 1601 "md/raid:%s: read error not correctable "
1495 "(sector %llu on %s).\n", 1602 "(sector %llu on %s).\n",
1496 mdname(conf->mddev), 1603 mdname(conf->mddev),
1497 (unsigned long long)(sh->sector 1604 (unsigned long long)(sh->sector
@@ -1500,7 +1607,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
1500 else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) 1607 else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
1501 /* Oh, no!!! */ 1608 /* Oh, no!!! */
1502 printk_rl(KERN_WARNING 1609 printk_rl(KERN_WARNING
1503 "raid5:%s: read error NOT corrected!! " 1610 "md/raid:%s: read error NOT corrected!! "
1504 "(sector %llu on %s).\n", 1611 "(sector %llu on %s).\n",
1505 mdname(conf->mddev), 1612 mdname(conf->mddev),
1506 (unsigned long long)(sh->sector 1613 (unsigned long long)(sh->sector
@@ -1509,7 +1616,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
1509 else if (atomic_read(&rdev->read_errors) 1616 else if (atomic_read(&rdev->read_errors)
1510 > conf->max_nr_stripes) 1617 > conf->max_nr_stripes)
1511 printk(KERN_WARNING 1618 printk(KERN_WARNING
1512 "raid5:%s: Too many read errors, failing device %s.\n", 1619 "md/raid:%s: Too many read errors, failing device %s.\n",
1513 mdname(conf->mddev), bdn); 1620 mdname(conf->mddev), bdn);
1514 else 1621 else
1515 retry = 1; 1622 retry = 1;
@@ -1581,8 +1688,8 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous)
1581static void error(mddev_t *mddev, mdk_rdev_t *rdev) 1688static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1582{ 1689{
1583 char b[BDEVNAME_SIZE]; 1690 char b[BDEVNAME_SIZE];
1584 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 1691 raid5_conf_t *conf = mddev->private;
1585 pr_debug("raid5: error called\n"); 1692 pr_debug("raid456: error called\n");
1586 1693
1587 if (!test_bit(Faulty, &rdev->flags)) { 1694 if (!test_bit(Faulty, &rdev->flags)) {
1588 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1695 set_bit(MD_CHANGE_DEVS, &mddev->flags);
@@ -1598,9 +1705,13 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1598 } 1705 }
1599 set_bit(Faulty, &rdev->flags); 1706 set_bit(Faulty, &rdev->flags);
1600 printk(KERN_ALERT 1707 printk(KERN_ALERT
1601 "raid5: Disk failure on %s, disabling device.\n" 1708 "md/raid:%s: Disk failure on %s, disabling device.\n"
1602 "raid5: Operation continuing on %d devices.\n", 1709 KERN_ALERT
1603 bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); 1710 "md/raid:%s: Operation continuing on %d devices.\n",
1711 mdname(mddev),
1712 bdevname(rdev->bdev, b),
1713 mdname(mddev),
1714 conf->raid_disks - mddev->degraded);
1604 } 1715 }
1605} 1716}
1606 1717
@@ -1612,8 +1723,8 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
1612 int previous, int *dd_idx, 1723 int previous, int *dd_idx,
1613 struct stripe_head *sh) 1724 struct stripe_head *sh)
1614{ 1725{
1615 long stripe; 1726 sector_t stripe, stripe2;
1616 unsigned long chunk_number; 1727 sector_t chunk_number;
1617 unsigned int chunk_offset; 1728 unsigned int chunk_offset;
1618 int pd_idx, qd_idx; 1729 int pd_idx, qd_idx;
1619 int ddf_layout = 0; 1730 int ddf_layout = 0;
@@ -1633,18 +1744,13 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
1633 */ 1744 */
1634 chunk_offset = sector_div(r_sector, sectors_per_chunk); 1745 chunk_offset = sector_div(r_sector, sectors_per_chunk);
1635 chunk_number = r_sector; 1746 chunk_number = r_sector;
1636 BUG_ON(r_sector != chunk_number);
1637 1747
1638 /* 1748 /*
1639 * Compute the stripe number 1749 * Compute the stripe number
1640 */ 1750 */
1641 stripe = chunk_number / data_disks; 1751 stripe = chunk_number;
1642 1752 *dd_idx = sector_div(stripe, data_disks);
1643 /* 1753 stripe2 = stripe;
1644 * Compute the data disk and parity disk indexes inside the stripe
1645 */
1646 *dd_idx = chunk_number % data_disks;
1647
1648 /* 1754 /*
1649 * Select the parity disk based on the user selected algorithm. 1755 * Select the parity disk based on the user selected algorithm.
1650 */ 1756 */
@@ -1656,21 +1762,21 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
1656 case 5: 1762 case 5:
1657 switch (algorithm) { 1763 switch (algorithm) {
1658 case ALGORITHM_LEFT_ASYMMETRIC: 1764 case ALGORITHM_LEFT_ASYMMETRIC:
1659 pd_idx = data_disks - stripe % raid_disks; 1765 pd_idx = data_disks - sector_div(stripe2, raid_disks);
1660 if (*dd_idx >= pd_idx) 1766 if (*dd_idx >= pd_idx)
1661 (*dd_idx)++; 1767 (*dd_idx)++;
1662 break; 1768 break;
1663 case ALGORITHM_RIGHT_ASYMMETRIC: 1769 case ALGORITHM_RIGHT_ASYMMETRIC:
1664 pd_idx = stripe % raid_disks; 1770 pd_idx = sector_div(stripe2, raid_disks);
1665 if (*dd_idx >= pd_idx) 1771 if (*dd_idx >= pd_idx)
1666 (*dd_idx)++; 1772 (*dd_idx)++;
1667 break; 1773 break;
1668 case ALGORITHM_LEFT_SYMMETRIC: 1774 case ALGORITHM_LEFT_SYMMETRIC:
1669 pd_idx = data_disks - stripe % raid_disks; 1775 pd_idx = data_disks - sector_div(stripe2, raid_disks);
1670 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 1776 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
1671 break; 1777 break;
1672 case ALGORITHM_RIGHT_SYMMETRIC: 1778 case ALGORITHM_RIGHT_SYMMETRIC:
1673 pd_idx = stripe % raid_disks; 1779 pd_idx = sector_div(stripe2, raid_disks);
1674 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 1780 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
1675 break; 1781 break;
1676 case ALGORITHM_PARITY_0: 1782 case ALGORITHM_PARITY_0:
@@ -1681,8 +1787,6 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
1681 pd_idx = data_disks; 1787 pd_idx = data_disks;
1682 break; 1788 break;
1683 default: 1789 default:
1684 printk(KERN_ERR "raid5: unsupported algorithm %d\n",
1685 algorithm);
1686 BUG(); 1790 BUG();
1687 } 1791 }
1688 break; 1792 break;
@@ -1690,7 +1794,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
1690 1794
1691 switch (algorithm) { 1795 switch (algorithm) {
1692 case ALGORITHM_LEFT_ASYMMETRIC: 1796 case ALGORITHM_LEFT_ASYMMETRIC:
1693 pd_idx = raid_disks - 1 - (stripe % raid_disks); 1797 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
1694 qd_idx = pd_idx + 1; 1798 qd_idx = pd_idx + 1;
1695 if (pd_idx == raid_disks-1) { 1799 if (pd_idx == raid_disks-1) {
1696 (*dd_idx)++; /* Q D D D P */ 1800 (*dd_idx)++; /* Q D D D P */
@@ -1699,7 +1803,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
1699 (*dd_idx) += 2; /* D D P Q D */ 1803 (*dd_idx) += 2; /* D D P Q D */
1700 break; 1804 break;
1701 case ALGORITHM_RIGHT_ASYMMETRIC: 1805 case ALGORITHM_RIGHT_ASYMMETRIC:
1702 pd_idx = stripe % raid_disks; 1806 pd_idx = sector_div(stripe2, raid_disks);
1703 qd_idx = pd_idx + 1; 1807 qd_idx = pd_idx + 1;
1704 if (pd_idx == raid_disks-1) { 1808 if (pd_idx == raid_disks-1) {
1705 (*dd_idx)++; /* Q D D D P */ 1809 (*dd_idx)++; /* Q D D D P */
@@ -1708,12 +1812,12 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
1708 (*dd_idx) += 2; /* D D P Q D */ 1812 (*dd_idx) += 2; /* D D P Q D */
1709 break; 1813 break;
1710 case ALGORITHM_LEFT_SYMMETRIC: 1814 case ALGORITHM_LEFT_SYMMETRIC:
1711 pd_idx = raid_disks - 1 - (stripe % raid_disks); 1815 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
1712 qd_idx = (pd_idx + 1) % raid_disks; 1816 qd_idx = (pd_idx + 1) % raid_disks;
1713 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 1817 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
1714 break; 1818 break;
1715 case ALGORITHM_RIGHT_SYMMETRIC: 1819 case ALGORITHM_RIGHT_SYMMETRIC:
1716 pd_idx = stripe % raid_disks; 1820 pd_idx = sector_div(stripe2, raid_disks);
1717 qd_idx = (pd_idx + 1) % raid_disks; 1821 qd_idx = (pd_idx + 1) % raid_disks;
1718 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 1822 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
1719 break; 1823 break;
@@ -1732,7 +1836,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
1732 /* Exactly the same as RIGHT_ASYMMETRIC, but or 1836 /* Exactly the same as RIGHT_ASYMMETRIC, but or
1733 * of blocks for computing Q is different. 1837 * of blocks for computing Q is different.
1734 */ 1838 */
1735 pd_idx = stripe % raid_disks; 1839 pd_idx = sector_div(stripe2, raid_disks);
1736 qd_idx = pd_idx + 1; 1840 qd_idx = pd_idx + 1;
1737 if (pd_idx == raid_disks-1) { 1841 if (pd_idx == raid_disks-1) {
1738 (*dd_idx)++; /* Q D D D P */ 1842 (*dd_idx)++; /* Q D D D P */
@@ -1747,7 +1851,8 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
1747 * D D D P Q rather than 1851 * D D D P Q rather than
1748 * Q D D D P 1852 * Q D D D P
1749 */ 1853 */
1750 pd_idx = raid_disks - 1 - ((stripe + 1) % raid_disks); 1854 stripe2 += 1;
1855 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
1751 qd_idx = pd_idx + 1; 1856 qd_idx = pd_idx + 1;
1752 if (pd_idx == raid_disks-1) { 1857 if (pd_idx == raid_disks-1) {
1753 (*dd_idx)++; /* Q D D D P */ 1858 (*dd_idx)++; /* Q D D D P */
@@ -1759,7 +1864,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
1759 1864
1760 case ALGORITHM_ROTATING_N_CONTINUE: 1865 case ALGORITHM_ROTATING_N_CONTINUE:
1761 /* Same as left_symmetric but Q is before P */ 1866 /* Same as left_symmetric but Q is before P */
1762 pd_idx = raid_disks - 1 - (stripe % raid_disks); 1867 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
1763 qd_idx = (pd_idx + raid_disks - 1) % raid_disks; 1868 qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
1764 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 1869 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
1765 ddf_layout = 1; 1870 ddf_layout = 1;
@@ -1767,27 +1872,27 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
1767 1872
1768 case ALGORITHM_LEFT_ASYMMETRIC_6: 1873 case ALGORITHM_LEFT_ASYMMETRIC_6:
1769 /* RAID5 left_asymmetric, with Q on last device */ 1874 /* RAID5 left_asymmetric, with Q on last device */
1770 pd_idx = data_disks - stripe % (raid_disks-1); 1875 pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
1771 if (*dd_idx >= pd_idx) 1876 if (*dd_idx >= pd_idx)
1772 (*dd_idx)++; 1877 (*dd_idx)++;
1773 qd_idx = raid_disks - 1; 1878 qd_idx = raid_disks - 1;
1774 break; 1879 break;
1775 1880
1776 case ALGORITHM_RIGHT_ASYMMETRIC_6: 1881 case ALGORITHM_RIGHT_ASYMMETRIC_6:
1777 pd_idx = stripe % (raid_disks-1); 1882 pd_idx = sector_div(stripe2, raid_disks-1);
1778 if (*dd_idx >= pd_idx) 1883 if (*dd_idx >= pd_idx)
1779 (*dd_idx)++; 1884 (*dd_idx)++;
1780 qd_idx = raid_disks - 1; 1885 qd_idx = raid_disks - 1;
1781 break; 1886 break;
1782 1887
1783 case ALGORITHM_LEFT_SYMMETRIC_6: 1888 case ALGORITHM_LEFT_SYMMETRIC_6:
1784 pd_idx = data_disks - stripe % (raid_disks-1); 1889 pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
1785 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 1890 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
1786 qd_idx = raid_disks - 1; 1891 qd_idx = raid_disks - 1;
1787 break; 1892 break;
1788 1893
1789 case ALGORITHM_RIGHT_SYMMETRIC_6: 1894 case ALGORITHM_RIGHT_SYMMETRIC_6:
1790 pd_idx = stripe % (raid_disks-1); 1895 pd_idx = sector_div(stripe2, raid_disks-1);
1791 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 1896 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
1792 qd_idx = raid_disks - 1; 1897 qd_idx = raid_disks - 1;
1793 break; 1898 break;
@@ -1798,10 +1903,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
1798 qd_idx = raid_disks - 1; 1903 qd_idx = raid_disks - 1;
1799 break; 1904 break;
1800 1905
1801
1802 default: 1906 default:
1803 printk(KERN_CRIT "raid6: unsupported algorithm %d\n",
1804 algorithm);
1805 BUG(); 1907 BUG();
1806 } 1908 }
1807 break; 1909 break;
@@ -1832,14 +1934,14 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
1832 : conf->algorithm; 1934 : conf->algorithm;
1833 sector_t stripe; 1935 sector_t stripe;
1834 int chunk_offset; 1936 int chunk_offset;
1835 int chunk_number, dummy1, dd_idx = i; 1937 sector_t chunk_number;
1938 int dummy1, dd_idx = i;
1836 sector_t r_sector; 1939 sector_t r_sector;
1837 struct stripe_head sh2; 1940 struct stripe_head sh2;
1838 1941
1839 1942
1840 chunk_offset = sector_div(new_sector, sectors_per_chunk); 1943 chunk_offset = sector_div(new_sector, sectors_per_chunk);
1841 stripe = new_sector; 1944 stripe = new_sector;
1842 BUG_ON(new_sector != stripe);
1843 1945
1844 if (i == sh->pd_idx) 1946 if (i == sh->pd_idx)
1845 return 0; 1947 return 0;
@@ -1864,8 +1966,6 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
1864 case ALGORITHM_PARITY_N: 1966 case ALGORITHM_PARITY_N:
1865 break; 1967 break;
1866 default: 1968 default:
1867 printk(KERN_ERR "raid5: unsupported algorithm %d\n",
1868 algorithm);
1869 BUG(); 1969 BUG();
1870 } 1970 }
1871 break; 1971 break;
@@ -1899,10 +1999,15 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
1899 case ALGORITHM_PARITY_N: 1999 case ALGORITHM_PARITY_N:
1900 break; 2000 break;
1901 case ALGORITHM_ROTATING_N_CONTINUE: 2001 case ALGORITHM_ROTATING_N_CONTINUE:
2002 /* Like left_symmetric, but P is before Q */
1902 if (sh->pd_idx == 0) 2003 if (sh->pd_idx == 0)
1903 i--; /* P D D D Q */ 2004 i--; /* P D D D Q */
1904 else if (i > sh->pd_idx) 2005 else {
1905 i -= 2; /* D D Q P D */ 2006 /* D D Q P D */
2007 if (i < sh->pd_idx)
2008 i += raid_disks;
2009 i -= (sh->pd_idx + 1);
2010 }
1906 break; 2011 break;
1907 case ALGORITHM_LEFT_ASYMMETRIC_6: 2012 case ALGORITHM_LEFT_ASYMMETRIC_6:
1908 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2013 case ALGORITHM_RIGHT_ASYMMETRIC_6:
@@ -1919,21 +2024,20 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
1919 i -= 1; 2024 i -= 1;
1920 break; 2025 break;
1921 default: 2026 default:
1922 printk(KERN_CRIT "raid6: unsupported algorithm %d\n",
1923 algorithm);
1924 BUG(); 2027 BUG();
1925 } 2028 }
1926 break; 2029 break;
1927 } 2030 }
1928 2031
1929 chunk_number = stripe * data_disks + i; 2032 chunk_number = stripe * data_disks + i;
1930 r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset; 2033 r_sector = chunk_number * sectors_per_chunk + chunk_offset;
1931 2034
1932 check = raid5_compute_sector(conf, r_sector, 2035 check = raid5_compute_sector(conf, r_sector,
1933 previous, &dummy1, &sh2); 2036 previous, &dummy1, &sh2);
1934 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx 2037 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
1935 || sh2.qd_idx != sh->qd_idx) { 2038 || sh2.qd_idx != sh->qd_idx) {
1936 printk(KERN_ERR "compute_blocknr: map not correct\n"); 2039 printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n",
2040 mdname(conf->mddev));
1937 return 0; 2041 return 0;
1938 } 2042 }
1939 return r_sector; 2043 return r_sector;
@@ -2896,7 +3000,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
2896 * 3000 *
2897 */ 3001 */
2898 3002
2899static bool handle_stripe5(struct stripe_head *sh) 3003static void handle_stripe5(struct stripe_head *sh)
2900{ 3004{
2901 raid5_conf_t *conf = sh->raid_conf; 3005 raid5_conf_t *conf = sh->raid_conf;
2902 int disks = sh->disks, i; 3006 int disks = sh->disks, i;
@@ -2905,6 +3009,7 @@ static bool handle_stripe5(struct stripe_head *sh)
2905 struct r5dev *dev; 3009 struct r5dev *dev;
2906 mdk_rdev_t *blocked_rdev = NULL; 3010 mdk_rdev_t *blocked_rdev = NULL;
2907 int prexor; 3011 int prexor;
3012 int dec_preread_active = 0;
2908 3013
2909 memset(&s, 0, sizeof(s)); 3014 memset(&s, 0, sizeof(s));
2910 pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d " 3015 pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d "
@@ -2926,7 +3031,6 @@ static bool handle_stripe5(struct stripe_head *sh)
2926 mdk_rdev_t *rdev; 3031 mdk_rdev_t *rdev;
2927 3032
2928 dev = &sh->dev[i]; 3033 dev = &sh->dev[i];
2929 clear_bit(R5_Insync, &dev->flags);
2930 3034
2931 pr_debug("check %d: state 0x%lx toread %p read %p write %p " 3035 pr_debug("check %d: state 0x%lx toread %p read %p write %p "
2932 "written %p\n", i, dev->flags, dev->toread, dev->read, 3036 "written %p\n", i, dev->flags, dev->toread, dev->read,
@@ -2963,17 +3067,27 @@ static bool handle_stripe5(struct stripe_head *sh)
2963 blocked_rdev = rdev; 3067 blocked_rdev = rdev;
2964 atomic_inc(&rdev->nr_pending); 3068 atomic_inc(&rdev->nr_pending);
2965 } 3069 }
2966 if (!rdev || !test_bit(In_sync, &rdev->flags)) { 3070 clear_bit(R5_Insync, &dev->flags);
3071 if (!rdev)
3072 /* Not in-sync */;
3073 else if (test_bit(In_sync, &rdev->flags))
3074 set_bit(R5_Insync, &dev->flags);
3075 else {
3076 /* could be in-sync depending on recovery/reshape status */
3077 if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
3078 set_bit(R5_Insync, &dev->flags);
3079 }
3080 if (!test_bit(R5_Insync, &dev->flags)) {
2967 /* The ReadError flag will just be confusing now */ 3081 /* The ReadError flag will just be confusing now */
2968 clear_bit(R5_ReadError, &dev->flags); 3082 clear_bit(R5_ReadError, &dev->flags);
2969 clear_bit(R5_ReWrite, &dev->flags); 3083 clear_bit(R5_ReWrite, &dev->flags);
2970 } 3084 }
2971 if (!rdev || !test_bit(In_sync, &rdev->flags) 3085 if (test_bit(R5_ReadError, &dev->flags))
2972 || test_bit(R5_ReadError, &dev->flags)) { 3086 clear_bit(R5_Insync, &dev->flags);
3087 if (!test_bit(R5_Insync, &dev->flags)) {
2973 s.failed++; 3088 s.failed++;
2974 s.failed_num = i; 3089 s.failed_num = i;
2975 } else 3090 }
2976 set_bit(R5_Insync, &dev->flags);
2977 } 3091 }
2978 rcu_read_unlock(); 3092 rcu_read_unlock();
2979 3093
@@ -3054,12 +3168,8 @@ static bool handle_stripe5(struct stripe_head *sh)
3054 set_bit(STRIPE_INSYNC, &sh->state); 3168 set_bit(STRIPE_INSYNC, &sh->state);
3055 } 3169 }
3056 } 3170 }
3057 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 3171 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3058 atomic_dec(&conf->preread_active_stripes); 3172 dec_preread_active = 1;
3059 if (atomic_read(&conf->preread_active_stripes) <
3060 IO_THRESHOLD)
3061 md_wakeup_thread(conf->mddev->thread);
3062 }
3063 } 3173 }
3064 3174
3065 /* Now to consider new write requests and what else, if anything 3175 /* Now to consider new write requests and what else, if anything
@@ -3166,12 +3276,20 @@ static bool handle_stripe5(struct stripe_head *sh)
3166 3276
3167 ops_run_io(sh, &s); 3277 ops_run_io(sh, &s);
3168 3278
3279 if (dec_preread_active) {
3280 /* We delay this until after ops_run_io so that if make_request
3281 * is waiting on a barrier, it won't continue until the writes
3282 * have actually been submitted.
3283 */
3284 atomic_dec(&conf->preread_active_stripes);
3285 if (atomic_read(&conf->preread_active_stripes) <
3286 IO_THRESHOLD)
3287 md_wakeup_thread(conf->mddev->thread);
3288 }
3169 return_io(return_bi); 3289 return_io(return_bi);
3170
3171 return blocked_rdev == NULL;
3172} 3290}
3173 3291
3174static bool handle_stripe6(struct stripe_head *sh) 3292static void handle_stripe6(struct stripe_head *sh)
3175{ 3293{
3176 raid5_conf_t *conf = sh->raid_conf; 3294 raid5_conf_t *conf = sh->raid_conf;
3177 int disks = sh->disks; 3295 int disks = sh->disks;
@@ -3181,6 +3299,7 @@ static bool handle_stripe6(struct stripe_head *sh)
3181 struct r6_state r6s; 3299 struct r6_state r6s;
3182 struct r5dev *dev, *pdev, *qdev; 3300 struct r5dev *dev, *pdev, *qdev;
3183 mdk_rdev_t *blocked_rdev = NULL; 3301 mdk_rdev_t *blocked_rdev = NULL;
3302 int dec_preread_active = 0;
3184 3303
3185 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 3304 pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
3186 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", 3305 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
@@ -3202,7 +3321,6 @@ static bool handle_stripe6(struct stripe_head *sh)
3202 for (i=disks; i--; ) { 3321 for (i=disks; i--; ) {
3203 mdk_rdev_t *rdev; 3322 mdk_rdev_t *rdev;
3204 dev = &sh->dev[i]; 3323 dev = &sh->dev[i];
3205 clear_bit(R5_Insync, &dev->flags);
3206 3324
3207 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 3325 pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
3208 i, dev->flags, dev->toread, dev->towrite, dev->written); 3326 i, dev->flags, dev->toread, dev->towrite, dev->written);
@@ -3240,18 +3358,28 @@ static bool handle_stripe6(struct stripe_head *sh)
3240 blocked_rdev = rdev; 3358 blocked_rdev = rdev;
3241 atomic_inc(&rdev->nr_pending); 3359 atomic_inc(&rdev->nr_pending);
3242 } 3360 }
3243 if (!rdev || !test_bit(In_sync, &rdev->flags)) { 3361 clear_bit(R5_Insync, &dev->flags);
3362 if (!rdev)
3363 /* Not in-sync */;
3364 else if (test_bit(In_sync, &rdev->flags))
3365 set_bit(R5_Insync, &dev->flags);
3366 else {
3367 /* in sync if before recovery_offset */
3368 if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
3369 set_bit(R5_Insync, &dev->flags);
3370 }
3371 if (!test_bit(R5_Insync, &dev->flags)) {
3244 /* The ReadError flag will just be confusing now */ 3372 /* The ReadError flag will just be confusing now */
3245 clear_bit(R5_ReadError, &dev->flags); 3373 clear_bit(R5_ReadError, &dev->flags);
3246 clear_bit(R5_ReWrite, &dev->flags); 3374 clear_bit(R5_ReWrite, &dev->flags);
3247 } 3375 }
3248 if (!rdev || !test_bit(In_sync, &rdev->flags) 3376 if (test_bit(R5_ReadError, &dev->flags))
3249 || test_bit(R5_ReadError, &dev->flags)) { 3377 clear_bit(R5_Insync, &dev->flags);
3378 if (!test_bit(R5_Insync, &dev->flags)) {
3250 if (s.failed < 2) 3379 if (s.failed < 2)
3251 r6s.failed_num[s.failed] = i; 3380 r6s.failed_num[s.failed] = i;
3252 s.failed++; 3381 s.failed++;
3253 } else 3382 }
3254 set_bit(R5_Insync, &dev->flags);
3255 } 3383 }
3256 rcu_read_unlock(); 3384 rcu_read_unlock();
3257 3385
@@ -3318,7 +3446,6 @@ static bool handle_stripe6(struct stripe_head *sh)
3318 * completed 3446 * completed
3319 */ 3447 */
3320 if (sh->reconstruct_state == reconstruct_state_drain_result) { 3448 if (sh->reconstruct_state == reconstruct_state_drain_result) {
3321 int qd_idx = sh->qd_idx;
3322 3449
3323 sh->reconstruct_state = reconstruct_state_idle; 3450 sh->reconstruct_state = reconstruct_state_idle;
3324 /* All the 'written' buffers and the parity blocks are ready to 3451 /* All the 'written' buffers and the parity blocks are ready to
@@ -3340,12 +3467,8 @@ static bool handle_stripe6(struct stripe_head *sh)
3340 set_bit(STRIPE_INSYNC, &sh->state); 3467 set_bit(STRIPE_INSYNC, &sh->state);
3341 } 3468 }
3342 } 3469 }
3343 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 3470 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3344 atomic_dec(&conf->preread_active_stripes); 3471 dec_preread_active = 1;
3345 if (atomic_read(&conf->preread_active_stripes) <
3346 IO_THRESHOLD)
3347 md_wakeup_thread(conf->mddev->thread);
3348 }
3349 } 3472 }
3350 3473
3351 /* Now to consider new write requests and what else, if anything 3474 /* Now to consider new write requests and what else, if anything
@@ -3454,18 +3577,27 @@ static bool handle_stripe6(struct stripe_head *sh)
3454 3577
3455 ops_run_io(sh, &s); 3578 ops_run_io(sh, &s);
3456 3579
3457 return_io(return_bi);
3458 3580
3459 return blocked_rdev == NULL; 3581 if (dec_preread_active) {
3582 /* We delay this until after ops_run_io so that if make_request
3583 * is waiting on a barrier, it won't continue until the writes
3584 * have actually been submitted.
3585 */
3586 atomic_dec(&conf->preread_active_stripes);
3587 if (atomic_read(&conf->preread_active_stripes) <
3588 IO_THRESHOLD)
3589 md_wakeup_thread(conf->mddev->thread);
3590 }
3591
3592 return_io(return_bi);
3460} 3593}
3461 3594
3462/* returns true if the stripe was handled */ 3595static void handle_stripe(struct stripe_head *sh)
3463static bool handle_stripe(struct stripe_head *sh)
3464{ 3596{
3465 if (sh->raid_conf->level == 6) 3597 if (sh->raid_conf->level == 6)
3466 return handle_stripe6(sh); 3598 handle_stripe6(sh);
3467 else 3599 else
3468 return handle_stripe5(sh); 3600 handle_stripe5(sh);
3469} 3601}
3470 3602
3471static void raid5_activate_delayed(raid5_conf_t *conf) 3603static void raid5_activate_delayed(raid5_conf_t *conf)
@@ -3503,9 +3635,10 @@ static void unplug_slaves(mddev_t *mddev)
3503{ 3635{
3504 raid5_conf_t *conf = mddev->private; 3636 raid5_conf_t *conf = mddev->private;
3505 int i; 3637 int i;
3638 int devs = max(conf->raid_disks, conf->previous_raid_disks);
3506 3639
3507 rcu_read_lock(); 3640 rcu_read_lock();
3508 for (i = 0; i < conf->raid_disks; i++) { 3641 for (i = 0; i < devs; i++) {
3509 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); 3642 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
3510 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { 3643 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
3511 struct request_queue *r_queue = bdev_get_queue(rdev->bdev); 3644 struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
@@ -3659,10 +3792,10 @@ static void raid5_align_endio(struct bio *bi, int error)
3659 3792
3660 bio_put(bi); 3793 bio_put(bi);
3661 3794
3662 mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata;
3663 conf = mddev->private;
3664 rdev = (void*)raid_bi->bi_next; 3795 rdev = (void*)raid_bi->bi_next;
3665 raid_bi->bi_next = NULL; 3796 raid_bi->bi_next = NULL;
3797 mddev = rdev->mddev;
3798 conf = mddev->private;
3666 3799
3667 rdev_dec_pending(rdev, conf->mddev); 3800 rdev_dec_pending(rdev, conf->mddev);
3668 3801
@@ -3686,7 +3819,7 @@ static int bio_fits_rdev(struct bio *bi)
3686 if ((bi->bi_size>>9) > queue_max_sectors(q)) 3819 if ((bi->bi_size>>9) > queue_max_sectors(q))
3687 return 0; 3820 return 0;
3688 blk_recount_segments(q, bi); 3821 blk_recount_segments(q, bi);
3689 if (bi->bi_phys_segments > queue_max_phys_segments(q)) 3822 if (bi->bi_phys_segments > queue_max_segments(q))
3690 return 0; 3823 return 0;
3691 3824
3692 if (q->merge_bvec_fn) 3825 if (q->merge_bvec_fn)
@@ -3699,11 +3832,10 @@ static int bio_fits_rdev(struct bio *bi)
3699} 3832}
3700 3833
3701 3834
3702static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) 3835static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio)
3703{ 3836{
3704 mddev_t *mddev = q->queuedata;
3705 raid5_conf_t *conf = mddev->private; 3837 raid5_conf_t *conf = mddev->private;
3706 unsigned int dd_idx; 3838 int dd_idx;
3707 struct bio* align_bi; 3839 struct bio* align_bi;
3708 mdk_rdev_t *rdev; 3840 mdk_rdev_t *rdev;
3709 3841
@@ -3816,33 +3948,32 @@ static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf)
3816 return sh; 3948 return sh;
3817} 3949}
3818 3950
3819static int make_request(struct request_queue *q, struct bio * bi) 3951static int make_request(mddev_t *mddev, struct bio * bi)
3820{ 3952{
3821 mddev_t *mddev = q->queuedata;
3822 raid5_conf_t *conf = mddev->private; 3953 raid5_conf_t *conf = mddev->private;
3823 int dd_idx; 3954 int dd_idx;
3824 sector_t new_sector; 3955 sector_t new_sector;
3825 sector_t logical_sector, last_sector; 3956 sector_t logical_sector, last_sector;
3826 struct stripe_head *sh; 3957 struct stripe_head *sh;
3827 const int rw = bio_data_dir(bi); 3958 const int rw = bio_data_dir(bi);
3828 int cpu, remaining; 3959 int remaining;
3829 3960
3830 if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) { 3961 if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) {
3831 bio_endio(bi, -EOPNOTSUPP); 3962 /* Drain all pending writes. We only really need
3963 * to ensure they have been submitted, but this is
3964 * easier.
3965 */
3966 mddev->pers->quiesce(mddev, 1);
3967 mddev->pers->quiesce(mddev, 0);
3968 md_barrier_request(mddev, bi);
3832 return 0; 3969 return 0;
3833 } 3970 }
3834 3971
3835 md_write_start(mddev, bi); 3972 md_write_start(mddev, bi);
3836 3973
3837 cpu = part_stat_lock();
3838 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
3839 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
3840 bio_sectors(bi));
3841 part_stat_unlock();
3842
3843 if (rw == READ && 3974 if (rw == READ &&
3844 mddev->reshape_position == MaxSector && 3975 mddev->reshape_position == MaxSector &&
3845 chunk_aligned_read(q,bi)) 3976 chunk_aligned_read(mddev,bi))
3846 return 0; 3977 return 0;
3847 3978
3848 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 3979 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
@@ -3890,7 +4021,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
3890 new_sector = raid5_compute_sector(conf, logical_sector, 4021 new_sector = raid5_compute_sector(conf, logical_sector,
3891 previous, 4022 previous,
3892 &dd_idx, NULL); 4023 &dd_idx, NULL);
3893 pr_debug("raid5: make_request, sector %llu logical %llu\n", 4024 pr_debug("raid456: make_request, sector %llu logical %llu\n",
3894 (unsigned long long)new_sector, 4025 (unsigned long long)new_sector,
3895 (unsigned long long)logical_sector); 4026 (unsigned long long)logical_sector);
3896 4027
@@ -3952,6 +4083,9 @@ static int make_request(struct request_queue *q, struct bio * bi)
3952 finish_wait(&conf->wait_for_overlap, &w); 4083 finish_wait(&conf->wait_for_overlap, &w);
3953 set_bit(STRIPE_HANDLE, &sh->state); 4084 set_bit(STRIPE_HANDLE, &sh->state);
3954 clear_bit(STRIPE_DELAYED, &sh->state); 4085 clear_bit(STRIPE_DELAYED, &sh->state);
4086 if (mddev->barrier &&
4087 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4088 atomic_inc(&conf->preread_active_stripes);
3955 release_stripe(sh); 4089 release_stripe(sh);
3956 } else { 4090 } else {
3957 /* cannot get stripe for read-ahead, just give-up */ 4091 /* cannot get stripe for read-ahead, just give-up */
@@ -3971,6 +4105,14 @@ static int make_request(struct request_queue *q, struct bio * bi)
3971 4105
3972 bio_endio(bi, 0); 4106 bio_endio(bi, 0);
3973 } 4107 }
4108
4109 if (mddev->barrier) {
4110 /* We need to wait for the stripes to all be handled.
4111 * So: wait for preread_active_stripes to drop to 0.
4112 */
4113 wait_event(mddev->thread->wqueue,
4114 atomic_read(&conf->preread_active_stripes) == 0);
4115 }
3974 return 0; 4116 return 0;
3975} 4117}
3976 4118
@@ -3987,7 +4129,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
3987 * As the reads complete, handle_stripe will copy the data 4129 * As the reads complete, handle_stripe will copy the data
3988 * into the destination stripe and release that stripe. 4130 * into the destination stripe and release that stripe.
3989 */ 4131 */
3990 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 4132 raid5_conf_t *conf = mddev->private;
3991 struct stripe_head *sh; 4133 struct stripe_head *sh;
3992 sector_t first_sector, last_sector; 4134 sector_t first_sector, last_sector;
3993 int raid_disks = conf->previous_raid_disks; 4135 int raid_disks = conf->previous_raid_disks;
@@ -4011,6 +4153,8 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
4011 sector_nr = conf->reshape_progress; 4153 sector_nr = conf->reshape_progress;
4012 sector_div(sector_nr, new_data_disks); 4154 sector_div(sector_nr, new_data_disks);
4013 if (sector_nr) { 4155 if (sector_nr) {
4156 mddev->curr_resync_completed = sector_nr;
4157 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4014 *skipped = 1; 4158 *skipped = 1;
4015 return sector_nr; 4159 return sector_nr;
4016 } 4160 }
@@ -4194,7 +4338,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
4194/* FIXME go_faster isn't used */ 4338/* FIXME go_faster isn't used */
4195static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) 4339static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
4196{ 4340{
4197 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 4341 raid5_conf_t *conf = mddev->private;
4198 struct stripe_head *sh; 4342 struct stripe_head *sh;
4199 sector_t max_sector = mddev->dev_sectors; 4343 sector_t max_sector = mddev->dev_sectors;
4200 int sync_blocks; 4344 int sync_blocks;
@@ -4277,9 +4421,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
4277 clear_bit(STRIPE_INSYNC, &sh->state); 4421 clear_bit(STRIPE_INSYNC, &sh->state);
4278 spin_unlock(&sh->lock); 4422 spin_unlock(&sh->lock);
4279 4423
4280 /* wait for any blocked device to be handled */ 4424 handle_stripe(sh);
4281 while (unlikely(!handle_stripe(sh)))
4282 ;
4283 release_stripe(sh); 4425 release_stripe(sh);
4284 4426
4285 return STRIPE_SECTORS; 4427 return STRIPE_SECTORS;
@@ -4349,37 +4491,6 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
4349 return handled; 4491 return handled;
4350} 4492}
4351 4493
4352#ifdef CONFIG_MULTICORE_RAID456
4353static void __process_stripe(void *param, async_cookie_t cookie)
4354{
4355 struct stripe_head *sh = param;
4356
4357 handle_stripe(sh);
4358 release_stripe(sh);
4359}
4360
4361static void process_stripe(struct stripe_head *sh, struct list_head *domain)
4362{
4363 async_schedule_domain(__process_stripe, sh, domain);
4364}
4365
4366static void synchronize_stripe_processing(struct list_head *domain)
4367{
4368 async_synchronize_full_domain(domain);
4369}
4370#else
4371static void process_stripe(struct stripe_head *sh, struct list_head *domain)
4372{
4373 handle_stripe(sh);
4374 release_stripe(sh);
4375 cond_resched();
4376}
4377
4378static void synchronize_stripe_processing(struct list_head *domain)
4379{
4380}
4381#endif
4382
4383 4494
4384/* 4495/*
4385 * This is our raid5 kernel thread. 4496 * This is our raid5 kernel thread.
@@ -4393,7 +4504,6 @@ static void raid5d(mddev_t *mddev)
4393 struct stripe_head *sh; 4504 struct stripe_head *sh;
4394 raid5_conf_t *conf = mddev->private; 4505 raid5_conf_t *conf = mddev->private;
4395 int handled; 4506 int handled;
4396 LIST_HEAD(raid_domain);
4397 4507
4398 pr_debug("+++ raid5d active\n"); 4508 pr_debug("+++ raid5d active\n");
4399 4509
@@ -4430,7 +4540,9 @@ static void raid5d(mddev_t *mddev)
4430 spin_unlock_irq(&conf->device_lock); 4540 spin_unlock_irq(&conf->device_lock);
4431 4541
4432 handled++; 4542 handled++;
4433 process_stripe(sh, &raid_domain); 4543 handle_stripe(sh);
4544 release_stripe(sh);
4545 cond_resched();
4434 4546
4435 spin_lock_irq(&conf->device_lock); 4547 spin_lock_irq(&conf->device_lock);
4436 } 4548 }
@@ -4438,7 +4550,6 @@ static void raid5d(mddev_t *mddev)
4438 4550
4439 spin_unlock_irq(&conf->device_lock); 4551 spin_unlock_irq(&conf->device_lock);
4440 4552
4441 synchronize_stripe_processing(&raid_domain);
4442 async_tx_issue_pending_all(); 4553 async_tx_issue_pending_all();
4443 unplug_slaves(mddev); 4554 unplug_slaves(mddev);
4444 4555
@@ -4558,13 +4669,9 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
4558 4669
4559 if (!sectors) 4670 if (!sectors)
4560 sectors = mddev->dev_sectors; 4671 sectors = mddev->dev_sectors;
4561 if (!raid_disks) { 4672 if (!raid_disks)
4562 /* size is defined by the smallest of previous and new size */ 4673 /* size is defined by the smallest of previous and new size */
4563 if (conf->raid_disks < conf->previous_raid_disks) 4674 raid_disks = min(conf->raid_disks, conf->previous_raid_disks);
4564 raid_disks = conf->raid_disks;
4565 else
4566 raid_disks = conf->previous_raid_disks;
4567 }
4568 4675
4569 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 4676 sectors &= ~((sector_t)mddev->chunk_sectors - 1);
4570 sectors &= ~((sector_t)mddev->new_chunk_sectors - 1); 4677 sectors &= ~((sector_t)mddev->new_chunk_sectors - 1);
@@ -4624,7 +4731,7 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
4624 kfree(percpu->scribble); 4731 kfree(percpu->scribble);
4625 pr_err("%s: failed memory allocation for cpu%ld\n", 4732 pr_err("%s: failed memory allocation for cpu%ld\n",
4626 __func__, cpu); 4733 __func__, cpu);
4627 return NOTIFY_BAD; 4734 return notifier_from_errno(-ENOMEM);
4628 } 4735 }
4629 break; 4736 break;
4630 case CPU_DEAD: 4737 case CPU_DEAD:
@@ -4645,7 +4752,7 @@ static int raid5_alloc_percpu(raid5_conf_t *conf)
4645{ 4752{
4646 unsigned long cpu; 4753 unsigned long cpu;
4647 struct page *spare_page; 4754 struct page *spare_page;
4648 struct raid5_percpu *allcpus; 4755 struct raid5_percpu __percpu *allcpus;
4649 void *scribble; 4756 void *scribble;
4650 int err; 4757 int err;
4651 4758
@@ -4665,7 +4772,7 @@ static int raid5_alloc_percpu(raid5_conf_t *conf)
4665 } 4772 }
4666 per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; 4773 per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page;
4667 } 4774 }
4668 scribble = kmalloc(scribble_len(conf->raid_disks), GFP_KERNEL); 4775 scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
4669 if (!scribble) { 4776 if (!scribble) {
4670 err = -ENOMEM; 4777 err = -ENOMEM;
4671 break; 4778 break;
@@ -4686,14 +4793,14 @@ static int raid5_alloc_percpu(raid5_conf_t *conf)
4686static raid5_conf_t *setup_conf(mddev_t *mddev) 4793static raid5_conf_t *setup_conf(mddev_t *mddev)
4687{ 4794{
4688 raid5_conf_t *conf; 4795 raid5_conf_t *conf;
4689 int raid_disk, memory; 4796 int raid_disk, memory, max_disks;
4690 mdk_rdev_t *rdev; 4797 mdk_rdev_t *rdev;
4691 struct disk_info *disk; 4798 struct disk_info *disk;
4692 4799
4693 if (mddev->new_level != 5 4800 if (mddev->new_level != 5
4694 && mddev->new_level != 4 4801 && mddev->new_level != 4
4695 && mddev->new_level != 6) { 4802 && mddev->new_level != 6) {
4696 printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n", 4803 printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n",
4697 mdname(mddev), mddev->new_level); 4804 mdname(mddev), mddev->new_level);
4698 return ERR_PTR(-EIO); 4805 return ERR_PTR(-EIO);
4699 } 4806 }
@@ -4701,12 +4808,12 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4701 && !algorithm_valid_raid5(mddev->new_layout)) || 4808 && !algorithm_valid_raid5(mddev->new_layout)) ||
4702 (mddev->new_level == 6 4809 (mddev->new_level == 6
4703 && !algorithm_valid_raid6(mddev->new_layout))) { 4810 && !algorithm_valid_raid6(mddev->new_layout))) {
4704 printk(KERN_ERR "raid5: %s: layout %d not supported\n", 4811 printk(KERN_ERR "md/raid:%s: layout %d not supported\n",
4705 mdname(mddev), mddev->new_layout); 4812 mdname(mddev), mddev->new_layout);
4706 return ERR_PTR(-EIO); 4813 return ERR_PTR(-EIO);
4707 } 4814 }
4708 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 4815 if (mddev->new_level == 6 && mddev->raid_disks < 4) {
4709 printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n", 4816 printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n",
4710 mdname(mddev), mddev->raid_disks); 4817 mdname(mddev), mddev->raid_disks);
4711 return ERR_PTR(-EINVAL); 4818 return ERR_PTR(-EINVAL);
4712 } 4819 }
@@ -4714,23 +4821,36 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4714 if (!mddev->new_chunk_sectors || 4821 if (!mddev->new_chunk_sectors ||
4715 (mddev->new_chunk_sectors << 9) % PAGE_SIZE || 4822 (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
4716 !is_power_of_2(mddev->new_chunk_sectors)) { 4823 !is_power_of_2(mddev->new_chunk_sectors)) {
4717 printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", 4824 printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n",
4718 mddev->new_chunk_sectors << 9, mdname(mddev)); 4825 mdname(mddev), mddev->new_chunk_sectors << 9);
4719 return ERR_PTR(-EINVAL); 4826 return ERR_PTR(-EINVAL);
4720 } 4827 }
4721 4828
4722 conf = kzalloc(sizeof(raid5_conf_t), GFP_KERNEL); 4829 conf = kzalloc(sizeof(raid5_conf_t), GFP_KERNEL);
4723 if (conf == NULL) 4830 if (conf == NULL)
4724 goto abort; 4831 goto abort;
4832 spin_lock_init(&conf->device_lock);
4833 init_waitqueue_head(&conf->wait_for_stripe);
4834 init_waitqueue_head(&conf->wait_for_overlap);
4835 INIT_LIST_HEAD(&conf->handle_list);
4836 INIT_LIST_HEAD(&conf->hold_list);
4837 INIT_LIST_HEAD(&conf->delayed_list);
4838 INIT_LIST_HEAD(&conf->bitmap_list);
4839 INIT_LIST_HEAD(&conf->inactive_list);
4840 atomic_set(&conf->active_stripes, 0);
4841 atomic_set(&conf->preread_active_stripes, 0);
4842 atomic_set(&conf->active_aligned_reads, 0);
4843 conf->bypass_threshold = BYPASS_THRESHOLD;
4725 4844
4726 conf->raid_disks = mddev->raid_disks; 4845 conf->raid_disks = mddev->raid_disks;
4727 conf->scribble_len = scribble_len(conf->raid_disks);
4728 if (mddev->reshape_position == MaxSector) 4846 if (mddev->reshape_position == MaxSector)
4729 conf->previous_raid_disks = mddev->raid_disks; 4847 conf->previous_raid_disks = mddev->raid_disks;
4730 else 4848 else
4731 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 4849 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
4850 max_disks = max(conf->raid_disks, conf->previous_raid_disks);
4851 conf->scribble_len = scribble_len(max_disks);
4732 4852
4733 conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info), 4853 conf->disks = kzalloc(max_disks * sizeof(struct disk_info),
4734 GFP_KERNEL); 4854 GFP_KERNEL);
4735 if (!conf->disks) 4855 if (!conf->disks)
4736 goto abort; 4856 goto abort;
@@ -4744,24 +4864,11 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4744 if (raid5_alloc_percpu(conf) != 0) 4864 if (raid5_alloc_percpu(conf) != 0)
4745 goto abort; 4865 goto abort;
4746 4866
4747 spin_lock_init(&conf->device_lock); 4867 pr_debug("raid456: run(%s) called.\n", mdname(mddev));
4748 init_waitqueue_head(&conf->wait_for_stripe);
4749 init_waitqueue_head(&conf->wait_for_overlap);
4750 INIT_LIST_HEAD(&conf->handle_list);
4751 INIT_LIST_HEAD(&conf->hold_list);
4752 INIT_LIST_HEAD(&conf->delayed_list);
4753 INIT_LIST_HEAD(&conf->bitmap_list);
4754 INIT_LIST_HEAD(&conf->inactive_list);
4755 atomic_set(&conf->active_stripes, 0);
4756 atomic_set(&conf->preread_active_stripes, 0);
4757 atomic_set(&conf->active_aligned_reads, 0);
4758 conf->bypass_threshold = BYPASS_THRESHOLD;
4759
4760 pr_debug("raid5: run(%s) called.\n", mdname(mddev));
4761 4868
4762 list_for_each_entry(rdev, &mddev->disks, same_set) { 4869 list_for_each_entry(rdev, &mddev->disks, same_set) {
4763 raid_disk = rdev->raid_disk; 4870 raid_disk = rdev->raid_disk;
4764 if (raid_disk >= conf->raid_disks 4871 if (raid_disk >= max_disks
4765 || raid_disk < 0) 4872 || raid_disk < 0)
4766 continue; 4873 continue;
4767 disk = conf->disks + raid_disk; 4874 disk = conf->disks + raid_disk;
@@ -4770,9 +4877,9 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4770 4877
4771 if (test_bit(In_sync, &rdev->flags)) { 4878 if (test_bit(In_sync, &rdev->flags)) {
4772 char b[BDEVNAME_SIZE]; 4879 char b[BDEVNAME_SIZE];
4773 printk(KERN_INFO "raid5: device %s operational as raid" 4880 printk(KERN_INFO "md/raid:%s: device %s operational as raid"
4774 " disk %d\n", bdevname(rdev->bdev,b), 4881 " disk %d\n",
4775 raid_disk); 4882 mdname(mddev), bdevname(rdev->bdev, b), raid_disk);
4776 } else 4883 } else
4777 /* Cannot rely on bitmap to complete recovery */ 4884 /* Cannot rely on bitmap to complete recovery */
4778 conf->fullsync = 1; 4885 conf->fullsync = 1;
@@ -4793,19 +4900,20 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4793 } 4900 }
4794 4901
4795 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + 4902 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
4796 conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 4903 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
4797 if (grow_stripes(conf, conf->max_nr_stripes)) { 4904 if (grow_stripes(conf, conf->max_nr_stripes)) {
4798 printk(KERN_ERR 4905 printk(KERN_ERR
4799 "raid5: couldn't allocate %dkB for buffers\n", memory); 4906 "md/raid:%s: couldn't allocate %dkB for buffers\n",
4907 mdname(mddev), memory);
4800 goto abort; 4908 goto abort;
4801 } else 4909 } else
4802 printk(KERN_INFO "raid5: allocated %dkB for %s\n", 4910 printk(KERN_INFO "md/raid:%s: allocated %dkB\n",
4803 memory, mdname(mddev)); 4911 mdname(mddev), memory);
4804 4912
4805 conf->thread = md_register_thread(raid5d, mddev, NULL); 4913 conf->thread = md_register_thread(raid5d, mddev, NULL);
4806 if (!conf->thread) { 4914 if (!conf->thread) {
4807 printk(KERN_ERR 4915 printk(KERN_ERR
4808 "raid5: couldn't allocate thread for %s\n", 4916 "md/raid:%s: couldn't allocate thread.\n",
4809 mdname(mddev)); 4917 mdname(mddev));
4810 goto abort; 4918 goto abort;
4811 } 4919 }
@@ -4820,14 +4928,43 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4820 return ERR_PTR(-ENOMEM); 4928 return ERR_PTR(-ENOMEM);
4821} 4929}
4822 4930
4931
4932static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded)
4933{
4934 switch (algo) {
4935 case ALGORITHM_PARITY_0:
4936 if (raid_disk < max_degraded)
4937 return 1;
4938 break;
4939 case ALGORITHM_PARITY_N:
4940 if (raid_disk >= raid_disks - max_degraded)
4941 return 1;
4942 break;
4943 case ALGORITHM_PARITY_0_6:
4944 if (raid_disk == 0 ||
4945 raid_disk == raid_disks - 1)
4946 return 1;
4947 break;
4948 case ALGORITHM_LEFT_ASYMMETRIC_6:
4949 case ALGORITHM_RIGHT_ASYMMETRIC_6:
4950 case ALGORITHM_LEFT_SYMMETRIC_6:
4951 case ALGORITHM_RIGHT_SYMMETRIC_6:
4952 if (raid_disk == raid_disks - 1)
4953 return 1;
4954 }
4955 return 0;
4956}
4957
4823static int run(mddev_t *mddev) 4958static int run(mddev_t *mddev)
4824{ 4959{
4825 raid5_conf_t *conf; 4960 raid5_conf_t *conf;
4826 int working_disks = 0, chunk_size; 4961 int working_disks = 0, chunk_size;
4962 int dirty_parity_disks = 0;
4827 mdk_rdev_t *rdev; 4963 mdk_rdev_t *rdev;
4964 sector_t reshape_offset = 0;
4828 4965
4829 if (mddev->recovery_cp != MaxSector) 4966 if (mddev->recovery_cp != MaxSector)
4830 printk(KERN_NOTICE "raid5: %s is not clean" 4967 printk(KERN_NOTICE "md/raid:%s: not clean"
4831 " -- starting background reconstruction\n", 4968 " -- starting background reconstruction\n",
4832 mdname(mddev)); 4969 mdname(mddev));
4833 if (mddev->reshape_position != MaxSector) { 4970 if (mddev->reshape_position != MaxSector) {
@@ -4841,7 +4978,7 @@ static int run(mddev_t *mddev)
4841 int max_degraded = (mddev->level == 6 ? 2 : 1); 4978 int max_degraded = (mddev->level == 6 ? 2 : 1);
4842 4979
4843 if (mddev->new_level != mddev->level) { 4980 if (mddev->new_level != mddev->level) {
4844 printk(KERN_ERR "raid5: %s: unsupported reshape " 4981 printk(KERN_ERR "md/raid:%s: unsupported reshape "
4845 "required - aborting.\n", 4982 "required - aborting.\n",
4846 mdname(mddev)); 4983 mdname(mddev));
4847 return -EINVAL; 4984 return -EINVAL;
@@ -4854,10 +4991,11 @@ static int run(mddev_t *mddev)
4854 here_new = mddev->reshape_position; 4991 here_new = mddev->reshape_position;
4855 if (sector_div(here_new, mddev->new_chunk_sectors * 4992 if (sector_div(here_new, mddev->new_chunk_sectors *
4856 (mddev->raid_disks - max_degraded))) { 4993 (mddev->raid_disks - max_degraded))) {
4857 printk(KERN_ERR "raid5: reshape_position not " 4994 printk(KERN_ERR "md/raid:%s: reshape_position not "
4858 "on a stripe boundary\n"); 4995 "on a stripe boundary\n", mdname(mddev));
4859 return -EINVAL; 4996 return -EINVAL;
4860 } 4997 }
4998 reshape_offset = here_new * mddev->new_chunk_sectors;
4861 /* here_new is the stripe we will write to */ 4999 /* here_new is the stripe we will write to */
4862 here_old = mddev->reshape_position; 5000 here_old = mddev->reshape_position;
4863 sector_div(here_old, mddev->chunk_sectors * 5001 sector_div(here_old, mddev->chunk_sectors *
@@ -4875,8 +5013,9 @@ static int run(mddev_t *mddev)
4875 if ((here_new * mddev->new_chunk_sectors != 5013 if ((here_new * mddev->new_chunk_sectors !=
4876 here_old * mddev->chunk_sectors) || 5014 here_old * mddev->chunk_sectors) ||
4877 mddev->ro == 0) { 5015 mddev->ro == 0) {
4878 printk(KERN_ERR "raid5: in-place reshape must be started" 5016 printk(KERN_ERR "md/raid:%s: in-place reshape must be started"
4879 " in read-only mode - aborting\n"); 5017 " in read-only mode - aborting\n",
5018 mdname(mddev));
4880 return -EINVAL; 5019 return -EINVAL;
4881 } 5020 }
4882 } else if (mddev->delta_disks < 0 5021 } else if (mddev->delta_disks < 0
@@ -4885,11 +5024,13 @@ static int run(mddev_t *mddev)
4885 : (here_new * mddev->new_chunk_sectors >= 5024 : (here_new * mddev->new_chunk_sectors >=
4886 here_old * mddev->chunk_sectors)) { 5025 here_old * mddev->chunk_sectors)) {
4887 /* Reading from the same stripe as writing to - bad */ 5026 /* Reading from the same stripe as writing to - bad */
4888 printk(KERN_ERR "raid5: reshape_position too early for " 5027 printk(KERN_ERR "md/raid:%s: reshape_position too early for "
4889 "auto-recovery - aborting.\n"); 5028 "auto-recovery - aborting.\n",
5029 mdname(mddev));
4890 return -EINVAL; 5030 return -EINVAL;
4891 } 5031 }
4892 printk(KERN_INFO "raid5: reshape will continue\n"); 5032 printk(KERN_INFO "md/raid:%s: reshape will continue\n",
5033 mdname(mddev));
4893 /* OK, we should be able to continue; */ 5034 /* OK, we should be able to continue; */
4894 } else { 5035 } else {
4895 BUG_ON(mddev->level != mddev->new_level); 5036 BUG_ON(mddev->level != mddev->new_level);
@@ -4913,15 +5054,47 @@ static int run(mddev_t *mddev)
4913 /* 5054 /*
4914 * 0 for a fully functional array, 1 or 2 for a degraded array. 5055 * 0 for a fully functional array, 1 or 2 for a degraded array.
4915 */ 5056 */
4916 list_for_each_entry(rdev, &mddev->disks, same_set) 5057 list_for_each_entry(rdev, &mddev->disks, same_set) {
4917 if (rdev->raid_disk >= 0 && 5058 if (rdev->raid_disk < 0)
4918 test_bit(In_sync, &rdev->flags)) 5059 continue;
5060 if (test_bit(In_sync, &rdev->flags)) {
4919 working_disks++; 5061 working_disks++;
5062 continue;
5063 }
5064 /* This disc is not fully in-sync. However if it
5065 * just stored parity (beyond the recovery_offset),
5066 * when we don't need to be concerned about the
5067 * array being dirty.
5068 * When reshape goes 'backwards', we never have
5069 * partially completed devices, so we only need
5070 * to worry about reshape going forwards.
5071 */
5072 /* Hack because v0.91 doesn't store recovery_offset properly. */
5073 if (mddev->major_version == 0 &&
5074 mddev->minor_version > 90)
5075 rdev->recovery_offset = reshape_offset;
5076
5077 if (rdev->recovery_offset < reshape_offset) {
5078 /* We need to check old and new layout */
5079 if (!only_parity(rdev->raid_disk,
5080 conf->algorithm,
5081 conf->raid_disks,
5082 conf->max_degraded))
5083 continue;
5084 }
5085 if (!only_parity(rdev->raid_disk,
5086 conf->prev_algo,
5087 conf->previous_raid_disks,
5088 conf->max_degraded))
5089 continue;
5090 dirty_parity_disks++;
5091 }
4920 5092
4921 mddev->degraded = conf->raid_disks - working_disks; 5093 mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks)
5094 - working_disks);
4922 5095
4923 if (mddev->degraded > conf->max_degraded) { 5096 if (has_failed(conf)) {
4924 printk(KERN_ERR "raid5: not enough operational devices for %s" 5097 printk(KERN_ERR "md/raid:%s: not enough operational devices"
4925 " (%d/%d failed)\n", 5098 " (%d/%d failed)\n",
4926 mdname(mddev), mddev->degraded, conf->raid_disks); 5099 mdname(mddev), mddev->degraded, conf->raid_disks);
4927 goto abort; 5100 goto abort;
@@ -4931,36 +5104,36 @@ static int run(mddev_t *mddev)
4931 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); 5104 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1);
4932 mddev->resync_max_sectors = mddev->dev_sectors; 5105 mddev->resync_max_sectors = mddev->dev_sectors;
4933 5106
4934 if (mddev->degraded > 0 && 5107 if (mddev->degraded > dirty_parity_disks &&
4935 mddev->recovery_cp != MaxSector) { 5108 mddev->recovery_cp != MaxSector) {
4936 if (mddev->ok_start_degraded) 5109 if (mddev->ok_start_degraded)
4937 printk(KERN_WARNING 5110 printk(KERN_WARNING
4938 "raid5: starting dirty degraded array: %s" 5111 "md/raid:%s: starting dirty degraded array"
4939 "- data corruption possible.\n", 5112 " - data corruption possible.\n",
4940 mdname(mddev)); 5113 mdname(mddev));
4941 else { 5114 else {
4942 printk(KERN_ERR 5115 printk(KERN_ERR
4943 "raid5: cannot start dirty degraded array for %s\n", 5116 "md/raid:%s: cannot start dirty degraded array.\n",
4944 mdname(mddev)); 5117 mdname(mddev));
4945 goto abort; 5118 goto abort;
4946 } 5119 }
4947 } 5120 }
4948 5121
4949 if (mddev->degraded == 0) 5122 if (mddev->degraded == 0)
4950 printk("raid5: raid level %d set %s active with %d out of %d" 5123 printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d"
4951 " devices, algorithm %d\n", conf->level, mdname(mddev), 5124 " devices, algorithm %d\n", mdname(mddev), conf->level,
4952 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 5125 mddev->raid_disks-mddev->degraded, mddev->raid_disks,
4953 mddev->new_layout); 5126 mddev->new_layout);
4954 else 5127 else
4955 printk(KERN_ALERT "raid5: raid level %d set %s active with %d" 5128 printk(KERN_ALERT "md/raid:%s: raid level %d active with %d"
4956 " out of %d devices, algorithm %d\n", conf->level, 5129 " out of %d devices, algorithm %d\n",
4957 mdname(mddev), mddev->raid_disks - mddev->degraded, 5130 mdname(mddev), conf->level,
4958 mddev->raid_disks, mddev->new_layout); 5131 mddev->raid_disks - mddev->degraded,
5132 mddev->raid_disks, mddev->new_layout);
4959 5133
4960 print_raid5_conf(conf); 5134 print_raid5_conf(conf);
4961 5135
4962 if (conf->reshape_progress != MaxSector) { 5136 if (conf->reshape_progress != MaxSector) {
4963 printk("...ok start reshape thread\n");
4964 conf->reshape_safe = conf->reshape_progress; 5137 conf->reshape_safe = conf->reshape_progress;
4965 atomic_set(&conf->reshape_stripes, 0); 5138 atomic_set(&conf->reshape_stripes, 0);
4966 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5139 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
@@ -4983,9 +5156,11 @@ static int run(mddev_t *mddev)
4983 } 5156 }
4984 5157
4985 /* Ok, everything is just fine now */ 5158 /* Ok, everything is just fine now */
4986 if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 5159 if (mddev->to_remove == &raid5_attrs_group)
5160 mddev->to_remove = NULL;
5161 else if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
4987 printk(KERN_WARNING 5162 printk(KERN_WARNING
4988 "raid5: failed to create sysfs attributes for %s\n", 5163 "md/raid:%s: failed to create sysfs attributes.\n",
4989 mdname(mddev)); 5164 mdname(mddev));
4990 5165
4991 mddev->queue->queue_lock = &conf->device_lock; 5166 mddev->queue->queue_lock = &conf->device_lock;
@@ -5015,23 +5190,21 @@ abort:
5015 free_conf(conf); 5190 free_conf(conf);
5016 } 5191 }
5017 mddev->private = NULL; 5192 mddev->private = NULL;
5018 printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev)); 5193 printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev));
5019 return -EIO; 5194 return -EIO;
5020} 5195}
5021 5196
5022
5023
5024static int stop(mddev_t *mddev) 5197static int stop(mddev_t *mddev)
5025{ 5198{
5026 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 5199 raid5_conf_t *conf = mddev->private;
5027 5200
5028 md_unregister_thread(mddev->thread); 5201 md_unregister_thread(mddev->thread);
5029 mddev->thread = NULL; 5202 mddev->thread = NULL;
5030 mddev->queue->backing_dev_info.congested_fn = NULL; 5203 mddev->queue->backing_dev_info.congested_fn = NULL;
5031 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 5204 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
5032 sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
5033 free_conf(conf); 5205 free_conf(conf);
5034 mddev->private = NULL; 5206 mddev->private = NULL;
5207 mddev->to_remove = &raid5_attrs_group;
5035 return 0; 5208 return 0;
5036} 5209}
5037 5210
@@ -5072,7 +5245,7 @@ static void printall(struct seq_file *seq, raid5_conf_t *conf)
5072 5245
5073static void status(struct seq_file *seq, mddev_t *mddev) 5246static void status(struct seq_file *seq, mddev_t *mddev)
5074{ 5247{
5075 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 5248 raid5_conf_t *conf = mddev->private;
5076 int i; 5249 int i;
5077 5250
5078 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, 5251 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
@@ -5094,21 +5267,22 @@ static void print_raid5_conf (raid5_conf_t *conf)
5094 int i; 5267 int i;
5095 struct disk_info *tmp; 5268 struct disk_info *tmp;
5096 5269
5097 printk("RAID5 conf printout:\n"); 5270 printk(KERN_DEBUG "RAID conf printout:\n");
5098 if (!conf) { 5271 if (!conf) {
5099 printk("(conf==NULL)\n"); 5272 printk("(conf==NULL)\n");
5100 return; 5273 return;
5101 } 5274 }
5102 printk(" --- rd:%d wd:%d\n", conf->raid_disks, 5275 printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level,
5103 conf->raid_disks - conf->mddev->degraded); 5276 conf->raid_disks,
5277 conf->raid_disks - conf->mddev->degraded);
5104 5278
5105 for (i = 0; i < conf->raid_disks; i++) { 5279 for (i = 0; i < conf->raid_disks; i++) {
5106 char b[BDEVNAME_SIZE]; 5280 char b[BDEVNAME_SIZE];
5107 tmp = conf->disks + i; 5281 tmp = conf->disks + i;
5108 if (tmp->rdev) 5282 if (tmp->rdev)
5109 printk(" disk %d, o:%d, dev:%s\n", 5283 printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n",
5110 i, !test_bit(Faulty, &tmp->rdev->flags), 5284 i, !test_bit(Faulty, &tmp->rdev->flags),
5111 bdevname(tmp->rdev->bdev,b)); 5285 bdevname(tmp->rdev->bdev, b));
5112 } 5286 }
5113} 5287}
5114 5288
@@ -5121,6 +5295,7 @@ static int raid5_spare_active(mddev_t *mddev)
5121 for (i = 0; i < conf->raid_disks; i++) { 5295 for (i = 0; i < conf->raid_disks; i++) {
5122 tmp = conf->disks + i; 5296 tmp = conf->disks + i;
5123 if (tmp->rdev 5297 if (tmp->rdev
5298 && tmp->rdev->recovery_offset == MaxSector
5124 && !test_bit(Faulty, &tmp->rdev->flags) 5299 && !test_bit(Faulty, &tmp->rdev->flags)
5125 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 5300 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
5126 unsigned long flags; 5301 unsigned long flags;
@@ -5156,7 +5331,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
5156 * isn't possible. 5331 * isn't possible.
5157 */ 5332 */
5158 if (!test_bit(Faulty, &rdev->flags) && 5333 if (!test_bit(Faulty, &rdev->flags) &&
5159 mddev->degraded <= conf->max_degraded && 5334 !has_failed(conf) &&
5160 number < conf->raid_disks) { 5335 number < conf->raid_disks) {
5161 err = -EBUSY; 5336 err = -EBUSY;
5162 goto abort; 5337 goto abort;
@@ -5184,7 +5359,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
5184 int first = 0; 5359 int first = 0;
5185 int last = conf->raid_disks - 1; 5360 int last = conf->raid_disks - 1;
5186 5361
5187 if (mddev->degraded > conf->max_degraded) 5362 if (has_failed(conf))
5188 /* no point adding a device */ 5363 /* no point adding a device */
5189 return -EINVAL; 5364 return -EINVAL;
5190 5365
@@ -5231,7 +5406,6 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
5231 raid5_size(mddev, sectors, mddev->raid_disks)) 5406 raid5_size(mddev, sectors, mddev->raid_disks))
5232 return -EINVAL; 5407 return -EINVAL;
5233 set_capacity(mddev->gendisk, mddev->array_sectors); 5408 set_capacity(mddev->gendisk, mddev->array_sectors);
5234 mddev->changed = 1;
5235 revalidate_disk(mddev->gendisk); 5409 revalidate_disk(mddev->gendisk);
5236 if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { 5410 if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) {
5237 mddev->recovery_cp = mddev->dev_sectors; 5411 mddev->recovery_cp = mddev->dev_sectors;
@@ -5257,7 +5431,8 @@ static int check_stripe_cache(mddev_t *mddev)
5257 > conf->max_nr_stripes || 5431 > conf->max_nr_stripes ||
5258 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 5432 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
5259 > conf->max_nr_stripes) { 5433 > conf->max_nr_stripes) {
5260 printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n", 5434 printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n",
5435 mdname(mddev),
5261 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) 5436 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
5262 / STRIPE_SIZE)*4); 5437 / STRIPE_SIZE)*4);
5263 return 0; 5438 return 0;
@@ -5276,7 +5451,7 @@ static int check_reshape(mddev_t *mddev)
5276 if (mddev->bitmap) 5451 if (mddev->bitmap)
5277 /* Cannot grow a bitmap yet */ 5452 /* Cannot grow a bitmap yet */
5278 return -EBUSY; 5453 return -EBUSY;
5279 if (mddev->degraded > conf->max_degraded) 5454 if (has_failed(conf))
5280 return -EINVAL; 5455 return -EINVAL;
5281 if (mddev->delta_disks < 0) { 5456 if (mddev->delta_disks < 0) {
5282 /* We might be able to shrink, but the devices must 5457 /* We might be able to shrink, but the devices must
@@ -5328,7 +5503,7 @@ static int raid5_start_reshape(mddev_t *mddev)
5328 */ 5503 */
5329 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) 5504 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
5330 < mddev->array_sectors) { 5505 < mddev->array_sectors) {
5331 printk(KERN_ERR "md: %s: array size must be reduced " 5506 printk(KERN_ERR "md/raid:%s: array size must be reduced "
5332 "before number of disks\n", mdname(mddev)); 5507 "before number of disks\n", mdname(mddev));
5333 return -EINVAL; 5508 return -EINVAL;
5334 } 5509 }
@@ -5351,29 +5526,39 @@ static int raid5_start_reshape(mddev_t *mddev)
5351 5526
5352 /* Add some new drives, as many as will fit. 5527 /* Add some new drives, as many as will fit.
5353 * We know there are enough to make the newly sized array work. 5528 * We know there are enough to make the newly sized array work.
5529 * Don't add devices if we are reducing the number of
5530 * devices in the array. This is because it is not possible
5531 * to correctly record the "partially reconstructed" state of
5532 * such devices during the reshape and confusion could result.
5354 */ 5533 */
5355 list_for_each_entry(rdev, &mddev->disks, same_set) 5534 if (mddev->delta_disks >= 0)
5535 list_for_each_entry(rdev, &mddev->disks, same_set)
5356 if (rdev->raid_disk < 0 && 5536 if (rdev->raid_disk < 0 &&
5357 !test_bit(Faulty, &rdev->flags)) { 5537 !test_bit(Faulty, &rdev->flags)) {
5358 if (raid5_add_disk(mddev, rdev) == 0) { 5538 if (raid5_add_disk(mddev, rdev) == 0) {
5359 char nm[20]; 5539 char nm[20];
5360 set_bit(In_sync, &rdev->flags); 5540 if (rdev->raid_disk >= conf->previous_raid_disks) {
5361 added_devices++; 5541 set_bit(In_sync, &rdev->flags);
5362 rdev->recovery_offset = 0; 5542 added_devices++;
5543 } else
5544 rdev->recovery_offset = 0;
5363 sprintf(nm, "rd%d", rdev->raid_disk); 5545 sprintf(nm, "rd%d", rdev->raid_disk);
5364 if (sysfs_create_link(&mddev->kobj, 5546 if (sysfs_create_link(&mddev->kobj,
5365 &rdev->kobj, nm)) 5547 &rdev->kobj, nm))
5366 printk(KERN_WARNING 5548 printk(KERN_WARNING
5367 "raid5: failed to create " 5549 "md/raid:%s: failed to create "
5368 " link %s for %s\n", 5550 " link %s\n",
5369 nm, mdname(mddev)); 5551 mdname(mddev), nm);
5370 } else 5552 } else
5371 break; 5553 break;
5372 } 5554 }
5373 5555
5556 /* When a reshape changes the number of devices, ->degraded
5557 * is measured against the larger of the pre and post number of
5558 * devices.*/
5374 if (mddev->delta_disks > 0) { 5559 if (mddev->delta_disks > 0) {
5375 spin_lock_irqsave(&conf->device_lock, flags); 5560 spin_lock_irqsave(&conf->device_lock, flags);
5376 mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) 5561 mddev->degraded += (conf->raid_disks - conf->previous_raid_disks)
5377 - added_devices; 5562 - added_devices;
5378 spin_unlock_irqrestore(&conf->device_lock, flags); 5563 spin_unlock_irqrestore(&conf->device_lock, flags);
5379 } 5564 }
@@ -5440,7 +5625,6 @@ static void raid5_finish_reshape(mddev_t *mddev)
5440 if (mddev->delta_disks > 0) { 5625 if (mddev->delta_disks > 0) {
5441 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 5626 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
5442 set_capacity(mddev->gendisk, mddev->array_sectors); 5627 set_capacity(mddev->gendisk, mddev->array_sectors);
5443 mddev->changed = 1;
5444 revalidate_disk(mddev->gendisk); 5628 revalidate_disk(mddev->gendisk);
5445 } else { 5629 } else {
5446 int d; 5630 int d;
@@ -5505,6 +5689,29 @@ static void raid5_quiesce(mddev_t *mddev, int state)
5505} 5689}
5506 5690
5507 5691
5692static void *raid45_takeover_raid0(mddev_t *mddev, int level)
5693{
5694 struct raid0_private_data *raid0_priv = mddev->private;
5695
5696 /* for raid0 takeover only one zone is supported */
5697 if (raid0_priv->nr_strip_zones > 1) {
5698 printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n",
5699 mdname(mddev));
5700 return ERR_PTR(-EINVAL);
5701 }
5702
5703 mddev->new_level = level;
5704 mddev->new_layout = ALGORITHM_PARITY_N;
5705 mddev->new_chunk_sectors = mddev->chunk_sectors;
5706 mddev->raid_disks += 1;
5707 mddev->delta_disks = 1;
5708 /* make sure it will be not marked as dirty */
5709 mddev->recovery_cp = MaxSector;
5710
5711 return setup_conf(mddev);
5712}
5713
5714
5508static void *raid5_takeover_raid1(mddev_t *mddev) 5715static void *raid5_takeover_raid1(mddev_t *mddev)
5509{ 5716{
5510 int chunksect; 5717 int chunksect;
@@ -5629,12 +5836,13 @@ static int raid6_check_reshape(mddev_t *mddev)
5629static void *raid5_takeover(mddev_t *mddev) 5836static void *raid5_takeover(mddev_t *mddev)
5630{ 5837{
5631 /* raid5 can take over: 5838 /* raid5 can take over:
5632 * raid0 - if all devices are the same - make it a raid4 layout 5839 * raid0 - if there is only one strip zone - make it a raid4 layout
5633 * raid1 - if there are two drives. We need to know the chunk size 5840 * raid1 - if there are two drives. We need to know the chunk size
5634 * raid4 - trivial - just use a raid4 layout. 5841 * raid4 - trivial - just use a raid4 layout.
5635 * raid6 - Providing it is a *_6 layout 5842 * raid6 - Providing it is a *_6 layout
5636 */ 5843 */
5637 5844 if (mddev->level == 0)
5845 return raid45_takeover_raid0(mddev, 5);
5638 if (mddev->level == 1) 5846 if (mddev->level == 1)
5639 return raid5_takeover_raid1(mddev); 5847 return raid5_takeover_raid1(mddev);
5640 if (mddev->level == 4) { 5848 if (mddev->level == 4) {
@@ -5648,6 +5856,22 @@ static void *raid5_takeover(mddev_t *mddev)
5648 return ERR_PTR(-EINVAL); 5856 return ERR_PTR(-EINVAL);
5649} 5857}
5650 5858
5859static void *raid4_takeover(mddev_t *mddev)
5860{
5861 /* raid4 can take over:
5862 * raid0 - if there is only one strip zone
5863 * raid5 - if layout is right
5864 */
5865 if (mddev->level == 0)
5866 return raid45_takeover_raid0(mddev, 4);
5867 if (mddev->level == 5 &&
5868 mddev->layout == ALGORITHM_PARITY_N) {
5869 mddev->new_layout = 0;
5870 mddev->new_level = 4;
5871 return setup_conf(mddev);
5872 }
5873 return ERR_PTR(-EINVAL);
5874}
5651 5875
5652static struct mdk_personality raid5_personality; 5876static struct mdk_personality raid5_personality;
5653 5877
@@ -5763,6 +5987,7 @@ static struct mdk_personality raid4_personality =
5763 .start_reshape = raid5_start_reshape, 5987 .start_reshape = raid5_start_reshape,
5764 .finish_reshape = raid5_finish_reshape, 5988 .finish_reshape = raid5_finish_reshape,
5765 .quiesce = raid5_quiesce, 5989 .quiesce = raid5_quiesce,
5990 .takeover = raid4_takeover,
5766}; 5991};
5767 5992
5768static int __init raid5_init(void) 5993static int __init raid5_init(void)
@@ -5783,6 +6008,7 @@ static void raid5_exit(void)
5783module_init(raid5_init); 6008module_init(raid5_init);
5784module_exit(raid5_exit); 6009module_exit(raid5_exit);
5785MODULE_LICENSE("GPL"); 6010MODULE_LICENSE("GPL");
6011MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD");
5786MODULE_ALIAS("md-personality-4"); /* RAID5 */ 6012MODULE_ALIAS("md-personality-4"); /* RAID5 */
5787MODULE_ALIAS("md-raid5"); 6013MODULE_ALIAS("md-raid5");
5788MODULE_ALIAS("md-raid4"); 6014MODULE_ALIAS("md-raid4");
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 2390e0e83daf..0f86f5e36724 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -214,12 +214,20 @@ struct stripe_head {
214 int disks; /* disks in stripe */ 214 int disks; /* disks in stripe */
215 enum check_states check_state; 215 enum check_states check_state;
216 enum reconstruct_states reconstruct_state; 216 enum reconstruct_states reconstruct_state;
217 /* stripe_operations 217 /**
218 * struct stripe_operations
218 * @target - STRIPE_OP_COMPUTE_BLK target 219 * @target - STRIPE_OP_COMPUTE_BLK target
220 * @target2 - 2nd compute target in the raid6 case
221 * @zero_sum_result - P and Q verification flags
222 * @request - async service request flags for raid_run_ops
219 */ 223 */
220 struct stripe_operations { 224 struct stripe_operations {
221 int target, target2; 225 int target, target2;
222 enum sum_check_flags zero_sum_result; 226 enum sum_check_flags zero_sum_result;
227 #ifdef CONFIG_MULTICORE_RAID456
228 unsigned long request;
229 wait_queue_head_t wait_for_ops;
230 #endif
223 } ops; 231 } ops;
224 struct r5dev { 232 struct r5dev {
225 struct bio req; 233 struct bio req;
@@ -294,6 +302,8 @@ struct r6_state {
294#define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */ 302#define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */
295#define STRIPE_BIOFILL_RUN 14 303#define STRIPE_BIOFILL_RUN 14
296#define STRIPE_COMPUTE_RUN 15 304#define STRIPE_COMPUTE_RUN 15
305#define STRIPE_OPS_REQ_PENDING 16
306
297/* 307/*
298 * Operation request flags 308 * Operation request flags
299 */ 309 */
@@ -395,7 +405,7 @@ struct raid5_private_data {
395 * lists and performing address 405 * lists and performing address
396 * conversions 406 * conversions
397 */ 407 */
398 } *percpu; 408 } __percpu *percpu;
399 size_t scribble_len; /* size of scribble region must be 409 size_t scribble_len; /* size of scribble region must be
400 * associated with conf to handle 410 * associated with conf to handle
401 * cpu hotplug while reshaping 411 * cpu hotplug while reshaping
@@ -478,7 +488,7 @@ static inline int algorithm_valid_raid6(int layout)
478{ 488{
479 return (layout >= 0 && layout <= 5) 489 return (layout >= 0 && layout <= 5)
480 || 490 ||
481 (layout == 8 || layout == 10) 491 (layout >= 8 && layout <= 10)
482 || 492 ||
483 (layout >= 16 && layout <= 20); 493 (layout >= 16 && layout <= 20);
484} 494}