diff options
Diffstat (limited to 'drivers/md')
41 files changed, 4970 insertions, 2125 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index e27ae4604cef..bf1a95e31559 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
@@ -100,8 +100,8 @@ config MD_RAID1 | |||
100 | If unsure, say Y. | 100 | If unsure, say Y. |
101 | 101 | ||
102 | config MD_RAID10 | 102 | config MD_RAID10 |
103 | tristate "RAID-10 (mirrored striping) mode (EXPERIMENTAL)" | 103 | tristate "RAID-10 (mirrored striping) mode" |
104 | depends on BLK_DEV_MD && EXPERIMENTAL | 104 | depends on BLK_DEV_MD |
105 | ---help--- | 105 | ---help--- |
106 | RAID-10 provides a combination of striping (RAID-0) and | 106 | RAID-10 provides a combination of striping (RAID-0) and |
107 | mirroring (RAID-1) with easier configuration and more flexible | 107 | mirroring (RAID-1) with easier configuration and more flexible |
@@ -169,11 +169,10 @@ config MD_MULTIPATH | |||
169 | tristate "Multipath I/O support" | 169 | tristate "Multipath I/O support" |
170 | depends on BLK_DEV_MD | 170 | depends on BLK_DEV_MD |
171 | help | 171 | help |
172 | Multipath-IO is the ability of certain devices to address the same | 172 | MD_MULTIPATH provides a simple multi-path personality for use |
173 | physical disk over multiple 'IO paths'. The code ensures that such | 173 | the MD framework. It is not under active development. New |
174 | paths can be defined and handled at runtime, and ensures that a | 174 | projects should consider using DM_MULTIPATH which has more |
175 | transparent failover to the backup path(s) happens if a IO errors | 175 | features and more testing. |
176 | arrives on the primary path. | ||
177 | 176 | ||
178 | If unsure, say N. | 177 | If unsure, say N. |
179 | 178 | ||
diff --git a/drivers/md/Makefile b/drivers/md/Makefile index c9b3a7843d83..5e3aac41919d 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile | |||
@@ -40,4 +40,3 @@ obj-$(CONFIG_DM_ZERO) += dm-zero.o | |||
40 | ifeq ($(CONFIG_DM_UEVENT),y) | 40 | ifeq ($(CONFIG_DM_UEVENT),y) |
41 | dm-mod-objs += dm-uevent.o | 41 | dm-mod-objs += dm-uevent.o |
42 | endif | 42 | endif |
43 | |||
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 6986b0059d23..1742435ce3ae 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c | |||
@@ -212,7 +212,7 @@ static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page) | |||
212 | */ | 212 | */ |
213 | 213 | ||
214 | /* IO operations when bitmap is stored near all superblocks */ | 214 | /* IO operations when bitmap is stored near all superblocks */ |
215 | static struct page *read_sb_page(mddev_t *mddev, long offset, | 215 | static struct page *read_sb_page(mddev_t *mddev, loff_t offset, |
216 | struct page *page, | 216 | struct page *page, |
217 | unsigned long index, int size) | 217 | unsigned long index, int size) |
218 | { | 218 | { |
@@ -287,27 +287,36 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) | |||
287 | 287 | ||
288 | while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { | 288 | while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { |
289 | int size = PAGE_SIZE; | 289 | int size = PAGE_SIZE; |
290 | loff_t offset = mddev->bitmap_info.offset; | ||
290 | if (page->index == bitmap->file_pages-1) | 291 | if (page->index == bitmap->file_pages-1) |
291 | size = roundup(bitmap->last_page_size, | 292 | size = roundup(bitmap->last_page_size, |
292 | bdev_logical_block_size(rdev->bdev)); | 293 | bdev_logical_block_size(rdev->bdev)); |
293 | /* Just make sure we aren't corrupting data or | 294 | /* Just make sure we aren't corrupting data or |
294 | * metadata | 295 | * metadata |
295 | */ | 296 | */ |
296 | if (bitmap->offset < 0) { | 297 | if (mddev->external) { |
298 | /* Bitmap could be anywhere. */ | ||
299 | if (rdev->sb_start + offset + (page->index *(PAGE_SIZE/512)) > | ||
300 | rdev->data_offset && | ||
301 | rdev->sb_start + offset < | ||
302 | rdev->data_offset + mddev->dev_sectors + | ||
303 | (PAGE_SIZE/512)) | ||
304 | goto bad_alignment; | ||
305 | } else if (offset < 0) { | ||
297 | /* DATA BITMAP METADATA */ | 306 | /* DATA BITMAP METADATA */ |
298 | if (bitmap->offset | 307 | if (offset |
299 | + (long)(page->index * (PAGE_SIZE/512)) | 308 | + (long)(page->index * (PAGE_SIZE/512)) |
300 | + size/512 > 0) | 309 | + size/512 > 0) |
301 | /* bitmap runs in to metadata */ | 310 | /* bitmap runs in to metadata */ |
302 | goto bad_alignment; | 311 | goto bad_alignment; |
303 | if (rdev->data_offset + mddev->dev_sectors | 312 | if (rdev->data_offset + mddev->dev_sectors |
304 | > rdev->sb_start + bitmap->offset) | 313 | > rdev->sb_start + offset) |
305 | /* data runs in to bitmap */ | 314 | /* data runs in to bitmap */ |
306 | goto bad_alignment; | 315 | goto bad_alignment; |
307 | } else if (rdev->sb_start < rdev->data_offset) { | 316 | } else if (rdev->sb_start < rdev->data_offset) { |
308 | /* METADATA BITMAP DATA */ | 317 | /* METADATA BITMAP DATA */ |
309 | if (rdev->sb_start | 318 | if (rdev->sb_start |
310 | + bitmap->offset | 319 | + offset |
311 | + page->index*(PAGE_SIZE/512) + size/512 | 320 | + page->index*(PAGE_SIZE/512) + size/512 |
312 | > rdev->data_offset) | 321 | > rdev->data_offset) |
313 | /* bitmap runs in to data */ | 322 | /* bitmap runs in to data */ |
@@ -316,7 +325,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) | |||
316 | /* DATA METADATA BITMAP - no problems */ | 325 | /* DATA METADATA BITMAP - no problems */ |
317 | } | 326 | } |
318 | md_super_write(mddev, rdev, | 327 | md_super_write(mddev, rdev, |
319 | rdev->sb_start + bitmap->offset | 328 | rdev->sb_start + offset |
320 | + page->index * (PAGE_SIZE/512), | 329 | + page->index * (PAGE_SIZE/512), |
321 | size, | 330 | size, |
322 | page); | 331 | page); |
@@ -488,19 +497,24 @@ void bitmap_update_sb(struct bitmap *bitmap) | |||
488 | 497 | ||
489 | if (!bitmap || !bitmap->mddev) /* no bitmap for this array */ | 498 | if (!bitmap || !bitmap->mddev) /* no bitmap for this array */ |
490 | return; | 499 | return; |
500 | if (bitmap->mddev->bitmap_info.external) | ||
501 | return; | ||
491 | spin_lock_irqsave(&bitmap->lock, flags); | 502 | spin_lock_irqsave(&bitmap->lock, flags); |
492 | if (!bitmap->sb_page) { /* no superblock */ | 503 | if (!bitmap->sb_page) { /* no superblock */ |
493 | spin_unlock_irqrestore(&bitmap->lock, flags); | 504 | spin_unlock_irqrestore(&bitmap->lock, flags); |
494 | return; | 505 | return; |
495 | } | 506 | } |
496 | spin_unlock_irqrestore(&bitmap->lock, flags); | 507 | spin_unlock_irqrestore(&bitmap->lock, flags); |
497 | sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); | 508 | sb = kmap_atomic(bitmap->sb_page, KM_USER0); |
498 | sb->events = cpu_to_le64(bitmap->mddev->events); | 509 | sb->events = cpu_to_le64(bitmap->mddev->events); |
499 | if (bitmap->mddev->events < bitmap->events_cleared) { | 510 | if (bitmap->mddev->events < bitmap->events_cleared) { |
500 | /* rocking back to read-only */ | 511 | /* rocking back to read-only */ |
501 | bitmap->events_cleared = bitmap->mddev->events; | 512 | bitmap->events_cleared = bitmap->mddev->events; |
502 | sb->events_cleared = cpu_to_le64(bitmap->events_cleared); | 513 | sb->events_cleared = cpu_to_le64(bitmap->events_cleared); |
503 | } | 514 | } |
515 | /* Just in case these have been changed via sysfs: */ | ||
516 | sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); | ||
517 | sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); | ||
504 | kunmap_atomic(sb, KM_USER0); | 518 | kunmap_atomic(sb, KM_USER0); |
505 | write_page(bitmap, bitmap->sb_page, 1); | 519 | write_page(bitmap, bitmap->sb_page, 1); |
506 | } | 520 | } |
@@ -512,7 +526,7 @@ void bitmap_print_sb(struct bitmap *bitmap) | |||
512 | 526 | ||
513 | if (!bitmap || !bitmap->sb_page) | 527 | if (!bitmap || !bitmap->sb_page) |
514 | return; | 528 | return; |
515 | sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); | 529 | sb = kmap_atomic(bitmap->sb_page, KM_USER0); |
516 | printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap)); | 530 | printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap)); |
517 | printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic)); | 531 | printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic)); |
518 | printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version)); | 532 | printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version)); |
@@ -550,7 +564,8 @@ static int bitmap_read_sb(struct bitmap *bitmap) | |||
550 | 564 | ||
551 | bitmap->sb_page = read_page(bitmap->file, 0, bitmap, bytes); | 565 | bitmap->sb_page = read_page(bitmap->file, 0, bitmap, bytes); |
552 | } else { | 566 | } else { |
553 | bitmap->sb_page = read_sb_page(bitmap->mddev, bitmap->offset, | 567 | bitmap->sb_page = read_sb_page(bitmap->mddev, |
568 | bitmap->mddev->bitmap_info.offset, | ||
554 | NULL, | 569 | NULL, |
555 | 0, sizeof(bitmap_super_t)); | 570 | 0, sizeof(bitmap_super_t)); |
556 | } | 571 | } |
@@ -560,10 +575,10 @@ static int bitmap_read_sb(struct bitmap *bitmap) | |||
560 | return err; | 575 | return err; |
561 | } | 576 | } |
562 | 577 | ||
563 | sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); | 578 | sb = kmap_atomic(bitmap->sb_page, KM_USER0); |
564 | 579 | ||
565 | chunksize = le32_to_cpu(sb->chunksize); | 580 | chunksize = le32_to_cpu(sb->chunksize); |
566 | daemon_sleep = le32_to_cpu(sb->daemon_sleep); | 581 | daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; |
567 | write_behind = le32_to_cpu(sb->write_behind); | 582 | write_behind = le32_to_cpu(sb->write_behind); |
568 | 583 | ||
569 | /* verify that the bitmap-specific fields are valid */ | 584 | /* verify that the bitmap-specific fields are valid */ |
@@ -576,7 +591,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) | |||
576 | reason = "bitmap chunksize too small"; | 591 | reason = "bitmap chunksize too small"; |
577 | else if ((1 << ffz(~chunksize)) != chunksize) | 592 | else if ((1 << ffz(~chunksize)) != chunksize) |
578 | reason = "bitmap chunksize not a power of 2"; | 593 | reason = "bitmap chunksize not a power of 2"; |
579 | else if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT / HZ) | 594 | else if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT) |
580 | reason = "daemon sleep period out of range"; | 595 | reason = "daemon sleep period out of range"; |
581 | else if (write_behind > COUNTER_MAX) | 596 | else if (write_behind > COUNTER_MAX) |
582 | reason = "write-behind limit out of range (0 - 16383)"; | 597 | reason = "write-behind limit out of range (0 - 16383)"; |
@@ -610,10 +625,9 @@ static int bitmap_read_sb(struct bitmap *bitmap) | |||
610 | } | 625 | } |
611 | success: | 626 | success: |
612 | /* assign fields using values from superblock */ | 627 | /* assign fields using values from superblock */ |
613 | bitmap->chunksize = chunksize; | 628 | bitmap->mddev->bitmap_info.chunksize = chunksize; |
614 | bitmap->daemon_sleep = daemon_sleep; | 629 | bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; |
615 | bitmap->daemon_lastrun = jiffies; | 630 | bitmap->mddev->bitmap_info.max_write_behind = write_behind; |
616 | bitmap->max_write_behind = write_behind; | ||
617 | bitmap->flags |= le32_to_cpu(sb->state); | 631 | bitmap->flags |= le32_to_cpu(sb->state); |
618 | if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) | 632 | if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) |
619 | bitmap->flags |= BITMAP_HOSTENDIAN; | 633 | bitmap->flags |= BITMAP_HOSTENDIAN; |
@@ -647,7 +661,7 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, | |||
647 | return 0; | 661 | return 0; |
648 | } | 662 | } |
649 | spin_unlock_irqrestore(&bitmap->lock, flags); | 663 | spin_unlock_irqrestore(&bitmap->lock, flags); |
650 | sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); | 664 | sb = kmap_atomic(bitmap->sb_page, KM_USER0); |
651 | old = le32_to_cpu(sb->state) & bits; | 665 | old = le32_to_cpu(sb->state) & bits; |
652 | switch (op) { | 666 | switch (op) { |
653 | case MASK_SET: sb->state |= cpu_to_le32(bits); | 667 | case MASK_SET: sb->state |= cpu_to_le32(bits); |
@@ -664,16 +678,26 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, | |||
664 | * general bitmap file operations | 678 | * general bitmap file operations |
665 | */ | 679 | */ |
666 | 680 | ||
681 | /* | ||
682 | * on-disk bitmap: | ||
683 | * | ||
684 | * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap | ||
685 | * file a page at a time. There's a superblock at the start of the file. | ||
686 | */ | ||
667 | /* calculate the index of the page that contains this bit */ | 687 | /* calculate the index of the page that contains this bit */ |
668 | static inline unsigned long file_page_index(unsigned long chunk) | 688 | static inline unsigned long file_page_index(struct bitmap *bitmap, unsigned long chunk) |
669 | { | 689 | { |
670 | return CHUNK_BIT_OFFSET(chunk) >> PAGE_BIT_SHIFT; | 690 | if (!bitmap->mddev->bitmap_info.external) |
691 | chunk += sizeof(bitmap_super_t) << 3; | ||
692 | return chunk >> PAGE_BIT_SHIFT; | ||
671 | } | 693 | } |
672 | 694 | ||
673 | /* calculate the (bit) offset of this bit within a page */ | 695 | /* calculate the (bit) offset of this bit within a page */ |
674 | static inline unsigned long file_page_offset(unsigned long chunk) | 696 | static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned long chunk) |
675 | { | 697 | { |
676 | return CHUNK_BIT_OFFSET(chunk) & (PAGE_BITS - 1); | 698 | if (!bitmap->mddev->bitmap_info.external) |
699 | chunk += sizeof(bitmap_super_t) << 3; | ||
700 | return chunk & (PAGE_BITS - 1); | ||
677 | } | 701 | } |
678 | 702 | ||
679 | /* | 703 | /* |
@@ -686,8 +710,9 @@ static inline unsigned long file_page_offset(unsigned long chunk) | |||
686 | static inline struct page *filemap_get_page(struct bitmap *bitmap, | 710 | static inline struct page *filemap_get_page(struct bitmap *bitmap, |
687 | unsigned long chunk) | 711 | unsigned long chunk) |
688 | { | 712 | { |
689 | if (file_page_index(chunk) >= bitmap->file_pages) return NULL; | 713 | if (file_page_index(bitmap, chunk) >= bitmap->file_pages) return NULL; |
690 | return bitmap->filemap[file_page_index(chunk) - file_page_index(0)]; | 714 | return bitmap->filemap[file_page_index(bitmap, chunk) |
715 | - file_page_index(bitmap, 0)]; | ||
691 | } | 716 | } |
692 | 717 | ||
693 | 718 | ||
@@ -710,7 +735,7 @@ static void bitmap_file_unmap(struct bitmap *bitmap) | |||
710 | spin_unlock_irqrestore(&bitmap->lock, flags); | 735 | spin_unlock_irqrestore(&bitmap->lock, flags); |
711 | 736 | ||
712 | while (pages--) | 737 | while (pages--) |
713 | if (map[pages]->index != 0) /* 0 is sb_page, release it below */ | 738 | if (map[pages] != sb_page) /* 0 is sb_page, release it below */ |
714 | free_buffers(map[pages]); | 739 | free_buffers(map[pages]); |
715 | kfree(map); | 740 | kfree(map); |
716 | kfree(attr); | 741 | kfree(attr); |
@@ -821,7 +846,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) | |||
821 | 846 | ||
822 | page = filemap_get_page(bitmap, chunk); | 847 | page = filemap_get_page(bitmap, chunk); |
823 | if (!page) return; | 848 | if (!page) return; |
824 | bit = file_page_offset(chunk); | 849 | bit = file_page_offset(bitmap, chunk); |
825 | 850 | ||
826 | /* set the bit */ | 851 | /* set the bit */ |
827 | kaddr = kmap_atomic(page, KM_USER0); | 852 | kaddr = kmap_atomic(page, KM_USER0); |
@@ -907,7 +932,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) | |||
907 | chunks = bitmap->chunks; | 932 | chunks = bitmap->chunks; |
908 | file = bitmap->file; | 933 | file = bitmap->file; |
909 | 934 | ||
910 | BUG_ON(!file && !bitmap->offset); | 935 | BUG_ON(!file && !bitmap->mddev->bitmap_info.offset); |
911 | 936 | ||
912 | #ifdef INJECT_FAULTS_3 | 937 | #ifdef INJECT_FAULTS_3 |
913 | outofdate = 1; | 938 | outofdate = 1; |
@@ -919,14 +944,17 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) | |||
919 | "recovery\n", bmname(bitmap)); | 944 | "recovery\n", bmname(bitmap)); |
920 | 945 | ||
921 | bytes = (chunks + 7) / 8; | 946 | bytes = (chunks + 7) / 8; |
947 | if (!bitmap->mddev->bitmap_info.external) | ||
948 | bytes += sizeof(bitmap_super_t); | ||
922 | 949 | ||
923 | num_pages = (bytes + sizeof(bitmap_super_t) + PAGE_SIZE - 1) / PAGE_SIZE; | 950 | |
951 | num_pages = (bytes + PAGE_SIZE - 1) / PAGE_SIZE; | ||
924 | 952 | ||
925 | if (file && i_size_read(file->f_mapping->host) < bytes + sizeof(bitmap_super_t)) { | 953 | if (file && i_size_read(file->f_mapping->host) < bytes) { |
926 | printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n", | 954 | printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n", |
927 | bmname(bitmap), | 955 | bmname(bitmap), |
928 | (unsigned long) i_size_read(file->f_mapping->host), | 956 | (unsigned long) i_size_read(file->f_mapping->host), |
929 | bytes + sizeof(bitmap_super_t)); | 957 | bytes); |
930 | goto err; | 958 | goto err; |
931 | } | 959 | } |
932 | 960 | ||
@@ -947,17 +975,16 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) | |||
947 | 975 | ||
948 | for (i = 0; i < chunks; i++) { | 976 | for (i = 0; i < chunks; i++) { |
949 | int b; | 977 | int b; |
950 | index = file_page_index(i); | 978 | index = file_page_index(bitmap, i); |
951 | bit = file_page_offset(i); | 979 | bit = file_page_offset(bitmap, i); |
952 | if (index != oldindex) { /* this is a new page, read it in */ | 980 | if (index != oldindex) { /* this is a new page, read it in */ |
953 | int count; | 981 | int count; |
954 | /* unmap the old page, we're done with it */ | 982 | /* unmap the old page, we're done with it */ |
955 | if (index == num_pages-1) | 983 | if (index == num_pages-1) |
956 | count = bytes + sizeof(bitmap_super_t) | 984 | count = bytes - index * PAGE_SIZE; |
957 | - index * PAGE_SIZE; | ||
958 | else | 985 | else |
959 | count = PAGE_SIZE; | 986 | count = PAGE_SIZE; |
960 | if (index == 0) { | 987 | if (index == 0 && bitmap->sb_page) { |
961 | /* | 988 | /* |
962 | * if we're here then the superblock page | 989 | * if we're here then the superblock page |
963 | * contains some bits (PAGE_SIZE != sizeof sb) | 990 | * contains some bits (PAGE_SIZE != sizeof sb) |
@@ -967,14 +994,15 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) | |||
967 | offset = sizeof(bitmap_super_t); | 994 | offset = sizeof(bitmap_super_t); |
968 | if (!file) | 995 | if (!file) |
969 | read_sb_page(bitmap->mddev, | 996 | read_sb_page(bitmap->mddev, |
970 | bitmap->offset, | 997 | bitmap->mddev->bitmap_info.offset, |
971 | page, | 998 | page, |
972 | index, count); | 999 | index, count); |
973 | } else if (file) { | 1000 | } else if (file) { |
974 | page = read_page(file, index, bitmap, count); | 1001 | page = read_page(file, index, bitmap, count); |
975 | offset = 0; | 1002 | offset = 0; |
976 | } else { | 1003 | } else { |
977 | page = read_sb_page(bitmap->mddev, bitmap->offset, | 1004 | page = read_sb_page(bitmap->mddev, |
1005 | bitmap->mddev->bitmap_info.offset, | ||
978 | NULL, | 1006 | NULL, |
979 | index, count); | 1007 | index, count); |
980 | offset = 0; | 1008 | offset = 0; |
@@ -1078,23 +1106,32 @@ static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, | |||
1078 | * out to disk | 1106 | * out to disk |
1079 | */ | 1107 | */ |
1080 | 1108 | ||
1081 | void bitmap_daemon_work(struct bitmap *bitmap) | 1109 | void bitmap_daemon_work(mddev_t *mddev) |
1082 | { | 1110 | { |
1111 | struct bitmap *bitmap; | ||
1083 | unsigned long j; | 1112 | unsigned long j; |
1084 | unsigned long flags; | 1113 | unsigned long flags; |
1085 | struct page *page = NULL, *lastpage = NULL; | 1114 | struct page *page = NULL, *lastpage = NULL; |
1086 | int blocks; | 1115 | int blocks; |
1087 | void *paddr; | 1116 | void *paddr; |
1088 | 1117 | ||
1089 | if (bitmap == NULL) | 1118 | /* Use a mutex to guard daemon_work against |
1119 | * bitmap_destroy. | ||
1120 | */ | ||
1121 | mutex_lock(&mddev->bitmap_info.mutex); | ||
1122 | bitmap = mddev->bitmap; | ||
1123 | if (bitmap == NULL) { | ||
1124 | mutex_unlock(&mddev->bitmap_info.mutex); | ||
1090 | return; | 1125 | return; |
1091 | if (time_before(jiffies, bitmap->daemon_lastrun + bitmap->daemon_sleep*HZ)) | 1126 | } |
1127 | if (time_before(jiffies, bitmap->daemon_lastrun | ||
1128 | + bitmap->mddev->bitmap_info.daemon_sleep)) | ||
1092 | goto done; | 1129 | goto done; |
1093 | 1130 | ||
1094 | bitmap->daemon_lastrun = jiffies; | 1131 | bitmap->daemon_lastrun = jiffies; |
1095 | if (bitmap->allclean) { | 1132 | if (bitmap->allclean) { |
1096 | bitmap->mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; | 1133 | bitmap->mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; |
1097 | return; | 1134 | goto done; |
1098 | } | 1135 | } |
1099 | bitmap->allclean = 1; | 1136 | bitmap->allclean = 1; |
1100 | 1137 | ||
@@ -1142,7 +1179,8 @@ void bitmap_daemon_work(struct bitmap *bitmap) | |||
1142 | /* We are possibly going to clear some bits, so make | 1179 | /* We are possibly going to clear some bits, so make |
1143 | * sure that events_cleared is up-to-date. | 1180 | * sure that events_cleared is up-to-date. |
1144 | */ | 1181 | */ |
1145 | if (bitmap->need_sync) { | 1182 | if (bitmap->need_sync && |
1183 | bitmap->mddev->bitmap_info.external == 0) { | ||
1146 | bitmap_super_t *sb; | 1184 | bitmap_super_t *sb; |
1147 | bitmap->need_sync = 0; | 1185 | bitmap->need_sync = 0; |
1148 | sb = kmap_atomic(bitmap->sb_page, KM_USER0); | 1186 | sb = kmap_atomic(bitmap->sb_page, KM_USER0); |
@@ -1152,7 +1190,8 @@ void bitmap_daemon_work(struct bitmap *bitmap) | |||
1152 | write_page(bitmap, bitmap->sb_page, 1); | 1190 | write_page(bitmap, bitmap->sb_page, 1); |
1153 | } | 1191 | } |
1154 | spin_lock_irqsave(&bitmap->lock, flags); | 1192 | spin_lock_irqsave(&bitmap->lock, flags); |
1155 | clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); | 1193 | if (!bitmap->need_sync) |
1194 | clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); | ||
1156 | } | 1195 | } |
1157 | bmc = bitmap_get_counter(bitmap, | 1196 | bmc = bitmap_get_counter(bitmap, |
1158 | (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap), | 1197 | (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap), |
@@ -1167,7 +1206,7 @@ void bitmap_daemon_work(struct bitmap *bitmap) | |||
1167 | if (*bmc == 2) { | 1206 | if (*bmc == 2) { |
1168 | *bmc=1; /* maybe clear the bit next time */ | 1207 | *bmc=1; /* maybe clear the bit next time */ |
1169 | set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); | 1208 | set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); |
1170 | } else if (*bmc == 1) { | 1209 | } else if (*bmc == 1 && !bitmap->need_sync) { |
1171 | /* we can clear the bit */ | 1210 | /* we can clear the bit */ |
1172 | *bmc = 0; | 1211 | *bmc = 0; |
1173 | bitmap_count_page(bitmap, | 1212 | bitmap_count_page(bitmap, |
@@ -1177,9 +1216,11 @@ void bitmap_daemon_work(struct bitmap *bitmap) | |||
1177 | /* clear the bit */ | 1216 | /* clear the bit */ |
1178 | paddr = kmap_atomic(page, KM_USER0); | 1217 | paddr = kmap_atomic(page, KM_USER0); |
1179 | if (bitmap->flags & BITMAP_HOSTENDIAN) | 1218 | if (bitmap->flags & BITMAP_HOSTENDIAN) |
1180 | clear_bit(file_page_offset(j), paddr); | 1219 | clear_bit(file_page_offset(bitmap, j), |
1220 | paddr); | ||
1181 | else | 1221 | else |
1182 | ext2_clear_bit(file_page_offset(j), paddr); | 1222 | ext2_clear_bit(file_page_offset(bitmap, j), |
1223 | paddr); | ||
1183 | kunmap_atomic(paddr, KM_USER0); | 1224 | kunmap_atomic(paddr, KM_USER0); |
1184 | } | 1225 | } |
1185 | } else | 1226 | } else |
@@ -1202,7 +1243,9 @@ void bitmap_daemon_work(struct bitmap *bitmap) | |||
1202 | 1243 | ||
1203 | done: | 1244 | done: |
1204 | if (bitmap->allclean == 0) | 1245 | if (bitmap->allclean == 0) |
1205 | bitmap->mddev->thread->timeout = bitmap->daemon_sleep * HZ; | 1246 | bitmap->mddev->thread->timeout = |
1247 | bitmap->mddev->bitmap_info.daemon_sleep; | ||
1248 | mutex_unlock(&mddev->bitmap_info.mutex); | ||
1206 | } | 1249 | } |
1207 | 1250 | ||
1208 | static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, | 1251 | static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, |
@@ -1249,9 +1292,14 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect | |||
1249 | if (!bitmap) return 0; | 1292 | if (!bitmap) return 0; |
1250 | 1293 | ||
1251 | if (behind) { | 1294 | if (behind) { |
1295 | int bw; | ||
1252 | atomic_inc(&bitmap->behind_writes); | 1296 | atomic_inc(&bitmap->behind_writes); |
1297 | bw = atomic_read(&bitmap->behind_writes); | ||
1298 | if (bw > bitmap->behind_writes_used) | ||
1299 | bitmap->behind_writes_used = bw; | ||
1300 | |||
1253 | PRINTK(KERN_DEBUG "inc write-behind count %d/%d\n", | 1301 | PRINTK(KERN_DEBUG "inc write-behind count %d/%d\n", |
1254 | atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); | 1302 | bw, bitmap->max_write_behind); |
1255 | } | 1303 | } |
1256 | 1304 | ||
1257 | while (sectors) { | 1305 | while (sectors) { |
@@ -1308,7 +1356,8 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto | |||
1308 | { | 1356 | { |
1309 | if (!bitmap) return; | 1357 | if (!bitmap) return; |
1310 | if (behind) { | 1358 | if (behind) { |
1311 | atomic_dec(&bitmap->behind_writes); | 1359 | if (atomic_dec_and_test(&bitmap->behind_writes)) |
1360 | wake_up(&bitmap->behind_wait); | ||
1312 | PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n", | 1361 | PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n", |
1313 | atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); | 1362 | atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); |
1314 | } | 1363 | } |
@@ -1332,6 +1381,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto | |||
1332 | bitmap->events_cleared < bitmap->mddev->events) { | 1381 | bitmap->events_cleared < bitmap->mddev->events) { |
1333 | bitmap->events_cleared = bitmap->mddev->events; | 1382 | bitmap->events_cleared = bitmap->mddev->events; |
1334 | bitmap->need_sync = 1; | 1383 | bitmap->need_sync = 1; |
1384 | sysfs_notify_dirent(bitmap->sysfs_can_clear); | ||
1335 | } | 1385 | } |
1336 | 1386 | ||
1337 | if (!success && ! (*bmc & NEEDED_MASK)) | 1387 | if (!success && ! (*bmc & NEEDED_MASK)) |
@@ -1470,7 +1520,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) | |||
1470 | return; | 1520 | return; |
1471 | } | 1521 | } |
1472 | if (time_before(jiffies, (bitmap->last_end_sync | 1522 | if (time_before(jiffies, (bitmap->last_end_sync |
1473 | + bitmap->daemon_sleep * HZ))) | 1523 | + bitmap->mddev->bitmap_info.daemon_sleep))) |
1474 | return; | 1524 | return; |
1475 | wait_event(bitmap->mddev->recovery_wait, | 1525 | wait_event(bitmap->mddev->recovery_wait, |
1476 | atomic_read(&bitmap->mddev->recovery_active) == 0); | 1526 | atomic_read(&bitmap->mddev->recovery_active) == 0); |
@@ -1522,6 +1572,12 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e) | |||
1522 | sector_t sec = (sector_t)chunk << CHUNK_BLOCK_SHIFT(bitmap); | 1572 | sector_t sec = (sector_t)chunk << CHUNK_BLOCK_SHIFT(bitmap); |
1523 | bitmap_set_memory_bits(bitmap, sec, 1); | 1573 | bitmap_set_memory_bits(bitmap, sec, 1); |
1524 | bitmap_file_set_bit(bitmap, sec); | 1574 | bitmap_file_set_bit(bitmap, sec); |
1575 | if (sec < bitmap->mddev->recovery_cp) | ||
1576 | /* We are asserting that the array is dirty, | ||
1577 | * so move the recovery_cp address back so | ||
1578 | * that it is obvious that it is dirty | ||
1579 | */ | ||
1580 | bitmap->mddev->recovery_cp = sec; | ||
1525 | } | 1581 | } |
1526 | } | 1582 | } |
1527 | 1583 | ||
@@ -1531,7 +1587,7 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e) | |||
1531 | void bitmap_flush(mddev_t *mddev) | 1587 | void bitmap_flush(mddev_t *mddev) |
1532 | { | 1588 | { |
1533 | struct bitmap *bitmap = mddev->bitmap; | 1589 | struct bitmap *bitmap = mddev->bitmap; |
1534 | int sleep; | 1590 | long sleep; |
1535 | 1591 | ||
1536 | if (!bitmap) /* there was no bitmap */ | 1592 | if (!bitmap) /* there was no bitmap */ |
1537 | return; | 1593 | return; |
@@ -1539,12 +1595,13 @@ void bitmap_flush(mddev_t *mddev) | |||
1539 | /* run the daemon_work three time to ensure everything is flushed | 1595 | /* run the daemon_work three time to ensure everything is flushed |
1540 | * that can be | 1596 | * that can be |
1541 | */ | 1597 | */ |
1542 | sleep = bitmap->daemon_sleep; | 1598 | sleep = mddev->bitmap_info.daemon_sleep * 2; |
1543 | bitmap->daemon_sleep = 0; | 1599 | bitmap->daemon_lastrun -= sleep; |
1544 | bitmap_daemon_work(bitmap); | 1600 | bitmap_daemon_work(mddev); |
1545 | bitmap_daemon_work(bitmap); | 1601 | bitmap->daemon_lastrun -= sleep; |
1546 | bitmap_daemon_work(bitmap); | 1602 | bitmap_daemon_work(mddev); |
1547 | bitmap->daemon_sleep = sleep; | 1603 | bitmap->daemon_lastrun -= sleep; |
1604 | bitmap_daemon_work(mddev); | ||
1548 | bitmap_update_sb(bitmap); | 1605 | bitmap_update_sb(bitmap); |
1549 | } | 1606 | } |
1550 | 1607 | ||
@@ -1574,6 +1631,7 @@ static void bitmap_free(struct bitmap *bitmap) | |||
1574 | kfree(bp); | 1631 | kfree(bp); |
1575 | kfree(bitmap); | 1632 | kfree(bitmap); |
1576 | } | 1633 | } |
1634 | |||
1577 | void bitmap_destroy(mddev_t *mddev) | 1635 | void bitmap_destroy(mddev_t *mddev) |
1578 | { | 1636 | { |
1579 | struct bitmap *bitmap = mddev->bitmap; | 1637 | struct bitmap *bitmap = mddev->bitmap; |
@@ -1581,10 +1639,15 @@ void bitmap_destroy(mddev_t *mddev) | |||
1581 | if (!bitmap) /* there was no bitmap */ | 1639 | if (!bitmap) /* there was no bitmap */ |
1582 | return; | 1640 | return; |
1583 | 1641 | ||
1642 | mutex_lock(&mddev->bitmap_info.mutex); | ||
1584 | mddev->bitmap = NULL; /* disconnect from the md device */ | 1643 | mddev->bitmap = NULL; /* disconnect from the md device */ |
1644 | mutex_unlock(&mddev->bitmap_info.mutex); | ||
1585 | if (mddev->thread) | 1645 | if (mddev->thread) |
1586 | mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; | 1646 | mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; |
1587 | 1647 | ||
1648 | if (bitmap->sysfs_can_clear) | ||
1649 | sysfs_put(bitmap->sysfs_can_clear); | ||
1650 | |||
1588 | bitmap_free(bitmap); | 1651 | bitmap_free(bitmap); |
1589 | } | 1652 | } |
1590 | 1653 | ||
@@ -1598,16 +1661,17 @@ int bitmap_create(mddev_t *mddev) | |||
1598 | sector_t blocks = mddev->resync_max_sectors; | 1661 | sector_t blocks = mddev->resync_max_sectors; |
1599 | unsigned long chunks; | 1662 | unsigned long chunks; |
1600 | unsigned long pages; | 1663 | unsigned long pages; |
1601 | struct file *file = mddev->bitmap_file; | 1664 | struct file *file = mddev->bitmap_info.file; |
1602 | int err; | 1665 | int err; |
1603 | sector_t start; | 1666 | sector_t start; |
1667 | struct sysfs_dirent *bm; | ||
1604 | 1668 | ||
1605 | BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); | 1669 | BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); |
1606 | 1670 | ||
1607 | if (!file && !mddev->bitmap_offset) /* bitmap disabled, nothing to do */ | 1671 | if (!file && !mddev->bitmap_info.offset) /* bitmap disabled, nothing to do */ |
1608 | return 0; | 1672 | return 0; |
1609 | 1673 | ||
1610 | BUG_ON(file && mddev->bitmap_offset); | 1674 | BUG_ON(file && mddev->bitmap_info.offset); |
1611 | 1675 | ||
1612 | bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); | 1676 | bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); |
1613 | if (!bitmap) | 1677 | if (!bitmap) |
@@ -1617,24 +1681,42 @@ int bitmap_create(mddev_t *mddev) | |||
1617 | atomic_set(&bitmap->pending_writes, 0); | 1681 | atomic_set(&bitmap->pending_writes, 0); |
1618 | init_waitqueue_head(&bitmap->write_wait); | 1682 | init_waitqueue_head(&bitmap->write_wait); |
1619 | init_waitqueue_head(&bitmap->overflow_wait); | 1683 | init_waitqueue_head(&bitmap->overflow_wait); |
1684 | init_waitqueue_head(&bitmap->behind_wait); | ||
1620 | 1685 | ||
1621 | bitmap->mddev = mddev; | 1686 | bitmap->mddev = mddev; |
1622 | 1687 | ||
1688 | bm = sysfs_get_dirent(mddev->kobj.sd, NULL, "bitmap"); | ||
1689 | if (bm) { | ||
1690 | bitmap->sysfs_can_clear = sysfs_get_dirent(bm, NULL, "can_clear"); | ||
1691 | sysfs_put(bm); | ||
1692 | } else | ||
1693 | bitmap->sysfs_can_clear = NULL; | ||
1694 | |||
1623 | bitmap->file = file; | 1695 | bitmap->file = file; |
1624 | bitmap->offset = mddev->bitmap_offset; | ||
1625 | if (file) { | 1696 | if (file) { |
1626 | get_file(file); | 1697 | get_file(file); |
1627 | do_sync_mapping_range(file->f_mapping, 0, LLONG_MAX, | 1698 | /* As future accesses to this file will use bmap, |
1628 | SYNC_FILE_RANGE_WAIT_BEFORE | | 1699 | * and bypass the page cache, we must sync the file |
1629 | SYNC_FILE_RANGE_WRITE | | 1700 | * first. |
1630 | SYNC_FILE_RANGE_WAIT_AFTER); | 1701 | */ |
1702 | vfs_fsync(file, 1); | ||
1703 | } | ||
1704 | /* read superblock from bitmap file (this sets mddev->bitmap_info.chunksize) */ | ||
1705 | if (!mddev->bitmap_info.external) | ||
1706 | err = bitmap_read_sb(bitmap); | ||
1707 | else { | ||
1708 | err = 0; | ||
1709 | if (mddev->bitmap_info.chunksize == 0 || | ||
1710 | mddev->bitmap_info.daemon_sleep == 0) | ||
1711 | /* chunksize and time_base need to be | ||
1712 | * set first. */ | ||
1713 | err = -EINVAL; | ||
1631 | } | 1714 | } |
1632 | /* read superblock from bitmap file (this sets bitmap->chunksize) */ | ||
1633 | err = bitmap_read_sb(bitmap); | ||
1634 | if (err) | 1715 | if (err) |
1635 | goto error; | 1716 | goto error; |
1636 | 1717 | ||
1637 | bitmap->chunkshift = ffz(~bitmap->chunksize); | 1718 | bitmap->daemon_lastrun = jiffies; |
1719 | bitmap->chunkshift = ffz(~mddev->bitmap_info.chunksize); | ||
1638 | 1720 | ||
1639 | /* now that chunksize and chunkshift are set, we can use these macros */ | 1721 | /* now that chunksize and chunkshift are set, we can use these macros */ |
1640 | chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) >> | 1722 | chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) >> |
@@ -1676,7 +1758,8 @@ int bitmap_create(mddev_t *mddev) | |||
1676 | 1758 | ||
1677 | mddev->bitmap = bitmap; | 1759 | mddev->bitmap = bitmap; |
1678 | 1760 | ||
1679 | mddev->thread->timeout = bitmap->daemon_sleep * HZ; | 1761 | mddev->thread->timeout = mddev->bitmap_info.daemon_sleep; |
1762 | md_wakeup_thread(mddev->thread); | ||
1680 | 1763 | ||
1681 | bitmap_update_sb(bitmap); | 1764 | bitmap_update_sb(bitmap); |
1682 | 1765 | ||
@@ -1687,6 +1770,286 @@ int bitmap_create(mddev_t *mddev) | |||
1687 | return err; | 1770 | return err; |
1688 | } | 1771 | } |
1689 | 1772 | ||
1773 | static ssize_t | ||
1774 | location_show(mddev_t *mddev, char *page) | ||
1775 | { | ||
1776 | ssize_t len; | ||
1777 | if (mddev->bitmap_info.file) { | ||
1778 | len = sprintf(page, "file"); | ||
1779 | } else if (mddev->bitmap_info.offset) { | ||
1780 | len = sprintf(page, "%+lld", (long long)mddev->bitmap_info.offset); | ||
1781 | } else | ||
1782 | len = sprintf(page, "none"); | ||
1783 | len += sprintf(page+len, "\n"); | ||
1784 | return len; | ||
1785 | } | ||
1786 | |||
1787 | static ssize_t | ||
1788 | location_store(mddev_t *mddev, const char *buf, size_t len) | ||
1789 | { | ||
1790 | |||
1791 | if (mddev->pers) { | ||
1792 | if (!mddev->pers->quiesce) | ||
1793 | return -EBUSY; | ||
1794 | if (mddev->recovery || mddev->sync_thread) | ||
1795 | return -EBUSY; | ||
1796 | } | ||
1797 | |||
1798 | if (mddev->bitmap || mddev->bitmap_info.file || | ||
1799 | mddev->bitmap_info.offset) { | ||
1800 | /* bitmap already configured. Only option is to clear it */ | ||
1801 | if (strncmp(buf, "none", 4) != 0) | ||
1802 | return -EBUSY; | ||
1803 | if (mddev->pers) { | ||
1804 | mddev->pers->quiesce(mddev, 1); | ||
1805 | bitmap_destroy(mddev); | ||
1806 | mddev->pers->quiesce(mddev, 0); | ||
1807 | } | ||
1808 | mddev->bitmap_info.offset = 0; | ||
1809 | if (mddev->bitmap_info.file) { | ||
1810 | struct file *f = mddev->bitmap_info.file; | ||
1811 | mddev->bitmap_info.file = NULL; | ||
1812 | restore_bitmap_write_access(f); | ||
1813 | fput(f); | ||
1814 | } | ||
1815 | } else { | ||
1816 | /* No bitmap, OK to set a location */ | ||
1817 | long long offset; | ||
1818 | if (strncmp(buf, "none", 4) == 0) | ||
1819 | /* nothing to be done */; | ||
1820 | else if (strncmp(buf, "file:", 5) == 0) { | ||
1821 | /* Not supported yet */ | ||
1822 | return -EINVAL; | ||
1823 | } else { | ||
1824 | int rv; | ||
1825 | if (buf[0] == '+') | ||
1826 | rv = strict_strtoll(buf+1, 10, &offset); | ||
1827 | else | ||
1828 | rv = strict_strtoll(buf, 10, &offset); | ||
1829 | if (rv) | ||
1830 | return rv; | ||
1831 | if (offset == 0) | ||
1832 | return -EINVAL; | ||
1833 | if (mddev->bitmap_info.external == 0 && | ||
1834 | mddev->major_version == 0 && | ||
1835 | offset != mddev->bitmap_info.default_offset) | ||
1836 | return -EINVAL; | ||
1837 | mddev->bitmap_info.offset = offset; | ||
1838 | if (mddev->pers) { | ||
1839 | mddev->pers->quiesce(mddev, 1); | ||
1840 | rv = bitmap_create(mddev); | ||
1841 | if (rv) { | ||
1842 | bitmap_destroy(mddev); | ||
1843 | mddev->bitmap_info.offset = 0; | ||
1844 | } | ||
1845 | mddev->pers->quiesce(mddev, 0); | ||
1846 | if (rv) | ||
1847 | return rv; | ||
1848 | } | ||
1849 | } | ||
1850 | } | ||
1851 | if (!mddev->external) { | ||
1852 | /* Ensure new bitmap info is stored in | ||
1853 | * metadata promptly. | ||
1854 | */ | ||
1855 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | ||
1856 | md_wakeup_thread(mddev->thread); | ||
1857 | } | ||
1858 | return len; | ||
1859 | } | ||
1860 | |||
1861 | static struct md_sysfs_entry bitmap_location = | ||
1862 | __ATTR(location, S_IRUGO|S_IWUSR, location_show, location_store); | ||
1863 | |||
1864 | static ssize_t | ||
1865 | timeout_show(mddev_t *mddev, char *page) | ||
1866 | { | ||
1867 | ssize_t len; | ||
1868 | unsigned long secs = mddev->bitmap_info.daemon_sleep / HZ; | ||
1869 | unsigned long jifs = mddev->bitmap_info.daemon_sleep % HZ; | ||
1870 | |||
1871 | len = sprintf(page, "%lu", secs); | ||
1872 | if (jifs) | ||
1873 | len += sprintf(page+len, ".%03u", jiffies_to_msecs(jifs)); | ||
1874 | len += sprintf(page+len, "\n"); | ||
1875 | return len; | ||
1876 | } | ||
1877 | |||
1878 | static ssize_t | ||
1879 | timeout_store(mddev_t *mddev, const char *buf, size_t len) | ||
1880 | { | ||
1881 | /* timeout can be set at any time */ | ||
1882 | unsigned long timeout; | ||
1883 | int rv = strict_strtoul_scaled(buf, &timeout, 4); | ||
1884 | if (rv) | ||
1885 | return rv; | ||
1886 | |||
1887 | /* just to make sure we don't overflow... */ | ||
1888 | if (timeout >= LONG_MAX / HZ) | ||
1889 | return -EINVAL; | ||
1890 | |||
1891 | timeout = timeout * HZ / 10000; | ||
1892 | |||
1893 | if (timeout >= MAX_SCHEDULE_TIMEOUT) | ||
1894 | timeout = MAX_SCHEDULE_TIMEOUT-1; | ||
1895 | if (timeout < 1) | ||
1896 | timeout = 1; | ||
1897 | mddev->bitmap_info.daemon_sleep = timeout; | ||
1898 | if (mddev->thread) { | ||
1899 | /* if thread->timeout is MAX_SCHEDULE_TIMEOUT, then | ||
1900 | * the bitmap is all clean and we don't need to | ||
1901 | * adjust the timeout right now | ||
1902 | */ | ||
1903 | if (mddev->thread->timeout < MAX_SCHEDULE_TIMEOUT) { | ||
1904 | mddev->thread->timeout = timeout; | ||
1905 | md_wakeup_thread(mddev->thread); | ||
1906 | } | ||
1907 | } | ||
1908 | return len; | ||
1909 | } | ||
1910 | |||
1911 | static struct md_sysfs_entry bitmap_timeout = | ||
1912 | __ATTR(time_base, S_IRUGO|S_IWUSR, timeout_show, timeout_store); | ||
1913 | |||
1914 | static ssize_t | ||
1915 | backlog_show(mddev_t *mddev, char *page) | ||
1916 | { | ||
1917 | return sprintf(page, "%lu\n", mddev->bitmap_info.max_write_behind); | ||
1918 | } | ||
1919 | |||
1920 | static ssize_t | ||
1921 | backlog_store(mddev_t *mddev, const char *buf, size_t len) | ||
1922 | { | ||
1923 | unsigned long backlog; | ||
1924 | int rv = strict_strtoul(buf, 10, &backlog); | ||
1925 | if (rv) | ||
1926 | return rv; | ||
1927 | if (backlog > COUNTER_MAX) | ||
1928 | return -EINVAL; | ||
1929 | mddev->bitmap_info.max_write_behind = backlog; | ||
1930 | return len; | ||
1931 | } | ||
1932 | |||
1933 | static struct md_sysfs_entry bitmap_backlog = | ||
1934 | __ATTR(backlog, S_IRUGO|S_IWUSR, backlog_show, backlog_store); | ||
1935 | |||
1936 | static ssize_t | ||
1937 | chunksize_show(mddev_t *mddev, char *page) | ||
1938 | { | ||
1939 | return sprintf(page, "%lu\n", mddev->bitmap_info.chunksize); | ||
1940 | } | ||
1941 | |||
1942 | static ssize_t | ||
1943 | chunksize_store(mddev_t *mddev, const char *buf, size_t len) | ||
1944 | { | ||
1945 | /* Can only be changed when no bitmap is active */ | ||
1946 | int rv; | ||
1947 | unsigned long csize; | ||
1948 | if (mddev->bitmap) | ||
1949 | return -EBUSY; | ||
1950 | rv = strict_strtoul(buf, 10, &csize); | ||
1951 | if (rv) | ||
1952 | return rv; | ||
1953 | if (csize < 512 || | ||
1954 | !is_power_of_2(csize)) | ||
1955 | return -EINVAL; | ||
1956 | mddev->bitmap_info.chunksize = csize; | ||
1957 | return len; | ||
1958 | } | ||
1959 | |||
1960 | static struct md_sysfs_entry bitmap_chunksize = | ||
1961 | __ATTR(chunksize, S_IRUGO|S_IWUSR, chunksize_show, chunksize_store); | ||
1962 | |||
1963 | static ssize_t metadata_show(mddev_t *mddev, char *page) | ||
1964 | { | ||
1965 | return sprintf(page, "%s\n", (mddev->bitmap_info.external | ||
1966 | ? "external" : "internal")); | ||
1967 | } | ||
1968 | |||
1969 | static ssize_t metadata_store(mddev_t *mddev, const char *buf, size_t len) | ||
1970 | { | ||
1971 | if (mddev->bitmap || | ||
1972 | mddev->bitmap_info.file || | ||
1973 | mddev->bitmap_info.offset) | ||
1974 | return -EBUSY; | ||
1975 | if (strncmp(buf, "external", 8) == 0) | ||
1976 | mddev->bitmap_info.external = 1; | ||
1977 | else if (strncmp(buf, "internal", 8) == 0) | ||
1978 | mddev->bitmap_info.external = 0; | ||
1979 | else | ||
1980 | return -EINVAL; | ||
1981 | return len; | ||
1982 | } | ||
1983 | |||
1984 | static struct md_sysfs_entry bitmap_metadata = | ||
1985 | __ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store); | ||
1986 | |||
1987 | static ssize_t can_clear_show(mddev_t *mddev, char *page) | ||
1988 | { | ||
1989 | int len; | ||
1990 | if (mddev->bitmap) | ||
1991 | len = sprintf(page, "%s\n", (mddev->bitmap->need_sync ? | ||
1992 | "false" : "true")); | ||
1993 | else | ||
1994 | len = sprintf(page, "\n"); | ||
1995 | return len; | ||
1996 | } | ||
1997 | |||
1998 | static ssize_t can_clear_store(mddev_t *mddev, const char *buf, size_t len) | ||
1999 | { | ||
2000 | if (mddev->bitmap == NULL) | ||
2001 | return -ENOENT; | ||
2002 | if (strncmp(buf, "false", 5) == 0) | ||
2003 | mddev->bitmap->need_sync = 1; | ||
2004 | else if (strncmp(buf, "true", 4) == 0) { | ||
2005 | if (mddev->degraded) | ||
2006 | return -EBUSY; | ||
2007 | mddev->bitmap->need_sync = 0; | ||
2008 | } else | ||
2009 | return -EINVAL; | ||
2010 | return len; | ||
2011 | } | ||
2012 | |||
2013 | static struct md_sysfs_entry bitmap_can_clear = | ||
2014 | __ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store); | ||
2015 | |||
2016 | static ssize_t | ||
2017 | behind_writes_used_show(mddev_t *mddev, char *page) | ||
2018 | { | ||
2019 | if (mddev->bitmap == NULL) | ||
2020 | return sprintf(page, "0\n"); | ||
2021 | return sprintf(page, "%lu\n", | ||
2022 | mddev->bitmap->behind_writes_used); | ||
2023 | } | ||
2024 | |||
2025 | static ssize_t | ||
2026 | behind_writes_used_reset(mddev_t *mddev, const char *buf, size_t len) | ||
2027 | { | ||
2028 | if (mddev->bitmap) | ||
2029 | mddev->bitmap->behind_writes_used = 0; | ||
2030 | return len; | ||
2031 | } | ||
2032 | |||
2033 | static struct md_sysfs_entry max_backlog_used = | ||
2034 | __ATTR(max_backlog_used, S_IRUGO | S_IWUSR, | ||
2035 | behind_writes_used_show, behind_writes_used_reset); | ||
2036 | |||
2037 | static struct attribute *md_bitmap_attrs[] = { | ||
2038 | &bitmap_location.attr, | ||
2039 | &bitmap_timeout.attr, | ||
2040 | &bitmap_backlog.attr, | ||
2041 | &bitmap_chunksize.attr, | ||
2042 | &bitmap_metadata.attr, | ||
2043 | &bitmap_can_clear.attr, | ||
2044 | &max_backlog_used.attr, | ||
2045 | NULL | ||
2046 | }; | ||
2047 | struct attribute_group md_bitmap_group = { | ||
2048 | .name = "bitmap", | ||
2049 | .attrs = md_bitmap_attrs, | ||
2050 | }; | ||
2051 | |||
2052 | |||
1690 | /* the bitmap API -- for raid personalities */ | 2053 | /* the bitmap API -- for raid personalities */ |
1691 | EXPORT_SYMBOL(bitmap_startwrite); | 2054 | EXPORT_SYMBOL(bitmap_startwrite); |
1692 | EXPORT_SYMBOL(bitmap_endwrite); | 2055 | EXPORT_SYMBOL(bitmap_endwrite); |
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index e98900671ca9..3797dea4723a 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h | |||
@@ -106,7 +106,7 @@ typedef __u16 bitmap_counter_t; | |||
106 | #define BITMAP_BLOCK_SHIFT 9 | 106 | #define BITMAP_BLOCK_SHIFT 9 |
107 | 107 | ||
108 | /* how many blocks per chunk? (this is variable) */ | 108 | /* how many blocks per chunk? (this is variable) */ |
109 | #define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->chunksize >> BITMAP_BLOCK_SHIFT) | 109 | #define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->mddev->bitmap_info.chunksize >> BITMAP_BLOCK_SHIFT) |
110 | #define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT) | 110 | #define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT) |
111 | #define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1) | 111 | #define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1) |
112 | 112 | ||
@@ -118,16 +118,6 @@ typedef __u16 bitmap_counter_t; | |||
118 | (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1) | 118 | (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1) |
119 | #define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1) | 119 | #define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1) |
120 | 120 | ||
121 | /* | ||
122 | * on-disk bitmap: | ||
123 | * | ||
124 | * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap | ||
125 | * file a page at a time. There's a superblock at the start of the file. | ||
126 | */ | ||
127 | |||
128 | /* map chunks (bits) to file pages - offset by the size of the superblock */ | ||
129 | #define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3)) | ||
130 | |||
131 | #endif | 121 | #endif |
132 | 122 | ||
133 | /* | 123 | /* |
@@ -209,7 +199,6 @@ struct bitmap { | |||
209 | int counter_bits; /* how many bits per block counter */ | 199 | int counter_bits; /* how many bits per block counter */ |
210 | 200 | ||
211 | /* bitmap chunksize -- how much data does each bit represent? */ | 201 | /* bitmap chunksize -- how much data does each bit represent? */ |
212 | unsigned long chunksize; | ||
213 | unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */ | 202 | unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */ |
214 | unsigned long chunks; /* total number of data chunks for the array */ | 203 | unsigned long chunks; /* total number of data chunks for the array */ |
215 | 204 | ||
@@ -226,7 +215,6 @@ struct bitmap { | |||
226 | /* bitmap spinlock */ | 215 | /* bitmap spinlock */ |
227 | spinlock_t lock; | 216 | spinlock_t lock; |
228 | 217 | ||
229 | long offset; /* offset from superblock if file is NULL */ | ||
230 | struct file *file; /* backing disk file */ | 218 | struct file *file; /* backing disk file */ |
231 | struct page *sb_page; /* cached copy of the bitmap file superblock */ | 219 | struct page *sb_page; /* cached copy of the bitmap file superblock */ |
232 | struct page **filemap; /* list of cache pages for the file */ | 220 | struct page **filemap; /* list of cache pages for the file */ |
@@ -238,22 +226,23 @@ struct bitmap { | |||
238 | 226 | ||
239 | int allclean; | 227 | int allclean; |
240 | 228 | ||
241 | unsigned long max_write_behind; /* write-behind mode */ | ||
242 | atomic_t behind_writes; | 229 | atomic_t behind_writes; |
230 | unsigned long behind_writes_used; /* highest actual value at runtime */ | ||
243 | 231 | ||
244 | /* | 232 | /* |
245 | * the bitmap daemon - periodically wakes up and sweeps the bitmap | 233 | * the bitmap daemon - periodically wakes up and sweeps the bitmap |
246 | * file, cleaning up bits and flushing out pages to disk as necessary | 234 | * file, cleaning up bits and flushing out pages to disk as necessary |
247 | */ | 235 | */ |
248 | unsigned long daemon_lastrun; /* jiffies of last run */ | 236 | unsigned long daemon_lastrun; /* jiffies of last run */ |
249 | unsigned long daemon_sleep; /* how many seconds between updates? */ | ||
250 | unsigned long last_end_sync; /* when we lasted called end_sync to | 237 | unsigned long last_end_sync; /* when we lasted called end_sync to |
251 | * update bitmap with resync progress */ | 238 | * update bitmap with resync progress */ |
252 | 239 | ||
253 | atomic_t pending_writes; /* pending writes to the bitmap file */ | 240 | atomic_t pending_writes; /* pending writes to the bitmap file */ |
254 | wait_queue_head_t write_wait; | 241 | wait_queue_head_t write_wait; |
255 | wait_queue_head_t overflow_wait; | 242 | wait_queue_head_t overflow_wait; |
243 | wait_queue_head_t behind_wait; | ||
256 | 244 | ||
245 | struct sysfs_dirent *sysfs_can_clear; | ||
257 | }; | 246 | }; |
258 | 247 | ||
259 | /* the bitmap API */ | 248 | /* the bitmap API */ |
@@ -282,7 +271,7 @@ void bitmap_close_sync(struct bitmap *bitmap); | |||
282 | void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector); | 271 | void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector); |
283 | 272 | ||
284 | void bitmap_unplug(struct bitmap *bitmap); | 273 | void bitmap_unplug(struct bitmap *bitmap); |
285 | void bitmap_daemon_work(struct bitmap *bitmap); | 274 | void bitmap_daemon_work(mddev_t *mddev); |
286 | #endif | 275 | #endif |
287 | 276 | ||
288 | #endif | 277 | #endif |
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index ed1038164019..3bdbb6115702 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * Copyright (C) 2003 Christophe Saout <christophe@saout.de> | 2 | * Copyright (C) 2003 Christophe Saout <christophe@saout.de> |
3 | * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> | 3 | * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> |
4 | * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved. | 4 | * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved. |
5 | * | 5 | * |
6 | * This file is released under the GPL. | 6 | * This file is released under the GPL. |
7 | */ | 7 | */ |
@@ -71,10 +71,21 @@ struct crypt_iv_operations { | |||
71 | int (*ctr)(struct crypt_config *cc, struct dm_target *ti, | 71 | int (*ctr)(struct crypt_config *cc, struct dm_target *ti, |
72 | const char *opts); | 72 | const char *opts); |
73 | void (*dtr)(struct crypt_config *cc); | 73 | void (*dtr)(struct crypt_config *cc); |
74 | const char *(*status)(struct crypt_config *cc); | 74 | int (*init)(struct crypt_config *cc); |
75 | int (*wipe)(struct crypt_config *cc); | ||
75 | int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector); | 76 | int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector); |
76 | }; | 77 | }; |
77 | 78 | ||
79 | struct iv_essiv_private { | ||
80 | struct crypto_cipher *tfm; | ||
81 | struct crypto_hash *hash_tfm; | ||
82 | u8 *salt; | ||
83 | }; | ||
84 | |||
85 | struct iv_benbi_private { | ||
86 | int shift; | ||
87 | }; | ||
88 | |||
78 | /* | 89 | /* |
79 | * Crypt: maps a linear range of a block device | 90 | * Crypt: maps a linear range of a block device |
80 | * and encrypts / decrypts at the same time. | 91 | * and encrypts / decrypts at the same time. |
@@ -102,8 +113,8 @@ struct crypt_config { | |||
102 | struct crypt_iv_operations *iv_gen_ops; | 113 | struct crypt_iv_operations *iv_gen_ops; |
103 | char *iv_mode; | 114 | char *iv_mode; |
104 | union { | 115 | union { |
105 | struct crypto_cipher *essiv_tfm; | 116 | struct iv_essiv_private essiv; |
106 | int benbi_shift; | 117 | struct iv_benbi_private benbi; |
107 | } iv_gen_private; | 118 | } iv_gen_private; |
108 | sector_t iv_offset; | 119 | sector_t iv_offset; |
109 | unsigned int iv_size; | 120 | unsigned int iv_size; |
@@ -147,6 +158,9 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io); | |||
147 | * plain: the initial vector is the 32-bit little-endian version of the sector | 158 | * plain: the initial vector is the 32-bit little-endian version of the sector |
148 | * number, padded with zeros if necessary. | 159 | * number, padded with zeros if necessary. |
149 | * | 160 | * |
161 | * plain64: the initial vector is the 64-bit little-endian version of the sector | ||
162 | * number, padded with zeros if necessary. | ||
163 | * | ||
150 | * essiv: "encrypted sector|salt initial vector", the sector number is | 164 | * essiv: "encrypted sector|salt initial vector", the sector number is |
151 | * encrypted with the bulk cipher using a salt as key. The salt | 165 | * encrypted with the bulk cipher using a salt as key. The salt |
152 | * should be derived from the bulk cipher's key via hashing. | 166 | * should be derived from the bulk cipher's key via hashing. |
@@ -169,88 +183,123 @@ static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector) | |||
169 | return 0; | 183 | return 0; |
170 | } | 184 | } |
171 | 185 | ||
172 | static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, | 186 | static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv, |
173 | const char *opts) | 187 | sector_t sector) |
174 | { | 188 | { |
175 | struct crypto_cipher *essiv_tfm; | 189 | memset(iv, 0, cc->iv_size); |
176 | struct crypto_hash *hash_tfm; | 190 | *(u64 *)iv = cpu_to_le64(sector); |
191 | |||
192 | return 0; | ||
193 | } | ||
194 | |||
195 | /* Initialise ESSIV - compute salt but no local memory allocations */ | ||
196 | static int crypt_iv_essiv_init(struct crypt_config *cc) | ||
197 | { | ||
198 | struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; | ||
177 | struct hash_desc desc; | 199 | struct hash_desc desc; |
178 | struct scatterlist sg; | 200 | struct scatterlist sg; |
179 | unsigned int saltsize; | ||
180 | u8 *salt; | ||
181 | int err; | 201 | int err; |
182 | 202 | ||
183 | if (opts == NULL) { | 203 | sg_init_one(&sg, cc->key, cc->key_size); |
204 | desc.tfm = essiv->hash_tfm; | ||
205 | desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; | ||
206 | |||
207 | err = crypto_hash_digest(&desc, &sg, cc->key_size, essiv->salt); | ||
208 | if (err) | ||
209 | return err; | ||
210 | |||
211 | return crypto_cipher_setkey(essiv->tfm, essiv->salt, | ||
212 | crypto_hash_digestsize(essiv->hash_tfm)); | ||
213 | } | ||
214 | |||
215 | /* Wipe salt and reset key derived from volume key */ | ||
216 | static int crypt_iv_essiv_wipe(struct crypt_config *cc) | ||
217 | { | ||
218 | struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; | ||
219 | unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm); | ||
220 | |||
221 | memset(essiv->salt, 0, salt_size); | ||
222 | |||
223 | return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size); | ||
224 | } | ||
225 | |||
226 | static void crypt_iv_essiv_dtr(struct crypt_config *cc) | ||
227 | { | ||
228 | struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; | ||
229 | |||
230 | crypto_free_cipher(essiv->tfm); | ||
231 | essiv->tfm = NULL; | ||
232 | |||
233 | crypto_free_hash(essiv->hash_tfm); | ||
234 | essiv->hash_tfm = NULL; | ||
235 | |||
236 | kzfree(essiv->salt); | ||
237 | essiv->salt = NULL; | ||
238 | } | ||
239 | |||
240 | static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, | ||
241 | const char *opts) | ||
242 | { | ||
243 | struct crypto_cipher *essiv_tfm = NULL; | ||
244 | struct crypto_hash *hash_tfm = NULL; | ||
245 | u8 *salt = NULL; | ||
246 | int err; | ||
247 | |||
248 | if (!opts) { | ||
184 | ti->error = "Digest algorithm missing for ESSIV mode"; | 249 | ti->error = "Digest algorithm missing for ESSIV mode"; |
185 | return -EINVAL; | 250 | return -EINVAL; |
186 | } | 251 | } |
187 | 252 | ||
188 | /* Hash the cipher key with the given hash algorithm */ | 253 | /* Allocate hash algorithm */ |
189 | hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC); | 254 | hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC); |
190 | if (IS_ERR(hash_tfm)) { | 255 | if (IS_ERR(hash_tfm)) { |
191 | ti->error = "Error initializing ESSIV hash"; | 256 | ti->error = "Error initializing ESSIV hash"; |
192 | return PTR_ERR(hash_tfm); | 257 | err = PTR_ERR(hash_tfm); |
258 | goto bad; | ||
193 | } | 259 | } |
194 | 260 | ||
195 | saltsize = crypto_hash_digestsize(hash_tfm); | 261 | salt = kzalloc(crypto_hash_digestsize(hash_tfm), GFP_KERNEL); |
196 | salt = kmalloc(saltsize, GFP_KERNEL); | 262 | if (!salt) { |
197 | if (salt == NULL) { | ||
198 | ti->error = "Error kmallocing salt storage in ESSIV"; | 263 | ti->error = "Error kmallocing salt storage in ESSIV"; |
199 | crypto_free_hash(hash_tfm); | 264 | err = -ENOMEM; |
200 | return -ENOMEM; | 265 | goto bad; |
201 | } | 266 | } |
202 | 267 | ||
203 | sg_init_one(&sg, cc->key, cc->key_size); | 268 | /* Allocate essiv_tfm */ |
204 | desc.tfm = hash_tfm; | ||
205 | desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; | ||
206 | err = crypto_hash_digest(&desc, &sg, cc->key_size, salt); | ||
207 | crypto_free_hash(hash_tfm); | ||
208 | |||
209 | if (err) { | ||
210 | ti->error = "Error calculating hash in ESSIV"; | ||
211 | kfree(salt); | ||
212 | return err; | ||
213 | } | ||
214 | |||
215 | /* Setup the essiv_tfm with the given salt */ | ||
216 | essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC); | 269 | essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC); |
217 | if (IS_ERR(essiv_tfm)) { | 270 | if (IS_ERR(essiv_tfm)) { |
218 | ti->error = "Error allocating crypto tfm for ESSIV"; | 271 | ti->error = "Error allocating crypto tfm for ESSIV"; |
219 | kfree(salt); | 272 | err = PTR_ERR(essiv_tfm); |
220 | return PTR_ERR(essiv_tfm); | 273 | goto bad; |
221 | } | 274 | } |
222 | if (crypto_cipher_blocksize(essiv_tfm) != | 275 | if (crypto_cipher_blocksize(essiv_tfm) != |
223 | crypto_ablkcipher_ivsize(cc->tfm)) { | 276 | crypto_ablkcipher_ivsize(cc->tfm)) { |
224 | ti->error = "Block size of ESSIV cipher does " | 277 | ti->error = "Block size of ESSIV cipher does " |
225 | "not match IV size of block cipher"; | 278 | "not match IV size of block cipher"; |
226 | crypto_free_cipher(essiv_tfm); | 279 | err = -EINVAL; |
227 | kfree(salt); | 280 | goto bad; |
228 | return -EINVAL; | ||
229 | } | 281 | } |
230 | err = crypto_cipher_setkey(essiv_tfm, salt, saltsize); | ||
231 | if (err) { | ||
232 | ti->error = "Failed to set key for ESSIV cipher"; | ||
233 | crypto_free_cipher(essiv_tfm); | ||
234 | kfree(salt); | ||
235 | return err; | ||
236 | } | ||
237 | kfree(salt); | ||
238 | 282 | ||
239 | cc->iv_gen_private.essiv_tfm = essiv_tfm; | 283 | cc->iv_gen_private.essiv.salt = salt; |
284 | cc->iv_gen_private.essiv.tfm = essiv_tfm; | ||
285 | cc->iv_gen_private.essiv.hash_tfm = hash_tfm; | ||
286 | |||
240 | return 0; | 287 | return 0; |
241 | } | ||
242 | 288 | ||
243 | static void crypt_iv_essiv_dtr(struct crypt_config *cc) | 289 | bad: |
244 | { | 290 | if (essiv_tfm && !IS_ERR(essiv_tfm)) |
245 | crypto_free_cipher(cc->iv_gen_private.essiv_tfm); | 291 | crypto_free_cipher(essiv_tfm); |
246 | cc->iv_gen_private.essiv_tfm = NULL; | 292 | if (hash_tfm && !IS_ERR(hash_tfm)) |
293 | crypto_free_hash(hash_tfm); | ||
294 | kfree(salt); | ||
295 | return err; | ||
247 | } | 296 | } |
248 | 297 | ||
249 | static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector) | 298 | static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector) |
250 | { | 299 | { |
251 | memset(iv, 0, cc->iv_size); | 300 | memset(iv, 0, cc->iv_size); |
252 | *(u64 *)iv = cpu_to_le64(sector); | 301 | *(u64 *)iv = cpu_to_le64(sector); |
253 | crypto_cipher_encrypt_one(cc->iv_gen_private.essiv_tfm, iv, iv); | 302 | crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv); |
254 | return 0; | 303 | return 0; |
255 | } | 304 | } |
256 | 305 | ||
@@ -273,7 +322,7 @@ static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, | |||
273 | return -EINVAL; | 322 | return -EINVAL; |
274 | } | 323 | } |
275 | 324 | ||
276 | cc->iv_gen_private.benbi_shift = 9 - log; | 325 | cc->iv_gen_private.benbi.shift = 9 - log; |
277 | 326 | ||
278 | return 0; | 327 | return 0; |
279 | } | 328 | } |
@@ -288,7 +337,7 @@ static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector) | |||
288 | 337 | ||
289 | memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */ | 338 | memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */ |
290 | 339 | ||
291 | val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi_shift) + 1); | 340 | val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi.shift) + 1); |
292 | put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64))); | 341 | put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64))); |
293 | 342 | ||
294 | return 0; | 343 | return 0; |
@@ -305,9 +354,15 @@ static struct crypt_iv_operations crypt_iv_plain_ops = { | |||
305 | .generator = crypt_iv_plain_gen | 354 | .generator = crypt_iv_plain_gen |
306 | }; | 355 | }; |
307 | 356 | ||
357 | static struct crypt_iv_operations crypt_iv_plain64_ops = { | ||
358 | .generator = crypt_iv_plain64_gen | ||
359 | }; | ||
360 | |||
308 | static struct crypt_iv_operations crypt_iv_essiv_ops = { | 361 | static struct crypt_iv_operations crypt_iv_essiv_ops = { |
309 | .ctr = crypt_iv_essiv_ctr, | 362 | .ctr = crypt_iv_essiv_ctr, |
310 | .dtr = crypt_iv_essiv_dtr, | 363 | .dtr = crypt_iv_essiv_dtr, |
364 | .init = crypt_iv_essiv_init, | ||
365 | .wipe = crypt_iv_essiv_wipe, | ||
311 | .generator = crypt_iv_essiv_gen | 366 | .generator = crypt_iv_essiv_gen |
312 | }; | 367 | }; |
313 | 368 | ||
@@ -934,14 +989,14 @@ static int crypt_set_key(struct crypt_config *cc, char *key) | |||
934 | 989 | ||
935 | set_bit(DM_CRYPT_KEY_VALID, &cc->flags); | 990 | set_bit(DM_CRYPT_KEY_VALID, &cc->flags); |
936 | 991 | ||
937 | return 0; | 992 | return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size); |
938 | } | 993 | } |
939 | 994 | ||
940 | static int crypt_wipe_key(struct crypt_config *cc) | 995 | static int crypt_wipe_key(struct crypt_config *cc) |
941 | { | 996 | { |
942 | clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); | 997 | clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); |
943 | memset(&cc->key, 0, cc->key_size * sizeof(u8)); | 998 | memset(&cc->key, 0, cc->key_size * sizeof(u8)); |
944 | return 0; | 999 | return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size); |
945 | } | 1000 | } |
946 | 1001 | ||
947 | /* | 1002 | /* |
@@ -983,12 +1038,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
983 | return -ENOMEM; | 1038 | return -ENOMEM; |
984 | } | 1039 | } |
985 | 1040 | ||
986 | if (crypt_set_key(cc, argv[1])) { | 1041 | /* Compatibility mode for old dm-crypt cipher strings */ |
987 | ti->error = "Error decoding key"; | ||
988 | goto bad_cipher; | ||
989 | } | ||
990 | |||
991 | /* Compatiblity mode for old dm-crypt cipher strings */ | ||
992 | if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) { | 1042 | if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) { |
993 | chainmode = "cbc"; | 1043 | chainmode = "cbc"; |
994 | ivmode = "plain"; | 1044 | ivmode = "plain"; |
@@ -1015,6 +1065,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1015 | strcpy(cc->chainmode, chainmode); | 1065 | strcpy(cc->chainmode, chainmode); |
1016 | cc->tfm = tfm; | 1066 | cc->tfm = tfm; |
1017 | 1067 | ||
1068 | if (crypt_set_key(cc, argv[1]) < 0) { | ||
1069 | ti->error = "Error decoding and setting key"; | ||
1070 | goto bad_ivmode; | ||
1071 | } | ||
1072 | |||
1018 | /* | 1073 | /* |
1019 | * Choose ivmode. Valid modes: "plain", "essiv:<esshash>", "benbi". | 1074 | * Choose ivmode. Valid modes: "plain", "essiv:<esshash>", "benbi". |
1020 | * See comments at iv code | 1075 | * See comments at iv code |
@@ -1024,6 +1079,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1024 | cc->iv_gen_ops = NULL; | 1079 | cc->iv_gen_ops = NULL; |
1025 | else if (strcmp(ivmode, "plain") == 0) | 1080 | else if (strcmp(ivmode, "plain") == 0) |
1026 | cc->iv_gen_ops = &crypt_iv_plain_ops; | 1081 | cc->iv_gen_ops = &crypt_iv_plain_ops; |
1082 | else if (strcmp(ivmode, "plain64") == 0) | ||
1083 | cc->iv_gen_ops = &crypt_iv_plain64_ops; | ||
1027 | else if (strcmp(ivmode, "essiv") == 0) | 1084 | else if (strcmp(ivmode, "essiv") == 0) |
1028 | cc->iv_gen_ops = &crypt_iv_essiv_ops; | 1085 | cc->iv_gen_ops = &crypt_iv_essiv_ops; |
1029 | else if (strcmp(ivmode, "benbi") == 0) | 1086 | else if (strcmp(ivmode, "benbi") == 0) |
@@ -1039,6 +1096,12 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1039 | cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0) | 1096 | cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0) |
1040 | goto bad_ivmode; | 1097 | goto bad_ivmode; |
1041 | 1098 | ||
1099 | if (cc->iv_gen_ops && cc->iv_gen_ops->init && | ||
1100 | cc->iv_gen_ops->init(cc) < 0) { | ||
1101 | ti->error = "Error initialising IV"; | ||
1102 | goto bad_slab_pool; | ||
1103 | } | ||
1104 | |||
1042 | cc->iv_size = crypto_ablkcipher_ivsize(tfm); | 1105 | cc->iv_size = crypto_ablkcipher_ivsize(tfm); |
1043 | if (cc->iv_size) | 1106 | if (cc->iv_size) |
1044 | /* at least a 64 bit sector number should fit in our buffer */ | 1107 | /* at least a 64 bit sector number should fit in our buffer */ |
@@ -1085,11 +1148,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1085 | goto bad_bs; | 1148 | goto bad_bs; |
1086 | } | 1149 | } |
1087 | 1150 | ||
1088 | if (crypto_ablkcipher_setkey(tfm, cc->key, key_size) < 0) { | ||
1089 | ti->error = "Error setting key"; | ||
1090 | goto bad_device; | ||
1091 | } | ||
1092 | |||
1093 | if (sscanf(argv[2], "%llu", &tmpll) != 1) { | 1151 | if (sscanf(argv[2], "%llu", &tmpll) != 1) { |
1094 | ti->error = "Invalid iv_offset sector"; | 1152 | ti->error = "Invalid iv_offset sector"; |
1095 | goto bad_device; | 1153 | goto bad_device; |
@@ -1102,8 +1160,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1102 | } | 1160 | } |
1103 | cc->start = tmpll; | 1161 | cc->start = tmpll; |
1104 | 1162 | ||
1105 | if (dm_get_device(ti, argv[3], cc->start, ti->len, | 1163 | if (dm_get_device(ti, argv[3], dm_table_get_mode(ti->table), &cc->dev)) { |
1106 | dm_table_get_mode(ti->table), &cc->dev)) { | ||
1107 | ti->error = "Device lookup failed"; | 1164 | ti->error = "Device lookup failed"; |
1108 | goto bad_device; | 1165 | goto bad_device; |
1109 | } | 1166 | } |
@@ -1278,6 +1335,7 @@ static void crypt_resume(struct dm_target *ti) | |||
1278 | static int crypt_message(struct dm_target *ti, unsigned argc, char **argv) | 1335 | static int crypt_message(struct dm_target *ti, unsigned argc, char **argv) |
1279 | { | 1336 | { |
1280 | struct crypt_config *cc = ti->private; | 1337 | struct crypt_config *cc = ti->private; |
1338 | int ret = -EINVAL; | ||
1281 | 1339 | ||
1282 | if (argc < 2) | 1340 | if (argc < 2) |
1283 | goto error; | 1341 | goto error; |
@@ -1287,10 +1345,22 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv) | |||
1287 | DMWARN("not suspended during key manipulation."); | 1345 | DMWARN("not suspended during key manipulation."); |
1288 | return -EINVAL; | 1346 | return -EINVAL; |
1289 | } | 1347 | } |
1290 | if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) | 1348 | if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) { |
1291 | return crypt_set_key(cc, argv[2]); | 1349 | ret = crypt_set_key(cc, argv[2]); |
1292 | if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) | 1350 | if (ret) |
1351 | return ret; | ||
1352 | if (cc->iv_gen_ops && cc->iv_gen_ops->init) | ||
1353 | ret = cc->iv_gen_ops->init(cc); | ||
1354 | return ret; | ||
1355 | } | ||
1356 | if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) { | ||
1357 | if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) { | ||
1358 | ret = cc->iv_gen_ops->wipe(cc); | ||
1359 | if (ret) | ||
1360 | return ret; | ||
1361 | } | ||
1293 | return crypt_wipe_key(cc); | 1362 | return crypt_wipe_key(cc); |
1363 | } | ||
1294 | } | 1364 | } |
1295 | 1365 | ||
1296 | error: | 1366 | error: |
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index ebe7381f47c8..852052880d7a 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c | |||
@@ -156,8 +156,8 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
156 | goto bad; | 156 | goto bad; |
157 | } | 157 | } |
158 | 158 | ||
159 | if (dm_get_device(ti, argv[0], dc->start_read, ti->len, | 159 | if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), |
160 | dm_table_get_mode(ti->table), &dc->dev_read)) { | 160 | &dc->dev_read)) { |
161 | ti->error = "Device lookup failed"; | 161 | ti->error = "Device lookup failed"; |
162 | goto bad; | 162 | goto bad; |
163 | } | 163 | } |
@@ -177,8 +177,8 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
177 | goto bad_dev_read; | 177 | goto bad_dev_read; |
178 | } | 178 | } |
179 | 179 | ||
180 | if (dm_get_device(ti, argv[3], dc->start_write, ti->len, | 180 | if (dm_get_device(ti, argv[3], dm_table_get_mode(ti->table), |
181 | dm_table_get_mode(ti->table), &dc->dev_write)) { | 181 | &dc->dev_write)) { |
182 | ti->error = "Write device lookup failed"; | 182 | ti->error = "Write device lookup failed"; |
183 | goto bad_dev_read; | 183 | goto bad_dev_read; |
184 | } | 184 | } |
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index 556acff3952f..2b7907b6dd09 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c | |||
@@ -138,16 +138,6 @@ int dm_exception_store_type_unregister(struct dm_exception_store_type *type) | |||
138 | } | 138 | } |
139 | EXPORT_SYMBOL(dm_exception_store_type_unregister); | 139 | EXPORT_SYMBOL(dm_exception_store_type_unregister); |
140 | 140 | ||
141 | /* | ||
142 | * Round a number up to the nearest 'size' boundary. size must | ||
143 | * be a power of 2. | ||
144 | */ | ||
145 | static ulong round_up(ulong n, ulong size) | ||
146 | { | ||
147 | size--; | ||
148 | return (n + size) & ~size; | ||
149 | } | ||
150 | |||
151 | static int set_chunk_size(struct dm_exception_store *store, | 141 | static int set_chunk_size(struct dm_exception_store *store, |
152 | const char *chunk_size_arg, char **error) | 142 | const char *chunk_size_arg, char **error) |
153 | { | 143 | { |
@@ -155,7 +145,8 @@ static int set_chunk_size(struct dm_exception_store *store, | |||
155 | char *value; | 145 | char *value; |
156 | 146 | ||
157 | chunk_size_ulong = simple_strtoul(chunk_size_arg, &value, 10); | 147 | chunk_size_ulong = simple_strtoul(chunk_size_arg, &value, 10); |
158 | if (*chunk_size_arg == '\0' || *value != '\0') { | 148 | if (*chunk_size_arg == '\0' || *value != '\0' || |
149 | chunk_size_ulong > UINT_MAX) { | ||
159 | *error = "Invalid chunk size"; | 150 | *error = "Invalid chunk size"; |
160 | return -EINVAL; | 151 | return -EINVAL; |
161 | } | 152 | } |
@@ -165,45 +156,42 @@ static int set_chunk_size(struct dm_exception_store *store, | |||
165 | return 0; | 156 | return 0; |
166 | } | 157 | } |
167 | 158 | ||
168 | /* | 159 | return dm_exception_store_set_chunk_size(store, |
169 | * Chunk size must be multiple of page size. Silently | 160 | (unsigned) chunk_size_ulong, |
170 | * round up if it's not. | ||
171 | */ | ||
172 | chunk_size_ulong = round_up(chunk_size_ulong, PAGE_SIZE >> 9); | ||
173 | |||
174 | return dm_exception_store_set_chunk_size(store, chunk_size_ulong, | ||
175 | error); | 161 | error); |
176 | } | 162 | } |
177 | 163 | ||
178 | int dm_exception_store_set_chunk_size(struct dm_exception_store *store, | 164 | int dm_exception_store_set_chunk_size(struct dm_exception_store *store, |
179 | unsigned long chunk_size_ulong, | 165 | unsigned chunk_size, |
180 | char **error) | 166 | char **error) |
181 | { | 167 | { |
182 | /* Check chunk_size is a power of 2 */ | 168 | /* Check chunk_size is a power of 2 */ |
183 | if (!is_power_of_2(chunk_size_ulong)) { | 169 | if (!is_power_of_2(chunk_size)) { |
184 | *error = "Chunk size is not a power of 2"; | 170 | *error = "Chunk size is not a power of 2"; |
185 | return -EINVAL; | 171 | return -EINVAL; |
186 | } | 172 | } |
187 | 173 | ||
188 | /* Validate the chunk size against the device block size */ | 174 | /* Validate the chunk size against the device block size */ |
189 | if (chunk_size_ulong % (bdev_logical_block_size(store->cow->bdev) >> 9)) { | 175 | if (chunk_size % |
176 | (bdev_logical_block_size(dm_snap_cow(store->snap)->bdev) >> 9)) { | ||
190 | *error = "Chunk size is not a multiple of device blocksize"; | 177 | *error = "Chunk size is not a multiple of device blocksize"; |
191 | return -EINVAL; | 178 | return -EINVAL; |
192 | } | 179 | } |
193 | 180 | ||
194 | if (chunk_size_ulong > INT_MAX >> SECTOR_SHIFT) { | 181 | if (chunk_size > INT_MAX >> SECTOR_SHIFT) { |
195 | *error = "Chunk size is too high"; | 182 | *error = "Chunk size is too high"; |
196 | return -EINVAL; | 183 | return -EINVAL; |
197 | } | 184 | } |
198 | 185 | ||
199 | store->chunk_size = chunk_size_ulong; | 186 | store->chunk_size = chunk_size; |
200 | store->chunk_mask = chunk_size_ulong - 1; | 187 | store->chunk_mask = chunk_size - 1; |
201 | store->chunk_shift = ffs(chunk_size_ulong) - 1; | 188 | store->chunk_shift = ffs(chunk_size) - 1; |
202 | 189 | ||
203 | return 0; | 190 | return 0; |
204 | } | 191 | } |
205 | 192 | ||
206 | int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, | 193 | int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, |
194 | struct dm_snapshot *snap, | ||
207 | unsigned *args_used, | 195 | unsigned *args_used, |
208 | struct dm_exception_store **store) | 196 | struct dm_exception_store **store) |
209 | { | 197 | { |
@@ -212,7 +200,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, | |||
212 | struct dm_exception_store *tmp_store; | 200 | struct dm_exception_store *tmp_store; |
213 | char persistent; | 201 | char persistent; |
214 | 202 | ||
215 | if (argc < 3) { | 203 | if (argc < 2) { |
216 | ti->error = "Insufficient exception store arguments"; | 204 | ti->error = "Insufficient exception store arguments"; |
217 | return -EINVAL; | 205 | return -EINVAL; |
218 | } | 206 | } |
@@ -223,14 +211,15 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, | |||
223 | return -ENOMEM; | 211 | return -ENOMEM; |
224 | } | 212 | } |
225 | 213 | ||
226 | persistent = toupper(*argv[1]); | 214 | persistent = toupper(*argv[0]); |
227 | if (persistent == 'P') | 215 | if (persistent == 'P') |
228 | type = get_type("P"); | 216 | type = get_type("P"); |
229 | else if (persistent == 'N') | 217 | else if (persistent == 'N') |
230 | type = get_type("N"); | 218 | type = get_type("N"); |
231 | else { | 219 | else { |
232 | ti->error = "Persistent flag is not P or N"; | 220 | ti->error = "Persistent flag is not P or N"; |
233 | return -EINVAL; | 221 | r = -EINVAL; |
222 | goto bad_type; | ||
234 | } | 223 | } |
235 | 224 | ||
236 | if (!type) { | 225 | if (!type) { |
@@ -240,32 +229,23 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, | |||
240 | } | 229 | } |
241 | 230 | ||
242 | tmp_store->type = type; | 231 | tmp_store->type = type; |
243 | tmp_store->ti = ti; | 232 | tmp_store->snap = snap; |
244 | |||
245 | r = dm_get_device(ti, argv[0], 0, 0, | ||
246 | FMODE_READ | FMODE_WRITE, &tmp_store->cow); | ||
247 | if (r) { | ||
248 | ti->error = "Cannot get COW device"; | ||
249 | goto bad_cow; | ||
250 | } | ||
251 | 233 | ||
252 | r = set_chunk_size(tmp_store, argv[2], &ti->error); | 234 | r = set_chunk_size(tmp_store, argv[1], &ti->error); |
253 | if (r) | 235 | if (r) |
254 | goto bad_cow; | 236 | goto bad; |
255 | 237 | ||
256 | r = type->ctr(tmp_store, 0, NULL); | 238 | r = type->ctr(tmp_store, 0, NULL); |
257 | if (r) { | 239 | if (r) { |
258 | ti->error = "Exception store type constructor failed"; | 240 | ti->error = "Exception store type constructor failed"; |
259 | goto bad_ctr; | 241 | goto bad; |
260 | } | 242 | } |
261 | 243 | ||
262 | *args_used = 3; | 244 | *args_used = 2; |
263 | *store = tmp_store; | 245 | *store = tmp_store; |
264 | return 0; | 246 | return 0; |
265 | 247 | ||
266 | bad_ctr: | 248 | bad: |
267 | dm_put_device(ti, tmp_store->cow); | ||
268 | bad_cow: | ||
269 | put_type(type); | 249 | put_type(type); |
270 | bad_type: | 250 | bad_type: |
271 | kfree(tmp_store); | 251 | kfree(tmp_store); |
@@ -276,7 +256,6 @@ EXPORT_SYMBOL(dm_exception_store_create); | |||
276 | void dm_exception_store_destroy(struct dm_exception_store *store) | 256 | void dm_exception_store_destroy(struct dm_exception_store *store) |
277 | { | 257 | { |
278 | store->type->dtr(store); | 258 | store->type->dtr(store); |
279 | dm_put_device(store->ti, store->cow); | ||
280 | put_type(store->type); | 259 | put_type(store->type); |
281 | kfree(store); | 260 | kfree(store); |
282 | } | 261 | } |
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h index 812c71872ba0..e8dfa06af3ba 100644 --- a/drivers/md/dm-exception-store.h +++ b/drivers/md/dm-exception-store.h | |||
@@ -26,7 +26,7 @@ typedef sector_t chunk_t; | |||
26 | * of chunks that follow contiguously. Remaining bits hold the number of the | 26 | * of chunks that follow contiguously. Remaining bits hold the number of the |
27 | * chunk within the device. | 27 | * chunk within the device. |
28 | */ | 28 | */ |
29 | struct dm_snap_exception { | 29 | struct dm_exception { |
30 | struct list_head hash_list; | 30 | struct list_head hash_list; |
31 | 31 | ||
32 | chunk_t old_chunk; | 32 | chunk_t old_chunk; |
@@ -64,17 +64,34 @@ struct dm_exception_store_type { | |||
64 | * Find somewhere to store the next exception. | 64 | * Find somewhere to store the next exception. |
65 | */ | 65 | */ |
66 | int (*prepare_exception) (struct dm_exception_store *store, | 66 | int (*prepare_exception) (struct dm_exception_store *store, |
67 | struct dm_snap_exception *e); | 67 | struct dm_exception *e); |
68 | 68 | ||
69 | /* | 69 | /* |
70 | * Update the metadata with this exception. | 70 | * Update the metadata with this exception. |
71 | */ | 71 | */ |
72 | void (*commit_exception) (struct dm_exception_store *store, | 72 | void (*commit_exception) (struct dm_exception_store *store, |
73 | struct dm_snap_exception *e, | 73 | struct dm_exception *e, |
74 | void (*callback) (void *, int success), | 74 | void (*callback) (void *, int success), |
75 | void *callback_context); | 75 | void *callback_context); |
76 | 76 | ||
77 | /* | 77 | /* |
78 | * Returns 0 if the exception store is empty. | ||
79 | * | ||
80 | * If there are exceptions still to be merged, sets | ||
81 | * *last_old_chunk and *last_new_chunk to the most recent | ||
82 | * still-to-be-merged chunk and returns the number of | ||
83 | * consecutive previous ones. | ||
84 | */ | ||
85 | int (*prepare_merge) (struct dm_exception_store *store, | ||
86 | chunk_t *last_old_chunk, chunk_t *last_new_chunk); | ||
87 | |||
88 | /* | ||
89 | * Clear the last n exceptions. | ||
90 | * nr_merged must be <= the value returned by prepare_merge. | ||
91 | */ | ||
92 | int (*commit_merge) (struct dm_exception_store *store, int nr_merged); | ||
93 | |||
94 | /* | ||
78 | * The snapshot is invalid, note this in the metadata. | 95 | * The snapshot is invalid, note this in the metadata. |
79 | */ | 96 | */ |
80 | void (*drop_snapshot) (struct dm_exception_store *store); | 97 | void (*drop_snapshot) (struct dm_exception_store *store); |
@@ -86,29 +103,34 @@ struct dm_exception_store_type { | |||
86 | /* | 103 | /* |
87 | * Return how full the snapshot is. | 104 | * Return how full the snapshot is. |
88 | */ | 105 | */ |
89 | void (*fraction_full) (struct dm_exception_store *store, | 106 | void (*usage) (struct dm_exception_store *store, |
90 | sector_t *numerator, | 107 | sector_t *total_sectors, sector_t *sectors_allocated, |
91 | sector_t *denominator); | 108 | sector_t *metadata_sectors); |
92 | 109 | ||
93 | /* For internal device-mapper use only. */ | 110 | /* For internal device-mapper use only. */ |
94 | struct list_head list; | 111 | struct list_head list; |
95 | }; | 112 | }; |
96 | 113 | ||
114 | struct dm_snapshot; | ||
115 | |||
97 | struct dm_exception_store { | 116 | struct dm_exception_store { |
98 | struct dm_exception_store_type *type; | 117 | struct dm_exception_store_type *type; |
99 | struct dm_target *ti; | 118 | struct dm_snapshot *snap; |
100 | |||
101 | struct dm_dev *cow; | ||
102 | 119 | ||
103 | /* Size of data blocks saved - must be a power of 2 */ | 120 | /* Size of data blocks saved - must be a power of 2 */ |
104 | chunk_t chunk_size; | 121 | unsigned chunk_size; |
105 | chunk_t chunk_mask; | 122 | unsigned chunk_mask; |
106 | chunk_t chunk_shift; | 123 | unsigned chunk_shift; |
107 | 124 | ||
108 | void *context; | 125 | void *context; |
109 | }; | 126 | }; |
110 | 127 | ||
111 | /* | 128 | /* |
129 | * Obtain the cow device used by a given snapshot. | ||
130 | */ | ||
131 | struct dm_dev *dm_snap_cow(struct dm_snapshot *snap); | ||
132 | |||
133 | /* | ||
112 | * Funtions to manipulate consecutive chunks | 134 | * Funtions to manipulate consecutive chunks |
113 | */ | 135 | */ |
114 | # if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64) | 136 | # if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64) |
@@ -120,18 +142,25 @@ static inline chunk_t dm_chunk_number(chunk_t chunk) | |||
120 | return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL); | 142 | return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL); |
121 | } | 143 | } |
122 | 144 | ||
123 | static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e) | 145 | static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e) |
124 | { | 146 | { |
125 | return e->new_chunk >> DM_CHUNK_NUMBER_BITS; | 147 | return e->new_chunk >> DM_CHUNK_NUMBER_BITS; |
126 | } | 148 | } |
127 | 149 | ||
128 | static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e) | 150 | static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e) |
129 | { | 151 | { |
130 | e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS); | 152 | e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS); |
131 | 153 | ||
132 | BUG_ON(!dm_consecutive_chunk_count(e)); | 154 | BUG_ON(!dm_consecutive_chunk_count(e)); |
133 | } | 155 | } |
134 | 156 | ||
157 | static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e) | ||
158 | { | ||
159 | BUG_ON(!dm_consecutive_chunk_count(e)); | ||
160 | |||
161 | e->new_chunk -= (1ULL << DM_CHUNK_NUMBER_BITS); | ||
162 | } | ||
163 | |||
135 | # else | 164 | # else |
136 | # define DM_CHUNK_CONSECUTIVE_BITS 0 | 165 | # define DM_CHUNK_CONSECUTIVE_BITS 0 |
137 | 166 | ||
@@ -140,12 +169,16 @@ static inline chunk_t dm_chunk_number(chunk_t chunk) | |||
140 | return chunk; | 169 | return chunk; |
141 | } | 170 | } |
142 | 171 | ||
143 | static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e) | 172 | static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e) |
144 | { | 173 | { |
145 | return 0; | 174 | return 0; |
146 | } | 175 | } |
147 | 176 | ||
148 | static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e) | 177 | static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e) |
178 | { | ||
179 | } | ||
180 | |||
181 | static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e) | ||
149 | { | 182 | { |
150 | } | 183 | } |
151 | 184 | ||
@@ -162,17 +195,18 @@ static inline sector_t get_dev_size(struct block_device *bdev) | |||
162 | static inline chunk_t sector_to_chunk(struct dm_exception_store *store, | 195 | static inline chunk_t sector_to_chunk(struct dm_exception_store *store, |
163 | sector_t sector) | 196 | sector_t sector) |
164 | { | 197 | { |
165 | return (sector & ~store->chunk_mask) >> store->chunk_shift; | 198 | return sector >> store->chunk_shift; |
166 | } | 199 | } |
167 | 200 | ||
168 | int dm_exception_store_type_register(struct dm_exception_store_type *type); | 201 | int dm_exception_store_type_register(struct dm_exception_store_type *type); |
169 | int dm_exception_store_type_unregister(struct dm_exception_store_type *type); | 202 | int dm_exception_store_type_unregister(struct dm_exception_store_type *type); |
170 | 203 | ||
171 | int dm_exception_store_set_chunk_size(struct dm_exception_store *store, | 204 | int dm_exception_store_set_chunk_size(struct dm_exception_store *store, |
172 | unsigned long chunk_size_ulong, | 205 | unsigned chunk_size, |
173 | char **error); | 206 | char **error); |
174 | 207 | ||
175 | int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, | 208 | int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, |
209 | struct dm_snapshot *snap, | ||
176 | unsigned *args_used, | 210 | unsigned *args_used, |
177 | struct dm_exception_store **store); | 211 | struct dm_exception_store **store); |
178 | void dm_exception_store_destroy(struct dm_exception_store *store); | 212 | void dm_exception_store_destroy(struct dm_exception_store *store); |
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 3a2e6a2f8bdd..10f457ca6af2 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c | |||
@@ -5,6 +5,8 @@ | |||
5 | * This file is released under the GPL. | 5 | * This file is released under the GPL. |
6 | */ | 6 | */ |
7 | 7 | ||
8 | #include "dm.h" | ||
9 | |||
8 | #include <linux/device-mapper.h> | 10 | #include <linux/device-mapper.h> |
9 | 11 | ||
10 | #include <linux/bio.h> | 12 | #include <linux/bio.h> |
@@ -14,12 +16,19 @@ | |||
14 | #include <linux/slab.h> | 16 | #include <linux/slab.h> |
15 | #include <linux/dm-io.h> | 17 | #include <linux/dm-io.h> |
16 | 18 | ||
19 | #define DM_MSG_PREFIX "io" | ||
20 | |||
21 | #define DM_IO_MAX_REGIONS BITS_PER_LONG | ||
22 | |||
17 | struct dm_io_client { | 23 | struct dm_io_client { |
18 | mempool_t *pool; | 24 | mempool_t *pool; |
19 | struct bio_set *bios; | 25 | struct bio_set *bios; |
20 | }; | 26 | }; |
21 | 27 | ||
22 | /* FIXME: can we shrink this ? */ | 28 | /* |
29 | * Aligning 'struct io' reduces the number of bits required to store | ||
30 | * its address. Refer to store_io_and_region_in_bio() below. | ||
31 | */ | ||
23 | struct io { | 32 | struct io { |
24 | unsigned long error_bits; | 33 | unsigned long error_bits; |
25 | unsigned long eopnotsupp_bits; | 34 | unsigned long eopnotsupp_bits; |
@@ -28,7 +37,9 @@ struct io { | |||
28 | struct dm_io_client *client; | 37 | struct dm_io_client *client; |
29 | io_notify_fn callback; | 38 | io_notify_fn callback; |
30 | void *context; | 39 | void *context; |
31 | }; | 40 | } __attribute__((aligned(DM_IO_MAX_REGIONS))); |
41 | |||
42 | static struct kmem_cache *_dm_io_cache; | ||
32 | 43 | ||
33 | /* | 44 | /* |
34 | * io contexts are only dynamically allocated for asynchronous | 45 | * io contexts are only dynamically allocated for asynchronous |
@@ -53,7 +64,7 @@ struct dm_io_client *dm_io_client_create(unsigned num_pages) | |||
53 | if (!client) | 64 | if (!client) |
54 | return ERR_PTR(-ENOMEM); | 65 | return ERR_PTR(-ENOMEM); |
55 | 66 | ||
56 | client->pool = mempool_create_kmalloc_pool(ios, sizeof(struct io)); | 67 | client->pool = mempool_create_slab_pool(ios, _dm_io_cache); |
57 | if (!client->pool) | 68 | if (!client->pool) |
58 | goto bad; | 69 | goto bad; |
59 | 70 | ||
@@ -88,18 +99,29 @@ EXPORT_SYMBOL(dm_io_client_destroy); | |||
88 | 99 | ||
89 | /*----------------------------------------------------------------- | 100 | /*----------------------------------------------------------------- |
90 | * We need to keep track of which region a bio is doing io for. | 101 | * We need to keep track of which region a bio is doing io for. |
91 | * In order to save a memory allocation we store this the last | 102 | * To avoid a memory allocation to store just 5 or 6 bits, we |
92 | * bvec which we know is unused (blech). | 103 | * ensure the 'struct io' pointer is aligned so enough low bits are |
93 | * XXX This is ugly and can OOPS with some configs... find another way. | 104 | * always zero and then combine it with the region number directly in |
105 | * bi_private. | ||
94 | *---------------------------------------------------------------*/ | 106 | *---------------------------------------------------------------*/ |
95 | static inline void bio_set_region(struct bio *bio, unsigned region) | 107 | static void store_io_and_region_in_bio(struct bio *bio, struct io *io, |
108 | unsigned region) | ||
96 | { | 109 | { |
97 | bio->bi_io_vec[bio->bi_max_vecs].bv_len = region; | 110 | if (unlikely(!IS_ALIGNED((unsigned long)io, DM_IO_MAX_REGIONS))) { |
111 | DMCRIT("Unaligned struct io pointer %p", io); | ||
112 | BUG(); | ||
113 | } | ||
114 | |||
115 | bio->bi_private = (void *)((unsigned long)io | region); | ||
98 | } | 116 | } |
99 | 117 | ||
100 | static inline unsigned bio_get_region(struct bio *bio) | 118 | static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io, |
119 | unsigned *region) | ||
101 | { | 120 | { |
102 | return bio->bi_io_vec[bio->bi_max_vecs].bv_len; | 121 | unsigned long val = (unsigned long)bio->bi_private; |
122 | |||
123 | *io = (void *)(val & -(unsigned long)DM_IO_MAX_REGIONS); | ||
124 | *region = val & (DM_IO_MAX_REGIONS - 1); | ||
103 | } | 125 | } |
104 | 126 | ||
105 | /*----------------------------------------------------------------- | 127 | /*----------------------------------------------------------------- |
@@ -140,10 +162,8 @@ static void endio(struct bio *bio, int error) | |||
140 | /* | 162 | /* |
141 | * The bio destructor in bio_put() may use the io object. | 163 | * The bio destructor in bio_put() may use the io object. |
142 | */ | 164 | */ |
143 | io = bio->bi_private; | 165 | retrieve_io_and_region_from_bio(bio, &io, ®ion); |
144 | region = bio_get_region(bio); | ||
145 | 166 | ||
146 | bio->bi_max_vecs++; | ||
147 | bio_put(bio); | 167 | bio_put(bio); |
148 | 168 | ||
149 | dec_count(io, region, error); | 169 | dec_count(io, region, error); |
@@ -243,7 +263,10 @@ static void vm_dp_init(struct dpages *dp, void *data) | |||
243 | 263 | ||
244 | static void dm_bio_destructor(struct bio *bio) | 264 | static void dm_bio_destructor(struct bio *bio) |
245 | { | 265 | { |
246 | struct io *io = bio->bi_private; | 266 | unsigned region; |
267 | struct io *io; | ||
268 | |||
269 | retrieve_io_and_region_from_bio(bio, &io, ®ion); | ||
247 | 270 | ||
248 | bio_free(bio, io->client->bios); | 271 | bio_free(bio, io->client->bios); |
249 | } | 272 | } |
@@ -286,26 +309,23 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, | |||
286 | unsigned num_bvecs; | 309 | unsigned num_bvecs; |
287 | sector_t remaining = where->count; | 310 | sector_t remaining = where->count; |
288 | 311 | ||
289 | while (remaining) { | 312 | /* |
313 | * where->count may be zero if rw holds a write barrier and we | ||
314 | * need to send a zero-sized barrier. | ||
315 | */ | ||
316 | do { | ||
290 | /* | 317 | /* |
291 | * Allocate a suitably sized-bio: we add an extra | 318 | * Allocate a suitably sized-bio. |
292 | * bvec for bio_get/set_region() and decrement bi_max_vecs | ||
293 | * to hide it from bio_add_page(). | ||
294 | */ | 319 | */ |
295 | num_bvecs = dm_sector_div_up(remaining, | 320 | num_bvecs = dm_sector_div_up(remaining, |
296 | (PAGE_SIZE >> SECTOR_SHIFT)); | 321 | (PAGE_SIZE >> SECTOR_SHIFT)); |
297 | num_bvecs = 1 + min_t(int, bio_get_nr_vecs(where->bdev), | 322 | num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), num_bvecs); |
298 | num_bvecs); | ||
299 | if (unlikely(num_bvecs > BIO_MAX_PAGES)) | ||
300 | num_bvecs = BIO_MAX_PAGES; | ||
301 | bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios); | 323 | bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios); |
302 | bio->bi_sector = where->sector + (where->count - remaining); | 324 | bio->bi_sector = where->sector + (where->count - remaining); |
303 | bio->bi_bdev = where->bdev; | 325 | bio->bi_bdev = where->bdev; |
304 | bio->bi_end_io = endio; | 326 | bio->bi_end_io = endio; |
305 | bio->bi_private = io; | ||
306 | bio->bi_destructor = dm_bio_destructor; | 327 | bio->bi_destructor = dm_bio_destructor; |
307 | bio->bi_max_vecs--; | 328 | store_io_and_region_in_bio(bio, io, region); |
308 | bio_set_region(bio, region); | ||
309 | 329 | ||
310 | /* | 330 | /* |
311 | * Try and add as many pages as possible. | 331 | * Try and add as many pages as possible. |
@@ -323,7 +343,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, | |||
323 | 343 | ||
324 | atomic_inc(&io->count); | 344 | atomic_inc(&io->count); |
325 | submit_bio(rw, bio); | 345 | submit_bio(rw, bio); |
326 | } | 346 | } while (remaining); |
327 | } | 347 | } |
328 | 348 | ||
329 | static void dispatch_io(int rw, unsigned int num_regions, | 349 | static void dispatch_io(int rw, unsigned int num_regions, |
@@ -333,6 +353,8 @@ static void dispatch_io(int rw, unsigned int num_regions, | |||
333 | int i; | 353 | int i; |
334 | struct dpages old_pages = *dp; | 354 | struct dpages old_pages = *dp; |
335 | 355 | ||
356 | BUG_ON(num_regions > DM_IO_MAX_REGIONS); | ||
357 | |||
336 | if (sync) | 358 | if (sync) |
337 | rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); | 359 | rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); |
338 | 360 | ||
@@ -342,7 +364,7 @@ static void dispatch_io(int rw, unsigned int num_regions, | |||
342 | */ | 364 | */ |
343 | for (i = 0; i < num_regions; i++) { | 365 | for (i = 0; i < num_regions; i++) { |
344 | *dp = old_pages; | 366 | *dp = old_pages; |
345 | if (where[i].count) | 367 | if (where[i].count || (rw & (1 << BIO_RW_BARRIER))) |
346 | do_region(rw, i, where + i, dp, io); | 368 | do_region(rw, i, where + i, dp, io); |
347 | } | 369 | } |
348 | 370 | ||
@@ -357,7 +379,14 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, | |||
357 | struct dm_io_region *where, int rw, struct dpages *dp, | 379 | struct dm_io_region *where, int rw, struct dpages *dp, |
358 | unsigned long *error_bits) | 380 | unsigned long *error_bits) |
359 | { | 381 | { |
360 | struct io io; | 382 | /* |
383 | * gcc <= 4.3 can't do the alignment for stack variables, so we must | ||
384 | * align it on our own. | ||
385 | * volatile prevents the optimizer from removing or reusing | ||
386 | * "io_" field from the stack frame (allowed in ANSI C). | ||
387 | */ | ||
388 | volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1]; | ||
389 | struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io)); | ||
361 | 390 | ||
362 | if (num_regions > 1 && (rw & RW_MASK) != WRITE) { | 391 | if (num_regions > 1 && (rw & RW_MASK) != WRITE) { |
363 | WARN_ON(1); | 392 | WARN_ON(1); |
@@ -365,33 +394,33 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, | |||
365 | } | 394 | } |
366 | 395 | ||
367 | retry: | 396 | retry: |
368 | io.error_bits = 0; | 397 | io->error_bits = 0; |
369 | io.eopnotsupp_bits = 0; | 398 | io->eopnotsupp_bits = 0; |
370 | atomic_set(&io.count, 1); /* see dispatch_io() */ | 399 | atomic_set(&io->count, 1); /* see dispatch_io() */ |
371 | io.sleeper = current; | 400 | io->sleeper = current; |
372 | io.client = client; | 401 | io->client = client; |
373 | 402 | ||
374 | dispatch_io(rw, num_regions, where, dp, &io, 1); | 403 | dispatch_io(rw, num_regions, where, dp, io, 1); |
375 | 404 | ||
376 | while (1) { | 405 | while (1) { |
377 | set_current_state(TASK_UNINTERRUPTIBLE); | 406 | set_current_state(TASK_UNINTERRUPTIBLE); |
378 | 407 | ||
379 | if (!atomic_read(&io.count)) | 408 | if (!atomic_read(&io->count)) |
380 | break; | 409 | break; |
381 | 410 | ||
382 | io_schedule(); | 411 | io_schedule(); |
383 | } | 412 | } |
384 | set_current_state(TASK_RUNNING); | 413 | set_current_state(TASK_RUNNING); |
385 | 414 | ||
386 | if (io.eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) { | 415 | if (io->eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) { |
387 | rw &= ~(1 << BIO_RW_BARRIER); | 416 | rw &= ~(1 << BIO_RW_BARRIER); |
388 | goto retry; | 417 | goto retry; |
389 | } | 418 | } |
390 | 419 | ||
391 | if (error_bits) | 420 | if (error_bits) |
392 | *error_bits = io.error_bits; | 421 | *error_bits = io->error_bits; |
393 | 422 | ||
394 | return io.error_bits ? -EIO : 0; | 423 | return io->error_bits ? -EIO : 0; |
395 | } | 424 | } |
396 | 425 | ||
397 | static int async_io(struct dm_io_client *client, unsigned int num_regions, | 426 | static int async_io(struct dm_io_client *client, unsigned int num_regions, |
@@ -472,3 +501,18 @@ int dm_io(struct dm_io_request *io_req, unsigned num_regions, | |||
472 | &dp, io_req->notify.fn, io_req->notify.context); | 501 | &dp, io_req->notify.fn, io_req->notify.context); |
473 | } | 502 | } |
474 | EXPORT_SYMBOL(dm_io); | 503 | EXPORT_SYMBOL(dm_io); |
504 | |||
505 | int __init dm_io_init(void) | ||
506 | { | ||
507 | _dm_io_cache = KMEM_CACHE(io, 0); | ||
508 | if (!_dm_io_cache) | ||
509 | return -ENOMEM; | ||
510 | |||
511 | return 0; | ||
512 | } | ||
513 | |||
514 | void dm_io_exit(void) | ||
515 | { | ||
516 | kmem_cache_destroy(_dm_io_cache); | ||
517 | _dm_io_cache = NULL; | ||
518 | } | ||
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index a67942931582..d7500e1c26f2 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c | |||
@@ -56,6 +56,11 @@ static void dm_hash_remove_all(int keep_open_devices); | |||
56 | */ | 56 | */ |
57 | static DECLARE_RWSEM(_hash_lock); | 57 | static DECLARE_RWSEM(_hash_lock); |
58 | 58 | ||
59 | /* | ||
60 | * Protects use of mdptr to obtain hash cell name and uuid from mapped device. | ||
61 | */ | ||
62 | static DEFINE_MUTEX(dm_hash_cells_mutex); | ||
63 | |||
59 | static void init_buckets(struct list_head *buckets) | 64 | static void init_buckets(struct list_head *buckets) |
60 | { | 65 | { |
61 | unsigned int i; | 66 | unsigned int i; |
@@ -206,7 +211,9 @@ static int dm_hash_insert(const char *name, const char *uuid, struct mapped_devi | |||
206 | list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid)); | 211 | list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid)); |
207 | } | 212 | } |
208 | dm_get(md); | 213 | dm_get(md); |
214 | mutex_lock(&dm_hash_cells_mutex); | ||
209 | dm_set_mdptr(md, cell); | 215 | dm_set_mdptr(md, cell); |
216 | mutex_unlock(&dm_hash_cells_mutex); | ||
210 | up_write(&_hash_lock); | 217 | up_write(&_hash_lock); |
211 | 218 | ||
212 | return 0; | 219 | return 0; |
@@ -224,9 +231,11 @@ static void __hash_remove(struct hash_cell *hc) | |||
224 | /* remove from the dev hash */ | 231 | /* remove from the dev hash */ |
225 | list_del(&hc->uuid_list); | 232 | list_del(&hc->uuid_list); |
226 | list_del(&hc->name_list); | 233 | list_del(&hc->name_list); |
234 | mutex_lock(&dm_hash_cells_mutex); | ||
227 | dm_set_mdptr(hc->md, NULL); | 235 | dm_set_mdptr(hc->md, NULL); |
236 | mutex_unlock(&dm_hash_cells_mutex); | ||
228 | 237 | ||
229 | table = dm_get_table(hc->md); | 238 | table = dm_get_live_table(hc->md); |
230 | if (table) { | 239 | if (table) { |
231 | dm_table_event(table); | 240 | dm_table_event(table); |
232 | dm_table_put(table); | 241 | dm_table_put(table); |
@@ -276,7 +285,8 @@ retry: | |||
276 | up_write(&_hash_lock); | 285 | up_write(&_hash_lock); |
277 | } | 286 | } |
278 | 287 | ||
279 | static int dm_hash_rename(uint32_t cookie, const char *old, const char *new) | 288 | static int dm_hash_rename(uint32_t cookie, uint32_t *flags, const char *old, |
289 | const char *new) | ||
280 | { | 290 | { |
281 | char *new_name, *old_name; | 291 | char *new_name, *old_name; |
282 | struct hash_cell *hc; | 292 | struct hash_cell *hc; |
@@ -321,19 +331,22 @@ static int dm_hash_rename(uint32_t cookie, const char *old, const char *new) | |||
321 | */ | 331 | */ |
322 | list_del(&hc->name_list); | 332 | list_del(&hc->name_list); |
323 | old_name = hc->name; | 333 | old_name = hc->name; |
334 | mutex_lock(&dm_hash_cells_mutex); | ||
324 | hc->name = new_name; | 335 | hc->name = new_name; |
336 | mutex_unlock(&dm_hash_cells_mutex); | ||
325 | list_add(&hc->name_list, _name_buckets + hash_str(new_name)); | 337 | list_add(&hc->name_list, _name_buckets + hash_str(new_name)); |
326 | 338 | ||
327 | /* | 339 | /* |
328 | * Wake up any dm event waiters. | 340 | * Wake up any dm event waiters. |
329 | */ | 341 | */ |
330 | table = dm_get_table(hc->md); | 342 | table = dm_get_live_table(hc->md); |
331 | if (table) { | 343 | if (table) { |
332 | dm_table_event(table); | 344 | dm_table_event(table); |
333 | dm_table_put(table); | 345 | dm_table_put(table); |
334 | } | 346 | } |
335 | 347 | ||
336 | dm_kobject_uevent(hc->md, KOBJ_CHANGE, cookie); | 348 | if (!dm_kobject_uevent(hc->md, KOBJ_CHANGE, cookie)) |
349 | *flags |= DM_UEVENT_GENERATED_FLAG; | ||
337 | 350 | ||
338 | dm_put(hc->md); | 351 | dm_put(hc->md); |
339 | up_write(&_hash_lock); | 352 | up_write(&_hash_lock); |
@@ -512,8 +525,6 @@ static int list_versions(struct dm_ioctl *param, size_t param_size) | |||
512 | return 0; | 525 | return 0; |
513 | } | 526 | } |
514 | 527 | ||
515 | |||
516 | |||
517 | static int check_name(const char *name) | 528 | static int check_name(const char *name) |
518 | { | 529 | { |
519 | if (strchr(name, '/')) { | 530 | if (strchr(name, '/')) { |
@@ -525,6 +536,40 @@ static int check_name(const char *name) | |||
525 | } | 536 | } |
526 | 537 | ||
527 | /* | 538 | /* |
539 | * On successful return, the caller must not attempt to acquire | ||
540 | * _hash_lock without first calling dm_table_put, because dm_table_destroy | ||
541 | * waits for this dm_table_put and could be called under this lock. | ||
542 | */ | ||
543 | static struct dm_table *dm_get_inactive_table(struct mapped_device *md) | ||
544 | { | ||
545 | struct hash_cell *hc; | ||
546 | struct dm_table *table = NULL; | ||
547 | |||
548 | down_read(&_hash_lock); | ||
549 | hc = dm_get_mdptr(md); | ||
550 | if (!hc || hc->md != md) { | ||
551 | DMWARN("device has been removed from the dev hash table."); | ||
552 | goto out; | ||
553 | } | ||
554 | |||
555 | table = hc->new_map; | ||
556 | if (table) | ||
557 | dm_table_get(table); | ||
558 | |||
559 | out: | ||
560 | up_read(&_hash_lock); | ||
561 | |||
562 | return table; | ||
563 | } | ||
564 | |||
565 | static struct dm_table *dm_get_live_or_inactive_table(struct mapped_device *md, | ||
566 | struct dm_ioctl *param) | ||
567 | { | ||
568 | return (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) ? | ||
569 | dm_get_inactive_table(md) : dm_get_live_table(md); | ||
570 | } | ||
571 | |||
572 | /* | ||
528 | * Fills in a dm_ioctl structure, ready for sending back to | 573 | * Fills in a dm_ioctl structure, ready for sending back to |
529 | * userland. | 574 | * userland. |
530 | */ | 575 | */ |
@@ -536,7 +581,7 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param) | |||
536 | param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG | | 581 | param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG | |
537 | DM_ACTIVE_PRESENT_FLAG); | 582 | DM_ACTIVE_PRESENT_FLAG); |
538 | 583 | ||
539 | if (dm_suspended(md)) | 584 | if (dm_suspended_md(md)) |
540 | param->flags |= DM_SUSPEND_FLAG; | 585 | param->flags |= DM_SUSPEND_FLAG; |
541 | 586 | ||
542 | param->dev = huge_encode_dev(disk_devt(disk)); | 587 | param->dev = huge_encode_dev(disk_devt(disk)); |
@@ -548,18 +593,30 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param) | |||
548 | */ | 593 | */ |
549 | param->open_count = dm_open_count(md); | 594 | param->open_count = dm_open_count(md); |
550 | 595 | ||
551 | if (get_disk_ro(disk)) | ||
552 | param->flags |= DM_READONLY_FLAG; | ||
553 | |||
554 | param->event_nr = dm_get_event_nr(md); | 596 | param->event_nr = dm_get_event_nr(md); |
597 | param->target_count = 0; | ||
555 | 598 | ||
556 | table = dm_get_table(md); | 599 | table = dm_get_live_table(md); |
557 | if (table) { | 600 | if (table) { |
558 | param->flags |= DM_ACTIVE_PRESENT_FLAG; | 601 | if (!(param->flags & DM_QUERY_INACTIVE_TABLE_FLAG)) { |
559 | param->target_count = dm_table_get_num_targets(table); | 602 | if (get_disk_ro(disk)) |
603 | param->flags |= DM_READONLY_FLAG; | ||
604 | param->target_count = dm_table_get_num_targets(table); | ||
605 | } | ||
560 | dm_table_put(table); | 606 | dm_table_put(table); |
561 | } else | 607 | |
562 | param->target_count = 0; | 608 | param->flags |= DM_ACTIVE_PRESENT_FLAG; |
609 | } | ||
610 | |||
611 | if (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) { | ||
612 | table = dm_get_inactive_table(md); | ||
613 | if (table) { | ||
614 | if (!(dm_table_get_mode(table) & FMODE_WRITE)) | ||
615 | param->flags |= DM_READONLY_FLAG; | ||
616 | param->target_count = dm_table_get_num_targets(table); | ||
617 | dm_table_put(table); | ||
618 | } | ||
619 | } | ||
563 | 620 | ||
564 | return 0; | 621 | return 0; |
565 | } | 622 | } |
@@ -634,9 +691,9 @@ static struct mapped_device *find_device(struct dm_ioctl *param) | |||
634 | * Sneakily write in both the name and the uuid | 691 | * Sneakily write in both the name and the uuid |
635 | * while we have the cell. | 692 | * while we have the cell. |
636 | */ | 693 | */ |
637 | strncpy(param->name, hc->name, sizeof(param->name)); | 694 | strlcpy(param->name, hc->name, sizeof(param->name)); |
638 | if (hc->uuid) | 695 | if (hc->uuid) |
639 | strncpy(param->uuid, hc->uuid, sizeof(param->uuid)-1); | 696 | strlcpy(param->uuid, hc->uuid, sizeof(param->uuid)); |
640 | else | 697 | else |
641 | param->uuid[0] = '\0'; | 698 | param->uuid[0] = '\0'; |
642 | 699 | ||
@@ -681,10 +738,10 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) | |||
681 | __hash_remove(hc); | 738 | __hash_remove(hc); |
682 | up_write(&_hash_lock); | 739 | up_write(&_hash_lock); |
683 | 740 | ||
684 | dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr); | 741 | if (!dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr)) |
742 | param->flags |= DM_UEVENT_GENERATED_FLAG; | ||
685 | 743 | ||
686 | dm_put(md); | 744 | dm_put(md); |
687 | param->data_size = 0; | ||
688 | return 0; | 745 | return 0; |
689 | } | 746 | } |
690 | 747 | ||
@@ -718,7 +775,9 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size) | |||
718 | return r; | 775 | return r; |
719 | 776 | ||
720 | param->data_size = 0; | 777 | param->data_size = 0; |
721 | return dm_hash_rename(param->event_nr, param->name, new_name); | 778 | |
779 | return dm_hash_rename(param->event_nr, ¶m->flags, param->name, | ||
780 | new_name); | ||
722 | } | 781 | } |
723 | 782 | ||
724 | static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) | 783 | static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) |
@@ -784,7 +843,7 @@ static int do_suspend(struct dm_ioctl *param) | |||
784 | if (param->flags & DM_NOFLUSH_FLAG) | 843 | if (param->flags & DM_NOFLUSH_FLAG) |
785 | suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; | 844 | suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; |
786 | 845 | ||
787 | if (!dm_suspended(md)) | 846 | if (!dm_suspended_md(md)) |
788 | r = dm_suspend(md, suspend_flags); | 847 | r = dm_suspend(md, suspend_flags); |
789 | 848 | ||
790 | if (!r) | 849 | if (!r) |
@@ -800,7 +859,7 @@ static int do_resume(struct dm_ioctl *param) | |||
800 | unsigned suspend_flags = DM_SUSPEND_LOCKFS_FLAG; | 859 | unsigned suspend_flags = DM_SUSPEND_LOCKFS_FLAG; |
801 | struct hash_cell *hc; | 860 | struct hash_cell *hc; |
802 | struct mapped_device *md; | 861 | struct mapped_device *md; |
803 | struct dm_table *new_map; | 862 | struct dm_table *new_map, *old_map = NULL; |
804 | 863 | ||
805 | down_write(&_hash_lock); | 864 | down_write(&_hash_lock); |
806 | 865 | ||
@@ -826,14 +885,14 @@ static int do_resume(struct dm_ioctl *param) | |||
826 | suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG; | 885 | suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG; |
827 | if (param->flags & DM_NOFLUSH_FLAG) | 886 | if (param->flags & DM_NOFLUSH_FLAG) |
828 | suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; | 887 | suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; |
829 | if (!dm_suspended(md)) | 888 | if (!dm_suspended_md(md)) |
830 | dm_suspend(md, suspend_flags); | 889 | dm_suspend(md, suspend_flags); |
831 | 890 | ||
832 | r = dm_swap_table(md, new_map); | 891 | old_map = dm_swap_table(md, new_map); |
833 | if (r) { | 892 | if (IS_ERR(old_map)) { |
834 | dm_table_destroy(new_map); | 893 | dm_table_destroy(new_map); |
835 | dm_put(md); | 894 | dm_put(md); |
836 | return r; | 895 | return PTR_ERR(old_map); |
837 | } | 896 | } |
838 | 897 | ||
839 | if (dm_table_get_mode(new_map) & FMODE_WRITE) | 898 | if (dm_table_get_mode(new_map) & FMODE_WRITE) |
@@ -842,14 +901,17 @@ static int do_resume(struct dm_ioctl *param) | |||
842 | set_disk_ro(dm_disk(md), 1); | 901 | set_disk_ro(dm_disk(md), 1); |
843 | } | 902 | } |
844 | 903 | ||
845 | if (dm_suspended(md)) | 904 | if (dm_suspended_md(md)) { |
846 | r = dm_resume(md); | 905 | r = dm_resume(md); |
906 | if (!r && !dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr)) | ||
907 | param->flags |= DM_UEVENT_GENERATED_FLAG; | ||
908 | } | ||
847 | 909 | ||
910 | if (old_map) | ||
911 | dm_table_destroy(old_map); | ||
848 | 912 | ||
849 | if (!r) { | 913 | if (!r) |
850 | dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr); | ||
851 | r = __dev_status(md, param); | 914 | r = __dev_status(md, param); |
852 | } | ||
853 | 915 | ||
854 | dm_put(md); | 916 | dm_put(md); |
855 | return r; | 917 | return r; |
@@ -982,7 +1044,7 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size) | |||
982 | if (r) | 1044 | if (r) |
983 | goto out; | 1045 | goto out; |
984 | 1046 | ||
985 | table = dm_get_table(md); | 1047 | table = dm_get_live_or_inactive_table(md, param); |
986 | if (table) { | 1048 | if (table) { |
987 | retrieve_status(table, param, param_size); | 1049 | retrieve_status(table, param, param_size); |
988 | dm_table_put(table); | 1050 | dm_table_put(table); |
@@ -1215,7 +1277,7 @@ static int table_deps(struct dm_ioctl *param, size_t param_size) | |||
1215 | if (r) | 1277 | if (r) |
1216 | goto out; | 1278 | goto out; |
1217 | 1279 | ||
1218 | table = dm_get_table(md); | 1280 | table = dm_get_live_or_inactive_table(md, param); |
1219 | if (table) { | 1281 | if (table) { |
1220 | retrieve_deps(table, param, param_size); | 1282 | retrieve_deps(table, param, param_size); |
1221 | dm_table_put(table); | 1283 | dm_table_put(table); |
@@ -1244,13 +1306,13 @@ static int table_status(struct dm_ioctl *param, size_t param_size) | |||
1244 | if (r) | 1306 | if (r) |
1245 | goto out; | 1307 | goto out; |
1246 | 1308 | ||
1247 | table = dm_get_table(md); | 1309 | table = dm_get_live_or_inactive_table(md, param); |
1248 | if (table) { | 1310 | if (table) { |
1249 | retrieve_status(table, param, param_size); | 1311 | retrieve_status(table, param, param_size); |
1250 | dm_table_put(table); | 1312 | dm_table_put(table); |
1251 | } | 1313 | } |
1252 | 1314 | ||
1253 | out: | 1315 | out: |
1254 | dm_put(md); | 1316 | dm_put(md); |
1255 | return r; | 1317 | return r; |
1256 | } | 1318 | } |
@@ -1288,10 +1350,15 @@ static int target_message(struct dm_ioctl *param, size_t param_size) | |||
1288 | goto out; | 1350 | goto out; |
1289 | } | 1351 | } |
1290 | 1352 | ||
1291 | table = dm_get_table(md); | 1353 | table = dm_get_live_table(md); |
1292 | if (!table) | 1354 | if (!table) |
1293 | goto out_argv; | 1355 | goto out_argv; |
1294 | 1356 | ||
1357 | if (dm_deleting_md(md)) { | ||
1358 | r = -ENXIO; | ||
1359 | goto out_table; | ||
1360 | } | ||
1361 | |||
1295 | ti = dm_table_find_target(table, tmsg->sector); | 1362 | ti = dm_table_find_target(table, tmsg->sector); |
1296 | if (!dm_target_is_valid(ti)) { | 1363 | if (!dm_target_is_valid(ti)) { |
1297 | DMWARN("Target message sector outside device."); | 1364 | DMWARN("Target message sector outside device."); |
@@ -1303,6 +1370,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size) | |||
1303 | r = -EINVAL; | 1370 | r = -EINVAL; |
1304 | } | 1371 | } |
1305 | 1372 | ||
1373 | out_table: | ||
1306 | dm_table_put(table); | 1374 | dm_table_put(table); |
1307 | out_argv: | 1375 | out_argv: |
1308 | kfree(argv); | 1376 | kfree(argv); |
@@ -1413,6 +1481,7 @@ static int validate_params(uint cmd, struct dm_ioctl *param) | |||
1413 | { | 1481 | { |
1414 | /* Always clear this flag */ | 1482 | /* Always clear this flag */ |
1415 | param->flags &= ~DM_BUFFER_FULL_FLAG; | 1483 | param->flags &= ~DM_BUFFER_FULL_FLAG; |
1484 | param->flags &= ~DM_UEVENT_GENERATED_FLAG; | ||
1416 | 1485 | ||
1417 | /* Ignores parameters */ | 1486 | /* Ignores parameters */ |
1418 | if (cmd == DM_REMOVE_ALL_CMD || | 1487 | if (cmd == DM_REMOVE_ALL_CMD || |
@@ -1582,8 +1651,7 @@ int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid) | |||
1582 | if (!md) | 1651 | if (!md) |
1583 | return -ENXIO; | 1652 | return -ENXIO; |
1584 | 1653 | ||
1585 | dm_get(md); | 1654 | mutex_lock(&dm_hash_cells_mutex); |
1586 | down_read(&_hash_lock); | ||
1587 | hc = dm_get_mdptr(md); | 1655 | hc = dm_get_mdptr(md); |
1588 | if (!hc || hc->md != md) { | 1656 | if (!hc || hc->md != md) { |
1589 | r = -ENXIO; | 1657 | r = -ENXIO; |
@@ -1596,8 +1664,7 @@ int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid) | |||
1596 | strcpy(uuid, hc->uuid ? : ""); | 1664 | strcpy(uuid, hc->uuid ? : ""); |
1597 | 1665 | ||
1598 | out: | 1666 | out: |
1599 | up_read(&_hash_lock); | 1667 | mutex_unlock(&dm_hash_cells_mutex); |
1600 | dm_put(md); | ||
1601 | 1668 | ||
1602 | return r; | 1669 | return r; |
1603 | } | 1670 | } |
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 3e3fc06cb861..addf83475040 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c | |||
@@ -450,7 +450,10 @@ static void dispatch_job(struct kcopyd_job *job) | |||
450 | { | 450 | { |
451 | struct dm_kcopyd_client *kc = job->kc; | 451 | struct dm_kcopyd_client *kc = job->kc; |
452 | atomic_inc(&kc->nr_jobs); | 452 | atomic_inc(&kc->nr_jobs); |
453 | push(&kc->pages_jobs, job); | 453 | if (unlikely(!job->source.count)) |
454 | push(&kc->complete_jobs, job); | ||
455 | else | ||
456 | push(&kc->pages_jobs, job); | ||
454 | wake(kc); | 457 | wake(kc); |
455 | } | 458 | } |
456 | 459 | ||
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 82f7d6e6b1ea..9200dbf2391a 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c | |||
@@ -47,8 +47,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
47 | } | 47 | } |
48 | lc->start = tmp; | 48 | lc->start = tmp; |
49 | 49 | ||
50 | if (dm_get_device(ti, argv[0], lc->start, ti->len, | 50 | if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &lc->dev)) { |
51 | dm_table_get_mode(ti->table), &lc->dev)) { | ||
52 | ti->error = "dm-linear: Device lookup failed"; | 51 | ti->error = "dm-linear: Device lookup failed"; |
53 | goto bad; | 52 | goto bad; |
54 | } | 53 | } |
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c index 652bd33109e3..1ed0094f064b 100644 --- a/drivers/md/dm-log-userspace-base.c +++ b/drivers/md/dm-log-userspace-base.c | |||
@@ -5,6 +5,7 @@ | |||
5 | */ | 5 | */ |
6 | 6 | ||
7 | #include <linux/bio.h> | 7 | #include <linux/bio.h> |
8 | #include <linux/slab.h> | ||
8 | #include <linux/dm-dirty-log.h> | 9 | #include <linux/dm-dirty-log.h> |
9 | #include <linux/device-mapper.h> | 10 | #include <linux/device-mapper.h> |
10 | #include <linux/dm-log-userspace.h> | 11 | #include <linux/dm-log-userspace.h> |
@@ -156,7 +157,7 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, | |||
156 | } | 157 | } |
157 | 158 | ||
158 | /* The ptr value is sufficient for local unique id */ | 159 | /* The ptr value is sufficient for local unique id */ |
159 | lc->luid = (uint64_t)lc; | 160 | lc->luid = (unsigned long)lc; |
160 | 161 | ||
161 | lc->ti = ti; | 162 | lc->ti = ti; |
162 | 163 | ||
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c index ba0edad2d048..075cbcf8a9f5 100644 --- a/drivers/md/dm-log-userspace-transfer.c +++ b/drivers/md/dm-log-userspace-transfer.c | |||
@@ -6,6 +6,7 @@ | |||
6 | 6 | ||
7 | #include <linux/kernel.h> | 7 | #include <linux/kernel.h> |
8 | #include <linux/module.h> | 8 | #include <linux/module.h> |
9 | #include <linux/slab.h> | ||
9 | #include <net/sock.h> | 10 | #include <net/sock.h> |
10 | #include <linux/workqueue.h> | 11 | #include <linux/workqueue.h> |
11 | #include <linux/connector.h> | 12 | #include <linux/connector.h> |
@@ -129,11 +130,13 @@ static int fill_pkg(struct cn_msg *msg, struct dm_ulog_request *tfr) | |||
129 | * This is the connector callback that delivers data | 130 | * This is the connector callback that delivers data |
130 | * that was sent from userspace. | 131 | * that was sent from userspace. |
131 | */ | 132 | */ |
132 | static void cn_ulog_callback(void *data) | 133 | static void cn_ulog_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp) |
133 | { | 134 | { |
134 | struct cn_msg *msg = (struct cn_msg *)data; | ||
135 | struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1); | 135 | struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1); |
136 | 136 | ||
137 | if (!cap_raised(nsp->eff_cap, CAP_SYS_ADMIN)) | ||
138 | return; | ||
139 | |||
137 | spin_lock(&receiving_list_lock); | 140 | spin_lock(&receiving_list_lock); |
138 | if (msg->len == 0) | 141 | if (msg->len == 0) |
139 | fill_pkg(msg, NULL); | 142 | fill_pkg(msg, NULL); |
@@ -170,11 +173,15 @@ int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type, | |||
170 | { | 173 | { |
171 | int r = 0; | 174 | int r = 0; |
172 | size_t dummy = 0; | 175 | size_t dummy = 0; |
173 | int overhead_size = | 176 | int overhead_size = sizeof(struct dm_ulog_request) + sizeof(struct cn_msg); |
174 | sizeof(struct dm_ulog_request *) + sizeof(struct cn_msg); | ||
175 | struct dm_ulog_request *tfr = prealloced_ulog_tfr; | 177 | struct dm_ulog_request *tfr = prealloced_ulog_tfr; |
176 | struct receiving_pkg pkg; | 178 | struct receiving_pkg pkg; |
177 | 179 | ||
180 | /* | ||
181 | * Given the space needed to hold the 'struct cn_msg' and | ||
182 | * 'struct dm_ulog_request' - do we have enough payload | ||
183 | * space remaining? | ||
184 | */ | ||
178 | if (data_size > (DM_ULOG_PREALLOCED_SIZE - overhead_size)) { | 185 | if (data_size > (DM_ULOG_PREALLOCED_SIZE - overhead_size)) { |
179 | DMINFO("Size of tfr exceeds preallocated size"); | 186 | DMINFO("Size of tfr exceeds preallocated size"); |
180 | return -EINVAL; | 187 | return -EINVAL; |
@@ -189,7 +196,7 @@ resend: | |||
189 | */ | 196 | */ |
190 | mutex_lock(&dm_ulog_lock); | 197 | mutex_lock(&dm_ulog_lock); |
191 | 198 | ||
192 | memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - overhead_size); | 199 | memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - sizeof(struct cn_msg)); |
193 | memcpy(tfr->uuid, uuid, DM_UUID_LEN); | 200 | memcpy(tfr->uuid, uuid, DM_UUID_LEN); |
194 | tfr->luid = luid; | 201 | tfr->luid = luid; |
195 | tfr->seq = dm_ulog_seq++; | 202 | tfr->seq = dm_ulog_seq++; |
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 9443896ede07..5a08be0222db 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c | |||
@@ -145,8 +145,9 @@ int dm_dirty_log_type_unregister(struct dm_dirty_log_type *type) | |||
145 | EXPORT_SYMBOL(dm_dirty_log_type_unregister); | 145 | EXPORT_SYMBOL(dm_dirty_log_type_unregister); |
146 | 146 | ||
147 | struct dm_dirty_log *dm_dirty_log_create(const char *type_name, | 147 | struct dm_dirty_log *dm_dirty_log_create(const char *type_name, |
148 | struct dm_target *ti, | 148 | struct dm_target *ti, |
149 | unsigned int argc, char **argv) | 149 | int (*flush_callback_fn)(struct dm_target *ti), |
150 | unsigned int argc, char **argv) | ||
150 | { | 151 | { |
151 | struct dm_dirty_log_type *type; | 152 | struct dm_dirty_log_type *type; |
152 | struct dm_dirty_log *log; | 153 | struct dm_dirty_log *log; |
@@ -161,6 +162,7 @@ struct dm_dirty_log *dm_dirty_log_create(const char *type_name, | |||
161 | return NULL; | 162 | return NULL; |
162 | } | 163 | } |
163 | 164 | ||
165 | log->flush_callback_fn = flush_callback_fn; | ||
164 | log->type = type; | 166 | log->type = type; |
165 | if (type->ctr(log, ti, argc, argv)) { | 167 | if (type->ctr(log, ti, argc, argv)) { |
166 | kfree(log); | 168 | kfree(log); |
@@ -208,7 +210,9 @@ struct log_header { | |||
208 | 210 | ||
209 | struct log_c { | 211 | struct log_c { |
210 | struct dm_target *ti; | 212 | struct dm_target *ti; |
211 | int touched; | 213 | int touched_dirtied; |
214 | int touched_cleaned; | ||
215 | int flush_failed; | ||
212 | uint32_t region_size; | 216 | uint32_t region_size; |
213 | unsigned int region_count; | 217 | unsigned int region_count; |
214 | region_t sync_count; | 218 | region_t sync_count; |
@@ -233,6 +237,7 @@ struct log_c { | |||
233 | * Disk log fields | 237 | * Disk log fields |
234 | */ | 238 | */ |
235 | int log_dev_failed; | 239 | int log_dev_failed; |
240 | int log_dev_flush_failed; | ||
236 | struct dm_dev *log_dev; | 241 | struct dm_dev *log_dev; |
237 | struct log_header header; | 242 | struct log_header header; |
238 | 243 | ||
@@ -253,14 +258,14 @@ static inline void log_set_bit(struct log_c *l, | |||
253 | uint32_t *bs, unsigned bit) | 258 | uint32_t *bs, unsigned bit) |
254 | { | 259 | { |
255 | ext2_set_bit(bit, (unsigned long *) bs); | 260 | ext2_set_bit(bit, (unsigned long *) bs); |
256 | l->touched = 1; | 261 | l->touched_cleaned = 1; |
257 | } | 262 | } |
258 | 263 | ||
259 | static inline void log_clear_bit(struct log_c *l, | 264 | static inline void log_clear_bit(struct log_c *l, |
260 | uint32_t *bs, unsigned bit) | 265 | uint32_t *bs, unsigned bit) |
261 | { | 266 | { |
262 | ext2_clear_bit(bit, (unsigned long *) bs); | 267 | ext2_clear_bit(bit, (unsigned long *) bs); |
263 | l->touched = 1; | 268 | l->touched_dirtied = 1; |
264 | } | 269 | } |
265 | 270 | ||
266 | /*---------------------------------------------------------------- | 271 | /*---------------------------------------------------------------- |
@@ -287,6 +292,19 @@ static int rw_header(struct log_c *lc, int rw) | |||
287 | return dm_io(&lc->io_req, 1, &lc->header_location, NULL); | 292 | return dm_io(&lc->io_req, 1, &lc->header_location, NULL); |
288 | } | 293 | } |
289 | 294 | ||
295 | static int flush_header(struct log_c *lc) | ||
296 | { | ||
297 | struct dm_io_region null_location = { | ||
298 | .bdev = lc->header_location.bdev, | ||
299 | .sector = 0, | ||
300 | .count = 0, | ||
301 | }; | ||
302 | |||
303 | lc->io_req.bi_rw = WRITE_BARRIER; | ||
304 | |||
305 | return dm_io(&lc->io_req, 1, &null_location, NULL); | ||
306 | } | ||
307 | |||
290 | static int read_header(struct log_c *log) | 308 | static int read_header(struct log_c *log) |
291 | { | 309 | { |
292 | int r; | 310 | int r; |
@@ -378,7 +396,9 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, | |||
378 | } | 396 | } |
379 | 397 | ||
380 | lc->ti = ti; | 398 | lc->ti = ti; |
381 | lc->touched = 0; | 399 | lc->touched_dirtied = 0; |
400 | lc->touched_cleaned = 0; | ||
401 | lc->flush_failed = 0; | ||
382 | lc->region_size = region_size; | 402 | lc->region_size = region_size; |
383 | lc->region_count = region_count; | 403 | lc->region_count = region_count; |
384 | lc->sync = sync; | 404 | lc->sync = sync; |
@@ -406,6 +426,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, | |||
406 | } else { | 426 | } else { |
407 | lc->log_dev = dev; | 427 | lc->log_dev = dev; |
408 | lc->log_dev_failed = 0; | 428 | lc->log_dev_failed = 0; |
429 | lc->log_dev_flush_failed = 0; | ||
409 | lc->header_location.bdev = lc->log_dev->bdev; | 430 | lc->header_location.bdev = lc->log_dev->bdev; |
410 | lc->header_location.sector = 0; | 431 | lc->header_location.sector = 0; |
411 | 432 | ||
@@ -522,8 +543,7 @@ static int disk_ctr(struct dm_dirty_log *log, struct dm_target *ti, | |||
522 | return -EINVAL; | 543 | return -EINVAL; |
523 | } | 544 | } |
524 | 545 | ||
525 | r = dm_get_device(ti, argv[0], 0, 0 /* FIXME */, | 546 | r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &dev); |
526 | FMODE_READ | FMODE_WRITE, &dev); | ||
527 | if (r) | 547 | if (r) |
528 | return r; | 548 | return r; |
529 | 549 | ||
@@ -614,6 +634,11 @@ static int disk_resume(struct dm_dirty_log *log) | |||
614 | 634 | ||
615 | /* write the new header */ | 635 | /* write the new header */ |
616 | r = rw_header(lc, WRITE); | 636 | r = rw_header(lc, WRITE); |
637 | if (!r) { | ||
638 | r = flush_header(lc); | ||
639 | if (r) | ||
640 | lc->log_dev_flush_failed = 1; | ||
641 | } | ||
617 | if (r) { | 642 | if (r) { |
618 | DMWARN("%s: Failed to write header on dirty region log device", | 643 | DMWARN("%s: Failed to write header on dirty region log device", |
619 | lc->log_dev->name); | 644 | lc->log_dev->name); |
@@ -656,18 +681,40 @@ static int core_flush(struct dm_dirty_log *log) | |||
656 | 681 | ||
657 | static int disk_flush(struct dm_dirty_log *log) | 682 | static int disk_flush(struct dm_dirty_log *log) |
658 | { | 683 | { |
659 | int r; | 684 | int r, i; |
660 | struct log_c *lc = (struct log_c *) log->context; | 685 | struct log_c *lc = log->context; |
661 | 686 | ||
662 | /* only write if the log has changed */ | 687 | /* only write if the log has changed */ |
663 | if (!lc->touched) | 688 | if (!lc->touched_cleaned && !lc->touched_dirtied) |
664 | return 0; | 689 | return 0; |
665 | 690 | ||
691 | if (lc->touched_cleaned && log->flush_callback_fn && | ||
692 | log->flush_callback_fn(lc->ti)) { | ||
693 | /* | ||
694 | * At this point it is impossible to determine which | ||
695 | * regions are clean and which are dirty (without | ||
696 | * re-reading the log off disk). So mark all of them | ||
697 | * dirty. | ||
698 | */ | ||
699 | lc->flush_failed = 1; | ||
700 | for (i = 0; i < lc->region_count; i++) | ||
701 | log_clear_bit(lc, lc->clean_bits, i); | ||
702 | } | ||
703 | |||
666 | r = rw_header(lc, WRITE); | 704 | r = rw_header(lc, WRITE); |
667 | if (r) | 705 | if (r) |
668 | fail_log_device(lc); | 706 | fail_log_device(lc); |
669 | else | 707 | else { |
670 | lc->touched = 0; | 708 | if (lc->touched_dirtied) { |
709 | r = flush_header(lc); | ||
710 | if (r) { | ||
711 | lc->log_dev_flush_failed = 1; | ||
712 | fail_log_device(lc); | ||
713 | } else | ||
714 | lc->touched_dirtied = 0; | ||
715 | } | ||
716 | lc->touched_cleaned = 0; | ||
717 | } | ||
671 | 718 | ||
672 | return r; | 719 | return r; |
673 | } | 720 | } |
@@ -681,7 +728,8 @@ static void core_mark_region(struct dm_dirty_log *log, region_t region) | |||
681 | static void core_clear_region(struct dm_dirty_log *log, region_t region) | 728 | static void core_clear_region(struct dm_dirty_log *log, region_t region) |
682 | { | 729 | { |
683 | struct log_c *lc = (struct log_c *) log->context; | 730 | struct log_c *lc = (struct log_c *) log->context; |
684 | log_set_bit(lc, lc->clean_bits, region); | 731 | if (likely(!lc->flush_failed)) |
732 | log_set_bit(lc, lc->clean_bits, region); | ||
685 | } | 733 | } |
686 | 734 | ||
687 | static int core_get_resync_work(struct dm_dirty_log *log, region_t *region) | 735 | static int core_get_resync_work(struct dm_dirty_log *log, region_t *region) |
@@ -762,7 +810,9 @@ static int disk_status(struct dm_dirty_log *log, status_type_t status, | |||
762 | switch(status) { | 810 | switch(status) { |
763 | case STATUSTYPE_INFO: | 811 | case STATUSTYPE_INFO: |
764 | DMEMIT("3 %s %s %c", log->type->name, lc->log_dev->name, | 812 | DMEMIT("3 %s %s %c", log->type->name, lc->log_dev->name, |
765 | lc->log_dev_failed ? 'D' : 'A'); | 813 | lc->log_dev_flush_failed ? 'F' : |
814 | lc->log_dev_failed ? 'D' : | ||
815 | 'A'); | ||
766 | break; | 816 | break; |
767 | 817 | ||
768 | case STATUSTYPE_TABLE: | 818 | case STATUSTYPE_TABLE: |
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 32d0b878eccc..826bce7343b3 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c | |||
@@ -69,6 +69,7 @@ struct multipath { | |||
69 | struct list_head priority_groups; | 69 | struct list_head priority_groups; |
70 | unsigned pg_init_required; /* pg_init needs calling? */ | 70 | unsigned pg_init_required; /* pg_init needs calling? */ |
71 | unsigned pg_init_in_progress; /* Only one pg_init allowed at once */ | 71 | unsigned pg_init_in_progress; /* Only one pg_init allowed at once */ |
72 | wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ | ||
72 | 73 | ||
73 | unsigned nr_valid_paths; /* Total number of usable paths */ | 74 | unsigned nr_valid_paths; /* Total number of usable paths */ |
74 | struct pgpath *current_pgpath; | 75 | struct pgpath *current_pgpath; |
@@ -93,6 +94,8 @@ struct multipath { | |||
93 | * can resubmit bios on error. | 94 | * can resubmit bios on error. |
94 | */ | 95 | */ |
95 | mempool_t *mpio_pool; | 96 | mempool_t *mpio_pool; |
97 | |||
98 | struct mutex work_mutex; | ||
96 | }; | 99 | }; |
97 | 100 | ||
98 | /* | 101 | /* |
@@ -198,6 +201,8 @@ static struct multipath *alloc_multipath(struct dm_target *ti) | |||
198 | m->queue_io = 1; | 201 | m->queue_io = 1; |
199 | INIT_WORK(&m->process_queued_ios, process_queued_ios); | 202 | INIT_WORK(&m->process_queued_ios, process_queued_ios); |
200 | INIT_WORK(&m->trigger_event, trigger_event); | 203 | INIT_WORK(&m->trigger_event, trigger_event); |
204 | init_waitqueue_head(&m->pg_init_wait); | ||
205 | mutex_init(&m->work_mutex); | ||
201 | m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache); | 206 | m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache); |
202 | if (!m->mpio_pool) { | 207 | if (!m->mpio_pool) { |
203 | kfree(m); | 208 | kfree(m); |
@@ -230,6 +235,21 @@ static void free_multipath(struct multipath *m) | |||
230 | * Path selection | 235 | * Path selection |
231 | *-----------------------------------------------*/ | 236 | *-----------------------------------------------*/ |
232 | 237 | ||
238 | static void __pg_init_all_paths(struct multipath *m) | ||
239 | { | ||
240 | struct pgpath *pgpath; | ||
241 | |||
242 | m->pg_init_count++; | ||
243 | m->pg_init_required = 0; | ||
244 | list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) { | ||
245 | /* Skip failed paths */ | ||
246 | if (!pgpath->is_active) | ||
247 | continue; | ||
248 | if (queue_work(kmpath_handlerd, &pgpath->activate_path)) | ||
249 | m->pg_init_in_progress++; | ||
250 | } | ||
251 | } | ||
252 | |||
233 | static void __switch_pg(struct multipath *m, struct pgpath *pgpath) | 253 | static void __switch_pg(struct multipath *m, struct pgpath *pgpath) |
234 | { | 254 | { |
235 | m->current_pg = pgpath->pg; | 255 | m->current_pg = pgpath->pg; |
@@ -434,7 +454,7 @@ static void process_queued_ios(struct work_struct *work) | |||
434 | { | 454 | { |
435 | struct multipath *m = | 455 | struct multipath *m = |
436 | container_of(work, struct multipath, process_queued_ios); | 456 | container_of(work, struct multipath, process_queued_ios); |
437 | struct pgpath *pgpath = NULL, *tmp; | 457 | struct pgpath *pgpath = NULL; |
438 | unsigned must_queue = 1; | 458 | unsigned must_queue = 1; |
439 | unsigned long flags; | 459 | unsigned long flags; |
440 | 460 | ||
@@ -452,14 +472,9 @@ static void process_queued_ios(struct work_struct *work) | |||
452 | (!pgpath && !m->queue_if_no_path)) | 472 | (!pgpath && !m->queue_if_no_path)) |
453 | must_queue = 0; | 473 | must_queue = 0; |
454 | 474 | ||
455 | if (m->pg_init_required && !m->pg_init_in_progress && pgpath) { | 475 | if (m->pg_init_required && !m->pg_init_in_progress && pgpath) |
456 | m->pg_init_count++; | 476 | __pg_init_all_paths(m); |
457 | m->pg_init_required = 0; | 477 | |
458 | list_for_each_entry(tmp, &pgpath->pg->pgpaths, list) { | ||
459 | if (queue_work(kmpath_handlerd, &tmp->activate_path)) | ||
460 | m->pg_init_in_progress++; | ||
461 | } | ||
462 | } | ||
463 | out: | 478 | out: |
464 | spin_unlock_irqrestore(&m->lock, flags); | 479 | spin_unlock_irqrestore(&m->lock, flags); |
465 | if (!must_queue) | 480 | if (!must_queue) |
@@ -592,8 +607,8 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, | |||
592 | if (!p) | 607 | if (!p) |
593 | return ERR_PTR(-ENOMEM); | 608 | return ERR_PTR(-ENOMEM); |
594 | 609 | ||
595 | r = dm_get_device(ti, shift(as), ti->begin, ti->len, | 610 | r = dm_get_device(ti, shift(as), dm_table_get_mode(ti->table), |
596 | dm_table_get_mode(ti->table), &p->path.dev); | 611 | &p->path.dev); |
597 | if (r) { | 612 | if (r) { |
598 | ti->error = "error getting device"; | 613 | ti->error = "error getting device"; |
599 | goto bad; | 614 | goto bad; |
@@ -885,13 +900,43 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, | |||
885 | return r; | 900 | return r; |
886 | } | 901 | } |
887 | 902 | ||
888 | static void multipath_dtr(struct dm_target *ti) | 903 | static void multipath_wait_for_pg_init_completion(struct multipath *m) |
889 | { | 904 | { |
890 | struct multipath *m = (struct multipath *) ti->private; | 905 | DECLARE_WAITQUEUE(wait, current); |
906 | unsigned long flags; | ||
907 | |||
908 | add_wait_queue(&m->pg_init_wait, &wait); | ||
891 | 909 | ||
910 | while (1) { | ||
911 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
912 | |||
913 | spin_lock_irqsave(&m->lock, flags); | ||
914 | if (!m->pg_init_in_progress) { | ||
915 | spin_unlock_irqrestore(&m->lock, flags); | ||
916 | break; | ||
917 | } | ||
918 | spin_unlock_irqrestore(&m->lock, flags); | ||
919 | |||
920 | io_schedule(); | ||
921 | } | ||
922 | set_current_state(TASK_RUNNING); | ||
923 | |||
924 | remove_wait_queue(&m->pg_init_wait, &wait); | ||
925 | } | ||
926 | |||
927 | static void flush_multipath_work(struct multipath *m) | ||
928 | { | ||
892 | flush_workqueue(kmpath_handlerd); | 929 | flush_workqueue(kmpath_handlerd); |
930 | multipath_wait_for_pg_init_completion(m); | ||
893 | flush_workqueue(kmultipathd); | 931 | flush_workqueue(kmultipathd); |
894 | flush_scheduled_work(); | 932 | flush_scheduled_work(); |
933 | } | ||
934 | |||
935 | static void multipath_dtr(struct dm_target *ti) | ||
936 | { | ||
937 | struct multipath *m = ti->private; | ||
938 | |||
939 | flush_multipath_work(m); | ||
895 | free_multipath(m); | 940 | free_multipath(m); |
896 | } | 941 | } |
897 | 942 | ||
@@ -1116,9 +1161,9 @@ static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath) | |||
1116 | return limit_reached; | 1161 | return limit_reached; |
1117 | } | 1162 | } |
1118 | 1163 | ||
1119 | static void pg_init_done(struct dm_path *path, int errors) | 1164 | static void pg_init_done(void *data, int errors) |
1120 | { | 1165 | { |
1121 | struct pgpath *pgpath = path_to_pgpath(path); | 1166 | struct pgpath *pgpath = data; |
1122 | struct priority_group *pg = pgpath->pg; | 1167 | struct priority_group *pg = pgpath->pg; |
1123 | struct multipath *m = pg->m; | 1168 | struct multipath *m = pg->m; |
1124 | unsigned long flags; | 1169 | unsigned long flags; |
@@ -1132,8 +1177,8 @@ static void pg_init_done(struct dm_path *path, int errors) | |||
1132 | errors = 0; | 1177 | errors = 0; |
1133 | break; | 1178 | break; |
1134 | } | 1179 | } |
1135 | DMERR("Cannot failover device because scsi_dh_%s was not " | 1180 | DMERR("Could not failover the device: Handler scsi_dh_%s " |
1136 | "loaded.", m->hw_handler_name); | 1181 | "Error %d.", m->hw_handler_name, errors); |
1137 | /* | 1182 | /* |
1138 | * Fail path for now, so we do not ping pong | 1183 | * Fail path for now, so we do not ping pong |
1139 | */ | 1184 | */ |
@@ -1170,25 +1215,34 @@ static void pg_init_done(struct dm_path *path, int errors) | |||
1170 | m->current_pgpath = NULL; | 1215 | m->current_pgpath = NULL; |
1171 | m->current_pg = NULL; | 1216 | m->current_pg = NULL; |
1172 | } | 1217 | } |
1173 | } else if (!m->pg_init_required) { | 1218 | } else if (!m->pg_init_required) |
1174 | m->queue_io = 0; | ||
1175 | pg->bypassed = 0; | 1219 | pg->bypassed = 0; |
1176 | } | ||
1177 | 1220 | ||
1178 | m->pg_init_in_progress--; | 1221 | if (--m->pg_init_in_progress) |
1179 | if (!m->pg_init_in_progress) | 1222 | /* Activations of other paths are still on going */ |
1180 | queue_work(kmultipathd, &m->process_queued_ios); | 1223 | goto out; |
1224 | |||
1225 | if (!m->pg_init_required) | ||
1226 | m->queue_io = 0; | ||
1227 | |||
1228 | queue_work(kmultipathd, &m->process_queued_ios); | ||
1229 | |||
1230 | /* | ||
1231 | * Wake up any thread waiting to suspend. | ||
1232 | */ | ||
1233 | wake_up(&m->pg_init_wait); | ||
1234 | |||
1235 | out: | ||
1181 | spin_unlock_irqrestore(&m->lock, flags); | 1236 | spin_unlock_irqrestore(&m->lock, flags); |
1182 | } | 1237 | } |
1183 | 1238 | ||
1184 | static void activate_path(struct work_struct *work) | 1239 | static void activate_path(struct work_struct *work) |
1185 | { | 1240 | { |
1186 | int ret; | ||
1187 | struct pgpath *pgpath = | 1241 | struct pgpath *pgpath = |
1188 | container_of(work, struct pgpath, activate_path); | 1242 | container_of(work, struct pgpath, activate_path); |
1189 | 1243 | ||
1190 | ret = scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev)); | 1244 | scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev), |
1191 | pg_init_done(&pgpath->path, ret); | 1245 | pg_init_done, pgpath); |
1192 | } | 1246 | } |
1193 | 1247 | ||
1194 | /* | 1248 | /* |
@@ -1261,6 +1315,15 @@ static void multipath_presuspend(struct dm_target *ti) | |||
1261 | queue_if_no_path(m, 0, 1); | 1315 | queue_if_no_path(m, 0, 1); |
1262 | } | 1316 | } |
1263 | 1317 | ||
1318 | static void multipath_postsuspend(struct dm_target *ti) | ||
1319 | { | ||
1320 | struct multipath *m = ti->private; | ||
1321 | |||
1322 | mutex_lock(&m->work_mutex); | ||
1323 | flush_multipath_work(m); | ||
1324 | mutex_unlock(&m->work_mutex); | ||
1325 | } | ||
1326 | |||
1264 | /* | 1327 | /* |
1265 | * Restore the queue_if_no_path setting. | 1328 | * Restore the queue_if_no_path setting. |
1266 | */ | 1329 | */ |
@@ -1397,51 +1460,65 @@ static int multipath_status(struct dm_target *ti, status_type_t type, | |||
1397 | 1460 | ||
1398 | static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) | 1461 | static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) |
1399 | { | 1462 | { |
1400 | int r; | 1463 | int r = -EINVAL; |
1401 | struct dm_dev *dev; | 1464 | struct dm_dev *dev; |
1402 | struct multipath *m = (struct multipath *) ti->private; | 1465 | struct multipath *m = (struct multipath *) ti->private; |
1403 | action_fn action; | 1466 | action_fn action; |
1404 | 1467 | ||
1468 | mutex_lock(&m->work_mutex); | ||
1469 | |||
1470 | if (dm_suspended(ti)) { | ||
1471 | r = -EBUSY; | ||
1472 | goto out; | ||
1473 | } | ||
1474 | |||
1405 | if (argc == 1) { | 1475 | if (argc == 1) { |
1406 | if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) | 1476 | if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) { |
1407 | return queue_if_no_path(m, 1, 0); | 1477 | r = queue_if_no_path(m, 1, 0); |
1408 | else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path"))) | 1478 | goto out; |
1409 | return queue_if_no_path(m, 0, 0); | 1479 | } else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path"))) { |
1480 | r = queue_if_no_path(m, 0, 0); | ||
1481 | goto out; | ||
1482 | } | ||
1410 | } | 1483 | } |
1411 | 1484 | ||
1412 | if (argc != 2) | 1485 | if (argc != 2) { |
1413 | goto error; | 1486 | DMWARN("Unrecognised multipath message received."); |
1487 | goto out; | ||
1488 | } | ||
1414 | 1489 | ||
1415 | if (!strnicmp(argv[0], MESG_STR("disable_group"))) | 1490 | if (!strnicmp(argv[0], MESG_STR("disable_group"))) { |
1416 | return bypass_pg_num(m, argv[1], 1); | 1491 | r = bypass_pg_num(m, argv[1], 1); |
1417 | else if (!strnicmp(argv[0], MESG_STR("enable_group"))) | 1492 | goto out; |
1418 | return bypass_pg_num(m, argv[1], 0); | 1493 | } else if (!strnicmp(argv[0], MESG_STR("enable_group"))) { |
1419 | else if (!strnicmp(argv[0], MESG_STR("switch_group"))) | 1494 | r = bypass_pg_num(m, argv[1], 0); |
1420 | return switch_pg_num(m, argv[1]); | 1495 | goto out; |
1421 | else if (!strnicmp(argv[0], MESG_STR("reinstate_path"))) | 1496 | } else if (!strnicmp(argv[0], MESG_STR("switch_group"))) { |
1497 | r = switch_pg_num(m, argv[1]); | ||
1498 | goto out; | ||
1499 | } else if (!strnicmp(argv[0], MESG_STR("reinstate_path"))) | ||
1422 | action = reinstate_path; | 1500 | action = reinstate_path; |
1423 | else if (!strnicmp(argv[0], MESG_STR("fail_path"))) | 1501 | else if (!strnicmp(argv[0], MESG_STR("fail_path"))) |
1424 | action = fail_path; | 1502 | action = fail_path; |
1425 | else | 1503 | else { |
1426 | goto error; | 1504 | DMWARN("Unrecognised multipath message received."); |
1505 | goto out; | ||
1506 | } | ||
1427 | 1507 | ||
1428 | r = dm_get_device(ti, argv[1], ti->begin, ti->len, | 1508 | r = dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dev); |
1429 | dm_table_get_mode(ti->table), &dev); | ||
1430 | if (r) { | 1509 | if (r) { |
1431 | DMWARN("message: error getting device %s", | 1510 | DMWARN("message: error getting device %s", |
1432 | argv[1]); | 1511 | argv[1]); |
1433 | return -EINVAL; | 1512 | goto out; |
1434 | } | 1513 | } |
1435 | 1514 | ||
1436 | r = action_dev(m, dev, action); | 1515 | r = action_dev(m, dev, action); |
1437 | 1516 | ||
1438 | dm_put_device(ti, dev); | 1517 | dm_put_device(ti, dev); |
1439 | 1518 | ||
1519 | out: | ||
1520 | mutex_unlock(&m->work_mutex); | ||
1440 | return r; | 1521 | return r; |
1441 | |||
1442 | error: | ||
1443 | DMWARN("Unrecognised multipath message received."); | ||
1444 | return -EINVAL; | ||
1445 | } | 1522 | } |
1446 | 1523 | ||
1447 | static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, | 1524 | static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, |
@@ -1567,13 +1644,14 @@ out: | |||
1567 | *---------------------------------------------------------------*/ | 1644 | *---------------------------------------------------------------*/ |
1568 | static struct target_type multipath_target = { | 1645 | static struct target_type multipath_target = { |
1569 | .name = "multipath", | 1646 | .name = "multipath", |
1570 | .version = {1, 1, 0}, | 1647 | .version = {1, 1, 1}, |
1571 | .module = THIS_MODULE, | 1648 | .module = THIS_MODULE, |
1572 | .ctr = multipath_ctr, | 1649 | .ctr = multipath_ctr, |
1573 | .dtr = multipath_dtr, | 1650 | .dtr = multipath_dtr, |
1574 | .map_rq = multipath_map, | 1651 | .map_rq = multipath_map, |
1575 | .rq_end_io = multipath_end_io, | 1652 | .rq_end_io = multipath_end_io, |
1576 | .presuspend = multipath_presuspend, | 1653 | .presuspend = multipath_presuspend, |
1654 | .postsuspend = multipath_postsuspend, | ||
1577 | .resume = multipath_resume, | 1655 | .resume = multipath_resume, |
1578 | .status = multipath_status, | 1656 | .status = multipath_status, |
1579 | .message = multipath_message, | 1657 | .message = multipath_message, |
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index cc9dc79b0784..ddda531723dc 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c | |||
@@ -35,6 +35,7 @@ static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped); | |||
35 | *---------------------------------------------------------------*/ | 35 | *---------------------------------------------------------------*/ |
36 | enum dm_raid1_error { | 36 | enum dm_raid1_error { |
37 | DM_RAID1_WRITE_ERROR, | 37 | DM_RAID1_WRITE_ERROR, |
38 | DM_RAID1_FLUSH_ERROR, | ||
38 | DM_RAID1_SYNC_ERROR, | 39 | DM_RAID1_SYNC_ERROR, |
39 | DM_RAID1_READ_ERROR | 40 | DM_RAID1_READ_ERROR |
40 | }; | 41 | }; |
@@ -57,6 +58,7 @@ struct mirror_set { | |||
57 | struct bio_list reads; | 58 | struct bio_list reads; |
58 | struct bio_list writes; | 59 | struct bio_list writes; |
59 | struct bio_list failures; | 60 | struct bio_list failures; |
61 | struct bio_list holds; /* bios are waiting until suspend */ | ||
60 | 62 | ||
61 | struct dm_region_hash *rh; | 63 | struct dm_region_hash *rh; |
62 | struct dm_kcopyd_client *kcopyd_client; | 64 | struct dm_kcopyd_client *kcopyd_client; |
@@ -67,6 +69,7 @@ struct mirror_set { | |||
67 | region_t nr_regions; | 69 | region_t nr_regions; |
68 | int in_sync; | 70 | int in_sync; |
69 | int log_failure; | 71 | int log_failure; |
72 | int leg_failure; | ||
70 | atomic_t suspend; | 73 | atomic_t suspend; |
71 | 74 | ||
72 | atomic_t default_mirror; /* Default mirror */ | 75 | atomic_t default_mirror; /* Default mirror */ |
@@ -179,6 +182,17 @@ static void set_default_mirror(struct mirror *m) | |||
179 | atomic_set(&ms->default_mirror, m - m0); | 182 | atomic_set(&ms->default_mirror, m - m0); |
180 | } | 183 | } |
181 | 184 | ||
185 | static struct mirror *get_valid_mirror(struct mirror_set *ms) | ||
186 | { | ||
187 | struct mirror *m; | ||
188 | |||
189 | for (m = ms->mirror; m < ms->mirror + ms->nr_mirrors; m++) | ||
190 | if (!atomic_read(&m->error_count)) | ||
191 | return m; | ||
192 | |||
193 | return NULL; | ||
194 | } | ||
195 | |||
182 | /* fail_mirror | 196 | /* fail_mirror |
183 | * @m: mirror device to fail | 197 | * @m: mirror device to fail |
184 | * @error_type: one of the enum's, DM_RAID1_*_ERROR | 198 | * @error_type: one of the enum's, DM_RAID1_*_ERROR |
@@ -198,6 +212,8 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type) | |||
198 | struct mirror_set *ms = m->ms; | 212 | struct mirror_set *ms = m->ms; |
199 | struct mirror *new; | 213 | struct mirror *new; |
200 | 214 | ||
215 | ms->leg_failure = 1; | ||
216 | |||
201 | /* | 217 | /* |
202 | * error_count is used for nothing more than a | 218 | * error_count is used for nothing more than a |
203 | * simple way to tell if a device has encountered | 219 | * simple way to tell if a device has encountered |
@@ -224,19 +240,50 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type) | |||
224 | goto out; | 240 | goto out; |
225 | } | 241 | } |
226 | 242 | ||
227 | for (new = ms->mirror; new < ms->mirror + ms->nr_mirrors; new++) | 243 | new = get_valid_mirror(ms); |
228 | if (!atomic_read(&new->error_count)) { | 244 | if (new) |
229 | set_default_mirror(new); | 245 | set_default_mirror(new); |
230 | break; | 246 | else |
231 | } | ||
232 | |||
233 | if (unlikely(new == ms->mirror + ms->nr_mirrors)) | ||
234 | DMWARN("All sides of mirror have failed."); | 247 | DMWARN("All sides of mirror have failed."); |
235 | 248 | ||
236 | out: | 249 | out: |
237 | schedule_work(&ms->trigger_event); | 250 | schedule_work(&ms->trigger_event); |
238 | } | 251 | } |
239 | 252 | ||
253 | static int mirror_flush(struct dm_target *ti) | ||
254 | { | ||
255 | struct mirror_set *ms = ti->private; | ||
256 | unsigned long error_bits; | ||
257 | |||
258 | unsigned int i; | ||
259 | struct dm_io_region io[ms->nr_mirrors]; | ||
260 | struct mirror *m; | ||
261 | struct dm_io_request io_req = { | ||
262 | .bi_rw = WRITE_BARRIER, | ||
263 | .mem.type = DM_IO_KMEM, | ||
264 | .mem.ptr.bvec = NULL, | ||
265 | .client = ms->io_client, | ||
266 | }; | ||
267 | |||
268 | for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) { | ||
269 | io[i].bdev = m->dev->bdev; | ||
270 | io[i].sector = 0; | ||
271 | io[i].count = 0; | ||
272 | } | ||
273 | |||
274 | error_bits = -1; | ||
275 | dm_io(&io_req, ms->nr_mirrors, io, &error_bits); | ||
276 | if (unlikely(error_bits != 0)) { | ||
277 | for (i = 0; i < ms->nr_mirrors; i++) | ||
278 | if (test_bit(i, &error_bits)) | ||
279 | fail_mirror(ms->mirror + i, | ||
280 | DM_RAID1_FLUSH_ERROR); | ||
281 | return -EIO; | ||
282 | } | ||
283 | |||
284 | return 0; | ||
285 | } | ||
286 | |||
240 | /*----------------------------------------------------------------- | 287 | /*----------------------------------------------------------------- |
241 | * Recovery. | 288 | * Recovery. |
242 | * | 289 | * |
@@ -396,6 +443,8 @@ static int mirror_available(struct mirror_set *ms, struct bio *bio) | |||
396 | */ | 443 | */ |
397 | static sector_t map_sector(struct mirror *m, struct bio *bio) | 444 | static sector_t map_sector(struct mirror *m, struct bio *bio) |
398 | { | 445 | { |
446 | if (unlikely(!bio->bi_size)) | ||
447 | return 0; | ||
399 | return m->offset + (bio->bi_sector - m->ms->ti->begin); | 448 | return m->offset + (bio->bi_sector - m->ms->ti->begin); |
400 | } | 449 | } |
401 | 450 | ||
@@ -413,6 +462,34 @@ static void map_region(struct dm_io_region *io, struct mirror *m, | |||
413 | io->count = bio->bi_size >> 9; | 462 | io->count = bio->bi_size >> 9; |
414 | } | 463 | } |
415 | 464 | ||
465 | static void hold_bio(struct mirror_set *ms, struct bio *bio) | ||
466 | { | ||
467 | /* | ||
468 | * Lock is required to avoid race condition during suspend | ||
469 | * process. | ||
470 | */ | ||
471 | spin_lock_irq(&ms->lock); | ||
472 | |||
473 | if (atomic_read(&ms->suspend)) { | ||
474 | spin_unlock_irq(&ms->lock); | ||
475 | |||
476 | /* | ||
477 | * If device is suspended, complete the bio. | ||
478 | */ | ||
479 | if (dm_noflush_suspending(ms->ti)) | ||
480 | bio_endio(bio, DM_ENDIO_REQUEUE); | ||
481 | else | ||
482 | bio_endio(bio, -EIO); | ||
483 | return; | ||
484 | } | ||
485 | |||
486 | /* | ||
487 | * Hold bio until the suspend is complete. | ||
488 | */ | ||
489 | bio_list_add(&ms->holds, bio); | ||
490 | spin_unlock_irq(&ms->lock); | ||
491 | } | ||
492 | |||
416 | /*----------------------------------------------------------------- | 493 | /*----------------------------------------------------------------- |
417 | * Reads | 494 | * Reads |
418 | *---------------------------------------------------------------*/ | 495 | *---------------------------------------------------------------*/ |
@@ -511,7 +588,6 @@ static void write_callback(unsigned long error, void *context) | |||
511 | unsigned i, ret = 0; | 588 | unsigned i, ret = 0; |
512 | struct bio *bio = (struct bio *) context; | 589 | struct bio *bio = (struct bio *) context; |
513 | struct mirror_set *ms; | 590 | struct mirror_set *ms; |
514 | int uptodate = 0; | ||
515 | int should_wake = 0; | 591 | int should_wake = 0; |
516 | unsigned long flags; | 592 | unsigned long flags; |
517 | 593 | ||
@@ -524,36 +600,27 @@ static void write_callback(unsigned long error, void *context) | |||
524 | * This way we handle both writes to SYNC and NOSYNC | 600 | * This way we handle both writes to SYNC and NOSYNC |
525 | * regions with the same code. | 601 | * regions with the same code. |
526 | */ | 602 | */ |
527 | if (likely(!error)) | 603 | if (likely(!error)) { |
528 | goto out; | 604 | bio_endio(bio, ret); |
605 | return; | ||
606 | } | ||
529 | 607 | ||
530 | for (i = 0; i < ms->nr_mirrors; i++) | 608 | for (i = 0; i < ms->nr_mirrors; i++) |
531 | if (test_bit(i, &error)) | 609 | if (test_bit(i, &error)) |
532 | fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR); | 610 | fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR); |
533 | else | ||
534 | uptodate = 1; | ||
535 | 611 | ||
536 | if (unlikely(!uptodate)) { | 612 | /* |
537 | DMERR("All replicated volumes dead, failing I/O"); | 613 | * Need to raise event. Since raising |
538 | /* None of the writes succeeded, fail the I/O. */ | 614 | * events can block, we need to do it in |
539 | ret = -EIO; | 615 | * the main thread. |
540 | } else if (errors_handled(ms)) { | 616 | */ |
541 | /* | 617 | spin_lock_irqsave(&ms->lock, flags); |
542 | * Need to raise event. Since raising | 618 | if (!ms->failures.head) |
543 | * events can block, we need to do it in | 619 | should_wake = 1; |
544 | * the main thread. | 620 | bio_list_add(&ms->failures, bio); |
545 | */ | 621 | spin_unlock_irqrestore(&ms->lock, flags); |
546 | spin_lock_irqsave(&ms->lock, flags); | 622 | if (should_wake) |
547 | if (!ms->failures.head) | 623 | wakeup_mirrord(ms); |
548 | should_wake = 1; | ||
549 | bio_list_add(&ms->failures, bio); | ||
550 | spin_unlock_irqrestore(&ms->lock, flags); | ||
551 | if (should_wake) | ||
552 | wakeup_mirrord(ms); | ||
553 | return; | ||
554 | } | ||
555 | out: | ||
556 | bio_endio(bio, ret); | ||
557 | } | 624 | } |
558 | 625 | ||
559 | static void do_write(struct mirror_set *ms, struct bio *bio) | 626 | static void do_write(struct mirror_set *ms, struct bio *bio) |
@@ -562,7 +629,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio) | |||
562 | struct dm_io_region io[ms->nr_mirrors], *dest = io; | 629 | struct dm_io_region io[ms->nr_mirrors], *dest = io; |
563 | struct mirror *m; | 630 | struct mirror *m; |
564 | struct dm_io_request io_req = { | 631 | struct dm_io_request io_req = { |
565 | .bi_rw = WRITE, | 632 | .bi_rw = WRITE | (bio->bi_rw & WRITE_BARRIER), |
566 | .mem.type = DM_IO_BVEC, | 633 | .mem.type = DM_IO_BVEC, |
567 | .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, | 634 | .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, |
568 | .notify.fn = write_callback, | 635 | .notify.fn = write_callback, |
@@ -603,6 +670,11 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) | |||
603 | bio_list_init(&requeue); | 670 | bio_list_init(&requeue); |
604 | 671 | ||
605 | while ((bio = bio_list_pop(writes))) { | 672 | while ((bio = bio_list_pop(writes))) { |
673 | if (unlikely(bio_empty_barrier(bio))) { | ||
674 | bio_list_add(&sync, bio); | ||
675 | continue; | ||
676 | } | ||
677 | |||
606 | region = dm_rh_bio_to_region(ms->rh, bio); | 678 | region = dm_rh_bio_to_region(ms->rh, bio); |
607 | 679 | ||
608 | if (log->type->is_remote_recovering && | 680 | if (log->type->is_remote_recovering && |
@@ -659,7 +731,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) | |||
659 | /* | 731 | /* |
660 | * Dispatch io. | 732 | * Dispatch io. |
661 | */ | 733 | */ |
662 | if (unlikely(ms->log_failure)) { | 734 | if (unlikely(ms->log_failure) && errors_handled(ms)) { |
663 | spin_lock_irq(&ms->lock); | 735 | spin_lock_irq(&ms->lock); |
664 | bio_list_merge(&ms->failures, &sync); | 736 | bio_list_merge(&ms->failures, &sync); |
665 | spin_unlock_irq(&ms->lock); | 737 | spin_unlock_irq(&ms->lock); |
@@ -672,8 +744,15 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) | |||
672 | dm_rh_delay(ms->rh, bio); | 744 | dm_rh_delay(ms->rh, bio); |
673 | 745 | ||
674 | while ((bio = bio_list_pop(&nosync))) { | 746 | while ((bio = bio_list_pop(&nosync))) { |
675 | map_bio(get_default_mirror(ms), bio); | 747 | if (unlikely(ms->leg_failure) && errors_handled(ms)) { |
676 | generic_make_request(bio); | 748 | spin_lock_irq(&ms->lock); |
749 | bio_list_add(&ms->failures, bio); | ||
750 | spin_unlock_irq(&ms->lock); | ||
751 | wakeup_mirrord(ms); | ||
752 | } else { | ||
753 | map_bio(get_default_mirror(ms), bio); | ||
754 | generic_make_request(bio); | ||
755 | } | ||
677 | } | 756 | } |
678 | } | 757 | } |
679 | 758 | ||
@@ -681,20 +760,12 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures) | |||
681 | { | 760 | { |
682 | struct bio *bio; | 761 | struct bio *bio; |
683 | 762 | ||
684 | if (!failures->head) | 763 | if (likely(!failures->head)) |
685 | return; | ||
686 | |||
687 | if (!ms->log_failure) { | ||
688 | while ((bio = bio_list_pop(failures))) { | ||
689 | ms->in_sync = 0; | ||
690 | dm_rh_mark_nosync(ms->rh, bio, bio->bi_size, 0); | ||
691 | } | ||
692 | return; | 764 | return; |
693 | } | ||
694 | 765 | ||
695 | /* | 766 | /* |
696 | * If the log has failed, unattempted writes are being | 767 | * If the log has failed, unattempted writes are being |
697 | * put on the failures list. We can't issue those writes | 768 | * put on the holds list. We can't issue those writes |
698 | * until a log has been marked, so we must store them. | 769 | * until a log has been marked, so we must store them. |
699 | * | 770 | * |
700 | * If a 'noflush' suspend is in progress, we can requeue | 771 | * If a 'noflush' suspend is in progress, we can requeue |
@@ -709,23 +780,27 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures) | |||
709 | * for us to treat them the same and requeue them | 780 | * for us to treat them the same and requeue them |
710 | * as well. | 781 | * as well. |
711 | */ | 782 | */ |
712 | if (dm_noflush_suspending(ms->ti)) { | 783 | while ((bio = bio_list_pop(failures))) { |
713 | while ((bio = bio_list_pop(failures))) | 784 | if (!ms->log_failure) { |
714 | bio_endio(bio, DM_ENDIO_REQUEUE); | 785 | ms->in_sync = 0; |
715 | return; | 786 | dm_rh_mark_nosync(ms->rh, bio); |
716 | } | 787 | } |
717 | 788 | ||
718 | if (atomic_read(&ms->suspend)) { | 789 | /* |
719 | while ((bio = bio_list_pop(failures))) | 790 | * If all the legs are dead, fail the I/O. |
791 | * If we have been told to handle errors, hold the bio | ||
792 | * and wait for userspace to deal with the problem. | ||
793 | * Otherwise pretend that the I/O succeeded. (This would | ||
794 | * be wrong if the failed leg returned after reboot and | ||
795 | * got replicated back to the good legs.) | ||
796 | */ | ||
797 | if (!get_valid_mirror(ms)) | ||
720 | bio_endio(bio, -EIO); | 798 | bio_endio(bio, -EIO); |
721 | return; | 799 | else if (errors_handled(ms)) |
800 | hold_bio(ms, bio); | ||
801 | else | ||
802 | bio_endio(bio, 0); | ||
722 | } | 803 | } |
723 | |||
724 | spin_lock_irq(&ms->lock); | ||
725 | bio_list_merge(&ms->failures, failures); | ||
726 | spin_unlock_irq(&ms->lock); | ||
727 | |||
728 | delayed_wake(ms); | ||
729 | } | 804 | } |
730 | 805 | ||
731 | static void trigger_event(struct work_struct *work) | 806 | static void trigger_event(struct work_struct *work) |
@@ -784,12 +859,17 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors, | |||
784 | } | 859 | } |
785 | 860 | ||
786 | spin_lock_init(&ms->lock); | 861 | spin_lock_init(&ms->lock); |
862 | bio_list_init(&ms->reads); | ||
863 | bio_list_init(&ms->writes); | ||
864 | bio_list_init(&ms->failures); | ||
865 | bio_list_init(&ms->holds); | ||
787 | 866 | ||
788 | ms->ti = ti; | 867 | ms->ti = ti; |
789 | ms->nr_mirrors = nr_mirrors; | 868 | ms->nr_mirrors = nr_mirrors; |
790 | ms->nr_regions = dm_sector_div_up(ti->len, region_size); | 869 | ms->nr_regions = dm_sector_div_up(ti->len, region_size); |
791 | ms->in_sync = 0; | 870 | ms->in_sync = 0; |
792 | ms->log_failure = 0; | 871 | ms->log_failure = 0; |
872 | ms->leg_failure = 0; | ||
793 | atomic_set(&ms->suspend, 0); | 873 | atomic_set(&ms->suspend, 0); |
794 | atomic_set(&ms->default_mirror, DEFAULT_MIRROR); | 874 | atomic_set(&ms->default_mirror, DEFAULT_MIRROR); |
795 | 875 | ||
@@ -847,8 +927,7 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti, | |||
847 | return -EINVAL; | 927 | return -EINVAL; |
848 | } | 928 | } |
849 | 929 | ||
850 | if (dm_get_device(ti, argv[0], offset, ti->len, | 930 | if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), |
851 | dm_table_get_mode(ti->table), | ||
852 | &ms->mirror[mirror].dev)) { | 931 | &ms->mirror[mirror].dev)) { |
853 | ti->error = "Device lookup failure"; | 932 | ti->error = "Device lookup failure"; |
854 | return -ENXIO; | 933 | return -ENXIO; |
@@ -889,7 +968,8 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti, | |||
889 | return NULL; | 968 | return NULL; |
890 | } | 969 | } |
891 | 970 | ||
892 | dl = dm_dirty_log_create(argv[0], ti, param_count, argv + 2); | 971 | dl = dm_dirty_log_create(argv[0], ti, mirror_flush, param_count, |
972 | argv + 2); | ||
893 | if (!dl) { | 973 | if (!dl) { |
894 | ti->error = "Error creating mirror dirty log"; | 974 | ti->error = "Error creating mirror dirty log"; |
895 | return NULL; | 975 | return NULL; |
@@ -995,6 +1075,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
995 | 1075 | ||
996 | ti->private = ms; | 1076 | ti->private = ms; |
997 | ti->split_io = dm_rh_get_region_size(ms->rh); | 1077 | ti->split_io = dm_rh_get_region_size(ms->rh); |
1078 | ti->num_flush_requests = 1; | ||
998 | 1079 | ||
999 | ms->kmirrord_wq = create_singlethread_workqueue("kmirrord"); | 1080 | ms->kmirrord_wq = create_singlethread_workqueue("kmirrord"); |
1000 | if (!ms->kmirrord_wq) { | 1081 | if (!ms->kmirrord_wq) { |
@@ -1122,7 +1203,8 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, | |||
1122 | * We need to dec pending if this was a write. | 1203 | * We need to dec pending if this was a write. |
1123 | */ | 1204 | */ |
1124 | if (rw == WRITE) { | 1205 | if (rw == WRITE) { |
1125 | dm_rh_dec(ms->rh, map_context->ll); | 1206 | if (likely(!bio_empty_barrier(bio))) |
1207 | dm_rh_dec(ms->rh, map_context->ll); | ||
1126 | return error; | 1208 | return error; |
1127 | } | 1209 | } |
1128 | 1210 | ||
@@ -1180,9 +1262,26 @@ static void mirror_presuspend(struct dm_target *ti) | |||
1180 | struct mirror_set *ms = (struct mirror_set *) ti->private; | 1262 | struct mirror_set *ms = (struct mirror_set *) ti->private; |
1181 | struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); | 1263 | struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); |
1182 | 1264 | ||
1265 | struct bio_list holds; | ||
1266 | struct bio *bio; | ||
1267 | |||
1183 | atomic_set(&ms->suspend, 1); | 1268 | atomic_set(&ms->suspend, 1); |
1184 | 1269 | ||
1185 | /* | 1270 | /* |
1271 | * Process bios in the hold list to start recovery waiting | ||
1272 | * for bios in the hold list. After the process, no bio has | ||
1273 | * a chance to be added in the hold list because ms->suspend | ||
1274 | * is set. | ||
1275 | */ | ||
1276 | spin_lock_irq(&ms->lock); | ||
1277 | holds = ms->holds; | ||
1278 | bio_list_init(&ms->holds); | ||
1279 | spin_unlock_irq(&ms->lock); | ||
1280 | |||
1281 | while ((bio = bio_list_pop(&holds))) | ||
1282 | hold_bio(ms, bio); | ||
1283 | |||
1284 | /* | ||
1186 | * We must finish up all the work that we've | 1285 | * We must finish up all the work that we've |
1187 | * generated (i.e. recovery work). | 1286 | * generated (i.e. recovery work). |
1188 | */ | 1287 | */ |
@@ -1244,7 +1343,8 @@ static char device_status_char(struct mirror *m) | |||
1244 | if (!atomic_read(&(m->error_count))) | 1343 | if (!atomic_read(&(m->error_count))) |
1245 | return 'A'; | 1344 | return 'A'; |
1246 | 1345 | ||
1247 | return (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' : | 1346 | return (test_bit(DM_RAID1_FLUSH_ERROR, &(m->error_type))) ? 'F' : |
1347 | (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' : | ||
1248 | (test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' : | 1348 | (test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' : |
1249 | (test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U'; | 1349 | (test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U'; |
1250 | } | 1350 | } |
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c index 36dbe29f2fd6..bd5c58b28868 100644 --- a/drivers/md/dm-region-hash.c +++ b/drivers/md/dm-region-hash.c | |||
@@ -11,6 +11,7 @@ | |||
11 | #include <linux/ctype.h> | 11 | #include <linux/ctype.h> |
12 | #include <linux/init.h> | 12 | #include <linux/init.h> |
13 | #include <linux/module.h> | 13 | #include <linux/module.h> |
14 | #include <linux/slab.h> | ||
14 | #include <linux/vmalloc.h> | 15 | #include <linux/vmalloc.h> |
15 | 16 | ||
16 | #include "dm.h" | 17 | #include "dm.h" |
@@ -79,6 +80,11 @@ struct dm_region_hash { | |||
79 | struct list_head recovered_regions; | 80 | struct list_head recovered_regions; |
80 | struct list_head failed_recovered_regions; | 81 | struct list_head failed_recovered_regions; |
81 | 82 | ||
83 | /* | ||
84 | * If there was a barrier failure no regions can be marked clean. | ||
85 | */ | ||
86 | int barrier_failure; | ||
87 | |||
82 | void *context; | 88 | void *context; |
83 | sector_t target_begin; | 89 | sector_t target_begin; |
84 | 90 | ||
@@ -211,6 +217,7 @@ struct dm_region_hash *dm_region_hash_create( | |||
211 | INIT_LIST_HEAD(&rh->quiesced_regions); | 217 | INIT_LIST_HEAD(&rh->quiesced_regions); |
212 | INIT_LIST_HEAD(&rh->recovered_regions); | 218 | INIT_LIST_HEAD(&rh->recovered_regions); |
213 | INIT_LIST_HEAD(&rh->failed_recovered_regions); | 219 | INIT_LIST_HEAD(&rh->failed_recovered_regions); |
220 | rh->barrier_failure = 0; | ||
214 | 221 | ||
215 | rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS, | 222 | rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS, |
216 | sizeof(struct dm_region)); | 223 | sizeof(struct dm_region)); |
@@ -377,8 +384,6 @@ static void complete_resync_work(struct dm_region *reg, int success) | |||
377 | /* dm_rh_mark_nosync | 384 | /* dm_rh_mark_nosync |
378 | * @ms | 385 | * @ms |
379 | * @bio | 386 | * @bio |
380 | * @done | ||
381 | * @error | ||
382 | * | 387 | * |
383 | * The bio was written on some mirror(s) but failed on other mirror(s). | 388 | * The bio was written on some mirror(s) but failed on other mirror(s). |
384 | * We can successfully endio the bio but should avoid the region being | 389 | * We can successfully endio the bio but should avoid the region being |
@@ -386,8 +391,7 @@ static void complete_resync_work(struct dm_region *reg, int success) | |||
386 | * | 391 | * |
387 | * This function is _not_ safe in interrupt context! | 392 | * This function is _not_ safe in interrupt context! |
388 | */ | 393 | */ |
389 | void dm_rh_mark_nosync(struct dm_region_hash *rh, | 394 | void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio) |
390 | struct bio *bio, unsigned done, int error) | ||
391 | { | 395 | { |
392 | unsigned long flags; | 396 | unsigned long flags; |
393 | struct dm_dirty_log *log = rh->log; | 397 | struct dm_dirty_log *log = rh->log; |
@@ -395,6 +399,11 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, | |||
395 | region_t region = dm_rh_bio_to_region(rh, bio); | 399 | region_t region = dm_rh_bio_to_region(rh, bio); |
396 | int recovering = 0; | 400 | int recovering = 0; |
397 | 401 | ||
402 | if (bio_empty_barrier(bio)) { | ||
403 | rh->barrier_failure = 1; | ||
404 | return; | ||
405 | } | ||
406 | |||
398 | /* We must inform the log that the sync count has changed. */ | 407 | /* We must inform the log that the sync count has changed. */ |
399 | log->type->set_region_sync(log, region, 0); | 408 | log->type->set_region_sync(log, region, 0); |
400 | 409 | ||
@@ -419,7 +428,6 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, | |||
419 | BUG_ON(!list_empty(®->list)); | 428 | BUG_ON(!list_empty(®->list)); |
420 | spin_unlock_irqrestore(&rh->region_lock, flags); | 429 | spin_unlock_irqrestore(&rh->region_lock, flags); |
421 | 430 | ||
422 | bio_endio(bio, error); | ||
423 | if (recovering) | 431 | if (recovering) |
424 | complete_resync_work(reg, 0); | 432 | complete_resync_work(reg, 0); |
425 | } | 433 | } |
@@ -515,8 +523,11 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios) | |||
515 | { | 523 | { |
516 | struct bio *bio; | 524 | struct bio *bio; |
517 | 525 | ||
518 | for (bio = bios->head; bio; bio = bio->bi_next) | 526 | for (bio = bios->head; bio; bio = bio->bi_next) { |
527 | if (bio_empty_barrier(bio)) | ||
528 | continue; | ||
519 | rh_inc(rh, dm_rh_bio_to_region(rh, bio)); | 529 | rh_inc(rh, dm_rh_bio_to_region(rh, bio)); |
530 | } | ||
520 | } | 531 | } |
521 | EXPORT_SYMBOL_GPL(dm_rh_inc_pending); | 532 | EXPORT_SYMBOL_GPL(dm_rh_inc_pending); |
522 | 533 | ||
@@ -544,7 +555,14 @@ void dm_rh_dec(struct dm_region_hash *rh, region_t region) | |||
544 | */ | 555 | */ |
545 | 556 | ||
546 | /* do nothing for DM_RH_NOSYNC */ | 557 | /* do nothing for DM_RH_NOSYNC */ |
547 | if (reg->state == DM_RH_RECOVERING) { | 558 | if (unlikely(rh->barrier_failure)) { |
559 | /* | ||
560 | * If a write barrier failed some time ago, we | ||
561 | * don't know whether or not this write made it | ||
562 | * to the disk, so we must resync the device. | ||
563 | */ | ||
564 | reg->state = DM_RH_NOSYNC; | ||
565 | } else if (reg->state == DM_RH_RECOVERING) { | ||
548 | list_add_tail(®->list, &rh->quiesced_regions); | 566 | list_add_tail(®->list, &rh->quiesced_regions); |
549 | } else if (reg->state == DM_RH_DIRTY) { | 567 | } else if (reg->state == DM_RH_DIRTY) { |
550 | reg->state = DM_RH_CLEAN; | 568 | reg->state = DM_RH_CLEAN; |
@@ -643,10 +661,9 @@ void dm_rh_recovery_end(struct dm_region *reg, int success) | |||
643 | spin_lock_irq(&rh->region_lock); | 661 | spin_lock_irq(&rh->region_lock); |
644 | if (success) | 662 | if (success) |
645 | list_add(®->list, ®->rh->recovered_regions); | 663 | list_add(®->list, ®->rh->recovered_regions); |
646 | else { | 664 | else |
647 | reg->state = DM_RH_NOSYNC; | ||
648 | list_add(®->list, ®->rh->failed_recovered_regions); | 665 | list_add(®->list, ®->rh->failed_recovered_regions); |
649 | } | 666 | |
650 | spin_unlock_irq(&rh->region_lock); | 667 | spin_unlock_irq(&rh->region_lock); |
651 | 668 | ||
652 | rh->wakeup_workers(rh->context); | 669 | rh->wakeup_workers(rh->context); |
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c index cfa668f46c40..9c6c2e47ad62 100644 --- a/drivers/md/dm-service-time.c +++ b/drivers/md/dm-service-time.c | |||
@@ -11,6 +11,8 @@ | |||
11 | #include "dm.h" | 11 | #include "dm.h" |
12 | #include "dm-path-selector.h" | 12 | #include "dm-path-selector.h" |
13 | 13 | ||
14 | #include <linux/slab.h> | ||
15 | |||
14 | #define DM_MSG_PREFIX "multipath service-time" | 16 | #define DM_MSG_PREFIX "multipath service-time" |
15 | #define ST_MIN_IO 1 | 17 | #define ST_MIN_IO 1 |
16 | #define ST_MAX_RELATIVE_THROUGHPUT 100 | 18 | #define ST_MAX_RELATIVE_THROUGHPUT 100 |
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index d5b2e08750d5..c097d8a4823d 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c | |||
@@ -55,6 +55,8 @@ | |||
55 | */ | 55 | */ |
56 | #define SNAPSHOT_DISK_VERSION 1 | 56 | #define SNAPSHOT_DISK_VERSION 1 |
57 | 57 | ||
58 | #define NUM_SNAPSHOT_HDR_CHUNKS 1 | ||
59 | |||
58 | struct disk_header { | 60 | struct disk_header { |
59 | uint32_t magic; | 61 | uint32_t magic; |
60 | 62 | ||
@@ -120,7 +122,22 @@ struct pstore { | |||
120 | 122 | ||
121 | /* | 123 | /* |
122 | * The next free chunk for an exception. | 124 | * The next free chunk for an exception. |
125 | * | ||
126 | * When creating exceptions, all the chunks here and above are | ||
127 | * free. It holds the next chunk to be allocated. On rare | ||
128 | * occasions (e.g. after a system crash) holes can be left in | ||
129 | * the exception store because chunks can be committed out of | ||
130 | * order. | ||
131 | * | ||
132 | * When merging exceptions, it does not necessarily mean all the | ||
133 | * chunks here and above are free. It holds the value it would | ||
134 | * have held if all chunks had been committed in order of | ||
135 | * allocation. Consequently the value may occasionally be | ||
136 | * slightly too low, but since it's only used for 'status' and | ||
137 | * it can never reach its minimum value too early this doesn't | ||
138 | * matter. | ||
123 | */ | 139 | */ |
140 | |||
124 | chunk_t next_free; | 141 | chunk_t next_free; |
125 | 142 | ||
126 | /* | 143 | /* |
@@ -214,7 +231,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw, | |||
214 | int metadata) | 231 | int metadata) |
215 | { | 232 | { |
216 | struct dm_io_region where = { | 233 | struct dm_io_region where = { |
217 | .bdev = ps->store->cow->bdev, | 234 | .bdev = dm_snap_cow(ps->store->snap)->bdev, |
218 | .sector = ps->store->chunk_size * chunk, | 235 | .sector = ps->store->chunk_size * chunk, |
219 | .count = ps->store->chunk_size, | 236 | .count = ps->store->chunk_size, |
220 | }; | 237 | }; |
@@ -237,7 +254,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw, | |||
237 | * Issue the synchronous I/O from a different thread | 254 | * Issue the synchronous I/O from a different thread |
238 | * to avoid generic_make_request recursion. | 255 | * to avoid generic_make_request recursion. |
239 | */ | 256 | */ |
240 | INIT_WORK(&req.work, do_metadata); | 257 | INIT_WORK_ON_STACK(&req.work, do_metadata); |
241 | queue_work(ps->metadata_wq, &req.work); | 258 | queue_work(ps->metadata_wq, &req.work); |
242 | flush_workqueue(ps->metadata_wq); | 259 | flush_workqueue(ps->metadata_wq); |
243 | 260 | ||
@@ -284,16 +301,18 @@ static int read_header(struct pstore *ps, int *new_snapshot) | |||
284 | { | 301 | { |
285 | int r; | 302 | int r; |
286 | struct disk_header *dh; | 303 | struct disk_header *dh; |
287 | chunk_t chunk_size; | 304 | unsigned chunk_size; |
288 | int chunk_size_supplied = 1; | 305 | int chunk_size_supplied = 1; |
289 | char *chunk_err; | 306 | char *chunk_err; |
290 | 307 | ||
291 | /* | 308 | /* |
292 | * Use default chunk size (or hardsect_size, if larger) if none supplied | 309 | * Use default chunk size (or logical_block_size, if larger) |
310 | * if none supplied | ||
293 | */ | 311 | */ |
294 | if (!ps->store->chunk_size) { | 312 | if (!ps->store->chunk_size) { |
295 | ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS, | 313 | ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS, |
296 | bdev_logical_block_size(ps->store->cow->bdev) >> 9); | 314 | bdev_logical_block_size(dm_snap_cow(ps->store->snap)-> |
315 | bdev) >> 9); | ||
297 | ps->store->chunk_mask = ps->store->chunk_size - 1; | 316 | ps->store->chunk_mask = ps->store->chunk_size - 1; |
298 | ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1; | 317 | ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1; |
299 | chunk_size_supplied = 0; | 318 | chunk_size_supplied = 0; |
@@ -334,10 +353,9 @@ static int read_header(struct pstore *ps, int *new_snapshot) | |||
334 | return 0; | 353 | return 0; |
335 | 354 | ||
336 | if (chunk_size_supplied) | 355 | if (chunk_size_supplied) |
337 | DMWARN("chunk size %llu in device metadata overrides " | 356 | DMWARN("chunk size %u in device metadata overrides " |
338 | "table chunk size of %llu.", | 357 | "table chunk size of %u.", |
339 | (unsigned long long)chunk_size, | 358 | chunk_size, ps->store->chunk_size); |
340 | (unsigned long long)ps->store->chunk_size); | ||
341 | 359 | ||
342 | /* We had a bogus chunk_size. Fix stuff up. */ | 360 | /* We had a bogus chunk_size. Fix stuff up. */ |
343 | free_area(ps); | 361 | free_area(ps); |
@@ -345,8 +363,8 @@ static int read_header(struct pstore *ps, int *new_snapshot) | |||
345 | r = dm_exception_store_set_chunk_size(ps->store, chunk_size, | 363 | r = dm_exception_store_set_chunk_size(ps->store, chunk_size, |
346 | &chunk_err); | 364 | &chunk_err); |
347 | if (r) { | 365 | if (r) { |
348 | DMERR("invalid on-disk chunk size %llu: %s.", | 366 | DMERR("invalid on-disk chunk size %u: %s.", |
349 | (unsigned long long)chunk_size, chunk_err); | 367 | chunk_size, chunk_err); |
350 | return r; | 368 | return r; |
351 | } | 369 | } |
352 | 370 | ||
@@ -408,6 +426,15 @@ static void write_exception(struct pstore *ps, | |||
408 | e->new_chunk = cpu_to_le64(de->new_chunk); | 426 | e->new_chunk = cpu_to_le64(de->new_chunk); |
409 | } | 427 | } |
410 | 428 | ||
429 | static void clear_exception(struct pstore *ps, uint32_t index) | ||
430 | { | ||
431 | struct disk_exception *e = get_exception(ps, index); | ||
432 | |||
433 | /* clear it */ | ||
434 | e->old_chunk = 0; | ||
435 | e->new_chunk = 0; | ||
436 | } | ||
437 | |||
411 | /* | 438 | /* |
412 | * Registers the exceptions that are present in the current area. | 439 | * Registers the exceptions that are present in the current area. |
413 | * 'full' is filled in to indicate if the area has been | 440 | * 'full' is filled in to indicate if the area has been |
@@ -489,11 +516,23 @@ static struct pstore *get_info(struct dm_exception_store *store) | |||
489 | return (struct pstore *) store->context; | 516 | return (struct pstore *) store->context; |
490 | } | 517 | } |
491 | 518 | ||
492 | static void persistent_fraction_full(struct dm_exception_store *store, | 519 | static void persistent_usage(struct dm_exception_store *store, |
493 | sector_t *numerator, sector_t *denominator) | 520 | sector_t *total_sectors, |
521 | sector_t *sectors_allocated, | ||
522 | sector_t *metadata_sectors) | ||
494 | { | 523 | { |
495 | *numerator = get_info(store)->next_free * store->chunk_size; | 524 | struct pstore *ps = get_info(store); |
496 | *denominator = get_dev_size(store->cow->bdev); | 525 | |
526 | *sectors_allocated = ps->next_free * store->chunk_size; | ||
527 | *total_sectors = get_dev_size(dm_snap_cow(store->snap)->bdev); | ||
528 | |||
529 | /* | ||
530 | * First chunk is the fixed header. | ||
531 | * Then there are (ps->current_area + 1) metadata chunks, each one | ||
532 | * separated from the next by ps->exceptions_per_area data chunks. | ||
533 | */ | ||
534 | *metadata_sectors = (ps->current_area + 1 + NUM_SNAPSHOT_HDR_CHUNKS) * | ||
535 | store->chunk_size; | ||
497 | } | 536 | } |
498 | 537 | ||
499 | static void persistent_dtr(struct dm_exception_store *store) | 538 | static void persistent_dtr(struct dm_exception_store *store) |
@@ -552,44 +591,40 @@ static int persistent_read_metadata(struct dm_exception_store *store, | |||
552 | ps->current_area = 0; | 591 | ps->current_area = 0; |
553 | zero_memory_area(ps); | 592 | zero_memory_area(ps); |
554 | r = zero_disk_area(ps, 0); | 593 | r = zero_disk_area(ps, 0); |
555 | if (r) { | 594 | if (r) |
556 | DMWARN("zero_disk_area(0) failed"); | 595 | DMWARN("zero_disk_area(0) failed"); |
557 | return r; | 596 | return r; |
558 | } | 597 | } |
559 | } else { | 598 | /* |
560 | /* | 599 | * Sanity checks. |
561 | * Sanity checks. | 600 | */ |
562 | */ | 601 | if (ps->version != SNAPSHOT_DISK_VERSION) { |
563 | if (ps->version != SNAPSHOT_DISK_VERSION) { | 602 | DMWARN("unable to handle snapshot disk version %d", |
564 | DMWARN("unable to handle snapshot disk version %d", | 603 | ps->version); |
565 | ps->version); | 604 | return -EINVAL; |
566 | return -EINVAL; | 605 | } |
567 | } | ||
568 | 606 | ||
569 | /* | 607 | /* |
570 | * Metadata are valid, but snapshot is invalidated | 608 | * Metadata are valid, but snapshot is invalidated |
571 | */ | 609 | */ |
572 | if (!ps->valid) | 610 | if (!ps->valid) |
573 | return 1; | 611 | return 1; |
574 | 612 | ||
575 | /* | 613 | /* |
576 | * Read the metadata. | 614 | * Read the metadata. |
577 | */ | 615 | */ |
578 | r = read_exceptions(ps, callback, callback_context); | 616 | r = read_exceptions(ps, callback, callback_context); |
579 | if (r) | ||
580 | return r; | ||
581 | } | ||
582 | 617 | ||
583 | return 0; | 618 | return r; |
584 | } | 619 | } |
585 | 620 | ||
586 | static int persistent_prepare_exception(struct dm_exception_store *store, | 621 | static int persistent_prepare_exception(struct dm_exception_store *store, |
587 | struct dm_snap_exception *e) | 622 | struct dm_exception *e) |
588 | { | 623 | { |
589 | struct pstore *ps = get_info(store); | 624 | struct pstore *ps = get_info(store); |
590 | uint32_t stride; | 625 | uint32_t stride; |
591 | chunk_t next_free; | 626 | chunk_t next_free; |
592 | sector_t size = get_dev_size(store->cow->bdev); | 627 | sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev); |
593 | 628 | ||
594 | /* Is there enough room ? */ | 629 | /* Is there enough room ? */ |
595 | if (size < ((ps->next_free + 1) * store->chunk_size)) | 630 | if (size < ((ps->next_free + 1) * store->chunk_size)) |
@@ -611,7 +646,7 @@ static int persistent_prepare_exception(struct dm_exception_store *store, | |||
611 | } | 646 | } |
612 | 647 | ||
613 | static void persistent_commit_exception(struct dm_exception_store *store, | 648 | static void persistent_commit_exception(struct dm_exception_store *store, |
614 | struct dm_snap_exception *e, | 649 | struct dm_exception *e, |
615 | void (*callback) (void *, int success), | 650 | void (*callback) (void *, int success), |
616 | void *callback_context) | 651 | void *callback_context) |
617 | { | 652 | { |
@@ -672,6 +707,85 @@ static void persistent_commit_exception(struct dm_exception_store *store, | |||
672 | ps->callback_count = 0; | 707 | ps->callback_count = 0; |
673 | } | 708 | } |
674 | 709 | ||
710 | static int persistent_prepare_merge(struct dm_exception_store *store, | ||
711 | chunk_t *last_old_chunk, | ||
712 | chunk_t *last_new_chunk) | ||
713 | { | ||
714 | struct pstore *ps = get_info(store); | ||
715 | struct disk_exception de; | ||
716 | int nr_consecutive; | ||
717 | int r; | ||
718 | |||
719 | /* | ||
720 | * When current area is empty, move back to preceding area. | ||
721 | */ | ||
722 | if (!ps->current_committed) { | ||
723 | /* | ||
724 | * Have we finished? | ||
725 | */ | ||
726 | if (!ps->current_area) | ||
727 | return 0; | ||
728 | |||
729 | ps->current_area--; | ||
730 | r = area_io(ps, READ); | ||
731 | if (r < 0) | ||
732 | return r; | ||
733 | ps->current_committed = ps->exceptions_per_area; | ||
734 | } | ||
735 | |||
736 | read_exception(ps, ps->current_committed - 1, &de); | ||
737 | *last_old_chunk = de.old_chunk; | ||
738 | *last_new_chunk = de.new_chunk; | ||
739 | |||
740 | /* | ||
741 | * Find number of consecutive chunks within the current area, | ||
742 | * working backwards. | ||
743 | */ | ||
744 | for (nr_consecutive = 1; nr_consecutive < ps->current_committed; | ||
745 | nr_consecutive++) { | ||
746 | read_exception(ps, ps->current_committed - 1 - nr_consecutive, | ||
747 | &de); | ||
748 | if (de.old_chunk != *last_old_chunk - nr_consecutive || | ||
749 | de.new_chunk != *last_new_chunk - nr_consecutive) | ||
750 | break; | ||
751 | } | ||
752 | |||
753 | return nr_consecutive; | ||
754 | } | ||
755 | |||
756 | static int persistent_commit_merge(struct dm_exception_store *store, | ||
757 | int nr_merged) | ||
758 | { | ||
759 | int r, i; | ||
760 | struct pstore *ps = get_info(store); | ||
761 | |||
762 | BUG_ON(nr_merged > ps->current_committed); | ||
763 | |||
764 | for (i = 0; i < nr_merged; i++) | ||
765 | clear_exception(ps, ps->current_committed - 1 - i); | ||
766 | |||
767 | r = area_io(ps, WRITE); | ||
768 | if (r < 0) | ||
769 | return r; | ||
770 | |||
771 | ps->current_committed -= nr_merged; | ||
772 | |||
773 | /* | ||
774 | * At this stage, only persistent_usage() uses ps->next_free, so | ||
775 | * we make no attempt to keep ps->next_free strictly accurate | ||
776 | * as exceptions may have been committed out-of-order originally. | ||
777 | * Once a snapshot has become merging, we set it to the value it | ||
778 | * would have held had all the exceptions been committed in order. | ||
779 | * | ||
780 | * ps->current_area does not get reduced by prepare_merge() until | ||
781 | * after commit_merge() has removed the nr_merged previous exceptions. | ||
782 | */ | ||
783 | ps->next_free = (area_location(ps, ps->current_area) - 1) + | ||
784 | (ps->current_committed + 1) + NUM_SNAPSHOT_HDR_CHUNKS; | ||
785 | |||
786 | return 0; | ||
787 | } | ||
788 | |||
675 | static void persistent_drop_snapshot(struct dm_exception_store *store) | 789 | static void persistent_drop_snapshot(struct dm_exception_store *store) |
676 | { | 790 | { |
677 | struct pstore *ps = get_info(store); | 791 | struct pstore *ps = get_info(store); |
@@ -697,7 +811,7 @@ static int persistent_ctr(struct dm_exception_store *store, | |||
697 | ps->area = NULL; | 811 | ps->area = NULL; |
698 | ps->zero_area = NULL; | 812 | ps->zero_area = NULL; |
699 | ps->header_area = NULL; | 813 | ps->header_area = NULL; |
700 | ps->next_free = 2; /* skipping the header and first area */ | 814 | ps->next_free = NUM_SNAPSHOT_HDR_CHUNKS + 1; /* header and 1st area */ |
701 | ps->current_committed = 0; | 815 | ps->current_committed = 0; |
702 | 816 | ||
703 | ps->callback_count = 0; | 817 | ps->callback_count = 0; |
@@ -726,8 +840,7 @@ static unsigned persistent_status(struct dm_exception_store *store, | |||
726 | case STATUSTYPE_INFO: | 840 | case STATUSTYPE_INFO: |
727 | break; | 841 | break; |
728 | case STATUSTYPE_TABLE: | 842 | case STATUSTYPE_TABLE: |
729 | DMEMIT(" %s P %llu", store->cow->name, | 843 | DMEMIT(" P %llu", (unsigned long long)store->chunk_size); |
730 | (unsigned long long)store->chunk_size); | ||
731 | } | 844 | } |
732 | 845 | ||
733 | return sz; | 846 | return sz; |
@@ -741,8 +854,10 @@ static struct dm_exception_store_type _persistent_type = { | |||
741 | .read_metadata = persistent_read_metadata, | 854 | .read_metadata = persistent_read_metadata, |
742 | .prepare_exception = persistent_prepare_exception, | 855 | .prepare_exception = persistent_prepare_exception, |
743 | .commit_exception = persistent_commit_exception, | 856 | .commit_exception = persistent_commit_exception, |
857 | .prepare_merge = persistent_prepare_merge, | ||
858 | .commit_merge = persistent_commit_merge, | ||
744 | .drop_snapshot = persistent_drop_snapshot, | 859 | .drop_snapshot = persistent_drop_snapshot, |
745 | .fraction_full = persistent_fraction_full, | 860 | .usage = persistent_usage, |
746 | .status = persistent_status, | 861 | .status = persistent_status, |
747 | }; | 862 | }; |
748 | 863 | ||
@@ -754,8 +869,10 @@ static struct dm_exception_store_type _persistent_compat_type = { | |||
754 | .read_metadata = persistent_read_metadata, | 869 | .read_metadata = persistent_read_metadata, |
755 | .prepare_exception = persistent_prepare_exception, | 870 | .prepare_exception = persistent_prepare_exception, |
756 | .commit_exception = persistent_commit_exception, | 871 | .commit_exception = persistent_commit_exception, |
872 | .prepare_merge = persistent_prepare_merge, | ||
873 | .commit_merge = persistent_commit_merge, | ||
757 | .drop_snapshot = persistent_drop_snapshot, | 874 | .drop_snapshot = persistent_drop_snapshot, |
758 | .fraction_full = persistent_fraction_full, | 875 | .usage = persistent_usage, |
759 | .status = persistent_status, | 876 | .status = persistent_status, |
760 | }; | 877 | }; |
761 | 878 | ||
diff --git a/drivers/md/dm-snap-transient.c b/drivers/md/dm-snap-transient.c index cde5aa558e6d..a0898a66a2f8 100644 --- a/drivers/md/dm-snap-transient.c +++ b/drivers/md/dm-snap-transient.c | |||
@@ -36,10 +36,10 @@ static int transient_read_metadata(struct dm_exception_store *store, | |||
36 | } | 36 | } |
37 | 37 | ||
38 | static int transient_prepare_exception(struct dm_exception_store *store, | 38 | static int transient_prepare_exception(struct dm_exception_store *store, |
39 | struct dm_snap_exception *e) | 39 | struct dm_exception *e) |
40 | { | 40 | { |
41 | struct transient_c *tc = store->context; | 41 | struct transient_c *tc = store->context; |
42 | sector_t size = get_dev_size(store->cow->bdev); | 42 | sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev); |
43 | 43 | ||
44 | if (size < (tc->next_free + store->chunk_size)) | 44 | if (size < (tc->next_free + store->chunk_size)) |
45 | return -1; | 45 | return -1; |
@@ -51,7 +51,7 @@ static int transient_prepare_exception(struct dm_exception_store *store, | |||
51 | } | 51 | } |
52 | 52 | ||
53 | static void transient_commit_exception(struct dm_exception_store *store, | 53 | static void transient_commit_exception(struct dm_exception_store *store, |
54 | struct dm_snap_exception *e, | 54 | struct dm_exception *e, |
55 | void (*callback) (void *, int success), | 55 | void (*callback) (void *, int success), |
56 | void *callback_context) | 56 | void *callback_context) |
57 | { | 57 | { |
@@ -59,11 +59,14 @@ static void transient_commit_exception(struct dm_exception_store *store, | |||
59 | callback(callback_context, 1); | 59 | callback(callback_context, 1); |
60 | } | 60 | } |
61 | 61 | ||
62 | static void transient_fraction_full(struct dm_exception_store *store, | 62 | static void transient_usage(struct dm_exception_store *store, |
63 | sector_t *numerator, sector_t *denominator) | 63 | sector_t *total_sectors, |
64 | sector_t *sectors_allocated, | ||
65 | sector_t *metadata_sectors) | ||
64 | { | 66 | { |
65 | *numerator = ((struct transient_c *) store->context)->next_free; | 67 | *sectors_allocated = ((struct transient_c *) store->context)->next_free; |
66 | *denominator = get_dev_size(store->cow->bdev); | 68 | *total_sectors = get_dev_size(dm_snap_cow(store->snap)->bdev); |
69 | *metadata_sectors = 0; | ||
67 | } | 70 | } |
68 | 71 | ||
69 | static int transient_ctr(struct dm_exception_store *store, | 72 | static int transient_ctr(struct dm_exception_store *store, |
@@ -91,8 +94,7 @@ static unsigned transient_status(struct dm_exception_store *store, | |||
91 | case STATUSTYPE_INFO: | 94 | case STATUSTYPE_INFO: |
92 | break; | 95 | break; |
93 | case STATUSTYPE_TABLE: | 96 | case STATUSTYPE_TABLE: |
94 | DMEMIT(" %s N %llu", store->cow->name, | 97 | DMEMIT(" N %llu", (unsigned long long)store->chunk_size); |
95 | (unsigned long long)store->chunk_size); | ||
96 | } | 98 | } |
97 | 99 | ||
98 | return sz; | 100 | return sz; |
@@ -106,7 +108,7 @@ static struct dm_exception_store_type _transient_type = { | |||
106 | .read_metadata = transient_read_metadata, | 108 | .read_metadata = transient_read_metadata, |
107 | .prepare_exception = transient_prepare_exception, | 109 | .prepare_exception = transient_prepare_exception, |
108 | .commit_exception = transient_commit_exception, | 110 | .commit_exception = transient_commit_exception, |
109 | .fraction_full = transient_fraction_full, | 111 | .usage = transient_usage, |
110 | .status = transient_status, | 112 | .status = transient_status, |
111 | }; | 113 | }; |
112 | 114 | ||
@@ -118,7 +120,7 @@ static struct dm_exception_store_type _transient_compat_type = { | |||
118 | .read_metadata = transient_read_metadata, | 120 | .read_metadata = transient_read_metadata, |
119 | .prepare_exception = transient_prepare_exception, | 121 | .prepare_exception = transient_prepare_exception, |
120 | .commit_exception = transient_commit_exception, | 122 | .commit_exception = transient_commit_exception, |
121 | .fraction_full = transient_fraction_full, | 123 | .usage = transient_usage, |
122 | .status = transient_status, | 124 | .status = transient_status, |
123 | }; | 125 | }; |
124 | 126 | ||
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 57f1bf7f3b7a..54853773510c 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c | |||
@@ -25,6 +25,11 @@ | |||
25 | 25 | ||
26 | #define DM_MSG_PREFIX "snapshots" | 26 | #define DM_MSG_PREFIX "snapshots" |
27 | 27 | ||
28 | static const char dm_snapshot_merge_target_name[] = "snapshot-merge"; | ||
29 | |||
30 | #define dm_target_is_snapshot_merge(ti) \ | ||
31 | ((ti)->type->name == dm_snapshot_merge_target_name) | ||
32 | |||
28 | /* | 33 | /* |
29 | * The percentage increment we will wake up users at | 34 | * The percentage increment we will wake up users at |
30 | */ | 35 | */ |
@@ -49,7 +54,7 @@ | |||
49 | #define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \ | 54 | #define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \ |
50 | (DM_TRACKED_CHUNK_HASH_SIZE - 1)) | 55 | (DM_TRACKED_CHUNK_HASH_SIZE - 1)) |
51 | 56 | ||
52 | struct exception_table { | 57 | struct dm_exception_table { |
53 | uint32_t hash_mask; | 58 | uint32_t hash_mask; |
54 | unsigned hash_shift; | 59 | unsigned hash_shift; |
55 | struct list_head *table; | 60 | struct list_head *table; |
@@ -59,22 +64,31 @@ struct dm_snapshot { | |||
59 | struct rw_semaphore lock; | 64 | struct rw_semaphore lock; |
60 | 65 | ||
61 | struct dm_dev *origin; | 66 | struct dm_dev *origin; |
67 | struct dm_dev *cow; | ||
68 | |||
69 | struct dm_target *ti; | ||
62 | 70 | ||
63 | /* List of snapshots per Origin */ | 71 | /* List of snapshots per Origin */ |
64 | struct list_head list; | 72 | struct list_head list; |
65 | 73 | ||
66 | /* You can't use a snapshot if this is 0 (e.g. if full) */ | 74 | /* |
75 | * You can't use a snapshot if this is 0 (e.g. if full). | ||
76 | * A snapshot-merge target never clears this. | ||
77 | */ | ||
67 | int valid; | 78 | int valid; |
68 | 79 | ||
69 | /* Origin writes don't trigger exceptions until this is set */ | 80 | /* Origin writes don't trigger exceptions until this is set */ |
70 | int active; | 81 | int active; |
71 | 82 | ||
72 | mempool_t *pending_pool; | 83 | /* Whether or not owning mapped_device is suspended */ |
84 | int suspended; | ||
73 | 85 | ||
74 | atomic_t pending_exceptions_count; | 86 | atomic_t pending_exceptions_count; |
75 | 87 | ||
76 | struct exception_table pending; | 88 | mempool_t *pending_pool; |
77 | struct exception_table complete; | 89 | |
90 | struct dm_exception_table pending; | ||
91 | struct dm_exception_table complete; | ||
78 | 92 | ||
79 | /* | 93 | /* |
80 | * pe_lock protects all pending_exception operations and access | 94 | * pe_lock protects all pending_exception operations and access |
@@ -82,6 +96,11 @@ struct dm_snapshot { | |||
82 | */ | 96 | */ |
83 | spinlock_t pe_lock; | 97 | spinlock_t pe_lock; |
84 | 98 | ||
99 | /* Chunks with outstanding reads */ | ||
100 | spinlock_t tracked_chunk_lock; | ||
101 | mempool_t *tracked_chunk_pool; | ||
102 | struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE]; | ||
103 | |||
85 | /* The on disk metadata handler */ | 104 | /* The on disk metadata handler */ |
86 | struct dm_exception_store *store; | 105 | struct dm_exception_store *store; |
87 | 106 | ||
@@ -91,12 +110,50 @@ struct dm_snapshot { | |||
91 | struct bio_list queued_bios; | 110 | struct bio_list queued_bios; |
92 | struct work_struct queued_bios_work; | 111 | struct work_struct queued_bios_work; |
93 | 112 | ||
94 | /* Chunks with outstanding reads */ | 113 | /* Wait for events based on state_bits */ |
95 | mempool_t *tracked_chunk_pool; | 114 | unsigned long state_bits; |
96 | spinlock_t tracked_chunk_lock; | 115 | |
97 | struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE]; | 116 | /* Range of chunks currently being merged. */ |
117 | chunk_t first_merging_chunk; | ||
118 | int num_merging_chunks; | ||
119 | |||
120 | /* | ||
121 | * The merge operation failed if this flag is set. | ||
122 | * Failure modes are handled as follows: | ||
123 | * - I/O error reading the header | ||
124 | * => don't load the target; abort. | ||
125 | * - Header does not have "valid" flag set | ||
126 | * => use the origin; forget about the snapshot. | ||
127 | * - I/O error when reading exceptions | ||
128 | * => don't load the target; abort. | ||
129 | * (We can't use the intermediate origin state.) | ||
130 | * - I/O error while merging | ||
131 | * => stop merging; set merge_failed; process I/O normally. | ||
132 | */ | ||
133 | int merge_failed; | ||
134 | |||
135 | /* | ||
136 | * Incoming bios that overlap with chunks being merged must wait | ||
137 | * for them to be committed. | ||
138 | */ | ||
139 | struct bio_list bios_queued_during_merge; | ||
98 | }; | 140 | }; |
99 | 141 | ||
142 | /* | ||
143 | * state_bits: | ||
144 | * RUNNING_MERGE - Merge operation is in progress. | ||
145 | * SHUTDOWN_MERGE - Set to signal that merge needs to be stopped; | ||
146 | * cleared afterwards. | ||
147 | */ | ||
148 | #define RUNNING_MERGE 0 | ||
149 | #define SHUTDOWN_MERGE 1 | ||
150 | |||
151 | struct dm_dev *dm_snap_cow(struct dm_snapshot *s) | ||
152 | { | ||
153 | return s->cow; | ||
154 | } | ||
155 | EXPORT_SYMBOL(dm_snap_cow); | ||
156 | |||
100 | static struct workqueue_struct *ksnapd; | 157 | static struct workqueue_struct *ksnapd; |
101 | static void flush_queued_bios(struct work_struct *work); | 158 | static void flush_queued_bios(struct work_struct *work); |
102 | 159 | ||
@@ -116,7 +173,7 @@ static int bdev_equal(struct block_device *lhs, struct block_device *rhs) | |||
116 | } | 173 | } |
117 | 174 | ||
118 | struct dm_snap_pending_exception { | 175 | struct dm_snap_pending_exception { |
119 | struct dm_snap_exception e; | 176 | struct dm_exception e; |
120 | 177 | ||
121 | /* | 178 | /* |
122 | * Origin buffers waiting for this to complete are held | 179 | * Origin buffers waiting for this to complete are held |
@@ -125,28 +182,6 @@ struct dm_snap_pending_exception { | |||
125 | struct bio_list origin_bios; | 182 | struct bio_list origin_bios; |
126 | struct bio_list snapshot_bios; | 183 | struct bio_list snapshot_bios; |
127 | 184 | ||
128 | /* | ||
129 | * Short-term queue of pending exceptions prior to submission. | ||
130 | */ | ||
131 | struct list_head list; | ||
132 | |||
133 | /* | ||
134 | * The primary pending_exception is the one that holds | ||
135 | * the ref_count and the list of origin_bios for a | ||
136 | * group of pending_exceptions. It is always last to get freed. | ||
137 | * These fields get set up when writing to the origin. | ||
138 | */ | ||
139 | struct dm_snap_pending_exception *primary_pe; | ||
140 | |||
141 | /* | ||
142 | * Number of pending_exceptions processing this chunk. | ||
143 | * When this drops to zero we must complete the origin bios. | ||
144 | * If incrementing or decrementing this, hold pe->snap->lock for | ||
145 | * the sibling concerned and not pe->primary_pe->snap->lock unless | ||
146 | * they are the same. | ||
147 | */ | ||
148 | atomic_t ref_count; | ||
149 | |||
150 | /* Pointer back to snapshot context */ | 185 | /* Pointer back to snapshot context */ |
151 | struct dm_snapshot *snap; | 186 | struct dm_snapshot *snap; |
152 | 187 | ||
@@ -222,6 +257,16 @@ static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk) | |||
222 | } | 257 | } |
223 | 258 | ||
224 | /* | 259 | /* |
260 | * This conflicting I/O is extremely improbable in the caller, | ||
261 | * so msleep(1) is sufficient and there is no need for a wait queue. | ||
262 | */ | ||
263 | static void __check_for_conflicting_io(struct dm_snapshot *s, chunk_t chunk) | ||
264 | { | ||
265 | while (__chunk_is_tracked(s, chunk)) | ||
266 | msleep(1); | ||
267 | } | ||
268 | |||
269 | /* | ||
225 | * One of these per registered origin, held in the snapshot_origins hash | 270 | * One of these per registered origin, held in the snapshot_origins hash |
226 | */ | 271 | */ |
227 | struct origin { | 272 | struct origin { |
@@ -243,6 +288,10 @@ struct origin { | |||
243 | static struct list_head *_origins; | 288 | static struct list_head *_origins; |
244 | static struct rw_semaphore _origins_lock; | 289 | static struct rw_semaphore _origins_lock; |
245 | 290 | ||
291 | static DECLARE_WAIT_QUEUE_HEAD(_pending_exceptions_done); | ||
292 | static DEFINE_SPINLOCK(_pending_exceptions_done_spinlock); | ||
293 | static uint64_t _pending_exceptions_done_count; | ||
294 | |||
246 | static int init_origin_hash(void) | 295 | static int init_origin_hash(void) |
247 | { | 296 | { |
248 | int i; | 297 | int i; |
@@ -291,21 +340,144 @@ static void __insert_origin(struct origin *o) | |||
291 | } | 340 | } |
292 | 341 | ||
293 | /* | 342 | /* |
343 | * _origins_lock must be held when calling this function. | ||
344 | * Returns number of snapshots registered using the supplied cow device, plus: | ||
345 | * snap_src - a snapshot suitable for use as a source of exception handover | ||
346 | * snap_dest - a snapshot capable of receiving exception handover. | ||
347 | * snap_merge - an existing snapshot-merge target linked to the same origin. | ||
348 | * There can be at most one snapshot-merge target. The parameter is optional. | ||
349 | * | ||
350 | * Possible return values and states of snap_src and snap_dest. | ||
351 | * 0: NULL, NULL - first new snapshot | ||
352 | * 1: snap_src, NULL - normal snapshot | ||
353 | * 2: snap_src, snap_dest - waiting for handover | ||
354 | * 2: snap_src, NULL - handed over, waiting for old to be deleted | ||
355 | * 1: NULL, snap_dest - source got destroyed without handover | ||
356 | */ | ||
357 | static int __find_snapshots_sharing_cow(struct dm_snapshot *snap, | ||
358 | struct dm_snapshot **snap_src, | ||
359 | struct dm_snapshot **snap_dest, | ||
360 | struct dm_snapshot **snap_merge) | ||
361 | { | ||
362 | struct dm_snapshot *s; | ||
363 | struct origin *o; | ||
364 | int count = 0; | ||
365 | int active; | ||
366 | |||
367 | o = __lookup_origin(snap->origin->bdev); | ||
368 | if (!o) | ||
369 | goto out; | ||
370 | |||
371 | list_for_each_entry(s, &o->snapshots, list) { | ||
372 | if (dm_target_is_snapshot_merge(s->ti) && snap_merge) | ||
373 | *snap_merge = s; | ||
374 | if (!bdev_equal(s->cow->bdev, snap->cow->bdev)) | ||
375 | continue; | ||
376 | |||
377 | down_read(&s->lock); | ||
378 | active = s->active; | ||
379 | up_read(&s->lock); | ||
380 | |||
381 | if (active) { | ||
382 | if (snap_src) | ||
383 | *snap_src = s; | ||
384 | } else if (snap_dest) | ||
385 | *snap_dest = s; | ||
386 | |||
387 | count++; | ||
388 | } | ||
389 | |||
390 | out: | ||
391 | return count; | ||
392 | } | ||
393 | |||
394 | /* | ||
395 | * On success, returns 1 if this snapshot is a handover destination, | ||
396 | * otherwise returns 0. | ||
397 | */ | ||
398 | static int __validate_exception_handover(struct dm_snapshot *snap) | ||
399 | { | ||
400 | struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; | ||
401 | struct dm_snapshot *snap_merge = NULL; | ||
402 | |||
403 | /* Does snapshot need exceptions handed over to it? */ | ||
404 | if ((__find_snapshots_sharing_cow(snap, &snap_src, &snap_dest, | ||
405 | &snap_merge) == 2) || | ||
406 | snap_dest) { | ||
407 | snap->ti->error = "Snapshot cow pairing for exception " | ||
408 | "table handover failed"; | ||
409 | return -EINVAL; | ||
410 | } | ||
411 | |||
412 | /* | ||
413 | * If no snap_src was found, snap cannot become a handover | ||
414 | * destination. | ||
415 | */ | ||
416 | if (!snap_src) | ||
417 | return 0; | ||
418 | |||
419 | /* | ||
420 | * Non-snapshot-merge handover? | ||
421 | */ | ||
422 | if (!dm_target_is_snapshot_merge(snap->ti)) | ||
423 | return 1; | ||
424 | |||
425 | /* | ||
426 | * Do not allow more than one merging snapshot. | ||
427 | */ | ||
428 | if (snap_merge) { | ||
429 | snap->ti->error = "A snapshot is already merging."; | ||
430 | return -EINVAL; | ||
431 | } | ||
432 | |||
433 | if (!snap_src->store->type->prepare_merge || | ||
434 | !snap_src->store->type->commit_merge) { | ||
435 | snap->ti->error = "Snapshot exception store does not " | ||
436 | "support snapshot-merge."; | ||
437 | return -EINVAL; | ||
438 | } | ||
439 | |||
440 | return 1; | ||
441 | } | ||
442 | |||
443 | static void __insert_snapshot(struct origin *o, struct dm_snapshot *s) | ||
444 | { | ||
445 | struct dm_snapshot *l; | ||
446 | |||
447 | /* Sort the list according to chunk size, largest-first smallest-last */ | ||
448 | list_for_each_entry(l, &o->snapshots, list) | ||
449 | if (l->store->chunk_size < s->store->chunk_size) | ||
450 | break; | ||
451 | list_add_tail(&s->list, &l->list); | ||
452 | } | ||
453 | |||
454 | /* | ||
294 | * Make a note of the snapshot and its origin so we can look it | 455 | * Make a note of the snapshot and its origin so we can look it |
295 | * up when the origin has a write on it. | 456 | * up when the origin has a write on it. |
457 | * | ||
458 | * Also validate snapshot exception store handovers. | ||
459 | * On success, returns 1 if this registration is a handover destination, | ||
460 | * otherwise returns 0. | ||
296 | */ | 461 | */ |
297 | static int register_snapshot(struct dm_snapshot *snap) | 462 | static int register_snapshot(struct dm_snapshot *snap) |
298 | { | 463 | { |
299 | struct origin *o, *new_o; | 464 | struct origin *o, *new_o = NULL; |
300 | struct block_device *bdev = snap->origin->bdev; | 465 | struct block_device *bdev = snap->origin->bdev; |
466 | int r = 0; | ||
301 | 467 | ||
302 | new_o = kmalloc(sizeof(*new_o), GFP_KERNEL); | 468 | new_o = kmalloc(sizeof(*new_o), GFP_KERNEL); |
303 | if (!new_o) | 469 | if (!new_o) |
304 | return -ENOMEM; | 470 | return -ENOMEM; |
305 | 471 | ||
306 | down_write(&_origins_lock); | 472 | down_write(&_origins_lock); |
307 | o = __lookup_origin(bdev); | ||
308 | 473 | ||
474 | r = __validate_exception_handover(snap); | ||
475 | if (r < 0) { | ||
476 | kfree(new_o); | ||
477 | goto out; | ||
478 | } | ||
479 | |||
480 | o = __lookup_origin(bdev); | ||
309 | if (o) | 481 | if (o) |
310 | kfree(new_o); | 482 | kfree(new_o); |
311 | else { | 483 | else { |
@@ -319,10 +491,27 @@ static int register_snapshot(struct dm_snapshot *snap) | |||
319 | __insert_origin(o); | 491 | __insert_origin(o); |
320 | } | 492 | } |
321 | 493 | ||
322 | list_add_tail(&snap->list, &o->snapshots); | 494 | __insert_snapshot(o, snap); |
495 | |||
496 | out: | ||
497 | up_write(&_origins_lock); | ||
498 | |||
499 | return r; | ||
500 | } | ||
501 | |||
502 | /* | ||
503 | * Move snapshot to correct place in list according to chunk size. | ||
504 | */ | ||
505 | static void reregister_snapshot(struct dm_snapshot *s) | ||
506 | { | ||
507 | struct block_device *bdev = s->origin->bdev; | ||
508 | |||
509 | down_write(&_origins_lock); | ||
510 | |||
511 | list_del(&s->list); | ||
512 | __insert_snapshot(__lookup_origin(bdev), s); | ||
323 | 513 | ||
324 | up_write(&_origins_lock); | 514 | up_write(&_origins_lock); |
325 | return 0; | ||
326 | } | 515 | } |
327 | 516 | ||
328 | static void unregister_snapshot(struct dm_snapshot *s) | 517 | static void unregister_snapshot(struct dm_snapshot *s) |
@@ -333,7 +522,7 @@ static void unregister_snapshot(struct dm_snapshot *s) | |||
333 | o = __lookup_origin(s->origin->bdev); | 522 | o = __lookup_origin(s->origin->bdev); |
334 | 523 | ||
335 | list_del(&s->list); | 524 | list_del(&s->list); |
336 | if (list_empty(&o->snapshots)) { | 525 | if (o && list_empty(&o->snapshots)) { |
337 | list_del(&o->hash_list); | 526 | list_del(&o->hash_list); |
338 | kfree(o); | 527 | kfree(o); |
339 | } | 528 | } |
@@ -346,8 +535,8 @@ static void unregister_snapshot(struct dm_snapshot *s) | |||
346 | * The lowest hash_shift bits of the chunk number are ignored, allowing | 535 | * The lowest hash_shift bits of the chunk number are ignored, allowing |
347 | * some consecutive chunks to be grouped together. | 536 | * some consecutive chunks to be grouped together. |
348 | */ | 537 | */ |
349 | static int init_exception_table(struct exception_table *et, uint32_t size, | 538 | static int dm_exception_table_init(struct dm_exception_table *et, |
350 | unsigned hash_shift) | 539 | uint32_t size, unsigned hash_shift) |
351 | { | 540 | { |
352 | unsigned int i; | 541 | unsigned int i; |
353 | 542 | ||
@@ -363,10 +552,11 @@ static int init_exception_table(struct exception_table *et, uint32_t size, | |||
363 | return 0; | 552 | return 0; |
364 | } | 553 | } |
365 | 554 | ||
366 | static void exit_exception_table(struct exception_table *et, struct kmem_cache *mem) | 555 | static void dm_exception_table_exit(struct dm_exception_table *et, |
556 | struct kmem_cache *mem) | ||
367 | { | 557 | { |
368 | struct list_head *slot; | 558 | struct list_head *slot; |
369 | struct dm_snap_exception *ex, *next; | 559 | struct dm_exception *ex, *next; |
370 | int i, size; | 560 | int i, size; |
371 | 561 | ||
372 | size = et->hash_mask + 1; | 562 | size = et->hash_mask + 1; |
@@ -380,19 +570,12 @@ static void exit_exception_table(struct exception_table *et, struct kmem_cache * | |||
380 | vfree(et->table); | 570 | vfree(et->table); |
381 | } | 571 | } |
382 | 572 | ||
383 | static uint32_t exception_hash(struct exception_table *et, chunk_t chunk) | 573 | static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk) |
384 | { | 574 | { |
385 | return (chunk >> et->hash_shift) & et->hash_mask; | 575 | return (chunk >> et->hash_shift) & et->hash_mask; |
386 | } | 576 | } |
387 | 577 | ||
388 | static void insert_exception(struct exception_table *eh, | 578 | static void dm_remove_exception(struct dm_exception *e) |
389 | struct dm_snap_exception *e) | ||
390 | { | ||
391 | struct list_head *l = &eh->table[exception_hash(eh, e->old_chunk)]; | ||
392 | list_add(&e->hash_list, l); | ||
393 | } | ||
394 | |||
395 | static void remove_exception(struct dm_snap_exception *e) | ||
396 | { | 579 | { |
397 | list_del(&e->hash_list); | 580 | list_del(&e->hash_list); |
398 | } | 581 | } |
@@ -401,11 +584,11 @@ static void remove_exception(struct dm_snap_exception *e) | |||
401 | * Return the exception data for a sector, or NULL if not | 584 | * Return the exception data for a sector, or NULL if not |
402 | * remapped. | 585 | * remapped. |
403 | */ | 586 | */ |
404 | static struct dm_snap_exception *lookup_exception(struct exception_table *et, | 587 | static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et, |
405 | chunk_t chunk) | 588 | chunk_t chunk) |
406 | { | 589 | { |
407 | struct list_head *slot; | 590 | struct list_head *slot; |
408 | struct dm_snap_exception *e; | 591 | struct dm_exception *e; |
409 | 592 | ||
410 | slot = &et->table[exception_hash(et, chunk)]; | 593 | slot = &et->table[exception_hash(et, chunk)]; |
411 | list_for_each_entry (e, slot, hash_list) | 594 | list_for_each_entry (e, slot, hash_list) |
@@ -416,9 +599,9 @@ static struct dm_snap_exception *lookup_exception(struct exception_table *et, | |||
416 | return NULL; | 599 | return NULL; |
417 | } | 600 | } |
418 | 601 | ||
419 | static struct dm_snap_exception *alloc_exception(void) | 602 | static struct dm_exception *alloc_completed_exception(void) |
420 | { | 603 | { |
421 | struct dm_snap_exception *e; | 604 | struct dm_exception *e; |
422 | 605 | ||
423 | e = kmem_cache_alloc(exception_cache, GFP_NOIO); | 606 | e = kmem_cache_alloc(exception_cache, GFP_NOIO); |
424 | if (!e) | 607 | if (!e) |
@@ -427,7 +610,7 @@ static struct dm_snap_exception *alloc_exception(void) | |||
427 | return e; | 610 | return e; |
428 | } | 611 | } |
429 | 612 | ||
430 | static void free_exception(struct dm_snap_exception *e) | 613 | static void free_completed_exception(struct dm_exception *e) |
431 | { | 614 | { |
432 | kmem_cache_free(exception_cache, e); | 615 | kmem_cache_free(exception_cache, e); |
433 | } | 616 | } |
@@ -452,12 +635,11 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe) | |||
452 | atomic_dec(&s->pending_exceptions_count); | 635 | atomic_dec(&s->pending_exceptions_count); |
453 | } | 636 | } |
454 | 637 | ||
455 | static void insert_completed_exception(struct dm_snapshot *s, | 638 | static void dm_insert_exception(struct dm_exception_table *eh, |
456 | struct dm_snap_exception *new_e) | 639 | struct dm_exception *new_e) |
457 | { | 640 | { |
458 | struct exception_table *eh = &s->complete; | ||
459 | struct list_head *l; | 641 | struct list_head *l; |
460 | struct dm_snap_exception *e = NULL; | 642 | struct dm_exception *e = NULL; |
461 | 643 | ||
462 | l = &eh->table[exception_hash(eh, new_e->old_chunk)]; | 644 | l = &eh->table[exception_hash(eh, new_e->old_chunk)]; |
463 | 645 | ||
@@ -473,7 +655,7 @@ static void insert_completed_exception(struct dm_snapshot *s, | |||
473 | new_e->new_chunk == (dm_chunk_number(e->new_chunk) + | 655 | new_e->new_chunk == (dm_chunk_number(e->new_chunk) + |
474 | dm_consecutive_chunk_count(e) + 1)) { | 656 | dm_consecutive_chunk_count(e) + 1)) { |
475 | dm_consecutive_chunk_count_inc(e); | 657 | dm_consecutive_chunk_count_inc(e); |
476 | free_exception(new_e); | 658 | free_completed_exception(new_e); |
477 | return; | 659 | return; |
478 | } | 660 | } |
479 | 661 | ||
@@ -483,7 +665,7 @@ static void insert_completed_exception(struct dm_snapshot *s, | |||
483 | dm_consecutive_chunk_count_inc(e); | 665 | dm_consecutive_chunk_count_inc(e); |
484 | e->old_chunk--; | 666 | e->old_chunk--; |
485 | e->new_chunk--; | 667 | e->new_chunk--; |
486 | free_exception(new_e); | 668 | free_completed_exception(new_e); |
487 | return; | 669 | return; |
488 | } | 670 | } |
489 | 671 | ||
@@ -502,9 +684,9 @@ out: | |||
502 | static int dm_add_exception(void *context, chunk_t old, chunk_t new) | 684 | static int dm_add_exception(void *context, chunk_t old, chunk_t new) |
503 | { | 685 | { |
504 | struct dm_snapshot *s = context; | 686 | struct dm_snapshot *s = context; |
505 | struct dm_snap_exception *e; | 687 | struct dm_exception *e; |
506 | 688 | ||
507 | e = alloc_exception(); | 689 | e = alloc_completed_exception(); |
508 | if (!e) | 690 | if (!e) |
509 | return -ENOMEM; | 691 | return -ENOMEM; |
510 | 692 | ||
@@ -513,11 +695,30 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new) | |||
513 | /* Consecutive_count is implicitly initialised to zero */ | 695 | /* Consecutive_count is implicitly initialised to zero */ |
514 | e->new_chunk = new; | 696 | e->new_chunk = new; |
515 | 697 | ||
516 | insert_completed_exception(s, e); | 698 | dm_insert_exception(&s->complete, e); |
517 | 699 | ||
518 | return 0; | 700 | return 0; |
519 | } | 701 | } |
520 | 702 | ||
703 | #define min_not_zero(l, r) (((l) == 0) ? (r) : (((r) == 0) ? (l) : min(l, r))) | ||
704 | |||
705 | /* | ||
706 | * Return a minimum chunk size of all snapshots that have the specified origin. | ||
707 | * Return zero if the origin has no snapshots. | ||
708 | */ | ||
709 | static sector_t __minimum_chunk_size(struct origin *o) | ||
710 | { | ||
711 | struct dm_snapshot *snap; | ||
712 | unsigned chunk_size = 0; | ||
713 | |||
714 | if (o) | ||
715 | list_for_each_entry(snap, &o->snapshots, list) | ||
716 | chunk_size = min_not_zero(chunk_size, | ||
717 | snap->store->chunk_size); | ||
718 | |||
719 | return chunk_size; | ||
720 | } | ||
721 | |||
521 | /* | 722 | /* |
522 | * Hard coded magic. | 723 | * Hard coded magic. |
523 | */ | 724 | */ |
@@ -541,16 +742,18 @@ static int init_hash_tables(struct dm_snapshot *s) | |||
541 | * Calculate based on the size of the original volume or | 742 | * Calculate based on the size of the original volume or |
542 | * the COW volume... | 743 | * the COW volume... |
543 | */ | 744 | */ |
544 | cow_dev_size = get_dev_size(s->store->cow->bdev); | 745 | cow_dev_size = get_dev_size(s->cow->bdev); |
545 | origin_dev_size = get_dev_size(s->origin->bdev); | 746 | origin_dev_size = get_dev_size(s->origin->bdev); |
546 | max_buckets = calc_max_buckets(); | 747 | max_buckets = calc_max_buckets(); |
547 | 748 | ||
548 | hash_size = min(origin_dev_size, cow_dev_size) >> s->store->chunk_shift; | 749 | hash_size = min(origin_dev_size, cow_dev_size) >> s->store->chunk_shift; |
549 | hash_size = min(hash_size, max_buckets); | 750 | hash_size = min(hash_size, max_buckets); |
550 | 751 | ||
752 | if (hash_size < 64) | ||
753 | hash_size = 64; | ||
551 | hash_size = rounddown_pow_of_two(hash_size); | 754 | hash_size = rounddown_pow_of_two(hash_size); |
552 | if (init_exception_table(&s->complete, hash_size, | 755 | if (dm_exception_table_init(&s->complete, hash_size, |
553 | DM_CHUNK_CONSECUTIVE_BITS)) | 756 | DM_CHUNK_CONSECUTIVE_BITS)) |
554 | return -ENOMEM; | 757 | return -ENOMEM; |
555 | 758 | ||
556 | /* | 759 | /* |
@@ -561,14 +764,284 @@ static int init_hash_tables(struct dm_snapshot *s) | |||
561 | if (hash_size < 64) | 764 | if (hash_size < 64) |
562 | hash_size = 64; | 765 | hash_size = 64; |
563 | 766 | ||
564 | if (init_exception_table(&s->pending, hash_size, 0)) { | 767 | if (dm_exception_table_init(&s->pending, hash_size, 0)) { |
565 | exit_exception_table(&s->complete, exception_cache); | 768 | dm_exception_table_exit(&s->complete, exception_cache); |
566 | return -ENOMEM; | 769 | return -ENOMEM; |
567 | } | 770 | } |
568 | 771 | ||
569 | return 0; | 772 | return 0; |
570 | } | 773 | } |
571 | 774 | ||
775 | static void merge_shutdown(struct dm_snapshot *s) | ||
776 | { | ||
777 | clear_bit_unlock(RUNNING_MERGE, &s->state_bits); | ||
778 | smp_mb__after_clear_bit(); | ||
779 | wake_up_bit(&s->state_bits, RUNNING_MERGE); | ||
780 | } | ||
781 | |||
782 | static struct bio *__release_queued_bios_after_merge(struct dm_snapshot *s) | ||
783 | { | ||
784 | s->first_merging_chunk = 0; | ||
785 | s->num_merging_chunks = 0; | ||
786 | |||
787 | return bio_list_get(&s->bios_queued_during_merge); | ||
788 | } | ||
789 | |||
790 | /* | ||
791 | * Remove one chunk from the index of completed exceptions. | ||
792 | */ | ||
793 | static int __remove_single_exception_chunk(struct dm_snapshot *s, | ||
794 | chunk_t old_chunk) | ||
795 | { | ||
796 | struct dm_exception *e; | ||
797 | |||
798 | e = dm_lookup_exception(&s->complete, old_chunk); | ||
799 | if (!e) { | ||
800 | DMERR("Corruption detected: exception for block %llu is " | ||
801 | "on disk but not in memory", | ||
802 | (unsigned long long)old_chunk); | ||
803 | return -EINVAL; | ||
804 | } | ||
805 | |||
806 | /* | ||
807 | * If this is the only chunk using this exception, remove exception. | ||
808 | */ | ||
809 | if (!dm_consecutive_chunk_count(e)) { | ||
810 | dm_remove_exception(e); | ||
811 | free_completed_exception(e); | ||
812 | return 0; | ||
813 | } | ||
814 | |||
815 | /* | ||
816 | * The chunk may be either at the beginning or the end of a | ||
817 | * group of consecutive chunks - never in the middle. We are | ||
818 | * removing chunks in the opposite order to that in which they | ||
819 | * were added, so this should always be true. | ||
820 | * Decrement the consecutive chunk counter and adjust the | ||
821 | * starting point if necessary. | ||
822 | */ | ||
823 | if (old_chunk == e->old_chunk) { | ||
824 | e->old_chunk++; | ||
825 | e->new_chunk++; | ||
826 | } else if (old_chunk != e->old_chunk + | ||
827 | dm_consecutive_chunk_count(e)) { | ||
828 | DMERR("Attempt to merge block %llu from the " | ||
829 | "middle of a chunk range [%llu - %llu]", | ||
830 | (unsigned long long)old_chunk, | ||
831 | (unsigned long long)e->old_chunk, | ||
832 | (unsigned long long) | ||
833 | e->old_chunk + dm_consecutive_chunk_count(e)); | ||
834 | return -EINVAL; | ||
835 | } | ||
836 | |||
837 | dm_consecutive_chunk_count_dec(e); | ||
838 | |||
839 | return 0; | ||
840 | } | ||
841 | |||
842 | static void flush_bios(struct bio *bio); | ||
843 | |||
844 | static int remove_single_exception_chunk(struct dm_snapshot *s) | ||
845 | { | ||
846 | struct bio *b = NULL; | ||
847 | int r; | ||
848 | chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1; | ||
849 | |||
850 | down_write(&s->lock); | ||
851 | |||
852 | /* | ||
853 | * Process chunks (and associated exceptions) in reverse order | ||
854 | * so that dm_consecutive_chunk_count_dec() accounting works. | ||
855 | */ | ||
856 | do { | ||
857 | r = __remove_single_exception_chunk(s, old_chunk); | ||
858 | if (r) | ||
859 | goto out; | ||
860 | } while (old_chunk-- > s->first_merging_chunk); | ||
861 | |||
862 | b = __release_queued_bios_after_merge(s); | ||
863 | |||
864 | out: | ||
865 | up_write(&s->lock); | ||
866 | if (b) | ||
867 | flush_bios(b); | ||
868 | |||
869 | return r; | ||
870 | } | ||
871 | |||
872 | static int origin_write_extent(struct dm_snapshot *merging_snap, | ||
873 | sector_t sector, unsigned chunk_size); | ||
874 | |||
875 | static void merge_callback(int read_err, unsigned long write_err, | ||
876 | void *context); | ||
877 | |||
878 | static uint64_t read_pending_exceptions_done_count(void) | ||
879 | { | ||
880 | uint64_t pending_exceptions_done; | ||
881 | |||
882 | spin_lock(&_pending_exceptions_done_spinlock); | ||
883 | pending_exceptions_done = _pending_exceptions_done_count; | ||
884 | spin_unlock(&_pending_exceptions_done_spinlock); | ||
885 | |||
886 | return pending_exceptions_done; | ||
887 | } | ||
888 | |||
889 | static void increment_pending_exceptions_done_count(void) | ||
890 | { | ||
891 | spin_lock(&_pending_exceptions_done_spinlock); | ||
892 | _pending_exceptions_done_count++; | ||
893 | spin_unlock(&_pending_exceptions_done_spinlock); | ||
894 | |||
895 | wake_up_all(&_pending_exceptions_done); | ||
896 | } | ||
897 | |||
898 | static void snapshot_merge_next_chunks(struct dm_snapshot *s) | ||
899 | { | ||
900 | int i, linear_chunks; | ||
901 | chunk_t old_chunk, new_chunk; | ||
902 | struct dm_io_region src, dest; | ||
903 | sector_t io_size; | ||
904 | uint64_t previous_count; | ||
905 | |||
906 | BUG_ON(!test_bit(RUNNING_MERGE, &s->state_bits)); | ||
907 | if (unlikely(test_bit(SHUTDOWN_MERGE, &s->state_bits))) | ||
908 | goto shut; | ||
909 | |||
910 | /* | ||
911 | * valid flag never changes during merge, so no lock required. | ||
912 | */ | ||
913 | if (!s->valid) { | ||
914 | DMERR("Snapshot is invalid: can't merge"); | ||
915 | goto shut; | ||
916 | } | ||
917 | |||
918 | linear_chunks = s->store->type->prepare_merge(s->store, &old_chunk, | ||
919 | &new_chunk); | ||
920 | if (linear_chunks <= 0) { | ||
921 | if (linear_chunks < 0) { | ||
922 | DMERR("Read error in exception store: " | ||
923 | "shutting down merge"); | ||
924 | down_write(&s->lock); | ||
925 | s->merge_failed = 1; | ||
926 | up_write(&s->lock); | ||
927 | } | ||
928 | goto shut; | ||
929 | } | ||
930 | |||
931 | /* Adjust old_chunk and new_chunk to reflect start of linear region */ | ||
932 | old_chunk = old_chunk + 1 - linear_chunks; | ||
933 | new_chunk = new_chunk + 1 - linear_chunks; | ||
934 | |||
935 | /* | ||
936 | * Use one (potentially large) I/O to copy all 'linear_chunks' | ||
937 | * from the exception store to the origin | ||
938 | */ | ||
939 | io_size = linear_chunks * s->store->chunk_size; | ||
940 | |||
941 | dest.bdev = s->origin->bdev; | ||
942 | dest.sector = chunk_to_sector(s->store, old_chunk); | ||
943 | dest.count = min(io_size, get_dev_size(dest.bdev) - dest.sector); | ||
944 | |||
945 | src.bdev = s->cow->bdev; | ||
946 | src.sector = chunk_to_sector(s->store, new_chunk); | ||
947 | src.count = dest.count; | ||
948 | |||
949 | /* | ||
950 | * Reallocate any exceptions needed in other snapshots then | ||
951 | * wait for the pending exceptions to complete. | ||
952 | * Each time any pending exception (globally on the system) | ||
953 | * completes we are woken and repeat the process to find out | ||
954 | * if we can proceed. While this may not seem a particularly | ||
955 | * efficient algorithm, it is not expected to have any | ||
956 | * significant impact on performance. | ||
957 | */ | ||
958 | previous_count = read_pending_exceptions_done_count(); | ||
959 | while (origin_write_extent(s, dest.sector, io_size)) { | ||
960 | wait_event(_pending_exceptions_done, | ||
961 | (read_pending_exceptions_done_count() != | ||
962 | previous_count)); | ||
963 | /* Retry after the wait, until all exceptions are done. */ | ||
964 | previous_count = read_pending_exceptions_done_count(); | ||
965 | } | ||
966 | |||
967 | down_write(&s->lock); | ||
968 | s->first_merging_chunk = old_chunk; | ||
969 | s->num_merging_chunks = linear_chunks; | ||
970 | up_write(&s->lock); | ||
971 | |||
972 | /* Wait until writes to all 'linear_chunks' drain */ | ||
973 | for (i = 0; i < linear_chunks; i++) | ||
974 | __check_for_conflicting_io(s, old_chunk + i); | ||
975 | |||
976 | dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, merge_callback, s); | ||
977 | return; | ||
978 | |||
979 | shut: | ||
980 | merge_shutdown(s); | ||
981 | } | ||
982 | |||
983 | static void error_bios(struct bio *bio); | ||
984 | |||
985 | static void merge_callback(int read_err, unsigned long write_err, void *context) | ||
986 | { | ||
987 | struct dm_snapshot *s = context; | ||
988 | struct bio *b = NULL; | ||
989 | |||
990 | if (read_err || write_err) { | ||
991 | if (read_err) | ||
992 | DMERR("Read error: shutting down merge."); | ||
993 | else | ||
994 | DMERR("Write error: shutting down merge."); | ||
995 | goto shut; | ||
996 | } | ||
997 | |||
998 | if (s->store->type->commit_merge(s->store, | ||
999 | s->num_merging_chunks) < 0) { | ||
1000 | DMERR("Write error in exception store: shutting down merge"); | ||
1001 | goto shut; | ||
1002 | } | ||
1003 | |||
1004 | if (remove_single_exception_chunk(s) < 0) | ||
1005 | goto shut; | ||
1006 | |||
1007 | snapshot_merge_next_chunks(s); | ||
1008 | |||
1009 | return; | ||
1010 | |||
1011 | shut: | ||
1012 | down_write(&s->lock); | ||
1013 | s->merge_failed = 1; | ||
1014 | b = __release_queued_bios_after_merge(s); | ||
1015 | up_write(&s->lock); | ||
1016 | error_bios(b); | ||
1017 | |||
1018 | merge_shutdown(s); | ||
1019 | } | ||
1020 | |||
1021 | static void start_merge(struct dm_snapshot *s) | ||
1022 | { | ||
1023 | if (!test_and_set_bit(RUNNING_MERGE, &s->state_bits)) | ||
1024 | snapshot_merge_next_chunks(s); | ||
1025 | } | ||
1026 | |||
1027 | static int wait_schedule(void *ptr) | ||
1028 | { | ||
1029 | schedule(); | ||
1030 | |||
1031 | return 0; | ||
1032 | } | ||
1033 | |||
1034 | /* | ||
1035 | * Stop the merging process and wait until it finishes. | ||
1036 | */ | ||
1037 | static void stop_merge(struct dm_snapshot *s) | ||
1038 | { | ||
1039 | set_bit(SHUTDOWN_MERGE, &s->state_bits); | ||
1040 | wait_on_bit(&s->state_bits, RUNNING_MERGE, wait_schedule, | ||
1041 | TASK_UNINTERRUPTIBLE); | ||
1042 | clear_bit(SHUTDOWN_MERGE, &s->state_bits); | ||
1043 | } | ||
1044 | |||
572 | /* | 1045 | /* |
573 | * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size> | 1046 | * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size> |
574 | */ | 1047 | */ |
@@ -577,50 +1050,72 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
577 | struct dm_snapshot *s; | 1050 | struct dm_snapshot *s; |
578 | int i; | 1051 | int i; |
579 | int r = -EINVAL; | 1052 | int r = -EINVAL; |
580 | char *origin_path; | 1053 | char *origin_path, *cow_path; |
581 | struct dm_exception_store *store; | 1054 | unsigned args_used, num_flush_requests = 1; |
582 | unsigned args_used; | 1055 | fmode_t origin_mode = FMODE_READ; |
583 | 1056 | ||
584 | if (argc != 4) { | 1057 | if (argc != 4) { |
585 | ti->error = "requires exactly 4 arguments"; | 1058 | ti->error = "requires exactly 4 arguments"; |
586 | r = -EINVAL; | 1059 | r = -EINVAL; |
587 | goto bad_args; | 1060 | goto bad; |
1061 | } | ||
1062 | |||
1063 | if (dm_target_is_snapshot_merge(ti)) { | ||
1064 | num_flush_requests = 2; | ||
1065 | origin_mode = FMODE_WRITE; | ||
588 | } | 1066 | } |
589 | 1067 | ||
590 | origin_path = argv[0]; | 1068 | origin_path = argv[0]; |
591 | argv++; | 1069 | argv++; |
592 | argc--; | 1070 | argc--; |
593 | 1071 | ||
594 | r = dm_exception_store_create(ti, argc, argv, &args_used, &store); | 1072 | s = kmalloc(sizeof(*s), GFP_KERNEL); |
1073 | if (!s) { | ||
1074 | ti->error = "Cannot allocate snapshot context private " | ||
1075 | "structure"; | ||
1076 | r = -ENOMEM; | ||
1077 | goto bad; | ||
1078 | } | ||
1079 | |||
1080 | cow_path = argv[0]; | ||
1081 | argv++; | ||
1082 | argc--; | ||
1083 | |||
1084 | r = dm_get_device(ti, cow_path, FMODE_READ | FMODE_WRITE, &s->cow); | ||
1085 | if (r) { | ||
1086 | ti->error = "Cannot get COW device"; | ||
1087 | goto bad_cow; | ||
1088 | } | ||
1089 | |||
1090 | r = dm_exception_store_create(ti, argc, argv, s, &args_used, &s->store); | ||
595 | if (r) { | 1091 | if (r) { |
596 | ti->error = "Couldn't create exception store"; | 1092 | ti->error = "Couldn't create exception store"; |
597 | r = -EINVAL; | 1093 | r = -EINVAL; |
598 | goto bad_args; | 1094 | goto bad_store; |
599 | } | 1095 | } |
600 | 1096 | ||
601 | argv += args_used; | 1097 | argv += args_used; |
602 | argc -= args_used; | 1098 | argc -= args_used; |
603 | 1099 | ||
604 | s = kmalloc(sizeof(*s), GFP_KERNEL); | 1100 | r = dm_get_device(ti, origin_path, origin_mode, &s->origin); |
605 | if (!s) { | ||
606 | ti->error = "Cannot allocate snapshot context private " | ||
607 | "structure"; | ||
608 | r = -ENOMEM; | ||
609 | goto bad_snap; | ||
610 | } | ||
611 | |||
612 | r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin); | ||
613 | if (r) { | 1101 | if (r) { |
614 | ti->error = "Cannot get origin device"; | 1102 | ti->error = "Cannot get origin device"; |
615 | goto bad_origin; | 1103 | goto bad_origin; |
616 | } | 1104 | } |
617 | 1105 | ||
618 | s->store = store; | 1106 | s->ti = ti; |
619 | s->valid = 1; | 1107 | s->valid = 1; |
620 | s->active = 0; | 1108 | s->active = 0; |
1109 | s->suspended = 0; | ||
621 | atomic_set(&s->pending_exceptions_count, 0); | 1110 | atomic_set(&s->pending_exceptions_count, 0); |
622 | init_rwsem(&s->lock); | 1111 | init_rwsem(&s->lock); |
1112 | INIT_LIST_HEAD(&s->list); | ||
623 | spin_lock_init(&s->pe_lock); | 1113 | spin_lock_init(&s->pe_lock); |
1114 | s->state_bits = 0; | ||
1115 | s->merge_failed = 0; | ||
1116 | s->first_merging_chunk = 0; | ||
1117 | s->num_merging_chunks = 0; | ||
1118 | bio_list_init(&s->bios_queued_during_merge); | ||
624 | 1119 | ||
625 | /* Allocate hash table for COW data */ | 1120 | /* Allocate hash table for COW data */ |
626 | if (init_hash_tables(s)) { | 1121 | if (init_hash_tables(s)) { |
@@ -654,34 +1149,55 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
654 | 1149 | ||
655 | spin_lock_init(&s->tracked_chunk_lock); | 1150 | spin_lock_init(&s->tracked_chunk_lock); |
656 | 1151 | ||
657 | /* Metadata must only be loaded into one table at once */ | 1152 | bio_list_init(&s->queued_bios); |
1153 | INIT_WORK(&s->queued_bios_work, flush_queued_bios); | ||
1154 | |||
1155 | ti->private = s; | ||
1156 | ti->num_flush_requests = num_flush_requests; | ||
1157 | |||
1158 | /* Add snapshot to the list of snapshots for this origin */ | ||
1159 | /* Exceptions aren't triggered till snapshot_resume() is called */ | ||
1160 | r = register_snapshot(s); | ||
1161 | if (r == -ENOMEM) { | ||
1162 | ti->error = "Snapshot origin struct allocation failed"; | ||
1163 | goto bad_load_and_register; | ||
1164 | } else if (r < 0) { | ||
1165 | /* invalid handover, register_snapshot has set ti->error */ | ||
1166 | goto bad_load_and_register; | ||
1167 | } | ||
1168 | |||
1169 | /* | ||
1170 | * Metadata must only be loaded into one table at once, so skip this | ||
1171 | * if metadata will be handed over during resume. | ||
1172 | * Chunk size will be set during the handover - set it to zero to | ||
1173 | * ensure it's ignored. | ||
1174 | */ | ||
1175 | if (r > 0) { | ||
1176 | s->store->chunk_size = 0; | ||
1177 | return 0; | ||
1178 | } | ||
1179 | |||
658 | r = s->store->type->read_metadata(s->store, dm_add_exception, | 1180 | r = s->store->type->read_metadata(s->store, dm_add_exception, |
659 | (void *)s); | 1181 | (void *)s); |
660 | if (r < 0) { | 1182 | if (r < 0) { |
661 | ti->error = "Failed to read snapshot metadata"; | 1183 | ti->error = "Failed to read snapshot metadata"; |
662 | goto bad_load_and_register; | 1184 | goto bad_read_metadata; |
663 | } else if (r > 0) { | 1185 | } else if (r > 0) { |
664 | s->valid = 0; | 1186 | s->valid = 0; |
665 | DMWARN("Snapshot is marked invalid."); | 1187 | DMWARN("Snapshot is marked invalid."); |
666 | } | 1188 | } |
667 | 1189 | ||
668 | bio_list_init(&s->queued_bios); | 1190 | if (!s->store->chunk_size) { |
669 | INIT_WORK(&s->queued_bios_work, flush_queued_bios); | 1191 | ti->error = "Chunk size not set"; |
670 | 1192 | goto bad_read_metadata; | |
671 | /* Add snapshot to the list of snapshots for this origin */ | ||
672 | /* Exceptions aren't triggered till snapshot_resume() is called */ | ||
673 | if (register_snapshot(s)) { | ||
674 | r = -EINVAL; | ||
675 | ti->error = "Cannot register snapshot origin"; | ||
676 | goto bad_load_and_register; | ||
677 | } | 1193 | } |
678 | |||
679 | ti->private = s; | ||
680 | ti->split_io = s->store->chunk_size; | 1194 | ti->split_io = s->store->chunk_size; |
681 | ti->num_flush_requests = 1; | ||
682 | 1195 | ||
683 | return 0; | 1196 | return 0; |
684 | 1197 | ||
1198 | bad_read_metadata: | ||
1199 | unregister_snapshot(s); | ||
1200 | |||
685 | bad_load_and_register: | 1201 | bad_load_and_register: |
686 | mempool_destroy(s->tracked_chunk_pool); | 1202 | mempool_destroy(s->tracked_chunk_pool); |
687 | 1203 | ||
@@ -692,19 +1208,22 @@ bad_pending_pool: | |||
692 | dm_kcopyd_client_destroy(s->kcopyd_client); | 1208 | dm_kcopyd_client_destroy(s->kcopyd_client); |
693 | 1209 | ||
694 | bad_kcopyd: | 1210 | bad_kcopyd: |
695 | exit_exception_table(&s->pending, pending_cache); | 1211 | dm_exception_table_exit(&s->pending, pending_cache); |
696 | exit_exception_table(&s->complete, exception_cache); | 1212 | dm_exception_table_exit(&s->complete, exception_cache); |
697 | 1213 | ||
698 | bad_hash_tables: | 1214 | bad_hash_tables: |
699 | dm_put_device(ti, s->origin); | 1215 | dm_put_device(ti, s->origin); |
700 | 1216 | ||
701 | bad_origin: | 1217 | bad_origin: |
702 | kfree(s); | 1218 | dm_exception_store_destroy(s->store); |
703 | 1219 | ||
704 | bad_snap: | 1220 | bad_store: |
705 | dm_exception_store_destroy(store); | 1221 | dm_put_device(ti, s->cow); |
1222 | |||
1223 | bad_cow: | ||
1224 | kfree(s); | ||
706 | 1225 | ||
707 | bad_args: | 1226 | bad: |
708 | return r; | 1227 | return r; |
709 | } | 1228 | } |
710 | 1229 | ||
@@ -713,8 +1232,39 @@ static void __free_exceptions(struct dm_snapshot *s) | |||
713 | dm_kcopyd_client_destroy(s->kcopyd_client); | 1232 | dm_kcopyd_client_destroy(s->kcopyd_client); |
714 | s->kcopyd_client = NULL; | 1233 | s->kcopyd_client = NULL; |
715 | 1234 | ||
716 | exit_exception_table(&s->pending, pending_cache); | 1235 | dm_exception_table_exit(&s->pending, pending_cache); |
717 | exit_exception_table(&s->complete, exception_cache); | 1236 | dm_exception_table_exit(&s->complete, exception_cache); |
1237 | } | ||
1238 | |||
1239 | static void __handover_exceptions(struct dm_snapshot *snap_src, | ||
1240 | struct dm_snapshot *snap_dest) | ||
1241 | { | ||
1242 | union { | ||
1243 | struct dm_exception_table table_swap; | ||
1244 | struct dm_exception_store *store_swap; | ||
1245 | } u; | ||
1246 | |||
1247 | /* | ||
1248 | * Swap all snapshot context information between the two instances. | ||
1249 | */ | ||
1250 | u.table_swap = snap_dest->complete; | ||
1251 | snap_dest->complete = snap_src->complete; | ||
1252 | snap_src->complete = u.table_swap; | ||
1253 | |||
1254 | u.store_swap = snap_dest->store; | ||
1255 | snap_dest->store = snap_src->store; | ||
1256 | snap_src->store = u.store_swap; | ||
1257 | |||
1258 | snap_dest->store->snap = snap_dest; | ||
1259 | snap_src->store->snap = snap_src; | ||
1260 | |||
1261 | snap_dest->ti->split_io = snap_dest->store->chunk_size; | ||
1262 | snap_dest->valid = snap_src->valid; | ||
1263 | |||
1264 | /* | ||
1265 | * Set source invalid to ensure it receives no further I/O. | ||
1266 | */ | ||
1267 | snap_src->valid = 0; | ||
718 | } | 1268 | } |
719 | 1269 | ||
720 | static void snapshot_dtr(struct dm_target *ti) | 1270 | static void snapshot_dtr(struct dm_target *ti) |
@@ -723,9 +1273,24 @@ static void snapshot_dtr(struct dm_target *ti) | |||
723 | int i; | 1273 | int i; |
724 | #endif | 1274 | #endif |
725 | struct dm_snapshot *s = ti->private; | 1275 | struct dm_snapshot *s = ti->private; |
1276 | struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; | ||
726 | 1277 | ||
727 | flush_workqueue(ksnapd); | 1278 | flush_workqueue(ksnapd); |
728 | 1279 | ||
1280 | down_read(&_origins_lock); | ||
1281 | /* Check whether exception handover must be cancelled */ | ||
1282 | (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); | ||
1283 | if (snap_src && snap_dest && (s == snap_src)) { | ||
1284 | down_write(&snap_dest->lock); | ||
1285 | snap_dest->valid = 0; | ||
1286 | up_write(&snap_dest->lock); | ||
1287 | DMERR("Cancelling snapshot handover."); | ||
1288 | } | ||
1289 | up_read(&_origins_lock); | ||
1290 | |||
1291 | if (dm_target_is_snapshot_merge(ti)) | ||
1292 | stop_merge(s); | ||
1293 | |||
729 | /* Prevent further origin writes from using this snapshot. */ | 1294 | /* Prevent further origin writes from using this snapshot. */ |
730 | /* After this returns there can be no new kcopyd jobs. */ | 1295 | /* After this returns there can be no new kcopyd jobs. */ |
731 | unregister_snapshot(s); | 1296 | unregister_snapshot(s); |
@@ -753,6 +1318,8 @@ static void snapshot_dtr(struct dm_target *ti) | |||
753 | 1318 | ||
754 | dm_exception_store_destroy(s->store); | 1319 | dm_exception_store_destroy(s->store); |
755 | 1320 | ||
1321 | dm_put_device(ti, s->cow); | ||
1322 | |||
756 | kfree(s); | 1323 | kfree(s); |
757 | } | 1324 | } |
758 | 1325 | ||
@@ -785,6 +1352,26 @@ static void flush_queued_bios(struct work_struct *work) | |||
785 | flush_bios(queued_bios); | 1352 | flush_bios(queued_bios); |
786 | } | 1353 | } |
787 | 1354 | ||
1355 | static int do_origin(struct dm_dev *origin, struct bio *bio); | ||
1356 | |||
1357 | /* | ||
1358 | * Flush a list of buffers. | ||
1359 | */ | ||
1360 | static void retry_origin_bios(struct dm_snapshot *s, struct bio *bio) | ||
1361 | { | ||
1362 | struct bio *n; | ||
1363 | int r; | ||
1364 | |||
1365 | while (bio) { | ||
1366 | n = bio->bi_next; | ||
1367 | bio->bi_next = NULL; | ||
1368 | r = do_origin(s->origin, bio); | ||
1369 | if (r == DM_MAPIO_REMAPPED) | ||
1370 | generic_make_request(bio); | ||
1371 | bio = n; | ||
1372 | } | ||
1373 | } | ||
1374 | |||
788 | /* | 1375 | /* |
789 | * Error a list of buffers. | 1376 | * Error a list of buffers. |
790 | */ | 1377 | */ |
@@ -815,45 +1402,12 @@ static void __invalidate_snapshot(struct dm_snapshot *s, int err) | |||
815 | 1402 | ||
816 | s->valid = 0; | 1403 | s->valid = 0; |
817 | 1404 | ||
818 | dm_table_event(s->store->ti->table); | 1405 | dm_table_event(s->ti->table); |
819 | } | ||
820 | |||
821 | static void get_pending_exception(struct dm_snap_pending_exception *pe) | ||
822 | { | ||
823 | atomic_inc(&pe->ref_count); | ||
824 | } | ||
825 | |||
826 | static struct bio *put_pending_exception(struct dm_snap_pending_exception *pe) | ||
827 | { | ||
828 | struct dm_snap_pending_exception *primary_pe; | ||
829 | struct bio *origin_bios = NULL; | ||
830 | |||
831 | primary_pe = pe->primary_pe; | ||
832 | |||
833 | /* | ||
834 | * If this pe is involved in a write to the origin and | ||
835 | * it is the last sibling to complete then release | ||
836 | * the bios for the original write to the origin. | ||
837 | */ | ||
838 | if (primary_pe && | ||
839 | atomic_dec_and_test(&primary_pe->ref_count)) { | ||
840 | origin_bios = bio_list_get(&primary_pe->origin_bios); | ||
841 | free_pending_exception(primary_pe); | ||
842 | } | ||
843 | |||
844 | /* | ||
845 | * Free the pe if it's not linked to an origin write or if | ||
846 | * it's not itself a primary pe. | ||
847 | */ | ||
848 | if (!primary_pe || primary_pe != pe) | ||
849 | free_pending_exception(pe); | ||
850 | |||
851 | return origin_bios; | ||
852 | } | 1406 | } |
853 | 1407 | ||
854 | static void pending_complete(struct dm_snap_pending_exception *pe, int success) | 1408 | static void pending_complete(struct dm_snap_pending_exception *pe, int success) |
855 | { | 1409 | { |
856 | struct dm_snap_exception *e; | 1410 | struct dm_exception *e; |
857 | struct dm_snapshot *s = pe->snap; | 1411 | struct dm_snapshot *s = pe->snap; |
858 | struct bio *origin_bios = NULL; | 1412 | struct bio *origin_bios = NULL; |
859 | struct bio *snapshot_bios = NULL; | 1413 | struct bio *snapshot_bios = NULL; |
@@ -867,7 +1421,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success) | |||
867 | goto out; | 1421 | goto out; |
868 | } | 1422 | } |
869 | 1423 | ||
870 | e = alloc_exception(); | 1424 | e = alloc_completed_exception(); |
871 | if (!e) { | 1425 | if (!e) { |
872 | down_write(&s->lock); | 1426 | down_write(&s->lock); |
873 | __invalidate_snapshot(s, -ENOMEM); | 1427 | __invalidate_snapshot(s, -ENOMEM); |
@@ -878,28 +1432,27 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success) | |||
878 | 1432 | ||
879 | down_write(&s->lock); | 1433 | down_write(&s->lock); |
880 | if (!s->valid) { | 1434 | if (!s->valid) { |
881 | free_exception(e); | 1435 | free_completed_exception(e); |
882 | error = 1; | 1436 | error = 1; |
883 | goto out; | 1437 | goto out; |
884 | } | 1438 | } |
885 | 1439 | ||
886 | /* | 1440 | /* Check for conflicting reads */ |
887 | * Check for conflicting reads. This is extremely improbable, | 1441 | __check_for_conflicting_io(s, pe->e.old_chunk); |
888 | * so msleep(1) is sufficient and there is no need for a wait queue. | ||
889 | */ | ||
890 | while (__chunk_is_tracked(s, pe->e.old_chunk)) | ||
891 | msleep(1); | ||
892 | 1442 | ||
893 | /* | 1443 | /* |
894 | * Add a proper exception, and remove the | 1444 | * Add a proper exception, and remove the |
895 | * in-flight exception from the list. | 1445 | * in-flight exception from the list. |
896 | */ | 1446 | */ |
897 | insert_completed_exception(s, e); | 1447 | dm_insert_exception(&s->complete, e); |
898 | 1448 | ||
899 | out: | 1449 | out: |
900 | remove_exception(&pe->e); | 1450 | dm_remove_exception(&pe->e); |
901 | snapshot_bios = bio_list_get(&pe->snapshot_bios); | 1451 | snapshot_bios = bio_list_get(&pe->snapshot_bios); |
902 | origin_bios = put_pending_exception(pe); | 1452 | origin_bios = bio_list_get(&pe->origin_bios); |
1453 | free_pending_exception(pe); | ||
1454 | |||
1455 | increment_pending_exceptions_done_count(); | ||
903 | 1456 | ||
904 | up_write(&s->lock); | 1457 | up_write(&s->lock); |
905 | 1458 | ||
@@ -909,7 +1462,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success) | |||
909 | else | 1462 | else |
910 | flush_bios(snapshot_bios); | 1463 | flush_bios(snapshot_bios); |
911 | 1464 | ||
912 | flush_bios(origin_bios); | 1465 | retry_origin_bios(s, origin_bios); |
913 | } | 1466 | } |
914 | 1467 | ||
915 | static void commit_callback(void *context, int success) | 1468 | static void commit_callback(void *context, int success) |
@@ -951,9 +1504,9 @@ static void start_copy(struct dm_snap_pending_exception *pe) | |||
951 | 1504 | ||
952 | src.bdev = bdev; | 1505 | src.bdev = bdev; |
953 | src.sector = chunk_to_sector(s->store, pe->e.old_chunk); | 1506 | src.sector = chunk_to_sector(s->store, pe->e.old_chunk); |
954 | src.count = min(s->store->chunk_size, dev_size - src.sector); | 1507 | src.count = min((sector_t)s->store->chunk_size, dev_size - src.sector); |
955 | 1508 | ||
956 | dest.bdev = s->store->cow->bdev; | 1509 | dest.bdev = s->cow->bdev; |
957 | dest.sector = chunk_to_sector(s->store, pe->e.new_chunk); | 1510 | dest.sector = chunk_to_sector(s->store, pe->e.new_chunk); |
958 | dest.count = src.count; | 1511 | dest.count = src.count; |
959 | 1512 | ||
@@ -965,7 +1518,7 @@ static void start_copy(struct dm_snap_pending_exception *pe) | |||
965 | static struct dm_snap_pending_exception * | 1518 | static struct dm_snap_pending_exception * |
966 | __lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk) | 1519 | __lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk) |
967 | { | 1520 | { |
968 | struct dm_snap_exception *e = lookup_exception(&s->pending, chunk); | 1521 | struct dm_exception *e = dm_lookup_exception(&s->pending, chunk); |
969 | 1522 | ||
970 | if (!e) | 1523 | if (!e) |
971 | return NULL; | 1524 | return NULL; |
@@ -996,8 +1549,6 @@ __find_pending_exception(struct dm_snapshot *s, | |||
996 | pe->e.old_chunk = chunk; | 1549 | pe->e.old_chunk = chunk; |
997 | bio_list_init(&pe->origin_bios); | 1550 | bio_list_init(&pe->origin_bios); |
998 | bio_list_init(&pe->snapshot_bios); | 1551 | bio_list_init(&pe->snapshot_bios); |
999 | pe->primary_pe = NULL; | ||
1000 | atomic_set(&pe->ref_count, 0); | ||
1001 | pe->started = 0; | 1552 | pe->started = 0; |
1002 | 1553 | ||
1003 | if (s->store->type->prepare_exception(s->store, &pe->e)) { | 1554 | if (s->store->type->prepare_exception(s->store, &pe->e)) { |
@@ -1005,16 +1556,15 @@ __find_pending_exception(struct dm_snapshot *s, | |||
1005 | return NULL; | 1556 | return NULL; |
1006 | } | 1557 | } |
1007 | 1558 | ||
1008 | get_pending_exception(pe); | 1559 | dm_insert_exception(&s->pending, &pe->e); |
1009 | insert_exception(&s->pending, &pe->e); | ||
1010 | 1560 | ||
1011 | return pe; | 1561 | return pe; |
1012 | } | 1562 | } |
1013 | 1563 | ||
1014 | static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e, | 1564 | static void remap_exception(struct dm_snapshot *s, struct dm_exception *e, |
1015 | struct bio *bio, chunk_t chunk) | 1565 | struct bio *bio, chunk_t chunk) |
1016 | { | 1566 | { |
1017 | bio->bi_bdev = s->store->cow->bdev; | 1567 | bio->bi_bdev = s->cow->bdev; |
1018 | bio->bi_sector = chunk_to_sector(s->store, | 1568 | bio->bi_sector = chunk_to_sector(s->store, |
1019 | dm_chunk_number(e->new_chunk) + | 1569 | dm_chunk_number(e->new_chunk) + |
1020 | (chunk - e->old_chunk)) + | 1570 | (chunk - e->old_chunk)) + |
@@ -1025,14 +1575,14 @@ static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e, | |||
1025 | static int snapshot_map(struct dm_target *ti, struct bio *bio, | 1575 | static int snapshot_map(struct dm_target *ti, struct bio *bio, |
1026 | union map_info *map_context) | 1576 | union map_info *map_context) |
1027 | { | 1577 | { |
1028 | struct dm_snap_exception *e; | 1578 | struct dm_exception *e; |
1029 | struct dm_snapshot *s = ti->private; | 1579 | struct dm_snapshot *s = ti->private; |
1030 | int r = DM_MAPIO_REMAPPED; | 1580 | int r = DM_MAPIO_REMAPPED; |
1031 | chunk_t chunk; | 1581 | chunk_t chunk; |
1032 | struct dm_snap_pending_exception *pe = NULL; | 1582 | struct dm_snap_pending_exception *pe = NULL; |
1033 | 1583 | ||
1034 | if (unlikely(bio_empty_barrier(bio))) { | 1584 | if (unlikely(bio_empty_barrier(bio))) { |
1035 | bio->bi_bdev = s->store->cow->bdev; | 1585 | bio->bi_bdev = s->cow->bdev; |
1036 | return DM_MAPIO_REMAPPED; | 1586 | return DM_MAPIO_REMAPPED; |
1037 | } | 1587 | } |
1038 | 1588 | ||
@@ -1053,7 +1603,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, | |||
1053 | } | 1603 | } |
1054 | 1604 | ||
1055 | /* If the block is already remapped - use that, else remap it */ | 1605 | /* If the block is already remapped - use that, else remap it */ |
1056 | e = lookup_exception(&s->complete, chunk); | 1606 | e = dm_lookup_exception(&s->complete, chunk); |
1057 | if (e) { | 1607 | if (e) { |
1058 | remap_exception(s, e, bio, chunk); | 1608 | remap_exception(s, e, bio, chunk); |
1059 | goto out_unlock; | 1609 | goto out_unlock; |
@@ -1077,7 +1627,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, | |||
1077 | goto out_unlock; | 1627 | goto out_unlock; |
1078 | } | 1628 | } |
1079 | 1629 | ||
1080 | e = lookup_exception(&s->complete, chunk); | 1630 | e = dm_lookup_exception(&s->complete, chunk); |
1081 | if (e) { | 1631 | if (e) { |
1082 | free_pending_exception(pe); | 1632 | free_pending_exception(pe); |
1083 | remap_exception(s, e, bio, chunk); | 1633 | remap_exception(s, e, bio, chunk); |
@@ -1115,6 +1665,78 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, | |||
1115 | return r; | 1665 | return r; |
1116 | } | 1666 | } |
1117 | 1667 | ||
1668 | /* | ||
1669 | * A snapshot-merge target behaves like a combination of a snapshot | ||
1670 | * target and a snapshot-origin target. It only generates new | ||
1671 | * exceptions in other snapshots and not in the one that is being | ||
1672 | * merged. | ||
1673 | * | ||
1674 | * For each chunk, if there is an existing exception, it is used to | ||
1675 | * redirect I/O to the cow device. Otherwise I/O is sent to the origin, | ||
1676 | * which in turn might generate exceptions in other snapshots. | ||
1677 | * If merging is currently taking place on the chunk in question, the | ||
1678 | * I/O is deferred by adding it to s->bios_queued_during_merge. | ||
1679 | */ | ||
1680 | static int snapshot_merge_map(struct dm_target *ti, struct bio *bio, | ||
1681 | union map_info *map_context) | ||
1682 | { | ||
1683 | struct dm_exception *e; | ||
1684 | struct dm_snapshot *s = ti->private; | ||
1685 | int r = DM_MAPIO_REMAPPED; | ||
1686 | chunk_t chunk; | ||
1687 | |||
1688 | if (unlikely(bio_empty_barrier(bio))) { | ||
1689 | if (!map_context->flush_request) | ||
1690 | bio->bi_bdev = s->origin->bdev; | ||
1691 | else | ||
1692 | bio->bi_bdev = s->cow->bdev; | ||
1693 | map_context->ptr = NULL; | ||
1694 | return DM_MAPIO_REMAPPED; | ||
1695 | } | ||
1696 | |||
1697 | chunk = sector_to_chunk(s->store, bio->bi_sector); | ||
1698 | |||
1699 | down_write(&s->lock); | ||
1700 | |||
1701 | /* Full merging snapshots are redirected to the origin */ | ||
1702 | if (!s->valid) | ||
1703 | goto redirect_to_origin; | ||
1704 | |||
1705 | /* If the block is already remapped - use that */ | ||
1706 | e = dm_lookup_exception(&s->complete, chunk); | ||
1707 | if (e) { | ||
1708 | /* Queue writes overlapping with chunks being merged */ | ||
1709 | if (bio_rw(bio) == WRITE && | ||
1710 | chunk >= s->first_merging_chunk && | ||
1711 | chunk < (s->first_merging_chunk + | ||
1712 | s->num_merging_chunks)) { | ||
1713 | bio->bi_bdev = s->origin->bdev; | ||
1714 | bio_list_add(&s->bios_queued_during_merge, bio); | ||
1715 | r = DM_MAPIO_SUBMITTED; | ||
1716 | goto out_unlock; | ||
1717 | } | ||
1718 | |||
1719 | remap_exception(s, e, bio, chunk); | ||
1720 | |||
1721 | if (bio_rw(bio) == WRITE) | ||
1722 | map_context->ptr = track_chunk(s, chunk); | ||
1723 | goto out_unlock; | ||
1724 | } | ||
1725 | |||
1726 | redirect_to_origin: | ||
1727 | bio->bi_bdev = s->origin->bdev; | ||
1728 | |||
1729 | if (bio_rw(bio) == WRITE) { | ||
1730 | up_write(&s->lock); | ||
1731 | return do_origin(s->origin, bio); | ||
1732 | } | ||
1733 | |||
1734 | out_unlock: | ||
1735 | up_write(&s->lock); | ||
1736 | |||
1737 | return r; | ||
1738 | } | ||
1739 | |||
1118 | static int snapshot_end_io(struct dm_target *ti, struct bio *bio, | 1740 | static int snapshot_end_io(struct dm_target *ti, struct bio *bio, |
1119 | int error, union map_info *map_context) | 1741 | int error, union map_info *map_context) |
1120 | { | 1742 | { |
@@ -1127,15 +1749,101 @@ static int snapshot_end_io(struct dm_target *ti, struct bio *bio, | |||
1127 | return 0; | 1749 | return 0; |
1128 | } | 1750 | } |
1129 | 1751 | ||
1752 | static void snapshot_merge_presuspend(struct dm_target *ti) | ||
1753 | { | ||
1754 | struct dm_snapshot *s = ti->private; | ||
1755 | |||
1756 | stop_merge(s); | ||
1757 | } | ||
1758 | |||
1759 | static void snapshot_postsuspend(struct dm_target *ti) | ||
1760 | { | ||
1761 | struct dm_snapshot *s = ti->private; | ||
1762 | |||
1763 | down_write(&s->lock); | ||
1764 | s->suspended = 1; | ||
1765 | up_write(&s->lock); | ||
1766 | } | ||
1767 | |||
1768 | static int snapshot_preresume(struct dm_target *ti) | ||
1769 | { | ||
1770 | int r = 0; | ||
1771 | struct dm_snapshot *s = ti->private; | ||
1772 | struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; | ||
1773 | |||
1774 | down_read(&_origins_lock); | ||
1775 | (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); | ||
1776 | if (snap_src && snap_dest) { | ||
1777 | down_read(&snap_src->lock); | ||
1778 | if (s == snap_src) { | ||
1779 | DMERR("Unable to resume snapshot source until " | ||
1780 | "handover completes."); | ||
1781 | r = -EINVAL; | ||
1782 | } else if (!snap_src->suspended) { | ||
1783 | DMERR("Unable to perform snapshot handover until " | ||
1784 | "source is suspended."); | ||
1785 | r = -EINVAL; | ||
1786 | } | ||
1787 | up_read(&snap_src->lock); | ||
1788 | } | ||
1789 | up_read(&_origins_lock); | ||
1790 | |||
1791 | return r; | ||
1792 | } | ||
1793 | |||
1130 | static void snapshot_resume(struct dm_target *ti) | 1794 | static void snapshot_resume(struct dm_target *ti) |
1131 | { | 1795 | { |
1132 | struct dm_snapshot *s = ti->private; | 1796 | struct dm_snapshot *s = ti->private; |
1797 | struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; | ||
1798 | |||
1799 | down_read(&_origins_lock); | ||
1800 | (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); | ||
1801 | if (snap_src && snap_dest) { | ||
1802 | down_write(&snap_src->lock); | ||
1803 | down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING); | ||
1804 | __handover_exceptions(snap_src, snap_dest); | ||
1805 | up_write(&snap_dest->lock); | ||
1806 | up_write(&snap_src->lock); | ||
1807 | } | ||
1808 | up_read(&_origins_lock); | ||
1809 | |||
1810 | /* Now we have correct chunk size, reregister */ | ||
1811 | reregister_snapshot(s); | ||
1133 | 1812 | ||
1134 | down_write(&s->lock); | 1813 | down_write(&s->lock); |
1135 | s->active = 1; | 1814 | s->active = 1; |
1815 | s->suspended = 0; | ||
1136 | up_write(&s->lock); | 1816 | up_write(&s->lock); |
1137 | } | 1817 | } |
1138 | 1818 | ||
1819 | static sector_t get_origin_minimum_chunksize(struct block_device *bdev) | ||
1820 | { | ||
1821 | sector_t min_chunksize; | ||
1822 | |||
1823 | down_read(&_origins_lock); | ||
1824 | min_chunksize = __minimum_chunk_size(__lookup_origin(bdev)); | ||
1825 | up_read(&_origins_lock); | ||
1826 | |||
1827 | return min_chunksize; | ||
1828 | } | ||
1829 | |||
1830 | static void snapshot_merge_resume(struct dm_target *ti) | ||
1831 | { | ||
1832 | struct dm_snapshot *s = ti->private; | ||
1833 | |||
1834 | /* | ||
1835 | * Handover exceptions from existing snapshot. | ||
1836 | */ | ||
1837 | snapshot_resume(ti); | ||
1838 | |||
1839 | /* | ||
1840 | * snapshot-merge acts as an origin, so set ti->split_io | ||
1841 | */ | ||
1842 | ti->split_io = get_origin_minimum_chunksize(s->origin->bdev); | ||
1843 | |||
1844 | start_merge(s); | ||
1845 | } | ||
1846 | |||
1139 | static int snapshot_status(struct dm_target *ti, status_type_t type, | 1847 | static int snapshot_status(struct dm_target *ti, status_type_t type, |
1140 | char *result, unsigned int maxlen) | 1848 | char *result, unsigned int maxlen) |
1141 | { | 1849 | { |
@@ -1144,21 +1852,32 @@ static int snapshot_status(struct dm_target *ti, status_type_t type, | |||
1144 | 1852 | ||
1145 | switch (type) { | 1853 | switch (type) { |
1146 | case STATUSTYPE_INFO: | 1854 | case STATUSTYPE_INFO: |
1855 | |||
1856 | down_write(&snap->lock); | ||
1857 | |||
1147 | if (!snap->valid) | 1858 | if (!snap->valid) |
1148 | DMEMIT("Invalid"); | 1859 | DMEMIT("Invalid"); |
1860 | else if (snap->merge_failed) | ||
1861 | DMEMIT("Merge failed"); | ||
1149 | else { | 1862 | else { |
1150 | if (snap->store->type->fraction_full) { | 1863 | if (snap->store->type->usage) { |
1151 | sector_t numerator, denominator; | 1864 | sector_t total_sectors, sectors_allocated, |
1152 | snap->store->type->fraction_full(snap->store, | 1865 | metadata_sectors; |
1153 | &numerator, | 1866 | snap->store->type->usage(snap->store, |
1154 | &denominator); | 1867 | &total_sectors, |
1155 | DMEMIT("%llu/%llu", | 1868 | §ors_allocated, |
1156 | (unsigned long long)numerator, | 1869 | &metadata_sectors); |
1157 | (unsigned long long)denominator); | 1870 | DMEMIT("%llu/%llu %llu", |
1871 | (unsigned long long)sectors_allocated, | ||
1872 | (unsigned long long)total_sectors, | ||
1873 | (unsigned long long)metadata_sectors); | ||
1158 | } | 1874 | } |
1159 | else | 1875 | else |
1160 | DMEMIT("Unknown"); | 1876 | DMEMIT("Unknown"); |
1161 | } | 1877 | } |
1878 | |||
1879 | up_write(&snap->lock); | ||
1880 | |||
1162 | break; | 1881 | break; |
1163 | 1882 | ||
1164 | case STATUSTYPE_TABLE: | 1883 | case STATUSTYPE_TABLE: |
@@ -1167,7 +1886,7 @@ static int snapshot_status(struct dm_target *ti, status_type_t type, | |||
1167 | * to make private copies if the output is to | 1886 | * to make private copies if the output is to |
1168 | * make sense. | 1887 | * make sense. |
1169 | */ | 1888 | */ |
1170 | DMEMIT("%s", snap->origin->name); | 1889 | DMEMIT("%s %s", snap->origin->name, snap->cow->name); |
1171 | snap->store->type->status(snap->store, type, result + sz, | 1890 | snap->store->type->status(snap->store, type, result + sz, |
1172 | maxlen - sz); | 1891 | maxlen - sz); |
1173 | break; | 1892 | break; |
@@ -1188,17 +1907,36 @@ static int snapshot_iterate_devices(struct dm_target *ti, | |||
1188 | /*----------------------------------------------------------------- | 1907 | /*----------------------------------------------------------------- |
1189 | * Origin methods | 1908 | * Origin methods |
1190 | *---------------------------------------------------------------*/ | 1909 | *---------------------------------------------------------------*/ |
1191 | static int __origin_write(struct list_head *snapshots, struct bio *bio) | 1910 | |
1911 | /* | ||
1912 | * If no exceptions need creating, DM_MAPIO_REMAPPED is returned and any | ||
1913 | * supplied bio was ignored. The caller may submit it immediately. | ||
1914 | * (No remapping actually occurs as the origin is always a direct linear | ||
1915 | * map.) | ||
1916 | * | ||
1917 | * If further exceptions are required, DM_MAPIO_SUBMITTED is returned | ||
1918 | * and any supplied bio is added to a list to be submitted once all | ||
1919 | * the necessary exceptions exist. | ||
1920 | */ | ||
1921 | static int __origin_write(struct list_head *snapshots, sector_t sector, | ||
1922 | struct bio *bio) | ||
1192 | { | 1923 | { |
1193 | int r = DM_MAPIO_REMAPPED, first = 0; | 1924 | int r = DM_MAPIO_REMAPPED; |
1194 | struct dm_snapshot *snap; | 1925 | struct dm_snapshot *snap; |
1195 | struct dm_snap_exception *e; | 1926 | struct dm_exception *e; |
1196 | struct dm_snap_pending_exception *pe, *next_pe, *primary_pe = NULL; | 1927 | struct dm_snap_pending_exception *pe; |
1928 | struct dm_snap_pending_exception *pe_to_start_now = NULL; | ||
1929 | struct dm_snap_pending_exception *pe_to_start_last = NULL; | ||
1197 | chunk_t chunk; | 1930 | chunk_t chunk; |
1198 | LIST_HEAD(pe_queue); | ||
1199 | 1931 | ||
1200 | /* Do all the snapshots on this origin */ | 1932 | /* Do all the snapshots on this origin */ |
1201 | list_for_each_entry (snap, snapshots, list) { | 1933 | list_for_each_entry (snap, snapshots, list) { |
1934 | /* | ||
1935 | * Don't make new exceptions in a merging snapshot | ||
1936 | * because it has effectively been deleted | ||
1937 | */ | ||
1938 | if (dm_target_is_snapshot_merge(snap->ti)) | ||
1939 | continue; | ||
1202 | 1940 | ||
1203 | down_write(&snap->lock); | 1941 | down_write(&snap->lock); |
1204 | 1942 | ||
@@ -1207,24 +1945,21 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio) | |||
1207 | goto next_snapshot; | 1945 | goto next_snapshot; |
1208 | 1946 | ||
1209 | /* Nothing to do if writing beyond end of snapshot */ | 1947 | /* Nothing to do if writing beyond end of snapshot */ |
1210 | if (bio->bi_sector >= dm_table_get_size(snap->store->ti->table)) | 1948 | if (sector >= dm_table_get_size(snap->ti->table)) |
1211 | goto next_snapshot; | 1949 | goto next_snapshot; |
1212 | 1950 | ||
1213 | /* | 1951 | /* |
1214 | * Remember, different snapshots can have | 1952 | * Remember, different snapshots can have |
1215 | * different chunk sizes. | 1953 | * different chunk sizes. |
1216 | */ | 1954 | */ |
1217 | chunk = sector_to_chunk(snap->store, bio->bi_sector); | 1955 | chunk = sector_to_chunk(snap->store, sector); |
1218 | 1956 | ||
1219 | /* | 1957 | /* |
1220 | * Check exception table to see if block | 1958 | * Check exception table to see if block |
1221 | * is already remapped in this snapshot | 1959 | * is already remapped in this snapshot |
1222 | * and trigger an exception if not. | 1960 | * and trigger an exception if not. |
1223 | * | ||
1224 | * ref_count is initialised to 1 so pending_complete() | ||
1225 | * won't destroy the primary_pe while we're inside this loop. | ||
1226 | */ | 1961 | */ |
1227 | e = lookup_exception(&snap->complete, chunk); | 1962 | e = dm_lookup_exception(&snap->complete, chunk); |
1228 | if (e) | 1963 | if (e) |
1229 | goto next_snapshot; | 1964 | goto next_snapshot; |
1230 | 1965 | ||
@@ -1239,7 +1974,7 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio) | |||
1239 | goto next_snapshot; | 1974 | goto next_snapshot; |
1240 | } | 1975 | } |
1241 | 1976 | ||
1242 | e = lookup_exception(&snap->complete, chunk); | 1977 | e = dm_lookup_exception(&snap->complete, chunk); |
1243 | if (e) { | 1978 | if (e) { |
1244 | free_pending_exception(pe); | 1979 | free_pending_exception(pe); |
1245 | goto next_snapshot; | 1980 | goto next_snapshot; |
@@ -1252,59 +1987,43 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio) | |||
1252 | } | 1987 | } |
1253 | } | 1988 | } |
1254 | 1989 | ||
1255 | if (!primary_pe) { | 1990 | r = DM_MAPIO_SUBMITTED; |
1256 | /* | ||
1257 | * Either every pe here has same | ||
1258 | * primary_pe or none has one yet. | ||
1259 | */ | ||
1260 | if (pe->primary_pe) | ||
1261 | primary_pe = pe->primary_pe; | ||
1262 | else { | ||
1263 | primary_pe = pe; | ||
1264 | first = 1; | ||
1265 | } | ||
1266 | |||
1267 | bio_list_add(&primary_pe->origin_bios, bio); | ||
1268 | 1991 | ||
1269 | r = DM_MAPIO_SUBMITTED; | 1992 | /* |
1270 | } | 1993 | * If an origin bio was supplied, queue it to wait for the |
1994 | * completion of this exception, and start this one last, | ||
1995 | * at the end of the function. | ||
1996 | */ | ||
1997 | if (bio) { | ||
1998 | bio_list_add(&pe->origin_bios, bio); | ||
1999 | bio = NULL; | ||
1271 | 2000 | ||
1272 | if (!pe->primary_pe) { | 2001 | if (!pe->started) { |
1273 | pe->primary_pe = primary_pe; | 2002 | pe->started = 1; |
1274 | get_pending_exception(primary_pe); | 2003 | pe_to_start_last = pe; |
2004 | } | ||
1275 | } | 2005 | } |
1276 | 2006 | ||
1277 | if (!pe->started) { | 2007 | if (!pe->started) { |
1278 | pe->started = 1; | 2008 | pe->started = 1; |
1279 | list_add_tail(&pe->list, &pe_queue); | 2009 | pe_to_start_now = pe; |
1280 | } | 2010 | } |
1281 | 2011 | ||
1282 | next_snapshot: | 2012 | next_snapshot: |
1283 | up_write(&snap->lock); | 2013 | up_write(&snap->lock); |
1284 | } | ||
1285 | |||
1286 | if (!primary_pe) | ||
1287 | return r; | ||
1288 | |||
1289 | /* | ||
1290 | * If this is the first time we're processing this chunk and | ||
1291 | * ref_count is now 1 it means all the pending exceptions | ||
1292 | * got completed while we were in the loop above, so it falls to | ||
1293 | * us here to remove the primary_pe and submit any origin_bios. | ||
1294 | */ | ||
1295 | 2014 | ||
1296 | if (first && atomic_dec_and_test(&primary_pe->ref_count)) { | 2015 | if (pe_to_start_now) { |
1297 | flush_bios(bio_list_get(&primary_pe->origin_bios)); | 2016 | start_copy(pe_to_start_now); |
1298 | free_pending_exception(primary_pe); | 2017 | pe_to_start_now = NULL; |
1299 | /* If we got here, pe_queue is necessarily empty. */ | 2018 | } |
1300 | return r; | ||
1301 | } | 2019 | } |
1302 | 2020 | ||
1303 | /* | 2021 | /* |
1304 | * Now that we have a complete pe list we can start the copying. | 2022 | * Submit the exception against which the bio is queued last, |
2023 | * to give the other exceptions a head start. | ||
1305 | */ | 2024 | */ |
1306 | list_for_each_entry_safe(pe, next_pe, &pe_queue, list) | 2025 | if (pe_to_start_last) |
1307 | start_copy(pe); | 2026 | start_copy(pe_to_start_last); |
1308 | 2027 | ||
1309 | return r; | 2028 | return r; |
1310 | } | 2029 | } |
@@ -1320,13 +2039,48 @@ static int do_origin(struct dm_dev *origin, struct bio *bio) | |||
1320 | down_read(&_origins_lock); | 2039 | down_read(&_origins_lock); |
1321 | o = __lookup_origin(origin->bdev); | 2040 | o = __lookup_origin(origin->bdev); |
1322 | if (o) | 2041 | if (o) |
1323 | r = __origin_write(&o->snapshots, bio); | 2042 | r = __origin_write(&o->snapshots, bio->bi_sector, bio); |
1324 | up_read(&_origins_lock); | 2043 | up_read(&_origins_lock); |
1325 | 2044 | ||
1326 | return r; | 2045 | return r; |
1327 | } | 2046 | } |
1328 | 2047 | ||
1329 | /* | 2048 | /* |
2049 | * Trigger exceptions in all non-merging snapshots. | ||
2050 | * | ||
2051 | * The chunk size of the merging snapshot may be larger than the chunk | ||
2052 | * size of some other snapshot so we may need to reallocate multiple | ||
2053 | * chunks in other snapshots. | ||
2054 | * | ||
2055 | * We scan all the overlapping exceptions in the other snapshots. | ||
2056 | * Returns 1 if anything was reallocated and must be waited for, | ||
2057 | * otherwise returns 0. | ||
2058 | * | ||
2059 | * size must be a multiple of merging_snap's chunk_size. | ||
2060 | */ | ||
2061 | static int origin_write_extent(struct dm_snapshot *merging_snap, | ||
2062 | sector_t sector, unsigned size) | ||
2063 | { | ||
2064 | int must_wait = 0; | ||
2065 | sector_t n; | ||
2066 | struct origin *o; | ||
2067 | |||
2068 | /* | ||
2069 | * The origin's __minimum_chunk_size() got stored in split_io | ||
2070 | * by snapshot_merge_resume(). | ||
2071 | */ | ||
2072 | down_read(&_origins_lock); | ||
2073 | o = __lookup_origin(merging_snap->origin->bdev); | ||
2074 | for (n = 0; n < size; n += merging_snap->ti->split_io) | ||
2075 | if (__origin_write(&o->snapshots, sector + n, NULL) == | ||
2076 | DM_MAPIO_SUBMITTED) | ||
2077 | must_wait = 1; | ||
2078 | up_read(&_origins_lock); | ||
2079 | |||
2080 | return must_wait; | ||
2081 | } | ||
2082 | |||
2083 | /* | ||
1330 | * Origin: maps a linear range of a device, with hooks for snapshotting. | 2084 | * Origin: maps a linear range of a device, with hooks for snapshotting. |
1331 | */ | 2085 | */ |
1332 | 2086 | ||
@@ -1345,8 +2099,7 @@ static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1345 | return -EINVAL; | 2099 | return -EINVAL; |
1346 | } | 2100 | } |
1347 | 2101 | ||
1348 | r = dm_get_device(ti, argv[0], 0, ti->len, | 2102 | r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &dev); |
1349 | dm_table_get_mode(ti->table), &dev); | ||
1350 | if (r) { | 2103 | if (r) { |
1351 | ti->error = "Cannot get target device"; | 2104 | ti->error = "Cannot get target device"; |
1352 | return r; | 2105 | return r; |
@@ -1377,8 +2130,6 @@ static int origin_map(struct dm_target *ti, struct bio *bio, | |||
1377 | return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED; | 2130 | return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED; |
1378 | } | 2131 | } |
1379 | 2132 | ||
1380 | #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) | ||
1381 | |||
1382 | /* | 2133 | /* |
1383 | * Set the target "split_io" field to the minimum of all the snapshots' | 2134 | * Set the target "split_io" field to the minimum of all the snapshots' |
1384 | * chunk sizes. | 2135 | * chunk sizes. |
@@ -1386,19 +2137,8 @@ static int origin_map(struct dm_target *ti, struct bio *bio, | |||
1386 | static void origin_resume(struct dm_target *ti) | 2137 | static void origin_resume(struct dm_target *ti) |
1387 | { | 2138 | { |
1388 | struct dm_dev *dev = ti->private; | 2139 | struct dm_dev *dev = ti->private; |
1389 | struct dm_snapshot *snap; | ||
1390 | struct origin *o; | ||
1391 | chunk_t chunk_size = 0; | ||
1392 | 2140 | ||
1393 | down_read(&_origins_lock); | 2141 | ti->split_io = get_origin_minimum_chunksize(dev->bdev); |
1394 | o = __lookup_origin(dev->bdev); | ||
1395 | if (o) | ||
1396 | list_for_each_entry (snap, &o->snapshots, list) | ||
1397 | chunk_size = min_not_zero(chunk_size, | ||
1398 | snap->store->chunk_size); | ||
1399 | up_read(&_origins_lock); | ||
1400 | |||
1401 | ti->split_io = chunk_size; | ||
1402 | } | 2142 | } |
1403 | 2143 | ||
1404 | static int origin_status(struct dm_target *ti, status_type_t type, char *result, | 2144 | static int origin_status(struct dm_target *ti, status_type_t type, char *result, |
@@ -1441,17 +2181,35 @@ static struct target_type origin_target = { | |||
1441 | 2181 | ||
1442 | static struct target_type snapshot_target = { | 2182 | static struct target_type snapshot_target = { |
1443 | .name = "snapshot", | 2183 | .name = "snapshot", |
1444 | .version = {1, 7, 0}, | 2184 | .version = {1, 9, 0}, |
1445 | .module = THIS_MODULE, | 2185 | .module = THIS_MODULE, |
1446 | .ctr = snapshot_ctr, | 2186 | .ctr = snapshot_ctr, |
1447 | .dtr = snapshot_dtr, | 2187 | .dtr = snapshot_dtr, |
1448 | .map = snapshot_map, | 2188 | .map = snapshot_map, |
1449 | .end_io = snapshot_end_io, | 2189 | .end_io = snapshot_end_io, |
2190 | .postsuspend = snapshot_postsuspend, | ||
2191 | .preresume = snapshot_preresume, | ||
1450 | .resume = snapshot_resume, | 2192 | .resume = snapshot_resume, |
1451 | .status = snapshot_status, | 2193 | .status = snapshot_status, |
1452 | .iterate_devices = snapshot_iterate_devices, | 2194 | .iterate_devices = snapshot_iterate_devices, |
1453 | }; | 2195 | }; |
1454 | 2196 | ||
2197 | static struct target_type merge_target = { | ||
2198 | .name = dm_snapshot_merge_target_name, | ||
2199 | .version = {1, 0, 0}, | ||
2200 | .module = THIS_MODULE, | ||
2201 | .ctr = snapshot_ctr, | ||
2202 | .dtr = snapshot_dtr, | ||
2203 | .map = snapshot_merge_map, | ||
2204 | .end_io = snapshot_end_io, | ||
2205 | .presuspend = snapshot_merge_presuspend, | ||
2206 | .postsuspend = snapshot_postsuspend, | ||
2207 | .preresume = snapshot_preresume, | ||
2208 | .resume = snapshot_merge_resume, | ||
2209 | .status = snapshot_status, | ||
2210 | .iterate_devices = snapshot_iterate_devices, | ||
2211 | }; | ||
2212 | |||
1455 | static int __init dm_snapshot_init(void) | 2213 | static int __init dm_snapshot_init(void) |
1456 | { | 2214 | { |
1457 | int r; | 2215 | int r; |
@@ -1463,42 +2221,48 @@ static int __init dm_snapshot_init(void) | |||
1463 | } | 2221 | } |
1464 | 2222 | ||
1465 | r = dm_register_target(&snapshot_target); | 2223 | r = dm_register_target(&snapshot_target); |
1466 | if (r) { | 2224 | if (r < 0) { |
1467 | DMERR("snapshot target register failed %d", r); | 2225 | DMERR("snapshot target register failed %d", r); |
1468 | return r; | 2226 | goto bad_register_snapshot_target; |
1469 | } | 2227 | } |
1470 | 2228 | ||
1471 | r = dm_register_target(&origin_target); | 2229 | r = dm_register_target(&origin_target); |
1472 | if (r < 0) { | 2230 | if (r < 0) { |
1473 | DMERR("Origin target register failed %d", r); | 2231 | DMERR("Origin target register failed %d", r); |
1474 | goto bad1; | 2232 | goto bad_register_origin_target; |
2233 | } | ||
2234 | |||
2235 | r = dm_register_target(&merge_target); | ||
2236 | if (r < 0) { | ||
2237 | DMERR("Merge target register failed %d", r); | ||
2238 | goto bad_register_merge_target; | ||
1475 | } | 2239 | } |
1476 | 2240 | ||
1477 | r = init_origin_hash(); | 2241 | r = init_origin_hash(); |
1478 | if (r) { | 2242 | if (r) { |
1479 | DMERR("init_origin_hash failed."); | 2243 | DMERR("init_origin_hash failed."); |
1480 | goto bad2; | 2244 | goto bad_origin_hash; |
1481 | } | 2245 | } |
1482 | 2246 | ||
1483 | exception_cache = KMEM_CACHE(dm_snap_exception, 0); | 2247 | exception_cache = KMEM_CACHE(dm_exception, 0); |
1484 | if (!exception_cache) { | 2248 | if (!exception_cache) { |
1485 | DMERR("Couldn't create exception cache."); | 2249 | DMERR("Couldn't create exception cache."); |
1486 | r = -ENOMEM; | 2250 | r = -ENOMEM; |
1487 | goto bad3; | 2251 | goto bad_exception_cache; |
1488 | } | 2252 | } |
1489 | 2253 | ||
1490 | pending_cache = KMEM_CACHE(dm_snap_pending_exception, 0); | 2254 | pending_cache = KMEM_CACHE(dm_snap_pending_exception, 0); |
1491 | if (!pending_cache) { | 2255 | if (!pending_cache) { |
1492 | DMERR("Couldn't create pending cache."); | 2256 | DMERR("Couldn't create pending cache."); |
1493 | r = -ENOMEM; | 2257 | r = -ENOMEM; |
1494 | goto bad4; | 2258 | goto bad_pending_cache; |
1495 | } | 2259 | } |
1496 | 2260 | ||
1497 | tracked_chunk_cache = KMEM_CACHE(dm_snap_tracked_chunk, 0); | 2261 | tracked_chunk_cache = KMEM_CACHE(dm_snap_tracked_chunk, 0); |
1498 | if (!tracked_chunk_cache) { | 2262 | if (!tracked_chunk_cache) { |
1499 | DMERR("Couldn't create cache to track chunks in use."); | 2263 | DMERR("Couldn't create cache to track chunks in use."); |
1500 | r = -ENOMEM; | 2264 | r = -ENOMEM; |
1501 | goto bad5; | 2265 | goto bad_tracked_chunk_cache; |
1502 | } | 2266 | } |
1503 | 2267 | ||
1504 | ksnapd = create_singlethread_workqueue("ksnapd"); | 2268 | ksnapd = create_singlethread_workqueue("ksnapd"); |
@@ -1512,16 +2276,21 @@ static int __init dm_snapshot_init(void) | |||
1512 | 2276 | ||
1513 | bad_pending_pool: | 2277 | bad_pending_pool: |
1514 | kmem_cache_destroy(tracked_chunk_cache); | 2278 | kmem_cache_destroy(tracked_chunk_cache); |
1515 | bad5: | 2279 | bad_tracked_chunk_cache: |
1516 | kmem_cache_destroy(pending_cache); | 2280 | kmem_cache_destroy(pending_cache); |
1517 | bad4: | 2281 | bad_pending_cache: |
1518 | kmem_cache_destroy(exception_cache); | 2282 | kmem_cache_destroy(exception_cache); |
1519 | bad3: | 2283 | bad_exception_cache: |
1520 | exit_origin_hash(); | 2284 | exit_origin_hash(); |
1521 | bad2: | 2285 | bad_origin_hash: |
2286 | dm_unregister_target(&merge_target); | ||
2287 | bad_register_merge_target: | ||
1522 | dm_unregister_target(&origin_target); | 2288 | dm_unregister_target(&origin_target); |
1523 | bad1: | 2289 | bad_register_origin_target: |
1524 | dm_unregister_target(&snapshot_target); | 2290 | dm_unregister_target(&snapshot_target); |
2291 | bad_register_snapshot_target: | ||
2292 | dm_exception_store_exit(); | ||
2293 | |||
1525 | return r; | 2294 | return r; |
1526 | } | 2295 | } |
1527 | 2296 | ||
@@ -1531,6 +2300,7 @@ static void __exit dm_snapshot_exit(void) | |||
1531 | 2300 | ||
1532 | dm_unregister_target(&snapshot_target); | 2301 | dm_unregister_target(&snapshot_target); |
1533 | dm_unregister_target(&origin_target); | 2302 | dm_unregister_target(&origin_target); |
2303 | dm_unregister_target(&merge_target); | ||
1534 | 2304 | ||
1535 | exit_origin_hash(); | 2305 | exit_origin_hash(); |
1536 | kmem_cache_destroy(pending_cache); | 2306 | kmem_cache_destroy(pending_cache); |
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index e0efc1adcaff..e610725db766 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c | |||
@@ -80,8 +80,7 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc, | |||
80 | if (sscanf(argv[1], "%llu", &start) != 1) | 80 | if (sscanf(argv[1], "%llu", &start) != 1) |
81 | return -EINVAL; | 81 | return -EINVAL; |
82 | 82 | ||
83 | if (dm_get_device(ti, argv[0], start, sc->stripe_width, | 83 | if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), |
84 | dm_table_get_mode(ti->table), | ||
85 | &sc->stripe[stripe].dev)) | 84 | &sc->stripe[stripe].dev)) |
86 | return -ENXIO; | 85 | return -ENXIO; |
87 | 86 | ||
@@ -110,7 +109,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
110 | } | 109 | } |
111 | 110 | ||
112 | stripes = simple_strtoul(argv[0], &end, 10); | 111 | stripes = simple_strtoul(argv[0], &end, 10); |
113 | if (*end) { | 112 | if (!stripes || *end) { |
114 | ti->error = "Invalid stripe count"; | 113 | ti->error = "Invalid stripe count"; |
115 | return -EINVAL; | 114 | return -EINVAL; |
116 | } | 115 | } |
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c index 4b045903a4e2..84d2b91e4efb 100644 --- a/drivers/md/dm-sysfs.c +++ b/drivers/md/dm-sysfs.c | |||
@@ -59,7 +59,7 @@ static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf) | |||
59 | 59 | ||
60 | static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf) | 60 | static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf) |
61 | { | 61 | { |
62 | sprintf(buf, "%d\n", dm_suspended(md)); | 62 | sprintf(buf, "%d\n", dm_suspended_md(md)); |
63 | 63 | ||
64 | return strlen(buf); | 64 | return strlen(buf); |
65 | } | 65 | } |
@@ -75,7 +75,7 @@ static struct attribute *dm_attrs[] = { | |||
75 | NULL, | 75 | NULL, |
76 | }; | 76 | }; |
77 | 77 | ||
78 | static struct sysfs_ops dm_sysfs_ops = { | 78 | static const struct sysfs_ops dm_sysfs_ops = { |
79 | .show = dm_attr_show, | 79 | .show = dm_attr_show, |
80 | }; | 80 | }; |
81 | 81 | ||
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 1a6cb3c7822e..9924ea23032d 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <linux/blkdev.h> | 12 | #include <linux/blkdev.h> |
13 | #include <linux/namei.h> | 13 | #include <linux/namei.h> |
14 | #include <linux/ctype.h> | 14 | #include <linux/ctype.h> |
15 | #include <linux/string.h> | ||
15 | #include <linux/slab.h> | 16 | #include <linux/slab.h> |
16 | #include <linux/interrupt.h> | 17 | #include <linux/interrupt.h> |
17 | #include <linux/mutex.h> | 18 | #include <linux/mutex.h> |
@@ -237,6 +238,9 @@ void dm_table_destroy(struct dm_table *t) | |||
237 | { | 238 | { |
238 | unsigned int i; | 239 | unsigned int i; |
239 | 240 | ||
241 | if (!t) | ||
242 | return; | ||
243 | |||
240 | while (atomic_read(&t->holders)) | 244 | while (atomic_read(&t->holders)) |
241 | msleep(1); | 245 | msleep(1); |
242 | smp_mb(); | 246 | smp_mb(); |
@@ -425,8 +429,7 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, | |||
425 | * it's already present. | 429 | * it's already present. |
426 | */ | 430 | */ |
427 | static int __table_get_device(struct dm_table *t, struct dm_target *ti, | 431 | static int __table_get_device(struct dm_table *t, struct dm_target *ti, |
428 | const char *path, sector_t start, sector_t len, | 432 | const char *path, fmode_t mode, struct dm_dev **result) |
429 | fmode_t mode, struct dm_dev **result) | ||
430 | { | 433 | { |
431 | int r; | 434 | int r; |
432 | dev_t uninitialized_var(dev); | 435 | dev_t uninitialized_var(dev); |
@@ -499,16 +502,15 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, | |||
499 | return 0; | 502 | return 0; |
500 | } | 503 | } |
501 | 504 | ||
502 | if (blk_stack_limits(limits, &q->limits, start << 9) < 0) | 505 | if (bdev_stack_limits(limits, bdev, start) < 0) |
503 | DMWARN("%s: target device %s is misaligned: " | 506 | DMWARN("%s: adding target device %s caused an alignment inconsistency: " |
504 | "physical_block_size=%u, logical_block_size=%u, " | 507 | "physical_block_size=%u, logical_block_size=%u, " |
505 | "alignment_offset=%u, start=%llu", | 508 | "alignment_offset=%u, start=%llu", |
506 | dm_device_name(ti->table->md), bdevname(bdev, b), | 509 | dm_device_name(ti->table->md), bdevname(bdev, b), |
507 | q->limits.physical_block_size, | 510 | q->limits.physical_block_size, |
508 | q->limits.logical_block_size, | 511 | q->limits.logical_block_size, |
509 | q->limits.alignment_offset, | 512 | q->limits.alignment_offset, |
510 | (unsigned long long) start << 9); | 513 | (unsigned long long) start << SECTOR_SHIFT); |
511 | |||
512 | 514 | ||
513 | /* | 515 | /* |
514 | * Check if merge fn is supported. | 516 | * Check if merge fn is supported. |
@@ -524,11 +526,10 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, | |||
524 | } | 526 | } |
525 | EXPORT_SYMBOL_GPL(dm_set_device_limits); | 527 | EXPORT_SYMBOL_GPL(dm_set_device_limits); |
526 | 528 | ||
527 | int dm_get_device(struct dm_target *ti, const char *path, sector_t start, | 529 | int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, |
528 | sector_t len, fmode_t mode, struct dm_dev **result) | 530 | struct dm_dev **result) |
529 | { | 531 | { |
530 | return __table_get_device(ti->table, ti, path, | 532 | return __table_get_device(ti->table, ti, path, mode, result); |
531 | start, len, mode, result); | ||
532 | } | 533 | } |
533 | 534 | ||
534 | 535 | ||
@@ -600,11 +601,8 @@ int dm_split_args(int *argc, char ***argvp, char *input) | |||
600 | return -ENOMEM; | 601 | return -ENOMEM; |
601 | 602 | ||
602 | while (1) { | 603 | while (1) { |
603 | start = end; | ||
604 | |||
605 | /* Skip whitespace */ | 604 | /* Skip whitespace */ |
606 | while (*start && isspace(*start)) | 605 | start = skip_spaces(end); |
607 | start++; | ||
608 | 606 | ||
609 | if (!*start) | 607 | if (!*start) |
610 | break; /* success, we hit the end */ | 608 | break; /* success, we hit the end */ |
@@ -1025,9 +1023,9 @@ combine_limits: | |||
1025 | * for the table. | 1023 | * for the table. |
1026 | */ | 1024 | */ |
1027 | if (blk_stack_limits(limits, &ti_limits, 0) < 0) | 1025 | if (blk_stack_limits(limits, &ti_limits, 0) < 0) |
1028 | DMWARN("%s: target device " | 1026 | DMWARN("%s: adding target device " |
1029 | "(start sect %llu len %llu) " | 1027 | "(start sect %llu len %llu) " |
1030 | "is misaligned", | 1028 | "caused an alignment inconsistency", |
1031 | dm_device_name(table->md), | 1029 | dm_device_name(table->md), |
1032 | (unsigned long long) ti->begin, | 1030 | (unsigned long long) ti->begin, |
1033 | (unsigned long long) ti->len); | 1031 | (unsigned long long) ti->len); |
@@ -1079,15 +1077,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, | |||
1079 | struct queue_limits *limits) | 1077 | struct queue_limits *limits) |
1080 | { | 1078 | { |
1081 | /* | 1079 | /* |
1082 | * Each target device in the table has a data area that should normally | ||
1083 | * be aligned such that the DM device's alignment_offset is 0. | ||
1084 | * FIXME: Propagate alignment_offsets up the stack and warn of | ||
1085 | * sub-optimal or inconsistent settings. | ||
1086 | */ | ||
1087 | limits->alignment_offset = 0; | ||
1088 | limits->misaligned = 0; | ||
1089 | |||
1090 | /* | ||
1091 | * Copy table's limits to the DM device's request_queue | 1080 | * Copy table's limits to the DM device's request_queue |
1092 | */ | 1081 | */ |
1093 | q->limits = *limits; | 1082 | q->limits = *limits; |
@@ -1240,8 +1229,6 @@ void dm_table_unplug_all(struct dm_table *t) | |||
1240 | 1229 | ||
1241 | struct mapped_device *dm_table_get_md(struct dm_table *t) | 1230 | struct mapped_device *dm_table_get_md(struct dm_table *t) |
1242 | { | 1231 | { |
1243 | dm_get(t->md); | ||
1244 | |||
1245 | return t->md; | 1232 | return t->md; |
1246 | } | 1233 | } |
1247 | 1234 | ||
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c index 04feccf2a997..11dea11dc0b6 100644 --- a/drivers/md/dm-target.c +++ b/drivers/md/dm-target.c | |||
@@ -10,7 +10,6 @@ | |||
10 | #include <linux/init.h> | 10 | #include <linux/init.h> |
11 | #include <linux/kmod.h> | 11 | #include <linux/kmod.h> |
12 | #include <linux/bio.h> | 12 | #include <linux/bio.h> |
13 | #include <linux/slab.h> | ||
14 | 13 | ||
15 | #define DM_MSG_PREFIX "target" | 14 | #define DM_MSG_PREFIX "target" |
16 | 15 | ||
diff --git a/drivers/md/dm-uevent.c b/drivers/md/dm-uevent.c index 6f65883aef12..6b1e3b61b25e 100644 --- a/drivers/md/dm-uevent.c +++ b/drivers/md/dm-uevent.c | |||
@@ -139,14 +139,13 @@ void dm_send_uevents(struct list_head *events, struct kobject *kobj) | |||
139 | list_del_init(&event->elist); | 139 | list_del_init(&event->elist); |
140 | 140 | ||
141 | /* | 141 | /* |
142 | * Need to call dm_copy_name_and_uuid from here for now. | 142 | * When a device is being removed this copy fails and we |
143 | * Context of previous var adds and locking used for | 143 | * discard these unsent events. |
144 | * hash_cell not compatable. | ||
145 | */ | 144 | */ |
146 | if (dm_copy_name_and_uuid(event->md, event->name, | 145 | if (dm_copy_name_and_uuid(event->md, event->name, |
147 | event->uuid)) { | 146 | event->uuid)) { |
148 | DMERR("%s: dm_copy_name_and_uuid() failed", | 147 | DMINFO("%s: skipping sending uevent for lost device", |
149 | __func__); | 148 | __func__); |
150 | goto uevent_free; | 149 | goto uevent_free; |
151 | } | 150 | } |
152 | 151 | ||
@@ -188,7 +187,7 @@ void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti, | |||
188 | 187 | ||
189 | if (event_type >= ARRAY_SIZE(_dm_uevent_type_names)) { | 188 | if (event_type >= ARRAY_SIZE(_dm_uevent_type_names)) { |
190 | DMERR("%s: Invalid event_type %d", __func__, event_type); | 189 | DMERR("%s: Invalid event_type %d", __func__, event_type); |
191 | goto out; | 190 | return; |
192 | } | 191 | } |
193 | 192 | ||
194 | event = dm_build_path_uevent(md, ti, | 193 | event = dm_build_path_uevent(md, ti, |
@@ -196,12 +195,9 @@ void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti, | |||
196 | _dm_uevent_type_names[event_type].name, | 195 | _dm_uevent_type_names[event_type].name, |
197 | path, nr_valid_paths); | 196 | path, nr_valid_paths); |
198 | if (IS_ERR(event)) | 197 | if (IS_ERR(event)) |
199 | goto out; | 198 | return; |
200 | 199 | ||
201 | dm_uevent_add(md, &event->elist); | 200 | dm_uevent_add(md, &event->elist); |
202 | |||
203 | out: | ||
204 | dm_put(md); | ||
205 | } | 201 | } |
206 | EXPORT_SYMBOL_GPL(dm_path_uevent); | 202 | EXPORT_SYMBOL_GPL(dm_path_uevent); |
207 | 203 | ||
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 376f1ab48a24..d21e1284604f 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c | |||
@@ -47,6 +47,7 @@ struct dm_io { | |||
47 | atomic_t io_count; | 47 | atomic_t io_count; |
48 | struct bio *bio; | 48 | struct bio *bio; |
49 | unsigned long start_time; | 49 | unsigned long start_time; |
50 | spinlock_t endio_lock; | ||
50 | }; | 51 | }; |
51 | 52 | ||
52 | /* | 53 | /* |
@@ -142,9 +143,19 @@ struct mapped_device { | |||
142 | int barrier_error; | 143 | int barrier_error; |
143 | 144 | ||
144 | /* | 145 | /* |
146 | * Protect barrier_error from concurrent endio processing | ||
147 | * in request-based dm. | ||
148 | */ | ||
149 | spinlock_t barrier_error_lock; | ||
150 | |||
151 | /* | ||
145 | * Processing queue (flush/barriers) | 152 | * Processing queue (flush/barriers) |
146 | */ | 153 | */ |
147 | struct workqueue_struct *wq; | 154 | struct workqueue_struct *wq; |
155 | struct work_struct barrier_work; | ||
156 | |||
157 | /* A pointer to the currently processing pre/post flush request */ | ||
158 | struct request *flush_request; | ||
148 | 159 | ||
149 | /* | 160 | /* |
150 | * The current mapping. | 161 | * The current mapping. |
@@ -177,9 +188,6 @@ struct mapped_device { | |||
177 | /* forced geometry settings */ | 188 | /* forced geometry settings */ |
178 | struct hd_geometry geometry; | 189 | struct hd_geometry geometry; |
179 | 190 | ||
180 | /* marker of flush suspend for request-based dm */ | ||
181 | struct request suspend_rq; | ||
182 | |||
183 | /* For saving the address of __make_request for request based dm */ | 191 | /* For saving the address of __make_request for request based dm */ |
184 | make_request_fn *saved_make_request_fn; | 192 | make_request_fn *saved_make_request_fn; |
185 | 193 | ||
@@ -274,6 +282,7 @@ static int (*_inits[])(void) __initdata = { | |||
274 | dm_target_init, | 282 | dm_target_init, |
275 | dm_linear_init, | 283 | dm_linear_init, |
276 | dm_stripe_init, | 284 | dm_stripe_init, |
285 | dm_io_init, | ||
277 | dm_kcopyd_init, | 286 | dm_kcopyd_init, |
278 | dm_interface_init, | 287 | dm_interface_init, |
279 | }; | 288 | }; |
@@ -283,6 +292,7 @@ static void (*_exits[])(void) = { | |||
283 | dm_target_exit, | 292 | dm_target_exit, |
284 | dm_linear_exit, | 293 | dm_linear_exit, |
285 | dm_stripe_exit, | 294 | dm_stripe_exit, |
295 | dm_io_exit, | ||
286 | dm_kcopyd_exit, | 296 | dm_kcopyd_exit, |
287 | dm_interface_exit, | 297 | dm_interface_exit, |
288 | }; | 298 | }; |
@@ -319,6 +329,11 @@ static void __exit dm_exit(void) | |||
319 | /* | 329 | /* |
320 | * Block device functions | 330 | * Block device functions |
321 | */ | 331 | */ |
332 | int dm_deleting_md(struct mapped_device *md) | ||
333 | { | ||
334 | return test_bit(DMF_DELETING, &md->flags); | ||
335 | } | ||
336 | |||
322 | static int dm_blk_open(struct block_device *bdev, fmode_t mode) | 337 | static int dm_blk_open(struct block_device *bdev, fmode_t mode) |
323 | { | 338 | { |
324 | struct mapped_device *md; | 339 | struct mapped_device *md; |
@@ -330,7 +345,7 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode) | |||
330 | goto out; | 345 | goto out; |
331 | 346 | ||
332 | if (test_bit(DMF_FREEING, &md->flags) || | 347 | if (test_bit(DMF_FREEING, &md->flags) || |
333 | test_bit(DMF_DELETING, &md->flags)) { | 348 | dm_deleting_md(md)) { |
334 | md = NULL; | 349 | md = NULL; |
335 | goto out; | 350 | goto out; |
336 | } | 351 | } |
@@ -387,7 +402,7 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, | |||
387 | unsigned int cmd, unsigned long arg) | 402 | unsigned int cmd, unsigned long arg) |
388 | { | 403 | { |
389 | struct mapped_device *md = bdev->bd_disk->private_data; | 404 | struct mapped_device *md = bdev->bd_disk->private_data; |
390 | struct dm_table *map = dm_get_table(md); | 405 | struct dm_table *map = dm_get_live_table(md); |
391 | struct dm_target *tgt; | 406 | struct dm_target *tgt; |
392 | int r = -ENOTTY; | 407 | int r = -ENOTTY; |
393 | 408 | ||
@@ -400,7 +415,7 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, | |||
400 | 415 | ||
401 | tgt = dm_table_get_target(map, 0); | 416 | tgt = dm_table_get_target(map, 0); |
402 | 417 | ||
403 | if (dm_suspended(md)) { | 418 | if (dm_suspended_md(md)) { |
404 | r = -EAGAIN; | 419 | r = -EAGAIN; |
405 | goto out; | 420 | goto out; |
406 | } | 421 | } |
@@ -429,9 +444,10 @@ static void free_tio(struct mapped_device *md, struct dm_target_io *tio) | |||
429 | mempool_free(tio, md->tio_pool); | 444 | mempool_free(tio, md->tio_pool); |
430 | } | 445 | } |
431 | 446 | ||
432 | static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md) | 447 | static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md, |
448 | gfp_t gfp_mask) | ||
433 | { | 449 | { |
434 | return mempool_alloc(md->tio_pool, GFP_ATOMIC); | 450 | return mempool_alloc(md->tio_pool, gfp_mask); |
435 | } | 451 | } |
436 | 452 | ||
437 | static void free_rq_tio(struct dm_rq_target_io *tio) | 453 | static void free_rq_tio(struct dm_rq_target_io *tio) |
@@ -449,6 +465,12 @@ static void free_bio_info(struct dm_rq_clone_bio_info *info) | |||
449 | mempool_free(info, info->tio->md->io_pool); | 465 | mempool_free(info, info->tio->md->io_pool); |
450 | } | 466 | } |
451 | 467 | ||
468 | static int md_in_flight(struct mapped_device *md) | ||
469 | { | ||
470 | return atomic_read(&md->pending[READ]) + | ||
471 | atomic_read(&md->pending[WRITE]); | ||
472 | } | ||
473 | |||
452 | static void start_io_acct(struct dm_io *io) | 474 | static void start_io_acct(struct dm_io *io) |
453 | { | 475 | { |
454 | struct mapped_device *md = io->md; | 476 | struct mapped_device *md = io->md; |
@@ -511,7 +533,7 @@ static void queue_io(struct mapped_device *md, struct bio *bio) | |||
511 | * function to access the md->map field, and make sure they call | 533 | * function to access the md->map field, and make sure they call |
512 | * dm_table_put() when finished. | 534 | * dm_table_put() when finished. |
513 | */ | 535 | */ |
514 | struct dm_table *dm_get_table(struct mapped_device *md) | 536 | struct dm_table *dm_get_live_table(struct mapped_device *md) |
515 | { | 537 | { |
516 | struct dm_table *t; | 538 | struct dm_table *t; |
517 | unsigned long flags; | 539 | unsigned long flags; |
@@ -578,8 +600,12 @@ static void dec_pending(struct dm_io *io, int error) | |||
578 | struct mapped_device *md = io->md; | 600 | struct mapped_device *md = io->md; |
579 | 601 | ||
580 | /* Push-back supersedes any I/O errors */ | 602 | /* Push-back supersedes any I/O errors */ |
581 | if (error && !(io->error > 0 && __noflush_suspending(md))) | 603 | if (unlikely(error)) { |
582 | io->error = error; | 604 | spin_lock_irqsave(&io->endio_lock, flags); |
605 | if (!(io->error > 0 && __noflush_suspending(md))) | ||
606 | io->error = error; | ||
607 | spin_unlock_irqrestore(&io->endio_lock, flags); | ||
608 | } | ||
583 | 609 | ||
584 | if (atomic_dec_and_test(&io->io_count)) { | 610 | if (atomic_dec_and_test(&io->io_count)) { |
585 | if (io->error == DM_ENDIO_REQUEUE) { | 611 | if (io->error == DM_ENDIO_REQUEUE) { |
@@ -609,8 +635,10 @@ static void dec_pending(struct dm_io *io, int error) | |||
609 | if (!md->barrier_error && io_error != -EOPNOTSUPP) | 635 | if (!md->barrier_error && io_error != -EOPNOTSUPP) |
610 | md->barrier_error = io_error; | 636 | md->barrier_error = io_error; |
611 | end_io_acct(io); | 637 | end_io_acct(io); |
638 | free_io(md, io); | ||
612 | } else { | 639 | } else { |
613 | end_io_acct(io); | 640 | end_io_acct(io); |
641 | free_io(md, io); | ||
614 | 642 | ||
615 | if (io_error != DM_ENDIO_REQUEUE) { | 643 | if (io_error != DM_ENDIO_REQUEUE) { |
616 | trace_block_bio_complete(md->queue, bio); | 644 | trace_block_bio_complete(md->queue, bio); |
@@ -618,8 +646,6 @@ static void dec_pending(struct dm_io *io, int error) | |||
618 | bio_endio(bio, io_error); | 646 | bio_endio(bio, io_error); |
619 | } | 647 | } |
620 | } | 648 | } |
621 | |||
622 | free_io(md, io); | ||
623 | } | 649 | } |
624 | } | 650 | } |
625 | 651 | ||
@@ -711,28 +737,38 @@ static void end_clone_bio(struct bio *clone, int error) | |||
711 | blk_update_request(tio->orig, 0, nr_bytes); | 737 | blk_update_request(tio->orig, 0, nr_bytes); |
712 | } | 738 | } |
713 | 739 | ||
740 | static void store_barrier_error(struct mapped_device *md, int error) | ||
741 | { | ||
742 | unsigned long flags; | ||
743 | |||
744 | spin_lock_irqsave(&md->barrier_error_lock, flags); | ||
745 | /* | ||
746 | * Basically, the first error is taken, but: | ||
747 | * -EOPNOTSUPP supersedes any I/O error. | ||
748 | * Requeue request supersedes any I/O error but -EOPNOTSUPP. | ||
749 | */ | ||
750 | if (!md->barrier_error || error == -EOPNOTSUPP || | ||
751 | (md->barrier_error != -EOPNOTSUPP && | ||
752 | error == DM_ENDIO_REQUEUE)) | ||
753 | md->barrier_error = error; | ||
754 | spin_unlock_irqrestore(&md->barrier_error_lock, flags); | ||
755 | } | ||
756 | |||
714 | /* | 757 | /* |
715 | * Don't touch any member of the md after calling this function because | 758 | * Don't touch any member of the md after calling this function because |
716 | * the md may be freed in dm_put() at the end of this function. | 759 | * the md may be freed in dm_put() at the end of this function. |
717 | * Or do dm_get() before calling this function and dm_put() later. | 760 | * Or do dm_get() before calling this function and dm_put() later. |
718 | */ | 761 | */ |
719 | static void rq_completed(struct mapped_device *md, int run_queue) | 762 | static void rq_completed(struct mapped_device *md, int rw, int run_queue) |
720 | { | 763 | { |
721 | int wakeup_waiters = 0; | 764 | atomic_dec(&md->pending[rw]); |
722 | struct request_queue *q = md->queue; | ||
723 | unsigned long flags; | ||
724 | |||
725 | spin_lock_irqsave(q->queue_lock, flags); | ||
726 | if (!queue_in_flight(q)) | ||
727 | wakeup_waiters = 1; | ||
728 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
729 | 765 | ||
730 | /* nudge anyone waiting on suspend queue */ | 766 | /* nudge anyone waiting on suspend queue */ |
731 | if (wakeup_waiters) | 767 | if (!md_in_flight(md)) |
732 | wake_up(&md->wait); | 768 | wake_up(&md->wait); |
733 | 769 | ||
734 | if (run_queue) | 770 | if (run_queue) |
735 | blk_run_queue(q); | 771 | blk_run_queue(md->queue); |
736 | 772 | ||
737 | /* | 773 | /* |
738 | * dm_put() must be at the end of this function. See the comment above | 774 | * dm_put() must be at the end of this function. See the comment above |
@@ -748,6 +784,44 @@ static void free_rq_clone(struct request *clone) | |||
748 | free_rq_tio(tio); | 784 | free_rq_tio(tio); |
749 | } | 785 | } |
750 | 786 | ||
787 | /* | ||
788 | * Complete the clone and the original request. | ||
789 | * Must be called without queue lock. | ||
790 | */ | ||
791 | static void dm_end_request(struct request *clone, int error) | ||
792 | { | ||
793 | int rw = rq_data_dir(clone); | ||
794 | int run_queue = 1; | ||
795 | bool is_barrier = blk_barrier_rq(clone); | ||
796 | struct dm_rq_target_io *tio = clone->end_io_data; | ||
797 | struct mapped_device *md = tio->md; | ||
798 | struct request *rq = tio->orig; | ||
799 | |||
800 | if (blk_pc_request(rq) && !is_barrier) { | ||
801 | rq->errors = clone->errors; | ||
802 | rq->resid_len = clone->resid_len; | ||
803 | |||
804 | if (rq->sense) | ||
805 | /* | ||
806 | * We are using the sense buffer of the original | ||
807 | * request. | ||
808 | * So setting the length of the sense data is enough. | ||
809 | */ | ||
810 | rq->sense_len = clone->sense_len; | ||
811 | } | ||
812 | |||
813 | free_rq_clone(clone); | ||
814 | |||
815 | if (unlikely(is_barrier)) { | ||
816 | if (unlikely(error)) | ||
817 | store_barrier_error(md, error); | ||
818 | run_queue = 0; | ||
819 | } else | ||
820 | blk_end_request_all(rq, error); | ||
821 | |||
822 | rq_completed(md, rw, run_queue); | ||
823 | } | ||
824 | |||
751 | static void dm_unprep_request(struct request *rq) | 825 | static void dm_unprep_request(struct request *rq) |
752 | { | 826 | { |
753 | struct request *clone = rq->special; | 827 | struct request *clone = rq->special; |
@@ -763,12 +837,23 @@ static void dm_unprep_request(struct request *rq) | |||
763 | */ | 837 | */ |
764 | void dm_requeue_unmapped_request(struct request *clone) | 838 | void dm_requeue_unmapped_request(struct request *clone) |
765 | { | 839 | { |
840 | int rw = rq_data_dir(clone); | ||
766 | struct dm_rq_target_io *tio = clone->end_io_data; | 841 | struct dm_rq_target_io *tio = clone->end_io_data; |
767 | struct mapped_device *md = tio->md; | 842 | struct mapped_device *md = tio->md; |
768 | struct request *rq = tio->orig; | 843 | struct request *rq = tio->orig; |
769 | struct request_queue *q = rq->q; | 844 | struct request_queue *q = rq->q; |
770 | unsigned long flags; | 845 | unsigned long flags; |
771 | 846 | ||
847 | if (unlikely(blk_barrier_rq(clone))) { | ||
848 | /* | ||
849 | * Barrier clones share an original request. | ||
850 | * Leave it to dm_end_request(), which handles this special | ||
851 | * case. | ||
852 | */ | ||
853 | dm_end_request(clone, DM_ENDIO_REQUEUE); | ||
854 | return; | ||
855 | } | ||
856 | |||
772 | dm_unprep_request(rq); | 857 | dm_unprep_request(rq); |
773 | 858 | ||
774 | spin_lock_irqsave(q->queue_lock, flags); | 859 | spin_lock_irqsave(q->queue_lock, flags); |
@@ -777,7 +862,7 @@ void dm_requeue_unmapped_request(struct request *clone) | |||
777 | blk_requeue_request(q, rq); | 862 | blk_requeue_request(q, rq); |
778 | spin_unlock_irqrestore(q->queue_lock, flags); | 863 | spin_unlock_irqrestore(q->queue_lock, flags); |
779 | 864 | ||
780 | rq_completed(md, 0); | 865 | rq_completed(md, rw, 0); |
781 | } | 866 | } |
782 | EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); | 867 | EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); |
783 | 868 | ||
@@ -810,34 +895,28 @@ static void start_queue(struct request_queue *q) | |||
810 | spin_unlock_irqrestore(q->queue_lock, flags); | 895 | spin_unlock_irqrestore(q->queue_lock, flags); |
811 | } | 896 | } |
812 | 897 | ||
813 | /* | 898 | static void dm_done(struct request *clone, int error, bool mapped) |
814 | * Complete the clone and the original request. | ||
815 | * Must be called without queue lock. | ||
816 | */ | ||
817 | static void dm_end_request(struct request *clone, int error) | ||
818 | { | 899 | { |
900 | int r = error; | ||
819 | struct dm_rq_target_io *tio = clone->end_io_data; | 901 | struct dm_rq_target_io *tio = clone->end_io_data; |
820 | struct mapped_device *md = tio->md; | 902 | dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io; |
821 | struct request *rq = tio->orig; | ||
822 | 903 | ||
823 | if (blk_pc_request(rq)) { | 904 | if (mapped && rq_end_io) |
824 | rq->errors = clone->errors; | 905 | r = rq_end_io(tio->ti, clone, error, &tio->info); |
825 | rq->resid_len = clone->resid_len; | ||
826 | 906 | ||
827 | if (rq->sense) | 907 | if (r <= 0) |
828 | /* | 908 | /* The target wants to complete the I/O */ |
829 | * We are using the sense buffer of the original | 909 | dm_end_request(clone, r); |
830 | * request. | 910 | else if (r == DM_ENDIO_INCOMPLETE) |
831 | * So setting the length of the sense data is enough. | 911 | /* The target will handle the I/O */ |
832 | */ | 912 | return; |
833 | rq->sense_len = clone->sense_len; | 913 | else if (r == DM_ENDIO_REQUEUE) |
914 | /* The target wants to requeue the I/O */ | ||
915 | dm_requeue_unmapped_request(clone); | ||
916 | else { | ||
917 | DMWARN("unimplemented target endio return value: %d", r); | ||
918 | BUG(); | ||
834 | } | 919 | } |
835 | |||
836 | free_rq_clone(clone); | ||
837 | |||
838 | blk_end_request_all(rq, error); | ||
839 | |||
840 | rq_completed(md, 1); | ||
841 | } | 920 | } |
842 | 921 | ||
843 | /* | 922 | /* |
@@ -845,27 +924,14 @@ static void dm_end_request(struct request *clone, int error) | |||
845 | */ | 924 | */ |
846 | static void dm_softirq_done(struct request *rq) | 925 | static void dm_softirq_done(struct request *rq) |
847 | { | 926 | { |
927 | bool mapped = true; | ||
848 | struct request *clone = rq->completion_data; | 928 | struct request *clone = rq->completion_data; |
849 | struct dm_rq_target_io *tio = clone->end_io_data; | 929 | struct dm_rq_target_io *tio = clone->end_io_data; |
850 | dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io; | ||
851 | int error = tio->error; | ||
852 | 930 | ||
853 | if (!(rq->cmd_flags & REQ_FAILED) && rq_end_io) | 931 | if (rq->cmd_flags & REQ_FAILED) |
854 | error = rq_end_io(tio->ti, clone, error, &tio->info); | 932 | mapped = false; |
855 | 933 | ||
856 | if (error <= 0) | 934 | dm_done(clone, tio->error, mapped); |
857 | /* The target wants to complete the I/O */ | ||
858 | dm_end_request(clone, error); | ||
859 | else if (error == DM_ENDIO_INCOMPLETE) | ||
860 | /* The target will handle the I/O */ | ||
861 | return; | ||
862 | else if (error == DM_ENDIO_REQUEUE) | ||
863 | /* The target wants to requeue the I/O */ | ||
864 | dm_requeue_unmapped_request(clone); | ||
865 | else { | ||
866 | DMWARN("unimplemented target endio return value: %d", error); | ||
867 | BUG(); | ||
868 | } | ||
869 | } | 935 | } |
870 | 936 | ||
871 | /* | 937 | /* |
@@ -877,6 +943,19 @@ static void dm_complete_request(struct request *clone, int error) | |||
877 | struct dm_rq_target_io *tio = clone->end_io_data; | 943 | struct dm_rq_target_io *tio = clone->end_io_data; |
878 | struct request *rq = tio->orig; | 944 | struct request *rq = tio->orig; |
879 | 945 | ||
946 | if (unlikely(blk_barrier_rq(clone))) { | ||
947 | /* | ||
948 | * Barrier clones share an original request. So can't use | ||
949 | * softirq_done with the original. | ||
950 | * Pass the clone to dm_done() directly in this special case. | ||
951 | * It is safe (even if clone->q->queue_lock is held here) | ||
952 | * because there is no I/O dispatching during the completion | ||
953 | * of barrier clone. | ||
954 | */ | ||
955 | dm_done(clone, error, true); | ||
956 | return; | ||
957 | } | ||
958 | |||
880 | tio->error = error; | 959 | tio->error = error; |
881 | rq->completion_data = clone; | 960 | rq->completion_data = clone; |
882 | blk_complete_request(rq); | 961 | blk_complete_request(rq); |
@@ -893,6 +972,17 @@ void dm_kill_unmapped_request(struct request *clone, int error) | |||
893 | struct dm_rq_target_io *tio = clone->end_io_data; | 972 | struct dm_rq_target_io *tio = clone->end_io_data; |
894 | struct request *rq = tio->orig; | 973 | struct request *rq = tio->orig; |
895 | 974 | ||
975 | if (unlikely(blk_barrier_rq(clone))) { | ||
976 | /* | ||
977 | * Barrier clones share an original request. | ||
978 | * Leave it to dm_end_request(), which handles this special | ||
979 | * case. | ||
980 | */ | ||
981 | BUG_ON(error > 0); | ||
982 | dm_end_request(clone, error); | ||
983 | return; | ||
984 | } | ||
985 | |||
896 | rq->cmd_flags |= REQ_FAILED; | 986 | rq->cmd_flags |= REQ_FAILED; |
897 | dm_complete_request(clone, error); | 987 | dm_complete_request(clone, error); |
898 | } | 988 | } |
@@ -1209,7 +1299,7 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) | |||
1209 | struct clone_info ci; | 1299 | struct clone_info ci; |
1210 | int error = 0; | 1300 | int error = 0; |
1211 | 1301 | ||
1212 | ci.map = dm_get_table(md); | 1302 | ci.map = dm_get_live_table(md); |
1213 | if (unlikely(!ci.map)) { | 1303 | if (unlikely(!ci.map)) { |
1214 | if (!bio_rw_flagged(bio, BIO_RW_BARRIER)) | 1304 | if (!bio_rw_flagged(bio, BIO_RW_BARRIER)) |
1215 | bio_io_error(bio); | 1305 | bio_io_error(bio); |
@@ -1226,6 +1316,7 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) | |||
1226 | atomic_set(&ci.io->io_count, 1); | 1316 | atomic_set(&ci.io->io_count, 1); |
1227 | ci.io->bio = bio; | 1317 | ci.io->bio = bio; |
1228 | ci.io->md = md; | 1318 | ci.io->md = md; |
1319 | spin_lock_init(&ci.io->endio_lock); | ||
1229 | ci.sector = bio->bi_sector; | 1320 | ci.sector = bio->bi_sector; |
1230 | ci.sector_count = bio_sectors(bio); | 1321 | ci.sector_count = bio_sectors(bio); |
1231 | if (unlikely(bio_empty_barrier(bio))) | 1322 | if (unlikely(bio_empty_barrier(bio))) |
@@ -1249,7 +1340,7 @@ static int dm_merge_bvec(struct request_queue *q, | |||
1249 | struct bio_vec *biovec) | 1340 | struct bio_vec *biovec) |
1250 | { | 1341 | { |
1251 | struct mapped_device *md = q->queuedata; | 1342 | struct mapped_device *md = q->queuedata; |
1252 | struct dm_table *map = dm_get_table(md); | 1343 | struct dm_table *map = dm_get_live_table(md); |
1253 | struct dm_target *ti; | 1344 | struct dm_target *ti; |
1254 | sector_t max_sectors; | 1345 | sector_t max_sectors; |
1255 | int max_size = 0; | 1346 | int max_size = 0; |
@@ -1346,11 +1437,6 @@ static int dm_make_request(struct request_queue *q, struct bio *bio) | |||
1346 | { | 1437 | { |
1347 | struct mapped_device *md = q->queuedata; | 1438 | struct mapped_device *md = q->queuedata; |
1348 | 1439 | ||
1349 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { | ||
1350 | bio_endio(bio, -EOPNOTSUPP); | ||
1351 | return 0; | ||
1352 | } | ||
1353 | |||
1354 | return md->saved_make_request_fn(q, bio); /* call __make_request() */ | 1440 | return md->saved_make_request_fn(q, bio); /* call __make_request() */ |
1355 | } | 1441 | } |
1356 | 1442 | ||
@@ -1369,6 +1455,25 @@ static int dm_request(struct request_queue *q, struct bio *bio) | |||
1369 | return _dm_request(q, bio); | 1455 | return _dm_request(q, bio); |
1370 | } | 1456 | } |
1371 | 1457 | ||
1458 | /* | ||
1459 | * Mark this request as flush request, so that dm_request_fn() can | ||
1460 | * recognize. | ||
1461 | */ | ||
1462 | static void dm_rq_prepare_flush(struct request_queue *q, struct request *rq) | ||
1463 | { | ||
1464 | rq->cmd_type = REQ_TYPE_LINUX_BLOCK; | ||
1465 | rq->cmd[0] = REQ_LB_OP_FLUSH; | ||
1466 | } | ||
1467 | |||
1468 | static bool dm_rq_is_flush_request(struct request *rq) | ||
1469 | { | ||
1470 | if (rq->cmd_type == REQ_TYPE_LINUX_BLOCK && | ||
1471 | rq->cmd[0] == REQ_LB_OP_FLUSH) | ||
1472 | return true; | ||
1473 | else | ||
1474 | return false; | ||
1475 | } | ||
1476 | |||
1372 | void dm_dispatch_request(struct request *rq) | 1477 | void dm_dispatch_request(struct request *rq) |
1373 | { | 1478 | { |
1374 | int r; | 1479 | int r; |
@@ -1414,25 +1519,54 @@ static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, | |||
1414 | static int setup_clone(struct request *clone, struct request *rq, | 1519 | static int setup_clone(struct request *clone, struct request *rq, |
1415 | struct dm_rq_target_io *tio) | 1520 | struct dm_rq_target_io *tio) |
1416 | { | 1521 | { |
1417 | int r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, | 1522 | int r; |
1418 | dm_rq_bio_constructor, tio); | ||
1419 | 1523 | ||
1420 | if (r) | 1524 | if (dm_rq_is_flush_request(rq)) { |
1421 | return r; | 1525 | blk_rq_init(NULL, clone); |
1526 | clone->cmd_type = REQ_TYPE_FS; | ||
1527 | clone->cmd_flags |= (REQ_HARDBARRIER | WRITE); | ||
1528 | } else { | ||
1529 | r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, | ||
1530 | dm_rq_bio_constructor, tio); | ||
1531 | if (r) | ||
1532 | return r; | ||
1533 | |||
1534 | clone->cmd = rq->cmd; | ||
1535 | clone->cmd_len = rq->cmd_len; | ||
1536 | clone->sense = rq->sense; | ||
1537 | clone->buffer = rq->buffer; | ||
1538 | } | ||
1422 | 1539 | ||
1423 | clone->cmd = rq->cmd; | ||
1424 | clone->cmd_len = rq->cmd_len; | ||
1425 | clone->sense = rq->sense; | ||
1426 | clone->buffer = rq->buffer; | ||
1427 | clone->end_io = end_clone_request; | 1540 | clone->end_io = end_clone_request; |
1428 | clone->end_io_data = tio; | 1541 | clone->end_io_data = tio; |
1429 | 1542 | ||
1430 | return 0; | 1543 | return 0; |
1431 | } | 1544 | } |
1432 | 1545 | ||
1433 | static int dm_rq_flush_suspending(struct mapped_device *md) | 1546 | static struct request *clone_rq(struct request *rq, struct mapped_device *md, |
1547 | gfp_t gfp_mask) | ||
1434 | { | 1548 | { |
1435 | return !md->suspend_rq.special; | 1549 | struct request *clone; |
1550 | struct dm_rq_target_io *tio; | ||
1551 | |||
1552 | tio = alloc_rq_tio(md, gfp_mask); | ||
1553 | if (!tio) | ||
1554 | return NULL; | ||
1555 | |||
1556 | tio->md = md; | ||
1557 | tio->ti = NULL; | ||
1558 | tio->orig = rq; | ||
1559 | tio->error = 0; | ||
1560 | memset(&tio->info, 0, sizeof(tio->info)); | ||
1561 | |||
1562 | clone = &tio->clone; | ||
1563 | if (setup_clone(clone, rq, tio)) { | ||
1564 | /* -ENOMEM */ | ||
1565 | free_rq_tio(tio); | ||
1566 | return NULL; | ||
1567 | } | ||
1568 | |||
1569 | return clone; | ||
1436 | } | 1570 | } |
1437 | 1571 | ||
1438 | /* | 1572 | /* |
@@ -1441,51 +1575,35 @@ static int dm_rq_flush_suspending(struct mapped_device *md) | |||
1441 | static int dm_prep_fn(struct request_queue *q, struct request *rq) | 1575 | static int dm_prep_fn(struct request_queue *q, struct request *rq) |
1442 | { | 1576 | { |
1443 | struct mapped_device *md = q->queuedata; | 1577 | struct mapped_device *md = q->queuedata; |
1444 | struct dm_rq_target_io *tio; | ||
1445 | struct request *clone; | 1578 | struct request *clone; |
1446 | 1579 | ||
1447 | if (unlikely(rq == &md->suspend_rq)) { | 1580 | if (unlikely(dm_rq_is_flush_request(rq))) |
1448 | if (dm_rq_flush_suspending(md)) | 1581 | return BLKPREP_OK; |
1449 | return BLKPREP_OK; | ||
1450 | else | ||
1451 | /* The flush suspend was interrupted */ | ||
1452 | return BLKPREP_KILL; | ||
1453 | } | ||
1454 | 1582 | ||
1455 | if (unlikely(rq->special)) { | 1583 | if (unlikely(rq->special)) { |
1456 | DMWARN("Already has something in rq->special."); | 1584 | DMWARN("Already has something in rq->special."); |
1457 | return BLKPREP_KILL; | 1585 | return BLKPREP_KILL; |
1458 | } | 1586 | } |
1459 | 1587 | ||
1460 | tio = alloc_rq_tio(md); /* Only one for each original request */ | 1588 | clone = clone_rq(rq, md, GFP_ATOMIC); |
1461 | if (!tio) | 1589 | if (!clone) |
1462 | /* -ENOMEM */ | ||
1463 | return BLKPREP_DEFER; | 1590 | return BLKPREP_DEFER; |
1464 | 1591 | ||
1465 | tio->md = md; | ||
1466 | tio->ti = NULL; | ||
1467 | tio->orig = rq; | ||
1468 | tio->error = 0; | ||
1469 | memset(&tio->info, 0, sizeof(tio->info)); | ||
1470 | |||
1471 | clone = &tio->clone; | ||
1472 | if (setup_clone(clone, rq, tio)) { | ||
1473 | /* -ENOMEM */ | ||
1474 | free_rq_tio(tio); | ||
1475 | return BLKPREP_DEFER; | ||
1476 | } | ||
1477 | |||
1478 | rq->special = clone; | 1592 | rq->special = clone; |
1479 | rq->cmd_flags |= REQ_DONTPREP; | 1593 | rq->cmd_flags |= REQ_DONTPREP; |
1480 | 1594 | ||
1481 | return BLKPREP_OK; | 1595 | return BLKPREP_OK; |
1482 | } | 1596 | } |
1483 | 1597 | ||
1484 | static void map_request(struct dm_target *ti, struct request *rq, | 1598 | /* |
1485 | struct mapped_device *md) | 1599 | * Returns: |
1600 | * 0 : the request has been processed (not requeued) | ||
1601 | * !0 : the request has been requeued | ||
1602 | */ | ||
1603 | static int map_request(struct dm_target *ti, struct request *clone, | ||
1604 | struct mapped_device *md) | ||
1486 | { | 1605 | { |
1487 | int r; | 1606 | int r, requeued = 0; |
1488 | struct request *clone = rq->special; | ||
1489 | struct dm_rq_target_io *tio = clone->end_io_data; | 1607 | struct dm_rq_target_io *tio = clone->end_io_data; |
1490 | 1608 | ||
1491 | /* | 1609 | /* |
@@ -1505,11 +1623,14 @@ static void map_request(struct dm_target *ti, struct request *rq, | |||
1505 | break; | 1623 | break; |
1506 | case DM_MAPIO_REMAPPED: | 1624 | case DM_MAPIO_REMAPPED: |
1507 | /* The target has remapped the I/O so dispatch it */ | 1625 | /* The target has remapped the I/O so dispatch it */ |
1626 | trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), | ||
1627 | blk_rq_pos(tio->orig)); | ||
1508 | dm_dispatch_request(clone); | 1628 | dm_dispatch_request(clone); |
1509 | break; | 1629 | break; |
1510 | case DM_MAPIO_REQUEUE: | 1630 | case DM_MAPIO_REQUEUE: |
1511 | /* The target wants to requeue the I/O */ | 1631 | /* The target wants to requeue the I/O */ |
1512 | dm_requeue_unmapped_request(clone); | 1632 | dm_requeue_unmapped_request(clone); |
1633 | requeued = 1; | ||
1513 | break; | 1634 | break; |
1514 | default: | 1635 | default: |
1515 | if (r > 0) { | 1636 | if (r > 0) { |
@@ -1521,6 +1642,8 @@ static void map_request(struct dm_target *ti, struct request *rq, | |||
1521 | dm_kill_unmapped_request(clone, r); | 1642 | dm_kill_unmapped_request(clone, r); |
1522 | break; | 1643 | break; |
1523 | } | 1644 | } |
1645 | |||
1646 | return requeued; | ||
1524 | } | 1647 | } |
1525 | 1648 | ||
1526 | /* | 1649 | /* |
@@ -1530,29 +1653,26 @@ static void map_request(struct dm_target *ti, struct request *rq, | |||
1530 | static void dm_request_fn(struct request_queue *q) | 1653 | static void dm_request_fn(struct request_queue *q) |
1531 | { | 1654 | { |
1532 | struct mapped_device *md = q->queuedata; | 1655 | struct mapped_device *md = q->queuedata; |
1533 | struct dm_table *map = dm_get_table(md); | 1656 | struct dm_table *map = dm_get_live_table(md); |
1534 | struct dm_target *ti; | 1657 | struct dm_target *ti; |
1535 | struct request *rq; | 1658 | struct request *rq, *clone; |
1536 | 1659 | ||
1537 | /* | 1660 | /* |
1538 | * For noflush suspend, check blk_queue_stopped() to immediately | 1661 | * For suspend, check blk_queue_stopped() and increment |
1539 | * quit I/O dispatching. | 1662 | * ->pending within a single queue_lock not to increment the |
1663 | * number of in-flight I/Os after the queue is stopped in | ||
1664 | * dm_suspend(). | ||
1540 | */ | 1665 | */ |
1541 | while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) { | 1666 | while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) { |
1542 | rq = blk_peek_request(q); | 1667 | rq = blk_peek_request(q); |
1543 | if (!rq) | 1668 | if (!rq) |
1544 | goto plug_and_out; | 1669 | goto plug_and_out; |
1545 | 1670 | ||
1546 | if (unlikely(rq == &md->suspend_rq)) { /* Flush suspend maker */ | 1671 | if (unlikely(dm_rq_is_flush_request(rq))) { |
1547 | if (queue_in_flight(q)) | 1672 | BUG_ON(md->flush_request); |
1548 | /* Not quiet yet. Wait more */ | 1673 | md->flush_request = rq; |
1549 | goto plug_and_out; | ||
1550 | |||
1551 | /* This device should be quiet now */ | ||
1552 | __stop_queue(q); | ||
1553 | blk_start_request(rq); | 1674 | blk_start_request(rq); |
1554 | __blk_end_request_all(rq, 0); | 1675 | queue_work(md->wq, &md->barrier_work); |
1555 | wake_up(&md->wait); | ||
1556 | goto out; | 1676 | goto out; |
1557 | } | 1677 | } |
1558 | 1678 | ||
@@ -1561,13 +1681,21 @@ static void dm_request_fn(struct request_queue *q) | |||
1561 | goto plug_and_out; | 1681 | goto plug_and_out; |
1562 | 1682 | ||
1563 | blk_start_request(rq); | 1683 | blk_start_request(rq); |
1684 | clone = rq->special; | ||
1685 | atomic_inc(&md->pending[rq_data_dir(clone)]); | ||
1686 | |||
1564 | spin_unlock(q->queue_lock); | 1687 | spin_unlock(q->queue_lock); |
1565 | map_request(ti, rq, md); | 1688 | if (map_request(ti, clone, md)) |
1689 | goto requeued; | ||
1690 | |||
1566 | spin_lock_irq(q->queue_lock); | 1691 | spin_lock_irq(q->queue_lock); |
1567 | } | 1692 | } |
1568 | 1693 | ||
1569 | goto out; | 1694 | goto out; |
1570 | 1695 | ||
1696 | requeued: | ||
1697 | spin_lock_irq(q->queue_lock); | ||
1698 | |||
1571 | plug_and_out: | 1699 | plug_and_out: |
1572 | if (!elv_queue_empty(q)) | 1700 | if (!elv_queue_empty(q)) |
1573 | /* Some requests still remain, retry later */ | 1701 | /* Some requests still remain, retry later */ |
@@ -1589,7 +1717,7 @@ static int dm_lld_busy(struct request_queue *q) | |||
1589 | { | 1717 | { |
1590 | int r; | 1718 | int r; |
1591 | struct mapped_device *md = q->queuedata; | 1719 | struct mapped_device *md = q->queuedata; |
1592 | struct dm_table *map = dm_get_table(md); | 1720 | struct dm_table *map = dm_get_live_table(md); |
1593 | 1721 | ||
1594 | if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) | 1722 | if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) |
1595 | r = 1; | 1723 | r = 1; |
@@ -1604,7 +1732,7 @@ static int dm_lld_busy(struct request_queue *q) | |||
1604 | static void dm_unplug_all(struct request_queue *q) | 1732 | static void dm_unplug_all(struct request_queue *q) |
1605 | { | 1733 | { |
1606 | struct mapped_device *md = q->queuedata; | 1734 | struct mapped_device *md = q->queuedata; |
1607 | struct dm_table *map = dm_get_table(md); | 1735 | struct dm_table *map = dm_get_live_table(md); |
1608 | 1736 | ||
1609 | if (map) { | 1737 | if (map) { |
1610 | if (dm_request_based(md)) | 1738 | if (dm_request_based(md)) |
@@ -1622,7 +1750,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits) | |||
1622 | struct dm_table *map; | 1750 | struct dm_table *map; |
1623 | 1751 | ||
1624 | if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { | 1752 | if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { |
1625 | map = dm_get_table(md); | 1753 | map = dm_get_live_table(md); |
1626 | if (map) { | 1754 | if (map) { |
1627 | /* | 1755 | /* |
1628 | * Request-based dm cares about only own queue for | 1756 | * Request-based dm cares about only own queue for |
@@ -1719,6 +1847,7 @@ out: | |||
1719 | static const struct block_device_operations dm_blk_dops; | 1847 | static const struct block_device_operations dm_blk_dops; |
1720 | 1848 | ||
1721 | static void dm_wq_work(struct work_struct *work); | 1849 | static void dm_wq_work(struct work_struct *work); |
1850 | static void dm_rq_barrier_work(struct work_struct *work); | ||
1722 | 1851 | ||
1723 | /* | 1852 | /* |
1724 | * Allocate and initialise a blank device with a given minor. | 1853 | * Allocate and initialise a blank device with a given minor. |
@@ -1748,6 +1877,7 @@ static struct mapped_device *alloc_dev(int minor) | |||
1748 | init_rwsem(&md->io_lock); | 1877 | init_rwsem(&md->io_lock); |
1749 | mutex_init(&md->suspend_lock); | 1878 | mutex_init(&md->suspend_lock); |
1750 | spin_lock_init(&md->deferred_lock); | 1879 | spin_lock_init(&md->deferred_lock); |
1880 | spin_lock_init(&md->barrier_error_lock); | ||
1751 | rwlock_init(&md->map_lock); | 1881 | rwlock_init(&md->map_lock); |
1752 | atomic_set(&md->holders, 1); | 1882 | atomic_set(&md->holders, 1); |
1753 | atomic_set(&md->open_count, 0); | 1883 | atomic_set(&md->open_count, 0); |
@@ -1782,6 +1912,8 @@ static struct mapped_device *alloc_dev(int minor) | |||
1782 | blk_queue_softirq_done(md->queue, dm_softirq_done); | 1912 | blk_queue_softirq_done(md->queue, dm_softirq_done); |
1783 | blk_queue_prep_rq(md->queue, dm_prep_fn); | 1913 | blk_queue_prep_rq(md->queue, dm_prep_fn); |
1784 | blk_queue_lld_busy(md->queue, dm_lld_busy); | 1914 | blk_queue_lld_busy(md->queue, dm_lld_busy); |
1915 | blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH, | ||
1916 | dm_rq_prepare_flush); | ||
1785 | 1917 | ||
1786 | md->disk = alloc_disk(1); | 1918 | md->disk = alloc_disk(1); |
1787 | if (!md->disk) | 1919 | if (!md->disk) |
@@ -1791,6 +1923,7 @@ static struct mapped_device *alloc_dev(int minor) | |||
1791 | atomic_set(&md->pending[1], 0); | 1923 | atomic_set(&md->pending[1], 0); |
1792 | init_waitqueue_head(&md->wait); | 1924 | init_waitqueue_head(&md->wait); |
1793 | INIT_WORK(&md->work, dm_wq_work); | 1925 | INIT_WORK(&md->work, dm_wq_work); |
1926 | INIT_WORK(&md->barrier_work, dm_rq_barrier_work); | ||
1794 | init_waitqueue_head(&md->eventq); | 1927 | init_waitqueue_head(&md->eventq); |
1795 | 1928 | ||
1796 | md->disk->major = _major; | 1929 | md->disk->major = _major; |
@@ -1822,6 +1955,7 @@ static struct mapped_device *alloc_dev(int minor) | |||
1822 | bad_bdev: | 1955 | bad_bdev: |
1823 | destroy_workqueue(md->wq); | 1956 | destroy_workqueue(md->wq); |
1824 | bad_thread: | 1957 | bad_thread: |
1958 | del_gendisk(md->disk); | ||
1825 | put_disk(md->disk); | 1959 | put_disk(md->disk); |
1826 | bad_disk: | 1960 | bad_disk: |
1827 | blk_cleanup_queue(md->queue); | 1961 | blk_cleanup_queue(md->queue); |
@@ -1914,9 +2048,13 @@ static void __set_size(struct mapped_device *md, sector_t size) | |||
1914 | mutex_unlock(&md->bdev->bd_inode->i_mutex); | 2048 | mutex_unlock(&md->bdev->bd_inode->i_mutex); |
1915 | } | 2049 | } |
1916 | 2050 | ||
1917 | static int __bind(struct mapped_device *md, struct dm_table *t, | 2051 | /* |
1918 | struct queue_limits *limits) | 2052 | * Returns old map, which caller must destroy. |
2053 | */ | ||
2054 | static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, | ||
2055 | struct queue_limits *limits) | ||
1919 | { | 2056 | { |
2057 | struct dm_table *old_map; | ||
1920 | struct request_queue *q = md->queue; | 2058 | struct request_queue *q = md->queue; |
1921 | sector_t size; | 2059 | sector_t size; |
1922 | unsigned long flags; | 2060 | unsigned long flags; |
@@ -1931,11 +2069,6 @@ static int __bind(struct mapped_device *md, struct dm_table *t, | |||
1931 | 2069 | ||
1932 | __set_size(md, size); | 2070 | __set_size(md, size); |
1933 | 2071 | ||
1934 | if (!size) { | ||
1935 | dm_table_destroy(t); | ||
1936 | return 0; | ||
1937 | } | ||
1938 | |||
1939 | dm_table_event_callback(t, event_callback, md); | 2072 | dm_table_event_callback(t, event_callback, md); |
1940 | 2073 | ||
1941 | /* | 2074 | /* |
@@ -1951,26 +2084,31 @@ static int __bind(struct mapped_device *md, struct dm_table *t, | |||
1951 | __bind_mempools(md, t); | 2084 | __bind_mempools(md, t); |
1952 | 2085 | ||
1953 | write_lock_irqsave(&md->map_lock, flags); | 2086 | write_lock_irqsave(&md->map_lock, flags); |
2087 | old_map = md->map; | ||
1954 | md->map = t; | 2088 | md->map = t; |
1955 | dm_table_set_restrictions(t, q, limits); | 2089 | dm_table_set_restrictions(t, q, limits); |
1956 | write_unlock_irqrestore(&md->map_lock, flags); | 2090 | write_unlock_irqrestore(&md->map_lock, flags); |
1957 | 2091 | ||
1958 | return 0; | 2092 | return old_map; |
1959 | } | 2093 | } |
1960 | 2094 | ||
1961 | static void __unbind(struct mapped_device *md) | 2095 | /* |
2096 | * Returns unbound table for the caller to free. | ||
2097 | */ | ||
2098 | static struct dm_table *__unbind(struct mapped_device *md) | ||
1962 | { | 2099 | { |
1963 | struct dm_table *map = md->map; | 2100 | struct dm_table *map = md->map; |
1964 | unsigned long flags; | 2101 | unsigned long flags; |
1965 | 2102 | ||
1966 | if (!map) | 2103 | if (!map) |
1967 | return; | 2104 | return NULL; |
1968 | 2105 | ||
1969 | dm_table_event_callback(map, NULL, NULL); | 2106 | dm_table_event_callback(map, NULL, NULL); |
1970 | write_lock_irqsave(&md->map_lock, flags); | 2107 | write_lock_irqsave(&md->map_lock, flags); |
1971 | md->map = NULL; | 2108 | md->map = NULL; |
1972 | write_unlock_irqrestore(&md->map_lock, flags); | 2109 | write_unlock_irqrestore(&md->map_lock, flags); |
1973 | dm_table_destroy(map); | 2110 | |
2111 | return map; | ||
1974 | } | 2112 | } |
1975 | 2113 | ||
1976 | /* | 2114 | /* |
@@ -2052,18 +2190,18 @@ void dm_put(struct mapped_device *md) | |||
2052 | BUG_ON(test_bit(DMF_FREEING, &md->flags)); | 2190 | BUG_ON(test_bit(DMF_FREEING, &md->flags)); |
2053 | 2191 | ||
2054 | if (atomic_dec_and_lock(&md->holders, &_minor_lock)) { | 2192 | if (atomic_dec_and_lock(&md->holders, &_minor_lock)) { |
2055 | map = dm_get_table(md); | 2193 | map = dm_get_live_table(md); |
2056 | idr_replace(&_minor_idr, MINOR_ALLOCED, | 2194 | idr_replace(&_minor_idr, MINOR_ALLOCED, |
2057 | MINOR(disk_devt(dm_disk(md)))); | 2195 | MINOR(disk_devt(dm_disk(md)))); |
2058 | set_bit(DMF_FREEING, &md->flags); | 2196 | set_bit(DMF_FREEING, &md->flags); |
2059 | spin_unlock(&_minor_lock); | 2197 | spin_unlock(&_minor_lock); |
2060 | if (!dm_suspended(md)) { | 2198 | if (!dm_suspended_md(md)) { |
2061 | dm_table_presuspend_targets(map); | 2199 | dm_table_presuspend_targets(map); |
2062 | dm_table_postsuspend_targets(map); | 2200 | dm_table_postsuspend_targets(map); |
2063 | } | 2201 | } |
2064 | dm_sysfs_exit(md); | 2202 | dm_sysfs_exit(md); |
2065 | dm_table_put(map); | 2203 | dm_table_put(map); |
2066 | __unbind(md); | 2204 | dm_table_destroy(__unbind(md)); |
2067 | free_dev(md); | 2205 | free_dev(md); |
2068 | } | 2206 | } |
2069 | } | 2207 | } |
@@ -2073,8 +2211,6 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible) | |||
2073 | { | 2211 | { |
2074 | int r = 0; | 2212 | int r = 0; |
2075 | DECLARE_WAITQUEUE(wait, current); | 2213 | DECLARE_WAITQUEUE(wait, current); |
2076 | struct request_queue *q = md->queue; | ||
2077 | unsigned long flags; | ||
2078 | 2214 | ||
2079 | dm_unplug_all(md->queue); | 2215 | dm_unplug_all(md->queue); |
2080 | 2216 | ||
@@ -2084,15 +2220,7 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible) | |||
2084 | set_current_state(interruptible); | 2220 | set_current_state(interruptible); |
2085 | 2221 | ||
2086 | smp_mb(); | 2222 | smp_mb(); |
2087 | if (dm_request_based(md)) { | 2223 | if (!md_in_flight(md)) |
2088 | spin_lock_irqsave(q->queue_lock, flags); | ||
2089 | if (!queue_in_flight(q) && blk_queue_stopped(q)) { | ||
2090 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
2091 | break; | ||
2092 | } | ||
2093 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
2094 | } else if (!atomic_read(&md->pending[0]) && | ||
2095 | !atomic_read(&md->pending[1])) | ||
2096 | break; | 2224 | break; |
2097 | 2225 | ||
2098 | if (interruptible == TASK_INTERRUPTIBLE && | 2226 | if (interruptible == TASK_INTERRUPTIBLE && |
@@ -2187,98 +2315,106 @@ static void dm_queue_flush(struct mapped_device *md) | |||
2187 | queue_work(md->wq, &md->work); | 2315 | queue_work(md->wq, &md->work); |
2188 | } | 2316 | } |
2189 | 2317 | ||
2190 | /* | 2318 | static void dm_rq_set_flush_nr(struct request *clone, unsigned flush_nr) |
2191 | * Swap in a new table (destroying old one). | ||
2192 | */ | ||
2193 | int dm_swap_table(struct mapped_device *md, struct dm_table *table) | ||
2194 | { | 2319 | { |
2195 | struct queue_limits limits; | 2320 | struct dm_rq_target_io *tio = clone->end_io_data; |
2196 | int r = -EINVAL; | ||
2197 | 2321 | ||
2198 | mutex_lock(&md->suspend_lock); | 2322 | tio->info.flush_request = flush_nr; |
2323 | } | ||
2199 | 2324 | ||
2200 | /* device must be suspended */ | 2325 | /* Issue barrier requests to targets and wait for their completion. */ |
2201 | if (!dm_suspended(md)) | 2326 | static int dm_rq_barrier(struct mapped_device *md) |
2202 | goto out; | 2327 | { |
2328 | int i, j; | ||
2329 | struct dm_table *map = dm_get_live_table(md); | ||
2330 | unsigned num_targets = dm_table_get_num_targets(map); | ||
2331 | struct dm_target *ti; | ||
2332 | struct request *clone; | ||
2203 | 2333 | ||
2204 | r = dm_calculate_queue_limits(table, &limits); | 2334 | md->barrier_error = 0; |
2205 | if (r) | ||
2206 | goto out; | ||
2207 | 2335 | ||
2208 | /* cannot change the device type, once a table is bound */ | 2336 | for (i = 0; i < num_targets; i++) { |
2209 | if (md->map && | 2337 | ti = dm_table_get_target(map, i); |
2210 | (dm_table_get_type(md->map) != dm_table_get_type(table))) { | 2338 | for (j = 0; j < ti->num_flush_requests; j++) { |
2211 | DMWARN("can't change the device type after a table is bound"); | 2339 | clone = clone_rq(md->flush_request, md, GFP_NOIO); |
2212 | goto out; | 2340 | dm_rq_set_flush_nr(clone, j); |
2341 | atomic_inc(&md->pending[rq_data_dir(clone)]); | ||
2342 | map_request(ti, clone, md); | ||
2343 | } | ||
2213 | } | 2344 | } |
2214 | 2345 | ||
2215 | __unbind(md); | 2346 | dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); |
2216 | r = __bind(md, table, &limits); | 2347 | dm_table_put(map); |
2217 | |||
2218 | out: | ||
2219 | mutex_unlock(&md->suspend_lock); | ||
2220 | return r; | ||
2221 | } | ||
2222 | 2348 | ||
2223 | static void dm_rq_invalidate_suspend_marker(struct mapped_device *md) | 2349 | return md->barrier_error; |
2224 | { | ||
2225 | md->suspend_rq.special = (void *)0x1; | ||
2226 | } | 2350 | } |
2227 | 2351 | ||
2228 | static void dm_rq_abort_suspend(struct mapped_device *md, int noflush) | 2352 | static void dm_rq_barrier_work(struct work_struct *work) |
2229 | { | 2353 | { |
2354 | int error; | ||
2355 | struct mapped_device *md = container_of(work, struct mapped_device, | ||
2356 | barrier_work); | ||
2230 | struct request_queue *q = md->queue; | 2357 | struct request_queue *q = md->queue; |
2358 | struct request *rq; | ||
2231 | unsigned long flags; | 2359 | unsigned long flags; |
2232 | 2360 | ||
2233 | spin_lock_irqsave(q->queue_lock, flags); | 2361 | /* |
2234 | if (!noflush) | 2362 | * Hold the md reference here and leave it at the last part so that |
2235 | dm_rq_invalidate_suspend_marker(md); | 2363 | * the md can't be deleted by device opener when the barrier request |
2236 | __start_queue(q); | 2364 | * completes. |
2237 | spin_unlock_irqrestore(q->queue_lock, flags); | 2365 | */ |
2238 | } | 2366 | dm_get(md); |
2239 | 2367 | ||
2240 | static void dm_rq_start_suspend(struct mapped_device *md, int noflush) | 2368 | error = dm_rq_barrier(md); |
2241 | { | ||
2242 | struct request *rq = &md->suspend_rq; | ||
2243 | struct request_queue *q = md->queue; | ||
2244 | 2369 | ||
2245 | if (noflush) | 2370 | rq = md->flush_request; |
2246 | stop_queue(q); | 2371 | md->flush_request = NULL; |
2247 | else { | 2372 | |
2248 | blk_rq_init(q, rq); | 2373 | if (error == DM_ENDIO_REQUEUE) { |
2249 | blk_insert_request(q, rq, 0, NULL); | 2374 | spin_lock_irqsave(q->queue_lock, flags); |
2250 | } | 2375 | blk_requeue_request(q, rq); |
2376 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
2377 | } else | ||
2378 | blk_end_request_all(rq, error); | ||
2379 | |||
2380 | blk_run_queue(q); | ||
2381 | |||
2382 | dm_put(md); | ||
2251 | } | 2383 | } |
2252 | 2384 | ||
2253 | static int dm_rq_suspend_available(struct mapped_device *md, int noflush) | 2385 | /* |
2386 | * Swap in a new table, returning the old one for the caller to destroy. | ||
2387 | */ | ||
2388 | struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) | ||
2254 | { | 2389 | { |
2255 | int r = 1; | 2390 | struct dm_table *map = ERR_PTR(-EINVAL); |
2256 | struct request *rq = &md->suspend_rq; | 2391 | struct queue_limits limits; |
2257 | struct request_queue *q = md->queue; | 2392 | int r; |
2258 | unsigned long flags; | ||
2259 | 2393 | ||
2260 | if (noflush) | 2394 | mutex_lock(&md->suspend_lock); |
2261 | return r; | ||
2262 | 2395 | ||
2263 | /* The marker must be protected by queue lock if it is in use */ | 2396 | /* device must be suspended */ |
2264 | spin_lock_irqsave(q->queue_lock, flags); | 2397 | if (!dm_suspended_md(md)) |
2265 | if (unlikely(rq->ref_count)) { | 2398 | goto out; |
2266 | /* | 2399 | |
2267 | * This can happen, when the previous flush suspend was | 2400 | r = dm_calculate_queue_limits(table, &limits); |
2268 | * interrupted, the marker is still in the queue and | 2401 | if (r) { |
2269 | * this flush suspend has been invoked, because we don't | 2402 | map = ERR_PTR(r); |
2270 | * remove the marker at the time of suspend interruption. | 2403 | goto out; |
2271 | * We have only one marker per mapped_device, so we can't | ||
2272 | * start another flush suspend while it is in use. | ||
2273 | */ | ||
2274 | BUG_ON(!rq->special); /* The marker should be invalidated */ | ||
2275 | DMWARN("Invalidating the previous flush suspend is still in" | ||
2276 | " progress. Please retry later."); | ||
2277 | r = 0; | ||
2278 | } | 2404 | } |
2279 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
2280 | 2405 | ||
2281 | return r; | 2406 | /* cannot change the device type, once a table is bound */ |
2407 | if (md->map && | ||
2408 | (dm_table_get_type(md->map) != dm_table_get_type(table))) { | ||
2409 | DMWARN("can't change the device type after a table is bound"); | ||
2410 | goto out; | ||
2411 | } | ||
2412 | |||
2413 | map = __bind(md, table, &limits); | ||
2414 | |||
2415 | out: | ||
2416 | mutex_unlock(&md->suspend_lock); | ||
2417 | return map; | ||
2282 | } | 2418 | } |
2283 | 2419 | ||
2284 | /* | 2420 | /* |
@@ -2323,49 +2459,11 @@ static void unlock_fs(struct mapped_device *md) | |||
2323 | /* | 2459 | /* |
2324 | * Suspend mechanism in request-based dm. | 2460 | * Suspend mechanism in request-based dm. |
2325 | * | 2461 | * |
2326 | * After the suspend starts, further incoming requests are kept in | 2462 | * 1. Flush all I/Os by lock_fs() if needed. |
2327 | * the request_queue and deferred. | 2463 | * 2. Stop dispatching any I/O by stopping the request_queue. |
2328 | * Remaining requests in the request_queue at the start of suspend are flushed | 2464 | * 3. Wait for all in-flight I/Os to be completed or requeued. |
2329 | * if it is flush suspend. | ||
2330 | * The suspend completes when the following conditions have been satisfied, | ||
2331 | * so wait for it: | ||
2332 | * 1. q->in_flight is 0 (which means no in_flight request) | ||
2333 | * 2. queue has been stopped (which means no request dispatching) | ||
2334 | * | 2465 | * |
2335 | * | 2466 | * To abort suspend, start the request_queue. |
2336 | * Noflush suspend | ||
2337 | * --------------- | ||
2338 | * Noflush suspend doesn't need to dispatch remaining requests. | ||
2339 | * So stop the queue immediately. Then, wait for all in_flight requests | ||
2340 | * to be completed or requeued. | ||
2341 | * | ||
2342 | * To abort noflush suspend, start the queue. | ||
2343 | * | ||
2344 | * | ||
2345 | * Flush suspend | ||
2346 | * ------------- | ||
2347 | * Flush suspend needs to dispatch remaining requests. So stop the queue | ||
2348 | * after the remaining requests are completed. (Requeued request must be also | ||
2349 | * re-dispatched and completed. Until then, we can't stop the queue.) | ||
2350 | * | ||
2351 | * During flushing the remaining requests, further incoming requests are also | ||
2352 | * inserted to the same queue. To distinguish which requests are to be | ||
2353 | * flushed, we insert a marker request to the queue at the time of starting | ||
2354 | * flush suspend, like a barrier. | ||
2355 | * The dispatching is blocked when the marker is found on the top of the queue. | ||
2356 | * And the queue is stopped when all in_flight requests are completed, since | ||
2357 | * that means the remaining requests are completely flushed. | ||
2358 | * Then, the marker is removed from the queue. | ||
2359 | * | ||
2360 | * To abort flush suspend, we also need to take care of the marker, not only | ||
2361 | * starting the queue. | ||
2362 | * We don't remove the marker forcibly from the queue since it's against | ||
2363 | * the block-layer manner. Instead, we put a invalidated mark on the marker. | ||
2364 | * When the invalidated marker is found on the top of the queue, it is | ||
2365 | * immediately removed from the queue, so it doesn't block dispatching. | ||
2366 | * Because we have only one marker per mapped_device, we can't start another | ||
2367 | * flush suspend until the invalidated marker is removed from the queue. | ||
2368 | * So fail and return with -EBUSY in such a case. | ||
2369 | */ | 2467 | */ |
2370 | int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | 2468 | int dm_suspend(struct mapped_device *md, unsigned suspend_flags) |
2371 | { | 2469 | { |
@@ -2376,17 +2474,12 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
2376 | 2474 | ||
2377 | mutex_lock(&md->suspend_lock); | 2475 | mutex_lock(&md->suspend_lock); |
2378 | 2476 | ||
2379 | if (dm_suspended(md)) { | 2477 | if (dm_suspended_md(md)) { |
2380 | r = -EINVAL; | 2478 | r = -EINVAL; |
2381 | goto out_unlock; | 2479 | goto out_unlock; |
2382 | } | 2480 | } |
2383 | 2481 | ||
2384 | if (dm_request_based(md) && !dm_rq_suspend_available(md, noflush)) { | 2482 | map = dm_get_live_table(md); |
2385 | r = -EBUSY; | ||
2386 | goto out_unlock; | ||
2387 | } | ||
2388 | |||
2389 | map = dm_get_table(md); | ||
2390 | 2483 | ||
2391 | /* | 2484 | /* |
2392 | * DMF_NOFLUSH_SUSPENDING must be set before presuspend. | 2485 | * DMF_NOFLUSH_SUSPENDING must be set before presuspend. |
@@ -2399,8 +2492,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
2399 | dm_table_presuspend_targets(map); | 2492 | dm_table_presuspend_targets(map); |
2400 | 2493 | ||
2401 | /* | 2494 | /* |
2402 | * Flush I/O to the device. noflush supersedes do_lockfs, | 2495 | * Flush I/O to the device. |
2403 | * because lock_fs() needs to flush I/Os. | 2496 | * Any I/O submitted after lock_fs() may not be flushed. |
2497 | * noflush takes precedence over do_lockfs. | ||
2498 | * (lock_fs() flushes I/Os and waits for them to complete.) | ||
2404 | */ | 2499 | */ |
2405 | if (!noflush && do_lockfs) { | 2500 | if (!noflush && do_lockfs) { |
2406 | r = lock_fs(md); | 2501 | r = lock_fs(md); |
@@ -2429,10 +2524,15 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
2429 | set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); | 2524 | set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); |
2430 | up_write(&md->io_lock); | 2525 | up_write(&md->io_lock); |
2431 | 2526 | ||
2432 | flush_workqueue(md->wq); | 2527 | /* |
2433 | 2528 | * Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which | |
2529 | * can be kicked until md->queue is stopped. So stop md->queue before | ||
2530 | * flushing md->wq. | ||
2531 | */ | ||
2434 | if (dm_request_based(md)) | 2532 | if (dm_request_based(md)) |
2435 | dm_rq_start_suspend(md, noflush); | 2533 | stop_queue(md->queue); |
2534 | |||
2535 | flush_workqueue(md->wq); | ||
2436 | 2536 | ||
2437 | /* | 2537 | /* |
2438 | * At this point no more requests are entering target request routines. | 2538 | * At this point no more requests are entering target request routines. |
@@ -2451,7 +2551,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
2451 | dm_queue_flush(md); | 2551 | dm_queue_flush(md); |
2452 | 2552 | ||
2453 | if (dm_request_based(md)) | 2553 | if (dm_request_based(md)) |
2454 | dm_rq_abort_suspend(md, noflush); | 2554 | start_queue(md->queue); |
2455 | 2555 | ||
2456 | unlock_fs(md); | 2556 | unlock_fs(md); |
2457 | goto out; /* pushback list is already flushed, so skip flush */ | 2557 | goto out; /* pushback list is already flushed, so skip flush */ |
@@ -2463,10 +2563,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
2463 | * requests are being added to md->deferred list. | 2563 | * requests are being added to md->deferred list. |
2464 | */ | 2564 | */ |
2465 | 2565 | ||
2466 | dm_table_postsuspend_targets(map); | ||
2467 | |||
2468 | set_bit(DMF_SUSPENDED, &md->flags); | 2566 | set_bit(DMF_SUSPENDED, &md->flags); |
2469 | 2567 | ||
2568 | dm_table_postsuspend_targets(map); | ||
2569 | |||
2470 | out: | 2570 | out: |
2471 | dm_table_put(map); | 2571 | dm_table_put(map); |
2472 | 2572 | ||
@@ -2481,10 +2581,10 @@ int dm_resume(struct mapped_device *md) | |||
2481 | struct dm_table *map = NULL; | 2581 | struct dm_table *map = NULL; |
2482 | 2582 | ||
2483 | mutex_lock(&md->suspend_lock); | 2583 | mutex_lock(&md->suspend_lock); |
2484 | if (!dm_suspended(md)) | 2584 | if (!dm_suspended_md(md)) |
2485 | goto out; | 2585 | goto out; |
2486 | 2586 | ||
2487 | map = dm_get_table(md); | 2587 | map = dm_get_live_table(md); |
2488 | if (!map || !dm_table_get_size(map)) | 2588 | if (!map || !dm_table_get_size(map)) |
2489 | goto out; | 2589 | goto out; |
2490 | 2590 | ||
@@ -2518,18 +2618,19 @@ out: | |||
2518 | /*----------------------------------------------------------------- | 2618 | /*----------------------------------------------------------------- |
2519 | * Event notification. | 2619 | * Event notification. |
2520 | *---------------------------------------------------------------*/ | 2620 | *---------------------------------------------------------------*/ |
2521 | void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, | 2621 | int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, |
2522 | unsigned cookie) | 2622 | unsigned cookie) |
2523 | { | 2623 | { |
2524 | char udev_cookie[DM_COOKIE_LENGTH]; | 2624 | char udev_cookie[DM_COOKIE_LENGTH]; |
2525 | char *envp[] = { udev_cookie, NULL }; | 2625 | char *envp[] = { udev_cookie, NULL }; |
2526 | 2626 | ||
2527 | if (!cookie) | 2627 | if (!cookie) |
2528 | kobject_uevent(&disk_to_dev(md->disk)->kobj, action); | 2628 | return kobject_uevent(&disk_to_dev(md->disk)->kobj, action); |
2529 | else { | 2629 | else { |
2530 | snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", | 2630 | snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", |
2531 | DM_COOKIE_ENV_VAR_NAME, cookie); | 2631 | DM_COOKIE_ENV_VAR_NAME, cookie); |
2532 | kobject_uevent_env(&disk_to_dev(md->disk)->kobj, action, envp); | 2632 | return kobject_uevent_env(&disk_to_dev(md->disk)->kobj, |
2633 | action, envp); | ||
2533 | } | 2634 | } |
2534 | } | 2635 | } |
2535 | 2636 | ||
@@ -2585,26 +2686,27 @@ struct mapped_device *dm_get_from_kobject(struct kobject *kobj) | |||
2585 | return NULL; | 2686 | return NULL; |
2586 | 2687 | ||
2587 | if (test_bit(DMF_FREEING, &md->flags) || | 2688 | if (test_bit(DMF_FREEING, &md->flags) || |
2588 | test_bit(DMF_DELETING, &md->flags)) | 2689 | dm_deleting_md(md)) |
2589 | return NULL; | 2690 | return NULL; |
2590 | 2691 | ||
2591 | dm_get(md); | 2692 | dm_get(md); |
2592 | return md; | 2693 | return md; |
2593 | } | 2694 | } |
2594 | 2695 | ||
2595 | int dm_suspended(struct mapped_device *md) | 2696 | int dm_suspended_md(struct mapped_device *md) |
2596 | { | 2697 | { |
2597 | return test_bit(DMF_SUSPENDED, &md->flags); | 2698 | return test_bit(DMF_SUSPENDED, &md->flags); |
2598 | } | 2699 | } |
2599 | 2700 | ||
2600 | int dm_noflush_suspending(struct dm_target *ti) | 2701 | int dm_suspended(struct dm_target *ti) |
2601 | { | 2702 | { |
2602 | struct mapped_device *md = dm_table_get_md(ti->table); | 2703 | return dm_suspended_md(dm_table_get_md(ti->table)); |
2603 | int r = __noflush_suspending(md); | 2704 | } |
2604 | 2705 | EXPORT_SYMBOL_GPL(dm_suspended); | |
2605 | dm_put(md); | ||
2606 | 2706 | ||
2607 | return r; | 2707 | int dm_noflush_suspending(struct dm_target *ti) |
2708 | { | ||
2709 | return __noflush_suspending(dm_table_get_md(ti->table)); | ||
2608 | } | 2710 | } |
2609 | EXPORT_SYMBOL_GPL(dm_noflush_suspending); | 2711 | EXPORT_SYMBOL_GPL(dm_noflush_suspending); |
2610 | 2712 | ||
diff --git a/drivers/md/dm.h b/drivers/md/dm.h index a7663eba17e2..bad1724d4869 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h | |||
@@ -89,6 +89,16 @@ int dm_target_iterate(void (*iter_func)(struct target_type *tt, | |||
89 | int dm_split_args(int *argc, char ***argvp, char *input); | 89 | int dm_split_args(int *argc, char ***argvp, char *input); |
90 | 90 | ||
91 | /* | 91 | /* |
92 | * Is this mapped_device being deleted? | ||
93 | */ | ||
94 | int dm_deleting_md(struct mapped_device *md); | ||
95 | |||
96 | /* | ||
97 | * Is this mapped_device suspended? | ||
98 | */ | ||
99 | int dm_suspended_md(struct mapped_device *md); | ||
100 | |||
101 | /* | ||
92 | * The device-mapper can be driven through one of two interfaces; | 102 | * The device-mapper can be driven through one of two interfaces; |
93 | * ioctl or filesystem, depending which patch you have applied. | 103 | * ioctl or filesystem, depending which patch you have applied. |
94 | */ | 104 | */ |
@@ -115,8 +125,11 @@ void dm_stripe_exit(void); | |||
115 | int dm_open_count(struct mapped_device *md); | 125 | int dm_open_count(struct mapped_device *md); |
116 | int dm_lock_for_deletion(struct mapped_device *md); | 126 | int dm_lock_for_deletion(struct mapped_device *md); |
117 | 127 | ||
118 | void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, | 128 | int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, |
119 | unsigned cookie); | 129 | unsigned cookie); |
130 | |||
131 | int dm_io_init(void); | ||
132 | void dm_io_exit(void); | ||
120 | 133 | ||
121 | int dm_kcopyd_init(void); | 134 | int dm_kcopyd_init(void); |
122 | void dm_kcopyd_exit(void); | 135 | void dm_kcopyd_exit(void); |
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 87d88dbb667f..1a8987884614 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c | |||
@@ -64,6 +64,7 @@ | |||
64 | #define MaxFault 50 | 64 | #define MaxFault 50 |
65 | #include <linux/blkdev.h> | 65 | #include <linux/blkdev.h> |
66 | #include <linux/raid/md_u.h> | 66 | #include <linux/raid/md_u.h> |
67 | #include <linux/slab.h> | ||
67 | #include "md.h" | 68 | #include "md.h" |
68 | #include <linux/seq_file.h> | 69 | #include <linux/seq_file.h> |
69 | 70 | ||
@@ -168,10 +169,9 @@ static void add_sector(conf_t *conf, sector_t start, int mode) | |||
168 | conf->nfaults = n+1; | 169 | conf->nfaults = n+1; |
169 | } | 170 | } |
170 | 171 | ||
171 | static int make_request(struct request_queue *q, struct bio *bio) | 172 | static int make_request(mddev_t *mddev, struct bio *bio) |
172 | { | 173 | { |
173 | mddev_t *mddev = q->queuedata; | 174 | conf_t *conf = mddev->private; |
174 | conf_t *conf = (conf_t*)mddev->private; | ||
175 | int failit = 0; | 175 | int failit = 0; |
176 | 176 | ||
177 | if (bio_data_dir(bio) == WRITE) { | 177 | if (bio_data_dir(bio) == WRITE) { |
@@ -224,7 +224,7 @@ static int make_request(struct request_queue *q, struct bio *bio) | |||
224 | 224 | ||
225 | static void status(struct seq_file *seq, mddev_t *mddev) | 225 | static void status(struct seq_file *seq, mddev_t *mddev) |
226 | { | 226 | { |
227 | conf_t *conf = (conf_t*)mddev->private; | 227 | conf_t *conf = mddev->private; |
228 | int n; | 228 | int n; |
229 | 229 | ||
230 | if ((n=atomic_read(&conf->counters[WriteTransient])) != 0) | 230 | if ((n=atomic_read(&conf->counters[WriteTransient])) != 0) |
@@ -327,7 +327,7 @@ static int run(mddev_t *mddev) | |||
327 | 327 | ||
328 | static int stop(mddev_t *mddev) | 328 | static int stop(mddev_t *mddev) |
329 | { | 329 | { |
330 | conf_t *conf = (conf_t *)mddev->private; | 330 | conf_t *conf = mddev->private; |
331 | 331 | ||
332 | kfree(conf); | 332 | kfree(conf); |
333 | mddev->private = NULL; | 333 | mddev->private = NULL; |
@@ -360,6 +360,7 @@ static void raid_exit(void) | |||
360 | module_init(raid_init); | 360 | module_init(raid_init); |
361 | module_exit(raid_exit); | 361 | module_exit(raid_exit); |
362 | MODULE_LICENSE("GPL"); | 362 | MODULE_LICENSE("GPL"); |
363 | MODULE_DESCRIPTION("Fault injection personality for MD"); | ||
363 | MODULE_ALIAS("md-personality-10"); /* faulty */ | 364 | MODULE_ALIAS("md-personality-10"); /* faulty */ |
364 | MODULE_ALIAS("md-faulty"); | 365 | MODULE_ALIAS("md-faulty"); |
365 | MODULE_ALIAS("md-level--5"); | 366 | MODULE_ALIAS("md-level--5"); |
diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 1ceceb334d5e..7e0e057db9a7 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c | |||
@@ -19,6 +19,7 @@ | |||
19 | #include <linux/blkdev.h> | 19 | #include <linux/blkdev.h> |
20 | #include <linux/raid/md_u.h> | 20 | #include <linux/raid/md_u.h> |
21 | #include <linux/seq_file.h> | 21 | #include <linux/seq_file.h> |
22 | #include <linux/slab.h> | ||
22 | #include "md.h" | 23 | #include "md.h" |
23 | #include "linear.h" | 24 | #include "linear.h" |
24 | 25 | ||
@@ -158,7 +159,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) | |||
158 | sector_t sectors; | 159 | sector_t sectors; |
159 | 160 | ||
160 | if (j < 0 || j >= raid_disks || disk->rdev) { | 161 | if (j < 0 || j >= raid_disks || disk->rdev) { |
161 | printk("linear: disk numbering problem. Aborting!\n"); | 162 | printk(KERN_ERR "md/linear:%s: disk numbering problem. Aborting!\n", |
163 | mdname(mddev)); | ||
162 | goto out; | 164 | goto out; |
163 | } | 165 | } |
164 | 166 | ||
@@ -172,19 +174,22 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) | |||
172 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 174 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
173 | rdev->data_offset << 9); | 175 | rdev->data_offset << 9); |
174 | /* as we don't honour merge_bvec_fn, we must never risk | 176 | /* as we don't honour merge_bvec_fn, we must never risk |
175 | * violating it, so limit ->max_sector to one PAGE, as | 177 | * violating it, so limit max_segments to 1 lying within |
176 | * a one page request is never in violation. | 178 | * a single page. |
177 | */ | 179 | */ |
178 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && | 180 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { |
179 | queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) | 181 | blk_queue_max_segments(mddev->queue, 1); |
180 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | 182 | blk_queue_segment_boundary(mddev->queue, |
183 | PAGE_CACHE_SIZE - 1); | ||
184 | } | ||
181 | 185 | ||
182 | conf->array_sectors += rdev->sectors; | 186 | conf->array_sectors += rdev->sectors; |
183 | cnt++; | 187 | cnt++; |
184 | 188 | ||
185 | } | 189 | } |
186 | if (cnt != raid_disks) { | 190 | if (cnt != raid_disks) { |
187 | printk("linear: not enough drives present. Aborting!\n"); | 191 | printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n", |
192 | mdname(mddev)); | ||
188 | goto out; | 193 | goto out; |
189 | } | 194 | } |
190 | 195 | ||
@@ -279,29 +284,21 @@ static int linear_stop (mddev_t *mddev) | |||
279 | rcu_barrier(); | 284 | rcu_barrier(); |
280 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ | 285 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ |
281 | kfree(conf); | 286 | kfree(conf); |
287 | mddev->private = NULL; | ||
282 | 288 | ||
283 | return 0; | 289 | return 0; |
284 | } | 290 | } |
285 | 291 | ||
286 | static int linear_make_request (struct request_queue *q, struct bio *bio) | 292 | static int linear_make_request (mddev_t *mddev, struct bio *bio) |
287 | { | 293 | { |
288 | const int rw = bio_data_dir(bio); | ||
289 | mddev_t *mddev = q->queuedata; | ||
290 | dev_info_t *tmp_dev; | 294 | dev_info_t *tmp_dev; |
291 | sector_t start_sector; | 295 | sector_t start_sector; |
292 | int cpu; | ||
293 | 296 | ||
294 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { | 297 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { |
295 | bio_endio(bio, -EOPNOTSUPP); | 298 | md_barrier_request(mddev, bio); |
296 | return 0; | 299 | return 0; |
297 | } | 300 | } |
298 | 301 | ||
299 | cpu = part_stat_lock(); | ||
300 | part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); | ||
301 | part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], | ||
302 | bio_sectors(bio)); | ||
303 | part_stat_unlock(); | ||
304 | |||
305 | rcu_read_lock(); | 302 | rcu_read_lock(); |
306 | tmp_dev = which_dev(mddev, bio->bi_sector); | 303 | tmp_dev = which_dev(mddev, bio->bi_sector); |
307 | start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; | 304 | start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; |
@@ -311,12 +308,14 @@ static int linear_make_request (struct request_queue *q, struct bio *bio) | |||
311 | || (bio->bi_sector < start_sector))) { | 308 | || (bio->bi_sector < start_sector))) { |
312 | char b[BDEVNAME_SIZE]; | 309 | char b[BDEVNAME_SIZE]; |
313 | 310 | ||
314 | printk("linear_make_request: Sector %llu out of bounds on " | 311 | printk(KERN_ERR |
315 | "dev %s: %llu sectors, offset %llu\n", | 312 | "md/linear:%s: make_request: Sector %llu out of bounds on " |
316 | (unsigned long long)bio->bi_sector, | 313 | "dev %s: %llu sectors, offset %llu\n", |
317 | bdevname(tmp_dev->rdev->bdev, b), | 314 | mdname(mddev), |
318 | (unsigned long long)tmp_dev->rdev->sectors, | 315 | (unsigned long long)bio->bi_sector, |
319 | (unsigned long long)start_sector); | 316 | bdevname(tmp_dev->rdev->bdev, b), |
317 | (unsigned long long)tmp_dev->rdev->sectors, | ||
318 | (unsigned long long)start_sector); | ||
320 | rcu_read_unlock(); | 319 | rcu_read_unlock(); |
321 | bio_io_error(bio); | 320 | bio_io_error(bio); |
322 | return 0; | 321 | return 0; |
@@ -333,9 +332,9 @@ static int linear_make_request (struct request_queue *q, struct bio *bio) | |||
333 | 332 | ||
334 | bp = bio_split(bio, end_sector - bio->bi_sector); | 333 | bp = bio_split(bio, end_sector - bio->bi_sector); |
335 | 334 | ||
336 | if (linear_make_request(q, &bp->bio1)) | 335 | if (linear_make_request(mddev, &bp->bio1)) |
337 | generic_make_request(&bp->bio1); | 336 | generic_make_request(&bp->bio1); |
338 | if (linear_make_request(q, &bp->bio2)) | 337 | if (linear_make_request(mddev, &bp->bio2)) |
339 | generic_make_request(&bp->bio2); | 338 | generic_make_request(&bp->bio2); |
340 | bio_pair_release(bp); | 339 | bio_pair_release(bp); |
341 | return 0; | 340 | return 0; |
@@ -383,6 +382,7 @@ static void linear_exit (void) | |||
383 | module_init(linear_init); | 382 | module_init(linear_init); |
384 | module_exit(linear_exit); | 383 | module_exit(linear_exit); |
385 | MODULE_LICENSE("GPL"); | 384 | MODULE_LICENSE("GPL"); |
385 | MODULE_DESCRIPTION("Linear device concatenation personality for MD"); | ||
386 | MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/ | 386 | MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/ |
387 | MODULE_ALIAS("md-linear"); | 387 | MODULE_ALIAS("md-linear"); |
388 | MODULE_ALIAS("md-level--1"); | 388 | MODULE_ALIAS("md-level--1"); |
diff --git a/drivers/md/md.c b/drivers/md/md.c index 26ba42a79129..cb20d0b0555a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -39,14 +39,17 @@ | |||
39 | #include <linux/buffer_head.h> /* for invalidate_bdev */ | 39 | #include <linux/buffer_head.h> /* for invalidate_bdev */ |
40 | #include <linux/poll.h> | 40 | #include <linux/poll.h> |
41 | #include <linux/ctype.h> | 41 | #include <linux/ctype.h> |
42 | #include <linux/string.h> | ||
42 | #include <linux/hdreg.h> | 43 | #include <linux/hdreg.h> |
43 | #include <linux/proc_fs.h> | 44 | #include <linux/proc_fs.h> |
44 | #include <linux/random.h> | 45 | #include <linux/random.h> |
45 | #include <linux/reboot.h> | 46 | #include <linux/reboot.h> |
46 | #include <linux/file.h> | 47 | #include <linux/file.h> |
48 | #include <linux/compat.h> | ||
47 | #include <linux/delay.h> | 49 | #include <linux/delay.h> |
48 | #include <linux/raid/md_p.h> | 50 | #include <linux/raid/md_p.h> |
49 | #include <linux/raid/md_u.h> | 51 | #include <linux/raid/md_u.h> |
52 | #include <linux/slab.h> | ||
50 | #include "md.h" | 53 | #include "md.h" |
51 | #include "bitmap.h" | 54 | #include "bitmap.h" |
52 | 55 | ||
@@ -68,6 +71,12 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait); | |||
68 | #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } | 71 | #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } |
69 | 72 | ||
70 | /* | 73 | /* |
74 | * Default number of read corrections we'll attempt on an rdev | ||
75 | * before ejecting it from the array. We divide the read error | ||
76 | * count by 2 for every hour elapsed between read errors. | ||
77 | */ | ||
78 | #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 | ||
79 | /* | ||
71 | * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' | 80 | * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' |
72 | * is 1000 KB/sec, so the extra system load does not show up that much. | 81 | * is 1000 KB/sec, so the extra system load does not show up that much. |
73 | * Increase it if you want to have more _guaranteed_ speed. Note that | 82 | * Increase it if you want to have more _guaranteed_ speed. Note that |
@@ -98,44 +107,40 @@ static struct ctl_table_header *raid_table_header; | |||
98 | 107 | ||
99 | static ctl_table raid_table[] = { | 108 | static ctl_table raid_table[] = { |
100 | { | 109 | { |
101 | .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, | ||
102 | .procname = "speed_limit_min", | 110 | .procname = "speed_limit_min", |
103 | .data = &sysctl_speed_limit_min, | 111 | .data = &sysctl_speed_limit_min, |
104 | .maxlen = sizeof(int), | 112 | .maxlen = sizeof(int), |
105 | .mode = S_IRUGO|S_IWUSR, | 113 | .mode = S_IRUGO|S_IWUSR, |
106 | .proc_handler = &proc_dointvec, | 114 | .proc_handler = proc_dointvec, |
107 | }, | 115 | }, |
108 | { | 116 | { |
109 | .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, | ||
110 | .procname = "speed_limit_max", | 117 | .procname = "speed_limit_max", |
111 | .data = &sysctl_speed_limit_max, | 118 | .data = &sysctl_speed_limit_max, |
112 | .maxlen = sizeof(int), | 119 | .maxlen = sizeof(int), |
113 | .mode = S_IRUGO|S_IWUSR, | 120 | .mode = S_IRUGO|S_IWUSR, |
114 | .proc_handler = &proc_dointvec, | 121 | .proc_handler = proc_dointvec, |
115 | }, | 122 | }, |
116 | { .ctl_name = 0 } | 123 | { } |
117 | }; | 124 | }; |
118 | 125 | ||
119 | static ctl_table raid_dir_table[] = { | 126 | static ctl_table raid_dir_table[] = { |
120 | { | 127 | { |
121 | .ctl_name = DEV_RAID, | ||
122 | .procname = "raid", | 128 | .procname = "raid", |
123 | .maxlen = 0, | 129 | .maxlen = 0, |
124 | .mode = S_IRUGO|S_IXUGO, | 130 | .mode = S_IRUGO|S_IXUGO, |
125 | .child = raid_table, | 131 | .child = raid_table, |
126 | }, | 132 | }, |
127 | { .ctl_name = 0 } | 133 | { } |
128 | }; | 134 | }; |
129 | 135 | ||
130 | static ctl_table raid_root_table[] = { | 136 | static ctl_table raid_root_table[] = { |
131 | { | 137 | { |
132 | .ctl_name = CTL_DEV, | ||
133 | .procname = "dev", | 138 | .procname = "dev", |
134 | .maxlen = 0, | 139 | .maxlen = 0, |
135 | .mode = 0555, | 140 | .mode = 0555, |
136 | .child = raid_dir_table, | 141 | .child = raid_dir_table, |
137 | }, | 142 | }, |
138 | { .ctl_name = 0 } | 143 | { } |
139 | }; | 144 | }; |
140 | 145 | ||
141 | static const struct block_device_operations md_fops; | 146 | static const struct block_device_operations md_fops; |
@@ -210,19 +215,22 @@ static DEFINE_SPINLOCK(all_mddevs_lock); | |||
210 | */ | 215 | */ |
211 | static int md_make_request(struct request_queue *q, struct bio *bio) | 216 | static int md_make_request(struct request_queue *q, struct bio *bio) |
212 | { | 217 | { |
218 | const int rw = bio_data_dir(bio); | ||
213 | mddev_t *mddev = q->queuedata; | 219 | mddev_t *mddev = q->queuedata; |
214 | int rv; | 220 | int rv; |
221 | int cpu; | ||
222 | |||
215 | if (mddev == NULL || mddev->pers == NULL) { | 223 | if (mddev == NULL || mddev->pers == NULL) { |
216 | bio_io_error(bio); | 224 | bio_io_error(bio); |
217 | return 0; | 225 | return 0; |
218 | } | 226 | } |
219 | rcu_read_lock(); | 227 | rcu_read_lock(); |
220 | if (mddev->suspended) { | 228 | if (mddev->suspended || mddev->barrier) { |
221 | DEFINE_WAIT(__wait); | 229 | DEFINE_WAIT(__wait); |
222 | for (;;) { | 230 | for (;;) { |
223 | prepare_to_wait(&mddev->sb_wait, &__wait, | 231 | prepare_to_wait(&mddev->sb_wait, &__wait, |
224 | TASK_UNINTERRUPTIBLE); | 232 | TASK_UNINTERRUPTIBLE); |
225 | if (!mddev->suspended) | 233 | if (!mddev->suspended && !mddev->barrier) |
226 | break; | 234 | break; |
227 | rcu_read_unlock(); | 235 | rcu_read_unlock(); |
228 | schedule(); | 236 | schedule(); |
@@ -232,13 +240,27 @@ static int md_make_request(struct request_queue *q, struct bio *bio) | |||
232 | } | 240 | } |
233 | atomic_inc(&mddev->active_io); | 241 | atomic_inc(&mddev->active_io); |
234 | rcu_read_unlock(); | 242 | rcu_read_unlock(); |
235 | rv = mddev->pers->make_request(q, bio); | 243 | |
244 | rv = mddev->pers->make_request(mddev, bio); | ||
245 | |||
246 | cpu = part_stat_lock(); | ||
247 | part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); | ||
248 | part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], | ||
249 | bio_sectors(bio)); | ||
250 | part_stat_unlock(); | ||
251 | |||
236 | if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) | 252 | if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) |
237 | wake_up(&mddev->sb_wait); | 253 | wake_up(&mddev->sb_wait); |
238 | 254 | ||
239 | return rv; | 255 | return rv; |
240 | } | 256 | } |
241 | 257 | ||
258 | /* mddev_suspend makes sure no new requests are submitted | ||
259 | * to the device, and that any requests that have been submitted | ||
260 | * are completely handled. | ||
261 | * Once ->stop is called and completes, the module will be completely | ||
262 | * unused. | ||
263 | */ | ||
242 | static void mddev_suspend(mddev_t *mddev) | 264 | static void mddev_suspend(mddev_t *mddev) |
243 | { | 265 | { |
244 | BUG_ON(mddev->suspended); | 266 | BUG_ON(mddev->suspended); |
@@ -246,13 +268,6 @@ static void mddev_suspend(mddev_t *mddev) | |||
246 | synchronize_rcu(); | 268 | synchronize_rcu(); |
247 | wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); | 269 | wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); |
248 | mddev->pers->quiesce(mddev, 1); | 270 | mddev->pers->quiesce(mddev, 1); |
249 | md_unregister_thread(mddev->thread); | ||
250 | mddev->thread = NULL; | ||
251 | /* we now know that no code is executing in the personality module, | ||
252 | * except possibly the tail end of a ->bi_end_io function, but that | ||
253 | * is certain to complete before the module has a chance to get | ||
254 | * unloaded | ||
255 | */ | ||
256 | } | 271 | } |
257 | 272 | ||
258 | static void mddev_resume(mddev_t *mddev) | 273 | static void mddev_resume(mddev_t *mddev) |
@@ -264,10 +279,110 @@ static void mddev_resume(mddev_t *mddev) | |||
264 | 279 | ||
265 | int mddev_congested(mddev_t *mddev, int bits) | 280 | int mddev_congested(mddev_t *mddev, int bits) |
266 | { | 281 | { |
282 | if (mddev->barrier) | ||
283 | return 1; | ||
267 | return mddev->suspended; | 284 | return mddev->suspended; |
268 | } | 285 | } |
269 | EXPORT_SYMBOL(mddev_congested); | 286 | EXPORT_SYMBOL(mddev_congested); |
270 | 287 | ||
288 | /* | ||
289 | * Generic barrier handling for md | ||
290 | */ | ||
291 | |||
292 | #define POST_REQUEST_BARRIER ((void*)1) | ||
293 | |||
294 | static void md_end_barrier(struct bio *bio, int err) | ||
295 | { | ||
296 | mdk_rdev_t *rdev = bio->bi_private; | ||
297 | mddev_t *mddev = rdev->mddev; | ||
298 | if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER) | ||
299 | set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags); | ||
300 | |||
301 | rdev_dec_pending(rdev, mddev); | ||
302 | |||
303 | if (atomic_dec_and_test(&mddev->flush_pending)) { | ||
304 | if (mddev->barrier == POST_REQUEST_BARRIER) { | ||
305 | /* This was a post-request barrier */ | ||
306 | mddev->barrier = NULL; | ||
307 | wake_up(&mddev->sb_wait); | ||
308 | } else | ||
309 | /* The pre-request barrier has finished */ | ||
310 | schedule_work(&mddev->barrier_work); | ||
311 | } | ||
312 | bio_put(bio); | ||
313 | } | ||
314 | |||
315 | static void submit_barriers(mddev_t *mddev) | ||
316 | { | ||
317 | mdk_rdev_t *rdev; | ||
318 | |||
319 | rcu_read_lock(); | ||
320 | list_for_each_entry_rcu(rdev, &mddev->disks, same_set) | ||
321 | if (rdev->raid_disk >= 0 && | ||
322 | !test_bit(Faulty, &rdev->flags)) { | ||
323 | /* Take two references, one is dropped | ||
324 | * when request finishes, one after | ||
325 | * we reclaim rcu_read_lock | ||
326 | */ | ||
327 | struct bio *bi; | ||
328 | atomic_inc(&rdev->nr_pending); | ||
329 | atomic_inc(&rdev->nr_pending); | ||
330 | rcu_read_unlock(); | ||
331 | bi = bio_alloc(GFP_KERNEL, 0); | ||
332 | bi->bi_end_io = md_end_barrier; | ||
333 | bi->bi_private = rdev; | ||
334 | bi->bi_bdev = rdev->bdev; | ||
335 | atomic_inc(&mddev->flush_pending); | ||
336 | submit_bio(WRITE_BARRIER, bi); | ||
337 | rcu_read_lock(); | ||
338 | rdev_dec_pending(rdev, mddev); | ||
339 | } | ||
340 | rcu_read_unlock(); | ||
341 | } | ||
342 | |||
343 | static void md_submit_barrier(struct work_struct *ws) | ||
344 | { | ||
345 | mddev_t *mddev = container_of(ws, mddev_t, barrier_work); | ||
346 | struct bio *bio = mddev->barrier; | ||
347 | |||
348 | atomic_set(&mddev->flush_pending, 1); | ||
349 | |||
350 | if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) | ||
351 | bio_endio(bio, -EOPNOTSUPP); | ||
352 | else if (bio->bi_size == 0) | ||
353 | /* an empty barrier - all done */ | ||
354 | bio_endio(bio, 0); | ||
355 | else { | ||
356 | bio->bi_rw &= ~(1<<BIO_RW_BARRIER); | ||
357 | if (mddev->pers->make_request(mddev, bio)) | ||
358 | generic_make_request(bio); | ||
359 | mddev->barrier = POST_REQUEST_BARRIER; | ||
360 | submit_barriers(mddev); | ||
361 | } | ||
362 | if (atomic_dec_and_test(&mddev->flush_pending)) { | ||
363 | mddev->barrier = NULL; | ||
364 | wake_up(&mddev->sb_wait); | ||
365 | } | ||
366 | } | ||
367 | |||
368 | void md_barrier_request(mddev_t *mddev, struct bio *bio) | ||
369 | { | ||
370 | spin_lock_irq(&mddev->write_lock); | ||
371 | wait_event_lock_irq(mddev->sb_wait, | ||
372 | !mddev->barrier, | ||
373 | mddev->write_lock, /*nothing*/); | ||
374 | mddev->barrier = bio; | ||
375 | spin_unlock_irq(&mddev->write_lock); | ||
376 | |||
377 | atomic_set(&mddev->flush_pending, 1); | ||
378 | INIT_WORK(&mddev->barrier_work, md_submit_barrier); | ||
379 | |||
380 | submit_barriers(mddev); | ||
381 | |||
382 | if (atomic_dec_and_test(&mddev->flush_pending)) | ||
383 | schedule_work(&mddev->barrier_work); | ||
384 | } | ||
385 | EXPORT_SYMBOL(md_barrier_request); | ||
271 | 386 | ||
272 | static inline mddev_t *mddev_get(mddev_t *mddev) | 387 | static inline mddev_t *mddev_get(mddev_t *mddev) |
273 | { | 388 | { |
@@ -282,7 +397,9 @@ static void mddev_put(mddev_t *mddev) | |||
282 | if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) | 397 | if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) |
283 | return; | 398 | return; |
284 | if (!mddev->raid_disks && list_empty(&mddev->disks) && | 399 | if (!mddev->raid_disks && list_empty(&mddev->disks) && |
285 | !mddev->hold_active) { | 400 | mddev->ctime == 0 && !mddev->hold_active) { |
401 | /* Array is not configured at all, and not held active, | ||
402 | * so destroy it */ | ||
286 | list_del(&mddev->all_mddevs); | 403 | list_del(&mddev->all_mddevs); |
287 | if (mddev->gendisk) { | 404 | if (mddev->gendisk) { |
288 | /* we did a probe so need to clean up. | 405 | /* we did a probe so need to clean up. |
@@ -299,6 +416,27 @@ static void mddev_put(mddev_t *mddev) | |||
299 | spin_unlock(&all_mddevs_lock); | 416 | spin_unlock(&all_mddevs_lock); |
300 | } | 417 | } |
301 | 418 | ||
419 | static void mddev_init(mddev_t *mddev) | ||
420 | { | ||
421 | mutex_init(&mddev->open_mutex); | ||
422 | mutex_init(&mddev->reconfig_mutex); | ||
423 | mutex_init(&mddev->bitmap_info.mutex); | ||
424 | INIT_LIST_HEAD(&mddev->disks); | ||
425 | INIT_LIST_HEAD(&mddev->all_mddevs); | ||
426 | init_timer(&mddev->safemode_timer); | ||
427 | atomic_set(&mddev->active, 1); | ||
428 | atomic_set(&mddev->openers, 0); | ||
429 | atomic_set(&mddev->active_io, 0); | ||
430 | spin_lock_init(&mddev->write_lock); | ||
431 | atomic_set(&mddev->flush_pending, 0); | ||
432 | init_waitqueue_head(&mddev->sb_wait); | ||
433 | init_waitqueue_head(&mddev->recovery_wait); | ||
434 | mddev->reshape_position = MaxSector; | ||
435 | mddev->resync_min = 0; | ||
436 | mddev->resync_max = MaxSector; | ||
437 | mddev->level = LEVEL_NONE; | ||
438 | } | ||
439 | |||
302 | static mddev_t * mddev_find(dev_t unit) | 440 | static mddev_t * mddev_find(dev_t unit) |
303 | { | 441 | { |
304 | mddev_t *mddev, *new = NULL; | 442 | mddev_t *mddev, *new = NULL; |
@@ -365,21 +503,7 @@ static mddev_t * mddev_find(dev_t unit) | |||
365 | else | 503 | else |
366 | new->md_minor = MINOR(unit) >> MdpMinorShift; | 504 | new->md_minor = MINOR(unit) >> MdpMinorShift; |
367 | 505 | ||
368 | mutex_init(&new->open_mutex); | 506 | mddev_init(new); |
369 | mutex_init(&new->reconfig_mutex); | ||
370 | INIT_LIST_HEAD(&new->disks); | ||
371 | INIT_LIST_HEAD(&new->all_mddevs); | ||
372 | init_timer(&new->safemode_timer); | ||
373 | atomic_set(&new->active, 1); | ||
374 | atomic_set(&new->openers, 0); | ||
375 | atomic_set(&new->active_io, 0); | ||
376 | spin_lock_init(&new->write_lock); | ||
377 | init_waitqueue_head(&new->sb_wait); | ||
378 | init_waitqueue_head(&new->recovery_wait); | ||
379 | new->reshape_position = MaxSector; | ||
380 | new->resync_min = 0; | ||
381 | new->resync_max = MaxSector; | ||
382 | new->level = LEVEL_NONE; | ||
383 | 507 | ||
384 | goto retry; | 508 | goto retry; |
385 | } | 509 | } |
@@ -399,9 +523,36 @@ static inline int mddev_trylock(mddev_t * mddev) | |||
399 | return mutex_trylock(&mddev->reconfig_mutex); | 523 | return mutex_trylock(&mddev->reconfig_mutex); |
400 | } | 524 | } |
401 | 525 | ||
402 | static inline void mddev_unlock(mddev_t * mddev) | 526 | static struct attribute_group md_redundancy_group; |
527 | |||
528 | static void mddev_unlock(mddev_t * mddev) | ||
403 | { | 529 | { |
404 | mutex_unlock(&mddev->reconfig_mutex); | 530 | if (mddev->to_remove) { |
531 | /* These cannot be removed under reconfig_mutex as | ||
532 | * an access to the files will try to take reconfig_mutex | ||
533 | * while holding the file unremovable, which leads to | ||
534 | * a deadlock. | ||
535 | * So hold open_mutex instead - we are allowed to take | ||
536 | * it while holding reconfig_mutex, and md_run can | ||
537 | * use it to wait for the remove to complete. | ||
538 | */ | ||
539 | struct attribute_group *to_remove = mddev->to_remove; | ||
540 | mddev->to_remove = NULL; | ||
541 | mutex_lock(&mddev->open_mutex); | ||
542 | mutex_unlock(&mddev->reconfig_mutex); | ||
543 | |||
544 | if (to_remove != &md_redundancy_group) | ||
545 | sysfs_remove_group(&mddev->kobj, to_remove); | ||
546 | if (mddev->pers == NULL || | ||
547 | mddev->pers->sync_request == NULL) { | ||
548 | sysfs_remove_group(&mddev->kobj, &md_redundancy_group); | ||
549 | if (mddev->sysfs_action) | ||
550 | sysfs_put(mddev->sysfs_action); | ||
551 | mddev->sysfs_action = NULL; | ||
552 | } | ||
553 | mutex_unlock(&mddev->open_mutex); | ||
554 | } else | ||
555 | mutex_unlock(&mddev->reconfig_mutex); | ||
405 | 556 | ||
406 | md_wakeup_thread(mddev->thread); | 557 | md_wakeup_thread(mddev->thread); |
407 | } | 558 | } |
@@ -752,7 +903,7 @@ struct super_type { | |||
752 | */ | 903 | */ |
753 | int md_check_no_bitmap(mddev_t *mddev) | 904 | int md_check_no_bitmap(mddev_t *mddev) |
754 | { | 905 | { |
755 | if (!mddev->bitmap_file && !mddev->bitmap_offset) | 906 | if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) |
756 | return 0; | 907 | return 0; |
757 | printk(KERN_ERR "%s: bitmaps are not supported for %s\n", | 908 | printk(KERN_ERR "%s: bitmaps are not supported for %s\n", |
758 | mdname(mddev), mddev->pers->name); | 909 | mdname(mddev), mddev->pers->name); |
@@ -880,8 +1031,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
880 | mddev->raid_disks = sb->raid_disks; | 1031 | mddev->raid_disks = sb->raid_disks; |
881 | mddev->dev_sectors = sb->size * 2; | 1032 | mddev->dev_sectors = sb->size * 2; |
882 | mddev->events = ev1; | 1033 | mddev->events = ev1; |
883 | mddev->bitmap_offset = 0; | 1034 | mddev->bitmap_info.offset = 0; |
884 | mddev->default_bitmap_offset = MD_SB_BYTES >> 9; | 1035 | mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; |
885 | 1036 | ||
886 | if (mddev->minor_version >= 91) { | 1037 | if (mddev->minor_version >= 91) { |
887 | mddev->reshape_position = sb->reshape_position; | 1038 | mddev->reshape_position = sb->reshape_position; |
@@ -915,14 +1066,18 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
915 | mddev->max_disks = MD_SB_DISKS; | 1066 | mddev->max_disks = MD_SB_DISKS; |
916 | 1067 | ||
917 | if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && | 1068 | if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && |
918 | mddev->bitmap_file == NULL) | 1069 | mddev->bitmap_info.file == NULL) |
919 | mddev->bitmap_offset = mddev->default_bitmap_offset; | 1070 | mddev->bitmap_info.offset = |
1071 | mddev->bitmap_info.default_offset; | ||
920 | 1072 | ||
921 | } else if (mddev->pers == NULL) { | 1073 | } else if (mddev->pers == NULL) { |
922 | /* Insist on good event counter while assembling */ | 1074 | /* Insist on good event counter while assembling, except |
1075 | * for spares (which don't need an event count) */ | ||
923 | ++ev1; | 1076 | ++ev1; |
924 | if (ev1 < mddev->events) | 1077 | if (sb->disks[rdev->desc_nr].state & ( |
925 | return -EINVAL; | 1078 | (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) |
1079 | if (ev1 < mddev->events) | ||
1080 | return -EINVAL; | ||
926 | } else if (mddev->bitmap) { | 1081 | } else if (mddev->bitmap) { |
927 | /* if adding to array with a bitmap, then we can accept an | 1082 | /* if adding to array with a bitmap, then we can accept an |
928 | * older device ... but not too old. | 1083 | * older device ... but not too old. |
@@ -944,6 +1099,14 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
944 | desc->raid_disk < mddev->raid_disks */) { | 1099 | desc->raid_disk < mddev->raid_disks */) { |
945 | set_bit(In_sync, &rdev->flags); | 1100 | set_bit(In_sync, &rdev->flags); |
946 | rdev->raid_disk = desc->raid_disk; | 1101 | rdev->raid_disk = desc->raid_disk; |
1102 | } else if (desc->state & (1<<MD_DISK_ACTIVE)) { | ||
1103 | /* active but not in sync implies recovery up to | ||
1104 | * reshape position. We don't know exactly where | ||
1105 | * that is, so set to zero for now */ | ||
1106 | if (mddev->minor_version >= 91) { | ||
1107 | rdev->recovery_offset = 0; | ||
1108 | rdev->raid_disk = desc->raid_disk; | ||
1109 | } | ||
947 | } | 1110 | } |
948 | if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) | 1111 | if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) |
949 | set_bit(WriteMostly, &rdev->flags); | 1112 | set_bit(WriteMostly, &rdev->flags); |
@@ -1025,15 +1188,26 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1025 | sb->layout = mddev->layout; | 1188 | sb->layout = mddev->layout; |
1026 | sb->chunk_size = mddev->chunk_sectors << 9; | 1189 | sb->chunk_size = mddev->chunk_sectors << 9; |
1027 | 1190 | ||
1028 | if (mddev->bitmap && mddev->bitmap_file == NULL) | 1191 | if (mddev->bitmap && mddev->bitmap_info.file == NULL) |
1029 | sb->state |= (1<<MD_SB_BITMAP_PRESENT); | 1192 | sb->state |= (1<<MD_SB_BITMAP_PRESENT); |
1030 | 1193 | ||
1031 | sb->disks[0].state = (1<<MD_DISK_REMOVED); | 1194 | sb->disks[0].state = (1<<MD_DISK_REMOVED); |
1032 | list_for_each_entry(rdev2, &mddev->disks, same_set) { | 1195 | list_for_each_entry(rdev2, &mddev->disks, same_set) { |
1033 | mdp_disk_t *d; | 1196 | mdp_disk_t *d; |
1034 | int desc_nr; | 1197 | int desc_nr; |
1035 | if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) | 1198 | int is_active = test_bit(In_sync, &rdev2->flags); |
1036 | && !test_bit(Faulty, &rdev2->flags)) | 1199 | |
1200 | if (rdev2->raid_disk >= 0 && | ||
1201 | sb->minor_version >= 91) | ||
1202 | /* we have nowhere to store the recovery_offset, | ||
1203 | * but if it is not below the reshape_position, | ||
1204 | * we can piggy-back on that. | ||
1205 | */ | ||
1206 | is_active = 1; | ||
1207 | if (rdev2->raid_disk < 0 || | ||
1208 | test_bit(Faulty, &rdev2->flags)) | ||
1209 | is_active = 0; | ||
1210 | if (is_active) | ||
1037 | desc_nr = rdev2->raid_disk; | 1211 | desc_nr = rdev2->raid_disk; |
1038 | else | 1212 | else |
1039 | desc_nr = next_spare++; | 1213 | desc_nr = next_spare++; |
@@ -1043,16 +1217,16 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1043 | d->number = rdev2->desc_nr; | 1217 | d->number = rdev2->desc_nr; |
1044 | d->major = MAJOR(rdev2->bdev->bd_dev); | 1218 | d->major = MAJOR(rdev2->bdev->bd_dev); |
1045 | d->minor = MINOR(rdev2->bdev->bd_dev); | 1219 | d->minor = MINOR(rdev2->bdev->bd_dev); |
1046 | if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) | 1220 | if (is_active) |
1047 | && !test_bit(Faulty, &rdev2->flags)) | ||
1048 | d->raid_disk = rdev2->raid_disk; | 1221 | d->raid_disk = rdev2->raid_disk; |
1049 | else | 1222 | else |
1050 | d->raid_disk = rdev2->desc_nr; /* compatibility */ | 1223 | d->raid_disk = rdev2->desc_nr; /* compatibility */ |
1051 | if (test_bit(Faulty, &rdev2->flags)) | 1224 | if (test_bit(Faulty, &rdev2->flags)) |
1052 | d->state = (1<<MD_DISK_FAULTY); | 1225 | d->state = (1<<MD_DISK_FAULTY); |
1053 | else if (test_bit(In_sync, &rdev2->flags)) { | 1226 | else if (is_active) { |
1054 | d->state = (1<<MD_DISK_ACTIVE); | 1227 | d->state = (1<<MD_DISK_ACTIVE); |
1055 | d->state |= (1<<MD_DISK_SYNC); | 1228 | if (test_bit(In_sync, &rdev2->flags)) |
1229 | d->state |= (1<<MD_DISK_SYNC); | ||
1056 | active++; | 1230 | active++; |
1057 | working++; | 1231 | working++; |
1058 | } else { | 1232 | } else { |
@@ -1092,7 +1266,7 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) | |||
1092 | { | 1266 | { |
1093 | if (num_sectors && num_sectors < rdev->mddev->dev_sectors) | 1267 | if (num_sectors && num_sectors < rdev->mddev->dev_sectors) |
1094 | return 0; /* component must fit device */ | 1268 | return 0; /* component must fit device */ |
1095 | if (rdev->mddev->bitmap_offset) | 1269 | if (rdev->mddev->bitmap_info.offset) |
1096 | return 0; /* can't move bitmap */ | 1270 | return 0; /* can't move bitmap */ |
1097 | rdev->sb_start = calc_dev_sboffset(rdev->bdev); | 1271 | rdev->sb_start = calc_dev_sboffset(rdev->bdev); |
1098 | if (!num_sectors || num_sectors > rdev->sb_start) | 1272 | if (!num_sectors || num_sectors > rdev->sb_start) |
@@ -1271,8 +1445,8 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1271 | mddev->raid_disks = le32_to_cpu(sb->raid_disks); | 1445 | mddev->raid_disks = le32_to_cpu(sb->raid_disks); |
1272 | mddev->dev_sectors = le64_to_cpu(sb->size); | 1446 | mddev->dev_sectors = le64_to_cpu(sb->size); |
1273 | mddev->events = ev1; | 1447 | mddev->events = ev1; |
1274 | mddev->bitmap_offset = 0; | 1448 | mddev->bitmap_info.offset = 0; |
1275 | mddev->default_bitmap_offset = 1024 >> 9; | 1449 | mddev->bitmap_info.default_offset = 1024 >> 9; |
1276 | 1450 | ||
1277 | mddev->recovery_cp = le64_to_cpu(sb->resync_offset); | 1451 | mddev->recovery_cp = le64_to_cpu(sb->resync_offset); |
1278 | memcpy(mddev->uuid, sb->set_uuid, 16); | 1452 | memcpy(mddev->uuid, sb->set_uuid, 16); |
@@ -1280,8 +1454,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1280 | mddev->max_disks = (4096-256)/2; | 1454 | mddev->max_disks = (4096-256)/2; |
1281 | 1455 | ||
1282 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && | 1456 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && |
1283 | mddev->bitmap_file == NULL ) | 1457 | mddev->bitmap_info.file == NULL ) |
1284 | mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); | 1458 | mddev->bitmap_info.offset = |
1459 | (__s32)le32_to_cpu(sb->bitmap_offset); | ||
1285 | 1460 | ||
1286 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { | 1461 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { |
1287 | mddev->reshape_position = le64_to_cpu(sb->reshape_position); | 1462 | mddev->reshape_position = le64_to_cpu(sb->reshape_position); |
@@ -1298,10 +1473,14 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1298 | } | 1473 | } |
1299 | 1474 | ||
1300 | } else if (mddev->pers == NULL) { | 1475 | } else if (mddev->pers == NULL) { |
1301 | /* Insist of good event counter while assembling */ | 1476 | /* Insist of good event counter while assembling, except for |
1477 | * spares (which don't need an event count) */ | ||
1302 | ++ev1; | 1478 | ++ev1; |
1303 | if (ev1 < mddev->events) | 1479 | if (rdev->desc_nr >= 0 && |
1304 | return -EINVAL; | 1480 | rdev->desc_nr < le32_to_cpu(sb->max_dev) && |
1481 | le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe) | ||
1482 | if (ev1 < mddev->events) | ||
1483 | return -EINVAL; | ||
1305 | } else if (mddev->bitmap) { | 1484 | } else if (mddev->bitmap) { |
1306 | /* If adding to array with a bitmap, then we can accept an | 1485 | /* If adding to array with a bitmap, then we can accept an |
1307 | * older device, but not too old. | 1486 | * older device, but not too old. |
@@ -1375,21 +1554,17 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1375 | sb->level = cpu_to_le32(mddev->level); | 1554 | sb->level = cpu_to_le32(mddev->level); |
1376 | sb->layout = cpu_to_le32(mddev->layout); | 1555 | sb->layout = cpu_to_le32(mddev->layout); |
1377 | 1556 | ||
1378 | if (mddev->bitmap && mddev->bitmap_file == NULL) { | 1557 | if (mddev->bitmap && mddev->bitmap_info.file == NULL) { |
1379 | sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); | 1558 | sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); |
1380 | sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); | 1559 | sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); |
1381 | } | 1560 | } |
1382 | 1561 | ||
1383 | if (rdev->raid_disk >= 0 && | 1562 | if (rdev->raid_disk >= 0 && |
1384 | !test_bit(In_sync, &rdev->flags)) { | 1563 | !test_bit(In_sync, &rdev->flags)) { |
1385 | if (mddev->curr_resync_completed > rdev->recovery_offset) | 1564 | sb->feature_map |= |
1386 | rdev->recovery_offset = mddev->curr_resync_completed; | 1565 | cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); |
1387 | if (rdev->recovery_offset > 0) { | 1566 | sb->recovery_offset = |
1388 | sb->feature_map |= | 1567 | cpu_to_le64(rdev->recovery_offset); |
1389 | cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); | ||
1390 | sb->recovery_offset = | ||
1391 | cpu_to_le64(rdev->recovery_offset); | ||
1392 | } | ||
1393 | } | 1568 | } |
1394 | 1569 | ||
1395 | if (mddev->reshape_position != MaxSector) { | 1570 | if (mddev->reshape_position != MaxSector) { |
@@ -1423,7 +1598,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1423 | sb->dev_roles[i] = cpu_to_le16(0xfffe); | 1598 | sb->dev_roles[i] = cpu_to_le16(0xfffe); |
1424 | else if (test_bit(In_sync, &rdev2->flags)) | 1599 | else if (test_bit(In_sync, &rdev2->flags)) |
1425 | sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); | 1600 | sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); |
1426 | else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0) | 1601 | else if (rdev2->raid_disk >= 0) |
1427 | sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); | 1602 | sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); |
1428 | else | 1603 | else |
1429 | sb->dev_roles[i] = cpu_to_le16(0xffff); | 1604 | sb->dev_roles[i] = cpu_to_le16(0xffff); |
@@ -1445,7 +1620,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) | |||
1445 | max_sectors -= rdev->data_offset; | 1620 | max_sectors -= rdev->data_offset; |
1446 | if (!num_sectors || num_sectors > max_sectors) | 1621 | if (!num_sectors || num_sectors > max_sectors) |
1447 | num_sectors = max_sectors; | 1622 | num_sectors = max_sectors; |
1448 | } else if (rdev->mddev->bitmap_offset) { | 1623 | } else if (rdev->mddev->bitmap_info.offset) { |
1449 | /* minor version 0 with bitmap we can't move */ | 1624 | /* minor version 0 with bitmap we can't move */ |
1450 | return 0; | 1625 | return 0; |
1451 | } else { | 1626 | } else { |
@@ -1640,7 +1815,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) | |||
1640 | kobject_del(&rdev->kobj); | 1815 | kobject_del(&rdev->kobj); |
1641 | goto fail; | 1816 | goto fail; |
1642 | } | 1817 | } |
1643 | rdev->sysfs_state = sysfs_get_dirent(rdev->kobj.sd, "state"); | 1818 | rdev->sysfs_state = sysfs_get_dirent(rdev->kobj.sd, NULL, "state"); |
1644 | 1819 | ||
1645 | list_add_rcu(&rdev->same_set, &mddev->disks); | 1820 | list_add_rcu(&rdev->same_set, &mddev->disks); |
1646 | bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); | 1821 | bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); |
@@ -1813,15 +1988,11 @@ static void print_sb_1(struct mdp_superblock_1 *sb) | |||
1813 | 1988 | ||
1814 | uuid = sb->set_uuid; | 1989 | uuid = sb->set_uuid; |
1815 | printk(KERN_INFO | 1990 | printk(KERN_INFO |
1816 | "md: SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x" | 1991 | "md: SB: (V:%u) (F:0x%08x) Array-ID:<%pU>\n" |
1817 | ":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n" | ||
1818 | "md: Name: \"%s\" CT:%llu\n", | 1992 | "md: Name: \"%s\" CT:%llu\n", |
1819 | le32_to_cpu(sb->major_version), | 1993 | le32_to_cpu(sb->major_version), |
1820 | le32_to_cpu(sb->feature_map), | 1994 | le32_to_cpu(sb->feature_map), |
1821 | uuid[0], uuid[1], uuid[2], uuid[3], | 1995 | uuid, |
1822 | uuid[4], uuid[5], uuid[6], uuid[7], | ||
1823 | uuid[8], uuid[9], uuid[10], uuid[11], | ||
1824 | uuid[12], uuid[13], uuid[14], uuid[15], | ||
1825 | sb->set_name, | 1996 | sb->set_name, |
1826 | (unsigned long long)le64_to_cpu(sb->ctime) | 1997 | (unsigned long long)le64_to_cpu(sb->ctime) |
1827 | & MD_SUPERBLOCK_1_TIME_SEC_MASK); | 1998 | & MD_SUPERBLOCK_1_TIME_SEC_MASK); |
@@ -1830,8 +2001,7 @@ static void print_sb_1(struct mdp_superblock_1 *sb) | |||
1830 | printk(KERN_INFO | 2001 | printk(KERN_INFO |
1831 | "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu" | 2002 | "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu" |
1832 | " RO:%llu\n" | 2003 | " RO:%llu\n" |
1833 | "md: Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x" | 2004 | "md: Dev:%08x UUID: %pU\n" |
1834 | ":%02x%02x%02x%02x%02x%02x\n" | ||
1835 | "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n" | 2005 | "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n" |
1836 | "md: (MaxDev:%u) \n", | 2006 | "md: (MaxDev:%u) \n", |
1837 | le32_to_cpu(sb->level), | 2007 | le32_to_cpu(sb->level), |
@@ -1844,10 +2014,7 @@ static void print_sb_1(struct mdp_superblock_1 *sb) | |||
1844 | (unsigned long long)le64_to_cpu(sb->super_offset), | 2014 | (unsigned long long)le64_to_cpu(sb->super_offset), |
1845 | (unsigned long long)le64_to_cpu(sb->recovery_offset), | 2015 | (unsigned long long)le64_to_cpu(sb->recovery_offset), |
1846 | le32_to_cpu(sb->dev_number), | 2016 | le32_to_cpu(sb->dev_number), |
1847 | uuid[0], uuid[1], uuid[2], uuid[3], | 2017 | uuid, |
1848 | uuid[4], uuid[5], uuid[6], uuid[7], | ||
1849 | uuid[8], uuid[9], uuid[10], uuid[11], | ||
1850 | uuid[12], uuid[13], uuid[14], uuid[15], | ||
1851 | sb->devflags, | 2018 | sb->devflags, |
1852 | (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK, | 2019 | (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK, |
1853 | (unsigned long long)le64_to_cpu(sb->events), | 2020 | (unsigned long long)le64_to_cpu(sb->events), |
@@ -1917,11 +2084,19 @@ static void sync_sbs(mddev_t * mddev, int nospares) | |||
1917 | */ | 2084 | */ |
1918 | mdk_rdev_t *rdev; | 2085 | mdk_rdev_t *rdev; |
1919 | 2086 | ||
2087 | /* First make sure individual recovery_offsets are correct */ | ||
2088 | list_for_each_entry(rdev, &mddev->disks, same_set) { | ||
2089 | if (rdev->raid_disk >= 0 && | ||
2090 | mddev->delta_disks >= 0 && | ||
2091 | !test_bit(In_sync, &rdev->flags) && | ||
2092 | mddev->curr_resync_completed > rdev->recovery_offset) | ||
2093 | rdev->recovery_offset = mddev->curr_resync_completed; | ||
2094 | |||
2095 | } | ||
1920 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 2096 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
1921 | if (rdev->sb_events == mddev->events || | 2097 | if (rdev->sb_events == mddev->events || |
1922 | (nospares && | 2098 | (nospares && |
1923 | rdev->raid_disk < 0 && | 2099 | rdev->raid_disk < 0 && |
1924 | (rdev->sb_events&1)==0 && | ||
1925 | rdev->sb_events+1 == mddev->events)) { | 2100 | rdev->sb_events+1 == mddev->events)) { |
1926 | /* Don't update this superblock */ | 2101 | /* Don't update this superblock */ |
1927 | rdev->sb_loaded = 2; | 2102 | rdev->sb_loaded = 2; |
@@ -1974,22 +2149,14 @@ repeat: | |||
1974 | * and 'events' is odd, we can roll back to the previous clean state */ | 2149 | * and 'events' is odd, we can roll back to the previous clean state */ |
1975 | if (nospares | 2150 | if (nospares |
1976 | && (mddev->in_sync && mddev->recovery_cp == MaxSector) | 2151 | && (mddev->in_sync && mddev->recovery_cp == MaxSector) |
1977 | && (mddev->events & 1) | 2152 | && mddev->can_decrease_events |
1978 | && mddev->events != 1) | 2153 | && mddev->events != 1) { |
1979 | mddev->events--; | 2154 | mddev->events--; |
1980 | else { | 2155 | mddev->can_decrease_events = 0; |
2156 | } else { | ||
1981 | /* otherwise we have to go forward and ... */ | 2157 | /* otherwise we have to go forward and ... */ |
1982 | mddev->events ++; | 2158 | mddev->events ++; |
1983 | if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ | 2159 | mddev->can_decrease_events = nospares; |
1984 | /* .. if the array isn't clean, an 'even' event must also go | ||
1985 | * to spares. */ | ||
1986 | if ((mddev->events&1)==0) | ||
1987 | nospares = 0; | ||
1988 | } else { | ||
1989 | /* otherwise an 'odd' event must go to spares */ | ||
1990 | if ((mddev->events&1)) | ||
1991 | nospares = 0; | ||
1992 | } | ||
1993 | } | 2160 | } |
1994 | 2161 | ||
1995 | if (!mddev->events) { | 2162 | if (!mddev->events) { |
@@ -2233,6 +2400,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2233 | return err; | 2400 | return err; |
2234 | sprintf(nm, "rd%d", rdev->raid_disk); | 2401 | sprintf(nm, "rd%d", rdev->raid_disk); |
2235 | sysfs_remove_link(&rdev->mddev->kobj, nm); | 2402 | sysfs_remove_link(&rdev->mddev->kobj, nm); |
2403 | rdev->raid_disk = -1; | ||
2236 | set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); | 2404 | set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); |
2237 | md_wakeup_thread(rdev->mddev->thread); | 2405 | md_wakeup_thread(rdev->mddev->thread); |
2238 | } else if (rdev->mddev->pers) { | 2406 | } else if (rdev->mddev->pers) { |
@@ -2421,12 +2589,49 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2421 | static struct rdev_sysfs_entry rdev_size = | 2589 | static struct rdev_sysfs_entry rdev_size = |
2422 | __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); | 2590 | __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); |
2423 | 2591 | ||
2592 | |||
2593 | static ssize_t recovery_start_show(mdk_rdev_t *rdev, char *page) | ||
2594 | { | ||
2595 | unsigned long long recovery_start = rdev->recovery_offset; | ||
2596 | |||
2597 | if (test_bit(In_sync, &rdev->flags) || | ||
2598 | recovery_start == MaxSector) | ||
2599 | return sprintf(page, "none\n"); | ||
2600 | |||
2601 | return sprintf(page, "%llu\n", recovery_start); | ||
2602 | } | ||
2603 | |||
2604 | static ssize_t recovery_start_store(mdk_rdev_t *rdev, const char *buf, size_t len) | ||
2605 | { | ||
2606 | unsigned long long recovery_start; | ||
2607 | |||
2608 | if (cmd_match(buf, "none")) | ||
2609 | recovery_start = MaxSector; | ||
2610 | else if (strict_strtoull(buf, 10, &recovery_start)) | ||
2611 | return -EINVAL; | ||
2612 | |||
2613 | if (rdev->mddev->pers && | ||
2614 | rdev->raid_disk >= 0) | ||
2615 | return -EBUSY; | ||
2616 | |||
2617 | rdev->recovery_offset = recovery_start; | ||
2618 | if (recovery_start == MaxSector) | ||
2619 | set_bit(In_sync, &rdev->flags); | ||
2620 | else | ||
2621 | clear_bit(In_sync, &rdev->flags); | ||
2622 | return len; | ||
2623 | } | ||
2624 | |||
2625 | static struct rdev_sysfs_entry rdev_recovery_start = | ||
2626 | __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); | ||
2627 | |||
2424 | static struct attribute *rdev_default_attrs[] = { | 2628 | static struct attribute *rdev_default_attrs[] = { |
2425 | &rdev_state.attr, | 2629 | &rdev_state.attr, |
2426 | &rdev_errors.attr, | 2630 | &rdev_errors.attr, |
2427 | &rdev_slot.attr, | 2631 | &rdev_slot.attr, |
2428 | &rdev_offset.attr, | 2632 | &rdev_offset.attr, |
2429 | &rdev_size.attr, | 2633 | &rdev_size.attr, |
2634 | &rdev_recovery_start.attr, | ||
2430 | NULL, | 2635 | NULL, |
2431 | }; | 2636 | }; |
2432 | static ssize_t | 2637 | static ssize_t |
@@ -2480,7 +2685,7 @@ static void rdev_free(struct kobject *ko) | |||
2480 | mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj); | 2685 | mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj); |
2481 | kfree(rdev); | 2686 | kfree(rdev); |
2482 | } | 2687 | } |
2483 | static struct sysfs_ops rdev_sysfs_ops = { | 2688 | static const struct sysfs_ops rdev_sysfs_ops = { |
2484 | .show = rdev_attr_show, | 2689 | .show = rdev_attr_show, |
2485 | .store = rdev_attr_store, | 2690 | .store = rdev_attr_store, |
2486 | }; | 2691 | }; |
@@ -2528,6 +2733,8 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi | |||
2528 | rdev->flags = 0; | 2733 | rdev->flags = 0; |
2529 | rdev->data_offset = 0; | 2734 | rdev->data_offset = 0; |
2530 | rdev->sb_events = 0; | 2735 | rdev->sb_events = 0; |
2736 | rdev->last_read_error.tv_sec = 0; | ||
2737 | rdev->last_read_error.tv_nsec = 0; | ||
2531 | atomic_set(&rdev->nr_pending, 0); | 2738 | atomic_set(&rdev->nr_pending, 0); |
2532 | atomic_set(&rdev->read_errors, 0); | 2739 | atomic_set(&rdev->read_errors, 0); |
2533 | atomic_set(&rdev->corrected_errors, 0); | 2740 | atomic_set(&rdev->corrected_errors, 0); |
@@ -2609,8 +2816,9 @@ static void analyze_sbs(mddev_t * mddev) | |||
2609 | 2816 | ||
2610 | i = 0; | 2817 | i = 0; |
2611 | rdev_for_each(rdev, tmp, mddev) { | 2818 | rdev_for_each(rdev, tmp, mddev) { |
2612 | if (rdev->desc_nr >= mddev->max_disks || | 2819 | if (mddev->max_disks && |
2613 | i > mddev->max_disks) { | 2820 | (rdev->desc_nr >= mddev->max_disks || |
2821 | i > mddev->max_disks)) { | ||
2614 | printk(KERN_WARNING | 2822 | printk(KERN_WARNING |
2615 | "md: %s: %s: only %d devices permitted\n", | 2823 | "md: %s: %s: only %d devices permitted\n", |
2616 | mdname(mddev), bdevname(rdev->bdev, b), | 2824 | mdname(mddev), bdevname(rdev->bdev, b), |
@@ -2631,13 +2839,54 @@ static void analyze_sbs(mddev_t * mddev) | |||
2631 | rdev->desc_nr = i++; | 2839 | rdev->desc_nr = i++; |
2632 | rdev->raid_disk = rdev->desc_nr; | 2840 | rdev->raid_disk = rdev->desc_nr; |
2633 | set_bit(In_sync, &rdev->flags); | 2841 | set_bit(In_sync, &rdev->flags); |
2634 | } else if (rdev->raid_disk >= mddev->raid_disks) { | 2842 | } else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) { |
2635 | rdev->raid_disk = -1; | 2843 | rdev->raid_disk = -1; |
2636 | clear_bit(In_sync, &rdev->flags); | 2844 | clear_bit(In_sync, &rdev->flags); |
2637 | } | 2845 | } |
2638 | } | 2846 | } |
2639 | } | 2847 | } |
2640 | 2848 | ||
2849 | /* Read a fixed-point number. | ||
2850 | * Numbers in sysfs attributes should be in "standard" units where | ||
2851 | * possible, so time should be in seconds. | ||
2852 | * However we internally use a a much smaller unit such as | ||
2853 | * milliseconds or jiffies. | ||
2854 | * This function takes a decimal number with a possible fractional | ||
2855 | * component, and produces an integer which is the result of | ||
2856 | * multiplying that number by 10^'scale'. | ||
2857 | * all without any floating-point arithmetic. | ||
2858 | */ | ||
2859 | int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) | ||
2860 | { | ||
2861 | unsigned long result = 0; | ||
2862 | long decimals = -1; | ||
2863 | while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { | ||
2864 | if (*cp == '.') | ||
2865 | decimals = 0; | ||
2866 | else if (decimals < scale) { | ||
2867 | unsigned int value; | ||
2868 | value = *cp - '0'; | ||
2869 | result = result * 10 + value; | ||
2870 | if (decimals >= 0) | ||
2871 | decimals++; | ||
2872 | } | ||
2873 | cp++; | ||
2874 | } | ||
2875 | if (*cp == '\n') | ||
2876 | cp++; | ||
2877 | if (*cp) | ||
2878 | return -EINVAL; | ||
2879 | if (decimals < 0) | ||
2880 | decimals = 0; | ||
2881 | while (decimals < scale) { | ||
2882 | result *= 10; | ||
2883 | decimals ++; | ||
2884 | } | ||
2885 | *res = result; | ||
2886 | return 0; | ||
2887 | } | ||
2888 | |||
2889 | |||
2641 | static void md_safemode_timeout(unsigned long data); | 2890 | static void md_safemode_timeout(unsigned long data); |
2642 | 2891 | ||
2643 | static ssize_t | 2892 | static ssize_t |
@@ -2649,31 +2898,10 @@ safe_delay_show(mddev_t *mddev, char *page) | |||
2649 | static ssize_t | 2898 | static ssize_t |
2650 | safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) | 2899 | safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) |
2651 | { | 2900 | { |
2652 | int scale=1; | ||
2653 | int dot=0; | ||
2654 | int i; | ||
2655 | unsigned long msec; | 2901 | unsigned long msec; |
2656 | char buf[30]; | ||
2657 | 2902 | ||
2658 | /* remove a period, and count digits after it */ | 2903 | if (strict_strtoul_scaled(cbuf, &msec, 3) < 0) |
2659 | if (len >= sizeof(buf)) | ||
2660 | return -EINVAL; | ||
2661 | strlcpy(buf, cbuf, sizeof(buf)); | ||
2662 | for (i=0; i<len; i++) { | ||
2663 | if (dot) { | ||
2664 | if (isdigit(buf[i])) { | ||
2665 | buf[i-1] = buf[i]; | ||
2666 | scale *= 10; | ||
2667 | } | ||
2668 | buf[i] = 0; | ||
2669 | } else if (buf[i] == '.') { | ||
2670 | dot=1; | ||
2671 | buf[i] = 0; | ||
2672 | } | ||
2673 | } | ||
2674 | if (strict_strtoul(buf, 10, &msec) < 0) | ||
2675 | return -EINVAL; | 2904 | return -EINVAL; |
2676 | msec = (msec * 1000) / scale; | ||
2677 | if (msec == 0) | 2905 | if (msec == 0) |
2678 | mddev->safemode_delay = 0; | 2906 | mddev->safemode_delay = 0; |
2679 | else { | 2907 | else { |
@@ -2706,9 +2934,10 @@ level_show(mddev_t *mddev, char *page) | |||
2706 | static ssize_t | 2934 | static ssize_t |
2707 | level_store(mddev_t *mddev, const char *buf, size_t len) | 2935 | level_store(mddev_t *mddev, const char *buf, size_t len) |
2708 | { | 2936 | { |
2709 | char level[16]; | 2937 | char clevel[16]; |
2710 | ssize_t rv = len; | 2938 | ssize_t rv = len; |
2711 | struct mdk_personality *pers; | 2939 | struct mdk_personality *pers; |
2940 | long level; | ||
2712 | void *priv; | 2941 | void *priv; |
2713 | mdk_rdev_t *rdev; | 2942 | mdk_rdev_t *rdev; |
2714 | 2943 | ||
@@ -2741,19 +2970,22 @@ level_store(mddev_t *mddev, const char *buf, size_t len) | |||
2741 | } | 2970 | } |
2742 | 2971 | ||
2743 | /* Now find the new personality */ | 2972 | /* Now find the new personality */ |
2744 | if (len == 0 || len >= sizeof(level)) | 2973 | if (len == 0 || len >= sizeof(clevel)) |
2745 | return -EINVAL; | 2974 | return -EINVAL; |
2746 | strncpy(level, buf, len); | 2975 | strncpy(clevel, buf, len); |
2747 | if (level[len-1] == '\n') | 2976 | if (clevel[len-1] == '\n') |
2748 | len--; | 2977 | len--; |
2749 | level[len] = 0; | 2978 | clevel[len] = 0; |
2979 | if (strict_strtol(clevel, 10, &level)) | ||
2980 | level = LEVEL_NONE; | ||
2750 | 2981 | ||
2751 | request_module("md-%s", level); | 2982 | if (request_module("md-%s", clevel) != 0) |
2983 | request_module("md-level-%s", clevel); | ||
2752 | spin_lock(&pers_lock); | 2984 | spin_lock(&pers_lock); |
2753 | pers = find_pers(LEVEL_NONE, level); | 2985 | pers = find_pers(level, clevel); |
2754 | if (!pers || !try_module_get(pers->owner)) { | 2986 | if (!pers || !try_module_get(pers->owner)) { |
2755 | spin_unlock(&pers_lock); | 2987 | spin_unlock(&pers_lock); |
2756 | printk(KERN_WARNING "md: personality %s not loaded\n", level); | 2988 | printk(KERN_WARNING "md: personality %s not loaded\n", clevel); |
2757 | return -EINVAL; | 2989 | return -EINVAL; |
2758 | } | 2990 | } |
2759 | spin_unlock(&pers_lock); | 2991 | spin_unlock(&pers_lock); |
@@ -2766,10 +2998,13 @@ level_store(mddev_t *mddev, const char *buf, size_t len) | |||
2766 | if (!pers->takeover) { | 2998 | if (!pers->takeover) { |
2767 | module_put(pers->owner); | 2999 | module_put(pers->owner); |
2768 | printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", | 3000 | printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", |
2769 | mdname(mddev), level); | 3001 | mdname(mddev), clevel); |
2770 | return -EINVAL; | 3002 | return -EINVAL; |
2771 | } | 3003 | } |
2772 | 3004 | ||
3005 | list_for_each_entry(rdev, &mddev->disks, same_set) | ||
3006 | rdev->new_raid_disk = rdev->raid_disk; | ||
3007 | |||
2773 | /* ->takeover must set new_* and/or delta_disks | 3008 | /* ->takeover must set new_* and/or delta_disks |
2774 | * if it succeeds, and may set them when it fails. | 3009 | * if it succeeds, and may set them when it fails. |
2775 | */ | 3010 | */ |
@@ -2782,20 +3017,73 @@ level_store(mddev_t *mddev, const char *buf, size_t len) | |||
2782 | mddev->delta_disks = 0; | 3017 | mddev->delta_disks = 0; |
2783 | module_put(pers->owner); | 3018 | module_put(pers->owner); |
2784 | printk(KERN_WARNING "md: %s: %s would not accept array\n", | 3019 | printk(KERN_WARNING "md: %s: %s would not accept array\n", |
2785 | mdname(mddev), level); | 3020 | mdname(mddev), clevel); |
2786 | return PTR_ERR(priv); | 3021 | return PTR_ERR(priv); |
2787 | } | 3022 | } |
2788 | 3023 | ||
2789 | /* Looks like we have a winner */ | 3024 | /* Looks like we have a winner */ |
2790 | mddev_suspend(mddev); | 3025 | mddev_suspend(mddev); |
2791 | mddev->pers->stop(mddev); | 3026 | mddev->pers->stop(mddev); |
2792 | module_put(mddev->pers->owner); | 3027 | |
2793 | /* Invalidate devices that are now superfluous */ | 3028 | if (mddev->pers->sync_request == NULL && |
2794 | list_for_each_entry(rdev, &mddev->disks, same_set) | 3029 | pers->sync_request != NULL) { |
2795 | if (rdev->raid_disk >= mddev->raid_disks) { | 3030 | /* need to add the md_redundancy_group */ |
2796 | rdev->raid_disk = -1; | 3031 | if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) |
3032 | printk(KERN_WARNING | ||
3033 | "md: cannot register extra attributes for %s\n", | ||
3034 | mdname(mddev)); | ||
3035 | mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, NULL, "sync_action"); | ||
3036 | } | ||
3037 | if (mddev->pers->sync_request != NULL && | ||
3038 | pers->sync_request == NULL) { | ||
3039 | /* need to remove the md_redundancy_group */ | ||
3040 | if (mddev->to_remove == NULL) | ||
3041 | mddev->to_remove = &md_redundancy_group; | ||
3042 | } | ||
3043 | |||
3044 | if (mddev->pers->sync_request == NULL && | ||
3045 | mddev->external) { | ||
3046 | /* We are converting from a no-redundancy array | ||
3047 | * to a redundancy array and metadata is managed | ||
3048 | * externally so we need to be sure that writes | ||
3049 | * won't block due to a need to transition | ||
3050 | * clean->dirty | ||
3051 | * until external management is started. | ||
3052 | */ | ||
3053 | mddev->in_sync = 0; | ||
3054 | mddev->safemode_delay = 0; | ||
3055 | mddev->safemode = 0; | ||
3056 | } | ||
3057 | |||
3058 | list_for_each_entry(rdev, &mddev->disks, same_set) { | ||
3059 | char nm[20]; | ||
3060 | if (rdev->raid_disk < 0) | ||
3061 | continue; | ||
3062 | if (rdev->new_raid_disk > mddev->raid_disks) | ||
3063 | rdev->new_raid_disk = -1; | ||
3064 | if (rdev->new_raid_disk == rdev->raid_disk) | ||
3065 | continue; | ||
3066 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
3067 | sysfs_remove_link(&mddev->kobj, nm); | ||
3068 | } | ||
3069 | list_for_each_entry(rdev, &mddev->disks, same_set) { | ||
3070 | if (rdev->raid_disk < 0) | ||
3071 | continue; | ||
3072 | if (rdev->new_raid_disk == rdev->raid_disk) | ||
3073 | continue; | ||
3074 | rdev->raid_disk = rdev->new_raid_disk; | ||
3075 | if (rdev->raid_disk < 0) | ||
2797 | clear_bit(In_sync, &rdev->flags); | 3076 | clear_bit(In_sync, &rdev->flags); |
3077 | else { | ||
3078 | char nm[20]; | ||
3079 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
3080 | if(sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) | ||
3081 | printk("md: cannot register %s for %s after level change\n", | ||
3082 | nm, mdname(mddev)); | ||
2798 | } | 3083 | } |
3084 | } | ||
3085 | |||
3086 | module_put(mddev->pers->owner); | ||
2799 | mddev->pers = pers; | 3087 | mddev->pers = pers; |
2800 | mddev->private = priv; | 3088 | mddev->private = priv; |
2801 | strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); | 3089 | strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); |
@@ -2803,11 +3091,20 @@ level_store(mddev_t *mddev, const char *buf, size_t len) | |||
2803 | mddev->layout = mddev->new_layout; | 3091 | mddev->layout = mddev->new_layout; |
2804 | mddev->chunk_sectors = mddev->new_chunk_sectors; | 3092 | mddev->chunk_sectors = mddev->new_chunk_sectors; |
2805 | mddev->delta_disks = 0; | 3093 | mddev->delta_disks = 0; |
3094 | if (mddev->pers->sync_request == NULL) { | ||
3095 | /* this is now an array without redundancy, so | ||
3096 | * it must always be in_sync | ||
3097 | */ | ||
3098 | mddev->in_sync = 1; | ||
3099 | del_timer_sync(&mddev->safemode_timer); | ||
3100 | } | ||
2806 | pers->run(mddev); | 3101 | pers->run(mddev); |
2807 | mddev_resume(mddev); | 3102 | mddev_resume(mddev); |
2808 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 3103 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
2809 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 3104 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
2810 | md_wakeup_thread(mddev->thread); | 3105 | md_wakeup_thread(mddev->thread); |
3106 | sysfs_notify(&mddev->kobj, NULL, "level"); | ||
3107 | md_new_event(mddev); | ||
2811 | return rv; | 3108 | return rv; |
2812 | } | 3109 | } |
2813 | 3110 | ||
@@ -2949,7 +3246,9 @@ resync_start_store(mddev_t *mddev, const char *buf, size_t len) | |||
2949 | 3246 | ||
2950 | if (mddev->pers) | 3247 | if (mddev->pers) |
2951 | return -EBUSY; | 3248 | return -EBUSY; |
2952 | if (!*buf || (*e && *e != '\n')) | 3249 | if (cmd_match(buf, "none")) |
3250 | n = MaxSector; | ||
3251 | else if (!*buf || (*e && *e != '\n')) | ||
2953 | return -EINVAL; | 3252 | return -EINVAL; |
2954 | 3253 | ||
2955 | mddev->recovery_cp = n; | 3254 | mddev->recovery_cp = n; |
@@ -3044,6 +3343,7 @@ array_state_show(mddev_t *mddev, char *page) | |||
3044 | } | 3343 | } |
3045 | 3344 | ||
3046 | static int do_md_stop(mddev_t * mddev, int ro, int is_open); | 3345 | static int do_md_stop(mddev_t * mddev, int ro, int is_open); |
3346 | static int md_set_readonly(mddev_t * mddev, int is_open); | ||
3047 | static int do_md_run(mddev_t * mddev); | 3347 | static int do_md_run(mddev_t * mddev); |
3048 | static int restart_array(mddev_t *mddev); | 3348 | static int restart_array(mddev_t *mddev); |
3049 | 3349 | ||
@@ -3074,7 +3374,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) | |||
3074 | break; /* not supported yet */ | 3374 | break; /* not supported yet */ |
3075 | case readonly: | 3375 | case readonly: |
3076 | if (mddev->pers) | 3376 | if (mddev->pers) |
3077 | err = do_md_stop(mddev, 1, 0); | 3377 | err = md_set_readonly(mddev, 0); |
3078 | else { | 3378 | else { |
3079 | mddev->ro = 1; | 3379 | mddev->ro = 1; |
3080 | set_disk_ro(mddev->gendisk, 1); | 3380 | set_disk_ro(mddev->gendisk, 1); |
@@ -3084,7 +3384,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) | |||
3084 | case read_auto: | 3384 | case read_auto: |
3085 | if (mddev->pers) { | 3385 | if (mddev->pers) { |
3086 | if (mddev->ro == 0) | 3386 | if (mddev->ro == 0) |
3087 | err = do_md_stop(mddev, 1, 0); | 3387 | err = md_set_readonly(mddev, 0); |
3088 | else if (mddev->ro == 1) | 3388 | else if (mddev->ro == 1) |
3089 | err = restart_array(mddev); | 3389 | err = restart_array(mddev); |
3090 | if (err == 0) { | 3390 | if (err == 0) { |
@@ -3145,6 +3445,29 @@ static struct md_sysfs_entry md_array_state = | |||
3145 | __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); | 3445 | __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); |
3146 | 3446 | ||
3147 | static ssize_t | 3447 | static ssize_t |
3448 | max_corrected_read_errors_show(mddev_t *mddev, char *page) { | ||
3449 | return sprintf(page, "%d\n", | ||
3450 | atomic_read(&mddev->max_corr_read_errors)); | ||
3451 | } | ||
3452 | |||
3453 | static ssize_t | ||
3454 | max_corrected_read_errors_store(mddev_t *mddev, const char *buf, size_t len) | ||
3455 | { | ||
3456 | char *e; | ||
3457 | unsigned long n = simple_strtoul(buf, &e, 10); | ||
3458 | |||
3459 | if (*buf && (*e == 0 || *e == '\n')) { | ||
3460 | atomic_set(&mddev->max_corr_read_errors, n); | ||
3461 | return len; | ||
3462 | } | ||
3463 | return -EINVAL; | ||
3464 | } | ||
3465 | |||
3466 | static struct md_sysfs_entry max_corr_read_errors = | ||
3467 | __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, | ||
3468 | max_corrected_read_errors_store); | ||
3469 | |||
3470 | static ssize_t | ||
3148 | null_show(mddev_t *mddev, char *page) | 3471 | null_show(mddev_t *mddev, char *page) |
3149 | { | 3472 | { |
3150 | return -EINVAL; | 3473 | return -EINVAL; |
@@ -3225,8 +3548,7 @@ bitmap_store(mddev_t *mddev, const char *buf, size_t len) | |||
3225 | } | 3548 | } |
3226 | if (*end && !isspace(*end)) break; | 3549 | if (*end && !isspace(*end)) break; |
3227 | bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); | 3550 | bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); |
3228 | buf = end; | 3551 | buf = skip_spaces(end); |
3229 | while (isspace(*buf)) buf++; | ||
3230 | } | 3552 | } |
3231 | bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ | 3553 | bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ |
3232 | out: | 3554 | out: |
@@ -3769,6 +4091,7 @@ static struct attribute *md_default_attrs[] = { | |||
3769 | &md_array_state.attr, | 4091 | &md_array_state.attr, |
3770 | &md_reshape_position.attr, | 4092 | &md_reshape_position.attr, |
3771 | &md_array_size.attr, | 4093 | &md_array_size.attr, |
4094 | &max_corr_read_errors.attr, | ||
3772 | NULL, | 4095 | NULL, |
3773 | }; | 4096 | }; |
3774 | 4097 | ||
@@ -3850,7 +4173,7 @@ static void md_free(struct kobject *ko) | |||
3850 | kfree(mddev); | 4173 | kfree(mddev); |
3851 | } | 4174 | } |
3852 | 4175 | ||
3853 | static struct sysfs_ops md_sysfs_ops = { | 4176 | static const struct sysfs_ops md_sysfs_ops = { |
3854 | .show = md_attr_show, | 4177 | .show = md_attr_show, |
3855 | .store = md_attr_store, | 4178 | .store = md_attr_store, |
3856 | }; | 4179 | }; |
@@ -3866,13 +4189,7 @@ static void mddev_delayed_delete(struct work_struct *ws) | |||
3866 | { | 4189 | { |
3867 | mddev_t *mddev = container_of(ws, mddev_t, del_work); | 4190 | mddev_t *mddev = container_of(ws, mddev_t, del_work); |
3868 | 4191 | ||
3869 | if (mddev->private == &md_redundancy_group) { | 4192 | sysfs_remove_group(&mddev->kobj, &md_bitmap_group); |
3870 | sysfs_remove_group(&mddev->kobj, &md_redundancy_group); | ||
3871 | if (mddev->sysfs_action) | ||
3872 | sysfs_put(mddev->sysfs_action); | ||
3873 | mddev->sysfs_action = NULL; | ||
3874 | mddev->private = NULL; | ||
3875 | } | ||
3876 | kobject_del(&mddev->kobj); | 4193 | kobject_del(&mddev->kobj); |
3877 | kobject_put(&mddev->kobj); | 4194 | kobject_put(&mddev->kobj); |
3878 | } | 4195 | } |
@@ -3964,11 +4281,13 @@ static int md_alloc(dev_t dev, char *name) | |||
3964 | disk->disk_name); | 4281 | disk->disk_name); |
3965 | error = 0; | 4282 | error = 0; |
3966 | } | 4283 | } |
4284 | if (sysfs_create_group(&mddev->kobj, &md_bitmap_group)) | ||
4285 | printk(KERN_DEBUG "pointless warning\n"); | ||
3967 | abort: | 4286 | abort: |
3968 | mutex_unlock(&disks_mutex); | 4287 | mutex_unlock(&disks_mutex); |
3969 | if (!error) { | 4288 | if (!error) { |
3970 | kobject_uevent(&mddev->kobj, KOBJ_ADD); | 4289 | kobject_uevent(&mddev->kobj, KOBJ_ADD); |
3971 | mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state"); | 4290 | mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, NULL, "array_state"); |
3972 | } | 4291 | } |
3973 | mddev_put(mddev); | 4292 | mddev_put(mddev); |
3974 | return error; | 4293 | return error; |
@@ -4013,11 +4332,10 @@ static void md_safemode_timeout(unsigned long data) | |||
4013 | 4332 | ||
4014 | static int start_dirty_degraded; | 4333 | static int start_dirty_degraded; |
4015 | 4334 | ||
4016 | static int do_md_run(mddev_t * mddev) | 4335 | static int md_run(mddev_t *mddev) |
4017 | { | 4336 | { |
4018 | int err; | 4337 | int err; |
4019 | mdk_rdev_t *rdev; | 4338 | mdk_rdev_t *rdev; |
4020 | struct gendisk *disk; | ||
4021 | struct mdk_personality *pers; | 4339 | struct mdk_personality *pers; |
4022 | 4340 | ||
4023 | if (list_empty(&mddev->disks)) | 4341 | if (list_empty(&mddev->disks)) |
@@ -4027,6 +4345,13 @@ static int do_md_run(mddev_t * mddev) | |||
4027 | if (mddev->pers) | 4345 | if (mddev->pers) |
4028 | return -EBUSY; | 4346 | return -EBUSY; |
4029 | 4347 | ||
4348 | /* These two calls synchronise us with the | ||
4349 | * sysfs_remove_group calls in mddev_unlock, | ||
4350 | * so they must have completed. | ||
4351 | */ | ||
4352 | mutex_lock(&mddev->open_mutex); | ||
4353 | mutex_unlock(&mddev->open_mutex); | ||
4354 | |||
4030 | /* | 4355 | /* |
4031 | * Analyze all RAID superblock(s) | 4356 | * Analyze all RAID superblock(s) |
4032 | */ | 4357 | */ |
@@ -4075,11 +4400,6 @@ static int do_md_run(mddev_t * mddev) | |||
4075 | sysfs_notify_dirent(rdev->sysfs_state); | 4400 | sysfs_notify_dirent(rdev->sysfs_state); |
4076 | } | 4401 | } |
4077 | 4402 | ||
4078 | md_probe(mddev->unit, NULL, NULL); | ||
4079 | disk = mddev->gendisk; | ||
4080 | if (!disk) | ||
4081 | return -ENOMEM; | ||
4082 | |||
4083 | spin_lock(&pers_lock); | 4403 | spin_lock(&pers_lock); |
4084 | pers = find_pers(mddev->level, mddev->clevel); | 4404 | pers = find_pers(mddev->level, mddev->clevel); |
4085 | if (!pers || !try_module_get(pers->owner)) { | 4405 | if (!pers || !try_module_get(pers->owner)) { |
@@ -4145,7 +4465,7 @@ static int do_md_run(mddev_t * mddev) | |||
4145 | mddev->barriers_work = 1; | 4465 | mddev->barriers_work = 1; |
4146 | mddev->ok_start_degraded = start_dirty_degraded; | 4466 | mddev->ok_start_degraded = start_dirty_degraded; |
4147 | 4467 | ||
4148 | if (start_readonly) | 4468 | if (start_readonly && mddev->ro == 0) |
4149 | mddev->ro = 2; /* read-only, but switch on first write */ | 4469 | mddev->ro = 2; /* read-only, but switch on first write */ |
4150 | 4470 | ||
4151 | err = mddev->pers->run(mddev); | 4471 | err = mddev->pers->run(mddev); |
@@ -4180,11 +4500,13 @@ static int do_md_run(mddev_t * mddev) | |||
4180 | printk(KERN_WARNING | 4500 | printk(KERN_WARNING |
4181 | "md: cannot register extra attributes for %s\n", | 4501 | "md: cannot register extra attributes for %s\n", |
4182 | mdname(mddev)); | 4502 | mdname(mddev)); |
4183 | mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); | 4503 | mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, NULL, "sync_action"); |
4184 | } else if (mddev->ro == 2) /* auto-readonly not meaningful */ | 4504 | } else if (mddev->ro == 2) /* auto-readonly not meaningful */ |
4185 | mddev->ro = 0; | 4505 | mddev->ro = 0; |
4186 | 4506 | ||
4187 | atomic_set(&mddev->writes_pending,0); | 4507 | atomic_set(&mddev->writes_pending,0); |
4508 | atomic_set(&mddev->max_corr_read_errors, | ||
4509 | MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); | ||
4188 | mddev->safemode = 0; | 4510 | mddev->safemode = 0; |
4189 | mddev->safemode_timer.function = md_safemode_timeout; | 4511 | mddev->safemode_timer.function = md_safemode_timeout; |
4190 | mddev->safemode_timer.data = (unsigned long) mddev; | 4512 | mddev->safemode_timer.data = (unsigned long) mddev; |
@@ -4205,49 +4527,32 @@ static int do_md_run(mddev_t * mddev) | |||
4205 | if (mddev->flags) | 4527 | if (mddev->flags) |
4206 | md_update_sb(mddev, 0); | 4528 | md_update_sb(mddev, 0); |
4207 | 4529 | ||
4208 | set_capacity(disk, mddev->array_sectors); | ||
4209 | |||
4210 | /* If there is a partially-recovered drive we need to | ||
4211 | * start recovery here. If we leave it to md_check_recovery, | ||
4212 | * it will remove the drives and not do the right thing | ||
4213 | */ | ||
4214 | if (mddev->degraded && !mddev->sync_thread) { | ||
4215 | int spares = 0; | ||
4216 | list_for_each_entry(rdev, &mddev->disks, same_set) | ||
4217 | if (rdev->raid_disk >= 0 && | ||
4218 | !test_bit(In_sync, &rdev->flags) && | ||
4219 | !test_bit(Faulty, &rdev->flags)) | ||
4220 | /* complete an interrupted recovery */ | ||
4221 | spares++; | ||
4222 | if (spares && mddev->pers->sync_request) { | ||
4223 | mddev->recovery = 0; | ||
4224 | set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); | ||
4225 | mddev->sync_thread = md_register_thread(md_do_sync, | ||
4226 | mddev, | ||
4227 | "resync"); | ||
4228 | if (!mddev->sync_thread) { | ||
4229 | printk(KERN_ERR "%s: could not start resync" | ||
4230 | " thread...\n", | ||
4231 | mdname(mddev)); | ||
4232 | /* leave the spares where they are, it shouldn't hurt */ | ||
4233 | mddev->recovery = 0; | ||
4234 | } | ||
4235 | } | ||
4236 | } | ||
4237 | md_wakeup_thread(mddev->thread); | 4530 | md_wakeup_thread(mddev->thread); |
4238 | md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ | 4531 | md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ |
4239 | 4532 | ||
4240 | revalidate_disk(mddev->gendisk); | ||
4241 | mddev->changed = 1; | ||
4242 | md_new_event(mddev); | 4533 | md_new_event(mddev); |
4243 | sysfs_notify_dirent(mddev->sysfs_state); | 4534 | sysfs_notify_dirent(mddev->sysfs_state); |
4244 | if (mddev->sysfs_action) | 4535 | if (mddev->sysfs_action) |
4245 | sysfs_notify_dirent(mddev->sysfs_action); | 4536 | sysfs_notify_dirent(mddev->sysfs_action); |
4246 | sysfs_notify(&mddev->kobj, NULL, "degraded"); | 4537 | sysfs_notify(&mddev->kobj, NULL, "degraded"); |
4247 | kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); | ||
4248 | return 0; | 4538 | return 0; |
4249 | } | 4539 | } |
4250 | 4540 | ||
4541 | static int do_md_run(mddev_t *mddev) | ||
4542 | { | ||
4543 | int err; | ||
4544 | |||
4545 | err = md_run(mddev); | ||
4546 | if (err) | ||
4547 | goto out; | ||
4548 | |||
4549 | set_capacity(mddev->gendisk, mddev->array_sectors); | ||
4550 | revalidate_disk(mddev->gendisk); | ||
4551 | kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); | ||
4552 | out: | ||
4553 | return err; | ||
4554 | } | ||
4555 | |||
4251 | static int restart_array(mddev_t *mddev) | 4556 | static int restart_array(mddev_t *mddev) |
4252 | { | 4557 | { |
4253 | struct gendisk *disk = mddev->gendisk; | 4558 | struct gendisk *disk = mddev->gendisk; |
@@ -4289,7 +4594,7 @@ static int deny_bitmap_write_access(struct file * file) | |||
4289 | return 0; | 4594 | return 0; |
4290 | } | 4595 | } |
4291 | 4596 | ||
4292 | static void restore_bitmap_write_access(struct file *file) | 4597 | void restore_bitmap_write_access(struct file *file) |
4293 | { | 4598 | { |
4294 | struct inode *inode = file->f_mapping->host; | 4599 | struct inode *inode = file->f_mapping->host; |
4295 | 4600 | ||
@@ -4298,9 +4603,110 @@ static void restore_bitmap_write_access(struct file *file) | |||
4298 | spin_unlock(&inode->i_lock); | 4603 | spin_unlock(&inode->i_lock); |
4299 | } | 4604 | } |
4300 | 4605 | ||
4606 | static void md_clean(mddev_t *mddev) | ||
4607 | { | ||
4608 | mddev->array_sectors = 0; | ||
4609 | mddev->external_size = 0; | ||
4610 | mddev->dev_sectors = 0; | ||
4611 | mddev->raid_disks = 0; | ||
4612 | mddev->recovery_cp = 0; | ||
4613 | mddev->resync_min = 0; | ||
4614 | mddev->resync_max = MaxSector; | ||
4615 | mddev->reshape_position = MaxSector; | ||
4616 | mddev->external = 0; | ||
4617 | mddev->persistent = 0; | ||
4618 | mddev->level = LEVEL_NONE; | ||
4619 | mddev->clevel[0] = 0; | ||
4620 | mddev->flags = 0; | ||
4621 | mddev->ro = 0; | ||
4622 | mddev->metadata_type[0] = 0; | ||
4623 | mddev->chunk_sectors = 0; | ||
4624 | mddev->ctime = mddev->utime = 0; | ||
4625 | mddev->layout = 0; | ||
4626 | mddev->max_disks = 0; | ||
4627 | mddev->events = 0; | ||
4628 | mddev->can_decrease_events = 0; | ||
4629 | mddev->delta_disks = 0; | ||
4630 | mddev->new_level = LEVEL_NONE; | ||
4631 | mddev->new_layout = 0; | ||
4632 | mddev->new_chunk_sectors = 0; | ||
4633 | mddev->curr_resync = 0; | ||
4634 | mddev->resync_mismatches = 0; | ||
4635 | mddev->suspend_lo = mddev->suspend_hi = 0; | ||
4636 | mddev->sync_speed_min = mddev->sync_speed_max = 0; | ||
4637 | mddev->recovery = 0; | ||
4638 | mddev->in_sync = 0; | ||
4639 | mddev->degraded = 0; | ||
4640 | mddev->barriers_work = 0; | ||
4641 | mddev->safemode = 0; | ||
4642 | mddev->bitmap_info.offset = 0; | ||
4643 | mddev->bitmap_info.default_offset = 0; | ||
4644 | mddev->bitmap_info.chunksize = 0; | ||
4645 | mddev->bitmap_info.daemon_sleep = 0; | ||
4646 | mddev->bitmap_info.max_write_behind = 0; | ||
4647 | } | ||
4648 | |||
4649 | static void md_stop_writes(mddev_t *mddev) | ||
4650 | { | ||
4651 | if (mddev->sync_thread) { | ||
4652 | set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); | ||
4653 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||
4654 | md_unregister_thread(mddev->sync_thread); | ||
4655 | mddev->sync_thread = NULL; | ||
4656 | } | ||
4657 | |||
4658 | del_timer_sync(&mddev->safemode_timer); | ||
4659 | |||
4660 | bitmap_flush(mddev); | ||
4661 | md_super_wait(mddev); | ||
4662 | |||
4663 | if (!mddev->in_sync || mddev->flags) { | ||
4664 | /* mark array as shutdown cleanly */ | ||
4665 | mddev->in_sync = 1; | ||
4666 | md_update_sb(mddev, 1); | ||
4667 | } | ||
4668 | } | ||
4669 | |||
4670 | static void md_stop(mddev_t *mddev) | ||
4671 | { | ||
4672 | md_stop_writes(mddev); | ||
4673 | |||
4674 | mddev->pers->stop(mddev); | ||
4675 | if (mddev->pers->sync_request && mddev->to_remove == NULL) | ||
4676 | mddev->to_remove = &md_redundancy_group; | ||
4677 | module_put(mddev->pers->owner); | ||
4678 | mddev->pers = NULL; | ||
4679 | clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); | ||
4680 | } | ||
4681 | |||
4682 | static int md_set_readonly(mddev_t *mddev, int is_open) | ||
4683 | { | ||
4684 | int err = 0; | ||
4685 | mutex_lock(&mddev->open_mutex); | ||
4686 | if (atomic_read(&mddev->openers) > is_open) { | ||
4687 | printk("md: %s still in use.\n",mdname(mddev)); | ||
4688 | err = -EBUSY; | ||
4689 | goto out; | ||
4690 | } | ||
4691 | if (mddev->pers) { | ||
4692 | md_stop_writes(mddev); | ||
4693 | |||
4694 | err = -ENXIO; | ||
4695 | if (mddev->ro==1) | ||
4696 | goto out; | ||
4697 | mddev->ro = 1; | ||
4698 | set_disk_ro(mddev->gendisk, 1); | ||
4699 | clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); | ||
4700 | sysfs_notify_dirent(mddev->sysfs_state); | ||
4701 | err = 0; | ||
4702 | } | ||
4703 | out: | ||
4704 | mutex_unlock(&mddev->open_mutex); | ||
4705 | return err; | ||
4706 | } | ||
4707 | |||
4301 | /* mode: | 4708 | /* mode: |
4302 | * 0 - completely stop and dis-assemble array | 4709 | * 0 - completely stop and dis-assemble array |
4303 | * 1 - switch to readonly | ||
4304 | * 2 - stop but do not disassemble array | 4710 | * 2 - stop but do not disassemble array |
4305 | */ | 4711 | */ |
4306 | static int do_md_stop(mddev_t * mddev, int mode, int is_open) | 4712 | static int do_md_stop(mddev_t * mddev, int mode, int is_open) |
@@ -4315,64 +4721,32 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) | |||
4315 | err = -EBUSY; | 4721 | err = -EBUSY; |
4316 | } else if (mddev->pers) { | 4722 | } else if (mddev->pers) { |
4317 | 4723 | ||
4318 | if (mddev->sync_thread) { | 4724 | if (mddev->ro) |
4319 | set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); | 4725 | set_disk_ro(disk, 0); |
4320 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||
4321 | md_unregister_thread(mddev->sync_thread); | ||
4322 | mddev->sync_thread = NULL; | ||
4323 | } | ||
4324 | |||
4325 | del_timer_sync(&mddev->safemode_timer); | ||
4326 | 4726 | ||
4327 | switch(mode) { | 4727 | md_stop(mddev); |
4328 | case 1: /* readonly */ | 4728 | mddev->queue->merge_bvec_fn = NULL; |
4329 | err = -ENXIO; | 4729 | mddev->queue->unplug_fn = NULL; |
4330 | if (mddev->ro==1) | 4730 | mddev->queue->backing_dev_info.congested_fn = NULL; |
4331 | goto out; | ||
4332 | mddev->ro = 1; | ||
4333 | break; | ||
4334 | case 0: /* disassemble */ | ||
4335 | case 2: /* stop */ | ||
4336 | bitmap_flush(mddev); | ||
4337 | md_super_wait(mddev); | ||
4338 | if (mddev->ro) | ||
4339 | set_disk_ro(disk, 0); | ||
4340 | 4731 | ||
4341 | mddev->pers->stop(mddev); | 4732 | /* tell userspace to handle 'inactive' */ |
4342 | mddev->queue->merge_bvec_fn = NULL; | 4733 | sysfs_notify_dirent(mddev->sysfs_state); |
4343 | mddev->queue->unplug_fn = NULL; | ||
4344 | mddev->queue->backing_dev_info.congested_fn = NULL; | ||
4345 | module_put(mddev->pers->owner); | ||
4346 | if (mddev->pers->sync_request) | ||
4347 | mddev->private = &md_redundancy_group; | ||
4348 | mddev->pers = NULL; | ||
4349 | /* tell userspace to handle 'inactive' */ | ||
4350 | sysfs_notify_dirent(mddev->sysfs_state); | ||
4351 | 4734 | ||
4352 | list_for_each_entry(rdev, &mddev->disks, same_set) | 4735 | list_for_each_entry(rdev, &mddev->disks, same_set) |
4353 | if (rdev->raid_disk >= 0) { | 4736 | if (rdev->raid_disk >= 0) { |
4354 | char nm[20]; | 4737 | char nm[20]; |
4355 | sprintf(nm, "rd%d", rdev->raid_disk); | 4738 | sprintf(nm, "rd%d", rdev->raid_disk); |
4356 | sysfs_remove_link(&mddev->kobj, nm); | 4739 | sysfs_remove_link(&mddev->kobj, nm); |
4357 | } | 4740 | } |
4358 | 4741 | ||
4359 | set_capacity(disk, 0); | 4742 | set_capacity(disk, 0); |
4360 | mddev->changed = 1; | 4743 | revalidate_disk(disk); |
4361 | 4744 | ||
4362 | if (mddev->ro) | 4745 | if (mddev->ro) |
4363 | mddev->ro = 0; | 4746 | mddev->ro = 0; |
4364 | } | 4747 | |
4365 | if (!mddev->in_sync || mddev->flags) { | ||
4366 | /* mark array as shutdown cleanly */ | ||
4367 | mddev->in_sync = 1; | ||
4368 | md_update_sb(mddev, 1); | ||
4369 | } | ||
4370 | if (mode == 1) | ||
4371 | set_disk_ro(disk, 1); | ||
4372 | clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); | ||
4373 | err = 0; | 4748 | err = 0; |
4374 | } | 4749 | } |
4375 | out: | ||
4376 | mutex_unlock(&mddev->open_mutex); | 4750 | mutex_unlock(&mddev->open_mutex); |
4377 | if (err) | 4751 | if (err) |
4378 | return err; | 4752 | return err; |
@@ -4384,59 +4758,21 @@ out: | |||
4384 | printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); | 4758 | printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); |
4385 | 4759 | ||
4386 | bitmap_destroy(mddev); | 4760 | bitmap_destroy(mddev); |
4387 | if (mddev->bitmap_file) { | 4761 | if (mddev->bitmap_info.file) { |
4388 | restore_bitmap_write_access(mddev->bitmap_file); | 4762 | restore_bitmap_write_access(mddev->bitmap_info.file); |
4389 | fput(mddev->bitmap_file); | 4763 | fput(mddev->bitmap_info.file); |
4390 | mddev->bitmap_file = NULL; | 4764 | mddev->bitmap_info.file = NULL; |
4391 | } | 4765 | } |
4392 | mddev->bitmap_offset = 0; | 4766 | mddev->bitmap_info.offset = 0; |
4393 | |||
4394 | /* make sure all md_delayed_delete calls have finished */ | ||
4395 | flush_scheduled_work(); | ||
4396 | 4767 | ||
4397 | export_array(mddev); | 4768 | export_array(mddev); |
4398 | 4769 | ||
4399 | mddev->array_sectors = 0; | 4770 | md_clean(mddev); |
4400 | mddev->external_size = 0; | ||
4401 | mddev->dev_sectors = 0; | ||
4402 | mddev->raid_disks = 0; | ||
4403 | mddev->recovery_cp = 0; | ||
4404 | mddev->resync_min = 0; | ||
4405 | mddev->resync_max = MaxSector; | ||
4406 | mddev->reshape_position = MaxSector; | ||
4407 | mddev->external = 0; | ||
4408 | mddev->persistent = 0; | ||
4409 | mddev->level = LEVEL_NONE; | ||
4410 | mddev->clevel[0] = 0; | ||
4411 | mddev->flags = 0; | ||
4412 | mddev->ro = 0; | ||
4413 | mddev->metadata_type[0] = 0; | ||
4414 | mddev->chunk_sectors = 0; | ||
4415 | mddev->ctime = mddev->utime = 0; | ||
4416 | mddev->layout = 0; | ||
4417 | mddev->max_disks = 0; | ||
4418 | mddev->events = 0; | ||
4419 | mddev->delta_disks = 0; | ||
4420 | mddev->new_level = LEVEL_NONE; | ||
4421 | mddev->new_layout = 0; | ||
4422 | mddev->new_chunk_sectors = 0; | ||
4423 | mddev->curr_resync = 0; | ||
4424 | mddev->resync_mismatches = 0; | ||
4425 | mddev->suspend_lo = mddev->suspend_hi = 0; | ||
4426 | mddev->sync_speed_min = mddev->sync_speed_max = 0; | ||
4427 | mddev->recovery = 0; | ||
4428 | mddev->in_sync = 0; | ||
4429 | mddev->changed = 0; | ||
4430 | mddev->degraded = 0; | ||
4431 | mddev->barriers_work = 0; | ||
4432 | mddev->safemode = 0; | ||
4433 | kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); | 4771 | kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); |
4434 | if (mddev->hold_active == UNTIL_STOP) | 4772 | if (mddev->hold_active == UNTIL_STOP) |
4435 | mddev->hold_active = 0; | 4773 | mddev->hold_active = 0; |
4436 | 4774 | ||
4437 | } else if (mddev->pers) | 4775 | } |
4438 | printk(KERN_INFO "md: %s switched to read-only mode.\n", | ||
4439 | mdname(mddev)); | ||
4440 | err = 0; | 4776 | err = 0; |
4441 | blk_integrity_unregister(disk); | 4777 | blk_integrity_unregister(disk); |
4442 | md_new_event(mddev); | 4778 | md_new_event(mddev); |
@@ -4615,7 +4951,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg) | |||
4615 | info.state = 0; | 4951 | info.state = 0; |
4616 | if (mddev->in_sync) | 4952 | if (mddev->in_sync) |
4617 | info.state = (1<<MD_SB_CLEAN); | 4953 | info.state = (1<<MD_SB_CLEAN); |
4618 | if (mddev->bitmap && mddev->bitmap_offset) | 4954 | if (mddev->bitmap && mddev->bitmap_info.offset) |
4619 | info.state = (1<<MD_SB_BITMAP_PRESENT); | 4955 | info.state = (1<<MD_SB_BITMAP_PRESENT); |
4620 | info.active_disks = insync; | 4956 | info.active_disks = insync; |
4621 | info.working_disks = working; | 4957 | info.working_disks = working; |
@@ -4973,23 +5309,23 @@ static int set_bitmap_file(mddev_t *mddev, int fd) | |||
4973 | if (fd >= 0) { | 5309 | if (fd >= 0) { |
4974 | if (mddev->bitmap) | 5310 | if (mddev->bitmap) |
4975 | return -EEXIST; /* cannot add when bitmap is present */ | 5311 | return -EEXIST; /* cannot add when bitmap is present */ |
4976 | mddev->bitmap_file = fget(fd); | 5312 | mddev->bitmap_info.file = fget(fd); |
4977 | 5313 | ||
4978 | if (mddev->bitmap_file == NULL) { | 5314 | if (mddev->bitmap_info.file == NULL) { |
4979 | printk(KERN_ERR "%s: error: failed to get bitmap file\n", | 5315 | printk(KERN_ERR "%s: error: failed to get bitmap file\n", |
4980 | mdname(mddev)); | 5316 | mdname(mddev)); |
4981 | return -EBADF; | 5317 | return -EBADF; |
4982 | } | 5318 | } |
4983 | 5319 | ||
4984 | err = deny_bitmap_write_access(mddev->bitmap_file); | 5320 | err = deny_bitmap_write_access(mddev->bitmap_info.file); |
4985 | if (err) { | 5321 | if (err) { |
4986 | printk(KERN_ERR "%s: error: bitmap file is already in use\n", | 5322 | printk(KERN_ERR "%s: error: bitmap file is already in use\n", |
4987 | mdname(mddev)); | 5323 | mdname(mddev)); |
4988 | fput(mddev->bitmap_file); | 5324 | fput(mddev->bitmap_info.file); |
4989 | mddev->bitmap_file = NULL; | 5325 | mddev->bitmap_info.file = NULL; |
4990 | return err; | 5326 | return err; |
4991 | } | 5327 | } |
4992 | mddev->bitmap_offset = 0; /* file overrides offset */ | 5328 | mddev->bitmap_info.offset = 0; /* file overrides offset */ |
4993 | } else if (mddev->bitmap == NULL) | 5329 | } else if (mddev->bitmap == NULL) |
4994 | return -ENOENT; /* cannot remove what isn't there */ | 5330 | return -ENOENT; /* cannot remove what isn't there */ |
4995 | err = 0; | 5331 | err = 0; |
@@ -5004,11 +5340,11 @@ static int set_bitmap_file(mddev_t *mddev, int fd) | |||
5004 | mddev->pers->quiesce(mddev, 0); | 5340 | mddev->pers->quiesce(mddev, 0); |
5005 | } | 5341 | } |
5006 | if (fd < 0) { | 5342 | if (fd < 0) { |
5007 | if (mddev->bitmap_file) { | 5343 | if (mddev->bitmap_info.file) { |
5008 | restore_bitmap_write_access(mddev->bitmap_file); | 5344 | restore_bitmap_write_access(mddev->bitmap_info.file); |
5009 | fput(mddev->bitmap_file); | 5345 | fput(mddev->bitmap_info.file); |
5010 | } | 5346 | } |
5011 | mddev->bitmap_file = NULL; | 5347 | mddev->bitmap_info.file = NULL; |
5012 | } | 5348 | } |
5013 | 5349 | ||
5014 | return err; | 5350 | return err; |
@@ -5045,6 +5381,10 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) | |||
5045 | mddev->minor_version = info->minor_version; | 5381 | mddev->minor_version = info->minor_version; |
5046 | mddev->patch_version = info->patch_version; | 5382 | mddev->patch_version = info->patch_version; |
5047 | mddev->persistent = !info->not_persistent; | 5383 | mddev->persistent = !info->not_persistent; |
5384 | /* ensure mddev_put doesn't delete this now that there | ||
5385 | * is some minimal configuration. | ||
5386 | */ | ||
5387 | mddev->ctime = get_seconds(); | ||
5048 | return 0; | 5388 | return 0; |
5049 | } | 5389 | } |
5050 | mddev->major_version = MD_MAJOR_VERSION; | 5390 | mddev->major_version = MD_MAJOR_VERSION; |
@@ -5075,8 +5415,8 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) | |||
5075 | mddev->flags = 0; | 5415 | mddev->flags = 0; |
5076 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 5416 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
5077 | 5417 | ||
5078 | mddev->default_bitmap_offset = MD_SB_BYTES >> 9; | 5418 | mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; |
5079 | mddev->bitmap_offset = 0; | 5419 | mddev->bitmap_info.offset = 0; |
5080 | 5420 | ||
5081 | mddev->reshape_position = MaxSector; | 5421 | mddev->reshape_position = MaxSector; |
5082 | 5422 | ||
@@ -5150,7 +5490,7 @@ static int update_raid_disks(mddev_t *mddev, int raid_disks) | |||
5150 | if (mddev->pers->check_reshape == NULL) | 5490 | if (mddev->pers->check_reshape == NULL) |
5151 | return -EINVAL; | 5491 | return -EINVAL; |
5152 | if (raid_disks <= 0 || | 5492 | if (raid_disks <= 0 || |
5153 | raid_disks >= mddev->max_disks) | 5493 | (mddev->max_disks && raid_disks >= mddev->max_disks)) |
5154 | return -EINVAL; | 5494 | return -EINVAL; |
5155 | if (mddev->sync_thread || mddev->reshape_position != MaxSector) | 5495 | if (mddev->sync_thread || mddev->reshape_position != MaxSector) |
5156 | return -EBUSY; | 5496 | return -EBUSY; |
@@ -5176,7 +5516,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) | |||
5176 | int state = 0; | 5516 | int state = 0; |
5177 | 5517 | ||
5178 | /* calculate expected state,ignoring low bits */ | 5518 | /* calculate expected state,ignoring low bits */ |
5179 | if (mddev->bitmap && mddev->bitmap_offset) | 5519 | if (mddev->bitmap && mddev->bitmap_info.offset) |
5180 | state |= (1 << MD_SB_BITMAP_PRESENT); | 5520 | state |= (1 << MD_SB_BITMAP_PRESENT); |
5181 | 5521 | ||
5182 | if (mddev->major_version != info->major_version || | 5522 | if (mddev->major_version != info->major_version || |
@@ -5235,9 +5575,10 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) | |||
5235 | /* add the bitmap */ | 5575 | /* add the bitmap */ |
5236 | if (mddev->bitmap) | 5576 | if (mddev->bitmap) |
5237 | return -EEXIST; | 5577 | return -EEXIST; |
5238 | if (mddev->default_bitmap_offset == 0) | 5578 | if (mddev->bitmap_info.default_offset == 0) |
5239 | return -EINVAL; | 5579 | return -EINVAL; |
5240 | mddev->bitmap_offset = mddev->default_bitmap_offset; | 5580 | mddev->bitmap_info.offset = |
5581 | mddev->bitmap_info.default_offset; | ||
5241 | mddev->pers->quiesce(mddev, 1); | 5582 | mddev->pers->quiesce(mddev, 1); |
5242 | rv = bitmap_create(mddev); | 5583 | rv = bitmap_create(mddev); |
5243 | if (rv) | 5584 | if (rv) |
@@ -5252,7 +5593,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) | |||
5252 | mddev->pers->quiesce(mddev, 1); | 5593 | mddev->pers->quiesce(mddev, 1); |
5253 | bitmap_destroy(mddev); | 5594 | bitmap_destroy(mddev); |
5254 | mddev->pers->quiesce(mddev, 0); | 5595 | mddev->pers->quiesce(mddev, 0); |
5255 | mddev->bitmap_offset = 0; | 5596 | mddev->bitmap_info.offset = 0; |
5256 | } | 5597 | } |
5257 | } | 5598 | } |
5258 | md_update_sb(mddev, 1); | 5599 | md_update_sb(mddev, 1); |
@@ -5286,7 +5627,7 @@ static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) | |||
5286 | 5627 | ||
5287 | geo->heads = 2; | 5628 | geo->heads = 2; |
5288 | geo->sectors = 4; | 5629 | geo->sectors = 4; |
5289 | geo->cylinders = get_capacity(mddev->gendisk) / 8; | 5630 | geo->cylinders = mddev->array_sectors / 8; |
5290 | return 0; | 5631 | return 0; |
5291 | } | 5632 | } |
5292 | 5633 | ||
@@ -5296,6 +5637,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, | |||
5296 | int err = 0; | 5637 | int err = 0; |
5297 | void __user *argp = (void __user *)arg; | 5638 | void __user *argp = (void __user *)arg; |
5298 | mddev_t *mddev = NULL; | 5639 | mddev_t *mddev = NULL; |
5640 | int ro; | ||
5299 | 5641 | ||
5300 | if (!capable(CAP_SYS_ADMIN)) | 5642 | if (!capable(CAP_SYS_ADMIN)) |
5301 | return -EACCES; | 5643 | return -EACCES; |
@@ -5428,9 +5770,37 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, | |||
5428 | goto done_unlock; | 5770 | goto done_unlock; |
5429 | 5771 | ||
5430 | case STOP_ARRAY_RO: | 5772 | case STOP_ARRAY_RO: |
5431 | err = do_md_stop(mddev, 1, 1); | 5773 | err = md_set_readonly(mddev, 1); |
5432 | goto done_unlock; | 5774 | goto done_unlock; |
5433 | 5775 | ||
5776 | case BLKROSET: | ||
5777 | if (get_user(ro, (int __user *)(arg))) { | ||
5778 | err = -EFAULT; | ||
5779 | goto done_unlock; | ||
5780 | } | ||
5781 | err = -EINVAL; | ||
5782 | |||
5783 | /* if the bdev is going readonly the value of mddev->ro | ||
5784 | * does not matter, no writes are coming | ||
5785 | */ | ||
5786 | if (ro) | ||
5787 | goto done_unlock; | ||
5788 | |||
5789 | /* are we are already prepared for writes? */ | ||
5790 | if (mddev->ro != 1) | ||
5791 | goto done_unlock; | ||
5792 | |||
5793 | /* transitioning to readauto need only happen for | ||
5794 | * arrays that call md_write_start | ||
5795 | */ | ||
5796 | if (mddev->pers) { | ||
5797 | err = restart_array(mddev); | ||
5798 | if (err == 0) { | ||
5799 | mddev->ro = 2; | ||
5800 | set_disk_ro(mddev->gendisk, 0); | ||
5801 | } | ||
5802 | } | ||
5803 | goto done_unlock; | ||
5434 | } | 5804 | } |
5435 | 5805 | ||
5436 | /* | 5806 | /* |
@@ -5503,6 +5873,25 @@ done: | |||
5503 | abort: | 5873 | abort: |
5504 | return err; | 5874 | return err; |
5505 | } | 5875 | } |
5876 | #ifdef CONFIG_COMPAT | ||
5877 | static int md_compat_ioctl(struct block_device *bdev, fmode_t mode, | ||
5878 | unsigned int cmd, unsigned long arg) | ||
5879 | { | ||
5880 | switch (cmd) { | ||
5881 | case HOT_REMOVE_DISK: | ||
5882 | case HOT_ADD_DISK: | ||
5883 | case SET_DISK_FAULTY: | ||
5884 | case SET_BITMAP_FILE: | ||
5885 | /* These take in integer arg, do not convert */ | ||
5886 | break; | ||
5887 | default: | ||
5888 | arg = (unsigned long)compat_ptr(arg); | ||
5889 | break; | ||
5890 | } | ||
5891 | |||
5892 | return md_ioctl(bdev, mode, cmd, arg); | ||
5893 | } | ||
5894 | #endif /* CONFIG_COMPAT */ | ||
5506 | 5895 | ||
5507 | static int md_open(struct block_device *bdev, fmode_t mode) | 5896 | static int md_open(struct block_device *bdev, fmode_t mode) |
5508 | { | 5897 | { |
@@ -5532,7 +5921,7 @@ static int md_open(struct block_device *bdev, fmode_t mode) | |||
5532 | atomic_inc(&mddev->openers); | 5921 | atomic_inc(&mddev->openers); |
5533 | mutex_unlock(&mddev->open_mutex); | 5922 | mutex_unlock(&mddev->open_mutex); |
5534 | 5923 | ||
5535 | check_disk_change(bdev); | 5924 | check_disk_size_change(mddev->gendisk, bdev); |
5536 | out: | 5925 | out: |
5537 | return err; | 5926 | return err; |
5538 | } | 5927 | } |
@@ -5547,30 +5936,16 @@ static int md_release(struct gendisk *disk, fmode_t mode) | |||
5547 | 5936 | ||
5548 | return 0; | 5937 | return 0; |
5549 | } | 5938 | } |
5550 | |||
5551 | static int md_media_changed(struct gendisk *disk) | ||
5552 | { | ||
5553 | mddev_t *mddev = disk->private_data; | ||
5554 | |||
5555 | return mddev->changed; | ||
5556 | } | ||
5557 | |||
5558 | static int md_revalidate(struct gendisk *disk) | ||
5559 | { | ||
5560 | mddev_t *mddev = disk->private_data; | ||
5561 | |||
5562 | mddev->changed = 0; | ||
5563 | return 0; | ||
5564 | } | ||
5565 | static const struct block_device_operations md_fops = | 5939 | static const struct block_device_operations md_fops = |
5566 | { | 5940 | { |
5567 | .owner = THIS_MODULE, | 5941 | .owner = THIS_MODULE, |
5568 | .open = md_open, | 5942 | .open = md_open, |
5569 | .release = md_release, | 5943 | .release = md_release, |
5570 | .ioctl = md_ioctl, | 5944 | .ioctl = md_ioctl, |
5945 | #ifdef CONFIG_COMPAT | ||
5946 | .compat_ioctl = md_compat_ioctl, | ||
5947 | #endif | ||
5571 | .getgeo = md_getgeo, | 5948 | .getgeo = md_getgeo, |
5572 | .media_changed = md_media_changed, | ||
5573 | .revalidate_disk= md_revalidate, | ||
5574 | }; | 5949 | }; |
5575 | 5950 | ||
5576 | static int md_thread(void * arg) | 5951 | static int md_thread(void * arg) |
@@ -5684,7 +6059,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
5684 | mddev->pers->error_handler(mddev,rdev); | 6059 | mddev->pers->error_handler(mddev,rdev); |
5685 | if (mddev->degraded) | 6060 | if (mddev->degraded) |
5686 | set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); | 6061 | set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); |
5687 | set_bit(StateChanged, &rdev->flags); | 6062 | sysfs_notify_dirent(rdev->sysfs_state); |
5688 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | 6063 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); |
5689 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 6064 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
5690 | md_wakeup_thread(mddev->thread); | 6065 | md_wakeup_thread(mddev->thread); |
@@ -5961,14 +6336,14 @@ static int md_seq_show(struct seq_file *seq, void *v) | |||
5961 | unsigned long chunk_kb; | 6336 | unsigned long chunk_kb; |
5962 | unsigned long flags; | 6337 | unsigned long flags; |
5963 | spin_lock_irqsave(&bitmap->lock, flags); | 6338 | spin_lock_irqsave(&bitmap->lock, flags); |
5964 | chunk_kb = bitmap->chunksize >> 10; | 6339 | chunk_kb = mddev->bitmap_info.chunksize >> 10; |
5965 | seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " | 6340 | seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " |
5966 | "%lu%s chunk", | 6341 | "%lu%s chunk", |
5967 | bitmap->pages - bitmap->missing_pages, | 6342 | bitmap->pages - bitmap->missing_pages, |
5968 | bitmap->pages, | 6343 | bitmap->pages, |
5969 | (bitmap->pages - bitmap->missing_pages) | 6344 | (bitmap->pages - bitmap->missing_pages) |
5970 | << (PAGE_SHIFT - 10), | 6345 | << (PAGE_SHIFT - 10), |
5971 | chunk_kb ? chunk_kb : bitmap->chunksize, | 6346 | chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize, |
5972 | chunk_kb ? "KB" : "B"); | 6347 | chunk_kb ? "KB" : "B"); |
5973 | if (bitmap->file) { | 6348 | if (bitmap->file) { |
5974 | seq_printf(seq, ", file: "); | 6349 | seq_printf(seq, ", file: "); |
@@ -6254,10 +6629,11 @@ void md_do_sync(mddev_t *mddev) | |||
6254 | mddev->curr_resync = 2; | 6629 | mddev->curr_resync = 2; |
6255 | 6630 | ||
6256 | try_again: | 6631 | try_again: |
6257 | if (kthread_should_stop()) { | 6632 | if (kthread_should_stop()) |
6258 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | 6633 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); |
6634 | |||
6635 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | ||
6259 | goto skip; | 6636 | goto skip; |
6260 | } | ||
6261 | for_each_mddev(mddev2, tmp) { | 6637 | for_each_mddev(mddev2, tmp) { |
6262 | if (mddev2 == mddev) | 6638 | if (mddev2 == mddev) |
6263 | continue; | 6639 | continue; |
@@ -6317,12 +6693,14 @@ void md_do_sync(mddev_t *mddev) | |||
6317 | /* recovery follows the physical size of devices */ | 6693 | /* recovery follows the physical size of devices */ |
6318 | max_sectors = mddev->dev_sectors; | 6694 | max_sectors = mddev->dev_sectors; |
6319 | j = MaxSector; | 6695 | j = MaxSector; |
6320 | list_for_each_entry(rdev, &mddev->disks, same_set) | 6696 | rcu_read_lock(); |
6697 | list_for_each_entry_rcu(rdev, &mddev->disks, same_set) | ||
6321 | if (rdev->raid_disk >= 0 && | 6698 | if (rdev->raid_disk >= 0 && |
6322 | !test_bit(Faulty, &rdev->flags) && | 6699 | !test_bit(Faulty, &rdev->flags) && |
6323 | !test_bit(In_sync, &rdev->flags) && | 6700 | !test_bit(In_sync, &rdev->flags) && |
6324 | rdev->recovery_offset < j) | 6701 | rdev->recovery_offset < j) |
6325 | j = rdev->recovery_offset; | 6702 | j = rdev->recovery_offset; |
6703 | rcu_read_unlock(); | ||
6326 | } | 6704 | } |
6327 | 6705 | ||
6328 | printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); | 6706 | printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); |
@@ -6359,6 +6737,7 @@ void md_do_sync(mddev_t *mddev) | |||
6359 | desc, mdname(mddev)); | 6737 | desc, mdname(mddev)); |
6360 | mddev->curr_resync = j; | 6738 | mddev->curr_resync = j; |
6361 | } | 6739 | } |
6740 | mddev->curr_resync_completed = mddev->curr_resync; | ||
6362 | 6741 | ||
6363 | while (j < max_sectors) { | 6742 | while (j < max_sectors) { |
6364 | sector_t sectors; | 6743 | sector_t sectors; |
@@ -6491,21 +6870,30 @@ void md_do_sync(mddev_t *mddev) | |||
6491 | } else { | 6870 | } else { |
6492 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | 6871 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) |
6493 | mddev->curr_resync = MaxSector; | 6872 | mddev->curr_resync = MaxSector; |
6494 | list_for_each_entry(rdev, &mddev->disks, same_set) | 6873 | rcu_read_lock(); |
6874 | list_for_each_entry_rcu(rdev, &mddev->disks, same_set) | ||
6495 | if (rdev->raid_disk >= 0 && | 6875 | if (rdev->raid_disk >= 0 && |
6876 | mddev->delta_disks >= 0 && | ||
6496 | !test_bit(Faulty, &rdev->flags) && | 6877 | !test_bit(Faulty, &rdev->flags) && |
6497 | !test_bit(In_sync, &rdev->flags) && | 6878 | !test_bit(In_sync, &rdev->flags) && |
6498 | rdev->recovery_offset < mddev->curr_resync) | 6879 | rdev->recovery_offset < mddev->curr_resync) |
6499 | rdev->recovery_offset = mddev->curr_resync; | 6880 | rdev->recovery_offset = mddev->curr_resync; |
6881 | rcu_read_unlock(); | ||
6500 | } | 6882 | } |
6501 | } | 6883 | } |
6502 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 6884 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
6503 | 6885 | ||
6504 | skip: | 6886 | skip: |
6887 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { | ||
6888 | /* We completed so min/max setting can be forgotten if used. */ | ||
6889 | if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) | ||
6890 | mddev->resync_min = 0; | ||
6891 | mddev->resync_max = MaxSector; | ||
6892 | } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) | ||
6893 | mddev->resync_min = mddev->curr_resync_completed; | ||
6505 | mddev->curr_resync = 0; | 6894 | mddev->curr_resync = 0; |
6506 | mddev->curr_resync_completed = 0; | 6895 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) |
6507 | mddev->resync_min = 0; | 6896 | mddev->curr_resync_completed = 0; |
6508 | mddev->resync_max = MaxSector; | ||
6509 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | 6897 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); |
6510 | wake_up(&resync_wait); | 6898 | wake_up(&resync_wait); |
6511 | set_bit(MD_RECOVERY_DONE, &mddev->recovery); | 6899 | set_bit(MD_RECOVERY_DONE, &mddev->recovery); |
@@ -6568,6 +6956,7 @@ static int remove_and_add_spares(mddev_t *mddev) | |||
6568 | nm, mdname(mddev)); | 6956 | nm, mdname(mddev)); |
6569 | spares++; | 6957 | spares++; |
6570 | md_new_event(mddev); | 6958 | md_new_event(mddev); |
6959 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | ||
6571 | } else | 6960 | } else |
6572 | break; | 6961 | break; |
6573 | } | 6962 | } |
@@ -6603,7 +6992,7 @@ void md_check_recovery(mddev_t *mddev) | |||
6603 | 6992 | ||
6604 | 6993 | ||
6605 | if (mddev->bitmap) | 6994 | if (mddev->bitmap) |
6606 | bitmap_daemon_work(mddev->bitmap); | 6995 | bitmap_daemon_work(mddev); |
6607 | 6996 | ||
6608 | if (mddev->ro) | 6997 | if (mddev->ro) |
6609 | return; | 6998 | return; |
@@ -6663,11 +7052,6 @@ void md_check_recovery(mddev_t *mddev) | |||
6663 | if (mddev->flags) | 7052 | if (mddev->flags) |
6664 | md_update_sb(mddev, 0); | 7053 | md_update_sb(mddev, 0); |
6665 | 7054 | ||
6666 | list_for_each_entry(rdev, &mddev->disks, same_set) | ||
6667 | if (test_and_clear_bit(StateChanged, &rdev->flags)) | ||
6668 | sysfs_notify_dirent(rdev->sysfs_state); | ||
6669 | |||
6670 | |||
6671 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && | 7055 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && |
6672 | !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { | 7056 | !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { |
6673 | /* resync/recovery still happening */ | 7057 | /* resync/recovery still happening */ |
@@ -6804,7 +7188,7 @@ static int md_notify_reboot(struct notifier_block *this, | |||
6804 | * appears to still be in use. Hence | 7188 | * appears to still be in use. Hence |
6805 | * the '100'. | 7189 | * the '100'. |
6806 | */ | 7190 | */ |
6807 | do_md_stop(mddev, 1, 100); | 7191 | md_set_readonly(mddev, 100); |
6808 | mddev_unlock(mddev); | 7192 | mddev_unlock(mddev); |
6809 | } | 7193 | } |
6810 | /* | 7194 | /* |
@@ -6973,5 +7357,6 @@ EXPORT_SYMBOL(md_unregister_thread); | |||
6973 | EXPORT_SYMBOL(md_wakeup_thread); | 7357 | EXPORT_SYMBOL(md_wakeup_thread); |
6974 | EXPORT_SYMBOL(md_check_recovery); | 7358 | EXPORT_SYMBOL(md_check_recovery); |
6975 | MODULE_LICENSE("GPL"); | 7359 | MODULE_LICENSE("GPL"); |
7360 | MODULE_DESCRIPTION("MD RAID framework"); | ||
6976 | MODULE_ALIAS("md"); | 7361 | MODULE_ALIAS("md"); |
6977 | MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); | 7362 | MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); |
diff --git a/drivers/md/md.h b/drivers/md/md.h index f184b69ef337..10597bfec000 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h | |||
@@ -74,13 +74,13 @@ struct mdk_rdev_s | |||
74 | #define Blocked 8 /* An error occured on an externally | 74 | #define Blocked 8 /* An error occured on an externally |
75 | * managed array, don't allow writes | 75 | * managed array, don't allow writes |
76 | * until it is cleared */ | 76 | * until it is cleared */ |
77 | #define StateChanged 9 /* Faulty or Blocked has changed during | ||
78 | * interrupt, so it needs to be | ||
79 | * notified by the thread */ | ||
80 | wait_queue_head_t blocked_wait; | 77 | wait_queue_head_t blocked_wait; |
81 | 78 | ||
82 | int desc_nr; /* descriptor index in the superblock */ | 79 | int desc_nr; /* descriptor index in the superblock */ |
83 | int raid_disk; /* role of device in array */ | 80 | int raid_disk; /* role of device in array */ |
81 | int new_raid_disk; /* role that the device will have in | ||
82 | * the array after a level-change completes. | ||
83 | */ | ||
84 | int saved_raid_disk; /* role that device used to have in the | 84 | int saved_raid_disk; /* role that device used to have in the |
85 | * array and could again if we did a partial | 85 | * array and could again if we did a partial |
86 | * resync from the bitmap | 86 | * resync from the bitmap |
@@ -97,6 +97,9 @@ struct mdk_rdev_s | |||
97 | atomic_t read_errors; /* number of consecutive read errors that | 97 | atomic_t read_errors; /* number of consecutive read errors that |
98 | * we have tried to ignore. | 98 | * we have tried to ignore. |
99 | */ | 99 | */ |
100 | struct timespec last_read_error; /* monotonic time since our | ||
101 | * last read error | ||
102 | */ | ||
100 | atomic_t corrected_errors; /* number of corrected read errors, | 103 | atomic_t corrected_errors; /* number of corrected read errors, |
101 | * for reporting to userspace and storing | 104 | * for reporting to userspace and storing |
102 | * in superblock. | 105 | * in superblock. |
@@ -150,6 +153,12 @@ struct mddev_s | |||
150 | int external_size; /* size managed | 153 | int external_size; /* size managed |
151 | * externally */ | 154 | * externally */ |
152 | __u64 events; | 155 | __u64 events; |
156 | /* If the last 'event' was simply a clean->dirty transition, and | ||
157 | * we didn't write it to the spares, then it is safe and simple | ||
158 | * to just decrement the event count on a dirty->clean transition. | ||
159 | * So we record that possibility here. | ||
160 | */ | ||
161 | int can_decrease_events; | ||
153 | 162 | ||
154 | char uuid[16]; | 163 | char uuid[16]; |
155 | 164 | ||
@@ -237,7 +246,6 @@ struct mddev_s | |||
237 | atomic_t active; /* general refcount */ | 246 | atomic_t active; /* general refcount */ |
238 | atomic_t openers; /* number of active opens */ | 247 | atomic_t openers; /* number of active opens */ |
239 | 248 | ||
240 | int changed; /* true if we might need to reread partition info */ | ||
241 | int degraded; /* whether md should consider | 249 | int degraded; /* whether md should consider |
242 | * adding a spare | 250 | * adding a spare |
243 | */ | 251 | */ |
@@ -276,21 +284,40 @@ struct mddev_s | |||
276 | atomic_t writes_pending; | 284 | atomic_t writes_pending; |
277 | struct request_queue *queue; /* for plugging ... */ | 285 | struct request_queue *queue; /* for plugging ... */ |
278 | 286 | ||
279 | atomic_t write_behind; /* outstanding async IO */ | ||
280 | unsigned int max_write_behind; /* 0 = sync */ | ||
281 | |||
282 | struct bitmap *bitmap; /* the bitmap for the device */ | 287 | struct bitmap *bitmap; /* the bitmap for the device */ |
283 | struct file *bitmap_file; /* the bitmap file */ | 288 | struct { |
284 | long bitmap_offset; /* offset from superblock of | 289 | struct file *file; /* the bitmap file */ |
285 | * start of bitmap. May be | 290 | loff_t offset; /* offset from superblock of |
286 | * negative, but not '0' | 291 | * start of bitmap. May be |
287 | */ | 292 | * negative, but not '0' |
288 | long default_bitmap_offset; /* this is the offset to use when | 293 | * For external metadata, offset |
289 | * hot-adding a bitmap. It should | 294 | * from start of device. |
290 | * eventually be settable by sysfs. | 295 | */ |
291 | */ | 296 | loff_t default_offset; /* this is the offset to use when |
292 | 297 | * hot-adding a bitmap. It should | |
298 | * eventually be settable by sysfs. | ||
299 | */ | ||
300 | struct mutex mutex; | ||
301 | unsigned long chunksize; | ||
302 | unsigned long daemon_sleep; /* how many seconds between updates? */ | ||
303 | unsigned long max_write_behind; /* write-behind mode */ | ||
304 | int external; | ||
305 | } bitmap_info; | ||
306 | |||
307 | atomic_t max_corr_read_errors; /* max read retries */ | ||
293 | struct list_head all_mddevs; | 308 | struct list_head all_mddevs; |
309 | |||
310 | struct attribute_group *to_remove; | ||
311 | /* Generic barrier handling. | ||
312 | * If there is a pending barrier request, all other | ||
313 | * writes are blocked while the devices are flushed. | ||
314 | * The last to finish a flush schedules a worker to | ||
315 | * submit the barrier request (without the barrier flag), | ||
316 | * then submit more flush requests. | ||
317 | */ | ||
318 | struct bio *barrier; | ||
319 | atomic_t flush_pending; | ||
320 | struct work_struct barrier_work; | ||
294 | }; | 321 | }; |
295 | 322 | ||
296 | 323 | ||
@@ -312,7 +339,7 @@ struct mdk_personality | |||
312 | int level; | 339 | int level; |
313 | struct list_head list; | 340 | struct list_head list; |
314 | struct module *owner; | 341 | struct module *owner; |
315 | int (*make_request)(struct request_queue *q, struct bio *bio); | 342 | int (*make_request)(mddev_t *mddev, struct bio *bio); |
316 | int (*run)(mddev_t *mddev); | 343 | int (*run)(mddev_t *mddev); |
317 | int (*stop)(mddev_t *mddev); | 344 | int (*stop)(mddev_t *mddev); |
318 | void (*status)(struct seq_file *seq, mddev_t *mddev); | 345 | void (*status)(struct seq_file *seq, mddev_t *mddev); |
@@ -353,7 +380,7 @@ struct md_sysfs_entry { | |||
353 | ssize_t (*show)(mddev_t *, char *); | 380 | ssize_t (*show)(mddev_t *, char *); |
354 | ssize_t (*store)(mddev_t *, const char *, size_t); | 381 | ssize_t (*store)(mddev_t *, const char *, size_t); |
355 | }; | 382 | }; |
356 | 383 | extern struct attribute_group md_bitmap_group; | |
357 | 384 | ||
358 | static inline char * mdname (mddev_t * mddev) | 385 | static inline char * mdname (mddev_t * mddev) |
359 | { | 386 | { |
@@ -431,6 +458,7 @@ extern void md_done_sync(mddev_t *mddev, int blocks, int ok); | |||
431 | extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); | 458 | extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); |
432 | 459 | ||
433 | extern int mddev_congested(mddev_t *mddev, int bits); | 460 | extern int mddev_congested(mddev_t *mddev, int bits); |
461 | extern void md_barrier_request(mddev_t *mddev, struct bio *bio); | ||
434 | extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, | 462 | extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, |
435 | sector_t sector, int size, struct page *page); | 463 | sector_t sector, int size, struct page *page); |
436 | extern void md_super_wait(mddev_t *mddev); | 464 | extern void md_super_wait(mddev_t *mddev); |
@@ -443,6 +471,8 @@ extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); | |||
443 | extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors); | 471 | extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors); |
444 | extern int md_check_no_bitmap(mddev_t *mddev); | 472 | extern int md_check_no_bitmap(mddev_t *mddev); |
445 | extern int md_integrity_register(mddev_t *mddev); | 473 | extern int md_integrity_register(mddev_t *mddev); |
446 | void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev); | 474 | extern void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev); |
475 | extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); | ||
476 | extern void restore_bitmap_write_access(struct file *file); | ||
447 | 477 | ||
448 | #endif /* _MD_MD_H */ | 478 | #endif /* _MD_MD_H */ |
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index ee7646f974a0..410fb60699ac 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/blkdev.h> | 22 | #include <linux/blkdev.h> |
23 | #include <linux/raid/md_u.h> | 23 | #include <linux/raid/md_u.h> |
24 | #include <linux/seq_file.h> | 24 | #include <linux/seq_file.h> |
25 | #include <linux/slab.h> | ||
25 | #include "md.h" | 26 | #include "md.h" |
26 | #include "multipath.h" | 27 | #include "multipath.h" |
27 | 28 | ||
@@ -84,7 +85,7 @@ static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err) | |||
84 | static void multipath_end_request(struct bio *bio, int error) | 85 | static void multipath_end_request(struct bio *bio, int error) |
85 | { | 86 | { |
86 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 87 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
87 | struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private); | 88 | struct multipath_bh *mp_bh = bio->bi_private; |
88 | multipath_conf_t *conf = mp_bh->mddev->private; | 89 | multipath_conf_t *conf = mp_bh->mddev->private; |
89 | mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev; | 90 | mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev; |
90 | 91 | ||
@@ -135,17 +136,14 @@ static void multipath_unplug(struct request_queue *q) | |||
135 | } | 136 | } |
136 | 137 | ||
137 | 138 | ||
138 | static int multipath_make_request (struct request_queue *q, struct bio * bio) | 139 | static int multipath_make_request(mddev_t *mddev, struct bio * bio) |
139 | { | 140 | { |
140 | mddev_t *mddev = q->queuedata; | ||
141 | multipath_conf_t *conf = mddev->private; | 141 | multipath_conf_t *conf = mddev->private; |
142 | struct multipath_bh * mp_bh; | 142 | struct multipath_bh * mp_bh; |
143 | struct multipath_info *multipath; | 143 | struct multipath_info *multipath; |
144 | const int rw = bio_data_dir(bio); | ||
145 | int cpu; | ||
146 | 144 | ||
147 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { | 145 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { |
148 | bio_endio(bio, -EOPNOTSUPP); | 146 | md_barrier_request(mddev, bio); |
149 | return 0; | 147 | return 0; |
150 | } | 148 | } |
151 | 149 | ||
@@ -154,12 +152,6 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio) | |||
154 | mp_bh->master_bio = bio; | 152 | mp_bh->master_bio = bio; |
155 | mp_bh->mddev = mddev; | 153 | mp_bh->mddev = mddev; |
156 | 154 | ||
157 | cpu = part_stat_lock(); | ||
158 | part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); | ||
159 | part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], | ||
160 | bio_sectors(bio)); | ||
161 | part_stat_unlock(); | ||
162 | |||
163 | mp_bh->path = multipath_map(conf); | 155 | mp_bh->path = multipath_map(conf); |
164 | if (mp_bh->path < 0) { | 156 | if (mp_bh->path < 0) { |
165 | bio_endio(bio, -EIO); | 157 | bio_endio(bio, -EIO); |
@@ -301,14 +293,16 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
301 | rdev->data_offset << 9); | 293 | rdev->data_offset << 9); |
302 | 294 | ||
303 | /* as we don't honour merge_bvec_fn, we must never risk | 295 | /* as we don't honour merge_bvec_fn, we must never risk |
304 | * violating it, so limit ->max_sector to one PAGE, as | 296 | * violating it, so limit ->max_segments to one, lying |
305 | * a one page request is never in violation. | 297 | * within a single page. |
306 | * (Note: it is very unlikely that a device with | 298 | * (Note: it is very unlikely that a device with |
307 | * merge_bvec_fn will be involved in multipath.) | 299 | * merge_bvec_fn will be involved in multipath.) |
308 | */ | 300 | */ |
309 | if (q->merge_bvec_fn && | 301 | if (q->merge_bvec_fn) { |
310 | queue_max_sectors(q) > (PAGE_SIZE>>9)) | 302 | blk_queue_max_segments(mddev->queue, 1); |
311 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | 303 | blk_queue_segment_boundary(mddev->queue, |
304 | PAGE_CACHE_SIZE - 1); | ||
305 | } | ||
312 | 306 | ||
313 | conf->working_disks++; | 307 | conf->working_disks++; |
314 | mddev->degraded--; | 308 | mddev->degraded--; |
@@ -476,9 +470,11 @@ static int multipath_run (mddev_t *mddev) | |||
476 | /* as we don't honour merge_bvec_fn, we must never risk | 470 | /* as we don't honour merge_bvec_fn, we must never risk |
477 | * violating it, not that we ever expect a device with | 471 | * violating it, not that we ever expect a device with |
478 | * a merge_bvec_fn to be involved in multipath */ | 472 | * a merge_bvec_fn to be involved in multipath */ |
479 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && | 473 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { |
480 | queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) | 474 | blk_queue_max_segments(mddev->queue, 1); |
481 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | 475 | blk_queue_segment_boundary(mddev->queue, |
476 | PAGE_CACHE_SIZE - 1); | ||
477 | } | ||
482 | 478 | ||
483 | if (!test_bit(Faulty, &rdev->flags)) | 479 | if (!test_bit(Faulty, &rdev->flags)) |
484 | conf->working_disks++; | 480 | conf->working_disks++; |
@@ -581,6 +577,7 @@ static void __exit multipath_exit (void) | |||
581 | module_init(multipath_init); | 577 | module_init(multipath_init); |
582 | module_exit(multipath_exit); | 578 | module_exit(multipath_exit); |
583 | MODULE_LICENSE("GPL"); | 579 | MODULE_LICENSE("GPL"); |
580 | MODULE_DESCRIPTION("simple multi-path personality for MD"); | ||
584 | MODULE_ALIAS("md-personality-7"); /* MULTIPATH */ | 581 | MODULE_ALIAS("md-personality-7"); /* MULTIPATH */ |
585 | MODULE_ALIAS("md-multipath"); | 582 | MODULE_ALIAS("md-multipath"); |
586 | MODULE_ALIAS("md-level--4"); | 583 | MODULE_ALIAS("md-level--4"); |
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index d3a4ce06015a..563abed5a2cb 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c | |||
@@ -20,17 +20,20 @@ | |||
20 | 20 | ||
21 | #include <linux/blkdev.h> | 21 | #include <linux/blkdev.h> |
22 | #include <linux/seq_file.h> | 22 | #include <linux/seq_file.h> |
23 | #include <linux/slab.h> | ||
23 | #include "md.h" | 24 | #include "md.h" |
24 | #include "raid0.h" | 25 | #include "raid0.h" |
26 | #include "raid5.h" | ||
25 | 27 | ||
26 | static void raid0_unplug(struct request_queue *q) | 28 | static void raid0_unplug(struct request_queue *q) |
27 | { | 29 | { |
28 | mddev_t *mddev = q->queuedata; | 30 | mddev_t *mddev = q->queuedata; |
29 | raid0_conf_t *conf = mddev->private; | 31 | raid0_conf_t *conf = mddev->private; |
30 | mdk_rdev_t **devlist = conf->devlist; | 32 | mdk_rdev_t **devlist = conf->devlist; |
33 | int raid_disks = conf->strip_zone[0].nb_dev; | ||
31 | int i; | 34 | int i; |
32 | 35 | ||
33 | for (i=0; i<mddev->raid_disks; i++) { | 36 | for (i=0; i < raid_disks; i++) { |
34 | struct request_queue *r_queue = bdev_get_queue(devlist[i]->bdev); | 37 | struct request_queue *r_queue = bdev_get_queue(devlist[i]->bdev); |
35 | 38 | ||
36 | blk_unplug(r_queue); | 39 | blk_unplug(r_queue); |
@@ -42,12 +45,13 @@ static int raid0_congested(void *data, int bits) | |||
42 | mddev_t *mddev = data; | 45 | mddev_t *mddev = data; |
43 | raid0_conf_t *conf = mddev->private; | 46 | raid0_conf_t *conf = mddev->private; |
44 | mdk_rdev_t **devlist = conf->devlist; | 47 | mdk_rdev_t **devlist = conf->devlist; |
48 | int raid_disks = conf->strip_zone[0].nb_dev; | ||
45 | int i, ret = 0; | 49 | int i, ret = 0; |
46 | 50 | ||
47 | if (mddev_congested(mddev, bits)) | 51 | if (mddev_congested(mddev, bits)) |
48 | return 1; | 52 | return 1; |
49 | 53 | ||
50 | for (i = 0; i < mddev->raid_disks && !ret ; i++) { | 54 | for (i = 0; i < raid_disks && !ret ; i++) { |
51 | struct request_queue *q = bdev_get_queue(devlist[i]->bdev); | 55 | struct request_queue *q = bdev_get_queue(devlist[i]->bdev); |
52 | 56 | ||
53 | ret |= bdi_congested(&q->backing_dev_info, bits); | 57 | ret |= bdi_congested(&q->backing_dev_info, bits); |
@@ -65,16 +69,17 @@ static void dump_zones(mddev_t *mddev) | |||
65 | sector_t zone_start = 0; | 69 | sector_t zone_start = 0; |
66 | char b[BDEVNAME_SIZE]; | 70 | char b[BDEVNAME_SIZE]; |
67 | raid0_conf_t *conf = mddev->private; | 71 | raid0_conf_t *conf = mddev->private; |
72 | int raid_disks = conf->strip_zone[0].nb_dev; | ||
68 | printk(KERN_INFO "******* %s configuration *********\n", | 73 | printk(KERN_INFO "******* %s configuration *********\n", |
69 | mdname(mddev)); | 74 | mdname(mddev)); |
70 | h = 0; | 75 | h = 0; |
71 | for (j = 0; j < conf->nr_strip_zones; j++) { | 76 | for (j = 0; j < conf->nr_strip_zones; j++) { |
72 | printk(KERN_INFO "zone%d=[", j); | 77 | printk(KERN_INFO "zone%d=[", j); |
73 | for (k = 0; k < conf->strip_zone[j].nb_dev; k++) | 78 | for (k = 0; k < conf->strip_zone[j].nb_dev; k++) |
74 | printk("%s/", | 79 | printk(KERN_CONT "%s/", |
75 | bdevname(conf->devlist[j*mddev->raid_disks | 80 | bdevname(conf->devlist[j*raid_disks |
76 | + k]->bdev, b)); | 81 | + k]->bdev, b)); |
77 | printk("]\n"); | 82 | printk(KERN_CONT "]\n"); |
78 | 83 | ||
79 | zone_size = conf->strip_zone[j].zone_end - zone_start; | 84 | zone_size = conf->strip_zone[j].zone_end - zone_start; |
80 | printk(KERN_INFO " zone offset=%llukb " | 85 | printk(KERN_INFO " zone offset=%llukb " |
@@ -87,7 +92,7 @@ static void dump_zones(mddev_t *mddev) | |||
87 | printk(KERN_INFO "**********************************\n\n"); | 92 | printk(KERN_INFO "**********************************\n\n"); |
88 | } | 93 | } |
89 | 94 | ||
90 | static int create_strip_zones(mddev_t *mddev) | 95 | static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf) |
91 | { | 96 | { |
92 | int i, c, err; | 97 | int i, c, err; |
93 | sector_t curr_zone_end, sectors; | 98 | sector_t curr_zone_end, sectors; |
@@ -100,8 +105,9 @@ static int create_strip_zones(mddev_t *mddev) | |||
100 | if (!conf) | 105 | if (!conf) |
101 | return -ENOMEM; | 106 | return -ENOMEM; |
102 | list_for_each_entry(rdev1, &mddev->disks, same_set) { | 107 | list_for_each_entry(rdev1, &mddev->disks, same_set) { |
103 | printk(KERN_INFO "raid0: looking at %s\n", | 108 | printk(KERN_INFO "md/raid0:%s: looking at %s\n", |
104 | bdevname(rdev1->bdev,b)); | 109 | mdname(mddev), |
110 | bdevname(rdev1->bdev, b)); | ||
105 | c = 0; | 111 | c = 0; |
106 | 112 | ||
107 | /* round size to chunk_size */ | 113 | /* round size to chunk_size */ |
@@ -110,14 +116,16 @@ static int create_strip_zones(mddev_t *mddev) | |||
110 | rdev1->sectors = sectors * mddev->chunk_sectors; | 116 | rdev1->sectors = sectors * mddev->chunk_sectors; |
111 | 117 | ||
112 | list_for_each_entry(rdev2, &mddev->disks, same_set) { | 118 | list_for_each_entry(rdev2, &mddev->disks, same_set) { |
113 | printk(KERN_INFO "raid0: comparing %s(%llu)", | 119 | printk(KERN_INFO "md/raid0:%s: comparing %s(%llu)", |
120 | mdname(mddev), | ||
114 | bdevname(rdev1->bdev,b), | 121 | bdevname(rdev1->bdev,b), |
115 | (unsigned long long)rdev1->sectors); | 122 | (unsigned long long)rdev1->sectors); |
116 | printk(KERN_INFO " with %s(%llu)\n", | 123 | printk(KERN_CONT " with %s(%llu)\n", |
117 | bdevname(rdev2->bdev,b), | 124 | bdevname(rdev2->bdev,b), |
118 | (unsigned long long)rdev2->sectors); | 125 | (unsigned long long)rdev2->sectors); |
119 | if (rdev2 == rdev1) { | 126 | if (rdev2 == rdev1) { |
120 | printk(KERN_INFO "raid0: END\n"); | 127 | printk(KERN_INFO "md/raid0:%s: END\n", |
128 | mdname(mddev)); | ||
121 | break; | 129 | break; |
122 | } | 130 | } |
123 | if (rdev2->sectors == rdev1->sectors) { | 131 | if (rdev2->sectors == rdev1->sectors) { |
@@ -125,20 +133,24 @@ static int create_strip_zones(mddev_t *mddev) | |||
125 | * Not unique, don't count it as a new | 133 | * Not unique, don't count it as a new |
126 | * group | 134 | * group |
127 | */ | 135 | */ |
128 | printk(KERN_INFO "raid0: EQUAL\n"); | 136 | printk(KERN_INFO "md/raid0:%s: EQUAL\n", |
137 | mdname(mddev)); | ||
129 | c = 1; | 138 | c = 1; |
130 | break; | 139 | break; |
131 | } | 140 | } |
132 | printk(KERN_INFO "raid0: NOT EQUAL\n"); | 141 | printk(KERN_INFO "md/raid0:%s: NOT EQUAL\n", |
142 | mdname(mddev)); | ||
133 | } | 143 | } |
134 | if (!c) { | 144 | if (!c) { |
135 | printk(KERN_INFO "raid0: ==> UNIQUE\n"); | 145 | printk(KERN_INFO "md/raid0:%s: ==> UNIQUE\n", |
146 | mdname(mddev)); | ||
136 | conf->nr_strip_zones++; | 147 | conf->nr_strip_zones++; |
137 | printk(KERN_INFO "raid0: %d zones\n", | 148 | printk(KERN_INFO "md/raid0:%s: %d zones\n", |
138 | conf->nr_strip_zones); | 149 | mdname(mddev), conf->nr_strip_zones); |
139 | } | 150 | } |
140 | } | 151 | } |
141 | printk(KERN_INFO "raid0: FINAL %d zones\n", conf->nr_strip_zones); | 152 | printk(KERN_INFO "md/raid0:%s: FINAL %d zones\n", |
153 | mdname(mddev), conf->nr_strip_zones); | ||
142 | err = -ENOMEM; | 154 | err = -ENOMEM; |
143 | conf->strip_zone = kzalloc(sizeof(struct strip_zone)* | 155 | conf->strip_zone = kzalloc(sizeof(struct strip_zone)* |
144 | conf->nr_strip_zones, GFP_KERNEL); | 156 | conf->nr_strip_zones, GFP_KERNEL); |
@@ -161,14 +173,20 @@ static int create_strip_zones(mddev_t *mddev) | |||
161 | list_for_each_entry(rdev1, &mddev->disks, same_set) { | 173 | list_for_each_entry(rdev1, &mddev->disks, same_set) { |
162 | int j = rdev1->raid_disk; | 174 | int j = rdev1->raid_disk; |
163 | 175 | ||
176 | if (mddev->level == 10) { | ||
177 | /* taking over a raid10-n2 array */ | ||
178 | j /= 2; | ||
179 | rdev1->new_raid_disk = j; | ||
180 | } | ||
181 | |||
164 | if (j < 0 || j >= mddev->raid_disks) { | 182 | if (j < 0 || j >= mddev->raid_disks) { |
165 | printk(KERN_ERR "raid0: bad disk number %d - " | 183 | printk(KERN_ERR "md/raid0:%s: bad disk number %d - " |
166 | "aborting!\n", j); | 184 | "aborting!\n", mdname(mddev), j); |
167 | goto abort; | 185 | goto abort; |
168 | } | 186 | } |
169 | if (dev[j]) { | 187 | if (dev[j]) { |
170 | printk(KERN_ERR "raid0: multiple devices for %d - " | 188 | printk(KERN_ERR "md/raid0:%s: multiple devices for %d - " |
171 | "aborting!\n", j); | 189 | "aborting!\n", mdname(mddev), j); |
172 | goto abort; | 190 | goto abort; |
173 | } | 191 | } |
174 | dev[j] = rdev1; | 192 | dev[j] = rdev1; |
@@ -176,21 +194,22 @@ static int create_strip_zones(mddev_t *mddev) | |||
176 | disk_stack_limits(mddev->gendisk, rdev1->bdev, | 194 | disk_stack_limits(mddev->gendisk, rdev1->bdev, |
177 | rdev1->data_offset << 9); | 195 | rdev1->data_offset << 9); |
178 | /* as we don't honour merge_bvec_fn, we must never risk | 196 | /* as we don't honour merge_bvec_fn, we must never risk |
179 | * violating it, so limit ->max_sector to one PAGE, as | 197 | * violating it, so limit ->max_segments to 1, lying within |
180 | * a one page request is never in violation. | 198 | * a single page. |
181 | */ | 199 | */ |
182 | 200 | ||
183 | if (rdev1->bdev->bd_disk->queue->merge_bvec_fn && | 201 | if (rdev1->bdev->bd_disk->queue->merge_bvec_fn) { |
184 | queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) | 202 | blk_queue_max_segments(mddev->queue, 1); |
185 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | 203 | blk_queue_segment_boundary(mddev->queue, |
186 | 204 | PAGE_CACHE_SIZE - 1); | |
205 | } | ||
187 | if (!smallest || (rdev1->sectors < smallest->sectors)) | 206 | if (!smallest || (rdev1->sectors < smallest->sectors)) |
188 | smallest = rdev1; | 207 | smallest = rdev1; |
189 | cnt++; | 208 | cnt++; |
190 | } | 209 | } |
191 | if (cnt != mddev->raid_disks) { | 210 | if (cnt != mddev->raid_disks) { |
192 | printk(KERN_ERR "raid0: too few disks (%d of %d) - " | 211 | printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - " |
193 | "aborting!\n", cnt, mddev->raid_disks); | 212 | "aborting!\n", mdname(mddev), cnt, mddev->raid_disks); |
194 | goto abort; | 213 | goto abort; |
195 | } | 214 | } |
196 | zone->nb_dev = cnt; | 215 | zone->nb_dev = cnt; |
@@ -206,39 +225,44 @@ static int create_strip_zones(mddev_t *mddev) | |||
206 | zone = conf->strip_zone + i; | 225 | zone = conf->strip_zone + i; |
207 | dev = conf->devlist + i * mddev->raid_disks; | 226 | dev = conf->devlist + i * mddev->raid_disks; |
208 | 227 | ||
209 | printk(KERN_INFO "raid0: zone %d\n", i); | 228 | printk(KERN_INFO "md/raid0:%s: zone %d\n", |
229 | mdname(mddev), i); | ||
210 | zone->dev_start = smallest->sectors; | 230 | zone->dev_start = smallest->sectors; |
211 | smallest = NULL; | 231 | smallest = NULL; |
212 | c = 0; | 232 | c = 0; |
213 | 233 | ||
214 | for (j=0; j<cnt; j++) { | 234 | for (j=0; j<cnt; j++) { |
215 | rdev = conf->devlist[j]; | 235 | rdev = conf->devlist[j]; |
216 | printk(KERN_INFO "raid0: checking %s ...", | 236 | printk(KERN_INFO "md/raid0:%s: checking %s ...", |
217 | bdevname(rdev->bdev, b)); | 237 | mdname(mddev), |
238 | bdevname(rdev->bdev, b)); | ||
218 | if (rdev->sectors <= zone->dev_start) { | 239 | if (rdev->sectors <= zone->dev_start) { |
219 | printk(KERN_INFO " nope.\n"); | 240 | printk(KERN_CONT " nope.\n"); |
220 | continue; | 241 | continue; |
221 | } | 242 | } |
222 | printk(KERN_INFO " contained as device %d\n", c); | 243 | printk(KERN_CONT " contained as device %d\n", c); |
223 | dev[c] = rdev; | 244 | dev[c] = rdev; |
224 | c++; | 245 | c++; |
225 | if (!smallest || rdev->sectors < smallest->sectors) { | 246 | if (!smallest || rdev->sectors < smallest->sectors) { |
226 | smallest = rdev; | 247 | smallest = rdev; |
227 | printk(KERN_INFO " (%llu) is smallest!.\n", | 248 | printk(KERN_INFO "md/raid0:%s: (%llu) is smallest!.\n", |
228 | (unsigned long long)rdev->sectors); | 249 | mdname(mddev), |
250 | (unsigned long long)rdev->sectors); | ||
229 | } | 251 | } |
230 | } | 252 | } |
231 | 253 | ||
232 | zone->nb_dev = c; | 254 | zone->nb_dev = c; |
233 | sectors = (smallest->sectors - zone->dev_start) * c; | 255 | sectors = (smallest->sectors - zone->dev_start) * c; |
234 | printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n", | 256 | printk(KERN_INFO "md/raid0:%s: zone->nb_dev: %d, sectors: %llu\n", |
235 | zone->nb_dev, (unsigned long long)sectors); | 257 | mdname(mddev), |
258 | zone->nb_dev, (unsigned long long)sectors); | ||
236 | 259 | ||
237 | curr_zone_end += sectors; | 260 | curr_zone_end += sectors; |
238 | zone->zone_end = curr_zone_end; | 261 | zone->zone_end = curr_zone_end; |
239 | 262 | ||
240 | printk(KERN_INFO "raid0: current zone start: %llu\n", | 263 | printk(KERN_INFO "md/raid0:%s: current zone start: %llu\n", |
241 | (unsigned long long)smallest->sectors); | 264 | mdname(mddev), |
265 | (unsigned long long)smallest->sectors); | ||
242 | } | 266 | } |
243 | mddev->queue->unplug_fn = raid0_unplug; | 267 | mddev->queue->unplug_fn = raid0_unplug; |
244 | mddev->queue->backing_dev_info.congested_fn = raid0_congested; | 268 | mddev->queue->backing_dev_info.congested_fn = raid0_congested; |
@@ -249,7 +273,7 @@ static int create_strip_zones(mddev_t *mddev) | |||
249 | * chunk size is a multiple of that sector size | 273 | * chunk size is a multiple of that sector size |
250 | */ | 274 | */ |
251 | if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) { | 275 | if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) { |
252 | printk(KERN_ERR "%s chunk_size of %d not valid\n", | 276 | printk(KERN_ERR "md/raid0:%s: chunk_size of %d not valid\n", |
253 | mdname(mddev), | 277 | mdname(mddev), |
254 | mddev->chunk_sectors << 9); | 278 | mddev->chunk_sectors << 9); |
255 | goto abort; | 279 | goto abort; |
@@ -259,14 +283,15 @@ static int create_strip_zones(mddev_t *mddev) | |||
259 | blk_queue_io_opt(mddev->queue, | 283 | blk_queue_io_opt(mddev->queue, |
260 | (mddev->chunk_sectors << 9) * mddev->raid_disks); | 284 | (mddev->chunk_sectors << 9) * mddev->raid_disks); |
261 | 285 | ||
262 | printk(KERN_INFO "raid0: done.\n"); | 286 | printk(KERN_INFO "md/raid0:%s: done.\n", mdname(mddev)); |
263 | mddev->private = conf; | 287 | *private_conf = conf; |
288 | |||
264 | return 0; | 289 | return 0; |
265 | abort: | 290 | abort: |
266 | kfree(conf->strip_zone); | 291 | kfree(conf->strip_zone); |
267 | kfree(conf->devlist); | 292 | kfree(conf->devlist); |
268 | kfree(conf); | 293 | kfree(conf); |
269 | mddev->private = NULL; | 294 | *private_conf = NULL; |
270 | return err; | 295 | return err; |
271 | } | 296 | } |
272 | 297 | ||
@@ -317,26 +342,34 @@ static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks) | |||
317 | 342 | ||
318 | static int raid0_run(mddev_t *mddev) | 343 | static int raid0_run(mddev_t *mddev) |
319 | { | 344 | { |
345 | raid0_conf_t *conf; | ||
320 | int ret; | 346 | int ret; |
321 | 347 | ||
322 | if (mddev->chunk_sectors == 0) { | 348 | if (mddev->chunk_sectors == 0) { |
323 | printk(KERN_ERR "md/raid0: chunk size must be set.\n"); | 349 | printk(KERN_ERR "md/raid0:%s: chunk size must be set.\n", |
350 | mdname(mddev)); | ||
324 | return -EINVAL; | 351 | return -EINVAL; |
325 | } | 352 | } |
326 | if (md_check_no_bitmap(mddev)) | 353 | if (md_check_no_bitmap(mddev)) |
327 | return -EINVAL; | 354 | return -EINVAL; |
328 | blk_queue_max_sectors(mddev->queue, mddev->chunk_sectors); | 355 | blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); |
329 | mddev->queue->queue_lock = &mddev->queue->__queue_lock; | 356 | mddev->queue->queue_lock = &mddev->queue->__queue_lock; |
330 | 357 | ||
331 | ret = create_strip_zones(mddev); | 358 | /* if private is not null, we are here after takeover */ |
332 | if (ret < 0) | 359 | if (mddev->private == NULL) { |
333 | return ret; | 360 | ret = create_strip_zones(mddev, &conf); |
361 | if (ret < 0) | ||
362 | return ret; | ||
363 | mddev->private = conf; | ||
364 | } | ||
365 | conf = mddev->private; | ||
334 | 366 | ||
335 | /* calculate array device size */ | 367 | /* calculate array device size */ |
336 | md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); | 368 | md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); |
337 | 369 | ||
338 | printk(KERN_INFO "raid0 : md_size is %llu sectors.\n", | 370 | printk(KERN_INFO "md/raid0:%s: md_size is %llu sectors.\n", |
339 | (unsigned long long)mddev->array_sectors); | 371 | mdname(mddev), |
372 | (unsigned long long)mddev->array_sectors); | ||
340 | /* calculate the max read-ahead size. | 373 | /* calculate the max read-ahead size. |
341 | * For read-ahead of large files to be effective, we need to | 374 | * For read-ahead of large files to be effective, we need to |
342 | * readahead at least twice a whole stripe. i.e. number of devices | 375 | * readahead at least twice a whole stripe. i.e. number of devices |
@@ -400,6 +433,7 @@ static mdk_rdev_t *map_sector(mddev_t *mddev, struct strip_zone *zone, | |||
400 | unsigned int sect_in_chunk; | 433 | unsigned int sect_in_chunk; |
401 | sector_t chunk; | 434 | sector_t chunk; |
402 | raid0_conf_t *conf = mddev->private; | 435 | raid0_conf_t *conf = mddev->private; |
436 | int raid_disks = conf->strip_zone[0].nb_dev; | ||
403 | unsigned int chunk_sects = mddev->chunk_sectors; | 437 | unsigned int chunk_sects = mddev->chunk_sectors; |
404 | 438 | ||
405 | if (is_power_of_2(chunk_sects)) { | 439 | if (is_power_of_2(chunk_sects)) { |
@@ -422,7 +456,7 @@ static mdk_rdev_t *map_sector(mddev_t *mddev, struct strip_zone *zone, | |||
422 | * + the position in the chunk | 456 | * + the position in the chunk |
423 | */ | 457 | */ |
424 | *sector_offset = (chunk * chunk_sects) + sect_in_chunk; | 458 | *sector_offset = (chunk * chunk_sects) + sect_in_chunk; |
425 | return conf->devlist[(zone - conf->strip_zone)*mddev->raid_disks | 459 | return conf->devlist[(zone - conf->strip_zone)*raid_disks |
426 | + sector_div(sector, zone->nb_dev)]; | 460 | + sector_div(sector, zone->nb_dev)]; |
427 | } | 461 | } |
428 | 462 | ||
@@ -442,27 +476,18 @@ static inline int is_io_in_chunk_boundary(mddev_t *mddev, | |||
442 | } | 476 | } |
443 | } | 477 | } |
444 | 478 | ||
445 | static int raid0_make_request(struct request_queue *q, struct bio *bio) | 479 | static int raid0_make_request(mddev_t *mddev, struct bio *bio) |
446 | { | 480 | { |
447 | mddev_t *mddev = q->queuedata; | ||
448 | unsigned int chunk_sects; | 481 | unsigned int chunk_sects; |
449 | sector_t sector_offset; | 482 | sector_t sector_offset; |
450 | struct strip_zone *zone; | 483 | struct strip_zone *zone; |
451 | mdk_rdev_t *tmp_dev; | 484 | mdk_rdev_t *tmp_dev; |
452 | const int rw = bio_data_dir(bio); | ||
453 | int cpu; | ||
454 | 485 | ||
455 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { | 486 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { |
456 | bio_endio(bio, -EOPNOTSUPP); | 487 | md_barrier_request(mddev, bio); |
457 | return 0; | 488 | return 0; |
458 | } | 489 | } |
459 | 490 | ||
460 | cpu = part_stat_lock(); | ||
461 | part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); | ||
462 | part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], | ||
463 | bio_sectors(bio)); | ||
464 | part_stat_unlock(); | ||
465 | |||
466 | chunk_sects = mddev->chunk_sectors; | 491 | chunk_sects = mddev->chunk_sectors; |
467 | if (unlikely(!is_io_in_chunk_boundary(mddev, chunk_sects, bio))) { | 492 | if (unlikely(!is_io_in_chunk_boundary(mddev, chunk_sects, bio))) { |
468 | sector_t sector = bio->bi_sector; | 493 | sector_t sector = bio->bi_sector; |
@@ -480,9 +505,9 @@ static int raid0_make_request(struct request_queue *q, struct bio *bio) | |||
480 | else | 505 | else |
481 | bp = bio_split(bio, chunk_sects - | 506 | bp = bio_split(bio, chunk_sects - |
482 | sector_div(sector, chunk_sects)); | 507 | sector_div(sector, chunk_sects)); |
483 | if (raid0_make_request(q, &bp->bio1)) | 508 | if (raid0_make_request(mddev, &bp->bio1)) |
484 | generic_make_request(&bp->bio1); | 509 | generic_make_request(&bp->bio1); |
485 | if (raid0_make_request(q, &bp->bio2)) | 510 | if (raid0_make_request(mddev, &bp->bio2)) |
486 | generic_make_request(&bp->bio2); | 511 | generic_make_request(&bp->bio2); |
487 | 512 | ||
488 | bio_pair_release(bp); | 513 | bio_pair_release(bp); |
@@ -502,9 +527,10 @@ static int raid0_make_request(struct request_queue *q, struct bio *bio) | |||
502 | return 1; | 527 | return 1; |
503 | 528 | ||
504 | bad_map: | 529 | bad_map: |
505 | printk("raid0_make_request bug: can't convert block across chunks" | 530 | printk("md/raid0:%s: make_request bug: can't convert block across chunks" |
506 | " or bigger than %dk %llu %d\n", chunk_sects / 2, | 531 | " or bigger than %dk %llu %d\n", |
507 | (unsigned long long)bio->bi_sector, bio->bi_size >> 10); | 532 | mdname(mddev), chunk_sects / 2, |
533 | (unsigned long long)bio->bi_sector, bio->bi_size >> 10); | ||
508 | 534 | ||
509 | bio_io_error(bio); | 535 | bio_io_error(bio); |
510 | return 0; | 536 | return 0; |
@@ -517,6 +543,7 @@ static void raid0_status(struct seq_file *seq, mddev_t *mddev) | |||
517 | int j, k, h; | 543 | int j, k, h; |
518 | char b[BDEVNAME_SIZE]; | 544 | char b[BDEVNAME_SIZE]; |
519 | raid0_conf_t *conf = mddev->private; | 545 | raid0_conf_t *conf = mddev->private; |
546 | int raid_disks = conf->strip_zone[0].nb_dev; | ||
520 | 547 | ||
521 | sector_t zone_size; | 548 | sector_t zone_size; |
522 | sector_t zone_start = 0; | 549 | sector_t zone_start = 0; |
@@ -527,7 +554,7 @@ static void raid0_status(struct seq_file *seq, mddev_t *mddev) | |||
527 | seq_printf(seq, "=["); | 554 | seq_printf(seq, "=["); |
528 | for (k = 0; k < conf->strip_zone[j].nb_dev; k++) | 555 | for (k = 0; k < conf->strip_zone[j].nb_dev; k++) |
529 | seq_printf(seq, "%s/", bdevname( | 556 | seq_printf(seq, "%s/", bdevname( |
530 | conf->devlist[j*mddev->raid_disks + k] | 557 | conf->devlist[j*raid_disks + k] |
531 | ->bdev, b)); | 558 | ->bdev, b)); |
532 | 559 | ||
533 | zone_size = conf->strip_zone[j].zone_end - zone_start; | 560 | zone_size = conf->strip_zone[j].zone_end - zone_start; |
@@ -542,6 +569,109 @@ static void raid0_status(struct seq_file *seq, mddev_t *mddev) | |||
542 | return; | 569 | return; |
543 | } | 570 | } |
544 | 571 | ||
572 | static void *raid0_takeover_raid45(mddev_t *mddev) | ||
573 | { | ||
574 | mdk_rdev_t *rdev; | ||
575 | raid0_conf_t *priv_conf; | ||
576 | |||
577 | if (mddev->degraded != 1) { | ||
578 | printk(KERN_ERR "md/raid0:%s: raid5 must be degraded! Degraded disks: %d\n", | ||
579 | mdname(mddev), | ||
580 | mddev->degraded); | ||
581 | return ERR_PTR(-EINVAL); | ||
582 | } | ||
583 | |||
584 | list_for_each_entry(rdev, &mddev->disks, same_set) { | ||
585 | /* check slot number for a disk */ | ||
586 | if (rdev->raid_disk == mddev->raid_disks-1) { | ||
587 | printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n", | ||
588 | mdname(mddev)); | ||
589 | return ERR_PTR(-EINVAL); | ||
590 | } | ||
591 | } | ||
592 | |||
593 | /* Set new parameters */ | ||
594 | mddev->new_level = 0; | ||
595 | mddev->new_layout = 0; | ||
596 | mddev->new_chunk_sectors = mddev->chunk_sectors; | ||
597 | mddev->raid_disks--; | ||
598 | mddev->delta_disks = -1; | ||
599 | /* make sure it will be not marked as dirty */ | ||
600 | mddev->recovery_cp = MaxSector; | ||
601 | |||
602 | create_strip_zones(mddev, &priv_conf); | ||
603 | return priv_conf; | ||
604 | } | ||
605 | |||
606 | static void *raid0_takeover_raid10(mddev_t *mddev) | ||
607 | { | ||
608 | raid0_conf_t *priv_conf; | ||
609 | |||
610 | /* Check layout: | ||
611 | * - far_copies must be 1 | ||
612 | * - near_copies must be 2 | ||
613 | * - disks number must be even | ||
614 | * - all mirrors must be already degraded | ||
615 | */ | ||
616 | if (mddev->layout != ((1 << 8) + 2)) { | ||
617 | printk(KERN_ERR "md/raid0:%s:: Raid0 cannot takover layout: 0x%x\n", | ||
618 | mdname(mddev), | ||
619 | mddev->layout); | ||
620 | return ERR_PTR(-EINVAL); | ||
621 | } | ||
622 | if (mddev->raid_disks & 1) { | ||
623 | printk(KERN_ERR "md/raid0:%s: Raid0 cannot takover Raid10 with odd disk number.\n", | ||
624 | mdname(mddev)); | ||
625 | return ERR_PTR(-EINVAL); | ||
626 | } | ||
627 | if (mddev->degraded != (mddev->raid_disks>>1)) { | ||
628 | printk(KERN_ERR "md/raid0:%s: All mirrors must be already degraded!\n", | ||
629 | mdname(mddev)); | ||
630 | return ERR_PTR(-EINVAL); | ||
631 | } | ||
632 | |||
633 | /* Set new parameters */ | ||
634 | mddev->new_level = 0; | ||
635 | mddev->new_layout = 0; | ||
636 | mddev->new_chunk_sectors = mddev->chunk_sectors; | ||
637 | mddev->delta_disks = - mddev->raid_disks / 2; | ||
638 | mddev->raid_disks += mddev->delta_disks; | ||
639 | mddev->degraded = 0; | ||
640 | /* make sure it will be not marked as dirty */ | ||
641 | mddev->recovery_cp = MaxSector; | ||
642 | |||
643 | create_strip_zones(mddev, &priv_conf); | ||
644 | return priv_conf; | ||
645 | } | ||
646 | |||
647 | static void *raid0_takeover(mddev_t *mddev) | ||
648 | { | ||
649 | /* raid0 can take over: | ||
650 | * raid4 - if all data disks are active. | ||
651 | * raid5 - providing it is Raid4 layout and one disk is faulty | ||
652 | * raid10 - assuming we have all necessary active disks | ||
653 | */ | ||
654 | if (mddev->level == 4) | ||
655 | return raid0_takeover_raid45(mddev); | ||
656 | |||
657 | if (mddev->level == 5) { | ||
658 | if (mddev->layout == ALGORITHM_PARITY_N) | ||
659 | return raid0_takeover_raid45(mddev); | ||
660 | |||
661 | printk(KERN_ERR "md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n", | ||
662 | mdname(mddev), ALGORITHM_PARITY_N); | ||
663 | } | ||
664 | |||
665 | if (mddev->level == 10) | ||
666 | return raid0_takeover_raid10(mddev); | ||
667 | |||
668 | return ERR_PTR(-EINVAL); | ||
669 | } | ||
670 | |||
671 | static void raid0_quiesce(mddev_t *mddev, int state) | ||
672 | { | ||
673 | } | ||
674 | |||
545 | static struct mdk_personality raid0_personality= | 675 | static struct mdk_personality raid0_personality= |
546 | { | 676 | { |
547 | .name = "raid0", | 677 | .name = "raid0", |
@@ -552,6 +682,8 @@ static struct mdk_personality raid0_personality= | |||
552 | .stop = raid0_stop, | 682 | .stop = raid0_stop, |
553 | .status = raid0_status, | 683 | .status = raid0_status, |
554 | .size = raid0_size, | 684 | .size = raid0_size, |
685 | .takeover = raid0_takeover, | ||
686 | .quiesce = raid0_quiesce, | ||
555 | }; | 687 | }; |
556 | 688 | ||
557 | static int __init raid0_init (void) | 689 | static int __init raid0_init (void) |
@@ -567,6 +699,7 @@ static void raid0_exit (void) | |||
567 | module_init(raid0_init); | 699 | module_init(raid0_init); |
568 | module_exit(raid0_exit); | 700 | module_exit(raid0_exit); |
569 | MODULE_LICENSE("GPL"); | 701 | MODULE_LICENSE("GPL"); |
702 | MODULE_DESCRIPTION("RAID0 (striping) personality for MD"); | ||
570 | MODULE_ALIAS("md-personality-2"); /* RAID0 */ | 703 | MODULE_ALIAS("md-personality-2"); /* RAID0 */ |
571 | MODULE_ALIAS("md-raid0"); | 704 | MODULE_ALIAS("md-raid0"); |
572 | MODULE_ALIAS("md-level-0"); | 705 | MODULE_ALIAS("md-level-0"); |
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index d1b9bd5fd4f6..a948da8012de 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -31,6 +31,7 @@ | |||
31 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | 31 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
32 | */ | 32 | */ |
33 | 33 | ||
34 | #include <linux/slab.h> | ||
34 | #include <linux/delay.h> | 35 | #include <linux/delay.h> |
35 | #include <linux/blkdev.h> | 36 | #include <linux/blkdev.h> |
36 | #include <linux/seq_file.h> | 37 | #include <linux/seq_file.h> |
@@ -64,7 +65,7 @@ static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) | |||
64 | 65 | ||
65 | /* allocate a r1bio with room for raid_disks entries in the bios array */ | 66 | /* allocate a r1bio with room for raid_disks entries in the bios array */ |
66 | r1_bio = kzalloc(size, gfp_flags); | 67 | r1_bio = kzalloc(size, gfp_flags); |
67 | if (!r1_bio) | 68 | if (!r1_bio && pi->mddev) |
68 | unplug_slaves(pi->mddev); | 69 | unplug_slaves(pi->mddev); |
69 | 70 | ||
70 | return r1_bio; | 71 | return r1_bio; |
@@ -262,7 +263,7 @@ static inline void update_head_pos(int disk, r1bio_t *r1_bio) | |||
262 | static void raid1_end_read_request(struct bio *bio, int error) | 263 | static void raid1_end_read_request(struct bio *bio, int error) |
263 | { | 264 | { |
264 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 265 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
265 | r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); | 266 | r1bio_t *r1_bio = bio->bi_private; |
266 | int mirror; | 267 | int mirror; |
267 | conf_t *conf = r1_bio->mddev->private; | 268 | conf_t *conf = r1_bio->mddev->private; |
268 | 269 | ||
@@ -296,7 +297,8 @@ static void raid1_end_read_request(struct bio *bio, int error) | |||
296 | */ | 297 | */ |
297 | char b[BDEVNAME_SIZE]; | 298 | char b[BDEVNAME_SIZE]; |
298 | if (printk_ratelimit()) | 299 | if (printk_ratelimit()) |
299 | printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n", | 300 | printk(KERN_ERR "md/raid1:%s: %s: rescheduling sector %llu\n", |
301 | mdname(conf->mddev), | ||
300 | bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector); | 302 | bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector); |
301 | reschedule_retry(r1_bio); | 303 | reschedule_retry(r1_bio); |
302 | } | 304 | } |
@@ -307,7 +309,7 @@ static void raid1_end_read_request(struct bio *bio, int error) | |||
307 | static void raid1_end_write_request(struct bio *bio, int error) | 309 | static void raid1_end_write_request(struct bio *bio, int error) |
308 | { | 310 | { |
309 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 311 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
310 | r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); | 312 | r1bio_t *r1_bio = bio->bi_private; |
311 | int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state); | 313 | int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state); |
312 | conf_t *conf = r1_bio->mddev->private; | 314 | conf_t *conf = r1_bio->mddev->private; |
313 | struct bio *to_put = NULL; | 315 | struct bio *to_put = NULL; |
@@ -417,7 +419,7 @@ static void raid1_end_write_request(struct bio *bio, int error) | |||
417 | */ | 419 | */ |
418 | static int read_balance(conf_t *conf, r1bio_t *r1_bio) | 420 | static int read_balance(conf_t *conf, r1bio_t *r1_bio) |
419 | { | 421 | { |
420 | const unsigned long this_sector = r1_bio->sector; | 422 | const sector_t this_sector = r1_bio->sector; |
421 | int new_disk = conf->last_used, disk = new_disk; | 423 | int new_disk = conf->last_used, disk = new_disk; |
422 | int wonly_disk = -1; | 424 | int wonly_disk = -1; |
423 | const int sectors = r1_bio->sectors; | 425 | const int sectors = r1_bio->sectors; |
@@ -433,7 +435,7 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
433 | retry: | 435 | retry: |
434 | if (conf->mddev->recovery_cp < MaxSector && | 436 | if (conf->mddev->recovery_cp < MaxSector && |
435 | (this_sector + sectors >= conf->next_resync)) { | 437 | (this_sector + sectors >= conf->next_resync)) { |
436 | /* Choose the first operation device, for consistancy */ | 438 | /* Choose the first operational device, for consistancy */ |
437 | new_disk = 0; | 439 | new_disk = 0; |
438 | 440 | ||
439 | for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); | 441 | for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); |
@@ -677,6 +679,7 @@ static void raise_barrier(conf_t *conf) | |||
677 | static void lower_barrier(conf_t *conf) | 679 | static void lower_barrier(conf_t *conf) |
678 | { | 680 | { |
679 | unsigned long flags; | 681 | unsigned long flags; |
682 | BUG_ON(conf->barrier <= 0); | ||
680 | spin_lock_irqsave(&conf->resync_lock, flags); | 683 | spin_lock_irqsave(&conf->resync_lock, flags); |
681 | conf->barrier--; | 684 | conf->barrier--; |
682 | spin_unlock_irqrestore(&conf->resync_lock, flags); | 685 | spin_unlock_irqrestore(&conf->resync_lock, flags); |
@@ -772,9 +775,8 @@ do_sync_io: | |||
772 | return NULL; | 775 | return NULL; |
773 | } | 776 | } |
774 | 777 | ||
775 | static int make_request(struct request_queue *q, struct bio * bio) | 778 | static int make_request(mddev_t *mddev, struct bio * bio) |
776 | { | 779 | { |
777 | mddev_t *mddev = q->queuedata; | ||
778 | conf_t *conf = mddev->private; | 780 | conf_t *conf = mddev->private; |
779 | mirror_info_t *mirror; | 781 | mirror_info_t *mirror; |
780 | r1bio_t *r1_bio; | 782 | r1bio_t *r1_bio; |
@@ -786,7 +788,6 @@ static int make_request(struct request_queue *q, struct bio * bio) | |||
786 | struct page **behind_pages = NULL; | 788 | struct page **behind_pages = NULL; |
787 | const int rw = bio_data_dir(bio); | 789 | const int rw = bio_data_dir(bio); |
788 | const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO); | 790 | const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO); |
789 | int cpu; | ||
790 | bool do_barriers; | 791 | bool do_barriers; |
791 | mdk_rdev_t *blocked_rdev; | 792 | mdk_rdev_t *blocked_rdev; |
792 | 793 | ||
@@ -801,6 +802,25 @@ static int make_request(struct request_queue *q, struct bio * bio) | |||
801 | 802 | ||
802 | md_write_start(mddev, bio); /* wait on superblock update early */ | 803 | md_write_start(mddev, bio); /* wait on superblock update early */ |
803 | 804 | ||
805 | if (bio_data_dir(bio) == WRITE && | ||
806 | bio->bi_sector + bio->bi_size/512 > mddev->suspend_lo && | ||
807 | bio->bi_sector < mddev->suspend_hi) { | ||
808 | /* As the suspend_* range is controlled by | ||
809 | * userspace, we want an interruptible | ||
810 | * wait. | ||
811 | */ | ||
812 | DEFINE_WAIT(w); | ||
813 | for (;;) { | ||
814 | flush_signals(current); | ||
815 | prepare_to_wait(&conf->wait_barrier, | ||
816 | &w, TASK_INTERRUPTIBLE); | ||
817 | if (bio->bi_sector + bio->bi_size/512 <= mddev->suspend_lo || | ||
818 | bio->bi_sector >= mddev->suspend_hi) | ||
819 | break; | ||
820 | schedule(); | ||
821 | } | ||
822 | finish_wait(&conf->wait_barrier, &w); | ||
823 | } | ||
804 | if (unlikely(!mddev->barriers_work && | 824 | if (unlikely(!mddev->barriers_work && |
805 | bio_rw_flagged(bio, BIO_RW_BARRIER))) { | 825 | bio_rw_flagged(bio, BIO_RW_BARRIER))) { |
806 | if (rw == WRITE) | 826 | if (rw == WRITE) |
@@ -813,12 +833,6 @@ static int make_request(struct request_queue *q, struct bio * bio) | |||
813 | 833 | ||
814 | bitmap = mddev->bitmap; | 834 | bitmap = mddev->bitmap; |
815 | 835 | ||
816 | cpu = part_stat_lock(); | ||
817 | part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); | ||
818 | part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], | ||
819 | bio_sectors(bio)); | ||
820 | part_stat_unlock(); | ||
821 | |||
822 | /* | 836 | /* |
823 | * make_request() can abort the operation when READA is being | 837 | * make_request() can abort the operation when READA is being |
824 | * used and no empty request is available. | 838 | * used and no empty request is available. |
@@ -845,6 +859,15 @@ static int make_request(struct request_queue *q, struct bio * bio) | |||
845 | } | 859 | } |
846 | mirror = conf->mirrors + rdisk; | 860 | mirror = conf->mirrors + rdisk; |
847 | 861 | ||
862 | if (test_bit(WriteMostly, &mirror->rdev->flags) && | ||
863 | bitmap) { | ||
864 | /* Reading from a write-mostly device must | ||
865 | * take care not to over-take any writes | ||
866 | * that are 'behind' | ||
867 | */ | ||
868 | wait_event(bitmap->behind_wait, | ||
869 | atomic_read(&bitmap->behind_writes) == 0); | ||
870 | } | ||
848 | r1_bio->read_disk = rdisk; | 871 | r1_bio->read_disk = rdisk; |
849 | 872 | ||
850 | read_bio = bio_clone(bio, GFP_NOIO); | 873 | read_bio = bio_clone(bio, GFP_NOIO); |
@@ -891,9 +914,10 @@ static int make_request(struct request_queue *q, struct bio * bio) | |||
891 | if (test_bit(Faulty, &rdev->flags)) { | 914 | if (test_bit(Faulty, &rdev->flags)) { |
892 | rdev_dec_pending(rdev, mddev); | 915 | rdev_dec_pending(rdev, mddev); |
893 | r1_bio->bios[i] = NULL; | 916 | r1_bio->bios[i] = NULL; |
894 | } else | 917 | } else { |
895 | r1_bio->bios[i] = bio; | 918 | r1_bio->bios[i] = bio; |
896 | targets++; | 919 | targets++; |
920 | } | ||
897 | } else | 921 | } else |
898 | r1_bio->bios[i] = NULL; | 922 | r1_bio->bios[i] = NULL; |
899 | } | 923 | } |
@@ -921,9 +945,14 @@ static int make_request(struct request_queue *q, struct bio * bio) | |||
921 | set_bit(R1BIO_Degraded, &r1_bio->state); | 945 | set_bit(R1BIO_Degraded, &r1_bio->state); |
922 | } | 946 | } |
923 | 947 | ||
924 | /* do behind I/O ? */ | 948 | /* do behind I/O ? |
949 | * Not if there are too many, or cannot allocate memory, | ||
950 | * or a reader on WriteMostly is waiting for behind writes | ||
951 | * to flush */ | ||
925 | if (bitmap && | 952 | if (bitmap && |
926 | atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind && | 953 | (atomic_read(&bitmap->behind_writes) |
954 | < mddev->bitmap_info.max_write_behind) && | ||
955 | !waitqueue_active(&bitmap->behind_wait) && | ||
927 | (behind_pages = alloc_behind_pages(bio)) != NULL) | 956 | (behind_pages = alloc_behind_pages(bio)) != NULL) |
928 | set_bit(R1BIO_BehindIO, &r1_bio->state); | 957 | set_bit(R1BIO_BehindIO, &r1_bio->state); |
929 | 958 | ||
@@ -1048,21 +1077,22 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1048 | } else | 1077 | } else |
1049 | set_bit(Faulty, &rdev->flags); | 1078 | set_bit(Faulty, &rdev->flags); |
1050 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 1079 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
1051 | printk(KERN_ALERT "raid1: Disk failure on %s, disabling device.\n" | 1080 | printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n" |
1052 | "raid1: Operation continuing on %d devices.\n", | 1081 | KERN_ALERT "md/raid1:%s: Operation continuing on %d devices.\n", |
1053 | bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); | 1082 | mdname(mddev), bdevname(rdev->bdev, b), |
1083 | mdname(mddev), conf->raid_disks - mddev->degraded); | ||
1054 | } | 1084 | } |
1055 | 1085 | ||
1056 | static void print_conf(conf_t *conf) | 1086 | static void print_conf(conf_t *conf) |
1057 | { | 1087 | { |
1058 | int i; | 1088 | int i; |
1059 | 1089 | ||
1060 | printk("RAID1 conf printout:\n"); | 1090 | printk(KERN_DEBUG "RAID1 conf printout:\n"); |
1061 | if (!conf) { | 1091 | if (!conf) { |
1062 | printk("(!conf)\n"); | 1092 | printk(KERN_DEBUG "(!conf)\n"); |
1063 | return; | 1093 | return; |
1064 | } | 1094 | } |
1065 | printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, | 1095 | printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, |
1066 | conf->raid_disks); | 1096 | conf->raid_disks); |
1067 | 1097 | ||
1068 | rcu_read_lock(); | 1098 | rcu_read_lock(); |
@@ -1070,7 +1100,7 @@ static void print_conf(conf_t *conf) | |||
1070 | char b[BDEVNAME_SIZE]; | 1100 | char b[BDEVNAME_SIZE]; |
1071 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); | 1101 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); |
1072 | if (rdev) | 1102 | if (rdev) |
1073 | printk(" disk %d, wo:%d, o:%d, dev:%s\n", | 1103 | printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n", |
1074 | i, !test_bit(In_sync, &rdev->flags), | 1104 | i, !test_bit(In_sync, &rdev->flags), |
1075 | !test_bit(Faulty, &rdev->flags), | 1105 | !test_bit(Faulty, &rdev->flags), |
1076 | bdevname(rdev->bdev,b)); | 1106 | bdevname(rdev->bdev,b)); |
@@ -1131,13 +1161,17 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1131 | 1161 | ||
1132 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 1162 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
1133 | rdev->data_offset << 9); | 1163 | rdev->data_offset << 9); |
1134 | /* as we don't honour merge_bvec_fn, we must never risk | 1164 | /* as we don't honour merge_bvec_fn, we must |
1135 | * violating it, so limit ->max_sector to one PAGE, as | 1165 | * never risk violating it, so limit |
1136 | * a one page request is never in violation. | 1166 | * ->max_segments to one lying with a single |
1167 | * page, as a one page request is never in | ||
1168 | * violation. | ||
1137 | */ | 1169 | */ |
1138 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && | 1170 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { |
1139 | queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) | 1171 | blk_queue_max_segments(mddev->queue, 1); |
1140 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | 1172 | blk_queue_segment_boundary(mddev->queue, |
1173 | PAGE_CACHE_SIZE - 1); | ||
1174 | } | ||
1141 | 1175 | ||
1142 | p->head_position = 0; | 1176 | p->head_position = 0; |
1143 | rdev->raid_disk = mirror; | 1177 | rdev->raid_disk = mirror; |
@@ -1197,7 +1231,7 @@ abort: | |||
1197 | 1231 | ||
1198 | static void end_sync_read(struct bio *bio, int error) | 1232 | static void end_sync_read(struct bio *bio, int error) |
1199 | { | 1233 | { |
1200 | r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); | 1234 | r1bio_t *r1_bio = bio->bi_private; |
1201 | int i; | 1235 | int i; |
1202 | 1236 | ||
1203 | for (i=r1_bio->mddev->raid_disks; i--; ) | 1237 | for (i=r1_bio->mddev->raid_disks; i--; ) |
@@ -1220,7 +1254,7 @@ static void end_sync_read(struct bio *bio, int error) | |||
1220 | static void end_sync_write(struct bio *bio, int error) | 1254 | static void end_sync_write(struct bio *bio, int error) |
1221 | { | 1255 | { |
1222 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 1256 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
1223 | r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); | 1257 | r1bio_t *r1_bio = bio->bi_private; |
1224 | mddev_t *mddev = r1_bio->mddev; | 1258 | mddev_t *mddev = r1_bio->mddev; |
1225 | conf_t *conf = mddev->private; | 1259 | conf_t *conf = mddev->private; |
1226 | int i; | 1260 | int i; |
@@ -1427,9 +1461,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) | |||
1427 | char b[BDEVNAME_SIZE]; | 1461 | char b[BDEVNAME_SIZE]; |
1428 | /* Cannot read from anywhere, array is toast */ | 1462 | /* Cannot read from anywhere, array is toast */ |
1429 | md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); | 1463 | md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); |
1430 | printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error" | 1464 | printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" |
1431 | " for block %llu\n", | 1465 | " for block %llu\n", |
1432 | bdevname(bio->bi_bdev,b), | 1466 | mdname(mddev), |
1467 | bdevname(bio->bi_bdev, b), | ||
1433 | (unsigned long long)r1_bio->sector); | 1468 | (unsigned long long)r1_bio->sector); |
1434 | md_done_sync(mddev, r1_bio->sectors, 0); | 1469 | md_done_sync(mddev, r1_bio->sectors, 0); |
1435 | put_buf(r1_bio); | 1470 | put_buf(r1_bio); |
@@ -1551,7 +1586,7 @@ static void fix_read_error(conf_t *conf, int read_disk, | |||
1551 | else { | 1586 | else { |
1552 | atomic_add(s, &rdev->corrected_errors); | 1587 | atomic_add(s, &rdev->corrected_errors); |
1553 | printk(KERN_INFO | 1588 | printk(KERN_INFO |
1554 | "raid1:%s: read error corrected " | 1589 | "md/raid1:%s: read error corrected " |
1555 | "(%d sectors at %llu on %s)\n", | 1590 | "(%d sectors at %llu on %s)\n", |
1556 | mdname(mddev), s, | 1591 | mdname(mddev), s, |
1557 | (unsigned long long)(sect + | 1592 | (unsigned long long)(sect + |
@@ -1650,13 +1685,15 @@ static void raid1d(mddev_t *mddev) | |||
1650 | r1_bio->sector, | 1685 | r1_bio->sector, |
1651 | r1_bio->sectors); | 1686 | r1_bio->sectors); |
1652 | unfreeze_array(conf); | 1687 | unfreeze_array(conf); |
1653 | } | 1688 | } else |
1689 | md_error(mddev, | ||
1690 | conf->mirrors[r1_bio->read_disk].rdev); | ||
1654 | 1691 | ||
1655 | bio = r1_bio->bios[r1_bio->read_disk]; | 1692 | bio = r1_bio->bios[r1_bio->read_disk]; |
1656 | if ((disk=read_balance(conf, r1_bio)) == -1 || | 1693 | if ((disk=read_balance(conf, r1_bio)) == -1) { |
1657 | disk == r1_bio->read_disk) { | 1694 | printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O" |
1658 | printk(KERN_ALERT "raid1: %s: unrecoverable I/O" | ||
1659 | " read error for block %llu\n", | 1695 | " read error for block %llu\n", |
1696 | mdname(mddev), | ||
1660 | bdevname(bio->bi_bdev,b), | 1697 | bdevname(bio->bi_bdev,b), |
1661 | (unsigned long long)r1_bio->sector); | 1698 | (unsigned long long)r1_bio->sector); |
1662 | raid_end_bio_io(r1_bio); | 1699 | raid_end_bio_io(r1_bio); |
@@ -1670,10 +1707,11 @@ static void raid1d(mddev_t *mddev) | |||
1670 | r1_bio->bios[r1_bio->read_disk] = bio; | 1707 | r1_bio->bios[r1_bio->read_disk] = bio; |
1671 | rdev = conf->mirrors[disk].rdev; | 1708 | rdev = conf->mirrors[disk].rdev; |
1672 | if (printk_ratelimit()) | 1709 | if (printk_ratelimit()) |
1673 | printk(KERN_ERR "raid1: %s: redirecting sector %llu to" | 1710 | printk(KERN_ERR "md/raid1:%s: redirecting sector %llu to" |
1674 | " another mirror\n", | 1711 | " other mirror: %s\n", |
1675 | bdevname(rdev->bdev,b), | 1712 | mdname(mddev), |
1676 | (unsigned long long)r1_bio->sector); | 1713 | (unsigned long long)r1_bio->sector, |
1714 | bdevname(rdev->bdev,b)); | ||
1677 | bio->bi_sector = r1_bio->sector + rdev->data_offset; | 1715 | bio->bi_sector = r1_bio->sector + rdev->data_offset; |
1678 | bio->bi_bdev = rdev->bdev; | 1716 | bio->bi_bdev = rdev->bdev; |
1679 | bio->bi_end_io = raid1_end_read_request; | 1717 | bio->bi_end_io = raid1_end_read_request; |
@@ -1683,6 +1721,7 @@ static void raid1d(mddev_t *mddev) | |||
1683 | generic_make_request(bio); | 1721 | generic_make_request(bio); |
1684 | } | 1722 | } |
1685 | } | 1723 | } |
1724 | cond_resched(); | ||
1686 | } | 1725 | } |
1687 | if (unplug) | 1726 | if (unplug) |
1688 | unplug_slaves(mddev); | 1727 | unplug_slaves(mddev); |
@@ -1727,13 +1766,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1727 | int still_degraded = 0; | 1766 | int still_degraded = 0; |
1728 | 1767 | ||
1729 | if (!conf->r1buf_pool) | 1768 | if (!conf->r1buf_pool) |
1730 | { | ||
1731 | /* | ||
1732 | printk("sync start - bitmap %p\n", mddev->bitmap); | ||
1733 | */ | ||
1734 | if (init_resync(conf)) | 1769 | if (init_resync(conf)) |
1735 | return 0; | 1770 | return 0; |
1736 | } | ||
1737 | 1771 | ||
1738 | max_sector = mddev->dev_sectors; | 1772 | max_sector = mddev->dev_sectors; |
1739 | if (sector_nr >= max_sector) { | 1773 | if (sector_nr >= max_sector) { |
@@ -1939,73 +1973,48 @@ static sector_t raid1_size(mddev_t *mddev, sector_t sectors, int raid_disks) | |||
1939 | return mddev->dev_sectors; | 1973 | return mddev->dev_sectors; |
1940 | } | 1974 | } |
1941 | 1975 | ||
1942 | static int run(mddev_t *mddev) | 1976 | static conf_t *setup_conf(mddev_t *mddev) |
1943 | { | 1977 | { |
1944 | conf_t *conf; | 1978 | conf_t *conf; |
1945 | int i, j, disk_idx; | 1979 | int i; |
1946 | mirror_info_t *disk; | 1980 | mirror_info_t *disk; |
1947 | mdk_rdev_t *rdev; | 1981 | mdk_rdev_t *rdev; |
1982 | int err = -ENOMEM; | ||
1948 | 1983 | ||
1949 | if (mddev->level != 1) { | ||
1950 | printk("raid1: %s: raid level not set to mirroring (%d)\n", | ||
1951 | mdname(mddev), mddev->level); | ||
1952 | goto out; | ||
1953 | } | ||
1954 | if (mddev->reshape_position != MaxSector) { | ||
1955 | printk("raid1: %s: reshape_position set but not supported\n", | ||
1956 | mdname(mddev)); | ||
1957 | goto out; | ||
1958 | } | ||
1959 | /* | ||
1960 | * copy the already verified devices into our private RAID1 | ||
1961 | * bookkeeping area. [whatever we allocate in run(), | ||
1962 | * should be freed in stop()] | ||
1963 | */ | ||
1964 | conf = kzalloc(sizeof(conf_t), GFP_KERNEL); | 1984 | conf = kzalloc(sizeof(conf_t), GFP_KERNEL); |
1965 | mddev->private = conf; | ||
1966 | if (!conf) | 1985 | if (!conf) |
1967 | goto out_no_mem; | 1986 | goto abort; |
1968 | 1987 | ||
1969 | conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, | 1988 | conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, |
1970 | GFP_KERNEL); | 1989 | GFP_KERNEL); |
1971 | if (!conf->mirrors) | 1990 | if (!conf->mirrors) |
1972 | goto out_no_mem; | 1991 | goto abort; |
1973 | 1992 | ||
1974 | conf->tmppage = alloc_page(GFP_KERNEL); | 1993 | conf->tmppage = alloc_page(GFP_KERNEL); |
1975 | if (!conf->tmppage) | 1994 | if (!conf->tmppage) |
1976 | goto out_no_mem; | 1995 | goto abort; |
1977 | 1996 | ||
1978 | conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL); | 1997 | conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL); |
1979 | if (!conf->poolinfo) | 1998 | if (!conf->poolinfo) |
1980 | goto out_no_mem; | 1999 | goto abort; |
1981 | conf->poolinfo->mddev = mddev; | ||
1982 | conf->poolinfo->raid_disks = mddev->raid_disks; | 2000 | conf->poolinfo->raid_disks = mddev->raid_disks; |
1983 | conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, | 2001 | conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, |
1984 | r1bio_pool_free, | 2002 | r1bio_pool_free, |
1985 | conf->poolinfo); | 2003 | conf->poolinfo); |
1986 | if (!conf->r1bio_pool) | 2004 | if (!conf->r1bio_pool) |
1987 | goto out_no_mem; | 2005 | goto abort; |
1988 | 2006 | ||
1989 | spin_lock_init(&conf->device_lock); | 2007 | conf->poolinfo->mddev = mddev; |
1990 | mddev->queue->queue_lock = &conf->device_lock; | ||
1991 | 2008 | ||
2009 | spin_lock_init(&conf->device_lock); | ||
1992 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 2010 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
1993 | disk_idx = rdev->raid_disk; | 2011 | int disk_idx = rdev->raid_disk; |
1994 | if (disk_idx >= mddev->raid_disks | 2012 | if (disk_idx >= mddev->raid_disks |
1995 | || disk_idx < 0) | 2013 | || disk_idx < 0) |
1996 | continue; | 2014 | continue; |
1997 | disk = conf->mirrors + disk_idx; | 2015 | disk = conf->mirrors + disk_idx; |
1998 | 2016 | ||
1999 | disk->rdev = rdev; | 2017 | disk->rdev = rdev; |
2000 | disk_stack_limits(mddev->gendisk, rdev->bdev, | ||
2001 | rdev->data_offset << 9); | ||
2002 | /* as we don't honour merge_bvec_fn, we must never risk | ||
2003 | * violating it, so limit ->max_sector to one PAGE, as | ||
2004 | * a one page request is never in violation. | ||
2005 | */ | ||
2006 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && | ||
2007 | queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) | ||
2008 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | ||
2009 | 2018 | ||
2010 | disk->head_position = 0; | 2019 | disk->head_position = 0; |
2011 | } | 2020 | } |
@@ -2019,8 +2028,7 @@ static int run(mddev_t *mddev) | |||
2019 | bio_list_init(&conf->pending_bio_list); | 2028 | bio_list_init(&conf->pending_bio_list); |
2020 | bio_list_init(&conf->flushing_bio_list); | 2029 | bio_list_init(&conf->flushing_bio_list); |
2021 | 2030 | ||
2022 | 2031 | conf->last_used = -1; | |
2023 | mddev->degraded = 0; | ||
2024 | for (i = 0; i < conf->raid_disks; i++) { | 2032 | for (i = 0; i < conf->raid_disks; i++) { |
2025 | 2033 | ||
2026 | disk = conf->mirrors + i; | 2034 | disk = conf->mirrors + i; |
@@ -2028,49 +2036,115 @@ static int run(mddev_t *mddev) | |||
2028 | if (!disk->rdev || | 2036 | if (!disk->rdev || |
2029 | !test_bit(In_sync, &disk->rdev->flags)) { | 2037 | !test_bit(In_sync, &disk->rdev->flags)) { |
2030 | disk->head_position = 0; | 2038 | disk->head_position = 0; |
2031 | mddev->degraded++; | ||
2032 | if (disk->rdev) | 2039 | if (disk->rdev) |
2033 | conf->fullsync = 1; | 2040 | conf->fullsync = 1; |
2034 | } | 2041 | } else if (conf->last_used < 0) |
2042 | /* | ||
2043 | * The first working device is used as a | ||
2044 | * starting point to read balancing. | ||
2045 | */ | ||
2046 | conf->last_used = i; | ||
2035 | } | 2047 | } |
2036 | if (mddev->degraded == conf->raid_disks) { | 2048 | |
2037 | printk(KERN_ERR "raid1: no operational mirrors for %s\n", | 2049 | err = -EIO; |
2038 | mdname(mddev)); | 2050 | if (conf->last_used < 0) { |
2039 | goto out_free_conf; | 2051 | printk(KERN_ERR "md/raid1:%s: no operational mirrors\n", |
2052 | mdname(mddev)); | ||
2053 | goto abort; | ||
2054 | } | ||
2055 | err = -ENOMEM; | ||
2056 | conf->thread = md_register_thread(raid1d, mddev, NULL); | ||
2057 | if (!conf->thread) { | ||
2058 | printk(KERN_ERR | ||
2059 | "md/raid1:%s: couldn't allocate thread\n", | ||
2060 | mdname(mddev)); | ||
2061 | goto abort; | ||
2040 | } | 2062 | } |
2041 | if (conf->raid_disks - mddev->degraded == 1) | ||
2042 | mddev->recovery_cp = MaxSector; | ||
2043 | 2063 | ||
2064 | return conf; | ||
2065 | |||
2066 | abort: | ||
2067 | if (conf) { | ||
2068 | if (conf->r1bio_pool) | ||
2069 | mempool_destroy(conf->r1bio_pool); | ||
2070 | kfree(conf->mirrors); | ||
2071 | safe_put_page(conf->tmppage); | ||
2072 | kfree(conf->poolinfo); | ||
2073 | kfree(conf); | ||
2074 | } | ||
2075 | return ERR_PTR(err); | ||
2076 | } | ||
2077 | |||
2078 | static int run(mddev_t *mddev) | ||
2079 | { | ||
2080 | conf_t *conf; | ||
2081 | int i; | ||
2082 | mdk_rdev_t *rdev; | ||
2083 | |||
2084 | if (mddev->level != 1) { | ||
2085 | printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n", | ||
2086 | mdname(mddev), mddev->level); | ||
2087 | return -EIO; | ||
2088 | } | ||
2089 | if (mddev->reshape_position != MaxSector) { | ||
2090 | printk(KERN_ERR "md/raid1:%s: reshape_position set but not supported\n", | ||
2091 | mdname(mddev)); | ||
2092 | return -EIO; | ||
2093 | } | ||
2044 | /* | 2094 | /* |
2045 | * find the first working one and use it as a starting point | 2095 | * copy the already verified devices into our private RAID1 |
2046 | * to read balancing. | 2096 | * bookkeeping area. [whatever we allocate in run(), |
2097 | * should be freed in stop()] | ||
2047 | */ | 2098 | */ |
2048 | for (j = 0; j < conf->raid_disks && | 2099 | if (mddev->private == NULL) |
2049 | (!conf->mirrors[j].rdev || | 2100 | conf = setup_conf(mddev); |
2050 | !test_bit(In_sync, &conf->mirrors[j].rdev->flags)) ; j++) | 2101 | else |
2051 | /* nothing */; | 2102 | conf = mddev->private; |
2052 | conf->last_used = j; | ||
2053 | 2103 | ||
2104 | if (IS_ERR(conf)) | ||
2105 | return PTR_ERR(conf); | ||
2054 | 2106 | ||
2055 | mddev->thread = md_register_thread(raid1d, mddev, NULL); | 2107 | mddev->queue->queue_lock = &conf->device_lock; |
2056 | if (!mddev->thread) { | 2108 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
2057 | printk(KERN_ERR | 2109 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
2058 | "raid1: couldn't allocate thread for %s\n", | 2110 | rdev->data_offset << 9); |
2059 | mdname(mddev)); | 2111 | /* as we don't honour merge_bvec_fn, we must never risk |
2060 | goto out_free_conf; | 2112 | * violating it, so limit ->max_segments to 1 lying within |
2113 | * a single page, as a one page request is never in violation. | ||
2114 | */ | ||
2115 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { | ||
2116 | blk_queue_max_segments(mddev->queue, 1); | ||
2117 | blk_queue_segment_boundary(mddev->queue, | ||
2118 | PAGE_CACHE_SIZE - 1); | ||
2119 | } | ||
2061 | } | 2120 | } |
2062 | 2121 | ||
2122 | mddev->degraded = 0; | ||
2123 | for (i=0; i < conf->raid_disks; i++) | ||
2124 | if (conf->mirrors[i].rdev == NULL || | ||
2125 | !test_bit(In_sync, &conf->mirrors[i].rdev->flags) || | ||
2126 | test_bit(Faulty, &conf->mirrors[i].rdev->flags)) | ||
2127 | mddev->degraded++; | ||
2128 | |||
2129 | if (conf->raid_disks - mddev->degraded == 1) | ||
2130 | mddev->recovery_cp = MaxSector; | ||
2131 | |||
2063 | if (mddev->recovery_cp != MaxSector) | 2132 | if (mddev->recovery_cp != MaxSector) |
2064 | printk(KERN_NOTICE "raid1: %s is not clean" | 2133 | printk(KERN_NOTICE "md/raid1:%s: not clean" |
2065 | " -- starting background reconstruction\n", | 2134 | " -- starting background reconstruction\n", |
2066 | mdname(mddev)); | 2135 | mdname(mddev)); |
2067 | printk(KERN_INFO | 2136 | printk(KERN_INFO |
2068 | "raid1: raid set %s active with %d out of %d mirrors\n", | 2137 | "md/raid1:%s: active with %d out of %d mirrors\n", |
2069 | mdname(mddev), mddev->raid_disks - mddev->degraded, | 2138 | mdname(mddev), mddev->raid_disks - mddev->degraded, |
2070 | mddev->raid_disks); | 2139 | mddev->raid_disks); |
2140 | |||
2071 | /* | 2141 | /* |
2072 | * Ok, everything is just fine now | 2142 | * Ok, everything is just fine now |
2073 | */ | 2143 | */ |
2144 | mddev->thread = conf->thread; | ||
2145 | conf->thread = NULL; | ||
2146 | mddev->private = conf; | ||
2147 | |||
2074 | md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); | 2148 | md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); |
2075 | 2149 | ||
2076 | mddev->queue->unplug_fn = raid1_unplug; | 2150 | mddev->queue->unplug_fn = raid1_unplug; |
@@ -2078,38 +2152,20 @@ static int run(mddev_t *mddev) | |||
2078 | mddev->queue->backing_dev_info.congested_data = mddev; | 2152 | mddev->queue->backing_dev_info.congested_data = mddev; |
2079 | md_integrity_register(mddev); | 2153 | md_integrity_register(mddev); |
2080 | return 0; | 2154 | return 0; |
2081 | |||
2082 | out_no_mem: | ||
2083 | printk(KERN_ERR "raid1: couldn't allocate memory for %s\n", | ||
2084 | mdname(mddev)); | ||
2085 | |||
2086 | out_free_conf: | ||
2087 | if (conf) { | ||
2088 | if (conf->r1bio_pool) | ||
2089 | mempool_destroy(conf->r1bio_pool); | ||
2090 | kfree(conf->mirrors); | ||
2091 | safe_put_page(conf->tmppage); | ||
2092 | kfree(conf->poolinfo); | ||
2093 | kfree(conf); | ||
2094 | mddev->private = NULL; | ||
2095 | } | ||
2096 | out: | ||
2097 | return -EIO; | ||
2098 | } | 2155 | } |
2099 | 2156 | ||
2100 | static int stop(mddev_t *mddev) | 2157 | static int stop(mddev_t *mddev) |
2101 | { | 2158 | { |
2102 | conf_t *conf = mddev->private; | 2159 | conf_t *conf = mddev->private; |
2103 | struct bitmap *bitmap = mddev->bitmap; | 2160 | struct bitmap *bitmap = mddev->bitmap; |
2104 | int behind_wait = 0; | ||
2105 | 2161 | ||
2106 | /* wait for behind writes to complete */ | 2162 | /* wait for behind writes to complete */ |
2107 | while (bitmap && atomic_read(&bitmap->behind_writes) > 0) { | 2163 | if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { |
2108 | behind_wait++; | 2164 | printk(KERN_INFO "md/raid1:%s: behind writes in progress - waiting to stop.\n", |
2109 | printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait); | 2165 | mdname(mddev)); |
2110 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
2111 | schedule_timeout(HZ); /* wait a second */ | ||
2112 | /* need to kick something here to make sure I/O goes? */ | 2166 | /* need to kick something here to make sure I/O goes? */ |
2167 | wait_event(bitmap->behind_wait, | ||
2168 | atomic_read(&bitmap->behind_writes) == 0); | ||
2113 | } | 2169 | } |
2114 | 2170 | ||
2115 | raise_barrier(conf); | 2171 | raise_barrier(conf); |
@@ -2140,7 +2196,6 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors) | |||
2140 | if (mddev->array_sectors > raid1_size(mddev, sectors, 0)) | 2196 | if (mddev->array_sectors > raid1_size(mddev, sectors, 0)) |
2141 | return -EINVAL; | 2197 | return -EINVAL; |
2142 | set_capacity(mddev->gendisk, mddev->array_sectors); | 2198 | set_capacity(mddev->gendisk, mddev->array_sectors); |
2143 | mddev->changed = 1; | ||
2144 | revalidate_disk(mddev->gendisk); | 2199 | revalidate_disk(mddev->gendisk); |
2145 | if (sectors > mddev->dev_sectors && | 2200 | if (sectors > mddev->dev_sectors && |
2146 | mddev->recovery_cp == MaxSector) { | 2201 | mddev->recovery_cp == MaxSector) { |
@@ -2235,9 +2290,9 @@ static int raid1_reshape(mddev_t *mddev) | |||
2235 | if (sysfs_create_link(&mddev->kobj, | 2290 | if (sysfs_create_link(&mddev->kobj, |
2236 | &rdev->kobj, nm)) | 2291 | &rdev->kobj, nm)) |
2237 | printk(KERN_WARNING | 2292 | printk(KERN_WARNING |
2238 | "md/raid1: cannot register " | 2293 | "md/raid1:%s: cannot register " |
2239 | "%s for %s\n", | 2294 | "%s\n", |
2240 | nm, mdname(mddev)); | 2295 | mdname(mddev), nm); |
2241 | } | 2296 | } |
2242 | if (rdev) | 2297 | if (rdev) |
2243 | newmirrors[d2++].rdev = rdev; | 2298 | newmirrors[d2++].rdev = rdev; |
@@ -2268,6 +2323,9 @@ static void raid1_quiesce(mddev_t *mddev, int state) | |||
2268 | conf_t *conf = mddev->private; | 2323 | conf_t *conf = mddev->private; |
2269 | 2324 | ||
2270 | switch(state) { | 2325 | switch(state) { |
2326 | case 2: /* wake for suspend */ | ||
2327 | wake_up(&conf->wait_barrier); | ||
2328 | break; | ||
2271 | case 1: | 2329 | case 1: |
2272 | raise_barrier(conf); | 2330 | raise_barrier(conf); |
2273 | break; | 2331 | break; |
@@ -2277,6 +2335,23 @@ static void raid1_quiesce(mddev_t *mddev, int state) | |||
2277 | } | 2335 | } |
2278 | } | 2336 | } |
2279 | 2337 | ||
2338 | static void *raid1_takeover(mddev_t *mddev) | ||
2339 | { | ||
2340 | /* raid1 can take over: | ||
2341 | * raid5 with 2 devices, any layout or chunk size | ||
2342 | */ | ||
2343 | if (mddev->level == 5 && mddev->raid_disks == 2) { | ||
2344 | conf_t *conf; | ||
2345 | mddev->new_level = 1; | ||
2346 | mddev->new_layout = 0; | ||
2347 | mddev->new_chunk_sectors = 0; | ||
2348 | conf = setup_conf(mddev); | ||
2349 | if (!IS_ERR(conf)) | ||
2350 | conf->barrier = 1; | ||
2351 | return conf; | ||
2352 | } | ||
2353 | return ERR_PTR(-EINVAL); | ||
2354 | } | ||
2280 | 2355 | ||
2281 | static struct mdk_personality raid1_personality = | 2356 | static struct mdk_personality raid1_personality = |
2282 | { | 2357 | { |
@@ -2296,6 +2371,7 @@ static struct mdk_personality raid1_personality = | |||
2296 | .size = raid1_size, | 2371 | .size = raid1_size, |
2297 | .check_reshape = raid1_reshape, | 2372 | .check_reshape = raid1_reshape, |
2298 | .quiesce = raid1_quiesce, | 2373 | .quiesce = raid1_quiesce, |
2374 | .takeover = raid1_takeover, | ||
2299 | }; | 2375 | }; |
2300 | 2376 | ||
2301 | static int __init raid_init(void) | 2377 | static int __init raid_init(void) |
@@ -2311,6 +2387,7 @@ static void raid_exit(void) | |||
2311 | module_init(raid_init); | 2387 | module_init(raid_init); |
2312 | module_exit(raid_exit); | 2388 | module_exit(raid_exit); |
2313 | MODULE_LICENSE("GPL"); | 2389 | MODULE_LICENSE("GPL"); |
2390 | MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD"); | ||
2314 | MODULE_ALIAS("md-personality-3"); /* RAID1 */ | 2391 | MODULE_ALIAS("md-personality-3"); /* RAID1 */ |
2315 | MODULE_ALIAS("md-raid1"); | 2392 | MODULE_ALIAS("md-raid1"); |
2316 | MODULE_ALIAS("md-level-1"); | 2393 | MODULE_ALIAS("md-level-1"); |
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index e87b84deff68..5f2d443ae28a 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h | |||
@@ -59,6 +59,11 @@ struct r1_private_data_s { | |||
59 | 59 | ||
60 | mempool_t *r1bio_pool; | 60 | mempool_t *r1bio_pool; |
61 | mempool_t *r1buf_pool; | 61 | mempool_t *r1buf_pool; |
62 | |||
63 | /* When taking over an array from a different personality, we store | ||
64 | * the new thread here until we fully activate the array. | ||
65 | */ | ||
66 | struct mdk_thread_s *thread; | ||
62 | }; | 67 | }; |
63 | 68 | ||
64 | typedef struct r1_private_data_s conf_t; | 69 | typedef struct r1_private_data_s conf_t; |
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 51c4c5c4d87a..42e64e4e5e25 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -18,11 +18,13 @@ | |||
18 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | 18 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
19 | */ | 19 | */ |
20 | 20 | ||
21 | #include <linux/slab.h> | ||
21 | #include <linux/delay.h> | 22 | #include <linux/delay.h> |
22 | #include <linux/blkdev.h> | 23 | #include <linux/blkdev.h> |
23 | #include <linux/seq_file.h> | 24 | #include <linux/seq_file.h> |
24 | #include "md.h" | 25 | #include "md.h" |
25 | #include "raid10.h" | 26 | #include "raid10.h" |
27 | #include "raid0.h" | ||
26 | #include "bitmap.h" | 28 | #include "bitmap.h" |
27 | 29 | ||
28 | /* | 30 | /* |
@@ -68,7 +70,7 @@ static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) | |||
68 | 70 | ||
69 | /* allocate a r10bio with room for raid_disks entries in the bios array */ | 71 | /* allocate a r10bio with room for raid_disks entries in the bios array */ |
70 | r10_bio = kzalloc(size, gfp_flags); | 72 | r10_bio = kzalloc(size, gfp_flags); |
71 | if (!r10_bio) | 73 | if (!r10_bio && conf->mddev) |
72 | unplug_slaves(conf->mddev); | 74 | unplug_slaves(conf->mddev); |
73 | 75 | ||
74 | return r10_bio; | 76 | return r10_bio; |
@@ -254,7 +256,7 @@ static inline void update_head_pos(int slot, r10bio_t *r10_bio) | |||
254 | static void raid10_end_read_request(struct bio *bio, int error) | 256 | static void raid10_end_read_request(struct bio *bio, int error) |
255 | { | 257 | { |
256 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 258 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
257 | r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); | 259 | r10bio_t *r10_bio = bio->bi_private; |
258 | int slot, dev; | 260 | int slot, dev; |
259 | conf_t *conf = r10_bio->mddev->private; | 261 | conf_t *conf = r10_bio->mddev->private; |
260 | 262 | ||
@@ -284,7 +286,8 @@ static void raid10_end_read_request(struct bio *bio, int error) | |||
284 | */ | 286 | */ |
285 | char b[BDEVNAME_SIZE]; | 287 | char b[BDEVNAME_SIZE]; |
286 | if (printk_ratelimit()) | 288 | if (printk_ratelimit()) |
287 | printk(KERN_ERR "raid10: %s: rescheduling sector %llu\n", | 289 | printk(KERN_ERR "md/raid10:%s: %s: rescheduling sector %llu\n", |
290 | mdname(conf->mddev), | ||
288 | bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); | 291 | bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); |
289 | reschedule_retry(r10_bio); | 292 | reschedule_retry(r10_bio); |
290 | } | 293 | } |
@@ -295,7 +298,7 @@ static void raid10_end_read_request(struct bio *bio, int error) | |||
295 | static void raid10_end_write_request(struct bio *bio, int error) | 298 | static void raid10_end_write_request(struct bio *bio, int error) |
296 | { | 299 | { |
297 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 300 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
298 | r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); | 301 | r10bio_t *r10_bio = bio->bi_private; |
299 | int slot, dev; | 302 | int slot, dev; |
300 | conf_t *conf = r10_bio->mddev->private; | 303 | conf_t *conf = r10_bio->mddev->private; |
301 | 304 | ||
@@ -493,7 +496,7 @@ static int raid10_mergeable_bvec(struct request_queue *q, | |||
493 | */ | 496 | */ |
494 | static int read_balance(conf_t *conf, r10bio_t *r10_bio) | 497 | static int read_balance(conf_t *conf, r10bio_t *r10_bio) |
495 | { | 498 | { |
496 | const unsigned long this_sector = r10_bio->sector; | 499 | const sector_t this_sector = r10_bio->sector; |
497 | int disk, slot, nslot; | 500 | int disk, slot, nslot; |
498 | const int sectors = r10_bio->sectors; | 501 | const int sectors = r10_bio->sectors; |
499 | sector_t new_distance, current_distance; | 502 | sector_t new_distance, current_distance; |
@@ -600,7 +603,7 @@ static void unplug_slaves(mddev_t *mddev) | |||
600 | int i; | 603 | int i; |
601 | 604 | ||
602 | rcu_read_lock(); | 605 | rcu_read_lock(); |
603 | for (i=0; i<mddev->raid_disks; i++) { | 606 | for (i=0; i < conf->raid_disks; i++) { |
604 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); | 607 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); |
605 | if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { | 608 | if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { |
606 | struct request_queue *r_queue = bdev_get_queue(rdev->bdev); | 609 | struct request_queue *r_queue = bdev_get_queue(rdev->bdev); |
@@ -634,7 +637,7 @@ static int raid10_congested(void *data, int bits) | |||
634 | if (mddev_congested(mddev, bits)) | 637 | if (mddev_congested(mddev, bits)) |
635 | return 1; | 638 | return 1; |
636 | rcu_read_lock(); | 639 | rcu_read_lock(); |
637 | for (i = 0; i < mddev->raid_disks && ret == 0; i++) { | 640 | for (i = 0; i < conf->raid_disks && ret == 0; i++) { |
638 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); | 641 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); |
639 | if (rdev && !test_bit(Faulty, &rdev->flags)) { | 642 | if (rdev && !test_bit(Faulty, &rdev->flags)) { |
640 | struct request_queue *q = bdev_get_queue(rdev->bdev); | 643 | struct request_queue *q = bdev_get_queue(rdev->bdev); |
@@ -787,14 +790,12 @@ static void unfreeze_array(conf_t *conf) | |||
787 | spin_unlock_irq(&conf->resync_lock); | 790 | spin_unlock_irq(&conf->resync_lock); |
788 | } | 791 | } |
789 | 792 | ||
790 | static int make_request(struct request_queue *q, struct bio * bio) | 793 | static int make_request(mddev_t *mddev, struct bio * bio) |
791 | { | 794 | { |
792 | mddev_t *mddev = q->queuedata; | ||
793 | conf_t *conf = mddev->private; | 795 | conf_t *conf = mddev->private; |
794 | mirror_info_t *mirror; | 796 | mirror_info_t *mirror; |
795 | r10bio_t *r10_bio; | 797 | r10bio_t *r10_bio; |
796 | struct bio *read_bio; | 798 | struct bio *read_bio; |
797 | int cpu; | ||
798 | int i; | 799 | int i; |
799 | int chunk_sects = conf->chunk_mask + 1; | 800 | int chunk_sects = conf->chunk_mask + 1; |
800 | const int rw = bio_data_dir(bio); | 801 | const int rw = bio_data_dir(bio); |
@@ -804,7 +805,7 @@ static int make_request(struct request_queue *q, struct bio * bio) | |||
804 | mdk_rdev_t *blocked_rdev; | 805 | mdk_rdev_t *blocked_rdev; |
805 | 806 | ||
806 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { | 807 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { |
807 | bio_endio(bio, -EOPNOTSUPP); | 808 | md_barrier_request(mddev, bio); |
808 | return 0; | 809 | return 0; |
809 | } | 810 | } |
810 | 811 | ||
@@ -824,16 +825,16 @@ static int make_request(struct request_queue *q, struct bio * bio) | |||
824 | */ | 825 | */ |
825 | bp = bio_split(bio, | 826 | bp = bio_split(bio, |
826 | chunk_sects - (bio->bi_sector & (chunk_sects - 1)) ); | 827 | chunk_sects - (bio->bi_sector & (chunk_sects - 1)) ); |
827 | if (make_request(q, &bp->bio1)) | 828 | if (make_request(mddev, &bp->bio1)) |
828 | generic_make_request(&bp->bio1); | 829 | generic_make_request(&bp->bio1); |
829 | if (make_request(q, &bp->bio2)) | 830 | if (make_request(mddev, &bp->bio2)) |
830 | generic_make_request(&bp->bio2); | 831 | generic_make_request(&bp->bio2); |
831 | 832 | ||
832 | bio_pair_release(bp); | 833 | bio_pair_release(bp); |
833 | return 0; | 834 | return 0; |
834 | bad_map: | 835 | bad_map: |
835 | printk("raid10_make_request bug: can't convert block across chunks" | 836 | printk("md/raid10:%s: make_request bug: can't convert block across chunks" |
836 | " or bigger than %dk %llu %d\n", chunk_sects/2, | 837 | " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2, |
837 | (unsigned long long)bio->bi_sector, bio->bi_size >> 10); | 838 | (unsigned long long)bio->bi_sector, bio->bi_size >> 10); |
838 | 839 | ||
839 | bio_io_error(bio); | 840 | bio_io_error(bio); |
@@ -849,12 +850,6 @@ static int make_request(struct request_queue *q, struct bio * bio) | |||
849 | */ | 850 | */ |
850 | wait_barrier(conf); | 851 | wait_barrier(conf); |
851 | 852 | ||
852 | cpu = part_stat_lock(); | ||
853 | part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); | ||
854 | part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], | ||
855 | bio_sectors(bio)); | ||
856 | part_stat_unlock(); | ||
857 | |||
858 | r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); | 853 | r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); |
859 | 854 | ||
860 | r10_bio->master_bio = bio; | 855 | r10_bio->master_bio = bio; |
@@ -1038,9 +1033,10 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1038 | } | 1033 | } |
1039 | set_bit(Faulty, &rdev->flags); | 1034 | set_bit(Faulty, &rdev->flags); |
1040 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 1035 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
1041 | printk(KERN_ALERT "raid10: Disk failure on %s, disabling device.\n" | 1036 | printk(KERN_ALERT "md/raid10:%s: Disk failure on %s, disabling device.\n" |
1042 | "raid10: Operation continuing on %d devices.\n", | 1037 | KERN_ALERT "md/raid10:%s: Operation continuing on %d devices.\n", |
1043 | bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); | 1038 | mdname(mddev), bdevname(rdev->bdev, b), |
1039 | mdname(mddev), conf->raid_disks - mddev->degraded); | ||
1044 | } | 1040 | } |
1045 | 1041 | ||
1046 | static void print_conf(conf_t *conf) | 1042 | static void print_conf(conf_t *conf) |
@@ -1048,19 +1044,19 @@ static void print_conf(conf_t *conf) | |||
1048 | int i; | 1044 | int i; |
1049 | mirror_info_t *tmp; | 1045 | mirror_info_t *tmp; |
1050 | 1046 | ||
1051 | printk("RAID10 conf printout:\n"); | 1047 | printk(KERN_DEBUG "RAID10 conf printout:\n"); |
1052 | if (!conf) { | 1048 | if (!conf) { |
1053 | printk("(!conf)\n"); | 1049 | printk(KERN_DEBUG "(!conf)\n"); |
1054 | return; | 1050 | return; |
1055 | } | 1051 | } |
1056 | printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, | 1052 | printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, |
1057 | conf->raid_disks); | 1053 | conf->raid_disks); |
1058 | 1054 | ||
1059 | for (i = 0; i < conf->raid_disks; i++) { | 1055 | for (i = 0; i < conf->raid_disks; i++) { |
1060 | char b[BDEVNAME_SIZE]; | 1056 | char b[BDEVNAME_SIZE]; |
1061 | tmp = conf->mirrors + i; | 1057 | tmp = conf->mirrors + i; |
1062 | if (tmp->rdev) | 1058 | if (tmp->rdev) |
1063 | printk(" disk %d, wo:%d, o:%d, dev:%s\n", | 1059 | printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n", |
1064 | i, !test_bit(In_sync, &tmp->rdev->flags), | 1060 | i, !test_bit(In_sync, &tmp->rdev->flags), |
1065 | !test_bit(Faulty, &tmp->rdev->flags), | 1061 | !test_bit(Faulty, &tmp->rdev->flags), |
1066 | bdevname(tmp->rdev->bdev,b)); | 1062 | bdevname(tmp->rdev->bdev,b)); |
@@ -1131,7 +1127,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1131 | int mirror; | 1127 | int mirror; |
1132 | mirror_info_t *p; | 1128 | mirror_info_t *p; |
1133 | int first = 0; | 1129 | int first = 0; |
1134 | int last = mddev->raid_disks - 1; | 1130 | int last = conf->raid_disks - 1; |
1135 | 1131 | ||
1136 | if (mddev->recovery_cp < MaxSector) | 1132 | if (mddev->recovery_cp < MaxSector) |
1137 | /* only hot-add to in-sync arrays, as recovery is | 1133 | /* only hot-add to in-sync arrays, as recovery is |
@@ -1155,13 +1151,17 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1155 | 1151 | ||
1156 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 1152 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
1157 | rdev->data_offset << 9); | 1153 | rdev->data_offset << 9); |
1158 | /* as we don't honour merge_bvec_fn, we must never risk | 1154 | /* as we don't honour merge_bvec_fn, we must |
1159 | * violating it, so limit ->max_sector to one PAGE, as | 1155 | * never risk violating it, so limit |
1160 | * a one page request is never in violation. | 1156 | * ->max_segments to one lying with a single |
1157 | * page, as a one page request is never in | ||
1158 | * violation. | ||
1161 | */ | 1159 | */ |
1162 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && | 1160 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { |
1163 | queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) | 1161 | blk_queue_max_segments(mddev->queue, 1); |
1164 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | 1162 | blk_queue_segment_boundary(mddev->queue, |
1163 | PAGE_CACHE_SIZE - 1); | ||
1164 | } | ||
1165 | 1165 | ||
1166 | p->head_position = 0; | 1166 | p->head_position = 0; |
1167 | rdev->raid_disk = mirror; | 1167 | rdev->raid_disk = mirror; |
@@ -1219,7 +1219,7 @@ abort: | |||
1219 | 1219 | ||
1220 | static void end_sync_read(struct bio *bio, int error) | 1220 | static void end_sync_read(struct bio *bio, int error) |
1221 | { | 1221 | { |
1222 | r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); | 1222 | r10bio_t *r10_bio = bio->bi_private; |
1223 | conf_t *conf = r10_bio->mddev->private; | 1223 | conf_t *conf = r10_bio->mddev->private; |
1224 | int i,d; | 1224 | int i,d; |
1225 | 1225 | ||
@@ -1256,7 +1256,7 @@ static void end_sync_read(struct bio *bio, int error) | |||
1256 | static void end_sync_write(struct bio *bio, int error) | 1256 | static void end_sync_write(struct bio *bio, int error) |
1257 | { | 1257 | { |
1258 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 1258 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
1259 | r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); | 1259 | r10bio_t *r10_bio = bio->bi_private; |
1260 | mddev_t *mddev = r10_bio->mddev; | 1260 | mddev_t *mddev = r10_bio->mddev; |
1261 | conf_t *conf = mddev->private; | 1261 | conf_t *conf = mddev->private; |
1262 | int i,d; | 1262 | int i,d; |
@@ -1432,6 +1432,43 @@ static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) | |||
1432 | 1432 | ||
1433 | 1433 | ||
1434 | /* | 1434 | /* |
1435 | * Used by fix_read_error() to decay the per rdev read_errors. | ||
1436 | * We halve the read error count for every hour that has elapsed | ||
1437 | * since the last recorded read error. | ||
1438 | * | ||
1439 | */ | ||
1440 | static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev) | ||
1441 | { | ||
1442 | struct timespec cur_time_mon; | ||
1443 | unsigned long hours_since_last; | ||
1444 | unsigned int read_errors = atomic_read(&rdev->read_errors); | ||
1445 | |||
1446 | ktime_get_ts(&cur_time_mon); | ||
1447 | |||
1448 | if (rdev->last_read_error.tv_sec == 0 && | ||
1449 | rdev->last_read_error.tv_nsec == 0) { | ||
1450 | /* first time we've seen a read error */ | ||
1451 | rdev->last_read_error = cur_time_mon; | ||
1452 | return; | ||
1453 | } | ||
1454 | |||
1455 | hours_since_last = (cur_time_mon.tv_sec - | ||
1456 | rdev->last_read_error.tv_sec) / 3600; | ||
1457 | |||
1458 | rdev->last_read_error = cur_time_mon; | ||
1459 | |||
1460 | /* | ||
1461 | * if hours_since_last is > the number of bits in read_errors | ||
1462 | * just set read errors to 0. We do this to avoid | ||
1463 | * overflowing the shift of read_errors by hours_since_last. | ||
1464 | */ | ||
1465 | if (hours_since_last >= 8 * sizeof(read_errors)) | ||
1466 | atomic_set(&rdev->read_errors, 0); | ||
1467 | else | ||
1468 | atomic_set(&rdev->read_errors, read_errors >> hours_since_last); | ||
1469 | } | ||
1470 | |||
1471 | /* | ||
1435 | * This is a kernel thread which: | 1472 | * This is a kernel thread which: |
1436 | * | 1473 | * |
1437 | * 1. Retries failed read operations on working mirrors. | 1474 | * 1. Retries failed read operations on working mirrors. |
@@ -1444,6 +1481,44 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1444 | int sect = 0; /* Offset from r10_bio->sector */ | 1481 | int sect = 0; /* Offset from r10_bio->sector */ |
1445 | int sectors = r10_bio->sectors; | 1482 | int sectors = r10_bio->sectors; |
1446 | mdk_rdev_t*rdev; | 1483 | mdk_rdev_t*rdev; |
1484 | int max_read_errors = atomic_read(&mddev->max_corr_read_errors); | ||
1485 | int d = r10_bio->devs[r10_bio->read_slot].devnum; | ||
1486 | |||
1487 | rcu_read_lock(); | ||
1488 | rdev = rcu_dereference(conf->mirrors[d].rdev); | ||
1489 | if (rdev) { /* If rdev is not NULL */ | ||
1490 | char b[BDEVNAME_SIZE]; | ||
1491 | int cur_read_error_count = 0; | ||
1492 | |||
1493 | bdevname(rdev->bdev, b); | ||
1494 | |||
1495 | if (test_bit(Faulty, &rdev->flags)) { | ||
1496 | rcu_read_unlock(); | ||
1497 | /* drive has already been failed, just ignore any | ||
1498 | more fix_read_error() attempts */ | ||
1499 | return; | ||
1500 | } | ||
1501 | |||
1502 | check_decay_read_errors(mddev, rdev); | ||
1503 | atomic_inc(&rdev->read_errors); | ||
1504 | cur_read_error_count = atomic_read(&rdev->read_errors); | ||
1505 | if (cur_read_error_count > max_read_errors) { | ||
1506 | rcu_read_unlock(); | ||
1507 | printk(KERN_NOTICE | ||
1508 | "md/raid10:%s: %s: Raid device exceeded " | ||
1509 | "read_error threshold " | ||
1510 | "[cur %d:max %d]\n", | ||
1511 | mdname(mddev), | ||
1512 | b, cur_read_error_count, max_read_errors); | ||
1513 | printk(KERN_NOTICE | ||
1514 | "md/raid10:%s: %s: Failing raid " | ||
1515 | "device\n", mdname(mddev), b); | ||
1516 | md_error(mddev, conf->mirrors[d].rdev); | ||
1517 | return; | ||
1518 | } | ||
1519 | } | ||
1520 | rcu_read_unlock(); | ||
1521 | |||
1447 | while(sectors) { | 1522 | while(sectors) { |
1448 | int s = sectors; | 1523 | int s = sectors; |
1449 | int sl = r10_bio->read_slot; | 1524 | int sl = r10_bio->read_slot; |
@@ -1455,7 +1530,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1455 | 1530 | ||
1456 | rcu_read_lock(); | 1531 | rcu_read_lock(); |
1457 | do { | 1532 | do { |
1458 | int d = r10_bio->devs[sl].devnum; | 1533 | d = r10_bio->devs[sl].devnum; |
1459 | rdev = rcu_dereference(conf->mirrors[d].rdev); | 1534 | rdev = rcu_dereference(conf->mirrors[d].rdev); |
1460 | if (rdev && | 1535 | if (rdev && |
1461 | test_bit(In_sync, &rdev->flags)) { | 1536 | test_bit(In_sync, &rdev->flags)) { |
@@ -1488,7 +1563,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1488 | /* write it back and re-read */ | 1563 | /* write it back and re-read */ |
1489 | rcu_read_lock(); | 1564 | rcu_read_lock(); |
1490 | while (sl != r10_bio->read_slot) { | 1565 | while (sl != r10_bio->read_slot) { |
1491 | int d; | 1566 | char b[BDEVNAME_SIZE]; |
1567 | |||
1492 | if (sl==0) | 1568 | if (sl==0) |
1493 | sl = conf->copies; | 1569 | sl = conf->copies; |
1494 | sl--; | 1570 | sl--; |
@@ -1503,16 +1579,29 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1503 | r10_bio->devs[sl].addr + | 1579 | r10_bio->devs[sl].addr + |
1504 | sect + rdev->data_offset, | 1580 | sect + rdev->data_offset, |
1505 | s<<9, conf->tmppage, WRITE) | 1581 | s<<9, conf->tmppage, WRITE) |
1506 | == 0) | 1582 | == 0) { |
1507 | /* Well, this device is dead */ | 1583 | /* Well, this device is dead */ |
1584 | printk(KERN_NOTICE | ||
1585 | "md/raid10:%s: read correction " | ||
1586 | "write failed" | ||
1587 | " (%d sectors at %llu on %s)\n", | ||
1588 | mdname(mddev), s, | ||
1589 | (unsigned long long)(sect+ | ||
1590 | rdev->data_offset), | ||
1591 | bdevname(rdev->bdev, b)); | ||
1592 | printk(KERN_NOTICE "md/raid10:%s: %s: failing " | ||
1593 | "drive\n", | ||
1594 | mdname(mddev), | ||
1595 | bdevname(rdev->bdev, b)); | ||
1508 | md_error(mddev, rdev); | 1596 | md_error(mddev, rdev); |
1597 | } | ||
1509 | rdev_dec_pending(rdev, mddev); | 1598 | rdev_dec_pending(rdev, mddev); |
1510 | rcu_read_lock(); | 1599 | rcu_read_lock(); |
1511 | } | 1600 | } |
1512 | } | 1601 | } |
1513 | sl = start; | 1602 | sl = start; |
1514 | while (sl != r10_bio->read_slot) { | 1603 | while (sl != r10_bio->read_slot) { |
1515 | int d; | 1604 | |
1516 | if (sl==0) | 1605 | if (sl==0) |
1517 | sl = conf->copies; | 1606 | sl = conf->copies; |
1518 | sl--; | 1607 | sl--; |
@@ -1526,17 +1615,31 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1526 | if (sync_page_io(rdev->bdev, | 1615 | if (sync_page_io(rdev->bdev, |
1527 | r10_bio->devs[sl].addr + | 1616 | r10_bio->devs[sl].addr + |
1528 | sect + rdev->data_offset, | 1617 | sect + rdev->data_offset, |
1529 | s<<9, conf->tmppage, READ) == 0) | 1618 | s<<9, conf->tmppage, |
1619 | READ) == 0) { | ||
1530 | /* Well, this device is dead */ | 1620 | /* Well, this device is dead */ |
1621 | printk(KERN_NOTICE | ||
1622 | "md/raid10:%s: unable to read back " | ||
1623 | "corrected sectors" | ||
1624 | " (%d sectors at %llu on %s)\n", | ||
1625 | mdname(mddev), s, | ||
1626 | (unsigned long long)(sect+ | ||
1627 | rdev->data_offset), | ||
1628 | bdevname(rdev->bdev, b)); | ||
1629 | printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n", | ||
1630 | mdname(mddev), | ||
1631 | bdevname(rdev->bdev, b)); | ||
1632 | |||
1531 | md_error(mddev, rdev); | 1633 | md_error(mddev, rdev); |
1532 | else | 1634 | } else { |
1533 | printk(KERN_INFO | 1635 | printk(KERN_INFO |
1534 | "raid10:%s: read error corrected" | 1636 | "md/raid10:%s: read error corrected" |
1535 | " (%d sectors at %llu on %s)\n", | 1637 | " (%d sectors at %llu on %s)\n", |
1536 | mdname(mddev), s, | 1638 | mdname(mddev), s, |
1537 | (unsigned long long)(sect+ | 1639 | (unsigned long long)(sect+ |
1538 | rdev->data_offset), | 1640 | rdev->data_offset), |
1539 | bdevname(rdev->bdev, b)); | 1641 | bdevname(rdev->bdev, b)); |
1642 | } | ||
1540 | 1643 | ||
1541 | rdev_dec_pending(rdev, mddev); | 1644 | rdev_dec_pending(rdev, mddev); |
1542 | rcu_read_lock(); | 1645 | rcu_read_lock(); |
@@ -1605,8 +1708,9 @@ static void raid10d(mddev_t *mddev) | |||
1605 | mddev->ro ? IO_BLOCKED : NULL; | 1708 | mddev->ro ? IO_BLOCKED : NULL; |
1606 | mirror = read_balance(conf, r10_bio); | 1709 | mirror = read_balance(conf, r10_bio); |
1607 | if (mirror == -1) { | 1710 | if (mirror == -1) { |
1608 | printk(KERN_ALERT "raid10: %s: unrecoverable I/O" | 1711 | printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" |
1609 | " read error for block %llu\n", | 1712 | " read error for block %llu\n", |
1713 | mdname(mddev), | ||
1610 | bdevname(bio->bi_bdev,b), | 1714 | bdevname(bio->bi_bdev,b), |
1611 | (unsigned long long)r10_bio->sector); | 1715 | (unsigned long long)r10_bio->sector); |
1612 | raid_end_bio_io(r10_bio); | 1716 | raid_end_bio_io(r10_bio); |
@@ -1616,8 +1720,9 @@ static void raid10d(mddev_t *mddev) | |||
1616 | bio_put(bio); | 1720 | bio_put(bio); |
1617 | rdev = conf->mirrors[mirror].rdev; | 1721 | rdev = conf->mirrors[mirror].rdev; |
1618 | if (printk_ratelimit()) | 1722 | if (printk_ratelimit()) |
1619 | printk(KERN_ERR "raid10: %s: redirecting sector %llu to" | 1723 | printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to" |
1620 | " another mirror\n", | 1724 | " another mirror\n", |
1725 | mdname(mddev), | ||
1621 | bdevname(rdev->bdev,b), | 1726 | bdevname(rdev->bdev,b), |
1622 | (unsigned long long)r10_bio->sector); | 1727 | (unsigned long long)r10_bio->sector); |
1623 | bio = bio_clone(r10_bio->master_bio, GFP_NOIO); | 1728 | bio = bio_clone(r10_bio->master_bio, GFP_NOIO); |
@@ -1632,6 +1737,7 @@ static void raid10d(mddev_t *mddev) | |||
1632 | generic_make_request(bio); | 1737 | generic_make_request(bio); |
1633 | } | 1738 | } |
1634 | } | 1739 | } |
1740 | cond_resched(); | ||
1635 | } | 1741 | } |
1636 | if (unplug) | 1742 | if (unplug) |
1637 | unplug_slaves(mddev); | 1743 | unplug_slaves(mddev); |
@@ -1874,7 +1980,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1874 | r10_bio = rb2; | 1980 | r10_bio = rb2; |
1875 | if (!test_and_set_bit(MD_RECOVERY_INTR, | 1981 | if (!test_and_set_bit(MD_RECOVERY_INTR, |
1876 | &mddev->recovery)) | 1982 | &mddev->recovery)) |
1877 | printk(KERN_INFO "raid10: %s: insufficient working devices for recovery.\n", | 1983 | printk(KERN_INFO "md/raid10:%s: insufficient " |
1984 | "working devices for recovery.\n", | ||
1878 | mdname(mddev)); | 1985 | mdname(mddev)); |
1879 | break; | 1986 | break; |
1880 | } | 1987 | } |
@@ -2034,9 +2141,9 @@ raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks) | |||
2034 | conf_t *conf = mddev->private; | 2141 | conf_t *conf = mddev->private; |
2035 | 2142 | ||
2036 | if (!raid_disks) | 2143 | if (!raid_disks) |
2037 | raid_disks = mddev->raid_disks; | 2144 | raid_disks = conf->raid_disks; |
2038 | if (!sectors) | 2145 | if (!sectors) |
2039 | sectors = mddev->dev_sectors; | 2146 | sectors = conf->dev_sectors; |
2040 | 2147 | ||
2041 | size = sectors >> conf->chunk_shift; | 2148 | size = sectors >> conf->chunk_shift; |
2042 | sector_div(size, conf->far_copies); | 2149 | sector_div(size, conf->far_copies); |
@@ -2046,63 +2153,61 @@ raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks) | |||
2046 | return size << conf->chunk_shift; | 2153 | return size << conf->chunk_shift; |
2047 | } | 2154 | } |
2048 | 2155 | ||
2049 | static int run(mddev_t *mddev) | 2156 | |
2157 | static conf_t *setup_conf(mddev_t *mddev) | ||
2050 | { | 2158 | { |
2051 | conf_t *conf; | 2159 | conf_t *conf = NULL; |
2052 | int i, disk_idx, chunk_size; | ||
2053 | mirror_info_t *disk; | ||
2054 | mdk_rdev_t *rdev; | ||
2055 | int nc, fc, fo; | 2160 | int nc, fc, fo; |
2056 | sector_t stride, size; | 2161 | sector_t stride, size; |
2162 | int err = -EINVAL; | ||
2057 | 2163 | ||
2058 | if (mddev->chunk_sectors < (PAGE_SIZE >> 9) || | 2164 | if (mddev->new_chunk_sectors < (PAGE_SIZE >> 9) || |
2059 | !is_power_of_2(mddev->chunk_sectors)) { | 2165 | !is_power_of_2(mddev->new_chunk_sectors)) { |
2060 | printk(KERN_ERR "md/raid10: chunk size must be " | 2166 | printk(KERN_ERR "md/raid10:%s: chunk size must be " |
2061 | "at least PAGE_SIZE(%ld) and be a power of 2.\n", PAGE_SIZE); | 2167 | "at least PAGE_SIZE(%ld) and be a power of 2.\n", |
2062 | return -EINVAL; | 2168 | mdname(mddev), PAGE_SIZE); |
2169 | goto out; | ||
2063 | } | 2170 | } |
2064 | 2171 | ||
2065 | nc = mddev->layout & 255; | 2172 | nc = mddev->new_layout & 255; |
2066 | fc = (mddev->layout >> 8) & 255; | 2173 | fc = (mddev->new_layout >> 8) & 255; |
2067 | fo = mddev->layout & (1<<16); | 2174 | fo = mddev->new_layout & (1<<16); |
2175 | |||
2068 | if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || | 2176 | if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || |
2069 | (mddev->layout >> 17)) { | 2177 | (mddev->new_layout >> 17)) { |
2070 | printk(KERN_ERR "raid10: %s: unsupported raid10 layout: 0x%8x\n", | 2178 | printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n", |
2071 | mdname(mddev), mddev->layout); | 2179 | mdname(mddev), mddev->new_layout); |
2072 | goto out; | 2180 | goto out; |
2073 | } | 2181 | } |
2074 | /* | 2182 | |
2075 | * copy the already verified devices into our private RAID10 | 2183 | err = -ENOMEM; |
2076 | * bookkeeping area. [whatever we allocate in run(), | ||
2077 | * should be freed in stop()] | ||
2078 | */ | ||
2079 | conf = kzalloc(sizeof(conf_t), GFP_KERNEL); | 2184 | conf = kzalloc(sizeof(conf_t), GFP_KERNEL); |
2080 | mddev->private = conf; | 2185 | if (!conf) |
2081 | if (!conf) { | ||
2082 | printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", | ||
2083 | mdname(mddev)); | ||
2084 | goto out; | 2186 | goto out; |
2085 | } | 2187 | |
2086 | conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, | 2188 | conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, |
2087 | GFP_KERNEL); | 2189 | GFP_KERNEL); |
2088 | if (!conf->mirrors) { | 2190 | if (!conf->mirrors) |
2089 | printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", | 2191 | goto out; |
2090 | mdname(mddev)); | ||
2091 | goto out_free_conf; | ||
2092 | } | ||
2093 | 2192 | ||
2094 | conf->tmppage = alloc_page(GFP_KERNEL); | 2193 | conf->tmppage = alloc_page(GFP_KERNEL); |
2095 | if (!conf->tmppage) | 2194 | if (!conf->tmppage) |
2096 | goto out_free_conf; | 2195 | goto out; |
2196 | |||
2097 | 2197 | ||
2098 | conf->mddev = mddev; | ||
2099 | conf->raid_disks = mddev->raid_disks; | 2198 | conf->raid_disks = mddev->raid_disks; |
2100 | conf->near_copies = nc; | 2199 | conf->near_copies = nc; |
2101 | conf->far_copies = fc; | 2200 | conf->far_copies = fc; |
2102 | conf->copies = nc*fc; | 2201 | conf->copies = nc*fc; |
2103 | conf->far_offset = fo; | 2202 | conf->far_offset = fo; |
2104 | conf->chunk_mask = mddev->chunk_sectors - 1; | 2203 | conf->chunk_mask = mddev->new_chunk_sectors - 1; |
2105 | conf->chunk_shift = ffz(~mddev->chunk_sectors); | 2204 | conf->chunk_shift = ffz(~mddev->new_chunk_sectors); |
2205 | |||
2206 | conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, | ||
2207 | r10bio_pool_free, conf); | ||
2208 | if (!conf->r10bio_pool) | ||
2209 | goto out; | ||
2210 | |||
2106 | size = mddev->dev_sectors >> conf->chunk_shift; | 2211 | size = mddev->dev_sectors >> conf->chunk_shift; |
2107 | sector_div(size, fc); | 2212 | sector_div(size, fc); |
2108 | size = size * conf->raid_disks; | 2213 | size = size * conf->raid_disks; |
@@ -2116,7 +2221,8 @@ static int run(mddev_t *mddev) | |||
2116 | */ | 2221 | */ |
2117 | stride += conf->raid_disks - 1; | 2222 | stride += conf->raid_disks - 1; |
2118 | sector_div(stride, conf->raid_disks); | 2223 | sector_div(stride, conf->raid_disks); |
2119 | mddev->dev_sectors = stride << conf->chunk_shift; | 2224 | |
2225 | conf->dev_sectors = stride << conf->chunk_shift; | ||
2120 | 2226 | ||
2121 | if (fo) | 2227 | if (fo) |
2122 | stride = 1; | 2228 | stride = 1; |
@@ -2124,17 +2230,62 @@ static int run(mddev_t *mddev) | |||
2124 | sector_div(stride, fc); | 2230 | sector_div(stride, fc); |
2125 | conf->stride = stride << conf->chunk_shift; | 2231 | conf->stride = stride << conf->chunk_shift; |
2126 | 2232 | ||
2127 | conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, | ||
2128 | r10bio_pool_free, conf); | ||
2129 | if (!conf->r10bio_pool) { | ||
2130 | printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", | ||
2131 | mdname(mddev)); | ||
2132 | goto out_free_conf; | ||
2133 | } | ||
2134 | 2233 | ||
2135 | spin_lock_init(&conf->device_lock); | 2234 | spin_lock_init(&conf->device_lock); |
2235 | INIT_LIST_HEAD(&conf->retry_list); | ||
2236 | |||
2237 | spin_lock_init(&conf->resync_lock); | ||
2238 | init_waitqueue_head(&conf->wait_barrier); | ||
2239 | |||
2240 | conf->thread = md_register_thread(raid10d, mddev, NULL); | ||
2241 | if (!conf->thread) | ||
2242 | goto out; | ||
2243 | |||
2244 | conf->mddev = mddev; | ||
2245 | return conf; | ||
2246 | |||
2247 | out: | ||
2248 | printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n", | ||
2249 | mdname(mddev)); | ||
2250 | if (conf) { | ||
2251 | if (conf->r10bio_pool) | ||
2252 | mempool_destroy(conf->r10bio_pool); | ||
2253 | kfree(conf->mirrors); | ||
2254 | safe_put_page(conf->tmppage); | ||
2255 | kfree(conf); | ||
2256 | } | ||
2257 | return ERR_PTR(err); | ||
2258 | } | ||
2259 | |||
2260 | static int run(mddev_t *mddev) | ||
2261 | { | ||
2262 | conf_t *conf; | ||
2263 | int i, disk_idx, chunk_size; | ||
2264 | mirror_info_t *disk; | ||
2265 | mdk_rdev_t *rdev; | ||
2266 | sector_t size; | ||
2267 | |||
2268 | /* | ||
2269 | * copy the already verified devices into our private RAID10 | ||
2270 | * bookkeeping area. [whatever we allocate in run(), | ||
2271 | * should be freed in stop()] | ||
2272 | */ | ||
2273 | |||
2274 | if (mddev->private == NULL) { | ||
2275 | conf = setup_conf(mddev); | ||
2276 | if (IS_ERR(conf)) | ||
2277 | return PTR_ERR(conf); | ||
2278 | mddev->private = conf; | ||
2279 | } | ||
2280 | conf = mddev->private; | ||
2281 | if (!conf) | ||
2282 | goto out; | ||
2283 | |||
2136 | mddev->queue->queue_lock = &conf->device_lock; | 2284 | mddev->queue->queue_lock = &conf->device_lock; |
2137 | 2285 | ||
2286 | mddev->thread = conf->thread; | ||
2287 | conf->thread = NULL; | ||
2288 | |||
2138 | chunk_size = mddev->chunk_sectors << 9; | 2289 | chunk_size = mddev->chunk_sectors << 9; |
2139 | blk_queue_io_min(mddev->queue, chunk_size); | 2290 | blk_queue_io_min(mddev->queue, chunk_size); |
2140 | if (conf->raid_disks % conf->near_copies) | 2291 | if (conf->raid_disks % conf->near_copies) |
@@ -2145,7 +2296,7 @@ static int run(mddev_t *mddev) | |||
2145 | 2296 | ||
2146 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 2297 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
2147 | disk_idx = rdev->raid_disk; | 2298 | disk_idx = rdev->raid_disk; |
2148 | if (disk_idx >= mddev->raid_disks | 2299 | if (disk_idx >= conf->raid_disks |
2149 | || disk_idx < 0) | 2300 | || disk_idx < 0) |
2150 | continue; | 2301 | continue; |
2151 | disk = conf->mirrors + disk_idx; | 2302 | disk = conf->mirrors + disk_idx; |
@@ -2154,23 +2305,20 @@ static int run(mddev_t *mddev) | |||
2154 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 2305 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
2155 | rdev->data_offset << 9); | 2306 | rdev->data_offset << 9); |
2156 | /* as we don't honour merge_bvec_fn, we must never risk | 2307 | /* as we don't honour merge_bvec_fn, we must never risk |
2157 | * violating it, so limit ->max_sector to one PAGE, as | 2308 | * violating it, so limit max_segments to 1 lying |
2158 | * a one page request is never in violation. | 2309 | * within a single page. |
2159 | */ | 2310 | */ |
2160 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && | 2311 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { |
2161 | queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) | 2312 | blk_queue_max_segments(mddev->queue, 1); |
2162 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | 2313 | blk_queue_segment_boundary(mddev->queue, |
2314 | PAGE_CACHE_SIZE - 1); | ||
2315 | } | ||
2163 | 2316 | ||
2164 | disk->head_position = 0; | 2317 | disk->head_position = 0; |
2165 | } | 2318 | } |
2166 | INIT_LIST_HEAD(&conf->retry_list); | ||
2167 | |||
2168 | spin_lock_init(&conf->resync_lock); | ||
2169 | init_waitqueue_head(&conf->wait_barrier); | ||
2170 | |||
2171 | /* need to check that every block has at least one working mirror */ | 2319 | /* need to check that every block has at least one working mirror */ |
2172 | if (!enough(conf)) { | 2320 | if (!enough(conf)) { |
2173 | printk(KERN_ERR "raid10: not enough operational mirrors for %s\n", | 2321 | printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", |
2174 | mdname(mddev)); | 2322 | mdname(mddev)); |
2175 | goto out_free_conf; | 2323 | goto out_free_conf; |
2176 | } | 2324 | } |
@@ -2189,28 +2337,21 @@ static int run(mddev_t *mddev) | |||
2189 | } | 2337 | } |
2190 | } | 2338 | } |
2191 | 2339 | ||
2192 | |||
2193 | mddev->thread = md_register_thread(raid10d, mddev, NULL); | ||
2194 | if (!mddev->thread) { | ||
2195 | printk(KERN_ERR | ||
2196 | "raid10: couldn't allocate thread for %s\n", | ||
2197 | mdname(mddev)); | ||
2198 | goto out_free_conf; | ||
2199 | } | ||
2200 | |||
2201 | if (mddev->recovery_cp != MaxSector) | 2340 | if (mddev->recovery_cp != MaxSector) |
2202 | printk(KERN_NOTICE "raid10: %s is not clean" | 2341 | printk(KERN_NOTICE "md/raid10:%s: not clean" |
2203 | " -- starting background reconstruction\n", | 2342 | " -- starting background reconstruction\n", |
2204 | mdname(mddev)); | 2343 | mdname(mddev)); |
2205 | printk(KERN_INFO | 2344 | printk(KERN_INFO |
2206 | "raid10: raid set %s active with %d out of %d devices\n", | 2345 | "md/raid10:%s: active with %d out of %d devices\n", |
2207 | mdname(mddev), mddev->raid_disks - mddev->degraded, | 2346 | mdname(mddev), conf->raid_disks - mddev->degraded, |
2208 | mddev->raid_disks); | 2347 | conf->raid_disks); |
2209 | /* | 2348 | /* |
2210 | * Ok, everything is just fine now | 2349 | * Ok, everything is just fine now |
2211 | */ | 2350 | */ |
2212 | md_set_array_sectors(mddev, raid10_size(mddev, 0, 0)); | 2351 | mddev->dev_sectors = conf->dev_sectors; |
2213 | mddev->resync_max_sectors = raid10_size(mddev, 0, 0); | 2352 | size = raid10_size(mddev, 0, 0); |
2353 | md_set_array_sectors(mddev, size); | ||
2354 | mddev->resync_max_sectors = size; | ||
2214 | 2355 | ||
2215 | mddev->queue->unplug_fn = raid10_unplug; | 2356 | mddev->queue->unplug_fn = raid10_unplug; |
2216 | mddev->queue->backing_dev_info.congested_fn = raid10_congested; | 2357 | mddev->queue->backing_dev_info.congested_fn = raid10_congested; |
@@ -2228,7 +2369,7 @@ static int run(mddev_t *mddev) | |||
2228 | mddev->queue->backing_dev_info.ra_pages = 2* stripe; | 2369 | mddev->queue->backing_dev_info.ra_pages = 2* stripe; |
2229 | } | 2370 | } |
2230 | 2371 | ||
2231 | if (conf->near_copies < mddev->raid_disks) | 2372 | if (conf->near_copies < conf->raid_disks) |
2232 | blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); | 2373 | blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); |
2233 | md_integrity_register(mddev); | 2374 | md_integrity_register(mddev); |
2234 | return 0; | 2375 | return 0; |
@@ -2240,6 +2381,7 @@ out_free_conf: | |||
2240 | kfree(conf->mirrors); | 2381 | kfree(conf->mirrors); |
2241 | kfree(conf); | 2382 | kfree(conf); |
2242 | mddev->private = NULL; | 2383 | mddev->private = NULL; |
2384 | md_unregister_thread(mddev->thread); | ||
2243 | out: | 2385 | out: |
2244 | return -EIO; | 2386 | return -EIO; |
2245 | } | 2387 | } |
@@ -2274,13 +2416,57 @@ static void raid10_quiesce(mddev_t *mddev, int state) | |||
2274 | lower_barrier(conf); | 2416 | lower_barrier(conf); |
2275 | break; | 2417 | break; |
2276 | } | 2418 | } |
2277 | if (mddev->thread) { | 2419 | } |
2278 | if (mddev->bitmap) | 2420 | |
2279 | mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; | 2421 | static void *raid10_takeover_raid0(mddev_t *mddev) |
2280 | else | 2422 | { |
2281 | mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; | 2423 | mdk_rdev_t *rdev; |
2282 | md_wakeup_thread(mddev->thread); | 2424 | conf_t *conf; |
2425 | |||
2426 | if (mddev->degraded > 0) { | ||
2427 | printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n", | ||
2428 | mdname(mddev)); | ||
2429 | return ERR_PTR(-EINVAL); | ||
2430 | } | ||
2431 | |||
2432 | /* Set new parameters */ | ||
2433 | mddev->new_level = 10; | ||
2434 | /* new layout: far_copies = 1, near_copies = 2 */ | ||
2435 | mddev->new_layout = (1<<8) + 2; | ||
2436 | mddev->new_chunk_sectors = mddev->chunk_sectors; | ||
2437 | mddev->delta_disks = mddev->raid_disks; | ||
2438 | mddev->raid_disks *= 2; | ||
2439 | /* make sure it will be not marked as dirty */ | ||
2440 | mddev->recovery_cp = MaxSector; | ||
2441 | |||
2442 | conf = setup_conf(mddev); | ||
2443 | if (!IS_ERR(conf)) | ||
2444 | list_for_each_entry(rdev, &mddev->disks, same_set) | ||
2445 | if (rdev->raid_disk >= 0) | ||
2446 | rdev->new_raid_disk = rdev->raid_disk * 2; | ||
2447 | |||
2448 | return conf; | ||
2449 | } | ||
2450 | |||
2451 | static void *raid10_takeover(mddev_t *mddev) | ||
2452 | { | ||
2453 | struct raid0_private_data *raid0_priv; | ||
2454 | |||
2455 | /* raid10 can take over: | ||
2456 | * raid0 - providing it has only two drives | ||
2457 | */ | ||
2458 | if (mddev->level == 0) { | ||
2459 | /* for raid0 takeover only one zone is supported */ | ||
2460 | raid0_priv = mddev->private; | ||
2461 | if (raid0_priv->nr_strip_zones > 1) { | ||
2462 | printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0" | ||
2463 | " with more than one zone.\n", | ||
2464 | mdname(mddev)); | ||
2465 | return ERR_PTR(-EINVAL); | ||
2466 | } | ||
2467 | return raid10_takeover_raid0(mddev); | ||
2283 | } | 2468 | } |
2469 | return ERR_PTR(-EINVAL); | ||
2284 | } | 2470 | } |
2285 | 2471 | ||
2286 | static struct mdk_personality raid10_personality = | 2472 | static struct mdk_personality raid10_personality = |
@@ -2299,6 +2485,7 @@ static struct mdk_personality raid10_personality = | |||
2299 | .sync_request = sync_request, | 2485 | .sync_request = sync_request, |
2300 | .quiesce = raid10_quiesce, | 2486 | .quiesce = raid10_quiesce, |
2301 | .size = raid10_size, | 2487 | .size = raid10_size, |
2488 | .takeover = raid10_takeover, | ||
2302 | }; | 2489 | }; |
2303 | 2490 | ||
2304 | static int __init raid_init(void) | 2491 | static int __init raid_init(void) |
@@ -2314,6 +2501,7 @@ static void raid_exit(void) | |||
2314 | module_init(raid_init); | 2501 | module_init(raid_init); |
2315 | module_exit(raid_exit); | 2502 | module_exit(raid_exit); |
2316 | MODULE_LICENSE("GPL"); | 2503 | MODULE_LICENSE("GPL"); |
2504 | MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD"); | ||
2317 | MODULE_ALIAS("md-personality-9"); /* RAID10 */ | 2505 | MODULE_ALIAS("md-personality-9"); /* RAID10 */ |
2318 | MODULE_ALIAS("md-raid10"); | 2506 | MODULE_ALIAS("md-raid10"); |
2319 | MODULE_ALIAS("md-level-10"); | 2507 | MODULE_ALIAS("md-level-10"); |
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 59cd1efb8d30..2316ac2e8e21 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h | |||
@@ -33,6 +33,8 @@ struct r10_private_data_s { | |||
33 | * 1 stripe. | 33 | * 1 stripe. |
34 | */ | 34 | */ |
35 | 35 | ||
36 | sector_t dev_sectors; /* temp copy of mddev->dev_sectors */ | ||
37 | |||
36 | int chunk_shift; /* shift from chunks to sectors */ | 38 | int chunk_shift; /* shift from chunks to sectors */ |
37 | sector_t chunk_mask; | 39 | sector_t chunk_mask; |
38 | 40 | ||
@@ -57,6 +59,11 @@ struct r10_private_data_s { | |||
57 | mempool_t *r10bio_pool; | 59 | mempool_t *r10bio_pool; |
58 | mempool_t *r10buf_pool; | 60 | mempool_t *r10buf_pool; |
59 | struct page *tmppage; | 61 | struct page *tmppage; |
62 | |||
63 | /* When taking over an array from a different personality, we store | ||
64 | * the new thread here until we fully activate the array. | ||
65 | */ | ||
66 | struct mdk_thread_s *thread; | ||
60 | }; | 67 | }; |
61 | 68 | ||
62 | typedef struct r10_private_data_s conf_t; | 69 | typedef struct r10_private_data_s conf_t; |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 94829804ab7f..96c690279fc6 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -50,8 +50,10 @@ | |||
50 | #include <linux/async.h> | 50 | #include <linux/async.h> |
51 | #include <linux/seq_file.h> | 51 | #include <linux/seq_file.h> |
52 | #include <linux/cpu.h> | 52 | #include <linux/cpu.h> |
53 | #include <linux/slab.h> | ||
53 | #include "md.h" | 54 | #include "md.h" |
54 | #include "raid5.h" | 55 | #include "raid5.h" |
56 | #include "raid0.h" | ||
55 | #include "bitmap.h" | 57 | #include "bitmap.h" |
56 | 58 | ||
57 | /* | 59 | /* |
@@ -156,13 +158,16 @@ static inline int raid6_next_disk(int disk, int raid_disks) | |||
156 | static int raid6_idx_to_slot(int idx, struct stripe_head *sh, | 158 | static int raid6_idx_to_slot(int idx, struct stripe_head *sh, |
157 | int *count, int syndrome_disks) | 159 | int *count, int syndrome_disks) |
158 | { | 160 | { |
159 | int slot; | 161 | int slot = *count; |
160 | 162 | ||
163 | if (sh->ddf_layout) | ||
164 | (*count)++; | ||
161 | if (idx == sh->pd_idx) | 165 | if (idx == sh->pd_idx) |
162 | return syndrome_disks; | 166 | return syndrome_disks; |
163 | if (idx == sh->qd_idx) | 167 | if (idx == sh->qd_idx) |
164 | return syndrome_disks + 1; | 168 | return syndrome_disks + 1; |
165 | slot = (*count)++; | 169 | if (!sh->ddf_layout) |
170 | (*count)++; | ||
166 | return slot; | 171 | return slot; |
167 | } | 172 | } |
168 | 173 | ||
@@ -272,12 +277,13 @@ out: | |||
272 | return sh; | 277 | return sh; |
273 | } | 278 | } |
274 | 279 | ||
275 | static void shrink_buffers(struct stripe_head *sh, int num) | 280 | static void shrink_buffers(struct stripe_head *sh) |
276 | { | 281 | { |
277 | struct page *p; | 282 | struct page *p; |
278 | int i; | 283 | int i; |
284 | int num = sh->raid_conf->pool_size; | ||
279 | 285 | ||
280 | for (i=0; i<num ; i++) { | 286 | for (i = 0; i < num ; i++) { |
281 | p = sh->dev[i].page; | 287 | p = sh->dev[i].page; |
282 | if (!p) | 288 | if (!p) |
283 | continue; | 289 | continue; |
@@ -286,11 +292,12 @@ static void shrink_buffers(struct stripe_head *sh, int num) | |||
286 | } | 292 | } |
287 | } | 293 | } |
288 | 294 | ||
289 | static int grow_buffers(struct stripe_head *sh, int num) | 295 | static int grow_buffers(struct stripe_head *sh) |
290 | { | 296 | { |
291 | int i; | 297 | int i; |
298 | int num = sh->raid_conf->pool_size; | ||
292 | 299 | ||
293 | for (i=0; i<num; i++) { | 300 | for (i = 0; i < num; i++) { |
294 | struct page *page; | 301 | struct page *page; |
295 | 302 | ||
296 | if (!(page = alloc_page(GFP_KERNEL))) { | 303 | if (!(page = alloc_page(GFP_KERNEL))) { |
@@ -359,6 +366,73 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, | |||
359 | return NULL; | 366 | return NULL; |
360 | } | 367 | } |
361 | 368 | ||
369 | /* | ||
370 | * Need to check if array has failed when deciding whether to: | ||
371 | * - start an array | ||
372 | * - remove non-faulty devices | ||
373 | * - add a spare | ||
374 | * - allow a reshape | ||
375 | * This determination is simple when no reshape is happening. | ||
376 | * However if there is a reshape, we need to carefully check | ||
377 | * both the before and after sections. | ||
378 | * This is because some failed devices may only affect one | ||
379 | * of the two sections, and some non-in_sync devices may | ||
380 | * be insync in the section most affected by failed devices. | ||
381 | */ | ||
382 | static int has_failed(raid5_conf_t *conf) | ||
383 | { | ||
384 | int degraded; | ||
385 | int i; | ||
386 | if (conf->mddev->reshape_position == MaxSector) | ||
387 | return conf->mddev->degraded > conf->max_degraded; | ||
388 | |||
389 | rcu_read_lock(); | ||
390 | degraded = 0; | ||
391 | for (i = 0; i < conf->previous_raid_disks; i++) { | ||
392 | mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); | ||
393 | if (!rdev || test_bit(Faulty, &rdev->flags)) | ||
394 | degraded++; | ||
395 | else if (test_bit(In_sync, &rdev->flags)) | ||
396 | ; | ||
397 | else | ||
398 | /* not in-sync or faulty. | ||
399 | * If the reshape increases the number of devices, | ||
400 | * this is being recovered by the reshape, so | ||
401 | * this 'previous' section is not in_sync. | ||
402 | * If the number of devices is being reduced however, | ||
403 | * the device can only be part of the array if | ||
404 | * we are reverting a reshape, so this section will | ||
405 | * be in-sync. | ||
406 | */ | ||
407 | if (conf->raid_disks >= conf->previous_raid_disks) | ||
408 | degraded++; | ||
409 | } | ||
410 | rcu_read_unlock(); | ||
411 | if (degraded > conf->max_degraded) | ||
412 | return 1; | ||
413 | rcu_read_lock(); | ||
414 | degraded = 0; | ||
415 | for (i = 0; i < conf->raid_disks; i++) { | ||
416 | mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); | ||
417 | if (!rdev || test_bit(Faulty, &rdev->flags)) | ||
418 | degraded++; | ||
419 | else if (test_bit(In_sync, &rdev->flags)) | ||
420 | ; | ||
421 | else | ||
422 | /* not in-sync or faulty. | ||
423 | * If reshape increases the number of devices, this | ||
424 | * section has already been recovered, else it | ||
425 | * almost certainly hasn't. | ||
426 | */ | ||
427 | if (conf->raid_disks <= conf->previous_raid_disks) | ||
428 | degraded++; | ||
429 | } | ||
430 | rcu_read_unlock(); | ||
431 | if (degraded > conf->max_degraded) | ||
432 | return 1; | ||
433 | return 0; | ||
434 | } | ||
435 | |||
362 | static void unplug_slaves(mddev_t *mddev); | 436 | static void unplug_slaves(mddev_t *mddev); |
363 | static void raid5_unplug_device(struct request_queue *q); | 437 | static void raid5_unplug_device(struct request_queue *q); |
364 | 438 | ||
@@ -717,7 +791,7 @@ static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) | |||
717 | int i; | 791 | int i; |
718 | 792 | ||
719 | for (i = 0; i < disks; i++) | 793 | for (i = 0; i < disks; i++) |
720 | srcs[i] = (void *)raid6_empty_zero_page; | 794 | srcs[i] = NULL; |
721 | 795 | ||
722 | count = 0; | 796 | count = 0; |
723 | i = d0_idx; | 797 | i = d0_idx; |
@@ -727,9 +801,8 @@ static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) | |||
727 | srcs[slot] = sh->dev[i].page; | 801 | srcs[slot] = sh->dev[i].page; |
728 | i = raid6_next_disk(i, disks); | 802 | i = raid6_next_disk(i, disks); |
729 | } while (i != d0_idx); | 803 | } while (i != d0_idx); |
730 | BUG_ON(count != syndrome_disks); | ||
731 | 804 | ||
732 | return count; | 805 | return syndrome_disks; |
733 | } | 806 | } |
734 | 807 | ||
735 | static struct dma_async_tx_descriptor * | 808 | static struct dma_async_tx_descriptor * |
@@ -814,7 +887,7 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) | |||
814 | * slot number conversion for 'faila' and 'failb' | 887 | * slot number conversion for 'faila' and 'failb' |
815 | */ | 888 | */ |
816 | for (i = 0; i < disks ; i++) | 889 | for (i = 0; i < disks ; i++) |
817 | blocks[i] = (void *)raid6_empty_zero_page; | 890 | blocks[i] = NULL; |
818 | count = 0; | 891 | count = 0; |
819 | i = d0_idx; | 892 | i = d0_idx; |
820 | do { | 893 | do { |
@@ -828,7 +901,6 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) | |||
828 | failb = slot; | 901 | failb = slot; |
829 | i = raid6_next_disk(i, disks); | 902 | i = raid6_next_disk(i, disks); |
830 | } while (i != d0_idx); | 903 | } while (i != d0_idx); |
831 | BUG_ON(count != syndrome_disks); | ||
832 | 904 | ||
833 | BUG_ON(faila == failb); | 905 | BUG_ON(faila == failb); |
834 | if (failb < faila) | 906 | if (failb < faila) |
@@ -845,7 +917,7 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) | |||
845 | init_async_submit(&submit, ASYNC_TX_FENCE, NULL, | 917 | init_async_submit(&submit, ASYNC_TX_FENCE, NULL, |
846 | ops_complete_compute, sh, | 918 | ops_complete_compute, sh, |
847 | to_addr_conv(sh, percpu)); | 919 | to_addr_conv(sh, percpu)); |
848 | return async_gen_syndrome(blocks, 0, count+2, | 920 | return async_gen_syndrome(blocks, 0, syndrome_disks+2, |
849 | STRIPE_SIZE, &submit); | 921 | STRIPE_SIZE, &submit); |
850 | } else { | 922 | } else { |
851 | struct page *dest; | 923 | struct page *dest; |
@@ -1139,7 +1211,7 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu | |||
1139 | &sh->ops.zero_sum_result, percpu->spare_page, &submit); | 1211 | &sh->ops.zero_sum_result, percpu->spare_page, &submit); |
1140 | } | 1212 | } |
1141 | 1213 | ||
1142 | static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) | 1214 | static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request) |
1143 | { | 1215 | { |
1144 | int overlap_clear = 0, i, disks = sh->disks; | 1216 | int overlap_clear = 0, i, disks = sh->disks; |
1145 | struct dma_async_tx_descriptor *tx = NULL; | 1217 | struct dma_async_tx_descriptor *tx = NULL; |
@@ -1204,22 +1276,54 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) | |||
1204 | put_cpu(); | 1276 | put_cpu(); |
1205 | } | 1277 | } |
1206 | 1278 | ||
1279 | #ifdef CONFIG_MULTICORE_RAID456 | ||
1280 | static void async_run_ops(void *param, async_cookie_t cookie) | ||
1281 | { | ||
1282 | struct stripe_head *sh = param; | ||
1283 | unsigned long ops_request = sh->ops.request; | ||
1284 | |||
1285 | clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state); | ||
1286 | wake_up(&sh->ops.wait_for_ops); | ||
1287 | |||
1288 | __raid_run_ops(sh, ops_request); | ||
1289 | release_stripe(sh); | ||
1290 | } | ||
1291 | |||
1292 | static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) | ||
1293 | { | ||
1294 | /* since handle_stripe can be called outside of raid5d context | ||
1295 | * we need to ensure sh->ops.request is de-staged before another | ||
1296 | * request arrives | ||
1297 | */ | ||
1298 | wait_event(sh->ops.wait_for_ops, | ||
1299 | !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state)); | ||
1300 | sh->ops.request = ops_request; | ||
1301 | |||
1302 | atomic_inc(&sh->count); | ||
1303 | async_schedule(async_run_ops, sh); | ||
1304 | } | ||
1305 | #else | ||
1306 | #define raid_run_ops __raid_run_ops | ||
1307 | #endif | ||
1308 | |||
1207 | static int grow_one_stripe(raid5_conf_t *conf) | 1309 | static int grow_one_stripe(raid5_conf_t *conf) |
1208 | { | 1310 | { |
1209 | struct stripe_head *sh; | 1311 | struct stripe_head *sh; |
1210 | sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); | 1312 | sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); |
1211 | if (!sh) | 1313 | if (!sh) |
1212 | return 0; | 1314 | return 0; |
1213 | memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev)); | 1315 | memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev)); |
1214 | sh->raid_conf = conf; | 1316 | sh->raid_conf = conf; |
1215 | spin_lock_init(&sh->lock); | 1317 | spin_lock_init(&sh->lock); |
1318 | #ifdef CONFIG_MULTICORE_RAID456 | ||
1319 | init_waitqueue_head(&sh->ops.wait_for_ops); | ||
1320 | #endif | ||
1216 | 1321 | ||
1217 | if (grow_buffers(sh, conf->raid_disks)) { | 1322 | if (grow_buffers(sh)) { |
1218 | shrink_buffers(sh, conf->raid_disks); | 1323 | shrink_buffers(sh); |
1219 | kmem_cache_free(conf->slab_cache, sh); | 1324 | kmem_cache_free(conf->slab_cache, sh); |
1220 | return 0; | 1325 | return 0; |
1221 | } | 1326 | } |
1222 | sh->disks = conf->raid_disks; | ||
1223 | /* we just created an active stripe so... */ | 1327 | /* we just created an active stripe so... */ |
1224 | atomic_set(&sh->count, 1); | 1328 | atomic_set(&sh->count, 1); |
1225 | atomic_inc(&conf->active_stripes); | 1329 | atomic_inc(&conf->active_stripes); |
@@ -1231,7 +1335,7 @@ static int grow_one_stripe(raid5_conf_t *conf) | |||
1231 | static int grow_stripes(raid5_conf_t *conf, int num) | 1335 | static int grow_stripes(raid5_conf_t *conf, int num) |
1232 | { | 1336 | { |
1233 | struct kmem_cache *sc; | 1337 | struct kmem_cache *sc; |
1234 | int devs = conf->raid_disks; | 1338 | int devs = max(conf->raid_disks, conf->previous_raid_disks); |
1235 | 1339 | ||
1236 | sprintf(conf->cache_name[0], | 1340 | sprintf(conf->cache_name[0], |
1237 | "raid%d-%s", conf->level, mdname(conf->mddev)); | 1341 | "raid%d-%s", conf->level, mdname(conf->mddev)); |
@@ -1329,6 +1433,9 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) | |||
1329 | 1433 | ||
1330 | nsh->raid_conf = conf; | 1434 | nsh->raid_conf = conf; |
1331 | spin_lock_init(&nsh->lock); | 1435 | spin_lock_init(&nsh->lock); |
1436 | #ifdef CONFIG_MULTICORE_RAID456 | ||
1437 | init_waitqueue_head(&nsh->ops.wait_for_ops); | ||
1438 | #endif | ||
1332 | 1439 | ||
1333 | list_add(&nsh->lru, &newstripes); | 1440 | list_add(&nsh->lru, &newstripes); |
1334 | } | 1441 | } |
@@ -1429,7 +1536,7 @@ static int drop_one_stripe(raid5_conf_t *conf) | |||
1429 | if (!sh) | 1536 | if (!sh) |
1430 | return 0; | 1537 | return 0; |
1431 | BUG_ON(atomic_read(&sh->count)); | 1538 | BUG_ON(atomic_read(&sh->count)); |
1432 | shrink_buffers(sh, conf->pool_size); | 1539 | shrink_buffers(sh); |
1433 | kmem_cache_free(conf->slab_cache, sh); | 1540 | kmem_cache_free(conf->slab_cache, sh); |
1434 | atomic_dec(&conf->active_stripes); | 1541 | atomic_dec(&conf->active_stripes); |
1435 | return 1; | 1542 | return 1; |
@@ -1471,7 +1578,7 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1471 | set_bit(R5_UPTODATE, &sh->dev[i].flags); | 1578 | set_bit(R5_UPTODATE, &sh->dev[i].flags); |
1472 | if (test_bit(R5_ReadError, &sh->dev[i].flags)) { | 1579 | if (test_bit(R5_ReadError, &sh->dev[i].flags)) { |
1473 | rdev = conf->disks[i].rdev; | 1580 | rdev = conf->disks[i].rdev; |
1474 | printk_rl(KERN_INFO "raid5:%s: read error corrected" | 1581 | printk_rl(KERN_INFO "md/raid:%s: read error corrected" |
1475 | " (%lu sectors at %llu on %s)\n", | 1582 | " (%lu sectors at %llu on %s)\n", |
1476 | mdname(conf->mddev), STRIPE_SECTORS, | 1583 | mdname(conf->mddev), STRIPE_SECTORS, |
1477 | (unsigned long long)(sh->sector | 1584 | (unsigned long long)(sh->sector |
@@ -1489,9 +1596,9 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1489 | 1596 | ||
1490 | clear_bit(R5_UPTODATE, &sh->dev[i].flags); | 1597 | clear_bit(R5_UPTODATE, &sh->dev[i].flags); |
1491 | atomic_inc(&rdev->read_errors); | 1598 | atomic_inc(&rdev->read_errors); |
1492 | if (conf->mddev->degraded) | 1599 | if (conf->mddev->degraded >= conf->max_degraded) |
1493 | printk_rl(KERN_WARNING | 1600 | printk_rl(KERN_WARNING |
1494 | "raid5:%s: read error not correctable " | 1601 | "md/raid:%s: read error not correctable " |
1495 | "(sector %llu on %s).\n", | 1602 | "(sector %llu on %s).\n", |
1496 | mdname(conf->mddev), | 1603 | mdname(conf->mddev), |
1497 | (unsigned long long)(sh->sector | 1604 | (unsigned long long)(sh->sector |
@@ -1500,7 +1607,7 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1500 | else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) | 1607 | else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) |
1501 | /* Oh, no!!! */ | 1608 | /* Oh, no!!! */ |
1502 | printk_rl(KERN_WARNING | 1609 | printk_rl(KERN_WARNING |
1503 | "raid5:%s: read error NOT corrected!! " | 1610 | "md/raid:%s: read error NOT corrected!! " |
1504 | "(sector %llu on %s).\n", | 1611 | "(sector %llu on %s).\n", |
1505 | mdname(conf->mddev), | 1612 | mdname(conf->mddev), |
1506 | (unsigned long long)(sh->sector | 1613 | (unsigned long long)(sh->sector |
@@ -1509,7 +1616,7 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1509 | else if (atomic_read(&rdev->read_errors) | 1616 | else if (atomic_read(&rdev->read_errors) |
1510 | > conf->max_nr_stripes) | 1617 | > conf->max_nr_stripes) |
1511 | printk(KERN_WARNING | 1618 | printk(KERN_WARNING |
1512 | "raid5:%s: Too many read errors, failing device %s.\n", | 1619 | "md/raid:%s: Too many read errors, failing device %s.\n", |
1513 | mdname(conf->mddev), bdn); | 1620 | mdname(conf->mddev), bdn); |
1514 | else | 1621 | else |
1515 | retry = 1; | 1622 | retry = 1; |
@@ -1581,8 +1688,8 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous) | |||
1581 | static void error(mddev_t *mddev, mdk_rdev_t *rdev) | 1688 | static void error(mddev_t *mddev, mdk_rdev_t *rdev) |
1582 | { | 1689 | { |
1583 | char b[BDEVNAME_SIZE]; | 1690 | char b[BDEVNAME_SIZE]; |
1584 | raid5_conf_t *conf = (raid5_conf_t *) mddev->private; | 1691 | raid5_conf_t *conf = mddev->private; |
1585 | pr_debug("raid5: error called\n"); | 1692 | pr_debug("raid456: error called\n"); |
1586 | 1693 | ||
1587 | if (!test_bit(Faulty, &rdev->flags)) { | 1694 | if (!test_bit(Faulty, &rdev->flags)) { |
1588 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 1695 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
@@ -1598,9 +1705,13 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1598 | } | 1705 | } |
1599 | set_bit(Faulty, &rdev->flags); | 1706 | set_bit(Faulty, &rdev->flags); |
1600 | printk(KERN_ALERT | 1707 | printk(KERN_ALERT |
1601 | "raid5: Disk failure on %s, disabling device.\n" | 1708 | "md/raid:%s: Disk failure on %s, disabling device.\n" |
1602 | "raid5: Operation continuing on %d devices.\n", | 1709 | KERN_ALERT |
1603 | bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); | 1710 | "md/raid:%s: Operation continuing on %d devices.\n", |
1711 | mdname(mddev), | ||
1712 | bdevname(rdev->bdev, b), | ||
1713 | mdname(mddev), | ||
1714 | conf->raid_disks - mddev->degraded); | ||
1604 | } | 1715 | } |
1605 | } | 1716 | } |
1606 | 1717 | ||
@@ -1612,8 +1723,8 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, | |||
1612 | int previous, int *dd_idx, | 1723 | int previous, int *dd_idx, |
1613 | struct stripe_head *sh) | 1724 | struct stripe_head *sh) |
1614 | { | 1725 | { |
1615 | long stripe; | 1726 | sector_t stripe, stripe2; |
1616 | unsigned long chunk_number; | 1727 | sector_t chunk_number; |
1617 | unsigned int chunk_offset; | 1728 | unsigned int chunk_offset; |
1618 | int pd_idx, qd_idx; | 1729 | int pd_idx, qd_idx; |
1619 | int ddf_layout = 0; | 1730 | int ddf_layout = 0; |
@@ -1633,18 +1744,13 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, | |||
1633 | */ | 1744 | */ |
1634 | chunk_offset = sector_div(r_sector, sectors_per_chunk); | 1745 | chunk_offset = sector_div(r_sector, sectors_per_chunk); |
1635 | chunk_number = r_sector; | 1746 | chunk_number = r_sector; |
1636 | BUG_ON(r_sector != chunk_number); | ||
1637 | 1747 | ||
1638 | /* | 1748 | /* |
1639 | * Compute the stripe number | 1749 | * Compute the stripe number |
1640 | */ | 1750 | */ |
1641 | stripe = chunk_number / data_disks; | 1751 | stripe = chunk_number; |
1642 | 1752 | *dd_idx = sector_div(stripe, data_disks); | |
1643 | /* | 1753 | stripe2 = stripe; |
1644 | * Compute the data disk and parity disk indexes inside the stripe | ||
1645 | */ | ||
1646 | *dd_idx = chunk_number % data_disks; | ||
1647 | |||
1648 | /* | 1754 | /* |
1649 | * Select the parity disk based on the user selected algorithm. | 1755 | * Select the parity disk based on the user selected algorithm. |
1650 | */ | 1756 | */ |
@@ -1656,21 +1762,21 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, | |||
1656 | case 5: | 1762 | case 5: |
1657 | switch (algorithm) { | 1763 | switch (algorithm) { |
1658 | case ALGORITHM_LEFT_ASYMMETRIC: | 1764 | case ALGORITHM_LEFT_ASYMMETRIC: |
1659 | pd_idx = data_disks - stripe % raid_disks; | 1765 | pd_idx = data_disks - sector_div(stripe2, raid_disks); |
1660 | if (*dd_idx >= pd_idx) | 1766 | if (*dd_idx >= pd_idx) |
1661 | (*dd_idx)++; | 1767 | (*dd_idx)++; |
1662 | break; | 1768 | break; |
1663 | case ALGORITHM_RIGHT_ASYMMETRIC: | 1769 | case ALGORITHM_RIGHT_ASYMMETRIC: |
1664 | pd_idx = stripe % raid_disks; | 1770 | pd_idx = sector_div(stripe2, raid_disks); |
1665 | if (*dd_idx >= pd_idx) | 1771 | if (*dd_idx >= pd_idx) |
1666 | (*dd_idx)++; | 1772 | (*dd_idx)++; |
1667 | break; | 1773 | break; |
1668 | case ALGORITHM_LEFT_SYMMETRIC: | 1774 | case ALGORITHM_LEFT_SYMMETRIC: |
1669 | pd_idx = data_disks - stripe % raid_disks; | 1775 | pd_idx = data_disks - sector_div(stripe2, raid_disks); |
1670 | *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; | 1776 | *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; |
1671 | break; | 1777 | break; |
1672 | case ALGORITHM_RIGHT_SYMMETRIC: | 1778 | case ALGORITHM_RIGHT_SYMMETRIC: |
1673 | pd_idx = stripe % raid_disks; | 1779 | pd_idx = sector_div(stripe2, raid_disks); |
1674 | *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; | 1780 | *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; |
1675 | break; | 1781 | break; |
1676 | case ALGORITHM_PARITY_0: | 1782 | case ALGORITHM_PARITY_0: |
@@ -1681,8 +1787,6 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, | |||
1681 | pd_idx = data_disks; | 1787 | pd_idx = data_disks; |
1682 | break; | 1788 | break; |
1683 | default: | 1789 | default: |
1684 | printk(KERN_ERR "raid5: unsupported algorithm %d\n", | ||
1685 | algorithm); | ||
1686 | BUG(); | 1790 | BUG(); |
1687 | } | 1791 | } |
1688 | break; | 1792 | break; |
@@ -1690,7 +1794,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, | |||
1690 | 1794 | ||
1691 | switch (algorithm) { | 1795 | switch (algorithm) { |
1692 | case ALGORITHM_LEFT_ASYMMETRIC: | 1796 | case ALGORITHM_LEFT_ASYMMETRIC: |
1693 | pd_idx = raid_disks - 1 - (stripe % raid_disks); | 1797 | pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); |
1694 | qd_idx = pd_idx + 1; | 1798 | qd_idx = pd_idx + 1; |
1695 | if (pd_idx == raid_disks-1) { | 1799 | if (pd_idx == raid_disks-1) { |
1696 | (*dd_idx)++; /* Q D D D P */ | 1800 | (*dd_idx)++; /* Q D D D P */ |
@@ -1699,7 +1803,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, | |||
1699 | (*dd_idx) += 2; /* D D P Q D */ | 1803 | (*dd_idx) += 2; /* D D P Q D */ |
1700 | break; | 1804 | break; |
1701 | case ALGORITHM_RIGHT_ASYMMETRIC: | 1805 | case ALGORITHM_RIGHT_ASYMMETRIC: |
1702 | pd_idx = stripe % raid_disks; | 1806 | pd_idx = sector_div(stripe2, raid_disks); |
1703 | qd_idx = pd_idx + 1; | 1807 | qd_idx = pd_idx + 1; |
1704 | if (pd_idx == raid_disks-1) { | 1808 | if (pd_idx == raid_disks-1) { |
1705 | (*dd_idx)++; /* Q D D D P */ | 1809 | (*dd_idx)++; /* Q D D D P */ |
@@ -1708,12 +1812,12 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, | |||
1708 | (*dd_idx) += 2; /* D D P Q D */ | 1812 | (*dd_idx) += 2; /* D D P Q D */ |
1709 | break; | 1813 | break; |
1710 | case ALGORITHM_LEFT_SYMMETRIC: | 1814 | case ALGORITHM_LEFT_SYMMETRIC: |
1711 | pd_idx = raid_disks - 1 - (stripe % raid_disks); | 1815 | pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); |
1712 | qd_idx = (pd_idx + 1) % raid_disks; | 1816 | qd_idx = (pd_idx + 1) % raid_disks; |
1713 | *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; | 1817 | *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; |
1714 | break; | 1818 | break; |
1715 | case ALGORITHM_RIGHT_SYMMETRIC: | 1819 | case ALGORITHM_RIGHT_SYMMETRIC: |
1716 | pd_idx = stripe % raid_disks; | 1820 | pd_idx = sector_div(stripe2, raid_disks); |
1717 | qd_idx = (pd_idx + 1) % raid_disks; | 1821 | qd_idx = (pd_idx + 1) % raid_disks; |
1718 | *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; | 1822 | *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; |
1719 | break; | 1823 | break; |
@@ -1732,7 +1836,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, | |||
1732 | /* Exactly the same as RIGHT_ASYMMETRIC, but or | 1836 | /* Exactly the same as RIGHT_ASYMMETRIC, but or |
1733 | * of blocks for computing Q is different. | 1837 | * of blocks for computing Q is different. |
1734 | */ | 1838 | */ |
1735 | pd_idx = stripe % raid_disks; | 1839 | pd_idx = sector_div(stripe2, raid_disks); |
1736 | qd_idx = pd_idx + 1; | 1840 | qd_idx = pd_idx + 1; |
1737 | if (pd_idx == raid_disks-1) { | 1841 | if (pd_idx == raid_disks-1) { |
1738 | (*dd_idx)++; /* Q D D D P */ | 1842 | (*dd_idx)++; /* Q D D D P */ |
@@ -1747,7 +1851,8 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, | |||
1747 | * D D D P Q rather than | 1851 | * D D D P Q rather than |
1748 | * Q D D D P | 1852 | * Q D D D P |
1749 | */ | 1853 | */ |
1750 | pd_idx = raid_disks - 1 - ((stripe + 1) % raid_disks); | 1854 | stripe2 += 1; |
1855 | pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); | ||
1751 | qd_idx = pd_idx + 1; | 1856 | qd_idx = pd_idx + 1; |
1752 | if (pd_idx == raid_disks-1) { | 1857 | if (pd_idx == raid_disks-1) { |
1753 | (*dd_idx)++; /* Q D D D P */ | 1858 | (*dd_idx)++; /* Q D D D P */ |
@@ -1759,7 +1864,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, | |||
1759 | 1864 | ||
1760 | case ALGORITHM_ROTATING_N_CONTINUE: | 1865 | case ALGORITHM_ROTATING_N_CONTINUE: |
1761 | /* Same as left_symmetric but Q is before P */ | 1866 | /* Same as left_symmetric but Q is before P */ |
1762 | pd_idx = raid_disks - 1 - (stripe % raid_disks); | 1867 | pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); |
1763 | qd_idx = (pd_idx + raid_disks - 1) % raid_disks; | 1868 | qd_idx = (pd_idx + raid_disks - 1) % raid_disks; |
1764 | *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; | 1869 | *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; |
1765 | ddf_layout = 1; | 1870 | ddf_layout = 1; |
@@ -1767,27 +1872,27 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, | |||
1767 | 1872 | ||
1768 | case ALGORITHM_LEFT_ASYMMETRIC_6: | 1873 | case ALGORITHM_LEFT_ASYMMETRIC_6: |
1769 | /* RAID5 left_asymmetric, with Q on last device */ | 1874 | /* RAID5 left_asymmetric, with Q on last device */ |
1770 | pd_idx = data_disks - stripe % (raid_disks-1); | 1875 | pd_idx = data_disks - sector_div(stripe2, raid_disks-1); |
1771 | if (*dd_idx >= pd_idx) | 1876 | if (*dd_idx >= pd_idx) |
1772 | (*dd_idx)++; | 1877 | (*dd_idx)++; |
1773 | qd_idx = raid_disks - 1; | 1878 | qd_idx = raid_disks - 1; |
1774 | break; | 1879 | break; |
1775 | 1880 | ||
1776 | case ALGORITHM_RIGHT_ASYMMETRIC_6: | 1881 | case ALGORITHM_RIGHT_ASYMMETRIC_6: |
1777 | pd_idx = stripe % (raid_disks-1); | 1882 | pd_idx = sector_div(stripe2, raid_disks-1); |
1778 | if (*dd_idx >= pd_idx) | 1883 | if (*dd_idx >= pd_idx) |
1779 | (*dd_idx)++; | 1884 | (*dd_idx)++; |
1780 | qd_idx = raid_disks - 1; | 1885 | qd_idx = raid_disks - 1; |
1781 | break; | 1886 | break; |
1782 | 1887 | ||
1783 | case ALGORITHM_LEFT_SYMMETRIC_6: | 1888 | case ALGORITHM_LEFT_SYMMETRIC_6: |
1784 | pd_idx = data_disks - stripe % (raid_disks-1); | 1889 | pd_idx = data_disks - sector_div(stripe2, raid_disks-1); |
1785 | *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); | 1890 | *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); |
1786 | qd_idx = raid_disks - 1; | 1891 | qd_idx = raid_disks - 1; |
1787 | break; | 1892 | break; |
1788 | 1893 | ||
1789 | case ALGORITHM_RIGHT_SYMMETRIC_6: | 1894 | case ALGORITHM_RIGHT_SYMMETRIC_6: |
1790 | pd_idx = stripe % (raid_disks-1); | 1895 | pd_idx = sector_div(stripe2, raid_disks-1); |
1791 | *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); | 1896 | *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); |
1792 | qd_idx = raid_disks - 1; | 1897 | qd_idx = raid_disks - 1; |
1793 | break; | 1898 | break; |
@@ -1798,10 +1903,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, | |||
1798 | qd_idx = raid_disks - 1; | 1903 | qd_idx = raid_disks - 1; |
1799 | break; | 1904 | break; |
1800 | 1905 | ||
1801 | |||
1802 | default: | 1906 | default: |
1803 | printk(KERN_CRIT "raid6: unsupported algorithm %d\n", | ||
1804 | algorithm); | ||
1805 | BUG(); | 1907 | BUG(); |
1806 | } | 1908 | } |
1807 | break; | 1909 | break; |
@@ -1832,14 +1934,14 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) | |||
1832 | : conf->algorithm; | 1934 | : conf->algorithm; |
1833 | sector_t stripe; | 1935 | sector_t stripe; |
1834 | int chunk_offset; | 1936 | int chunk_offset; |
1835 | int chunk_number, dummy1, dd_idx = i; | 1937 | sector_t chunk_number; |
1938 | int dummy1, dd_idx = i; | ||
1836 | sector_t r_sector; | 1939 | sector_t r_sector; |
1837 | struct stripe_head sh2; | 1940 | struct stripe_head sh2; |
1838 | 1941 | ||
1839 | 1942 | ||
1840 | chunk_offset = sector_div(new_sector, sectors_per_chunk); | 1943 | chunk_offset = sector_div(new_sector, sectors_per_chunk); |
1841 | stripe = new_sector; | 1944 | stripe = new_sector; |
1842 | BUG_ON(new_sector != stripe); | ||
1843 | 1945 | ||
1844 | if (i == sh->pd_idx) | 1946 | if (i == sh->pd_idx) |
1845 | return 0; | 1947 | return 0; |
@@ -1864,8 +1966,6 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) | |||
1864 | case ALGORITHM_PARITY_N: | 1966 | case ALGORITHM_PARITY_N: |
1865 | break; | 1967 | break; |
1866 | default: | 1968 | default: |
1867 | printk(KERN_ERR "raid5: unsupported algorithm %d\n", | ||
1868 | algorithm); | ||
1869 | BUG(); | 1969 | BUG(); |
1870 | } | 1970 | } |
1871 | break; | 1971 | break; |
@@ -1899,10 +1999,15 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) | |||
1899 | case ALGORITHM_PARITY_N: | 1999 | case ALGORITHM_PARITY_N: |
1900 | break; | 2000 | break; |
1901 | case ALGORITHM_ROTATING_N_CONTINUE: | 2001 | case ALGORITHM_ROTATING_N_CONTINUE: |
2002 | /* Like left_symmetric, but P is before Q */ | ||
1902 | if (sh->pd_idx == 0) | 2003 | if (sh->pd_idx == 0) |
1903 | i--; /* P D D D Q */ | 2004 | i--; /* P D D D Q */ |
1904 | else if (i > sh->pd_idx) | 2005 | else { |
1905 | i -= 2; /* D D Q P D */ | 2006 | /* D D Q P D */ |
2007 | if (i < sh->pd_idx) | ||
2008 | i += raid_disks; | ||
2009 | i -= (sh->pd_idx + 1); | ||
2010 | } | ||
1906 | break; | 2011 | break; |
1907 | case ALGORITHM_LEFT_ASYMMETRIC_6: | 2012 | case ALGORITHM_LEFT_ASYMMETRIC_6: |
1908 | case ALGORITHM_RIGHT_ASYMMETRIC_6: | 2013 | case ALGORITHM_RIGHT_ASYMMETRIC_6: |
@@ -1919,21 +2024,20 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) | |||
1919 | i -= 1; | 2024 | i -= 1; |
1920 | break; | 2025 | break; |
1921 | default: | 2026 | default: |
1922 | printk(KERN_CRIT "raid6: unsupported algorithm %d\n", | ||
1923 | algorithm); | ||
1924 | BUG(); | 2027 | BUG(); |
1925 | } | 2028 | } |
1926 | break; | 2029 | break; |
1927 | } | 2030 | } |
1928 | 2031 | ||
1929 | chunk_number = stripe * data_disks + i; | 2032 | chunk_number = stripe * data_disks + i; |
1930 | r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset; | 2033 | r_sector = chunk_number * sectors_per_chunk + chunk_offset; |
1931 | 2034 | ||
1932 | check = raid5_compute_sector(conf, r_sector, | 2035 | check = raid5_compute_sector(conf, r_sector, |
1933 | previous, &dummy1, &sh2); | 2036 | previous, &dummy1, &sh2); |
1934 | if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx | 2037 | if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx |
1935 | || sh2.qd_idx != sh->qd_idx) { | 2038 | || sh2.qd_idx != sh->qd_idx) { |
1936 | printk(KERN_ERR "compute_blocknr: map not correct\n"); | 2039 | printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n", |
2040 | mdname(conf->mddev)); | ||
1937 | return 0; | 2041 | return 0; |
1938 | } | 2042 | } |
1939 | return r_sector; | 2043 | return r_sector; |
@@ -2896,7 +3000,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, | |||
2896 | * | 3000 | * |
2897 | */ | 3001 | */ |
2898 | 3002 | ||
2899 | static bool handle_stripe5(struct stripe_head *sh) | 3003 | static void handle_stripe5(struct stripe_head *sh) |
2900 | { | 3004 | { |
2901 | raid5_conf_t *conf = sh->raid_conf; | 3005 | raid5_conf_t *conf = sh->raid_conf; |
2902 | int disks = sh->disks, i; | 3006 | int disks = sh->disks, i; |
@@ -2905,6 +3009,7 @@ static bool handle_stripe5(struct stripe_head *sh) | |||
2905 | struct r5dev *dev; | 3009 | struct r5dev *dev; |
2906 | mdk_rdev_t *blocked_rdev = NULL; | 3010 | mdk_rdev_t *blocked_rdev = NULL; |
2907 | int prexor; | 3011 | int prexor; |
3012 | int dec_preread_active = 0; | ||
2908 | 3013 | ||
2909 | memset(&s, 0, sizeof(s)); | 3014 | memset(&s, 0, sizeof(s)); |
2910 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d " | 3015 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d " |
@@ -2926,7 +3031,6 @@ static bool handle_stripe5(struct stripe_head *sh) | |||
2926 | mdk_rdev_t *rdev; | 3031 | mdk_rdev_t *rdev; |
2927 | 3032 | ||
2928 | dev = &sh->dev[i]; | 3033 | dev = &sh->dev[i]; |
2929 | clear_bit(R5_Insync, &dev->flags); | ||
2930 | 3034 | ||
2931 | pr_debug("check %d: state 0x%lx toread %p read %p write %p " | 3035 | pr_debug("check %d: state 0x%lx toread %p read %p write %p " |
2932 | "written %p\n", i, dev->flags, dev->toread, dev->read, | 3036 | "written %p\n", i, dev->flags, dev->toread, dev->read, |
@@ -2963,17 +3067,27 @@ static bool handle_stripe5(struct stripe_head *sh) | |||
2963 | blocked_rdev = rdev; | 3067 | blocked_rdev = rdev; |
2964 | atomic_inc(&rdev->nr_pending); | 3068 | atomic_inc(&rdev->nr_pending); |
2965 | } | 3069 | } |
2966 | if (!rdev || !test_bit(In_sync, &rdev->flags)) { | 3070 | clear_bit(R5_Insync, &dev->flags); |
3071 | if (!rdev) | ||
3072 | /* Not in-sync */; | ||
3073 | else if (test_bit(In_sync, &rdev->flags)) | ||
3074 | set_bit(R5_Insync, &dev->flags); | ||
3075 | else { | ||
3076 | /* could be in-sync depending on recovery/reshape status */ | ||
3077 | if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) | ||
3078 | set_bit(R5_Insync, &dev->flags); | ||
3079 | } | ||
3080 | if (!test_bit(R5_Insync, &dev->flags)) { | ||
2967 | /* The ReadError flag will just be confusing now */ | 3081 | /* The ReadError flag will just be confusing now */ |
2968 | clear_bit(R5_ReadError, &dev->flags); | 3082 | clear_bit(R5_ReadError, &dev->flags); |
2969 | clear_bit(R5_ReWrite, &dev->flags); | 3083 | clear_bit(R5_ReWrite, &dev->flags); |
2970 | } | 3084 | } |
2971 | if (!rdev || !test_bit(In_sync, &rdev->flags) | 3085 | if (test_bit(R5_ReadError, &dev->flags)) |
2972 | || test_bit(R5_ReadError, &dev->flags)) { | 3086 | clear_bit(R5_Insync, &dev->flags); |
3087 | if (!test_bit(R5_Insync, &dev->flags)) { | ||
2973 | s.failed++; | 3088 | s.failed++; |
2974 | s.failed_num = i; | 3089 | s.failed_num = i; |
2975 | } else | 3090 | } |
2976 | set_bit(R5_Insync, &dev->flags); | ||
2977 | } | 3091 | } |
2978 | rcu_read_unlock(); | 3092 | rcu_read_unlock(); |
2979 | 3093 | ||
@@ -3054,12 +3168,8 @@ static bool handle_stripe5(struct stripe_head *sh) | |||
3054 | set_bit(STRIPE_INSYNC, &sh->state); | 3168 | set_bit(STRIPE_INSYNC, &sh->state); |
3055 | } | 3169 | } |
3056 | } | 3170 | } |
3057 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | 3171 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) |
3058 | atomic_dec(&conf->preread_active_stripes); | 3172 | dec_preread_active = 1; |
3059 | if (atomic_read(&conf->preread_active_stripes) < | ||
3060 | IO_THRESHOLD) | ||
3061 | md_wakeup_thread(conf->mddev->thread); | ||
3062 | } | ||
3063 | } | 3173 | } |
3064 | 3174 | ||
3065 | /* Now to consider new write requests and what else, if anything | 3175 | /* Now to consider new write requests and what else, if anything |
@@ -3166,12 +3276,20 @@ static bool handle_stripe5(struct stripe_head *sh) | |||
3166 | 3276 | ||
3167 | ops_run_io(sh, &s); | 3277 | ops_run_io(sh, &s); |
3168 | 3278 | ||
3279 | if (dec_preread_active) { | ||
3280 | /* We delay this until after ops_run_io so that if make_request | ||
3281 | * is waiting on a barrier, it won't continue until the writes | ||
3282 | * have actually been submitted. | ||
3283 | */ | ||
3284 | atomic_dec(&conf->preread_active_stripes); | ||
3285 | if (atomic_read(&conf->preread_active_stripes) < | ||
3286 | IO_THRESHOLD) | ||
3287 | md_wakeup_thread(conf->mddev->thread); | ||
3288 | } | ||
3169 | return_io(return_bi); | 3289 | return_io(return_bi); |
3170 | |||
3171 | return blocked_rdev == NULL; | ||
3172 | } | 3290 | } |
3173 | 3291 | ||
3174 | static bool handle_stripe6(struct stripe_head *sh) | 3292 | static void handle_stripe6(struct stripe_head *sh) |
3175 | { | 3293 | { |
3176 | raid5_conf_t *conf = sh->raid_conf; | 3294 | raid5_conf_t *conf = sh->raid_conf; |
3177 | int disks = sh->disks; | 3295 | int disks = sh->disks; |
@@ -3181,6 +3299,7 @@ static bool handle_stripe6(struct stripe_head *sh) | |||
3181 | struct r6_state r6s; | 3299 | struct r6_state r6s; |
3182 | struct r5dev *dev, *pdev, *qdev; | 3300 | struct r5dev *dev, *pdev, *qdev; |
3183 | mdk_rdev_t *blocked_rdev = NULL; | 3301 | mdk_rdev_t *blocked_rdev = NULL; |
3302 | int dec_preread_active = 0; | ||
3184 | 3303 | ||
3185 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, " | 3304 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, " |
3186 | "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", | 3305 | "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", |
@@ -3202,7 +3321,6 @@ static bool handle_stripe6(struct stripe_head *sh) | |||
3202 | for (i=disks; i--; ) { | 3321 | for (i=disks; i--; ) { |
3203 | mdk_rdev_t *rdev; | 3322 | mdk_rdev_t *rdev; |
3204 | dev = &sh->dev[i]; | 3323 | dev = &sh->dev[i]; |
3205 | clear_bit(R5_Insync, &dev->flags); | ||
3206 | 3324 | ||
3207 | pr_debug("check %d: state 0x%lx read %p write %p written %p\n", | 3325 | pr_debug("check %d: state 0x%lx read %p write %p written %p\n", |
3208 | i, dev->flags, dev->toread, dev->towrite, dev->written); | 3326 | i, dev->flags, dev->toread, dev->towrite, dev->written); |
@@ -3240,18 +3358,28 @@ static bool handle_stripe6(struct stripe_head *sh) | |||
3240 | blocked_rdev = rdev; | 3358 | blocked_rdev = rdev; |
3241 | atomic_inc(&rdev->nr_pending); | 3359 | atomic_inc(&rdev->nr_pending); |
3242 | } | 3360 | } |
3243 | if (!rdev || !test_bit(In_sync, &rdev->flags)) { | 3361 | clear_bit(R5_Insync, &dev->flags); |
3362 | if (!rdev) | ||
3363 | /* Not in-sync */; | ||
3364 | else if (test_bit(In_sync, &rdev->flags)) | ||
3365 | set_bit(R5_Insync, &dev->flags); | ||
3366 | else { | ||
3367 | /* in sync if before recovery_offset */ | ||
3368 | if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) | ||
3369 | set_bit(R5_Insync, &dev->flags); | ||
3370 | } | ||
3371 | if (!test_bit(R5_Insync, &dev->flags)) { | ||
3244 | /* The ReadError flag will just be confusing now */ | 3372 | /* The ReadError flag will just be confusing now */ |
3245 | clear_bit(R5_ReadError, &dev->flags); | 3373 | clear_bit(R5_ReadError, &dev->flags); |
3246 | clear_bit(R5_ReWrite, &dev->flags); | 3374 | clear_bit(R5_ReWrite, &dev->flags); |
3247 | } | 3375 | } |
3248 | if (!rdev || !test_bit(In_sync, &rdev->flags) | 3376 | if (test_bit(R5_ReadError, &dev->flags)) |
3249 | || test_bit(R5_ReadError, &dev->flags)) { | 3377 | clear_bit(R5_Insync, &dev->flags); |
3378 | if (!test_bit(R5_Insync, &dev->flags)) { | ||
3250 | if (s.failed < 2) | 3379 | if (s.failed < 2) |
3251 | r6s.failed_num[s.failed] = i; | 3380 | r6s.failed_num[s.failed] = i; |
3252 | s.failed++; | 3381 | s.failed++; |
3253 | } else | 3382 | } |
3254 | set_bit(R5_Insync, &dev->flags); | ||
3255 | } | 3383 | } |
3256 | rcu_read_unlock(); | 3384 | rcu_read_unlock(); |
3257 | 3385 | ||
@@ -3318,7 +3446,6 @@ static bool handle_stripe6(struct stripe_head *sh) | |||
3318 | * completed | 3446 | * completed |
3319 | */ | 3447 | */ |
3320 | if (sh->reconstruct_state == reconstruct_state_drain_result) { | 3448 | if (sh->reconstruct_state == reconstruct_state_drain_result) { |
3321 | int qd_idx = sh->qd_idx; | ||
3322 | 3449 | ||
3323 | sh->reconstruct_state = reconstruct_state_idle; | 3450 | sh->reconstruct_state = reconstruct_state_idle; |
3324 | /* All the 'written' buffers and the parity blocks are ready to | 3451 | /* All the 'written' buffers and the parity blocks are ready to |
@@ -3340,12 +3467,8 @@ static bool handle_stripe6(struct stripe_head *sh) | |||
3340 | set_bit(STRIPE_INSYNC, &sh->state); | 3467 | set_bit(STRIPE_INSYNC, &sh->state); |
3341 | } | 3468 | } |
3342 | } | 3469 | } |
3343 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | 3470 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) |
3344 | atomic_dec(&conf->preread_active_stripes); | 3471 | dec_preread_active = 1; |
3345 | if (atomic_read(&conf->preread_active_stripes) < | ||
3346 | IO_THRESHOLD) | ||
3347 | md_wakeup_thread(conf->mddev->thread); | ||
3348 | } | ||
3349 | } | 3472 | } |
3350 | 3473 | ||
3351 | /* Now to consider new write requests and what else, if anything | 3474 | /* Now to consider new write requests and what else, if anything |
@@ -3454,18 +3577,27 @@ static bool handle_stripe6(struct stripe_head *sh) | |||
3454 | 3577 | ||
3455 | ops_run_io(sh, &s); | 3578 | ops_run_io(sh, &s); |
3456 | 3579 | ||
3457 | return_io(return_bi); | ||
3458 | 3580 | ||
3459 | return blocked_rdev == NULL; | 3581 | if (dec_preread_active) { |
3582 | /* We delay this until after ops_run_io so that if make_request | ||
3583 | * is waiting on a barrier, it won't continue until the writes | ||
3584 | * have actually been submitted. | ||
3585 | */ | ||
3586 | atomic_dec(&conf->preread_active_stripes); | ||
3587 | if (atomic_read(&conf->preread_active_stripes) < | ||
3588 | IO_THRESHOLD) | ||
3589 | md_wakeup_thread(conf->mddev->thread); | ||
3590 | } | ||
3591 | |||
3592 | return_io(return_bi); | ||
3460 | } | 3593 | } |
3461 | 3594 | ||
3462 | /* returns true if the stripe was handled */ | 3595 | static void handle_stripe(struct stripe_head *sh) |
3463 | static bool handle_stripe(struct stripe_head *sh) | ||
3464 | { | 3596 | { |
3465 | if (sh->raid_conf->level == 6) | 3597 | if (sh->raid_conf->level == 6) |
3466 | return handle_stripe6(sh); | 3598 | handle_stripe6(sh); |
3467 | else | 3599 | else |
3468 | return handle_stripe5(sh); | 3600 | handle_stripe5(sh); |
3469 | } | 3601 | } |
3470 | 3602 | ||
3471 | static void raid5_activate_delayed(raid5_conf_t *conf) | 3603 | static void raid5_activate_delayed(raid5_conf_t *conf) |
@@ -3503,9 +3635,10 @@ static void unplug_slaves(mddev_t *mddev) | |||
3503 | { | 3635 | { |
3504 | raid5_conf_t *conf = mddev->private; | 3636 | raid5_conf_t *conf = mddev->private; |
3505 | int i; | 3637 | int i; |
3638 | int devs = max(conf->raid_disks, conf->previous_raid_disks); | ||
3506 | 3639 | ||
3507 | rcu_read_lock(); | 3640 | rcu_read_lock(); |
3508 | for (i = 0; i < conf->raid_disks; i++) { | 3641 | for (i = 0; i < devs; i++) { |
3509 | mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); | 3642 | mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); |
3510 | if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { | 3643 | if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { |
3511 | struct request_queue *r_queue = bdev_get_queue(rdev->bdev); | 3644 | struct request_queue *r_queue = bdev_get_queue(rdev->bdev); |
@@ -3659,10 +3792,10 @@ static void raid5_align_endio(struct bio *bi, int error) | |||
3659 | 3792 | ||
3660 | bio_put(bi); | 3793 | bio_put(bi); |
3661 | 3794 | ||
3662 | mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata; | ||
3663 | conf = mddev->private; | ||
3664 | rdev = (void*)raid_bi->bi_next; | 3795 | rdev = (void*)raid_bi->bi_next; |
3665 | raid_bi->bi_next = NULL; | 3796 | raid_bi->bi_next = NULL; |
3797 | mddev = rdev->mddev; | ||
3798 | conf = mddev->private; | ||
3666 | 3799 | ||
3667 | rdev_dec_pending(rdev, conf->mddev); | 3800 | rdev_dec_pending(rdev, conf->mddev); |
3668 | 3801 | ||
@@ -3686,7 +3819,7 @@ static int bio_fits_rdev(struct bio *bi) | |||
3686 | if ((bi->bi_size>>9) > queue_max_sectors(q)) | 3819 | if ((bi->bi_size>>9) > queue_max_sectors(q)) |
3687 | return 0; | 3820 | return 0; |
3688 | blk_recount_segments(q, bi); | 3821 | blk_recount_segments(q, bi); |
3689 | if (bi->bi_phys_segments > queue_max_phys_segments(q)) | 3822 | if (bi->bi_phys_segments > queue_max_segments(q)) |
3690 | return 0; | 3823 | return 0; |
3691 | 3824 | ||
3692 | if (q->merge_bvec_fn) | 3825 | if (q->merge_bvec_fn) |
@@ -3699,11 +3832,10 @@ static int bio_fits_rdev(struct bio *bi) | |||
3699 | } | 3832 | } |
3700 | 3833 | ||
3701 | 3834 | ||
3702 | static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) | 3835 | static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio) |
3703 | { | 3836 | { |
3704 | mddev_t *mddev = q->queuedata; | ||
3705 | raid5_conf_t *conf = mddev->private; | 3837 | raid5_conf_t *conf = mddev->private; |
3706 | unsigned int dd_idx; | 3838 | int dd_idx; |
3707 | struct bio* align_bi; | 3839 | struct bio* align_bi; |
3708 | mdk_rdev_t *rdev; | 3840 | mdk_rdev_t *rdev; |
3709 | 3841 | ||
@@ -3816,33 +3948,32 @@ static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf) | |||
3816 | return sh; | 3948 | return sh; |
3817 | } | 3949 | } |
3818 | 3950 | ||
3819 | static int make_request(struct request_queue *q, struct bio * bi) | 3951 | static int make_request(mddev_t *mddev, struct bio * bi) |
3820 | { | 3952 | { |
3821 | mddev_t *mddev = q->queuedata; | ||
3822 | raid5_conf_t *conf = mddev->private; | 3953 | raid5_conf_t *conf = mddev->private; |
3823 | int dd_idx; | 3954 | int dd_idx; |
3824 | sector_t new_sector; | 3955 | sector_t new_sector; |
3825 | sector_t logical_sector, last_sector; | 3956 | sector_t logical_sector, last_sector; |
3826 | struct stripe_head *sh; | 3957 | struct stripe_head *sh; |
3827 | const int rw = bio_data_dir(bi); | 3958 | const int rw = bio_data_dir(bi); |
3828 | int cpu, remaining; | 3959 | int remaining; |
3829 | 3960 | ||
3830 | if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) { | 3961 | if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) { |
3831 | bio_endio(bi, -EOPNOTSUPP); | 3962 | /* Drain all pending writes. We only really need |
3963 | * to ensure they have been submitted, but this is | ||
3964 | * easier. | ||
3965 | */ | ||
3966 | mddev->pers->quiesce(mddev, 1); | ||
3967 | mddev->pers->quiesce(mddev, 0); | ||
3968 | md_barrier_request(mddev, bi); | ||
3832 | return 0; | 3969 | return 0; |
3833 | } | 3970 | } |
3834 | 3971 | ||
3835 | md_write_start(mddev, bi); | 3972 | md_write_start(mddev, bi); |
3836 | 3973 | ||
3837 | cpu = part_stat_lock(); | ||
3838 | part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); | ||
3839 | part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], | ||
3840 | bio_sectors(bi)); | ||
3841 | part_stat_unlock(); | ||
3842 | |||
3843 | if (rw == READ && | 3974 | if (rw == READ && |
3844 | mddev->reshape_position == MaxSector && | 3975 | mddev->reshape_position == MaxSector && |
3845 | chunk_aligned_read(q,bi)) | 3976 | chunk_aligned_read(mddev,bi)) |
3846 | return 0; | 3977 | return 0; |
3847 | 3978 | ||
3848 | logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); | 3979 | logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); |
@@ -3890,7 +4021,7 @@ static int make_request(struct request_queue *q, struct bio * bi) | |||
3890 | new_sector = raid5_compute_sector(conf, logical_sector, | 4021 | new_sector = raid5_compute_sector(conf, logical_sector, |
3891 | previous, | 4022 | previous, |
3892 | &dd_idx, NULL); | 4023 | &dd_idx, NULL); |
3893 | pr_debug("raid5: make_request, sector %llu logical %llu\n", | 4024 | pr_debug("raid456: make_request, sector %llu logical %llu\n", |
3894 | (unsigned long long)new_sector, | 4025 | (unsigned long long)new_sector, |
3895 | (unsigned long long)logical_sector); | 4026 | (unsigned long long)logical_sector); |
3896 | 4027 | ||
@@ -3952,6 +4083,9 @@ static int make_request(struct request_queue *q, struct bio * bi) | |||
3952 | finish_wait(&conf->wait_for_overlap, &w); | 4083 | finish_wait(&conf->wait_for_overlap, &w); |
3953 | set_bit(STRIPE_HANDLE, &sh->state); | 4084 | set_bit(STRIPE_HANDLE, &sh->state); |
3954 | clear_bit(STRIPE_DELAYED, &sh->state); | 4085 | clear_bit(STRIPE_DELAYED, &sh->state); |
4086 | if (mddev->barrier && | ||
4087 | !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | ||
4088 | atomic_inc(&conf->preread_active_stripes); | ||
3955 | release_stripe(sh); | 4089 | release_stripe(sh); |
3956 | } else { | 4090 | } else { |
3957 | /* cannot get stripe for read-ahead, just give-up */ | 4091 | /* cannot get stripe for read-ahead, just give-up */ |
@@ -3971,6 +4105,14 @@ static int make_request(struct request_queue *q, struct bio * bi) | |||
3971 | 4105 | ||
3972 | bio_endio(bi, 0); | 4106 | bio_endio(bi, 0); |
3973 | } | 4107 | } |
4108 | |||
4109 | if (mddev->barrier) { | ||
4110 | /* We need to wait for the stripes to all be handled. | ||
4111 | * So: wait for preread_active_stripes to drop to 0. | ||
4112 | */ | ||
4113 | wait_event(mddev->thread->wqueue, | ||
4114 | atomic_read(&conf->preread_active_stripes) == 0); | ||
4115 | } | ||
3974 | return 0; | 4116 | return 0; |
3975 | } | 4117 | } |
3976 | 4118 | ||
@@ -3987,7 +4129,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped | |||
3987 | * As the reads complete, handle_stripe will copy the data | 4129 | * As the reads complete, handle_stripe will copy the data |
3988 | * into the destination stripe and release that stripe. | 4130 | * into the destination stripe and release that stripe. |
3989 | */ | 4131 | */ |
3990 | raid5_conf_t *conf = (raid5_conf_t *) mddev->private; | 4132 | raid5_conf_t *conf = mddev->private; |
3991 | struct stripe_head *sh; | 4133 | struct stripe_head *sh; |
3992 | sector_t first_sector, last_sector; | 4134 | sector_t first_sector, last_sector; |
3993 | int raid_disks = conf->previous_raid_disks; | 4135 | int raid_disks = conf->previous_raid_disks; |
@@ -4011,6 +4153,8 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped | |||
4011 | sector_nr = conf->reshape_progress; | 4153 | sector_nr = conf->reshape_progress; |
4012 | sector_div(sector_nr, new_data_disks); | 4154 | sector_div(sector_nr, new_data_disks); |
4013 | if (sector_nr) { | 4155 | if (sector_nr) { |
4156 | mddev->curr_resync_completed = sector_nr; | ||
4157 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | ||
4014 | *skipped = 1; | 4158 | *skipped = 1; |
4015 | return sector_nr; | 4159 | return sector_nr; |
4016 | } | 4160 | } |
@@ -4194,7 +4338,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped | |||
4194 | /* FIXME go_faster isn't used */ | 4338 | /* FIXME go_faster isn't used */ |
4195 | static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) | 4339 | static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) |
4196 | { | 4340 | { |
4197 | raid5_conf_t *conf = (raid5_conf_t *) mddev->private; | 4341 | raid5_conf_t *conf = mddev->private; |
4198 | struct stripe_head *sh; | 4342 | struct stripe_head *sh; |
4199 | sector_t max_sector = mddev->dev_sectors; | 4343 | sector_t max_sector = mddev->dev_sectors; |
4200 | int sync_blocks; | 4344 | int sync_blocks; |
@@ -4277,9 +4421,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski | |||
4277 | clear_bit(STRIPE_INSYNC, &sh->state); | 4421 | clear_bit(STRIPE_INSYNC, &sh->state); |
4278 | spin_unlock(&sh->lock); | 4422 | spin_unlock(&sh->lock); |
4279 | 4423 | ||
4280 | /* wait for any blocked device to be handled */ | 4424 | handle_stripe(sh); |
4281 | while (unlikely(!handle_stripe(sh))) | ||
4282 | ; | ||
4283 | release_stripe(sh); | 4425 | release_stripe(sh); |
4284 | 4426 | ||
4285 | return STRIPE_SECTORS; | 4427 | return STRIPE_SECTORS; |
@@ -4349,37 +4491,6 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) | |||
4349 | return handled; | 4491 | return handled; |
4350 | } | 4492 | } |
4351 | 4493 | ||
4352 | #ifdef CONFIG_MULTICORE_RAID456 | ||
4353 | static void __process_stripe(void *param, async_cookie_t cookie) | ||
4354 | { | ||
4355 | struct stripe_head *sh = param; | ||
4356 | |||
4357 | handle_stripe(sh); | ||
4358 | release_stripe(sh); | ||
4359 | } | ||
4360 | |||
4361 | static void process_stripe(struct stripe_head *sh, struct list_head *domain) | ||
4362 | { | ||
4363 | async_schedule_domain(__process_stripe, sh, domain); | ||
4364 | } | ||
4365 | |||
4366 | static void synchronize_stripe_processing(struct list_head *domain) | ||
4367 | { | ||
4368 | async_synchronize_full_domain(domain); | ||
4369 | } | ||
4370 | #else | ||
4371 | static void process_stripe(struct stripe_head *sh, struct list_head *domain) | ||
4372 | { | ||
4373 | handle_stripe(sh); | ||
4374 | release_stripe(sh); | ||
4375 | cond_resched(); | ||
4376 | } | ||
4377 | |||
4378 | static void synchronize_stripe_processing(struct list_head *domain) | ||
4379 | { | ||
4380 | } | ||
4381 | #endif | ||
4382 | |||
4383 | 4494 | ||
4384 | /* | 4495 | /* |
4385 | * This is our raid5 kernel thread. | 4496 | * This is our raid5 kernel thread. |
@@ -4393,7 +4504,6 @@ static void raid5d(mddev_t *mddev) | |||
4393 | struct stripe_head *sh; | 4504 | struct stripe_head *sh; |
4394 | raid5_conf_t *conf = mddev->private; | 4505 | raid5_conf_t *conf = mddev->private; |
4395 | int handled; | 4506 | int handled; |
4396 | LIST_HEAD(raid_domain); | ||
4397 | 4507 | ||
4398 | pr_debug("+++ raid5d active\n"); | 4508 | pr_debug("+++ raid5d active\n"); |
4399 | 4509 | ||
@@ -4430,7 +4540,9 @@ static void raid5d(mddev_t *mddev) | |||
4430 | spin_unlock_irq(&conf->device_lock); | 4540 | spin_unlock_irq(&conf->device_lock); |
4431 | 4541 | ||
4432 | handled++; | 4542 | handled++; |
4433 | process_stripe(sh, &raid_domain); | 4543 | handle_stripe(sh); |
4544 | release_stripe(sh); | ||
4545 | cond_resched(); | ||
4434 | 4546 | ||
4435 | spin_lock_irq(&conf->device_lock); | 4547 | spin_lock_irq(&conf->device_lock); |
4436 | } | 4548 | } |
@@ -4438,7 +4550,6 @@ static void raid5d(mddev_t *mddev) | |||
4438 | 4550 | ||
4439 | spin_unlock_irq(&conf->device_lock); | 4551 | spin_unlock_irq(&conf->device_lock); |
4440 | 4552 | ||
4441 | synchronize_stripe_processing(&raid_domain); | ||
4442 | async_tx_issue_pending_all(); | 4553 | async_tx_issue_pending_all(); |
4443 | unplug_slaves(mddev); | 4554 | unplug_slaves(mddev); |
4444 | 4555 | ||
@@ -4558,13 +4669,9 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) | |||
4558 | 4669 | ||
4559 | if (!sectors) | 4670 | if (!sectors) |
4560 | sectors = mddev->dev_sectors; | 4671 | sectors = mddev->dev_sectors; |
4561 | if (!raid_disks) { | 4672 | if (!raid_disks) |
4562 | /* size is defined by the smallest of previous and new size */ | 4673 | /* size is defined by the smallest of previous and new size */ |
4563 | if (conf->raid_disks < conf->previous_raid_disks) | 4674 | raid_disks = min(conf->raid_disks, conf->previous_raid_disks); |
4564 | raid_disks = conf->raid_disks; | ||
4565 | else | ||
4566 | raid_disks = conf->previous_raid_disks; | ||
4567 | } | ||
4568 | 4675 | ||
4569 | sectors &= ~((sector_t)mddev->chunk_sectors - 1); | 4676 | sectors &= ~((sector_t)mddev->chunk_sectors - 1); |
4570 | sectors &= ~((sector_t)mddev->new_chunk_sectors - 1); | 4677 | sectors &= ~((sector_t)mddev->new_chunk_sectors - 1); |
@@ -4624,7 +4731,7 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, | |||
4624 | kfree(percpu->scribble); | 4731 | kfree(percpu->scribble); |
4625 | pr_err("%s: failed memory allocation for cpu%ld\n", | 4732 | pr_err("%s: failed memory allocation for cpu%ld\n", |
4626 | __func__, cpu); | 4733 | __func__, cpu); |
4627 | return NOTIFY_BAD; | 4734 | return notifier_from_errno(-ENOMEM); |
4628 | } | 4735 | } |
4629 | break; | 4736 | break; |
4630 | case CPU_DEAD: | 4737 | case CPU_DEAD: |
@@ -4645,7 +4752,7 @@ static int raid5_alloc_percpu(raid5_conf_t *conf) | |||
4645 | { | 4752 | { |
4646 | unsigned long cpu; | 4753 | unsigned long cpu; |
4647 | struct page *spare_page; | 4754 | struct page *spare_page; |
4648 | struct raid5_percpu *allcpus; | 4755 | struct raid5_percpu __percpu *allcpus; |
4649 | void *scribble; | 4756 | void *scribble; |
4650 | int err; | 4757 | int err; |
4651 | 4758 | ||
@@ -4665,7 +4772,7 @@ static int raid5_alloc_percpu(raid5_conf_t *conf) | |||
4665 | } | 4772 | } |
4666 | per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; | 4773 | per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; |
4667 | } | 4774 | } |
4668 | scribble = kmalloc(scribble_len(conf->raid_disks), GFP_KERNEL); | 4775 | scribble = kmalloc(conf->scribble_len, GFP_KERNEL); |
4669 | if (!scribble) { | 4776 | if (!scribble) { |
4670 | err = -ENOMEM; | 4777 | err = -ENOMEM; |
4671 | break; | 4778 | break; |
@@ -4686,14 +4793,14 @@ static int raid5_alloc_percpu(raid5_conf_t *conf) | |||
4686 | static raid5_conf_t *setup_conf(mddev_t *mddev) | 4793 | static raid5_conf_t *setup_conf(mddev_t *mddev) |
4687 | { | 4794 | { |
4688 | raid5_conf_t *conf; | 4795 | raid5_conf_t *conf; |
4689 | int raid_disk, memory; | 4796 | int raid_disk, memory, max_disks; |
4690 | mdk_rdev_t *rdev; | 4797 | mdk_rdev_t *rdev; |
4691 | struct disk_info *disk; | 4798 | struct disk_info *disk; |
4692 | 4799 | ||
4693 | if (mddev->new_level != 5 | 4800 | if (mddev->new_level != 5 |
4694 | && mddev->new_level != 4 | 4801 | && mddev->new_level != 4 |
4695 | && mddev->new_level != 6) { | 4802 | && mddev->new_level != 6) { |
4696 | printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n", | 4803 | printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n", |
4697 | mdname(mddev), mddev->new_level); | 4804 | mdname(mddev), mddev->new_level); |
4698 | return ERR_PTR(-EIO); | 4805 | return ERR_PTR(-EIO); |
4699 | } | 4806 | } |
@@ -4701,12 +4808,12 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) | |||
4701 | && !algorithm_valid_raid5(mddev->new_layout)) || | 4808 | && !algorithm_valid_raid5(mddev->new_layout)) || |
4702 | (mddev->new_level == 6 | 4809 | (mddev->new_level == 6 |
4703 | && !algorithm_valid_raid6(mddev->new_layout))) { | 4810 | && !algorithm_valid_raid6(mddev->new_layout))) { |
4704 | printk(KERN_ERR "raid5: %s: layout %d not supported\n", | 4811 | printk(KERN_ERR "md/raid:%s: layout %d not supported\n", |
4705 | mdname(mddev), mddev->new_layout); | 4812 | mdname(mddev), mddev->new_layout); |
4706 | return ERR_PTR(-EIO); | 4813 | return ERR_PTR(-EIO); |
4707 | } | 4814 | } |
4708 | if (mddev->new_level == 6 && mddev->raid_disks < 4) { | 4815 | if (mddev->new_level == 6 && mddev->raid_disks < 4) { |
4709 | printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n", | 4816 | printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n", |
4710 | mdname(mddev), mddev->raid_disks); | 4817 | mdname(mddev), mddev->raid_disks); |
4711 | return ERR_PTR(-EINVAL); | 4818 | return ERR_PTR(-EINVAL); |
4712 | } | 4819 | } |
@@ -4714,23 +4821,36 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) | |||
4714 | if (!mddev->new_chunk_sectors || | 4821 | if (!mddev->new_chunk_sectors || |
4715 | (mddev->new_chunk_sectors << 9) % PAGE_SIZE || | 4822 | (mddev->new_chunk_sectors << 9) % PAGE_SIZE || |
4716 | !is_power_of_2(mddev->new_chunk_sectors)) { | 4823 | !is_power_of_2(mddev->new_chunk_sectors)) { |
4717 | printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", | 4824 | printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n", |
4718 | mddev->new_chunk_sectors << 9, mdname(mddev)); | 4825 | mdname(mddev), mddev->new_chunk_sectors << 9); |
4719 | return ERR_PTR(-EINVAL); | 4826 | return ERR_PTR(-EINVAL); |
4720 | } | 4827 | } |
4721 | 4828 | ||
4722 | conf = kzalloc(sizeof(raid5_conf_t), GFP_KERNEL); | 4829 | conf = kzalloc(sizeof(raid5_conf_t), GFP_KERNEL); |
4723 | if (conf == NULL) | 4830 | if (conf == NULL) |
4724 | goto abort; | 4831 | goto abort; |
4832 | spin_lock_init(&conf->device_lock); | ||
4833 | init_waitqueue_head(&conf->wait_for_stripe); | ||
4834 | init_waitqueue_head(&conf->wait_for_overlap); | ||
4835 | INIT_LIST_HEAD(&conf->handle_list); | ||
4836 | INIT_LIST_HEAD(&conf->hold_list); | ||
4837 | INIT_LIST_HEAD(&conf->delayed_list); | ||
4838 | INIT_LIST_HEAD(&conf->bitmap_list); | ||
4839 | INIT_LIST_HEAD(&conf->inactive_list); | ||
4840 | atomic_set(&conf->active_stripes, 0); | ||
4841 | atomic_set(&conf->preread_active_stripes, 0); | ||
4842 | atomic_set(&conf->active_aligned_reads, 0); | ||
4843 | conf->bypass_threshold = BYPASS_THRESHOLD; | ||
4725 | 4844 | ||
4726 | conf->raid_disks = mddev->raid_disks; | 4845 | conf->raid_disks = mddev->raid_disks; |
4727 | conf->scribble_len = scribble_len(conf->raid_disks); | ||
4728 | if (mddev->reshape_position == MaxSector) | 4846 | if (mddev->reshape_position == MaxSector) |
4729 | conf->previous_raid_disks = mddev->raid_disks; | 4847 | conf->previous_raid_disks = mddev->raid_disks; |
4730 | else | 4848 | else |
4731 | conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; | 4849 | conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; |
4850 | max_disks = max(conf->raid_disks, conf->previous_raid_disks); | ||
4851 | conf->scribble_len = scribble_len(max_disks); | ||
4732 | 4852 | ||
4733 | conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info), | 4853 | conf->disks = kzalloc(max_disks * sizeof(struct disk_info), |
4734 | GFP_KERNEL); | 4854 | GFP_KERNEL); |
4735 | if (!conf->disks) | 4855 | if (!conf->disks) |
4736 | goto abort; | 4856 | goto abort; |
@@ -4744,24 +4864,11 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) | |||
4744 | if (raid5_alloc_percpu(conf) != 0) | 4864 | if (raid5_alloc_percpu(conf) != 0) |
4745 | goto abort; | 4865 | goto abort; |
4746 | 4866 | ||
4747 | spin_lock_init(&conf->device_lock); | 4867 | pr_debug("raid456: run(%s) called.\n", mdname(mddev)); |
4748 | init_waitqueue_head(&conf->wait_for_stripe); | ||
4749 | init_waitqueue_head(&conf->wait_for_overlap); | ||
4750 | INIT_LIST_HEAD(&conf->handle_list); | ||
4751 | INIT_LIST_HEAD(&conf->hold_list); | ||
4752 | INIT_LIST_HEAD(&conf->delayed_list); | ||
4753 | INIT_LIST_HEAD(&conf->bitmap_list); | ||
4754 | INIT_LIST_HEAD(&conf->inactive_list); | ||
4755 | atomic_set(&conf->active_stripes, 0); | ||
4756 | atomic_set(&conf->preread_active_stripes, 0); | ||
4757 | atomic_set(&conf->active_aligned_reads, 0); | ||
4758 | conf->bypass_threshold = BYPASS_THRESHOLD; | ||
4759 | |||
4760 | pr_debug("raid5: run(%s) called.\n", mdname(mddev)); | ||
4761 | 4868 | ||
4762 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 4869 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
4763 | raid_disk = rdev->raid_disk; | 4870 | raid_disk = rdev->raid_disk; |
4764 | if (raid_disk >= conf->raid_disks | 4871 | if (raid_disk >= max_disks |
4765 | || raid_disk < 0) | 4872 | || raid_disk < 0) |
4766 | continue; | 4873 | continue; |
4767 | disk = conf->disks + raid_disk; | 4874 | disk = conf->disks + raid_disk; |
@@ -4770,9 +4877,9 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) | |||
4770 | 4877 | ||
4771 | if (test_bit(In_sync, &rdev->flags)) { | 4878 | if (test_bit(In_sync, &rdev->flags)) { |
4772 | char b[BDEVNAME_SIZE]; | 4879 | char b[BDEVNAME_SIZE]; |
4773 | printk(KERN_INFO "raid5: device %s operational as raid" | 4880 | printk(KERN_INFO "md/raid:%s: device %s operational as raid" |
4774 | " disk %d\n", bdevname(rdev->bdev,b), | 4881 | " disk %d\n", |
4775 | raid_disk); | 4882 | mdname(mddev), bdevname(rdev->bdev, b), raid_disk); |
4776 | } else | 4883 | } else |
4777 | /* Cannot rely on bitmap to complete recovery */ | 4884 | /* Cannot rely on bitmap to complete recovery */ |
4778 | conf->fullsync = 1; | 4885 | conf->fullsync = 1; |
@@ -4793,19 +4900,20 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) | |||
4793 | } | 4900 | } |
4794 | 4901 | ||
4795 | memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + | 4902 | memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + |
4796 | conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; | 4903 | max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; |
4797 | if (grow_stripes(conf, conf->max_nr_stripes)) { | 4904 | if (grow_stripes(conf, conf->max_nr_stripes)) { |
4798 | printk(KERN_ERR | 4905 | printk(KERN_ERR |
4799 | "raid5: couldn't allocate %dkB for buffers\n", memory); | 4906 | "md/raid:%s: couldn't allocate %dkB for buffers\n", |
4907 | mdname(mddev), memory); | ||
4800 | goto abort; | 4908 | goto abort; |
4801 | } else | 4909 | } else |
4802 | printk(KERN_INFO "raid5: allocated %dkB for %s\n", | 4910 | printk(KERN_INFO "md/raid:%s: allocated %dkB\n", |
4803 | memory, mdname(mddev)); | 4911 | mdname(mddev), memory); |
4804 | 4912 | ||
4805 | conf->thread = md_register_thread(raid5d, mddev, NULL); | 4913 | conf->thread = md_register_thread(raid5d, mddev, NULL); |
4806 | if (!conf->thread) { | 4914 | if (!conf->thread) { |
4807 | printk(KERN_ERR | 4915 | printk(KERN_ERR |
4808 | "raid5: couldn't allocate thread for %s\n", | 4916 | "md/raid:%s: couldn't allocate thread.\n", |
4809 | mdname(mddev)); | 4917 | mdname(mddev)); |
4810 | goto abort; | 4918 | goto abort; |
4811 | } | 4919 | } |
@@ -4820,14 +4928,43 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) | |||
4820 | return ERR_PTR(-ENOMEM); | 4928 | return ERR_PTR(-ENOMEM); |
4821 | } | 4929 | } |
4822 | 4930 | ||
4931 | |||
4932 | static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) | ||
4933 | { | ||
4934 | switch (algo) { | ||
4935 | case ALGORITHM_PARITY_0: | ||
4936 | if (raid_disk < max_degraded) | ||
4937 | return 1; | ||
4938 | break; | ||
4939 | case ALGORITHM_PARITY_N: | ||
4940 | if (raid_disk >= raid_disks - max_degraded) | ||
4941 | return 1; | ||
4942 | break; | ||
4943 | case ALGORITHM_PARITY_0_6: | ||
4944 | if (raid_disk == 0 || | ||
4945 | raid_disk == raid_disks - 1) | ||
4946 | return 1; | ||
4947 | break; | ||
4948 | case ALGORITHM_LEFT_ASYMMETRIC_6: | ||
4949 | case ALGORITHM_RIGHT_ASYMMETRIC_6: | ||
4950 | case ALGORITHM_LEFT_SYMMETRIC_6: | ||
4951 | case ALGORITHM_RIGHT_SYMMETRIC_6: | ||
4952 | if (raid_disk == raid_disks - 1) | ||
4953 | return 1; | ||
4954 | } | ||
4955 | return 0; | ||
4956 | } | ||
4957 | |||
4823 | static int run(mddev_t *mddev) | 4958 | static int run(mddev_t *mddev) |
4824 | { | 4959 | { |
4825 | raid5_conf_t *conf; | 4960 | raid5_conf_t *conf; |
4826 | int working_disks = 0, chunk_size; | 4961 | int working_disks = 0, chunk_size; |
4962 | int dirty_parity_disks = 0; | ||
4827 | mdk_rdev_t *rdev; | 4963 | mdk_rdev_t *rdev; |
4964 | sector_t reshape_offset = 0; | ||
4828 | 4965 | ||
4829 | if (mddev->recovery_cp != MaxSector) | 4966 | if (mddev->recovery_cp != MaxSector) |
4830 | printk(KERN_NOTICE "raid5: %s is not clean" | 4967 | printk(KERN_NOTICE "md/raid:%s: not clean" |
4831 | " -- starting background reconstruction\n", | 4968 | " -- starting background reconstruction\n", |
4832 | mdname(mddev)); | 4969 | mdname(mddev)); |
4833 | if (mddev->reshape_position != MaxSector) { | 4970 | if (mddev->reshape_position != MaxSector) { |
@@ -4841,7 +4978,7 @@ static int run(mddev_t *mddev) | |||
4841 | int max_degraded = (mddev->level == 6 ? 2 : 1); | 4978 | int max_degraded = (mddev->level == 6 ? 2 : 1); |
4842 | 4979 | ||
4843 | if (mddev->new_level != mddev->level) { | 4980 | if (mddev->new_level != mddev->level) { |
4844 | printk(KERN_ERR "raid5: %s: unsupported reshape " | 4981 | printk(KERN_ERR "md/raid:%s: unsupported reshape " |
4845 | "required - aborting.\n", | 4982 | "required - aborting.\n", |
4846 | mdname(mddev)); | 4983 | mdname(mddev)); |
4847 | return -EINVAL; | 4984 | return -EINVAL; |
@@ -4854,10 +4991,11 @@ static int run(mddev_t *mddev) | |||
4854 | here_new = mddev->reshape_position; | 4991 | here_new = mddev->reshape_position; |
4855 | if (sector_div(here_new, mddev->new_chunk_sectors * | 4992 | if (sector_div(here_new, mddev->new_chunk_sectors * |
4856 | (mddev->raid_disks - max_degraded))) { | 4993 | (mddev->raid_disks - max_degraded))) { |
4857 | printk(KERN_ERR "raid5: reshape_position not " | 4994 | printk(KERN_ERR "md/raid:%s: reshape_position not " |
4858 | "on a stripe boundary\n"); | 4995 | "on a stripe boundary\n", mdname(mddev)); |
4859 | return -EINVAL; | 4996 | return -EINVAL; |
4860 | } | 4997 | } |
4998 | reshape_offset = here_new * mddev->new_chunk_sectors; | ||
4861 | /* here_new is the stripe we will write to */ | 4999 | /* here_new is the stripe we will write to */ |
4862 | here_old = mddev->reshape_position; | 5000 | here_old = mddev->reshape_position; |
4863 | sector_div(here_old, mddev->chunk_sectors * | 5001 | sector_div(here_old, mddev->chunk_sectors * |
@@ -4875,8 +5013,9 @@ static int run(mddev_t *mddev) | |||
4875 | if ((here_new * mddev->new_chunk_sectors != | 5013 | if ((here_new * mddev->new_chunk_sectors != |
4876 | here_old * mddev->chunk_sectors) || | 5014 | here_old * mddev->chunk_sectors) || |
4877 | mddev->ro == 0) { | 5015 | mddev->ro == 0) { |
4878 | printk(KERN_ERR "raid5: in-place reshape must be started" | 5016 | printk(KERN_ERR "md/raid:%s: in-place reshape must be started" |
4879 | " in read-only mode - aborting\n"); | 5017 | " in read-only mode - aborting\n", |
5018 | mdname(mddev)); | ||
4880 | return -EINVAL; | 5019 | return -EINVAL; |
4881 | } | 5020 | } |
4882 | } else if (mddev->delta_disks < 0 | 5021 | } else if (mddev->delta_disks < 0 |
@@ -4885,11 +5024,13 @@ static int run(mddev_t *mddev) | |||
4885 | : (here_new * mddev->new_chunk_sectors >= | 5024 | : (here_new * mddev->new_chunk_sectors >= |
4886 | here_old * mddev->chunk_sectors)) { | 5025 | here_old * mddev->chunk_sectors)) { |
4887 | /* Reading from the same stripe as writing to - bad */ | 5026 | /* Reading from the same stripe as writing to - bad */ |
4888 | printk(KERN_ERR "raid5: reshape_position too early for " | 5027 | printk(KERN_ERR "md/raid:%s: reshape_position too early for " |
4889 | "auto-recovery - aborting.\n"); | 5028 | "auto-recovery - aborting.\n", |
5029 | mdname(mddev)); | ||
4890 | return -EINVAL; | 5030 | return -EINVAL; |
4891 | } | 5031 | } |
4892 | printk(KERN_INFO "raid5: reshape will continue\n"); | 5032 | printk(KERN_INFO "md/raid:%s: reshape will continue\n", |
5033 | mdname(mddev)); | ||
4893 | /* OK, we should be able to continue; */ | 5034 | /* OK, we should be able to continue; */ |
4894 | } else { | 5035 | } else { |
4895 | BUG_ON(mddev->level != mddev->new_level); | 5036 | BUG_ON(mddev->level != mddev->new_level); |
@@ -4913,15 +5054,47 @@ static int run(mddev_t *mddev) | |||
4913 | /* | 5054 | /* |
4914 | * 0 for a fully functional array, 1 or 2 for a degraded array. | 5055 | * 0 for a fully functional array, 1 or 2 for a degraded array. |
4915 | */ | 5056 | */ |
4916 | list_for_each_entry(rdev, &mddev->disks, same_set) | 5057 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
4917 | if (rdev->raid_disk >= 0 && | 5058 | if (rdev->raid_disk < 0) |
4918 | test_bit(In_sync, &rdev->flags)) | 5059 | continue; |
5060 | if (test_bit(In_sync, &rdev->flags)) { | ||
4919 | working_disks++; | 5061 | working_disks++; |
5062 | continue; | ||
5063 | } | ||
5064 | /* This disc is not fully in-sync. However if it | ||
5065 | * just stored parity (beyond the recovery_offset), | ||
5066 | * when we don't need to be concerned about the | ||
5067 | * array being dirty. | ||
5068 | * When reshape goes 'backwards', we never have | ||
5069 | * partially completed devices, so we only need | ||
5070 | * to worry about reshape going forwards. | ||
5071 | */ | ||
5072 | /* Hack because v0.91 doesn't store recovery_offset properly. */ | ||
5073 | if (mddev->major_version == 0 && | ||
5074 | mddev->minor_version > 90) | ||
5075 | rdev->recovery_offset = reshape_offset; | ||
5076 | |||
5077 | if (rdev->recovery_offset < reshape_offset) { | ||
5078 | /* We need to check old and new layout */ | ||
5079 | if (!only_parity(rdev->raid_disk, | ||
5080 | conf->algorithm, | ||
5081 | conf->raid_disks, | ||
5082 | conf->max_degraded)) | ||
5083 | continue; | ||
5084 | } | ||
5085 | if (!only_parity(rdev->raid_disk, | ||
5086 | conf->prev_algo, | ||
5087 | conf->previous_raid_disks, | ||
5088 | conf->max_degraded)) | ||
5089 | continue; | ||
5090 | dirty_parity_disks++; | ||
5091 | } | ||
4920 | 5092 | ||
4921 | mddev->degraded = conf->raid_disks - working_disks; | 5093 | mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks) |
5094 | - working_disks); | ||
4922 | 5095 | ||
4923 | if (mddev->degraded > conf->max_degraded) { | 5096 | if (has_failed(conf)) { |
4924 | printk(KERN_ERR "raid5: not enough operational devices for %s" | 5097 | printk(KERN_ERR "md/raid:%s: not enough operational devices" |
4925 | " (%d/%d failed)\n", | 5098 | " (%d/%d failed)\n", |
4926 | mdname(mddev), mddev->degraded, conf->raid_disks); | 5099 | mdname(mddev), mddev->degraded, conf->raid_disks); |
4927 | goto abort; | 5100 | goto abort; |
@@ -4931,36 +5104,36 @@ static int run(mddev_t *mddev) | |||
4931 | mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); | 5104 | mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); |
4932 | mddev->resync_max_sectors = mddev->dev_sectors; | 5105 | mddev->resync_max_sectors = mddev->dev_sectors; |
4933 | 5106 | ||
4934 | if (mddev->degraded > 0 && | 5107 | if (mddev->degraded > dirty_parity_disks && |
4935 | mddev->recovery_cp != MaxSector) { | 5108 | mddev->recovery_cp != MaxSector) { |
4936 | if (mddev->ok_start_degraded) | 5109 | if (mddev->ok_start_degraded) |
4937 | printk(KERN_WARNING | 5110 | printk(KERN_WARNING |
4938 | "raid5: starting dirty degraded array: %s" | 5111 | "md/raid:%s: starting dirty degraded array" |
4939 | "- data corruption possible.\n", | 5112 | " - data corruption possible.\n", |
4940 | mdname(mddev)); | 5113 | mdname(mddev)); |
4941 | else { | 5114 | else { |
4942 | printk(KERN_ERR | 5115 | printk(KERN_ERR |
4943 | "raid5: cannot start dirty degraded array for %s\n", | 5116 | "md/raid:%s: cannot start dirty degraded array.\n", |
4944 | mdname(mddev)); | 5117 | mdname(mddev)); |
4945 | goto abort; | 5118 | goto abort; |
4946 | } | 5119 | } |
4947 | } | 5120 | } |
4948 | 5121 | ||
4949 | if (mddev->degraded == 0) | 5122 | if (mddev->degraded == 0) |
4950 | printk("raid5: raid level %d set %s active with %d out of %d" | 5123 | printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d" |
4951 | " devices, algorithm %d\n", conf->level, mdname(mddev), | 5124 | " devices, algorithm %d\n", mdname(mddev), conf->level, |
4952 | mddev->raid_disks-mddev->degraded, mddev->raid_disks, | 5125 | mddev->raid_disks-mddev->degraded, mddev->raid_disks, |
4953 | mddev->new_layout); | 5126 | mddev->new_layout); |
4954 | else | 5127 | else |
4955 | printk(KERN_ALERT "raid5: raid level %d set %s active with %d" | 5128 | printk(KERN_ALERT "md/raid:%s: raid level %d active with %d" |
4956 | " out of %d devices, algorithm %d\n", conf->level, | 5129 | " out of %d devices, algorithm %d\n", |
4957 | mdname(mddev), mddev->raid_disks - mddev->degraded, | 5130 | mdname(mddev), conf->level, |
4958 | mddev->raid_disks, mddev->new_layout); | 5131 | mddev->raid_disks - mddev->degraded, |
5132 | mddev->raid_disks, mddev->new_layout); | ||
4959 | 5133 | ||
4960 | print_raid5_conf(conf); | 5134 | print_raid5_conf(conf); |
4961 | 5135 | ||
4962 | if (conf->reshape_progress != MaxSector) { | 5136 | if (conf->reshape_progress != MaxSector) { |
4963 | printk("...ok start reshape thread\n"); | ||
4964 | conf->reshape_safe = conf->reshape_progress; | 5137 | conf->reshape_safe = conf->reshape_progress; |
4965 | atomic_set(&conf->reshape_stripes, 0); | 5138 | atomic_set(&conf->reshape_stripes, 0); |
4966 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); | 5139 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); |
@@ -4983,9 +5156,11 @@ static int run(mddev_t *mddev) | |||
4983 | } | 5156 | } |
4984 | 5157 | ||
4985 | /* Ok, everything is just fine now */ | 5158 | /* Ok, everything is just fine now */ |
4986 | if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) | 5159 | if (mddev->to_remove == &raid5_attrs_group) |
5160 | mddev->to_remove = NULL; | ||
5161 | else if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) | ||
4987 | printk(KERN_WARNING | 5162 | printk(KERN_WARNING |
4988 | "raid5: failed to create sysfs attributes for %s\n", | 5163 | "md/raid:%s: failed to create sysfs attributes.\n", |
4989 | mdname(mddev)); | 5164 | mdname(mddev)); |
4990 | 5165 | ||
4991 | mddev->queue->queue_lock = &conf->device_lock; | 5166 | mddev->queue->queue_lock = &conf->device_lock; |
@@ -5015,23 +5190,21 @@ abort: | |||
5015 | free_conf(conf); | 5190 | free_conf(conf); |
5016 | } | 5191 | } |
5017 | mddev->private = NULL; | 5192 | mddev->private = NULL; |
5018 | printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev)); | 5193 | printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev)); |
5019 | return -EIO; | 5194 | return -EIO; |
5020 | } | 5195 | } |
5021 | 5196 | ||
5022 | |||
5023 | |||
5024 | static int stop(mddev_t *mddev) | 5197 | static int stop(mddev_t *mddev) |
5025 | { | 5198 | { |
5026 | raid5_conf_t *conf = (raid5_conf_t *) mddev->private; | 5199 | raid5_conf_t *conf = mddev->private; |
5027 | 5200 | ||
5028 | md_unregister_thread(mddev->thread); | 5201 | md_unregister_thread(mddev->thread); |
5029 | mddev->thread = NULL; | 5202 | mddev->thread = NULL; |
5030 | mddev->queue->backing_dev_info.congested_fn = NULL; | 5203 | mddev->queue->backing_dev_info.congested_fn = NULL; |
5031 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ | 5204 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ |
5032 | sysfs_remove_group(&mddev->kobj, &raid5_attrs_group); | ||
5033 | free_conf(conf); | 5205 | free_conf(conf); |
5034 | mddev->private = NULL; | 5206 | mddev->private = NULL; |
5207 | mddev->to_remove = &raid5_attrs_group; | ||
5035 | return 0; | 5208 | return 0; |
5036 | } | 5209 | } |
5037 | 5210 | ||
@@ -5072,7 +5245,7 @@ static void printall(struct seq_file *seq, raid5_conf_t *conf) | |||
5072 | 5245 | ||
5073 | static void status(struct seq_file *seq, mddev_t *mddev) | 5246 | static void status(struct seq_file *seq, mddev_t *mddev) |
5074 | { | 5247 | { |
5075 | raid5_conf_t *conf = (raid5_conf_t *) mddev->private; | 5248 | raid5_conf_t *conf = mddev->private; |
5076 | int i; | 5249 | int i; |
5077 | 5250 | ||
5078 | seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, | 5251 | seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, |
@@ -5094,21 +5267,22 @@ static void print_raid5_conf (raid5_conf_t *conf) | |||
5094 | int i; | 5267 | int i; |
5095 | struct disk_info *tmp; | 5268 | struct disk_info *tmp; |
5096 | 5269 | ||
5097 | printk("RAID5 conf printout:\n"); | 5270 | printk(KERN_DEBUG "RAID conf printout:\n"); |
5098 | if (!conf) { | 5271 | if (!conf) { |
5099 | printk("(conf==NULL)\n"); | 5272 | printk("(conf==NULL)\n"); |
5100 | return; | 5273 | return; |
5101 | } | 5274 | } |
5102 | printk(" --- rd:%d wd:%d\n", conf->raid_disks, | 5275 | printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level, |
5103 | conf->raid_disks - conf->mddev->degraded); | 5276 | conf->raid_disks, |
5277 | conf->raid_disks - conf->mddev->degraded); | ||
5104 | 5278 | ||
5105 | for (i = 0; i < conf->raid_disks; i++) { | 5279 | for (i = 0; i < conf->raid_disks; i++) { |
5106 | char b[BDEVNAME_SIZE]; | 5280 | char b[BDEVNAME_SIZE]; |
5107 | tmp = conf->disks + i; | 5281 | tmp = conf->disks + i; |
5108 | if (tmp->rdev) | 5282 | if (tmp->rdev) |
5109 | printk(" disk %d, o:%d, dev:%s\n", | 5283 | printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n", |
5110 | i, !test_bit(Faulty, &tmp->rdev->flags), | 5284 | i, !test_bit(Faulty, &tmp->rdev->flags), |
5111 | bdevname(tmp->rdev->bdev,b)); | 5285 | bdevname(tmp->rdev->bdev, b)); |
5112 | } | 5286 | } |
5113 | } | 5287 | } |
5114 | 5288 | ||
@@ -5121,6 +5295,7 @@ static int raid5_spare_active(mddev_t *mddev) | |||
5121 | for (i = 0; i < conf->raid_disks; i++) { | 5295 | for (i = 0; i < conf->raid_disks; i++) { |
5122 | tmp = conf->disks + i; | 5296 | tmp = conf->disks + i; |
5123 | if (tmp->rdev | 5297 | if (tmp->rdev |
5298 | && tmp->rdev->recovery_offset == MaxSector | ||
5124 | && !test_bit(Faulty, &tmp->rdev->flags) | 5299 | && !test_bit(Faulty, &tmp->rdev->flags) |
5125 | && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { | 5300 | && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { |
5126 | unsigned long flags; | 5301 | unsigned long flags; |
@@ -5156,7 +5331,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number) | |||
5156 | * isn't possible. | 5331 | * isn't possible. |
5157 | */ | 5332 | */ |
5158 | if (!test_bit(Faulty, &rdev->flags) && | 5333 | if (!test_bit(Faulty, &rdev->flags) && |
5159 | mddev->degraded <= conf->max_degraded && | 5334 | !has_failed(conf) && |
5160 | number < conf->raid_disks) { | 5335 | number < conf->raid_disks) { |
5161 | err = -EBUSY; | 5336 | err = -EBUSY; |
5162 | goto abort; | 5337 | goto abort; |
@@ -5184,7 +5359,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
5184 | int first = 0; | 5359 | int first = 0; |
5185 | int last = conf->raid_disks - 1; | 5360 | int last = conf->raid_disks - 1; |
5186 | 5361 | ||
5187 | if (mddev->degraded > conf->max_degraded) | 5362 | if (has_failed(conf)) |
5188 | /* no point adding a device */ | 5363 | /* no point adding a device */ |
5189 | return -EINVAL; | 5364 | return -EINVAL; |
5190 | 5365 | ||
@@ -5231,7 +5406,6 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) | |||
5231 | raid5_size(mddev, sectors, mddev->raid_disks)) | 5406 | raid5_size(mddev, sectors, mddev->raid_disks)) |
5232 | return -EINVAL; | 5407 | return -EINVAL; |
5233 | set_capacity(mddev->gendisk, mddev->array_sectors); | 5408 | set_capacity(mddev->gendisk, mddev->array_sectors); |
5234 | mddev->changed = 1; | ||
5235 | revalidate_disk(mddev->gendisk); | 5409 | revalidate_disk(mddev->gendisk); |
5236 | if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { | 5410 | if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { |
5237 | mddev->recovery_cp = mddev->dev_sectors; | 5411 | mddev->recovery_cp = mddev->dev_sectors; |
@@ -5257,7 +5431,8 @@ static int check_stripe_cache(mddev_t *mddev) | |||
5257 | > conf->max_nr_stripes || | 5431 | > conf->max_nr_stripes || |
5258 | ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 | 5432 | ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 |
5259 | > conf->max_nr_stripes) { | 5433 | > conf->max_nr_stripes) { |
5260 | printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n", | 5434 | printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", |
5435 | mdname(mddev), | ||
5261 | ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) | 5436 | ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) |
5262 | / STRIPE_SIZE)*4); | 5437 | / STRIPE_SIZE)*4); |
5263 | return 0; | 5438 | return 0; |
@@ -5276,7 +5451,7 @@ static int check_reshape(mddev_t *mddev) | |||
5276 | if (mddev->bitmap) | 5451 | if (mddev->bitmap) |
5277 | /* Cannot grow a bitmap yet */ | 5452 | /* Cannot grow a bitmap yet */ |
5278 | return -EBUSY; | 5453 | return -EBUSY; |
5279 | if (mddev->degraded > conf->max_degraded) | 5454 | if (has_failed(conf)) |
5280 | return -EINVAL; | 5455 | return -EINVAL; |
5281 | if (mddev->delta_disks < 0) { | 5456 | if (mddev->delta_disks < 0) { |
5282 | /* We might be able to shrink, but the devices must | 5457 | /* We might be able to shrink, but the devices must |
@@ -5328,7 +5503,7 @@ static int raid5_start_reshape(mddev_t *mddev) | |||
5328 | */ | 5503 | */ |
5329 | if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) | 5504 | if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) |
5330 | < mddev->array_sectors) { | 5505 | < mddev->array_sectors) { |
5331 | printk(KERN_ERR "md: %s: array size must be reduced " | 5506 | printk(KERN_ERR "md/raid:%s: array size must be reduced " |
5332 | "before number of disks\n", mdname(mddev)); | 5507 | "before number of disks\n", mdname(mddev)); |
5333 | return -EINVAL; | 5508 | return -EINVAL; |
5334 | } | 5509 | } |
@@ -5351,29 +5526,39 @@ static int raid5_start_reshape(mddev_t *mddev) | |||
5351 | 5526 | ||
5352 | /* Add some new drives, as many as will fit. | 5527 | /* Add some new drives, as many as will fit. |
5353 | * We know there are enough to make the newly sized array work. | 5528 | * We know there are enough to make the newly sized array work. |
5529 | * Don't add devices if we are reducing the number of | ||
5530 | * devices in the array. This is because it is not possible | ||
5531 | * to correctly record the "partially reconstructed" state of | ||
5532 | * such devices during the reshape and confusion could result. | ||
5354 | */ | 5533 | */ |
5355 | list_for_each_entry(rdev, &mddev->disks, same_set) | 5534 | if (mddev->delta_disks >= 0) |
5535 | list_for_each_entry(rdev, &mddev->disks, same_set) | ||
5356 | if (rdev->raid_disk < 0 && | 5536 | if (rdev->raid_disk < 0 && |
5357 | !test_bit(Faulty, &rdev->flags)) { | 5537 | !test_bit(Faulty, &rdev->flags)) { |
5358 | if (raid5_add_disk(mddev, rdev) == 0) { | 5538 | if (raid5_add_disk(mddev, rdev) == 0) { |
5359 | char nm[20]; | 5539 | char nm[20]; |
5360 | set_bit(In_sync, &rdev->flags); | 5540 | if (rdev->raid_disk >= conf->previous_raid_disks) { |
5361 | added_devices++; | 5541 | set_bit(In_sync, &rdev->flags); |
5362 | rdev->recovery_offset = 0; | 5542 | added_devices++; |
5543 | } else | ||
5544 | rdev->recovery_offset = 0; | ||
5363 | sprintf(nm, "rd%d", rdev->raid_disk); | 5545 | sprintf(nm, "rd%d", rdev->raid_disk); |
5364 | if (sysfs_create_link(&mddev->kobj, | 5546 | if (sysfs_create_link(&mddev->kobj, |
5365 | &rdev->kobj, nm)) | 5547 | &rdev->kobj, nm)) |
5366 | printk(KERN_WARNING | 5548 | printk(KERN_WARNING |
5367 | "raid5: failed to create " | 5549 | "md/raid:%s: failed to create " |
5368 | " link %s for %s\n", | 5550 | " link %s\n", |
5369 | nm, mdname(mddev)); | 5551 | mdname(mddev), nm); |
5370 | } else | 5552 | } else |
5371 | break; | 5553 | break; |
5372 | } | 5554 | } |
5373 | 5555 | ||
5556 | /* When a reshape changes the number of devices, ->degraded | ||
5557 | * is measured against the larger of the pre and post number of | ||
5558 | * devices.*/ | ||
5374 | if (mddev->delta_disks > 0) { | 5559 | if (mddev->delta_disks > 0) { |
5375 | spin_lock_irqsave(&conf->device_lock, flags); | 5560 | spin_lock_irqsave(&conf->device_lock, flags); |
5376 | mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) | 5561 | mddev->degraded += (conf->raid_disks - conf->previous_raid_disks) |
5377 | - added_devices; | 5562 | - added_devices; |
5378 | spin_unlock_irqrestore(&conf->device_lock, flags); | 5563 | spin_unlock_irqrestore(&conf->device_lock, flags); |
5379 | } | 5564 | } |
@@ -5440,7 +5625,6 @@ static void raid5_finish_reshape(mddev_t *mddev) | |||
5440 | if (mddev->delta_disks > 0) { | 5625 | if (mddev->delta_disks > 0) { |
5441 | md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); | 5626 | md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); |
5442 | set_capacity(mddev->gendisk, mddev->array_sectors); | 5627 | set_capacity(mddev->gendisk, mddev->array_sectors); |
5443 | mddev->changed = 1; | ||
5444 | revalidate_disk(mddev->gendisk); | 5628 | revalidate_disk(mddev->gendisk); |
5445 | } else { | 5629 | } else { |
5446 | int d; | 5630 | int d; |
@@ -5505,6 +5689,29 @@ static void raid5_quiesce(mddev_t *mddev, int state) | |||
5505 | } | 5689 | } |
5506 | 5690 | ||
5507 | 5691 | ||
5692 | static void *raid45_takeover_raid0(mddev_t *mddev, int level) | ||
5693 | { | ||
5694 | struct raid0_private_data *raid0_priv = mddev->private; | ||
5695 | |||
5696 | /* for raid0 takeover only one zone is supported */ | ||
5697 | if (raid0_priv->nr_strip_zones > 1) { | ||
5698 | printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n", | ||
5699 | mdname(mddev)); | ||
5700 | return ERR_PTR(-EINVAL); | ||
5701 | } | ||
5702 | |||
5703 | mddev->new_level = level; | ||
5704 | mddev->new_layout = ALGORITHM_PARITY_N; | ||
5705 | mddev->new_chunk_sectors = mddev->chunk_sectors; | ||
5706 | mddev->raid_disks += 1; | ||
5707 | mddev->delta_disks = 1; | ||
5708 | /* make sure it will be not marked as dirty */ | ||
5709 | mddev->recovery_cp = MaxSector; | ||
5710 | |||
5711 | return setup_conf(mddev); | ||
5712 | } | ||
5713 | |||
5714 | |||
5508 | static void *raid5_takeover_raid1(mddev_t *mddev) | 5715 | static void *raid5_takeover_raid1(mddev_t *mddev) |
5509 | { | 5716 | { |
5510 | int chunksect; | 5717 | int chunksect; |
@@ -5629,12 +5836,13 @@ static int raid6_check_reshape(mddev_t *mddev) | |||
5629 | static void *raid5_takeover(mddev_t *mddev) | 5836 | static void *raid5_takeover(mddev_t *mddev) |
5630 | { | 5837 | { |
5631 | /* raid5 can take over: | 5838 | /* raid5 can take over: |
5632 | * raid0 - if all devices are the same - make it a raid4 layout | 5839 | * raid0 - if there is only one strip zone - make it a raid4 layout |
5633 | * raid1 - if there are two drives. We need to know the chunk size | 5840 | * raid1 - if there are two drives. We need to know the chunk size |
5634 | * raid4 - trivial - just use a raid4 layout. | 5841 | * raid4 - trivial - just use a raid4 layout. |
5635 | * raid6 - Providing it is a *_6 layout | 5842 | * raid6 - Providing it is a *_6 layout |
5636 | */ | 5843 | */ |
5637 | 5844 | if (mddev->level == 0) | |
5845 | return raid45_takeover_raid0(mddev, 5); | ||
5638 | if (mddev->level == 1) | 5846 | if (mddev->level == 1) |
5639 | return raid5_takeover_raid1(mddev); | 5847 | return raid5_takeover_raid1(mddev); |
5640 | if (mddev->level == 4) { | 5848 | if (mddev->level == 4) { |
@@ -5648,6 +5856,22 @@ static void *raid5_takeover(mddev_t *mddev) | |||
5648 | return ERR_PTR(-EINVAL); | 5856 | return ERR_PTR(-EINVAL); |
5649 | } | 5857 | } |
5650 | 5858 | ||
5859 | static void *raid4_takeover(mddev_t *mddev) | ||
5860 | { | ||
5861 | /* raid4 can take over: | ||
5862 | * raid0 - if there is only one strip zone | ||
5863 | * raid5 - if layout is right | ||
5864 | */ | ||
5865 | if (mddev->level == 0) | ||
5866 | return raid45_takeover_raid0(mddev, 4); | ||
5867 | if (mddev->level == 5 && | ||
5868 | mddev->layout == ALGORITHM_PARITY_N) { | ||
5869 | mddev->new_layout = 0; | ||
5870 | mddev->new_level = 4; | ||
5871 | return setup_conf(mddev); | ||
5872 | } | ||
5873 | return ERR_PTR(-EINVAL); | ||
5874 | } | ||
5651 | 5875 | ||
5652 | static struct mdk_personality raid5_personality; | 5876 | static struct mdk_personality raid5_personality; |
5653 | 5877 | ||
@@ -5763,6 +5987,7 @@ static struct mdk_personality raid4_personality = | |||
5763 | .start_reshape = raid5_start_reshape, | 5987 | .start_reshape = raid5_start_reshape, |
5764 | .finish_reshape = raid5_finish_reshape, | 5988 | .finish_reshape = raid5_finish_reshape, |
5765 | .quiesce = raid5_quiesce, | 5989 | .quiesce = raid5_quiesce, |
5990 | .takeover = raid4_takeover, | ||
5766 | }; | 5991 | }; |
5767 | 5992 | ||
5768 | static int __init raid5_init(void) | 5993 | static int __init raid5_init(void) |
@@ -5783,6 +6008,7 @@ static void raid5_exit(void) | |||
5783 | module_init(raid5_init); | 6008 | module_init(raid5_init); |
5784 | module_exit(raid5_exit); | 6009 | module_exit(raid5_exit); |
5785 | MODULE_LICENSE("GPL"); | 6010 | MODULE_LICENSE("GPL"); |
6011 | MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD"); | ||
5786 | MODULE_ALIAS("md-personality-4"); /* RAID5 */ | 6012 | MODULE_ALIAS("md-personality-4"); /* RAID5 */ |
5787 | MODULE_ALIAS("md-raid5"); | 6013 | MODULE_ALIAS("md-raid5"); |
5788 | MODULE_ALIAS("md-raid4"); | 6014 | MODULE_ALIAS("md-raid4"); |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 2390e0e83daf..0f86f5e36724 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
@@ -214,12 +214,20 @@ struct stripe_head { | |||
214 | int disks; /* disks in stripe */ | 214 | int disks; /* disks in stripe */ |
215 | enum check_states check_state; | 215 | enum check_states check_state; |
216 | enum reconstruct_states reconstruct_state; | 216 | enum reconstruct_states reconstruct_state; |
217 | /* stripe_operations | 217 | /** |
218 | * struct stripe_operations | ||
218 | * @target - STRIPE_OP_COMPUTE_BLK target | 219 | * @target - STRIPE_OP_COMPUTE_BLK target |
220 | * @target2 - 2nd compute target in the raid6 case | ||
221 | * @zero_sum_result - P and Q verification flags | ||
222 | * @request - async service request flags for raid_run_ops | ||
219 | */ | 223 | */ |
220 | struct stripe_operations { | 224 | struct stripe_operations { |
221 | int target, target2; | 225 | int target, target2; |
222 | enum sum_check_flags zero_sum_result; | 226 | enum sum_check_flags zero_sum_result; |
227 | #ifdef CONFIG_MULTICORE_RAID456 | ||
228 | unsigned long request; | ||
229 | wait_queue_head_t wait_for_ops; | ||
230 | #endif | ||
223 | } ops; | 231 | } ops; |
224 | struct r5dev { | 232 | struct r5dev { |
225 | struct bio req; | 233 | struct bio req; |
@@ -294,6 +302,8 @@ struct r6_state { | |||
294 | #define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */ | 302 | #define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */ |
295 | #define STRIPE_BIOFILL_RUN 14 | 303 | #define STRIPE_BIOFILL_RUN 14 |
296 | #define STRIPE_COMPUTE_RUN 15 | 304 | #define STRIPE_COMPUTE_RUN 15 |
305 | #define STRIPE_OPS_REQ_PENDING 16 | ||
306 | |||
297 | /* | 307 | /* |
298 | * Operation request flags | 308 | * Operation request flags |
299 | */ | 309 | */ |
@@ -395,7 +405,7 @@ struct raid5_private_data { | |||
395 | * lists and performing address | 405 | * lists and performing address |
396 | * conversions | 406 | * conversions |
397 | */ | 407 | */ |
398 | } *percpu; | 408 | } __percpu *percpu; |
399 | size_t scribble_len; /* size of scribble region must be | 409 | size_t scribble_len; /* size of scribble region must be |
400 | * associated with conf to handle | 410 | * associated with conf to handle |
401 | * cpu hotplug while reshaping | 411 | * cpu hotplug while reshaping |
@@ -478,7 +488,7 @@ static inline int algorithm_valid_raid6(int layout) | |||
478 | { | 488 | { |
479 | return (layout >= 0 && layout <= 5) | 489 | return (layout >= 0 && layout <= 5) |
480 | || | 490 | || |
481 | (layout == 8 || layout == 10) | 491 | (layout >= 8 && layout <= 10) |
482 | || | 492 | || |
483 | (layout >= 16 && layout <= 20); | 493 | (layout >= 16 && layout <= 20); |
484 | } | 494 | } |